LCOV - code coverage report
Current view: top level - gcore - overview.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 2557 2919 87.6 %
Date: 2025-05-31 00:00:17 Functions: 113 138 81.9 %

          Line data    Source code
       1             : 
       2             : /******************************************************************************
       3             :  *
       4             :  * Project:  GDAL Core
       5             :  * Purpose:  Helper code to implement overview support in different drivers.
       6             :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       7             :  *
       8             :  ******************************************************************************
       9             :  * Copyright (c) 2000, Frank Warmerdam
      10             :  * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
      11             :  *
      12             :  * SPDX-License-Identifier: MIT
      13             :  ****************************************************************************/
      14             : 
      15             : #include "cpl_port.h"
      16             : #include "gdal_priv.h"
      17             : 
      18             : #include <cmath>
      19             : #include <cstddef>
      20             : #include <cstdlib>
      21             : 
      22             : #include <algorithm>
      23             : #include <complex>
      24             : #include <condition_variable>
      25             : #include <limits>
      26             : #include <list>
      27             : #include <memory>
      28             : #include <mutex>
      29             : #include <vector>
      30             : 
      31             : #include "cpl_conv.h"
      32             : #include "cpl_error.h"
      33             : #include "cpl_float.h"
      34             : #include "cpl_progress.h"
      35             : #include "cpl_vsi.h"
      36             : #include "gdal.h"
      37             : #include "gdal_thread_pool.h"
      38             : #include "gdalwarper.h"
      39             : #include "gdal_vrt.h"
      40             : #include "vrtdataset.h"
      41             : 
      42             : #ifdef USE_NEON_OPTIMIZATIONS
      43             : #include "include_sse2neon.h"
      44             : #define USE_SSE2
      45             : 
      46             : #include "gdalsse_priv.h"
      47             : 
      48             : // Restrict to 64bit processors because they are guaranteed to have SSE2,
      49             : // or if __AVX2__ is defined.
      50             : #elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
      51             : #define USE_SSE2
      52             : 
      53             : #include "gdalsse_priv.h"
      54             : 
      55             : #ifdef __SSE3__
      56             : #include <pmmintrin.h>
      57             : #endif
      58             : #ifdef __SSSE3__
      59             : #include <tmmintrin.h>
      60             : #endif
      61             : #ifdef __SSE4_1__
      62             : #include <smmintrin.h>
      63             : #endif
      64             : #ifdef __AVX2__
      65             : #include <immintrin.h>
      66             : #endif
      67             : 
      68             : #endif
      69             : 
      70             : // To be included after above USE_SSE2 and include gdalsse_priv.h
      71             : // to avoid build issue on Windows x86
      72             : #include "gdal_priv_templates.hpp"
      73             : 
      74             : /************************************************************************/
      75             : /*                      GDALResampleChunk_Near()                        */
      76             : /************************************************************************/
      77             : 
      78             : template <class T>
      79        1233 : static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
      80             :                                       const T *pChunk, T **ppDstBuffer)
      81             : 
      82             : {
      83        1233 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
      84        1233 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
      85        1233 :     const GDALDataType eWrkDataType = args.eWrkDataType;
      86        1233 :     const int nChunkXOff = args.nChunkXOff;
      87        1233 :     const int nChunkXSize = args.nChunkXSize;
      88        1233 :     const int nChunkYOff = args.nChunkYOff;
      89        1233 :     const int nDstXOff = args.nDstXOff;
      90        1233 :     const int nDstXOff2 = args.nDstXOff2;
      91        1233 :     const int nDstYOff = args.nDstYOff;
      92        1233 :     const int nDstYOff2 = args.nDstYOff2;
      93        1233 :     const int nDstXWidth = nDstXOff2 - nDstXOff;
      94             : 
      95             :     /* -------------------------------------------------------------------- */
      96             :     /*      Allocate buffers.                                               */
      97             :     /* -------------------------------------------------------------------- */
      98        1233 :     *ppDstBuffer = static_cast<T *>(
      99        1233 :         VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
     100             :                             GDALGetDataTypeSizeBytes(eWrkDataType)));
     101        1233 :     if (*ppDstBuffer == nullptr)
     102             :     {
     103           0 :         return CE_Failure;
     104             :     }
     105        1233 :     T *const pDstBuffer = *ppDstBuffer;
     106             : 
     107             :     int *panSrcXOff =
     108        1233 :         static_cast<int *>(VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(int)));
     109             : 
     110        1233 :     if (panSrcXOff == nullptr)
     111             :     {
     112           0 :         return CE_Failure;
     113             :     }
     114             : 
     115             :     /* ==================================================================== */
     116             :     /*      Precompute inner loop constants.                                */
     117             :     /* ==================================================================== */
     118      842009 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
     119             :     {
     120      840776 :         int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
     121      840776 :         if (nSrcXOff < nChunkXOff)
     122           0 :             nSrcXOff = nChunkXOff;
     123             : 
     124      840776 :         panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
     125             :     }
     126             : 
     127             :     /* ==================================================================== */
     128             :     /*      Loop over destination scanlines.                                */
     129             :     /* ==================================================================== */
     130      141825 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
     131             :     {
     132      140592 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
     133      140592 :         if (nSrcYOff < nChunkYOff)
     134           0 :             nSrcYOff = nChunkYOff;
     135             : 
     136      140592 :         const T *const pSrcScanline =
     137             :             pChunk +
     138      140592 :             (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
     139      138074 :             nChunkXOff;
     140             : 
     141             :         /* --------------------------------------------------------------------
     142             :          */
     143             :         /*      Loop over destination pixels */
     144             :         /* --------------------------------------------------------------------
     145             :          */
     146      140592 :         T *pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
     147   119627130 :         for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
     148             :         {
     149   119486612 :             pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
     150             :         }
     151             :     }
     152             : 
     153        1233 :     CPLFree(panSrcXOff);
     154             : 
     155        1233 :     return CE_None;
     156             : }
     157             : 
     158        1233 : static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
     159             :                                      const void *pChunk, void **ppDstBuffer,
     160             :                                      GDALDataType *peDstBufferDataType)
     161             : {
     162        1233 :     *peDstBufferDataType = args.eWrkDataType;
     163        1233 :     switch (args.eWrkDataType)
     164             :     {
     165             :         // For nearest resampling, as no computation is done, only the
     166             :         // size of the data type matters.
     167        1081 :         case GDT_Byte:
     168             :         case GDT_Int8:
     169             :         {
     170        1081 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
     171        1081 :             return GDALResampleChunk_NearT(
     172             :                 args, static_cast<const uint8_t *>(pChunk),
     173        1081 :                 reinterpret_cast<uint8_t **>(ppDstBuffer));
     174             :         }
     175             : 
     176          50 :         case GDT_Int16:
     177             :         case GDT_UInt16:
     178             :         case GDT_Float16:
     179             :         {
     180          50 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
     181          50 :             return GDALResampleChunk_NearT(
     182             :                 args, static_cast<const uint16_t *>(pChunk),
     183          50 :                 reinterpret_cast<uint16_t **>(ppDstBuffer));
     184             :         }
     185             : 
     186          55 :         case GDT_CInt16:
     187             :         case GDT_CFloat16:
     188             :         case GDT_Int32:
     189             :         case GDT_UInt32:
     190             :         case GDT_Float32:
     191             :         {
     192          55 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
     193          55 :             return GDALResampleChunk_NearT(
     194             :                 args, static_cast<const uint32_t *>(pChunk),
     195          55 :                 reinterpret_cast<uint32_t **>(ppDstBuffer));
     196             :         }
     197             : 
     198          43 :         case GDT_CInt32:
     199             :         case GDT_CFloat32:
     200             :         case GDT_Int64:
     201             :         case GDT_UInt64:
     202             :         case GDT_Float64:
     203             :         {
     204          43 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
     205          43 :             return GDALResampleChunk_NearT(
     206             :                 args, static_cast<const uint64_t *>(pChunk),
     207          43 :                 reinterpret_cast<uint64_t **>(ppDstBuffer));
     208             :         }
     209             : 
     210           4 :         case GDT_CFloat64:
     211             :         {
     212           4 :             return GDALResampleChunk_NearT(
     213             :                 args, static_cast<const std::complex<double> *>(pChunk),
     214           4 :                 reinterpret_cast<std::complex<double> **>(ppDstBuffer));
     215             :         }
     216             : 
     217           0 :         case GDT_Unknown:
     218             :         case GDT_TypeCount:
     219           0 :             break;
     220             :     }
     221           0 :     CPLAssert(false);
     222             :     return CE_Failure;
     223             : }
     224             : 
     225             : namespace
     226             : {
     227             : 
     228             : // Find in the color table the entry whose RGB value is the closest
     229             : // (using quadratic distance) to the test color, ignoring transparent entries.
     230        3837 : int BestColorEntry(const std::vector<GDALColorEntry> &entries,
     231             :                    const GDALColorEntry &test)
     232             : {
     233        3837 :     int nMinDist = std::numeric_limits<int>::max();
     234        3837 :     size_t bestEntry = 0;
     235      986109 :     for (size_t i = 0; i < entries.size(); ++i)
     236             :     {
     237      982272 :         const GDALColorEntry &entry = entries[i];
     238             :         // Ignore transparent entries
     239      982272 :         if (entry.c4 == 0)
     240        3237 :             continue;
     241             : 
     242      979035 :         int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
     243      979035 :                     ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
     244      979035 :                     ((test.c3 - entry.c3) * (test.c3 - entry.c3));
     245      979035 :         if (nDist < nMinDist)
     246             :         {
     247       15847 :             nMinDist = nDist;
     248       15847 :             bestEntry = i;
     249             :         }
     250             :     }
     251        3837 :     return static_cast<int>(bestEntry);
     252             : }
     253             : 
     254           7 : std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
     255             :                                            int &transparentIdx)
     256             : {
     257           7 :     std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
     258             : 
     259           7 :     transparentIdx = -1;
     260           7 :     int i = 0;
     261        1799 :     for (auto &entry : entries)
     262             :     {
     263        1792 :         table.GetColorEntryAsRGB(i, &entry);
     264        1792 :         if (transparentIdx < 0 && entry.c4 == 0)
     265           1 :             transparentIdx = i;
     266        1792 :         ++i;
     267             :     }
     268           7 :     return entries;
     269             : }
     270             : 
     271             : }  // unnamed  namespace
     272             : 
     273             : /************************************************************************/
     274             : /*                             SQUARE()                                 */
     275             : /************************************************************************/
     276             : 
     277        3721 : template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
     278             : {
     279        3721 :     return static_cast<Tsquare>(val) * val;
     280             : }
     281             : 
     282             : /************************************************************************/
     283             : /*                          ComputeIntegerRMS()                         */
     284             : /************************************************************************/
     285             : // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
     286             : // integer that minimizes abs(rms**2 - sumSquares / weight)
     287             : template <class T, class Twork>
     288          42 : inline T ComputeIntegerRMS(double sumSquares, double weight)
     289             : {
     290          42 :     const double sumDivWeight = sumSquares / weight;
     291          42 :     T rms = static_cast<T>(sqrt(sumDivWeight));
     292             : 
     293             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     294             :     // Naive version:
     295             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     296          42 :     if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
     297          42 :         2 * sumDivWeight)
     298           6 :         rms += 1;
     299          42 :     return rms;
     300             : }
     301             : 
     302           0 : template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
     303             : {
     304           0 :     CPLAssert(false);
     305             :     return 0;
     306             : }
     307             : 
     308          24 : template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
     309             : {
     310             :     // It has been verified that given the correction on rms below, using
     311             :     // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
     312             :     // is equivalent, so use the former as it is used twice.
     313          24 :     const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
     314          24 :     const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
     315          24 :     GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
     316             : 
     317             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     318             :     // Naive version:
     319             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     320             :     // Optimized version for integer case and weight == 4
     321          24 :     if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
     322           5 :         rms += 1;
     323          24 :     return rms;
     324             : }
     325             : 
     326             : template <>
     327          20 : inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
     328             : {
     329          20 :     const double sumDivWeight = sumSquares * 0.25;
     330          20 :     GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
     331             : 
     332             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     333             :     // Naive version:
     334             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     335             :     // Optimized version for integer case and weight == 4
     336          20 :     if (static_cast<GUInt32>(rms) * (rms + 1) <
     337          20 :         static_cast<GUInt32>(sumDivWeight + 0.25))
     338           4 :         rms += 1;
     339          20 :     return rms;
     340             : }
     341             : 
     342             : #ifdef USE_SSE2
     343             : 
     344             : /************************************************************************/
     345             : /*                   QuadraticMeanByteSSE2OrAVX2()                      */
     346             : /************************************************************************/
     347             : 
     348             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
     349             : #define sse2_packus_epi32 _mm_packus_epi32
     350             : #else
     351      516119 : inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
     352             : {
     353      516119 :     const auto minus32768_32 = _mm_set1_epi32(-32768);
     354      516119 :     const auto minus32768_16 = _mm_set1_epi16(-32768);
     355      516119 :     a = _mm_add_epi32(a, minus32768_32);
     356      516119 :     b = _mm_add_epi32(b, minus32768_32);
     357      516119 :     a = _mm_packs_epi32(a, b);
     358      516119 :     a = _mm_sub_epi16(a, minus32768_16);
     359      516119 :     return a;
     360             : }
     361             : #endif
     362             : 
     363             : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
     364             : #define sse2_hadd_epi16 _mm_hadd_epi16
     365             : #else
     366     4669030 : inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
     367             : {
     368             :     // Horizontal addition of adjacent pairs
     369     4669030 :     const auto mask = _mm_set1_epi32(0xFFFF);
     370             :     const auto horizLo =
     371    14007100 :         _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
     372             :     const auto horizHi =
     373    14007100 :         _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
     374             : 
     375             :     // Recombine low and high parts
     376     4669030 :     return _mm_packs_epi32(horizLo, horizHi);
     377             : }
     378             : #endif
     379             : 
     380             : #ifdef __AVX2__
     381             : 
     382             : #define DEST_ELTS 16
     383             : #define set1_epi16 _mm256_set1_epi16
     384             : #define set1_epi32 _mm256_set1_epi32
     385             : #define setzero _mm256_setzero_si256
     386             : #define set1_ps _mm256_set1_ps
     387             : #define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
     388             : #define unpacklo_epi8 _mm256_unpacklo_epi8
     389             : #define unpackhi_epi8 _mm256_unpackhi_epi8
     390             : #define madd_epi16 _mm256_madd_epi16
     391             : #define add_epi32 _mm256_add_epi32
     392             : #define mul_ps _mm256_mul_ps
     393             : #define cvtepi32_ps _mm256_cvtepi32_ps
     394             : #define sqrt_ps _mm256_sqrt_ps
     395             : #define cvttps_epi32 _mm256_cvttps_epi32
     396             : #define packs_epi32 _mm256_packs_epi32
     397             : #define packus_epi32 _mm256_packus_epi32
     398             : #define srli_epi32 _mm256_srli_epi32
     399             : #define mullo_epi16 _mm256_mullo_epi16
     400             : #define srli_epi16 _mm256_srli_epi16
     401             : #define cmpgt_epi16 _mm256_cmpgt_epi16
     402             : #define add_epi16 _mm256_add_epi16
     403             : #define sub_epi16 _mm256_sub_epi16
     404             : #define packus_epi16 _mm256_packus_epi16
     405             : /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
     406             : /* to get the lower 128-bit bits of what would be a true 256-bit vector register
     407             :  */
     408             : #define store_lo(x, y)                                                         \
     409             :     _mm_storeu_si128(reinterpret_cast<__m128i *>(x),                           \
     410             :                      _mm256_extracti128_si256(                                 \
     411             :                          _mm256_permute4x64_epi64((y), 0 | (2 << 2)), 0))
     412             : #define hadd_epi16 _mm256_hadd_epi16
     413             : #define zeroupper() _mm256_zeroupper()
     414             : #else
     415             : #define DEST_ELTS 8
     416             : #define set1_epi16 _mm_set1_epi16
     417             : #define set1_epi32 _mm_set1_epi32
     418             : #define setzero _mm_setzero_si128
     419             : #define set1_ps _mm_set1_ps
     420             : #define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
     421             : #define unpacklo_epi8 _mm_unpacklo_epi8
     422             : #define unpackhi_epi8 _mm_unpackhi_epi8
     423             : #define madd_epi16 _mm_madd_epi16
     424             : #define add_epi32 _mm_add_epi32
     425             : #define mul_ps _mm_mul_ps
     426             : #define cvtepi32_ps _mm_cvtepi32_ps
     427             : #define sqrt_ps _mm_sqrt_ps
     428             : #define cvttps_epi32 _mm_cvttps_epi32
     429             : #define packs_epi32 _mm_packs_epi32
     430             : #define packus_epi32 sse2_packus_epi32
     431             : #define srli_epi32 _mm_srli_epi32
     432             : #define mullo_epi16 _mm_mullo_epi16
     433             : #define srli_epi16 _mm_srli_epi16
     434             : #define cmpgt_epi16 _mm_cmpgt_epi16
     435             : #define add_epi16 _mm_add_epi16
     436             : #define sub_epi16 _mm_sub_epi16
     437             : #define packus_epi16 _mm_packus_epi16
     438             : #define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
     439             : #define hadd_epi16 sse2_hadd_epi16
     440             : #define zeroupper() (void)0
     441             : #endif
     442             : 
     443             : #if defined(__GNUC__) && defined(__AVX2__)
     444             : // Disabling inlining works around a bug with gcc 9.3 (Ubuntu 20.04) in
     445             : // -O2 -mavx2 mode in QuadraticMeanFloatSSE2(),
     446             : // where the registry that contains minus_zero is correctly
     447             : // loaded the first time the function is called (looking at the disassembly,
     448             : // one sees it is loaded much earlier than the function), but gets corrupted
     449             : // (zeroed) in following iterations.
     450             : // It appears the bug is due to the explicit zeroupper() call at the end of
     451             : // the function.
     452             : // The bug is at least solved in gcc 10.2.
     453             : // Inlining doesn't bring much here to performance.
     454             : // This is also needed with gcc 9.3 on QuadraticMeanByteSSE2OrAVX2() in
     455             : // -O3 -mavx2 mode
     456             : #define NOINLINE __attribute__((noinline))
     457             : #else
     458             : #define NOINLINE
     459             : #endif
     460             : 
     461             : template <class T>
     462             : static int NOINLINE
     463        5385 : QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
     464             :                             const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     465             :                             T *CPL_RESTRICT pDstScanline)
     466             : {
     467             :     // Optimized implementation for RMS on Byte by
     468             :     // processing by group of 8 output pixels, so as to use
     469             :     // a single _mm_sqrt_ps() call for 4 output pixels
     470        5385 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     471             : 
     472        5385 :     int iDstPixel = 0;
     473        5385 :     const auto one16 = set1_epi16(1);
     474        5385 :     const auto one32 = set1_epi32(1);
     475        5385 :     const auto zero = setzero();
     476        5385 :     const auto minus32768 = set1_epi16(-32768);
     477             : 
     478      521496 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
     479             :     {
     480             :         // Load 2 * DEST_ELTS bytes from each line
     481      516111 :         auto firstLine = loadu_int(pSrcScanlineShifted);
     482     1032220 :         auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
     483             :         // Extend those Bytes as UInt16s
     484      516111 :         auto firstLineLo = unpacklo_epi8(firstLine, zero);
     485      516111 :         auto firstLineHi = unpackhi_epi8(firstLine, zero);
     486      516111 :         auto secondLineLo = unpacklo_epi8(secondLine, zero);
     487      516111 :         auto secondLineHi = unpackhi_epi8(secondLine, zero);
     488             : 
     489             :         // Multiplication of 16 bit values and horizontal
     490             :         // addition of 32 bit results
     491             :         // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
     492      516111 :         firstLineLo = madd_epi16(firstLineLo, firstLineLo);
     493      516111 :         firstLineHi = madd_epi16(firstLineHi, firstLineHi);
     494      516111 :         secondLineLo = madd_epi16(secondLineLo, secondLineLo);
     495      516111 :         secondLineHi = madd_epi16(secondLineHi, secondLineHi);
     496             : 
     497             :         // Vertical addition
     498      516111 :         const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
     499      516111 :         const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
     500             : 
     501             :         const auto sumSquaresPlusOneDiv4Lo =
     502     1032220 :             srli_epi32(add_epi32(sumSquaresLo, one32), 2);
     503             :         const auto sumSquaresPlusOneDiv4Hi =
     504     1032220 :             srli_epi32(add_epi32(sumSquaresHi, one32), 2);
     505             : 
     506             :         // Take square root and truncate/floor to int32
     507             :         const auto rmsLo =
     508     1548330 :             cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
     509             :         const auto rmsHi =
     510     1548330 :             cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
     511             : 
     512             :         // Merge back low and high registers with each RMS value
     513             :         // as a 16 bit value.
     514      516111 :         auto rms = packs_epi32(rmsLo, rmsHi);
     515             : 
     516             :         // Round to upper value if it minimizes the
     517             :         // error |rms^2 - sumSquares/4|
     518             :         // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
     519             :         //    rms += 1;
     520             :         // which is equivalent to:
     521             :         // if( rms * (rms + 1) < (sumSquares+1) / 4 )
     522             :         //    rms += 1;
     523             :         // And both left and right parts fit on 16 (unsigned) bits
     524             :         const auto sumSquaresPlusOneDiv4 =
     525      516111 :             packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
     526             :         // cmpgt_epi16 operates on signed int16, but here
     527             :         // we have unsigned values, so shift them by -32768 before
     528     2580560 :         auto mask = cmpgt_epi16(
     529             :             add_epi16(sumSquaresPlusOneDiv4, minus32768),
     530             :             add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
     531             :         // The value of the mask will be -1 when the correction needs to be
     532             :         // applied
     533      516111 :         rms = sub_epi16(rms, mask);
     534             : 
     535             :         // Pack each 16 bit RMS value to 8 bits
     536      516111 :         rms = packus_epi16(rms, rms /* could be anything */);
     537      516111 :         store_lo(&pDstScanline[iDstPixel], rms);
     538      516111 :         pSrcScanlineShifted += 2 * DEST_ELTS;
     539             :     }
     540             :     zeroupper();
     541             : 
     542        5385 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     543        5385 :     return iDstPixel;
     544             : }
     545             : 
     546             : /************************************************************************/
     547             : /*                      AverageByteSSE2OrAVX2()                         */
     548             : /************************************************************************/
     549             : 
     550             : template <class T>
     551             : static int
     552      111280 : AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
     553             :                       const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     554             :                       T *CPL_RESTRICT pDstScanline)
     555             : {
     556             :     // Optimized implementation for average on Byte by
     557             :     // processing by group of 8 output pixels.
     558             : 
     559      111280 :     const auto zero = setzero();
     560      111280 :     const auto two16 = set1_epi16(2);
     561      111280 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     562             : 
     563      111280 :     int iDstPixel = 0;
     564     4780310 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
     565             :     {
     566             :         // Load 2 * DEST_ELTS bytes from each line
     567     4669030 :         const auto firstLine = loadu_int(pSrcScanlineShifted);
     568     9338050 :         const auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
     569             :         // Extend those Bytes as UInt16s
     570     4669030 :         const auto firstLineLo = unpacklo_epi8(firstLine, zero);
     571     4669030 :         const auto firstLineHi = unpackhi_epi8(firstLine, zero);
     572     4669030 :         const auto secondLineLo = unpacklo_epi8(secondLine, zero);
     573     4669030 :         const auto secondLineHi = unpackhi_epi8(secondLine, zero);
     574             : 
     575             :         // Vertical addition
     576     4669030 :         const auto sumLo = add_epi16(firstLineLo, secondLineLo);
     577     4669030 :         const auto sumHi = add_epi16(firstLineHi, secondLineHi);
     578             : 
     579             :         // Horizontal addition of adjacent pairs, and recombine low and high
     580             :         // parts
     581     4669030 :         const auto sum = hadd_epi16(sumLo, sumHi);
     582             : 
     583             :         // average = (sum + 2) / 4
     584     9338050 :         auto average = srli_epi16(add_epi16(sum, two16), 2);
     585             : 
     586             :         // Pack each 16 bit average value to 8 bits
     587     4669030 :         average = packus_epi16(average, average /* could be anything */);
     588     4669030 :         store_lo(&pDstScanline[iDstPixel], average);
     589     4669030 :         pSrcScanlineShifted += 2 * DEST_ELTS;
     590             :     }
     591             :     zeroupper();
     592             : 
     593      111280 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     594      111280 :     return iDstPixel;
     595             : }
     596             : 
     597             : /************************************************************************/
     598             : /*                     QuadraticMeanUInt16SSE2()                        */
     599             : /************************************************************************/
     600             : 
     601             : #ifdef __SSE3__
     602             : #define sse2_hadd_pd _mm_hadd_pd
     603             : #else
     604           8 : inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
     605             : {
     606             :     auto aLo_bLo =
     607          32 :         _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
     608             :     auto aHi_bHi =
     609          32 :         _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
     610           8 :     return _mm_add_pd(aLo_bLo, aHi_bHi);  // (aLo + aHi, bLo + bHi)
     611             : }
     612             : #endif
     613             : 
     614          40 : inline __m128d SQUARE_PD(__m128d x)
     615             : {
     616          40 :     return _mm_mul_pd(x, x);
     617             : }
     618             : 
     619             : #ifdef __AVX2__
     620             : 
     621             : inline __m256d SQUARE_PD(__m256d x)
     622             : {
     623             :     return _mm256_mul_pd(x, x);
     624             : }
     625             : 
     626             : inline __m256d FIXUP_LANES(__m256d x)
     627             : {
     628             :     return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
     629             : }
     630             : 
     631             : inline __m256 FIXUP_LANES(__m256 x)
     632             : {
     633             :     return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
     634             : }
     635             : 
     636             : #endif
     637             : 
     638             : template <class T>
     639             : static int
     640          10 : QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
     641             :                         const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     642             :                         T *CPL_RESTRICT pDstScanline)
     643             : {
     644             :     // Optimized implementation for RMS on UInt16 by
     645             :     // processing by group of 4 output pixels.
     646          10 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     647             : 
     648          10 :     int iDstPixel = 0;
     649          10 :     const auto zero = _mm_setzero_si128();
     650             : 
     651             : #ifdef __AVX2__
     652             :     const auto zeroDot25 = _mm256_set1_pd(0.25);
     653             :     const auto zeroDot5 = _mm256_set1_pd(0.5);
     654             : 
     655             :     // The first four 0's could be anything, as we only take the bottom
     656             :     // 128 bits.
     657             :     const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
     658             : #else
     659          10 :     const auto zeroDot25 = _mm_set1_pd(0.25);
     660          10 :     const auto zeroDot5 = _mm_set1_pd(0.5);
     661             : #endif
     662             : 
     663          40 :     for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
     664             :     {
     665             :         // Load 8 UInt16 from each line
     666          30 :         const auto firstLine = _mm_loadu_si128(
     667             :             reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
     668             :         const auto secondLine =
     669          30 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     670          30 :                 pSrcScanlineShifted + nChunkXSize));
     671             : 
     672             :         // Detect if all of the source values fit in 14 bits.
     673             :         // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
     674             :         // and we can do a much faster implementation.
     675             :         const auto maskTmp =
     676          60 :             _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
     677             : #if defined(__i386__) || defined(_M_IX86)
     678             :         uint64_t nMaskFitsIn14Bits = 0;
     679             :         _mm_storel_epi64(
     680             :             reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
     681             :             _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
     682             : #else
     683          30 :         const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
     684             :             _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
     685             : #endif
     686          30 :         if (nMaskFitsIn14Bits == 0)
     687             :         {
     688             :             // Multiplication of 16 bit values and horizontal
     689             :             // addition of 32 bit results
     690             :             const auto firstLineHSumSquare =
     691          26 :                 _mm_madd_epi16(firstLine, firstLine);
     692             :             const auto secondLineHSumSquare =
     693          26 :                 _mm_madd_epi16(secondLine, secondLine);
     694             :             // Vertical addition
     695             :             const auto sumSquares =
     696          26 :                 _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
     697             :             // In theory we should take sqrt(sumSquares * 0.25f)
     698             :             // but given the rounding we do, this is equivalent to
     699             :             // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
     700             :             // sumSquares <= 4 * 16383^2
     701          26 :             const auto one32 = _mm_set1_epi32(1);
     702             :             const auto sumSquaresPlusOneDiv4 =
     703          52 :                 _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
     704             :             // Take square root and truncate/floor to int32
     705          78 :             auto rms = _mm_cvttps_epi32(
     706             :                 _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
     707             : 
     708             :             // Round to upper value if it minimizes the
     709             :             // error |rms^2 - sumSquares/4|
     710             :             // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
     711             :             //    rms += 1;
     712             :             // which is equivalent to:
     713             :             // if( rms * rms + rms < (sumSquares+1) / 4 )
     714             :             //    rms += 1;
     715             :             auto mask =
     716          78 :                 _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
     717             :                                 _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
     718          26 :             rms = _mm_sub_epi32(rms, mask);
     719             :             // Pack each 32 bit RMS value to 16 bits
     720          26 :             rms = _mm_packs_epi32(rms, rms /* could be anything */);
     721             :             _mm_storel_epi64(
     722          26 :                 reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
     723          26 :             pSrcScanlineShifted += 8;
     724          26 :             continue;
     725             :         }
     726             : 
     727             :         // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
     728             :         // to 32 bit would result in 4 multiplications instead of 8, but
     729             :         // mullo/mulhi have a worse throughput than mul_pd.
     730             : 
     731             :         // Extend those UInt16s as UInt32s
     732           4 :         const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
     733           4 :         const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
     734           4 :         const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
     735           4 :         const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
     736             : 
     737             : #ifdef __AVX2__
     738             :         // Multiplication of 32 bit values previously converted to 64 bit double
     739             :         const auto firstLineLoDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineLo));
     740             :         const auto firstLineHiDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineHi));
     741             :         const auto secondLineLoDbl =
     742             :             SQUARE_PD(_mm256_cvtepi32_pd(secondLineLo));
     743             :         const auto secondLineHiDbl =
     744             :             SQUARE_PD(_mm256_cvtepi32_pd(secondLineHi));
     745             : 
     746             :         // Vertical addition of squares
     747             :         const auto sumSquaresLo =
     748             :             _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
     749             :         const auto sumSquaresHi =
     750             :             _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
     751             : 
     752             :         // Horizontal addition of squares
     753             :         const auto sumSquares =
     754             :             FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
     755             : 
     756             :         const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
     757             : 
     758             :         // Take square root and truncate/floor to int32
     759             :         auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
     760             :         const auto rmsDouble = _mm256_cvtepi32_pd(rms);
     761             :         const auto right = _mm256_sub_pd(
     762             :             sumDivWeight, _mm256_add_pd(SQUARE_PD(rmsDouble), rmsDouble));
     763             : 
     764             :         auto mask =
     765             :             _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
     766             :         // Extract 32-bit from each of the 4 64-bit masks
     767             :         // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
     768             :         // _MM_SHUFFLE(2,0,2,0)));
     769             :         mask = _mm256_permutevar8x32_ps(mask, permutation);
     770             :         const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
     771             : 
     772             :         // Apply the correction
     773             :         rms = _mm_sub_epi32(rms, maskI);
     774             : 
     775             :         // Pack each 32 bit RMS value to 16 bits
     776             :         rms = _mm_packus_epi32(rms, rms /* could be anything */);
     777             : #else
     778             :         // Multiplication of 32 bit values previously converted to 64 bit double
     779           4 :         const auto firstLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineLo));
     780             :         const auto firstLineLoHi =
     781           8 :             SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
     782           4 :         const auto firstLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineHi));
     783             :         const auto firstLineHiHi =
     784           8 :             SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
     785             : 
     786           4 :         const auto secondLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineLo));
     787             :         const auto secondLineLoHi =
     788           8 :             SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
     789           4 :         const auto secondLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineHi));
     790             :         const auto secondLineHiHi =
     791           8 :             SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
     792             : 
     793             :         // Vertical addition of squares
     794           4 :         const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
     795           4 :         const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
     796           4 :         const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
     797           4 :         const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
     798             : 
     799             :         // Horizontal addition of squares
     800           4 :         const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
     801           4 :         const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
     802             : 
     803           4 :         const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
     804           4 :         const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
     805             :         // Take square root and truncate/floor to int32
     806           8 :         const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
     807           8 :         const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
     808             : 
     809             :         // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
     810             :         // if( 0.5 < sumDivWeight - (rms * rms + rms) )
     811             :         //     rms += 1;
     812           4 :         const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
     813           4 :         const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
     814           8 :         const auto rightLo = _mm_sub_pd(
     815             :             sumDivWeightLo, _mm_add_pd(SQUARE_PD(rmsLoDouble), rmsLoDouble));
     816          12 :         const auto rightHi = _mm_sub_pd(
     817             :             sumDivWeightHi, _mm_add_pd(SQUARE_PD(rmsHiDouble), rmsHiDouble));
     818             : 
     819           8 :         const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
     820           4 :         const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
     821             :         // The value of the mask will be -1 when the correction needs to be
     822             :         // applied
     823           8 :         const auto mask = _mm_castps_si128(_mm_shuffle_ps(
     824             :             maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
     825             : 
     826          16 :         auto rms = _mm_castps_si128(
     827             :             _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
     828             :         // Apply the correction
     829           4 :         rms = _mm_sub_epi32(rms, mask);
     830             : 
     831             :         // Pack each 32 bit RMS value to 16 bits
     832           4 :         rms = sse2_packus_epi32(rms, rms /* could be anything */);
     833             : #endif
     834             : 
     835           4 :         _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
     836             :                          rms);
     837           4 :         pSrcScanlineShifted += 8;
     838             :     }
     839             : 
     840             :     zeroupper();
     841             : 
     842          10 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     843          10 :     return iDstPixel;
     844             : }
     845             : 
     846             : /************************************************************************/
     847             : /*                         AverageUInt16SSE2()                          */
     848             : /************************************************************************/
     849             : 
     850             : template <class T>
     851           9 : static int AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
     852             :                              const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     853             :                              T *CPL_RESTRICT pDstScanline)
     854             : {
     855             :     // Optimized implementation for average on UInt16 by
     856             :     // processing by group of 8 output pixels.
     857             : 
     858           9 :     const auto mask = _mm_set1_epi32(0xFFFF);
     859           9 :     const auto two = _mm_set1_epi32(2);
     860           9 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     861             : 
     862           9 :     int iDstPixel = 0;
     863          13 :     for (; iDstPixel < nDstXWidth - 7; iDstPixel += 8)
     864             :     {
     865             :         __m128i averageLow;
     866             :         // Load 8 UInt16 from each line
     867             :         {
     868           4 :             const auto firstLine = _mm_loadu_si128(
     869             :                 reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
     870             :             const auto secondLine =
     871           4 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     872           4 :                     pSrcScanlineShifted + nChunkXSize));
     873             : 
     874             :             // Horizontal addition and extension to 32 bit
     875          12 :             const auto horizAddFirstLine = _mm_add_epi32(
     876             :                 _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
     877             :             const auto horizAddSecondLine =
     878          12 :                 _mm_add_epi32(_mm_and_si128(secondLine, mask),
     879             :                               _mm_srli_epi32(secondLine, 16));
     880             : 
     881             :             // Vertical addition and average computation
     882             :             // average = (sum + 2) >> 2
     883           8 :             const auto sum = _mm_add_epi32(
     884             :                 _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
     885           4 :             averageLow = _mm_srli_epi32(sum, 2);
     886             :         }
     887             :         // Load 8 UInt16 from each line
     888             :         __m128i averageHigh;
     889             :         {
     890           4 :             const auto firstLine = _mm_loadu_si128(
     891           4 :                 reinterpret_cast<__m128i const *>(pSrcScanlineShifted + 8));
     892             :             const auto secondLine =
     893           4 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     894           4 :                     pSrcScanlineShifted + 8 + nChunkXSize));
     895             : 
     896             :             // Horizontal addition and extension to 32 bit
     897          12 :             const auto horizAddFirstLine = _mm_add_epi32(
     898             :                 _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
     899             :             const auto horizAddSecondLine =
     900          12 :                 _mm_add_epi32(_mm_and_si128(secondLine, mask),
     901             :                               _mm_srli_epi32(secondLine, 16));
     902             : 
     903             :             // Vertical addition and average computation
     904             :             // average = (sum + 2) >> 2
     905           8 :             const auto sum = _mm_add_epi32(
     906             :                 _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
     907           4 :             averageHigh = _mm_srli_epi32(sum, 2);
     908             :         }
     909             : 
     910             :         // Pack each 32 bit average value to 16 bits
     911           4 :         auto average = sse2_packus_epi32(averageLow, averageHigh);
     912           4 :         _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
     913             :                          average);
     914           4 :         pSrcScanlineShifted += 16;
     915             :     }
     916             : 
     917           9 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     918           9 :     return iDstPixel;
     919             : }
     920             : 
     921             : /************************************************************************/
     922             : /*                      QuadraticMeanFloatSSE2()                        */
     923             : /************************************************************************/
     924             : 
     925             : #ifdef __AVX2__
     926             : #define RMS_FLOAT_ELTS 8
     927             : #define set1_ps _mm256_set1_ps
     928             : #define loadu_ps _mm256_loadu_ps
     929             : #define andnot_ps _mm256_andnot_ps
     930             : #define and_ps _mm256_and_ps
     931             : #define max_ps _mm256_max_ps
     932             : #define shuffle_ps _mm256_shuffle_ps
     933             : #define div_ps _mm256_div_ps
     934             : #define cmpeq_ps(x, y) _mm256_cmp_ps(x, y, _CMP_EQ_OQ)
     935             : #define mul_ps _mm256_mul_ps
     936             : #define add_ps _mm256_add_ps
     937             : #define hadd_ps _mm256_hadd_ps
     938             : #define sqrt_ps _mm256_sqrt_ps
     939             : #define or_ps _mm256_or_ps
     940             : #define unpacklo_ps _mm256_unpacklo_ps
     941             : #define unpackhi_ps _mm256_unpackhi_ps
     942             : #define storeu_ps _mm256_storeu_ps
     943             : 
     944             : inline __m256 SQUARE_PS(__m256 x)
     945             : {
     946             :     return _mm256_mul_ps(x, x);
     947             : }
     948             : 
     949             : #else
     950             : 
     951             : #ifdef __SSE3__
     952             : #define sse2_hadd_ps _mm_hadd_ps
     953             : #else
     954             : inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
     955             : {
     956             :     auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
     957             :     auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
     958             :     return _mm_add_ps(aEven_bEven, aOdd_bOdd);  // (aEven + aOdd, bEven + bOdd)
     959             : }
     960             : #endif
     961             : 
     962             : #define RMS_FLOAT_ELTS 4
     963             : #define set1_ps _mm_set1_ps
     964             : #define loadu_ps _mm_loadu_ps
     965             : #define andnot_ps _mm_andnot_ps
     966             : #define and_ps _mm_and_ps
     967             : #define max_ps _mm_max_ps
     968             : #define shuffle_ps _mm_shuffle_ps
     969             : #define div_ps _mm_div_ps
     970             : #define cmpeq_ps _mm_cmpeq_ps
     971             : #define mul_ps _mm_mul_ps
     972             : #define add_ps _mm_add_ps
     973             : #define hadd_ps sse2_hadd_ps
     974             : #define sqrt_ps _mm_sqrt_ps
     975             : #define or_ps _mm_or_ps
     976             : #define unpacklo_ps _mm_unpacklo_ps
     977             : #define unpackhi_ps _mm_unpackhi_ps
     978             : #define storeu_ps _mm_storeu_ps
     979             : 
     980         272 : inline __m128 SQUARE_PS(__m128 x)
     981             : {
     982         272 :     return _mm_mul_ps(x, x);
     983             : }
     984             : 
     985          68 : inline __m128 FIXUP_LANES(__m128 x)
     986             : {
     987          68 :     return x;
     988             : }
     989             : 
     990             : #endif
     991             : 
     992             : template <class T>
     993             : static int NOINLINE
     994          34 : QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
     995             :                        const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     996             :                        T *CPL_RESTRICT pDstScanline)
     997             : {
     998             :     // Optimized implementation for RMS on Float32 by
     999             :     // processing by group of RMS_FLOAT_ELTS output pixels.
    1000          34 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
    1001             : 
    1002          34 :     int iDstPixel = 0;
    1003          34 :     const auto minus_zero = set1_ps(-0.0f);
    1004          34 :     const auto zeroDot25 = set1_ps(0.25f);
    1005          34 :     const auto one = set1_ps(1.0f);
    1006          68 :     const auto infv = set1_ps(std::numeric_limits<float>::infinity());
    1007             : 
    1008         102 :     for (; iDstPixel < nDstXWidth - (RMS_FLOAT_ELTS - 1);
    1009             :          iDstPixel += RMS_FLOAT_ELTS)
    1010             :     {
    1011             :         // Load 2*RMS_FLOAT_ELTS Float32 from each line
    1012             :         auto firstLineLo =
    1013          68 :             loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
    1014          68 :         auto firstLineHi = loadu_ps(reinterpret_cast<float const *>(
    1015          68 :             pSrcScanlineShifted + RMS_FLOAT_ELTS));
    1016          68 :         auto secondLineLo = loadu_ps(
    1017          68 :             reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
    1018          68 :         auto secondLineHi = loadu_ps(reinterpret_cast<float const *>(
    1019          68 :             pSrcScanlineShifted + RMS_FLOAT_ELTS + nChunkXSize));
    1020             : 
    1021             :         // Take the absolute value
    1022          68 :         firstLineLo = andnot_ps(minus_zero, firstLineLo);
    1023          68 :         firstLineHi = andnot_ps(minus_zero, firstLineHi);
    1024          68 :         secondLineLo = andnot_ps(minus_zero, secondLineLo);
    1025          68 :         secondLineHi = andnot_ps(minus_zero, secondLineHi);
    1026             : 
    1027             :         auto firstLineEven =
    1028          68 :             shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
    1029             :         auto firstLineOdd =
    1030          68 :             shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
    1031             :         auto secondLineEven =
    1032          68 :             shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
    1033             :         auto secondLineOdd =
    1034          68 :             shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
    1035             : 
    1036             :         // Compute the maximum of each RMS_FLOAT_ELTS value to RMS-average
    1037         204 :         const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
    1038             :                                  max_ps(secondLineEven, secondLineEven));
    1039             : 
    1040             :         // Normalize each value by the maximum of the RMS_FLOAT_ELTS ones.
    1041             :         // This step is important to avoid that the square evaluates to infinity
    1042             :         // for sufficiently big input.
    1043          68 :         auto invMax = div_ps(one, maxV);
    1044             :         // Deal with 0 being the maximum to correct division by zero
    1045             :         // note: comparing to -0 leads to identical results as to comparing with
    1046             :         // 0
    1047         136 :         invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
    1048             : 
    1049          68 :         firstLineEven = mul_ps(firstLineEven, invMax);
    1050          68 :         firstLineOdd = mul_ps(firstLineOdd, invMax);
    1051          68 :         secondLineEven = mul_ps(secondLineEven, invMax);
    1052          68 :         secondLineOdd = mul_ps(secondLineOdd, invMax);
    1053             : 
    1054             :         // Compute squares
    1055          68 :         firstLineEven = SQUARE_PS(firstLineEven);
    1056          68 :         firstLineOdd = SQUARE_PS(firstLineOdd);
    1057          68 :         secondLineEven = SQUARE_PS(secondLineEven);
    1058          68 :         secondLineOdd = SQUARE_PS(secondLineOdd);
    1059             : 
    1060         204 :         const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
    1061             :                                        add_ps(secondLineEven, secondLineOdd));
    1062             : 
    1063         204 :         auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
    1064             : 
    1065             :         // Deal with infinity being the maximum
    1066          68 :         const auto maskIsInf = cmpeq_ps(maxV, infv);
    1067         136 :         rms = or_ps(andnot_ps(maskIsInf, rms), and_ps(maskIsInf, infv));
    1068             : 
    1069          68 :         rms = FIXUP_LANES(rms);
    1070             : 
    1071             :         // coverity[incompatible_cast]
    1072          68 :         storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]), rms);
    1073          68 :         pSrcScanlineShifted += RMS_FLOAT_ELTS * 2;
    1074             :     }
    1075             : 
    1076             :     zeroupper();
    1077             : 
    1078          34 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
    1079          34 :     return iDstPixel;
    1080             : }
    1081             : 
    1082             : /************************************************************************/
    1083             : /*                        AverageFloatSSE2()                            */
    1084             : /************************************************************************/
    1085             : 
    1086             : template <class T>
    1087          14 : static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
    1088             :                             const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
    1089             :                             T *CPL_RESTRICT pDstScanline)
    1090             : {
    1091             :     // Optimized implementation for average on Float32 by
    1092             :     // processing by group of 4 output pixels.
    1093          14 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
    1094             : 
    1095          14 :     int iDstPixel = 0;
    1096          14 :     const auto zeroDot25 = _mm_set1_ps(0.25f);
    1097             : 
    1098          32 :     for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
    1099             :     {
    1100             :         // Load 8 Float32 from each line
    1101             :         const auto firstLineLo =
    1102          18 :             _mm_loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
    1103          18 :         const auto firstLineHi = _mm_loadu_ps(
    1104          18 :             reinterpret_cast<float const *>(pSrcScanlineShifted + 4));
    1105          18 :         const auto secondLineLo = _mm_loadu_ps(
    1106          18 :             reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
    1107          18 :         const auto secondLineHi = _mm_loadu_ps(reinterpret_cast<float const *>(
    1108          18 :             pSrcScanlineShifted + 4 + nChunkXSize));
    1109             : 
    1110             :         // Vertical addition
    1111          18 :         const auto sumLo = _mm_add_ps(firstLineLo, secondLineLo);
    1112          18 :         const auto sumHi = _mm_add_ps(firstLineHi, secondLineHi);
    1113             : 
    1114             :         // Horizontal addition
    1115             :         const auto A =
    1116          18 :             _mm_shuffle_ps(sumLo, sumHi, 0 | (2 << 2) | (0 << 4) | (2 << 6));
    1117             :         const auto B =
    1118          18 :             _mm_shuffle_ps(sumLo, sumHi, 1 | (3 << 2) | (1 << 4) | (3 << 6));
    1119          18 :         const auto sum = _mm_add_ps(A, B);
    1120             : 
    1121          18 :         const auto average = _mm_mul_ps(sum, zeroDot25);
    1122             : 
    1123             :         // coverity[incompatible_cast]
    1124          18 :         _mm_storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]),
    1125             :                       average);
    1126          18 :         pSrcScanlineShifted += 8;
    1127             :     }
    1128             : 
    1129          14 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
    1130          14 :     return iDstPixel;
    1131             : }
    1132             : 
    1133             : #endif
    1134             : 
    1135             : /************************************************************************/
    1136             : /*                    GDALResampleChunk_AverageOrRMS()                  */
    1137             : /************************************************************************/
    1138             : 
    1139             : template <class T, class Tsum, GDALDataType eWrkDataType>
    1140             : static CPLErr
    1141        2319 : GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
    1142             :                                  const T *pChunk, void **ppDstBuffer)
    1143             : {
    1144        2319 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    1145        2319 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    1146        2319 :     const double dfSrcXDelta = args.dfSrcXDelta;
    1147        2319 :     const double dfSrcYDelta = args.dfSrcYDelta;
    1148        2319 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    1149        2319 :     const int nChunkXOff = args.nChunkXOff;
    1150        2319 :     const int nChunkYOff = args.nChunkYOff;
    1151        2319 :     const int nChunkXSize = args.nChunkXSize;
    1152        2319 :     const int nChunkYSize = args.nChunkYSize;
    1153        2319 :     const int nDstXOff = args.nDstXOff;
    1154        2319 :     const int nDstXOff2 = args.nDstXOff2;
    1155        2319 :     const int nDstYOff = args.nDstYOff;
    1156        2319 :     const int nDstYOff2 = args.nDstYOff2;
    1157        2319 :     const char *pszResampling = args.pszResampling;
    1158        2319 :     bool bHasNoData = args.bHasNoData;
    1159        2319 :     const double dfNoDataValue = args.dfNoDataValue;
    1160        2319 :     const GDALColorTable *poColorTable = args.poColorTable;
    1161        2319 :     const bool bPropagateNoData = args.bPropagateNoData;
    1162             : 
    1163             :     // AVERAGE_BIT2GRAYSCALE
    1164             :     const bool bBit2Grayscale =
    1165        2319 :         CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"));
    1166        2319 :     const bool bQuadraticMean = CPL_TO_BOOL(EQUAL(pszResampling, "RMS"));
    1167        2319 :     if (bBit2Grayscale)
    1168           9 :         poColorTable = nullptr;
    1169             : 
    1170             :     T tNoDataValue;
    1171        2319 :     if (!bHasNoData)
    1172        2263 :         tNoDataValue = 0;
    1173             :     else
    1174          56 :         tNoDataValue = static_cast<T>(dfNoDataValue);
    1175        2319 :     const T tReplacementVal =
    1176         114 :         bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
    1177          56 :                          args.eOvrDataType, dfNoDataValue))
    1178             :                    : 0;
    1179             : 
    1180        2319 :     int nChunkRightXOff = nChunkXOff + nChunkXSize;
    1181        2319 :     int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    1182        2319 :     int nDstXWidth = nDstXOff2 - nDstXOff;
    1183             : 
    1184             :     /* -------------------------------------------------------------------- */
    1185             :     /*      Allocate buffers.                                               */
    1186             :     /* -------------------------------------------------------------------- */
    1187        2319 :     *ppDstBuffer = static_cast<T *>(
    1188        2319 :         VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
    1189             :                             GDALGetDataTypeSizeBytes(eWrkDataType)));
    1190        2319 :     if (*ppDstBuffer == nullptr)
    1191             :     {
    1192           0 :         return CE_Failure;
    1193             :     }
    1194        2319 :     T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
    1195             : 
    1196             :     struct PrecomputedXValue
    1197             :     {
    1198             :         int nLeftXOffShifted;
    1199             :         int nRightXOffShifted;
    1200             :         double dfLeftWeight;
    1201             :         double dfRightWeight;
    1202             :         double dfTotalWeightFullLine;
    1203             :     };
    1204             : 
    1205             :     PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
    1206        2319 :         VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(PrecomputedXValue)));
    1207             : 
    1208        2319 :     if (pasSrcX == nullptr)
    1209             :     {
    1210           0 :         return CE_Failure;
    1211             :     }
    1212             : 
    1213        2319 :     int nTransparentIdx = -1;
    1214        2319 :     std::vector<GDALColorEntry> colorEntries;
    1215        2319 :     if (poColorTable)
    1216           5 :         colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
    1217             : 
    1218             :     // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
    1219             :     // it as nodata value
    1220        2349 :     if (bHasNoData && dfNoDataValue >= 0.0f &&
    1221          30 :         tNoDataValue < colorEntries.size())
    1222           1 :         colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
    1223             : 
    1224             :     // Or if we have no explicit nodata, but a color table entry that is
    1225             :     // transparent, consider it as the nodata value
    1226        2318 :     else if (!bHasNoData && nTransparentIdx >= 0)
    1227             :     {
    1228           0 :         bHasNoData = true;
    1229           0 :         tNoDataValue = static_cast<T>(nTransparentIdx);
    1230             :     }
    1231             : 
    1232             :     /* ==================================================================== */
    1233             :     /*      Precompute inner loop constants.                                */
    1234             :     /* ==================================================================== */
    1235        2319 :     bool bSrcXSpacingIsTwo = true;
    1236        2319 :     int nLastSrcXOff2 = -1;
    1237      852277 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    1238             :     {
    1239      849958 :         double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
    1240             :         // Apply some epsilon to avoid numerical precision issues
    1241      849958 :         int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
    1242      849958 :         double dfSrcXOff2 = dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
    1243      849958 :         int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
    1244             : 
    1245      849958 :         if (nSrcXOff < nChunkXOff)
    1246           0 :             nSrcXOff = nChunkXOff;
    1247      849958 :         if (nSrcXOff2 == nSrcXOff)
    1248           0 :             nSrcXOff2++;
    1249      849958 :         if (nSrcXOff2 > nChunkRightXOff)
    1250           1 :             nSrcXOff2 = nChunkRightXOff;
    1251             : 
    1252      849958 :         pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
    1253      849958 :         pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
    1254      849958 :             nSrcXOff2 - nChunkXOff;
    1255          21 :         pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
    1256      849958 :             (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
    1257      849958 :         pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
    1258      849958 :             1 - (nSrcXOff2 - dfSrcXOff2);
    1259      849958 :         pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
    1260      849958 :             pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
    1261      849958 :         if (nSrcXOff + 1 < nSrcXOff2)
    1262             :         {
    1263      849937 :             pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
    1264      849937 :                 nSrcXOff2 - nSrcXOff - 2;
    1265      849937 :             pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
    1266      849937 :                 pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
    1267             :         }
    1268             : 
    1269      849958 :         if (nSrcXOff2 - nSrcXOff != 2 ||
    1270      728548 :             (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
    1271             :         {
    1272      120599 :             bSrcXSpacingIsTwo = false;
    1273             :         }
    1274      849958 :         nLastSrcXOff2 = nSrcXOff2;
    1275             :     }
    1276             : 
    1277             :     /* ==================================================================== */
    1278             :     /*      Loop over destination scanlines.                                */
    1279             :     /* ==================================================================== */
    1280      721820 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    1281             :     {
    1282      719501 :         double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
    1283      719501 :         int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
    1284      719501 :         if (nSrcYOff < nChunkYOff)
    1285           0 :             nSrcYOff = nChunkYOff;
    1286             : 
    1287      719501 :         double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
    1288      719501 :         int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
    1289      719501 :         if (nSrcYOff2 == nSrcYOff)
    1290           0 :             ++nSrcYOff2;
    1291      719501 :         if (nSrcYOff2 > nChunkBottomYOff)
    1292           3 :             nSrcYOff2 = nChunkBottomYOff;
    1293             : 
    1294      719501 :         T *const pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
    1295             : 
    1296             :         /* --------------------------------------------------------------------
    1297             :          */
    1298             :         /*      Loop over destination pixels */
    1299             :         /* --------------------------------------------------------------------
    1300             :          */
    1301      719501 :         if (poColorTable == nullptr)
    1302             :         {
    1303      719386 :             if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
    1304             :                 pabyChunkNodataMask == nullptr)
    1305             :             {
    1306             :                 if (eWrkDataType == GDT_Byte || eWrkDataType == GDT_UInt16)
    1307             :                 {
    1308             :                     // Optimized case : no nodata, overview by a factor of 2 and
    1309             :                     // regular x and y src spacing.
    1310      116684 :                     const T *pSrcScanlineShifted =
    1311      116684 :                         pChunk + pasSrcX[0].nLeftXOffShifted +
    1312      116684 :                         static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
    1313      116684 :                             nChunkXSize;
    1314      116684 :                     int iDstPixel = 0;
    1315             : #ifdef USE_SSE2
    1316      116665 :                     if (bQuadraticMean && eWrkDataType == GDT_Byte)
    1317             :                     {
    1318        5385 :                         iDstPixel = QuadraticMeanByteSSE2OrAVX2(
    1319             :                             nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1320             :                             pDstScanline);
    1321             :                     }
    1322      111299 :                     else if (bQuadraticMean /* && eWrkDataType == GDT_UInt16 */)
    1323             :                     {
    1324          10 :                         iDstPixel = QuadraticMeanUInt16SSE2(
    1325             :                             nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1326             :                             pDstScanline);
    1327             :                     }
    1328             :                     else if (/* !bQuadraticMean && */ eWrkDataType == GDT_Byte)
    1329             :                     {
    1330      111280 :                         iDstPixel = AverageByteSSE2OrAVX2(
    1331             :                             nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1332             :                             pDstScanline);
    1333             :                     }
    1334             :                     else /* if( !bQuadraticMean && eWrkDataType == GDT_UInt16 )
    1335             :                           */
    1336             :                     {
    1337           9 :                         iDstPixel = AverageUInt16SSE2(nDstXWidth, nChunkXSize,
    1338             :                                                       pSrcScanlineShifted,
    1339             :                                                       pDstScanline);
    1340             :                     }
    1341             : #endif
    1342      279043 :                     for (; iDstPixel < nDstXWidth; ++iDstPixel)
    1343             :                     {
    1344      162359 :                         Tsum nTotal = 0;
    1345             :                         T nVal;
    1346      162359 :                         if (bQuadraticMean)
    1347          44 :                             nTotal =
    1348          44 :                                 SQUARE<Tsum>(pSrcScanlineShifted[0]) +
    1349          44 :                                 SQUARE<Tsum>(pSrcScanlineShifted[1]) +
    1350          44 :                                 SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
    1351          44 :                                 SQUARE<Tsum>(
    1352          44 :                                     pSrcScanlineShifted[1 + nChunkXSize]);
    1353             :                         else
    1354      162315 :                             nTotal = pSrcScanlineShifted[0] +
    1355      162315 :                                      pSrcScanlineShifted[1] +
    1356      162315 :                                      pSrcScanlineShifted[nChunkXSize] +
    1357      162315 :                                      pSrcScanlineShifted[1 + nChunkXSize];
    1358             : 
    1359      162359 :                         constexpr int nTotalWeight = 4;
    1360      162359 :                         if (bQuadraticMean)
    1361          44 :                             nVal = ComputeIntegerRMS_4values<T>(nTotal);
    1362             :                         else
    1363      162315 :                             nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
    1364             :                                                   nTotalWeight);
    1365             : 
    1366             :                         // No need to compare nVal against tNoDataValue as we
    1367             :                         // are in a case where pabyChunkNodataMask == nullptr
    1368             :                         // implies the absence of nodata value.
    1369      162359 :                         pDstScanline[iDstPixel] = nVal;
    1370      162359 :                         pSrcScanlineShifted += 2;
    1371             :                     }
    1372             :                 }
    1373             :                 else
    1374             :                 {
    1375             :                     CPLAssert(eWrkDataType == GDT_Float32 ||
    1376             :                               eWrkDataType == GDT_Float64);
    1377          70 :                     const T *pSrcScanlineShifted =
    1378          70 :                         pChunk + pasSrcX[0].nLeftXOffShifted +
    1379          70 :                         static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
    1380          70 :                             nChunkXSize;
    1381          70 :                     int iDstPixel = 0;
    1382             : #ifdef USE_SSE2
    1383             :                     if (eWrkDataType == GDT_Float32)
    1384             :                     {
    1385          48 :                         if (bQuadraticMean)
    1386             :                         {
    1387          34 :                             iDstPixel = QuadraticMeanFloatSSE2(
    1388             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1389             :                                 pDstScanline);
    1390             :                         }
    1391             :                         else
    1392             :                         {
    1393          14 :                             iDstPixel = AverageFloatSSE2(
    1394             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1395             :                                 pDstScanline);
    1396             :                         }
    1397             :                     }
    1398             : #endif
    1399             : 
    1400         268 :                     for (; iDstPixel < nDstXWidth; ++iDstPixel)
    1401             :                     {
    1402             :                         T nVal;
    1403         198 :                         if (bQuadraticMean)
    1404             :                         {
    1405             :                             // Cast to double to avoid overflows
    1406             :                             // (using std::hypot() is much slower)
    1407         100 :                             nVal = static_cast<T>(std::sqrt(
    1408             :                                 0.25 *
    1409         100 :                                 (SQUARE<double>(pSrcScanlineShifted[0]) +
    1410         100 :                                  SQUARE<double>(pSrcScanlineShifted[1]) +
    1411         100 :                                  SQUARE<double>(
    1412         200 :                                      pSrcScanlineShifted[nChunkXSize]) +
    1413         100 :                                  SQUARE<double>(
    1414         100 :                                      pSrcScanlineShifted[1 + nChunkXSize]))));
    1415             :                         }
    1416             :                         else
    1417             :                         {
    1418          98 :                             nVal = static_cast<T>(
    1419          98 :                                 0.25f * (pSrcScanlineShifted[0] +
    1420          98 :                                          pSrcScanlineShifted[1] +
    1421          98 :                                          pSrcScanlineShifted[nChunkXSize] +
    1422          98 :                                          pSrcScanlineShifted[1 + nChunkXSize]));
    1423             :                         }
    1424             : 
    1425             :                         // No need to compare nVal against tNoDataValue as we
    1426             :                         // are in a case where pabyChunkNodataMask == nullptr
    1427             :                         // implies the absence of nodata value.
    1428         198 :                         pDstScanline[iDstPixel] = nVal;
    1429         198 :                         pSrcScanlineShifted += 2;
    1430             :                     }
    1431      116754 :                 }
    1432             :             }
    1433             :             else
    1434             :             {
    1435          17 :                 const double dfBottomWeight =
    1436      602632 :                     (nSrcYOff + 1 == nSrcYOff2) ? 1.0
    1437      602615 :                                                 : 1.0 - (dfSrcYOff - nSrcYOff);
    1438      602632 :                 const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
    1439      602632 :                 nSrcYOff -= nChunkYOff;
    1440      602632 :                 nSrcYOff2 -= nChunkYOff;
    1441             : 
    1442      602632 :                 double dfTotalWeightFullColumn = dfBottomWeight;
    1443      602632 :                 if (nSrcYOff + 1 < nSrcYOff2)
    1444             :                 {
    1445      602615 :                     dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
    1446      602615 :                     dfTotalWeightFullColumn += dfTopWeight;
    1447             :                 }
    1448             : 
    1449    18754460 :                 for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
    1450             :                 {
    1451    18147483 :                     const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
    1452    18147483 :                     const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
    1453             : 
    1454    18147483 :                     double dfTotal = 0;
    1455    18147483 :                     double dfTotalWeight = 0;
    1456    18147483 :                     if (pabyChunkNodataMask == nullptr)
    1457             :                     {
    1458     1746435 :                         auto pChunkShifted =
    1459         115 :                             pChunk +
    1460     1746435 :                             static_cast<GPtrDiff_t>(nSrcYOff) * nChunkXSize;
    1461     1746435 :                         int nCounterY = nSrcYOff2 - nSrcYOff - 1;
    1462     1746435 :                         double dfWeightY = dfBottomWeight;
    1463     3493427 :                         while (true)
    1464             :                         {
    1465             :                             double dfTotalLine;
    1466     5239852 :                             if (bQuadraticMean)
    1467             :                             {
    1468             :                                 // Left pixel
    1469             :                                 {
    1470         104 :                                     const T val = pChunkShifted[nSrcXOff];
    1471         104 :                                     dfTotalLine =
    1472         104 :                                         SQUARE<double>(val) *
    1473         104 :                                         pasSrcX[iDstPixel].dfLeftWeight;
    1474             :                                 }
    1475             : 
    1476         104 :                                 if (nSrcXOff + 1 < nSrcXOff2)
    1477             :                                 {
    1478             :                                     // Middle pixels
    1479         104 :                                     for (int iX = nSrcXOff + 1;
    1480         424 :                                          iX + 1 < nSrcXOff2; ++iX)
    1481             :                                     {
    1482         320 :                                         const T val = pChunkShifted[iX];
    1483         320 :                                         dfTotalLine += SQUARE<double>(val);
    1484             :                                     }
    1485             : 
    1486             :                                     // Right pixel
    1487             :                                     {
    1488         104 :                                         const T val =
    1489         104 :                                             pChunkShifted[nSrcXOff2 - 1];
    1490         104 :                                         dfTotalLine +=
    1491         104 :                                             SQUARE<double>(val) *
    1492         104 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1493             :                                     }
    1494             :                                 }
    1495             :                             }
    1496             :                             else
    1497             :                             {
    1498             :                                 // Left pixel
    1499             :                                 {
    1500     5239756 :                                     const T val = pChunkShifted[nSrcXOff];
    1501     5239756 :                                     dfTotalLine =
    1502     5239756 :                                         val * pasSrcX[iDstPixel].dfLeftWeight;
    1503             :                                 }
    1504             : 
    1505     5239756 :                                 if (nSrcXOff + 1 < nSrcXOff2)
    1506             :                                 {
    1507             :                                     // Middle pixels
    1508     4239330 :                                     for (int iX = nSrcXOff + 1;
    1509    64183126 :                                          iX + 1 < nSrcXOff2; ++iX)
    1510             :                                     {
    1511    59943836 :                                         const T val = pChunkShifted[iX];
    1512    59943836 :                                         dfTotalLine += val;
    1513             :                                     }
    1514             : 
    1515             :                                     // Right pixel
    1516             :                                     {
    1517     4239330 :                                         const T val =
    1518     4239330 :                                             pChunkShifted[nSrcXOff2 - 1];
    1519     4239330 :                                         dfTotalLine +=
    1520     4239330 :                                             val *
    1521     4239330 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1522             :                                     }
    1523             :                                 }
    1524             :                             }
    1525             : 
    1526     5239852 :                             dfTotal += dfTotalLine * dfWeightY;
    1527     5239852 :                             --nCounterY;
    1528     5239852 :                             if (nCounterY < 0)
    1529     1746435 :                                 break;
    1530     3493427 :                             pChunkShifted += nChunkXSize;
    1531     3493427 :                             dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
    1532             :                         }
    1533             : 
    1534     1746435 :                         dfTotalWeight =
    1535     1746435 :                             pasSrcX[iDstPixel].dfTotalWeightFullLine *
    1536             :                             dfTotalWeightFullColumn;
    1537             :                     }
    1538             :                     else
    1539             :                     {
    1540    16401068 :                         GPtrDiff_t nCount = 0;
    1541    71751504 :                         for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    1542             :                         {
    1543    55350336 :                             const auto pChunkShifted =
    1544         136 :                                 pChunk +
    1545    55350336 :                                 static_cast<GPtrDiff_t>(iY) * nChunkXSize;
    1546             : 
    1547    55350336 :                             double dfTotalLine = 0;
    1548    55350336 :                             double dfTotalWeightLine = 0;
    1549             :                             // Left pixel
    1550             :                             {
    1551    55350336 :                                 const int iX = nSrcXOff;
    1552    55350336 :                                 const T val = pChunkShifted[iX];
    1553    55350336 :                                 if (pabyChunkNodataMask[iX + iY * nChunkXSize])
    1554             :                                 {
    1555    23508183 :                                     nCount++;
    1556    23508183 :                                     const double dfWeightX =
    1557    23508183 :                                         pasSrcX[iDstPixel].dfLeftWeight;
    1558    23508183 :                                     dfTotalWeightLine = dfWeightX;
    1559    23508183 :                                     if (bQuadraticMean)
    1560          60 :                                         dfTotalLine =
    1561          60 :                                             SQUARE<double>(val) * dfWeightX;
    1562             :                                     else
    1563    23508083 :                                         dfTotalLine = val * dfWeightX;
    1564             :                                 }
    1565             :                             }
    1566             : 
    1567    55350336 :                             if (nSrcXOff + 1 < nSrcXOff2)
    1568             :                             {
    1569             :                                 // Middle pixels
    1570   152870136 :                                 for (int iX = nSrcXOff + 1; iX + 1 < nSrcXOff2;
    1571             :                                      ++iX)
    1572             :                                 {
    1573    97518100 :                                     const T val = pChunkShifted[iX];
    1574    97518100 :                                     if (pabyChunkNodataMask[iX +
    1575    97518100 :                                                             iY * nChunkXSize])
    1576             :                                     {
    1577    39727100 :                                         nCount++;
    1578    39727100 :                                         dfTotalWeightLine += 1;
    1579    39727100 :                                         if (bQuadraticMean)
    1580           0 :                                             dfTotalLine += SQUARE<double>(val);
    1581             :                                         else
    1582    39727100 :                                             dfTotalLine += val;
    1583             :                                     }
    1584             :                                 }
    1585             : 
    1586             :                                 // Right pixel
    1587             :                                 {
    1588    55351936 :                                     const int iX = nSrcXOff2 - 1;
    1589    55351936 :                                     const T val = pChunkShifted[iX];
    1590    55351936 :                                     if (pabyChunkNodataMask[iX +
    1591    55351936 :                                                             iY * nChunkXSize])
    1592             :                                     {
    1593    23509251 :                                         nCount++;
    1594    23509251 :                                         const double dfWeightX =
    1595    23509251 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1596    23509251 :                                         dfTotalWeightLine += dfWeightX;
    1597    23509251 :                                         if (bQuadraticMean)
    1598           1 :                                             dfTotalLine +=
    1599          61 :                                                 SQUARE<double>(val) * dfWeightX;
    1600             :                                         else
    1601    23509150 :                                             dfTotalLine += val * dfWeightX;
    1602             :                                     }
    1603             :                                 }
    1604             :                             }
    1605             : 
    1606    94311604 :                             const double dfWeightY =
    1607             :                                 (iY == nSrcYOff)        ? dfBottomWeight
    1608    38961168 :                                 : (iY + 1 == nSrcYOff2) ? dfTopWeight
    1609             :                                                         : 1.0;
    1610    55350436 :                             dfTotal += dfTotalLine * dfWeightY;
    1611    55350436 :                             dfTotalWeight += dfTotalWeightLine * dfWeightY;
    1612             :                         }
    1613             : 
    1614    16401068 :                         if (nCount == 0 ||
    1615           8 :                             (bPropagateNoData &&
    1616             :                              nCount <
    1617           8 :                                  static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
    1618           8 :                                      (nSrcXOff2 - nSrcXOff)))
    1619             :                         {
    1620     9607202 :                             pDstScanline[iDstPixel] = tNoDataValue;
    1621     9607202 :                             continue;
    1622             :                         }
    1623             :                     }
    1624             :                     if (eWrkDataType == GDT_Byte)
    1625             :                     {
    1626             :                         T nVal;
    1627     8540160 :                         if (bQuadraticMean)
    1628          38 :                             nVal = ComputeIntegerRMS<T, int>(dfTotal,
    1629             :                                                              dfTotalWeight);
    1630             :                         else
    1631     8540120 :                             nVal =
    1632     8540120 :                                 static_cast<T>(dfTotal / dfTotalWeight + 0.5);
    1633     8544440 :                         if (bHasNoData && nVal == tNoDataValue)
    1634           0 :                             nVal = tReplacementVal;
    1635     8544440 :                         pDstScanline[iDstPixel] = nVal;
    1636             :                     }
    1637             :                     else if (eWrkDataType == GDT_UInt16)
    1638             :                     {
    1639             :                         T nVal;
    1640           8 :                         if (bQuadraticMean)
    1641           4 :                             nVal = ComputeIntegerRMS<T, uint64_t>(
    1642             :                                 dfTotal, dfTotalWeight);
    1643             :                         else
    1644           4 :                             nVal =
    1645           4 :                                 static_cast<T>(dfTotal / dfTotalWeight + 0.5);
    1646           8 :                         if (bHasNoData && nVal == tNoDataValue)
    1647           0 :                             nVal = tReplacementVal;
    1648           8 :                         pDstScanline[iDstPixel] = nVal;
    1649             :                     }
    1650             :                     else
    1651             :                     {
    1652             :                         T nVal;
    1653         153 :                         if (bQuadraticMean)
    1654          20 :                             nVal =
    1655          25 :                                 static_cast<T>(sqrt(dfTotal / dfTotalWeight));
    1656             :                         else
    1657         128 :                             nVal = static_cast<T>(dfTotal / dfTotalWeight);
    1658         153 :                         if (bHasNoData && nVal == tNoDataValue)
    1659           2 :                             nVal = tReplacementVal;
    1660         153 :                         pDstScanline[iDstPixel] = nVal;
    1661             :                     }
    1662             :                 }
    1663             :             }
    1664             :         }
    1665             :         else
    1666             :         {
    1667         115 :             nSrcYOff -= nChunkYOff;
    1668         115 :             nSrcYOff2 -= nChunkYOff;
    1669             : 
    1670        2275 :             for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
    1671             :             {
    1672        6475 :                 const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
    1673        6475 :                 const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
    1674             : 
    1675        6475 :                 GPtrDiff_t nTotalR = 0;
    1676        6475 :                 GPtrDiff_t nTotalG = 0;
    1677        6475 :                 GPtrDiff_t nTotalB = 0;
    1678        6475 :                 GPtrDiff_t nCount = 0;
    1679             : 
    1680       19425 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    1681             :                 {
    1682       38850 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    1683             :                     {
    1684       25900 :                         const T val = pChunk[iX + static_cast<GPtrDiff_t>(iY) *
    1685       25900 :                                                       nChunkXSize];
    1686             :                         // cppcheck-suppress unsignedLessThanZero
    1687       25900 :                         if (val < 0 || val >= colorEntries.size())
    1688           0 :                             continue;
    1689       25900 :                         size_t idx = static_cast<size_t>(val);
    1690       25900 :                         const auto &entry = colorEntries[idx];
    1691       25900 :                         if (entry.c4)
    1692             :                         {
    1693       14128 :                             if (bQuadraticMean)
    1694             :                             {
    1695         800 :                                 nTotalR += SQUARE<int>(entry.c1);
    1696         800 :                                 nTotalG += SQUARE<int>(entry.c2);
    1697         800 :                                 nTotalB += SQUARE<int>(entry.c3);
    1698         800 :                                 ++nCount;
    1699             :                             }
    1700             :                             else
    1701             :                             {
    1702       13328 :                                 nTotalR += entry.c1;
    1703       13328 :                                 nTotalG += entry.c2;
    1704       13328 :                                 nTotalB += entry.c3;
    1705       13328 :                                 ++nCount;
    1706             :                             }
    1707             :                         }
    1708             :                     }
    1709             :                 }
    1710             : 
    1711        6475 :                 if (nCount == 0 ||
    1712           0 :                     (bPropagateNoData &&
    1713           0 :                      nCount < static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
    1714           0 :                                   (nSrcXOff2 - nSrcXOff)))
    1715             :                 {
    1716        2838 :                     pDstScanline[iDstPixel] = tNoDataValue;
    1717             :                 }
    1718             :                 else
    1719             :                 {
    1720             :                     GDALColorEntry color;
    1721        3637 :                     if (bQuadraticMean)
    1722             :                     {
    1723         200 :                         color.c1 =
    1724         200 :                             static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
    1725         200 :                         color.c2 =
    1726         200 :                             static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
    1727         200 :                         color.c3 =
    1728         200 :                             static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
    1729             :                     }
    1730             :                     else
    1731             :                     {
    1732        3437 :                         color.c1 =
    1733        3437 :                             static_cast<short>((nTotalR + nCount / 2) / nCount);
    1734        3437 :                         color.c2 =
    1735        3437 :                             static_cast<short>((nTotalG + nCount / 2) / nCount);
    1736        3437 :                         color.c3 =
    1737        3437 :                             static_cast<short>((nTotalB + nCount / 2) / nCount);
    1738             :                     }
    1739           0 :                     pDstScanline[iDstPixel] =
    1740        3637 :                         static_cast<T>(BestColorEntry(colorEntries, color));
    1741             :                 }
    1742             :             }
    1743             :         }
    1744             :     }
    1745             : 
    1746        2319 :     CPLFree(pasSrcX);
    1747             : 
    1748        2319 :     return CE_None;
    1749             : }
    1750             : 
    1751             : static CPLErr
    1752        2319 : GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
    1753             :                                const void *pChunk, void **ppDstBuffer,
    1754             :                                GDALDataType *peDstBufferDataType)
    1755             : {
    1756        2319 :     *peDstBufferDataType = args.eWrkDataType;
    1757        2319 :     switch (args.eWrkDataType)
    1758             :     {
    1759        2252 :         case GDT_Byte:
    1760             :         {
    1761        2252 :             return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_Byte>(
    1762        2252 :                 args, static_cast<const GByte *>(pChunk), ppDstBuffer);
    1763             :         }
    1764             : 
    1765           9 :         case GDT_UInt16:
    1766             :         {
    1767           9 :             if (EQUAL(args.pszResampling, "RMS"))
    1768             :             {
    1769             :                 // Use double as accumulation type, because UInt32 could overflow
    1770             :                 return GDALResampleChunk_AverageOrRMS_T<GUInt16, double,
    1771           5 :                                                         GDT_UInt16>(
    1772           5 :                     args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
    1773             :             }
    1774             :             else
    1775             :             {
    1776             :                 return GDALResampleChunk_AverageOrRMS_T<GUInt16, GUInt32,
    1777           4 :                                                         GDT_UInt16>(
    1778           4 :                     args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
    1779             :             }
    1780             :         }
    1781             : 
    1782          41 :         case GDT_Float32:
    1783             :         {
    1784          41 :             return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32>(
    1785          41 :                 args, static_cast<const float *>(pChunk), ppDstBuffer);
    1786             :         }
    1787             : 
    1788          17 :         case GDT_Float64:
    1789             :         {
    1790             :             return GDALResampleChunk_AverageOrRMS_T<double, double,
    1791          17 :                                                     GDT_Float64>(
    1792          17 :                 args, static_cast<const double *>(pChunk), ppDstBuffer);
    1793             :         }
    1794             : 
    1795           0 :         default:
    1796           0 :             break;
    1797             :     }
    1798             : 
    1799           0 :     CPLAssert(false);
    1800             :     return CE_Failure;
    1801             : }
    1802             : 
    1803             : /************************************************************************/
    1804             : /*                     GDALResampleChunk_Gauss()                        */
    1805             : /************************************************************************/
    1806             : 
    1807          86 : static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
    1808             :                                       const void *pChunk, void **ppDstBuffer,
    1809             :                                       GDALDataType *peDstBufferDataType)
    1810             : 
    1811             : {
    1812          86 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    1813          86 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    1814          86 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    1815          86 :     const int nChunkXOff = args.nChunkXOff;
    1816          86 :     const int nChunkXSize = args.nChunkXSize;
    1817          86 :     const int nChunkYOff = args.nChunkYOff;
    1818          86 :     const int nChunkYSize = args.nChunkYSize;
    1819          86 :     const int nDstXOff = args.nDstXOff;
    1820          86 :     const int nDstXOff2 = args.nDstXOff2;
    1821          86 :     const int nDstYOff = args.nDstYOff;
    1822          86 :     const int nDstYOff2 = args.nDstYOff2;
    1823          86 :     const bool bHasNoData = args.bHasNoData;
    1824          86 :     double dfNoDataValue = args.dfNoDataValue;
    1825          86 :     const GDALColorTable *poColorTable = args.poColorTable;
    1826             : 
    1827          86 :     const double *const padfChunk = static_cast<const double *>(pChunk);
    1828             : 
    1829          86 :     *ppDstBuffer =
    1830          86 :         VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
    1831             :                             GDALGetDataTypeSizeBytes(GDT_Float64));
    1832          86 :     if (*ppDstBuffer == nullptr)
    1833             :     {
    1834           0 :         return CE_Failure;
    1835             :     }
    1836          86 :     *peDstBufferDataType = GDT_Float64;
    1837          86 :     double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
    1838             : 
    1839             :     /* -------------------------------------------------------------------- */
    1840             :     /*      Create the filter kernel and allocate scanline buffer.          */
    1841             :     /* -------------------------------------------------------------------- */
    1842          86 :     int nGaussMatrixDim = 3;
    1843             :     const int *panGaussMatrix;
    1844          86 :     constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
    1845          86 :     constexpr int anGaussMatrix5x5[] = {1,  4, 6,  4,  1,  4, 16, 24, 16,
    1846             :                                         4,  6, 24, 36, 24, 6, 4,  16, 24,
    1847             :                                         16, 4, 1,  4,  6,  4, 1};
    1848          86 :     constexpr int anGaussMatrix7x7[] = {
    1849             :         1,   6,  15, 20,  15,  6,   1,   6,  36, 90,  120, 90,  36,
    1850             :         6,   15, 90, 225, 300, 225, 90,  15, 20, 120, 300, 400, 300,
    1851             :         120, 20, 15, 90,  225, 300, 225, 90, 15, 6,   36,  90,  120,
    1852             :         90,  36, 6,  1,   6,   15,  20,  15, 6,  1};
    1853             : 
    1854          86 :     const int nOXSize = args.nOvrXSize;
    1855          86 :     const int nOYSize = args.nOvrYSize;
    1856          86 :     const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
    1857             : 
    1858             :     // matrix for gauss filter
    1859          86 :     if (nResYFactor <= 2)
    1860             :     {
    1861          85 :         panGaussMatrix = anGaussMatrix3x3;
    1862          85 :         nGaussMatrixDim = 3;
    1863             :     }
    1864           1 :     else if (nResYFactor <= 4)
    1865             :     {
    1866           0 :         panGaussMatrix = anGaussMatrix5x5;
    1867           0 :         nGaussMatrixDim = 5;
    1868             :     }
    1869             :     else
    1870             :     {
    1871           1 :         panGaussMatrix = anGaussMatrix7x7;
    1872           1 :         nGaussMatrixDim = 7;
    1873             :     }
    1874             : 
    1875             : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
    1876             :     int *panGaussMatrixDup = static_cast<int *>(
    1877             :         CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
    1878             :     memcpy(panGaussMatrixDup, panGaussMatrix,
    1879             :            sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
    1880             :     panGaussMatrix = panGaussMatrixDup;
    1881             : #endif
    1882             : 
    1883          86 :     if (!bHasNoData)
    1884          79 :         dfNoDataValue = 0.0;
    1885             : 
    1886          86 :     std::vector<GDALColorEntry> colorEntries;
    1887          86 :     int nTransparentIdx = -1;
    1888          86 :     if (poColorTable)
    1889           2 :         colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
    1890             : 
    1891             :     // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
    1892             :     // it as nodata value.
    1893          92 :     if (bHasNoData && dfNoDataValue >= 0.0f &&
    1894           6 :         dfNoDataValue < colorEntries.size())
    1895           0 :         colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
    1896             : 
    1897             :     // Or if we have no explicit nodata, but a color table entry that is
    1898             :     // transparent, consider it as the nodata value.
    1899          86 :     else if (!bHasNoData && nTransparentIdx >= 0)
    1900             :     {
    1901           0 :         dfNoDataValue = nTransparentIdx;
    1902             :     }
    1903             : 
    1904          86 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    1905          86 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    1906          86 :     const int nDstXWidth = nDstXOff2 - nDstXOff;
    1907             : 
    1908             :     /* ==================================================================== */
    1909             :     /*      Loop over destination scanlines.                                */
    1910             :     /* ==================================================================== */
    1911       16488 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    1912             :     {
    1913       16402 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
    1914       16402 :         int nSrcYOff2 =
    1915       16402 :             static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
    1916             : 
    1917       16402 :         if (nSrcYOff < nChunkYOff)
    1918             :         {
    1919           0 :             nSrcYOff = nChunkYOff;
    1920           0 :             nSrcYOff2++;
    1921             :         }
    1922             : 
    1923       16402 :         const int iSizeY = nSrcYOff2 - nSrcYOff;
    1924       16402 :         nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
    1925       16402 :         nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
    1926             : 
    1927       16402 :         if (nSrcYOff2 > nChunkBottomYOff ||
    1928       16359 :             (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
    1929             :         {
    1930          44 :             nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
    1931             :         }
    1932             : 
    1933       16402 :         int nYShiftGaussMatrix = 0;
    1934       16402 :         if (nSrcYOff < nChunkYOff)
    1935             :         {
    1936           0 :             nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
    1937           0 :             nSrcYOff = nChunkYOff;
    1938             :         }
    1939             : 
    1940       16402 :         const double *const padfSrcScanline =
    1941       16402 :             padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
    1942       16402 :         const GByte *pabySrcScanlineNodataMask = nullptr;
    1943       16402 :         if (pabyChunkNodataMask != nullptr)
    1944         152 :             pabySrcScanlineNodataMask =
    1945         152 :                 pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
    1946             : 
    1947             :         /* --------------------------------------------------------------------
    1948             :          */
    1949             :         /*      Loop over destination pixels */
    1950             :         /* --------------------------------------------------------------------
    1951             :          */
    1952       16402 :         double *const padfDstScanline =
    1953       16402 :             padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
    1954     4149980 :         for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    1955             :         {
    1956     4133580 :             int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
    1957     4133580 :             int nSrcXOff2 =
    1958     4133580 :                 static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
    1959             : 
    1960     4133580 :             if (nSrcXOff < nChunkXOff)
    1961             :             {
    1962           0 :                 nSrcXOff = nChunkXOff;
    1963           0 :                 nSrcXOff2++;
    1964             :             }
    1965             : 
    1966     4133580 :             const int iSizeX = nSrcXOff2 - nSrcXOff;
    1967     4133580 :             nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
    1968     4133580 :             nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
    1969             : 
    1970     4133580 :             if (nSrcXOff2 > nChunkRightXOff ||
    1971     4127930 :                 (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
    1972             :             {
    1973        5650 :                 nSrcXOff2 =
    1974        5650 :                     std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
    1975             :             }
    1976             : 
    1977     4133580 :             int nXShiftGaussMatrix = 0;
    1978     4133580 :             if (nSrcXOff < nChunkXOff)
    1979             :             {
    1980           0 :                 nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
    1981           0 :                 nSrcXOff = nChunkXOff;
    1982             :             }
    1983             : 
    1984     4133580 :             if (poColorTable == nullptr)
    1985             :             {
    1986     4133380 :                 double dfTotal = 0.0;
    1987     4133380 :                 GInt64 nCount = 0;
    1988     4133380 :                 const int *panLineWeight =
    1989     4133380 :                     panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
    1990             :                     nXShiftGaussMatrix;
    1991             : 
    1992    16527900 :                 for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
    1993    12394500 :                      ++iY, ++j, panLineWeight += nGaussMatrixDim)
    1994             :                 {
    1995    49561300 :                     for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
    1996             :                     {
    1997    37166800 :                         const double val =
    1998    37166800 :                             padfSrcScanline[iX - nChunkXOff +
    1999    37166800 :                                             static_cast<GPtrDiff_t>(iY -
    2000    37166800 :                                                                     nSrcYOff) *
    2001    37166800 :                                                 nChunkXSize];
    2002    37166800 :                         if (pabySrcScanlineNodataMask == nullptr ||
    2003       32872 :                             pabySrcScanlineNodataMask[iX - nChunkXOff +
    2004       32872 :                                                       static_cast<GPtrDiff_t>(
    2005       32872 :                                                           iY - nSrcYOff) *
    2006       32872 :                                                           nChunkXSize])
    2007             :                         {
    2008    37146100 :                             const int nWeight = panLineWeight[i];
    2009    37146100 :                             dfTotal += val * nWeight;
    2010    37146100 :                             nCount += nWeight;
    2011             :                         }
    2012             :                     }
    2013             :                 }
    2014             : 
    2015     4133380 :                 if (nCount == 0)
    2016             :                 {
    2017        2217 :                     padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
    2018             :                 }
    2019             :                 else
    2020             :                 {
    2021     4131160 :                     padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
    2022             :                 }
    2023             :             }
    2024             :             else
    2025             :             {
    2026         200 :                 GInt64 nTotalR = 0;
    2027         200 :                 GInt64 nTotalG = 0;
    2028         200 :                 GInt64 nTotalB = 0;
    2029         200 :                 GInt64 nTotalWeight = 0;
    2030         200 :                 const int *panLineWeight =
    2031         200 :                     panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
    2032             :                     nXShiftGaussMatrix;
    2033             : 
    2034         780 :                 for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
    2035         580 :                      ++iY, ++j, panLineWeight += nGaussMatrixDim)
    2036             :                 {
    2037        2262 :                     for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
    2038             :                     {
    2039        1682 :                         const double val =
    2040        1682 :                             padfSrcScanline[iX - nChunkXOff +
    2041        1682 :                                             static_cast<GPtrDiff_t>(iY -
    2042        1682 :                                                                     nSrcYOff) *
    2043        1682 :                                                 nChunkXSize];
    2044        1682 :                         if (val < 0 || val >= colorEntries.size())
    2045           0 :                             continue;
    2046             : 
    2047        1682 :                         size_t idx = static_cast<size_t>(val);
    2048        1682 :                         if (colorEntries[idx].c4)
    2049             :                         {
    2050        1682 :                             const int nWeight = panLineWeight[i];
    2051        1682 :                             nTotalR +=
    2052        1682 :                                 static_cast<GInt64>(colorEntries[idx].c1) *
    2053        1682 :                                 nWeight;
    2054        1682 :                             nTotalG +=
    2055        1682 :                                 static_cast<GInt64>(colorEntries[idx].c2) *
    2056        1682 :                                 nWeight;
    2057        1682 :                             nTotalB +=
    2058        1682 :                                 static_cast<GInt64>(colorEntries[idx].c3) *
    2059        1682 :                                 nWeight;
    2060        1682 :                             nTotalWeight += nWeight;
    2061             :                         }
    2062             :                     }
    2063             :                 }
    2064             : 
    2065         200 :                 if (nTotalWeight == 0)
    2066             :                 {
    2067           0 :                     padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
    2068             :                 }
    2069             :                 else
    2070             :                 {
    2071             :                     GDALColorEntry color;
    2072             : 
    2073         200 :                     color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
    2074             :                                                   nTotalWeight);
    2075         200 :                     color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
    2076             :                                                   nTotalWeight);
    2077         200 :                     color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
    2078             :                                                   nTotalWeight);
    2079         200 :                     padfDstScanline[iDstPixel - nDstXOff] =
    2080         200 :                         BestColorEntry(colorEntries, color);
    2081             :                 }
    2082             :             }
    2083             :         }
    2084             :     }
    2085             : 
    2086             : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
    2087             :     CPLFree(panGaussMatrixDup);
    2088             : #endif
    2089             : 
    2090          86 :     return CE_None;
    2091             : }
    2092             : 
    2093             : /************************************************************************/
    2094             : /*                      GDALResampleChunk_Mode()                        */
    2095             : /************************************************************************/
    2096             : 
    2097        4398 : template <class T> static inline bool IsSame(T a, T b)
    2098             : {
    2099        4398 :     return a == b;
    2100             : }
    2101             : 
    2102        4854 : template <> bool IsSame<float>(float a, float b)
    2103             : {
    2104        4854 :     return a == b || (std::isnan(a) && std::isnan(b));
    2105             : }
    2106             : 
    2107         504 : template <> bool IsSame<double>(double a, double b)
    2108             : {
    2109         504 :     return a == b || (std::isnan(a) && std::isnan(b));
    2110             : }
    2111             : 
    2112             : template <>
    2113         480 : bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
    2114             : {
    2115         960 :     return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
    2116         960 :                       std::isnan(b.real()) && std::isnan(b.imag()));
    2117             : }
    2118             : 
    2119             : template <>
    2120         480 : bool IsSame<std::complex<double>>(std::complex<double> a,
    2121             :                                   std::complex<double> b)
    2122             : {
    2123         960 :     return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
    2124         960 :                       std::isnan(b.real()) && std::isnan(b.imag()));
    2125             : }
    2126             : 
    2127             : template <class T>
    2128         136 : static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
    2129             :                                       const T *pChunk, T *const pDstBuffer)
    2130             : 
    2131             : {
    2132         136 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    2133         136 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    2134         136 :     const double dfSrcXDelta = args.dfSrcXDelta;
    2135         136 :     const double dfSrcYDelta = args.dfSrcYDelta;
    2136         136 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    2137         136 :     const int nChunkXOff = args.nChunkXOff;
    2138         136 :     const int nChunkXSize = args.nChunkXSize;
    2139         136 :     const int nChunkYOff = args.nChunkYOff;
    2140         136 :     const int nChunkYSize = args.nChunkYSize;
    2141         136 :     const int nDstXOff = args.nDstXOff;
    2142         136 :     const int nDstXOff2 = args.nDstXOff2;
    2143         136 :     const int nDstYOff = args.nDstYOff;
    2144         136 :     const int nDstYOff2 = args.nDstYOff2;
    2145         136 :     const bool bHasNoData = args.bHasNoData;
    2146         136 :     const GDALColorTable *poColorTable = args.poColorTable;
    2147         136 :     const int nDstXSize = nDstXOff2 - nDstXOff;
    2148             : 
    2149           8 :     T tNoDataValue;
    2150             :     if constexpr (std::is_same<T, std::complex<float>>::value ||
    2151             :                   std::is_same<T, std::complex<double>>::value)
    2152             :     {
    2153             :         using BaseT = typename T::value_type;
    2154           8 :         tNoDataValue =
    2155             :             std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
    2156             :                                 std::numeric_limits<BaseT>::quiet_NaN());
    2157             :     }
    2158         128 :     else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
    2159         127 :         tNoDataValue = 0;
    2160             :     else
    2161           1 :         tNoDataValue = static_cast<T>(args.dfNoDataValue);
    2162             : 
    2163         136 :     size_t nMaxNumPx = 0;
    2164         136 :     T *paVals = nullptr;
    2165         136 :     int *panSums = nullptr;
    2166             : 
    2167         136 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    2168         136 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    2169         272 :     std::vector<int> anVals(256, 0);
    2170             : 
    2171             :     /* ==================================================================== */
    2172             :     /*      Loop over destination scanlines.                                */
    2173             :     /* ==================================================================== */
    2174        7531 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    2175             :     {
    2176        7395 :         double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
    2177        7395 :         int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
    2178             : #ifdef only_pixels_with_more_than_10_pct_participation
    2179             :         // When oversampling, don't take into account pixels that have a tiny
    2180             :         // participation in the resulting pixel
    2181             :         if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
    2182             :             nSrcYOff < nChunkBottomYOff)
    2183             :             nSrcYOff++;
    2184             : #endif
    2185        7395 :         if (nSrcYOff < nChunkYOff)
    2186           0 :             nSrcYOff = nChunkYOff;
    2187             : 
    2188        7395 :         double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
    2189        7395 :         int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
    2190             : #ifdef only_pixels_with_more_than_10_pct_participation
    2191             :         // When oversampling, don't take into account pixels that have a tiny
    2192             :         // participation in the resulting pixel
    2193             :         if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
    2194             :             nSrcYOff2 > nChunkYOff)
    2195             :             nSrcYOff2--;
    2196             : #endif
    2197        7395 :         if (nSrcYOff2 == nSrcYOff)
    2198           0 :             ++nSrcYOff2;
    2199        7395 :         if (nSrcYOff2 > nChunkBottomYOff)
    2200           0 :             nSrcYOff2 = nChunkBottomYOff;
    2201             : 
    2202        7395 :         const T *const paSrcScanline =
    2203         149 :             pChunk +
    2204        7395 :             (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
    2205        7395 :         const GByte *pabySrcScanlineNodataMask = nullptr;
    2206        7395 :         if (pabyChunkNodataMask != nullptr)
    2207        1810 :             pabySrcScanlineNodataMask =
    2208             :                 pabyChunkNodataMask +
    2209        1810 :                 static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
    2210             : 
    2211        7395 :         T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
    2212             :         /* --------------------------------------------------------------------
    2213             :          */
    2214             :         /*      Loop over destination pixels */
    2215             :         /* --------------------------------------------------------------------
    2216             :          */
    2217     4259580 :         for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    2218             :         {
    2219     4252187 :             double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
    2220             :             // Apply some epsilon to avoid numerical precision issues
    2221     4252187 :             int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
    2222             : #ifdef only_pixels_with_more_than_10_pct_participation
    2223             :             // When oversampling, don't take into account pixels that have a
    2224             :             // tiny participation in the resulting pixel
    2225             :             if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
    2226             :                 nSrcXOff < nChunkRightXOff)
    2227             :                 nSrcXOff++;
    2228             : #endif
    2229     4252187 :             if (nSrcXOff < nChunkXOff)
    2230           0 :                 nSrcXOff = nChunkXOff;
    2231             : 
    2232     4252187 :             double dfSrcXOff2 =
    2233     4252187 :                 dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
    2234     4252187 :             int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
    2235             : #ifdef only_pixels_with_more_than_10_pct_participation
    2236             :             // When oversampling, don't take into account pixels that have a
    2237             :             // tiny participation in the resulting pixel
    2238             :             if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
    2239             :                 nSrcXOff2 > nChunkXOff)
    2240             :                 nSrcXOff2--;
    2241             : #endif
    2242     4252187 :             if (nSrcXOff2 == nSrcXOff)
    2243           0 :                 nSrcXOff2++;
    2244     4252187 :             if (nSrcXOff2 > nChunkRightXOff)
    2245           0 :                 nSrcXOff2 = nChunkRightXOff;
    2246             : 
    2247     4252187 :             bool bRegularProcessing = false;
    2248             :             if constexpr (!std::is_same<T, GByte>::value)
    2249         827 :                 bRegularProcessing = true;
    2250     4251360 :             else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
    2251           0 :                 bRegularProcessing = true;
    2252             : 
    2253     4252187 :             if (bRegularProcessing)
    2254             :             {
    2255             :                 // Not sure how much sense it makes to run a majority
    2256             :                 // filter on floating point data, but here it is for the sake
    2257             :                 // of compatibility. It won't look right on RGB images by the
    2258             :                 // nature of the filter.
    2259             : 
    2260         827 :                 if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
    2261        2481 :                     nSrcYOff2 - nSrcYOff > INT_MAX / (nSrcXOff2 - nSrcXOff) ||
    2262         827 :                     static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
    2263         827 :                             static_cast<size_t>(nSrcXOff2 - nSrcXOff) >
    2264         827 :                         std::numeric_limits<size_t>::max() / sizeof(float))
    2265             :                 {
    2266           0 :                     CPLError(CE_Failure, CPLE_NotSupported,
    2267             :                              "Too big downsampling factor");
    2268           0 :                     CPLFree(paVals);
    2269           0 :                     CPLFree(panSums);
    2270           0 :                     return CE_Failure;
    2271             :                 }
    2272         827 :                 const size_t nNumPx =
    2273         827 :                     static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
    2274         827 :                     static_cast<size_t>(nSrcXOff2 - nSrcXOff);
    2275         827 :                 size_t iMaxInd = 0;
    2276         827 :                 size_t iMaxVal = 0;
    2277         827 :                 bool biMaxValdValid = false;
    2278             : 
    2279         827 :                 if (paVals == nullptr || nNumPx > nMaxNumPx)
    2280             :                 {
    2281             :                     T *paValsNew = static_cast<T *>(
    2282          71 :                         VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
    2283             :                     int *panSumsNew = static_cast<int *>(
    2284          71 :                         VSI_REALLOC_VERBOSE(panSums, nNumPx * sizeof(int)));
    2285          71 :                     if (paValsNew != nullptr)
    2286          71 :                         paVals = paValsNew;
    2287          71 :                     if (panSumsNew != nullptr)
    2288          71 :                         panSums = panSumsNew;
    2289          71 :                     if (paValsNew == nullptr || panSumsNew == nullptr)
    2290             :                     {
    2291           0 :                         CPLFree(paVals);
    2292           0 :                         CPLFree(panSums);
    2293           0 :                         return CE_Failure;
    2294             :                     }
    2295          71 :                     nMaxNumPx = nNumPx;
    2296             :                 }
    2297             : 
    2298        2585 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    2299             :                 {
    2300        1758 :                     const GPtrDiff_t iTotYOff =
    2301        1758 :                         static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
    2302        1758 :                         nChunkXOff;
    2303        5690 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    2304             :                     {
    2305        3932 :                         if (pabySrcScanlineNodataMask == nullptr ||
    2306          16 :                             pabySrcScanlineNodataMask[iX + iTotYOff])
    2307             :                         {
    2308        3917 :                             const T val = paSrcScanline[iX + iTotYOff];
    2309        3917 :                             size_t i = 0;  // Used after for.
    2310             : 
    2311             :                             // Check array for existing entry.
    2312       14387 :                             for (; i < iMaxInd; ++i)
    2313       17626 :                                 if (IsSame(paVals[i], val) &&
    2314        6910 :                                     ++panSums[i] > panSums[iMaxVal])
    2315             :                                 {
    2316         246 :                                     iMaxVal = i;
    2317         246 :                                     biMaxValdValid = true;
    2318         246 :                                     break;
    2319             :                                 }
    2320             : 
    2321             :                             // Add to arr if entry not already there.
    2322        3917 :                             if (i == iMaxInd)
    2323             :                             {
    2324        3671 :                                 paVals[iMaxInd] = val;
    2325        3671 :                                 panSums[iMaxInd] = 1;
    2326             : 
    2327        3671 :                                 if (!biMaxValdValid)
    2328             :                                 {
    2329         824 :                                     iMaxVal = iMaxInd;
    2330         824 :                                     biMaxValdValid = true;
    2331             :                                 }
    2332             : 
    2333        3671 :                                 ++iMaxInd;
    2334             :                             }
    2335             :                         }
    2336             :                     }
    2337             :                 }
    2338             : 
    2339         827 :                 if (!biMaxValdValid)
    2340           3 :                     paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
    2341             :                 else
    2342         824 :                     paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
    2343             :             }
    2344             :             else if constexpr (std::is_same<T, GByte>::value)
    2345             :             // ( eSrcDataType == GDT_Byte && nEntryCount < 256 )
    2346             :             {
    2347             :                 // So we go here for a paletted or non-paletted byte band.
    2348             :                 // The input values are then between 0 and 255.
    2349     4251360 :                 int nMaxVal = 0;
    2350     4251360 :                 int iMaxInd = -1;
    2351             : 
    2352             :                 // The cost of this zeroing might be high. Perhaps we should
    2353             :                 // just use the above generic case, and go to this one if the
    2354             :                 // number of source pixels is large enough
    2355     4251360 :                 std::fill(anVals.begin(), anVals.end(), 0);
    2356             : 
    2357    12777700 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    2358             :                 {
    2359     8526370 :                     const GPtrDiff_t iTotYOff =
    2360     8526370 :                         static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
    2361     8526370 :                         nChunkXOff;
    2362    25649400 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    2363             :                     {
    2364    17123000 :                         const T val = paSrcScanline[iX + iTotYOff];
    2365    17123000 :                         if (!bHasNoData || val != tNoDataValue)
    2366             :                         {
    2367    17123000 :                             int nVal = static_cast<int>(val);
    2368    17123000 :                             if (++anVals[nVal] > nMaxVal)
    2369             :                             {
    2370             :                                 // Sum the density.
    2371             :                                 // Is it the most common value so far?
    2372    17006300 :                                 iMaxInd = nVal;
    2373    17006300 :                                 nMaxVal = anVals[nVal];
    2374             :                             }
    2375             :                         }
    2376             :                     }
    2377             :                 }
    2378             : 
    2379     4251360 :                 if (iMaxInd == -1)
    2380           0 :                     paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
    2381             :                 else
    2382     4251360 :                     paDstScanline[iDstPixel - nDstXOff] =
    2383             :                         static_cast<T>(iMaxInd);
    2384             :             }
    2385             :         }
    2386             :     }
    2387             : 
    2388         136 :     CPLFree(paVals);
    2389         136 :     CPLFree(panSums);
    2390             : 
    2391         136 :     return CE_None;
    2392             : }
    2393             : 
    2394         136 : static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
    2395             :                                      const void *pChunk, void **ppDstBuffer,
    2396             :                                      GDALDataType *peDstBufferDataType)
    2397             : {
    2398         136 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(
    2399             :         args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
    2400             :         GDALGetDataTypeSizeBytes(args.eWrkDataType));
    2401         136 :     if (*ppDstBuffer == nullptr)
    2402             :     {
    2403           0 :         return CE_Failure;
    2404             :     }
    2405             : 
    2406         136 :     CPLAssert(args.eSrcDataType == args.eWrkDataType);
    2407             : 
    2408         136 :     *peDstBufferDataType = args.eWrkDataType;
    2409         136 :     switch (args.eWrkDataType)
    2410             :     {
    2411             :         // For mode resampling, as no computation is done, only the
    2412             :         // size of the data type matters... except for Byte where we have
    2413             :         // special processing. And for floating point values
    2414          65 :         case GDT_Byte:
    2415             :         {
    2416          65 :             return GDALResampleChunk_ModeT(args,
    2417             :                                            static_cast<const GByte *>(pChunk),
    2418          65 :                                            static_cast<GByte *>(*ppDstBuffer));
    2419             :         }
    2420             : 
    2421           4 :         case GDT_Int8:
    2422             :         {
    2423           4 :             return GDALResampleChunk_ModeT(args,
    2424             :                                            static_cast<const int8_t *>(pChunk),
    2425           4 :                                            static_cast<int8_t *>(*ppDstBuffer));
    2426             :         }
    2427             : 
    2428           9 :         case GDT_Int16:
    2429             :         case GDT_UInt16:
    2430             :         case GDT_Float16:
    2431             :         {
    2432           9 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
    2433           9 :             return GDALResampleChunk_ModeT(
    2434             :                 args, static_cast<const uint16_t *>(pChunk),
    2435           9 :                 static_cast<uint16_t *>(*ppDstBuffer));
    2436             :         }
    2437             : 
    2438          15 :         case GDT_CInt16:
    2439             :         case GDT_CFloat16:
    2440             :         case GDT_Int32:
    2441             :         case GDT_UInt32:
    2442             :         {
    2443          15 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
    2444          15 :             return GDALResampleChunk_ModeT(
    2445             :                 args, static_cast<const uint32_t *>(pChunk),
    2446          15 :                 static_cast<uint32_t *>(*ppDstBuffer));
    2447             :         }
    2448             : 
    2449          17 :         case GDT_Float32:
    2450             :         {
    2451          17 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
    2452          17 :             return GDALResampleChunk_ModeT(args,
    2453             :                                            static_cast<const float *>(pChunk),
    2454          17 :                                            static_cast<float *>(*ppDstBuffer));
    2455             :         }
    2456             : 
    2457          12 :         case GDT_CInt32:
    2458             :         case GDT_Int64:
    2459             :         case GDT_UInt64:
    2460             :         {
    2461          12 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
    2462          12 :             return GDALResampleChunk_ModeT(
    2463             :                 args, static_cast<const uint64_t *>(pChunk),
    2464          12 :                 static_cast<uint64_t *>(*ppDstBuffer));
    2465             :         }
    2466             : 
    2467           6 :         case GDT_Float64:
    2468             :         {
    2469           6 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
    2470           6 :             return GDALResampleChunk_ModeT(args,
    2471             :                                            static_cast<const double *>(pChunk),
    2472           6 :                                            static_cast<double *>(*ppDstBuffer));
    2473             :         }
    2474             : 
    2475           4 :         case GDT_CFloat32:
    2476             :         {
    2477           4 :             return GDALResampleChunk_ModeT(
    2478             :                 args, static_cast<const std::complex<float> *>(pChunk),
    2479           4 :                 static_cast<std::complex<float> *>(*ppDstBuffer));
    2480             :         }
    2481             : 
    2482           4 :         case GDT_CFloat64:
    2483             :         {
    2484           4 :             return GDALResampleChunk_ModeT(
    2485             :                 args, static_cast<const std::complex<double> *>(pChunk),
    2486           4 :                 static_cast<std::complex<double> *>(*ppDstBuffer));
    2487             :         }
    2488             : 
    2489           0 :         case GDT_Unknown:
    2490             :         case GDT_TypeCount:
    2491           0 :             break;
    2492             :     }
    2493             : 
    2494           0 :     CPLAssert(false);
    2495             :     return CE_Failure;
    2496             : }
    2497             : 
    2498             : /************************************************************************/
    2499             : /*                  GDALResampleConvolutionHorizontal()                 */
    2500             : /************************************************************************/
    2501             : 
    2502             : template <class T>
    2503             : static inline double
    2504       44886 : GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
    2505             :                                   int nSrcPixelCount)
    2506             : {
    2507       44886 :     double dfVal1 = 0.0;
    2508       44886 :     double dfVal2 = 0.0;
    2509       44886 :     int i = 0;  // Used after for.
    2510             :     // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
    2511             :     // manually (untypical) unrolled loop in -O2 and -O3:
    2512             :     // https://github.com/OSGeo/gdal/issues/9508
    2513             : #if !defined(__INTEL_CLANG_COMPILER)
    2514       89516 :     for (; i + 3 < nSrcPixelCount; i += 4)
    2515             :     {
    2516       44630 :         dfVal1 += pChunk[i] * padfWeights[i];
    2517       44630 :         dfVal1 += pChunk[i + 1] * padfWeights[i + 1];
    2518       44630 :         dfVal2 += pChunk[i + 2] * padfWeights[i + 2];
    2519       44630 :         dfVal2 += pChunk[i + 3] * padfWeights[i + 3];
    2520             :     }
    2521             : #endif
    2522       46358 :     for (; i < nSrcPixelCount; ++i)
    2523             :     {
    2524        1472 :         dfVal1 += pChunk[i] * padfWeights[i];
    2525             :     }
    2526       44886 :     return dfVal1 + dfVal2;
    2527             : }
    2528             : 
    2529             : template <class T>
    2530       44576 : static inline void GDALResampleConvolutionHorizontalWithMask(
    2531             :     const T *pChunk, const GByte *pabyMask, const double *padfWeights,
    2532             :     int nSrcPixelCount, double &dfVal, double &dfWeightSum)
    2533             : {
    2534       44576 :     dfVal = 0;
    2535       44576 :     dfWeightSum = 0;
    2536       44576 :     int i = 0;
    2537       98300 :     for (; i + 3 < nSrcPixelCount; i += 4)
    2538             :     {
    2539       53724 :         const double dfWeight0 = padfWeights[i] * pabyMask[i];
    2540       53724 :         const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
    2541       53724 :         const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
    2542       53724 :         const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
    2543       53724 :         dfVal += pChunk[i] * dfWeight0;
    2544       53724 :         dfVal += pChunk[i + 1] * dfWeight1;
    2545       53724 :         dfVal += pChunk[i + 2] * dfWeight2;
    2546       53724 :         dfVal += pChunk[i + 3] * dfWeight3;
    2547       53724 :         dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
    2548             :     }
    2549       61162 :     for (; i < nSrcPixelCount; ++i)
    2550             :     {
    2551       16586 :         const double dfWeight = padfWeights[i] * pabyMask[i];
    2552       16586 :         dfVal += pChunk[i] * dfWeight;
    2553       16586 :         dfWeightSum += dfWeight;
    2554             :     }
    2555       44576 : }
    2556             : 
    2557             : template <class T>
    2558     1340094 : static inline void GDALResampleConvolutionHorizontal_3rows(
    2559             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2560             :     const double *padfWeights, int nSrcPixelCount, double &dfRes1,
    2561             :     double &dfRes2, double &dfRes3)
    2562             : {
    2563     1340094 :     double dfVal1 = 0.0;
    2564     1340094 :     double dfVal2 = 0.0;
    2565     1340094 :     double dfVal3 = 0.0;
    2566     1340094 :     double dfVal4 = 0.0;
    2567     1340094 :     double dfVal5 = 0.0;
    2568     1340094 :     double dfVal6 = 0.0;
    2569     1340094 :     int i = 0;  // Used after for.
    2570     2733937 :     for (; i + 3 < nSrcPixelCount; i += 4)
    2571             :     {
    2572     1393842 :         dfVal1 += pChunkRow1[i] * padfWeights[i];
    2573     1393842 :         dfVal1 += pChunkRow1[i + 1] * padfWeights[i + 1];
    2574     1393842 :         dfVal2 += pChunkRow1[i + 2] * padfWeights[i + 2];
    2575     1393842 :         dfVal2 += pChunkRow1[i + 3] * padfWeights[i + 3];
    2576     1393842 :         dfVal3 += pChunkRow2[i] * padfWeights[i];
    2577     1393842 :         dfVal3 += pChunkRow2[i + 1] * padfWeights[i + 1];
    2578     1393842 :         dfVal4 += pChunkRow2[i + 2] * padfWeights[i + 2];
    2579     1393842 :         dfVal4 += pChunkRow2[i + 3] * padfWeights[i + 3];
    2580     1393842 :         dfVal5 += pChunkRow3[i] * padfWeights[i];
    2581     1393842 :         dfVal5 += pChunkRow3[i + 1] * padfWeights[i + 1];
    2582     1393842 :         dfVal6 += pChunkRow3[i + 2] * padfWeights[i + 2];
    2583     1393842 :         dfVal6 += pChunkRow3[i + 3] * padfWeights[i + 3];
    2584             :     }
    2585     1378621 :     for (; i < nSrcPixelCount; ++i)
    2586             :     {
    2587       38527 :         dfVal1 += pChunkRow1[i] * padfWeights[i];
    2588       38527 :         dfVal3 += pChunkRow2[i] * padfWeights[i];
    2589       38527 :         dfVal5 += pChunkRow3[i] * padfWeights[i];
    2590             :     }
    2591     1340094 :     dfRes1 = dfVal1 + dfVal2;
    2592     1340094 :     dfRes2 = dfVal3 + dfVal4;
    2593     1340094 :     dfRes3 = dfVal5 + dfVal6;
    2594     1340094 : }
    2595             : 
    2596             : template <class T>
    2597       18828 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
    2598             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2599             :     const double *padfWeights, int nSrcPixelCount, double &dfRes1,
    2600             :     double &dfRes2, double &dfRes3)
    2601             : {
    2602       18828 :     GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
    2603             :                                             padfWeights, nSrcPixelCount, dfRes1,
    2604             :                                             dfRes2, dfRes3);
    2605       18828 : }
    2606             : 
    2607             : template <class T>
    2608     1256466 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
    2609             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2610             :     const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
    2611             : {
    2612     1256466 :     GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
    2613             :                                             padfWeights, 4, dfRes1, dfRes2,
    2614             :                                             dfRes3);
    2615     1256466 : }
    2616             : 
    2617             : /************************************************************************/
    2618             : /*                  GDALResampleConvolutionVertical()                   */
    2619             : /************************************************************************/
    2620             : 
    2621             : template <class T>
    2622             : static inline double
    2623      465199 : GDALResampleConvolutionVertical(const T *pChunk, int nStride,
    2624             :                                 const double *padfWeights, int nSrcLineCount)
    2625             : {
    2626      465199 :     double dfVal1 = 0.0;
    2627      465199 :     double dfVal2 = 0.0;
    2628      465199 :     int i = 0;
    2629      465199 :     int j = 0;
    2630      916010 :     for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
    2631             :     {
    2632      450811 :         dfVal1 += pChunk[j] * padfWeights[i];
    2633      450811 :         dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
    2634      450811 :         dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
    2635      450811 :         dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
    2636             :     }
    2637      518702 :     for (; i < nSrcLineCount; ++i, j += nStride)
    2638             :     {
    2639       53503 :         dfVal1 += pChunk[j] * padfWeights[i];
    2640             :     }
    2641      465199 :     return dfVal1 + dfVal2;
    2642             : }
    2643             : 
    2644             : template <class T>
    2645     2880000 : static inline void GDALResampleConvolutionVertical_2cols(
    2646             :     const T *pChunk, int nStride, const double *padfWeights, int nSrcLineCount,
    2647             :     double &dfRes1, double &dfRes2)
    2648             : {
    2649     2880000 :     double dfVal1 = 0.0;
    2650     2880000 :     double dfVal2 = 0.0;
    2651     2880000 :     double dfVal3 = 0.0;
    2652     2880000 :     double dfVal4 = 0.0;
    2653     2880000 :     int i = 0;
    2654     2880000 :     int j = 0;
    2655     5716800 :     for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
    2656             :     {
    2657     2836800 :         dfVal1 += pChunk[j] * padfWeights[i];
    2658     2836800 :         dfVal3 += pChunk[j + 1] * padfWeights[i];
    2659     2836800 :         dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
    2660     2836800 :         dfVal3 += pChunk[j + 1 + nStride] * padfWeights[i + 1];
    2661     2836800 :         dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
    2662     2836800 :         dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
    2663     2836800 :         dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
    2664     2836800 :         dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
    2665             :     }
    2666     2995210 :     for (; i < nSrcLineCount; ++i, j += nStride)
    2667             :     {
    2668      115210 :         dfVal1 += pChunk[j] * padfWeights[i];
    2669      115210 :         dfVal3 += pChunk[j + 1] * padfWeights[i];
    2670             :     }
    2671     2880000 :     dfRes1 = dfVal1 + dfVal2;
    2672     2880000 :     dfRes2 = dfVal3 + dfVal4;
    2673     2880000 : }
    2674             : 
    2675             : #ifdef USE_SSE2
    2676             : 
    2677             : #ifdef __AVX__
    2678             : /************************************************************************/
    2679             : /*             GDALResampleConvolutionVertical_16cols<T>                */
    2680             : /************************************************************************/
    2681             : 
    2682             : template <class T>
    2683             : static inline void
    2684             : GDALResampleConvolutionVertical_16cols(const T *pChunk, int nStride,
    2685             :                                        const double *padfWeights,
    2686             :                                        int nSrcLineCount, float *afDest)
    2687             : {
    2688             :     int i = 0;
    2689             :     int j = 0;
    2690             :     XMMReg4Double v_acc0 = XMMReg4Double::Zero();
    2691             :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    2692             :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    2693             :     XMMReg4Double v_acc3 = XMMReg4Double::Zero();
    2694             :     for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
    2695             :     {
    2696             :         XMMReg4Double w0 =
    2697             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
    2698             :         XMMReg4Double w1 =
    2699             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
    2700             :         XMMReg4Double w2 =
    2701             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
    2702             :         XMMReg4Double w3 =
    2703             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
    2704             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
    2705             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
    2706             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
    2707             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
    2708             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
    2709             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
    2710             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
    2711             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
    2712             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
    2713             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
    2714             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
    2715             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
    2716             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
    2717             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
    2718             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
    2719             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
    2720             :     }
    2721             :     for (; i < nSrcLineCount; ++i, j += nStride)
    2722             :     {
    2723             :         XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
    2724             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
    2725             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
    2726             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
    2727             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
    2728             :     }
    2729             :     v_acc0.Store4Val(afDest);
    2730             :     v_acc1.Store4Val(afDest + 4);
    2731             :     v_acc2.Store4Val(afDest + 8);
    2732             :     v_acc3.Store4Val(afDest + 12);
    2733             : }
    2734             : 
    2735             : template <class T>
    2736             : static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
    2737             :                                                           const double *, int,
    2738             :                                                           double *)
    2739             : {
    2740             :     // Cannot be reached
    2741             :     CPLAssert(false);
    2742             : }
    2743             : 
    2744             : #else
    2745             : 
    2746             : /************************************************************************/
    2747             : /*              GDALResampleConvolutionVertical_8cols<T>                */
    2748             : /************************************************************************/
    2749             : 
    2750             : template <class T>
    2751             : static inline void
    2752    21404100 : GDALResampleConvolutionVertical_8cols(const T *pChunk, int nStride,
    2753             :                                       const double *padfWeights,
    2754             :                                       int nSrcLineCount, float *afDest)
    2755             : {
    2756    21404100 :     int i = 0;
    2757    21404100 :     int j = 0;
    2758    21404100 :     XMMReg4Double v_acc0 = XMMReg4Double::Zero();
    2759    21372700 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    2760    40878800 :     for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
    2761             :     {
    2762    19489500 :         XMMReg4Double w0 =
    2763    19489500 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
    2764    19472200 :         XMMReg4Double w1 =
    2765    19472200 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
    2766    19477400 :         XMMReg4Double w2 =
    2767    19477400 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
    2768    19479600 :         XMMReg4Double w3 =
    2769    19479600 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
    2770    19482100 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
    2771    19463000 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
    2772    19459600 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
    2773    19442200 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
    2774    19469400 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
    2775    19470100 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
    2776    19457900 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
    2777    19453100 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
    2778             :     }
    2779    32915400 :     for (; i < nSrcLineCount; ++i, j += nStride)
    2780             :     {
    2781    11526100 :         XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
    2782    11526100 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
    2783    11526100 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
    2784             :     }
    2785    21389300 :     v_acc0.Store4Val(afDest);
    2786    21381200 :     v_acc1.Store4Val(afDest + 4);
    2787    21407600 : }
    2788             : 
    2789             : template <class T>
    2790             : static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
    2791             :                                                          const double *, int,
    2792             :                                                          double *)
    2793             : {
    2794             :     // Cannot be reached
    2795             :     CPLAssert(false);
    2796             : }
    2797             : 
    2798             : #endif  // __AVX__
    2799             : 
    2800             : /************************************************************************/
    2801             : /*              GDALResampleConvolutionHorizontalSSE2<T>                */
    2802             : /************************************************************************/
    2803             : 
    2804             : template <class T>
    2805     2987566 : static inline double GDALResampleConvolutionHorizontalSSE2(
    2806             :     const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    2807             : {
    2808     2987566 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    2809     2987142 :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    2810     2986749 :     int i = 0;  // Used after for.
    2811     3213386 :     for (; i + 7 < nSrcPixelCount; i += 8)
    2812             :     {
    2813             :         // Retrieve the pixel & accumulate
    2814      226604 :         const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
    2815      226606 :         const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
    2816      226604 :         const XMMReg4Double v_weight1 =
    2817      226604 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    2818      226603 :         const XMMReg4Double v_weight2 =
    2819      226603 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
    2820             : 
    2821      226606 :         v_acc1 += v_pixels1 * v_weight1;
    2822      226600 :         v_acc2 += v_pixels2 * v_weight2;
    2823             :     }
    2824             : 
    2825     2986784 :     v_acc1 += v_acc2;
    2826             : 
    2827     2987330 :     double dfVal = v_acc1.GetHorizSum();
    2828    10151620 :     for (; i < nSrcPixelCount; ++i)
    2829             :     {
    2830     7164480 :         dfVal += pChunk[i] * padfWeightsAligned[i];
    2831             :     }
    2832     2987141 :     return dfVal;
    2833             : }
    2834             : 
    2835             : /************************************************************************/
    2836             : /*              GDALResampleConvolutionHorizontal<GByte>                */
    2837             : /************************************************************************/
    2838             : 
    2839             : template <>
    2840     2438940 : inline double GDALResampleConvolutionHorizontal<GByte>(
    2841             :     const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    2842             : {
    2843     2438940 :     return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
    2844     2438930 :                                                  nSrcPixelCount);
    2845             : }
    2846             : 
    2847             : template <>
    2848      548694 : inline double GDALResampleConvolutionHorizontal<GUInt16>(
    2849             :     const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    2850             : {
    2851      548694 :     return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
    2852      548752 :                                                  nSrcPixelCount);
    2853             : }
    2854             : 
    2855             : /************************************************************************/
    2856             : /*              GDALResampleConvolutionHorizontalWithMaskSSE2<T>        */
    2857             : /************************************************************************/
    2858             : 
    2859             : template <class T>
    2860     7067643 : static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
    2861             :     const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
    2862             :     int nSrcPixelCount, double &dfVal, double &dfWeightSum)
    2863             : {
    2864     7067643 :     int i = 0;  // Used after for.
    2865     7067643 :     XMMReg4Double v_acc = XMMReg4Double::Zero();
    2866     7066693 :     XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
    2867    19752321 :     for (; i + 3 < nSrcPixelCount; i += 4)
    2868             :     {
    2869    12685358 :         const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
    2870    12670558 :         const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
    2871    12686658 :         XMMReg4Double v_weight =
    2872    12686658 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    2873    12685158 :         v_weight *= v_mask;
    2874    12671358 :         v_acc += v_pixels * v_weight;
    2875    12685558 :         v_acc_weight += v_weight;
    2876             :     }
    2877             : 
    2878     7066933 :     dfVal = v_acc.GetHorizSum();
    2879     7066173 :     dfWeightSum = v_acc_weight.GetHorizSum();
    2880     7297133 :     for (; i < nSrcPixelCount; ++i)
    2881             :     {
    2882      231086 :         const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
    2883      231086 :         dfVal += pChunk[i] * dfWeight;
    2884      231086 :         dfWeightSum += dfWeight;
    2885             :     }
    2886     7066043 : }
    2887             : 
    2888             : /************************************************************************/
    2889             : /*              GDALResampleConvolutionHorizontalWithMask<GByte>        */
    2890             : /************************************************************************/
    2891             : 
    2892             : template <>
    2893     7067210 : inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
    2894             :     const GByte *pChunk, const GByte *pabyMask,
    2895             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
    2896             :     double &dfWeightSum)
    2897             : {
    2898     7067210 :     GDALResampleConvolutionHorizontalWithMaskSSE2(
    2899             :         pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
    2900             :         dfWeightSum);
    2901     7066510 : }
    2902             : 
    2903             : template <>
    2904          63 : inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
    2905             :     const GUInt16 *pChunk, const GByte *pabyMask,
    2906             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
    2907             :     double &dfWeightSum)
    2908             : {
    2909          63 :     GDALResampleConvolutionHorizontalWithMaskSSE2(
    2910             :         pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
    2911             :         dfWeightSum);
    2912          63 : }
    2913             : 
    2914             : /************************************************************************/
    2915             : /*              GDALResampleConvolutionHorizontal_3rows_SSE2<T>         */
    2916             : /************************************************************************/
    2917             : 
    2918             : template <class T>
    2919    16989430 : static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
    2920             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2921             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    2922             :     double &dfRes2, double &dfRes3)
    2923             : {
    2924    16989430 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
    2925    16971330 :                   v_acc2 = XMMReg4Double::Zero(),
    2926    16984230 :                   v_acc3 = XMMReg4Double::Zero();
    2927    16986330 :     int i = 0;
    2928    33843766 :     for (; i + 7 < nSrcPixelCount; i += 8)
    2929             :     {
    2930             :         // Retrieve the pixel & accumulate.
    2931    16870536 :         XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
    2932    16897636 :         XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
    2933    16901936 :         const XMMReg4Double v_weight1 =
    2934    16901936 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    2935    16881736 :         const XMMReg4Double v_weight2 =
    2936    16881736 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
    2937             : 
    2938    16891436 :         v_acc1 += v_pixels1 * v_weight1;
    2939    16871036 :         v_acc1 += v_pixels2 * v_weight2;
    2940             : 
    2941    16875936 :         v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
    2942    16877936 :         v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
    2943    16886436 :         v_acc2 += v_pixels1 * v_weight1;
    2944    16881636 :         v_acc2 += v_pixels2 * v_weight2;
    2945             : 
    2946    16878336 :         v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
    2947    16882136 :         v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
    2948    16892736 :         v_acc3 += v_pixels1 * v_weight1;
    2949    16871336 :         v_acc3 += v_pixels2 * v_weight2;
    2950             :     }
    2951             : 
    2952    16973230 :     dfRes1 = v_acc1.GetHorizSum();
    2953    16961030 :     dfRes2 = v_acc2.GetHorizSum();
    2954    16972230 :     dfRes3 = v_acc3.GetHorizSum();
    2955    28692526 :     for (; i < nSrcPixelCount; ++i)
    2956             :     {
    2957    11718796 :         dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
    2958    11718796 :         dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
    2959    11718796 :         dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
    2960             :     }
    2961    16973730 : }
    2962             : 
    2963             : /************************************************************************/
    2964             : /*              GDALResampleConvolutionHorizontal_3rows<GByte>          */
    2965             : /************************************************************************/
    2966             : 
    2967             : template <>
    2968    17000100 : inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
    2969             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    2970             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    2971             :     double &dfRes2, double &dfRes3)
    2972             : {
    2973    17000100 :     GDALResampleConvolutionHorizontal_3rows_SSE2(
    2974             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    2975             :         dfRes1, dfRes2, dfRes3);
    2976    16954300 : }
    2977             : 
    2978             : template <>
    2979          30 : inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
    2980             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    2981             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
    2982             :     int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
    2983             : {
    2984          30 :     GDALResampleConvolutionHorizontal_3rows_SSE2(
    2985             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    2986             :         dfRes1, dfRes2, dfRes3);
    2987          30 : }
    2988             : 
    2989             : /************************************************************************/
    2990             : /*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T>   */
    2991             : /************************************************************************/
    2992             : 
    2993             : template <class T>
    2994     3600705 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    2995             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2996             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    2997             :     double &dfRes2, double &dfRes3)
    2998             : {
    2999     3600705 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    3000     3600607 :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    3001     3600648 :     XMMReg4Double v_acc3 = XMMReg4Double::Zero();
    3002     3600616 :     int i = 0;  // Use after for.
    3003     6419107 :     for (; i + 3 < nSrcPixelCount; i += 4)
    3004             :     {
    3005             :         // Retrieve the pixel & accumulate.
    3006     2818480 :         const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
    3007     2818480 :         const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
    3008     2818480 :         const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
    3009     2818480 :         const XMMReg4Double v_weight =
    3010     2818480 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    3011             : 
    3012     2818480 :         v_acc1 += v_pixels1 * v_weight;
    3013     2818480 :         v_acc2 += v_pixels2 * v_weight;
    3014     2818480 :         v_acc3 += v_pixels3 * v_weight;
    3015             :     }
    3016             : 
    3017     3600627 :     dfRes1 = v_acc1.GetHorizSum();
    3018     3600574 :     dfRes2 = v_acc2.GetHorizSum();
    3019     3600605 :     dfRes3 = v_acc3.GetHorizSum();
    3020             : 
    3021     7983110 :     for (; i < nSrcPixelCount; ++i)
    3022             :     {
    3023     4382532 :         dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
    3024     4382532 :         dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
    3025     4382532 :         dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
    3026             :     }
    3027     3600578 : }
    3028             : 
    3029             : /************************************************************************/
    3030             : /*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>    */
    3031             : /************************************************************************/
    3032             : 
    3033             : template <>
    3034     3533580 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
    3035             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    3036             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    3037             :     double &dfRes2, double &dfRes3)
    3038             : {
    3039     3533580 :     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    3040             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    3041             :         dfRes1, dfRes2, dfRes3);
    3042     3533720 : }
    3043             : 
    3044             : template <>
    3045       66920 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
    3046             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    3047             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
    3048             :     int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
    3049             : {
    3050       66920 :     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    3051             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    3052             :         dfRes1, dfRes2, dfRes3);
    3053       67044 : }
    3054             : 
    3055             : /************************************************************************/
    3056             : /*     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T>       */
    3057             : /************************************************************************/
    3058             : 
    3059             : template <class T>
    3060    13860200 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    3061             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    3062             :     const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
    3063             :     double &dfRes3)
    3064             : {
    3065    13860200 :     const XMMReg4Double v_weight =
    3066             :         XMMReg4Double::Load4ValAligned(padfWeightsAligned);
    3067             : 
    3068             :     // Retrieve the pixel & accumulate.
    3069    13864860 :     const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
    3070    13901430 :     const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
    3071    13882600 :     const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
    3072             : 
    3073    13876000 :     XMMReg4Double v_acc1 = v_pixels1 * v_weight;
    3074    13818110 :     XMMReg4Double v_acc2 = v_pixels2 * v_weight;
    3075    13867390 :     XMMReg4Double v_acc3 = v_pixels3 * v_weight;
    3076             : 
    3077    13807110 :     dfRes1 = v_acc1.GetHorizSum();
    3078    13860060 :     dfRes2 = v_acc2.GetHorizSum();
    3079    13873970 :     dfRes3 = v_acc3.GetHorizSum();
    3080    13875300 : }
    3081             : 
    3082             : /************************************************************************/
    3083             : /*       GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>      */
    3084             : /************************************************************************/
    3085             : 
    3086             : template <>
    3087     8262320 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
    3088             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    3089             :     const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
    3090             :     double &dfRes3)
    3091             : {
    3092     8262320 :     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    3093             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
    3094             :         dfRes3);
    3095     8253720 : }
    3096             : 
    3097             : template <>
    3098     5601960 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
    3099             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    3100             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
    3101             :     double &dfRes2, double &dfRes3)
    3102             : {
    3103     5601960 :     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    3104             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
    3105             :         dfRes3);
    3106     5626860 : }
    3107             : 
    3108             : #endif  // USE_SSE2
    3109             : 
    3110             : /************************************************************************/
    3111             : /*                    GDALResampleChunk_Convolution()                   */
    3112             : /************************************************************************/
    3113             : 
    3114             : template <class T, class Twork, GDALDataType eWrkDataType>
    3115        4470 : static CPLErr GDALResampleChunk_ConvolutionT(
    3116             :     const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
    3117             :     FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
    3118             :     int nKernelRadius, bool bKernelWithNegativeWeights, float fMaxVal)
    3119             : 
    3120             : {
    3121        4470 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    3122        4470 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    3123        4470 :     const double dfSrcXDelta = args.dfSrcXDelta;
    3124        4470 :     const double dfSrcYDelta = args.dfSrcYDelta;
    3125        4470 :     constexpr int nBands = 1;
    3126        4470 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    3127        4470 :     const int nChunkXOff = args.nChunkXOff;
    3128        4470 :     const int nChunkXSize = args.nChunkXSize;
    3129        4470 :     const int nChunkYOff = args.nChunkYOff;
    3130        4470 :     const int nChunkYSize = args.nChunkYSize;
    3131        4470 :     const int nDstXOff = args.nDstXOff;
    3132        4470 :     const int nDstXOff2 = args.nDstXOff2;
    3133        4470 :     const int nDstYOff = args.nDstYOff;
    3134        4470 :     const int nDstYOff2 = args.nDstYOff2;
    3135        4470 :     const bool bHasNoData = args.bHasNoData;
    3136        4470 :     double dfNoDataValue = args.dfNoDataValue;
    3137             : 
    3138        4470 :     if (!bHasNoData)
    3139        4387 :         dfNoDataValue = 0.0;
    3140        4470 :     const auto dstDataType = args.eOvrDataType;
    3141        4470 :     const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
    3142        4467 :     const double dfReplacementVal =
    3143          75 :         bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
    3144             :                    : dfNoDataValue;
    3145             :     // cppcheck-suppress unreadVariable
    3146        4467 :     const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
    3147        4455 :     const bool bNoDataValueInt64Valid =
    3148        4463 :         isIntegerDT && GDALIsValueExactAs<GInt64>(dfNoDataValue);
    3149        4455 :     const auto nNodataValueInt64 =
    3150             :         bNoDataValueInt64Valid ? static_cast<GInt64>(dfNoDataValue) : 0;
    3151        4455 :     constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
    3152             : 
    3153             :     // TODO: we should have some generic function to do this.
    3154        4455 :     Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
    3155        4455 :     Twork fDstMax = cpl::NumericLimits<Twork>::max();
    3156        4455 :     if (dstDataType == GDT_Byte)
    3157             :     {
    3158        3733 :         fDstMin = std::numeric_limits<GByte>::min();
    3159        3730 :         fDstMax = std::numeric_limits<GByte>::max();
    3160             :     }
    3161         725 :     else if (dstDataType == GDT_Int8)
    3162             :     {
    3163           1 :         fDstMin = std::numeric_limits<GInt8>::min();
    3164           1 :         fDstMax = std::numeric_limits<GInt8>::max();
    3165             :     }
    3166         724 :     else if (dstDataType == GDT_UInt16)
    3167             :     {
    3168         388 :         fDstMin = std::numeric_limits<GUInt16>::min();
    3169         385 :         fDstMax = std::numeric_limits<GUInt16>::max();
    3170             :     }
    3171         341 :     else if (dstDataType == GDT_Int16)
    3172             :     {
    3173         291 :         fDstMin = std::numeric_limits<GInt16>::min();
    3174         291 :         fDstMax = std::numeric_limits<GInt16>::max();
    3175             :     }
    3176          50 :     else if (dstDataType == GDT_UInt32)
    3177             :     {
    3178           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
    3179           1 :         fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
    3180             :     }
    3181          49 :     else if (dstDataType == GDT_Int32)
    3182             :     {
    3183             :         // cppcheck-suppress unreadVariable
    3184           2 :         fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
    3185             :         // cppcheck-suppress unreadVariable
    3186           2 :         fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
    3187             :     }
    3188          47 :     else if (dstDataType == GDT_UInt64)
    3189             :     {
    3190             :         // cppcheck-suppress unreadVariable
    3191           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
    3192             :         // cppcheck-suppress unreadVariable
    3193           1 :         fDstMax = static_cast<Twork>(std::numeric_limits<uint64_t>::max());
    3194             :     }
    3195          46 :     else if (dstDataType == GDT_Int64)
    3196             :     {
    3197             :         // cppcheck-suppress unreadVariable
    3198           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
    3199             :         // cppcheck-suppress unreadVariable
    3200           1 :         fDstMax = static_cast<Twork>(std::numeric_limits<int64_t>::max());
    3201             :     }
    3202             : 
    3203    37021372 :     auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
    3204             :                                bNoDataValueInt64Valid, nNodataValueInt64,
    3205             :                                dfNoDataValue, dfReplacementVal](Twork fVal)
    3206             :     {
    3207    16036800 :         if (!bHasNoData)
    3208    11838800 :             return fVal;
    3209             : 
    3210             :         // Clamp value before comparing to nodata: this is only needed for
    3211             :         // kernels with negative weights (Lanczos)
    3212     4197970 :         Twork fClamped = fVal;
    3213     4197970 :         if (fClamped < fDstMin)
    3214       15998 :             fClamped = fDstMin;
    3215     4181970 :         else if (fClamped > fDstMax)
    3216       16406 :             fClamped = fDstMax;
    3217     4197970 :         if (isIntegerDT)
    3218             :         {
    3219     8381680 :             if (bNoDataValueInt64Valid &&
    3220     4197870 :                 nNodataValueInt64 == static_cast<GInt64>(std::round(fClamped)))
    3221             :             {
    3222             :                 // Do not use the nodata value
    3223       14435 :                 return static_cast<Twork>(dfReplacementVal);
    3224             :             }
    3225             :         }
    3226        6165 :         else if (dfNoDataValue == fClamped)
    3227             :         {
    3228             :             // Do not use the nodata value
    3229           1 :             return static_cast<Twork>(dfReplacementVal);
    3230             :         }
    3231     4175550 :         return fClamped;
    3232             :     };
    3233             : 
    3234             :     /* -------------------------------------------------------------------- */
    3235             :     /*      Allocate work buffers.                                          */
    3236             :     /* -------------------------------------------------------------------- */
    3237        4465 :     const int nDstXSize = nDstXOff2 - nDstXOff;
    3238        4465 :     Twork *pafWrkScanline = nullptr;
    3239        4465 :     if (dstDataType != eWrkDataType)
    3240             :     {
    3241             :         pafWrkScanline =
    3242        4420 :             static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
    3243        4423 :         if (pafWrkScanline == nullptr)
    3244           0 :             return CE_Failure;
    3245             :     }
    3246             : 
    3247        4468 :     const double dfXScale = 1.0 / dfXRatioDstToSrc;
    3248        4468 :     const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
    3249        4468 :     const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
    3250        4468 :     const double dfYScale = 1.0 / dfYRatioDstToSrc;
    3251        4468 :     const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
    3252        4468 :     const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
    3253             : 
    3254             :     // Temporary array to store result of horizontal filter.
    3255             :     double *padfHorizontalFiltered = static_cast<double *>(
    3256        4468 :         VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
    3257             : 
    3258             :     // To store convolution coefficients.
    3259        4471 :     double *padfWeights = static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
    3260             :         static_cast<int>(2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) +
    3261             :                          0.5) *
    3262             :         sizeof(double)));
    3263             : 
    3264        4467 :     GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
    3265        4467 :     if (pabyChunkNodataMask)
    3266             :         pabyChunkNodataMaskHorizontalFiltered =
    3267         462 :             static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
    3268        4467 :     if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
    3269         462 :         (pabyChunkNodataMask != nullptr &&
    3270             :          pabyChunkNodataMaskHorizontalFiltered == nullptr))
    3271             :     {
    3272           4 :         VSIFree(pafWrkScanline);
    3273           0 :         VSIFree(padfHorizontalFiltered);
    3274           0 :         VSIFreeAligned(padfWeights);
    3275           0 :         VSIFree(pabyChunkNodataMaskHorizontalFiltered);
    3276           0 :         return CE_Failure;
    3277             :     }
    3278             : 
    3279             :     /* ==================================================================== */
    3280             :     /*      First pass: horizontal filter                                   */
    3281             :     /* ==================================================================== */
    3282        4464 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    3283             : #ifdef USE_SSE2
    3284        4464 :     bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
    3285             : #endif
    3286     2919188 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    3287             :     {
    3288     2914710 :         const double dfSrcPixel =
    3289     2914710 :             (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
    3290     2914710 :         int nSrcPixelStart =
    3291     2914710 :             static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
    3292     2914710 :         if (nSrcPixelStart < nChunkXOff)
    3293       56693 :             nSrcPixelStart = nChunkXOff;
    3294     2914710 :         int nSrcPixelStop =
    3295     2914710 :             static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
    3296     2914710 :         if (nSrcPixelStop > nChunkRightXOff)
    3297       56714 :             nSrcPixelStop = nChunkRightXOff;
    3298             : #if 0
    3299             :         if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
    3300             :         {
    3301             :             printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
    3302             :         }
    3303             :         if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
    3304             :         {
    3305             :             printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
    3306             :         }
    3307             : #endif
    3308     2914710 :         const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
    3309     2914710 :         double dfWeightSum = 0.0;
    3310             : 
    3311             :         // Compute convolution coefficients.
    3312     2914710 :         int nSrcPixel = nSrcPixelStart;
    3313     2914710 :         double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
    3314     4057496 :         for (; nSrcPixel + 3 < nSrcPixelStop; nSrcPixel += 4)
    3315             :         {
    3316     1142685 :             padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
    3317     1142685 :             dfX += dfXScaleWeight;
    3318     1142685 :             padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
    3319     1142685 :             dfX += dfXScaleWeight;
    3320     1142685 :             padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
    3321     1142685 :             dfX += dfXScaleWeight;
    3322     1142685 :             padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
    3323     1142685 :             dfX += dfXScaleWeight;
    3324     1142780 :             dfWeightSum +=
    3325     1142685 :                 pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
    3326             :         }
    3327     6902815 :         for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
    3328             :         {
    3329     3988570 :             const double dfWeight = pfnFilterFunc(dfX);
    3330     3988011 :             padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
    3331     3988011 :             dfWeightSum += dfWeight;
    3332             :         }
    3333             : 
    3334     2914245 :         const int nHeight = nChunkYSize * nBands;
    3335     2914245 :         if (pabyChunkNodataMask == nullptr)
    3336             :         {
    3337     2826730 :             if (dfWeightSum != 0)
    3338             :             {
    3339     2826748 :                 const double dfInvWeightSum = 1.0 / dfWeightSum;
    3340    10735547 :                 for (int i = 0; i < nSrcPixelCount; ++i)
    3341     7908785 :                     padfWeights[i] *= dfInvWeightSum;
    3342             :             }
    3343     2826730 :             int iSrcLineOff = 0;
    3344             : #ifdef USE_SSE2
    3345     2826730 :             if (nSrcPixelCount == 4)
    3346             :             {
    3347    15737504 :                 for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
    3348             :                 {
    3349    15113176 :                     const GPtrDiff_t j =
    3350    15113176 :                         static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
    3351    15113176 :                         (nSrcPixelStart - nChunkXOff);
    3352    15113176 :                     double dfVal1 = 0.0;
    3353    15113176 :                     double dfVal2 = 0.0;
    3354    15113176 :                     double dfVal3 = 0.0;
    3355    15113176 :                     GDALResampleConvolutionHorizontalPixelCount4_3rows(
    3356    15113176 :                         pChunk + j, pChunk + j + nChunkXSize,
    3357    15113176 :                         pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
    3358             :                         dfVal2, dfVal3);
    3359    15124056 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3360    15124056 :                                                nDstXSize +
    3361    15124056 :                                            iDstPixel - nDstXOff] = dfVal1;
    3362    15124056 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3363    15124056 :                                             1) *
    3364    15124056 :                                                nDstXSize +
    3365    15124056 :                                            iDstPixel - nDstXOff] = dfVal2;
    3366    15124056 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3367    15124056 :                                             2) *
    3368    15124056 :                                                nDstXSize +
    3369    15124056 :                                            iDstPixel - nDstXOff] = dfVal3;
    3370             :                 }
    3371             :             }
    3372     2213285 :             else if (bSrcPixelCountLess8)
    3373             :             {
    3374     5663461 :                 for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
    3375             :                 {
    3376     3619333 :                     const GPtrDiff_t j =
    3377     3619333 :                         static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
    3378     3619333 :                         (nSrcPixelStart - nChunkXOff);
    3379     3619333 :                     double dfVal1 = 0.0;
    3380     3619333 :                     double dfVal2 = 0.0;
    3381     3619333 :                     double dfVal3 = 0.0;
    3382     3619333 :                     GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
    3383     3619333 :                         pChunk + j, pChunk + j + nChunkXSize,
    3384     3619333 :                         pChunk + j + 2 * nChunkXSize, padfWeights,
    3385             :                         nSrcPixelCount, dfVal1, dfVal2, dfVal3);
    3386     3619575 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3387     3619575 :                                                nDstXSize +
    3388     3619575 :                                            iDstPixel - nDstXOff] = dfVal1;
    3389     3619575 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3390     3619575 :                                             1) *
    3391     3619575 :                                                nDstXSize +
    3392     3619575 :                                            iDstPixel - nDstXOff] = dfVal2;
    3393     3619575 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3394     3619575 :                                             2) *
    3395     3619575 :                                                nDstXSize +
    3396     3619575 :                                            iDstPixel - nDstXOff] = dfVal3;
    3397             :                 }
    3398             :             }
    3399             :             else
    3400             : #endif
    3401             :             {
    3402    17218239 :                 for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
    3403             :                 {
    3404    17042830 :                     const GPtrDiff_t j =
    3405    17042830 :                         static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
    3406    17042830 :                         (nSrcPixelStart - nChunkXOff);
    3407    17042830 :                     double dfVal1 = 0.0;
    3408    17042830 :                     double dfVal2 = 0.0;
    3409    17042830 :                     double dfVal3 = 0.0;
    3410    17042830 :                     GDALResampleConvolutionHorizontal_3rows(
    3411    17042830 :                         pChunk + j, pChunk + j + nChunkXSize,
    3412    17042830 :                         pChunk + j + 2 * nChunkXSize, padfWeights,
    3413             :                         nSrcPixelCount, dfVal1, dfVal2, dfVal3);
    3414    17048930 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3415    17048930 :                                                nDstXSize +
    3416    17048930 :                                            iDstPixel - nDstXOff] = dfVal1;
    3417    17048930 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3418    17048930 :                                             1) *
    3419    17048930 :                                                nDstXSize +
    3420    17048930 :                                            iDstPixel - nDstXOff] = dfVal2;
    3421    17048930 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3422    17048930 :                                             2) *
    3423    17048930 :                                                nDstXSize +
    3424    17048930 :                                            iDstPixel - nDstXOff] = dfVal3;
    3425             :                 }
    3426             :             }
    3427     5876499 :             for (; iSrcLineOff < nHeight; ++iSrcLineOff)
    3428             :             {
    3429     3032466 :                 const GPtrDiff_t j =
    3430     3032466 :                     static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
    3431     3032466 :                     (nSrcPixelStart - nChunkXOff);
    3432     6020156 :                 const double dfVal = GDALResampleConvolutionHorizontal(
    3433     3032466 :                     pChunk + j, padfWeights, nSrcPixelCount);
    3434     3032579 :                 padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3435     3032579 :                                            nDstXSize +
    3436     3032579 :                                        iDstPixel - nDstXOff] = dfVal;
    3437             :             }
    3438             :         }
    3439             :         else
    3440             :         {
    3441    20497568 :             for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
    3442             :             {
    3443    20412846 :                 const GPtrDiff_t j =
    3444    20412846 :                     static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
    3445    20412846 :                     (nSrcPixelStart - nChunkXOff);
    3446             : 
    3447    20412846 :                 if (bKernelWithNegativeWeights)
    3448             :                 {
    3449    19899712 :                     int nConsecutiveValid = 0;
    3450    19899712 :                     int nMaxConsecutiveValid = 0;
    3451   181965458 :                     for (int k = 0; k < nSrcPixelCount; k++)
    3452             :                     {
    3453   162065146 :                         if (pabyChunkNodataMask[j + k])
    3454    48858253 :                             nConsecutiveValid++;
    3455   113206793 :                         else if (nConsecutiveValid)
    3456             :                         {
    3457      108870 :                             nMaxConsecutiveValid = std::max(
    3458      107790 :                                 nMaxConsecutiveValid, nConsecutiveValid);
    3459      108870 :                             nConsecutiveValid = 0;
    3460             :                         }
    3461             :                     }
    3462    19889412 :                     nMaxConsecutiveValid =
    3463    19900812 :                         std::max(nMaxConsecutiveValid, nConsecutiveValid);
    3464    19889412 :                     if (nMaxConsecutiveValid < nSrcPixelCount / 2)
    3465             :                     {
    3466    13314907 :                         const size_t nTempOffset =
    3467    13314907 :                             static_cast<size_t>(iSrcLineOff) * nDstXSize +
    3468    13314907 :                             iDstPixel - nDstXOff;
    3469    13314907 :                         padfHorizontalFiltered[nTempOffset] = 0.0;
    3470    13314907 :                         pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
    3471    13314907 :                         continue;
    3472             :                     }
    3473             :                 }
    3474             : 
    3475     7087729 :                 double dfVal = 0.0;
    3476     7087729 :                 GDALResampleConvolutionHorizontalWithMask(
    3477     7087729 :                     pChunk + j, pabyChunkNodataMask + j, padfWeights,
    3478             :                     nSrcPixelCount, dfVal, dfWeightSum);
    3479     7095096 :                 const size_t nTempOffset =
    3480     7095096 :                     static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
    3481     7095096 :                     nDstXOff;
    3482     7095096 :                 if (dfWeightSum > 0.0)
    3483             :                 {
    3484     7067156 :                     padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
    3485     7067156 :                     pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
    3486             :                 }
    3487             :                 else
    3488             :                 {
    3489       28002 :                     padfHorizontalFiltered[nTempOffset] = 0.0;
    3490       28002 :                     pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
    3491             :                 }
    3492             :             }
    3493             :         }
    3494             :     }
    3495             : 
    3496             :     /* ==================================================================== */
    3497             :     /*      Second pass: vertical filter                                    */
    3498             :     /* ==================================================================== */
    3499        4473 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    3500             : 
    3501      266292 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    3502             :     {
    3503      261819 :         Twork *const pafDstScanline =
    3504      261819 :             pafWrkScanline ? pafWrkScanline
    3505        8421 :                            : static_cast<Twork *>(pDstBuffer) +
    3506        8421 :                                  (iDstLine - nDstYOff) * nDstXSize;
    3507             : 
    3508      261819 :         const double dfSrcLine =
    3509      261819 :             (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
    3510      261819 :         int nSrcLineStart =
    3511      261819 :             static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
    3512      261819 :         int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
    3513      261819 :         if (nSrcLineStart < nChunkYOff)
    3514        2815 :             nSrcLineStart = nChunkYOff;
    3515      261819 :         if (nSrcLineStop > nChunkBottomYOff)
    3516        2859 :             nSrcLineStop = nChunkBottomYOff;
    3517             : #if 0
    3518             :         if( nSrcLineStart < nChunkYOff &&
    3519             :             nChunkYOff > 0 )
    3520             :         {
    3521             :             printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
    3522             :         }
    3523             :         if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
    3524             :         {
    3525             :             printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
    3526             :         }
    3527             : #endif
    3528      261819 :         const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
    3529      261819 :         double dfWeightSum = 0.0;
    3530             : 
    3531             :         // Compute convolution coefficients.
    3532      261819 :         int nSrcLine = nSrcLineStart;  // Used after for.
    3533      261819 :         double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
    3534      616063 :         for (; nSrcLine + 3 < nSrcLineStop;
    3535      354244 :              nSrcLine += 4, dfY += 4 * dfYScaleWeight)
    3536             :         {
    3537      354243 :             padfWeights[nSrcLine - nSrcLineStart] = dfY;
    3538      354243 :             padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
    3539      354243 :             padfWeights[nSrcLine + 2 - nSrcLineStart] =
    3540      354243 :                 dfY + 2 * dfYScaleWeight;
    3541      354243 :             padfWeights[nSrcLine + 3 - nSrcLineStart] =
    3542      354243 :                 dfY + 3 * dfYScaleWeight;
    3543      354244 :             dfWeightSum +=
    3544      354243 :                 pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
    3545             :         }
    3546      297506 :         for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
    3547             :         {
    3548       35694 :             const double dfWeight = pfnFilterFunc(dfY);
    3549       35686 :             padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
    3550       35686 :             dfWeightSum += dfWeight;
    3551             :         }
    3552             : 
    3553      261812 :         if (pabyChunkNodataMask == nullptr)
    3554             :         {
    3555      222820 :             if (dfWeightSum != 0)
    3556             :             {
    3557      222803 :                 const double dfInvWeightSum = 1.0 / dfWeightSum;
    3558     1402834 :                 for (int i = 0; i < nSrcLineCount; ++i)
    3559     1180031 :                     padfWeights[i] *= dfInvWeightSum;
    3560             :             }
    3561             :         }
    3562             : 
    3563      261812 :         if (pabyChunkNodataMask == nullptr)
    3564             :         {
    3565      222804 :             int iFilteredPixelOff = 0;  // Used after for.
    3566             :             // j used after for.
    3567      222804 :             size_t j =
    3568      222804 :                 (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
    3569             : #ifdef USE_SSE2
    3570             :             if constexpr (eWrkDataType == GDT_Float32)
    3571             :             {
    3572             : #ifdef __AVX__
    3573             :                 for (; iFilteredPixelOff + 15 < nDstXSize;
    3574             :                      iFilteredPixelOff += 16, j += 16)
    3575             :                 {
    3576             :                     GDALResampleConvolutionVertical_16cols(
    3577             :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3578             :                         nSrcLineCount, pafDstScanline + iFilteredPixelOff);
    3579             :                     if (bHasNoData)
    3580             :                     {
    3581             :                         for (int k = 0; k < 16; k++)
    3582             :                         {
    3583             :                             pafDstScanline[iFilteredPixelOff + k] =
    3584             :                                 replaceValIfNodata(
    3585             :                                     pafDstScanline[iFilteredPixelOff + k]);
    3586             :                         }
    3587             :                     }
    3588             :                 }
    3589             : #else
    3590    21591058 :                 for (; iFilteredPixelOff + 7 < nDstXSize;
    3591             :                      iFilteredPixelOff += 8, j += 8)
    3592             :                 {
    3593    21418218 :                     GDALResampleConvolutionVertical_8cols(
    3594    21418218 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3595    21418218 :                         nSrcLineCount, pafDstScanline + iFilteredPixelOff);
    3596    21375448 :                     if (bHasNoData)
    3597             :                     {
    3598      123192 :                         for (int k = 0; k < 8; k++)
    3599             :                         {
    3600      109504 :                             pafDstScanline[iFilteredPixelOff + k] =
    3601      109504 :                                 replaceValIfNodata(
    3602      109504 :                                     pafDstScanline[iFilteredPixelOff + k]);
    3603             :                         }
    3604             :                     }
    3605             :                 }
    3606             : #endif
    3607             : 
    3608      638007 :                 for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
    3609             :                 {
    3610      465244 :                     const Twork fVal =
    3611      465187 :                         static_cast<Twork>(GDALResampleConvolutionVertical(
    3612      465187 :                             padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3613             :                             nSrcLineCount));
    3614      465180 :                     pafDstScanline[iFilteredPixelOff] =
    3615      465244 :                         replaceValIfNodata(fVal);
    3616             :                 }
    3617             :             }
    3618             :             else
    3619             : #endif
    3620             :             {
    3621     2887210 :                 for (; iFilteredPixelOff + 1 < nDstXSize;
    3622             :                      iFilteredPixelOff += 2, j += 2)
    3623             :                 {
    3624     2880000 :                     double dfVal1 = 0.0;
    3625     2880000 :                     double dfVal2 = 0.0;
    3626     2880000 :                     GDALResampleConvolutionVertical_2cols(
    3627     2880000 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3628             :                         nSrcLineCount, dfVal1, dfVal2);
    3629     5760010 :                     pafDstScanline[iFilteredPixelOff] =
    3630     2880000 :                         replaceValIfNodata(static_cast<Twork>(dfVal1));
    3631     2880000 :                     pafDstScanline[iFilteredPixelOff + 1] =
    3632     2880000 :                         replaceValIfNodata(static_cast<Twork>(dfVal2));
    3633             :                 }
    3634        7206 :                 if (iFilteredPixelOff < nDstXSize)
    3635             :                 {
    3636           2 :                     const double dfVal = GDALResampleConvolutionVertical(
    3637           2 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3638             :                         nSrcLineCount);
    3639           2 :                     pafDstScanline[iFilteredPixelOff] =
    3640           2 :                         replaceValIfNodata(static_cast<Twork>(dfVal));
    3641             :                 }
    3642             :             }
    3643             :         }
    3644             :         else
    3645             :         {
    3646    18980239 :             for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
    3647             :                  ++iFilteredPixelOff)
    3648             :             {
    3649    18969033 :                 double dfVal = 0.0;
    3650    18969033 :                 dfWeightSum = 0.0;
    3651    18969033 :                 size_t j = (nSrcLineStart - nChunkYOff) *
    3652    18969033 :                                static_cast<size_t>(nDstXSize) +
    3653    18969033 :                            iFilteredPixelOff;
    3654    18969033 :                 if (bKernelWithNegativeWeights)
    3655             :                 {
    3656    18722001 :                     int nConsecutiveValid = 0;
    3657    18722001 :                     int nMaxConsecutiveValid = 0;
    3658   132756321 :                     for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
    3659             :                     {
    3660   114033020 :                         const double dfWeight =
    3661   114033020 :                             padfWeights[i] *
    3662             :                             pabyChunkNodataMaskHorizontalFiltered[j];
    3663   114033020 :                         if (pabyChunkNodataMaskHorizontalFiltered[j])
    3664             :                         {
    3665    48323137 :                             nConsecutiveValid++;
    3666             :                         }
    3667    65709483 :                         else if (nConsecutiveValid)
    3668             :                         {
    3669      205685 :                             nMaxConsecutiveValid = std::max(
    3670      204376 :                                 nMaxConsecutiveValid, nConsecutiveValid);
    3671      205685 :                             nConsecutiveValid = 0;
    3672             :                         }
    3673   114034020 :                         dfVal += padfHorizontalFiltered[j] * dfWeight;
    3674   114034020 :                         dfWeightSum += dfWeight;
    3675             :                     }
    3676    18693601 :                     nMaxConsecutiveValid =
    3677    18723301 :                         std::max(nMaxConsecutiveValid, nConsecutiveValid);
    3678    18693601 :                     if (nMaxConsecutiveValid < nSrcLineCount / 2)
    3679             :                     {
    3680     9246271 :                         pafDstScanline[iFilteredPixelOff] =
    3681     9246179 :                             static_cast<Twork>(dfNoDataValue);
    3682     9246271 :                         continue;
    3683             :                     }
    3684             :                 }
    3685             :                 else
    3686             :                 {
    3687     1240572 :                     for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
    3688             :                     {
    3689      993504 :                         const double dfWeight =
    3690      993504 :                             padfWeights[i] *
    3691             :                             pabyChunkNodataMaskHorizontalFiltered[j];
    3692      993504 :                         dfVal += padfHorizontalFiltered[j] * dfWeight;
    3693      993504 :                         dfWeightSum += dfWeight;
    3694             :                     }
    3695             :                 }
    3696     9694362 :                 if (dfWeightSum > 0.0)
    3697             :                 {
    3698     9705461 :                     pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
    3699     9704899 :                         static_cast<Twork>(dfVal / dfWeightSum));
    3700             :                 }
    3701             :                 else
    3702             :                 {
    3703          41 :                     pafDstScanline[iFilteredPixelOff] =
    3704          17 :                         static_cast<Twork>(dfNoDataValue);
    3705             :                 }
    3706             :             }
    3707             :         }
    3708             : 
    3709      191178 :         if (fMaxVal != 0.0f)
    3710             :         {
    3711      192324 :             for (int i = 0; i < nDstXSize; ++i)
    3712             :             {
    3713      192088 :                 if (pafDstScanline[i] > fMaxVal)
    3714       96022 :                     pafDstScanline[i] = fMaxVal;
    3715             :             }
    3716             :         }
    3717             : 
    3718      191178 :         if (pafWrkScanline)
    3719             :         {
    3720      253382 :             GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
    3721             :                             static_cast<GByte *>(pDstBuffer) +
    3722      253382 :                                 static_cast<size_t>(iDstLine - nDstYOff) *
    3723      253382 :                                     nDstXSize * nDstDataTypeSize,
    3724             :                             dstDataType, nDstDataTypeSize, nDstXSize);
    3725             :         }
    3726             :     }
    3727             : 
    3728        4473 :     VSIFree(pafWrkScanline);
    3729        4473 :     VSIFreeAligned(padfWeights);
    3730        4473 :     VSIFree(padfHorizontalFiltered);
    3731        4473 :     VSIFree(pabyChunkNodataMaskHorizontalFiltered);
    3732             : 
    3733        4473 :     return CE_None;
    3734             : }
    3735             : 
    3736             : static CPLErr
    3737        4473 : GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
    3738             :                               const void *pChunk, void **ppDstBuffer,
    3739             :                               GDALDataType *peDstBufferDataType)
    3740             : {
    3741             :     GDALResampleAlg eResample;
    3742        4473 :     bool bKernelWithNegativeWeights = false;
    3743        4473 :     if (EQUAL(args.pszResampling, "BILINEAR"))
    3744        2628 :         eResample = GRA_Bilinear;
    3745        1845 :     else if (EQUAL(args.pszResampling, "CUBIC"))
    3746             :     {
    3747        1761 :         eResample = GRA_Cubic;
    3748        1761 :         bKernelWithNegativeWeights = true;
    3749             :     }
    3750          84 :     else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
    3751          23 :         eResample = GRA_CubicSpline;
    3752          61 :     else if (EQUAL(args.pszResampling, "LANCZOS"))
    3753             :     {
    3754          54 :         eResample = GRA_Lanczos;
    3755          54 :         bKernelWithNegativeWeights = true;
    3756             :     }
    3757             :     else
    3758             :     {
    3759           7 :         CPLAssert(false);
    3760             :         return CE_Failure;
    3761             :     }
    3762        4466 :     const int nKernelRadius = GWKGetFilterRadius(eResample);
    3763        4469 :     FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
    3764             :     const FilterFunc4ValuesType pfnFilterFunc4Values =
    3765        4470 :         GWKGetFilterFunc4Values(eResample);
    3766             : 
    3767        4466 :     float fMaxVal = 0.f;
    3768             :     // Cubic, etc... can have overshoots, so make sure we clamp values to the
    3769             :     // maximum value if NBITS is set.
    3770        4466 :     if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
    3771           8 :         (args.eOvrDataType == GDT_Byte || args.eOvrDataType == GDT_UInt16 ||
    3772           0 :          args.eOvrDataType == GDT_UInt32))
    3773             :     {
    3774           8 :         int nBits = args.nOvrNBITS;
    3775           8 :         if (nBits == GDALGetDataTypeSize(args.eOvrDataType))
    3776           1 :             nBits = 0;
    3777           8 :         if (nBits > 0 && nBits < 32)
    3778           7 :             fMaxVal = static_cast<float>((1U << nBits) - 1);
    3779             :     }
    3780             : 
    3781        4466 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(
    3782             :         args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
    3783             :         GDALGetDataTypeSizeBytes(args.eOvrDataType));
    3784        4468 :     if (*ppDstBuffer == nullptr)
    3785             :     {
    3786           0 :         return CE_Failure;
    3787             :     }
    3788        4468 :     *peDstBufferDataType = args.eOvrDataType;
    3789             : 
    3790        4468 :     switch (args.eWrkDataType)
    3791             :     {
    3792        3732 :         case GDT_Byte:
    3793             :         {
    3794        3732 :             return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32>(
    3795             :                 args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
    3796             :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
    3797        3735 :                 bKernelWithNegativeWeights, fMaxVal);
    3798             :         }
    3799             : 
    3800         394 :         case GDT_UInt16:
    3801             :         {
    3802         394 :             return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32>(
    3803             :                 args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
    3804             :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
    3805         396 :                 bKernelWithNegativeWeights, fMaxVal);
    3806             :         }
    3807             : 
    3808         313 :         case GDT_Float32:
    3809             :         {
    3810         313 :             return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32>(
    3811             :                 args, static_cast<const float *>(pChunk), *ppDstBuffer,
    3812             :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
    3813         313 :                 bKernelWithNegativeWeights, fMaxVal);
    3814             :         }
    3815             : 
    3816          29 :         case GDT_Float64:
    3817             :         {
    3818          29 :             return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64>(
    3819             :                 args, static_cast<const double *>(pChunk), *ppDstBuffer,
    3820             :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
    3821          29 :                 bKernelWithNegativeWeights, fMaxVal);
    3822             :         }
    3823             : 
    3824           0 :         default:
    3825           0 :             break;
    3826             :     }
    3827             : 
    3828           0 :     CPLAssert(false);
    3829             :     return CE_Failure;
    3830             : }
    3831             : 
    3832             : /************************************************************************/
    3833             : /*                       GDALResampleChunkC32R()                        */
    3834             : /************************************************************************/
    3835             : 
    3836           2 : static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
    3837             :                                     const float *pafChunk, const int nChunkYOff,
    3838             :                                     const int nChunkYSize, const int nDstYOff,
    3839             :                                     const int nDstYOff2, const int nOvrXSize,
    3840             :                                     const int nOvrYSize, void **ppDstBuffer,
    3841             :                                     GDALDataType *peDstBufferDataType,
    3842             :                                     const char *pszResampling)
    3843             : 
    3844             : {
    3845             :     enum Method
    3846             :     {
    3847             :         NEAR,
    3848             :         AVERAGE,
    3849             :         AVERAGE_MAGPHASE,
    3850             :         RMS,
    3851             :     };
    3852             : 
    3853           2 :     Method eMethod = NEAR;
    3854           2 :     if (STARTS_WITH_CI(pszResampling, "NEAR"))
    3855             :     {
    3856           0 :         eMethod = NEAR;
    3857             :     }
    3858           2 :     else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
    3859             :     {
    3860           0 :         eMethod = AVERAGE_MAGPHASE;
    3861             :     }
    3862           2 :     else if (EQUAL(pszResampling, "RMS"))
    3863             :     {
    3864           2 :         eMethod = RMS;
    3865             :     }
    3866           0 :     else if (STARTS_WITH_CI(pszResampling, "AVER"))
    3867             :     {
    3868           0 :         eMethod = AVERAGE;
    3869             :     }
    3870             :     else
    3871             :     {
    3872           0 :         CPLError(
    3873             :             CE_Failure, CPLE_NotSupported,
    3874             :             "Resampling method %s is not supported for complex data types. "
    3875             :             "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
    3876             :             pszResampling);
    3877           0 :         return CE_Failure;
    3878             :     }
    3879             : 
    3880           2 :     const int nOXSize = nOvrXSize;
    3881           2 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
    3882             :                                        GDALGetDataTypeSizeBytes(GDT_CFloat32));
    3883           2 :     if (*ppDstBuffer == nullptr)
    3884             :     {
    3885           0 :         return CE_Failure;
    3886             :     }
    3887           2 :     float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
    3888           2 :     *peDstBufferDataType = GDT_CFloat32;
    3889             : 
    3890           2 :     const int nOYSize = nOvrYSize;
    3891           2 :     const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
    3892           2 :     const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
    3893             : 
    3894             :     /* ==================================================================== */
    3895             :     /*      Loop over destination scanlines.                                */
    3896             :     /* ==================================================================== */
    3897           8 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    3898             :     {
    3899           6 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
    3900           6 :         if (nSrcYOff < nChunkYOff)
    3901           0 :             nSrcYOff = nChunkYOff;
    3902             : 
    3903           6 :         int nSrcYOff2 =
    3904           6 :             static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
    3905           6 :         if (nSrcYOff2 == nSrcYOff)
    3906           0 :             nSrcYOff2++;
    3907             : 
    3908           6 :         if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
    3909             :         {
    3910           2 :             if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
    3911           0 :                 nSrcYOff = nSrcHeight - 1;
    3912           2 :             nSrcYOff2 = nSrcHeight;
    3913             :         }
    3914           6 :         if (nSrcYOff2 > nChunkYOff + nChunkYSize)
    3915           0 :             nSrcYOff2 = nChunkYOff + nChunkYSize;
    3916             : 
    3917           6 :         const float *const pafSrcScanline =
    3918           6 :             pafChunk + ((nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
    3919           6 :         float *const pafDstScanline =
    3920           6 :             pafDstBuffer + (iDstLine - nDstYOff) * 2 * nOXSize;
    3921             : 
    3922             :         /* --------------------------------------------------------------------
    3923             :          */
    3924             :         /*      Loop over destination pixels */
    3925             :         /* --------------------------------------------------------------------
    3926             :          */
    3927          18 :         for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
    3928             :         {
    3929          12 :             int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
    3930          12 :             int nSrcXOff2 =
    3931          12 :                 static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
    3932          12 :             if (nSrcXOff2 == nSrcXOff)
    3933           0 :                 nSrcXOff2++;
    3934          12 :             if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
    3935             :             {
    3936           6 :                 if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
    3937           0 :                     nSrcXOff = nSrcWidth - 1;
    3938           6 :                 nSrcXOff2 = nSrcWidth;
    3939             :             }
    3940             : 
    3941          12 :             if (eMethod == NEAR)
    3942             :             {
    3943           0 :                 pafDstScanline[iDstPixel * 2] = pafSrcScanline[nSrcXOff * 2];
    3944           0 :                 pafDstScanline[iDstPixel * 2 + 1] =
    3945           0 :                     pafSrcScanline[nSrcXOff * 2 + 1];
    3946             :             }
    3947          12 :             else if (eMethod == AVERAGE_MAGPHASE)
    3948             :             {
    3949           0 :                 double dfTotalR = 0.0;
    3950           0 :                 double dfTotalI = 0.0;
    3951           0 :                 double dfTotalM = 0.0;
    3952           0 :                 int nCount = 0;
    3953             : 
    3954           0 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    3955             :                 {
    3956           0 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    3957             :                     {
    3958           0 :                         const double dfR =
    3959           0 :                             pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
    3960           0 :                                                         iY - nSrcYOff) *
    3961           0 :                                                         nSrcWidth * 2];
    3962           0 :                         const double dfI =
    3963           0 :                             pafSrcScanline[iX * 2 +
    3964           0 :                                            static_cast<GPtrDiff_t>(iY -
    3965           0 :                                                                    nSrcYOff) *
    3966           0 :                                                nSrcWidth * 2 +
    3967           0 :                                            1];
    3968           0 :                         dfTotalR += dfR;
    3969           0 :                         dfTotalI += dfI;
    3970           0 :                         dfTotalM += std::hypot(dfR, dfI);
    3971           0 :                         ++nCount;
    3972             :                     }
    3973             :                 }
    3974             : 
    3975           0 :                 CPLAssert(nCount > 0);
    3976           0 :                 if (nCount == 0)
    3977             :                 {
    3978           0 :                     pafDstScanline[iDstPixel * 2] = 0.0;
    3979           0 :                     pafDstScanline[iDstPixel * 2 + 1] = 0.0;
    3980             :                 }
    3981             :                 else
    3982             :                 {
    3983           0 :                     pafDstScanline[iDstPixel * 2] =
    3984           0 :                         static_cast<float>(dfTotalR / nCount);
    3985           0 :                     pafDstScanline[iDstPixel * 2 + 1] =
    3986           0 :                         static_cast<float>(dfTotalI / nCount);
    3987             :                     const double dfM =
    3988           0 :                         std::hypot(pafDstScanline[iDstPixel * 2],
    3989           0 :                                    pafDstScanline[iDstPixel * 2 + 1]);
    3990           0 :                     const double dfDesiredM = dfTotalM / nCount;
    3991           0 :                     double dfRatio = 1.0;
    3992           0 :                     if (dfM != 0.0)
    3993           0 :                         dfRatio = dfDesiredM / dfM;
    3994             : 
    3995           0 :                     pafDstScanline[iDstPixel * 2] *=
    3996           0 :                         static_cast<float>(dfRatio);
    3997           0 :                     pafDstScanline[iDstPixel * 2 + 1] *=
    3998           0 :                         static_cast<float>(dfRatio);
    3999             :                 }
    4000             :             }
    4001          12 :             else if (eMethod == RMS)
    4002             :             {
    4003          12 :                 double dfTotalR = 0.0;
    4004          12 :                 double dfTotalI = 0.0;
    4005          12 :                 int nCount = 0;
    4006             : 
    4007          36 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    4008             :                 {
    4009          72 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    4010             :                     {
    4011          48 :                         const double dfR =
    4012          48 :                             pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
    4013          48 :                                                         iY - nSrcYOff) *
    4014          48 :                                                         nSrcWidth * 2];
    4015          48 :                         const double dfI =
    4016          48 :                             pafSrcScanline[iX * 2 +
    4017          48 :                                            static_cast<GPtrDiff_t>(iY -
    4018          48 :                                                                    nSrcYOff) *
    4019          48 :                                                nSrcWidth * 2 +
    4020          48 :                                            1];
    4021             : 
    4022          48 :                         dfTotalR += SQUARE(dfR);
    4023          48 :                         dfTotalI += SQUARE(dfI);
    4024             : 
    4025          48 :                         ++nCount;
    4026             :                     }
    4027             :                 }
    4028             : 
    4029          12 :                 CPLAssert(nCount > 0);
    4030          12 :                 if (nCount == 0)
    4031             :                 {
    4032           0 :                     pafDstScanline[iDstPixel * 2] = 0.0;
    4033           0 :                     pafDstScanline[iDstPixel * 2 + 1] = 0.0;
    4034             :                 }
    4035             :                 else
    4036             :                 {
    4037             :                     /* compute RMS */
    4038          12 :                     pafDstScanline[iDstPixel * 2] =
    4039          12 :                         static_cast<float>(sqrt(dfTotalR / nCount));
    4040          12 :                     pafDstScanline[iDstPixel * 2 + 1] =
    4041          12 :                         static_cast<float>(sqrt(dfTotalI / nCount));
    4042             :                 }
    4043             :             }
    4044           0 :             else if (eMethod == AVERAGE)
    4045             :             {
    4046           0 :                 double dfTotalR = 0.0;
    4047           0 :                 double dfTotalI = 0.0;
    4048           0 :                 int nCount = 0;
    4049             : 
    4050           0 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    4051             :                 {
    4052           0 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    4053             :                     {
    4054             :                         // TODO(schwehr): Maybe use std::complex?
    4055           0 :                         dfTotalR +=
    4056           0 :                             pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
    4057           0 :                                                         iY - nSrcYOff) *
    4058           0 :                                                         nSrcWidth * 2];
    4059           0 :                         dfTotalI += pafSrcScanline[iX * 2 +
    4060           0 :                                                    static_cast<GPtrDiff_t>(
    4061           0 :                                                        iY - nSrcYOff) *
    4062           0 :                                                        nSrcWidth * 2 +
    4063           0 :                                                    1];
    4064           0 :                         ++nCount;
    4065             :                     }
    4066             :                 }
    4067             : 
    4068           0 :                 CPLAssert(nCount > 0);
    4069           0 :                 if (nCount == 0)
    4070             :                 {
    4071           0 :                     pafDstScanline[iDstPixel * 2] = 0.0;
    4072           0 :                     pafDstScanline[iDstPixel * 2 + 1] = 0.0;
    4073             :                 }
    4074             :                 else
    4075             :                 {
    4076           0 :                     pafDstScanline[iDstPixel * 2] =
    4077           0 :                         static_cast<float>(dfTotalR / nCount);
    4078           0 :                     pafDstScanline[iDstPixel * 2 + 1] =
    4079           0 :                         static_cast<float>(dfTotalI / nCount);
    4080             :                 }
    4081             :             }
    4082             :         }
    4083             :     }
    4084             : 
    4085           2 :     return CE_None;
    4086             : }
    4087             : 
    4088             : /************************************************************************/
    4089             : /*                  GDALRegenerateCascadingOverviews()                  */
    4090             : /*                                                                      */
    4091             : /*      Generate a list of overviews in order from largest to           */
    4092             : /*      smallest, computing each from the next larger.                  */
    4093             : /************************************************************************/
    4094             : 
    4095          44 : static CPLErr GDALRegenerateCascadingOverviews(
    4096             :     GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
    4097             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    4098             :     void *pProgressData, CSLConstList papszOptions)
    4099             : 
    4100             : {
    4101             :     /* -------------------------------------------------------------------- */
    4102             :     /*      First, we must put the overviews in order from largest to       */
    4103             :     /*      smallest.                                                       */
    4104             :     /* -------------------------------------------------------------------- */
    4105         127 :     for (int i = 0; i < nOverviews - 1; ++i)
    4106             :     {
    4107         292 :         for (int j = 0; j < nOverviews - i - 1; ++j)
    4108             :         {
    4109         209 :             if (papoOvrBands[j]->GetXSize() *
    4110         209 :                     static_cast<float>(papoOvrBands[j]->GetYSize()) <
    4111         209 :                 papoOvrBands[j + 1]->GetXSize() *
    4112         209 :                     static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
    4113             :             {
    4114           0 :                 GDALRasterBand *poTempBand = papoOvrBands[j];
    4115           0 :                 papoOvrBands[j] = papoOvrBands[j + 1];
    4116           0 :                 papoOvrBands[j + 1] = poTempBand;
    4117             :             }
    4118             :         }
    4119             :     }
    4120             : 
    4121             :     /* -------------------------------------------------------------------- */
    4122             :     /*      Count total pixels so we can prepare appropriate scaled         */
    4123             :     /*      progress functions.                                             */
    4124             :     /* -------------------------------------------------------------------- */
    4125          44 :     double dfTotalPixels = 0.0;
    4126             : 
    4127         171 :     for (int i = 0; i < nOverviews; ++i)
    4128             :     {
    4129         127 :         dfTotalPixels += papoOvrBands[i]->GetXSize() *
    4130         127 :                          static_cast<double>(papoOvrBands[i]->GetYSize());
    4131             :     }
    4132             : 
    4133             :     /* -------------------------------------------------------------------- */
    4134             :     /*      Generate all the bands.                                         */
    4135             :     /* -------------------------------------------------------------------- */
    4136          44 :     double dfPixelsProcessed = 0.0;
    4137             : 
    4138         171 :     for (int i = 0; i < nOverviews; ++i)
    4139             :     {
    4140         127 :         GDALRasterBand *poBaseBand = poSrcBand;
    4141         127 :         if (i != 0)
    4142          83 :             poBaseBand = papoOvrBands[i - 1];
    4143             : 
    4144         127 :         double dfPixels = papoOvrBands[i]->GetXSize() *
    4145         127 :                           static_cast<double>(papoOvrBands[i]->GetYSize());
    4146             : 
    4147         254 :         void *pScaledProgressData = GDALCreateScaledProgress(
    4148             :             dfPixelsProcessed / dfTotalPixels,
    4149         127 :             (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
    4150             :             pProgressData);
    4151             : 
    4152         254 :         const CPLErr eErr = GDALRegenerateOverviewsEx(
    4153             :             poBaseBand, 1,
    4154         127 :             reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
    4155             :             pszResampling, GDALScaledProgress, pScaledProgressData,
    4156             :             papszOptions);
    4157         127 :         GDALDestroyScaledProgress(pScaledProgressData);
    4158             : 
    4159         127 :         if (eErr != CE_None)
    4160           0 :             return eErr;
    4161             : 
    4162         127 :         dfPixelsProcessed += dfPixels;
    4163             : 
    4164             :         // Only do the bit2grayscale promotion on the base band.
    4165         127 :         if (STARTS_WITH_CI(pszResampling,
    4166             :                            "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
    4167           8 :             pszResampling = "AVERAGE";
    4168             :     }
    4169             : 
    4170          44 :     return CE_None;
    4171             : }
    4172             : 
    4173             : /************************************************************************/
    4174             : /*                    GDALGetResampleFunction()                         */
    4175             : /************************************************************************/
    4176             : 
    4177        4945 : GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
    4178             :                                              int *pnRadius)
    4179             : {
    4180        4945 :     if (pnRadius)
    4181        4944 :         *pnRadius = 0;
    4182        4945 :     if (STARTS_WITH_CI(pszResampling, "NEAR"))
    4183         501 :         return GDALResampleChunk_Near;
    4184        4444 :     else if (STARTS_WITH_CI(pszResampling, "AVER") ||
    4185        3890 :              EQUAL(pszResampling, "RMS"))
    4186         574 :         return GDALResampleChunk_AverageOrRMS;
    4187        3870 :     else if (EQUAL(pszResampling, "GAUSS"))
    4188             :     {
    4189          26 :         if (pnRadius)
    4190          26 :             *pnRadius = 1;
    4191          26 :         return GDALResampleChunk_Gauss;
    4192             :     }
    4193        3844 :     else if (EQUAL(pszResampling, "MODE"))
    4194          96 :         return GDALResampleChunk_Mode;
    4195        3748 :     else if (EQUAL(pszResampling, "CUBIC"))
    4196             :     {
    4197        1341 :         if (pnRadius)
    4198        1338 :             *pnRadius = GWKGetFilterRadius(GRA_Cubic);
    4199        1337 :         return GDALResampleChunk_Convolution;
    4200             :     }
    4201        2407 :     else if (EQUAL(pszResampling, "CUBICSPLINE"))
    4202             :     {
    4203           3 :         if (pnRadius)
    4204           3 :             *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
    4205           3 :         return GDALResampleChunk_Convolution;
    4206             :     }
    4207        2404 :     else if (EQUAL(pszResampling, "LANCZOS"))
    4208             :     {
    4209           8 :         if (pnRadius)
    4210           8 :             *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
    4211           8 :         return GDALResampleChunk_Convolution;
    4212             :     }
    4213        2396 :     else if (EQUAL(pszResampling, "BILINEAR"))
    4214             :     {
    4215        2398 :         if (pnRadius)
    4216        2398 :             *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
    4217        2398 :         return GDALResampleChunk_Convolution;
    4218             :     }
    4219             :     else
    4220             :     {
    4221           0 :         CPLError(
    4222             :             CE_Failure, CPLE_AppDefined,
    4223             :             "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
    4224             :             pszResampling);
    4225           0 :         return nullptr;
    4226             :     }
    4227             : }
    4228             : 
    4229             : /************************************************************************/
    4230             : /*                      GDALGetOvrWorkDataType()                        */
    4231             : /************************************************************************/
    4232             : 
    4233        4819 : GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
    4234             :                                     GDALDataType eSrcDataType)
    4235             : {
    4236        4819 :     if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
    4237             :     {
    4238         595 :         return eSrcDataType;
    4239             :     }
    4240        4224 :     else if (eSrcDataType == GDT_Byte &&
    4241        3890 :              (STARTS_WITH_CI(pszResampling, "AVER") ||
    4242        3422 :               EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
    4243        2274 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    4244        2271 :               EQUAL(pszResampling, "LANCZOS") ||
    4245        2266 :               EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
    4246             :     {
    4247        3887 :         return GDT_Byte;
    4248             :     }
    4249         337 :     else if (eSrcDataType == GDT_UInt16 &&
    4250         120 :              (STARTS_WITH_CI(pszResampling, "AVER") ||
    4251         113 :               EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
    4252           3 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    4253           3 :               EQUAL(pszResampling, "LANCZOS") ||
    4254           2 :               EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
    4255             :     {
    4256         111 :         return GDT_UInt16;
    4257             :     }
    4258         226 :     else if (EQUAL(pszResampling, "GAUSS"))
    4259          20 :         return GDT_Float64;
    4260             : 
    4261         206 :     if (eSrcDataType == GDT_Byte || eSrcDataType == GDT_Int8 ||
    4262         204 :         eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
    4263             :         eSrcDataType == GDT_Float32)
    4264             :     {
    4265         164 :         return GDT_Float32;
    4266             :     }
    4267          42 :     return GDT_Float64;
    4268             : }
    4269             : 
    4270             : namespace
    4271             : {
    4272             : // Structure to hold a pointer to free with CPLFree()
    4273             : struct PointerHolder
    4274             : {
    4275             :     void *ptr = nullptr;
    4276             : 
    4277        5792 :     explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
    4278             :     {
    4279        5792 :     }
    4280             : 
    4281        5792 :     ~PointerHolder()
    4282        5792 :     {
    4283        5792 :         CPLFree(ptr);
    4284        5792 :     }
    4285             : 
    4286             :     PointerHolder(const PointerHolder &) = delete;
    4287             :     PointerHolder &operator=(const PointerHolder &) = delete;
    4288             : };
    4289             : }  // namespace
    4290             : 
    4291             : /************************************************************************/
    4292             : /*                      GDALRegenerateOverviews()                       */
    4293             : /************************************************************************/
    4294             : 
    4295             : /**
    4296             :  * \brief Generate downsampled overviews.
    4297             :  *
    4298             :  * This function will generate one or more overview images from a base image
    4299             :  * using the requested downsampling algorithm.  Its primary use is for
    4300             :  * generating overviews via GDALDataset::BuildOverviews(), but it can also be
    4301             :  * used to generate downsampled images in one file from another outside the
    4302             :  * overview architecture.
    4303             :  *
    4304             :  * The output bands need to exist in advance.
    4305             :  *
    4306             :  * The full set of resampling algorithms is documented in
    4307             :  * GDALDataset::BuildOverviews().
    4308             :  *
    4309             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    4310             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    4311             :  * considered as the nodata value and not each value of the triplet
    4312             :  * independently per band.
    4313             :  *
    4314             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    4315             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    4316             :  * overview computation.
    4317             :  *
    4318             :  * @param hSrcBand the source (base level) band.
    4319             :  * @param nOverviewCount the number of downsampled bands being generated.
    4320             :  * @param pahOvrBands the list of downsampled bands to be generated.
    4321             :  * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
    4322             :  * @param pfnProgress progress report function.
    4323             :  * @param pProgressData progress function callback data.
    4324             :  * @return CE_None on success or CE_Failure on failure.
    4325             :  */
    4326         250 : CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
    4327             :                                GDALRasterBandH *pahOvrBands,
    4328             :                                const char *pszResampling,
    4329             :                                GDALProgressFunc pfnProgress,
    4330             :                                void *pProgressData)
    4331             : 
    4332             : {
    4333         250 :     return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
    4334             :                                      pszResampling, pfnProgress, pProgressData,
    4335         250 :                                      nullptr);
    4336             : }
    4337             : 
    4338             : /************************************************************************/
    4339             : /*                     GDALRegenerateOverviewsEx()                      */
    4340             : /************************************************************************/
    4341             : 
    4342             : constexpr int RADIUS_TO_DIAMETER = 2;
    4343             : 
    4344             : /**
    4345             :  * \brief Generate downsampled overviews.
    4346             :  *
    4347             :  * This function will generate one or more overview images from a base image
    4348             :  * using the requested downsampling algorithm.  Its primary use is for
    4349             :  * generating overviews via GDALDataset::BuildOverviews(), but it can also be
    4350             :  * used to generate downsampled images in one file from another outside the
    4351             :  * overview architecture.
    4352             :  *
    4353             :  * The output bands need to exist in advance.
    4354             :  *
    4355             :  * The full set of resampling algorithms is documented in
    4356             :  * GDALDataset::BuildOverviews().
    4357             :  *
    4358             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    4359             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    4360             :  * considered as the nodata value and not each value of the triplet
    4361             :  * independently per band.
    4362             :  *
    4363             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    4364             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    4365             :  * overview computation.
    4366             :  *
    4367             :  * @param hSrcBand the source (base level) band.
    4368             :  * @param nOverviewCount the number of downsampled bands being generated.
    4369             :  * @param pahOvrBands the list of downsampled bands to be generated.
    4370             :  * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
    4371             :  * @param pfnProgress progress report function.
    4372             :  * @param pProgressData progress function callback data.
    4373             :  * @param papszOptions NULL terminated list of options as key=value pairs, or
    4374             :  * NULL
    4375             :  * @return CE_None on success or CE_Failure on failure.
    4376             :  * @since GDAL 3.6
    4377             :  */
    4378         902 : CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
    4379             :                                  GDALRasterBandH *pahOvrBands,
    4380             :                                  const char *pszResampling,
    4381             :                                  GDALProgressFunc pfnProgress,
    4382             :                                  void *pProgressData, CSLConstList papszOptions)
    4383             : 
    4384             : {
    4385         902 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    4386         902 :     GDALRasterBand **papoOvrBands =
    4387             :         reinterpret_cast<GDALRasterBand **>(pahOvrBands);
    4388             : 
    4389         902 :     if (pfnProgress == nullptr)
    4390         252 :         pfnProgress = GDALDummyProgress;
    4391             : 
    4392         902 :     if (EQUAL(pszResampling, "NONE"))
    4393          64 :         return CE_None;
    4394             : 
    4395         838 :     int nKernelRadius = 0;
    4396             :     GDALResampleFunction pfnResampleFn =
    4397         838 :         GDALGetResampleFunction(pszResampling, &nKernelRadius);
    4398             : 
    4399         838 :     if (pfnResampleFn == nullptr)
    4400           0 :         return CE_Failure;
    4401             : 
    4402             :     /* -------------------------------------------------------------------- */
    4403             :     /*      Check color tables...                                           */
    4404             :     /* -------------------------------------------------------------------- */
    4405         838 :     GDALColorTable *poColorTable = nullptr;
    4406             : 
    4407         471 :     if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
    4408        1750 :          EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
    4409         452 :         poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
    4410             :     {
    4411           9 :         poColorTable = poSrcBand->GetColorTable();
    4412           9 :         if (poColorTable != nullptr)
    4413             :         {
    4414           9 :             if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
    4415             :             {
    4416           0 :                 CPLError(CE_Warning, CPLE_AppDefined,
    4417             :                          "Computing overviews on palette index raster bands "
    4418             :                          "with a palette whose color interpretation is not RGB "
    4419             :                          "will probably lead to unexpected results.");
    4420           0 :                 poColorTable = nullptr;
    4421             :             }
    4422           9 :             else if (poColorTable->IsIdentity())
    4423             :             {
    4424           0 :                 poColorTable = nullptr;
    4425             :             }
    4426             :         }
    4427             :         else
    4428             :         {
    4429           0 :             CPLError(CE_Warning, CPLE_AppDefined,
    4430             :                      "Computing overviews on palette index raster bands "
    4431             :                      "without a palette will probably lead to unexpected "
    4432             :                      "results.");
    4433             :         }
    4434             :     }
    4435             :     // Not ready yet
    4436        2433 :     else if ((EQUAL(pszResampling, "CUBIC") ||
    4437         775 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    4438         775 :               EQUAL(pszResampling, "LANCZOS") ||
    4439        1684 :               EQUAL(pszResampling, "BILINEAR")) &&
    4440          80 :              poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
    4441             :     {
    4442           0 :         CPLError(CE_Warning, CPLE_AppDefined,
    4443             :                  "Computing %s overviews on palette index raster bands "
    4444             :                  "will probably lead to unexpected results.",
    4445             :                  pszResampling);
    4446             :     }
    4447             : 
    4448             :     // If we have a nodata mask and we are doing something more complicated
    4449             :     // than nearest neighbouring, we have to fetch to nodata mask.
    4450             : 
    4451         838 :     GDALRasterBand *poMaskBand = nullptr;
    4452         838 :     bool bUseNoDataMask = false;
    4453         838 :     bool bCanUseCascaded = true;
    4454             : 
    4455         838 :     if (!STARTS_WITH_CI(pszResampling, "NEAR"))
    4456             :     {
    4457             :         // Special case if we are an alpha/mask band. We want it to be
    4458             :         // considered as the mask band to avoid alpha=0 to be taken into account
    4459             :         // in average computation.
    4460         532 :         if (poSrcBand->IsMaskBand())
    4461             :         {
    4462          91 :             poMaskBand = poSrcBand;
    4463          91 :             bUseNoDataMask = true;
    4464             :         }
    4465             :         else
    4466             :         {
    4467         441 :             poMaskBand = poSrcBand->GetMaskBand();
    4468         441 :             const int nMaskFlags = poSrcBand->GetMaskFlags();
    4469         441 :             bCanUseCascaded =
    4470         441 :                 (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
    4471         441 :             bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
    4472             :         }
    4473             :     }
    4474             : 
    4475             :     /* -------------------------------------------------------------------- */
    4476             :     /*      If we are operating on multiple overviews, and using            */
    4477             :     /*      averaging, lets do them in cascading order to reduce the        */
    4478             :     /*      amount of computation.                                          */
    4479             :     /* -------------------------------------------------------------------- */
    4480             : 
    4481             :     // In case the mask made be computed from another band of the dataset,
    4482             :     // we can't use cascaded generation, as the computation of the overviews
    4483             :     // of the band used for the mask band may not have yet occurred (#3033).
    4484         838 :     if ((STARTS_WITH_CI(pszResampling, "AVER") ||
    4485         471 :          EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
    4486         440 :          EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
    4487         386 :          EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
    4488         838 :          EQUAL(pszResampling, "MODE")) &&
    4489          44 :         nOverviewCount > 1 && bCanUseCascaded)
    4490          44 :         return GDALRegenerateCascadingOverviews(
    4491             :             poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
    4492          44 :             pProgressData, papszOptions);
    4493             : 
    4494             :     /* -------------------------------------------------------------------- */
    4495             :     /*      Setup one horizontal swath to read from the raw buffer.         */
    4496             :     /* -------------------------------------------------------------------- */
    4497         794 :     int nFRXBlockSize = 0;
    4498         794 :     int nFRYBlockSize = 0;
    4499         794 :     poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
    4500             : 
    4501         794 :     const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
    4502        1282 :     const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
    4503        1236 :                                        EQUAL(pszResampling, "MODE") ||
    4504         442 :                                        !GDALDataTypeIsComplex(eSrcDataType);
    4505             :     const GDALDataType eWrkDataType =
    4506             :         bUseGenericResampleFn
    4507         794 :             ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
    4508         794 :             : GDT_CFloat32;
    4509             : 
    4510         794 :     const int nWidth = poSrcBand->GetXSize();
    4511         794 :     const int nHeight = poSrcBand->GetYSize();
    4512             : 
    4513         794 :     int nMaxOvrFactor = 1;
    4514        1705 :     for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
    4515             :     {
    4516         911 :         const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
    4517         911 :         const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
    4518         911 :         nMaxOvrFactor = std::max(
    4519             :             nMaxOvrFactor,
    4520         911 :             static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
    4521         911 :         nMaxOvrFactor = std::max(
    4522             :             nMaxOvrFactor,
    4523         911 :             static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
    4524             :     }
    4525             : 
    4526         794 :     int nFullResYChunk = nFRYBlockSize;
    4527         794 :     int nMaxChunkYSizeQueried = 0;
    4528             : 
    4529             :     const auto UpdateChunkHeightAndGetChunkSize =
    4530       10356 :         [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
    4531       83809 :          eWrkDataType, nWidth]()
    4532             :     {
    4533             :         // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
    4534             :         // + nFullResYChunk) / nMaxOvrFactor)
    4535       10356 :         if (nMaxOvrFactor > INT_MAX / RADIUS_TO_DIAMETER)
    4536             :         {
    4537           1 :             return GINTBIG_MAX;
    4538             :         }
    4539       10355 :         nFullResYChunk =
    4540       10355 :             std::max(nFullResYChunk, RADIUS_TO_DIAMETER * nMaxOvrFactor);
    4541       10355 :         if ((nKernelRadius > 0 &&
    4542         970 :              nMaxOvrFactor > INT_MAX / (RADIUS_TO_DIAMETER * nKernelRadius)) ||
    4543       10355 :             nFullResYChunk >
    4544       10355 :                 INT_MAX - RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor)
    4545             :         {
    4546           0 :             return GINTBIG_MAX;
    4547             :         }
    4548       10355 :         nMaxChunkYSizeQueried =
    4549       10355 :             nFullResYChunk + RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor;
    4550       10355 :         if (GDALGetDataTypeSizeBytes(eWrkDataType) >
    4551       10355 :             std::numeric_limits<int64_t>::max() /
    4552       10355 :                 (static_cast<int64_t>(nMaxChunkYSizeQueried) * nWidth))
    4553             :         {
    4554           1 :             return GINTBIG_MAX;
    4555             :         }
    4556       10354 :         return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
    4557       10354 :                nMaxChunkYSizeQueried * nWidth;
    4558         794 :     };
    4559             : 
    4560             :     // Only configurable for debug / testing
    4561             :     const char *pszChunkYSize =
    4562         794 :         CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
    4563         794 :     if (pszChunkYSize)
    4564             :     {
    4565             :         // coverity[tainted_data]
    4566           0 :         nFullResYChunk = atoi(pszChunkYSize);
    4567             :     }
    4568             : 
    4569             :     // Only configurable for debug / testing
    4570             :     const int nChunkMaxSize =
    4571         794 :         atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
    4572             : 
    4573         794 :     auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
    4574         794 :     if (nChunkSize > nChunkMaxSize)
    4575             :     {
    4576          15 :         if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
    4577          44 :             !GDALDataTypeIsComplex(eSrcDataType) &&
    4578          14 :             (!STARTS_WITH_CI(pszResampling, "AVER") ||
    4579           2 :              EQUAL(pszResampling, "AVERAGE")))
    4580             :         {
    4581             :             // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
    4582             :             // which use a block based strategy, which is much less memory
    4583             :             // hungry.
    4584          14 :             return GDALRegenerateOverviewsMultiBand(
    4585             :                 1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
    4586          14 :                 pfnProgress, pProgressData, papszOptions);
    4587             :         }
    4588           1 :         else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
    4589             :         {
    4590           0 :             return GDALRegenerateCascadingOverviews(
    4591             :                 poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
    4592           0 :                 pfnProgress, pProgressData, papszOptions);
    4593             :         }
    4594             :     }
    4595         779 :     else if (pszChunkYSize == nullptr)
    4596             :     {
    4597             :         // Try to get as close as possible to nChunkMaxSize
    4598       10341 :         while (nChunkSize < nChunkMaxSize / 2)
    4599             :         {
    4600        9562 :             nFullResYChunk *= 2;
    4601        9562 :             nChunkSize = UpdateChunkHeightAndGetChunkSize();
    4602             :         }
    4603             :     }
    4604             : 
    4605         780 :     int nHasNoData = 0;
    4606         780 :     const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
    4607         780 :     const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
    4608             :     const bool bPropagateNoData =
    4609         780 :         CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
    4610             : 
    4611             :     // Structure describing a resampling job
    4612             :     struct OvrJob
    4613             :     {
    4614             :         // Buffers to free when job is finished
    4615             :         std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
    4616             :         std::shared_ptr<PointerHolder> oSrcBufferHolder{};
    4617             :         std::unique_ptr<PointerHolder> oDstBufferHolder{};
    4618             : 
    4619             :         GDALRasterBand *poDstBand = nullptr;
    4620             : 
    4621             :         // Input parameters of pfnResampleFn
    4622             :         GDALResampleFunction pfnResampleFn = nullptr;
    4623             :         int nSrcWidth = 0;
    4624             :         int nSrcHeight = 0;
    4625             :         int nDstWidth = 0;
    4626             :         GDALOverviewResampleArgs args{};
    4627             :         const void *pChunk = nullptr;
    4628             :         bool bUseGenericResampleFn = false;
    4629             : 
    4630             :         // Output values of resampling function
    4631             :         CPLErr eErr = CE_Failure;
    4632             :         void *pDstBuffer = nullptr;
    4633             :         GDALDataType eDstBufferDataType = GDT_Unknown;
    4634             : 
    4635             :         // Synchronization
    4636             :         bool bFinished = false;
    4637             :         std::mutex mutex{};
    4638             :         std::condition_variable cv{};
    4639             : 
    4640           0 :         void SetSrcMaskBufferHolder(
    4641             :             const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
    4642             :         {
    4643           0 :             oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
    4644           0 :         }
    4645             : 
    4646           0 :         void SetSrcBufferHolder(
    4647             :             const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
    4648             :         {
    4649           0 :             oSrcBufferHolder = oSrcBufferHolderIn;
    4650           0 :         }
    4651             :     };
    4652             : 
    4653             :     // Thread function to resample
    4654         880 :     const auto JobResampleFunc = [](void *pData)
    4655             :     {
    4656         880 :         OvrJob *poJob = static_cast<OvrJob *>(pData);
    4657             : 
    4658         880 :         if (poJob->bUseGenericResampleFn)
    4659             :         {
    4660         878 :             poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
    4661             :                                                &(poJob->pDstBuffer),
    4662             :                                                &(poJob->eDstBufferDataType));
    4663             :         }
    4664             :         else
    4665             :         {
    4666           2 :             poJob->eErr = GDALResampleChunkC32R(
    4667             :                 poJob->nSrcWidth, poJob->nSrcHeight,
    4668           2 :                 static_cast<const float *>(poJob->pChunk),
    4669             :                 poJob->args.nChunkYOff, poJob->args.nChunkYSize,
    4670             :                 poJob->args.nDstYOff, poJob->args.nDstYOff2,
    4671             :                 poJob->args.nOvrXSize, poJob->args.nOvrYSize,
    4672             :                 &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
    4673             :                 poJob->args.pszResampling);
    4674             :         }
    4675             : 
    4676             :         poJob->oDstBufferHolder =
    4677         880 :             std::make_unique<PointerHolder>(poJob->pDstBuffer);
    4678             : 
    4679             :         {
    4680        1760 :             std::lock_guard<std::mutex> guard(poJob->mutex);
    4681         880 :             poJob->bFinished = true;
    4682         880 :             poJob->cv.notify_one();
    4683             :         }
    4684         880 :     };
    4685             : 
    4686             :     // Function to write resample data to target band
    4687         880 :     const auto WriteJobData = [](const OvrJob *poJob)
    4688             :     {
    4689        1760 :         return poJob->poDstBand->RasterIO(
    4690         880 :             GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
    4691         880 :             poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
    4692         880 :             poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
    4693         880 :             poJob->eDstBufferDataType, 0, 0, nullptr);
    4694             :     };
    4695             : 
    4696             :     // Wait for completion of oldest job and serialize it
    4697             :     const auto WaitAndFinalizeOldestJob =
    4698           0 :         [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
    4699             :     {
    4700           0 :         auto poOldestJob = jobList.front().get();
    4701             :         {
    4702           0 :             std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
    4703             :             // coverity[missing_lock:FALSE]
    4704           0 :             while (!poOldestJob->bFinished)
    4705             :             {
    4706           0 :                 poOldestJob->cv.wait(oGuard);
    4707             :             }
    4708             :         }
    4709           0 :         CPLErr l_eErr = poOldestJob->eErr;
    4710           0 :         if (l_eErr == CE_None)
    4711             :         {
    4712           0 :             l_eErr = WriteJobData(poOldestJob);
    4713             :         }
    4714             : 
    4715           0 :         jobList.pop_front();
    4716           0 :         return l_eErr;
    4717             :     };
    4718             : 
    4719             :     // Queue of jobs
    4720        1560 :     std::list<std::unique_ptr<OvrJob>> jobList;
    4721             : 
    4722         780 :     GByte *pabyChunkNodataMask = nullptr;
    4723         780 :     void *pChunk = nullptr;
    4724             : 
    4725         780 :     const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
    4726        3120 :     const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
    4727         780 :                                                        ? CPLGetNumCPUs()
    4728         780 :                                                        : atoi(pszThreads)));
    4729             :     auto poThreadPool =
    4730         780 :         nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
    4731             :     auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
    4732        1560 :                                    : std::unique_ptr<CPLJobQueue>(nullptr);
    4733             : 
    4734             :     /* -------------------------------------------------------------------- */
    4735             :     /*      Loop over image operating on chunks.                            */
    4736             :     /* -------------------------------------------------------------------- */
    4737         780 :     int nChunkYOff = 0;
    4738         780 :     CPLErr eErr = CE_None;
    4739             : 
    4740        1565 :     for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
    4741         785 :          nChunkYOff += nFullResYChunk)
    4742             :     {
    4743         785 :         if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
    4744             :                          pProgressData))
    4745             :         {
    4746           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    4747           0 :             eErr = CE_Failure;
    4748             :         }
    4749             : 
    4750         785 :         if (nFullResYChunk + nChunkYOff > nHeight)
    4751         778 :             nFullResYChunk = nHeight - nChunkYOff;
    4752             : 
    4753         785 :         int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
    4754         785 :         int nChunkYSizeQueried =
    4755         785 :             nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
    4756         785 :         if (nChunkYOffQueried < 0)
    4757             :         {
    4758          83 :             nChunkYSizeQueried += nChunkYOffQueried;
    4759          83 :             nChunkYOffQueried = 0;
    4760             :         }
    4761         785 :         if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
    4762          83 :             nChunkYSizeQueried = nHeight - nChunkYOffQueried;
    4763             : 
    4764             :         // Avoid accumulating too many tasks and exhaust RAM
    4765             :         // Try to complete already finished jobs
    4766         785 :         while (eErr == CE_None && !jobList.empty())
    4767             :         {
    4768           0 :             auto poOldestJob = jobList.front().get();
    4769             :             {
    4770           0 :                 std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
    4771           0 :                 if (!poOldestJob->bFinished)
    4772             :                 {
    4773           0 :                     break;
    4774             :                 }
    4775             :             }
    4776           0 :             eErr = poOldestJob->eErr;
    4777           0 :             if (eErr == CE_None)
    4778             :             {
    4779           0 :                 eErr = WriteJobData(poOldestJob);
    4780             :             }
    4781             : 
    4782           0 :             jobList.pop_front();
    4783             :         }
    4784             : 
    4785             :         // And in case we have saturated the number of threads,
    4786             :         // wait for completion of tasks to go below the threshold.
    4787        1570 :         while (eErr == CE_None &&
    4788         785 :                jobList.size() >= static_cast<size_t>(nThreads))
    4789             :         {
    4790           0 :             eErr = WaitAndFinalizeOldestJob(jobList);
    4791             :         }
    4792             : 
    4793             :         // (Re)allocate buffers if needed
    4794         785 :         if (pChunk == nullptr)
    4795             :         {
    4796         780 :             pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
    4797             :                                          nMaxChunkYSizeQueried, nWidth);
    4798             :         }
    4799         785 :         if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
    4800             :         {
    4801             :             pabyChunkNodataMask = static_cast<GByte *>(
    4802         283 :                 VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
    4803             :         }
    4804             : 
    4805         785 :         if (pChunk == nullptr ||
    4806         283 :             (bUseNoDataMask && pabyChunkNodataMask == nullptr))
    4807             :         {
    4808           0 :             CPLFree(pChunk);
    4809           0 :             CPLFree(pabyChunkNodataMask);
    4810           0 :             return CE_Failure;
    4811             :         }
    4812             : 
    4813             :         // Read chunk.
    4814         785 :         if (eErr == CE_None)
    4815         785 :             eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
    4816             :                                        nChunkYSizeQueried, pChunk, nWidth,
    4817             :                                        nChunkYSizeQueried, eWrkDataType, 0, 0,
    4818             :                                        nullptr);
    4819         785 :         if (eErr == CE_None && bUseNoDataMask)
    4820         283 :             eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
    4821             :                                         nChunkYSizeQueried, pabyChunkNodataMask,
    4822             :                                         nWidth, nChunkYSizeQueried, GDT_Byte, 0,
    4823             :                                         0, nullptr);
    4824             : 
    4825             :         // Special case to promote 1bit data to 8bit 0/255 values.
    4826         785 :         if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
    4827             :         {
    4828           9 :             if (eWrkDataType == GDT_Float32)
    4829             :             {
    4830           0 :                 float *pafChunk = static_cast<float *>(pChunk);
    4831           0 :                 for (GPtrDiff_t i = 0;
    4832           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4833             :                      i++)
    4834             :                 {
    4835           0 :                     if (pafChunk[i] == 1.0)
    4836           0 :                         pafChunk[i] = 255.0;
    4837             :                 }
    4838             :             }
    4839           9 :             else if (eWrkDataType == GDT_Byte)
    4840             :             {
    4841           9 :                 GByte *pabyChunk = static_cast<GByte *>(pChunk);
    4842      168417 :                 for (GPtrDiff_t i = 0;
    4843      168417 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4844             :                      i++)
    4845             :                 {
    4846      168408 :                     if (pabyChunk[i] == 1)
    4847      127437 :                         pabyChunk[i] = 255;
    4848             :                 }
    4849             :             }
    4850           0 :             else if (eWrkDataType == GDT_UInt16)
    4851             :             {
    4852           0 :                 GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
    4853           0 :                 for (GPtrDiff_t i = 0;
    4854           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4855             :                      i++)
    4856             :                 {
    4857           0 :                     if (pasChunk[i] == 1)
    4858           0 :                         pasChunk[i] = 255;
    4859             :                 }
    4860             :             }
    4861           0 :             else if (eWrkDataType == GDT_Float64)
    4862             :             {
    4863           0 :                 double *padfChunk = static_cast<double *>(pChunk);
    4864           0 :                 for (GPtrDiff_t i = 0;
    4865           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4866             :                      i++)
    4867             :                 {
    4868           0 :                     if (padfChunk[i] == 1.0)
    4869           0 :                         padfChunk[i] = 255.0;
    4870             :                 }
    4871             :             }
    4872             :             else
    4873             :             {
    4874           0 :                 CPLAssert(false);
    4875             :             }
    4876             :         }
    4877         776 :         else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
    4878             :         {
    4879           0 :             if (eWrkDataType == GDT_Float32)
    4880             :             {
    4881           0 :                 float *pafChunk = static_cast<float *>(pChunk);
    4882           0 :                 for (GPtrDiff_t i = 0;
    4883           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4884             :                      i++)
    4885             :                 {
    4886           0 :                     if (pafChunk[i] == 1.0)
    4887           0 :                         pafChunk[i] = 0.0;
    4888           0 :                     else if (pafChunk[i] == 0.0)
    4889           0 :                         pafChunk[i] = 255.0;
    4890             :                 }
    4891             :             }
    4892           0 :             else if (eWrkDataType == GDT_Byte)
    4893             :             {
    4894           0 :                 GByte *pabyChunk = static_cast<GByte *>(pChunk);
    4895           0 :                 for (GPtrDiff_t i = 0;
    4896           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4897             :                      i++)
    4898             :                 {
    4899           0 :                     if (pabyChunk[i] == 1)
    4900           0 :                         pabyChunk[i] = 0;
    4901           0 :                     else if (pabyChunk[i] == 0)
    4902           0 :                         pabyChunk[i] = 255;
    4903             :                 }
    4904             :             }
    4905           0 :             else if (eWrkDataType == GDT_UInt16)
    4906             :             {
    4907           0 :                 GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
    4908           0 :                 for (GPtrDiff_t i = 0;
    4909           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4910             :                      i++)
    4911             :                 {
    4912           0 :                     if (pasChunk[i] == 1)
    4913           0 :                         pasChunk[i] = 0;
    4914           0 :                     else if (pasChunk[i] == 0)
    4915           0 :                         pasChunk[i] = 255;
    4916             :                 }
    4917             :             }
    4918           0 :             else if (eWrkDataType == GDT_Float64)
    4919             :             {
    4920           0 :                 double *padfChunk = static_cast<double *>(pChunk);
    4921           0 :                 for (GPtrDiff_t i = 0;
    4922           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4923             :                      i++)
    4924             :                 {
    4925           0 :                     if (padfChunk[i] == 1.0)
    4926           0 :                         padfChunk[i] = 0.0;
    4927           0 :                     else if (padfChunk[i] == 0.0)
    4928           0 :                         padfChunk[i] = 255.0;
    4929             :                 }
    4930             :             }
    4931             :             else
    4932             :             {
    4933           0 :                 CPLAssert(false);
    4934             :             }
    4935             :         }
    4936             : 
    4937             :         auto oSrcBufferHolder =
    4938        1570 :             std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
    4939             :         auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
    4940        1570 :             poJobQueue ? pabyChunkNodataMask : nullptr);
    4941             : 
    4942        1665 :         for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
    4943             :              ++iOverview)
    4944             :         {
    4945         880 :             GDALRasterBand *poDstBand = papoOvrBands[iOverview];
    4946         880 :             const int nDstWidth = poDstBand->GetXSize();
    4947         880 :             const int nDstHeight = poDstBand->GetYSize();
    4948             : 
    4949         880 :             const double dfXRatioDstToSrc =
    4950         880 :                 static_cast<double>(nWidth) / nDstWidth;
    4951         880 :             const double dfYRatioDstToSrc =
    4952         880 :                 static_cast<double>(nHeight) / nDstHeight;
    4953             : 
    4954             :             /* --------------------------------------------------------------------
    4955             :              */
    4956             :             /*      Figure out the line to start writing to, and the first line
    4957             :              */
    4958             :             /*      to not write to.  In theory this approach should ensure that
    4959             :              */
    4960             :             /*      every output line will be written if all input chunks are */
    4961             :             /*      processed. */
    4962             :             /* --------------------------------------------------------------------
    4963             :              */
    4964         880 :             int nDstYOff =
    4965         880 :                 static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
    4966         880 :             if (nDstYOff == nDstHeight)
    4967           0 :                 continue;
    4968         880 :             int nDstYOff2 = static_cast<int>(
    4969         880 :                 0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
    4970             : 
    4971         880 :             if (nChunkYOff + nFullResYChunk == nHeight)
    4972         873 :                 nDstYOff2 = nDstHeight;
    4973             : #if DEBUG_VERBOSE
    4974             :             CPLDebug("GDAL",
    4975             :                      "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
    4976             :                      nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
    4977             :                      nDstWidth, nDstYOff2 - nDstYOff);
    4978             : #endif
    4979             : 
    4980        1760 :             auto poJob = std::make_unique<OvrJob>();
    4981         880 :             poJob->pfnResampleFn = pfnResampleFn;
    4982         880 :             poJob->bUseGenericResampleFn = bUseGenericResampleFn;
    4983         880 :             poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
    4984         880 :             poJob->args.nOvrXSize = poDstBand->GetXSize();
    4985         880 :             poJob->args.nOvrYSize = poDstBand->GetYSize();
    4986             :             const char *pszNBITS =
    4987         880 :                 poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    4988         880 :             poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    4989         880 :             poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    4990         880 :             poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    4991         880 :             poJob->args.eWrkDataType = eWrkDataType;
    4992         880 :             poJob->pChunk = pChunk;
    4993         880 :             poJob->args.pabyChunkNodataMask = pabyChunkNodataMask;
    4994         880 :             poJob->nSrcWidth = nWidth;
    4995         880 :             poJob->nSrcHeight = nHeight;
    4996         880 :             poJob->args.nChunkXOff = 0;
    4997         880 :             poJob->args.nChunkXSize = nWidth;
    4998         880 :             poJob->args.nChunkYOff = nChunkYOffQueried;
    4999         880 :             poJob->args.nChunkYSize = nChunkYSizeQueried;
    5000         880 :             poJob->nDstWidth = nDstWidth;
    5001         880 :             poJob->args.nDstXOff = 0;
    5002         880 :             poJob->args.nDstXOff2 = nDstWidth;
    5003         880 :             poJob->args.nDstYOff = nDstYOff;
    5004         880 :             poJob->args.nDstYOff2 = nDstYOff2;
    5005         880 :             poJob->poDstBand = poDstBand;
    5006         880 :             poJob->args.pszResampling = pszResampling;
    5007         880 :             poJob->args.bHasNoData = bHasNoData;
    5008         880 :             poJob->args.dfNoDataValue = dfNoDataValue;
    5009         880 :             poJob->args.poColorTable = poColorTable;
    5010         880 :             poJob->args.eSrcDataType = eSrcDataType;
    5011         880 :             poJob->args.bPropagateNoData = bPropagateNoData;
    5012             : 
    5013         880 :             if (poJobQueue)
    5014             :             {
    5015           0 :                 poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
    5016           0 :                 poJob->SetSrcBufferHolder(oSrcBufferHolder);
    5017           0 :                 poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
    5018           0 :                 jobList.emplace_back(std::move(poJob));
    5019             :             }
    5020             :             else
    5021             :             {
    5022         880 :                 JobResampleFunc(poJob.get());
    5023         880 :                 eErr = poJob->eErr;
    5024         880 :                 if (eErr == CE_None)
    5025             :                 {
    5026         880 :                     eErr = WriteJobData(poJob.get());
    5027             :                 }
    5028             :             }
    5029             :         }
    5030             : 
    5031         785 :         if (poJobQueue)
    5032             :         {
    5033           0 :             pChunk = nullptr;
    5034           0 :             pabyChunkNodataMask = nullptr;
    5035             :         }
    5036             :     }
    5037             : 
    5038         780 :     VSIFree(pChunk);
    5039         780 :     VSIFree(pabyChunkNodataMask);
    5040             : 
    5041             :     // Wait for all pending jobs to complete
    5042         780 :     while (!jobList.empty())
    5043             :     {
    5044           0 :         const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
    5045           0 :         if (l_eErr != CE_None && eErr == CE_None)
    5046           0 :             eErr = l_eErr;
    5047             :     }
    5048             : 
    5049             :     /* -------------------------------------------------------------------- */
    5050             :     /*      Renormalized overview mean / stddev if needed.                  */
    5051             :     /* -------------------------------------------------------------------- */
    5052         780 :     if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
    5053             :     {
    5054           0 :         GDALOverviewMagnitudeCorrection(
    5055             :             poSrcBand, nOverviewCount,
    5056             :             reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
    5057             :             GDALDummyProgress, nullptr);
    5058             :     }
    5059             : 
    5060             :     /* -------------------------------------------------------------------- */
    5061             :     /*      It can be important to flush out data to overviews.             */
    5062             :     /* -------------------------------------------------------------------- */
    5063        1653 :     for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
    5064             :          ++iOverview)
    5065             :     {
    5066         873 :         eErr = papoOvrBands[iOverview]->FlushCache(false);
    5067             :     }
    5068             : 
    5069         780 :     if (eErr == CE_None)
    5070         780 :         pfnProgress(1.0, nullptr, pProgressData);
    5071             : 
    5072         780 :     return eErr;
    5073             : }
    5074             : 
    5075             : /************************************************************************/
    5076             : /*            GDALRegenerateOverviewsMultiBand()                        */
    5077             : /************************************************************************/
    5078             : 
    5079             : /**
    5080             :  * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
    5081             :  * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
    5082             :  *
    5083             :  * This function will generate one or more overview images from a base
    5084             :  * image using the requested downsampling algorithm.  Its primary use
    5085             :  * is for generating overviews via GDALDataset::BuildOverviews(), but it
    5086             :  * can also be used to generate downsampled images in one file from another
    5087             :  * outside the overview architecture.
    5088             :  *
    5089             :  * The output bands need to exist in advance and share the same characteristics
    5090             :  * (type, dimensions)
    5091             :  *
    5092             :  * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
    5093             :  * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
    5094             :  *
    5095             :  * It does not support color tables or complex data types.
    5096             :  *
    5097             :  * The pseudo-algorithm used by the function is :
    5098             :  *    for each overview
    5099             :  *       iterate on lines of the source by a step of deltay
    5100             :  *           iterate on columns of the source  by a step of deltax
    5101             :  *               read the source data of size deltax * deltay for all the bands
    5102             :  *               generate the corresponding overview block for all the bands
    5103             :  *
    5104             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    5105             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    5106             :  * considered as the nodata value and not each value of the triplet
    5107             :  * independently per band.
    5108             :  *
    5109             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    5110             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    5111             :  * overview computation.
    5112             :  *
    5113             :  * @param nBands the number of bands, size of papoSrcBands and size of
    5114             :  *               first dimension of papapoOverviewBands
    5115             :  * @param papoSrcBands the list of source bands to downsample
    5116             :  * @param nOverviews the number of downsampled overview levels being generated.
    5117             :  * @param papapoOverviewBands bidimension array of bands. First dimension is
    5118             :  *                            indexed by nBands. Second dimension is indexed by
    5119             :  *                            nOverviews.
    5120             :  * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
    5121             :  * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
    5122             :  * @param pfnProgress progress report function.
    5123             :  * @param pProgressData progress function callback data.
    5124             :  * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
    5125             :  *                     key=value pairs, or NULL
    5126             :  *                     Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
    5127             :  *                     options can be specified to express that overviews should
    5128             :  *                     be regenerated only in the specified subset of the source
    5129             :  *                     dataset.
    5130             :  * @return CE_None on success or CE_Failure on failure.
    5131             :  */
    5132             : 
    5133         388 : CPLErr GDALRegenerateOverviewsMultiBand(
    5134             :     int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
    5135             :     GDALRasterBand *const *const *papapoOverviewBands,
    5136             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    5137             :     void *pProgressData, CSLConstList papszOptions)
    5138             : {
    5139         388 :     CPL_IGNORE_RET_VAL(papszOptions);
    5140             : 
    5141         388 :     if (pfnProgress == nullptr)
    5142          11 :         pfnProgress = GDALDummyProgress;
    5143             : 
    5144         388 :     if (EQUAL(pszResampling, "NONE"))
    5145           2 :         return CE_None;
    5146             : 
    5147             :     // Sanity checks.
    5148         386 :     if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
    5149         191 :         !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
    5150          84 :         !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
    5151          22 :         !EQUAL(pszResampling, "CUBICSPLINE") &&
    5152          21 :         !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
    5153           5 :         !EQUAL(pszResampling, "MODE"))
    5154             :     {
    5155           0 :         CPLError(CE_Failure, CPLE_NotSupported,
    5156             :                  "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
    5157             :                  "not supported",
    5158             :                  pszResampling);
    5159           0 :         return CE_Failure;
    5160             :     }
    5161             : 
    5162         386 :     int nKernelRadius = 0;
    5163             :     GDALResampleFunction pfnResampleFn =
    5164         386 :         GDALGetResampleFunction(pszResampling, &nKernelRadius);
    5165         386 :     if (pfnResampleFn == nullptr)
    5166           0 :         return CE_Failure;
    5167             : 
    5168         386 :     const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
    5169         386 :     const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
    5170         386 :     if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
    5171           0 :         return CE_None;
    5172         386 :     GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
    5173       66233 :     for (int iBand = 1; iBand < nBands; ++iBand)
    5174             :     {
    5175      131694 :         if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
    5176       65847 :             papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
    5177             :         {
    5178           0 :             CPLError(
    5179             :                 CE_Failure, CPLE_NotSupported,
    5180             :                 "GDALRegenerateOverviewsMultiBand: all the source bands must "
    5181             :                 "have the same dimensions");
    5182           0 :             return CE_Failure;
    5183             :         }
    5184       65847 :         if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
    5185             :         {
    5186           0 :             CPLError(
    5187             :                 CE_Failure, CPLE_NotSupported,
    5188             :                 "GDALRegenerateOverviewsMultiBand: all the source bands must "
    5189             :                 "have the same data type");
    5190           0 :             return CE_Failure;
    5191             :         }
    5192             :     }
    5193             : 
    5194        1032 :     for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
    5195             :     {
    5196         646 :         const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
    5197         646 :         const int nDstWidth = poOvrFirstBand->GetXSize();
    5198         646 :         const int nDstHeight = poOvrFirstBand->GetYSize();
    5199       66759 :         for (int iBand = 1; iBand < nBands; ++iBand)
    5200             :         {
    5201       66113 :             const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
    5202      132226 :             if (poOvrBand->GetXSize() != nDstWidth ||
    5203       66113 :                 poOvrBand->GetYSize() != nDstHeight)
    5204             :             {
    5205           0 :                 CPLError(
    5206             :                     CE_Failure, CPLE_NotSupported,
    5207             :                     "GDALRegenerateOverviewsMultiBand: all the overviews bands "
    5208             :                     "of the same level must have the same dimensions");
    5209           0 :                 return CE_Failure;
    5210             :             }
    5211       66113 :             if (poOvrBand->GetRasterDataType() != eDataType)
    5212             :             {
    5213           0 :                 CPLError(
    5214             :                     CE_Failure, CPLE_NotSupported,
    5215             :                     "GDALRegenerateOverviewsMultiBand: all the overviews bands "
    5216             :                     "must have the same data type as the source bands");
    5217           0 :                 return CE_Failure;
    5218             :             }
    5219             :         }
    5220             :     }
    5221             : 
    5222             :     // First pass to compute the total number of pixels to write.
    5223         386 :     double dfTotalPixelCount = 0;
    5224         386 :     const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
    5225         386 :     const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
    5226         386 :     const int nSrcXSize = atoi(CSLFetchNameValueDef(
    5227             :         papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
    5228         386 :     const int nSrcYSize = atoi(CSLFetchNameValueDef(
    5229             :         papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
    5230        1032 :     for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
    5231             :     {
    5232         646 :         dfTotalPixelCount +=
    5233        1292 :             static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
    5234         646 :             papapoOverviewBands[0][iOverview]->GetXSize() *
    5235        1292 :             static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
    5236         646 :             papapoOverviewBands[0][iOverview]->GetYSize();
    5237             :     }
    5238             : 
    5239             :     const GDALDataType eWrkDataType =
    5240         386 :         GDALGetOvrWorkDataType(pszResampling, eDataType);
    5241         386 :     const int nWrkDataTypeSize = GDALGetDataTypeSizeBytes(eWrkDataType);
    5242             : 
    5243         386 :     const bool bIsMask = papoSrcBands[0]->IsMaskBand();
    5244             : 
    5245             :     // If we have a nodata mask and we are doing something more complicated
    5246             :     // than nearest neighbouring, we have to fetch to nodata mask.
    5247             :     const bool bUseNoDataMask =
    5248         569 :         !STARTS_WITH_CI(pszResampling, "NEAR") &&
    5249         183 :         (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
    5250             : 
    5251         772 :     std::vector<bool> abHasNoData(nBands);
    5252         772 :     std::vector<double> adfNoDataValue(nBands);
    5253             : 
    5254       66619 :     for (int iBand = 0; iBand < nBands; ++iBand)
    5255             :     {
    5256       66233 :         int nHasNoData = 0;
    5257      132466 :         adfNoDataValue[iBand] =
    5258       66233 :             papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
    5259       66233 :         abHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
    5260             :     }
    5261             :     const bool bPropagateNoData =
    5262         386 :         CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
    5263             : 
    5264         386 :     const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
    5265        1544 :     const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
    5266         386 :                                                        ? CPLGetNumCPUs()
    5267         386 :                                                        : atoi(pszThreads)));
    5268             :     auto poThreadPool =
    5269         386 :         nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
    5270             :     auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
    5271         772 :                                    : std::unique_ptr<CPLJobQueue>(nullptr);
    5272             : 
    5273             :     // Only configurable for debug / testing
    5274         386 :     const GIntBig nChunkMaxSize = []() -> GIntBig
    5275             :     {
    5276             :         const char *pszVal =
    5277         386 :             CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", nullptr);
    5278         386 :         if (pszVal)
    5279             :         {
    5280          15 :             GIntBig nRet = 0;
    5281          15 :             CPLParseMemorySize(pszVal, &nRet, nullptr);
    5282          15 :             return std::max<GIntBig>(100, nRet);
    5283             :         }
    5284         371 :         return 10 * 1024 * 1024;
    5285         386 :     }();
    5286             : 
    5287             :     // Only configurable for debug / testing
    5288         386 :     const GIntBig nChunkMaxSizeForTempFile = []() -> GIntBig
    5289             :     {
    5290         386 :         const char *pszVal = CPLGetConfigOption(
    5291             :             "GDAL_OVR_CHUNK_MAX_SIZE_FOR_TEMP_FILE", nullptr);
    5292         386 :         if (pszVal)
    5293             :         {
    5294          14 :             GIntBig nRet = 0;
    5295          14 :             CPLParseMemorySize(pszVal, &nRet, nullptr);
    5296          14 :             return std::max<GIntBig>(100, nRet);
    5297             :         }
    5298         372 :         const auto nUsableRAM = CPLGetUsablePhysicalRAM();
    5299         372 :         if (nUsableRAM > 0)
    5300         372 :             return nUsableRAM / 10;
    5301             :         // Select a value to be able to at least downsample by 2 for a RGB
    5302             :         // 1024x1024 tiled output: (2 * 1024 + 2) * (2 * 1024 + 2) * 3 = 12 MB
    5303           0 :         return 100 * 1024 * 1024;
    5304         386 :     }();
    5305             : 
    5306             :     // Second pass to do the real job.
    5307         386 :     double dfCurPixelCount = 0;
    5308         386 :     CPLErr eErr = CE_None;
    5309        1026 :     for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
    5310             :          ++iOverview)
    5311             :     {
    5312         645 :         int iSrcOverview = -1;  // -1 means the source bands.
    5313             : 
    5314             :         const int nDstTotalWidth =
    5315         645 :             papapoOverviewBands[0][iOverview]->GetXSize();
    5316             :         const int nDstTotalHeight =
    5317         645 :             papapoOverviewBands[0][iOverview]->GetYSize();
    5318             : 
    5319             :         // Compute the coordinates of the target region to refresh
    5320         645 :         constexpr double EPS = 1e-8;
    5321         645 :         const int nDstXOffStart = static_cast<int>(
    5322         645 :             static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
    5323             :             EPS);
    5324             :         const int nDstXOffEnd =
    5325        1290 :             std::min(static_cast<int>(
    5326         645 :                          std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
    5327         645 :                                        nToplevelSrcWidth * nDstTotalWidth -
    5328             :                                    EPS)),
    5329         645 :                      nDstTotalWidth);
    5330         645 :         const int nDstWidth = nDstXOffEnd - nDstXOffStart;
    5331         645 :         const int nDstYOffStart =
    5332         645 :             static_cast<int>(static_cast<double>(nSrcYOff) /
    5333         645 :                                  nToplevelSrcHeight * nDstTotalHeight +
    5334             :                              EPS);
    5335             :         const int nDstYOffEnd =
    5336        1290 :             std::min(static_cast<int>(
    5337         645 :                          std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
    5338         645 :                                        nToplevelSrcHeight * nDstTotalHeight -
    5339             :                                    EPS)),
    5340         645 :                      nDstTotalHeight);
    5341         645 :         const int nDstHeight = nDstYOffEnd - nDstYOffStart;
    5342             : 
    5343             :         // Try to use previous level of overview as the source to compute
    5344             :         // the next level.
    5345         645 :         int nSrcWidth = nToplevelSrcWidth;
    5346         645 :         int nSrcHeight = nToplevelSrcHeight;
    5347         905 :         if (iOverview > 0 &&
    5348         260 :             papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
    5349             :         {
    5350         252 :             nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
    5351         252 :             nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
    5352         252 :             iSrcOverview = iOverview - 1;
    5353             :         }
    5354             : 
    5355         645 :         const double dfXRatioDstToSrc =
    5356         645 :             static_cast<double>(nSrcWidth) / nDstTotalWidth;
    5357         645 :         const double dfYRatioDstToSrc =
    5358         645 :             static_cast<double>(nSrcHeight) / nDstTotalHeight;
    5359             : 
    5360             :         const int nOvrFactor =
    5361        1935 :             std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
    5362         645 :                                  static_cast<int>(0.5 + dfYRatioDstToSrc)));
    5363             : 
    5364         645 :         int nDstChunkXSize = 0;
    5365         645 :         int nDstChunkYSize = 0;
    5366         645 :         papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
    5367             :                                                         &nDstChunkYSize);
    5368             : 
    5369         645 :         constexpr int PIXEL_MARGIN = 2;
    5370             :         // Try to extend the chunk size so that the memory needed to acquire
    5371             :         // source pixels goes up to 10 MB.
    5372             :         // This can help for drivers that support multi-threaded reading
    5373         645 :         const int nFullResYChunk = static_cast<int>(std::min<double>(
    5374         645 :             nSrcHeight, PIXEL_MARGIN + nDstChunkYSize * dfYRatioDstToSrc));
    5375         645 :         const int nFullResYChunkQueried = static_cast<int>(std::min<int64_t>(
    5376        1290 :             nSrcHeight,
    5377        1290 :             nFullResYChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
    5378         645 :                                  nKernelRadius * nOvrFactor));
    5379         881 :         while (nDstChunkXSize < nDstWidth)
    5380             :         {
    5381         255 :             constexpr int INCREASE_FACTOR = 2;
    5382             : 
    5383         255 :             const int nFullResXChunk = static_cast<int>(std::min<double>(
    5384         510 :                 nSrcWidth, PIXEL_MARGIN + INCREASE_FACTOR * nDstChunkXSize *
    5385         255 :                                               dfXRatioDstToSrc));
    5386             : 
    5387             :             const int nFullResXChunkQueried =
    5388         255 :                 static_cast<int>(std::min<int64_t>(
    5389         510 :                     nSrcWidth,
    5390         510 :                     nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
    5391         255 :                                          nKernelRadius * nOvrFactor));
    5392             : 
    5393         255 :             if (static_cast<GIntBig>(nFullResXChunkQueried) *
    5394         255 :                     nFullResYChunkQueried >
    5395         255 :                 nChunkMaxSize / (nBands * nWrkDataTypeSize))
    5396             :             {
    5397          19 :                 break;
    5398             :             }
    5399             : 
    5400         236 :             nDstChunkXSize *= INCREASE_FACTOR;
    5401             :         }
    5402         645 :         nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
    5403             : 
    5404         645 :         const int nFullResXChunk = static_cast<int>(std::min<double>(
    5405         645 :             nSrcWidth, PIXEL_MARGIN + nDstChunkXSize * dfXRatioDstToSrc));
    5406         645 :         const int nFullResXChunkQueried = static_cast<int>(std::min<int64_t>(
    5407        1290 :             nSrcWidth,
    5408        1290 :             nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
    5409         645 :                                  nKernelRadius * nOvrFactor));
    5410             : 
    5411             :         // Make sure that the RAM requirements to acquire the source data does
    5412             :         // not exceed nChunkMaxSizeForTempFile
    5413             :         // If so, reduce the destination chunk size, generate overviews in a
    5414             :         // temporary dataset, and copy that temporary dataset over the target
    5415             :         // overview bands (to avoid issues with lossy compression)
    5416             :         const bool bOverflowFullResXChunkYChunkQueried =
    5417        1286 :             nFullResYChunkQueried > INT_MAX / (nBands * nWrkDataTypeSize) ||
    5418         641 :             nFullResXChunkQueried >
    5419         641 :                 std::numeric_limits<int64_t>::max() /
    5420         641 :                     (nFullResYChunkQueried * nBands * nWrkDataTypeSize);
    5421             : 
    5422         645 :         const auto nMemRequirement =
    5423             :             bOverflowFullResXChunkYChunkQueried
    5424         645 :                 ? 0
    5425         641 :                 : static_cast<GIntBig>(nFullResXChunkQueried) *
    5426         641 :                       nFullResYChunkQueried * nBands * nWrkDataTypeSize;
    5427             :         // Use a temporary dataset with a smaller destination chunk size
    5428         645 :         const auto nOverShootFactor =
    5429             :             nMemRequirement / nChunkMaxSizeForTempFile;
    5430             : 
    5431         645 :         constexpr int MIN_OVERSHOOT_FACTOR = 4;
    5432             :         const auto nSqrtOverShootFactor = std::max<GIntBig>(
    5433        1290 :             MIN_OVERSHOOT_FACTOR, static_cast<GIntBig>(std::ceil(std::sqrt(
    5434         645 :                                       static_cast<double>(nOverShootFactor)))));
    5435         645 :         constexpr int DEFAULT_CHUNK_SIZE = 256;
    5436         645 :         constexpr int GTIFF_BLOCK_SIZE_MULTIPLE = 16;
    5437             :         const int nReducedDstChunkXSize =
    5438             :             bOverflowFullResXChunkYChunkQueried
    5439        1286 :                 ? DEFAULT_CHUNK_SIZE
    5440        1286 :                 : std::max(1, static_cast<int>(nDstChunkXSize /
    5441        1286 :                                                nSqrtOverShootFactor) &
    5442         641 :                                   ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
    5443             :         const int nReducedDstChunkYSize =
    5444             :             bOverflowFullResXChunkYChunkQueried
    5445        1286 :                 ? DEFAULT_CHUNK_SIZE
    5446        1286 :                 : std::max(1, static_cast<int>(nDstChunkYSize /
    5447        1286 :                                                nSqrtOverShootFactor) &
    5448         641 :                                   ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
    5449             : 
    5450         645 :         if (bOverflowFullResXChunkYChunkQueried ||
    5451             :             nMemRequirement > nChunkMaxSizeForTempFile)
    5452             :         {
    5453          43 :             const auto nDTSize = GDALGetDataTypeSizeBytes(eDataType);
    5454             :             const bool bTmpDSMemRequirementOverflow =
    5455          43 :                 nDTSize * nBands >
    5456          43 :                 std::numeric_limits<int64_t>::max() /
    5457          43 :                     (static_cast<int64_t>(nDstWidth) * nDstHeight);
    5458          43 :             const auto nTmpDSMemRequirement =
    5459             :                 bTmpDSMemRequirementOverflow
    5460          43 :                     ? 0
    5461          41 :                     : static_cast<GIntBig>(nDstWidth) * nDstHeight * nBands *
    5462          41 :                           nDTSize;
    5463             : 
    5464             :             // make sure that one band buffer doesn't overflow size_t
    5465             :             const bool bChunkSizeOverflow =
    5466          43 :                 static_cast<size_t>(nDTSize) >
    5467          43 :                 std::numeric_limits<size_t>::max() /
    5468          43 :                     (static_cast<uint64_t>(nDstWidth) * nDstHeight);
    5469          43 :             const size_t nChunkSize =
    5470             :                 bChunkSizeOverflow
    5471          43 :                     ? 0
    5472          41 :                     : static_cast<size_t>(nDstWidth) * nDstHeight * nDTSize;
    5473             : 
    5474             :             const auto CreateVRT =
    5475          41 :                 [nBands, nSrcWidth, nSrcHeight, nDstTotalWidth, nDstTotalHeight,
    5476             :                  pszResampling, eWrkDataType, papoSrcBands, papapoOverviewBands,
    5477             :                  iSrcOverview, &abHasNoData,
    5478      393585 :                  &adfNoDataValue](int nVRTBlockXSize, int nVRTBlockYSize)
    5479             :             {
    5480             :                 auto poVRTDS = std::make_unique<VRTDataset>(
    5481          41 :                     nDstTotalWidth, nDstTotalHeight, nVRTBlockXSize,
    5482          41 :                     nVRTBlockYSize);
    5483             : 
    5484       65620 :                 for (int iBand = 0; iBand < nBands; ++iBand)
    5485             :                 {
    5486      131158 :                     auto poVRTSrc = std::make_unique<VRTSimpleSource>();
    5487       65579 :                     poVRTSrc->SetResampling(pszResampling);
    5488       65579 :                     poVRTDS->AddBand(eWrkDataType);
    5489             :                     auto poVRTBand = static_cast<VRTSourcedRasterBand *>(
    5490       65579 :                         poVRTDS->GetRasterBand(iBand + 1));
    5491             : 
    5492       65579 :                     auto poSrcBand = papoSrcBands[iBand];
    5493       65579 :                     if (iSrcOverview != -1)
    5494          24 :                         poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
    5495       65579 :                     poVRTBand->ConfigureSource(
    5496             :                         poVRTSrc.get(), poSrcBand, false, 0, 0, nSrcWidth,
    5497             :                         nSrcHeight, 0, 0, nDstTotalWidth, nDstTotalHeight);
    5498             :                     // Add the source to the band
    5499       65579 :                     poVRTBand->AddSource(poVRTSrc.release());
    5500       65579 :                     if (abHasNoData[iBand])
    5501           3 :                         poVRTBand->SetNoDataValue(adfNoDataValue[iBand]);
    5502             :                 }
    5503             : 
    5504          42 :                 if (papoSrcBands[0]->GetMaskFlags() == GMF_PER_DATASET &&
    5505           1 :                     poVRTDS->CreateMaskBand(GMF_PER_DATASET) == CE_None)
    5506             :                 {
    5507             :                     VRTSourcedRasterBand *poMaskVRTBand =
    5508           1 :                         cpl::down_cast<VRTSourcedRasterBand *>(
    5509           1 :                             poVRTDS->GetRasterBand(1)->GetMaskBand());
    5510           1 :                     auto poSrcBand = papoSrcBands[0];
    5511           1 :                     if (iSrcOverview != -1)
    5512           0 :                         poSrcBand = papapoOverviewBands[0][iSrcOverview];
    5513           1 :                     poMaskVRTBand->AddMaskBandSource(
    5514           1 :                         poSrcBand->GetMaskBand(), 0, 0, nSrcWidth, nSrcHeight,
    5515             :                         0, 0, nDstTotalWidth, nDstTotalHeight);
    5516             :                 }
    5517             : 
    5518          41 :                 return poVRTDS;
    5519          43 :             };
    5520             : 
    5521             :             // If the overview accommodates chunking, do so and recurse
    5522             :             // to avoid generating full size temporary files
    5523          43 :             if (!bOverflowFullResXChunkYChunkQueried &&
    5524          39 :                 !bTmpDSMemRequirementOverflow && !bChunkSizeOverflow &&
    5525          39 :                 (nDstChunkXSize < nDstWidth || nDstChunkYSize < nDstHeight))
    5526             :             {
    5527             :                 // Create a VRT with the smaller chunk to do the scaling
    5528             :                 auto poVRTDS =
    5529          13 :                     CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
    5530             : 
    5531          13 :                 std::vector<GDALRasterBand *> apoVRTBand(nBands);
    5532          13 :                 std::vector<GDALRasterBand *> apoDstBand(nBands);
    5533       65560 :                 for (int iBand = 0; iBand < nBands; ++iBand)
    5534             :                 {
    5535       65547 :                     apoDstBand[iBand] = papapoOverviewBands[iBand][iOverview];
    5536       65547 :                     apoVRTBand[iBand] = poVRTDS->GetRasterBand(iBand + 1);
    5537             :                 }
    5538             : 
    5539             :                 // Use a flag to avoid reading from the overview being built
    5540             :                 GDALRasterIOExtraArg sExtraArg;
    5541          13 :                 INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    5542          13 :                 if (iSrcOverview == -1)
    5543          13 :                     sExtraArg.bUseOnlyThisScale = true;
    5544             : 
    5545             :                 // A single band buffer for data transfer to the overview
    5546          13 :                 std::vector<GByte> abyChunk;
    5547             :                 try
    5548             :                 {
    5549          13 :                     abyChunk.resize(nChunkSize);
    5550             :                 }
    5551           0 :                 catch (const std::exception &)
    5552             :                 {
    5553           0 :                     CPLError(CE_Failure, CPLE_OutOfMemory,
    5554             :                              "Out of memory allocating temporary buffer");
    5555           0 :                     return CE_Failure;
    5556             :                 }
    5557             : 
    5558             :                 // Loop over output height, in chunks
    5559          13 :                 for (int nDstYOff = nDstYOffStart;
    5560          38 :                      nDstYOff < nDstYOffEnd && eErr == CE_None;
    5561             :                      /* */)
    5562             :                 {
    5563             :                     const int nDstYCount =
    5564          25 :                         std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
    5565             :                     // Loop over output width, in output chunks
    5566          25 :                     for (int nDstXOff = nDstXOffStart;
    5567          74 :                          nDstXOff < nDstXOffEnd && eErr == CE_None;
    5568             :                          /* */)
    5569             :                     {
    5570             :                         const int nDstXCount =
    5571          49 :                             std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
    5572             :                         // Read and transfer the chunk to the overview
    5573          98 :                         for (int iBand = 0; iBand < nBands && eErr == CE_None;
    5574             :                              ++iBand)
    5575             :                         {
    5576          98 :                             eErr = apoVRTBand[iBand]->RasterIO(
    5577             :                                 GF_Read, nDstXOff, nDstYOff, nDstXCount,
    5578          49 :                                 nDstYCount, abyChunk.data(), nDstXCount,
    5579             :                                 nDstYCount, eDataType, 0, 0, &sExtraArg);
    5580          49 :                             if (eErr == CE_None)
    5581             :                             {
    5582          96 :                                 eErr = apoDstBand[iBand]->RasterIO(
    5583             :                                     GF_Write, nDstXOff, nDstYOff, nDstXCount,
    5584          48 :                                     nDstYCount, abyChunk.data(), nDstXCount,
    5585             :                                     nDstYCount, eDataType, 0, 0, nullptr);
    5586             :                             }
    5587             :                         }
    5588             : 
    5589          49 :                         dfCurPixelCount +=
    5590          49 :                             static_cast<double>(nDstXCount) * nDstYCount;
    5591             : 
    5592          49 :                         nDstXOff += nDstXCount;
    5593             :                     }  // width
    5594             : 
    5595          25 :                     if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount,
    5596             :                                      nullptr, pProgressData))
    5597             :                     {
    5598           0 :                         CPLError(CE_Failure, CPLE_UserInterrupt,
    5599             :                                  "User terminated");
    5600           0 :                         eErr = CE_Failure;
    5601             :                     }
    5602             : 
    5603          25 :                     nDstYOff += nDstYCount;
    5604             :                 }  // height
    5605             : 
    5606          13 :                 if (CE_None != eErr)
    5607             :                 {
    5608           1 :                     CPLError(CE_Failure, CPLE_AppDefined,
    5609             :                              "Error while writing overview");
    5610           1 :                     return CE_Failure;
    5611             :                 }
    5612             : 
    5613          12 :                 pfnProgress(1.0, nullptr, pProgressData);
    5614             :                 // Flush the overviews we just generated
    5615          24 :                 for (int iBand = 0; iBand < nBands; ++iBand)
    5616          12 :                     apoDstBand[iBand]->FlushCache(false);
    5617             : 
    5618          12 :                 continue;  // Next overview
    5619             :             }              // chunking via temporary dataset
    5620             : 
    5621           0 :             std::unique_ptr<GDALDataset> poTmpDS;
    5622             :             // Config option mostly/only for autotest purposes
    5623             :             const char *pszGDAL_OVR_TEMP_DRIVER =
    5624          30 :                 CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
    5625          30 :             if ((!bTmpDSMemRequirementOverflow &&
    5626           4 :                  nTmpDSMemRequirement <= nChunkMaxSizeForTempFile &&
    5627           4 :                  !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
    5628          26 :                 EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
    5629             :             {
    5630          10 :                 auto poTmpDrv = GetGDALDriverManager()->GetDriverByName("MEM");
    5631          10 :                 if (!poTmpDrv)
    5632             :                 {
    5633           0 :                     eErr = CE_Failure;
    5634           0 :                     break;
    5635             :                 }
    5636          10 :                 poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
    5637             :                                                nDstTotalHeight, nBands,
    5638          10 :                                                eDataType, nullptr));
    5639             :             }
    5640             :             else
    5641             :             {
    5642             :                 // Create a temporary file for the overview
    5643             :                 auto poTmpDrv =
    5644          20 :                     GetGDALDriverManager()->GetDriverByName("GTiff");
    5645          20 :                 if (!poTmpDrv)
    5646             :                 {
    5647           0 :                     eErr = CE_Failure;
    5648           0 :                     break;
    5649             :                 }
    5650          40 :                 std::string osTmpFilename;
    5651          20 :                 auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
    5652          20 :                 if (poDstDS)
    5653             :                 {
    5654          20 :                     osTmpFilename = poDstDS->GetDescription();
    5655             :                     VSIStatBufL sStatBuf;
    5656          20 :                     if (!osTmpFilename.empty() &&
    5657           0 :                         VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
    5658           0 :                         osTmpFilename += "_tmp_ovr.tif";
    5659             :                 }
    5660          20 :                 if (osTmpFilename.empty())
    5661             :                 {
    5662          20 :                     osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
    5663          20 :                     osTmpFilename += ".tif";
    5664             :                 }
    5665          20 :                 CPLDebug("GDAL", "Creating temporary file %s of %d x %d x %d",
    5666             :                          osTmpFilename.c_str(), nDstWidth, nDstHeight, nBands);
    5667          40 :                 CPLStringList aosCO;
    5668          20 :                 if (0 == ((nReducedDstChunkXSize % GTIFF_BLOCK_SIZE_MULTIPLE) |
    5669          20 :                           (nReducedDstChunkYSize % GTIFF_BLOCK_SIZE_MULTIPLE)))
    5670             :                 {
    5671          14 :                     aosCO.SetNameValue("TILED", "YES");
    5672             :                     aosCO.SetNameValue("BLOCKXSIZE",
    5673          14 :                                        CPLSPrintf("%d", nReducedDstChunkXSize));
    5674             :                     aosCO.SetNameValue("BLOCKYSIZE",
    5675          14 :                                        CPLSPrintf("%d", nReducedDstChunkYSize));
    5676             :                 }
    5677          20 :                 if (const char *pszCOList =
    5678          20 :                         poTmpDrv->GetMetadataItem(GDAL_DMD_CREATIONOPTIONLIST))
    5679             :                 {
    5680             :                     aosCO.SetNameValue(
    5681          20 :                         "COMPRESS", strstr(pszCOList, "ZSTD") ? "ZSTD" : "LZW");
    5682             :                 }
    5683          20 :                 poTmpDS.reset(poTmpDrv->Create(osTmpFilename.c_str(), nDstWidth,
    5684             :                                                nDstHeight, nBands, eDataType,
    5685          20 :                                                aosCO.List()));
    5686          20 :                 if (poTmpDS)
    5687             :                 {
    5688          18 :                     poTmpDS->MarkSuppressOnClose();
    5689          18 :                     VSIUnlink(osTmpFilename.c_str());
    5690             :                 }
    5691             :             }
    5692          30 :             if (!poTmpDS)
    5693             :             {
    5694           2 :                 eErr = CE_Failure;
    5695           2 :                 break;
    5696             :             }
    5697             : 
    5698             :             // Create a full size VRT to do the resampling without edge effects
    5699             :             auto poVRTDS =
    5700          28 :                 CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
    5701             : 
    5702             :             // Allocate a band buffer with the overview chunk size
    5703             :             std::unique_ptr<void, VSIFreeReleaser> pDstBuffer(
    5704             :                 VSI_MALLOC3_VERBOSE(size_t(nWrkDataTypeSize), nDstChunkXSize,
    5705          28 :                                     nDstChunkYSize));
    5706          28 :             if (pDstBuffer == nullptr)
    5707             :             {
    5708           0 :                 eErr = CE_Failure;
    5709           0 :                 break;
    5710             :             }
    5711             : 
    5712             :             // Use a flag to avoid reading the overview being built
    5713             :             GDALRasterIOExtraArg sExtraArg;
    5714          28 :             INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    5715          28 :             if (iSrcOverview == -1)
    5716           4 :                 sExtraArg.bUseOnlyThisScale = true;
    5717             : 
    5718             :             // Scale and copy data from the VRT to the temp file
    5719          28 :             for (int nDstYOff = nDstYOffStart;
    5720         914 :                  nDstYOff < nDstYOffEnd && eErr == CE_None;
    5721             :                  /* */)
    5722             :             {
    5723             :                 const int nDstYCount =
    5724         886 :                     std::min(nReducedDstChunkYSize, nDstYOffEnd - nDstYOff);
    5725         886 :                 for (int nDstXOff = nDstXOffStart;
    5726      201218 :                      nDstXOff < nDstXOffEnd && eErr == CE_None;
    5727             :                      /* */)
    5728             :                 {
    5729             :                     const int nDstXCount =
    5730      200332 :                         std::min(nReducedDstChunkXSize, nDstXOffEnd - nDstXOff);
    5731      400668 :                     for (int iBand = 0; iBand < nBands && eErr == CE_None;
    5732             :                          ++iBand)
    5733             :                     {
    5734      200336 :                         auto poSrcBand = poVRTDS->GetRasterBand(iBand + 1);
    5735      200336 :                         eErr = poSrcBand->RasterIO(
    5736             :                             GF_Read, nDstXOff, nDstYOff, nDstXCount, nDstYCount,
    5737             :                             pDstBuffer.get(), nDstXCount, nDstYCount,
    5738             :                             eWrkDataType, 0, 0, &sExtraArg);
    5739      200336 :                         if (eErr == CE_None)
    5740             :                         {
    5741             :                             // Write to the temporary dataset, shifted
    5742      200334 :                             auto poOvrBand = poTmpDS->GetRasterBand(iBand + 1);
    5743      200334 :                             eErr = poOvrBand->RasterIO(
    5744             :                                 GF_Write, nDstXOff - nDstXOffStart,
    5745             :                                 nDstYOff - nDstYOffStart, nDstXCount,
    5746             :                                 nDstYCount, pDstBuffer.get(), nDstXCount,
    5747             :                                 nDstYCount, eWrkDataType, 0, 0, nullptr);
    5748             :                         }
    5749             :                     }
    5750      200332 :                     nDstXOff += nDstXCount;
    5751             :                 }
    5752         886 :                 nDstYOff += nDstYCount;
    5753             :             }
    5754             : 
    5755             :             // Copy from the temporary to the overview
    5756          28 :             for (int nDstYOff = nDstYOffStart;
    5757          54 :                  nDstYOff < nDstYOffEnd && eErr == CE_None;
    5758             :                  /* */)
    5759             :             {
    5760             :                 const int nDstYCount =
    5761          26 :                     std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
    5762          26 :                 for (int nDstXOff = nDstXOffStart;
    5763          52 :                      nDstXOff < nDstXOffEnd && eErr == CE_None;
    5764             :                      /* */)
    5765             :                 {
    5766             :                     const int nDstXCount =
    5767          26 :                         std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
    5768          56 :                     for (int iBand = 0; iBand < nBands && eErr == CE_None;
    5769             :                          ++iBand)
    5770             :                     {
    5771          30 :                         auto poSrcBand = poTmpDS->GetRasterBand(iBand + 1);
    5772          30 :                         eErr = poSrcBand->RasterIO(
    5773             :                             GF_Read, nDstXOff - nDstXOffStart,
    5774             :                             nDstYOff - nDstYOffStart, nDstXCount, nDstYCount,
    5775             :                             pDstBuffer.get(), nDstXCount, nDstYCount,
    5776             :                             eWrkDataType, 0, 0, nullptr);
    5777          30 :                         if (eErr == CE_None)
    5778             :                         {
    5779             :                             // Write to the destination overview bands
    5780          30 :                             auto poOvrBand =
    5781          30 :                                 papapoOverviewBands[iBand][iOverview];
    5782          30 :                             eErr = poOvrBand->RasterIO(
    5783             :                                 GF_Write, nDstXOff, nDstYOff, nDstXCount,
    5784             :                                 nDstYCount, pDstBuffer.get(), nDstXCount,
    5785             :                                 nDstYCount, eWrkDataType, 0, 0, nullptr);
    5786             :                         }
    5787             :                     }
    5788          26 :                     nDstXOff += nDstXCount;
    5789             :                 }
    5790          26 :                 nDstYOff += nDstYCount;
    5791             :             }
    5792             : 
    5793          28 :             if (eErr != CE_None)
    5794             :             {
    5795           2 :                 CPLError(CE_Failure, CPLE_AppDefined,
    5796             :                          "Failed to write overview %d", iOverview);
    5797           2 :                 return eErr;
    5798             :             }
    5799             : 
    5800             :             // Flush the data to overviews.
    5801          56 :             for (int iBand = 0; iBand < nBands; ++iBand)
    5802          30 :                 papapoOverviewBands[iBand][iOverview]->FlushCache(false);
    5803             : 
    5804          26 :             continue;
    5805             :         }
    5806             : 
    5807             :         // Structure describing a resampling job
    5808             :         struct OvrJob
    5809             :         {
    5810             :             // Buffers to free when job is finished
    5811             :             std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
    5812             :             std::unique_ptr<PointerHolder> oSrcBufferHolder{};
    5813             :             std::unique_ptr<PointerHolder> oDstBufferHolder{};
    5814             : 
    5815             :             GDALRasterBand *poDstBand = nullptr;
    5816             : 
    5817             :             // Input parameters of pfnResampleFn
    5818             :             GDALResampleFunction pfnResampleFn = nullptr;
    5819             :             GDALOverviewResampleArgs args{};
    5820             :             const void *pChunk = nullptr;
    5821             : 
    5822             :             // Output values of resampling function
    5823             :             CPLErr eErr = CE_Failure;
    5824             :             void *pDstBuffer = nullptr;
    5825             :             GDALDataType eDstBufferDataType = GDT_Unknown;
    5826             : 
    5827             :             // Synchronization
    5828             :             bool bFinished = false;
    5829             :             std::mutex mutex{};
    5830             :             std::condition_variable cv{};
    5831             :         };
    5832             : 
    5833             :         // Thread function to resample
    5834        3310 :         const auto JobResampleFunc = [](void *pData)
    5835             :         {
    5836        3310 :             OvrJob *poJob = static_cast<OvrJob *>(pData);
    5837             : 
    5838        3310 :             poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
    5839             :                                                &(poJob->pDstBuffer),
    5840             :                                                &(poJob->eDstBufferDataType));
    5841             : 
    5842        3310 :             poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
    5843             : 
    5844             :             {
    5845        6620 :                 std::lock_guard<std::mutex> guard(poJob->mutex);
    5846        3310 :                 poJob->bFinished = true;
    5847        3310 :                 poJob->cv.notify_one();
    5848             :             }
    5849        3310 :         };
    5850             : 
    5851             :         // Function to write resample data to target band
    5852        3310 :         const auto WriteJobData = [](const OvrJob *poJob)
    5853             :         {
    5854        6620 :             return poJob->poDstBand->RasterIO(
    5855        3310 :                 GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
    5856        3310 :                 poJob->args.nDstXOff2 - poJob->args.nDstXOff,
    5857        3310 :                 poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
    5858        3310 :                 poJob->args.nDstXOff2 - poJob->args.nDstXOff,
    5859        3310 :                 poJob->args.nDstYOff2 - poJob->args.nDstYOff,
    5860        3310 :                 poJob->eDstBufferDataType, 0, 0, nullptr);
    5861             :         };
    5862             : 
    5863             :         // Wait for completion of oldest job and serialize it
    5864             :         const auto WaitAndFinalizeOldestJob =
    5865          16 :             [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
    5866             :         {
    5867          16 :             auto poOldestJob = jobList.front().get();
    5868             :             {
    5869          32 :                 std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
    5870             :                 // coverity[missing_lock:FALSE]
    5871          22 :                 while (!poOldestJob->bFinished)
    5872             :                 {
    5873           6 :                     poOldestJob->cv.wait(oGuard);
    5874             :                 }
    5875             :             }
    5876          16 :             CPLErr l_eErr = poOldestJob->eErr;
    5877          16 :             if (l_eErr == CE_None)
    5878             :             {
    5879          16 :                 l_eErr = WriteJobData(poOldestJob);
    5880             :             }
    5881             : 
    5882          16 :             jobList.pop_front();
    5883          16 :             return l_eErr;
    5884             :         };
    5885             : 
    5886             :         // Queue of jobs
    5887        1204 :         std::list<std::unique_ptr<OvrJob>> jobList;
    5888             : 
    5889        1204 :         std::vector<std::unique_ptr<void, VSIFreeReleaser>> apaChunk(nBands);
    5890             :         std::vector<std::unique_ptr<GByte, VSIFreeReleaser>>
    5891        1204 :             apabyChunkNoDataMask(nBands);
    5892             : 
    5893             :         // Iterate on destination overview, block by block.
    5894         602 :         for (int nDstYOff = nDstYOffStart;
    5895        2111 :              nDstYOff < nDstYOffEnd && eErr == CE_None;
    5896        1509 :              nDstYOff += nDstChunkYSize)
    5897             :         {
    5898             :             int nDstYCount;
    5899        1509 :             if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
    5900        1099 :                 nDstYCount = nDstChunkYSize;
    5901             :             else
    5902         410 :                 nDstYCount = nDstYOffEnd - nDstYOff;
    5903             : 
    5904        1509 :             int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    5905        1509 :             int nChunkYOff2 = static_cast<int>(
    5906        1509 :                 ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
    5907        1509 :             if (nChunkYOff2 > nSrcHeight ||
    5908        1509 :                 nDstYOff + nDstYCount == nDstTotalHeight)
    5909         595 :                 nChunkYOff2 = nSrcHeight;
    5910        1509 :             int nYCount = nChunkYOff2 - nChunkYOff;
    5911        1509 :             CPLAssert(nYCount <= nFullResYChunk);
    5912             : 
    5913        1509 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
    5914        1509 :             int nChunkYSizeQueried =
    5915        1509 :                 nYCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
    5916        1509 :             if (nChunkYOffQueried < 0)
    5917             :             {
    5918         148 :                 nChunkYSizeQueried += nChunkYOffQueried;
    5919         148 :                 nChunkYOffQueried = 0;
    5920             :             }
    5921        1509 :             if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
    5922         147 :                 nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
    5923        1509 :             CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
    5924             : 
    5925        1509 :             if (!pfnProgress(std::min(1.0, dfCurPixelCount / dfTotalPixelCount),
    5926             :                              nullptr, pProgressData))
    5927             :             {
    5928           1 :                 CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    5929           1 :                 eErr = CE_Failure;
    5930             :             }
    5931             : 
    5932             :             // Iterate on destination overview, block by block.
    5933        1509 :             for (int nDstXOff = nDstXOffStart;
    5934        3057 :                  nDstXOff < nDstXOffEnd && eErr == CE_None;
    5935        1548 :                  nDstXOff += nDstChunkXSize)
    5936             :             {
    5937        1548 :                 int nDstXCount = 0;
    5938        1548 :                 if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
    5939        1531 :                     nDstXCount = nDstChunkXSize;
    5940             :                 else
    5941          17 :                     nDstXCount = nDstXOffEnd - nDstXOff;
    5942             : 
    5943        1548 :                 dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
    5944             : 
    5945        1548 :                 int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    5946        1548 :                 int nChunkXOff2 = static_cast<int>(
    5947        1548 :                     ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    5948        1548 :                 if (nChunkXOff2 > nSrcWidth ||
    5949        1548 :                     nDstXOff + nDstXCount == nDstTotalWidth)
    5950        1473 :                     nChunkXOff2 = nSrcWidth;
    5951        1548 :                 const int nXCount = nChunkXOff2 - nChunkXOff;
    5952        1548 :                 CPLAssert(nXCount <= nFullResXChunk);
    5953             : 
    5954        1548 :                 int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
    5955        1548 :                 int nChunkXSizeQueried =
    5956        1548 :                     nXCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
    5957        1548 :                 if (nChunkXOffQueried < 0)
    5958             :                 {
    5959         208 :                     nChunkXSizeQueried += nChunkXOffQueried;
    5960         208 :                     nChunkXOffQueried = 0;
    5961             :                 }
    5962        1548 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
    5963         217 :                     nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
    5964        1548 :                 CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
    5965             : #if DEBUG_VERBOSE
    5966             :                 CPLDebug("GDAL",
    5967             :                          "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
    5968             :                          nChunkXOffQueried, nChunkYOffQueried,
    5969             :                          nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
    5970             :                          nDstYOff, nDstXCount, nDstYCount);
    5971             : #endif
    5972             : 
    5973             :                 // Avoid accumulating too many tasks and exhaust RAM
    5974             : 
    5975             :                 // Try to complete already finished jobs
    5976        1548 :                 while (eErr == CE_None && !jobList.empty())
    5977             :                 {
    5978           2 :                     auto poOldestJob = jobList.front().get();
    5979             :                     {
    5980           2 :                         std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
    5981           2 :                         if (!poOldestJob->bFinished)
    5982             :                         {
    5983           2 :                             break;
    5984             :                         }
    5985             :                     }
    5986           0 :                     eErr = poOldestJob->eErr;
    5987           0 :                     if (eErr == CE_None)
    5988             :                     {
    5989           0 :                         eErr = WriteJobData(poOldestJob);
    5990             :                     }
    5991             : 
    5992           0 :                     jobList.pop_front();
    5993             :                 }
    5994             : 
    5995             :                 // And in case we have saturated the number of threads,
    5996             :                 // wait for completion of tasks to go below the threshold.
    5997        3096 :                 while (eErr == CE_None &&
    5998        1548 :                        jobList.size() >= static_cast<size_t>(nThreads))
    5999             :                 {
    6000           0 :                     eErr = WaitAndFinalizeOldestJob(jobList);
    6001             :                 }
    6002             : 
    6003             :                 // Read the source buffers for all the bands.
    6004        4859 :                 for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
    6005             :                 {
    6006             :                     // (Re)allocate buffers if needed
    6007        3311 :                     if (apaChunk[iBand] == nullptr)
    6008             :                     {
    6009        1179 :                         apaChunk[iBand].reset(VSI_MALLOC3_VERBOSE(
    6010             :                             nFullResXChunkQueried, nFullResYChunkQueried,
    6011             :                             nWrkDataTypeSize));
    6012        1179 :                         if (apaChunk[iBand] == nullptr)
    6013             :                         {
    6014           0 :                             eErr = CE_Failure;
    6015             :                         }
    6016             :                     }
    6017        3652 :                     if (bUseNoDataMask &&
    6018         341 :                         apabyChunkNoDataMask[iBand] == nullptr)
    6019             :                     {
    6020         282 :                         apabyChunkNoDataMask[iBand].reset(
    6021         282 :                             static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    6022             :                                 nFullResXChunkQueried, nFullResYChunkQueried)));
    6023         282 :                         if (apabyChunkNoDataMask[iBand] == nullptr)
    6024             :                         {
    6025           0 :                             eErr = CE_Failure;
    6026             :                         }
    6027             :                     }
    6028             : 
    6029        3311 :                     if (eErr == CE_None)
    6030             :                     {
    6031        3311 :                         GDALRasterBand *poSrcBand = nullptr;
    6032        3311 :                         if (iSrcOverview == -1)
    6033        2409 :                             poSrcBand = papoSrcBands[iBand];
    6034             :                         else
    6035         902 :                             poSrcBand =
    6036         902 :                                 papapoOverviewBands[iBand][iSrcOverview];
    6037        3311 :                         eErr = poSrcBand->RasterIO(
    6038             :                             GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    6039             :                             nChunkXSizeQueried, nChunkYSizeQueried,
    6040        3311 :                             apaChunk[iBand].get(), nChunkXSizeQueried,
    6041             :                             nChunkYSizeQueried, eWrkDataType, 0, 0, nullptr);
    6042             : 
    6043        3311 :                         if (bUseNoDataMask && eErr == CE_None)
    6044             :                         {
    6045         341 :                             auto poMaskBand = poSrcBand->IsMaskBand()
    6046         341 :                                                   ? poSrcBand
    6047         262 :                                                   : poSrcBand->GetMaskBand();
    6048         341 :                             eErr = poMaskBand->RasterIO(
    6049             :                                 GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    6050             :                                 nChunkXSizeQueried, nChunkYSizeQueried,
    6051         341 :                                 apabyChunkNoDataMask[iBand].get(),
    6052             :                                 nChunkXSizeQueried, nChunkYSizeQueried,
    6053             :                                 GDT_Byte, 0, 0, nullptr);
    6054             :                         }
    6055             :                     }
    6056             :                 }
    6057             : 
    6058             :                 // Compute the resulting overview block.
    6059        4858 :                 for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
    6060             :                 {
    6061        6620 :                     auto poJob = std::make_unique<OvrJob>();
    6062        3310 :                     poJob->pfnResampleFn = pfnResampleFn;
    6063        3310 :                     poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
    6064        6620 :                     poJob->args.eOvrDataType =
    6065        3310 :                         poJob->poDstBand->GetRasterDataType();
    6066        3310 :                     poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
    6067        3310 :                     poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
    6068        3310 :                     const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
    6069        3310 :                         "NBITS", "IMAGE_STRUCTURE");
    6070        3310 :                     poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    6071        3310 :                     poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    6072        3310 :                     poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    6073        3310 :                     poJob->args.eWrkDataType = eWrkDataType;
    6074        3310 :                     poJob->pChunk = apaChunk[iBand].get();
    6075        3310 :                     poJob->args.pabyChunkNodataMask =
    6076        3310 :                         apabyChunkNoDataMask[iBand].get();
    6077        3310 :                     poJob->args.nChunkXOff = nChunkXOffQueried;
    6078        3310 :                     poJob->args.nChunkXSize = nChunkXSizeQueried;
    6079        3310 :                     poJob->args.nChunkYOff = nChunkYOffQueried;
    6080        3310 :                     poJob->args.nChunkYSize = nChunkYSizeQueried;
    6081        3310 :                     poJob->args.nDstXOff = nDstXOff;
    6082        3310 :                     poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
    6083        3310 :                     poJob->args.nDstYOff = nDstYOff;
    6084        3310 :                     poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
    6085        3310 :                     poJob->args.pszResampling = pszResampling;
    6086        3310 :                     poJob->args.bHasNoData = abHasNoData[iBand];
    6087        3310 :                     poJob->args.dfNoDataValue = adfNoDataValue[iBand];
    6088        3310 :                     poJob->args.eSrcDataType = eDataType;
    6089        3310 :                     poJob->args.bPropagateNoData = bPropagateNoData;
    6090             : 
    6091        3310 :                     if (poJobQueue)
    6092             :                     {
    6093          32 :                         poJob->oSrcMaskBufferHolder.reset(new PointerHolder(
    6094          16 :                             apabyChunkNoDataMask[iBand].release()));
    6095             : 
    6096          32 :                         poJob->oSrcBufferHolder.reset(
    6097          16 :                             new PointerHolder(apaChunk[iBand].release()));
    6098             : 
    6099          16 :                         poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
    6100          16 :                         jobList.emplace_back(std::move(poJob));
    6101             :                     }
    6102             :                     else
    6103             :                     {
    6104        3294 :                         JobResampleFunc(poJob.get());
    6105        3294 :                         eErr = poJob->eErr;
    6106        3294 :                         if (eErr == CE_None)
    6107             :                         {
    6108        3294 :                             eErr = WriteJobData(poJob.get());
    6109             :                         }
    6110             :                     }
    6111             :                 }
    6112             :             }
    6113             :         }
    6114             : 
    6115             :         // Wait for all pending jobs to complete
    6116         618 :         while (!jobList.empty())
    6117             :         {
    6118          16 :             const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
    6119          16 :             if (l_eErr != CE_None && eErr == CE_None)
    6120           0 :                 eErr = l_eErr;
    6121             :         }
    6122             : 
    6123             :         // Flush the data to overviews.
    6124        1779 :         for (int iBand = 0; iBand < nBands; ++iBand)
    6125             :         {
    6126        1177 :             if (papapoOverviewBands[iBand][iOverview]->FlushCache(false) !=
    6127             :                 CE_None)
    6128           0 :                 eErr = CE_Failure;
    6129             :         }
    6130             :     }
    6131             : 
    6132         383 :     if (eErr == CE_None)
    6133         379 :         pfnProgress(1.0, nullptr, pProgressData);
    6134             : 
    6135         383 :     return eErr;
    6136             : }
    6137             : 
    6138             : /************************************************************************/
    6139             : /*            GDALRegenerateOverviewsMultiBand()                        */
    6140             : /************************************************************************/
    6141             : 
    6142             : /**
    6143             :  * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
    6144             :  * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
    6145             :  *
    6146             :  * This function will generate one or more overview images from a base
    6147             :  * image using the requested downsampling algorithm.  Its primary use
    6148             :  * is for generating overviews via GDALDataset::BuildOverviews(), but it
    6149             :  * can also be used to generate downsampled images in one file from another
    6150             :  * outside the overview architecture.
    6151             :  *
    6152             :  * The output bands need to exist in advance and share the same characteristics
    6153             :  * (type, dimensions)
    6154             :  *
    6155             :  * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
    6156             :  * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
    6157             :  *
    6158             :  * It does not support color tables or complex data types.
    6159             :  *
    6160             :  * The pseudo-algorithm used by the function is :
    6161             :  *    for each overview
    6162             :  *       iterate on lines of the source by a step of deltay
    6163             :  *           iterate on columns of the source  by a step of deltax
    6164             :  *               read the source data of size deltax * deltay for all the bands
    6165             :  *               generate the corresponding overview block for all the bands
    6166             :  *
    6167             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    6168             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    6169             :  * considered as the nodata value and not each value of the triplet
    6170             :  * independently per band.
    6171             :  *
    6172             :  * The GDAL_NUM_THREADS configuration option can be set
    6173             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    6174             :  * overview computation.
    6175             :  *
    6176             :  * @param apoSrcBands the list of source bands to downsample
    6177             :  * @param aapoOverviewBands bidimension array of bands. First dimension is
    6178             :  *                          indexed by bands. Second dimension is indexed by
    6179             :  *                          overview levels. All aapoOverviewBands[i] arrays
    6180             :  *                          must have the same size (i.e. same number of
    6181             :  *                          overviews)
    6182             :  * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
    6183             :  * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
    6184             :  * @param pfnProgress progress report function.
    6185             :  * @param pProgressData progress function callback data.
    6186             :  * @param papszOptions NULL terminated list of options as
    6187             :  *                     key=value pairs, or NULL
    6188             :  *                     The XOFF, YOFF, XSIZE and YSIZE
    6189             :  *                     options can be specified to express that overviews should
    6190             :  *                     be regenerated only in the specified subset of the source
    6191             :  *                     dataset.
    6192             :  * @return CE_None on success or CE_Failure on failure.
    6193             :  * @since 3.10
    6194             :  */
    6195             : 
    6196          19 : CPLErr GDALRegenerateOverviewsMultiBand(
    6197             :     const std::vector<GDALRasterBand *> &apoSrcBands,
    6198             :     const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
    6199             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    6200             :     void *pProgressData, CSLConstList papszOptions)
    6201             : {
    6202          19 :     CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
    6203          29 :     for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
    6204             :     {
    6205          10 :         CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
    6206             :     }
    6207             : 
    6208          19 :     if (aapoOverviewBands.empty())
    6209           0 :         return CE_None;
    6210             : 
    6211          19 :     std::vector<GDALRasterBand **> apapoOverviewBands;
    6212          48 :     for (auto &apoOverviewBands : aapoOverviewBands)
    6213             :     {
    6214             :         auto papoOverviewBands = static_cast<GDALRasterBand **>(
    6215          29 :             CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
    6216          61 :         for (size_t i = 0; i < apoOverviewBands.size(); ++i)
    6217             :         {
    6218          32 :             papoOverviewBands[i] = apoOverviewBands[i];
    6219             :         }
    6220          29 :         apapoOverviewBands.push_back(papoOverviewBands);
    6221             :     }
    6222          38 :     const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
    6223          19 :         static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
    6224          19 :         static_cast<int>(aapoOverviewBands[0].size()),
    6225          19 :         apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
    6226             :         papszOptions);
    6227          48 :     for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
    6228          29 :         CPLFree(papoOverviewBands);
    6229          19 :     return eErr;
    6230             : }
    6231             : 
    6232             : /************************************************************************/
    6233             : /*                        GDALComputeBandStats()                        */
    6234             : /************************************************************************/
    6235             : 
    6236             : /** Undocumented
    6237             :  * @param hSrcBand undocumented.
    6238             :  * @param nSampleStep Step between scanlines used to compute statistics.
    6239             :  *                    When nSampleStep is equal to 1, all scanlines will
    6240             :  *                    be processed.
    6241             :  * @param pdfMean undocumented.
    6242             :  * @param pdfStdDev undocumented.
    6243             :  * @param pfnProgress undocumented.
    6244             :  * @param pProgressData undocumented.
    6245             :  * @return undocumented
    6246             :  */
    6247          18 : CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
    6248             :                                         int nSampleStep, double *pdfMean,
    6249             :                                         double *pdfStdDev,
    6250             :                                         GDALProgressFunc pfnProgress,
    6251             :                                         void *pProgressData)
    6252             : 
    6253             : {
    6254          18 :     VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
    6255             : 
    6256          18 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    6257             : 
    6258          18 :     if (pfnProgress == nullptr)
    6259          18 :         pfnProgress = GDALDummyProgress;
    6260             : 
    6261          18 :     const int nWidth = poSrcBand->GetXSize();
    6262          18 :     const int nHeight = poSrcBand->GetYSize();
    6263             : 
    6264          18 :     if (nSampleStep >= nHeight || nSampleStep < 1)
    6265           5 :         nSampleStep = 1;
    6266             : 
    6267          18 :     GDALDataType eWrkType = GDT_Unknown;
    6268          18 :     float *pafData = nullptr;
    6269          18 :     GDALDataType eType = poSrcBand->GetRasterDataType();
    6270          18 :     const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
    6271          18 :     if (bComplex)
    6272             :     {
    6273             :         pafData = static_cast<float *>(
    6274           0 :             VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
    6275           0 :         eWrkType = GDT_CFloat32;
    6276             :     }
    6277             :     else
    6278             :     {
    6279             :         pafData =
    6280          18 :             static_cast<float *>(VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
    6281          18 :         eWrkType = GDT_Float32;
    6282             :     }
    6283             : 
    6284          18 :     if (nWidth == 0 || pafData == nullptr)
    6285             :     {
    6286           0 :         VSIFree(pafData);
    6287           0 :         return CE_Failure;
    6288             :     }
    6289             : 
    6290             :     /* -------------------------------------------------------------------- */
    6291             :     /*      Loop over all sample lines.                                     */
    6292             :     /* -------------------------------------------------------------------- */
    6293          18 :     double dfSum = 0.0;
    6294          18 :     double dfSum2 = 0.0;
    6295          18 :     int iLine = 0;
    6296          18 :     GIntBig nSamples = 0;
    6297             : 
    6298        2143 :     do
    6299             :     {
    6300        2161 :         if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
    6301             :                          pProgressData))
    6302             :         {
    6303           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6304           0 :             CPLFree(pafData);
    6305           0 :             return CE_Failure;
    6306             :         }
    6307             : 
    6308             :         const CPLErr eErr =
    6309        2161 :             poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
    6310             :                                 1, eWrkType, 0, 0, nullptr);
    6311        2161 :         if (eErr != CE_None)
    6312             :         {
    6313           1 :             CPLFree(pafData);
    6314           1 :             return eErr;
    6315             :         }
    6316             : 
    6317      725208 :         for (int iPixel = 0; iPixel < nWidth; ++iPixel)
    6318             :         {
    6319      723048 :             float fValue = 0.0f;
    6320             : 
    6321      723048 :             if (bComplex)
    6322             :             {
    6323             :                 // Compute the magnitude of the complex value.
    6324             :                 fValue =
    6325           0 :                     std::hypot(pafData[iPixel * 2], pafData[iPixel * 2 + 1]);
    6326             :             }
    6327             :             else
    6328             :             {
    6329      723048 :                 fValue = pafData[iPixel];
    6330             :             }
    6331             : 
    6332      723048 :             dfSum += fValue;
    6333      723048 :             dfSum2 += static_cast<double>(fValue) * fValue;
    6334             :         }
    6335             : 
    6336        2160 :         nSamples += nWidth;
    6337        2160 :         iLine += nSampleStep;
    6338        2160 :     } while (iLine < nHeight);
    6339             : 
    6340          17 :     if (!pfnProgress(1.0, nullptr, pProgressData))
    6341             :     {
    6342           0 :         CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6343           0 :         CPLFree(pafData);
    6344           0 :         return CE_Failure;
    6345             :     }
    6346             : 
    6347             :     /* -------------------------------------------------------------------- */
    6348             :     /*      Produce the result values.                                      */
    6349             :     /* -------------------------------------------------------------------- */
    6350          17 :     if (pdfMean != nullptr)
    6351          17 :         *pdfMean = dfSum / nSamples;
    6352             : 
    6353          17 :     if (pdfStdDev != nullptr)
    6354             :     {
    6355          17 :         const double dfMean = dfSum / nSamples;
    6356             : 
    6357          17 :         *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
    6358             :     }
    6359             : 
    6360          17 :     CPLFree(pafData);
    6361             : 
    6362          17 :     return CE_None;
    6363             : }
    6364             : 
    6365             : /************************************************************************/
    6366             : /*                  GDALOverviewMagnitudeCorrection()                   */
    6367             : /*                                                                      */
    6368             : /*      Correct the mean and standard deviation of the overviews of     */
    6369             : /*      the given band to match the base layer approximately.           */
    6370             : /************************************************************************/
    6371             : 
    6372             : /** Undocumented
    6373             :  * @param hBaseBand undocumented.
    6374             :  * @param nOverviewCount undocumented.
    6375             :  * @param pahOverviews undocumented.
    6376             :  * @param pfnProgress undocumented.
    6377             :  * @param pProgressData undocumented.
    6378             :  * @return undocumented
    6379             :  */
    6380           0 : CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
    6381             :                                        int nOverviewCount,
    6382             :                                        GDALRasterBandH *pahOverviews,
    6383             :                                        GDALProgressFunc pfnProgress,
    6384             :                                        void *pProgressData)
    6385             : 
    6386             : {
    6387           0 :     VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
    6388             : 
    6389             :     /* -------------------------------------------------------------------- */
    6390             :     /*      Compute mean/stddev for source raster.                          */
    6391             :     /* -------------------------------------------------------------------- */
    6392           0 :     double dfOrigMean = 0.0;
    6393           0 :     double dfOrigStdDev = 0.0;
    6394             :     {
    6395             :         const CPLErr eErr =
    6396           0 :             GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
    6397             :                                  pfnProgress, pProgressData);
    6398             : 
    6399           0 :         if (eErr != CE_None)
    6400           0 :             return eErr;
    6401             :     }
    6402             : 
    6403             :     /* -------------------------------------------------------------------- */
    6404             :     /*      Loop on overview bands.                                         */
    6405             :     /* -------------------------------------------------------------------- */
    6406           0 :     for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
    6407             :     {
    6408             :         GDALRasterBand *poOverview =
    6409           0 :             GDALRasterBand::FromHandle(pahOverviews[iOverview]);
    6410             :         double dfOverviewMean, dfOverviewStdDev;
    6411             : 
    6412             :         const CPLErr eErr =
    6413           0 :             GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
    6414             :                                  &dfOverviewStdDev, pfnProgress, pProgressData);
    6415             : 
    6416           0 :         if (eErr != CE_None)
    6417           0 :             return eErr;
    6418             : 
    6419           0 :         double dfGain = 1.0;
    6420           0 :         if (dfOrigStdDev >= 0.0001)
    6421           0 :             dfGain = dfOrigStdDev / dfOverviewStdDev;
    6422             : 
    6423             :         /* --------------------------------------------------------------------
    6424             :          */
    6425             :         /*      Apply gain and offset. */
    6426             :         /* --------------------------------------------------------------------
    6427             :          */
    6428           0 :         const int nWidth = poOverview->GetXSize();
    6429           0 :         const int nHeight = poOverview->GetYSize();
    6430             : 
    6431           0 :         GDALDataType eWrkType = GDT_Unknown;
    6432           0 :         float *pafData = nullptr;
    6433           0 :         const GDALDataType eType = poOverview->GetRasterDataType();
    6434           0 :         const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
    6435           0 :         if (bComplex)
    6436             :         {
    6437             :             pafData = static_cast<float *>(
    6438           0 :                 VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
    6439           0 :             eWrkType = GDT_CFloat32;
    6440             :         }
    6441             :         else
    6442             :         {
    6443             :             pafData = static_cast<float *>(
    6444           0 :                 VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
    6445           0 :             eWrkType = GDT_Float32;
    6446             :         }
    6447             : 
    6448           0 :         if (pafData == nullptr)
    6449             :         {
    6450           0 :             return CE_Failure;
    6451             :         }
    6452             : 
    6453           0 :         for (int iLine = 0; iLine < nHeight; ++iLine)
    6454             :         {
    6455           0 :             if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
    6456             :                              pProgressData))
    6457             :             {
    6458           0 :                 CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6459           0 :                 CPLFree(pafData);
    6460           0 :                 return CE_Failure;
    6461             :             }
    6462             : 
    6463           0 :             if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
    6464             :                                      nWidth, 1, eWrkType, 0, 0,
    6465           0 :                                      nullptr) != CE_None)
    6466             :             {
    6467           0 :                 CPLFree(pafData);
    6468           0 :                 return CE_Failure;
    6469             :             }
    6470             : 
    6471           0 :             for (int iPixel = 0; iPixel < nWidth; ++iPixel)
    6472             :             {
    6473           0 :                 if (bComplex)
    6474             :                 {
    6475           0 :                     pafData[iPixel * 2] *= static_cast<float>(dfGain);
    6476           0 :                     pafData[iPixel * 2 + 1] *= static_cast<float>(dfGain);
    6477             :                 }
    6478             :                 else
    6479             :                 {
    6480           0 :                     pafData[iPixel] = static_cast<float>(
    6481           0 :                         (pafData[iPixel] - dfOverviewMean) * dfGain +
    6482             :                         dfOrigMean);
    6483             :                 }
    6484             :             }
    6485             : 
    6486           0 :             if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
    6487             :                                      nWidth, 1, eWrkType, 0, 0,
    6488           0 :                                      nullptr) != CE_None)
    6489             :             {
    6490           0 :                 CPLFree(pafData);
    6491           0 :                 return CE_Failure;
    6492             :             }
    6493             :         }
    6494             : 
    6495           0 :         if (!pfnProgress(1.0, nullptr, pProgressData))
    6496             :         {
    6497           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6498           0 :             CPLFree(pafData);
    6499           0 :             return CE_Failure;
    6500             :         }
    6501             : 
    6502           0 :         CPLFree(pafData);
    6503             :     }
    6504             : 
    6505           0 :     return CE_None;
    6506             : }

Generated by: LCOV version 1.14