LCOV - code coverage report
Current view: top level - gcore - overview.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 2586 2956 87.5 %
Date: 2025-07-09 17:50:03 Functions: 118 127 92.9 %

          Line data    Source code
       1             : 
       2             : /******************************************************************************
       3             :  *
       4             :  * Project:  GDAL Core
       5             :  * Purpose:  Helper code to implement overview support in different drivers.
       6             :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       7             :  *
       8             :  ******************************************************************************
       9             :  * Copyright (c) 2000, Frank Warmerdam
      10             :  * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
      11             :  *
      12             :  * SPDX-License-Identifier: MIT
      13             :  ****************************************************************************/
      14             : 
      15             : #include "cpl_port.h"
      16             : #include "gdal_priv.h"
      17             : 
      18             : #include <cmath>
      19             : #include <cstddef>
      20             : #include <cstdlib>
      21             : 
      22             : #include <algorithm>
      23             : #include <complex>
      24             : #include <condition_variable>
      25             : #include <limits>
      26             : #include <list>
      27             : #include <memory>
      28             : #include <mutex>
      29             : #include <vector>
      30             : 
      31             : #include "cpl_conv.h"
      32             : #include "cpl_error.h"
      33             : #include "cpl_float.h"
      34             : #include "cpl_progress.h"
      35             : #include "cpl_vsi.h"
      36             : #include "gdal.h"
      37             : #include "gdal_thread_pool.h"
      38             : #include "gdalwarper.h"
      39             : #include "gdal_vrt.h"
      40             : #include "vrtdataset.h"
      41             : 
      42             : #ifdef USE_NEON_OPTIMIZATIONS
      43             : #include "include_sse2neon.h"
      44             : #define USE_SSE2
      45             : 
      46             : #include "gdalsse_priv.h"
      47             : 
      48             : // Restrict to 64bit processors because they are guaranteed to have SSE2,
      49             : // or if __AVX2__ is defined.
      50             : #elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
      51             : #define USE_SSE2
      52             : 
      53             : #include "gdalsse_priv.h"
      54             : 
      55             : #ifdef __SSE3__
      56             : #include <pmmintrin.h>
      57             : #endif
      58             : #ifdef __SSSE3__
      59             : #include <tmmintrin.h>
      60             : #endif
      61             : #ifdef __SSE4_1__
      62             : #include <smmintrin.h>
      63             : #endif
      64             : #ifdef __AVX2__
      65             : #include <immintrin.h>
      66             : #endif
      67             : 
      68             : #endif
      69             : 
      70             : // To be included after above USE_SSE2 and include gdalsse_priv.h
      71             : // to avoid build issue on Windows x86
      72             : #include "gdal_priv_templates.hpp"
      73             : 
      74             : /************************************************************************/
      75             : /*                      GDALResampleChunk_Near()                        */
      76             : /************************************************************************/
      77             : 
      78             : template <class T>
      79        1233 : static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
      80             :                                       const T *pChunk, T **ppDstBuffer)
      81             : 
      82             : {
      83        1233 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
      84        1233 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
      85        1233 :     const GDALDataType eWrkDataType = args.eWrkDataType;
      86        1233 :     const int nChunkXOff = args.nChunkXOff;
      87        1233 :     const int nChunkXSize = args.nChunkXSize;
      88        1233 :     const int nChunkYOff = args.nChunkYOff;
      89        1233 :     const int nDstXOff = args.nDstXOff;
      90        1233 :     const int nDstXOff2 = args.nDstXOff2;
      91        1233 :     const int nDstYOff = args.nDstYOff;
      92        1233 :     const int nDstYOff2 = args.nDstYOff2;
      93        1233 :     const int nDstXWidth = nDstXOff2 - nDstXOff;
      94             : 
      95             :     /* -------------------------------------------------------------------- */
      96             :     /*      Allocate buffers.                                               */
      97             :     /* -------------------------------------------------------------------- */
      98        1233 :     *ppDstBuffer = static_cast<T *>(
      99        1233 :         VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
     100             :                             GDALGetDataTypeSizeBytes(eWrkDataType)));
     101        1233 :     if (*ppDstBuffer == nullptr)
     102             :     {
     103           0 :         return CE_Failure;
     104             :     }
     105        1233 :     T *const pDstBuffer = *ppDstBuffer;
     106             : 
     107             :     int *panSrcXOff =
     108        1233 :         static_cast<int *>(VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(int)));
     109             : 
     110        1233 :     if (panSrcXOff == nullptr)
     111             :     {
     112           0 :         return CE_Failure;
     113             :     }
     114             : 
     115             :     /* ==================================================================== */
     116             :     /*      Precompute inner loop constants.                                */
     117             :     /* ==================================================================== */
     118      842009 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
     119             :     {
     120      840776 :         int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
     121      840776 :         if (nSrcXOff < nChunkXOff)
     122           0 :             nSrcXOff = nChunkXOff;
     123             : 
     124      840776 :         panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
     125             :     }
     126             : 
     127             :     /* ==================================================================== */
     128             :     /*      Loop over destination scanlines.                                */
     129             :     /* ==================================================================== */
     130      141825 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
     131             :     {
     132      140592 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
     133      140592 :         if (nSrcYOff < nChunkYOff)
     134           0 :             nSrcYOff = nChunkYOff;
     135             : 
     136      140592 :         const T *const pSrcScanline =
     137             :             pChunk +
     138      140592 :             (static_cast<size_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
     139      138074 :             nChunkXOff;
     140             : 
     141             :         /* --------------------------------------------------------------------
     142             :          */
     143             :         /*      Loop over destination pixels */
     144             :         /* --------------------------------------------------------------------
     145             :          */
     146      140592 :         T *pDstScanline =
     147      140592 :             pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
     148   119627130 :         for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
     149             :         {
     150   119486612 :             pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
     151             :         }
     152             :     }
     153             : 
     154        1233 :     CPLFree(panSrcXOff);
     155             : 
     156        1233 :     return CE_None;
     157             : }
     158             : 
     159        1233 : static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
     160             :                                      const void *pChunk, void **ppDstBuffer,
     161             :                                      GDALDataType *peDstBufferDataType)
     162             : {
     163        1233 :     *peDstBufferDataType = args.eWrkDataType;
     164        1233 :     switch (args.eWrkDataType)
     165             :     {
     166             :         // For nearest resampling, as no computation is done, only the
     167             :         // size of the data type matters.
     168        1081 :         case GDT_Byte:
     169             :         case GDT_Int8:
     170             :         {
     171        1081 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
     172        1081 :             return GDALResampleChunk_NearT(
     173             :                 args, static_cast<const uint8_t *>(pChunk),
     174        1081 :                 reinterpret_cast<uint8_t **>(ppDstBuffer));
     175             :         }
     176             : 
     177          50 :         case GDT_Int16:
     178             :         case GDT_UInt16:
     179             :         case GDT_Float16:
     180             :         {
     181          50 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
     182          50 :             return GDALResampleChunk_NearT(
     183             :                 args, static_cast<const uint16_t *>(pChunk),
     184          50 :                 reinterpret_cast<uint16_t **>(ppDstBuffer));
     185             :         }
     186             : 
     187          55 :         case GDT_CInt16:
     188             :         case GDT_CFloat16:
     189             :         case GDT_Int32:
     190             :         case GDT_UInt32:
     191             :         case GDT_Float32:
     192             :         {
     193          55 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
     194          55 :             return GDALResampleChunk_NearT(
     195             :                 args, static_cast<const uint32_t *>(pChunk),
     196          55 :                 reinterpret_cast<uint32_t **>(ppDstBuffer));
     197             :         }
     198             : 
     199          43 :         case GDT_CInt32:
     200             :         case GDT_CFloat32:
     201             :         case GDT_Int64:
     202             :         case GDT_UInt64:
     203             :         case GDT_Float64:
     204             :         {
     205          43 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
     206          43 :             return GDALResampleChunk_NearT(
     207             :                 args, static_cast<const uint64_t *>(pChunk),
     208          43 :                 reinterpret_cast<uint64_t **>(ppDstBuffer));
     209             :         }
     210             : 
     211           4 :         case GDT_CFloat64:
     212             :         {
     213           4 :             return GDALResampleChunk_NearT(
     214             :                 args, static_cast<const std::complex<double> *>(pChunk),
     215           4 :                 reinterpret_cast<std::complex<double> **>(ppDstBuffer));
     216             :         }
     217             : 
     218           0 :         case GDT_Unknown:
     219             :         case GDT_TypeCount:
     220           0 :             break;
     221             :     }
     222           0 :     CPLAssert(false);
     223             :     return CE_Failure;
     224             : }
     225             : 
     226             : namespace
     227             : {
     228             : 
     229             : // Find in the color table the entry whose RGB value is the closest
     230             : // (using quadratic distance) to the test color, ignoring transparent entries.
     231        3837 : int BestColorEntry(const std::vector<GDALColorEntry> &entries,
     232             :                    const GDALColorEntry &test)
     233             : {
     234        3837 :     int nMinDist = std::numeric_limits<int>::max();
     235        3837 :     size_t bestEntry = 0;
     236      986109 :     for (size_t i = 0; i < entries.size(); ++i)
     237             :     {
     238      982272 :         const GDALColorEntry &entry = entries[i];
     239             :         // Ignore transparent entries
     240      982272 :         if (entry.c4 == 0)
     241        3237 :             continue;
     242             : 
     243      979035 :         int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
     244      979035 :                     ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
     245      979035 :                     ((test.c3 - entry.c3) * (test.c3 - entry.c3));
     246      979035 :         if (nDist < nMinDist)
     247             :         {
     248       15847 :             nMinDist = nDist;
     249       15847 :             bestEntry = i;
     250             :         }
     251             :     }
     252        3837 :     return static_cast<int>(bestEntry);
     253             : }
     254             : 
     255           7 : std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
     256             :                                            int &transparentIdx)
     257             : {
     258           7 :     std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
     259             : 
     260           7 :     transparentIdx = -1;
     261           7 :     int i = 0;
     262        1799 :     for (auto &entry : entries)
     263             :     {
     264        1792 :         table.GetColorEntryAsRGB(i, &entry);
     265        1792 :         if (transparentIdx < 0 && entry.c4 == 0)
     266           1 :             transparentIdx = i;
     267        1792 :         ++i;
     268             :     }
     269           7 :     return entries;
     270             : }
     271             : 
     272             : }  // unnamed  namespace
     273             : 
     274             : /************************************************************************/
     275             : /*                             SQUARE()                                 */
     276             : /************************************************************************/
     277             : 
     278        3721 : template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
     279             : {
     280        3721 :     return static_cast<Tsquare>(val) * val;
     281             : }
     282             : 
     283             : /************************************************************************/
     284             : /*                          ComputeIntegerRMS()                         */
     285             : /************************************************************************/
     286             : // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
     287             : // integer that minimizes abs(rms**2 - sumSquares / weight)
     288             : template <class T, class Twork>
     289          42 : inline T ComputeIntegerRMS(double sumSquares, double weight)
     290             : {
     291          42 :     const double sumDivWeight = sumSquares / weight;
     292          42 :     T rms = static_cast<T>(sqrt(sumDivWeight));
     293             : 
     294             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     295             :     // Naive version:
     296             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     297          42 :     if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
     298          42 :         2 * sumDivWeight)
     299           6 :         rms += 1;
     300          42 :     return rms;
     301             : }
     302             : 
     303           0 : template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
     304             : {
     305           0 :     CPLAssert(false);
     306             :     return 0;
     307             : }
     308             : 
     309          24 : template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
     310             : {
     311             :     // It has been verified that given the correction on rms below, using
     312             :     // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
     313             :     // is equivalent, so use the former as it is used twice.
     314          24 :     const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
     315          24 :     const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
     316          24 :     GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
     317             : 
     318             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     319             :     // Naive version:
     320             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     321             :     // Optimized version for integer case and weight == 4
     322          24 :     if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
     323           5 :         rms += 1;
     324          24 :     return rms;
     325             : }
     326             : 
     327             : template <>
     328          20 : inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
     329             : {
     330          20 :     const double sumDivWeight = sumSquares * 0.25;
     331          20 :     GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
     332             : 
     333             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     334             :     // Naive version:
     335             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     336             :     // Optimized version for integer case and weight == 4
     337          20 :     if (static_cast<GUInt32>(rms) * (rms + 1) <
     338          20 :         static_cast<GUInt32>(sumDivWeight + 0.25))
     339           4 :         rms += 1;
     340          20 :     return rms;
     341             : }
     342             : 
     343             : #ifdef USE_SSE2
     344             : 
     345             : /************************************************************************/
     346             : /*                   QuadraticMeanByteSSE2OrAVX2()                      */
     347             : /************************************************************************/
     348             : 
     349             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
     350             : #define sse2_packus_epi32 _mm_packus_epi32
     351             : #else
     352      516119 : inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
     353             : {
     354      516119 :     const auto minus32768_32 = _mm_set1_epi32(-32768);
     355      516119 :     const auto minus32768_16 = _mm_set1_epi16(-32768);
     356      516119 :     a = _mm_add_epi32(a, minus32768_32);
     357      516119 :     b = _mm_add_epi32(b, minus32768_32);
     358      516119 :     a = _mm_packs_epi32(a, b);
     359      516119 :     a = _mm_sub_epi16(a, minus32768_16);
     360      516119 :     return a;
     361             : }
     362             : #endif
     363             : 
     364             : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
     365             : #define sse2_hadd_epi16 _mm_hadd_epi16
     366             : #else
     367     4667530 : inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
     368             : {
     369             :     // Horizontal addition of adjacent pairs
     370     4667530 :     const auto mask = _mm_set1_epi32(0xFFFF);
     371             :     const auto horizLo =
     372    14002600 :         _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
     373             :     const auto horizHi =
     374    14002600 :         _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
     375             : 
     376             :     // Recombine low and high parts
     377     4667530 :     return _mm_packs_epi32(horizLo, horizHi);
     378             : }
     379             : #endif
     380             : 
     381             : #ifdef __AVX2__
     382             : 
     383             : #define DEST_ELTS 16
     384             : #define set1_epi16 _mm256_set1_epi16
     385             : #define set1_epi32 _mm256_set1_epi32
     386             : #define setzero _mm256_setzero_si256
     387             : #define set1_ps _mm256_set1_ps
     388             : #define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
     389             : #define unpacklo_epi8 _mm256_unpacklo_epi8
     390             : #define unpackhi_epi8 _mm256_unpackhi_epi8
     391             : #define madd_epi16 _mm256_madd_epi16
     392             : #define add_epi32 _mm256_add_epi32
     393             : #define mul_ps _mm256_mul_ps
     394             : #define cvtepi32_ps _mm256_cvtepi32_ps
     395             : #define sqrt_ps _mm256_sqrt_ps
     396             : #define cvttps_epi32 _mm256_cvttps_epi32
     397             : #define packs_epi32 _mm256_packs_epi32
     398             : #define packus_epi32 _mm256_packus_epi32
     399             : #define srli_epi32 _mm256_srli_epi32
     400             : #define mullo_epi16 _mm256_mullo_epi16
     401             : #define srli_epi16 _mm256_srli_epi16
     402             : #define cmpgt_epi16 _mm256_cmpgt_epi16
     403             : #define add_epi16 _mm256_add_epi16
     404             : #define sub_epi16 _mm256_sub_epi16
     405             : #define packus_epi16 _mm256_packus_epi16
     406             : 
     407             : /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
     408             : /* to get the lower 128-bit bits of what would be a true 256-bit vector register
     409             :  */
     410             : 
     411             : inline __m256i FIXUP_LANES(__m256i x)
     412             : {
     413             :     return _mm256_permute4x64_epi64(x, _MM_SHUFFLE(3, 1, 2, 0));
     414             : }
     415             : 
     416             : #define store_lo(x, y)                                                         \
     417             :     _mm_storeu_si128(reinterpret_cast<__m128i *>(x),                           \
     418             :                      _mm256_extracti128_si256(FIXUP_LANES(y), 0))
     419             : #define storeu_int(x, y)                                                       \
     420             :     _mm256_storeu_si256(reinterpret_cast<__m256i *>(x), FIXUP_LANES(y))
     421             : #define hadd_epi16 _mm256_hadd_epi16
     422             : #else
     423             : #define DEST_ELTS 8
     424             : #define set1_epi16 _mm_set1_epi16
     425             : #define set1_epi32 _mm_set1_epi32
     426             : #define setzero _mm_setzero_si128
     427             : #define set1_ps _mm_set1_ps
     428             : #define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
     429             : #define unpacklo_epi8 _mm_unpacklo_epi8
     430             : #define unpackhi_epi8 _mm_unpackhi_epi8
     431             : #define madd_epi16 _mm_madd_epi16
     432             : #define add_epi32 _mm_add_epi32
     433             : #define mul_ps _mm_mul_ps
     434             : #define cvtepi32_ps _mm_cvtepi32_ps
     435             : #define sqrt_ps _mm_sqrt_ps
     436             : #define cvttps_epi32 _mm_cvttps_epi32
     437             : #define packs_epi32 _mm_packs_epi32
     438             : #define packus_epi32 sse2_packus_epi32
     439             : #define srli_epi32 _mm_srli_epi32
     440             : #define mullo_epi16 _mm_mullo_epi16
     441             : #define srli_epi16 _mm_srli_epi16
     442             : #define cmpgt_epi16 _mm_cmpgt_epi16
     443             : #define add_epi16 _mm_add_epi16
     444             : #define sub_epi16 _mm_sub_epi16
     445             : #define packus_epi16 _mm_packus_epi16
     446             : #define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
     447             : #define storeu_int(x, y) _mm_storeu_si128(reinterpret_cast<__m128i *>(x), (y))
     448             : #define hadd_epi16 sse2_hadd_epi16
     449             : #endif
     450             : 
     451             : template <class T>
     452             : static int
     453             : #if defined(__GNUC__)
     454             :     __attribute__((noinline))
     455             : #endif
     456        5385 :     QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
     457             :                                 const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     458             :                                 T *CPL_RESTRICT pDstScanline)
     459             : {
     460             :     // Optimized implementation for RMS on Byte by
     461             :     // processing by group of 8 output pixels, so as to use
     462             :     // a single _mm_sqrt_ps() call for 4 output pixels
     463        5385 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     464             : 
     465        5385 :     int iDstPixel = 0;
     466        5385 :     const auto one16 = set1_epi16(1);
     467        5385 :     const auto one32 = set1_epi32(1);
     468        5385 :     const auto zero = setzero();
     469        5385 :     const auto minus32768 = set1_epi16(-32768);
     470             : 
     471      521496 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
     472             :     {
     473             :         // Load 2 * DEST_ELTS bytes from each line
     474      516111 :         auto firstLine = loadu_int(pSrcScanlineShifted);
     475     1032220 :         auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
     476             :         // Extend those Bytes as UInt16s
     477      516111 :         auto firstLineLo = unpacklo_epi8(firstLine, zero);
     478      516111 :         auto firstLineHi = unpackhi_epi8(firstLine, zero);
     479      516111 :         auto secondLineLo = unpacklo_epi8(secondLine, zero);
     480      516111 :         auto secondLineHi = unpackhi_epi8(secondLine, zero);
     481             : 
     482             :         // Multiplication of 16 bit values and horizontal
     483             :         // addition of 32 bit results
     484             :         // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
     485      516111 :         firstLineLo = madd_epi16(firstLineLo, firstLineLo);
     486      516111 :         firstLineHi = madd_epi16(firstLineHi, firstLineHi);
     487      516111 :         secondLineLo = madd_epi16(secondLineLo, secondLineLo);
     488      516111 :         secondLineHi = madd_epi16(secondLineHi, secondLineHi);
     489             : 
     490             :         // Vertical addition
     491      516111 :         const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
     492      516111 :         const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
     493             : 
     494             :         const auto sumSquaresPlusOneDiv4Lo =
     495     1032220 :             srli_epi32(add_epi32(sumSquaresLo, one32), 2);
     496             :         const auto sumSquaresPlusOneDiv4Hi =
     497     1032220 :             srli_epi32(add_epi32(sumSquaresHi, one32), 2);
     498             : 
     499             :         // Take square root and truncate/floor to int32
     500             :         const auto rmsLo =
     501     1548330 :             cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
     502             :         const auto rmsHi =
     503     1548330 :             cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
     504             : 
     505             :         // Merge back low and high registers with each RMS value
     506             :         // as a 16 bit value.
     507      516111 :         auto rms = packs_epi32(rmsLo, rmsHi);
     508             : 
     509             :         // Round to upper value if it minimizes the
     510             :         // error |rms^2 - sumSquares/4|
     511             :         // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
     512             :         //    rms += 1;
     513             :         // which is equivalent to:
     514             :         // if( rms * (rms + 1) < (sumSquares+1) / 4 )
     515             :         //    rms += 1;
     516             :         // And both left and right parts fit on 16 (unsigned) bits
     517             :         const auto sumSquaresPlusOneDiv4 =
     518      516111 :             packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
     519             :         // cmpgt_epi16 operates on signed int16, but here
     520             :         // we have unsigned values, so shift them by -32768 before
     521     2580560 :         auto mask = cmpgt_epi16(
     522             :             add_epi16(sumSquaresPlusOneDiv4, minus32768),
     523             :             add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
     524             :         // The value of the mask will be -1 when the correction needs to be
     525             :         // applied
     526      516111 :         rms = sub_epi16(rms, mask);
     527             : 
     528             :         // Pack each 16 bit RMS value to 8 bits
     529      516111 :         rms = packus_epi16(rms, rms /* could be anything */);
     530      516111 :         store_lo(&pDstScanline[iDstPixel], rms);
     531      516111 :         pSrcScanlineShifted += 2 * DEST_ELTS;
     532             :     }
     533             : 
     534        5385 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     535        5385 :     return iDstPixel;
     536             : }
     537             : 
     538             : /************************************************************************/
     539             : /*                      AverageByteSSE2OrAVX2()                         */
     540             : /************************************************************************/
     541             : 
     542             : template <class T>
     543             : static int
     544      111280 : AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
     545             :                       const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     546             :                       T *CPL_RESTRICT pDstScanline)
     547             : {
     548             :     // Optimized implementation for average on Byte by
     549             :     // processing by group of 16 output pixels for SSE2, or 32 for AVX2
     550             : 
     551      111280 :     const auto zero = setzero();
     552      111280 :     const auto two16 = set1_epi16(2);
     553      111280 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     554             : 
     555      111280 :     int iDstPixel = 0;
     556     2445050 :     for (; iDstPixel < nDstXWidth - (2 * DEST_ELTS - 1);
     557             :          iDstPixel += 2 * DEST_ELTS)
     558             :     {
     559             :         decltype(setzero()) average0;
     560             :         {
     561             :             // Load 2 * DEST_ELTS bytes from each line
     562     2333770 :             const auto firstLine = loadu_int(pSrcScanlineShifted);
     563             :             const auto secondLine =
     564     4667530 :                 loadu_int(pSrcScanlineShifted + nChunkXSize);
     565             :             // Extend those Bytes as UInt16s
     566     2333770 :             const auto firstLineLo = unpacklo_epi8(firstLine, zero);
     567     2333770 :             const auto firstLineHi = unpackhi_epi8(firstLine, zero);
     568     2333770 :             const auto secondLineLo = unpacklo_epi8(secondLine, zero);
     569     2333770 :             const auto secondLineHi = unpackhi_epi8(secondLine, zero);
     570             : 
     571             :             // Vertical addition
     572     2333770 :             const auto sumLo = add_epi16(firstLineLo, secondLineLo);
     573     2333770 :             const auto sumHi = add_epi16(firstLineHi, secondLineHi);
     574             : 
     575             :             // Horizontal addition of adjacent pairs, and recombine low and high
     576             :             // parts
     577     2333770 :             const auto sum = hadd_epi16(sumLo, sumHi);
     578             : 
     579             :             // average = (sum + 2) / 4
     580     2333770 :             average0 = srli_epi16(add_epi16(sum, two16), 2);
     581             : 
     582     2333770 :             pSrcScanlineShifted += 2 * DEST_ELTS;
     583             :         }
     584             : 
     585             :         decltype(setzero()) average1;
     586             :         {
     587             :             // Load 2 * DEST_ELTS bytes from each line
     588     2333770 :             const auto firstLine = loadu_int(pSrcScanlineShifted);
     589             :             const auto secondLine =
     590     4667530 :                 loadu_int(pSrcScanlineShifted + nChunkXSize);
     591             :             // Extend those Bytes as UInt16s
     592     2333770 :             const auto firstLineLo = unpacklo_epi8(firstLine, zero);
     593     2333770 :             const auto firstLineHi = unpackhi_epi8(firstLine, zero);
     594     2333770 :             const auto secondLineLo = unpacklo_epi8(secondLine, zero);
     595     2333770 :             const auto secondLineHi = unpackhi_epi8(secondLine, zero);
     596             : 
     597             :             // Vertical addition
     598     2333770 :             const auto sumLo = add_epi16(firstLineLo, secondLineLo);
     599     2333770 :             const auto sumHi = add_epi16(firstLineHi, secondLineHi);
     600             : 
     601             :             // Horizontal addition of adjacent pairs, and recombine low and high
     602             :             // parts
     603     2333770 :             const auto sum = hadd_epi16(sumLo, sumHi);
     604             : 
     605             :             // average = (sum + 2) / 4
     606     2333770 :             average1 = srli_epi16(add_epi16(sum, two16), 2);
     607             : 
     608     2333770 :             pSrcScanlineShifted += 2 * DEST_ELTS;
     609             :         }
     610             : 
     611             :         // Pack each 16 bit average value to 8 bits
     612     2333770 :         const auto average = packus_epi16(average0, average1);
     613     2333770 :         storeu_int(&pDstScanline[iDstPixel], average);
     614             :     }
     615             : 
     616      111280 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     617      111280 :     return iDstPixel;
     618             : }
     619             : 
     620             : /************************************************************************/
     621             : /*                     QuadraticMeanUInt16SSE2()                        */
     622             : /************************************************************************/
     623             : 
     624             : #ifdef __SSE3__
     625             : #define sse2_hadd_pd _mm_hadd_pd
     626             : #else
     627           8 : inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
     628             : {
     629             :     auto aLo_bLo =
     630          32 :         _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
     631             :     auto aHi_bHi =
     632          32 :         _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
     633           8 :     return _mm_add_pd(aLo_bLo, aHi_bHi);  // (aLo + aHi, bLo + bHi)
     634             : }
     635             : #endif
     636             : 
     637          40 : inline __m128d SQUARE_PD(__m128d x)
     638             : {
     639          40 :     return _mm_mul_pd(x, x);
     640             : }
     641             : 
     642             : #ifdef __AVX2__
     643             : 
     644             : inline __m256d SQUARE_PD(__m256d x)
     645             : {
     646             :     return _mm256_mul_pd(x, x);
     647             : }
     648             : 
     649             : inline __m256d FIXUP_LANES(__m256d x)
     650             : {
     651             :     return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
     652             : }
     653             : 
     654             : inline __m256 FIXUP_LANES(__m256 x)
     655             : {
     656             :     return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
     657             : }
     658             : 
     659             : #endif
     660             : 
     661             : template <class T>
     662             : static int
     663          10 : QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
     664             :                         const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     665             :                         T *CPL_RESTRICT pDstScanline)
     666             : {
     667             :     // Optimized implementation for RMS on UInt16 by
     668             :     // processing by group of 4 output pixels.
     669          10 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     670             : 
     671          10 :     int iDstPixel = 0;
     672          10 :     const auto zero = _mm_setzero_si128();
     673             : 
     674             : #ifdef __AVX2__
     675             :     const auto zeroDot25 = _mm256_set1_pd(0.25);
     676             :     const auto zeroDot5 = _mm256_set1_pd(0.5);
     677             : 
     678             :     // The first four 0's could be anything, as we only take the bottom
     679             :     // 128 bits.
     680             :     const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
     681             : #else
     682          10 :     const auto zeroDot25 = _mm_set1_pd(0.25);
     683          10 :     const auto zeroDot5 = _mm_set1_pd(0.5);
     684             : #endif
     685             : 
     686          40 :     for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
     687             :     {
     688             :         // Load 8 UInt16 from each line
     689          30 :         const auto firstLine = _mm_loadu_si128(
     690             :             reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
     691             :         const auto secondLine =
     692          30 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     693          30 :                 pSrcScanlineShifted + nChunkXSize));
     694             : 
     695             :         // Detect if all of the source values fit in 14 bits.
     696             :         // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
     697             :         // and we can do a much faster implementation.
     698             :         const auto maskTmp =
     699          60 :             _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
     700             : #if defined(__i386__) || defined(_M_IX86)
     701             :         uint64_t nMaskFitsIn14Bits = 0;
     702             :         _mm_storel_epi64(
     703             :             reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
     704             :             _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
     705             : #else
     706          30 :         const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
     707             :             _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
     708             : #endif
     709          30 :         if (nMaskFitsIn14Bits == 0)
     710             :         {
     711             :             // Multiplication of 16 bit values and horizontal
     712             :             // addition of 32 bit results
     713             :             const auto firstLineHSumSquare =
     714          26 :                 _mm_madd_epi16(firstLine, firstLine);
     715             :             const auto secondLineHSumSquare =
     716          26 :                 _mm_madd_epi16(secondLine, secondLine);
     717             :             // Vertical addition
     718             :             const auto sumSquares =
     719          26 :                 _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
     720             :             // In theory we should take sqrt(sumSquares * 0.25f)
     721             :             // but given the rounding we do, this is equivalent to
     722             :             // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
     723             :             // sumSquares <= 4 * 16383^2
     724          26 :             const auto one32 = _mm_set1_epi32(1);
     725             :             const auto sumSquaresPlusOneDiv4 =
     726          52 :                 _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
     727             :             // Take square root and truncate/floor to int32
     728          78 :             auto rms = _mm_cvttps_epi32(
     729             :                 _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
     730             : 
     731             :             // Round to upper value if it minimizes the
     732             :             // error |rms^2 - sumSquares/4|
     733             :             // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
     734             :             //    rms += 1;
     735             :             // which is equivalent to:
     736             :             // if( rms * rms + rms < (sumSquares+1) / 4 )
     737             :             //    rms += 1;
     738             :             auto mask =
     739          78 :                 _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
     740             :                                 _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
     741          26 :             rms = _mm_sub_epi32(rms, mask);
     742             :             // Pack each 32 bit RMS value to 16 bits
     743          26 :             rms = _mm_packs_epi32(rms, rms /* could be anything */);
     744             :             _mm_storel_epi64(
     745          26 :                 reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
     746          26 :             pSrcScanlineShifted += 8;
     747          26 :             continue;
     748             :         }
     749             : 
     750             :         // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
     751             :         // to 32 bit would result in 4 multiplications instead of 8, but
     752             :         // mullo/mulhi have a worse throughput than mul_pd.
     753             : 
     754             :         // Extend those UInt16s as UInt32s
     755           4 :         const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
     756           4 :         const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
     757           4 :         const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
     758           4 :         const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
     759             : 
     760             : #ifdef __AVX2__
     761             :         // Multiplication of 32 bit values previously converted to 64 bit double
     762             :         const auto firstLineLoDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineLo));
     763             :         const auto firstLineHiDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineHi));
     764             :         const auto secondLineLoDbl =
     765             :             SQUARE_PD(_mm256_cvtepi32_pd(secondLineLo));
     766             :         const auto secondLineHiDbl =
     767             :             SQUARE_PD(_mm256_cvtepi32_pd(secondLineHi));
     768             : 
     769             :         // Vertical addition of squares
     770             :         const auto sumSquaresLo =
     771             :             _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
     772             :         const auto sumSquaresHi =
     773             :             _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
     774             : 
     775             :         // Horizontal addition of squares
     776             :         const auto sumSquares =
     777             :             FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
     778             : 
     779             :         const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
     780             : 
     781             :         // Take square root and truncate/floor to int32
     782             :         auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
     783             :         const auto rmsDouble = _mm256_cvtepi32_pd(rms);
     784             :         const auto right = _mm256_sub_pd(
     785             :             sumDivWeight, _mm256_add_pd(SQUARE_PD(rmsDouble), rmsDouble));
     786             : 
     787             :         auto mask =
     788             :             _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
     789             :         // Extract 32-bit from each of the 4 64-bit masks
     790             :         // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
     791             :         // _MM_SHUFFLE(2,0,2,0)));
     792             :         mask = _mm256_permutevar8x32_ps(mask, permutation);
     793             :         const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
     794             : 
     795             :         // Apply the correction
     796             :         rms = _mm_sub_epi32(rms, maskI);
     797             : 
     798             :         // Pack each 32 bit RMS value to 16 bits
     799             :         rms = _mm_packus_epi32(rms, rms /* could be anything */);
     800             : #else
     801             :         // Multiplication of 32 bit values previously converted to 64 bit double
     802           4 :         const auto firstLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineLo));
     803             :         const auto firstLineLoHi =
     804           8 :             SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
     805           4 :         const auto firstLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineHi));
     806             :         const auto firstLineHiHi =
     807           8 :             SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
     808             : 
     809           4 :         const auto secondLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineLo));
     810             :         const auto secondLineLoHi =
     811           8 :             SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
     812           4 :         const auto secondLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineHi));
     813             :         const auto secondLineHiHi =
     814           8 :             SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
     815             : 
     816             :         // Vertical addition of squares
     817           4 :         const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
     818           4 :         const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
     819           4 :         const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
     820           4 :         const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
     821             : 
     822             :         // Horizontal addition of squares
     823           4 :         const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
     824           4 :         const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
     825             : 
     826           4 :         const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
     827           4 :         const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
     828             :         // Take square root and truncate/floor to int32
     829           8 :         const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
     830           8 :         const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
     831             : 
     832             :         // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
     833             :         // if( 0.5 < sumDivWeight - (rms * rms + rms) )
     834             :         //     rms += 1;
     835           4 :         const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
     836           4 :         const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
     837           8 :         const auto rightLo = _mm_sub_pd(
     838             :             sumDivWeightLo, _mm_add_pd(SQUARE_PD(rmsLoDouble), rmsLoDouble));
     839          12 :         const auto rightHi = _mm_sub_pd(
     840             :             sumDivWeightHi, _mm_add_pd(SQUARE_PD(rmsHiDouble), rmsHiDouble));
     841             : 
     842           8 :         const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
     843           4 :         const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
     844             :         // The value of the mask will be -1 when the correction needs to be
     845             :         // applied
     846           8 :         const auto mask = _mm_castps_si128(_mm_shuffle_ps(
     847             :             maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
     848             : 
     849          16 :         auto rms = _mm_castps_si128(
     850             :             _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
     851             :         // Apply the correction
     852           4 :         rms = _mm_sub_epi32(rms, mask);
     853             : 
     854             :         // Pack each 32 bit RMS value to 16 bits
     855           4 :         rms = sse2_packus_epi32(rms, rms /* could be anything */);
     856             : #endif
     857             : 
     858           4 :         _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
     859             :                          rms);
     860           4 :         pSrcScanlineShifted += 8;
     861             :     }
     862             : 
     863          10 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     864          10 :     return iDstPixel;
     865             : }
     866             : 
     867             : /************************************************************************/
     868             : /*                         AverageUInt16SSE2()                          */
     869             : /************************************************************************/
     870             : 
     871             : template <class T>
     872           9 : static int AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
     873             :                              const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     874             :                              T *CPL_RESTRICT pDstScanline)
     875             : {
     876             :     // Optimized implementation for average on UInt16 by
     877             :     // processing by group of 8 output pixels.
     878             : 
     879           9 :     const auto mask = _mm_set1_epi32(0xFFFF);
     880           9 :     const auto two = _mm_set1_epi32(2);
     881           9 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     882             : 
     883           9 :     int iDstPixel = 0;
     884          13 :     for (; iDstPixel < nDstXWidth - 7; iDstPixel += 8)
     885             :     {
     886             :         __m128i averageLow;
     887             :         // Load 8 UInt16 from each line
     888             :         {
     889           4 :             const auto firstLine = _mm_loadu_si128(
     890             :                 reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
     891             :             const auto secondLine =
     892           4 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     893           4 :                     pSrcScanlineShifted + nChunkXSize));
     894             : 
     895             :             // Horizontal addition and extension to 32 bit
     896          12 :             const auto horizAddFirstLine = _mm_add_epi32(
     897             :                 _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
     898             :             const auto horizAddSecondLine =
     899          12 :                 _mm_add_epi32(_mm_and_si128(secondLine, mask),
     900             :                               _mm_srli_epi32(secondLine, 16));
     901             : 
     902             :             // Vertical addition and average computation
     903             :             // average = (sum + 2) >> 2
     904           8 :             const auto sum = _mm_add_epi32(
     905             :                 _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
     906           4 :             averageLow = _mm_srli_epi32(sum, 2);
     907             :         }
     908             :         // Load 8 UInt16 from each line
     909             :         __m128i averageHigh;
     910             :         {
     911           4 :             const auto firstLine = _mm_loadu_si128(
     912           4 :                 reinterpret_cast<__m128i const *>(pSrcScanlineShifted + 8));
     913             :             const auto secondLine =
     914           4 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     915           4 :                     pSrcScanlineShifted + 8 + nChunkXSize));
     916             : 
     917             :             // Horizontal addition and extension to 32 bit
     918          12 :             const auto horizAddFirstLine = _mm_add_epi32(
     919             :                 _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
     920             :             const auto horizAddSecondLine =
     921          12 :                 _mm_add_epi32(_mm_and_si128(secondLine, mask),
     922             :                               _mm_srli_epi32(secondLine, 16));
     923             : 
     924             :             // Vertical addition and average computation
     925             :             // average = (sum + 2) >> 2
     926           8 :             const auto sum = _mm_add_epi32(
     927             :                 _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
     928           4 :             averageHigh = _mm_srli_epi32(sum, 2);
     929             :         }
     930             : 
     931             :         // Pack each 32 bit average value to 16 bits
     932           4 :         auto average = sse2_packus_epi32(averageLow, averageHigh);
     933           4 :         _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
     934             :                          average);
     935           4 :         pSrcScanlineShifted += 16;
     936             :     }
     937             : 
     938           9 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     939           9 :     return iDstPixel;
     940             : }
     941             : 
     942             : /************************************************************************/
     943             : /*                      QuadraticMeanFloatSSE2()                        */
     944             : /************************************************************************/
     945             : 
     946             : #ifdef __SSE3__
     947             : #define sse2_hadd_ps _mm_hadd_ps
     948             : #else
     949          18 : inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
     950             : {
     951          18 :     auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
     952          18 :     auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
     953          18 :     return _mm_add_ps(aEven_bEven, aOdd_bOdd);  // (aEven + aOdd, bEven + bOdd)
     954             : }
     955             : #endif
     956             : 
     957             : #ifdef __AVX2__
     958             : #define RMS_FLOAT_ELTS 8
     959             : #define set1_ps _mm256_set1_ps
     960             : #define loadu_ps _mm256_loadu_ps
     961             : #define andnot_ps _mm256_andnot_ps
     962             : #define and_ps _mm256_and_ps
     963             : #define max_ps _mm256_max_ps
     964             : #define shuffle_ps _mm256_shuffle_ps
     965             : #define div_ps _mm256_div_ps
     966             : #define cmpeq_ps(x, y) _mm256_cmp_ps(x, y, _CMP_EQ_OQ)
     967             : #define mul_ps _mm256_mul_ps
     968             : #define add_ps _mm256_add_ps
     969             : #define hadd_ps _mm256_hadd_ps
     970             : #define sqrt_ps _mm256_sqrt_ps
     971             : #define or_ps _mm256_or_ps
     972             : #define unpacklo_ps _mm256_unpacklo_ps
     973             : #define unpackhi_ps _mm256_unpackhi_ps
     974             : #define storeu_ps _mm256_storeu_ps
     975             : 
     976             : inline __m256 SQUARE_PS(__m256 x)
     977             : {
     978             :     return _mm256_mul_ps(x, x);
     979             : }
     980             : 
     981             : #else
     982             : 
     983             : #define RMS_FLOAT_ELTS 4
     984             : #define set1_ps _mm_set1_ps
     985             : #define loadu_ps _mm_loadu_ps
     986             : #define andnot_ps _mm_andnot_ps
     987             : #define and_ps _mm_and_ps
     988             : #define max_ps _mm_max_ps
     989             : #define shuffle_ps _mm_shuffle_ps
     990             : #define div_ps _mm_div_ps
     991             : #define cmpeq_ps _mm_cmpeq_ps
     992             : #define mul_ps _mm_mul_ps
     993             : #define add_ps _mm_add_ps
     994             : #define hadd_ps sse2_hadd_ps
     995             : #define sqrt_ps _mm_sqrt_ps
     996             : #define or_ps _mm_or_ps
     997             : #define unpacklo_ps _mm_unpacklo_ps
     998             : #define unpackhi_ps _mm_unpackhi_ps
     999             : #define storeu_ps _mm_storeu_ps
    1000             : 
    1001         272 : inline __m128 SQUARE_PS(__m128 x)
    1002             : {
    1003         272 :     return _mm_mul_ps(x, x);
    1004             : }
    1005             : 
    1006          68 : inline __m128 FIXUP_LANES(__m128 x)
    1007             : {
    1008          68 :     return x;
    1009             : }
    1010             : 
    1011             : #endif
    1012             : 
    1013             : static int
    1014             : #if defined(__GNUC__)
    1015             :     __attribute__((noinline))
    1016             : #endif
    1017          34 :     QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
    1018             :                            const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
    1019             :                            float *CPL_RESTRICT pDstScanline)
    1020             : {
    1021             :     // Optimized implementation for RMS on Float32 by
    1022             :     // processing by group of RMS_FLOAT_ELTS output pixels.
    1023          34 :     const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
    1024             : 
    1025          34 :     int iDstPixel = 0;
    1026          34 :     const auto minus_zero = set1_ps(-0.0f);
    1027          34 :     const auto zeroDot25 = set1_ps(0.25f);
    1028          34 :     const auto one = set1_ps(1.0f);
    1029          68 :     const auto infv = set1_ps(std::numeric_limits<float>::infinity());
    1030             : 
    1031         102 :     for (; iDstPixel < nDstXWidth - (RMS_FLOAT_ELTS - 1);
    1032          68 :          iDstPixel += RMS_FLOAT_ELTS)
    1033             :     {
    1034             :         // Load 2*RMS_FLOAT_ELTS Float32 from each line
    1035          68 :         auto firstLineLo = loadu_ps(pSrcScanlineShifted);
    1036          68 :         auto firstLineHi = loadu_ps(pSrcScanlineShifted + RMS_FLOAT_ELTS);
    1037          68 :         auto secondLineLo = loadu_ps(pSrcScanlineShifted + nChunkXSize);
    1038             :         auto secondLineHi =
    1039         136 :             loadu_ps(pSrcScanlineShifted + RMS_FLOAT_ELTS + nChunkXSize);
    1040             : 
    1041             :         // Take the absolute value
    1042          68 :         firstLineLo = andnot_ps(minus_zero, firstLineLo);
    1043          68 :         firstLineHi = andnot_ps(minus_zero, firstLineHi);
    1044          68 :         secondLineLo = andnot_ps(minus_zero, secondLineLo);
    1045          68 :         secondLineHi = andnot_ps(minus_zero, secondLineHi);
    1046             : 
    1047             :         auto firstLineEven =
    1048          68 :             shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
    1049             :         auto firstLineOdd =
    1050          68 :             shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
    1051             :         auto secondLineEven =
    1052          68 :             shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
    1053             :         auto secondLineOdd =
    1054          68 :             shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
    1055             : 
    1056             :         // Compute the maximum of each RMS_FLOAT_ELTS value to RMS-average
    1057         204 :         const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
    1058             :                                  max_ps(secondLineEven, secondLineEven));
    1059             : 
    1060             :         // Normalize each value by the maximum of the RMS_FLOAT_ELTS ones.
    1061             :         // This step is important to avoid that the square evaluates to infinity
    1062             :         // for sufficiently big input.
    1063          68 :         auto invMax = div_ps(one, maxV);
    1064             :         // Deal with 0 being the maximum to correct division by zero
    1065             :         // note: comparing to -0 leads to identical results as to comparing with
    1066             :         // 0
    1067         136 :         invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
    1068             : 
    1069          68 :         firstLineEven = mul_ps(firstLineEven, invMax);
    1070          68 :         firstLineOdd = mul_ps(firstLineOdd, invMax);
    1071          68 :         secondLineEven = mul_ps(secondLineEven, invMax);
    1072          68 :         secondLineOdd = mul_ps(secondLineOdd, invMax);
    1073             : 
    1074             :         // Compute squares
    1075          68 :         firstLineEven = SQUARE_PS(firstLineEven);
    1076          68 :         firstLineOdd = SQUARE_PS(firstLineOdd);
    1077          68 :         secondLineEven = SQUARE_PS(secondLineEven);
    1078          68 :         secondLineOdd = SQUARE_PS(secondLineOdd);
    1079             : 
    1080         204 :         const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
    1081             :                                        add_ps(secondLineEven, secondLineOdd));
    1082             : 
    1083         204 :         auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
    1084             : 
    1085             :         // Deal with infinity being the maximum
    1086          68 :         const auto maskIsInf = cmpeq_ps(maxV, infv);
    1087         136 :         rms = or_ps(andnot_ps(maskIsInf, rms), and_ps(maskIsInf, infv));
    1088             : 
    1089          68 :         rms = FIXUP_LANES(rms);
    1090             : 
    1091          68 :         storeu_ps(&pDstScanline[iDstPixel], rms);
    1092          68 :         pSrcScanlineShifted += RMS_FLOAT_ELTS * 2;
    1093             :     }
    1094             : 
    1095          34 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
    1096          34 :     return iDstPixel;
    1097             : }
    1098             : 
    1099             : /************************************************************************/
    1100             : /*                        AverageFloatSSE2()                            */
    1101             : /************************************************************************/
    1102             : 
    1103          14 : static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
    1104             :                             const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
    1105             :                             float *CPL_RESTRICT pDstScanline)
    1106             : {
    1107             :     // Optimized implementation for average on Float32 by
    1108             :     // processing by group of 4 output pixels.
    1109          14 :     const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
    1110             : 
    1111          14 :     int iDstPixel = 0;
    1112          14 :     const auto zeroDot25 = _mm_set1_ps(0.25f);
    1113             : 
    1114          32 :     for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
    1115             :     {
    1116             :         // Load 8 Float32 from each line
    1117          18 :         const auto firstLineLo = _mm_loadu_ps(pSrcScanlineShifted);
    1118          18 :         const auto firstLineHi = _mm_loadu_ps(pSrcScanlineShifted + 4);
    1119             :         const auto secondLineLo =
    1120          18 :             _mm_loadu_ps(pSrcScanlineShifted + nChunkXSize);
    1121             :         const auto secondLineHi =
    1122          36 :             _mm_loadu_ps(pSrcScanlineShifted + 4 + nChunkXSize);
    1123             : 
    1124             :         // Vertical addition
    1125          18 :         const auto sumLo = _mm_add_ps(firstLineLo, secondLineLo);
    1126          18 :         const auto sumHi = _mm_add_ps(firstLineHi, secondLineHi);
    1127             : 
    1128             :         // Horizontal addition
    1129          18 :         const auto sum = sse2_hadd_ps(sumLo, sumHi);
    1130             : 
    1131          18 :         const auto average = _mm_mul_ps(sum, zeroDot25);
    1132             : 
    1133          18 :         _mm_storeu_ps(&pDstScanline[iDstPixel], average);
    1134          18 :         pSrcScanlineShifted += 8;
    1135             :     }
    1136             : 
    1137          14 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
    1138          14 :     return iDstPixel;
    1139             : }
    1140             : 
    1141             : #endif
    1142             : 
    1143             : /************************************************************************/
    1144             : /*                    GDALResampleChunk_AverageOrRMS()                  */
    1145             : /************************************************************************/
    1146             : 
    1147             : template <class T, class Tsum, GDALDataType eWrkDataType>
    1148             : static CPLErr
    1149        2319 : GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
    1150             :                                  const T *pChunk, void **ppDstBuffer)
    1151             : {
    1152        2319 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    1153        2319 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    1154        2319 :     const double dfSrcXDelta = args.dfSrcXDelta;
    1155        2319 :     const double dfSrcYDelta = args.dfSrcYDelta;
    1156        2319 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    1157        2319 :     const int nChunkXOff = args.nChunkXOff;
    1158        2319 :     const int nChunkYOff = args.nChunkYOff;
    1159        2319 :     const int nChunkXSize = args.nChunkXSize;
    1160        2319 :     const int nChunkYSize = args.nChunkYSize;
    1161        2319 :     const int nDstXOff = args.nDstXOff;
    1162        2319 :     const int nDstXOff2 = args.nDstXOff2;
    1163        2319 :     const int nDstYOff = args.nDstYOff;
    1164        2319 :     const int nDstYOff2 = args.nDstYOff2;
    1165        2319 :     const char *pszResampling = args.pszResampling;
    1166        2319 :     bool bHasNoData = args.bHasNoData;
    1167        2319 :     const double dfNoDataValue = args.dfNoDataValue;
    1168        2319 :     const GDALColorTable *poColorTable = args.poColorTable;
    1169        2319 :     const bool bPropagateNoData = args.bPropagateNoData;
    1170             : 
    1171             :     // AVERAGE_BIT2GRAYSCALE
    1172             :     const bool bBit2Grayscale =
    1173        2319 :         CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"));
    1174        2319 :     const bool bQuadraticMean = CPL_TO_BOOL(EQUAL(pszResampling, "RMS"));
    1175        2319 :     if (bBit2Grayscale)
    1176           9 :         poColorTable = nullptr;
    1177             : 
    1178             :     T tNoDataValue;
    1179        2319 :     if (!bHasNoData)
    1180        2263 :         tNoDataValue = 0;
    1181             :     else
    1182          56 :         tNoDataValue = static_cast<T>(dfNoDataValue);
    1183        2319 :     const T tReplacementVal =
    1184         114 :         bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
    1185          56 :                          args.eOvrDataType, dfNoDataValue))
    1186             :                    : 0;
    1187             : 
    1188        2319 :     int nChunkRightXOff = nChunkXOff + nChunkXSize;
    1189        2319 :     int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    1190        2319 :     int nDstXWidth = nDstXOff2 - nDstXOff;
    1191             : 
    1192             :     /* -------------------------------------------------------------------- */
    1193             :     /*      Allocate buffers.                                               */
    1194             :     /* -------------------------------------------------------------------- */
    1195        2319 :     *ppDstBuffer = static_cast<T *>(
    1196        2319 :         VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
    1197             :                             GDALGetDataTypeSizeBytes(eWrkDataType)));
    1198        2319 :     if (*ppDstBuffer == nullptr)
    1199             :     {
    1200           0 :         return CE_Failure;
    1201             :     }
    1202        2319 :     T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
    1203             : 
    1204             :     struct PrecomputedXValue
    1205             :     {
    1206             :         int nLeftXOffShifted;
    1207             :         int nRightXOffShifted;
    1208             :         double dfLeftWeight;
    1209             :         double dfRightWeight;
    1210             :         double dfTotalWeightFullLine;
    1211             :     };
    1212             : 
    1213             :     PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
    1214        2319 :         VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(PrecomputedXValue)));
    1215             : 
    1216        2319 :     if (pasSrcX == nullptr)
    1217             :     {
    1218           0 :         return CE_Failure;
    1219             :     }
    1220             : 
    1221        2319 :     int nTransparentIdx = -1;
    1222        2319 :     std::vector<GDALColorEntry> colorEntries;
    1223        2319 :     if (poColorTable)
    1224           5 :         colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
    1225             : 
    1226             :     // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
    1227             :     // it as nodata value
    1228        2349 :     if (bHasNoData && dfNoDataValue >= 0.0f &&
    1229          30 :         tNoDataValue < colorEntries.size())
    1230           1 :         colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
    1231             : 
    1232             :     // Or if we have no explicit nodata, but a color table entry that is
    1233             :     // transparent, consider it as the nodata value
    1234        2318 :     else if (!bHasNoData && nTransparentIdx >= 0)
    1235             :     {
    1236           0 :         bHasNoData = true;
    1237           0 :         tNoDataValue = static_cast<T>(nTransparentIdx);
    1238             :     }
    1239             : 
    1240             :     /* ==================================================================== */
    1241             :     /*      Precompute inner loop constants.                                */
    1242             :     /* ==================================================================== */
    1243        2319 :     bool bSrcXSpacingIsTwo = true;
    1244        2319 :     int nLastSrcXOff2 = -1;
    1245      852325 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    1246             :     {
    1247      850006 :         double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
    1248             :         // Apply some epsilon to avoid numerical precision issues
    1249      850006 :         int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
    1250      850006 :         double dfSrcXOff2 = dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
    1251      850006 :         int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
    1252             : 
    1253      850006 :         if (nSrcXOff < nChunkXOff)
    1254           0 :             nSrcXOff = nChunkXOff;
    1255      850006 :         if (nSrcXOff2 == nSrcXOff)
    1256           0 :             nSrcXOff2++;
    1257      850006 :         if (nSrcXOff2 > nChunkRightXOff)
    1258           1 :             nSrcXOff2 = nChunkRightXOff;
    1259             : 
    1260      850006 :         pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
    1261      850006 :         pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
    1262      850006 :             nSrcXOff2 - nChunkXOff;
    1263          21 :         pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
    1264      850006 :             (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
    1265      850006 :         pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
    1266      850006 :             1 - (nSrcXOff2 - dfSrcXOff2);
    1267      850006 :         pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
    1268      850006 :             pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
    1269      850006 :         if (nSrcXOff + 1 < nSrcXOff2)
    1270             :         {
    1271      849985 :             pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
    1272      849985 :                 nSrcXOff2 - nSrcXOff - 2;
    1273      849985 :             pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
    1274      849985 :                 pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
    1275             :         }
    1276             : 
    1277      850006 :         if (nSrcXOff2 - nSrcXOff != 2 ||
    1278      728596 :             (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
    1279             :         {
    1280      120599 :             bSrcXSpacingIsTwo = false;
    1281             :         }
    1282      850006 :         nLastSrcXOff2 = nSrcXOff2;
    1283             :     }
    1284             : 
    1285             :     /* ==================================================================== */
    1286             :     /*      Loop over destination scanlines.                                */
    1287             :     /* ==================================================================== */
    1288      721819 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    1289             :     {
    1290      719500 :         double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
    1291      719500 :         int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
    1292      719500 :         if (nSrcYOff < nChunkYOff)
    1293           0 :             nSrcYOff = nChunkYOff;
    1294             : 
    1295      719500 :         double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
    1296      719500 :         int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
    1297      719500 :         if (nSrcYOff2 == nSrcYOff)
    1298           0 :             ++nSrcYOff2;
    1299      719500 :         if (nSrcYOff2 > nChunkBottomYOff)
    1300           3 :             nSrcYOff2 = nChunkBottomYOff;
    1301             : 
    1302      719500 :         T *const pDstScanline =
    1303      719500 :             pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
    1304             : 
    1305             :         /* --------------------------------------------------------------------
    1306             :          */
    1307             :         /*      Loop over destination pixels */
    1308             :         /* --------------------------------------------------------------------
    1309             :          */
    1310      719500 :         if (poColorTable == nullptr)
    1311             :         {
    1312      719385 :             if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
    1313             :                 pabyChunkNodataMask == nullptr)
    1314             :             {
    1315             :                 if constexpr (eWrkDataType == GDT_Byte ||
    1316             :                               eWrkDataType == GDT_UInt16)
    1317             :                 {
    1318             :                     // Optimized case : no nodata, overview by a factor of 2 and
    1319             :                     // regular x and y src spacing.
    1320      116684 :                     const T *pSrcScanlineShifted =
    1321      116684 :                         pChunk + pasSrcX[0].nLeftXOffShifted +
    1322      116684 :                         static_cast<size_t>(nSrcYOff - nChunkYOff) *
    1323      116684 :                             nChunkXSize;
    1324      116684 :                     int iDstPixel = 0;
    1325             : #ifdef USE_SSE2
    1326             :                     if constexpr (eWrkDataType == GDT_Byte)
    1327             :                     {
    1328      116665 :                         if (bQuadraticMean)
    1329             :                         {
    1330        5385 :                             iDstPixel = QuadraticMeanByteSSE2OrAVX2(
    1331             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1332             :                                 pDstScanline);
    1333             :                         }
    1334             :                         else
    1335             :                         {
    1336      111280 :                             iDstPixel = AverageByteSSE2OrAVX2(
    1337             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1338             :                                 pDstScanline);
    1339             :                         }
    1340             :                     }
    1341             :                     else
    1342             :                     {
    1343             :                         static_assert(eWrkDataType == GDT_UInt16);
    1344          19 :                         if (bQuadraticMean)
    1345             :                         {
    1346          10 :                             iDstPixel = QuadraticMeanUInt16SSE2(
    1347             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1348             :                                 pDstScanline);
    1349             :                         }
    1350             :                         else
    1351             :                         {
    1352           9 :                             iDstPixel = AverageUInt16SSE2(
    1353             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1354             :                                 pDstScanline);
    1355             :                         }
    1356             :                     }
    1357             : #endif
    1358      291091 :                     for (; iDstPixel < nDstXWidth; ++iDstPixel)
    1359             :                     {
    1360      174407 :                         Tsum nTotal = 0;
    1361             :                         T nVal;
    1362      174407 :                         if (bQuadraticMean)
    1363          44 :                             nTotal =
    1364          44 :                                 SQUARE<Tsum>(pSrcScanlineShifted[0]) +
    1365          44 :                                 SQUARE<Tsum>(pSrcScanlineShifted[1]) +
    1366          44 :                                 SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
    1367          44 :                                 SQUARE<Tsum>(
    1368          44 :                                     pSrcScanlineShifted[1 + nChunkXSize]);
    1369             :                         else
    1370      174363 :                             nTotal = pSrcScanlineShifted[0] +
    1371      174363 :                                      pSrcScanlineShifted[1] +
    1372      174363 :                                      pSrcScanlineShifted[nChunkXSize] +
    1373      174363 :                                      pSrcScanlineShifted[1 + nChunkXSize];
    1374             : 
    1375      174407 :                         constexpr int nTotalWeight = 4;
    1376      174407 :                         if (bQuadraticMean)
    1377          44 :                             nVal = ComputeIntegerRMS_4values<T>(nTotal);
    1378             :                         else
    1379      174363 :                             nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
    1380             :                                                   nTotalWeight);
    1381             : 
    1382             :                         // No need to compare nVal against tNoDataValue as we
    1383             :                         // are in a case where pabyChunkNodataMask == nullptr
    1384             :                         // implies the absence of nodata value.
    1385      174407 :                         pDstScanline[iDstPixel] = nVal;
    1386      174407 :                         pSrcScanlineShifted += 2;
    1387             :                     }
    1388             :                 }
    1389             :                 else
    1390             :                 {
    1391             :                     static_assert(eWrkDataType == GDT_Float32 ||
    1392             :                                   eWrkDataType == GDT_Float64);
    1393          70 :                     const T *pSrcScanlineShifted =
    1394          70 :                         pChunk + pasSrcX[0].nLeftXOffShifted +
    1395          70 :                         static_cast<size_t>(nSrcYOff - nChunkYOff) *
    1396          70 :                             nChunkXSize;
    1397          70 :                     int iDstPixel = 0;
    1398             : #ifdef USE_SSE2
    1399             :                     if constexpr (eWrkDataType == GDT_Float32)
    1400             :                     {
    1401             :                         static_assert(std::is_same_v<T, float>);
    1402          48 :                         if (bQuadraticMean)
    1403             :                         {
    1404          34 :                             iDstPixel = QuadraticMeanFloatSSE2(
    1405             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1406             :                                 pDstScanline);
    1407             :                         }
    1408             :                         else
    1409             :                         {
    1410          14 :                             iDstPixel = AverageFloatSSE2(
    1411             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1412             :                                 pDstScanline);
    1413             :                         }
    1414             :                     }
    1415             : #endif
    1416             : 
    1417         268 :                     for (; iDstPixel < nDstXWidth; ++iDstPixel)
    1418             :                     {
    1419             :                         T nVal;
    1420         198 :                         if (bQuadraticMean)
    1421             :                         {
    1422             :                             // Cast to double to avoid overflows
    1423             :                             // (using std::hypot() is much slower)
    1424         100 :                             nVal = static_cast<T>(std::sqrt(
    1425             :                                 0.25 *
    1426         100 :                                 (SQUARE<double>(pSrcScanlineShifted[0]) +
    1427         100 :                                  SQUARE<double>(pSrcScanlineShifted[1]) +
    1428         100 :                                  SQUARE<double>(
    1429         200 :                                      pSrcScanlineShifted[nChunkXSize]) +
    1430         100 :                                  SQUARE<double>(
    1431         100 :                                      pSrcScanlineShifted[1 + nChunkXSize]))));
    1432             :                         }
    1433             :                         else
    1434             :                         {
    1435          98 :                             nVal = static_cast<T>(
    1436          98 :                                 0.25f * (pSrcScanlineShifted[0] +
    1437          98 :                                          pSrcScanlineShifted[1] +
    1438          98 :                                          pSrcScanlineShifted[nChunkXSize] +
    1439          98 :                                          pSrcScanlineShifted[1 + nChunkXSize]));
    1440             :                         }
    1441             : 
    1442             :                         // No need to compare nVal against tNoDataValue as we
    1443             :                         // are in a case where pabyChunkNodataMask == nullptr
    1444             :                         // implies the absence of nodata value.
    1445         198 :                         pDstScanline[iDstPixel] = nVal;
    1446         198 :                         pSrcScanlineShifted += 2;
    1447             :                     }
    1448      116754 :                 }
    1449             :             }
    1450             :             else
    1451             :             {
    1452          17 :                 const double dfBottomWeight =
    1453      602631 :                     (nSrcYOff + 1 == nSrcYOff2) ? 1.0
    1454      602614 :                                                 : 1.0 - (dfSrcYOff - nSrcYOff);
    1455      602631 :                 const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
    1456      602631 :                 nSrcYOff -= nChunkYOff;
    1457      602631 :                 nSrcYOff2 -= nChunkYOff;
    1458             : 
    1459      602631 :                 double dfTotalWeightFullColumn = dfBottomWeight;
    1460      602631 :                 if (nSrcYOff + 1 < nSrcYOff2)
    1461             :                 {
    1462      602615 :                     dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
    1463      602615 :                     dfTotalWeightFullColumn += dfTopWeight;
    1464             :                 }
    1465             : 
    1466    18757660 :                 for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
    1467             :                 {
    1468    18151383 :                     const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
    1469    18151383 :                     const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
    1470             : 
    1471    18151383 :                     double dfTotal = 0;
    1472    18151383 :                     double dfTotalWeight = 0;
    1473    18151383 :                     if (pabyChunkNodataMask == nullptr)
    1474             :                     {
    1475     1746435 :                         auto pChunkShifted =
    1476         115 :                             pChunk +
    1477     1746435 :                             static_cast<size_t>(nSrcYOff) * nChunkXSize;
    1478     1746435 :                         int nCounterY = nSrcYOff2 - nSrcYOff - 1;
    1479     1746435 :                         double dfWeightY = dfBottomWeight;
    1480     3493427 :                         while (true)
    1481             :                         {
    1482             :                             double dfTotalLine;
    1483     5239852 :                             if (bQuadraticMean)
    1484             :                             {
    1485             :                                 // Left pixel
    1486             :                                 {
    1487         104 :                                     const T val = pChunkShifted[nSrcXOff];
    1488         104 :                                     dfTotalLine =
    1489         104 :                                         SQUARE<double>(val) *
    1490         104 :                                         pasSrcX[iDstPixel].dfLeftWeight;
    1491             :                                 }
    1492             : 
    1493         104 :                                 if (nSrcXOff + 1 < nSrcXOff2)
    1494             :                                 {
    1495             :                                     // Middle pixels
    1496         104 :                                     for (int iX = nSrcXOff + 1;
    1497         424 :                                          iX < nSrcXOff2 - 1; ++iX)
    1498             :                                     {
    1499         320 :                                         const T val = pChunkShifted[iX];
    1500         320 :                                         dfTotalLine += SQUARE<double>(val);
    1501             :                                     }
    1502             : 
    1503             :                                     // Right pixel
    1504             :                                     {
    1505         104 :                                         const T val =
    1506         104 :                                             pChunkShifted[nSrcXOff2 - 1];
    1507         104 :                                         dfTotalLine +=
    1508         104 :                                             SQUARE<double>(val) *
    1509         104 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1510             :                                     }
    1511             :                                 }
    1512             :                             }
    1513             :                             else
    1514             :                             {
    1515             :                                 // Left pixel
    1516             :                                 {
    1517     5239756 :                                     const T val = pChunkShifted[nSrcXOff];
    1518     5239756 :                                     dfTotalLine =
    1519     5239756 :                                         val * pasSrcX[iDstPixel].dfLeftWeight;
    1520             :                                 }
    1521             : 
    1522     5239756 :                                 if (nSrcXOff + 1 < nSrcXOff2)
    1523             :                                 {
    1524             :                                     // Middle pixels
    1525     4239330 :                                     for (int iX = nSrcXOff + 1;
    1526    64183126 :                                          iX < nSrcXOff2 - 1; ++iX)
    1527             :                                     {
    1528    59943836 :                                         const T val = pChunkShifted[iX];
    1529    59943836 :                                         dfTotalLine += val;
    1530             :                                     }
    1531             : 
    1532             :                                     // Right pixel
    1533             :                                     {
    1534     4239330 :                                         const T val =
    1535     4239330 :                                             pChunkShifted[nSrcXOff2 - 1];
    1536     4239330 :                                         dfTotalLine +=
    1537     4239330 :                                             val *
    1538     4239330 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1539             :                                     }
    1540             :                                 }
    1541             :                             }
    1542             : 
    1543     5239852 :                             dfTotal += dfTotalLine * dfWeightY;
    1544     5239852 :                             --nCounterY;
    1545     5239852 :                             if (nCounterY < 0)
    1546     1746435 :                                 break;
    1547     3493427 :                             pChunkShifted += nChunkXSize;
    1548     3493427 :                             dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
    1549             :                         }
    1550             : 
    1551     1746435 :                         dfTotalWeight =
    1552     1746435 :                             pasSrcX[iDstPixel].dfTotalWeightFullLine *
    1553             :                             dfTotalWeightFullColumn;
    1554             :                     }
    1555             :                     else
    1556             :                     {
    1557    16404968 :                         size_t nCount = 0;
    1558    71769204 :                         for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    1559             :                         {
    1560    55362736 :                             const auto pChunkShifted =
    1561    55362736 :                                 pChunk + static_cast<size_t>(iY) * nChunkXSize;
    1562             : 
    1563    55362736 :                             double dfTotalLine = 0;
    1564    55362736 :                             double dfTotalWeightLine = 0;
    1565             :                             // Left pixel
    1566             :                             {
    1567    55362736 :                                 const int iX = nSrcXOff;
    1568    55362736 :                                 const T val = pChunkShifted[iX];
    1569    55362736 :                                 if (pabyChunkNodataMask
    1570    55362736 :                                         [iX +
    1571    55362736 :                                          static_cast<size_t>(iY) * nChunkXSize])
    1572             :                                 {
    1573    23514583 :                                     nCount++;
    1574    23514583 :                                     const double dfWeightX =
    1575    23514583 :                                         pasSrcX[iDstPixel].dfLeftWeight;
    1576    23514583 :                                     dfTotalWeightLine = dfWeightX;
    1577    23514583 :                                     if (bQuadraticMean)
    1578          60 :                                         dfTotalLine =
    1579          60 :                                             SQUARE<double>(val) * dfWeightX;
    1580             :                                     else
    1581    23514583 :                                         dfTotalLine = val * dfWeightX;
    1582             :                                 }
    1583             :                             }
    1584             : 
    1585    55362736 :                             if (nSrcXOff < nSrcXOff2 - 1)
    1586             :                             {
    1587             :                                 // Middle pixels
    1588   152883136 :                                 for (int iX = nSrcXOff + 1; iX < nSrcXOff2 - 1;
    1589             :                                      ++iX)
    1590             :                                 {
    1591    97520300 :                                     const T val = pChunkShifted[iX];
    1592    97520300 :                                     if (pabyChunkNodataMask
    1593    97520300 :                                             [iX + static_cast<size_t>(iY) *
    1594    97520300 :                                                       nChunkXSize])
    1595             :                                     {
    1596    39728000 :                                         nCount++;
    1597    39728000 :                                         dfTotalWeightLine += 1;
    1598    39728000 :                                         if (bQuadraticMean)
    1599           0 :                                             dfTotalLine += SQUARE<double>(val);
    1600             :                                         else
    1601    39728000 :                                             dfTotalLine += val;
    1602             :                                     }
    1603             :                                 }
    1604             : 
    1605             :                                 // Right pixel
    1606             :                                 {
    1607    55362636 :                                     const int iX = nSrcXOff2 - 1;
    1608    55362636 :                                     const T val = pChunkShifted[iX];
    1609    55362636 :                                     if (pabyChunkNodataMask
    1610    55362636 :                                             [iX + static_cast<size_t>(iY) *
    1611    55362636 :                                                       nChunkXSize])
    1612             :                                     {
    1613    23514151 :                                         nCount++;
    1614    23514151 :                                         const double dfWeightX =
    1615    23514151 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1616    23514151 :                                         dfTotalWeightLine += dfWeightX;
    1617    23514151 :                                         if (bQuadraticMean)
    1618         782 :                                             dfTotalLine +=
    1619          61 :                                                 SQUARE<double>(val) * dfWeightX;
    1620             :                                         else
    1621    23514050 :                                             dfTotalLine += val * dfWeightX;
    1622             :                                     }
    1623             :                                 }
    1624             :                             }
    1625             : 
    1626    94328104 :                             const double dfWeightY =
    1627             :                                 (iY == nSrcYOff)        ? dfBottomWeight
    1628    38963768 :                                 : (iY + 1 == nSrcYOff2) ? dfTopWeight
    1629             :                                                         : 1.0;
    1630    55364236 :                             dfTotal += dfTotalLine * dfWeightY;
    1631    55364236 :                             dfTotalWeight += dfTotalWeightLine * dfWeightY;
    1632             :                         }
    1633             : 
    1634    16406468 :                         if (nCount == 0 ||
    1635           8 :                             (bPropagateNoData &&
    1636             :                              nCount <
    1637           8 :                                  static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
    1638           8 :                                      (nSrcXOff2 - nSrcXOff)))
    1639             :                         {
    1640     9608362 :                             pDstScanline[iDstPixel] = tNoDataValue;
    1641     9608362 :                             continue;
    1642             :                         }
    1643             :                     }
    1644             :                     if constexpr (eWrkDataType == GDT_Byte)
    1645             :                     {
    1646             :                         T nVal;
    1647     8544340 :                         if (bQuadraticMean)
    1648          38 :                             nVal = ComputeIntegerRMS<T, int>(dfTotal,
    1649             :                                                              dfTotalWeight);
    1650             :                         else
    1651     8544300 :                             nVal =
    1652     8544300 :                                 static_cast<T>(dfTotal / dfTotalWeight + 0.5);
    1653     8546550 :                         if (bHasNoData && nVal == tNoDataValue)
    1654           0 :                             nVal = tReplacementVal;
    1655     8546550 :                         pDstScanline[iDstPixel] = nVal;
    1656             :                     }
    1657             :                     else if constexpr (eWrkDataType == GDT_UInt16)
    1658             :                     {
    1659             :                         T nVal;
    1660           8 :                         if (bQuadraticMean)
    1661           4 :                             nVal = ComputeIntegerRMS<T, uint64_t>(
    1662             :                                 dfTotal, dfTotalWeight);
    1663             :                         else
    1664           4 :                             nVal =
    1665           4 :                                 static_cast<T>(dfTotal / dfTotalWeight + 0.5);
    1666           8 :                         if (bHasNoData && nVal == tNoDataValue)
    1667           0 :                             nVal = tReplacementVal;
    1668           8 :                         pDstScanline[iDstPixel] = nVal;
    1669             :                     }
    1670             :                     else
    1671             :                     {
    1672             :                         T nVal;
    1673         153 :                         if (bQuadraticMean)
    1674          20 :                             nVal =
    1675          25 :                                 static_cast<T>(sqrt(dfTotal / dfTotalWeight));
    1676             :                         else
    1677         128 :                             nVal = static_cast<T>(dfTotal / dfTotalWeight);
    1678         153 :                         if (bHasNoData && nVal == tNoDataValue)
    1679           2 :                             nVal = tReplacementVal;
    1680         153 :                         pDstScanline[iDstPixel] = nVal;
    1681             :                     }
    1682             :                 }
    1683             :             }
    1684             :         }
    1685             :         else
    1686             :         {
    1687         115 :             nSrcYOff -= nChunkYOff;
    1688         115 :             nSrcYOff2 -= nChunkYOff;
    1689             : 
    1690        2878 :             for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
    1691             :             {
    1692        6475 :                 const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
    1693        6475 :                 const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
    1694             : 
    1695        6475 :                 uint64_t nTotalR = 0;
    1696        6475 :                 uint64_t nTotalG = 0;
    1697        6475 :                 uint64_t nTotalB = 0;
    1698        6475 :                 size_t nCount = 0;
    1699             : 
    1700       19425 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    1701             :                 {
    1702       38850 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    1703             :                     {
    1704       25900 :                         const T val =
    1705       25900 :                             pChunk[iX + static_cast<size_t>(iY) * nChunkXSize];
    1706             :                         // cppcheck-suppress unsignedLessThanZero
    1707       25900 :                         if (val < 0 || val >= colorEntries.size())
    1708           0 :                             continue;
    1709       25900 :                         const size_t idx = static_cast<size_t>(val);
    1710       25900 :                         const auto &entry = colorEntries[idx];
    1711       25900 :                         if (entry.c4)
    1712             :                         {
    1713       14128 :                             if (bQuadraticMean)
    1714             :                             {
    1715         800 :                                 nTotalR += SQUARE<int>(entry.c1);
    1716         800 :                                 nTotalG += SQUARE<int>(entry.c2);
    1717         800 :                                 nTotalB += SQUARE<int>(entry.c3);
    1718         800 :                                 ++nCount;
    1719             :                             }
    1720             :                             else
    1721             :                             {
    1722       13328 :                                 nTotalR += entry.c1;
    1723       13328 :                                 nTotalG += entry.c2;
    1724       13328 :                                 nTotalB += entry.c3;
    1725       13328 :                                 ++nCount;
    1726             :                             }
    1727             :                         }
    1728             :                     }
    1729             :                 }
    1730             : 
    1731        6475 :                 if (nCount == 0 ||
    1732           0 :                     (bPropagateNoData &&
    1733           0 :                      nCount < static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
    1734           0 :                                   (nSrcXOff2 - nSrcXOff)))
    1735             :                 {
    1736        2838 :                     pDstScanline[iDstPixel] = tNoDataValue;
    1737             :                 }
    1738             :                 else
    1739             :                 {
    1740             :                     GDALColorEntry color;
    1741        3637 :                     if (bQuadraticMean)
    1742             :                     {
    1743         200 :                         color.c1 =
    1744         200 :                             static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
    1745         200 :                         color.c2 =
    1746         200 :                             static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
    1747         200 :                         color.c3 =
    1748         200 :                             static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
    1749             :                     }
    1750             :                     else
    1751             :                     {
    1752        3437 :                         color.c1 =
    1753        3437 :                             static_cast<short>((nTotalR + nCount / 2) / nCount);
    1754        3437 :                         color.c2 =
    1755        3437 :                             static_cast<short>((nTotalG + nCount / 2) / nCount);
    1756        3437 :                         color.c3 =
    1757        3437 :                             static_cast<short>((nTotalB + nCount / 2) / nCount);
    1758             :                     }
    1759           0 :                     pDstScanline[iDstPixel] =
    1760        3637 :                         static_cast<T>(BestColorEntry(colorEntries, color));
    1761             :                 }
    1762             :             }
    1763             :         }
    1764             :     }
    1765             : 
    1766        2319 :     CPLFree(pasSrcX);
    1767             : 
    1768        2319 :     return CE_None;
    1769             : }
    1770             : 
    1771             : static CPLErr
    1772        2319 : GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
    1773             :                                const void *pChunk, void **ppDstBuffer,
    1774             :                                GDALDataType *peDstBufferDataType)
    1775             : {
    1776        2319 :     *peDstBufferDataType = args.eWrkDataType;
    1777        2319 :     switch (args.eWrkDataType)
    1778             :     {
    1779        2252 :         case GDT_Byte:
    1780             :         {
    1781        2252 :             return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_Byte>(
    1782        2252 :                 args, static_cast<const GByte *>(pChunk), ppDstBuffer);
    1783             :         }
    1784             : 
    1785           9 :         case GDT_UInt16:
    1786             :         {
    1787           9 :             if (EQUAL(args.pszResampling, "RMS"))
    1788             :             {
    1789             :                 // Use double as accumulation type, because UInt32 could overflow
    1790             :                 return GDALResampleChunk_AverageOrRMS_T<GUInt16, double,
    1791           5 :                                                         GDT_UInt16>(
    1792           5 :                     args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
    1793             :             }
    1794             :             else
    1795             :             {
    1796             :                 return GDALResampleChunk_AverageOrRMS_T<GUInt16, GUInt32,
    1797           4 :                                                         GDT_UInt16>(
    1798           4 :                     args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
    1799             :             }
    1800             :         }
    1801             : 
    1802          41 :         case GDT_Float32:
    1803             :         {
    1804          41 :             return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32>(
    1805          41 :                 args, static_cast<const float *>(pChunk), ppDstBuffer);
    1806             :         }
    1807             : 
    1808          17 :         case GDT_Float64:
    1809             :         {
    1810             :             return GDALResampleChunk_AverageOrRMS_T<double, double,
    1811          17 :                                                     GDT_Float64>(
    1812          17 :                 args, static_cast<const double *>(pChunk), ppDstBuffer);
    1813             :         }
    1814             : 
    1815           0 :         default:
    1816           0 :             break;
    1817             :     }
    1818             : 
    1819           0 :     CPLAssert(false);
    1820             :     return CE_Failure;
    1821             : }
    1822             : 
    1823             : /************************************************************************/
    1824             : /*                     GDALResampleChunk_Gauss()                        */
    1825             : /************************************************************************/
    1826             : 
    1827          86 : static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
    1828             :                                       const void *pChunk, void **ppDstBuffer,
    1829             :                                       GDALDataType *peDstBufferDataType)
    1830             : 
    1831             : {
    1832          86 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    1833          86 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    1834          86 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    1835          86 :     const int nChunkXOff = args.nChunkXOff;
    1836          86 :     const int nChunkXSize = args.nChunkXSize;
    1837          86 :     const int nChunkYOff = args.nChunkYOff;
    1838          86 :     const int nChunkYSize = args.nChunkYSize;
    1839          86 :     const int nDstXOff = args.nDstXOff;
    1840          86 :     const int nDstXOff2 = args.nDstXOff2;
    1841          86 :     const int nDstYOff = args.nDstYOff;
    1842          86 :     const int nDstYOff2 = args.nDstYOff2;
    1843          86 :     const bool bHasNoData = args.bHasNoData;
    1844          86 :     double dfNoDataValue = args.dfNoDataValue;
    1845          86 :     const GDALColorTable *poColorTable = args.poColorTable;
    1846             : 
    1847          86 :     const double *const padfChunk = static_cast<const double *>(pChunk);
    1848             : 
    1849          86 :     *ppDstBuffer =
    1850          86 :         VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
    1851             :                             GDALGetDataTypeSizeBytes(GDT_Float64));
    1852          86 :     if (*ppDstBuffer == nullptr)
    1853             :     {
    1854           0 :         return CE_Failure;
    1855             :     }
    1856          86 :     *peDstBufferDataType = GDT_Float64;
    1857          86 :     double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
    1858             : 
    1859             :     /* -------------------------------------------------------------------- */
    1860             :     /*      Create the filter kernel and allocate scanline buffer.          */
    1861             :     /* -------------------------------------------------------------------- */
    1862          86 :     int nGaussMatrixDim = 3;
    1863             :     const int *panGaussMatrix;
    1864          86 :     constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
    1865          86 :     constexpr int anGaussMatrix5x5[] = {1,  4, 6,  4,  1,  4, 16, 24, 16,
    1866             :                                         4,  6, 24, 36, 24, 6, 4,  16, 24,
    1867             :                                         16, 4, 1,  4,  6,  4, 1};
    1868          86 :     constexpr int anGaussMatrix7x7[] = {
    1869             :         1,   6,  15, 20,  15,  6,   1,   6,  36, 90,  120, 90,  36,
    1870             :         6,   15, 90, 225, 300, 225, 90,  15, 20, 120, 300, 400, 300,
    1871             :         120, 20, 15, 90,  225, 300, 225, 90, 15, 6,   36,  90,  120,
    1872             :         90,  36, 6,  1,   6,   15,  20,  15, 6,  1};
    1873             : 
    1874          86 :     const int nOXSize = args.nOvrXSize;
    1875          86 :     const int nOYSize = args.nOvrYSize;
    1876          86 :     const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
    1877             : 
    1878             :     // matrix for gauss filter
    1879          86 :     if (nResYFactor <= 2)
    1880             :     {
    1881          85 :         panGaussMatrix = anGaussMatrix3x3;
    1882          85 :         nGaussMatrixDim = 3;
    1883             :     }
    1884           1 :     else if (nResYFactor <= 4)
    1885             :     {
    1886           0 :         panGaussMatrix = anGaussMatrix5x5;
    1887           0 :         nGaussMatrixDim = 5;
    1888             :     }
    1889             :     else
    1890             :     {
    1891           1 :         panGaussMatrix = anGaussMatrix7x7;
    1892           1 :         nGaussMatrixDim = 7;
    1893             :     }
    1894             : 
    1895             : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
    1896             :     int *panGaussMatrixDup = static_cast<int *>(
    1897             :         CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
    1898             :     memcpy(panGaussMatrixDup, panGaussMatrix,
    1899             :            sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
    1900             :     panGaussMatrix = panGaussMatrixDup;
    1901             : #endif
    1902             : 
    1903          86 :     if (!bHasNoData)
    1904          79 :         dfNoDataValue = 0.0;
    1905             : 
    1906          86 :     std::vector<GDALColorEntry> colorEntries;
    1907          86 :     int nTransparentIdx = -1;
    1908          86 :     if (poColorTable)
    1909           2 :         colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
    1910             : 
    1911             :     // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
    1912             :     // it as nodata value.
    1913          92 :     if (bHasNoData && dfNoDataValue >= 0.0f &&
    1914           6 :         dfNoDataValue < colorEntries.size())
    1915           0 :         colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
    1916             : 
    1917             :     // Or if we have no explicit nodata, but a color table entry that is
    1918             :     // transparent, consider it as the nodata value.
    1919          86 :     else if (!bHasNoData && nTransparentIdx >= 0)
    1920             :     {
    1921           0 :         dfNoDataValue = nTransparentIdx;
    1922             :     }
    1923             : 
    1924          86 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    1925          86 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    1926          86 :     const int nDstXWidth = nDstXOff2 - nDstXOff;
    1927             : 
    1928             :     /* ==================================================================== */
    1929             :     /*      Loop over destination scanlines.                                */
    1930             :     /* ==================================================================== */
    1931       16488 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    1932             :     {
    1933       16402 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
    1934       16402 :         int nSrcYOff2 =
    1935       16402 :             static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
    1936             : 
    1937       16402 :         if (nSrcYOff < nChunkYOff)
    1938             :         {
    1939           0 :             nSrcYOff = nChunkYOff;
    1940           0 :             nSrcYOff2++;
    1941             :         }
    1942             : 
    1943       16402 :         const int iSizeY = nSrcYOff2 - nSrcYOff;
    1944       16402 :         nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
    1945       16402 :         nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
    1946             : 
    1947       16402 :         if (nSrcYOff2 > nChunkBottomYOff ||
    1948       16359 :             (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
    1949             :         {
    1950          44 :             nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
    1951             :         }
    1952             : 
    1953       16402 :         int nYShiftGaussMatrix = 0;
    1954       16402 :         if (nSrcYOff < nChunkYOff)
    1955             :         {
    1956           0 :             nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
    1957           0 :             nSrcYOff = nChunkYOff;
    1958             :         }
    1959             : 
    1960       16402 :         const double *const padfSrcScanline =
    1961       16402 :             padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
    1962       16402 :         const GByte *pabySrcScanlineNodataMask = nullptr;
    1963       16402 :         if (pabyChunkNodataMask != nullptr)
    1964         152 :             pabySrcScanlineNodataMask =
    1965         152 :                 pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
    1966             : 
    1967             :         /* --------------------------------------------------------------------
    1968             :          */
    1969             :         /*      Loop over destination pixels */
    1970             :         /* --------------------------------------------------------------------
    1971             :          */
    1972       16402 :         double *const padfDstScanline =
    1973       16402 :             padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
    1974     4149980 :         for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    1975             :         {
    1976     4133580 :             int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
    1977     4133580 :             int nSrcXOff2 =
    1978     4133580 :                 static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
    1979             : 
    1980     4133580 :             if (nSrcXOff < nChunkXOff)
    1981             :             {
    1982           0 :                 nSrcXOff = nChunkXOff;
    1983           0 :                 nSrcXOff2++;
    1984             :             }
    1985             : 
    1986     4133580 :             const int iSizeX = nSrcXOff2 - nSrcXOff;
    1987     4133580 :             nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
    1988     4133580 :             nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
    1989             : 
    1990     4133580 :             if (nSrcXOff2 > nChunkRightXOff ||
    1991     4127930 :                 (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
    1992             :             {
    1993        5650 :                 nSrcXOff2 =
    1994        5650 :                     std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
    1995             :             }
    1996             : 
    1997     4133580 :             int nXShiftGaussMatrix = 0;
    1998     4133580 :             if (nSrcXOff < nChunkXOff)
    1999             :             {
    2000           0 :                 nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
    2001           0 :                 nSrcXOff = nChunkXOff;
    2002             :             }
    2003             : 
    2004     4133580 :             if (poColorTable == nullptr)
    2005             :             {
    2006     4133380 :                 double dfTotal = 0.0;
    2007     4133380 :                 GInt64 nCount = 0;
    2008     4133380 :                 const int *panLineWeight =
    2009     4133380 :                     panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
    2010             :                     nXShiftGaussMatrix;
    2011             : 
    2012    16527900 :                 for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
    2013    12394500 :                      ++iY, ++j, panLineWeight += nGaussMatrixDim)
    2014             :                 {
    2015    49561300 :                     for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
    2016             :                     {
    2017    37166800 :                         const double val =
    2018    37166800 :                             padfSrcScanline[iX - nChunkXOff +
    2019    37166800 :                                             static_cast<GPtrDiff_t>(iY -
    2020    37166800 :                                                                     nSrcYOff) *
    2021    37166800 :                                                 nChunkXSize];
    2022    37166800 :                         if (pabySrcScanlineNodataMask == nullptr ||
    2023       32872 :                             pabySrcScanlineNodataMask[iX - nChunkXOff +
    2024       32872 :                                                       static_cast<GPtrDiff_t>(
    2025       32872 :                                                           iY - nSrcYOff) *
    2026       32872 :                                                           nChunkXSize])
    2027             :                         {
    2028    37146100 :                             const int nWeight = panLineWeight[i];
    2029    37146100 :                             dfTotal += val * nWeight;
    2030    37146100 :                             nCount += nWeight;
    2031             :                         }
    2032             :                     }
    2033             :                 }
    2034             : 
    2035     4133380 :                 if (nCount == 0)
    2036             :                 {
    2037        2217 :                     padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
    2038             :                 }
    2039             :                 else
    2040             :                 {
    2041     4131160 :                     padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
    2042             :                 }
    2043             :             }
    2044             :             else
    2045             :             {
    2046         200 :                 GInt64 nTotalR = 0;
    2047         200 :                 GInt64 nTotalG = 0;
    2048         200 :                 GInt64 nTotalB = 0;
    2049         200 :                 GInt64 nTotalWeight = 0;
    2050         200 :                 const int *panLineWeight =
    2051         200 :                     panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
    2052             :                     nXShiftGaussMatrix;
    2053             : 
    2054         780 :                 for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
    2055         580 :                      ++iY, ++j, panLineWeight += nGaussMatrixDim)
    2056             :                 {
    2057        2262 :                     for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
    2058             :                     {
    2059        1682 :                         const double val =
    2060        1682 :                             padfSrcScanline[iX - nChunkXOff +
    2061        1682 :                                             static_cast<GPtrDiff_t>(iY -
    2062        1682 :                                                                     nSrcYOff) *
    2063        1682 :                                                 nChunkXSize];
    2064        1682 :                         if (val < 0 || val >= colorEntries.size())
    2065           0 :                             continue;
    2066             : 
    2067        1682 :                         size_t idx = static_cast<size_t>(val);
    2068        1682 :                         if (colorEntries[idx].c4)
    2069             :                         {
    2070        1682 :                             const int nWeight = panLineWeight[i];
    2071        1682 :                             nTotalR +=
    2072        1682 :                                 static_cast<GInt64>(colorEntries[idx].c1) *
    2073        1682 :                                 nWeight;
    2074        1682 :                             nTotalG +=
    2075        1682 :                                 static_cast<GInt64>(colorEntries[idx].c2) *
    2076        1682 :                                 nWeight;
    2077        1682 :                             nTotalB +=
    2078        1682 :                                 static_cast<GInt64>(colorEntries[idx].c3) *
    2079        1682 :                                 nWeight;
    2080        1682 :                             nTotalWeight += nWeight;
    2081             :                         }
    2082             :                     }
    2083             :                 }
    2084             : 
    2085         200 :                 if (nTotalWeight == 0)
    2086             :                 {
    2087           0 :                     padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
    2088             :                 }
    2089             :                 else
    2090             :                 {
    2091             :                     GDALColorEntry color;
    2092             : 
    2093         200 :                     color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
    2094             :                                                   nTotalWeight);
    2095         200 :                     color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
    2096             :                                                   nTotalWeight);
    2097         200 :                     color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
    2098             :                                                   nTotalWeight);
    2099         200 :                     padfDstScanline[iDstPixel - nDstXOff] =
    2100         200 :                         BestColorEntry(colorEntries, color);
    2101             :                 }
    2102             :             }
    2103             :         }
    2104             :     }
    2105             : 
    2106             : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
    2107             :     CPLFree(panGaussMatrixDup);
    2108             : #endif
    2109             : 
    2110          86 :     return CE_None;
    2111             : }
    2112             : 
    2113             : /************************************************************************/
    2114             : /*                      GDALResampleChunk_Mode()                        */
    2115             : /************************************************************************/
    2116             : 
    2117        4398 : template <class T> static inline bool IsSame(T a, T b)
    2118             : {
    2119        4398 :     return a == b;
    2120             : }
    2121             : 
    2122        4854 : template <> bool IsSame<float>(float a, float b)
    2123             : {
    2124        4854 :     return a == b || (std::isnan(a) && std::isnan(b));
    2125             : }
    2126             : 
    2127         504 : template <> bool IsSame<double>(double a, double b)
    2128             : {
    2129         504 :     return a == b || (std::isnan(a) && std::isnan(b));
    2130             : }
    2131             : 
    2132             : template <>
    2133         480 : bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
    2134             : {
    2135         960 :     return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
    2136         960 :                       std::isnan(b.real()) && std::isnan(b.imag()));
    2137             : }
    2138             : 
    2139             : template <>
    2140         480 : bool IsSame<std::complex<double>>(std::complex<double> a,
    2141             :                                   std::complex<double> b)
    2142             : {
    2143         960 :     return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
    2144         960 :                       std::isnan(b.real()) && std::isnan(b.imag()));
    2145             : }
    2146             : 
    2147             : template <class T>
    2148         136 : static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
    2149             :                                       const T *pChunk, T *const pDstBuffer)
    2150             : 
    2151             : {
    2152         136 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    2153         136 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    2154         136 :     const double dfSrcXDelta = args.dfSrcXDelta;
    2155         136 :     const double dfSrcYDelta = args.dfSrcYDelta;
    2156         136 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    2157         136 :     const int nChunkXOff = args.nChunkXOff;
    2158         136 :     const int nChunkXSize = args.nChunkXSize;
    2159         136 :     const int nChunkYOff = args.nChunkYOff;
    2160         136 :     const int nChunkYSize = args.nChunkYSize;
    2161         136 :     const int nDstXOff = args.nDstXOff;
    2162         136 :     const int nDstXOff2 = args.nDstXOff2;
    2163         136 :     const int nDstYOff = args.nDstYOff;
    2164         136 :     const int nDstYOff2 = args.nDstYOff2;
    2165         136 :     const bool bHasNoData = args.bHasNoData;
    2166         136 :     const GDALColorTable *poColorTable = args.poColorTable;
    2167         136 :     const int nDstXSize = nDstXOff2 - nDstXOff;
    2168             : 
    2169           8 :     T tNoDataValue;
    2170             :     if constexpr (std::is_same<T, std::complex<float>>::value ||
    2171             :                   std::is_same<T, std::complex<double>>::value)
    2172             :     {
    2173             :         using BaseT = typename T::value_type;
    2174           8 :         tNoDataValue =
    2175             :             std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
    2176             :                                 std::numeric_limits<BaseT>::quiet_NaN());
    2177             :     }
    2178         128 :     else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
    2179         127 :         tNoDataValue = 0;
    2180             :     else
    2181           1 :         tNoDataValue = static_cast<T>(args.dfNoDataValue);
    2182             : 
    2183         136 :     size_t nMaxNumPx = 0;
    2184         136 :     T *paVals = nullptr;
    2185         136 :     int *panSums = nullptr;
    2186             : 
    2187         136 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    2188         136 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    2189         272 :     std::vector<int> anVals(256, 0);
    2190             : 
    2191             :     /* ==================================================================== */
    2192             :     /*      Loop over destination scanlines.                                */
    2193             :     /* ==================================================================== */
    2194        7531 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    2195             :     {
    2196        7395 :         double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
    2197        7395 :         int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
    2198             : #ifdef only_pixels_with_more_than_10_pct_participation
    2199             :         // When oversampling, don't take into account pixels that have a tiny
    2200             :         // participation in the resulting pixel
    2201             :         if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
    2202             :             nSrcYOff < nChunkBottomYOff)
    2203             :             nSrcYOff++;
    2204             : #endif
    2205        7395 :         if (nSrcYOff < nChunkYOff)
    2206           0 :             nSrcYOff = nChunkYOff;
    2207             : 
    2208        7395 :         double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
    2209        7395 :         int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
    2210             : #ifdef only_pixels_with_more_than_10_pct_participation
    2211             :         // When oversampling, don't take into account pixels that have a tiny
    2212             :         // participation in the resulting pixel
    2213             :         if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
    2214             :             nSrcYOff2 > nChunkYOff)
    2215             :             nSrcYOff2--;
    2216             : #endif
    2217        7395 :         if (nSrcYOff2 == nSrcYOff)
    2218           0 :             ++nSrcYOff2;
    2219        7395 :         if (nSrcYOff2 > nChunkBottomYOff)
    2220           0 :             nSrcYOff2 = nChunkBottomYOff;
    2221             : 
    2222        7395 :         const T *const paSrcScanline =
    2223         149 :             pChunk +
    2224        7395 :             (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
    2225        7395 :         const GByte *pabySrcScanlineNodataMask = nullptr;
    2226        7395 :         if (pabyChunkNodataMask != nullptr)
    2227        1810 :             pabySrcScanlineNodataMask =
    2228             :                 pabyChunkNodataMask +
    2229        1810 :                 static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
    2230             : 
    2231        7395 :         T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
    2232             :         /* --------------------------------------------------------------------
    2233             :          */
    2234             :         /*      Loop over destination pixels */
    2235             :         /* --------------------------------------------------------------------
    2236             :          */
    2237     4259580 :         for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    2238             :         {
    2239     4252187 :             double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
    2240             :             // Apply some epsilon to avoid numerical precision issues
    2241     4252187 :             int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
    2242             : #ifdef only_pixels_with_more_than_10_pct_participation
    2243             :             // When oversampling, don't take into account pixels that have a
    2244             :             // tiny participation in the resulting pixel
    2245             :             if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
    2246             :                 nSrcXOff < nChunkRightXOff)
    2247             :                 nSrcXOff++;
    2248             : #endif
    2249     4252187 :             if (nSrcXOff < nChunkXOff)
    2250           0 :                 nSrcXOff = nChunkXOff;
    2251             : 
    2252     4252187 :             double dfSrcXOff2 =
    2253     4252187 :                 dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
    2254     4252187 :             int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
    2255             : #ifdef only_pixels_with_more_than_10_pct_participation
    2256             :             // When oversampling, don't take into account pixels that have a
    2257             :             // tiny participation in the resulting pixel
    2258             :             if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
    2259             :                 nSrcXOff2 > nChunkXOff)
    2260             :                 nSrcXOff2--;
    2261             : #endif
    2262     4252187 :             if (nSrcXOff2 == nSrcXOff)
    2263           0 :                 nSrcXOff2++;
    2264     4252187 :             if (nSrcXOff2 > nChunkRightXOff)
    2265           0 :                 nSrcXOff2 = nChunkRightXOff;
    2266             : 
    2267     4252187 :             bool bRegularProcessing = false;
    2268             :             if constexpr (!std::is_same<T, GByte>::value)
    2269         827 :                 bRegularProcessing = true;
    2270     4251360 :             else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
    2271           0 :                 bRegularProcessing = true;
    2272             : 
    2273     4252187 :             if (bRegularProcessing)
    2274             :             {
    2275             :                 // Not sure how much sense it makes to run a majority
    2276             :                 // filter on floating point data, but here it is for the sake
    2277             :                 // of compatibility. It won't look right on RGB images by the
    2278             :                 // nature of the filter.
    2279             : 
    2280         827 :                 if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
    2281        2481 :                     nSrcYOff2 - nSrcYOff > INT_MAX / (nSrcXOff2 - nSrcXOff) ||
    2282         827 :                     static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
    2283         827 :                             static_cast<size_t>(nSrcXOff2 - nSrcXOff) >
    2284         827 :                         std::numeric_limits<size_t>::max() / sizeof(float))
    2285             :                 {
    2286           0 :                     CPLError(CE_Failure, CPLE_NotSupported,
    2287             :                              "Too big downsampling factor");
    2288           0 :                     CPLFree(paVals);
    2289           0 :                     CPLFree(panSums);
    2290           0 :                     return CE_Failure;
    2291             :                 }
    2292         827 :                 const size_t nNumPx =
    2293         827 :                     static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
    2294         827 :                     static_cast<size_t>(nSrcXOff2 - nSrcXOff);
    2295         827 :                 size_t iMaxInd = 0;
    2296         827 :                 size_t iMaxVal = 0;
    2297         827 :                 bool biMaxValdValid = false;
    2298             : 
    2299         827 :                 if (paVals == nullptr || nNumPx > nMaxNumPx)
    2300             :                 {
    2301             :                     T *paValsNew = static_cast<T *>(
    2302          71 :                         VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
    2303             :                     int *panSumsNew = static_cast<int *>(
    2304          71 :                         VSI_REALLOC_VERBOSE(panSums, nNumPx * sizeof(int)));
    2305          71 :                     if (paValsNew != nullptr)
    2306          71 :                         paVals = paValsNew;
    2307          71 :                     if (panSumsNew != nullptr)
    2308          71 :                         panSums = panSumsNew;
    2309          71 :                     if (paValsNew == nullptr || panSumsNew == nullptr)
    2310             :                     {
    2311           0 :                         CPLFree(paVals);
    2312           0 :                         CPLFree(panSums);
    2313           0 :                         return CE_Failure;
    2314             :                     }
    2315          71 :                     nMaxNumPx = nNumPx;
    2316             :                 }
    2317             : 
    2318        2585 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    2319             :                 {
    2320        1758 :                     const GPtrDiff_t iTotYOff =
    2321        1758 :                         static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
    2322        1758 :                         nChunkXOff;
    2323        5690 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    2324             :                     {
    2325        3932 :                         if (pabySrcScanlineNodataMask == nullptr ||
    2326          16 :                             pabySrcScanlineNodataMask[iX + iTotYOff])
    2327             :                         {
    2328        3917 :                             const T val = paSrcScanline[iX + iTotYOff];
    2329        3917 :                             size_t i = 0;  // Used after for.
    2330             : 
    2331             :                             // Check array for existing entry.
    2332       14387 :                             for (; i < iMaxInd; ++i)
    2333       17626 :                                 if (IsSame(paVals[i], val) &&
    2334        6910 :                                     ++panSums[i] > panSums[iMaxVal])
    2335             :                                 {
    2336         246 :                                     iMaxVal = i;
    2337         246 :                                     biMaxValdValid = true;
    2338         246 :                                     break;
    2339             :                                 }
    2340             : 
    2341             :                             // Add to arr if entry not already there.
    2342        3917 :                             if (i == iMaxInd)
    2343             :                             {
    2344        3671 :                                 paVals[iMaxInd] = val;
    2345        3671 :                                 panSums[iMaxInd] = 1;
    2346             : 
    2347        3671 :                                 if (!biMaxValdValid)
    2348             :                                 {
    2349         824 :                                     iMaxVal = iMaxInd;
    2350         824 :                                     biMaxValdValid = true;
    2351             :                                 }
    2352             : 
    2353        3671 :                                 ++iMaxInd;
    2354             :                             }
    2355             :                         }
    2356             :                     }
    2357             :                 }
    2358             : 
    2359         827 :                 if (!biMaxValdValid)
    2360           3 :                     paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
    2361             :                 else
    2362         824 :                     paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
    2363             :             }
    2364             :             else if constexpr (std::is_same<T, GByte>::value)
    2365             :             // ( eSrcDataType == GDT_Byte && nEntryCount < 256 )
    2366             :             {
    2367             :                 // So we go here for a paletted or non-paletted byte band.
    2368             :                 // The input values are then between 0 and 255.
    2369     4251360 :                 int nMaxVal = 0;
    2370     4251360 :                 int iMaxInd = -1;
    2371             : 
    2372             :                 // The cost of this zeroing might be high. Perhaps we should
    2373             :                 // just use the above generic case, and go to this one if the
    2374             :                 // number of source pixels is large enough
    2375     4251360 :                 std::fill(anVals.begin(), anVals.end(), 0);
    2376             : 
    2377    12777700 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    2378             :                 {
    2379     8526370 :                     const GPtrDiff_t iTotYOff =
    2380     8526370 :                         static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
    2381     8526370 :                         nChunkXOff;
    2382    25649400 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    2383             :                     {
    2384    17123000 :                         const T val = paSrcScanline[iX + iTotYOff];
    2385    17123000 :                         if (!bHasNoData || val != tNoDataValue)
    2386             :                         {
    2387    17123000 :                             int nVal = static_cast<int>(val);
    2388    17123000 :                             if (++anVals[nVal] > nMaxVal)
    2389             :                             {
    2390             :                                 // Sum the density.
    2391             :                                 // Is it the most common value so far?
    2392    17006300 :                                 iMaxInd = nVal;
    2393    17006300 :                                 nMaxVal = anVals[nVal];
    2394             :                             }
    2395             :                         }
    2396             :                     }
    2397             :                 }
    2398             : 
    2399     4251360 :                 if (iMaxInd == -1)
    2400           0 :                     paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
    2401             :                 else
    2402     4251360 :                     paDstScanline[iDstPixel - nDstXOff] =
    2403             :                         static_cast<T>(iMaxInd);
    2404             :             }
    2405             :         }
    2406             :     }
    2407             : 
    2408         136 :     CPLFree(paVals);
    2409         136 :     CPLFree(panSums);
    2410             : 
    2411         136 :     return CE_None;
    2412             : }
    2413             : 
    2414         136 : static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
    2415             :                                      const void *pChunk, void **ppDstBuffer,
    2416             :                                      GDALDataType *peDstBufferDataType)
    2417             : {
    2418         136 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(
    2419             :         args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
    2420             :         GDALGetDataTypeSizeBytes(args.eWrkDataType));
    2421         136 :     if (*ppDstBuffer == nullptr)
    2422             :     {
    2423           0 :         return CE_Failure;
    2424             :     }
    2425             : 
    2426         136 :     CPLAssert(args.eSrcDataType == args.eWrkDataType);
    2427             : 
    2428         136 :     *peDstBufferDataType = args.eWrkDataType;
    2429         136 :     switch (args.eWrkDataType)
    2430             :     {
    2431             :         // For mode resampling, as no computation is done, only the
    2432             :         // size of the data type matters... except for Byte where we have
    2433             :         // special processing. And for floating point values
    2434          65 :         case GDT_Byte:
    2435             :         {
    2436          65 :             return GDALResampleChunk_ModeT(args,
    2437             :                                            static_cast<const GByte *>(pChunk),
    2438          65 :                                            static_cast<GByte *>(*ppDstBuffer));
    2439             :         }
    2440             : 
    2441           4 :         case GDT_Int8:
    2442             :         {
    2443           4 :             return GDALResampleChunk_ModeT(args,
    2444             :                                            static_cast<const int8_t *>(pChunk),
    2445           4 :                                            static_cast<int8_t *>(*ppDstBuffer));
    2446             :         }
    2447             : 
    2448           9 :         case GDT_Int16:
    2449             :         case GDT_UInt16:
    2450             :         case GDT_Float16:
    2451             :         {
    2452           9 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
    2453           9 :             return GDALResampleChunk_ModeT(
    2454             :                 args, static_cast<const uint16_t *>(pChunk),
    2455           9 :                 static_cast<uint16_t *>(*ppDstBuffer));
    2456             :         }
    2457             : 
    2458          15 :         case GDT_CInt16:
    2459             :         case GDT_CFloat16:
    2460             :         case GDT_Int32:
    2461             :         case GDT_UInt32:
    2462             :         {
    2463          15 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
    2464          15 :             return GDALResampleChunk_ModeT(
    2465             :                 args, static_cast<const uint32_t *>(pChunk),
    2466          15 :                 static_cast<uint32_t *>(*ppDstBuffer));
    2467             :         }
    2468             : 
    2469          17 :         case GDT_Float32:
    2470             :         {
    2471          17 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
    2472          17 :             return GDALResampleChunk_ModeT(args,
    2473             :                                            static_cast<const float *>(pChunk),
    2474          17 :                                            static_cast<float *>(*ppDstBuffer));
    2475             :         }
    2476             : 
    2477          12 :         case GDT_CInt32:
    2478             :         case GDT_Int64:
    2479             :         case GDT_UInt64:
    2480             :         {
    2481          12 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
    2482          12 :             return GDALResampleChunk_ModeT(
    2483             :                 args, static_cast<const uint64_t *>(pChunk),
    2484          12 :                 static_cast<uint64_t *>(*ppDstBuffer));
    2485             :         }
    2486             : 
    2487           6 :         case GDT_Float64:
    2488             :         {
    2489           6 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
    2490           6 :             return GDALResampleChunk_ModeT(args,
    2491             :                                            static_cast<const double *>(pChunk),
    2492           6 :                                            static_cast<double *>(*ppDstBuffer));
    2493             :         }
    2494             : 
    2495           4 :         case GDT_CFloat32:
    2496             :         {
    2497           4 :             return GDALResampleChunk_ModeT(
    2498             :                 args, static_cast<const std::complex<float> *>(pChunk),
    2499           4 :                 static_cast<std::complex<float> *>(*ppDstBuffer));
    2500             :         }
    2501             : 
    2502           4 :         case GDT_CFloat64:
    2503             :         {
    2504           4 :             return GDALResampleChunk_ModeT(
    2505             :                 args, static_cast<const std::complex<double> *>(pChunk),
    2506           4 :                 static_cast<std::complex<double> *>(*ppDstBuffer));
    2507             :         }
    2508             : 
    2509           0 :         case GDT_Unknown:
    2510             :         case GDT_TypeCount:
    2511           0 :             break;
    2512             :     }
    2513             : 
    2514           0 :     CPLAssert(false);
    2515             :     return CE_Failure;
    2516             : }
    2517             : 
    2518             : /************************************************************************/
    2519             : /*                  GDALResampleConvolutionHorizontal()                 */
    2520             : /************************************************************************/
    2521             : 
    2522             : template <class T>
    2523             : static inline double
    2524       44886 : GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
    2525             :                                   int nSrcPixelCount)
    2526             : {
    2527       44886 :     double dfVal1 = 0.0;
    2528       44886 :     double dfVal2 = 0.0;
    2529       44886 :     int i = 0;  // Used after for.
    2530             :     // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
    2531             :     // manually (untypical) unrolled loop in -O2 and -O3:
    2532             :     // https://github.com/OSGeo/gdal/issues/9508
    2533             : #if !defined(__INTEL_CLANG_COMPILER)
    2534       89516 :     for (; i < nSrcPixelCount - 3; i += 4)
    2535             :     {
    2536       44630 :         dfVal1 += pChunk[i] * padfWeights[i];
    2537       44630 :         dfVal1 += pChunk[i + 1] * padfWeights[i + 1];
    2538       44630 :         dfVal2 += pChunk[i + 2] * padfWeights[i + 2];
    2539       44630 :         dfVal2 += pChunk[i + 3] * padfWeights[i + 3];
    2540             :     }
    2541             : #endif
    2542       46358 :     for (; i < nSrcPixelCount; ++i)
    2543             :     {
    2544        1472 :         dfVal1 += pChunk[i] * padfWeights[i];
    2545             :     }
    2546       44886 :     return dfVal1 + dfVal2;
    2547             : }
    2548             : 
    2549             : template <class T>
    2550       44576 : static inline void GDALResampleConvolutionHorizontalWithMask(
    2551             :     const T *pChunk, const GByte *pabyMask, const double *padfWeights,
    2552             :     int nSrcPixelCount, double &dfVal, double &dfWeightSum)
    2553             : {
    2554       44576 :     dfVal = 0;
    2555       44576 :     dfWeightSum = 0;
    2556       44576 :     int i = 0;
    2557       98300 :     for (; i < nSrcPixelCount - 3; i += 4)
    2558             :     {
    2559       53724 :         const double dfWeight0 = padfWeights[i] * pabyMask[i];
    2560       53724 :         const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
    2561       53724 :         const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
    2562       53724 :         const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
    2563       53724 :         dfVal += pChunk[i] * dfWeight0;
    2564       53724 :         dfVal += pChunk[i + 1] * dfWeight1;
    2565       53724 :         dfVal += pChunk[i + 2] * dfWeight2;
    2566       53724 :         dfVal += pChunk[i + 3] * dfWeight3;
    2567       53724 :         dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
    2568             :     }
    2569       61162 :     for (; i < nSrcPixelCount; ++i)
    2570             :     {
    2571       16586 :         const double dfWeight = padfWeights[i] * pabyMask[i];
    2572       16586 :         dfVal += pChunk[i] * dfWeight;
    2573       16586 :         dfWeightSum += dfWeight;
    2574             :     }
    2575       44576 : }
    2576             : 
    2577             : template <class T>
    2578     1340094 : static inline void GDALResampleConvolutionHorizontal_3rows(
    2579             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2580             :     const double *padfWeights, int nSrcPixelCount, double &dfRes1,
    2581             :     double &dfRes2, double &dfRes3)
    2582             : {
    2583     1340094 :     double dfVal1 = 0.0;
    2584     1340094 :     double dfVal2 = 0.0;
    2585     1340094 :     double dfVal3 = 0.0;
    2586     1340094 :     double dfVal4 = 0.0;
    2587     1340094 :     double dfVal5 = 0.0;
    2588     1340094 :     double dfVal6 = 0.0;
    2589     1340094 :     int i = 0;  // Used after for.
    2590     2733937 :     for (; i < nSrcPixelCount - 3; i += 4)
    2591             :     {
    2592     1393842 :         dfVal1 += pChunkRow1[i] * padfWeights[i];
    2593     1393842 :         dfVal1 += pChunkRow1[i + 1] * padfWeights[i + 1];
    2594     1393842 :         dfVal2 += pChunkRow1[i + 2] * padfWeights[i + 2];
    2595     1393842 :         dfVal2 += pChunkRow1[i + 3] * padfWeights[i + 3];
    2596     1393842 :         dfVal3 += pChunkRow2[i] * padfWeights[i];
    2597     1393842 :         dfVal3 += pChunkRow2[i + 1] * padfWeights[i + 1];
    2598     1393842 :         dfVal4 += pChunkRow2[i + 2] * padfWeights[i + 2];
    2599     1393842 :         dfVal4 += pChunkRow2[i + 3] * padfWeights[i + 3];
    2600     1393842 :         dfVal5 += pChunkRow3[i] * padfWeights[i];
    2601     1393842 :         dfVal5 += pChunkRow3[i + 1] * padfWeights[i + 1];
    2602     1393842 :         dfVal6 += pChunkRow3[i + 2] * padfWeights[i + 2];
    2603     1393842 :         dfVal6 += pChunkRow3[i + 3] * padfWeights[i + 3];
    2604             :     }
    2605     1378621 :     for (; i < nSrcPixelCount; ++i)
    2606             :     {
    2607       38527 :         dfVal1 += pChunkRow1[i] * padfWeights[i];
    2608       38527 :         dfVal3 += pChunkRow2[i] * padfWeights[i];
    2609       38527 :         dfVal5 += pChunkRow3[i] * padfWeights[i];
    2610             :     }
    2611     1340094 :     dfRes1 = dfVal1 + dfVal2;
    2612     1340094 :     dfRes2 = dfVal3 + dfVal4;
    2613     1340094 :     dfRes3 = dfVal5 + dfVal6;
    2614     1340094 : }
    2615             : 
    2616             : template <class T>
    2617       18828 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
    2618             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2619             :     const double *padfWeights, int nSrcPixelCount, double &dfRes1,
    2620             :     double &dfRes2, double &dfRes3)
    2621             : {
    2622       18828 :     GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
    2623             :                                             padfWeights, nSrcPixelCount, dfRes1,
    2624             :                                             dfRes2, dfRes3);
    2625       18828 : }
    2626             : 
    2627             : template <class T>
    2628     1256466 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
    2629             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2630             :     const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
    2631             : {
    2632     1256466 :     GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
    2633             :                                             padfWeights, 4, dfRes1, dfRes2,
    2634             :                                             dfRes3);
    2635     1256466 : }
    2636             : 
    2637             : /************************************************************************/
    2638             : /*                  GDALResampleConvolutionVertical()                   */
    2639             : /************************************************************************/
    2640             : 
    2641             : template <class T>
    2642             : static inline double
    2643      465244 : GDALResampleConvolutionVertical(const T *pChunk, size_t nStride,
    2644             :                                 const double *padfWeights, int nSrcLineCount)
    2645             : {
    2646      465244 :     double dfVal1 = 0.0;
    2647      465244 :     double dfVal2 = 0.0;
    2648      465244 :     int i = 0;
    2649      465244 :     size_t j = 0;
    2650      916100 :     for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
    2651             :     {
    2652      450856 :         dfVal1 += pChunk[j + 0 * nStride] * padfWeights[i + 0];
    2653      450856 :         dfVal1 += pChunk[j + 1 * nStride] * padfWeights[i + 1];
    2654      450856 :         dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
    2655      450856 :         dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
    2656             :     }
    2657      518747 :     for (; i < nSrcLineCount; ++i, j += nStride)
    2658             :     {
    2659       53503 :         dfVal1 += pChunk[j] * padfWeights[i];
    2660             :     }
    2661      465244 :     return dfVal1 + dfVal2;
    2662             : }
    2663             : 
    2664             : template <class T>
    2665     2880000 : static inline void GDALResampleConvolutionVertical_2cols(
    2666             :     const T *pChunk, size_t nStride, const double *padfWeights,
    2667             :     int nSrcLineCount, double &dfRes1, double &dfRes2)
    2668             : {
    2669     2880000 :     double dfVal1 = 0.0;
    2670     2880000 :     double dfVal2 = 0.0;
    2671     2880000 :     double dfVal3 = 0.0;
    2672     2880000 :     double dfVal4 = 0.0;
    2673     2880000 :     int i = 0;
    2674     2880000 :     size_t j = 0;
    2675     5716800 :     for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
    2676             :     {
    2677     2836800 :         dfVal1 += pChunk[j + 0 + 0 * nStride] * padfWeights[i + 0];
    2678     2836800 :         dfVal3 += pChunk[j + 1 + 0 * nStride] * padfWeights[i + 0];
    2679     2836800 :         dfVal1 += pChunk[j + 0 + 1 * nStride] * padfWeights[i + 1];
    2680     2836800 :         dfVal3 += pChunk[j + 1 + 1 * nStride] * padfWeights[i + 1];
    2681     2836800 :         dfVal2 += pChunk[j + 0 + 2 * nStride] * padfWeights[i + 2];
    2682     2836800 :         dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
    2683     2836800 :         dfVal2 += pChunk[j + 0 + 3 * nStride] * padfWeights[i + 3];
    2684     2836800 :         dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
    2685             :     }
    2686     2995210 :     for (; i < nSrcLineCount; ++i, j += nStride)
    2687             :     {
    2688      115210 :         dfVal1 += pChunk[j + 0] * padfWeights[i];
    2689      115210 :         dfVal3 += pChunk[j + 1] * padfWeights[i];
    2690             :     }
    2691     2880000 :     dfRes1 = dfVal1 + dfVal2;
    2692     2880000 :     dfRes2 = dfVal3 + dfVal4;
    2693     2880000 : }
    2694             : 
    2695             : #ifdef USE_SSE2
    2696             : 
    2697             : #ifdef __AVX__
    2698             : /************************************************************************/
    2699             : /*             GDALResampleConvolutionVertical_16cols<T>                */
    2700             : /************************************************************************/
    2701             : 
    2702             : template <class T>
    2703             : static inline void
    2704             : GDALResampleConvolutionVertical_16cols(const T *pChunk, size_t nStride,
    2705             :                                        const double *padfWeights,
    2706             :                                        int nSrcLineCount, float *afDest)
    2707             : {
    2708             :     int i = 0;
    2709             :     size_t j = 0;
    2710             :     XMMReg4Double v_acc0 = XMMReg4Double::Zero();
    2711             :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    2712             :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    2713             :     XMMReg4Double v_acc3 = XMMReg4Double::Zero();
    2714             :     for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
    2715             :     {
    2716             :         XMMReg4Double w0 =
    2717             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
    2718             :         XMMReg4Double w1 =
    2719             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
    2720             :         XMMReg4Double w2 =
    2721             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
    2722             :         XMMReg4Double w3 =
    2723             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
    2724             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
    2725             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
    2726             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
    2727             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
    2728             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
    2729             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
    2730             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
    2731             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
    2732             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
    2733             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
    2734             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
    2735             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
    2736             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
    2737             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
    2738             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
    2739             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
    2740             :     }
    2741             :     for (; i < nSrcLineCount; ++i, j += nStride)
    2742             :     {
    2743             :         XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
    2744             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
    2745             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
    2746             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
    2747             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
    2748             :     }
    2749             :     v_acc0.Store4Val(afDest);
    2750             :     v_acc1.Store4Val(afDest + 4);
    2751             :     v_acc2.Store4Val(afDest + 8);
    2752             :     v_acc3.Store4Val(afDest + 12);
    2753             : }
    2754             : 
    2755             : template <class T>
    2756             : static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
    2757             :                                                           const double *, int,
    2758             :                                                           double *)
    2759             : {
    2760             :     // Cannot be reached
    2761             :     CPLAssert(false);
    2762             : }
    2763             : 
    2764             : #else
    2765             : 
    2766             : /************************************************************************/
    2767             : /*              GDALResampleConvolutionVertical_8cols<T>                */
    2768             : /************************************************************************/
    2769             : 
    2770             : template <class T>
    2771             : static inline void
    2772    22764800 : GDALResampleConvolutionVertical_8cols(const T *pChunk, size_t nStride,
    2773             :                                       const double *padfWeights,
    2774             :                                       int nSrcLineCount, float *afDest)
    2775             : {
    2776    22764800 :     int i = 0;
    2777    22764800 :     size_t j = 0;
    2778    22764800 :     XMMReg4Double v_acc0 = XMMReg4Double::Zero();
    2779    22753300 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    2780    44995200 :     for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
    2781             :     {
    2782    22228600 :         XMMReg4Double w0 =
    2783    22228600 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
    2784    22204400 :         XMMReg4Double w1 =
    2785    22204400 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
    2786    22195800 :         XMMReg4Double w2 =
    2787    22195800 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
    2788    22217900 :         XMMReg4Double w3 =
    2789    22217900 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
    2790    22211200 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
    2791    22210300 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
    2792    22192500 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
    2793    22205500 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
    2794    22209700 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
    2795    22209000 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
    2796    22207200 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
    2797    22217600 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
    2798             :     }
    2799    34307000 :     for (; i < nSrcLineCount; ++i, j += nStride)
    2800             :     {
    2801    11540500 :         XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
    2802    11540500 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
    2803    11540500 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
    2804             :     }
    2805    22766500 :     v_acc0.Store4Val(afDest);
    2806    22745600 :     v_acc1.Store4Val(afDest + 4);
    2807    22779400 : }
    2808             : 
    2809             : template <class T>
    2810             : static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
    2811             :                                                          const double *, int,
    2812             :                                                          double *)
    2813             : {
    2814             :     // Cannot be reached
    2815             :     CPLAssert(false);
    2816             : }
    2817             : 
    2818             : #endif  // __AVX__
    2819             : 
    2820             : /************************************************************************/
    2821             : /*              GDALResampleConvolutionHorizontalSSE2<T>                */
    2822             : /************************************************************************/
    2823             : 
    2824             : template <class T>
    2825     3037035 : static inline double GDALResampleConvolutionHorizontalSSE2(
    2826             :     const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    2827             : {
    2828     3037035 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    2829     3036859 :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    2830     3036899 :     int i = 0;  // Used after for.
    2831     3312425 :     for (; i < nSrcPixelCount - 7; i += 8)
    2832             :     {
    2833             :         // Retrieve the pixel & accumulate
    2834      275504 :         const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
    2835      275506 :         const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
    2836      275506 :         const XMMReg4Double v_weight1 =
    2837      275506 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    2838      275502 :         const XMMReg4Double v_weight2 =
    2839      275502 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
    2840             : 
    2841      275503 :         v_acc1 += v_pixels1 * v_weight1;
    2842      275504 :         v_acc2 += v_pixels2 * v_weight2;
    2843             :     }
    2844             : 
    2845     3036923 :     v_acc1 += v_acc2;
    2846             : 
    2847     3036890 :     double dfVal = v_acc1.GetHorizSum();
    2848    10209620 :     for (; i < nSrcPixelCount; ++i)
    2849             :     {
    2850     7172770 :         dfVal += pChunk[i] * padfWeightsAligned[i];
    2851             :     }
    2852     3036858 :     return dfVal;
    2853             : }
    2854             : 
    2855             : /************************************************************************/
    2856             : /*              GDALResampleConvolutionHorizontal<GByte>                */
    2857             : /************************************************************************/
    2858             : 
    2859             : template <>
    2860     2488100 : inline double GDALResampleConvolutionHorizontal<GByte>(
    2861             :     const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    2862             : {
    2863     2488100 :     return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
    2864     2488130 :                                                  nSrcPixelCount);
    2865             : }
    2866             : 
    2867             : template <>
    2868      548956 : inline double GDALResampleConvolutionHorizontal<GUInt16>(
    2869             :     const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    2870             : {
    2871      548956 :     return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
    2872      548950 :                                                  nSrcPixelCount);
    2873             : }
    2874             : 
    2875             : /************************************************************************/
    2876             : /*              GDALResampleConvolutionHorizontalWithMaskSSE2<T>        */
    2877             : /************************************************************************/
    2878             : 
    2879             : template <class T>
    2880     7062423 : static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
    2881             :     const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
    2882             :     int nSrcPixelCount, double &dfVal, double &dfWeightSum)
    2883             : {
    2884     7062423 :     int i = 0;  // Used after for.
    2885     7062423 :     XMMReg4Double v_acc = XMMReg4Double::Zero();
    2886     7052503 :     XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
    2887    19726921 :     for (; i < nSrcPixelCount - 3; i += 4)
    2888             :     {
    2889    12681358 :         const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
    2890    12686158 :         const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
    2891    12687258 :         XMMReg4Double v_weight =
    2892    12687258 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    2893    12685458 :         v_weight *= v_mask;
    2894    12681858 :         v_acc += v_pixels * v_weight;
    2895    12684258 :         v_acc_weight += v_weight;
    2896             :     }
    2897             : 
    2898     7045503 :     dfVal = v_acc.GetHorizSum();
    2899     7065433 :     dfWeightSum = v_acc_weight.GetHorizSum();
    2900     7296643 :     for (; i < nSrcPixelCount; ++i)
    2901             :     {
    2902      231077 :         const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
    2903      231077 :         dfVal += pChunk[i] * dfWeight;
    2904      231077 :         dfWeightSum += dfWeight;
    2905             :     }
    2906     7065563 : }
    2907             : 
    2908             : /************************************************************************/
    2909             : /*              GDALResampleConvolutionHorizontalWithMask<GByte>        */
    2910             : /************************************************************************/
    2911             : 
    2912             : template <>
    2913     7067680 : inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
    2914             :     const GByte *pChunk, const GByte *pabyMask,
    2915             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
    2916             :     double &dfWeightSum)
    2917             : {
    2918     7067680 :     GDALResampleConvolutionHorizontalWithMaskSSE2(
    2919             :         pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
    2920             :         dfWeightSum);
    2921     7058330 : }
    2922             : 
    2923             : template <>
    2924          63 : inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
    2925             :     const GUInt16 *pChunk, const GByte *pabyMask,
    2926             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
    2927             :     double &dfWeightSum)
    2928             : {
    2929          63 :     GDALResampleConvolutionHorizontalWithMaskSSE2(
    2930             :         pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
    2931             :         dfWeightSum);
    2932          63 : }
    2933             : 
    2934             : /************************************************************************/
    2935             : /*              GDALResampleConvolutionHorizontal_3rows_SSE2<T>         */
    2936             : /************************************************************************/
    2937             : 
    2938             : template <class T>
    2939    22991830 : static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
    2940             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2941             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    2942             :     double &dfRes2, double &dfRes3)
    2943             : {
    2944    22991830 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
    2945    22959530 :                   v_acc2 = XMMReg4Double::Zero(),
    2946    22977230 :                   v_acc3 = XMMReg4Double::Zero();
    2947    22973930 :     int i = 0;
    2948    45803666 :     for (; i < nSrcPixelCount - 7; i += 8)
    2949             :     {
    2950             :         // Retrieve the pixel & accumulate.
    2951    22841336 :         XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
    2952    22870536 :         XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
    2953    22870736 :         const XMMReg4Double v_weight1 =
    2954    22870736 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    2955    22843236 :         const XMMReg4Double v_weight2 =
    2956    22843236 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
    2957             : 
    2958    22857336 :         v_acc1 += v_pixels1 * v_weight1;
    2959    22836636 :         v_acc1 += v_pixels2 * v_weight2;
    2960             : 
    2961    22839936 :         v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
    2962    22845936 :         v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
    2963    22851336 :         v_acc2 += v_pixels1 * v_weight1;
    2964    22844636 :         v_acc2 += v_pixels2 * v_weight2;
    2965             : 
    2966    22848736 :         v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
    2967    22843136 :         v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
    2968    22851136 :         v_acc3 += v_pixels1 * v_weight1;
    2969    22848536 :         v_acc3 += v_pixels2 * v_weight2;
    2970             :     }
    2971             : 
    2972    22962330 :     dfRes1 = v_acc1.GetHorizSum();
    2973    22958930 :     dfRes2 = v_acc2.GetHorizSum();
    2974    22950030 :     dfRes3 = v_acc3.GetHorizSum();
    2975    34850126 :     for (; i < nSrcPixelCount; ++i)
    2976             :     {
    2977    11883796 :         dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
    2978    11883796 :         dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
    2979    11883796 :         dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
    2980             :     }
    2981    22966430 : }
    2982             : 
    2983             : /************************************************************************/
    2984             : /*              GDALResampleConvolutionHorizontal_3rows<GByte>          */
    2985             : /************************************************************************/
    2986             : 
    2987             : template <>
    2988    22975600 : inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
    2989             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    2990             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    2991             :     double &dfRes2, double &dfRes3)
    2992             : {
    2993    22975600 :     GDALResampleConvolutionHorizontal_3rows_SSE2(
    2994             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    2995             :         dfRes1, dfRes2, dfRes3);
    2996    22956000 : }
    2997             : 
    2998             : template <>
    2999          30 : inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
    3000             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    3001             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
    3002             :     int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
    3003             : {
    3004          30 :     GDALResampleConvolutionHorizontal_3rows_SSE2(
    3005             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    3006             :         dfRes1, dfRes2, dfRes3);
    3007          30 : }
    3008             : 
    3009             : /************************************************************************/
    3010             : /*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T>   */
    3011             : /************************************************************************/
    3012             : 
    3013             : template <class T>
    3014     5004922 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    3015             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    3016             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    3017             :     double &dfRes2, double &dfRes3)
    3018             : {
    3019     5004922 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    3020     4997450 :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    3021     5003038 :     XMMReg4Double v_acc3 = XMMReg4Double::Zero();
    3022     5002819 :     int i = 0;  // Use after for.
    3023    10581930 :     for (; i < nSrcPixelCount - 3; i += 4)
    3024             :     {
    3025             :         // Retrieve the pixel & accumulate.
    3026     5581700 :         const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
    3027     5607630 :         const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
    3028     5602430 :         const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
    3029     5613620 :         const XMMReg4Double v_weight =
    3030     5613620 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    3031             : 
    3032     5583600 :         v_acc1 += v_pixels1 * v_weight;
    3033     5587950 :         v_acc2 += v_pixels2 * v_weight;
    3034     5594870 :         v_acc3 += v_pixels3 * v_weight;
    3035             :     }
    3036             : 
    3037     5000200 :     dfRes1 = v_acc1.GetHorizSum();
    3038     5000936 :     dfRes2 = v_acc2.GetHorizSum();
    3039     5000057 :     dfRes3 = v_acc3.GetHorizSum();
    3040             : 
    3041     9406369 :     for (; i < nSrcPixelCount; ++i)
    3042             :     {
    3043     4405122 :         dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
    3044     4405122 :         dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
    3045     4405122 :         dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
    3046             :     }
    3047     5001247 : }
    3048             : 
    3049             : /************************************************************************/
    3050             : /*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>    */
    3051             : /************************************************************************/
    3052             : 
    3053             : template <>
    3054     4934660 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
    3055             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    3056             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    3057             :     double &dfRes2, double &dfRes3)
    3058             : {
    3059     4934660 :     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    3060             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    3061             :         dfRes1, dfRes2, dfRes3);
    3062     4933680 : }
    3063             : 
    3064             : template <>
    3065       67024 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
    3066             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    3067             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
    3068             :     int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
    3069             : {
    3070       67024 :     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    3071             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    3072             :         dfRes1, dfRes2, dfRes3);
    3073       67089 : }
    3074             : 
    3075             : /************************************************************************/
    3076             : /*     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T>       */
    3077             : /************************************************************************/
    3078             : 
    3079             : template <class T>
    3080    13894130 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    3081             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    3082             :     const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
    3083             :     double &dfRes3)
    3084             : {
    3085    13894130 :     const XMMReg4Double v_weight =
    3086             :         XMMReg4Double::Load4ValAligned(padfWeightsAligned);
    3087             : 
    3088             :     // Retrieve the pixel & accumulate.
    3089    13926200 :     const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
    3090    13935150 :     const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
    3091    13938510 :     const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
    3092             : 
    3093    13955150 :     XMMReg4Double v_acc1 = v_pixels1 * v_weight;
    3094    13878870 :     XMMReg4Double v_acc2 = v_pixels2 * v_weight;
    3095    13899650 :     XMMReg4Double v_acc3 = v_pixels3 * v_weight;
    3096             : 
    3097    13899040 :     dfRes1 = v_acc1.GetHorizSum();
    3098    13905290 :     dfRes2 = v_acc2.GetHorizSum();
    3099    13921340 :     dfRes3 = v_acc3.GetHorizSum();
    3100    13908980 : }
    3101             : 
    3102             : /************************************************************************/
    3103             : /*       GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>      */
    3104             : /************************************************************************/
    3105             : 
    3106             : template <>
    3107     8241970 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
    3108             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    3109             :     const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
    3110             :     double &dfRes3)
    3111             : {
    3112     8241970 :     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    3113             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
    3114             :         dfRes3);
    3115     8247550 : }
    3116             : 
    3117             : template <>
    3118     5676770 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
    3119             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    3120             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
    3121             :     double &dfRes2, double &dfRes3)
    3122             : {
    3123     5676770 :     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    3124             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
    3125             :         dfRes3);
    3126     5670600 : }
    3127             : 
    3128             : #endif  // USE_SSE2
    3129             : 
    3130             : /************************************************************************/
    3131             : /*                    GDALResampleChunk_Convolution()                   */
    3132             : /************************************************************************/
    3133             : 
    3134             : template <class T, class Twork, GDALDataType eWrkDataType>
    3135        4639 : static CPLErr GDALResampleChunk_ConvolutionT(
    3136             :     const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
    3137             :     FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
    3138             :     int nKernelRadius, bool bKernelWithNegativeWeights, float fMaxVal)
    3139             : 
    3140             : {
    3141        4639 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    3142        4639 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    3143        4639 :     const double dfSrcXDelta = args.dfSrcXDelta;
    3144        4639 :     const double dfSrcYDelta = args.dfSrcYDelta;
    3145        4639 :     constexpr int nBands = 1;
    3146        4639 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    3147        4639 :     const int nChunkXOff = args.nChunkXOff;
    3148        4639 :     const int nChunkXSize = args.nChunkXSize;
    3149        4639 :     const int nChunkYOff = args.nChunkYOff;
    3150        4639 :     const int nChunkYSize = args.nChunkYSize;
    3151        4639 :     const int nDstXOff = args.nDstXOff;
    3152        4639 :     const int nDstXOff2 = args.nDstXOff2;
    3153        4639 :     const int nDstYOff = args.nDstYOff;
    3154        4639 :     const int nDstYOff2 = args.nDstYOff2;
    3155        4639 :     const bool bHasNoData = args.bHasNoData;
    3156        4639 :     double dfNoDataValue = args.dfNoDataValue;
    3157             : 
    3158        4639 :     if (!bHasNoData)
    3159        4562 :         dfNoDataValue = 0.0;
    3160        4639 :     const auto dstDataType = args.eOvrDataType;
    3161        4639 :     const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
    3162        4639 :     const double dfReplacementVal =
    3163          75 :         bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
    3164             :                    : dfNoDataValue;
    3165             :     // cppcheck-suppress unreadVariable
    3166        4639 :     const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
    3167        4638 :     const bool bNoDataValueInt64Valid =
    3168        4638 :         isIntegerDT && GDALIsValueExactAs<GInt64>(dfNoDataValue);
    3169        4638 :     const auto nNodataValueInt64 =
    3170             :         bNoDataValueInt64Valid ? static_cast<GInt64>(dfNoDataValue) : 0;
    3171        4638 :     constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
    3172             : 
    3173             :     // TODO: we should have some generic function to do this.
    3174        4638 :     Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
    3175        4638 :     Twork fDstMax = cpl::NumericLimits<Twork>::max();
    3176        4638 :     if (dstDataType == GDT_Byte)
    3177             :     {
    3178        3903 :         fDstMin = std::numeric_limits<GByte>::min();
    3179        3901 :         fDstMax = std::numeric_limits<GByte>::max();
    3180             :     }
    3181         737 :     else if (dstDataType == GDT_Int8)
    3182             :     {
    3183           1 :         fDstMin = std::numeric_limits<GInt8>::min();
    3184           1 :         fDstMax = std::numeric_limits<GInt8>::max();
    3185             :     }
    3186         736 :     else if (dstDataType == GDT_UInt16)
    3187             :     {
    3188         393 :         fDstMin = std::numeric_limits<GUInt16>::min();
    3189         387 :         fDstMax = std::numeric_limits<GUInt16>::max();
    3190             :     }
    3191         343 :     else if (dstDataType == GDT_Int16)
    3192             :     {
    3193         291 :         fDstMin = std::numeric_limits<GInt16>::min();
    3194         291 :         fDstMax = std::numeric_limits<GInt16>::max();
    3195             :     }
    3196          52 :     else if (dstDataType == GDT_UInt32)
    3197             :     {
    3198           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
    3199           1 :         fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
    3200             :     }
    3201          51 :     else if (dstDataType == GDT_Int32)
    3202             :     {
    3203             :         // cppcheck-suppress unreadVariable
    3204           2 :         fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
    3205             :         // cppcheck-suppress unreadVariable
    3206           2 :         fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
    3207             :     }
    3208          49 :     else if (dstDataType == GDT_UInt64)
    3209             :     {
    3210             :         // cppcheck-suppress unreadVariable
    3211           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
    3212             :         // cppcheck-suppress unreadVariable
    3213             :         // (1 << 64) - 2048: largest uint64 value a double can hold
    3214           1 :         fDstMax = static_cast<Twork>(18446744073709549568ULL);
    3215             :     }
    3216          48 :     else if (dstDataType == GDT_Int64)
    3217             :     {
    3218             :         // cppcheck-suppress unreadVariable
    3219           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
    3220             :         // cppcheck-suppress unreadVariable
    3221             :         // (1 << 63) - 1024: largest int64 that a double can hold
    3222           1 :         fDstMax = static_cast<Twork>(9223372036854774784LL);
    3223             :     }
    3224             : 
    3225    36966174 :     auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
    3226             :                                bNoDataValueInt64Valid, nNodataValueInt64,
    3227             :                                dfNoDataValue, dfReplacementVal](Twork fVal)
    3228             :     {
    3229    16023700 :         if (!bHasNoData)
    3230    11838700 :             return fVal;
    3231             : 
    3232             :         // Clamp value before comparing to nodata: this is only needed for
    3233             :         // kernels with negative weights (Lanczos)
    3234     4185030 :         Twork fClamped = fVal;
    3235     4185030 :         if (fClamped < fDstMin)
    3236       15998 :             fClamped = fDstMin;
    3237     4169030 :         else if (fClamped > fDstMax)
    3238       16406 :             fClamped = fDstMax;
    3239     4185030 :         if (isIntegerDT)
    3240             :         {
    3241     4216630 :             if (bNoDataValueInt64Valid)
    3242             :             {
    3243     4214900 :                 const double fClampedRounded = std::round(fClamped);
    3244     8417100 :                 if (fClampedRounded >=
    3245             :                         static_cast<Twork>(
    3246     8417760 :                             std::numeric_limits<int64_t>::min()) &&
    3247             :                     fClampedRounded <=
    3248     8412010 :                         static_cast<Twork>(9223372036854774784LL) &&
    3249     4199200 :                     nNodataValueInt64 ==
    3250     4202550 :                         static_cast<GInt64>(std::round(fClamped)))
    3251             :                 {
    3252             :                     // Do not use the nodata value
    3253       14435 :                     return static_cast<Twork>(dfReplacementVal);
    3254             :                 }
    3255             :             }
    3256             :         }
    3257           0 :         else if (dfNoDataValue == fClamped)
    3258             :         {
    3259             :             // Do not use the nodata value
    3260           1 :             return static_cast<Twork>(dfReplacementVal);
    3261             :         }
    3262     4162560 :         return fClamped;
    3263             :     };
    3264             : 
    3265             :     /* -------------------------------------------------------------------- */
    3266             :     /*      Allocate work buffers.                                          */
    3267             :     /* -------------------------------------------------------------------- */
    3268        4635 :     const int nDstXSize = nDstXOff2 - nDstXOff;
    3269        4635 :     Twork *pafWrkScanline = nullptr;
    3270        4635 :     if (dstDataType != eWrkDataType)
    3271             :     {
    3272             :         pafWrkScanline =
    3273        4590 :             static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
    3274        4596 :         if (pafWrkScanline == nullptr)
    3275           0 :             return CE_Failure;
    3276             :     }
    3277             : 
    3278        4641 :     const double dfXScale = 1.0 / dfXRatioDstToSrc;
    3279        4641 :     const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
    3280        4641 :     const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
    3281        4641 :     const double dfYScale = 1.0 / dfYRatioDstToSrc;
    3282        4641 :     const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
    3283        4641 :     const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
    3284             : 
    3285             :     // Temporary array to store result of horizontal filter.
    3286             :     double *padfHorizontalFiltered = static_cast<double *>(
    3287        4641 :         VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
    3288             : 
    3289             :     // To store convolution coefficients.
    3290        4641 :     double *padfWeights = static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
    3291             :         static_cast<int>(2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) +
    3292             :                          0.5) *
    3293             :         sizeof(double)));
    3294             : 
    3295        4639 :     GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
    3296        4639 :     if (pabyChunkNodataMask)
    3297             :         pabyChunkNodataMaskHorizontalFiltered =
    3298         462 :             static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
    3299        4639 :     if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
    3300         462 :         (pabyChunkNodataMask != nullptr &&
    3301             :          pabyChunkNodataMaskHorizontalFiltered == nullptr))
    3302             :     {
    3303           1 :         VSIFree(pafWrkScanline);
    3304           0 :         VSIFree(padfHorizontalFiltered);
    3305           0 :         VSIFreeAligned(padfWeights);
    3306           0 :         VSIFree(pabyChunkNodataMaskHorizontalFiltered);
    3307           0 :         return CE_Failure;
    3308             :     }
    3309             : 
    3310             :     /* ==================================================================== */
    3311             :     /*      First pass: horizontal filter                                   */
    3312             :     /* ==================================================================== */
    3313        4639 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    3314             : #ifdef USE_SSE2
    3315        4639 :     bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
    3316             : #endif
    3317     2962381 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    3318             :     {
    3319     2957733 :         const double dfSrcPixel =
    3320     2957733 :             (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
    3321     2957733 :         int nSrcPixelStart =
    3322     2957733 :             static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
    3323     2957733 :         if (nSrcPixelStart < nChunkXOff)
    3324       56807 :             nSrcPixelStart = nChunkXOff;
    3325     2957733 :         int nSrcPixelStop =
    3326     2957733 :             static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
    3327     2957733 :         if (nSrcPixelStop > nChunkRightXOff)
    3328       56826 :             nSrcPixelStop = nChunkRightXOff;
    3329             : #if 0
    3330             :         if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
    3331             :         {
    3332             :             printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
    3333             :         }
    3334             :         if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
    3335             :         {
    3336             :             printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
    3337             :         }
    3338             : #endif
    3339     2957733 :         const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
    3340     2957733 :         double dfWeightSum = 0.0;
    3341             : 
    3342             :         // Compute convolution coefficients.
    3343     2957733 :         int nSrcPixel = nSrcPixelStart;
    3344     2957733 :         double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
    3345     4186216 :         for (; nSrcPixel < nSrcPixelStop - 3; nSrcPixel += 4)
    3346             :         {
    3347     1228748 :             padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
    3348     1228748 :             dfX += dfXScaleWeight;
    3349     1228748 :             padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
    3350     1228748 :             dfX += dfXScaleWeight;
    3351     1228748 :             padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
    3352     1228748 :             dfX += dfXScaleWeight;
    3353     1228748 :             padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
    3354     1228748 :             dfX += dfXScaleWeight;
    3355     1228474 :             dfWeightSum +=
    3356     1228748 :                 pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
    3357             :         }
    3358     6946891 :         for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
    3359             :         {
    3360     3989048 :             const double dfWeight = pfnFilterFunc(dfX);
    3361     3989420 :             padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
    3362     3989420 :             dfWeightSum += dfWeight;
    3363             :         }
    3364             : 
    3365     2957843 :         const int nHeight = nChunkYSize * nBands;
    3366     2957843 :         if (pabyChunkNodataMask == nullptr)
    3367             :         {
    3368     2869857 :             if (dfWeightSum != 0)
    3369             :             {
    3370     2869845 :                 const double dfInvWeightSum = 1.0 / dfWeightSum;
    3371    11124967 :                 for (int i = 0; i < nSrcPixelCount; ++i)
    3372     8255125 :                     padfWeights[i] *= dfInvWeightSum;
    3373             :             }
    3374     2869857 :             int iSrcLineOff = 0;
    3375             : #ifdef USE_SSE2
    3376     2869857 :             if (nSrcPixelCount == 4)
    3377             :             {
    3378    15780864 :                 for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
    3379             :                 {
    3380    15171416 :                     const size_t j =
    3381    15171416 :                         static_cast<size_t>(iSrcLineOff) * nChunkXSize +
    3382    15171416 :                         (nSrcPixelStart - nChunkXOff);
    3383    15171416 :                     double dfVal1 = 0.0;
    3384    15171416 :                     double dfVal2 = 0.0;
    3385    15171416 :                     double dfVal3 = 0.0;
    3386    15171416 :                     GDALResampleConvolutionHorizontalPixelCount4_3rows(
    3387    15171416 :                         pChunk + j, pChunk + j + nChunkXSize,
    3388    15171416 :                         pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
    3389             :                         dfVal2, dfVal3);
    3390    15167306 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3391    15167306 :                                                nDstXSize +
    3392    15167306 :                                            iDstPixel - nDstXOff] = dfVal1;
    3393    15167306 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3394    15167306 :                                             1) *
    3395    15167306 :                                                nDstXSize +
    3396    15167306 :                                            iDstPixel - nDstXOff] = dfVal2;
    3397    15167306 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3398    15167306 :                                             2) *
    3399    15167306 :                                                nDstXSize +
    3400    15167306 :                                            iDstPixel - nDstXOff] = dfVal3;
    3401             :                 }
    3402             :             }
    3403     2256292 :             else if (bSrcPixelCountLess8)
    3404             :             {
    3405     7072188 :                 for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
    3406             :                 {
    3407     5017447 :                     const size_t j =
    3408     5017447 :                         static_cast<size_t>(iSrcLineOff) * nChunkXSize +
    3409     5017447 :                         (nSrcPixelStart - nChunkXOff);
    3410     5017447 :                     double dfVal1 = 0.0;
    3411     5017447 :                     double dfVal2 = 0.0;
    3412     5017447 :                     double dfVal3 = 0.0;
    3413     5017447 :                     GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
    3414     5017447 :                         pChunk + j, pChunk + j + nChunkXSize,
    3415     5017447 :                         pChunk + j + 2 * nChunkXSize, padfWeights,
    3416             :                         nSrcPixelCount, dfVal1, dfVal2, dfVal3);
    3417     5020102 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3418     5020102 :                                                nDstXSize +
    3419     5020102 :                                            iDstPixel - nDstXOff] = dfVal1;
    3420     5020102 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3421     5020102 :                                             1) *
    3422     5020102 :                                                nDstXSize +
    3423     5020102 :                                            iDstPixel - nDstXOff] = dfVal2;
    3424     5020102 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3425     5020102 :                                             2) *
    3426     5020102 :                                                nDstXSize +
    3427     5020102 :                                            iDstPixel - nDstXOff] = dfVal3;
    3428             :                 }
    3429             :             }
    3430             :             else
    3431             : #endif
    3432             :             {
    3433    23238126 :                 for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
    3434             :                 {
    3435    23034430 :                     const size_t j =
    3436    23034430 :                         static_cast<size_t>(iSrcLineOff) * nChunkXSize +
    3437    23034430 :                         (nSrcPixelStart - nChunkXOff);
    3438    23034430 :                     double dfVal1 = 0.0;
    3439    23034430 :                     double dfVal2 = 0.0;
    3440    23034430 :                     double dfVal3 = 0.0;
    3441    23034430 :                     GDALResampleConvolutionHorizontal_3rows(
    3442    23034430 :                         pChunk + j, pChunk + j + nChunkXSize,
    3443    23034430 :                         pChunk + j + 2 * nChunkXSize, padfWeights,
    3444             :                         nSrcPixelCount, dfVal1, dfVal2, dfVal3);
    3445    23033930 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3446    23033930 :                                                nDstXSize +
    3447    23033930 :                                            iDstPixel - nDstXOff] = dfVal1;
    3448    23033930 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3449    23033930 :                                             1) *
    3450    23033930 :                                                nDstXSize +
    3451    23033930 :                                            iDstPixel - nDstXOff] = dfVal2;
    3452    23033930 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3453    23033930 :                                             2) *
    3454    23033930 :                                                nDstXSize +
    3455    23033930 :                                            iDstPixel - nDstXOff] = dfVal3;
    3456             :                 }
    3457             :             }
    3458     5949777 :             for (; iSrcLineOff < nHeight; ++iSrcLineOff)
    3459             :             {
    3460     3081953 :                 const size_t j =
    3461     3081953 :                     static_cast<size_t>(iSrcLineOff) * nChunkXSize +
    3462     3081953 :                     (nSrcPixelStart - nChunkXOff);
    3463     3630906 :                 const double dfVal = GDALResampleConvolutionHorizontal(
    3464      593853 :                     pChunk + j, padfWeights, nSrcPixelCount);
    3465     3081973 :                 padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3466     3081973 :                                            nDstXSize +
    3467     3081973 :                                        iDstPixel - nDstXOff] = dfVal;
    3468             :             }
    3469             :         }
    3470             :         else
    3471             :         {
    3472    20503076 :             for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
    3473             :             {
    3474    20413146 :                 const size_t j =
    3475    20413146 :                     static_cast<size_t>(iSrcLineOff) * nChunkXSize +
    3476    20413146 :                     (nSrcPixelStart - nChunkXOff);
    3477             : 
    3478    20413146 :                 if (bKernelWithNegativeWeights)
    3479             :                 {
    3480    19899912 :                     int nConsecutiveValid = 0;
    3481    19899912 :                     int nMaxConsecutiveValid = 0;
    3482   181970458 :                     for (int k = 0; k < nSrcPixelCount; k++)
    3483             :                     {
    3484   162066146 :                         if (pabyChunkNodataMask[j + k])
    3485    48904253 :                             nConsecutiveValid++;
    3486   113162793 :                         else if (nConsecutiveValid)
    3487             :                         {
    3488      111293 :                             nMaxConsecutiveValid = std::max(
    3489      107790 :                                 nMaxConsecutiveValid, nConsecutiveValid);
    3490      111293 :                             nConsecutiveValid = 0;
    3491             :                         }
    3492             :                     }
    3493    19902112 :                     nMaxConsecutiveValid =
    3494    19903412 :                         std::max(nMaxConsecutiveValid, nConsecutiveValid);
    3495    19902112 :                     if (nMaxConsecutiveValid < nSrcPixelCount / 2)
    3496             :                     {
    3497    13314907 :                         const size_t nTempOffset =
    3498    13314907 :                             static_cast<size_t>(iSrcLineOff) * nDstXSize +
    3499    13314907 :                             iDstPixel - nDstXOff;
    3500    13314907 :                         padfHorizontalFiltered[nTempOffset] = 0.0;
    3501    13314907 :                         pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
    3502    13314907 :                         continue;
    3503             :                     }
    3504             :                 }
    3505             : 
    3506     7100439 :                 double dfVal = 0.0;
    3507     7100439 :                 GDALResampleConvolutionHorizontalWithMask(
    3508       44639 :                     pChunk + j, pabyChunkNodataMask + j, padfWeights,
    3509             :                     nSrcPixelCount, dfVal, dfWeightSum);
    3510     7100213 :                 const size_t nTempOffset =
    3511     7100213 :                     static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
    3512     7100213 :                     nDstXOff;
    3513     7100213 :                 if (dfWeightSum > 0.0)
    3514             :                 {
    3515     7056846 :                     padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
    3516     7056846 :                     pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
    3517             :                 }
    3518             :                 else
    3519             :                 {
    3520       43368 :                     padfHorizontalFiltered[nTempOffset] = 0.0;
    3521       43368 :                     pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
    3522             :                 }
    3523             :             }
    3524             :         }
    3525             :     }
    3526             : 
    3527             :     /* ==================================================================== */
    3528             :     /*      Second pass: vertical filter                                    */
    3529             :     /* ==================================================================== */
    3530        4641 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    3531             : 
    3532      309474 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    3533             :     {
    3534      304833 :         Twork *const pafDstScanline =
    3535             :             pafWrkScanline
    3536      304833 :                 ? pafWrkScanline
    3537        8421 :                 : static_cast<Twork *>(pDstBuffer) +
    3538        8421 :                       static_cast<size_t>(iDstLine - nDstYOff) * nDstXSize;
    3539             : 
    3540      304833 :         const double dfSrcLine =
    3541      304833 :             (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
    3542      304833 :         int nSrcLineStart =
    3543      304833 :             static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
    3544      304833 :         int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
    3545      304833 :         if (nSrcLineStart < nChunkYOff)
    3546        2927 :             nSrcLineStart = nChunkYOff;
    3547      304833 :         if (nSrcLineStop > nChunkBottomYOff)
    3548        2971 :             nSrcLineStop = nChunkBottomYOff;
    3549             : #if 0
    3550             :         if( nSrcLineStart < nChunkYOff &&
    3551             :             nChunkYOff > 0 )
    3552             :         {
    3553             :             printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
    3554             :         }
    3555             :         if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
    3556             :         {
    3557             :             printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
    3558             :         }
    3559             : #endif
    3560      304833 :         const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
    3561      304833 :         double dfWeightSum = 0.0;
    3562             : 
    3563             :         // Compute convolution coefficients.
    3564      304833 :         int nSrcLine = nSrcLineStart;  // Used after for.
    3565      304833 :         double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
    3566      744873 :         for (; nSrcLine < nSrcLineStop - 3;
    3567      440040 :              nSrcLine += 4, dfY += 4 * dfYScaleWeight)
    3568             :         {
    3569      440038 :             padfWeights[nSrcLine - nSrcLineStart] = dfY;
    3570      440038 :             padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
    3571      440038 :             padfWeights[nSrcLine + 2 - nSrcLineStart] =
    3572      440038 :                 dfY + 2 * dfYScaleWeight;
    3573      440038 :             padfWeights[nSrcLine + 3 - nSrcLineStart] =
    3574      440038 :                 dfY + 3 * dfYScaleWeight;
    3575      440040 :             dfWeightSum +=
    3576      440038 :                 pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
    3577             :         }
    3578      340967 :         for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
    3579             :         {
    3580       36142 :             const double dfWeight = pfnFilterFunc(dfY);
    3581       36132 :             padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
    3582       36132 :             dfWeightSum += dfWeight;
    3583             :         }
    3584             : 
    3585      304825 :         if (pabyChunkNodataMask == nullptr)
    3586             :         {
    3587      265832 :             if (dfWeightSum != 0)
    3588             :             {
    3589      265834 :                 const double dfInvWeightSum = 1.0 / dfWeightSum;
    3590     1789495 :                 for (int i = 0; i < nSrcLineCount; ++i)
    3591     1523661 :                     padfWeights[i] *= dfInvWeightSum;
    3592             :             }
    3593             :         }
    3594             : 
    3595      304825 :         if (pabyChunkNodataMask == nullptr)
    3596             :         {
    3597      265831 :             int iFilteredPixelOff = 0;  // Used after for.
    3598             :             // j used after for.
    3599      265831 :             size_t j =
    3600      265831 :                 (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
    3601             : #ifdef USE_SSE2
    3602             :             if constexpr (eWrkDataType == GDT_Float32)
    3603             :             {
    3604             : #ifdef __AVX__
    3605             :                 for (; iFilteredPixelOff < nDstXSize - 15;
    3606             :                      iFilteredPixelOff += 16, j += 16)
    3607             :                 {
    3608             :                     GDALResampleConvolutionVertical_16cols(
    3609             :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3610             :                         nSrcLineCount, pafDstScanline + iFilteredPixelOff);
    3611             :                     if (bHasNoData)
    3612             :                     {
    3613             :                         for (int k = 0; k < 16; k++)
    3614             :                         {
    3615             :                             pafDstScanline[iFilteredPixelOff + k] =
    3616             :                                 replaceValIfNodata(
    3617             :                                     pafDstScanline[iFilteredPixelOff + k]);
    3618             :                         }
    3619             :                     }
    3620             :                 }
    3621             : #else
    3622    23017798 :                 for (; iFilteredPixelOff < nDstXSize - 7;
    3623             :                      iFilteredPixelOff += 8, j += 8)
    3624             :                 {
    3625    22799608 :                     GDALResampleConvolutionVertical_8cols(
    3626    22799608 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3627    22799608 :                         nSrcLineCount, pafDstScanline + iFilteredPixelOff);
    3628    22759178 :                     if (bHasNoData)
    3629             :                     {
    3630      123192 :                         for (int k = 0; k < 8; k++)
    3631             :                         {
    3632      109504 :                             pafDstScanline[iFilteredPixelOff + k] =
    3633      109504 :                                 replaceValIfNodata(
    3634      109504 :                                     pafDstScanline[iFilteredPixelOff + k]);
    3635             :                         }
    3636             :                     }
    3637             :                 }
    3638             : #endif
    3639             : 
    3640      683426 :                 for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
    3641             :                 {
    3642      465258 :                     const Twork fVal =
    3643      465241 :                         static_cast<Twork>(GDALResampleConvolutionVertical(
    3644      465241 :                             padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3645             :                             nSrcLineCount));
    3646      465239 :                     pafDstScanline[iFilteredPixelOff] =
    3647      465258 :                         replaceValIfNodata(fVal);
    3648             :                 }
    3649             :             }
    3650             :             else
    3651             : #endif
    3652             :             {
    3653     2887210 :                 for (; iFilteredPixelOff < nDstXSize - 1;
    3654             :                      iFilteredPixelOff += 2, j += 2)
    3655             :                 {
    3656     2880000 :                     double dfVal1 = 0.0;
    3657     2880000 :                     double dfVal2 = 0.0;
    3658     2880000 :                     GDALResampleConvolutionVertical_2cols(
    3659     2880000 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3660             :                         nSrcLineCount, dfVal1, dfVal2);
    3661     5760010 :                     pafDstScanline[iFilteredPixelOff] =
    3662     2880000 :                         replaceValIfNodata(static_cast<Twork>(dfVal1));
    3663     2880000 :                     pafDstScanline[iFilteredPixelOff + 1] =
    3664     2880000 :                         replaceValIfNodata(static_cast<Twork>(dfVal2));
    3665             :                 }
    3666        7206 :                 if (iFilteredPixelOff < nDstXSize)
    3667             :                 {
    3668           2 :                     const double dfVal = GDALResampleConvolutionVertical(
    3669           2 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3670             :                         nSrcLineCount);
    3671           2 :                     pafDstScanline[iFilteredPixelOff] =
    3672           2 :                         replaceValIfNodata(static_cast<Twork>(dfVal));
    3673             :                 }
    3674             :             }
    3675             :         }
    3676             :         else
    3677             :         {
    3678    18979539 :             for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
    3679             :                  ++iFilteredPixelOff)
    3680             :             {
    3681    18940633 :                 double dfVal = 0.0;
    3682    18940633 :                 dfWeightSum = 0.0;
    3683    18940633 :                 size_t j = (nSrcLineStart - nChunkYOff) *
    3684    18940633 :                                static_cast<size_t>(nDstXSize) +
    3685    18940633 :                            iFilteredPixelOff;
    3686    18940633 :                 if (bKernelWithNegativeWeights)
    3687             :                 {
    3688    18700801 :                     int nConsecutiveValid = 0;
    3689    18700801 :                     int nMaxConsecutiveValid = 0;
    3690   133007321 :                     for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
    3691             :                     {
    3692   114268020 :                         const double dfWeight =
    3693   114268020 :                             padfWeights[i] *
    3694             :                             pabyChunkNodataMaskHorizontalFiltered[j];
    3695   114268020 :                         if (pabyChunkNodataMaskHorizontalFiltered[j])
    3696             :                         {
    3697    48650337 :                             nConsecutiveValid++;
    3698             :                         }
    3699    65617183 :                         else if (nConsecutiveValid)
    3700             :                         {
    3701      243325 :                             nMaxConsecutiveValid = std::max(
    3702      204376 :                                 nMaxConsecutiveValid, nConsecutiveValid);
    3703      243325 :                             nConsecutiveValid = 0;
    3704             :                         }
    3705   114307020 :                         dfVal += padfHorizontalFiltered[j] * dfWeight;
    3706   114307020 :                         dfWeightSum += dfWeight;
    3707             :                     }
    3708    18740901 :                     nMaxConsecutiveValid =
    3709    18739801 :                         std::max(nMaxConsecutiveValid, nConsecutiveValid);
    3710    18740901 :                     if (nMaxConsecutiveValid < nSrcLineCount / 2)
    3711             :                     {
    3712     9246271 :                         pafDstScanline[iFilteredPixelOff] =
    3713     9246179 :                             static_cast<Twork>(dfNoDataValue);
    3714     9246271 :                         continue;
    3715             :                     }
    3716             :                 }
    3717             :                 else
    3718             :                 {
    3719     1233322 :                     for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
    3720             :                     {
    3721      993504 :                         const double dfWeight =
    3722      993504 :                             padfWeights[i] *
    3723             :                             pabyChunkNodataMaskHorizontalFiltered[j];
    3724      993504 :                         dfVal += padfHorizontalFiltered[j] * dfWeight;
    3725      993504 :                         dfWeightSum += dfWeight;
    3726             :                     }
    3727             :                 }
    3728     9734482 :                 if (dfWeightSum > 0.0)
    3729             :                 {
    3730     9682091 :                     pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
    3731     9722259 :                         static_cast<Twork>(dfVal / dfWeightSum));
    3732             :                 }
    3733             :                 else
    3734             :                 {
    3735       12205 :                     pafDstScanline[iFilteredPixelOff] =
    3736       12181 :                         static_cast<Twork>(dfNoDataValue);
    3737             :                 }
    3738             :             }
    3739             :         }
    3740             : 
    3741      264321 :         if (fMaxVal != 0.0f)
    3742             :         {
    3743      192324 :             for (int i = 0; i < nDstXSize; ++i)
    3744             :             {
    3745      192088 :                 if (pafDstScanline[i] > fMaxVal)
    3746       96022 :                     pafDstScanline[i] = fMaxVal;
    3747             :             }
    3748             :         }
    3749             : 
    3750      264321 :         if (pafWrkScanline)
    3751             :         {
    3752      296414 :             GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
    3753             :                             static_cast<GByte *>(pDstBuffer) +
    3754      296414 :                                 static_cast<size_t>(iDstLine - nDstYOff) *
    3755      296414 :                                     nDstXSize * nDstDataTypeSize,
    3756             :                             dstDataType, nDstDataTypeSize, nDstXSize);
    3757             :         }
    3758             :     }
    3759             : 
    3760        4641 :     VSIFree(pafWrkScanline);
    3761        4641 :     VSIFreeAligned(padfWeights);
    3762        4641 :     VSIFree(padfHorizontalFiltered);
    3763        4641 :     VSIFree(pabyChunkNodataMaskHorizontalFiltered);
    3764             : 
    3765        4641 :     return CE_None;
    3766             : }
    3767             : 
    3768             : static CPLErr
    3769        4640 : GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
    3770             :                               const void *pChunk, void **ppDstBuffer,
    3771             :                               GDALDataType *peDstBufferDataType)
    3772             : {
    3773             :     GDALResampleAlg eResample;
    3774        4640 :     bool bKernelWithNegativeWeights = false;
    3775        4640 :     if (EQUAL(args.pszResampling, "BILINEAR"))
    3776        2628 :         eResample = GRA_Bilinear;
    3777        2012 :     else if (EQUAL(args.pszResampling, "CUBIC"))
    3778             :     {
    3779        1935 :         eResample = GRA_Cubic;
    3780        1935 :         bKernelWithNegativeWeights = true;
    3781             :     }
    3782          77 :     else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
    3783          23 :         eResample = GRA_CubicSpline;
    3784          54 :     else if (EQUAL(args.pszResampling, "LANCZOS"))
    3785             :     {
    3786          54 :         eResample = GRA_Lanczos;
    3787          54 :         bKernelWithNegativeWeights = true;
    3788             :     }
    3789             :     else
    3790             :     {
    3791           0 :         CPLAssert(false);
    3792             :         return CE_Failure;
    3793             :     }
    3794        4640 :     const int nKernelRadius = GWKGetFilterRadius(eResample);
    3795        4636 :     FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
    3796             :     const FilterFunc4ValuesType pfnFilterFunc4Values =
    3797        4638 :         GWKGetFilterFunc4Values(eResample);
    3798             : 
    3799        4638 :     float fMaxVal = 0.f;
    3800             :     // Cubic, etc... can have overshoots, so make sure we clamp values to the
    3801             :     // maximum value if NBITS is set.
    3802        4638 :     if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
    3803           8 :         (args.eOvrDataType == GDT_Byte || args.eOvrDataType == GDT_UInt16 ||
    3804           0 :          args.eOvrDataType == GDT_UInt32))
    3805             :     {
    3806           8 :         int nBits = args.nOvrNBITS;
    3807           8 :         if (nBits == GDALGetDataTypeSizeBits(args.eOvrDataType))
    3808           1 :             nBits = 0;
    3809           8 :         if (nBits > 0 && nBits < 32)
    3810           7 :             fMaxVal = static_cast<float>((1U << nBits) - 1);
    3811             :     }
    3812             : 
    3813        4638 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(
    3814             :         args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
    3815             :         GDALGetDataTypeSizeBytes(args.eOvrDataType));
    3816        4640 :     if (*ppDstBuffer == nullptr)
    3817             :     {
    3818           0 :         return CE_Failure;
    3819             :     }
    3820        4640 :     *peDstBufferDataType = args.eOvrDataType;
    3821             : 
    3822        4640 :     switch (args.eWrkDataType)
    3823             :     {
    3824        3903 :         case GDT_Byte:
    3825             :         {
    3826        3903 :             return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32>(
    3827             :                 args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
    3828             :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
    3829        3903 :                 bKernelWithNegativeWeights, fMaxVal);
    3830             :         }
    3831             : 
    3832         395 :         case GDT_UInt16:
    3833             :         {
    3834         395 :             return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32>(
    3835             :                 args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
    3836             :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
    3837         396 :                 bKernelWithNegativeWeights, fMaxVal);
    3838             :         }
    3839             : 
    3840         313 :         case GDT_Float32:
    3841             :         {
    3842         313 :             return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32>(
    3843             :                 args, static_cast<const float *>(pChunk), *ppDstBuffer,
    3844             :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
    3845         313 :                 bKernelWithNegativeWeights, fMaxVal);
    3846             :         }
    3847             : 
    3848          29 :         case GDT_Float64:
    3849             :         {
    3850          29 :             return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64>(
    3851             :                 args, static_cast<const double *>(pChunk), *ppDstBuffer,
    3852             :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
    3853          29 :                 bKernelWithNegativeWeights, fMaxVal);
    3854             :         }
    3855             : 
    3856           0 :         default:
    3857           0 :             break;
    3858             :     }
    3859             : 
    3860           0 :     CPLAssert(false);
    3861             :     return CE_Failure;
    3862             : }
    3863             : 
    3864             : /************************************************************************/
    3865             : /*                       GDALResampleChunkC32R()                        */
    3866             : /************************************************************************/
    3867             : 
    3868           2 : static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
    3869             :                                     const float *pafChunk, const int nChunkYOff,
    3870             :                                     const int nChunkYSize, const int nDstYOff,
    3871             :                                     const int nDstYOff2, const int nOvrXSize,
    3872             :                                     const int nOvrYSize, void **ppDstBuffer,
    3873             :                                     GDALDataType *peDstBufferDataType,
    3874             :                                     const char *pszResampling)
    3875             : 
    3876             : {
    3877             :     enum Method
    3878             :     {
    3879             :         NEAR,
    3880             :         AVERAGE,
    3881             :         AVERAGE_MAGPHASE,
    3882             :         RMS,
    3883             :     };
    3884             : 
    3885           2 :     Method eMethod = NEAR;
    3886           2 :     if (STARTS_WITH_CI(pszResampling, "NEAR"))
    3887             :     {
    3888           0 :         eMethod = NEAR;
    3889             :     }
    3890           2 :     else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
    3891             :     {
    3892           0 :         eMethod = AVERAGE_MAGPHASE;
    3893             :     }
    3894           2 :     else if (EQUAL(pszResampling, "RMS"))
    3895             :     {
    3896           2 :         eMethod = RMS;
    3897             :     }
    3898           0 :     else if (STARTS_WITH_CI(pszResampling, "AVER"))
    3899             :     {
    3900           0 :         eMethod = AVERAGE;
    3901             :     }
    3902             :     else
    3903             :     {
    3904           0 :         CPLError(
    3905             :             CE_Failure, CPLE_NotSupported,
    3906             :             "Resampling method %s is not supported for complex data types. "
    3907             :             "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
    3908             :             pszResampling);
    3909           0 :         return CE_Failure;
    3910             :     }
    3911             : 
    3912           2 :     const int nOXSize = nOvrXSize;
    3913           2 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
    3914             :                                        GDALGetDataTypeSizeBytes(GDT_CFloat32));
    3915           2 :     if (*ppDstBuffer == nullptr)
    3916             :     {
    3917           0 :         return CE_Failure;
    3918             :     }
    3919           2 :     float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
    3920           2 :     *peDstBufferDataType = GDT_CFloat32;
    3921             : 
    3922           2 :     const int nOYSize = nOvrYSize;
    3923           2 :     const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
    3924           2 :     const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
    3925             : 
    3926             :     /* ==================================================================== */
    3927             :     /*      Loop over destination scanlines.                                */
    3928             :     /* ==================================================================== */
    3929           8 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    3930             :     {
    3931           6 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
    3932           6 :         if (nSrcYOff < nChunkYOff)
    3933           0 :             nSrcYOff = nChunkYOff;
    3934             : 
    3935           6 :         int nSrcYOff2 =
    3936           6 :             static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
    3937           6 :         if (nSrcYOff2 == nSrcYOff)
    3938           0 :             nSrcYOff2++;
    3939             : 
    3940           6 :         if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
    3941             :         {
    3942           2 :             if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
    3943           0 :                 nSrcYOff = nSrcHeight - 1;
    3944           2 :             nSrcYOff2 = nSrcHeight;
    3945             :         }
    3946           6 :         if (nSrcYOff2 > nChunkYOff + nChunkYSize)
    3947           0 :             nSrcYOff2 = nChunkYOff + nChunkYSize;
    3948             : 
    3949           6 :         const float *const pafSrcScanline =
    3950           6 :             pafChunk +
    3951           6 :             (static_cast<size_t>(nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
    3952           6 :         float *const pafDstScanline =
    3953           6 :             pafDstBuffer +
    3954           6 :             static_cast<size_t>(iDstLine - nDstYOff) * 2 * nOXSize;
    3955             : 
    3956             :         /* --------------------------------------------------------------------
    3957             :          */
    3958             :         /*      Loop over destination pixels */
    3959             :         /* --------------------------------------------------------------------
    3960             :          */
    3961          18 :         for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
    3962             :         {
    3963          12 :             const size_t iDstPixelSZ = static_cast<size_t>(iDstPixel);
    3964          12 :             int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
    3965          12 :             int nSrcXOff2 =
    3966          12 :                 static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
    3967          12 :             if (nSrcXOff2 == nSrcXOff)
    3968           0 :                 nSrcXOff2++;
    3969          12 :             if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
    3970             :             {
    3971           6 :                 if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
    3972           0 :                     nSrcXOff = nSrcWidth - 1;
    3973           6 :                 nSrcXOff2 = nSrcWidth;
    3974             :             }
    3975          12 :             const size_t nSrcXOffSZ = static_cast<size_t>(nSrcXOff);
    3976             : 
    3977          12 :             if (eMethod == NEAR)
    3978             :             {
    3979           0 :                 pafDstScanline[iDstPixelSZ * 2] =
    3980           0 :                     pafSrcScanline[nSrcXOffSZ * 2];
    3981           0 :                 pafDstScanline[iDstPixelSZ * 2 + 1] =
    3982           0 :                     pafSrcScanline[nSrcXOffSZ * 2 + 1];
    3983             :             }
    3984          12 :             else if (eMethod == AVERAGE_MAGPHASE)
    3985             :             {
    3986           0 :                 double dfTotalR = 0.0;
    3987           0 :                 double dfTotalI = 0.0;
    3988           0 :                 double dfTotalM = 0.0;
    3989           0 :                 size_t nCount = 0;
    3990             : 
    3991           0 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    3992             :                 {
    3993           0 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    3994             :                     {
    3995           0 :                         const double dfR =
    3996           0 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    3997           0 :                                            static_cast<size_t>(iY - nSrcYOff) *
    3998           0 :                                                nSrcWidth * 2];
    3999           0 :                         const double dfI =
    4000           0 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4001           0 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4002           0 :                                                nSrcWidth * 2 +
    4003           0 :                                            1];
    4004           0 :                         dfTotalR += dfR;
    4005           0 :                         dfTotalI += dfI;
    4006           0 :                         dfTotalM += std::hypot(dfR, dfI);
    4007           0 :                         ++nCount;
    4008             :                     }
    4009             :                 }
    4010             : 
    4011           0 :                 CPLAssert(nCount > 0);
    4012           0 :                 if (nCount == 0)
    4013             :                 {
    4014           0 :                     pafDstScanline[iDstPixelSZ * 2] = 0.0;
    4015           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
    4016             :                 }
    4017             :                 else
    4018             :                 {
    4019           0 :                     pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
    4020           0 :                         dfTotalR / static_cast<double>(nCount));
    4021           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
    4022           0 :                         dfTotalI / static_cast<double>(nCount));
    4023             :                     const double dfM =
    4024           0 :                         std::hypot(pafDstScanline[iDstPixelSZ * 2],
    4025           0 :                                    pafDstScanline[iDstPixelSZ * 2 + 1]);
    4026           0 :                     const double dfDesiredM =
    4027           0 :                         dfTotalM / static_cast<double>(nCount);
    4028           0 :                     double dfRatio = 1.0;
    4029           0 :                     if (dfM != 0.0)
    4030           0 :                         dfRatio = dfDesiredM / dfM;
    4031             : 
    4032           0 :                     pafDstScanline[iDstPixelSZ * 2] *=
    4033           0 :                         static_cast<float>(dfRatio);
    4034           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] *=
    4035           0 :                         static_cast<float>(dfRatio);
    4036             :                 }
    4037             :             }
    4038          12 :             else if (eMethod == RMS)
    4039             :             {
    4040          12 :                 double dfTotalR = 0.0;
    4041          12 :                 double dfTotalI = 0.0;
    4042          12 :                 size_t nCount = 0;
    4043             : 
    4044          36 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    4045             :                 {
    4046          72 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    4047             :                     {
    4048          48 :                         const double dfR =
    4049          48 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4050          48 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4051          48 :                                                nSrcWidth * 2];
    4052          48 :                         const double dfI =
    4053          48 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4054          48 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4055          48 :                                                nSrcWidth * 2 +
    4056          48 :                                            1];
    4057             : 
    4058          48 :                         dfTotalR += SQUARE(dfR);
    4059          48 :                         dfTotalI += SQUARE(dfI);
    4060             : 
    4061          48 :                         ++nCount;
    4062             :                     }
    4063             :                 }
    4064             : 
    4065          12 :                 CPLAssert(nCount > 0);
    4066          12 :                 if (nCount == 0)
    4067             :                 {
    4068           0 :                     pafDstScanline[iDstPixelSZ * 2] = 0.0;
    4069           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
    4070             :                 }
    4071             :                 else
    4072             :                 {
    4073             :                     /* compute RMS */
    4074          12 :                     pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
    4075          12 :                         sqrt(dfTotalR / static_cast<double>(nCount)));
    4076          12 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
    4077          12 :                         sqrt(dfTotalI / static_cast<double>(nCount)));
    4078             :                 }
    4079             :             }
    4080           0 :             else if (eMethod == AVERAGE)
    4081             :             {
    4082           0 :                 double dfTotalR = 0.0;
    4083           0 :                 double dfTotalI = 0.0;
    4084           0 :                 size_t nCount = 0;
    4085             : 
    4086           0 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    4087             :                 {
    4088           0 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    4089             :                     {
    4090             :                         // TODO(schwehr): Maybe use std::complex?
    4091           0 :                         dfTotalR +=
    4092           0 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4093           0 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4094           0 :                                                nSrcWidth * 2];
    4095           0 :                         dfTotalI +=
    4096           0 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4097           0 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4098           0 :                                                nSrcWidth * 2 +
    4099           0 :                                            1];
    4100           0 :                         ++nCount;
    4101             :                     }
    4102             :                 }
    4103             : 
    4104           0 :                 CPLAssert(nCount > 0);
    4105           0 :                 if (nCount == 0)
    4106             :                 {
    4107           0 :                     pafDstScanline[iDstPixelSZ * 2] = 0.0;
    4108           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
    4109             :                 }
    4110             :                 else
    4111             :                 {
    4112           0 :                     pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
    4113           0 :                         dfTotalR / static_cast<double>(nCount));
    4114           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
    4115           0 :                         dfTotalI / static_cast<double>(nCount));
    4116             :                 }
    4117             :             }
    4118             :         }
    4119             :     }
    4120             : 
    4121           2 :     return CE_None;
    4122             : }
    4123             : 
    4124             : /************************************************************************/
    4125             : /*                  GDALRegenerateCascadingOverviews()                  */
    4126             : /*                                                                      */
    4127             : /*      Generate a list of overviews in order from largest to           */
    4128             : /*      smallest, computing each from the next larger.                  */
    4129             : /************************************************************************/
    4130             : 
    4131          44 : static CPLErr GDALRegenerateCascadingOverviews(
    4132             :     GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
    4133             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    4134             :     void *pProgressData, CSLConstList papszOptions)
    4135             : 
    4136             : {
    4137             :     /* -------------------------------------------------------------------- */
    4138             :     /*      First, we must put the overviews in order from largest to       */
    4139             :     /*      smallest.                                                       */
    4140             :     /* -------------------------------------------------------------------- */
    4141         127 :     for (int i = 0; i < nOverviews - 1; ++i)
    4142             :     {
    4143         292 :         for (int j = 0; j < nOverviews - i - 1; ++j)
    4144             :         {
    4145         209 :             if (papoOvrBands[j]->GetXSize() *
    4146         209 :                     static_cast<float>(papoOvrBands[j]->GetYSize()) <
    4147         209 :                 papoOvrBands[j + 1]->GetXSize() *
    4148         209 :                     static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
    4149             :             {
    4150           0 :                 GDALRasterBand *poTempBand = papoOvrBands[j];
    4151           0 :                 papoOvrBands[j] = papoOvrBands[j + 1];
    4152           0 :                 papoOvrBands[j + 1] = poTempBand;
    4153             :             }
    4154             :         }
    4155             :     }
    4156             : 
    4157             :     /* -------------------------------------------------------------------- */
    4158             :     /*      Count total pixels so we can prepare appropriate scaled         */
    4159             :     /*      progress functions.                                             */
    4160             :     /* -------------------------------------------------------------------- */
    4161          44 :     double dfTotalPixels = 0.0;
    4162             : 
    4163         171 :     for (int i = 0; i < nOverviews; ++i)
    4164             :     {
    4165         127 :         dfTotalPixels += papoOvrBands[i]->GetXSize() *
    4166         127 :                          static_cast<double>(papoOvrBands[i]->GetYSize());
    4167             :     }
    4168             : 
    4169             :     /* -------------------------------------------------------------------- */
    4170             :     /*      Generate all the bands.                                         */
    4171             :     /* -------------------------------------------------------------------- */
    4172          44 :     double dfPixelsProcessed = 0.0;
    4173             : 
    4174         171 :     for (int i = 0; i < nOverviews; ++i)
    4175             :     {
    4176         127 :         GDALRasterBand *poBaseBand = poSrcBand;
    4177         127 :         if (i != 0)
    4178          83 :             poBaseBand = papoOvrBands[i - 1];
    4179             : 
    4180         127 :         double dfPixels = papoOvrBands[i]->GetXSize() *
    4181         127 :                           static_cast<double>(papoOvrBands[i]->GetYSize());
    4182             : 
    4183         254 :         void *pScaledProgressData = GDALCreateScaledProgress(
    4184             :             dfPixelsProcessed / dfTotalPixels,
    4185         127 :             (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
    4186             :             pProgressData);
    4187             : 
    4188         254 :         const CPLErr eErr = GDALRegenerateOverviewsEx(
    4189             :             poBaseBand, 1,
    4190         127 :             reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
    4191             :             pszResampling, GDALScaledProgress, pScaledProgressData,
    4192             :             papszOptions);
    4193         127 :         GDALDestroyScaledProgress(pScaledProgressData);
    4194             : 
    4195         127 :         if (eErr != CE_None)
    4196           0 :             return eErr;
    4197             : 
    4198         127 :         dfPixelsProcessed += dfPixels;
    4199             : 
    4200             :         // Only do the bit2grayscale promotion on the base band.
    4201         127 :         if (STARTS_WITH_CI(pszResampling,
    4202             :                            "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
    4203           8 :             pszResampling = "AVERAGE";
    4204             :     }
    4205             : 
    4206          44 :     return CE_None;
    4207             : }
    4208             : 
    4209             : /************************************************************************/
    4210             : /*                    GDALGetResampleFunction()                         */
    4211             : /************************************************************************/
    4212             : 
    4213        5028 : GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
    4214             :                                              int *pnRadius)
    4215             : {
    4216        5028 :     if (pnRadius)
    4217        5028 :         *pnRadius = 0;
    4218        5028 :     if (STARTS_WITH_CI(pszResampling, "NEAR"))
    4219         500 :         return GDALResampleChunk_Near;
    4220        4528 :     else if (STARTS_WITH_CI(pszResampling, "AVER") ||
    4221        3988 :              EQUAL(pszResampling, "RMS"))
    4222         565 :         return GDALResampleChunk_AverageOrRMS;
    4223        3963 :     else if (EQUAL(pszResampling, "GAUSS"))
    4224             :     {
    4225          26 :         if (pnRadius)
    4226          26 :             *pnRadius = 1;
    4227          26 :         return GDALResampleChunk_Gauss;
    4228             :     }
    4229        3937 :     else if (EQUAL(pszResampling, "MODE"))
    4230          96 :         return GDALResampleChunk_Mode;
    4231        3841 :     else if (EQUAL(pszResampling, "CUBIC"))
    4232             :     {
    4233        1432 :         if (pnRadius)
    4234        1432 :             *pnRadius = GWKGetFilterRadius(GRA_Cubic);
    4235        1431 :         return GDALResampleChunk_Convolution;
    4236             :     }
    4237        2409 :     else if (EQUAL(pszResampling, "CUBICSPLINE"))
    4238             :     {
    4239           3 :         if (pnRadius)
    4240           3 :             *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
    4241           3 :         return GDALResampleChunk_Convolution;
    4242             :     }
    4243        2406 :     else if (EQUAL(pszResampling, "LANCZOS"))
    4244             :     {
    4245           8 :         if (pnRadius)
    4246           8 :             *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
    4247           8 :         return GDALResampleChunk_Convolution;
    4248             :     }
    4249        2398 :     else if (EQUAL(pszResampling, "BILINEAR"))
    4250             :     {
    4251        2398 :         if (pnRadius)
    4252        2398 :             *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
    4253        2398 :         return GDALResampleChunk_Convolution;
    4254             :     }
    4255             :     else
    4256             :     {
    4257           0 :         CPLError(
    4258             :             CE_Failure, CPLE_AppDefined,
    4259             :             "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
    4260             :             pszResampling);
    4261           0 :         return nullptr;
    4262             :     }
    4263             : }
    4264             : 
    4265             : /************************************************************************/
    4266             : /*                      GDALGetOvrWorkDataType()                        */
    4267             : /************************************************************************/
    4268             : 
    4269        4910 : GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
    4270             :                                     GDALDataType eSrcDataType)
    4271             : {
    4272        4910 :     if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
    4273             :     {
    4274         591 :         return eSrcDataType;
    4275             :     }
    4276        4319 :     else if (eSrcDataType == GDT_Byte &&
    4277        3985 :              (STARTS_WITH_CI(pszResampling, "AVER") ||
    4278        3507 :               EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
    4279        2274 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    4280        2271 :               EQUAL(pszResampling, "LANCZOS") ||
    4281        2266 :               EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
    4282             :     {
    4283        3977 :         return GDT_Byte;
    4284             :     }
    4285         342 :     else if (eSrcDataType == GDT_UInt16 &&
    4286         122 :              (STARTS_WITH_CI(pszResampling, "AVER") ||
    4287         113 :               EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
    4288           3 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    4289           3 :               EQUAL(pszResampling, "LANCZOS") ||
    4290           2 :               EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
    4291             :     {
    4292         119 :         return GDT_UInt16;
    4293             :     }
    4294         223 :     else if (EQUAL(pszResampling, "GAUSS"))
    4295          20 :         return GDT_Float64;
    4296             : 
    4297         203 :     if (eSrcDataType == GDT_Byte || eSrcDataType == GDT_Int8 ||
    4298         204 :         eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
    4299             :         eSrcDataType == GDT_Float32)
    4300             :     {
    4301         161 :         return GDT_Float32;
    4302             :     }
    4303          42 :     return GDT_Float64;
    4304             : }
    4305             : 
    4306             : namespace
    4307             : {
    4308             : // Structure to hold a pointer to free with CPLFree()
    4309             : struct PointerHolder
    4310             : {
    4311             :     void *ptr = nullptr;
    4312             : 
    4313        5792 :     explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
    4314             :     {
    4315        5792 :     }
    4316             : 
    4317        5792 :     ~PointerHolder()
    4318        5792 :     {
    4319        5792 :         CPLFree(ptr);
    4320        5792 :     }
    4321             : 
    4322             :     PointerHolder(const PointerHolder &) = delete;
    4323             :     PointerHolder &operator=(const PointerHolder &) = delete;
    4324             : };
    4325             : }  // namespace
    4326             : 
    4327             : /************************************************************************/
    4328             : /*                      GDALRegenerateOverviews()                       */
    4329             : /************************************************************************/
    4330             : 
    4331             : /**
    4332             :  * \brief Generate downsampled overviews.
    4333             :  *
    4334             :  * This function will generate one or more overview images from a base image
    4335             :  * using the requested downsampling algorithm.  Its primary use is for
    4336             :  * generating overviews via GDALDataset::BuildOverviews(), but it can also be
    4337             :  * used to generate downsampled images in one file from another outside the
    4338             :  * overview architecture.
    4339             :  *
    4340             :  * The output bands need to exist in advance.
    4341             :  *
    4342             :  * The full set of resampling algorithms is documented in
    4343             :  * GDALDataset::BuildOverviews().
    4344             :  *
    4345             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    4346             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    4347             :  * considered as the nodata value and not each value of the triplet
    4348             :  * independently per band.
    4349             :  *
    4350             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    4351             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    4352             :  * overview computation.
    4353             :  *
    4354             :  * @param hSrcBand the source (base level) band.
    4355             :  * @param nOverviewCount the number of downsampled bands being generated.
    4356             :  * @param pahOvrBands the list of downsampled bands to be generated.
    4357             :  * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
    4358             :  * @param pfnProgress progress report function.
    4359             :  * @param pProgressData progress function callback data.
    4360             :  * @return CE_None on success or CE_Failure on failure.
    4361             :  */
    4362         250 : CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
    4363             :                                GDALRasterBandH *pahOvrBands,
    4364             :                                const char *pszResampling,
    4365             :                                GDALProgressFunc pfnProgress,
    4366             :                                void *pProgressData)
    4367             : 
    4368             : {
    4369         250 :     return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
    4370             :                                      pszResampling, pfnProgress, pProgressData,
    4371         250 :                                      nullptr);
    4372             : }
    4373             : 
    4374             : /************************************************************************/
    4375             : /*                     GDALRegenerateOverviewsEx()                      */
    4376             : /************************************************************************/
    4377             : 
    4378             : constexpr int RADIUS_TO_DIAMETER = 2;
    4379             : 
    4380             : /**
    4381             :  * \brief Generate downsampled overviews.
    4382             :  *
    4383             :  * This function will generate one or more overview images from a base image
    4384             :  * using the requested downsampling algorithm.  Its primary use is for
    4385             :  * generating overviews via GDALDataset::BuildOverviews(), but it can also be
    4386             :  * used to generate downsampled images in one file from another outside the
    4387             :  * overview architecture.
    4388             :  *
    4389             :  * The output bands need to exist in advance.
    4390             :  *
    4391             :  * The full set of resampling algorithms is documented in
    4392             :  * GDALDataset::BuildOverviews().
    4393             :  *
    4394             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    4395             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    4396             :  * considered as the nodata value and not each value of the triplet
    4397             :  * independently per band.
    4398             :  *
    4399             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    4400             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    4401             :  * overview computation.
    4402             :  *
    4403             :  * @param hSrcBand the source (base level) band.
    4404             :  * @param nOverviewCount the number of downsampled bands being generated.
    4405             :  * @param pahOvrBands the list of downsampled bands to be generated.
    4406             :  * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
    4407             :  * @param pfnProgress progress report function.
    4408             :  * @param pProgressData progress function callback data.
    4409             :  * @param papszOptions NULL terminated list of options as key=value pairs, or
    4410             :  * NULL
    4411             :  * @return CE_None on success or CE_Failure on failure.
    4412             :  * @since GDAL 3.6
    4413             :  */
    4414         887 : CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
    4415             :                                  GDALRasterBandH *pahOvrBands,
    4416             :                                  const char *pszResampling,
    4417             :                                  GDALProgressFunc pfnProgress,
    4418             :                                  void *pProgressData, CSLConstList papszOptions)
    4419             : 
    4420             : {
    4421         887 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    4422         887 :     GDALRasterBand **papoOvrBands =
    4423             :         reinterpret_cast<GDALRasterBand **>(pahOvrBands);
    4424             : 
    4425         887 :     if (pfnProgress == nullptr)
    4426         252 :         pfnProgress = GDALDummyProgress;
    4427             : 
    4428         887 :     if (EQUAL(pszResampling, "NONE"))
    4429          49 :         return CE_None;
    4430             : 
    4431         838 :     int nKernelRadius = 0;
    4432             :     GDALResampleFunction pfnResampleFn =
    4433         838 :         GDALGetResampleFunction(pszResampling, &nKernelRadius);
    4434             : 
    4435         838 :     if (pfnResampleFn == nullptr)
    4436           0 :         return CE_Failure;
    4437             : 
    4438             :     /* -------------------------------------------------------------------- */
    4439             :     /*      Check color tables...                                           */
    4440             :     /* -------------------------------------------------------------------- */
    4441         838 :     GDALColorTable *poColorTable = nullptr;
    4442             : 
    4443         471 :     if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
    4444        1750 :          EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
    4445         452 :         poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
    4446             :     {
    4447           9 :         poColorTable = poSrcBand->GetColorTable();
    4448           9 :         if (poColorTable != nullptr)
    4449             :         {
    4450           9 :             if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
    4451             :             {
    4452           0 :                 CPLError(CE_Warning, CPLE_AppDefined,
    4453             :                          "Computing overviews on palette index raster bands "
    4454             :                          "with a palette whose color interpretation is not RGB "
    4455             :                          "will probably lead to unexpected results.");
    4456           0 :                 poColorTable = nullptr;
    4457             :             }
    4458           9 :             else if (poColorTable->IsIdentity())
    4459             :             {
    4460           0 :                 poColorTable = nullptr;
    4461             :             }
    4462             :         }
    4463             :         else
    4464             :         {
    4465           0 :             CPLError(CE_Warning, CPLE_AppDefined,
    4466             :                      "Computing overviews on palette index raster bands "
    4467             :                      "without a palette will probably lead to unexpected "
    4468             :                      "results.");
    4469             :         }
    4470             :     }
    4471             :     // Not ready yet
    4472        2433 :     else if ((EQUAL(pszResampling, "CUBIC") ||
    4473         775 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    4474         775 :               EQUAL(pszResampling, "LANCZOS") ||
    4475        1684 :               EQUAL(pszResampling, "BILINEAR")) &&
    4476          80 :              poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
    4477             :     {
    4478           0 :         CPLError(CE_Warning, CPLE_AppDefined,
    4479             :                  "Computing %s overviews on palette index raster bands "
    4480             :                  "will probably lead to unexpected results.",
    4481             :                  pszResampling);
    4482             :     }
    4483             : 
    4484             :     // If we have a nodata mask and we are doing something more complicated
    4485             :     // than nearest neighbouring, we have to fetch to nodata mask.
    4486             : 
    4487         838 :     GDALRasterBand *poMaskBand = nullptr;
    4488         838 :     bool bUseNoDataMask = false;
    4489         838 :     bool bCanUseCascaded = true;
    4490             : 
    4491         838 :     if (!STARTS_WITH_CI(pszResampling, "NEAR"))
    4492             :     {
    4493             :         // Special case if we are an alpha/mask band. We want it to be
    4494             :         // considered as the mask band to avoid alpha=0 to be taken into account
    4495             :         // in average computation.
    4496         532 :         if (poSrcBand->IsMaskBand())
    4497             :         {
    4498          91 :             poMaskBand = poSrcBand;
    4499          91 :             bUseNoDataMask = true;
    4500             :         }
    4501             :         else
    4502             :         {
    4503         441 :             poMaskBand = poSrcBand->GetMaskBand();
    4504         441 :             const int nMaskFlags = poSrcBand->GetMaskFlags();
    4505         441 :             bCanUseCascaded =
    4506         441 :                 (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
    4507         441 :             bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
    4508             :         }
    4509             :     }
    4510             : 
    4511             :     /* -------------------------------------------------------------------- */
    4512             :     /*      If we are operating on multiple overviews, and using            */
    4513             :     /*      averaging, lets do them in cascading order to reduce the        */
    4514             :     /*      amount of computation.                                          */
    4515             :     /* -------------------------------------------------------------------- */
    4516             : 
    4517             :     // In case the mask made be computed from another band of the dataset,
    4518             :     // we can't use cascaded generation, as the computation of the overviews
    4519             :     // of the band used for the mask band may not have yet occurred (#3033).
    4520         838 :     if ((STARTS_WITH_CI(pszResampling, "AVER") ||
    4521         471 :          EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
    4522         440 :          EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
    4523         386 :          EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
    4524         838 :          EQUAL(pszResampling, "MODE")) &&
    4525          44 :         nOverviewCount > 1 && bCanUseCascaded)
    4526          44 :         return GDALRegenerateCascadingOverviews(
    4527             :             poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
    4528          44 :             pProgressData, papszOptions);
    4529             : 
    4530             :     /* -------------------------------------------------------------------- */
    4531             :     /*      Setup one horizontal swath to read from the raw buffer.         */
    4532             :     /* -------------------------------------------------------------------- */
    4533         794 :     int nFRXBlockSize = 0;
    4534         794 :     int nFRYBlockSize = 0;
    4535         794 :     poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
    4536             : 
    4537         794 :     const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
    4538        1282 :     const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
    4539        1236 :                                        EQUAL(pszResampling, "MODE") ||
    4540         442 :                                        !GDALDataTypeIsComplex(eSrcDataType);
    4541             :     const GDALDataType eWrkDataType =
    4542             :         bUseGenericResampleFn
    4543         794 :             ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
    4544         794 :             : GDT_CFloat32;
    4545             : 
    4546         794 :     const int nWidth = poSrcBand->GetXSize();
    4547         794 :     const int nHeight = poSrcBand->GetYSize();
    4548             : 
    4549         794 :     int nMaxOvrFactor = 1;
    4550        1705 :     for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
    4551             :     {
    4552         911 :         const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
    4553         911 :         const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
    4554         911 :         nMaxOvrFactor = std::max(
    4555             :             nMaxOvrFactor,
    4556         911 :             static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
    4557         911 :         nMaxOvrFactor = std::max(
    4558             :             nMaxOvrFactor,
    4559         911 :             static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
    4560             :     }
    4561             : 
    4562         794 :     int nFullResYChunk = nFRYBlockSize;
    4563         794 :     int nMaxChunkYSizeQueried = 0;
    4564             : 
    4565             :     const auto UpdateChunkHeightAndGetChunkSize =
    4566       10354 :         [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
    4567       83793 :          eWrkDataType, nWidth]()
    4568             :     {
    4569             :         // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
    4570             :         // + nFullResYChunk) / nMaxOvrFactor)
    4571       10354 :         if (nMaxOvrFactor > INT_MAX / RADIUS_TO_DIAMETER)
    4572             :         {
    4573           1 :             return GINTBIG_MAX;
    4574             :         }
    4575       10353 :         nFullResYChunk =
    4576       10353 :             std::max(nFullResYChunk, RADIUS_TO_DIAMETER * nMaxOvrFactor);
    4577       10353 :         if ((nKernelRadius > 0 &&
    4578         970 :              nMaxOvrFactor > INT_MAX / (RADIUS_TO_DIAMETER * nKernelRadius)) ||
    4579       10353 :             nFullResYChunk >
    4580       10353 :                 INT_MAX - RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor)
    4581             :         {
    4582           0 :             return GINTBIG_MAX;
    4583             :         }
    4584       10353 :         nMaxChunkYSizeQueried =
    4585       10353 :             nFullResYChunk + RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor;
    4586       10353 :         if (GDALGetDataTypeSizeBytes(eWrkDataType) >
    4587       10353 :             std::numeric_limits<int64_t>::max() /
    4588       10353 :                 (static_cast<int64_t>(nMaxChunkYSizeQueried) * nWidth))
    4589             :         {
    4590           1 :             return GINTBIG_MAX;
    4591             :         }
    4592       10352 :         return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
    4593       10352 :                nMaxChunkYSizeQueried * nWidth;
    4594         794 :     };
    4595             : 
    4596             :     const char *pszChunkYSize =
    4597         794 :         CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
    4598             : #ifndef __COVERITY__
    4599             :     // Only configurable for debug / testing
    4600         794 :     if (pszChunkYSize)
    4601             :     {
    4602           0 :         nFullResYChunk = atoi(pszChunkYSize);
    4603             :     }
    4604             : #endif
    4605             : 
    4606             :     // Only configurable for debug / testing
    4607             :     const int nChunkMaxSize =
    4608         794 :         atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
    4609             : 
    4610         794 :     auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
    4611         794 :     if (nChunkSize > nChunkMaxSize)
    4612             :     {
    4613          15 :         if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
    4614          44 :             !GDALDataTypeIsComplex(eSrcDataType) &&
    4615          14 :             (!STARTS_WITH_CI(pszResampling, "AVER") ||
    4616           2 :              EQUAL(pszResampling, "AVERAGE")))
    4617             :         {
    4618             :             // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
    4619             :             // which use a block based strategy, which is much less memory
    4620             :             // hungry.
    4621          14 :             return GDALRegenerateOverviewsMultiBand(
    4622             :                 1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
    4623          14 :                 pfnProgress, pProgressData, papszOptions);
    4624             :         }
    4625           1 :         else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
    4626             :         {
    4627           0 :             return GDALRegenerateCascadingOverviews(
    4628             :                 poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
    4629           0 :                 pfnProgress, pProgressData, papszOptions);
    4630             :         }
    4631             :     }
    4632         779 :     else if (pszChunkYSize == nullptr)
    4633             :     {
    4634             :         // Try to get as close as possible to nChunkMaxSize
    4635       10339 :         while (nChunkSize < nChunkMaxSize / 2)
    4636             :         {
    4637        9560 :             nFullResYChunk *= 2;
    4638        9560 :             nChunkSize = UpdateChunkHeightAndGetChunkSize();
    4639             :         }
    4640             :     }
    4641             : 
    4642         780 :     int nHasNoData = 0;
    4643         780 :     const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
    4644         780 :     const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
    4645             :     const bool bPropagateNoData =
    4646         780 :         CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
    4647             : 
    4648             :     // Structure describing a resampling job
    4649             :     struct OvrJob
    4650             :     {
    4651             :         // Buffers to free when job is finished
    4652             :         std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
    4653             :         std::shared_ptr<PointerHolder> oSrcBufferHolder{};
    4654             :         std::unique_ptr<PointerHolder> oDstBufferHolder{};
    4655             : 
    4656             :         GDALRasterBand *poDstBand = nullptr;
    4657             : 
    4658             :         // Input parameters of pfnResampleFn
    4659             :         GDALResampleFunction pfnResampleFn = nullptr;
    4660             :         int nSrcWidth = 0;
    4661             :         int nSrcHeight = 0;
    4662             :         int nDstWidth = 0;
    4663             :         GDALOverviewResampleArgs args{};
    4664             :         const void *pChunk = nullptr;
    4665             :         bool bUseGenericResampleFn = false;
    4666             : 
    4667             :         // Output values of resampling function
    4668             :         CPLErr eErr = CE_Failure;
    4669             :         void *pDstBuffer = nullptr;
    4670             :         GDALDataType eDstBufferDataType = GDT_Unknown;
    4671             : 
    4672           0 :         void SetSrcMaskBufferHolder(
    4673             :             const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
    4674             :         {
    4675           0 :             oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
    4676           0 :         }
    4677             : 
    4678           0 :         void SetSrcBufferHolder(
    4679             :             const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
    4680             :         {
    4681           0 :             oSrcBufferHolder = oSrcBufferHolderIn;
    4682           0 :         }
    4683             : 
    4684         880 :         void NotifyFinished()
    4685             :         {
    4686        1760 :             std::lock_guard guard(mutex);
    4687         880 :             bFinished = true;
    4688         880 :             cv.notify_one();
    4689         880 :         }
    4690             : 
    4691           0 :         bool IsFinished()
    4692             :         {
    4693           0 :             std::lock_guard guard(mutex);
    4694           0 :             return bFinished;
    4695             :         }
    4696             : 
    4697           0 :         void WaitFinished()
    4698             :         {
    4699           0 :             std::unique_lock oGuard(mutex);
    4700           0 :             while (!bFinished)
    4701             :             {
    4702           0 :                 cv.wait(oGuard);
    4703             :             }
    4704           0 :         }
    4705             : 
    4706             :       private:
    4707             :         // Synchronization
    4708             :         bool bFinished = false;
    4709             :         std::mutex mutex{};
    4710             :         std::condition_variable cv{};
    4711             :     };
    4712             : 
    4713             :     // Thread function to resample
    4714         880 :     const auto JobResampleFunc = [](void *pData)
    4715             :     {
    4716         880 :         OvrJob *poJob = static_cast<OvrJob *>(pData);
    4717             : 
    4718         880 :         if (poJob->bUseGenericResampleFn)
    4719             :         {
    4720         878 :             poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
    4721             :                                                &(poJob->pDstBuffer),
    4722             :                                                &(poJob->eDstBufferDataType));
    4723             :         }
    4724             :         else
    4725             :         {
    4726           2 :             poJob->eErr = GDALResampleChunkC32R(
    4727             :                 poJob->nSrcWidth, poJob->nSrcHeight,
    4728           2 :                 static_cast<const float *>(poJob->pChunk),
    4729             :                 poJob->args.nChunkYOff, poJob->args.nChunkYSize,
    4730             :                 poJob->args.nDstYOff, poJob->args.nDstYOff2,
    4731             :                 poJob->args.nOvrXSize, poJob->args.nOvrYSize,
    4732             :                 &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
    4733             :                 poJob->args.pszResampling);
    4734             :         }
    4735             : 
    4736             :         poJob->oDstBufferHolder =
    4737         880 :             std::make_unique<PointerHolder>(poJob->pDstBuffer);
    4738             : 
    4739         880 :         poJob->NotifyFinished();
    4740         880 :     };
    4741             : 
    4742             :     // Function to write resample data to target band
    4743         880 :     const auto WriteJobData = [](const OvrJob *poJob)
    4744             :     {
    4745        1760 :         return poJob->poDstBand->RasterIO(
    4746         880 :             GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
    4747         880 :             poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
    4748         880 :             poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
    4749         880 :             poJob->eDstBufferDataType, 0, 0, nullptr);
    4750             :     };
    4751             : 
    4752             :     // Wait for completion of oldest job and serialize it
    4753             :     const auto WaitAndFinalizeOldestJob =
    4754           0 :         [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
    4755             :     {
    4756           0 :         auto poOldestJob = jobList.front().get();
    4757           0 :         poOldestJob->WaitFinished();
    4758           0 :         CPLErr l_eErr = poOldestJob->eErr;
    4759           0 :         if (l_eErr == CE_None)
    4760             :         {
    4761           0 :             l_eErr = WriteJobData(poOldestJob);
    4762             :         }
    4763             : 
    4764           0 :         jobList.pop_front();
    4765           0 :         return l_eErr;
    4766             :     };
    4767             : 
    4768             :     // Queue of jobs
    4769        1560 :     std::list<std::unique_ptr<OvrJob>> jobList;
    4770             : 
    4771         780 :     GByte *pabyChunkNodataMask = nullptr;
    4772         780 :     void *pChunk = nullptr;
    4773             : 
    4774         780 :     const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
    4775        3120 :     const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
    4776         780 :                                                        ? CPLGetNumCPUs()
    4777         780 :                                                        : atoi(pszThreads)));
    4778             :     auto poThreadPool =
    4779         780 :         nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
    4780             :     auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
    4781        1560 :                                    : std::unique_ptr<CPLJobQueue>(nullptr);
    4782             : 
    4783             :     /* -------------------------------------------------------------------- */
    4784             :     /*      Loop over image operating on chunks.                            */
    4785             :     /* -------------------------------------------------------------------- */
    4786         780 :     int nChunkYOff = 0;
    4787         780 :     CPLErr eErr = CE_None;
    4788             : 
    4789        1565 :     for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
    4790         785 :          nChunkYOff += nFullResYChunk)
    4791             :     {
    4792         785 :         if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
    4793             :                          pProgressData))
    4794             :         {
    4795           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    4796           0 :             eErr = CE_Failure;
    4797             :         }
    4798             : 
    4799         785 :         if (nFullResYChunk + nChunkYOff > nHeight)
    4800         778 :             nFullResYChunk = nHeight - nChunkYOff;
    4801             : 
    4802         785 :         int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
    4803         785 :         int nChunkYSizeQueried =
    4804         785 :             nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
    4805         785 :         if (nChunkYOffQueried < 0)
    4806             :         {
    4807          83 :             nChunkYSizeQueried += nChunkYOffQueried;
    4808          83 :             nChunkYOffQueried = 0;
    4809             :         }
    4810         785 :         if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
    4811          83 :             nChunkYSizeQueried = nHeight - nChunkYOffQueried;
    4812             : 
    4813             :         // Avoid accumulating too many tasks and exhaust RAM
    4814             :         // Try to complete already finished jobs
    4815         785 :         while (eErr == CE_None && !jobList.empty())
    4816             :         {
    4817           0 :             auto poOldestJob = jobList.front().get();
    4818           0 :             if (!poOldestJob->IsFinished())
    4819           0 :                 break;
    4820           0 :             eErr = poOldestJob->eErr;
    4821           0 :             if (eErr == CE_None)
    4822             :             {
    4823           0 :                 eErr = WriteJobData(poOldestJob);
    4824             :             }
    4825             : 
    4826           0 :             jobList.pop_front();
    4827             :         }
    4828             : 
    4829             :         // And in case we have saturated the number of threads,
    4830             :         // wait for completion of tasks to go below the threshold.
    4831        1570 :         while (eErr == CE_None &&
    4832         785 :                jobList.size() >= static_cast<size_t>(nThreads))
    4833             :         {
    4834           0 :             eErr = WaitAndFinalizeOldestJob(jobList);
    4835             :         }
    4836             : 
    4837             :         // (Re)allocate buffers if needed
    4838         785 :         if (pChunk == nullptr)
    4839             :         {
    4840         780 :             pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
    4841             :                                          nMaxChunkYSizeQueried, nWidth);
    4842             :         }
    4843         785 :         if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
    4844             :         {
    4845             :             pabyChunkNodataMask = static_cast<GByte *>(
    4846         283 :                 VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
    4847             :         }
    4848             : 
    4849         785 :         if (pChunk == nullptr ||
    4850         283 :             (bUseNoDataMask && pabyChunkNodataMask == nullptr))
    4851             :         {
    4852           0 :             CPLFree(pChunk);
    4853           0 :             CPLFree(pabyChunkNodataMask);
    4854           0 :             return CE_Failure;
    4855             :         }
    4856             : 
    4857             :         // Read chunk.
    4858         785 :         if (eErr == CE_None)
    4859         785 :             eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
    4860             :                                        nChunkYSizeQueried, pChunk, nWidth,
    4861             :                                        nChunkYSizeQueried, eWrkDataType, 0, 0,
    4862             :                                        nullptr);
    4863         785 :         if (eErr == CE_None && bUseNoDataMask)
    4864         283 :             eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
    4865             :                                         nChunkYSizeQueried, pabyChunkNodataMask,
    4866             :                                         nWidth, nChunkYSizeQueried, GDT_Byte, 0,
    4867             :                                         0, nullptr);
    4868             : 
    4869             :         // Special case to promote 1bit data to 8bit 0/255 values.
    4870         785 :         if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
    4871             :         {
    4872           9 :             if (eWrkDataType == GDT_Float32)
    4873             :             {
    4874           0 :                 float *pafChunk = static_cast<float *>(pChunk);
    4875           0 :                 for (size_t i = 0;
    4876           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    4877             :                 {
    4878           0 :                     if (pafChunk[i] == 1.0)
    4879           0 :                         pafChunk[i] = 255.0;
    4880             :                 }
    4881             :             }
    4882           9 :             else if (eWrkDataType == GDT_Byte)
    4883             :             {
    4884           9 :                 GByte *pabyChunk = static_cast<GByte *>(pChunk);
    4885      168417 :                 for (size_t i = 0;
    4886      168417 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    4887             :                 {
    4888      168408 :                     if (pabyChunk[i] == 1)
    4889      127437 :                         pabyChunk[i] = 255;
    4890             :                 }
    4891             :             }
    4892           0 :             else if (eWrkDataType == GDT_UInt16)
    4893             :             {
    4894           0 :                 GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
    4895           0 :                 for (size_t i = 0;
    4896           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    4897             :                 {
    4898           0 :                     if (pasChunk[i] == 1)
    4899           0 :                         pasChunk[i] = 255;
    4900             :                 }
    4901             :             }
    4902           0 :             else if (eWrkDataType == GDT_Float64)
    4903             :             {
    4904           0 :                 double *padfChunk = static_cast<double *>(pChunk);
    4905           0 :                 for (size_t i = 0;
    4906           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    4907             :                 {
    4908           0 :                     if (padfChunk[i] == 1.0)
    4909           0 :                         padfChunk[i] = 255.0;
    4910             :                 }
    4911             :             }
    4912             :             else
    4913             :             {
    4914           0 :                 CPLAssert(false);
    4915             :             }
    4916             :         }
    4917         776 :         else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
    4918             :         {
    4919           0 :             if (eWrkDataType == GDT_Float32)
    4920             :             {
    4921           0 :                 float *pafChunk = static_cast<float *>(pChunk);
    4922           0 :                 for (size_t i = 0;
    4923           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    4924             :                 {
    4925           0 :                     if (pafChunk[i] == 1.0)
    4926           0 :                         pafChunk[i] = 0.0;
    4927           0 :                     else if (pafChunk[i] == 0.0)
    4928           0 :                         pafChunk[i] = 255.0;
    4929             :                 }
    4930             :             }
    4931           0 :             else if (eWrkDataType == GDT_Byte)
    4932             :             {
    4933           0 :                 GByte *pabyChunk = static_cast<GByte *>(pChunk);
    4934           0 :                 for (size_t i = 0;
    4935           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    4936             :                 {
    4937           0 :                     if (pabyChunk[i] == 1)
    4938           0 :                         pabyChunk[i] = 0;
    4939           0 :                     else if (pabyChunk[i] == 0)
    4940           0 :                         pabyChunk[i] = 255;
    4941             :                 }
    4942             :             }
    4943           0 :             else if (eWrkDataType == GDT_UInt16)
    4944             :             {
    4945           0 :                 GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
    4946           0 :                 for (size_t i = 0;
    4947           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    4948             :                 {
    4949           0 :                     if (pasChunk[i] == 1)
    4950           0 :                         pasChunk[i] = 0;
    4951           0 :                     else if (pasChunk[i] == 0)
    4952           0 :                         pasChunk[i] = 255;
    4953             :                 }
    4954             :             }
    4955           0 :             else if (eWrkDataType == GDT_Float64)
    4956             :             {
    4957           0 :                 double *padfChunk = static_cast<double *>(pChunk);
    4958           0 :                 for (size_t i = 0;
    4959           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    4960             :                 {
    4961           0 :                     if (padfChunk[i] == 1.0)
    4962           0 :                         padfChunk[i] = 0.0;
    4963           0 :                     else if (padfChunk[i] == 0.0)
    4964           0 :                         padfChunk[i] = 255.0;
    4965             :                 }
    4966             :             }
    4967             :             else
    4968             :             {
    4969           0 :                 CPLAssert(false);
    4970             :             }
    4971             :         }
    4972             : 
    4973             :         auto oSrcBufferHolder =
    4974        1570 :             std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
    4975             :         auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
    4976        1570 :             poJobQueue ? pabyChunkNodataMask : nullptr);
    4977             : 
    4978        1665 :         for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
    4979             :              ++iOverview)
    4980             :         {
    4981         880 :             GDALRasterBand *poDstBand = papoOvrBands[iOverview];
    4982         880 :             const int nDstWidth = poDstBand->GetXSize();
    4983         880 :             const int nDstHeight = poDstBand->GetYSize();
    4984             : 
    4985         880 :             const double dfXRatioDstToSrc =
    4986         880 :                 static_cast<double>(nWidth) / nDstWidth;
    4987         880 :             const double dfYRatioDstToSrc =
    4988         880 :                 static_cast<double>(nHeight) / nDstHeight;
    4989             : 
    4990             :             /* --------------------------------------------------------------------
    4991             :              */
    4992             :             /*      Figure out the line to start writing to, and the first line
    4993             :              */
    4994             :             /*      to not write to.  In theory this approach should ensure that
    4995             :              */
    4996             :             /*      every output line will be written if all input chunks are */
    4997             :             /*      processed. */
    4998             :             /* --------------------------------------------------------------------
    4999             :              */
    5000         880 :             int nDstYOff =
    5001         880 :                 static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
    5002         880 :             if (nDstYOff == nDstHeight)
    5003           0 :                 continue;
    5004         880 :             int nDstYOff2 = static_cast<int>(
    5005         880 :                 0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
    5006             : 
    5007         880 :             if (nChunkYOff + nFullResYChunk == nHeight)
    5008         873 :                 nDstYOff2 = nDstHeight;
    5009             : #if DEBUG_VERBOSE
    5010             :             CPLDebug("GDAL",
    5011             :                      "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
    5012             :                      nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
    5013             :                      nDstWidth, nDstYOff2 - nDstYOff);
    5014             : #endif
    5015             : 
    5016        1760 :             auto poJob = std::make_unique<OvrJob>();
    5017         880 :             poJob->pfnResampleFn = pfnResampleFn;
    5018         880 :             poJob->bUseGenericResampleFn = bUseGenericResampleFn;
    5019         880 :             poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
    5020         880 :             poJob->args.nOvrXSize = poDstBand->GetXSize();
    5021         880 :             poJob->args.nOvrYSize = poDstBand->GetYSize();
    5022             :             const char *pszNBITS =
    5023         880 :                 poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    5024         880 :             poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    5025         880 :             poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    5026         880 :             poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    5027         880 :             poJob->args.eWrkDataType = eWrkDataType;
    5028         880 :             poJob->pChunk = pChunk;
    5029         880 :             poJob->args.pabyChunkNodataMask = pabyChunkNodataMask;
    5030         880 :             poJob->nSrcWidth = nWidth;
    5031         880 :             poJob->nSrcHeight = nHeight;
    5032         880 :             poJob->args.nChunkXOff = 0;
    5033         880 :             poJob->args.nChunkXSize = nWidth;
    5034         880 :             poJob->args.nChunkYOff = nChunkYOffQueried;
    5035         880 :             poJob->args.nChunkYSize = nChunkYSizeQueried;
    5036         880 :             poJob->nDstWidth = nDstWidth;
    5037         880 :             poJob->args.nDstXOff = 0;
    5038         880 :             poJob->args.nDstXOff2 = nDstWidth;
    5039         880 :             poJob->args.nDstYOff = nDstYOff;
    5040         880 :             poJob->args.nDstYOff2 = nDstYOff2;
    5041         880 :             poJob->poDstBand = poDstBand;
    5042         880 :             poJob->args.pszResampling = pszResampling;
    5043         880 :             poJob->args.bHasNoData = bHasNoData;
    5044         880 :             poJob->args.dfNoDataValue = dfNoDataValue;
    5045         880 :             poJob->args.poColorTable = poColorTable;
    5046         880 :             poJob->args.eSrcDataType = eSrcDataType;
    5047         880 :             poJob->args.bPropagateNoData = bPropagateNoData;
    5048             : 
    5049         880 :             if (poJobQueue)
    5050             :             {
    5051           0 :                 poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
    5052           0 :                 poJob->SetSrcBufferHolder(oSrcBufferHolder);
    5053           0 :                 poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
    5054           0 :                 jobList.emplace_back(std::move(poJob));
    5055             :             }
    5056             :             else
    5057             :             {
    5058         880 :                 JobResampleFunc(poJob.get());
    5059         880 :                 eErr = poJob->eErr;
    5060         880 :                 if (eErr == CE_None)
    5061             :                 {
    5062         880 :                     eErr = WriteJobData(poJob.get());
    5063             :                 }
    5064             :             }
    5065             :         }
    5066             : 
    5067         785 :         if (poJobQueue)
    5068             :         {
    5069           0 :             pChunk = nullptr;
    5070           0 :             pabyChunkNodataMask = nullptr;
    5071             :         }
    5072             :     }
    5073             : 
    5074         780 :     VSIFree(pChunk);
    5075         780 :     VSIFree(pabyChunkNodataMask);
    5076             : 
    5077             :     // Wait for all pending jobs to complete
    5078         780 :     while (!jobList.empty())
    5079             :     {
    5080           0 :         const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
    5081           0 :         if (l_eErr != CE_None && eErr == CE_None)
    5082           0 :             eErr = l_eErr;
    5083             :     }
    5084             : 
    5085             :     /* -------------------------------------------------------------------- */
    5086             :     /*      Renormalized overview mean / stddev if needed.                  */
    5087             :     /* -------------------------------------------------------------------- */
    5088         780 :     if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
    5089             :     {
    5090           0 :         GDALOverviewMagnitudeCorrection(
    5091             :             poSrcBand, nOverviewCount,
    5092             :             reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
    5093             :             GDALDummyProgress, nullptr);
    5094             :     }
    5095             : 
    5096             :     /* -------------------------------------------------------------------- */
    5097             :     /*      It can be important to flush out data to overviews.             */
    5098             :     /* -------------------------------------------------------------------- */
    5099        1653 :     for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
    5100             :          ++iOverview)
    5101             :     {
    5102         873 :         eErr = papoOvrBands[iOverview]->FlushCache(false);
    5103             :     }
    5104             : 
    5105         780 :     if (eErr == CE_None)
    5106         780 :         pfnProgress(1.0, nullptr, pProgressData);
    5107             : 
    5108         780 :     return eErr;
    5109             : }
    5110             : 
    5111             : /************************************************************************/
    5112             : /*            GDALRegenerateOverviewsMultiBand()                        */
    5113             : /************************************************************************/
    5114             : 
    5115             : /**
    5116             :  * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
    5117             :  * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
    5118             :  *
    5119             :  * This function will generate one or more overview images from a base
    5120             :  * image using the requested downsampling algorithm.  Its primary use
    5121             :  * is for generating overviews via GDALDataset::BuildOverviews(), but it
    5122             :  * can also be used to generate downsampled images in one file from another
    5123             :  * outside the overview architecture.
    5124             :  *
    5125             :  * The output bands need to exist in advance and share the same characteristics
    5126             :  * (type, dimensions)
    5127             :  *
    5128             :  * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
    5129             :  * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
    5130             :  *
    5131             :  * It does not support color tables or complex data types.
    5132             :  *
    5133             :  * The pseudo-algorithm used by the function is :
    5134             :  *    for each overview
    5135             :  *       iterate on lines of the source by a step of deltay
    5136             :  *           iterate on columns of the source  by a step of deltax
    5137             :  *               read the source data of size deltax * deltay for all the bands
    5138             :  *               generate the corresponding overview block for all the bands
    5139             :  *
    5140             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    5141             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    5142             :  * considered as the nodata value and not each value of the triplet
    5143             :  * independently per band.
    5144             :  *
    5145             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    5146             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    5147             :  * overview computation.
    5148             :  *
    5149             :  * @param nBands the number of bands, size of papoSrcBands and size of
    5150             :  *               first dimension of papapoOverviewBands
    5151             :  * @param papoSrcBands the list of source bands to downsample
    5152             :  * @param nOverviews the number of downsampled overview levels being generated.
    5153             :  * @param papapoOverviewBands bidimension array of bands. First dimension is
    5154             :  *                            indexed by nBands. Second dimension is indexed by
    5155             :  *                            nOverviews.
    5156             :  * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
    5157             :  * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
    5158             :  * @param pfnProgress progress report function.
    5159             :  * @param pProgressData progress function callback data.
    5160             :  * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
    5161             :  *                     key=value pairs, or NULL
    5162             :  *                     Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
    5163             :  *                     options can be specified to express that overviews should
    5164             :  *                     be regenerated only in the specified subset of the source
    5165             :  *                     dataset.
    5166             :  * @return CE_None on success or CE_Failure on failure.
    5167             :  */
    5168             : 
    5169         388 : CPLErr GDALRegenerateOverviewsMultiBand(
    5170             :     int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
    5171             :     GDALRasterBand *const *const *papapoOverviewBands,
    5172             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    5173             :     void *pProgressData, CSLConstList papszOptions)
    5174             : {
    5175         388 :     CPL_IGNORE_RET_VAL(papszOptions);
    5176             : 
    5177         388 :     if (pfnProgress == nullptr)
    5178          11 :         pfnProgress = GDALDummyProgress;
    5179             : 
    5180         388 :     if (EQUAL(pszResampling, "NONE") || nBands == 0 || nOverviews == 0)
    5181           3 :         return CE_None;
    5182             : 
    5183             :     // Sanity checks.
    5184         385 :     if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
    5185         191 :         !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
    5186          84 :         !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
    5187          22 :         !EQUAL(pszResampling, "CUBICSPLINE") &&
    5188          21 :         !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
    5189           5 :         !EQUAL(pszResampling, "MODE"))
    5190             :     {
    5191           0 :         CPLError(CE_Failure, CPLE_NotSupported,
    5192             :                  "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
    5193             :                  "not supported",
    5194             :                  pszResampling);
    5195           0 :         return CE_Failure;
    5196             :     }
    5197             : 
    5198         385 :     int nKernelRadius = 0;
    5199             :     GDALResampleFunction pfnResampleFn =
    5200         385 :         GDALGetResampleFunction(pszResampling, &nKernelRadius);
    5201         385 :     if (pfnResampleFn == nullptr)
    5202           0 :         return CE_Failure;
    5203             : 
    5204         385 :     const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
    5205         385 :     const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
    5206         385 :     if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
    5207           0 :         return CE_None;
    5208         385 :     GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
    5209       66232 :     for (int iBand = 1; iBand < nBands; ++iBand)
    5210             :     {
    5211      131694 :         if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
    5212       65847 :             papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
    5213             :         {
    5214           0 :             CPLError(
    5215             :                 CE_Failure, CPLE_NotSupported,
    5216             :                 "GDALRegenerateOverviewsMultiBand: all the source bands must "
    5217             :                 "have the same dimensions");
    5218           0 :             return CE_Failure;
    5219             :         }
    5220       65847 :         if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
    5221             :         {
    5222           0 :             CPLError(
    5223             :                 CE_Failure, CPLE_NotSupported,
    5224             :                 "GDALRegenerateOverviewsMultiBand: all the source bands must "
    5225             :                 "have the same data type");
    5226           0 :             return CE_Failure;
    5227             :         }
    5228             :     }
    5229             : 
    5230        1031 :     for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
    5231             :     {
    5232         646 :         const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
    5233         646 :         const int nDstWidth = poOvrFirstBand->GetXSize();
    5234         646 :         const int nDstHeight = poOvrFirstBand->GetYSize();
    5235       66759 :         for (int iBand = 1; iBand < nBands; ++iBand)
    5236             :         {
    5237       66113 :             const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
    5238      132226 :             if (poOvrBand->GetXSize() != nDstWidth ||
    5239       66113 :                 poOvrBand->GetYSize() != nDstHeight)
    5240             :             {
    5241           0 :                 CPLError(
    5242             :                     CE_Failure, CPLE_NotSupported,
    5243             :                     "GDALRegenerateOverviewsMultiBand: all the overviews bands "
    5244             :                     "of the same level must have the same dimensions");
    5245           0 :                 return CE_Failure;
    5246             :             }
    5247       66113 :             if (poOvrBand->GetRasterDataType() != eDataType)
    5248             :             {
    5249           0 :                 CPLError(
    5250             :                     CE_Failure, CPLE_NotSupported,
    5251             :                     "GDALRegenerateOverviewsMultiBand: all the overviews bands "
    5252             :                     "must have the same data type as the source bands");
    5253           0 :                 return CE_Failure;
    5254             :             }
    5255             :         }
    5256             :     }
    5257             : 
    5258             :     // First pass to compute the total number of pixels to write.
    5259         385 :     double dfTotalPixelCount = 0;
    5260         385 :     const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
    5261         385 :     const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
    5262         385 :     const int nSrcXSize = atoi(CSLFetchNameValueDef(
    5263             :         papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
    5264         385 :     const int nSrcYSize = atoi(CSLFetchNameValueDef(
    5265             :         papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
    5266        1031 :     for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
    5267             :     {
    5268         646 :         dfTotalPixelCount +=
    5269        1292 :             static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
    5270         646 :             papapoOverviewBands[0][iOverview]->GetXSize() *
    5271        1292 :             static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
    5272         646 :             papapoOverviewBands[0][iOverview]->GetYSize();
    5273             :     }
    5274             : 
    5275             :     const GDALDataType eWrkDataType =
    5276         385 :         GDALGetOvrWorkDataType(pszResampling, eDataType);
    5277             :     const int nWrkDataTypeSize =
    5278         385 :         std::max(1, GDALGetDataTypeSizeBytes(eWrkDataType));
    5279             : 
    5280         385 :     const bool bIsMask = papoSrcBands[0]->IsMaskBand();
    5281             : 
    5282             :     // If we have a nodata mask and we are doing something more complicated
    5283             :     // than nearest neighbouring, we have to fetch to nodata mask.
    5284             :     const bool bUseNoDataMask =
    5285         568 :         !STARTS_WITH_CI(pszResampling, "NEAR") &&
    5286         183 :         (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
    5287             : 
    5288         770 :     std::vector<bool> abHasNoData(nBands);
    5289         770 :     std::vector<double> adfNoDataValue(nBands);
    5290             : 
    5291       66617 :     for (int iBand = 0; iBand < nBands; ++iBand)
    5292             :     {
    5293       66232 :         int nHasNoData = 0;
    5294      132464 :         adfNoDataValue[iBand] =
    5295       66232 :             papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
    5296       66232 :         abHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
    5297             :     }
    5298             :     const bool bPropagateNoData =
    5299         385 :         CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
    5300             : 
    5301         385 :     const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
    5302        1540 :     const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
    5303         385 :                                                        ? CPLGetNumCPUs()
    5304         385 :                                                        : atoi(pszThreads)));
    5305             :     auto poThreadPool =
    5306         385 :         nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
    5307             :     auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
    5308         770 :                                    : std::unique_ptr<CPLJobQueue>(nullptr);
    5309             : 
    5310             :     // Only configurable for debug / testing
    5311         385 :     const GIntBig nChunkMaxSize = []() -> GIntBig
    5312             :     {
    5313             :         const char *pszVal =
    5314         385 :             CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", nullptr);
    5315         385 :         if (pszVal)
    5316             :         {
    5317          15 :             GIntBig nRet = 0;
    5318          15 :             CPLParseMemorySize(pszVal, &nRet, nullptr);
    5319          15 :             return std::max<GIntBig>(100, nRet);
    5320             :         }
    5321         370 :         return 10 * 1024 * 1024;
    5322         385 :     }();
    5323             : 
    5324             :     // Only configurable for debug / testing
    5325         385 :     const GIntBig nChunkMaxSizeForTempFile = []() -> GIntBig
    5326             :     {
    5327         385 :         const char *pszVal = CPLGetConfigOption(
    5328             :             "GDAL_OVR_CHUNK_MAX_SIZE_FOR_TEMP_FILE", nullptr);
    5329         385 :         if (pszVal)
    5330             :         {
    5331          14 :             GIntBig nRet = 0;
    5332          14 :             CPLParseMemorySize(pszVal, &nRet, nullptr);
    5333          14 :             return std::max<GIntBig>(100, nRet);
    5334             :         }
    5335         371 :         const auto nUsableRAM = CPLGetUsablePhysicalRAM();
    5336         371 :         if (nUsableRAM > 0)
    5337         371 :             return nUsableRAM / 10;
    5338             :         // Select a value to be able to at least downsample by 2 for a RGB
    5339             :         // 1024x1024 tiled output: (2 * 1024 + 2) * (2 * 1024 + 2) * 3 = 12 MB
    5340           0 :         return 100 * 1024 * 1024;
    5341         385 :     }();
    5342             : 
    5343             :     // Second pass to do the real job.
    5344         385 :     double dfCurPixelCount = 0;
    5345         385 :     CPLErr eErr = CE_None;
    5346        1025 :     for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
    5347             :          ++iOverview)
    5348             :     {
    5349         645 :         int iSrcOverview = -1;  // -1 means the source bands.
    5350             : 
    5351             :         const int nDstTotalWidth =
    5352         645 :             papapoOverviewBands[0][iOverview]->GetXSize();
    5353             :         const int nDstTotalHeight =
    5354         645 :             papapoOverviewBands[0][iOverview]->GetYSize();
    5355             : 
    5356             :         // Compute the coordinates of the target region to refresh
    5357         645 :         constexpr double EPS = 1e-8;
    5358         645 :         const int nDstXOffStart = static_cast<int>(
    5359         645 :             static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
    5360             :             EPS);
    5361             :         const int nDstXOffEnd =
    5362        1290 :             std::min(static_cast<int>(
    5363         645 :                          std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
    5364         645 :                                        nToplevelSrcWidth * nDstTotalWidth -
    5365             :                                    EPS)),
    5366         645 :                      nDstTotalWidth);
    5367         645 :         const int nDstWidth = nDstXOffEnd - nDstXOffStart;
    5368         645 :         const int nDstYOffStart =
    5369         645 :             static_cast<int>(static_cast<double>(nSrcYOff) /
    5370         645 :                                  nToplevelSrcHeight * nDstTotalHeight +
    5371             :                              EPS);
    5372             :         const int nDstYOffEnd =
    5373        1290 :             std::min(static_cast<int>(
    5374         645 :                          std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
    5375         645 :                                        nToplevelSrcHeight * nDstTotalHeight -
    5376             :                                    EPS)),
    5377         645 :                      nDstTotalHeight);
    5378         645 :         const int nDstHeight = nDstYOffEnd - nDstYOffStart;
    5379             : 
    5380             :         // Try to use previous level of overview as the source to compute
    5381             :         // the next level.
    5382         645 :         int nSrcWidth = nToplevelSrcWidth;
    5383         645 :         int nSrcHeight = nToplevelSrcHeight;
    5384         905 :         if (iOverview > 0 &&
    5385         260 :             papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
    5386             :         {
    5387         252 :             nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
    5388         252 :             nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
    5389         252 :             iSrcOverview = iOverview - 1;
    5390             :         }
    5391             : 
    5392         645 :         const double dfXRatioDstToSrc =
    5393         645 :             static_cast<double>(nSrcWidth) / nDstTotalWidth;
    5394         645 :         const double dfYRatioDstToSrc =
    5395         645 :             static_cast<double>(nSrcHeight) / nDstTotalHeight;
    5396             : 
    5397             :         const int nOvrFactor =
    5398        1935 :             std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
    5399         645 :                                  static_cast<int>(0.5 + dfYRatioDstToSrc)));
    5400             : 
    5401         645 :         int nDstChunkXSize = 0;
    5402         645 :         int nDstChunkYSize = 0;
    5403         645 :         papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
    5404             :                                                         &nDstChunkYSize);
    5405             : 
    5406         645 :         constexpr int PIXEL_MARGIN = 2;
    5407             :         // Try to extend the chunk size so that the memory needed to acquire
    5408             :         // source pixels goes up to 10 MB.
    5409             :         // This can help for drivers that support multi-threaded reading
    5410         645 :         const int nFullResYChunk = static_cast<int>(std::min<double>(
    5411         645 :             nSrcHeight, PIXEL_MARGIN + nDstChunkYSize * dfYRatioDstToSrc));
    5412         645 :         const int nFullResYChunkQueried = static_cast<int>(std::min<int64_t>(
    5413        1290 :             nSrcHeight,
    5414        1290 :             nFullResYChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
    5415         645 :                                  nKernelRadius * nOvrFactor));
    5416         881 :         while (nDstChunkXSize < nDstWidth)
    5417             :         {
    5418         255 :             constexpr int INCREASE_FACTOR = 2;
    5419             : 
    5420         255 :             const int nFullResXChunk = static_cast<int>(std::min<double>(
    5421         510 :                 nSrcWidth, PIXEL_MARGIN + INCREASE_FACTOR * nDstChunkXSize *
    5422         255 :                                               dfXRatioDstToSrc));
    5423             : 
    5424             :             const int nFullResXChunkQueried =
    5425         255 :                 static_cast<int>(std::min<int64_t>(
    5426         510 :                     nSrcWidth,
    5427         510 :                     nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
    5428         255 :                                          nKernelRadius * nOvrFactor));
    5429             : 
    5430         255 :             if (nBands > nChunkMaxSize / nFullResXChunkQueried /
    5431         255 :                              nFullResYChunkQueried / nWrkDataTypeSize)
    5432             :             {
    5433          19 :                 break;
    5434             :             }
    5435             : 
    5436         236 :             nDstChunkXSize *= INCREASE_FACTOR;
    5437             :         }
    5438         645 :         nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
    5439             : 
    5440         645 :         const int nFullResXChunk = static_cast<int>(std::min<double>(
    5441         645 :             nSrcWidth, PIXEL_MARGIN + nDstChunkXSize * dfXRatioDstToSrc));
    5442         645 :         const int nFullResXChunkQueried = static_cast<int>(std::min<int64_t>(
    5443        1290 :             nSrcWidth,
    5444        1290 :             nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
    5445         645 :                                  nKernelRadius * nOvrFactor));
    5446             : 
    5447             :         // Make sure that the RAM requirements to acquire the source data does
    5448             :         // not exceed nChunkMaxSizeForTempFile
    5449             :         // If so, reduce the destination chunk size, generate overviews in a
    5450             :         // temporary dataset, and copy that temporary dataset over the target
    5451             :         // overview bands (to avoid issues with lossy compression)
    5452             :         const bool bOverflowFullResXChunkYChunkQueried =
    5453         645 :             nBands > std::numeric_limits<int64_t>::max() /
    5454         645 :                          nFullResXChunkQueried / nFullResYChunkQueried /
    5455         645 :                          nWrkDataTypeSize;
    5456             : 
    5457         645 :         const auto nMemRequirement =
    5458             :             bOverflowFullResXChunkYChunkQueried
    5459         645 :                 ? 0
    5460         641 :                 : static_cast<GIntBig>(nFullResXChunkQueried) *
    5461         641 :                       nFullResYChunkQueried * nBands * nWrkDataTypeSize;
    5462             :         // Use a temporary dataset with a smaller destination chunk size
    5463         645 :         const auto nOverShootFactor =
    5464             :             nMemRequirement / nChunkMaxSizeForTempFile;
    5465             : 
    5466         645 :         constexpr int MIN_OVERSHOOT_FACTOR = 4;
    5467             :         const auto nSqrtOverShootFactor = std::max<GIntBig>(
    5468        1290 :             MIN_OVERSHOOT_FACTOR, static_cast<GIntBig>(std::ceil(std::sqrt(
    5469         645 :                                       static_cast<double>(nOverShootFactor)))));
    5470         645 :         constexpr int DEFAULT_CHUNK_SIZE = 256;
    5471         645 :         constexpr int GTIFF_BLOCK_SIZE_MULTIPLE = 16;
    5472             :         const int nReducedDstChunkXSize =
    5473             :             bOverflowFullResXChunkYChunkQueried
    5474        1286 :                 ? DEFAULT_CHUNK_SIZE
    5475        1286 :                 : std::max(1, static_cast<int>(nDstChunkXSize /
    5476        1286 :                                                nSqrtOverShootFactor) &
    5477         641 :                                   ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
    5478             :         const int nReducedDstChunkYSize =
    5479             :             bOverflowFullResXChunkYChunkQueried
    5480        1286 :                 ? DEFAULT_CHUNK_SIZE
    5481        1286 :                 : std::max(1, static_cast<int>(nDstChunkYSize /
    5482        1286 :                                                nSqrtOverShootFactor) &
    5483         641 :                                   ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
    5484             : 
    5485         645 :         if (bOverflowFullResXChunkYChunkQueried ||
    5486             :             nMemRequirement > nChunkMaxSizeForTempFile)
    5487             :         {
    5488             :             const auto nDTSize =
    5489          43 :                 std::max(1, GDALGetDataTypeSizeBytes(eDataType));
    5490             :             const bool bTmpDSMemRequirementOverflow =
    5491          43 :                 nBands > std::numeric_limits<int64_t>::max() / nDstWidth /
    5492          43 :                              nDstHeight / nDTSize;
    5493          43 :             const auto nTmpDSMemRequirement =
    5494             :                 bTmpDSMemRequirementOverflow
    5495          43 :                     ? 0
    5496          41 :                     : static_cast<GIntBig>(nDstWidth) * nDstHeight * nBands *
    5497          41 :                           nDTSize;
    5498             : 
    5499             :             // make sure that one band buffer doesn't overflow size_t
    5500             :             const bool bChunkSizeOverflow =
    5501          43 :                 static_cast<size_t>(nDTSize) >
    5502          43 :                 std::numeric_limits<size_t>::max() / nDstWidth / nDstHeight;
    5503          43 :             const size_t nChunkSize =
    5504             :                 bChunkSizeOverflow
    5505          43 :                     ? 0
    5506          41 :                     : static_cast<size_t>(nDstWidth) * nDstHeight * nDTSize;
    5507             : 
    5508             :             const auto CreateVRT =
    5509          41 :                 [nBands, nSrcWidth, nSrcHeight, nDstTotalWidth, nDstTotalHeight,
    5510             :                  pszResampling, eWrkDataType, papoSrcBands, papapoOverviewBands,
    5511             :                  iSrcOverview, &abHasNoData,
    5512      393585 :                  &adfNoDataValue](int nVRTBlockXSize, int nVRTBlockYSize)
    5513             :             {
    5514             :                 auto poVRTDS = std::make_unique<VRTDataset>(
    5515          41 :                     nDstTotalWidth, nDstTotalHeight, nVRTBlockXSize,
    5516          41 :                     nVRTBlockYSize);
    5517             : 
    5518       65620 :                 for (int iBand = 0; iBand < nBands; ++iBand)
    5519             :                 {
    5520      131158 :                     auto poVRTSrc = std::make_unique<VRTSimpleSource>();
    5521       65579 :                     poVRTSrc->SetResampling(pszResampling);
    5522       65579 :                     poVRTDS->AddBand(eWrkDataType);
    5523             :                     auto poVRTBand = static_cast<VRTSourcedRasterBand *>(
    5524       65579 :                         poVRTDS->GetRasterBand(iBand + 1));
    5525             : 
    5526       65579 :                     auto poSrcBand = papoSrcBands[iBand];
    5527       65579 :                     if (iSrcOverview != -1)
    5528          24 :                         poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
    5529       65579 :                     poVRTBand->ConfigureSource(
    5530             :                         poVRTSrc.get(), poSrcBand, false, 0, 0, nSrcWidth,
    5531             :                         nSrcHeight, 0, 0, nDstTotalWidth, nDstTotalHeight);
    5532             :                     // Add the source to the band
    5533       65579 :                     poVRTBand->AddSource(poVRTSrc.release());
    5534       65579 :                     if (abHasNoData[iBand])
    5535           3 :                         poVRTBand->SetNoDataValue(adfNoDataValue[iBand]);
    5536             :                 }
    5537             : 
    5538          42 :                 if (papoSrcBands[0]->GetMaskFlags() == GMF_PER_DATASET &&
    5539           1 :                     poVRTDS->CreateMaskBand(GMF_PER_DATASET) == CE_None)
    5540             :                 {
    5541             :                     VRTSourcedRasterBand *poMaskVRTBand =
    5542           1 :                         cpl::down_cast<VRTSourcedRasterBand *>(
    5543           1 :                             poVRTDS->GetRasterBand(1)->GetMaskBand());
    5544           1 :                     auto poSrcBand = papoSrcBands[0];
    5545           1 :                     if (iSrcOverview != -1)
    5546           0 :                         poSrcBand = papapoOverviewBands[0][iSrcOverview];
    5547           1 :                     poMaskVRTBand->AddMaskBandSource(
    5548           1 :                         poSrcBand->GetMaskBand(), 0, 0, nSrcWidth, nSrcHeight,
    5549             :                         0, 0, nDstTotalWidth, nDstTotalHeight);
    5550             :                 }
    5551             : 
    5552          41 :                 return poVRTDS;
    5553          43 :             };
    5554             : 
    5555             :             // If the overview accommodates chunking, do so and recurse
    5556             :             // to avoid generating full size temporary files
    5557          43 :             if (!bOverflowFullResXChunkYChunkQueried &&
    5558          39 :                 !bTmpDSMemRequirementOverflow && !bChunkSizeOverflow &&
    5559          39 :                 (nDstChunkXSize < nDstWidth || nDstChunkYSize < nDstHeight))
    5560             :             {
    5561             :                 // Create a VRT with the smaller chunk to do the scaling
    5562             :                 auto poVRTDS =
    5563          13 :                     CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
    5564             : 
    5565          13 :                 std::vector<GDALRasterBand *> apoVRTBand(nBands);
    5566          13 :                 std::vector<GDALRasterBand *> apoDstBand(nBands);
    5567       65560 :                 for (int iBand = 0; iBand < nBands; ++iBand)
    5568             :                 {
    5569       65547 :                     apoDstBand[iBand] = papapoOverviewBands[iBand][iOverview];
    5570       65547 :                     apoVRTBand[iBand] = poVRTDS->GetRasterBand(iBand + 1);
    5571             :                 }
    5572             : 
    5573             :                 // Use a flag to avoid reading from the overview being built
    5574             :                 GDALRasterIOExtraArg sExtraArg;
    5575          13 :                 INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    5576          13 :                 if (iSrcOverview == -1)
    5577          13 :                     sExtraArg.bUseOnlyThisScale = true;
    5578             : 
    5579             :                 // A single band buffer for data transfer to the overview
    5580          13 :                 std::vector<GByte> abyChunk;
    5581             :                 try
    5582             :                 {
    5583          13 :                     abyChunk.resize(nChunkSize);
    5584             :                 }
    5585           0 :                 catch (const std::exception &)
    5586             :                 {
    5587           0 :                     CPLError(CE_Failure, CPLE_OutOfMemory,
    5588             :                              "Out of memory allocating temporary buffer");
    5589           0 :                     return CE_Failure;
    5590             :                 }
    5591             : 
    5592             :                 // Loop over output height, in chunks
    5593          13 :                 for (int nDstYOff = nDstYOffStart;
    5594          38 :                      nDstYOff < nDstYOffEnd && eErr == CE_None;
    5595             :                      /* */)
    5596             :                 {
    5597             :                     const int nDstYCount =
    5598          25 :                         std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
    5599             :                     // Loop over output width, in output chunks
    5600          25 :                     for (int nDstXOff = nDstXOffStart;
    5601          74 :                          nDstXOff < nDstXOffEnd && eErr == CE_None;
    5602             :                          /* */)
    5603             :                     {
    5604             :                         const int nDstXCount =
    5605          49 :                             std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
    5606             :                         // Read and transfer the chunk to the overview
    5607          98 :                         for (int iBand = 0; iBand < nBands && eErr == CE_None;
    5608             :                              ++iBand)
    5609             :                         {
    5610          98 :                             eErr = apoVRTBand[iBand]->RasterIO(
    5611             :                                 GF_Read, nDstXOff, nDstYOff, nDstXCount,
    5612          49 :                                 nDstYCount, abyChunk.data(), nDstXCount,
    5613             :                                 nDstYCount, eDataType, 0, 0, &sExtraArg);
    5614          49 :                             if (eErr == CE_None)
    5615             :                             {
    5616          96 :                                 eErr = apoDstBand[iBand]->RasterIO(
    5617             :                                     GF_Write, nDstXOff, nDstYOff, nDstXCount,
    5618          48 :                                     nDstYCount, abyChunk.data(), nDstXCount,
    5619             :                                     nDstYCount, eDataType, 0, 0, nullptr);
    5620             :                             }
    5621             :                         }
    5622             : 
    5623          49 :                         dfCurPixelCount +=
    5624          49 :                             static_cast<double>(nDstXCount) * nDstYCount;
    5625             : 
    5626          49 :                         nDstXOff += nDstXCount;
    5627             :                     }  // width
    5628             : 
    5629          25 :                     if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount,
    5630             :                                      nullptr, pProgressData))
    5631             :                     {
    5632           0 :                         CPLError(CE_Failure, CPLE_UserInterrupt,
    5633             :                                  "User terminated");
    5634           0 :                         eErr = CE_Failure;
    5635             :                     }
    5636             : 
    5637          25 :                     nDstYOff += nDstYCount;
    5638             :                 }  // height
    5639             : 
    5640          13 :                 if (CE_None != eErr)
    5641             :                 {
    5642           1 :                     CPLError(CE_Failure, CPLE_AppDefined,
    5643             :                              "Error while writing overview");
    5644           1 :                     return CE_Failure;
    5645             :                 }
    5646             : 
    5647          12 :                 pfnProgress(1.0, nullptr, pProgressData);
    5648             :                 // Flush the overviews we just generated
    5649          24 :                 for (int iBand = 0; iBand < nBands; ++iBand)
    5650          12 :                     apoDstBand[iBand]->FlushCache(false);
    5651             : 
    5652          12 :                 continue;  // Next overview
    5653             :             }              // chunking via temporary dataset
    5654             : 
    5655           0 :             std::unique_ptr<GDALDataset> poTmpDS;
    5656             :             // Config option mostly/only for autotest purposes
    5657             :             const char *pszGDAL_OVR_TEMP_DRIVER =
    5658          30 :                 CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
    5659          30 :             if ((!bTmpDSMemRequirementOverflow &&
    5660           4 :                  nTmpDSMemRequirement <= nChunkMaxSizeForTempFile &&
    5661           4 :                  !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
    5662          26 :                 EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
    5663             :             {
    5664          10 :                 auto poTmpDrv = GetGDALDriverManager()->GetDriverByName("MEM");
    5665          10 :                 if (!poTmpDrv)
    5666             :                 {
    5667           0 :                     eErr = CE_Failure;
    5668           0 :                     break;
    5669             :                 }
    5670          10 :                 poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
    5671             :                                                nDstTotalHeight, nBands,
    5672          10 :                                                eDataType, nullptr));
    5673             :             }
    5674             :             else
    5675             :             {
    5676             :                 // Create a temporary file for the overview
    5677             :                 auto poTmpDrv =
    5678          20 :                     GetGDALDriverManager()->GetDriverByName("GTiff");
    5679          20 :                 if (!poTmpDrv)
    5680             :                 {
    5681           0 :                     eErr = CE_Failure;
    5682           0 :                     break;
    5683             :                 }
    5684          40 :                 std::string osTmpFilename;
    5685          20 :                 auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
    5686          20 :                 if (poDstDS)
    5687             :                 {
    5688          20 :                     osTmpFilename = poDstDS->GetDescription();
    5689             :                     VSIStatBufL sStatBuf;
    5690          20 :                     if (!osTmpFilename.empty() &&
    5691           0 :                         VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
    5692           0 :                         osTmpFilename += "_tmp_ovr.tif";
    5693             :                 }
    5694          20 :                 if (osTmpFilename.empty())
    5695             :                 {
    5696          20 :                     osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
    5697          20 :                     osTmpFilename += ".tif";
    5698             :                 }
    5699          20 :                 CPLDebug("GDAL", "Creating temporary file %s of %d x %d x %d",
    5700             :                          osTmpFilename.c_str(), nDstWidth, nDstHeight, nBands);
    5701          40 :                 CPLStringList aosCO;
    5702          20 :                 if (0 == ((nReducedDstChunkXSize % GTIFF_BLOCK_SIZE_MULTIPLE) |
    5703          20 :                           (nReducedDstChunkYSize % GTIFF_BLOCK_SIZE_MULTIPLE)))
    5704             :                 {
    5705          14 :                     aosCO.SetNameValue("TILED", "YES");
    5706             :                     aosCO.SetNameValue("BLOCKXSIZE",
    5707          14 :                                        CPLSPrintf("%d", nReducedDstChunkXSize));
    5708             :                     aosCO.SetNameValue("BLOCKYSIZE",
    5709          14 :                                        CPLSPrintf("%d", nReducedDstChunkYSize));
    5710             :                 }
    5711          20 :                 if (const char *pszCOList =
    5712          20 :                         poTmpDrv->GetMetadataItem(GDAL_DMD_CREATIONOPTIONLIST))
    5713             :                 {
    5714             :                     aosCO.SetNameValue(
    5715          20 :                         "COMPRESS", strstr(pszCOList, "ZSTD") ? "ZSTD" : "LZW");
    5716             :                 }
    5717          20 :                 poTmpDS.reset(poTmpDrv->Create(osTmpFilename.c_str(), nDstWidth,
    5718             :                                                nDstHeight, nBands, eDataType,
    5719          20 :                                                aosCO.List()));
    5720          20 :                 if (poTmpDS)
    5721             :                 {
    5722          18 :                     poTmpDS->MarkSuppressOnClose();
    5723          18 :                     VSIUnlink(osTmpFilename.c_str());
    5724             :                 }
    5725             :             }
    5726          30 :             if (!poTmpDS)
    5727             :             {
    5728           2 :                 eErr = CE_Failure;
    5729           2 :                 break;
    5730             :             }
    5731             : 
    5732             :             // Create a full size VRT to do the resampling without edge effects
    5733             :             auto poVRTDS =
    5734          28 :                 CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
    5735             : 
    5736             :             // Allocate a band buffer with the overview chunk size
    5737             :             std::unique_ptr<void, VSIFreeReleaser> pDstBuffer(
    5738             :                 VSI_MALLOC3_VERBOSE(size_t(nWrkDataTypeSize), nDstChunkXSize,
    5739          28 :                                     nDstChunkYSize));
    5740          28 :             if (pDstBuffer == nullptr)
    5741             :             {
    5742           0 :                 eErr = CE_Failure;
    5743           0 :                 break;
    5744             :             }
    5745             : 
    5746             :             // Use a flag to avoid reading the overview being built
    5747             :             GDALRasterIOExtraArg sExtraArg;
    5748          28 :             INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    5749          28 :             if (iSrcOverview == -1)
    5750           4 :                 sExtraArg.bUseOnlyThisScale = true;
    5751             : 
    5752             :             // Scale and copy data from the VRT to the temp file
    5753          28 :             for (int nDstYOff = nDstYOffStart;
    5754         914 :                  nDstYOff < nDstYOffEnd && eErr == CE_None;
    5755             :                  /* */)
    5756             :             {
    5757             :                 const int nDstYCount =
    5758         886 :                     std::min(nReducedDstChunkYSize, nDstYOffEnd - nDstYOff);
    5759         886 :                 for (int nDstXOff = nDstXOffStart;
    5760      201218 :                      nDstXOff < nDstXOffEnd && eErr == CE_None;
    5761             :                      /* */)
    5762             :                 {
    5763             :                     const int nDstXCount =
    5764      200332 :                         std::min(nReducedDstChunkXSize, nDstXOffEnd - nDstXOff);
    5765      400668 :                     for (int iBand = 0; iBand < nBands && eErr == CE_None;
    5766             :                          ++iBand)
    5767             :                     {
    5768      200336 :                         auto poSrcBand = poVRTDS->GetRasterBand(iBand + 1);
    5769      200336 :                         eErr = poSrcBand->RasterIO(
    5770             :                             GF_Read, nDstXOff, nDstYOff, nDstXCount, nDstYCount,
    5771             :                             pDstBuffer.get(), nDstXCount, nDstYCount,
    5772             :                             eWrkDataType, 0, 0, &sExtraArg);
    5773      200336 :                         if (eErr == CE_None)
    5774             :                         {
    5775             :                             // Write to the temporary dataset, shifted
    5776      200334 :                             auto poOvrBand = poTmpDS->GetRasterBand(iBand + 1);
    5777      200334 :                             eErr = poOvrBand->RasterIO(
    5778             :                                 GF_Write, nDstXOff - nDstXOffStart,
    5779             :                                 nDstYOff - nDstYOffStart, nDstXCount,
    5780             :                                 nDstYCount, pDstBuffer.get(), nDstXCount,
    5781             :                                 nDstYCount, eWrkDataType, 0, 0, nullptr);
    5782             :                         }
    5783             :                     }
    5784      200332 :                     nDstXOff += nDstXCount;
    5785             :                 }
    5786         886 :                 nDstYOff += nDstYCount;
    5787             :             }
    5788             : 
    5789             :             // Copy from the temporary to the overview
    5790          28 :             for (int nDstYOff = nDstYOffStart;
    5791          54 :                  nDstYOff < nDstYOffEnd && eErr == CE_None;
    5792             :                  /* */)
    5793             :             {
    5794             :                 const int nDstYCount =
    5795          26 :                     std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
    5796          26 :                 for (int nDstXOff = nDstXOffStart;
    5797          52 :                      nDstXOff < nDstXOffEnd && eErr == CE_None;
    5798             :                      /* */)
    5799             :                 {
    5800             :                     const int nDstXCount =
    5801          26 :                         std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
    5802          56 :                     for (int iBand = 0; iBand < nBands && eErr == CE_None;
    5803             :                          ++iBand)
    5804             :                     {
    5805          30 :                         auto poSrcBand = poTmpDS->GetRasterBand(iBand + 1);
    5806          30 :                         eErr = poSrcBand->RasterIO(
    5807             :                             GF_Read, nDstXOff - nDstXOffStart,
    5808             :                             nDstYOff - nDstYOffStart, nDstXCount, nDstYCount,
    5809             :                             pDstBuffer.get(), nDstXCount, nDstYCount,
    5810             :                             eWrkDataType, 0, 0, nullptr);
    5811          30 :                         if (eErr == CE_None)
    5812             :                         {
    5813             :                             // Write to the destination overview bands
    5814          30 :                             auto poOvrBand =
    5815          30 :                                 papapoOverviewBands[iBand][iOverview];
    5816          30 :                             eErr = poOvrBand->RasterIO(
    5817             :                                 GF_Write, nDstXOff, nDstYOff, nDstXCount,
    5818             :                                 nDstYCount, pDstBuffer.get(), nDstXCount,
    5819             :                                 nDstYCount, eWrkDataType, 0, 0, nullptr);
    5820             :                         }
    5821             :                     }
    5822          26 :                     nDstXOff += nDstXCount;
    5823             :                 }
    5824          26 :                 nDstYOff += nDstYCount;
    5825             :             }
    5826             : 
    5827          28 :             if (eErr != CE_None)
    5828             :             {
    5829           2 :                 CPLError(CE_Failure, CPLE_AppDefined,
    5830             :                          "Failed to write overview %d", iOverview);
    5831           2 :                 return eErr;
    5832             :             }
    5833             : 
    5834             :             // Flush the data to overviews.
    5835          56 :             for (int iBand = 0; iBand < nBands; ++iBand)
    5836          30 :                 papapoOverviewBands[iBand][iOverview]->FlushCache(false);
    5837             : 
    5838          26 :             continue;
    5839             :         }
    5840             : 
    5841             :         // Structure describing a resampling job
    5842             :         struct OvrJob
    5843             :         {
    5844             :             // Buffers to free when job is finished
    5845             :             std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
    5846             :             std::unique_ptr<PointerHolder> oSrcBufferHolder{};
    5847             :             std::unique_ptr<PointerHolder> oDstBufferHolder{};
    5848             : 
    5849             :             GDALRasterBand *poDstBand = nullptr;
    5850             : 
    5851             :             // Input parameters of pfnResampleFn
    5852             :             GDALResampleFunction pfnResampleFn = nullptr;
    5853             :             GDALOverviewResampleArgs args{};
    5854             :             const void *pChunk = nullptr;
    5855             : 
    5856             :             // Output values of resampling function
    5857             :             CPLErr eErr = CE_Failure;
    5858             :             void *pDstBuffer = nullptr;
    5859             :             GDALDataType eDstBufferDataType = GDT_Unknown;
    5860             : 
    5861        3310 :             void NotifyFinished()
    5862             :             {
    5863        6620 :                 std::lock_guard guard(mutex);
    5864        3310 :                 bFinished = true;
    5865        3310 :                 cv.notify_one();
    5866        3310 :             }
    5867             : 
    5868           2 :             bool IsFinished()
    5869             :             {
    5870           2 :                 std::lock_guard guard(mutex);
    5871           4 :                 return bFinished;
    5872             :             }
    5873             : 
    5874          14 :             void WaitFinished()
    5875             :             {
    5876          28 :                 std::unique_lock oGuard(mutex);
    5877          18 :                 while (!bFinished)
    5878             :                 {
    5879           4 :                     cv.wait(oGuard);
    5880             :                 }
    5881          14 :             }
    5882             : 
    5883             :           private:
    5884             :             // Synchronization
    5885             :             bool bFinished = false;
    5886             :             std::mutex mutex{};
    5887             :             std::condition_variable cv{};
    5888             :         };
    5889             : 
    5890             :         // Thread function to resample
    5891        3310 :         const auto JobResampleFunc = [](void *pData)
    5892             :         {
    5893        3310 :             OvrJob *poJob = static_cast<OvrJob *>(pData);
    5894             : 
    5895        3310 :             poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
    5896             :                                                &(poJob->pDstBuffer),
    5897             :                                                &(poJob->eDstBufferDataType));
    5898             : 
    5899        3310 :             poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
    5900             : 
    5901        3310 :             poJob->NotifyFinished();
    5902        3310 :         };
    5903             : 
    5904             :         // Function to write resample data to target band
    5905        3310 :         const auto WriteJobData = [](const OvrJob *poJob)
    5906             :         {
    5907        6620 :             return poJob->poDstBand->RasterIO(
    5908        3310 :                 GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
    5909        3310 :                 poJob->args.nDstXOff2 - poJob->args.nDstXOff,
    5910        3310 :                 poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
    5911        3310 :                 poJob->args.nDstXOff2 - poJob->args.nDstXOff,
    5912        3310 :                 poJob->args.nDstYOff2 - poJob->args.nDstYOff,
    5913        3310 :                 poJob->eDstBufferDataType, 0, 0, nullptr);
    5914             :         };
    5915             : 
    5916             :         // Wait for completion of oldest job and serialize it
    5917             :         const auto WaitAndFinalizeOldestJob =
    5918          14 :             [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
    5919             :         {
    5920          14 :             auto poOldestJob = jobList.front().get();
    5921          14 :             poOldestJob->WaitFinished();
    5922          14 :             CPLErr l_eErr = poOldestJob->eErr;
    5923          14 :             if (l_eErr == CE_None)
    5924             :             {
    5925          14 :                 l_eErr = WriteJobData(poOldestJob);
    5926             :             }
    5927             : 
    5928          14 :             jobList.pop_front();
    5929          14 :             return l_eErr;
    5930             :         };
    5931             : 
    5932             :         // Queue of jobs
    5933        1204 :         std::list<std::unique_ptr<OvrJob>> jobList;
    5934             : 
    5935        1204 :         std::vector<std::unique_ptr<void, VSIFreeReleaser>> apaChunk(nBands);
    5936             :         std::vector<std::unique_ptr<GByte, VSIFreeReleaser>>
    5937        1204 :             apabyChunkNoDataMask(nBands);
    5938             : 
    5939             :         // Iterate on destination overview, block by block.
    5940         602 :         for (int nDstYOff = nDstYOffStart;
    5941        2111 :              nDstYOff < nDstYOffEnd && eErr == CE_None;
    5942        1509 :              nDstYOff += nDstChunkYSize)
    5943             :         {
    5944             :             int nDstYCount;
    5945        1509 :             if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
    5946        1099 :                 nDstYCount = nDstChunkYSize;
    5947             :             else
    5948         410 :                 nDstYCount = nDstYOffEnd - nDstYOff;
    5949             : 
    5950        1509 :             int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    5951        1509 :             int nChunkYOff2 = static_cast<int>(
    5952        1509 :                 ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
    5953        1509 :             if (nChunkYOff2 > nSrcHeight ||
    5954        1509 :                 nDstYOff + nDstYCount == nDstTotalHeight)
    5955         595 :                 nChunkYOff2 = nSrcHeight;
    5956        1509 :             int nYCount = nChunkYOff2 - nChunkYOff;
    5957        1509 :             CPLAssert(nYCount <= nFullResYChunk);
    5958             : 
    5959        1509 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
    5960        1509 :             int nChunkYSizeQueried =
    5961        1509 :                 nYCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
    5962        1509 :             if (nChunkYOffQueried < 0)
    5963             :             {
    5964         148 :                 nChunkYSizeQueried += nChunkYOffQueried;
    5965         148 :                 nChunkYOffQueried = 0;
    5966             :             }
    5967        1509 :             if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
    5968         147 :                 nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
    5969        1509 :             CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
    5970             : 
    5971        1509 :             if (!pfnProgress(std::min(1.0, dfCurPixelCount / dfTotalPixelCount),
    5972             :                              nullptr, pProgressData))
    5973             :             {
    5974           1 :                 CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    5975           1 :                 eErr = CE_Failure;
    5976             :             }
    5977             : 
    5978             :             // Iterate on destination overview, block by block.
    5979        1509 :             for (int nDstXOff = nDstXOffStart;
    5980        3057 :                  nDstXOff < nDstXOffEnd && eErr == CE_None;
    5981        1548 :                  nDstXOff += nDstChunkXSize)
    5982             :             {
    5983        1548 :                 int nDstXCount = 0;
    5984        1548 :                 if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
    5985        1531 :                     nDstXCount = nDstChunkXSize;
    5986             :                 else
    5987          17 :                     nDstXCount = nDstXOffEnd - nDstXOff;
    5988             : 
    5989        1548 :                 dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
    5990             : 
    5991        1548 :                 int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    5992        1548 :                 int nChunkXOff2 = static_cast<int>(
    5993        1548 :                     ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    5994        1548 :                 if (nChunkXOff2 > nSrcWidth ||
    5995        1548 :                     nDstXOff + nDstXCount == nDstTotalWidth)
    5996        1473 :                     nChunkXOff2 = nSrcWidth;
    5997        1548 :                 const int nXCount = nChunkXOff2 - nChunkXOff;
    5998        1548 :                 CPLAssert(nXCount <= nFullResXChunk);
    5999             : 
    6000        1548 :                 int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
    6001        1548 :                 int nChunkXSizeQueried =
    6002        1548 :                     nXCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
    6003        1548 :                 if (nChunkXOffQueried < 0)
    6004             :                 {
    6005         208 :                     nChunkXSizeQueried += nChunkXOffQueried;
    6006         208 :                     nChunkXOffQueried = 0;
    6007             :                 }
    6008        1548 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
    6009         217 :                     nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
    6010        1548 :                 CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
    6011             : #if DEBUG_VERBOSE
    6012             :                 CPLDebug("GDAL",
    6013             :                          "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
    6014             :                          nChunkXOffQueried, nChunkYOffQueried,
    6015             :                          nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
    6016             :                          nDstYOff, nDstXCount, nDstYCount);
    6017             : #endif
    6018             : 
    6019             :                 // Avoid accumulating too many tasks and exhaust RAM
    6020             : 
    6021             :                 // Try to complete already finished jobs
    6022        1550 :                 while (eErr == CE_None && !jobList.empty())
    6023             :                 {
    6024           2 :                     auto poOldestJob = jobList.front().get();
    6025           2 :                     if (!poOldestJob->IsFinished())
    6026           0 :                         break;
    6027           2 :                     eErr = poOldestJob->eErr;
    6028           2 :                     if (eErr == CE_None)
    6029             :                     {
    6030           2 :                         eErr = WriteJobData(poOldestJob);
    6031             :                     }
    6032             : 
    6033           2 :                     jobList.pop_front();
    6034             :                 }
    6035             : 
    6036             :                 // And in case we have saturated the number of threads,
    6037             :                 // wait for completion of tasks to go below the threshold.
    6038        3096 :                 while (eErr == CE_None &&
    6039        1548 :                        jobList.size() >= static_cast<size_t>(nThreads))
    6040             :                 {
    6041           0 :                     eErr = WaitAndFinalizeOldestJob(jobList);
    6042             :                 }
    6043             : 
    6044             :                 // Read the source buffers for all the bands.
    6045        4859 :                 for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
    6046             :                 {
    6047             :                     // (Re)allocate buffers if needed
    6048        3311 :                     if (apaChunk[iBand] == nullptr)
    6049             :                     {
    6050        1179 :                         apaChunk[iBand].reset(VSI_MALLOC3_VERBOSE(
    6051             :                             nFullResXChunkQueried, nFullResYChunkQueried,
    6052             :                             nWrkDataTypeSize));
    6053        1179 :                         if (apaChunk[iBand] == nullptr)
    6054             :                         {
    6055           0 :                             eErr = CE_Failure;
    6056             :                         }
    6057             :                     }
    6058        3652 :                     if (bUseNoDataMask &&
    6059         341 :                         apabyChunkNoDataMask[iBand] == nullptr)
    6060             :                     {
    6061         282 :                         apabyChunkNoDataMask[iBand].reset(
    6062         282 :                             static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    6063             :                                 nFullResXChunkQueried, nFullResYChunkQueried)));
    6064         282 :                         if (apabyChunkNoDataMask[iBand] == nullptr)
    6065             :                         {
    6066           0 :                             eErr = CE_Failure;
    6067             :                         }
    6068             :                     }
    6069             : 
    6070        3311 :                     if (eErr == CE_None)
    6071             :                     {
    6072        3311 :                         GDALRasterBand *poSrcBand = nullptr;
    6073        3311 :                         if (iSrcOverview == -1)
    6074        2409 :                             poSrcBand = papoSrcBands[iBand];
    6075             :                         else
    6076         902 :                             poSrcBand =
    6077         902 :                                 papapoOverviewBands[iBand][iSrcOverview];
    6078        3311 :                         eErr = poSrcBand->RasterIO(
    6079             :                             GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    6080             :                             nChunkXSizeQueried, nChunkYSizeQueried,
    6081        3311 :                             apaChunk[iBand].get(), nChunkXSizeQueried,
    6082             :                             nChunkYSizeQueried, eWrkDataType, 0, 0, nullptr);
    6083             : 
    6084        3311 :                         if (bUseNoDataMask && eErr == CE_None)
    6085             :                         {
    6086         341 :                             auto poMaskBand = poSrcBand->IsMaskBand()
    6087         341 :                                                   ? poSrcBand
    6088         262 :                                                   : poSrcBand->GetMaskBand();
    6089         341 :                             eErr = poMaskBand->RasterIO(
    6090             :                                 GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    6091             :                                 nChunkXSizeQueried, nChunkYSizeQueried,
    6092         341 :                                 apabyChunkNoDataMask[iBand].get(),
    6093             :                                 nChunkXSizeQueried, nChunkYSizeQueried,
    6094             :                                 GDT_Byte, 0, 0, nullptr);
    6095             :                         }
    6096             :                     }
    6097             :                 }
    6098             : 
    6099             :                 // Compute the resulting overview block.
    6100        4858 :                 for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
    6101             :                 {
    6102        6620 :                     auto poJob = std::make_unique<OvrJob>();
    6103        3310 :                     poJob->pfnResampleFn = pfnResampleFn;
    6104        3310 :                     poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
    6105        6620 :                     poJob->args.eOvrDataType =
    6106        3310 :                         poJob->poDstBand->GetRasterDataType();
    6107        3310 :                     poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
    6108        3310 :                     poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
    6109        3310 :                     const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
    6110        3310 :                         "NBITS", "IMAGE_STRUCTURE");
    6111        3310 :                     poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    6112        3310 :                     poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    6113        3310 :                     poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    6114        3310 :                     poJob->args.eWrkDataType = eWrkDataType;
    6115        3310 :                     poJob->pChunk = apaChunk[iBand].get();
    6116        3310 :                     poJob->args.pabyChunkNodataMask =
    6117        3310 :                         apabyChunkNoDataMask[iBand].get();
    6118        3310 :                     poJob->args.nChunkXOff = nChunkXOffQueried;
    6119        3310 :                     poJob->args.nChunkXSize = nChunkXSizeQueried;
    6120        3310 :                     poJob->args.nChunkYOff = nChunkYOffQueried;
    6121        3310 :                     poJob->args.nChunkYSize = nChunkYSizeQueried;
    6122        3310 :                     poJob->args.nDstXOff = nDstXOff;
    6123        3310 :                     poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
    6124        3310 :                     poJob->args.nDstYOff = nDstYOff;
    6125        3310 :                     poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
    6126        3310 :                     poJob->args.pszResampling = pszResampling;
    6127        3310 :                     poJob->args.bHasNoData = abHasNoData[iBand];
    6128        3310 :                     poJob->args.dfNoDataValue = adfNoDataValue[iBand];
    6129        3310 :                     poJob->args.eSrcDataType = eDataType;
    6130        3310 :                     poJob->args.bPropagateNoData = bPropagateNoData;
    6131             : 
    6132        3310 :                     if (poJobQueue)
    6133             :                     {
    6134          32 :                         poJob->oSrcMaskBufferHolder.reset(new PointerHolder(
    6135          16 :                             apabyChunkNoDataMask[iBand].release()));
    6136             : 
    6137          32 :                         poJob->oSrcBufferHolder.reset(
    6138          16 :                             new PointerHolder(apaChunk[iBand].release()));
    6139             : 
    6140          16 :                         poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
    6141          16 :                         jobList.emplace_back(std::move(poJob));
    6142             :                     }
    6143             :                     else
    6144             :                     {
    6145        3294 :                         JobResampleFunc(poJob.get());
    6146        3294 :                         eErr = poJob->eErr;
    6147        3294 :                         if (eErr == CE_None)
    6148             :                         {
    6149        3294 :                             eErr = WriteJobData(poJob.get());
    6150             :                         }
    6151             :                     }
    6152             :                 }
    6153             :             }
    6154             :         }
    6155             : 
    6156             :         // Wait for all pending jobs to complete
    6157         616 :         while (!jobList.empty())
    6158             :         {
    6159          14 :             const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
    6160          14 :             if (l_eErr != CE_None && eErr == CE_None)
    6161           0 :                 eErr = l_eErr;
    6162             :         }
    6163             : 
    6164             :         // Flush the data to overviews.
    6165        1779 :         for (int iBand = 0; iBand < nBands; ++iBand)
    6166             :         {
    6167        1177 :             if (papapoOverviewBands[iBand][iOverview]->FlushCache(false) !=
    6168             :                 CE_None)
    6169           0 :                 eErr = CE_Failure;
    6170             :         }
    6171             :     }
    6172             : 
    6173         382 :     if (eErr == CE_None)
    6174         378 :         pfnProgress(1.0, nullptr, pProgressData);
    6175             : 
    6176         382 :     return eErr;
    6177             : }
    6178             : 
    6179             : /************************************************************************/
    6180             : /*            GDALRegenerateOverviewsMultiBand()                        */
    6181             : /************************************************************************/
    6182             : 
    6183             : /**
    6184             :  * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
    6185             :  * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
    6186             :  *
    6187             :  * This function will generate one or more overview images from a base
    6188             :  * image using the requested downsampling algorithm.  Its primary use
    6189             :  * is for generating overviews via GDALDataset::BuildOverviews(), but it
    6190             :  * can also be used to generate downsampled images in one file from another
    6191             :  * outside the overview architecture.
    6192             :  *
    6193             :  * The output bands need to exist in advance and share the same characteristics
    6194             :  * (type, dimensions)
    6195             :  *
    6196             :  * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
    6197             :  * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
    6198             :  *
    6199             :  * It does not support color tables or complex data types.
    6200             :  *
    6201             :  * The pseudo-algorithm used by the function is :
    6202             :  *    for each overview
    6203             :  *       iterate on lines of the source by a step of deltay
    6204             :  *           iterate on columns of the source  by a step of deltax
    6205             :  *               read the source data of size deltax * deltay for all the bands
    6206             :  *               generate the corresponding overview block for all the bands
    6207             :  *
    6208             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    6209             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    6210             :  * considered as the nodata value and not each value of the triplet
    6211             :  * independently per band.
    6212             :  *
    6213             :  * The GDAL_NUM_THREADS configuration option can be set
    6214             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    6215             :  * overview computation.
    6216             :  *
    6217             :  * @param apoSrcBands the list of source bands to downsample
    6218             :  * @param aapoOverviewBands bidimension array of bands. First dimension is
    6219             :  *                          indexed by bands. Second dimension is indexed by
    6220             :  *                          overview levels. All aapoOverviewBands[i] arrays
    6221             :  *                          must have the same size (i.e. same number of
    6222             :  *                          overviews)
    6223             :  * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
    6224             :  * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
    6225             :  * @param pfnProgress progress report function.
    6226             :  * @param pProgressData progress function callback data.
    6227             :  * @param papszOptions NULL terminated list of options as
    6228             :  *                     key=value pairs, or NULL
    6229             :  *                     The XOFF, YOFF, XSIZE and YSIZE
    6230             :  *                     options can be specified to express that overviews should
    6231             :  *                     be regenerated only in the specified subset of the source
    6232             :  *                     dataset.
    6233             :  * @return CE_None on success or CE_Failure on failure.
    6234             :  * @since 3.10
    6235             :  */
    6236             : 
    6237          19 : CPLErr GDALRegenerateOverviewsMultiBand(
    6238             :     const std::vector<GDALRasterBand *> &apoSrcBands,
    6239             :     const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
    6240             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    6241             :     void *pProgressData, CSLConstList papszOptions)
    6242             : {
    6243          19 :     CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
    6244          29 :     for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
    6245             :     {
    6246          10 :         CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
    6247             :     }
    6248             : 
    6249          19 :     if (aapoOverviewBands.empty())
    6250           0 :         return CE_None;
    6251             : 
    6252          19 :     std::vector<GDALRasterBand **> apapoOverviewBands;
    6253          48 :     for (auto &apoOverviewBands : aapoOverviewBands)
    6254             :     {
    6255             :         auto papoOverviewBands = static_cast<GDALRasterBand **>(
    6256          29 :             CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
    6257          61 :         for (size_t i = 0; i < apoOverviewBands.size(); ++i)
    6258             :         {
    6259          32 :             papoOverviewBands[i] = apoOverviewBands[i];
    6260             :         }
    6261          29 :         apapoOverviewBands.push_back(papoOverviewBands);
    6262             :     }
    6263          38 :     const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
    6264          19 :         static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
    6265          19 :         static_cast<int>(aapoOverviewBands[0].size()),
    6266          19 :         apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
    6267             :         papszOptions);
    6268          48 :     for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
    6269          29 :         CPLFree(papoOverviewBands);
    6270          19 :     return eErr;
    6271             : }
    6272             : 
    6273             : /************************************************************************/
    6274             : /*                        GDALComputeBandStats()                        */
    6275             : /************************************************************************/
    6276             : 
    6277             : /** Undocumented
    6278             :  * @param hSrcBand undocumented.
    6279             :  * @param nSampleStep Step between scanlines used to compute statistics.
    6280             :  *                    When nSampleStep is equal to 1, all scanlines will
    6281             :  *                    be processed.
    6282             :  * @param pdfMean undocumented.
    6283             :  * @param pdfStdDev undocumented.
    6284             :  * @param pfnProgress undocumented.
    6285             :  * @param pProgressData undocumented.
    6286             :  * @return undocumented
    6287             :  */
    6288          18 : CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
    6289             :                                         int nSampleStep, double *pdfMean,
    6290             :                                         double *pdfStdDev,
    6291             :                                         GDALProgressFunc pfnProgress,
    6292             :                                         void *pProgressData)
    6293             : 
    6294             : {
    6295          18 :     VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
    6296             : 
    6297          18 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    6298             : 
    6299          18 :     if (pfnProgress == nullptr)
    6300          18 :         pfnProgress = GDALDummyProgress;
    6301             : 
    6302          18 :     const int nWidth = poSrcBand->GetXSize();
    6303          18 :     const int nHeight = poSrcBand->GetYSize();
    6304             : 
    6305          18 :     if (nSampleStep >= nHeight || nSampleStep < 1)
    6306           5 :         nSampleStep = 1;
    6307             : 
    6308          18 :     GDALDataType eWrkType = GDT_Unknown;
    6309          18 :     float *pafData = nullptr;
    6310          18 :     GDALDataType eType = poSrcBand->GetRasterDataType();
    6311          18 :     const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
    6312          18 :     if (bComplex)
    6313             :     {
    6314             :         pafData = static_cast<float *>(
    6315           0 :             VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
    6316           0 :         eWrkType = GDT_CFloat32;
    6317             :     }
    6318             :     else
    6319             :     {
    6320             :         pafData =
    6321          18 :             static_cast<float *>(VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
    6322          18 :         eWrkType = GDT_Float32;
    6323             :     }
    6324             : 
    6325          18 :     if (nWidth == 0 || pafData == nullptr)
    6326             :     {
    6327           0 :         VSIFree(pafData);
    6328           0 :         return CE_Failure;
    6329             :     }
    6330             : 
    6331             :     /* -------------------------------------------------------------------- */
    6332             :     /*      Loop over all sample lines.                                     */
    6333             :     /* -------------------------------------------------------------------- */
    6334          18 :     double dfSum = 0.0;
    6335          18 :     double dfSum2 = 0.0;
    6336          18 :     int iLine = 0;
    6337          18 :     GIntBig nSamples = 0;
    6338             : 
    6339        2143 :     do
    6340             :     {
    6341        2161 :         if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
    6342             :                          pProgressData))
    6343             :         {
    6344           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6345           0 :             CPLFree(pafData);
    6346           0 :             return CE_Failure;
    6347             :         }
    6348             : 
    6349             :         const CPLErr eErr =
    6350        2161 :             poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
    6351             :                                 1, eWrkType, 0, 0, nullptr);
    6352        2161 :         if (eErr != CE_None)
    6353             :         {
    6354           1 :             CPLFree(pafData);
    6355           1 :             return eErr;
    6356             :         }
    6357             : 
    6358      725208 :         for (int iPixel = 0; iPixel < nWidth; ++iPixel)
    6359             :         {
    6360      723048 :             float fValue = 0.0f;
    6361             : 
    6362      723048 :             if (bComplex)
    6363             :             {
    6364             :                 // Compute the magnitude of the complex value.
    6365             :                 fValue =
    6366           0 :                     std::hypot(pafData[static_cast<size_t>(iPixel) * 2],
    6367           0 :                                pafData[static_cast<size_t>(iPixel) * 2 + 1]);
    6368             :             }
    6369             :             else
    6370             :             {
    6371      723048 :                 fValue = pafData[iPixel];
    6372             :             }
    6373             : 
    6374      723048 :             dfSum += fValue;
    6375      723048 :             dfSum2 += static_cast<double>(fValue) * fValue;
    6376             :         }
    6377             : 
    6378        2160 :         nSamples += nWidth;
    6379        2160 :         iLine += nSampleStep;
    6380        2160 :     } while (iLine < nHeight);
    6381             : 
    6382          17 :     if (!pfnProgress(1.0, nullptr, pProgressData))
    6383             :     {
    6384           0 :         CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6385           0 :         CPLFree(pafData);
    6386           0 :         return CE_Failure;
    6387             :     }
    6388             : 
    6389             :     /* -------------------------------------------------------------------- */
    6390             :     /*      Produce the result values.                                      */
    6391             :     /* -------------------------------------------------------------------- */
    6392          17 :     if (pdfMean != nullptr)
    6393          17 :         *pdfMean = dfSum / nSamples;
    6394             : 
    6395          17 :     if (pdfStdDev != nullptr)
    6396             :     {
    6397          17 :         const double dfMean = dfSum / nSamples;
    6398             : 
    6399          17 :         *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
    6400             :     }
    6401             : 
    6402          17 :     CPLFree(pafData);
    6403             : 
    6404          17 :     return CE_None;
    6405             : }
    6406             : 
    6407             : /************************************************************************/
    6408             : /*                  GDALOverviewMagnitudeCorrection()                   */
    6409             : /*                                                                      */
    6410             : /*      Correct the mean and standard deviation of the overviews of     */
    6411             : /*      the given band to match the base layer approximately.           */
    6412             : /************************************************************************/
    6413             : 
    6414             : /** Undocumented
    6415             :  * @param hBaseBand undocumented.
    6416             :  * @param nOverviewCount undocumented.
    6417             :  * @param pahOverviews undocumented.
    6418             :  * @param pfnProgress undocumented.
    6419             :  * @param pProgressData undocumented.
    6420             :  * @return undocumented
    6421             :  */
    6422           0 : CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
    6423             :                                        int nOverviewCount,
    6424             :                                        GDALRasterBandH *pahOverviews,
    6425             :                                        GDALProgressFunc pfnProgress,
    6426             :                                        void *pProgressData)
    6427             : 
    6428             : {
    6429           0 :     VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
    6430             : 
    6431             :     /* -------------------------------------------------------------------- */
    6432             :     /*      Compute mean/stddev for source raster.                          */
    6433             :     /* -------------------------------------------------------------------- */
    6434           0 :     double dfOrigMean = 0.0;
    6435           0 :     double dfOrigStdDev = 0.0;
    6436             :     {
    6437             :         const CPLErr eErr =
    6438           0 :             GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
    6439             :                                  pfnProgress, pProgressData);
    6440             : 
    6441           0 :         if (eErr != CE_None)
    6442           0 :             return eErr;
    6443             :     }
    6444             : 
    6445             :     /* -------------------------------------------------------------------- */
    6446             :     /*      Loop on overview bands.                                         */
    6447             :     /* -------------------------------------------------------------------- */
    6448           0 :     for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
    6449             :     {
    6450             :         GDALRasterBand *poOverview =
    6451           0 :             GDALRasterBand::FromHandle(pahOverviews[iOverview]);
    6452             :         double dfOverviewMean, dfOverviewStdDev;
    6453             : 
    6454             :         const CPLErr eErr =
    6455           0 :             GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
    6456             :                                  &dfOverviewStdDev, pfnProgress, pProgressData);
    6457             : 
    6458           0 :         if (eErr != CE_None)
    6459           0 :             return eErr;
    6460             : 
    6461           0 :         double dfGain = 1.0;
    6462           0 :         if (dfOrigStdDev >= 0.0001)
    6463           0 :             dfGain = dfOrigStdDev / dfOverviewStdDev;
    6464             : 
    6465             :         /* --------------------------------------------------------------------
    6466             :          */
    6467             :         /*      Apply gain and offset. */
    6468             :         /* --------------------------------------------------------------------
    6469             :          */
    6470           0 :         const int nWidth = poOverview->GetXSize();
    6471           0 :         const int nHeight = poOverview->GetYSize();
    6472             : 
    6473           0 :         GDALDataType eWrkType = GDT_Unknown;
    6474           0 :         float *pafData = nullptr;
    6475           0 :         const GDALDataType eType = poOverview->GetRasterDataType();
    6476           0 :         const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
    6477           0 :         if (bComplex)
    6478             :         {
    6479             :             pafData = static_cast<float *>(
    6480           0 :                 VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
    6481           0 :             eWrkType = GDT_CFloat32;
    6482             :         }
    6483             :         else
    6484             :         {
    6485             :             pafData = static_cast<float *>(
    6486           0 :                 VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
    6487           0 :             eWrkType = GDT_Float32;
    6488             :         }
    6489             : 
    6490           0 :         if (pafData == nullptr)
    6491             :         {
    6492           0 :             return CE_Failure;
    6493             :         }
    6494             : 
    6495           0 :         for (int iLine = 0; iLine < nHeight; ++iLine)
    6496             :         {
    6497           0 :             if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
    6498             :                              pProgressData))
    6499             :             {
    6500           0 :                 CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6501           0 :                 CPLFree(pafData);
    6502           0 :                 return CE_Failure;
    6503             :             }
    6504             : 
    6505           0 :             if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
    6506             :                                      nWidth, 1, eWrkType, 0, 0,
    6507           0 :                                      nullptr) != CE_None)
    6508             :             {
    6509           0 :                 CPLFree(pafData);
    6510           0 :                 return CE_Failure;
    6511             :             }
    6512             : 
    6513           0 :             for (int iPixel = 0; iPixel < nWidth; ++iPixel)
    6514             :             {
    6515           0 :                 if (bComplex)
    6516             :                 {
    6517           0 :                     pafData[static_cast<size_t>(iPixel) * 2] *=
    6518           0 :                         static_cast<float>(dfGain);
    6519           0 :                     pafData[static_cast<size_t>(iPixel) * 2 + 1] *=
    6520           0 :                         static_cast<float>(dfGain);
    6521             :                 }
    6522             :                 else
    6523             :                 {
    6524           0 :                     pafData[iPixel] = static_cast<float>(
    6525           0 :                         (pafData[iPixel] - dfOverviewMean) * dfGain +
    6526             :                         dfOrigMean);
    6527             :                 }
    6528             :             }
    6529             : 
    6530           0 :             if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
    6531             :                                      nWidth, 1, eWrkType, 0, 0,
    6532           0 :                                      nullptr) != CE_None)
    6533             :             {
    6534           0 :                 CPLFree(pafData);
    6535           0 :                 return CE_Failure;
    6536             :             }
    6537             :         }
    6538             : 
    6539           0 :         if (!pfnProgress(1.0, nullptr, pProgressData))
    6540             :         {
    6541           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6542           0 :             CPLFree(pafData);
    6543           0 :             return CE_Failure;
    6544             :         }
    6545             : 
    6546           0 :         CPLFree(pafData);
    6547             :     }
    6548             : 
    6549           0 :     return CE_None;
    6550             : }

Generated by: LCOV version 1.14