Line data Source code
1 :
2 : /******************************************************************************
3 : *
4 : * Project: GDAL Core
5 : * Purpose: Helper code to implement overview support in different drivers.
6 : * Author: Frank Warmerdam, warmerdam@pobox.com
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 2000, Frank Warmerdam
10 : * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * SPDX-License-Identifier: MIT
13 : ****************************************************************************/
14 :
15 : #include "cpl_port.h"
16 : #include "gdal_priv.h"
17 :
18 : #include <cmath>
19 : #include <cstddef>
20 : #include <cstdlib>
21 :
22 : #include <algorithm>
23 : #include <complex>
24 : #include <condition_variable>
25 : #include <limits>
26 : #include <list>
27 : #include <memory>
28 : #include <mutex>
29 : #include <vector>
30 :
31 : #include "cpl_conv.h"
32 : #include "cpl_error.h"
33 : #include "cpl_progress.h"
34 : #include "cpl_vsi.h"
35 : #include "gdal.h"
36 : #include "gdal_thread_pool.h"
37 : #include "gdalwarper.h"
38 :
39 : #ifdef USE_NEON_OPTIMIZATIONS
40 : #include "include_sse2neon.h"
41 : #define USE_SSE2
42 :
43 : #include "gdalsse_priv.h"
44 :
45 : // Restrict to 64bit processors because they are guaranteed to have SSE2,
46 : // or if __AVX2__ is defined.
47 : #elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
48 : #define USE_SSE2
49 :
50 : #include "gdalsse_priv.h"
51 :
52 : #ifdef __SSE3__
53 : #include <pmmintrin.h>
54 : #endif
55 : #ifdef __SSSE3__
56 : #include <tmmintrin.h>
57 : #endif
58 : #ifdef __SSE4_1__
59 : #include <smmintrin.h>
60 : #endif
61 : #ifdef __AVX2__
62 : #include <immintrin.h>
63 : #endif
64 :
65 : #endif
66 :
67 : // To be included after above USE_SSE2 and include gdalsse_priv.h
68 : // to avoid build issue on Windows x86
69 : #include "gdal_priv_templates.hpp"
70 :
71 : /************************************************************************/
72 : /* GDALResampleChunk_Near() */
73 : /************************************************************************/
74 :
75 : template <class T>
76 6034 : static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
77 : const T *pChunk, T **ppDstBuffer)
78 :
79 : {
80 6034 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
81 6034 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
82 6034 : const GDALDataType eWrkDataType = args.eWrkDataType;
83 6034 : const int nChunkXOff = args.nChunkXOff;
84 6034 : const int nChunkXSize = args.nChunkXSize;
85 6034 : const int nChunkYOff = args.nChunkYOff;
86 6034 : const int nDstXOff = args.nDstXOff;
87 6034 : const int nDstXOff2 = args.nDstXOff2;
88 6034 : const int nDstYOff = args.nDstYOff;
89 6034 : const int nDstYOff2 = args.nDstYOff2;
90 6034 : const int nDstXWidth = nDstXOff2 - nDstXOff;
91 :
92 : /* -------------------------------------------------------------------- */
93 : /* Allocate buffers. */
94 : /* -------------------------------------------------------------------- */
95 6034 : *ppDstBuffer = static_cast<T *>(
96 6034 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
97 : GDALGetDataTypeSizeBytes(eWrkDataType)));
98 6034 : if (*ppDstBuffer == nullptr)
99 : {
100 0 : return CE_Failure;
101 : }
102 6034 : T *const pDstBuffer = *ppDstBuffer;
103 :
104 : int *panSrcXOff =
105 6034 : static_cast<int *>(VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(int)));
106 :
107 6034 : if (panSrcXOff == nullptr)
108 : {
109 0 : VSIFree(panSrcXOff);
110 0 : return CE_Failure;
111 : }
112 :
113 : /* ==================================================================== */
114 : /* Precompute inner loop constants. */
115 : /* ==================================================================== */
116 572295 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
117 : {
118 566261 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
119 566261 : if (nSrcXOff < nChunkXOff)
120 0 : nSrcXOff = nChunkXOff;
121 :
122 566261 : panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
123 : }
124 :
125 : /* ==================================================================== */
126 : /* Loop over destination scanlines. */
127 : /* ==================================================================== */
128 210662 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
129 : {
130 204628 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
131 204628 : if (nSrcYOff < nChunkYOff)
132 0 : nSrcYOff = nChunkYOff;
133 :
134 204628 : const T *const pSrcScanline =
135 : pChunk +
136 204628 : (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
137 202158 : nChunkXOff;
138 :
139 : /* --------------------------------------------------------------------
140 : */
141 : /* Loop over destination pixels */
142 : /* --------------------------------------------------------------------
143 : */
144 204628 : T *pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
145 116297034 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
146 : {
147 116092564 : pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
148 : }
149 : }
150 :
151 6034 : CPLFree(panSrcXOff);
152 :
153 6034 : return CE_None;
154 : }
155 :
156 6034 : static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
157 : const void *pChunk, void **ppDstBuffer,
158 : GDALDataType *peDstBufferDataType)
159 : {
160 6034 : *peDstBufferDataType = args.eWrkDataType;
161 6034 : switch (args.eWrkDataType)
162 : {
163 : // For nearest resampling, as no computation is done, only the
164 : // size of the data type matters.
165 5906 : case GDT_Byte:
166 : case GDT_Int8:
167 : {
168 5906 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
169 5906 : return GDALResampleChunk_NearT(
170 : args, static_cast<const uint8_t *>(pChunk),
171 5906 : reinterpret_cast<uint8_t **>(ppDstBuffer));
172 : }
173 :
174 26 : case GDT_Int16:
175 : case GDT_UInt16:
176 : {
177 26 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
178 26 : return GDALResampleChunk_NearT(
179 : args, static_cast<const uint16_t *>(pChunk),
180 26 : reinterpret_cast<uint16_t **>(ppDstBuffer));
181 : }
182 :
183 55 : case GDT_CInt16:
184 : case GDT_Int32:
185 : case GDT_UInt32:
186 : case GDT_Float32:
187 : {
188 55 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
189 55 : return GDALResampleChunk_NearT(
190 : args, static_cast<const uint32_t *>(pChunk),
191 55 : reinterpret_cast<uint32_t **>(ppDstBuffer));
192 : }
193 :
194 43 : case GDT_CInt32:
195 : case GDT_CFloat32:
196 : case GDT_Int64:
197 : case GDT_UInt64:
198 : case GDT_Float64:
199 : {
200 43 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
201 43 : return GDALResampleChunk_NearT(
202 : args, static_cast<const uint64_t *>(pChunk),
203 43 : reinterpret_cast<uint64_t **>(ppDstBuffer));
204 : }
205 :
206 4 : case GDT_CFloat64:
207 : {
208 4 : return GDALResampleChunk_NearT(
209 : args, static_cast<const std::complex<double> *>(pChunk),
210 4 : reinterpret_cast<std::complex<double> **>(ppDstBuffer));
211 : }
212 :
213 0 : case GDT_Unknown:
214 : case GDT_TypeCount:
215 0 : break;
216 : }
217 0 : CPLAssert(false);
218 : return CE_Failure;
219 : }
220 :
221 : namespace
222 : {
223 :
224 : // Find in the color table the entry whose RGB value is the closest
225 : // (using quadratic distance) to the test color, ignoring transparent entries.
226 3837 : int BestColorEntry(const std::vector<GDALColorEntry> &entries,
227 : const GDALColorEntry &test)
228 : {
229 3837 : int nMinDist = std::numeric_limits<int>::max();
230 3837 : size_t bestEntry = 0;
231 986109 : for (size_t i = 0; i < entries.size(); ++i)
232 : {
233 982272 : const GDALColorEntry &entry = entries[i];
234 : // Ignore transparent entries
235 982272 : if (entry.c4 == 0)
236 3237 : continue;
237 :
238 979035 : int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
239 979035 : ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
240 979035 : ((test.c3 - entry.c3) * (test.c3 - entry.c3));
241 979035 : if (nDist < nMinDist)
242 : {
243 15847 : nMinDist = nDist;
244 15847 : bestEntry = i;
245 : }
246 : }
247 3837 : return static_cast<int>(bestEntry);
248 : }
249 :
250 7 : std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
251 : int &transparentIdx)
252 : {
253 7 : std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
254 :
255 7 : transparentIdx = -1;
256 7 : int i = 0;
257 1799 : for (auto &entry : entries)
258 : {
259 1792 : table.GetColorEntryAsRGB(i, &entry);
260 1792 : if (transparentIdx < 0 && entry.c4 == 0)
261 1 : transparentIdx = i;
262 1792 : ++i;
263 : }
264 7 : return entries;
265 : }
266 :
267 : } // unnamed namespace
268 :
269 : /************************************************************************/
270 : /* SQUARE() */
271 : /************************************************************************/
272 :
273 3721 : template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
274 : {
275 3721 : return static_cast<Tsquare>(val) * val;
276 : }
277 :
278 : /************************************************************************/
279 : /* ComputeIntegerRMS() */
280 : /************************************************************************/
281 : // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
282 : // integer that minimizes abs(rms**2 - sumSquares / weight)
283 : template <class T, class Twork>
284 42 : inline T ComputeIntegerRMS(double sumSquares, double weight)
285 : {
286 42 : const double sumDivWeight = sumSquares / weight;
287 42 : T rms = static_cast<T>(sqrt(sumDivWeight));
288 :
289 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
290 : // Naive version:
291 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
292 42 : if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
293 42 : 2 * sumDivWeight)
294 6 : rms += 1;
295 42 : return rms;
296 : }
297 :
298 0 : template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
299 : {
300 0 : CPLAssert(false);
301 : return 0;
302 : }
303 :
304 24 : template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
305 : {
306 : // It has been verified that given the correction on rms below, using
307 : // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
308 : // is equivalent, so use the former as it is used twice.
309 24 : const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
310 24 : const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
311 24 : GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
312 :
313 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
314 : // Naive version:
315 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
316 : // Optimized version for integer case and weight == 4
317 24 : if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
318 5 : rms += 1;
319 24 : return rms;
320 : }
321 :
322 : template <>
323 20 : inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
324 : {
325 20 : const double sumDivWeight = sumSquares * 0.25;
326 20 : GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
327 :
328 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
329 : // Naive version:
330 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
331 : // Optimized version for integer case and weight == 4
332 20 : if (static_cast<GUInt32>(rms) * (rms + 1) <
333 20 : static_cast<GUInt32>(sumDivWeight + 0.25))
334 4 : rms += 1;
335 20 : return rms;
336 : }
337 :
338 : #ifdef USE_SSE2
339 :
340 : /************************************************************************/
341 : /* QuadraticMeanByteSSE2OrAVX2() */
342 : /************************************************************************/
343 :
344 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
345 : #define sse2_packus_epi32 _mm_packus_epi32
346 : #else
347 516119 : inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
348 : {
349 516119 : const auto minus32768_32 = _mm_set1_epi32(-32768);
350 516119 : const auto minus32768_16 = _mm_set1_epi16(-32768);
351 516119 : a = _mm_add_epi32(a, minus32768_32);
352 516119 : b = _mm_add_epi32(b, minus32768_32);
353 516119 : a = _mm_packs_epi32(a, b);
354 516119 : a = _mm_sub_epi16(a, minus32768_16);
355 516119 : return a;
356 : }
357 : #endif
358 :
359 : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
360 : #define sse2_hadd_epi16 _mm_hadd_epi16
361 : #else
362 4660800 : inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
363 : {
364 : // Horizontal addition of adjacent pairs
365 4660800 : const auto mask = _mm_set1_epi32(0xFFFF);
366 : const auto horizLo =
367 13982400 : _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
368 : const auto horizHi =
369 13982400 : _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
370 :
371 : // Recombine low and high parts
372 4660800 : return _mm_packs_epi32(horizLo, horizHi);
373 : }
374 : #endif
375 :
376 : #ifdef __AVX2__
377 :
378 : #define DEST_ELTS 16
379 : #define set1_epi16 _mm256_set1_epi16
380 : #define set1_epi32 _mm256_set1_epi32
381 : #define setzero _mm256_setzero_si256
382 : #define set1_ps _mm256_set1_ps
383 : #define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
384 : #define unpacklo_epi8 _mm256_unpacklo_epi8
385 : #define unpackhi_epi8 _mm256_unpackhi_epi8
386 : #define madd_epi16 _mm256_madd_epi16
387 : #define add_epi32 _mm256_add_epi32
388 : #define mul_ps _mm256_mul_ps
389 : #define cvtepi32_ps _mm256_cvtepi32_ps
390 : #define sqrt_ps _mm256_sqrt_ps
391 : #define cvttps_epi32 _mm256_cvttps_epi32
392 : #define packs_epi32 _mm256_packs_epi32
393 : #define packus_epi32 _mm256_packus_epi32
394 : #define srli_epi32 _mm256_srli_epi32
395 : #define mullo_epi16 _mm256_mullo_epi16
396 : #define srli_epi16 _mm256_srli_epi16
397 : #define cmpgt_epi16 _mm256_cmpgt_epi16
398 : #define add_epi16 _mm256_add_epi16
399 : #define sub_epi16 _mm256_sub_epi16
400 : #define packus_epi16 _mm256_packus_epi16
401 : /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
402 : /* to get the lower 128-bit bits of what would be a true 256-bit vector register
403 : */
404 : #define store_lo(x, y) \
405 : _mm_storeu_si128(reinterpret_cast<__m128i *>(x), \
406 : _mm256_extracti128_si256( \
407 : _mm256_permute4x64_epi64((y), 0 | (2 << 2)), 0))
408 : #define hadd_epi16 _mm256_hadd_epi16
409 : #define zeroupper() _mm256_zeroupper()
410 : #else
411 : #define DEST_ELTS 8
412 : #define set1_epi16 _mm_set1_epi16
413 : #define set1_epi32 _mm_set1_epi32
414 : #define setzero _mm_setzero_si128
415 : #define set1_ps _mm_set1_ps
416 : #define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
417 : #define unpacklo_epi8 _mm_unpacklo_epi8
418 : #define unpackhi_epi8 _mm_unpackhi_epi8
419 : #define madd_epi16 _mm_madd_epi16
420 : #define add_epi32 _mm_add_epi32
421 : #define mul_ps _mm_mul_ps
422 : #define cvtepi32_ps _mm_cvtepi32_ps
423 : #define sqrt_ps _mm_sqrt_ps
424 : #define cvttps_epi32 _mm_cvttps_epi32
425 : #define packs_epi32 _mm_packs_epi32
426 : #define packus_epi32 sse2_packus_epi32
427 : #define srli_epi32 _mm_srli_epi32
428 : #define mullo_epi16 _mm_mullo_epi16
429 : #define srli_epi16 _mm_srli_epi16
430 : #define cmpgt_epi16 _mm_cmpgt_epi16
431 : #define add_epi16 _mm_add_epi16
432 : #define sub_epi16 _mm_sub_epi16
433 : #define packus_epi16 _mm_packus_epi16
434 : #define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
435 : #define hadd_epi16 sse2_hadd_epi16
436 : #define zeroupper() (void)0
437 : #endif
438 :
439 : #if defined(__GNUC__) && defined(__AVX2__)
440 : // Disabling inlining works around a bug with gcc 9.3 (Ubuntu 20.04) in
441 : // -O2 -mavx2 mode in QuadraticMeanFloatSSE2(),
442 : // where the registry that contains minus_zero is correctly
443 : // loaded the first time the function is called (looking at the disassembly,
444 : // one sees it is loaded much earlier than the function), but gets corrupted
445 : // (zeroed) in following iterations.
446 : // It appears the bug is due to the explicit zeroupper() call at the end of
447 : // the function.
448 : // The bug is at least solved in gcc 10.2.
449 : // Inlining doesn't bring much here to performance.
450 : // This is also needed with gcc 9.3 on QuadraticMeanByteSSE2OrAVX2() in
451 : // -O3 -mavx2 mode
452 : #define NOINLINE __attribute__((noinline))
453 : #else
454 : #define NOINLINE
455 : #endif
456 :
457 : template <class T>
458 : static int NOINLINE
459 5385 : QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
460 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
461 : T *CPL_RESTRICT pDstScanline)
462 : {
463 : // Optimized implementation for RMS on Byte by
464 : // processing by group of 8 output pixels, so as to use
465 : // a single _mm_sqrt_ps() call for 4 output pixels
466 5385 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
467 :
468 5385 : int iDstPixel = 0;
469 5385 : const auto one16 = set1_epi16(1);
470 5385 : const auto one32 = set1_epi32(1);
471 5385 : const auto zero = setzero();
472 5385 : const auto minus32768 = set1_epi16(-32768);
473 :
474 521496 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
475 : {
476 : // Load 2 * DEST_ELTS bytes from each line
477 516111 : auto firstLine = loadu_int(pSrcScanlineShifted);
478 1032220 : auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
479 : // Extend those Bytes as UInt16s
480 516111 : auto firstLineLo = unpacklo_epi8(firstLine, zero);
481 516111 : auto firstLineHi = unpackhi_epi8(firstLine, zero);
482 516111 : auto secondLineLo = unpacklo_epi8(secondLine, zero);
483 516111 : auto secondLineHi = unpackhi_epi8(secondLine, zero);
484 :
485 : // Multiplication of 16 bit values and horizontal
486 : // addition of 32 bit results
487 : // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
488 516111 : firstLineLo = madd_epi16(firstLineLo, firstLineLo);
489 516111 : firstLineHi = madd_epi16(firstLineHi, firstLineHi);
490 516111 : secondLineLo = madd_epi16(secondLineLo, secondLineLo);
491 516111 : secondLineHi = madd_epi16(secondLineHi, secondLineHi);
492 :
493 : // Vertical addition
494 516111 : const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
495 516111 : const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
496 :
497 : const auto sumSquaresPlusOneDiv4Lo =
498 1032220 : srli_epi32(add_epi32(sumSquaresLo, one32), 2);
499 : const auto sumSquaresPlusOneDiv4Hi =
500 1032220 : srli_epi32(add_epi32(sumSquaresHi, one32), 2);
501 :
502 : // Take square root and truncate/floor to int32
503 : const auto rmsLo =
504 1548330 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
505 : const auto rmsHi =
506 1548330 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
507 :
508 : // Merge back low and high registers with each RMS value
509 : // as a 16 bit value.
510 516111 : auto rms = packs_epi32(rmsLo, rmsHi);
511 :
512 : // Round to upper value if it minimizes the
513 : // error |rms^2 - sumSquares/4|
514 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
515 : // rms += 1;
516 : // which is equivalent to:
517 : // if( rms * (rms + 1) < (sumSquares+1) / 4 )
518 : // rms += 1;
519 : // And both left and right parts fit on 16 (unsigned) bits
520 : const auto sumSquaresPlusOneDiv4 =
521 516111 : packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
522 : // cmpgt_epi16 operates on signed int16, but here
523 : // we have unsigned values, so shift them by -32768 before
524 2580560 : auto mask = cmpgt_epi16(
525 : add_epi16(sumSquaresPlusOneDiv4, minus32768),
526 : add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
527 : // The value of the mask will be -1 when the correction needs to be
528 : // applied
529 516111 : rms = sub_epi16(rms, mask);
530 :
531 : // Pack each 16 bit RMS value to 8 bits
532 516111 : rms = packus_epi16(rms, rms /* could be anything */);
533 516111 : store_lo(&pDstScanline[iDstPixel], rms);
534 516111 : pSrcScanlineShifted += 2 * DEST_ELTS;
535 : }
536 : zeroupper();
537 :
538 5385 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
539 5385 : return iDstPixel;
540 : }
541 :
542 : /************************************************************************/
543 : /* AverageByteSSE2OrAVX2() */
544 : /************************************************************************/
545 :
546 : template <class T>
547 : static int
548 110996 : AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
549 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
550 : T *CPL_RESTRICT pDstScanline)
551 : {
552 : // Optimized implementation for average on Byte by
553 : // processing by group of 8 output pixels.
554 :
555 110996 : const auto zero = setzero();
556 110996 : const auto two16 = set1_epi16(2);
557 110996 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
558 :
559 110996 : int iDstPixel = 0;
560 4771800 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
561 : {
562 : // Load 2 * DEST_ELTS bytes from each line
563 4660800 : const auto firstLine = loadu_int(pSrcScanlineShifted);
564 9321610 : const auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
565 : // Extend those Bytes as UInt16s
566 4660800 : const auto firstLineLo = unpacklo_epi8(firstLine, zero);
567 4660800 : const auto firstLineHi = unpackhi_epi8(firstLine, zero);
568 4660800 : const auto secondLineLo = unpacklo_epi8(secondLine, zero);
569 4660800 : const auto secondLineHi = unpackhi_epi8(secondLine, zero);
570 :
571 : // Vertical addition
572 4660800 : const auto sumLo = add_epi16(firstLineLo, secondLineLo);
573 4660800 : const auto sumHi = add_epi16(firstLineHi, secondLineHi);
574 :
575 : // Horizontal addition of adjacent pairs, and recombine low and high
576 : // parts
577 4660800 : const auto sum = hadd_epi16(sumLo, sumHi);
578 :
579 : // average = (sum + 2) / 4
580 9321610 : auto average = srli_epi16(add_epi16(sum, two16), 2);
581 :
582 : // Pack each 16 bit average value to 8 bits
583 4660800 : average = packus_epi16(average, average /* could be anything */);
584 4660800 : store_lo(&pDstScanline[iDstPixel], average);
585 4660800 : pSrcScanlineShifted += 2 * DEST_ELTS;
586 : }
587 : zeroupper();
588 :
589 110996 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
590 110996 : return iDstPixel;
591 : }
592 :
593 : /************************************************************************/
594 : /* QuadraticMeanUInt16SSE2() */
595 : /************************************************************************/
596 :
597 : #ifdef __SSE3__
598 : #define sse2_hadd_pd _mm_hadd_pd
599 : #else
600 8 : inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
601 : {
602 : auto aLo_bLo =
603 32 : _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
604 : auto aHi_bHi =
605 32 : _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
606 8 : return _mm_add_pd(aLo_bLo, aHi_bHi); // (aLo + aHi, bLo + bHi)
607 : }
608 : #endif
609 :
610 40 : inline __m128d SQUARE(__m128d x)
611 : {
612 40 : return _mm_mul_pd(x, x);
613 : }
614 :
615 : #ifdef __AVX2__
616 :
617 : inline __m256d SQUARE(__m256d x)
618 : {
619 : return _mm256_mul_pd(x, x);
620 : }
621 :
622 : inline __m256d FIXUP_LANES(__m256d x)
623 : {
624 : return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
625 : }
626 :
627 : inline __m256 FIXUP_LANES(__m256 x)
628 : {
629 : return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
630 : }
631 :
632 : #endif
633 :
634 : template <class T>
635 : static int
636 10 : QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
637 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
638 : T *CPL_RESTRICT pDstScanline)
639 : {
640 : // Optimized implementation for RMS on UInt16 by
641 : // processing by group of 4 output pixels.
642 10 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
643 :
644 10 : int iDstPixel = 0;
645 10 : const auto zero = _mm_setzero_si128();
646 :
647 : #ifdef __AVX2__
648 : const auto zeroDot25 = _mm256_set1_pd(0.25);
649 : const auto zeroDot5 = _mm256_set1_pd(0.5);
650 :
651 : // The first four 0's could be anything, as we only take the bottom
652 : // 128 bits.
653 : const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
654 : #else
655 10 : const auto zeroDot25 = _mm_set1_pd(0.25);
656 10 : const auto zeroDot5 = _mm_set1_pd(0.5);
657 : #endif
658 :
659 40 : for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
660 : {
661 : // Load 8 UInt16 from each line
662 30 : const auto firstLine = _mm_loadu_si128(
663 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
664 : const auto secondLine =
665 30 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
666 30 : pSrcScanlineShifted + nChunkXSize));
667 :
668 : // Detect if all of the source values fit in 14 bits.
669 : // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
670 : // and we can do a much faster implementation.
671 : const auto maskTmp =
672 60 : _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
673 : #if defined(__i386__) || defined(_M_IX86)
674 : uint64_t nMaskFitsIn14Bits = 0;
675 : _mm_storel_epi64(
676 : reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
677 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
678 : #else
679 30 : const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
680 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
681 : #endif
682 30 : if (nMaskFitsIn14Bits == 0)
683 : {
684 : // Multiplication of 16 bit values and horizontal
685 : // addition of 32 bit results
686 : const auto firstLineHSumSquare =
687 26 : _mm_madd_epi16(firstLine, firstLine);
688 : const auto secondLineHSumSquare =
689 26 : _mm_madd_epi16(secondLine, secondLine);
690 : // Vertical addition
691 : const auto sumSquares =
692 26 : _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
693 : // In theory we should take sqrt(sumSquares * 0.25f)
694 : // but given the rounding we do, this is equivalent to
695 : // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
696 : // sumSquares <= 4 * 16383^2
697 26 : const auto one32 = _mm_set1_epi32(1);
698 : const auto sumSquaresPlusOneDiv4 =
699 52 : _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
700 : // Take square root and truncate/floor to int32
701 78 : auto rms = _mm_cvttps_epi32(
702 : _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
703 :
704 : // Round to upper value if it minimizes the
705 : // error |rms^2 - sumSquares/4|
706 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
707 : // rms += 1;
708 : // which is equivalent to:
709 : // if( rms * rms + rms < (sumSquares+1) / 4 )
710 : // rms += 1;
711 : auto mask =
712 78 : _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
713 : _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
714 26 : rms = _mm_sub_epi32(rms, mask);
715 : // Pack each 32 bit RMS value to 16 bits
716 26 : rms = _mm_packs_epi32(rms, rms /* could be anything */);
717 : _mm_storel_epi64(
718 26 : reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
719 26 : pSrcScanlineShifted += 8;
720 26 : continue;
721 : }
722 :
723 : // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
724 : // to 32 bit would result in 4 multiplications instead of 8, but
725 : // mullo/mulhi have a worse throughput than mul_pd.
726 :
727 : // Extend those UInt16s as UInt32s
728 4 : const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
729 4 : const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
730 4 : const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
731 4 : const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
732 :
733 : #ifdef __AVX2__
734 : // Multiplication of 32 bit values previously converted to 64 bit double
735 : const auto firstLineLoDbl = SQUARE(_mm256_cvtepi32_pd(firstLineLo));
736 : const auto firstLineHiDbl = SQUARE(_mm256_cvtepi32_pd(firstLineHi));
737 : const auto secondLineLoDbl = SQUARE(_mm256_cvtepi32_pd(secondLineLo));
738 : const auto secondLineHiDbl = SQUARE(_mm256_cvtepi32_pd(secondLineHi));
739 :
740 : // Vertical addition of squares
741 : const auto sumSquaresLo =
742 : _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
743 : const auto sumSquaresHi =
744 : _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
745 :
746 : // Horizontal addition of squares
747 : const auto sumSquares =
748 : FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
749 :
750 : const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
751 :
752 : // Take square root and truncate/floor to int32
753 : auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
754 : const auto rmsDouble = _mm256_cvtepi32_pd(rms);
755 : const auto right = _mm256_sub_pd(
756 : sumDivWeight, _mm256_add_pd(SQUARE(rmsDouble), rmsDouble));
757 :
758 : auto mask =
759 : _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
760 : // Extract 32-bit from each of the 4 64-bit masks
761 : // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
762 : // _MM_SHUFFLE(2,0,2,0)));
763 : mask = _mm256_permutevar8x32_ps(mask, permutation);
764 : const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
765 :
766 : // Apply the correction
767 : rms = _mm_sub_epi32(rms, maskI);
768 :
769 : // Pack each 32 bit RMS value to 16 bits
770 : rms = _mm_packus_epi32(rms, rms /* could be anything */);
771 : #else
772 : // Multiplication of 32 bit values previously converted to 64 bit double
773 4 : const auto firstLineLoLo = SQUARE(_mm_cvtepi32_pd(firstLineLo));
774 : const auto firstLineLoHi =
775 8 : SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
776 4 : const auto firstLineHiLo = SQUARE(_mm_cvtepi32_pd(firstLineHi));
777 : const auto firstLineHiHi =
778 8 : SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
779 :
780 4 : const auto secondLineLoLo = SQUARE(_mm_cvtepi32_pd(secondLineLo));
781 : const auto secondLineLoHi =
782 8 : SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
783 4 : const auto secondLineHiLo = SQUARE(_mm_cvtepi32_pd(secondLineHi));
784 : const auto secondLineHiHi =
785 8 : SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
786 :
787 : // Vertical addition of squares
788 4 : const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
789 4 : const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
790 4 : const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
791 4 : const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
792 :
793 : // Horizontal addition of squares
794 4 : const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
795 4 : const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
796 :
797 4 : const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
798 4 : const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
799 : // Take square root and truncate/floor to int32
800 8 : const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
801 8 : const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
802 :
803 : // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
804 : // if( 0.5 < sumDivWeight - (rms * rms + rms) )
805 : // rms += 1;
806 4 : const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
807 4 : const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
808 8 : const auto rightLo = _mm_sub_pd(
809 : sumDivWeightLo, _mm_add_pd(SQUARE(rmsLoDouble), rmsLoDouble));
810 12 : const auto rightHi = _mm_sub_pd(
811 : sumDivWeightHi, _mm_add_pd(SQUARE(rmsHiDouble), rmsHiDouble));
812 :
813 8 : const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
814 4 : const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
815 : // The value of the mask will be -1 when the correction needs to be
816 : // applied
817 8 : const auto mask = _mm_castps_si128(_mm_shuffle_ps(
818 : maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
819 :
820 16 : auto rms = _mm_castps_si128(
821 : _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
822 : // Apply the correction
823 4 : rms = _mm_sub_epi32(rms, mask);
824 :
825 : // Pack each 32 bit RMS value to 16 bits
826 4 : rms = sse2_packus_epi32(rms, rms /* could be anything */);
827 : #endif
828 :
829 4 : _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
830 : rms);
831 4 : pSrcScanlineShifted += 8;
832 : }
833 :
834 : zeroupper();
835 :
836 10 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
837 10 : return iDstPixel;
838 : }
839 :
840 : /************************************************************************/
841 : /* AverageUInt16SSE2() */
842 : /************************************************************************/
843 :
844 : template <class T>
845 9 : static int AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
846 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
847 : T *CPL_RESTRICT pDstScanline)
848 : {
849 : // Optimized implementation for average on UInt16 by
850 : // processing by group of 8 output pixels.
851 :
852 9 : const auto mask = _mm_set1_epi32(0xFFFF);
853 9 : const auto two = _mm_set1_epi32(2);
854 9 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
855 :
856 9 : int iDstPixel = 0;
857 13 : for (; iDstPixel < nDstXWidth - 7; iDstPixel += 8)
858 : {
859 : __m128i averageLow;
860 : // Load 8 UInt16 from each line
861 : {
862 4 : const auto firstLine = _mm_loadu_si128(
863 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
864 : const auto secondLine =
865 4 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
866 4 : pSrcScanlineShifted + nChunkXSize));
867 :
868 : // Horizontal addition and extension to 32 bit
869 12 : const auto horizAddFirstLine = _mm_add_epi32(
870 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
871 : const auto horizAddSecondLine =
872 12 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
873 : _mm_srli_epi32(secondLine, 16));
874 :
875 : // Vertical addition and average computation
876 : // average = (sum + 2) >> 2
877 8 : const auto sum = _mm_add_epi32(
878 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
879 4 : averageLow = _mm_srli_epi32(sum, 2);
880 : }
881 : // Load 8 UInt16 from each line
882 : __m128i averageHigh;
883 : {
884 4 : const auto firstLine = _mm_loadu_si128(
885 4 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted + 8));
886 : const auto secondLine =
887 4 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
888 4 : pSrcScanlineShifted + 8 + nChunkXSize));
889 :
890 : // Horizontal addition and extension to 32 bit
891 12 : const auto horizAddFirstLine = _mm_add_epi32(
892 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
893 : const auto horizAddSecondLine =
894 12 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
895 : _mm_srli_epi32(secondLine, 16));
896 :
897 : // Vertical addition and average computation
898 : // average = (sum + 2) >> 2
899 8 : const auto sum = _mm_add_epi32(
900 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
901 4 : averageHigh = _mm_srli_epi32(sum, 2);
902 : }
903 :
904 : // Pack each 32 bit average value to 16 bits
905 4 : auto average = sse2_packus_epi32(averageLow, averageHigh);
906 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
907 : average);
908 4 : pSrcScanlineShifted += 16;
909 : }
910 :
911 9 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
912 9 : return iDstPixel;
913 : }
914 :
915 : /************************************************************************/
916 : /* QuadraticMeanFloatSSE2() */
917 : /************************************************************************/
918 :
919 : #ifdef __AVX2__
920 : #define RMS_FLOAT_ELTS 8
921 : #define set1_ps _mm256_set1_ps
922 : #define loadu_ps _mm256_loadu_ps
923 : #define andnot_ps _mm256_andnot_ps
924 : #define and_ps _mm256_and_ps
925 : #define max_ps _mm256_max_ps
926 : #define shuffle_ps _mm256_shuffle_ps
927 : #define div_ps _mm256_div_ps
928 : #define cmpeq_ps(x, y) _mm256_cmp_ps(x, y, _CMP_EQ_OQ)
929 : #define mul_ps _mm256_mul_ps
930 : #define add_ps _mm256_add_ps
931 : #define hadd_ps _mm256_hadd_ps
932 : #define sqrt_ps _mm256_sqrt_ps
933 : #define or_ps _mm256_or_ps
934 : #define unpacklo_ps _mm256_unpacklo_ps
935 : #define unpackhi_ps _mm256_unpackhi_ps
936 : #define storeu_ps _mm256_storeu_ps
937 :
938 : inline __m256 SQUARE(__m256 x)
939 : {
940 : return _mm256_mul_ps(x, x);
941 : }
942 :
943 : #else
944 :
945 : #ifdef __SSE3__
946 : #define sse2_hadd_ps _mm_hadd_ps
947 : #else
948 : inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
949 : {
950 : auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
951 : auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
952 : return _mm_add_ps(aEven_bEven, aOdd_bOdd); // (aEven + aOdd, bEven + bOdd)
953 : }
954 : #endif
955 :
956 : #define RMS_FLOAT_ELTS 4
957 : #define set1_ps _mm_set1_ps
958 : #define loadu_ps _mm_loadu_ps
959 : #define andnot_ps _mm_andnot_ps
960 : #define and_ps _mm_and_ps
961 : #define max_ps _mm_max_ps
962 : #define shuffle_ps _mm_shuffle_ps
963 : #define div_ps _mm_div_ps
964 : #define cmpeq_ps _mm_cmpeq_ps
965 : #define mul_ps _mm_mul_ps
966 : #define add_ps _mm_add_ps
967 : #define hadd_ps sse2_hadd_ps
968 : #define sqrt_ps _mm_sqrt_ps
969 : #define or_ps _mm_or_ps
970 : #define unpacklo_ps _mm_unpacklo_ps
971 : #define unpackhi_ps _mm_unpackhi_ps
972 : #define storeu_ps _mm_storeu_ps
973 :
974 272 : inline __m128 SQUARE(__m128 x)
975 : {
976 272 : return _mm_mul_ps(x, x);
977 : }
978 :
979 68 : inline __m128 FIXUP_LANES(__m128 x)
980 : {
981 68 : return x;
982 : }
983 :
984 : #endif
985 :
986 : template <class T>
987 : static int NOINLINE
988 34 : QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
989 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
990 : T *CPL_RESTRICT pDstScanline)
991 : {
992 : // Optimized implementation for RMS on Float32 by
993 : // processing by group of RMS_FLOAT_ELTS output pixels.
994 34 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
995 :
996 34 : int iDstPixel = 0;
997 34 : const auto minus_zero = set1_ps(-0.0f);
998 34 : const auto zeroDot25 = set1_ps(0.25f);
999 34 : const auto one = set1_ps(1.0f);
1000 68 : const auto infv = set1_ps(std::numeric_limits<float>::infinity());
1001 :
1002 102 : for (; iDstPixel < nDstXWidth - (RMS_FLOAT_ELTS - 1);
1003 : iDstPixel += RMS_FLOAT_ELTS)
1004 : {
1005 : // Load 2*RMS_FLOAT_ELTS Float32 from each line
1006 : auto firstLineLo =
1007 68 : loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
1008 68 : auto firstLineHi = loadu_ps(reinterpret_cast<float const *>(
1009 68 : pSrcScanlineShifted + RMS_FLOAT_ELTS));
1010 68 : auto secondLineLo = loadu_ps(
1011 68 : reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
1012 68 : auto secondLineHi = loadu_ps(reinterpret_cast<float const *>(
1013 68 : pSrcScanlineShifted + RMS_FLOAT_ELTS + nChunkXSize));
1014 :
1015 : // Take the absolute value
1016 68 : firstLineLo = andnot_ps(minus_zero, firstLineLo);
1017 68 : firstLineHi = andnot_ps(minus_zero, firstLineHi);
1018 68 : secondLineLo = andnot_ps(minus_zero, secondLineLo);
1019 68 : secondLineHi = andnot_ps(minus_zero, secondLineHi);
1020 :
1021 : auto firstLineEven =
1022 68 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1023 : auto firstLineOdd =
1024 68 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1025 : auto secondLineEven =
1026 68 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1027 : auto secondLineOdd =
1028 68 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1029 :
1030 : // Compute the maximum of each RMS_FLOAT_ELTS value to RMS-average
1031 204 : const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
1032 : max_ps(secondLineEven, secondLineEven));
1033 :
1034 : // Normalize each value by the maximum of the RMS_FLOAT_ELTS ones.
1035 : // This step is important to avoid that the square evaluates to infinity
1036 : // for sufficiently big input.
1037 68 : auto invMax = div_ps(one, maxV);
1038 : // Deal with 0 being the maximum to correct division by zero
1039 : // note: comparing to -0 leads to identical results as to comparing with
1040 : // 0
1041 136 : invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
1042 :
1043 68 : firstLineEven = mul_ps(firstLineEven, invMax);
1044 68 : firstLineOdd = mul_ps(firstLineOdd, invMax);
1045 68 : secondLineEven = mul_ps(secondLineEven, invMax);
1046 68 : secondLineOdd = mul_ps(secondLineOdd, invMax);
1047 :
1048 : // Compute squares
1049 68 : firstLineEven = SQUARE(firstLineEven);
1050 68 : firstLineOdd = SQUARE(firstLineOdd);
1051 68 : secondLineEven = SQUARE(secondLineEven);
1052 68 : secondLineOdd = SQUARE(secondLineOdd);
1053 :
1054 204 : const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
1055 : add_ps(secondLineEven, secondLineOdd));
1056 :
1057 204 : auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
1058 :
1059 : // Deal with infinity being the maximum
1060 68 : const auto maskIsInf = cmpeq_ps(maxV, infv);
1061 136 : rms = or_ps(andnot_ps(maskIsInf, rms), and_ps(maskIsInf, infv));
1062 :
1063 68 : rms = FIXUP_LANES(rms);
1064 :
1065 : // coverity[incompatible_cast]
1066 68 : storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]), rms);
1067 68 : pSrcScanlineShifted += RMS_FLOAT_ELTS * 2;
1068 : }
1069 :
1070 : zeroupper();
1071 :
1072 34 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1073 34 : return iDstPixel;
1074 : }
1075 :
1076 : /************************************************************************/
1077 : /* AverageFloatSSE2() */
1078 : /************************************************************************/
1079 :
1080 : template <class T>
1081 14 : static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
1082 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1083 : T *CPL_RESTRICT pDstScanline)
1084 : {
1085 : // Optimized implementation for average on Float32 by
1086 : // processing by group of 4 output pixels.
1087 14 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1088 :
1089 14 : int iDstPixel = 0;
1090 14 : const auto zeroDot25 = _mm_set1_ps(0.25f);
1091 :
1092 32 : for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
1093 : {
1094 : // Load 8 Float32 from each line
1095 : const auto firstLineLo =
1096 18 : _mm_loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
1097 18 : const auto firstLineHi = _mm_loadu_ps(
1098 18 : reinterpret_cast<float const *>(pSrcScanlineShifted + 4));
1099 18 : const auto secondLineLo = _mm_loadu_ps(
1100 18 : reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
1101 18 : const auto secondLineHi = _mm_loadu_ps(reinterpret_cast<float const *>(
1102 18 : pSrcScanlineShifted + 4 + nChunkXSize));
1103 :
1104 : // Vertical addition
1105 18 : const auto sumLo = _mm_add_ps(firstLineLo, secondLineLo);
1106 18 : const auto sumHi = _mm_add_ps(firstLineHi, secondLineHi);
1107 :
1108 : // Horizontal addition
1109 : const auto A =
1110 18 : _mm_shuffle_ps(sumLo, sumHi, 0 | (2 << 2) | (0 << 4) | (2 << 6));
1111 : const auto B =
1112 18 : _mm_shuffle_ps(sumLo, sumHi, 1 | (3 << 2) | (1 << 4) | (3 << 6));
1113 18 : const auto sum = _mm_add_ps(A, B);
1114 :
1115 18 : const auto average = _mm_mul_ps(sum, zeroDot25);
1116 :
1117 : // coverity[incompatible_cast]
1118 18 : _mm_storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]),
1119 : average);
1120 18 : pSrcScanlineShifted += 8;
1121 : }
1122 :
1123 14 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1124 14 : return iDstPixel;
1125 : }
1126 :
1127 : #endif
1128 :
1129 : /************************************************************************/
1130 : /* GDALResampleChunk_AverageOrRMS() */
1131 : /************************************************************************/
1132 :
1133 : template <class T, class Tsum, GDALDataType eWrkDataType>
1134 : static CPLErr
1135 10390 : GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
1136 : const T *pChunk, void **ppDstBuffer)
1137 : {
1138 10390 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1139 10390 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1140 10390 : const double dfSrcXDelta = args.dfSrcXDelta;
1141 10390 : const double dfSrcYDelta = args.dfSrcYDelta;
1142 10390 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1143 10390 : const int nChunkXOff = args.nChunkXOff;
1144 10390 : const int nChunkYOff = args.nChunkYOff;
1145 10390 : const int nChunkXSize = args.nChunkXSize;
1146 10390 : const int nChunkYSize = args.nChunkYSize;
1147 10390 : const int nDstXOff = args.nDstXOff;
1148 10390 : const int nDstXOff2 = args.nDstXOff2;
1149 10390 : const int nDstYOff = args.nDstYOff;
1150 10390 : const int nDstYOff2 = args.nDstYOff2;
1151 10390 : const char *pszResampling = args.pszResampling;
1152 10390 : bool bHasNoData = args.bHasNoData;
1153 10390 : const double dfNoDataValue = args.dfNoDataValue;
1154 10390 : const GDALColorTable *poColorTable = args.poColorTable;
1155 10390 : const bool bPropagateNoData = args.bPropagateNoData;
1156 :
1157 : // AVERAGE_BIT2GRAYSCALE
1158 : const bool bBit2Grayscale =
1159 10390 : CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"));
1160 10397 : const bool bQuadraticMean = CPL_TO_BOOL(EQUAL(pszResampling, "RMS"));
1161 10395 : if (bBit2Grayscale)
1162 9 : poColorTable = nullptr;
1163 :
1164 : T tNoDataValue;
1165 10395 : if (!bHasNoData)
1166 10344 : tNoDataValue = 0;
1167 : else
1168 51 : tNoDataValue = static_cast<T>(dfNoDataValue);
1169 10395 : const T tReplacementVal =
1170 107 : bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
1171 51 : args.eOvrDataType, dfNoDataValue))
1172 : : 0;
1173 :
1174 10395 : int nChunkRightXOff = nChunkXOff + nChunkXSize;
1175 10395 : int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1176 10395 : int nDstXWidth = nDstXOff2 - nDstXOff;
1177 :
1178 : /* -------------------------------------------------------------------- */
1179 : /* Allocate buffers. */
1180 : /* -------------------------------------------------------------------- */
1181 10397 : *ppDstBuffer = static_cast<T *>(
1182 10395 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
1183 : GDALGetDataTypeSizeBytes(eWrkDataType)));
1184 10397 : if (*ppDstBuffer == nullptr)
1185 : {
1186 0 : return CE_Failure;
1187 : }
1188 10397 : T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
1189 :
1190 : struct PrecomputedXValue
1191 : {
1192 : int nLeftXOffShifted;
1193 : int nRightXOffShifted;
1194 : double dfLeftWeight;
1195 : double dfRightWeight;
1196 : double dfTotalWeightFullLine;
1197 : };
1198 :
1199 : PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
1200 10397 : VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(PrecomputedXValue)));
1201 :
1202 10393 : if (pasSrcX == nullptr)
1203 : {
1204 0 : VSIFree(pasSrcX);
1205 0 : return CE_Failure;
1206 : }
1207 :
1208 10393 : int nTransparentIdx = -1;
1209 10393 : std::vector<GDALColorEntry> colorEntries;
1210 10391 : if (poColorTable)
1211 5 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1212 :
1213 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1214 : // it as nodata value
1215 10420 : if (bHasNoData && dfNoDataValue >= 0.0f &&
1216 27 : tNoDataValue < colorEntries.size())
1217 1 : colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1218 :
1219 : // Or if we have no explicit nodata, but a color table entry that is
1220 : // transparent, consider it as the nodata value
1221 10392 : else if (!bHasNoData && nTransparentIdx >= 0)
1222 : {
1223 0 : bHasNoData = true;
1224 0 : tNoDataValue = static_cast<T>(nTransparentIdx);
1225 : }
1226 :
1227 : /* ==================================================================== */
1228 : /* Precompute inner loop constants. */
1229 : /* ==================================================================== */
1230 10393 : bool bSrcXSpacingIsTwo = true;
1231 10393 : int nLastSrcXOff2 = -1;
1232 867056 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1233 : {
1234 856663 : double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
1235 : // Apply some epsilon to avoid numerical precision issues
1236 856663 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
1237 856663 : double dfSrcXOff2 = dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
1238 856663 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
1239 :
1240 856663 : if (nSrcXOff < nChunkXOff)
1241 0 : nSrcXOff = nChunkXOff;
1242 856663 : if (nSrcXOff2 == nSrcXOff)
1243 0 : nSrcXOff2++;
1244 856663 : if (nSrcXOff2 > nChunkRightXOff)
1245 1 : nSrcXOff2 = nChunkRightXOff;
1246 :
1247 856663 : pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
1248 856663 : pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
1249 856663 : nSrcXOff2 - nChunkXOff;
1250 18 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
1251 856663 : (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
1252 856663 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
1253 856663 : 1 - (nSrcXOff2 - dfSrcXOff2);
1254 856663 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
1255 856663 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
1256 856663 : if (nSrcXOff + 1 < nSrcXOff2)
1257 : {
1258 856635 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1259 856635 : nSrcXOff2 - nSrcXOff - 2;
1260 856635 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1261 856635 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
1262 : }
1263 :
1264 856663 : if (nSrcXOff2 - nSrcXOff != 2 ||
1265 727183 : (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
1266 : {
1267 120592 : bSrcXSpacingIsTwo = false;
1268 : }
1269 856663 : nLastSrcXOff2 = nSrcXOff2;
1270 : }
1271 :
1272 : /* ==================================================================== */
1273 : /* Loop over destination scanlines. */
1274 : /* ==================================================================== */
1275 752820 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1276 : {
1277 742425 : double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
1278 742425 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
1279 742425 : if (nSrcYOff < nChunkYOff)
1280 0 : nSrcYOff = nChunkYOff;
1281 :
1282 742425 : double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
1283 742425 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
1284 742425 : if (nSrcYOff2 == nSrcYOff)
1285 0 : ++nSrcYOff2;
1286 742425 : if (nSrcYOff2 > nChunkBottomYOff)
1287 3 : nSrcYOff2 = nChunkBottomYOff;
1288 :
1289 742425 : T *const pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
1290 :
1291 : /* --------------------------------------------------------------------
1292 : */
1293 : /* Loop over destination pixels */
1294 : /* --------------------------------------------------------------------
1295 : */
1296 742425 : if (poColorTable == nullptr)
1297 : {
1298 742321 : if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
1299 : pabyChunkNodataMask == nullptr)
1300 : {
1301 : if (eWrkDataType == GDT_Byte || eWrkDataType == GDT_UInt16)
1302 : {
1303 : // Optimized case : no nodata, overview by a factor of 2 and
1304 : // regular x and y src spacing.
1305 116400 : const T *pSrcScanlineShifted =
1306 116400 : pChunk + pasSrcX[0].nLeftXOffShifted +
1307 116400 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
1308 116400 : nChunkXSize;
1309 116400 : int iDstPixel = 0;
1310 : #ifdef USE_SSE2
1311 116381 : if (bQuadraticMean && eWrkDataType == GDT_Byte)
1312 : {
1313 5385 : iDstPixel = QuadraticMeanByteSSE2OrAVX2(
1314 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1315 : pDstScanline);
1316 : }
1317 111015 : else if (bQuadraticMean /* && eWrkDataType == GDT_UInt16 */)
1318 : {
1319 10 : iDstPixel = QuadraticMeanUInt16SSE2(
1320 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1321 : pDstScanline);
1322 : }
1323 : else if (/* !bQuadraticMean && */ eWrkDataType == GDT_Byte)
1324 : {
1325 110996 : iDstPixel = AverageByteSSE2OrAVX2(
1326 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1327 : pDstScanline);
1328 : }
1329 : else /* if( !bQuadraticMean && eWrkDataType == GDT_UInt16 )
1330 : */
1331 : {
1332 9 : iDstPixel = AverageUInt16SSE2(nDstXWidth, nChunkXSize,
1333 : pSrcScanlineShifted,
1334 : pDstScanline);
1335 : }
1336 : #endif
1337 278721 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1338 : {
1339 162321 : Tsum nTotal = 0;
1340 : T nVal;
1341 162321 : if (bQuadraticMean)
1342 44 : nTotal =
1343 44 : SQUARE<Tsum>(pSrcScanlineShifted[0]) +
1344 44 : SQUARE<Tsum>(pSrcScanlineShifted[1]) +
1345 44 : SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
1346 44 : SQUARE<Tsum>(
1347 44 : pSrcScanlineShifted[1 + nChunkXSize]);
1348 : else
1349 162277 : nTotal = pSrcScanlineShifted[0] +
1350 162277 : pSrcScanlineShifted[1] +
1351 162277 : pSrcScanlineShifted[nChunkXSize] +
1352 162277 : pSrcScanlineShifted[1 + nChunkXSize];
1353 :
1354 162321 : constexpr int nTotalWeight = 4;
1355 162321 : if (bQuadraticMean)
1356 44 : nVal = ComputeIntegerRMS_4values<T>(nTotal);
1357 : else
1358 162277 : nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
1359 : nTotalWeight);
1360 :
1361 : // No need to compare nVal against tNoDataValue as we
1362 : // are in a case where pabyChunkNodataMask == nullptr
1363 : // implies the absence of nodata value.
1364 162321 : pDstScanline[iDstPixel] = nVal;
1365 162321 : pSrcScanlineShifted += 2;
1366 : }
1367 : }
1368 : else
1369 : {
1370 : CPLAssert(eWrkDataType == GDT_Float32 ||
1371 : eWrkDataType == GDT_Float64);
1372 70 : const T *pSrcScanlineShifted =
1373 70 : pChunk + pasSrcX[0].nLeftXOffShifted +
1374 70 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
1375 70 : nChunkXSize;
1376 70 : int iDstPixel = 0;
1377 : #ifdef USE_SSE2
1378 : if (eWrkDataType == GDT_Float32)
1379 : {
1380 48 : if (bQuadraticMean)
1381 : {
1382 34 : iDstPixel = QuadraticMeanFloatSSE2(
1383 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1384 : pDstScanline);
1385 : }
1386 : else
1387 : {
1388 14 : iDstPixel = AverageFloatSSE2(
1389 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1390 : pDstScanline);
1391 : }
1392 : }
1393 : #endif
1394 :
1395 268 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1396 : {
1397 : T nVal;
1398 198 : if (bQuadraticMean)
1399 : {
1400 : // Cast to double to avoid overflows
1401 : // (using std::hypot() is much slower)
1402 100 : nVal = static_cast<T>(std::sqrt(
1403 : 0.25 *
1404 100 : (SQUARE<double>(pSrcScanlineShifted[0]) +
1405 100 : SQUARE<double>(pSrcScanlineShifted[1]) +
1406 100 : SQUARE<double>(
1407 200 : pSrcScanlineShifted[nChunkXSize]) +
1408 100 : SQUARE<double>(
1409 100 : pSrcScanlineShifted[1 + nChunkXSize]))));
1410 : }
1411 : else
1412 : {
1413 98 : nVal = static_cast<T>(
1414 98 : 0.25f * (pSrcScanlineShifted[0] +
1415 98 : pSrcScanlineShifted[1] +
1416 98 : pSrcScanlineShifted[nChunkXSize] +
1417 98 : pSrcScanlineShifted[1 + nChunkXSize]));
1418 : }
1419 :
1420 : // No need to compare nVal against tNoDataValue as we
1421 : // are in a case where pabyChunkNodataMask == nullptr
1422 : // implies the absence of nodata value.
1423 198 : pDstScanline[iDstPixel] = nVal;
1424 198 : pSrcScanlineShifted += 2;
1425 : }
1426 116470 : }
1427 : }
1428 : else
1429 : {
1430 24 : const double dfBottomWeight =
1431 625851 : (nSrcYOff + 1 == nSrcYOff2) ? 1.0
1432 625827 : : 1.0 - (dfSrcYOff - nSrcYOff);
1433 625851 : const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
1434 625851 : nSrcYOff -= nChunkYOff;
1435 625851 : nSrcYOff2 -= nChunkYOff;
1436 :
1437 625851 : double dfTotalWeightFullColumn = dfBottomWeight;
1438 625851 : if (nSrcYOff + 1 < nSrcYOff2)
1439 : {
1440 625820 : dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
1441 625820 : dfTotalWeightFullColumn += dfTopWeight;
1442 : }
1443 :
1444 18585256 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1445 : {
1446 17959281 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1447 17959281 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1448 :
1449 17959281 : double dfTotal = 0;
1450 17959281 : double dfTotalWeight = 0;
1451 17959281 : if (pabyChunkNodataMask == nullptr)
1452 : {
1453 1746435 : auto pChunkShifted =
1454 115 : pChunk +
1455 1746435 : static_cast<GPtrDiff_t>(nSrcYOff) * nChunkXSize;
1456 1746435 : int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1457 1746435 : double dfWeightY = dfBottomWeight;
1458 3493427 : while (true)
1459 : {
1460 : double dfTotalLine;
1461 5239852 : if (bQuadraticMean)
1462 : {
1463 : // Left pixel
1464 : {
1465 104 : const T val = pChunkShifted[nSrcXOff];
1466 104 : dfTotalLine =
1467 104 : SQUARE<double>(val) *
1468 104 : pasSrcX[iDstPixel].dfLeftWeight;
1469 : }
1470 :
1471 104 : if (nSrcXOff + 1 < nSrcXOff2)
1472 : {
1473 : // Middle pixels
1474 104 : for (int iX = nSrcXOff + 1;
1475 424 : iX + 1 < nSrcXOff2; ++iX)
1476 : {
1477 320 : const T val = pChunkShifted[iX];
1478 320 : dfTotalLine += SQUARE<double>(val);
1479 : }
1480 :
1481 : // Right pixel
1482 : {
1483 104 : const T val =
1484 104 : pChunkShifted[nSrcXOff2 - 1];
1485 104 : dfTotalLine +=
1486 104 : SQUARE<double>(val) *
1487 104 : pasSrcX[iDstPixel].dfRightWeight;
1488 : }
1489 : }
1490 : }
1491 : else
1492 : {
1493 : // Left pixel
1494 : {
1495 5239756 : const T val = pChunkShifted[nSrcXOff];
1496 5239756 : dfTotalLine =
1497 5239756 : val * pasSrcX[iDstPixel].dfLeftWeight;
1498 : }
1499 :
1500 5239756 : if (nSrcXOff + 1 < nSrcXOff2)
1501 : {
1502 : // Middle pixels
1503 4239330 : for (int iX = nSrcXOff + 1;
1504 64183126 : iX + 1 < nSrcXOff2; ++iX)
1505 : {
1506 59943836 : const T val = pChunkShifted[iX];
1507 59943836 : dfTotalLine += val;
1508 : }
1509 :
1510 : // Right pixel
1511 : {
1512 4239330 : const T val =
1513 4239330 : pChunkShifted[nSrcXOff2 - 1];
1514 4239330 : dfTotalLine +=
1515 4239330 : val *
1516 4239330 : pasSrcX[iDstPixel].dfRightWeight;
1517 : }
1518 : }
1519 : }
1520 :
1521 5239852 : dfTotal += dfTotalLine * dfWeightY;
1522 5239852 : --nCounterY;
1523 5239852 : if (nCounterY < 0)
1524 1746435 : break;
1525 3493427 : pChunkShifted += nChunkXSize;
1526 3493427 : dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
1527 : }
1528 :
1529 1746435 : dfTotalWeight =
1530 1746435 : pasSrcX[iDstPixel].dfTotalWeightFullLine *
1531 : dfTotalWeightFullColumn;
1532 : }
1533 : else
1534 : {
1535 16212866 : GPtrDiff_t nCount = 0;
1536 71187098 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1537 : {
1538 54974032 : const auto pChunkShifted =
1539 132 : pChunk +
1540 54974032 : static_cast<GPtrDiff_t>(iY) * nChunkXSize;
1541 :
1542 54974032 : double dfTotalLine = 0;
1543 54974032 : double dfTotalWeightLine = 0;
1544 : // Left pixel
1545 : {
1546 54974032 : const int iX = nSrcXOff;
1547 54974032 : const T val = pChunkShifted[iX];
1548 54974032 : if (pabyChunkNodataMask[iX + iY * nChunkXSize])
1549 : {
1550 23417381 : nCount++;
1551 23417381 : const double dfWeightX =
1552 23417381 : pasSrcX[iDstPixel].dfLeftWeight;
1553 23417381 : dfTotalWeightLine = dfWeightX;
1554 23417381 : if (bQuadraticMean)
1555 60 : dfTotalLine =
1556 60 : SQUARE<double>(val) * dfWeightX;
1557 : else
1558 23417381 : dfTotalLine = val * dfWeightX;
1559 : }
1560 : }
1561 :
1562 54974032 : if (nSrcXOff + 1 < nSrcXOff2)
1563 : {
1564 : // Middle pixels
1565 145163132 : for (int iX = nSrcXOff + 1; iX + 1 < nSrcXOff2;
1566 : ++iX)
1567 : {
1568 90193400 : const T val = pChunkShifted[iX];
1569 90193400 : if (pabyChunkNodataMask[iX +
1570 90193400 : iY * nChunkXSize])
1571 : {
1572 39727500 : nCount++;
1573 39727500 : dfTotalWeightLine += 1;
1574 39727500 : if (bQuadraticMean)
1575 0 : dfTotalLine += SQUARE<double>(val);
1576 : else
1577 39727500 : dfTotalLine += val;
1578 : }
1579 : }
1580 :
1581 : // Right pixel
1582 : {
1583 54969432 : const int iX = nSrcXOff2 - 1;
1584 54969432 : const T val = pChunkShifted[iX];
1585 54969432 : if (pabyChunkNodataMask[iX +
1586 54969432 : iY * nChunkXSize])
1587 : {
1588 23418047 : nCount++;
1589 23418047 : const double dfWeightX =
1590 23418047 : pasSrcX[iDstPixel].dfRightWeight;
1591 23418047 : dfTotalWeightLine += dfWeightX;
1592 23418047 : if (bQuadraticMean)
1593 531 : dfTotalLine +=
1594 61 : SQUARE<double>(val) * dfWeightX;
1595 : else
1596 23417946 : dfTotalLine += val * dfWeightX;
1597 : }
1598 : }
1599 : }
1600 :
1601 93736998 : const double dfWeightY =
1602 : (iY == nSrcYOff) ? dfBottomWeight
1603 38762766 : : (iY + 1 == nSrcYOff2) ? dfTopWeight
1604 : : 1.0;
1605 54974232 : dfTotal += dfTotalLine * dfWeightY;
1606 54974232 : dfTotalWeight += dfTotalWeightLine * dfWeightY;
1607 : }
1608 :
1609 16213066 : if (nCount == 0 ||
1610 8 : (bPropagateNoData &&
1611 : nCount <
1612 8 : static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
1613 8 : (nSrcXOff2 - nSrcXOff)))
1614 : {
1615 9461432 : pDstScanline[iDstPixel] = tNoDataValue;
1616 9461432 : continue;
1617 : }
1618 : }
1619 : if (eWrkDataType == GDT_Byte)
1620 : {
1621 : T nVal;
1622 8497910 : if (bQuadraticMean)
1623 38 : nVal = ComputeIntegerRMS<T, int>(dfTotal,
1624 : dfTotalWeight);
1625 : else
1626 8497870 : nVal =
1627 8497870 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1628 8497780 : if (bHasNoData && nVal == tNoDataValue)
1629 0 : nVal = tReplacementVal;
1630 8497780 : pDstScanline[iDstPixel] = nVal;
1631 : }
1632 : else if (eWrkDataType == GDT_UInt16)
1633 : {
1634 : T nVal;
1635 8 : if (bQuadraticMean)
1636 4 : nVal = ComputeIntegerRMS<T, uint64_t>(
1637 : dfTotal, dfTotalWeight);
1638 : else
1639 4 : nVal =
1640 4 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1641 8 : if (bHasNoData && nVal == tNoDataValue)
1642 0 : nVal = tReplacementVal;
1643 8 : pDstScanline[iDstPixel] = nVal;
1644 : }
1645 : else
1646 : {
1647 : T nVal;
1648 151 : if (bQuadraticMean)
1649 20 : nVal =
1650 25 : static_cast<T>(sqrt(dfTotal / dfTotalWeight));
1651 : else
1652 126 : nVal = static_cast<T>(dfTotal / dfTotalWeight);
1653 151 : if (bHasNoData && nVal == tNoDataValue)
1654 2 : nVal = tReplacementVal;
1655 151 : pDstScanline[iDstPixel] = nVal;
1656 : }
1657 : }
1658 : }
1659 : }
1660 : else
1661 : {
1662 104 : nSrcYOff -= nChunkYOff;
1663 104 : nSrcYOff2 -= nChunkYOff;
1664 :
1665 6505 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1666 : {
1667 6475 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1668 6475 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1669 :
1670 6475 : GPtrDiff_t nTotalR = 0;
1671 6475 : GPtrDiff_t nTotalG = 0;
1672 6475 : GPtrDiff_t nTotalB = 0;
1673 6475 : GPtrDiff_t nCount = 0;
1674 :
1675 19425 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1676 : {
1677 38850 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
1678 : {
1679 25900 : const T val = pChunk[iX + static_cast<GPtrDiff_t>(iY) *
1680 25900 : nChunkXSize];
1681 : // cppcheck-suppress unsignedLessThanZero
1682 25900 : if (val < 0 || val >= colorEntries.size())
1683 0 : continue;
1684 25900 : size_t idx = static_cast<size_t>(val);
1685 25900 : const auto &entry = colorEntries[idx];
1686 25900 : if (entry.c4)
1687 : {
1688 14128 : if (bQuadraticMean)
1689 : {
1690 800 : nTotalR += SQUARE<int>(entry.c1);
1691 800 : nTotalG += SQUARE<int>(entry.c2);
1692 800 : nTotalB += SQUARE<int>(entry.c3);
1693 800 : ++nCount;
1694 : }
1695 : else
1696 : {
1697 13328 : nTotalR += entry.c1;
1698 13328 : nTotalG += entry.c2;
1699 13328 : nTotalB += entry.c3;
1700 13328 : ++nCount;
1701 : }
1702 : }
1703 : }
1704 : }
1705 :
1706 6475 : if (nCount == 0 ||
1707 0 : (bPropagateNoData &&
1708 0 : nCount < static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
1709 0 : (nSrcXOff2 - nSrcXOff)))
1710 : {
1711 2838 : pDstScanline[iDstPixel] = tNoDataValue;
1712 : }
1713 : else
1714 : {
1715 : GDALColorEntry color;
1716 3637 : if (bQuadraticMean)
1717 : {
1718 200 : color.c1 =
1719 200 : static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
1720 200 : color.c2 =
1721 200 : static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
1722 200 : color.c3 =
1723 200 : static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
1724 : }
1725 : else
1726 : {
1727 3437 : color.c1 =
1728 3437 : static_cast<short>((nTotalR + nCount / 2) / nCount);
1729 3437 : color.c2 =
1730 3437 : static_cast<short>((nTotalG + nCount / 2) / nCount);
1731 3437 : color.c3 =
1732 3437 : static_cast<short>((nTotalB + nCount / 2) / nCount);
1733 : }
1734 3563 : pDstScanline[iDstPixel] =
1735 3637 : static_cast<T>(BestColorEntry(colorEntries, color));
1736 : }
1737 : }
1738 : }
1739 : }
1740 :
1741 10395 : CPLFree(pasSrcX);
1742 :
1743 10394 : return CE_None;
1744 : }
1745 :
1746 : static CPLErr
1747 10391 : GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
1748 : const void *pChunk, void **ppDstBuffer,
1749 : GDALDataType *peDstBufferDataType)
1750 : {
1751 10391 : *peDstBufferDataType = args.eWrkDataType;
1752 10391 : switch (args.eWrkDataType)
1753 : {
1754 10326 : case GDT_Byte:
1755 : {
1756 10326 : return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_Byte>(
1757 10324 : args, static_cast<const GByte *>(pChunk), ppDstBuffer);
1758 : }
1759 :
1760 9 : case GDT_UInt16:
1761 : {
1762 9 : if (EQUAL(args.pszResampling, "RMS"))
1763 : {
1764 : // Use double as accumulation type, because UInt32 could overflow
1765 : return GDALResampleChunk_AverageOrRMS_T<GUInt16, double,
1766 5 : GDT_UInt16>(
1767 5 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1768 : }
1769 : else
1770 : {
1771 : return GDALResampleChunk_AverageOrRMS_T<GUInt16, GUInt32,
1772 4 : GDT_UInt16>(
1773 4 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1774 : }
1775 : }
1776 :
1777 39 : case GDT_Float32:
1778 : {
1779 39 : return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32>(
1780 39 : args, static_cast<const float *>(pChunk), ppDstBuffer);
1781 : }
1782 :
1783 17 : case GDT_Float64:
1784 : {
1785 : return GDALResampleChunk_AverageOrRMS_T<double, double,
1786 17 : GDT_Float64>(
1787 17 : args, static_cast<const double *>(pChunk), ppDstBuffer);
1788 : }
1789 :
1790 0 : default:
1791 0 : break;
1792 : }
1793 :
1794 0 : CPLAssert(false);
1795 : return CE_Failure;
1796 : }
1797 :
1798 : /************************************************************************/
1799 : /* GDALResampleChunk_Gauss() */
1800 : /************************************************************************/
1801 :
1802 86 : static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
1803 : const void *pChunk, void **ppDstBuffer,
1804 : GDALDataType *peDstBufferDataType)
1805 :
1806 : {
1807 86 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1808 86 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1809 86 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1810 86 : const int nChunkXOff = args.nChunkXOff;
1811 86 : const int nChunkXSize = args.nChunkXSize;
1812 86 : const int nChunkYOff = args.nChunkYOff;
1813 86 : const int nChunkYSize = args.nChunkYSize;
1814 86 : const int nDstXOff = args.nDstXOff;
1815 86 : const int nDstXOff2 = args.nDstXOff2;
1816 86 : const int nDstYOff = args.nDstYOff;
1817 86 : const int nDstYOff2 = args.nDstYOff2;
1818 86 : const bool bHasNoData = args.bHasNoData;
1819 86 : double dfNoDataValue = args.dfNoDataValue;
1820 86 : const GDALColorTable *poColorTable = args.poColorTable;
1821 :
1822 86 : const double *const padfChunk = static_cast<const double *>(pChunk);
1823 :
1824 86 : *ppDstBuffer =
1825 86 : VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
1826 : GDALGetDataTypeSizeBytes(GDT_Float64));
1827 86 : if (*ppDstBuffer == nullptr)
1828 : {
1829 0 : return CE_Failure;
1830 : }
1831 86 : *peDstBufferDataType = GDT_Float64;
1832 86 : double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
1833 :
1834 : /* -------------------------------------------------------------------- */
1835 : /* Create the filter kernel and allocate scanline buffer. */
1836 : /* -------------------------------------------------------------------- */
1837 86 : int nGaussMatrixDim = 3;
1838 : const int *panGaussMatrix;
1839 86 : constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
1840 86 : constexpr int anGaussMatrix5x5[] = {1, 4, 6, 4, 1, 4, 16, 24, 16,
1841 : 4, 6, 24, 36, 24, 6, 4, 16, 24,
1842 : 16, 4, 1, 4, 6, 4, 1};
1843 86 : constexpr int anGaussMatrix7x7[] = {
1844 : 1, 6, 15, 20, 15, 6, 1, 6, 36, 90, 120, 90, 36,
1845 : 6, 15, 90, 225, 300, 225, 90, 15, 20, 120, 300, 400, 300,
1846 : 120, 20, 15, 90, 225, 300, 225, 90, 15, 6, 36, 90, 120,
1847 : 90, 36, 6, 1, 6, 15, 20, 15, 6, 1};
1848 :
1849 86 : const int nOXSize = args.nOvrXSize;
1850 86 : const int nOYSize = args.nOvrYSize;
1851 86 : const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
1852 :
1853 : // matrix for gauss filter
1854 86 : if (nResYFactor <= 2)
1855 : {
1856 85 : panGaussMatrix = anGaussMatrix3x3;
1857 85 : nGaussMatrixDim = 3;
1858 : }
1859 1 : else if (nResYFactor <= 4)
1860 : {
1861 0 : panGaussMatrix = anGaussMatrix5x5;
1862 0 : nGaussMatrixDim = 5;
1863 : }
1864 : else
1865 : {
1866 1 : panGaussMatrix = anGaussMatrix7x7;
1867 1 : nGaussMatrixDim = 7;
1868 : }
1869 :
1870 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
1871 : int *panGaussMatrixDup = static_cast<int *>(
1872 : CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
1873 : memcpy(panGaussMatrixDup, panGaussMatrix,
1874 : sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
1875 : panGaussMatrix = panGaussMatrixDup;
1876 : #endif
1877 :
1878 86 : if (!bHasNoData)
1879 79 : dfNoDataValue = 0.0;
1880 :
1881 86 : std::vector<GDALColorEntry> colorEntries;
1882 86 : int nTransparentIdx = -1;
1883 86 : if (poColorTable)
1884 2 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1885 :
1886 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1887 : // it as nodata value.
1888 92 : if (bHasNoData && dfNoDataValue >= 0.0f &&
1889 6 : dfNoDataValue < colorEntries.size())
1890 0 : colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
1891 :
1892 : // Or if we have no explicit nodata, but a color table entry that is
1893 : // transparent, consider it as the nodata value.
1894 86 : else if (!bHasNoData && nTransparentIdx >= 0)
1895 : {
1896 0 : dfNoDataValue = nTransparentIdx;
1897 : }
1898 :
1899 86 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
1900 86 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1901 86 : const int nDstXWidth = nDstXOff2 - nDstXOff;
1902 :
1903 : /* ==================================================================== */
1904 : /* Loop over destination scanlines. */
1905 : /* ==================================================================== */
1906 16488 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1907 : {
1908 16402 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
1909 16402 : int nSrcYOff2 =
1910 16402 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
1911 :
1912 16402 : if (nSrcYOff < nChunkYOff)
1913 : {
1914 0 : nSrcYOff = nChunkYOff;
1915 0 : nSrcYOff2++;
1916 : }
1917 :
1918 16402 : const int iSizeY = nSrcYOff2 - nSrcYOff;
1919 16402 : nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
1920 16402 : nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
1921 :
1922 16402 : if (nSrcYOff2 > nChunkBottomYOff ||
1923 16359 : (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
1924 : {
1925 44 : nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
1926 : }
1927 :
1928 16402 : int nYShiftGaussMatrix = 0;
1929 16402 : if (nSrcYOff < nChunkYOff)
1930 : {
1931 0 : nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
1932 0 : nSrcYOff = nChunkYOff;
1933 : }
1934 :
1935 16402 : const double *const padfSrcScanline =
1936 16402 : padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
1937 16402 : const GByte *pabySrcScanlineNodataMask = nullptr;
1938 16402 : if (pabyChunkNodataMask != nullptr)
1939 152 : pabySrcScanlineNodataMask =
1940 152 : pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
1941 :
1942 : /* --------------------------------------------------------------------
1943 : */
1944 : /* Loop over destination pixels */
1945 : /* --------------------------------------------------------------------
1946 : */
1947 16402 : double *const padfDstScanline =
1948 16402 : padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
1949 4149980 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1950 : {
1951 4133580 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
1952 4133580 : int nSrcXOff2 =
1953 4133580 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
1954 :
1955 4133580 : if (nSrcXOff < nChunkXOff)
1956 : {
1957 0 : nSrcXOff = nChunkXOff;
1958 0 : nSrcXOff2++;
1959 : }
1960 :
1961 4133580 : const int iSizeX = nSrcXOff2 - nSrcXOff;
1962 4133580 : nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
1963 4133580 : nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
1964 :
1965 4133580 : if (nSrcXOff2 > nChunkRightXOff ||
1966 4127930 : (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
1967 : {
1968 5650 : nSrcXOff2 =
1969 5650 : std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
1970 : }
1971 :
1972 4133580 : int nXShiftGaussMatrix = 0;
1973 4133580 : if (nSrcXOff < nChunkXOff)
1974 : {
1975 0 : nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
1976 0 : nSrcXOff = nChunkXOff;
1977 : }
1978 :
1979 4133580 : if (poColorTable == nullptr)
1980 : {
1981 4133380 : double dfTotal = 0.0;
1982 4133380 : GInt64 nCount = 0;
1983 4133380 : const int *panLineWeight =
1984 4133380 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
1985 : nXShiftGaussMatrix;
1986 :
1987 16527900 : for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
1988 12394500 : ++iY, ++j, panLineWeight += nGaussMatrixDim)
1989 : {
1990 49561300 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
1991 : {
1992 37166800 : const double val =
1993 37166800 : padfSrcScanline[iX - nChunkXOff +
1994 37166800 : static_cast<GPtrDiff_t>(iY -
1995 37166800 : nSrcYOff) *
1996 37166800 : nChunkXSize];
1997 37166800 : if (pabySrcScanlineNodataMask == nullptr ||
1998 32872 : pabySrcScanlineNodataMask[iX - nChunkXOff +
1999 32872 : static_cast<GPtrDiff_t>(
2000 32872 : iY - nSrcYOff) *
2001 32872 : nChunkXSize])
2002 : {
2003 37146100 : const int nWeight = panLineWeight[i];
2004 37146100 : dfTotal += val * nWeight;
2005 37146100 : nCount += nWeight;
2006 : }
2007 : }
2008 : }
2009 :
2010 4133380 : if (nCount == 0)
2011 : {
2012 2217 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2013 : }
2014 : else
2015 : {
2016 4131160 : padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
2017 : }
2018 : }
2019 : else
2020 : {
2021 200 : GInt64 nTotalR = 0;
2022 200 : GInt64 nTotalG = 0;
2023 200 : GInt64 nTotalB = 0;
2024 200 : GInt64 nTotalWeight = 0;
2025 200 : const int *panLineWeight =
2026 200 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2027 : nXShiftGaussMatrix;
2028 :
2029 780 : for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2030 580 : ++iY, ++j, panLineWeight += nGaussMatrixDim)
2031 : {
2032 2262 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2033 : {
2034 1682 : const double val =
2035 1682 : padfSrcScanline[iX - nChunkXOff +
2036 1682 : static_cast<GPtrDiff_t>(iY -
2037 1682 : nSrcYOff) *
2038 1682 : nChunkXSize];
2039 1682 : if (val < 0 || val >= colorEntries.size())
2040 0 : continue;
2041 :
2042 1682 : size_t idx = static_cast<size_t>(val);
2043 1682 : if (colorEntries[idx].c4)
2044 : {
2045 1682 : const int nWeight = panLineWeight[i];
2046 1682 : nTotalR +=
2047 1682 : static_cast<GInt64>(colorEntries[idx].c1) *
2048 1682 : nWeight;
2049 1682 : nTotalG +=
2050 1682 : static_cast<GInt64>(colorEntries[idx].c2) *
2051 1682 : nWeight;
2052 1682 : nTotalB +=
2053 1682 : static_cast<GInt64>(colorEntries[idx].c3) *
2054 1682 : nWeight;
2055 1682 : nTotalWeight += nWeight;
2056 : }
2057 : }
2058 : }
2059 :
2060 200 : if (nTotalWeight == 0)
2061 : {
2062 0 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2063 : }
2064 : else
2065 : {
2066 : GDALColorEntry color;
2067 :
2068 200 : color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
2069 : nTotalWeight);
2070 200 : color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
2071 : nTotalWeight);
2072 200 : color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
2073 : nTotalWeight);
2074 200 : padfDstScanline[iDstPixel - nDstXOff] =
2075 200 : BestColorEntry(colorEntries, color);
2076 : }
2077 : }
2078 : }
2079 : }
2080 :
2081 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
2082 : CPLFree(panGaussMatrixDup);
2083 : #endif
2084 :
2085 86 : return CE_None;
2086 : }
2087 :
2088 : /************************************************************************/
2089 : /* GDALResampleChunk_Mode() */
2090 : /************************************************************************/
2091 :
2092 4398 : template <class T> static inline bool IsSame(T a, T b)
2093 : {
2094 4398 : return a == b;
2095 : }
2096 :
2097 4854 : template <> bool IsSame<float>(float a, float b)
2098 : {
2099 4854 : return a == b || (std::isnan(a) && std::isnan(b));
2100 : }
2101 :
2102 504 : template <> bool IsSame<double>(double a, double b)
2103 : {
2104 504 : return a == b || (std::isnan(a) && std::isnan(b));
2105 : }
2106 :
2107 : template <>
2108 480 : bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
2109 : {
2110 960 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2111 960 : std::isnan(b.real()) && std::isnan(b.imag()));
2112 : }
2113 :
2114 : template <>
2115 480 : bool IsSame<std::complex<double>>(std::complex<double> a,
2116 : std::complex<double> b)
2117 : {
2118 960 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2119 960 : std::isnan(b.real()) && std::isnan(b.imag()));
2120 : }
2121 :
2122 : template <class T>
2123 136 : static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
2124 : const T *pChunk, T *const pDstBuffer)
2125 :
2126 : {
2127 136 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2128 136 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2129 136 : const double dfSrcXDelta = args.dfSrcXDelta;
2130 136 : const double dfSrcYDelta = args.dfSrcYDelta;
2131 136 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2132 136 : const int nChunkXOff = args.nChunkXOff;
2133 136 : const int nChunkXSize = args.nChunkXSize;
2134 136 : const int nChunkYOff = args.nChunkYOff;
2135 136 : const int nChunkYSize = args.nChunkYSize;
2136 136 : const int nDstXOff = args.nDstXOff;
2137 136 : const int nDstXOff2 = args.nDstXOff2;
2138 136 : const int nDstYOff = args.nDstYOff;
2139 136 : const int nDstYOff2 = args.nDstYOff2;
2140 136 : const bool bHasNoData = args.bHasNoData;
2141 136 : const GDALColorTable *poColorTable = args.poColorTable;
2142 136 : const int nDstXSize = nDstXOff2 - nDstXOff;
2143 :
2144 8 : T tNoDataValue;
2145 : if constexpr (std::is_same<T, std::complex<float>>::value ||
2146 : std::is_same<T, std::complex<double>>::value)
2147 : {
2148 : using BaseT = typename T::value_type;
2149 8 : tNoDataValue =
2150 : std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
2151 : std::numeric_limits<BaseT>::quiet_NaN());
2152 : }
2153 128 : else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
2154 127 : tNoDataValue = 0;
2155 : else
2156 1 : tNoDataValue = static_cast<T>(args.dfNoDataValue);
2157 :
2158 136 : size_t nMaxNumPx = 0;
2159 136 : T *paVals = nullptr;
2160 136 : int *panSums = nullptr;
2161 :
2162 136 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2163 136 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2164 272 : std::vector<int> anVals(256, 0);
2165 :
2166 : /* ==================================================================== */
2167 : /* Loop over destination scanlines. */
2168 : /* ==================================================================== */
2169 7531 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2170 : {
2171 7395 : double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
2172 7395 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
2173 : #ifdef only_pixels_with_more_than_10_pct_participation
2174 : // When oversampling, don't take into account pixels that have a tiny
2175 : // participation in the resulting pixel
2176 : if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2177 : nSrcYOff < nChunkBottomYOff)
2178 : nSrcYOff++;
2179 : #endif
2180 7395 : if (nSrcYOff < nChunkYOff)
2181 0 : nSrcYOff = nChunkYOff;
2182 :
2183 7395 : double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
2184 7395 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
2185 : #ifdef only_pixels_with_more_than_10_pct_participation
2186 : // When oversampling, don't take into account pixels that have a tiny
2187 : // participation in the resulting pixel
2188 : if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2189 : nSrcYOff2 > nChunkYOff)
2190 : nSrcYOff2--;
2191 : #endif
2192 7395 : if (nSrcYOff2 == nSrcYOff)
2193 0 : ++nSrcYOff2;
2194 7395 : if (nSrcYOff2 > nChunkBottomYOff)
2195 0 : nSrcYOff2 = nChunkBottomYOff;
2196 :
2197 7395 : const T *const paSrcScanline =
2198 149 : pChunk +
2199 7395 : (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
2200 7395 : const GByte *pabySrcScanlineNodataMask = nullptr;
2201 7395 : if (pabyChunkNodataMask != nullptr)
2202 1810 : pabySrcScanlineNodataMask =
2203 : pabyChunkNodataMask +
2204 1810 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
2205 :
2206 7395 : T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
2207 : /* --------------------------------------------------------------------
2208 : */
2209 : /* Loop over destination pixels */
2210 : /* --------------------------------------------------------------------
2211 : */
2212 4259580 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2213 : {
2214 4252187 : double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
2215 : // Apply some epsilon to avoid numerical precision issues
2216 4252187 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
2217 : #ifdef only_pixels_with_more_than_10_pct_participation
2218 : // When oversampling, don't take into account pixels that have a
2219 : // tiny participation in the resulting pixel
2220 : if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2221 : nSrcXOff < nChunkRightXOff)
2222 : nSrcXOff++;
2223 : #endif
2224 4252187 : if (nSrcXOff < nChunkXOff)
2225 0 : nSrcXOff = nChunkXOff;
2226 :
2227 4252187 : double dfSrcXOff2 =
2228 4252187 : dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
2229 4252187 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
2230 : #ifdef only_pixels_with_more_than_10_pct_participation
2231 : // When oversampling, don't take into account pixels that have a
2232 : // tiny participation in the resulting pixel
2233 : if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2234 : nSrcXOff2 > nChunkXOff)
2235 : nSrcXOff2--;
2236 : #endif
2237 4252187 : if (nSrcXOff2 == nSrcXOff)
2238 0 : nSrcXOff2++;
2239 4252187 : if (nSrcXOff2 > nChunkRightXOff)
2240 0 : nSrcXOff2 = nChunkRightXOff;
2241 :
2242 4252187 : bool bRegularProcessing = false;
2243 : if constexpr (!std::is_same<T, GByte>::value)
2244 827 : bRegularProcessing = true;
2245 4251360 : else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
2246 0 : bRegularProcessing = true;
2247 :
2248 4252187 : if (bRegularProcessing)
2249 : {
2250 : // Not sure how much sense it makes to run a majority
2251 : // filter on floating point data, but here it is for the sake
2252 : // of compatibility. It won't look right on RGB images by the
2253 : // nature of the filter.
2254 :
2255 827 : if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
2256 2481 : nSrcYOff2 - nSrcYOff > INT_MAX / (nSrcXOff2 - nSrcXOff) ||
2257 827 : static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
2258 827 : static_cast<size_t>(nSrcXOff2 - nSrcXOff) >
2259 827 : std::numeric_limits<size_t>::max() / sizeof(float))
2260 : {
2261 0 : CPLError(CE_Failure, CPLE_NotSupported,
2262 : "Too big downsampling factor");
2263 0 : CPLFree(paVals);
2264 0 : CPLFree(panSums);
2265 0 : return CE_Failure;
2266 : }
2267 827 : const size_t nNumPx =
2268 827 : static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
2269 827 : static_cast<size_t>(nSrcXOff2 - nSrcXOff);
2270 827 : size_t iMaxInd = 0;
2271 827 : size_t iMaxVal = 0;
2272 827 : bool biMaxValdValid = false;
2273 :
2274 827 : if (paVals == nullptr || nNumPx > nMaxNumPx)
2275 : {
2276 : T *paValsNew = static_cast<T *>(
2277 71 : VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
2278 : int *panSumsNew = static_cast<int *>(
2279 71 : VSI_REALLOC_VERBOSE(panSums, nNumPx * sizeof(int)));
2280 71 : if (paValsNew != nullptr)
2281 71 : paVals = paValsNew;
2282 71 : if (panSumsNew != nullptr)
2283 71 : panSums = panSumsNew;
2284 71 : if (paValsNew == nullptr || panSumsNew == nullptr)
2285 : {
2286 0 : CPLFree(paVals);
2287 0 : CPLFree(panSums);
2288 0 : return CE_Failure;
2289 : }
2290 71 : nMaxNumPx = nNumPx;
2291 : }
2292 :
2293 2585 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2294 : {
2295 1758 : const GPtrDiff_t iTotYOff =
2296 1758 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2297 1758 : nChunkXOff;
2298 5690 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2299 : {
2300 3932 : if (pabySrcScanlineNodataMask == nullptr ||
2301 16 : pabySrcScanlineNodataMask[iX + iTotYOff])
2302 : {
2303 3917 : const T val = paSrcScanline[iX + iTotYOff];
2304 3917 : size_t i = 0; // Used after for.
2305 :
2306 : // Check array for existing entry.
2307 14387 : for (; i < iMaxInd; ++i)
2308 17626 : if (IsSame(paVals[i], val) &&
2309 6910 : ++panSums[i] > panSums[iMaxVal])
2310 : {
2311 246 : iMaxVal = i;
2312 246 : biMaxValdValid = true;
2313 246 : break;
2314 : }
2315 :
2316 : // Add to arr if entry not already there.
2317 3917 : if (i == iMaxInd)
2318 : {
2319 3671 : paVals[iMaxInd] = val;
2320 3671 : panSums[iMaxInd] = 1;
2321 :
2322 3671 : if (!biMaxValdValid)
2323 : {
2324 824 : iMaxVal = iMaxInd;
2325 824 : biMaxValdValid = true;
2326 : }
2327 :
2328 3671 : ++iMaxInd;
2329 : }
2330 : }
2331 : }
2332 : }
2333 :
2334 827 : if (!biMaxValdValid)
2335 3 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2336 : else
2337 824 : paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
2338 : }
2339 : else if constexpr (std::is_same<T, GByte>::value)
2340 : // ( eSrcDataType == GDT_Byte && nEntryCount < 256 )
2341 : {
2342 : // So we go here for a paletted or non-paletted byte band.
2343 : // The input values are then between 0 and 255.
2344 4251360 : int nMaxVal = 0;
2345 4251360 : int iMaxInd = -1;
2346 :
2347 : // The cost of this zeroing might be high. Perhaps we should
2348 : // just use the above generic case, and go to this one if the
2349 : // number of source pixels is large enough
2350 4251360 : std::fill(anVals.begin(), anVals.end(), 0);
2351 :
2352 12777700 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2353 : {
2354 8526370 : const GPtrDiff_t iTotYOff =
2355 8526370 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2356 8526370 : nChunkXOff;
2357 25649400 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2358 : {
2359 17123000 : const T val = paSrcScanline[iX + iTotYOff];
2360 17123000 : if (!bHasNoData || val != tNoDataValue)
2361 : {
2362 17123000 : int nVal = static_cast<int>(val);
2363 17123000 : if (++anVals[nVal] > nMaxVal)
2364 : {
2365 : // Sum the density.
2366 : // Is it the most common value so far?
2367 17006300 : iMaxInd = nVal;
2368 17006300 : nMaxVal = anVals[nVal];
2369 : }
2370 : }
2371 : }
2372 : }
2373 :
2374 4251360 : if (iMaxInd == -1)
2375 0 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2376 : else
2377 4251360 : paDstScanline[iDstPixel - nDstXOff] =
2378 : static_cast<T>(iMaxInd);
2379 : }
2380 : }
2381 : }
2382 :
2383 136 : CPLFree(paVals);
2384 136 : CPLFree(panSums);
2385 :
2386 136 : return CE_None;
2387 : }
2388 :
2389 136 : static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
2390 : const void *pChunk, void **ppDstBuffer,
2391 : GDALDataType *peDstBufferDataType)
2392 : {
2393 136 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
2394 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
2395 : GDALGetDataTypeSizeBytes(args.eWrkDataType));
2396 136 : if (*ppDstBuffer == nullptr)
2397 : {
2398 0 : return CE_Failure;
2399 : }
2400 :
2401 136 : CPLAssert(args.eSrcDataType == args.eWrkDataType);
2402 :
2403 136 : *peDstBufferDataType = args.eWrkDataType;
2404 136 : switch (args.eWrkDataType)
2405 : {
2406 : // For mode resampling, as no computation is done, only the
2407 : // size of the data type matters... except for Byte where we have
2408 : // special processing. And for floating point values
2409 65 : case GDT_Byte:
2410 : {
2411 65 : return GDALResampleChunk_ModeT(args,
2412 : static_cast<const GByte *>(pChunk),
2413 65 : static_cast<GByte *>(*ppDstBuffer));
2414 : }
2415 :
2416 4 : case GDT_Int8:
2417 : {
2418 4 : return GDALResampleChunk_ModeT(args,
2419 : static_cast<const int8_t *>(pChunk),
2420 4 : static_cast<int8_t *>(*ppDstBuffer));
2421 : }
2422 :
2423 9 : case GDT_Int16:
2424 : case GDT_UInt16:
2425 : {
2426 9 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
2427 9 : return GDALResampleChunk_ModeT(
2428 : args, static_cast<const uint16_t *>(pChunk),
2429 9 : static_cast<uint16_t *>(*ppDstBuffer));
2430 : }
2431 :
2432 15 : case GDT_CInt16:
2433 : case GDT_Int32:
2434 : case GDT_UInt32:
2435 : {
2436 15 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2437 15 : return GDALResampleChunk_ModeT(
2438 : args, static_cast<const uint32_t *>(pChunk),
2439 15 : static_cast<uint32_t *>(*ppDstBuffer));
2440 : }
2441 :
2442 17 : case GDT_Float32:
2443 : {
2444 17 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2445 17 : return GDALResampleChunk_ModeT(args,
2446 : static_cast<const float *>(pChunk),
2447 17 : static_cast<float *>(*ppDstBuffer));
2448 : }
2449 :
2450 12 : case GDT_CInt32:
2451 : case GDT_Int64:
2452 : case GDT_UInt64:
2453 : {
2454 12 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2455 12 : return GDALResampleChunk_ModeT(
2456 : args, static_cast<const uint64_t *>(pChunk),
2457 12 : static_cast<uint64_t *>(*ppDstBuffer));
2458 : }
2459 :
2460 6 : case GDT_Float64:
2461 : {
2462 6 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2463 6 : return GDALResampleChunk_ModeT(args,
2464 : static_cast<const double *>(pChunk),
2465 6 : static_cast<double *>(*ppDstBuffer));
2466 : }
2467 :
2468 4 : case GDT_CFloat32:
2469 : {
2470 4 : return GDALResampleChunk_ModeT(
2471 : args, static_cast<const std::complex<float> *>(pChunk),
2472 4 : static_cast<std::complex<float> *>(*ppDstBuffer));
2473 : }
2474 :
2475 4 : case GDT_CFloat64:
2476 : {
2477 4 : return GDALResampleChunk_ModeT(
2478 : args, static_cast<const std::complex<double> *>(pChunk),
2479 4 : static_cast<std::complex<double> *>(*ppDstBuffer));
2480 : }
2481 :
2482 0 : case GDT_Unknown:
2483 : case GDT_TypeCount:
2484 0 : break;
2485 : }
2486 :
2487 0 : CPLAssert(false);
2488 : return CE_Failure;
2489 : }
2490 :
2491 : /************************************************************************/
2492 : /* GDALResampleConvolutionHorizontal() */
2493 : /************************************************************************/
2494 :
2495 : template <class T>
2496 : static inline double
2497 44642 : GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
2498 : int nSrcPixelCount)
2499 : {
2500 44642 : double dfVal1 = 0.0;
2501 44642 : double dfVal2 = 0.0;
2502 44642 : int i = 0; // Used after for.
2503 : // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
2504 : // manually (untypical) unrolled loop in -O2 and -O3:
2505 : // https://github.com/OSGeo/gdal/issues/9508
2506 : #if !defined(__INTEL_CLANG_COMPILER)
2507 89044 : for (; i + 3 < nSrcPixelCount; i += 4)
2508 : {
2509 44402 : dfVal1 += pChunk[i] * padfWeights[i];
2510 44402 : dfVal1 += pChunk[i + 1] * padfWeights[i + 1];
2511 44402 : dfVal2 += pChunk[i + 2] * padfWeights[i + 2];
2512 44402 : dfVal2 += pChunk[i + 3] * padfWeights[i + 3];
2513 : }
2514 : #endif
2515 46066 : for (; i < nSrcPixelCount; ++i)
2516 : {
2517 1424 : dfVal1 += pChunk[i] * padfWeights[i];
2518 : }
2519 44642 : return dfVal1 + dfVal2;
2520 : }
2521 :
2522 : template <class T>
2523 48 : static inline void GDALResampleConvolutionHorizontalWithMask(
2524 : const T *pChunk, const GByte *pabyMask, const double *padfWeights,
2525 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2526 : {
2527 48 : dfVal = 0;
2528 48 : dfWeightSum = 0;
2529 48 : int i = 0;
2530 48 : for (; i + 3 < nSrcPixelCount; i += 4)
2531 : {
2532 0 : const double dfWeight0 = padfWeights[i] * pabyMask[i];
2533 0 : const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
2534 0 : const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
2535 0 : const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
2536 0 : dfVal += pChunk[i] * dfWeight0;
2537 0 : dfVal += pChunk[i + 1] * dfWeight1;
2538 0 : dfVal += pChunk[i + 2] * dfWeight2;
2539 0 : dfVal += pChunk[i + 3] * dfWeight3;
2540 0 : dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
2541 : }
2542 178 : for (; i < nSrcPixelCount; ++i)
2543 : {
2544 130 : const double dfWeight = padfWeights[i] * pabyMask[i];
2545 130 : dfVal += pChunk[i] * dfWeight;
2546 130 : dfWeightSum += dfWeight;
2547 : }
2548 48 : }
2549 :
2550 : template <class T>
2551 1330334 : static inline void GDALResampleConvolutionHorizontal_3rows(
2552 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2553 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2554 : double &dfRes2, double &dfRes3)
2555 : {
2556 1330334 : double dfVal1 = 0.0;
2557 1330334 : double dfVal2 = 0.0;
2558 1330334 : double dfVal3 = 0.0;
2559 1330334 : double dfVal4 = 0.0;
2560 1330334 : double dfVal5 = 0.0;
2561 1330334 : double dfVal6 = 0.0;
2562 1330334 : int i = 0; // Used after for.
2563 2715057 : for (; i + 3 < nSrcPixelCount; i += 4)
2564 : {
2565 1384722 : dfVal1 += pChunkRow1[i] * padfWeights[i];
2566 1384722 : dfVal1 += pChunkRow1[i + 1] * padfWeights[i + 1];
2567 1384722 : dfVal2 += pChunkRow1[i + 2] * padfWeights[i + 2];
2568 1384722 : dfVal2 += pChunkRow1[i + 3] * padfWeights[i + 3];
2569 1384722 : dfVal3 += pChunkRow2[i] * padfWeights[i];
2570 1384722 : dfVal3 += pChunkRow2[i + 1] * padfWeights[i + 1];
2571 1384722 : dfVal4 += pChunkRow2[i + 2] * padfWeights[i + 2];
2572 1384722 : dfVal4 += pChunkRow2[i + 3] * padfWeights[i + 3];
2573 1384722 : dfVal5 += pChunkRow3[i] * padfWeights[i];
2574 1384722 : dfVal5 += pChunkRow3[i + 1] * padfWeights[i + 1];
2575 1384722 : dfVal6 += pChunkRow3[i + 2] * padfWeights[i + 2];
2576 1384722 : dfVal6 += pChunkRow3[i + 3] * padfWeights[i + 3];
2577 : }
2578 1366941 : for (; i < nSrcPixelCount; ++i)
2579 : {
2580 36607 : dfVal1 += pChunkRow1[i] * padfWeights[i];
2581 36607 : dfVal3 += pChunkRow2[i] * padfWeights[i];
2582 36607 : dfVal5 += pChunkRow3[i] * padfWeights[i];
2583 : }
2584 1330334 : dfRes1 = dfVal1 + dfVal2;
2585 1330334 : dfRes2 = dfVal3 + dfVal4;
2586 1330334 : dfRes3 = dfVal5 + dfVal6;
2587 1330334 : }
2588 :
2589 : template <class T>
2590 18188 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
2591 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2592 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2593 : double &dfRes2, double &dfRes3)
2594 : {
2595 18188 : GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2596 : padfWeights, nSrcPixelCount, dfRes1,
2597 : dfRes2, dfRes3);
2598 18188 : }
2599 :
2600 : template <class T>
2601 1247346 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
2602 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2603 : const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
2604 : {
2605 1247346 : GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2606 : padfWeights, 4, dfRes1, dfRes2,
2607 : dfRes3);
2608 1247346 : }
2609 :
2610 : /************************************************************************/
2611 : /* GDALResampleConvolutionVertical() */
2612 : /************************************************************************/
2613 :
2614 : template <class T>
2615 : static inline double
2616 463157 : GDALResampleConvolutionVertical(const T *pChunk, int nStride,
2617 : const double *padfWeights, int nSrcLineCount)
2618 : {
2619 463157 : double dfVal1 = 0.0;
2620 463157 : double dfVal2 = 0.0;
2621 463157 : int i = 0;
2622 463157 : int j = 0;
2623 912074 : for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2624 : {
2625 448917 : dfVal1 += pChunk[j] * padfWeights[i];
2626 448917 : dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
2627 448917 : dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2628 448917 : dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2629 : }
2630 515480 : for (; i < nSrcLineCount; ++i, j += nStride)
2631 : {
2632 52323 : dfVal1 += pChunk[j] * padfWeights[i];
2633 : }
2634 463157 : return dfVal1 + dfVal2;
2635 : }
2636 :
2637 : template <class T>
2638 2880000 : static inline void GDALResampleConvolutionVertical_2cols(
2639 : const T *pChunk, int nStride, const double *padfWeights, int nSrcLineCount,
2640 : double &dfRes1, double &dfRes2)
2641 : {
2642 2880000 : double dfVal1 = 0.0;
2643 2880000 : double dfVal2 = 0.0;
2644 2880000 : double dfVal3 = 0.0;
2645 2880000 : double dfVal4 = 0.0;
2646 2880000 : int i = 0;
2647 2880000 : int j = 0;
2648 5716800 : for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2649 : {
2650 2836800 : dfVal1 += pChunk[j] * padfWeights[i];
2651 2836800 : dfVal3 += pChunk[j + 1] * padfWeights[i];
2652 2836800 : dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
2653 2836800 : dfVal3 += pChunk[j + 1 + nStride] * padfWeights[i + 1];
2654 2836800 : dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2655 2836800 : dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
2656 2836800 : dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2657 2836800 : dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
2658 : }
2659 2995210 : for (; i < nSrcLineCount; ++i, j += nStride)
2660 : {
2661 115210 : dfVal1 += pChunk[j] * padfWeights[i];
2662 115210 : dfVal3 += pChunk[j + 1] * padfWeights[i];
2663 : }
2664 2880000 : dfRes1 = dfVal1 + dfVal2;
2665 2880000 : dfRes2 = dfVal3 + dfVal4;
2666 2880000 : }
2667 :
2668 : #ifdef USE_SSE2
2669 :
2670 : #ifdef __AVX__
2671 : /************************************************************************/
2672 : /* GDALResampleConvolutionVertical_16cols<T> */
2673 : /************************************************************************/
2674 :
2675 : template <class T>
2676 : static inline void
2677 : GDALResampleConvolutionVertical_16cols(const T *pChunk, int nStride,
2678 : const double *padfWeights,
2679 : int nSrcLineCount, float *afDest)
2680 : {
2681 : int i = 0;
2682 : int j = 0;
2683 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2684 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2685 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2686 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2687 : for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2688 : {
2689 : XMMReg4Double w0 =
2690 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2691 : XMMReg4Double w1 =
2692 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2693 : XMMReg4Double w2 =
2694 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2695 : XMMReg4Double w3 =
2696 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2697 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2698 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2699 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
2700 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
2701 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2702 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2703 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
2704 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
2705 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2706 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2707 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
2708 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
2709 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2710 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2711 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
2712 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
2713 : }
2714 : for (; i < nSrcLineCount; ++i, j += nStride)
2715 : {
2716 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2717 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2718 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2719 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
2720 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
2721 : }
2722 : v_acc0.Store4Val(afDest);
2723 : v_acc1.Store4Val(afDest + 4);
2724 : v_acc2.Store4Val(afDest + 8);
2725 : v_acc3.Store4Val(afDest + 12);
2726 : }
2727 :
2728 : template <class T>
2729 : static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
2730 : const double *, int,
2731 : double *)
2732 : {
2733 : // Cannot be reached
2734 : CPLAssert(false);
2735 : }
2736 :
2737 : #else
2738 :
2739 : /************************************************************************/
2740 : /* GDALResampleConvolutionVertical_8cols<T> */
2741 : /************************************************************************/
2742 :
2743 : template <class T>
2744 : static inline void
2745 18613000 : GDALResampleConvolutionVertical_8cols(const T *pChunk, int nStride,
2746 : const double *padfWeights,
2747 : int nSrcLineCount, float *afDest)
2748 : {
2749 18613000 : int i = 0;
2750 18613000 : int j = 0;
2751 18613000 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2752 18428000 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2753 33729700 : for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2754 : {
2755 15155500 : XMMReg4Double w0 =
2756 15155500 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2757 15090400 : XMMReg4Double w1 =
2758 15090400 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2759 15045700 : XMMReg4Double w2 =
2760 15045700 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2761 15073900 : XMMReg4Double w3 =
2762 15073900 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2763 15084900 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2764 15065600 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2765 15116300 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2766 15137700 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2767 15137000 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2768 15110400 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2769 15126700 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2770 15132700 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2771 : }
2772 29981300 : for (; i < nSrcLineCount; ++i, j += nStride)
2773 : {
2774 11407100 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2775 11407100 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2776 11407100 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2777 : }
2778 18574200 : v_acc0.Store4Val(afDest);
2779 18545900 : v_acc1.Store4Val(afDest + 4);
2780 18572300 : }
2781 :
2782 : template <class T>
2783 : static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
2784 : const double *, int,
2785 : double *)
2786 : {
2787 : // Cannot be reached
2788 : CPLAssert(false);
2789 : }
2790 :
2791 : #endif // __AVX__
2792 :
2793 : /************************************************************************/
2794 : /* GDALResampleConvolutionHorizontalSSE2<T> */
2795 : /************************************************************************/
2796 :
2797 : template <class T>
2798 2737535 : static inline double GDALResampleConvolutionHorizontalSSE2(
2799 : const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2800 : {
2801 2737535 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2802 2737003 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2803 2737344 : int i = 0; // Used after for.
2804 2813017 : for (; i + 7 < nSrcPixelCount; i += 8)
2805 : {
2806 : // Retrieve the pixel & accumulate
2807 75571 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
2808 75571 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
2809 75571 : const XMMReg4Double v_weight1 =
2810 75571 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2811 75571 : const XMMReg4Double v_weight2 =
2812 75571 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
2813 :
2814 75571 : v_acc1 += v_pixels1 * v_weight1;
2815 75571 : v_acc2 += v_pixels2 * v_weight2;
2816 : }
2817 :
2818 2737445 : v_acc1 += v_acc2;
2819 :
2820 2737358 : double dfVal = v_acc1.GetHorizSum();
2821 9501560 : for (; i < nSrcPixelCount; ++i)
2822 : {
2823 6764520 : dfVal += pChunk[i] * padfWeightsAligned[i];
2824 : }
2825 2737029 : return dfVal;
2826 : }
2827 :
2828 : /************************************************************************/
2829 : /* GDALResampleConvolutionHorizontal<GByte> */
2830 : /************************************************************************/
2831 :
2832 : template <>
2833 2189530 : inline double GDALResampleConvolutionHorizontal<GByte>(
2834 : const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2835 : {
2836 2189530 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
2837 2189530 : nSrcPixelCount);
2838 : }
2839 :
2840 : template <>
2841 548160 : inline double GDALResampleConvolutionHorizontal<GUInt16>(
2842 : const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2843 : {
2844 548160 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
2845 548433 : nSrcPixelCount);
2846 : }
2847 :
2848 : /************************************************************************/
2849 : /* GDALResampleConvolutionHorizontalWithMaskSSE2<T> */
2850 : /************************************************************************/
2851 :
2852 : template <class T>
2853 5736213 : static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
2854 : const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
2855 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2856 : {
2857 5736213 : int i = 0; // Used after for.
2858 5736213 : XMMReg4Double v_acc = XMMReg4Double::Zero();
2859 5736213 : XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
2860 16247021 : for (; i + 3 < nSrcPixelCount; i += 4)
2861 : {
2862 10510858 : const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
2863 10510858 : const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
2864 10510858 : XMMReg4Double v_weight =
2865 10510858 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2866 10510858 : v_weight *= v_mask;
2867 10510858 : v_acc += v_pixels * v_weight;
2868 10510858 : v_acc_weight += v_weight;
2869 : }
2870 :
2871 5736213 : dfVal = v_acc.GetHorizSum();
2872 5736213 : dfWeightSum = v_acc_weight.GetHorizSum();
2873 5927983 : for (; i < nSrcPixelCount; ++i)
2874 : {
2875 191772 : const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
2876 191772 : dfVal += pChunk[i] * dfWeight;
2877 191772 : dfWeightSum += dfWeight;
2878 : }
2879 5736213 : }
2880 :
2881 : /************************************************************************/
2882 : /* GDALResampleConvolutionHorizontalWithMask<GByte> */
2883 : /************************************************************************/
2884 :
2885 : template <>
2886 5736150 : inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
2887 : const GByte *pChunk, const GByte *pabyMask,
2888 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2889 : double &dfWeightSum)
2890 : {
2891 5736150 : GDALResampleConvolutionHorizontalWithMaskSSE2(
2892 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2893 : dfWeightSum);
2894 5736150 : }
2895 :
2896 : template <>
2897 63 : inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
2898 : const GUInt16 *pChunk, const GByte *pabyMask,
2899 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2900 : double &dfWeightSum)
2901 : {
2902 63 : GDALResampleConvolutionHorizontalWithMaskSSE2(
2903 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2904 : dfWeightSum);
2905 63 : }
2906 :
2907 : /************************************************************************/
2908 : /* GDALResampleConvolutionHorizontal_3rows_SSE2<T> */
2909 : /************************************************************************/
2910 :
2911 : template <class T>
2912 10023630 : static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
2913 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2914 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2915 : double &dfRes2, double &dfRes3)
2916 : {
2917 10023630 : XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
2918 10023630 : v_acc2 = XMMReg4Double::Zero(),
2919 10023630 : v_acc3 = XMMReg4Double::Zero();
2920 10023630 : int i = 0;
2921 19989466 : for (; i + 7 < nSrcPixelCount; i += 8)
2922 : {
2923 : // Retrieve the pixel & accumulate.
2924 9965826 : XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
2925 9965826 : XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
2926 9965826 : const XMMReg4Double v_weight1 =
2927 9965826 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2928 9965826 : const XMMReg4Double v_weight2 =
2929 9965826 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
2930 :
2931 9965826 : v_acc1 += v_pixels1 * v_weight1;
2932 9965826 : v_acc1 += v_pixels2 * v_weight2;
2933 :
2934 9965826 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
2935 9965826 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
2936 9965826 : v_acc2 += v_pixels1 * v_weight1;
2937 9965826 : v_acc2 += v_pixels2 * v_weight2;
2938 :
2939 9965826 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
2940 9965826 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
2941 9965826 : v_acc3 += v_pixels1 * v_weight1;
2942 9965826 : v_acc3 += v_pixels2 * v_weight2;
2943 : }
2944 :
2945 10023630 : dfRes1 = v_acc1.GetHorizSum();
2946 10023630 : dfRes2 = v_acc2.GetHorizSum();
2947 10023630 : dfRes3 = v_acc3.GetHorizSum();
2948 21487226 : for (; i < nSrcPixelCount; ++i)
2949 : {
2950 11463596 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
2951 11463596 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
2952 11463596 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
2953 : }
2954 10023630 : }
2955 :
2956 : /************************************************************************/
2957 : /* GDALResampleConvolutionHorizontal_3rows<GByte> */
2958 : /************************************************************************/
2959 :
2960 : template <>
2961 10023600 : inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
2962 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
2963 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2964 : double &dfRes2, double &dfRes3)
2965 : {
2966 10023600 : GDALResampleConvolutionHorizontal_3rows_SSE2(
2967 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
2968 : dfRes1, dfRes2, dfRes3);
2969 10023600 : }
2970 :
2971 : template <>
2972 30 : inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
2973 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
2974 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
2975 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
2976 : {
2977 30 : GDALResampleConvolutionHorizontal_3rows_SSE2(
2978 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
2979 : dfRes1, dfRes2, dfRes3);
2980 30 : }
2981 :
2982 : /************************************************************************/
2983 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T> */
2984 : /************************************************************************/
2985 :
2986 : template <class T>
2987 2173256 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
2988 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2989 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2990 : double &dfRes2, double &dfRes3)
2991 : {
2992 2173256 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2993 2173019 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2994 2173119 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2995 2173145 : int i = 0; // Use after for.
2996 2176400 : for (; i + 3 < nSrcPixelCount; i += 4)
2997 : {
2998 : // Retrieve the pixel & accumulate.
2999 3236 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3000 3236 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3001 3236 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3002 3236 : const XMMReg4Double v_weight =
3003 3236 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3004 :
3005 3236 : v_acc1 += v_pixels1 * v_weight;
3006 3236 : v_acc2 += v_pixels2 * v_weight;
3007 3236 : v_acc3 += v_pixels3 * v_weight;
3008 : }
3009 :
3010 2173170 : dfRes1 = v_acc1.GetHorizSum();
3011 2173005 : dfRes2 = v_acc2.GetHorizSum();
3012 2173052 : dfRes3 = v_acc3.GetHorizSum();
3013 :
3014 6494420 : for (; i < nSrcPixelCount; ++i)
3015 : {
3016 4321375 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3017 4321375 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3018 4321375 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3019 : }
3020 2173045 : }
3021 :
3022 : /************************************************************************/
3023 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte> */
3024 : /************************************************************************/
3025 :
3026 : template <>
3027 2106350 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
3028 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3029 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3030 : double &dfRes2, double &dfRes3)
3031 : {
3032 2106350 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3033 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3034 : dfRes1, dfRes2, dfRes3);
3035 2106360 : }
3036 :
3037 : template <>
3038 66764 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
3039 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3040 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3041 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3042 : {
3043 66764 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3044 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3045 : dfRes1, dfRes2, dfRes3);
3046 66958 : }
3047 :
3048 : /************************************************************************/
3049 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T> */
3050 : /************************************************************************/
3051 :
3052 : template <class T>
3053 12204790 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3054 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3055 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3056 : double &dfRes3)
3057 : {
3058 12204790 : const XMMReg4Double v_weight =
3059 : XMMReg4Double::Load4ValAligned(padfWeightsAligned);
3060 :
3061 : // Retrieve the pixel & accumulate.
3062 12149390 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
3063 12232740 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
3064 12238650 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
3065 :
3066 12242310 : XMMReg4Double v_acc1 = v_pixels1 * v_weight;
3067 12150770 : XMMReg4Double v_acc2 = v_pixels2 * v_weight;
3068 12173520 : XMMReg4Double v_acc3 = v_pixels3 * v_weight;
3069 :
3070 12143270 : dfRes1 = v_acc1.GetHorizSum();
3071 12160310 : dfRes2 = v_acc2.GetHorizSum();
3072 12177470 : dfRes3 = v_acc3.GetHorizSum();
3073 12182550 : }
3074 :
3075 : /************************************************************************/
3076 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte> */
3077 : /************************************************************************/
3078 :
3079 : template <>
3080 6635020 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
3081 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3082 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3083 : double &dfRes3)
3084 : {
3085 6635020 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3086 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3087 : dfRes3);
3088 6632800 : }
3089 :
3090 : template <>
3091 5572300 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
3092 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3093 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
3094 : double &dfRes2, double &dfRes3)
3095 : {
3096 5572300 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3097 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3098 : dfRes3);
3099 5531470 : }
3100 :
3101 : #endif // USE_SSE2
3102 :
3103 : /************************************************************************/
3104 : /* GDALResampleChunk_Convolution() */
3105 : /************************************************************************/
3106 :
3107 : template <class T, class Twork, GDALDataType eWrkDataType>
3108 3650 : static CPLErr GDALResampleChunk_ConvolutionT(
3109 : const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
3110 : FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
3111 : int nKernelRadius, bool bKernelWithNegativeWeights, float fMaxVal)
3112 :
3113 : {
3114 3650 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
3115 3650 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
3116 3650 : const double dfSrcXDelta = args.dfSrcXDelta;
3117 3650 : const double dfSrcYDelta = args.dfSrcYDelta;
3118 3650 : constexpr int nBands = 1;
3119 3650 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
3120 3650 : const int nChunkXOff = args.nChunkXOff;
3121 3650 : const int nChunkXSize = args.nChunkXSize;
3122 3650 : const int nChunkYOff = args.nChunkYOff;
3123 3650 : const int nChunkYSize = args.nChunkYSize;
3124 3650 : const int nDstXOff = args.nDstXOff;
3125 3650 : const int nDstXOff2 = args.nDstXOff2;
3126 3650 : const int nDstYOff = args.nDstYOff;
3127 3650 : const int nDstYOff2 = args.nDstYOff2;
3128 3650 : const bool bHasNoData = args.bHasNoData;
3129 3650 : double dfNoDataValue = args.dfNoDataValue;
3130 :
3131 3650 : if (!bHasNoData)
3132 3595 : dfNoDataValue = 0.0;
3133 3650 : const auto dstDataType = args.eOvrDataType;
3134 3650 : const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
3135 3641 : const double dfReplacementVal =
3136 46 : bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
3137 : : dfNoDataValue;
3138 : // cppcheck-suppress unreadVariable
3139 3641 : const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
3140 3631 : const auto nNodataValueInt64 = static_cast<GInt64>(dfNoDataValue);
3141 3631 : constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
3142 :
3143 : // TODO: we should have some generic function to do this.
3144 3631 : Twork fDstMin = -std::numeric_limits<Twork>::max();
3145 3631 : Twork fDstMax = std::numeric_limits<Twork>::max();
3146 3631 : if (dstDataType == GDT_Byte)
3147 : {
3148 2924 : fDstMin = std::numeric_limits<GByte>::min();
3149 2922 : fDstMax = std::numeric_limits<GByte>::max();
3150 : }
3151 709 : else if (dstDataType == GDT_Int8)
3152 : {
3153 1 : fDstMin = std::numeric_limits<GInt8>::min();
3154 1 : fDstMax = std::numeric_limits<GInt8>::max();
3155 : }
3156 708 : else if (dstDataType == GDT_UInt16)
3157 : {
3158 383 : fDstMin = std::numeric_limits<GUInt16>::min();
3159 382 : fDstMax = std::numeric_limits<GUInt16>::max();
3160 : }
3161 329 : else if (dstDataType == GDT_Int16)
3162 : {
3163 279 : fDstMin = std::numeric_limits<GInt16>::min();
3164 279 : fDstMax = std::numeric_limits<GInt16>::max();
3165 : }
3166 50 : else if (dstDataType == GDT_UInt32)
3167 : {
3168 1 : fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
3169 1 : fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
3170 : }
3171 49 : else if (dstDataType == GDT_Int32)
3172 : {
3173 : // cppcheck-suppress unreadVariable
3174 2 : fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
3175 : // cppcheck-suppress unreadVariable
3176 2 : fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
3177 : }
3178 47 : else if (dstDataType == GDT_UInt64)
3179 : {
3180 : // cppcheck-suppress unreadVariable
3181 1 : fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
3182 : // cppcheck-suppress unreadVariable
3183 1 : fDstMax = static_cast<Twork>(std::numeric_limits<uint64_t>::max());
3184 : }
3185 46 : else if (dstDataType == GDT_Int64)
3186 : {
3187 : // cppcheck-suppress unreadVariable
3188 1 : fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
3189 : // cppcheck-suppress unreadVariable
3190 1 : fDstMax = static_cast<Twork>(std::numeric_limits<int64_t>::max());
3191 : }
3192 :
3193 27544448 : auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
3194 : nNodataValueInt64, dfNoDataValue,
3195 : dfReplacementVal](Twork fVal)
3196 : {
3197 14634400 : if (!bHasNoData)
3198 11408000 : return fVal;
3199 :
3200 : // Clamp value before comparing to nodata: this is only needed for
3201 : // kernels with negative weights (Lanczos)
3202 3226360 : Twork fClamped = fVal;
3203 3226360 : if (fClamped < fDstMin)
3204 12874 : fClamped = fDstMin;
3205 3213490 : else if (fClamped > fDstMax)
3206 12852 : fClamped = fDstMax;
3207 3226360 : if (isIntegerDT)
3208 : {
3209 3226370 : if (nNodataValueInt64 == static_cast<GInt64>(std::round(fClamped)))
3210 : {
3211 : // Do not use the nodata value
3212 13869 : return static_cast<Twork>(dfReplacementVal);
3213 : }
3214 : }
3215 0 : else if (dfNoDataValue == fClamped)
3216 : {
3217 : // Do not use the nodata value
3218 1 : return static_cast<Twork>(dfReplacementVal);
3219 : }
3220 3212490 : return fClamped;
3221 : };
3222 :
3223 : /* -------------------------------------------------------------------- */
3224 : /* Allocate work buffers. */
3225 : /* -------------------------------------------------------------------- */
3226 3631 : const int nDstXSize = nDstXOff2 - nDstXOff;
3227 3631 : Twork *pafWrkScanline = nullptr;
3228 3631 : if (dstDataType != eWrkDataType)
3229 : {
3230 : pafWrkScanline =
3231 3592 : static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
3232 3601 : if (pafWrkScanline == nullptr)
3233 0 : return CE_Failure;
3234 : }
3235 :
3236 3640 : const double dfXScale = 1.0 / dfXRatioDstToSrc;
3237 3640 : const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
3238 3640 : const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
3239 3640 : const double dfYScale = 1.0 / dfYRatioDstToSrc;
3240 3640 : const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
3241 3640 : const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
3242 :
3243 : // Temporary array to store result of horizontal filter.
3244 : double *padfHorizontalFiltered = static_cast<double *>(
3245 3640 : VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
3246 :
3247 : // To store convolution coefficients.
3248 3645 : double *padfWeights = static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
3249 : static_cast<int>(2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) +
3250 : 0.5) *
3251 : sizeof(double)));
3252 :
3253 3639 : GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
3254 3639 : if (pabyChunkNodataMask)
3255 : pabyChunkNodataMaskHorizontalFiltered =
3256 377 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
3257 3639 : if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
3258 377 : (pabyChunkNodataMask != nullptr &&
3259 : pabyChunkNodataMaskHorizontalFiltered == nullptr))
3260 : {
3261 5 : VSIFree(pafWrkScanline);
3262 0 : VSIFree(padfHorizontalFiltered);
3263 0 : VSIFreeAligned(padfWeights);
3264 0 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3265 0 : return CE_Failure;
3266 : }
3267 :
3268 : /* ==================================================================== */
3269 : /* First pass: horizontal filter */
3270 : /* ==================================================================== */
3271 3634 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
3272 : #ifdef USE_SSE2
3273 3634 : bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
3274 : #endif
3275 2723275 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
3276 : {
3277 2719619 : const double dfSrcPixel =
3278 2719619 : (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
3279 2719619 : int nSrcPixelStart =
3280 2719619 : static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
3281 2719619 : if (nSrcPixelStart < nChunkXOff)
3282 55092 : nSrcPixelStart = nChunkXOff;
3283 2719619 : int nSrcPixelStop =
3284 2719619 : static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
3285 2719619 : if (nSrcPixelStop > nChunkRightXOff)
3286 55115 : nSrcPixelStop = nChunkRightXOff;
3287 : #if 0
3288 : if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
3289 : {
3290 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3291 : }
3292 : if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
3293 : {
3294 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3295 : }
3296 : #endif
3297 2719619 : const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
3298 2719619 : double dfWeightSum = 0.0;
3299 :
3300 : // Compute convolution coefficients.
3301 2719619 : int nSrcPixel = nSrcPixelStart;
3302 2719619 : double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
3303 3563017 : for (; nSrcPixel + 3 < nSrcPixelStop; nSrcPixel += 4)
3304 : {
3305 843637 : padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
3306 843637 : dfX += dfXScaleWeight;
3307 843637 : padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
3308 843637 : dfX += dfXScaleWeight;
3309 843637 : padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
3310 843637 : dfX += dfXScaleWeight;
3311 843637 : padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
3312 843637 : dfX += dfXScaleWeight;
3313 843390 : dfWeightSum +=
3314 843637 : pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
3315 : }
3316 6700731 : for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
3317 : {
3318 3981305 : const double dfWeight = pfnFilterFunc(dfX);
3319 3981348 : padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3320 3981348 : dfWeightSum += dfWeight;
3321 : }
3322 :
3323 2719426 : const int nHeight = nChunkYSize * nBands;
3324 2719426 : if (pabyChunkNodataMask == nullptr)
3325 : {
3326 2648612 : if (dfWeightSum != 0)
3327 : {
3328 2648622 : const double dfInvWeightSum = 1.0 / dfWeightSum;
3329 9452133 : for (int i = 0; i < nSrcPixelCount; ++i)
3330 6803505 : padfWeights[i] *= dfInvWeightSum;
3331 : }
3332 2648612 : int iSrcLineOff = 0;
3333 : #ifdef USE_SSE2
3334 2648612 : if (nSrcPixelCount == 4)
3335 : {
3336 13959446 : for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
3337 : {
3338 13427436 : const GPtrDiff_t j =
3339 13427436 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3340 13427436 : (nSrcPixelStart - nChunkXOff);
3341 13427436 : double dfVal1 = 0.0;
3342 13427436 : double dfVal2 = 0.0;
3343 13427436 : double dfVal3 = 0.0;
3344 13427436 : GDALResampleConvolutionHorizontalPixelCount4_3rows(
3345 13427436 : pChunk + j, pChunk + j + nChunkXSize,
3346 13427436 : pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
3347 : dfVal2, dfVal3);
3348 13423926 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3349 13423926 : nDstXSize +
3350 13423926 : iDstPixel - nDstXOff] = dfVal1;
3351 13423926 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3352 13423926 : 1) *
3353 13423926 : nDstXSize +
3354 13423926 : iDstPixel - nDstXOff] = dfVal2;
3355 13423926 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3356 13423926 : 2) *
3357 13423926 : nDstXSize +
3358 13423926 : iDstPixel - nDstXOff] = dfVal3;
3359 : }
3360 : }
3361 2113101 : else if (bSrcPixelCountLess8)
3362 : {
3363 4226204 : for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
3364 : {
3365 2191176 : const GPtrDiff_t j =
3366 2191176 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3367 2191176 : (nSrcPixelStart - nChunkXOff);
3368 2191176 : double dfVal1 = 0.0;
3369 2191176 : double dfVal2 = 0.0;
3370 2191176 : double dfVal3 = 0.0;
3371 2191176 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
3372 2191176 : pChunk + j, pChunk + j + nChunkXSize,
3373 2191176 : pChunk + j + 2 * nChunkXSize, padfWeights,
3374 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3375 2191495 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3376 2191495 : nDstXSize +
3377 2191495 : iDstPixel - nDstXOff] = dfVal1;
3378 2191495 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3379 2191495 : 1) *
3380 2191495 : nDstXSize +
3381 2191495 : iDstPixel - nDstXOff] = dfVal2;
3382 2191495 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3383 2191495 : 2) *
3384 2191495 : nDstXSize +
3385 2191495 : iDstPixel - nDstXOff] = dfVal3;
3386 : }
3387 : }
3388 : else
3389 : #endif
3390 : {
3391 10166842 : for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
3392 : {
3393 10088430 : const GPtrDiff_t j =
3394 10088430 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3395 10088430 : (nSrcPixelStart - nChunkXOff);
3396 10088430 : double dfVal1 = 0.0;
3397 10088430 : double dfVal2 = 0.0;
3398 10088430 : double dfVal3 = 0.0;
3399 10088430 : GDALResampleConvolutionHorizontal_3rows(
3400 10088430 : pChunk + j, pChunk + j + nChunkXSize,
3401 10088430 : pChunk + j + 2 * nChunkXSize, padfWeights,
3402 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3403 10088430 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3404 10088430 : nDstXSize +
3405 10088430 : iDstPixel - nDstXOff] = dfVal1;
3406 10088430 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3407 10088430 : 1) *
3408 10088430 : nDstXSize +
3409 10088430 : iDstPixel - nDstXOff] = dfVal2;
3410 10088430 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3411 10088430 : 2) *
3412 10088430 : nDstXSize +
3413 10088430 : iDstPixel - nDstXOff] = dfVal3;
3414 : }
3415 : }
3416 5428005 : for (; iSrcLineOff < nHeight; ++iSrcLineOff)
3417 : {
3418 2782344 : const GPtrDiff_t j =
3419 2782344 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3420 2782344 : (nSrcPixelStart - nChunkXOff);
3421 5520282 : const double dfVal = GDALResampleConvolutionHorizontal(
3422 2782344 : pChunk + j, padfWeights, nSrcPixelCount);
3423 2782583 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3424 2782583 : nDstXSize +
3425 2782583 : iDstPixel - nDstXOff] = dfVal;
3426 : }
3427 : }
3428 : else
3429 : {
3430 18280051 : for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
3431 : {
3432 18206118 : const GPtrDiff_t j =
3433 18206118 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3434 18206118 : (nSrcPixelStart - nChunkXOff);
3435 :
3436 18206118 : if (bKernelWithNegativeWeights)
3437 : {
3438 17725512 : int nConsecutiveValid = 0;
3439 17725512 : int nMaxConsecutiveValid = 0;
3440 164371458 : for (int k = 0; k < nSrcPixelCount; k++)
3441 : {
3442 146646146 : if (pabyChunkNodataMask[j + k])
3443 40208853 : nConsecutiveValid++;
3444 106436793 : else if (nConsecutiveValid)
3445 : {
3446 96592 : nMaxConsecutiveValid = std::max(
3447 96592 : nMaxConsecutiveValid, nConsecutiveValid);
3448 96592 : nConsecutiveValid = 0;
3449 : }
3450 : }
3451 17725512 : nMaxConsecutiveValid =
3452 17725512 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
3453 17725512 : if (nMaxConsecutiveValid < nSrcPixelCount / 2)
3454 : {
3455 12469807 : const size_t nTempOffset =
3456 12469807 : static_cast<size_t>(iSrcLineOff) * nDstXSize +
3457 12469807 : iDstPixel - nDstXOff;
3458 12469807 : padfHorizontalFiltered[nTempOffset] = 0.0;
3459 12469807 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3460 12469807 : continue;
3461 : }
3462 : }
3463 :
3464 5736261 : double dfVal = 0.0;
3465 5736261 : GDALResampleConvolutionHorizontalWithMask(
3466 5736261 : pChunk + j, pabyChunkNodataMask + j, padfWeights,
3467 : nSrcPixelCount, dfVal, dfWeightSum);
3468 5739430 : const size_t nTempOffset =
3469 5739430 : static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
3470 5739430 : nDstXOff;
3471 5739430 : if (dfWeightSum > 0.0)
3472 : {
3473 5691828 : padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
3474 5691828 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
3475 : }
3476 : else
3477 : {
3478 47595 : padfHorizontalFiltered[nTempOffset] = 0.0;
3479 47595 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3480 : }
3481 : }
3482 : }
3483 : }
3484 :
3485 : /* ==================================================================== */
3486 : /* Second pass: vertical filter */
3487 : /* ==================================================================== */
3488 3651 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
3489 :
3490 196900 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3491 : {
3492 193249 : Twork *const pafDstScanline =
3493 193249 : pafWrkScanline ? pafWrkScanline
3494 8421 : : static_cast<Twork *>(pDstBuffer) +
3495 8421 : (iDstLine - nDstYOff) * nDstXSize;
3496 :
3497 193249 : const double dfSrcLine =
3498 193249 : (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
3499 193249 : int nSrcLineStart =
3500 193249 : static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
3501 193249 : int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
3502 193249 : if (nSrcLineStart < nChunkYOff)
3503 2275 : nSrcLineStart = nChunkYOff;
3504 193249 : if (nSrcLineStop > nChunkBottomYOff)
3505 2311 : nSrcLineStop = nChunkBottomYOff;
3506 : #if 0
3507 : if( nSrcLineStart < nChunkYOff &&
3508 : nChunkYOff > 0 )
3509 : {
3510 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3511 : }
3512 : if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
3513 : {
3514 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3515 : }
3516 : #endif
3517 193249 : const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
3518 193249 : double dfWeightSum = 0.0;
3519 :
3520 : // Compute convolution coefficients.
3521 193249 : int nSrcLine = nSrcLineStart; // Used after for.
3522 193249 : double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
3523 428660 : for (; nSrcLine + 3 < nSrcLineStop;
3524 235411 : nSrcLine += 4, dfY += 4 * dfYScaleWeight)
3525 : {
3526 235402 : padfWeights[nSrcLine - nSrcLineStart] = dfY;
3527 235402 : padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
3528 235402 : padfWeights[nSrcLine + 2 - nSrcLineStart] =
3529 235402 : dfY + 2 * dfYScaleWeight;
3530 235402 : padfWeights[nSrcLine + 3 - nSrcLineStart] =
3531 235402 : dfY + 3 * dfYScaleWeight;
3532 235411 : dfWeightSum +=
3533 235402 : pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
3534 : }
3535 226006 : for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
3536 : {
3537 32772 : const double dfWeight = pfnFilterFunc(dfY);
3538 32748 : padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
3539 32748 : dfWeightSum += dfWeight;
3540 : }
3541 :
3542 193234 : if (pabyChunkNodataMask == nullptr)
3543 : {
3544 159721 : if (dfWeightSum != 0)
3545 : {
3546 159726 : const double dfInvWeightSum = 1.0 / dfWeightSum;
3547 899685 : for (int i = 0; i < nSrcLineCount; ++i)
3548 739959 : padfWeights[i] *= dfInvWeightSum;
3549 : }
3550 : }
3551 :
3552 193234 : if (pabyChunkNodataMask == nullptr)
3553 : {
3554 159731 : int iFilteredPixelOff = 0; // Used after for.
3555 : // j used after for.
3556 159731 : size_t j =
3557 159731 : (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
3558 : #ifdef USE_SSE2
3559 : if constexpr (eWrkDataType == GDT_Float32)
3560 : {
3561 : #ifdef __AVX__
3562 : for (; iFilteredPixelOff + 15 < nDstXSize;
3563 : iFilteredPixelOff += 16, j += 16)
3564 : {
3565 : GDALResampleConvolutionVertical_16cols(
3566 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3567 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3568 : if (bHasNoData)
3569 : {
3570 : for (int k = 0; k < 16; k++)
3571 : {
3572 : pafDstScanline[iFilteredPixelOff + k] =
3573 : replaceValIfNodata(
3574 : pafDstScanline[iFilteredPixelOff + k]);
3575 : }
3576 : }
3577 : }
3578 : #else
3579 18739546 : for (; iFilteredPixelOff + 7 < nDstXSize;
3580 : iFilteredPixelOff += 8, j += 8)
3581 : {
3582 18625610 : GDALResampleConvolutionVertical_8cols(
3583 18625610 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3584 18625610 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3585 18587070 : if (bHasNoData)
3586 : {
3587 17820 : for (int k = 0; k < 8; k++)
3588 : {
3589 15840 : pafDstScanline[iFilteredPixelOff + k] =
3590 15840 : replaceValIfNodata(
3591 15840 : pafDstScanline[iFilteredPixelOff + k]);
3592 : }
3593 : }
3594 : }
3595 : #endif
3596 :
3597 577109 : for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
3598 : {
3599 463230 : const Twork fVal =
3600 463149 : static_cast<Twork>(GDALResampleConvolutionVertical(
3601 463149 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3602 : nSrcLineCount));
3603 463155 : pafDstScanline[iFilteredPixelOff] =
3604 463230 : replaceValIfNodata(fVal);
3605 : }
3606 : }
3607 : else
3608 : #endif
3609 : {
3610 2887210 : for (; iFilteredPixelOff + 1 < nDstXSize;
3611 : iFilteredPixelOff += 2, j += 2)
3612 : {
3613 2880000 : double dfVal1 = 0.0;
3614 2880000 : double dfVal2 = 0.0;
3615 2880000 : GDALResampleConvolutionVertical_2cols(
3616 2880000 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3617 : nSrcLineCount, dfVal1, dfVal2);
3618 5760010 : pafDstScanline[iFilteredPixelOff] =
3619 2880000 : replaceValIfNodata(static_cast<Twork>(dfVal1));
3620 2880000 : pafDstScanline[iFilteredPixelOff + 1] =
3621 2880000 : replaceValIfNodata(static_cast<Twork>(dfVal2));
3622 : }
3623 7206 : if (iFilteredPixelOff < nDstXSize)
3624 : {
3625 2 : const double dfVal = GDALResampleConvolutionVertical(
3626 2 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3627 : nSrcLineCount);
3628 2 : pafDstScanline[iFilteredPixelOff] =
3629 2 : replaceValIfNodata(static_cast<Twork>(dfVal));
3630 : }
3631 : }
3632 : }
3633 : else
3634 : {
3635 17284632 : for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
3636 : ++iFilteredPixelOff)
3637 : {
3638 17251205 : double dfVal = 0.0;
3639 17251205 : dfWeightSum = 0.0;
3640 17251205 : size_t j = (nSrcLineStart - nChunkYOff) *
3641 17251205 : static_cast<size_t>(nDstXSize) +
3642 17251205 : iFilteredPixelOff;
3643 17251205 : if (bKernelWithNegativeWeights)
3644 : {
3645 17026301 : int nConsecutiveValid = 0;
3646 17026301 : int nMaxConsecutiveValid = 0;
3647 121244321 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3648 : {
3649 104218020 : const double dfWeight =
3650 104218020 : padfWeights[i] *
3651 : pabyChunkNodataMaskHorizontalFiltered[j];
3652 104218020 : if (pabyChunkNodataMaskHorizontalFiltered[j])
3653 : {
3654 41787737 : nConsecutiveValid++;
3655 : }
3656 62429783 : else if (nConsecutiveValid)
3657 : {
3658 199248 : nMaxConsecutiveValid = std::max(
3659 199248 : nMaxConsecutiveValid, nConsecutiveValid);
3660 199248 : nConsecutiveValid = 0;
3661 : }
3662 104218020 : dfVal += padfHorizontalFiltered[j] * dfWeight;
3663 104218020 : dfWeightSum += dfWeight;
3664 : }
3665 17026301 : nMaxConsecutiveValid =
3666 17026301 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
3667 17026301 : if (nMaxConsecutiveValid < nSrcLineCount / 2)
3668 : {
3669 8839831 : pafDstScanline[iFilteredPixelOff] =
3670 8839739 : static_cast<Twork>(dfNoDataValue);
3671 8839831 : continue;
3672 : }
3673 : }
3674 : else
3675 : {
3676 1130262 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3677 : {
3678 905432 : const double dfWeight =
3679 905432 : padfWeights[i] *
3680 : pabyChunkNodataMaskHorizontalFiltered[j];
3681 905432 : dfVal += padfHorizontalFiltered[j] * dfWeight;
3682 905432 : dfWeightSum += dfWeight;
3683 : }
3684 : }
3685 8411324 : if (dfWeightSum > 0.0)
3686 : {
3687 8395283 : pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
3688 8395271 : static_cast<Twork>(dfVal / dfWeightSum));
3689 : }
3690 : else
3691 : {
3692 16039 : pafDstScanline[iFilteredPixelOff] =
3693 16015 : static_cast<Twork>(dfNoDataValue);
3694 : }
3695 : }
3696 : }
3697 :
3698 154669 : if (fMaxVal != 0.0f)
3699 : {
3700 192324 : for (int i = 0; i < nDstXSize; ++i)
3701 : {
3702 192088 : if (pafDstScanline[i] > fMaxVal)
3703 96022 : pafDstScanline[i] = fMaxVal;
3704 : }
3705 : }
3706 :
3707 154669 : if (pafWrkScanline)
3708 : {
3709 184833 : GDALCopyWords(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
3710 : static_cast<GByte *>(pDstBuffer) +
3711 184833 : static_cast<size_t>(iDstLine - nDstYOff) *
3712 184833 : nDstXSize * nDstDataTypeSize,
3713 : dstDataType, nDstDataTypeSize, nDstXSize);
3714 : }
3715 : }
3716 :
3717 3651 : VSIFree(pafWrkScanline);
3718 3651 : VSIFreeAligned(padfWeights);
3719 3651 : VSIFree(padfHorizontalFiltered);
3720 3651 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3721 :
3722 3651 : return CE_None;
3723 : }
3724 :
3725 : static CPLErr
3726 3650 : GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
3727 : const void *pChunk, void **ppDstBuffer,
3728 : GDALDataType *peDstBufferDataType)
3729 : {
3730 : GDALResampleAlg eResample;
3731 3650 : bool bKernelWithNegativeWeights = false;
3732 3650 : if (EQUAL(args.pszResampling, "BILINEAR"))
3733 2579 : eResample = GRA_Bilinear;
3734 1071 : else if (EQUAL(args.pszResampling, "CUBIC"))
3735 : {
3736 991 : eResample = GRA_Cubic;
3737 991 : bKernelWithNegativeWeights = true;
3738 : }
3739 80 : else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
3740 23 : eResample = GRA_CubicSpline;
3741 57 : else if (EQUAL(args.pszResampling, "LANCZOS"))
3742 : {
3743 54 : eResample = GRA_Lanczos;
3744 54 : bKernelWithNegativeWeights = true;
3745 : }
3746 : else
3747 : {
3748 3 : CPLAssert(false);
3749 : return CE_Failure;
3750 : }
3751 3647 : const int nKernelRadius = GWKGetFilterRadius(eResample);
3752 3644 : FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
3753 : const FilterFunc4ValuesType pfnFilterFunc4Values =
3754 3646 : GWKGetFilterFunc4Values(eResample);
3755 :
3756 3644 : float fMaxVal = 0.f;
3757 : // Cubic, etc... can have overshoots, so make sure we clamp values to the
3758 : // maximum value if NBITS is set.
3759 3644 : if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
3760 8 : (args.eOvrDataType == GDT_Byte || args.eOvrDataType == GDT_UInt16 ||
3761 0 : args.eOvrDataType == GDT_UInt32))
3762 : {
3763 8 : int nBits = args.nOvrNBITS;
3764 8 : if (nBits == GDALGetDataTypeSize(args.eOvrDataType))
3765 1 : nBits = 0;
3766 8 : if (nBits > 0 && nBits < 32)
3767 7 : fMaxVal = static_cast<float>((1U << nBits) - 1);
3768 : }
3769 :
3770 3644 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
3771 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
3772 : GDALGetDataTypeSizeBytes(args.eOvrDataType));
3773 3648 : if (*ppDstBuffer == nullptr)
3774 : {
3775 0 : return CE_Failure;
3776 : }
3777 3648 : *peDstBufferDataType = args.eOvrDataType;
3778 :
3779 3648 : switch (args.eWrkDataType)
3780 : {
3781 2923 : case GDT_Byte:
3782 : {
3783 2923 : return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32>(
3784 : args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
3785 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3786 2925 : bKernelWithNegativeWeights, fMaxVal);
3787 : }
3788 :
3789 395 : case GDT_UInt16:
3790 : {
3791 395 : return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32>(
3792 : args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
3793 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3794 396 : bKernelWithNegativeWeights, fMaxVal);
3795 : }
3796 :
3797 301 : case GDT_Float32:
3798 : {
3799 301 : return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32>(
3800 : args, static_cast<const float *>(pChunk), *ppDstBuffer,
3801 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3802 301 : bKernelWithNegativeWeights, fMaxVal);
3803 : }
3804 :
3805 29 : case GDT_Float64:
3806 : {
3807 29 : return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64>(
3808 : args, static_cast<const double *>(pChunk), *ppDstBuffer,
3809 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3810 29 : bKernelWithNegativeWeights, fMaxVal);
3811 : }
3812 :
3813 0 : default:
3814 0 : break;
3815 : }
3816 :
3817 0 : CPLAssert(false);
3818 : return CE_Failure;
3819 : }
3820 :
3821 : /************************************************************************/
3822 : /* GDALResampleChunkC32R() */
3823 : /************************************************************************/
3824 :
3825 2 : static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
3826 : const float *pafChunk, const int nChunkYOff,
3827 : const int nChunkYSize, const int nDstYOff,
3828 : const int nDstYOff2, const int nOvrXSize,
3829 : const int nOvrYSize, void **ppDstBuffer,
3830 : GDALDataType *peDstBufferDataType,
3831 : const char *pszResampling)
3832 :
3833 : {
3834 : enum Method
3835 : {
3836 : NEAR,
3837 : AVERAGE,
3838 : AVERAGE_MAGPHASE,
3839 : RMS,
3840 : };
3841 :
3842 2 : Method eMethod = NEAR;
3843 2 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
3844 : {
3845 0 : eMethod = NEAR;
3846 : }
3847 2 : else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
3848 : {
3849 0 : eMethod = AVERAGE_MAGPHASE;
3850 : }
3851 2 : else if (EQUAL(pszResampling, "RMS"))
3852 : {
3853 2 : eMethod = RMS;
3854 : }
3855 0 : else if (STARTS_WITH_CI(pszResampling, "AVER"))
3856 : {
3857 0 : eMethod = AVERAGE;
3858 : }
3859 : else
3860 : {
3861 0 : CPLError(
3862 : CE_Failure, CPLE_NotSupported,
3863 : "Resampling method %s is not supported for complex data types. "
3864 : "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
3865 : pszResampling);
3866 0 : return CE_Failure;
3867 : }
3868 :
3869 2 : const int nOXSize = nOvrXSize;
3870 2 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
3871 : GDALGetDataTypeSizeBytes(GDT_CFloat32));
3872 2 : if (*ppDstBuffer == nullptr)
3873 : {
3874 0 : return CE_Failure;
3875 : }
3876 2 : float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
3877 2 : *peDstBufferDataType = GDT_CFloat32;
3878 :
3879 2 : const int nOYSize = nOvrYSize;
3880 2 : const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
3881 2 : const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
3882 :
3883 : /* ==================================================================== */
3884 : /* Loop over destination scanlines. */
3885 : /* ==================================================================== */
3886 8 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3887 : {
3888 6 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
3889 6 : if (nSrcYOff < nChunkYOff)
3890 0 : nSrcYOff = nChunkYOff;
3891 :
3892 6 : int nSrcYOff2 =
3893 6 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
3894 6 : if (nSrcYOff2 == nSrcYOff)
3895 0 : nSrcYOff2++;
3896 :
3897 6 : if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
3898 : {
3899 2 : if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
3900 0 : nSrcYOff = nSrcHeight - 1;
3901 2 : nSrcYOff2 = nSrcHeight;
3902 : }
3903 6 : if (nSrcYOff2 > nChunkYOff + nChunkYSize)
3904 0 : nSrcYOff2 = nChunkYOff + nChunkYSize;
3905 :
3906 6 : const float *const pafSrcScanline =
3907 6 : pafChunk + ((nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
3908 6 : float *const pafDstScanline =
3909 6 : pafDstBuffer + (iDstLine - nDstYOff) * 2 * nOXSize;
3910 :
3911 : /* --------------------------------------------------------------------
3912 : */
3913 : /* Loop over destination pixels */
3914 : /* --------------------------------------------------------------------
3915 : */
3916 18 : for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
3917 : {
3918 12 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
3919 12 : int nSrcXOff2 =
3920 12 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
3921 12 : if (nSrcXOff2 == nSrcXOff)
3922 0 : nSrcXOff2++;
3923 12 : if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
3924 : {
3925 6 : if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
3926 0 : nSrcXOff = nSrcWidth - 1;
3927 6 : nSrcXOff2 = nSrcWidth;
3928 : }
3929 :
3930 12 : if (eMethod == NEAR)
3931 : {
3932 0 : pafDstScanline[iDstPixel * 2] = pafSrcScanline[nSrcXOff * 2];
3933 0 : pafDstScanline[iDstPixel * 2 + 1] =
3934 0 : pafSrcScanline[nSrcXOff * 2 + 1];
3935 : }
3936 12 : else if (eMethod == AVERAGE_MAGPHASE)
3937 : {
3938 0 : double dfTotalR = 0.0;
3939 0 : double dfTotalI = 0.0;
3940 0 : double dfTotalM = 0.0;
3941 0 : int nCount = 0;
3942 :
3943 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
3944 : {
3945 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
3946 : {
3947 0 : const double dfR =
3948 0 : pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
3949 0 : iY - nSrcYOff) *
3950 0 : nSrcWidth * 2];
3951 0 : const double dfI =
3952 0 : pafSrcScanline[iX * 2 +
3953 0 : static_cast<GPtrDiff_t>(iY -
3954 0 : nSrcYOff) *
3955 0 : nSrcWidth * 2 +
3956 0 : 1];
3957 0 : dfTotalR += dfR;
3958 0 : dfTotalI += dfI;
3959 0 : dfTotalM += std::hypot(dfR, dfI);
3960 0 : ++nCount;
3961 : }
3962 : }
3963 :
3964 0 : CPLAssert(nCount > 0);
3965 0 : if (nCount == 0)
3966 : {
3967 0 : pafDstScanline[iDstPixel * 2] = 0.0;
3968 0 : pafDstScanline[iDstPixel * 2 + 1] = 0.0;
3969 : }
3970 : else
3971 : {
3972 0 : pafDstScanline[iDstPixel * 2] =
3973 0 : static_cast<float>(dfTotalR / nCount);
3974 0 : pafDstScanline[iDstPixel * 2 + 1] =
3975 0 : static_cast<float>(dfTotalI / nCount);
3976 : const double dfM =
3977 0 : std::hypot(pafDstScanline[iDstPixel * 2],
3978 0 : pafDstScanline[iDstPixel * 2 + 1]);
3979 0 : const double dfDesiredM = dfTotalM / nCount;
3980 0 : double dfRatio = 1.0;
3981 0 : if (dfM != 0.0)
3982 0 : dfRatio = dfDesiredM / dfM;
3983 :
3984 0 : pafDstScanline[iDstPixel * 2] *=
3985 0 : static_cast<float>(dfRatio);
3986 0 : pafDstScanline[iDstPixel * 2 + 1] *=
3987 0 : static_cast<float>(dfRatio);
3988 : }
3989 : }
3990 12 : else if (eMethod == RMS)
3991 : {
3992 12 : double dfTotalR = 0.0;
3993 12 : double dfTotalI = 0.0;
3994 12 : int nCount = 0;
3995 :
3996 36 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
3997 : {
3998 72 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
3999 : {
4000 48 : const double dfR =
4001 48 : pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
4002 48 : iY - nSrcYOff) *
4003 48 : nSrcWidth * 2];
4004 48 : const double dfI =
4005 48 : pafSrcScanline[iX * 2 +
4006 48 : static_cast<GPtrDiff_t>(iY -
4007 48 : nSrcYOff) *
4008 48 : nSrcWidth * 2 +
4009 48 : 1];
4010 :
4011 48 : dfTotalR += SQUARE(dfR);
4012 48 : dfTotalI += SQUARE(dfI);
4013 :
4014 48 : ++nCount;
4015 : }
4016 : }
4017 :
4018 12 : CPLAssert(nCount > 0);
4019 12 : if (nCount == 0)
4020 : {
4021 0 : pafDstScanline[iDstPixel * 2] = 0.0;
4022 0 : pafDstScanline[iDstPixel * 2 + 1] = 0.0;
4023 : }
4024 : else
4025 : {
4026 : /* compute RMS */
4027 12 : pafDstScanline[iDstPixel * 2] =
4028 12 : static_cast<float>(sqrt(dfTotalR / nCount));
4029 12 : pafDstScanline[iDstPixel * 2 + 1] =
4030 12 : static_cast<float>(sqrt(dfTotalI / nCount));
4031 : }
4032 : }
4033 0 : else if (eMethod == AVERAGE)
4034 : {
4035 0 : double dfTotalR = 0.0;
4036 0 : double dfTotalI = 0.0;
4037 0 : int nCount = 0;
4038 :
4039 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4040 : {
4041 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4042 : {
4043 : // TODO(schwehr): Maybe use std::complex?
4044 0 : dfTotalR +=
4045 0 : pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
4046 0 : iY - nSrcYOff) *
4047 0 : nSrcWidth * 2];
4048 0 : dfTotalI += pafSrcScanline[iX * 2 +
4049 0 : static_cast<GPtrDiff_t>(
4050 0 : iY - nSrcYOff) *
4051 0 : nSrcWidth * 2 +
4052 0 : 1];
4053 0 : ++nCount;
4054 : }
4055 : }
4056 :
4057 0 : CPLAssert(nCount > 0);
4058 0 : if (nCount == 0)
4059 : {
4060 0 : pafDstScanline[iDstPixel * 2] = 0.0;
4061 0 : pafDstScanline[iDstPixel * 2 + 1] = 0.0;
4062 : }
4063 : else
4064 : {
4065 0 : pafDstScanline[iDstPixel * 2] =
4066 0 : static_cast<float>(dfTotalR / nCount);
4067 0 : pafDstScanline[iDstPixel * 2 + 1] =
4068 0 : static_cast<float>(dfTotalI / nCount);
4069 : }
4070 : }
4071 : }
4072 : }
4073 :
4074 2 : return CE_None;
4075 : }
4076 :
4077 : /************************************************************************/
4078 : /* GDALRegenerateCascadingOverviews() */
4079 : /* */
4080 : /* Generate a list of overviews in order from largest to */
4081 : /* smallest, computing each from the next larger. */
4082 : /************************************************************************/
4083 :
4084 42 : static CPLErr GDALRegenerateCascadingOverviews(
4085 : GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
4086 : const char *pszResampling, GDALProgressFunc pfnProgress,
4087 : void *pProgressData, CSLConstList papszOptions)
4088 :
4089 : {
4090 : /* -------------------------------------------------------------------- */
4091 : /* First, we must put the overviews in order from largest to */
4092 : /* smallest. */
4093 : /* -------------------------------------------------------------------- */
4094 120 : for (int i = 0; i < nOverviews - 1; ++i)
4095 : {
4096 270 : for (int j = 0; j < nOverviews - i - 1; ++j)
4097 : {
4098 192 : if (papoOvrBands[j]->GetXSize() *
4099 192 : static_cast<float>(papoOvrBands[j]->GetYSize()) <
4100 192 : papoOvrBands[j + 1]->GetXSize() *
4101 192 : static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
4102 : {
4103 0 : GDALRasterBand *poTempBand = papoOvrBands[j];
4104 0 : papoOvrBands[j] = papoOvrBands[j + 1];
4105 0 : papoOvrBands[j + 1] = poTempBand;
4106 : }
4107 : }
4108 : }
4109 :
4110 : /* -------------------------------------------------------------------- */
4111 : /* Count total pixels so we can prepare appropriate scaled */
4112 : /* progress functions. */
4113 : /* -------------------------------------------------------------------- */
4114 42 : double dfTotalPixels = 0.0;
4115 :
4116 162 : for (int i = 0; i < nOverviews; ++i)
4117 : {
4118 120 : dfTotalPixels += papoOvrBands[i]->GetXSize() *
4119 120 : static_cast<double>(papoOvrBands[i]->GetYSize());
4120 : }
4121 :
4122 : /* -------------------------------------------------------------------- */
4123 : /* Generate all the bands. */
4124 : /* -------------------------------------------------------------------- */
4125 42 : double dfPixelsProcessed = 0.0;
4126 :
4127 162 : for (int i = 0; i < nOverviews; ++i)
4128 : {
4129 120 : GDALRasterBand *poBaseBand = poSrcBand;
4130 120 : if (i != 0)
4131 78 : poBaseBand = papoOvrBands[i - 1];
4132 :
4133 120 : double dfPixels = papoOvrBands[i]->GetXSize() *
4134 120 : static_cast<double>(papoOvrBands[i]->GetYSize());
4135 :
4136 240 : void *pScaledProgressData = GDALCreateScaledProgress(
4137 : dfPixelsProcessed / dfTotalPixels,
4138 120 : (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
4139 : pProgressData);
4140 :
4141 240 : const CPLErr eErr = GDALRegenerateOverviewsEx(
4142 : poBaseBand, 1,
4143 120 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
4144 : pszResampling, GDALScaledProgress, pScaledProgressData,
4145 : papszOptions);
4146 120 : GDALDestroyScaledProgress(pScaledProgressData);
4147 :
4148 120 : if (eErr != CE_None)
4149 0 : return eErr;
4150 :
4151 120 : dfPixelsProcessed += dfPixels;
4152 :
4153 : // Only do the bit2grayscale promotion on the base band.
4154 120 : if (STARTS_WITH_CI(pszResampling,
4155 : "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
4156 8 : pszResampling = "AVERAGE";
4157 : }
4158 :
4159 42 : return CE_None;
4160 : }
4161 :
4162 : /************************************************************************/
4163 : /* GDALGetResampleFunction() */
4164 : /************************************************************************/
4165 :
4166 3839 : GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
4167 : int *pnRadius)
4168 : {
4169 3839 : if (pnRadius)
4170 3839 : *pnRadius = 0;
4171 3839 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
4172 425 : return GDALResampleChunk_Near;
4173 3414 : else if (STARTS_WITH_CI(pszResampling, "AVER") ||
4174 2889 : EQUAL(pszResampling, "RMS"))
4175 553 : return GDALResampleChunk_AverageOrRMS;
4176 2861 : else if (EQUAL(pszResampling, "GAUSS"))
4177 : {
4178 26 : if (pnRadius)
4179 26 : *pnRadius = 1;
4180 26 : return GDALResampleChunk_Gauss;
4181 : }
4182 2835 : else if (EQUAL(pszResampling, "MODE"))
4183 96 : return GDALResampleChunk_Mode;
4184 2739 : else if (EQUAL(pszResampling, "CUBIC"))
4185 : {
4186 377 : if (pnRadius)
4187 375 : *pnRadius = GWKGetFilterRadius(GRA_Cubic);
4188 375 : return GDALResampleChunk_Convolution;
4189 : }
4190 2362 : else if (EQUAL(pszResampling, "CUBICSPLINE"))
4191 : {
4192 3 : if (pnRadius)
4193 3 : *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
4194 3 : return GDALResampleChunk_Convolution;
4195 : }
4196 2359 : else if (EQUAL(pszResampling, "LANCZOS"))
4197 : {
4198 8 : if (pnRadius)
4199 8 : *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
4200 8 : return GDALResampleChunk_Convolution;
4201 : }
4202 2351 : else if (EQUAL(pszResampling, "BILINEAR"))
4203 : {
4204 2357 : if (pnRadius)
4205 2357 : *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
4206 2357 : return GDALResampleChunk_Convolution;
4207 : }
4208 : else
4209 : {
4210 0 : CPLError(
4211 : CE_Failure, CPLE_AppDefined,
4212 : "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
4213 : pszResampling);
4214 0 : return nullptr;
4215 : }
4216 : }
4217 :
4218 : /************************************************************************/
4219 : /* GDALGetOvrWorkDataType() */
4220 : /************************************************************************/
4221 :
4222 3729 : GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
4223 : GDALDataType eSrcDataType)
4224 : {
4225 3729 : if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
4226 : {
4227 511 : return eSrcDataType;
4228 : }
4229 3218 : else if (eSrcDataType == GDT_Byte &&
4230 2907 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4231 2450 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4232 2247 : EQUAL(pszResampling, "CUBICSPLINE") ||
4233 2244 : EQUAL(pszResampling, "LANCZOS") ||
4234 2239 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4235 : {
4236 2900 : return GDT_Byte;
4237 : }
4238 318 : else if (eSrcDataType == GDT_UInt16 &&
4239 119 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4240 108 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4241 3 : EQUAL(pszResampling, "CUBICSPLINE") ||
4242 3 : EQUAL(pszResampling, "LANCZOS") ||
4243 2 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4244 : {
4245 102 : return GDT_UInt16;
4246 : }
4247 216 : else if (EQUAL(pszResampling, "GAUSS"))
4248 20 : return GDT_Float64;
4249 :
4250 196 : if (eSrcDataType == GDT_Byte || eSrcDataType == GDT_Int8 ||
4251 184 : eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
4252 : eSrcDataType == GDT_Float32)
4253 : {
4254 160 : return GDT_Float32;
4255 : }
4256 36 : return GDT_Float64;
4257 : }
4258 :
4259 : namespace
4260 : {
4261 : // Structure to hold a pointer to free with CPLFree()
4262 : struct PointerHolder
4263 : {
4264 : void *ptr = nullptr;
4265 :
4266 34637 : explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
4267 : {
4268 34637 : }
4269 :
4270 34646 : ~PointerHolder()
4271 34646 : {
4272 34646 : CPLFree(ptr);
4273 34646 : }
4274 :
4275 : PointerHolder(const PointerHolder &) = delete;
4276 : PointerHolder &operator=(const PointerHolder &) = delete;
4277 : };
4278 : } // namespace
4279 :
4280 : /************************************************************************/
4281 : /* GDALRegenerateOverviews() */
4282 : /************************************************************************/
4283 :
4284 : /**
4285 : * \brief Generate downsampled overviews.
4286 : *
4287 : * This function will generate one or more overview images from a base image
4288 : * using the requested downsampling algorithm. Its primary use is for
4289 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4290 : * used to generate downsampled images in one file from another outside the
4291 : * overview architecture.
4292 : *
4293 : * The output bands need to exist in advance.
4294 : *
4295 : * The full set of resampling algorithms is documented in
4296 : * GDALDataset::BuildOverviews().
4297 : *
4298 : * This function will honour properly NODATA_VALUES tuples (special dataset
4299 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4300 : * considered as the nodata value and not each value of the triplet
4301 : * independently per band.
4302 : *
4303 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4304 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4305 : * overview computation.
4306 : *
4307 : * @param hSrcBand the source (base level) band.
4308 : * @param nOverviewCount the number of downsampled bands being generated.
4309 : * @param pahOvrBands the list of downsampled bands to be generated.
4310 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4311 : * @param pfnProgress progress report function.
4312 : * @param pProgressData progress function callback data.
4313 : * @return CE_None on success or CE_Failure on failure.
4314 : */
4315 252 : CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
4316 : GDALRasterBandH *pahOvrBands,
4317 : const char *pszResampling,
4318 : GDALProgressFunc pfnProgress,
4319 : void *pProgressData)
4320 :
4321 : {
4322 252 : return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
4323 : pszResampling, pfnProgress, pProgressData,
4324 252 : nullptr);
4325 : }
4326 :
4327 : /************************************************************************/
4328 : /* GDALRegenerateOverviewsEx() */
4329 : /************************************************************************/
4330 :
4331 : /**
4332 : * \brief Generate downsampled overviews.
4333 : *
4334 : * This function will generate one or more overview images from a base image
4335 : * using the requested downsampling algorithm. Its primary use is for
4336 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4337 : * used to generate downsampled images in one file from another outside the
4338 : * overview architecture.
4339 : *
4340 : * The output bands need to exist in advance.
4341 : *
4342 : * The full set of resampling algorithms is documented in
4343 : * GDALDataset::BuildOverviews().
4344 : *
4345 : * This function will honour properly NODATA_VALUES tuples (special dataset
4346 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4347 : * considered as the nodata value and not each value of the triplet
4348 : * independently per band.
4349 : *
4350 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4351 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4352 : * overview computation.
4353 : *
4354 : * @param hSrcBand the source (base level) band.
4355 : * @param nOverviewCount the number of downsampled bands being generated.
4356 : * @param pahOvrBands the list of downsampled bands to be generated.
4357 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4358 : * @param pfnProgress progress report function.
4359 : * @param pProgressData progress function callback data.
4360 : * @param papszOptions NULL terminated list of options as key=value pairs, or
4361 : * NULL
4362 : * @return CE_None on success or CE_Failure on failure.
4363 : * @since GDAL 3.6
4364 : */
4365 806 : CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
4366 : GDALRasterBandH *pahOvrBands,
4367 : const char *pszResampling,
4368 : GDALProgressFunc pfnProgress,
4369 : void *pProgressData, CSLConstList papszOptions)
4370 :
4371 : {
4372 806 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
4373 806 : GDALRasterBand **papoOvrBands =
4374 : reinterpret_cast<GDALRasterBand **>(pahOvrBands);
4375 :
4376 806 : if (pfnProgress == nullptr)
4377 252 : pfnProgress = GDALDummyProgress;
4378 :
4379 806 : if (EQUAL(pszResampling, "NONE"))
4380 61 : return CE_None;
4381 :
4382 745 : int nKernelRadius = 0;
4383 : GDALResampleFunction pfnResampleFn =
4384 745 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
4385 :
4386 745 : if (pfnResampleFn == nullptr)
4387 0 : return CE_Failure;
4388 :
4389 : /* -------------------------------------------------------------------- */
4390 : /* Check color tables... */
4391 : /* -------------------------------------------------------------------- */
4392 745 : GDALColorTable *poColorTable = nullptr;
4393 :
4394 384 : if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
4395 1564 : EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
4396 446 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4397 : {
4398 9 : poColorTable = poSrcBand->GetColorTable();
4399 9 : if (poColorTable != nullptr)
4400 : {
4401 9 : if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
4402 : {
4403 0 : CPLError(CE_Warning, CPLE_AppDefined,
4404 : "Computing overviews on palette index raster bands "
4405 : "with a palette whose color interpretation is not RGB "
4406 : "will probably lead to unexpected results.");
4407 0 : poColorTable = nullptr;
4408 : }
4409 9 : else if (poColorTable->IsIdentity())
4410 : {
4411 0 : poColorTable = nullptr;
4412 : }
4413 : }
4414 : else
4415 : {
4416 0 : CPLError(CE_Warning, CPLE_AppDefined,
4417 : "Computing overviews on palette index raster bands "
4418 : "without a palette will probably lead to unexpected "
4419 : "results.");
4420 : }
4421 : }
4422 : // Not ready yet
4423 2154 : else if ((EQUAL(pszResampling, "CUBIC") ||
4424 682 : EQUAL(pszResampling, "CUBICSPLINE") ||
4425 682 : EQUAL(pszResampling, "LANCZOS") ||
4426 1475 : EQUAL(pszResampling, "BILINEAR")) &&
4427 57 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4428 : {
4429 0 : CPLError(CE_Warning, CPLE_AppDefined,
4430 : "Computing %s overviews on palette index raster bands "
4431 : "will probably lead to unexpected results.",
4432 : pszResampling);
4433 : }
4434 :
4435 : // If we have a nodata mask and we are doing something more complicated
4436 : // than nearest neighbouring, we have to fetch to nodata mask.
4437 :
4438 745 : GDALRasterBand *poMaskBand = nullptr;
4439 745 : bool bUseNoDataMask = false;
4440 745 : bool bCanUseCascaded = true;
4441 :
4442 745 : if (!STARTS_WITH_CI(pszResampling, "NEAR"))
4443 : {
4444 : // Special case if we are an alpha/mask band. We want it to be
4445 : // considered as the mask band to avoid alpha=0 to be taken into account
4446 : // in average computation.
4447 503 : if (poSrcBand->IsMaskBand())
4448 : {
4449 90 : poMaskBand = poSrcBand;
4450 90 : bUseNoDataMask = true;
4451 : }
4452 : else
4453 : {
4454 413 : poMaskBand = poSrcBand->GetMaskBand();
4455 413 : const int nMaskFlags = poSrcBand->GetMaskFlags();
4456 413 : bCanUseCascaded =
4457 413 : (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
4458 413 : bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
4459 : }
4460 : }
4461 :
4462 : /* -------------------------------------------------------------------- */
4463 : /* If we are operating on multiple overviews, and using */
4464 : /* averaging, lets do them in cascading order to reduce the */
4465 : /* amount of computation. */
4466 : /* -------------------------------------------------------------------- */
4467 :
4468 : // In case the mask made be computed from another band of the dataset,
4469 : // we can't use cascaded generation, as the computation of the overviews
4470 : // of the band used for the mask band may not have yet occurred (#3033).
4471 745 : if ((STARTS_WITH_CI(pszResampling, "AVER") ||
4472 384 : EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
4473 353 : EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
4474 299 : EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
4475 745 : EQUAL(pszResampling, "MODE")) &&
4476 42 : nOverviewCount > 1 && bCanUseCascaded)
4477 42 : return GDALRegenerateCascadingOverviews(
4478 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
4479 42 : pProgressData, papszOptions);
4480 :
4481 : /* -------------------------------------------------------------------- */
4482 : /* Setup one horizontal swath to read from the raw buffer. */
4483 : /* -------------------------------------------------------------------- */
4484 703 : int nFRXBlockSize = 0;
4485 703 : int nFRYBlockSize = 0;
4486 703 : poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
4487 :
4488 703 : const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
4489 1164 : const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
4490 1118 : EQUAL(pszResampling, "MODE") ||
4491 415 : !GDALDataTypeIsComplex(eSrcDataType);
4492 : const GDALDataType eWrkDataType =
4493 : bUseGenericResampleFn
4494 703 : ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
4495 703 : : GDT_CFloat32;
4496 :
4497 703 : const int nWidth = poSrcBand->GetXSize();
4498 703 : const int nHeight = poSrcBand->GetYSize();
4499 :
4500 703 : int nMaxOvrFactor = 1;
4501 1487 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
4502 : {
4503 784 : const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
4504 784 : const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
4505 784 : nMaxOvrFactor = std::max(
4506 : nMaxOvrFactor,
4507 784 : static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
4508 784 : nMaxOvrFactor = std::max(
4509 : nMaxOvrFactor,
4510 784 : static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
4511 : }
4512 :
4513 703 : int nFullResYChunk = nFRYBlockSize;
4514 703 : int nMaxChunkYSizeQueried = 0;
4515 :
4516 : const auto UpdateChunkHeightAndGetChunkSize =
4517 9137 : [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
4518 27411 : eWrkDataType, nWidth]()
4519 : {
4520 : // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
4521 : // + nFullResYChunk) / nMaxOvrFactor)
4522 9137 : nFullResYChunk = std::max(nFullResYChunk, 2 * nMaxOvrFactor);
4523 9137 : nMaxChunkYSizeQueried =
4524 9137 : nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
4525 9137 : return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
4526 9137 : nMaxChunkYSizeQueried * nWidth;
4527 703 : };
4528 :
4529 : // Only configurable for debug / testing
4530 : const char *pszChunkYSize =
4531 703 : CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
4532 703 : if (pszChunkYSize)
4533 : {
4534 : // coverity[tainted_data]
4535 0 : nFullResYChunk = atoi(pszChunkYSize);
4536 : }
4537 :
4538 : // Only configurable for debug / testing
4539 : const int nChunkMaxSize =
4540 703 : atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
4541 :
4542 703 : auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
4543 703 : if (nChunkSize > nChunkMaxSize)
4544 : {
4545 3 : if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
4546 9 : !GDALDataTypeIsComplex(eSrcDataType) &&
4547 3 : (!STARTS_WITH_CI(pszResampling, "AVER") ||
4548 0 : EQUAL(pszResampling, "AVERAGE")))
4549 : {
4550 : // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
4551 : // which use a block based strategy, which is much less memory
4552 : // hungry.
4553 3 : return GDALRegenerateOverviewsMultiBand(
4554 : 1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
4555 3 : pfnProgress, pProgressData, papszOptions);
4556 : }
4557 0 : else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
4558 : {
4559 0 : return GDALRegenerateCascadingOverviews(
4560 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
4561 0 : pfnProgress, pProgressData, papszOptions);
4562 : }
4563 : }
4564 700 : else if (pszChunkYSize == nullptr)
4565 : {
4566 : // Try to get as close as possible to nChunkMaxSize
4567 9134 : while (nChunkSize * 2 < nChunkMaxSize)
4568 : {
4569 8434 : nFullResYChunk *= 2;
4570 8434 : nChunkSize = UpdateChunkHeightAndGetChunkSize();
4571 : }
4572 : }
4573 :
4574 700 : int nHasNoData = 0;
4575 700 : const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
4576 700 : const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
4577 : const bool bPropagateNoData =
4578 700 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
4579 :
4580 : // Structure describing a resampling job
4581 : struct OvrJob
4582 : {
4583 : // Buffers to free when job is finished
4584 : std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
4585 : std::shared_ptr<PointerHolder> oSrcBufferHolder{};
4586 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
4587 :
4588 : GDALRasterBand *poDstBand = nullptr;
4589 :
4590 : // Input parameters of pfnResampleFn
4591 : GDALResampleFunction pfnResampleFn = nullptr;
4592 : int nSrcWidth = 0;
4593 : int nSrcHeight = 0;
4594 : int nDstWidth = 0;
4595 : GDALOverviewResampleArgs args{};
4596 : const void *pChunk = nullptr;
4597 : bool bUseGenericResampleFn = false;
4598 :
4599 : // Output values of resampling function
4600 : CPLErr eErr = CE_Failure;
4601 : void *pDstBuffer = nullptr;
4602 : GDALDataType eDstBufferDataType = GDT_Unknown;
4603 :
4604 : // Synchronization
4605 : bool bFinished = false;
4606 : std::mutex mutex{};
4607 : std::condition_variable cv{};
4608 :
4609 0 : void SetSrcMaskBufferHolder(
4610 : const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
4611 : {
4612 0 : oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
4613 0 : }
4614 :
4615 0 : void SetSrcBufferHolder(
4616 : const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
4617 : {
4618 0 : oSrcBufferHolder = oSrcBufferHolderIn;
4619 0 : }
4620 : };
4621 :
4622 : // Thread function to resample
4623 782 : const auto JobResampleFunc = [](void *pData)
4624 : {
4625 782 : OvrJob *poJob = static_cast<OvrJob *>(pData);
4626 :
4627 782 : if (poJob->bUseGenericResampleFn)
4628 : {
4629 780 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
4630 : &(poJob->pDstBuffer),
4631 : &(poJob->eDstBufferDataType));
4632 : }
4633 : else
4634 : {
4635 2 : poJob->eErr = GDALResampleChunkC32R(
4636 : poJob->nSrcWidth, poJob->nSrcHeight,
4637 2 : static_cast<const float *>(poJob->pChunk),
4638 : poJob->args.nChunkYOff, poJob->args.nChunkYSize,
4639 : poJob->args.nDstYOff, poJob->args.nDstYOff2,
4640 : poJob->args.nOvrXSize, poJob->args.nOvrYSize,
4641 : &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
4642 : poJob->args.pszResampling);
4643 : }
4644 :
4645 : poJob->oDstBufferHolder =
4646 782 : std::make_unique<PointerHolder>(poJob->pDstBuffer);
4647 :
4648 : {
4649 1564 : std::lock_guard<std::mutex> guard(poJob->mutex);
4650 782 : poJob->bFinished = true;
4651 782 : poJob->cv.notify_one();
4652 : }
4653 782 : };
4654 :
4655 : // Function to write resample data to target band
4656 782 : const auto WriteJobData = [](const OvrJob *poJob)
4657 : {
4658 1564 : return poJob->poDstBand->RasterIO(
4659 782 : GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
4660 782 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
4661 782 : poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
4662 782 : poJob->eDstBufferDataType, 0, 0, nullptr);
4663 : };
4664 :
4665 : // Wait for completion of oldest job and serialize it
4666 : const auto WaitAndFinalizeOldestJob =
4667 0 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
4668 : {
4669 0 : auto poOldestJob = jobList.front().get();
4670 : {
4671 0 : std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
4672 : // coverity[missing_lock:FALSE]
4673 0 : while (!poOldestJob->bFinished)
4674 : {
4675 0 : poOldestJob->cv.wait(oGuard);
4676 : }
4677 : }
4678 0 : CPLErr l_eErr = poOldestJob->eErr;
4679 0 : if (l_eErr == CE_None)
4680 : {
4681 0 : l_eErr = WriteJobData(poOldestJob);
4682 : }
4683 :
4684 0 : jobList.pop_front();
4685 0 : return l_eErr;
4686 : };
4687 :
4688 : // Queue of jobs
4689 1400 : std::list<std::unique_ptr<OvrJob>> jobList;
4690 :
4691 700 : GByte *pabyChunkNodataMask = nullptr;
4692 700 : void *pChunk = nullptr;
4693 :
4694 700 : const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
4695 2800 : const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
4696 700 : ? CPLGetNumCPUs()
4697 700 : : atoi(pszThreads)));
4698 : auto poThreadPool =
4699 700 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
4700 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
4701 1400 : : std::unique_ptr<CPLJobQueue>(nullptr);
4702 :
4703 : /* -------------------------------------------------------------------- */
4704 : /* Loop over image operating on chunks. */
4705 : /* -------------------------------------------------------------------- */
4706 700 : int nChunkYOff = 0;
4707 700 : CPLErr eErr = CE_None;
4708 :
4709 1405 : for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
4710 705 : nChunkYOff += nFullResYChunk)
4711 : {
4712 705 : if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
4713 : pProgressData))
4714 : {
4715 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
4716 0 : eErr = CE_Failure;
4717 : }
4718 :
4719 705 : if (nFullResYChunk + nChunkYOff > nHeight)
4720 698 : nFullResYChunk = nHeight - nChunkYOff;
4721 :
4722 705 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
4723 705 : int nChunkYSizeQueried =
4724 705 : nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
4725 705 : if (nChunkYOffQueried < 0)
4726 : {
4727 62 : nChunkYSizeQueried += nChunkYOffQueried;
4728 62 : nChunkYOffQueried = 0;
4729 : }
4730 705 : if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
4731 62 : nChunkYSizeQueried = nHeight - nChunkYOffQueried;
4732 :
4733 : // Avoid accumulating too many tasks and exhaust RAM
4734 : // Try to complete already finished jobs
4735 705 : while (eErr == CE_None && !jobList.empty())
4736 : {
4737 0 : auto poOldestJob = jobList.front().get();
4738 : {
4739 0 : std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
4740 0 : if (!poOldestJob->bFinished)
4741 : {
4742 0 : break;
4743 : }
4744 : }
4745 0 : eErr = poOldestJob->eErr;
4746 0 : if (eErr == CE_None)
4747 : {
4748 0 : eErr = WriteJobData(poOldestJob);
4749 : }
4750 :
4751 0 : jobList.pop_front();
4752 : }
4753 :
4754 : // And in case we have saturated the number of threads,
4755 : // wait for completion of tasks to go below the threshold.
4756 1410 : while (eErr == CE_None &&
4757 705 : jobList.size() >= static_cast<size_t>(nThreads))
4758 : {
4759 0 : eErr = WaitAndFinalizeOldestJob(jobList);
4760 : }
4761 :
4762 : // (Re)allocate buffers if needed
4763 705 : if (pChunk == nullptr)
4764 : {
4765 700 : pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
4766 : nMaxChunkYSizeQueried, nWidth);
4767 : }
4768 705 : if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
4769 : {
4770 : pabyChunkNodataMask = static_cast<GByte *>(
4771 274 : VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
4772 : }
4773 :
4774 705 : if (pChunk == nullptr ||
4775 274 : (bUseNoDataMask && pabyChunkNodataMask == nullptr))
4776 : {
4777 0 : CPLFree(pChunk);
4778 0 : CPLFree(pabyChunkNodataMask);
4779 0 : return CE_Failure;
4780 : }
4781 :
4782 : // Read chunk.
4783 705 : if (eErr == CE_None)
4784 705 : eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
4785 : nChunkYSizeQueried, pChunk, nWidth,
4786 : nChunkYSizeQueried, eWrkDataType, 0, 0,
4787 : nullptr);
4788 705 : if (eErr == CE_None && bUseNoDataMask)
4789 274 : eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
4790 : nChunkYSizeQueried, pabyChunkNodataMask,
4791 : nWidth, nChunkYSizeQueried, GDT_Byte, 0,
4792 : 0, nullptr);
4793 :
4794 : // Special case to promote 1bit data to 8bit 0/255 values.
4795 705 : if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
4796 : {
4797 9 : if (eWrkDataType == GDT_Float32)
4798 : {
4799 0 : float *pafChunk = static_cast<float *>(pChunk);
4800 0 : for (GPtrDiff_t i = 0;
4801 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4802 : i++)
4803 : {
4804 0 : if (pafChunk[i] == 1.0)
4805 0 : pafChunk[i] = 255.0;
4806 : }
4807 : }
4808 9 : else if (eWrkDataType == GDT_Byte)
4809 : {
4810 9 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
4811 168417 : for (GPtrDiff_t i = 0;
4812 168417 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4813 : i++)
4814 : {
4815 168408 : if (pabyChunk[i] == 1)
4816 127437 : pabyChunk[i] = 255;
4817 : }
4818 : }
4819 0 : else if (eWrkDataType == GDT_UInt16)
4820 : {
4821 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
4822 0 : for (GPtrDiff_t i = 0;
4823 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4824 : i++)
4825 : {
4826 0 : if (pasChunk[i] == 1)
4827 0 : pasChunk[i] = 255;
4828 : }
4829 : }
4830 0 : else if (eWrkDataType == GDT_Float64)
4831 : {
4832 0 : double *padfChunk = static_cast<double *>(pChunk);
4833 0 : for (GPtrDiff_t i = 0;
4834 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4835 : i++)
4836 : {
4837 0 : if (padfChunk[i] == 1.0)
4838 0 : padfChunk[i] = 255.0;
4839 : }
4840 : }
4841 : else
4842 : {
4843 0 : CPLAssert(false);
4844 : }
4845 : }
4846 696 : else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
4847 : {
4848 0 : if (eWrkDataType == GDT_Float32)
4849 : {
4850 0 : float *pafChunk = static_cast<float *>(pChunk);
4851 0 : for (GPtrDiff_t i = 0;
4852 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4853 : i++)
4854 : {
4855 0 : if (pafChunk[i] == 1.0)
4856 0 : pafChunk[i] = 0.0;
4857 0 : else if (pafChunk[i] == 0.0)
4858 0 : pafChunk[i] = 255.0;
4859 : }
4860 : }
4861 0 : else if (eWrkDataType == GDT_Byte)
4862 : {
4863 0 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
4864 0 : for (GPtrDiff_t i = 0;
4865 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4866 : i++)
4867 : {
4868 0 : if (pabyChunk[i] == 1)
4869 0 : pabyChunk[i] = 0;
4870 0 : else if (pabyChunk[i] == 0)
4871 0 : pabyChunk[i] = 255;
4872 : }
4873 : }
4874 0 : else if (eWrkDataType == GDT_UInt16)
4875 : {
4876 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
4877 0 : for (GPtrDiff_t i = 0;
4878 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4879 : i++)
4880 : {
4881 0 : if (pasChunk[i] == 1)
4882 0 : pasChunk[i] = 0;
4883 0 : else if (pasChunk[i] == 0)
4884 0 : pasChunk[i] = 255;
4885 : }
4886 : }
4887 0 : else if (eWrkDataType == GDT_Float64)
4888 : {
4889 0 : double *padfChunk = static_cast<double *>(pChunk);
4890 0 : for (GPtrDiff_t i = 0;
4891 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4892 : i++)
4893 : {
4894 0 : if (padfChunk[i] == 1.0)
4895 0 : padfChunk[i] = 0.0;
4896 0 : else if (padfChunk[i] == 0.0)
4897 0 : padfChunk[i] = 255.0;
4898 : }
4899 : }
4900 : else
4901 : {
4902 0 : CPLAssert(false);
4903 : }
4904 : }
4905 :
4906 : auto oSrcBufferHolder =
4907 1410 : std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
4908 : auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
4909 1410 : poJobQueue ? pabyChunkNodataMask : nullptr);
4910 :
4911 1487 : for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
4912 : ++iOverview)
4913 : {
4914 782 : GDALRasterBand *poDstBand = papoOvrBands[iOverview];
4915 782 : const int nDstWidth = poDstBand->GetXSize();
4916 782 : const int nDstHeight = poDstBand->GetYSize();
4917 :
4918 782 : const double dfXRatioDstToSrc =
4919 782 : static_cast<double>(nWidth) / nDstWidth;
4920 782 : const double dfYRatioDstToSrc =
4921 782 : static_cast<double>(nHeight) / nDstHeight;
4922 :
4923 : /* --------------------------------------------------------------------
4924 : */
4925 : /* Figure out the line to start writing to, and the first line
4926 : */
4927 : /* to not write to. In theory this approach should ensure that
4928 : */
4929 : /* every output line will be written if all input chunks are */
4930 : /* processed. */
4931 : /* --------------------------------------------------------------------
4932 : */
4933 782 : int nDstYOff =
4934 782 : static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
4935 782 : if (nDstYOff == nDstHeight)
4936 0 : continue;
4937 782 : int nDstYOff2 = static_cast<int>(
4938 782 : 0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
4939 :
4940 782 : if (nChunkYOff + nFullResYChunk == nHeight)
4941 775 : nDstYOff2 = nDstHeight;
4942 : #if DEBUG_VERBOSE
4943 : CPLDebug("GDAL",
4944 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
4945 : nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
4946 : nDstWidth, nDstYOff2 - nDstYOff);
4947 : #endif
4948 :
4949 1564 : auto poJob = std::make_unique<OvrJob>();
4950 782 : poJob->pfnResampleFn = pfnResampleFn;
4951 782 : poJob->bUseGenericResampleFn = bUseGenericResampleFn;
4952 782 : poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
4953 782 : poJob->args.nOvrXSize = poDstBand->GetXSize();
4954 782 : poJob->args.nOvrYSize = poDstBand->GetYSize();
4955 : const char *pszNBITS =
4956 782 : poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
4957 782 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
4958 782 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
4959 782 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
4960 782 : poJob->args.eWrkDataType = eWrkDataType;
4961 782 : poJob->pChunk = pChunk;
4962 782 : poJob->args.pabyChunkNodataMask = pabyChunkNodataMask;
4963 782 : poJob->nSrcWidth = nWidth;
4964 782 : poJob->nSrcHeight = nHeight;
4965 782 : poJob->args.nChunkXOff = 0;
4966 782 : poJob->args.nChunkXSize = nWidth;
4967 782 : poJob->args.nChunkYOff = nChunkYOff;
4968 782 : poJob->args.nChunkYSize = nChunkYSizeQueried;
4969 782 : poJob->nDstWidth = nDstWidth;
4970 782 : poJob->args.nDstXOff = 0;
4971 782 : poJob->args.nDstXOff2 = nDstWidth;
4972 782 : poJob->args.nDstYOff = nDstYOff;
4973 782 : poJob->args.nDstYOff2 = nDstYOff2;
4974 782 : poJob->poDstBand = poDstBand;
4975 782 : poJob->args.pszResampling = pszResampling;
4976 782 : poJob->args.bHasNoData = bHasNoData;
4977 782 : poJob->args.dfNoDataValue = dfNoDataValue;
4978 782 : poJob->args.poColorTable = poColorTable;
4979 782 : poJob->args.eSrcDataType = eSrcDataType;
4980 782 : poJob->args.bPropagateNoData = bPropagateNoData;
4981 :
4982 782 : if (poJobQueue)
4983 : {
4984 0 : poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
4985 0 : poJob->SetSrcBufferHolder(oSrcBufferHolder);
4986 0 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
4987 0 : jobList.emplace_back(std::move(poJob));
4988 : }
4989 : else
4990 : {
4991 782 : JobResampleFunc(poJob.get());
4992 782 : eErr = poJob->eErr;
4993 782 : if (eErr == CE_None)
4994 : {
4995 782 : eErr = WriteJobData(poJob.get());
4996 : }
4997 : }
4998 : }
4999 :
5000 705 : if (poJobQueue)
5001 : {
5002 0 : pChunk = nullptr;
5003 0 : pabyChunkNodataMask = nullptr;
5004 : }
5005 : }
5006 :
5007 700 : VSIFree(pChunk);
5008 700 : VSIFree(pabyChunkNodataMask);
5009 :
5010 : // Wait for all pending jobs to complete
5011 700 : while (!jobList.empty())
5012 : {
5013 0 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5014 0 : if (l_eErr != CE_None && eErr == CE_None)
5015 0 : eErr = l_eErr;
5016 : }
5017 :
5018 : /* -------------------------------------------------------------------- */
5019 : /* Renormalized overview mean / stddev if needed. */
5020 : /* -------------------------------------------------------------------- */
5021 700 : if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
5022 : {
5023 0 : GDALOverviewMagnitudeCorrection(
5024 : poSrcBand, nOverviewCount,
5025 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
5026 : GDALDummyProgress, nullptr);
5027 : }
5028 :
5029 : /* -------------------------------------------------------------------- */
5030 : /* It can be important to flush out data to overviews. */
5031 : /* -------------------------------------------------------------------- */
5032 1475 : for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
5033 : ++iOverview)
5034 : {
5035 775 : eErr = papoOvrBands[iOverview]->FlushCache(false);
5036 : }
5037 :
5038 700 : if (eErr == CE_None)
5039 700 : pfnProgress(1.0, nullptr, pProgressData);
5040 :
5041 700 : return eErr;
5042 : }
5043 :
5044 : /************************************************************************/
5045 : /* GDALRegenerateOverviewsMultiBand() */
5046 : /************************************************************************/
5047 :
5048 : /**
5049 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5050 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5051 : *
5052 : * This function will generate one or more overview images from a base
5053 : * image using the requested downsampling algorithm. Its primary use
5054 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
5055 : * can also be used to generate downsampled images in one file from another
5056 : * outside the overview architecture.
5057 : *
5058 : * The output bands need to exist in advance and share the same characteristics
5059 : * (type, dimensions)
5060 : *
5061 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5062 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5063 : *
5064 : * It does not support color tables or complex data types.
5065 : *
5066 : * The pseudo-algorithm used by the function is :
5067 : * for each overview
5068 : * iterate on lines of the source by a step of deltay
5069 : * iterate on columns of the source by a step of deltax
5070 : * read the source data of size deltax * deltay for all the bands
5071 : * generate the corresponding overview block for all the bands
5072 : *
5073 : * This function will honour properly NODATA_VALUES tuples (special dataset
5074 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5075 : * considered as the nodata value and not each value of the triplet
5076 : * independently per band.
5077 : *
5078 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
5079 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5080 : * overview computation.
5081 : *
5082 : * @param nBands the number of bands, size of papoSrcBands and size of
5083 : * first dimension of papapoOverviewBands
5084 : * @param papoSrcBands the list of source bands to downsample
5085 : * @param nOverviews the number of downsampled overview levels being generated.
5086 : * @param papapoOverviewBands bidimension array of bands. First dimension is
5087 : * indexed by nBands. Second dimension is indexed by
5088 : * nOverviews.
5089 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5090 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5091 : * @param pfnProgress progress report function.
5092 : * @param pProgressData progress function callback data.
5093 : * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
5094 : * key=value pairs, or NULL
5095 : * Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
5096 : * options can be specified to express that overviews should
5097 : * be regenerated only in the specified subset of the source
5098 : * dataset.
5099 : * @return CE_None on success or CE_Failure on failure.
5100 : */
5101 :
5102 354 : CPLErr GDALRegenerateOverviewsMultiBand(
5103 : int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
5104 : GDALRasterBand *const *const *papapoOverviewBands,
5105 : const char *pszResampling, GDALProgressFunc pfnProgress,
5106 : void *pProgressData, CSLConstList papszOptions)
5107 : {
5108 354 : CPL_IGNORE_RET_VAL(papszOptions);
5109 :
5110 354 : if (pfnProgress == nullptr)
5111 6 : pfnProgress = GDALDummyProgress;
5112 :
5113 354 : if (EQUAL(pszResampling, "NONE"))
5114 2 : return CE_None;
5115 :
5116 : // Sanity checks.
5117 352 : if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
5118 169 : !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
5119 70 : !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
5120 18 : !EQUAL(pszResampling, "CUBICSPLINE") &&
5121 17 : !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
5122 5 : !EQUAL(pszResampling, "MODE"))
5123 : {
5124 0 : CPLError(CE_Failure, CPLE_NotSupported,
5125 : "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
5126 : "not supported",
5127 : pszResampling);
5128 0 : return CE_Failure;
5129 : }
5130 :
5131 352 : int nKernelRadius = 0;
5132 : GDALResampleFunction pfnResampleFn =
5133 352 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
5134 352 : if (pfnResampleFn == nullptr)
5135 0 : return CE_Failure;
5136 :
5137 352 : const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
5138 352 : const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
5139 352 : if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
5140 0 : return CE_None;
5141 352 : GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
5142 647 : for (int iBand = 1; iBand < nBands; ++iBand)
5143 : {
5144 590 : if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
5145 295 : papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
5146 : {
5147 0 : CPLError(
5148 : CE_Failure, CPLE_NotSupported,
5149 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5150 : "have the same dimensions");
5151 0 : return CE_Failure;
5152 : }
5153 295 : if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
5154 : {
5155 0 : CPLError(
5156 : CE_Failure, CPLE_NotSupported,
5157 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5158 : "have the same data type");
5159 0 : return CE_Failure;
5160 : }
5161 : }
5162 :
5163 938 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5164 : {
5165 586 : const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
5166 586 : const int nDstWidth = poOvrFirstBand->GetXSize();
5167 586 : const int nDstHeight = poOvrFirstBand->GetYSize();
5168 1151 : for (int iBand = 1; iBand < nBands; ++iBand)
5169 : {
5170 565 : const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
5171 1130 : if (poOvrBand->GetXSize() != nDstWidth ||
5172 565 : poOvrBand->GetYSize() != nDstHeight)
5173 : {
5174 0 : CPLError(
5175 : CE_Failure, CPLE_NotSupported,
5176 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5177 : "of the same level must have the same dimensions");
5178 0 : return CE_Failure;
5179 : }
5180 565 : if (poOvrBand->GetRasterDataType() != eDataType)
5181 : {
5182 0 : CPLError(
5183 : CE_Failure, CPLE_NotSupported,
5184 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5185 : "must have the same data type as the source bands");
5186 0 : return CE_Failure;
5187 : }
5188 : }
5189 : }
5190 :
5191 : // First pass to compute the total number of pixels to write.
5192 352 : double dfTotalPixelCount = 0;
5193 352 : const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
5194 352 : const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
5195 352 : const int nSrcXSize = atoi(CSLFetchNameValueDef(
5196 : papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
5197 352 : const int nSrcYSize = atoi(CSLFetchNameValueDef(
5198 : papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
5199 938 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5200 : {
5201 586 : dfTotalPixelCount +=
5202 1172 : static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
5203 586 : papapoOverviewBands[0][iOverview]->GetXSize() *
5204 1172 : static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
5205 586 : papapoOverviewBands[0][iOverview]->GetYSize();
5206 : }
5207 :
5208 : const GDALDataType eWrkDataType =
5209 352 : GDALGetOvrWorkDataType(pszResampling, eDataType);
5210 352 : const int nWrkDataTypeSize = GDALGetDataTypeSizeBytes(eWrkDataType);
5211 :
5212 352 : const bool bIsMask = papoSrcBands[0]->IsMaskBand();
5213 :
5214 : // If we have a nodata mask and we are doing something more complicated
5215 : // than nearest neighbouring, we have to fetch to nodata mask.
5216 : const bool bUseNoDataMask =
5217 515 : !STARTS_WITH_CI(pszResampling, "NEAR") &&
5218 163 : (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
5219 :
5220 : bool *const pabHasNoData =
5221 352 : static_cast<bool *>(VSI_MALLOC_VERBOSE(nBands * sizeof(bool)));
5222 : double *const padfNoDataValue =
5223 352 : static_cast<double *>(VSI_MALLOC_VERBOSE(nBands * sizeof(double)));
5224 352 : if (pabHasNoData == nullptr || padfNoDataValue == nullptr)
5225 : {
5226 0 : CPLFree(pabHasNoData);
5227 0 : CPLFree(padfNoDataValue);
5228 0 : return CE_Failure;
5229 : }
5230 :
5231 999 : for (int iBand = 0; iBand < nBands; ++iBand)
5232 : {
5233 647 : int nHasNoData = 0;
5234 1294 : padfNoDataValue[iBand] =
5235 647 : papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
5236 647 : pabHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
5237 : }
5238 : const bool bPropagateNoData =
5239 352 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5240 :
5241 352 : const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
5242 1408 : const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
5243 352 : ? CPLGetNumCPUs()
5244 352 : : atoi(pszThreads)));
5245 : auto poThreadPool =
5246 352 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5247 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5248 352 : : std::unique_ptr<CPLJobQueue>(nullptr);
5249 :
5250 : // Only configurable for debug / testing
5251 : const int nChunkMaxSize = std::max(
5252 352 : 100, atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760")));
5253 :
5254 : // Second pass to do the real job.
5255 352 : double dfCurPixelCount = 0;
5256 352 : CPLErr eErr = CE_None;
5257 937 : for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
5258 : ++iOverview)
5259 : {
5260 585 : int iSrcOverview = -1; // -1 means the source bands.
5261 :
5262 : const int nDstTotalWidth =
5263 585 : papapoOverviewBands[0][iOverview]->GetXSize();
5264 : const int nDstTotalHeight =
5265 585 : papapoOverviewBands[0][iOverview]->GetYSize();
5266 :
5267 : // Compute the coordinates of the target region to refresh
5268 585 : constexpr double EPS = 1e-8;
5269 585 : const int nDstXOffStart = static_cast<int>(
5270 585 : static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
5271 : EPS);
5272 : const int nDstXOffEnd =
5273 1170 : std::min(static_cast<int>(
5274 585 : std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
5275 585 : nToplevelSrcWidth * nDstTotalWidth -
5276 : EPS)),
5277 585 : nDstTotalWidth);
5278 585 : const int nDstWidth = nDstXOffEnd - nDstXOffStart;
5279 585 : const int nDstYOffStart =
5280 585 : static_cast<int>(static_cast<double>(nSrcYOff) /
5281 585 : nToplevelSrcHeight * nDstTotalHeight +
5282 : EPS);
5283 : const int nDstYOffEnd =
5284 1170 : std::min(static_cast<int>(
5285 585 : std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
5286 585 : nToplevelSrcHeight * nDstTotalHeight -
5287 : EPS)),
5288 585 : nDstTotalHeight);
5289 :
5290 : // Try to use previous level of overview as the source to compute
5291 : // the next level.
5292 585 : int nSrcWidth = nToplevelSrcWidth;
5293 585 : int nSrcHeight = nToplevelSrcHeight;
5294 818 : if (iOverview > 0 &&
5295 233 : papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
5296 : {
5297 225 : nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
5298 225 : nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
5299 225 : iSrcOverview = iOverview - 1;
5300 : }
5301 :
5302 585 : const double dfXRatioDstToSrc =
5303 585 : static_cast<double>(nSrcWidth) / nDstTotalWidth;
5304 585 : const double dfYRatioDstToSrc =
5305 585 : static_cast<double>(nSrcHeight) / nDstTotalHeight;
5306 :
5307 1170 : int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
5308 585 : static_cast<int>(0.5 + dfYRatioDstToSrc));
5309 585 : if (nOvrFactor == 0)
5310 0 : nOvrFactor = 1;
5311 :
5312 585 : int nDstChunkXSize = 0;
5313 585 : int nDstChunkYSize = 0;
5314 585 : papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
5315 : &nDstChunkYSize);
5316 :
5317 : const char *pszDST_CHUNK_X_SIZE =
5318 585 : CSLFetchNameValue(papszOptions, "DST_CHUNK_X_SIZE");
5319 : const char *pszDST_CHUNK_Y_SIZE =
5320 585 : CSLFetchNameValue(papszOptions, "DST_CHUNK_Y_SIZE");
5321 585 : if (pszDST_CHUNK_X_SIZE && pszDST_CHUNK_Y_SIZE)
5322 : {
5323 12 : nDstChunkXSize = std::max(1, atoi(pszDST_CHUNK_X_SIZE));
5324 12 : nDstChunkYSize = std::max(1, atoi(pszDST_CHUNK_Y_SIZE));
5325 12 : CPLDebug("GDAL", "Using dst chunk size %d x %d", nDstChunkXSize,
5326 : nDstChunkYSize);
5327 : }
5328 :
5329 : // Try to extend the chunk size so that the memory needed to acquire
5330 : // source pixels goes up to 10 MB.
5331 : // This can help for drivers that support multi-threaded reading
5332 585 : const int nFullResYChunk =
5333 585 : 2 + static_cast<int>(nDstChunkYSize * dfYRatioDstToSrc);
5334 585 : const int nFullResYChunkQueried =
5335 585 : nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
5336 821 : while (nDstChunkXSize < nDstWidth)
5337 : {
5338 253 : const int nFullResXChunk =
5339 253 : 2 + static_cast<int>(2 * nDstChunkXSize * dfXRatioDstToSrc);
5340 :
5341 253 : const int nFullResXChunkQueried =
5342 253 : nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
5343 :
5344 253 : if (static_cast<GIntBig>(nFullResXChunkQueried) *
5345 253 : nFullResYChunkQueried * nBands * nWrkDataTypeSize >
5346 253 : nChunkMaxSize)
5347 : {
5348 17 : break;
5349 : }
5350 :
5351 236 : nDstChunkXSize *= 2;
5352 : }
5353 585 : nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
5354 :
5355 585 : const int nFullResXChunk =
5356 585 : 2 + static_cast<int>(nDstChunkXSize * dfXRatioDstToSrc);
5357 585 : const int nFullResXChunkQueried =
5358 585 : nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
5359 :
5360 : // Make sure that the RAM requirements to acquire the source data does
5361 : // not exceed nChunkMaxSize
5362 : // If so, reduce the destination chunk size, generate overviews in a
5363 : // temporary dataset, and copy that temporary dataset over the target
5364 : // overview bands (to avoid issues with lossy compression)
5365 585 : const auto nMemRequirement =
5366 585 : static_cast<GIntBig>(nFullResXChunkQueried) *
5367 585 : nFullResYChunkQueried * nBands * nWrkDataTypeSize;
5368 585 : if (nMemRequirement > nChunkMaxSize &&
5369 10 : !(pszDST_CHUNK_X_SIZE && pszDST_CHUNK_Y_SIZE))
5370 : {
5371 : // Compute a smaller destination chunk size
5372 12 : const auto nOverShootFactor = nMemRequirement / nChunkMaxSize;
5373 : const auto nSqrtOverShootFactor = std::max<GIntBig>(
5374 24 : 4, static_cast<GIntBig>(std::ceil(
5375 12 : std::sqrt(static_cast<double>(nOverShootFactor)))));
5376 : const int nReducedDstChunkXSize = std::max(
5377 12 : 1, static_cast<int>(nDstChunkXSize / nSqrtOverShootFactor));
5378 : const int nReducedDstChunkYSize = std::max(
5379 12 : 1, static_cast<int>(nDstChunkYSize / nSqrtOverShootFactor));
5380 12 : if (nReducedDstChunkXSize < nDstChunkXSize ||
5381 0 : nReducedDstChunkYSize < nDstChunkYSize)
5382 : {
5383 12 : CPLStringList aosOptions(papszOptions);
5384 : aosOptions.SetNameValue(
5385 : "DST_CHUNK_X_SIZE",
5386 12 : CPLSPrintf("%d", nReducedDstChunkXSize));
5387 : aosOptions.SetNameValue(
5388 : "DST_CHUNK_Y_SIZE",
5389 12 : CPLSPrintf("%d", nReducedDstChunkYSize));
5390 :
5391 : const auto nTmpDSMemRequirement =
5392 12 : static_cast<GIntBig>(nDstTotalWidth) * nDstTotalHeight *
5393 12 : nBands * GDALGetDataTypeSizeBytes(eDataType);
5394 0 : std::unique_ptr<GDALDataset> poTmpDS;
5395 : // Config option mostly/only for autotest purposes
5396 : const char *pszGDAL_OVR_TEMP_DRIVER =
5397 12 : CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
5398 12 : if ((nTmpDSMemRequirement <= nChunkMaxSize &&
5399 2 : !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
5400 10 : EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
5401 : {
5402 : auto poTmpDrv =
5403 11 : GetGDALDriverManager()->GetDriverByName("MEM");
5404 11 : if (!poTmpDrv)
5405 : {
5406 0 : eErr = CE_Failure;
5407 0 : break;
5408 : }
5409 11 : poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
5410 : nDstTotalHeight, nBands,
5411 11 : eDataType, nullptr));
5412 : }
5413 : else
5414 : {
5415 : auto poTmpDrv =
5416 1 : GetGDALDriverManager()->GetDriverByName("GTiff");
5417 1 : if (!poTmpDrv)
5418 : {
5419 0 : eErr = CE_Failure;
5420 0 : break;
5421 : }
5422 2 : std::string osTmpFilename;
5423 1 : auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
5424 1 : if (poDstDS)
5425 : {
5426 1 : osTmpFilename = poDstDS->GetDescription();
5427 : VSIStatBufL sStatBuf;
5428 1 : if (!osTmpFilename.empty() &&
5429 0 : VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
5430 0 : osTmpFilename += "_tmp_ovr.tif";
5431 : }
5432 1 : if (osTmpFilename.empty())
5433 : {
5434 1 : osTmpFilename = CPLGenerateTempFilename(nullptr);
5435 1 : osTmpFilename += ".tif";
5436 : }
5437 1 : CPLDebug("GDAL",
5438 : "Creating temporary file %s of %d x %d x %d",
5439 : osTmpFilename.c_str(), nDstTotalWidth,
5440 : nDstTotalHeight, nBands);
5441 2 : CPLStringList aosCO;
5442 1 : poTmpDS.reset(poTmpDrv->Create(
5443 : osTmpFilename.c_str(), nDstTotalWidth, nDstTotalHeight,
5444 1 : nBands, eDataType, aosCO.List()));
5445 1 : if (poTmpDS)
5446 : {
5447 1 : poTmpDS->MarkSuppressOnClose();
5448 1 : VSIUnlink(osTmpFilename.c_str());
5449 : }
5450 : }
5451 12 : if (!poTmpDS)
5452 : {
5453 0 : eErr = CE_Failure;
5454 0 : break;
5455 : }
5456 :
5457 12 : std::vector<GDALRasterBand **> apapoOverviewBands(nBands);
5458 27 : for (int i = 0; i < nBands; ++i)
5459 : {
5460 30 : apapoOverviewBands[i] = static_cast<GDALRasterBand **>(
5461 15 : CPLMalloc(sizeof(GDALRasterBand *)));
5462 15 : apapoOverviewBands[i][0] = poTmpDS->GetRasterBand(i + 1);
5463 : }
5464 :
5465 : const double dfExtraPixels =
5466 24 : static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
5467 12 : papapoOverviewBands[0][iOverview]->GetXSize() *
5468 24 : static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
5469 12 : papapoOverviewBands[0][iOverview]->GetYSize();
5470 :
5471 24 : void *pScaledProgressData = GDALCreateScaledProgress(
5472 : dfCurPixelCount / dfTotalPixelCount,
5473 12 : (dfCurPixelCount + dfExtraPixels) / dfTotalPixelCount,
5474 : pfnProgress, pProgressData);
5475 :
5476 : // Generate overviews in temporary dataset
5477 12 : eErr = GDALRegenerateOverviewsMultiBand(
5478 12 : nBands, papoSrcBands, 1, apapoOverviewBands.data(),
5479 : pszResampling, GDALScaledProgress, pScaledProgressData,
5480 12 : aosOptions.List());
5481 :
5482 12 : GDALDestroyScaledProgress(pScaledProgressData);
5483 :
5484 12 : dfCurPixelCount += dfExtraPixels;
5485 :
5486 27 : for (int i = 0; i < nBands; ++i)
5487 : {
5488 15 : CPLFree(apapoOverviewBands[i]);
5489 : }
5490 :
5491 : // Copy temporary dataset to destination overview bands
5492 :
5493 12 : if (eErr == CE_None)
5494 : {
5495 : // Check if all papapoOverviewBands[][iOverview] bands point
5496 : // to the same dataset. If so, we can use
5497 : // GDALDatasetCopyWholeRaster()
5498 : GDALDataset *poDstOvrBandDS =
5499 12 : papapoOverviewBands[0][iOverview]->GetDataset();
5500 12 : if (poDstOvrBandDS)
5501 : {
5502 15 : if (poDstOvrBandDS->GetRasterCount() != nBands ||
5503 3 : poDstOvrBandDS->GetRasterBand(1) !=
5504 3 : papapoOverviewBands[0][iOverview])
5505 : {
5506 9 : poDstOvrBandDS = nullptr;
5507 : }
5508 : else
5509 : {
5510 6 : for (int i = 1; poDstOvrBandDS && i < nBands; ++i)
5511 : {
5512 : GDALDataset *poThisDstOvrBandDS =
5513 3 : papapoOverviewBands[i][iOverview]
5514 3 : ->GetDataset();
5515 3 : if (poThisDstOvrBandDS == nullptr ||
5516 6 : poThisDstOvrBandDS != poDstOvrBandDS ||
5517 3 : poThisDstOvrBandDS->GetRasterBand(i + 1) !=
5518 3 : papapoOverviewBands[i][iOverview])
5519 : {
5520 0 : poDstOvrBandDS = nullptr;
5521 : }
5522 : }
5523 : }
5524 : }
5525 12 : if (poDstOvrBandDS)
5526 : {
5527 3 : eErr = GDALDatasetCopyWholeRaster(
5528 : GDALDataset::ToHandle(poTmpDS.get()),
5529 : GDALDataset::ToHandle(poDstOvrBandDS), nullptr,
5530 : nullptr, nullptr);
5531 : }
5532 : else
5533 : {
5534 18 : for (int i = 0; eErr == CE_None && i < nBands; ++i)
5535 : {
5536 9 : eErr = GDALRasterBandCopyWholeRaster(
5537 : GDALRasterBand::ToHandle(
5538 : poTmpDS->GetRasterBand(i + 1)),
5539 : GDALRasterBand::ToHandle(
5540 9 : papapoOverviewBands[i][iOverview]),
5541 : nullptr, nullptr, nullptr);
5542 : }
5543 : }
5544 : }
5545 :
5546 12 : if (eErr != CE_None)
5547 0 : break;
5548 :
5549 12 : continue;
5550 : }
5551 : }
5552 :
5553 : // Structure describing a resampling job
5554 : struct OvrJob
5555 : {
5556 : // Buffers to free when job is finished
5557 : std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
5558 : std::unique_ptr<PointerHolder> oSrcBufferHolder{};
5559 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
5560 :
5561 : GDALRasterBand *poDstBand = nullptr;
5562 :
5563 : // Input parameters of pfnResampleFn
5564 : GDALResampleFunction pfnResampleFn = nullptr;
5565 : GDALOverviewResampleArgs args{};
5566 : const void *pChunk = nullptr;
5567 :
5568 : // Output values of resampling function
5569 : CPLErr eErr = CE_Failure;
5570 : void *pDstBuffer = nullptr;
5571 : GDALDataType eDstBufferDataType = GDT_Unknown;
5572 :
5573 : // Synchronization
5574 : bool bFinished = false;
5575 : std::mutex mutex{};
5576 : std::condition_variable cv{};
5577 : };
5578 :
5579 : // Thread function to resample
5580 16228 : const auto JobResampleFunc = [](void *pData)
5581 : {
5582 16228 : OvrJob *poJob = static_cast<OvrJob *>(pData);
5583 :
5584 16228 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
5585 : &(poJob->pDstBuffer),
5586 : &(poJob->eDstBufferDataType));
5587 :
5588 16224 : poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
5589 :
5590 : {
5591 32460 : std::lock_guard<std::mutex> guard(poJob->mutex);
5592 16230 : poJob->bFinished = true;
5593 16230 : poJob->cv.notify_one();
5594 : }
5595 16230 : };
5596 :
5597 : // Function to write resample data to target band
5598 16230 : const auto WriteJobData = [](const OvrJob *poJob)
5599 : {
5600 32460 : return poJob->poDstBand->RasterIO(
5601 16230 : GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
5602 16230 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
5603 16230 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
5604 16230 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
5605 16230 : poJob->args.nDstYOff2 - poJob->args.nDstYOff,
5606 16230 : poJob->eDstBufferDataType, 0, 0, nullptr);
5607 : };
5608 :
5609 : // Wait for completion of oldest job and serialize it
5610 : const auto WaitAndFinalizeOldestJob =
5611 15 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
5612 : {
5613 15 : auto poOldestJob = jobList.front().get();
5614 : {
5615 30 : std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
5616 : // coverity[missing_lock:FALSE]
5617 18 : while (!poOldestJob->bFinished)
5618 : {
5619 3 : poOldestJob->cv.wait(oGuard);
5620 : }
5621 : }
5622 15 : CPLErr l_eErr = poOldestJob->eErr;
5623 15 : if (l_eErr == CE_None)
5624 : {
5625 15 : l_eErr = WriteJobData(poOldestJob);
5626 : }
5627 :
5628 15 : jobList.pop_front();
5629 15 : return l_eErr;
5630 : };
5631 :
5632 : // Queue of jobs
5633 1146 : std::list<std::unique_ptr<OvrJob>> jobList;
5634 :
5635 1146 : std::vector<void *> apaChunk(nBands);
5636 1146 : std::vector<GByte *> apabyChunkNoDataMask(nBands);
5637 :
5638 : // Iterate on destination overview, block by block.
5639 573 : for (int nDstYOff = nDstYOffStart;
5640 2209 : nDstYOff < nDstYOffEnd && eErr == CE_None;
5641 1636 : nDstYOff += nDstChunkYSize)
5642 : {
5643 : int nDstYCount;
5644 1636 : if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
5645 1248 : nDstYCount = nDstChunkYSize;
5646 : else
5647 388 : nDstYCount = nDstYOffEnd - nDstYOff;
5648 :
5649 1636 : int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
5650 1636 : int nChunkYOff2 = static_cast<int>(
5651 1636 : ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
5652 1636 : if (nChunkYOff2 > nSrcHeight ||
5653 1636 : nDstYOff + nDstYCount == nDstTotalHeight)
5654 570 : nChunkYOff2 = nSrcHeight;
5655 1636 : int nYCount = nChunkYOff2 - nChunkYOff;
5656 1636 : CPLAssert(nYCount <= nFullResYChunk);
5657 :
5658 1636 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
5659 1636 : int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
5660 1636 : if (nChunkYOffQueried < 0)
5661 : {
5662 126 : nChunkYSizeQueried += nChunkYOffQueried;
5663 126 : nChunkYOffQueried = 0;
5664 : }
5665 1636 : if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
5666 125 : nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
5667 1636 : CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
5668 :
5669 1636 : if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount, nullptr,
5670 : pProgressData))
5671 : {
5672 1 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
5673 1 : eErr = CE_Failure;
5674 : }
5675 :
5676 : // Iterate on destination overview, block by block.
5677 1636 : for (int nDstXOff = nDstXOffStart;
5678 10051 : nDstXOff < nDstXOffEnd && eErr == CE_None;
5679 8415 : nDstXOff += nDstChunkXSize)
5680 : {
5681 8415 : int nDstXCount = 0;
5682 8415 : if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
5683 8218 : nDstXCount = nDstChunkXSize;
5684 : else
5685 197 : nDstXCount = nDstXOffEnd - nDstXOff;
5686 :
5687 8415 : dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
5688 :
5689 8415 : int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
5690 8415 : int nChunkXOff2 = static_cast<int>(
5691 8415 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
5692 8415 : if (nChunkXOff2 > nSrcWidth ||
5693 8415 : nDstXOff + nDstXCount == nDstTotalWidth)
5694 1634 : nChunkXOff2 = nSrcWidth;
5695 8415 : const int nXCount = nChunkXOff2 - nChunkXOff;
5696 8415 : CPLAssert(nXCount <= nFullResXChunk);
5697 :
5698 8415 : int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
5699 8415 : int nChunkXSizeQueried =
5700 8415 : nXCount + 2 * nKernelRadius * nOvrFactor;
5701 8415 : if (nChunkXOffQueried < 0)
5702 : {
5703 186 : nChunkXSizeQueried += nChunkXOffQueried;
5704 186 : nChunkXOffQueried = 0;
5705 : }
5706 8415 : if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
5707 189 : nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
5708 8415 : CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
5709 : #if DEBUG_VERBOSE
5710 : CPLDebug("GDAL",
5711 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
5712 : nChunkXOffQueried, nChunkYOffQueried,
5713 : nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
5714 : nDstYOff, nDstXCount, nDstYCount);
5715 : #endif
5716 :
5717 : // Avoid accumulating too many tasks and exhaust RAM
5718 :
5719 : // Try to complete already finished jobs
5720 16512 : while (eErr == CE_None && !jobList.empty())
5721 : {
5722 8192 : auto poOldestJob = jobList.front().get();
5723 : {
5724 8192 : std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
5725 8192 : if (!poOldestJob->bFinished)
5726 : {
5727 95 : break;
5728 : }
5729 : }
5730 8097 : eErr = poOldestJob->eErr;
5731 8097 : if (eErr == CE_None)
5732 : {
5733 8097 : eErr = WriteJobData(poOldestJob);
5734 : }
5735 :
5736 8097 : jobList.pop_front();
5737 : }
5738 :
5739 : // And in case we have saturated the number of threads,
5740 : // wait for completion of tasks to go below the threshold.
5741 16830 : while (eErr == CE_None &&
5742 8415 : jobList.size() >= static_cast<size_t>(nThreads))
5743 : {
5744 0 : eErr = WaitAndFinalizeOldestJob(jobList);
5745 : }
5746 :
5747 : // (Re)allocate buffers if needed
5748 24646 : for (int iBand = 0; iBand < nBands; ++iBand)
5749 : {
5750 16231 : if (apaChunk[iBand] == nullptr)
5751 : {
5752 9233 : apaChunk[iBand] = VSI_MALLOC3_VERBOSE(
5753 : nFullResXChunkQueried, nFullResYChunkQueried,
5754 : nWrkDataTypeSize);
5755 9233 : if (apaChunk[iBand] == nullptr)
5756 : {
5757 0 : eErr = CE_Failure;
5758 : }
5759 : }
5760 24644 : if (bUseNoDataMask &&
5761 8413 : apabyChunkNoDataMask[iBand] == nullptr)
5762 : {
5763 16708 : apabyChunkNoDataMask[iBand] =
5764 8354 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
5765 : nFullResXChunkQueried, nFullResYChunkQueried));
5766 8354 : if (apabyChunkNoDataMask[iBand] == nullptr)
5767 : {
5768 0 : eErr = CE_Failure;
5769 : }
5770 : }
5771 : }
5772 :
5773 : // Read the source buffers for all the bands.
5774 24646 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
5775 : {
5776 16231 : GDALRasterBand *poSrcBand = nullptr;
5777 16231 : if (iSrcOverview == -1)
5778 15337 : poSrcBand = papoSrcBands[iBand];
5779 : else
5780 894 : poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
5781 16231 : eErr = poSrcBand->RasterIO(
5782 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
5783 16231 : nChunkXSizeQueried, nChunkYSizeQueried, apaChunk[iBand],
5784 : nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType, 0,
5785 : 0, nullptr);
5786 :
5787 16231 : if (bUseNoDataMask && eErr == CE_None)
5788 : {
5789 8413 : auto poMaskBand = poSrcBand->IsMaskBand()
5790 8413 : ? poSrcBand
5791 6312 : : poSrcBand->GetMaskBand();
5792 8413 : eErr = poMaskBand->RasterIO(
5793 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
5794 : nChunkXSizeQueried, nChunkYSizeQueried,
5795 8413 : apabyChunkNoDataMask[iBand], nChunkXSizeQueried,
5796 : nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
5797 : }
5798 : }
5799 :
5800 : // Compute the resulting overview block.
5801 24645 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
5802 : {
5803 32460 : auto poJob = std::make_unique<OvrJob>();
5804 16230 : poJob->pfnResampleFn = pfnResampleFn;
5805 16230 : poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
5806 32460 : poJob->args.eOvrDataType =
5807 16230 : poJob->poDstBand->GetRasterDataType();
5808 16230 : poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
5809 16230 : poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
5810 16230 : const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
5811 16230 : "NBITS", "IMAGE_STRUCTURE");
5812 16230 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
5813 16230 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
5814 16230 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
5815 16230 : poJob->args.eWrkDataType = eWrkDataType;
5816 16230 : poJob->pChunk = apaChunk[iBand];
5817 16230 : poJob->args.pabyChunkNodataMask =
5818 16230 : apabyChunkNoDataMask[iBand];
5819 16230 : poJob->args.nChunkXOff = nChunkXOffQueried;
5820 16230 : poJob->args.nChunkXSize = nChunkXSizeQueried;
5821 16230 : poJob->args.nChunkYOff = nChunkYOffQueried;
5822 16230 : poJob->args.nChunkYSize = nChunkYSizeQueried;
5823 16230 : poJob->args.nDstXOff = nDstXOff;
5824 16230 : poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
5825 16230 : poJob->args.nDstYOff = nDstYOff;
5826 16230 : poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
5827 16230 : poJob->args.pszResampling = pszResampling;
5828 16230 : poJob->args.bHasNoData = pabHasNoData[iBand];
5829 16230 : poJob->args.dfNoDataValue = padfNoDataValue[iBand];
5830 16230 : poJob->args.eSrcDataType = eDataType;
5831 16230 : poJob->args.bPropagateNoData = bPropagateNoData;
5832 :
5833 16230 : if (poJobQueue)
5834 : {
5835 16224 : poJob->oSrcMaskBufferHolder.reset(
5836 8112 : new PointerHolder(apabyChunkNoDataMask[iBand]));
5837 8112 : apabyChunkNoDataMask[iBand] = nullptr;
5838 :
5839 16224 : poJob->oSrcBufferHolder.reset(
5840 8112 : new PointerHolder(apaChunk[iBand]));
5841 8112 : apaChunk[iBand] = nullptr;
5842 :
5843 8112 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
5844 8112 : jobList.emplace_back(std::move(poJob));
5845 : }
5846 : else
5847 : {
5848 8118 : JobResampleFunc(poJob.get());
5849 8118 : eErr = poJob->eErr;
5850 8118 : if (eErr == CE_None)
5851 : {
5852 8118 : eErr = WriteJobData(poJob.get());
5853 : }
5854 : }
5855 : }
5856 : }
5857 : }
5858 :
5859 : // Wait for all pending jobs to complete
5860 588 : while (!jobList.empty())
5861 : {
5862 15 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5863 15 : if (l_eErr != CE_None && eErr == CE_None)
5864 0 : eErr = l_eErr;
5865 : }
5866 :
5867 : // Flush the data to overviews.
5868 1708 : for (int iBand = 0; iBand < nBands; ++iBand)
5869 : {
5870 1135 : CPLFree(apaChunk[iBand]);
5871 1135 : papapoOverviewBands[iBand][iOverview]->FlushCache(false);
5872 :
5873 1135 : CPLFree(apabyChunkNoDataMask[iBand]);
5874 : }
5875 : }
5876 :
5877 352 : CPLFree(pabHasNoData);
5878 352 : CPLFree(padfNoDataValue);
5879 :
5880 352 : if (eErr == CE_None)
5881 350 : pfnProgress(1.0, nullptr, pProgressData);
5882 :
5883 352 : return eErr;
5884 : }
5885 :
5886 : /************************************************************************/
5887 : /* GDALRegenerateOverviewsMultiBand() */
5888 : /************************************************************************/
5889 :
5890 : /**
5891 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5892 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5893 : *
5894 : * This function will generate one or more overview images from a base
5895 : * image using the requested downsampling algorithm. Its primary use
5896 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
5897 : * can also be used to generate downsampled images in one file from another
5898 : * outside the overview architecture.
5899 : *
5900 : * The output bands need to exist in advance and share the same characteristics
5901 : * (type, dimensions)
5902 : *
5903 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5904 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5905 : *
5906 : * It does not support color tables or complex data types.
5907 : *
5908 : * The pseudo-algorithm used by the function is :
5909 : * for each overview
5910 : * iterate on lines of the source by a step of deltay
5911 : * iterate on columns of the source by a step of deltax
5912 : * read the source data of size deltax * deltay for all the bands
5913 : * generate the corresponding overview block for all the bands
5914 : *
5915 : * This function will honour properly NODATA_VALUES tuples (special dataset
5916 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5917 : * considered as the nodata value and not each value of the triplet
5918 : * independently per band.
5919 : *
5920 : * The GDAL_NUM_THREADS configuration option can be set
5921 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5922 : * overview computation.
5923 : *
5924 : * @param apoSrcBands the list of source bands to downsample
5925 : * @param aapoOverviewBands bidimension array of bands. First dimension is
5926 : * indexed by bands. Second dimension is indexed by
5927 : * overview levels. All aapoOverviewBands[i] arrays
5928 : * must have the same size (i.e. same number of
5929 : * overviews)
5930 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5931 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5932 : * @param pfnProgress progress report function.
5933 : * @param pProgressData progress function callback data.
5934 : * @param papszOptions NULL terminated list of options as
5935 : * key=value pairs, or NULL
5936 : * The XOFF, YOFF, XSIZE and YSIZE
5937 : * options can be specified to express that overviews should
5938 : * be regenerated only in the specified subset of the source
5939 : * dataset.
5940 : * @return CE_None on success or CE_Failure on failure.
5941 : * @since 3.10
5942 : */
5943 :
5944 5 : CPLErr GDALRegenerateOverviewsMultiBand(
5945 : const std::vector<GDALRasterBand *> &apoSrcBands,
5946 : const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
5947 : const char *pszResampling, GDALProgressFunc pfnProgress,
5948 : void *pProgressData, CSLConstList papszOptions)
5949 : {
5950 5 : CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
5951 15 : for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
5952 : {
5953 10 : CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
5954 : }
5955 :
5956 5 : if (aapoOverviewBands.empty())
5957 0 : return CE_None;
5958 :
5959 5 : std::vector<GDALRasterBand **> apapoOverviewBands;
5960 20 : for (auto &apoOverviewBands : aapoOverviewBands)
5961 : {
5962 : auto papoOverviewBands = static_cast<GDALRasterBand **>(
5963 15 : CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
5964 30 : for (size_t i = 0; i < apoOverviewBands.size(); ++i)
5965 : {
5966 15 : papoOverviewBands[i] = apoOverviewBands[i];
5967 : }
5968 15 : apapoOverviewBands.push_back(papoOverviewBands);
5969 : }
5970 10 : const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
5971 5 : static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
5972 5 : static_cast<int>(aapoOverviewBands[0].size()),
5973 5 : apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
5974 : papszOptions);
5975 20 : for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
5976 15 : CPLFree(papoOverviewBands);
5977 5 : return eErr;
5978 : }
5979 :
5980 : /************************************************************************/
5981 : /* GDALComputeBandStats() */
5982 : /************************************************************************/
5983 :
5984 : /** Undocumented
5985 : * @param hSrcBand undocumented.
5986 : * @param nSampleStep Step between scanlines used to compute statistics.
5987 : * When nSampleStep is equal to 1, all scanlines will
5988 : * be processed.
5989 : * @param pdfMean undocumented.
5990 : * @param pdfStdDev undocumented.
5991 : * @param pfnProgress undocumented.
5992 : * @param pProgressData undocumented.
5993 : * @return undocumented
5994 : */
5995 16 : CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
5996 : int nSampleStep, double *pdfMean,
5997 : double *pdfStdDev,
5998 : GDALProgressFunc pfnProgress,
5999 : void *pProgressData)
6000 :
6001 : {
6002 16 : VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
6003 :
6004 16 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
6005 :
6006 16 : if (pfnProgress == nullptr)
6007 16 : pfnProgress = GDALDummyProgress;
6008 :
6009 16 : const int nWidth = poSrcBand->GetXSize();
6010 16 : const int nHeight = poSrcBand->GetYSize();
6011 :
6012 16 : if (nSampleStep >= nHeight || nSampleStep < 1)
6013 3 : nSampleStep = 1;
6014 :
6015 16 : GDALDataType eWrkType = GDT_Unknown;
6016 16 : float *pafData = nullptr;
6017 16 : GDALDataType eType = poSrcBand->GetRasterDataType();
6018 16 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6019 16 : if (bComplex)
6020 : {
6021 : pafData = static_cast<float *>(
6022 0 : VSI_MALLOC_VERBOSE(nWidth * 2 * sizeof(float)));
6023 0 : eWrkType = GDT_CFloat32;
6024 : }
6025 : else
6026 : {
6027 : pafData =
6028 16 : static_cast<float *>(VSI_MALLOC_VERBOSE(nWidth * sizeof(float)));
6029 16 : eWrkType = GDT_Float32;
6030 : }
6031 :
6032 16 : if (nWidth == 0 || pafData == nullptr)
6033 : {
6034 0 : VSIFree(pafData);
6035 0 : return CE_Failure;
6036 : }
6037 :
6038 : /* -------------------------------------------------------------------- */
6039 : /* Loop over all sample lines. */
6040 : /* -------------------------------------------------------------------- */
6041 16 : double dfSum = 0.0;
6042 16 : double dfSum2 = 0.0;
6043 16 : int iLine = 0;
6044 16 : GIntBig nSamples = 0;
6045 :
6046 2143 : do
6047 : {
6048 2159 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6049 : pProgressData))
6050 : {
6051 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6052 0 : CPLFree(pafData);
6053 0 : return CE_Failure;
6054 : }
6055 :
6056 : const CPLErr eErr =
6057 2159 : poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
6058 : 1, eWrkType, 0, 0, nullptr);
6059 2159 : if (eErr != CE_None)
6060 : {
6061 1 : CPLFree(pafData);
6062 1 : return eErr;
6063 : }
6064 :
6065 725204 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6066 : {
6067 723046 : float fValue = 0.0f;
6068 :
6069 723046 : if (bComplex)
6070 : {
6071 : // Compute the magnitude of the complex value.
6072 : fValue =
6073 0 : std::hypot(pafData[iPixel * 2], pafData[iPixel * 2 + 1]);
6074 : }
6075 : else
6076 : {
6077 723046 : fValue = pafData[iPixel];
6078 : }
6079 :
6080 723046 : dfSum += fValue;
6081 723046 : dfSum2 += static_cast<double>(fValue) * fValue;
6082 : }
6083 :
6084 2158 : nSamples += nWidth;
6085 2158 : iLine += nSampleStep;
6086 2158 : } while (iLine < nHeight);
6087 :
6088 15 : if (!pfnProgress(1.0, nullptr, pProgressData))
6089 : {
6090 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6091 0 : CPLFree(pafData);
6092 0 : return CE_Failure;
6093 : }
6094 :
6095 : /* -------------------------------------------------------------------- */
6096 : /* Produce the result values. */
6097 : /* -------------------------------------------------------------------- */
6098 15 : if (pdfMean != nullptr)
6099 15 : *pdfMean = dfSum / nSamples;
6100 :
6101 15 : if (pdfStdDev != nullptr)
6102 : {
6103 15 : const double dfMean = dfSum / nSamples;
6104 :
6105 15 : *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
6106 : }
6107 :
6108 15 : CPLFree(pafData);
6109 :
6110 15 : return CE_None;
6111 : }
6112 :
6113 : /************************************************************************/
6114 : /* GDALOverviewMagnitudeCorrection() */
6115 : /* */
6116 : /* Correct the mean and standard deviation of the overviews of */
6117 : /* the given band to match the base layer approximately. */
6118 : /************************************************************************/
6119 :
6120 : /** Undocumented
6121 : * @param hBaseBand undocumented.
6122 : * @param nOverviewCount undocumented.
6123 : * @param pahOverviews undocumented.
6124 : * @param pfnProgress undocumented.
6125 : * @param pProgressData undocumented.
6126 : * @return undocumented
6127 : */
6128 0 : CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
6129 : int nOverviewCount,
6130 : GDALRasterBandH *pahOverviews,
6131 : GDALProgressFunc pfnProgress,
6132 : void *pProgressData)
6133 :
6134 : {
6135 0 : VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
6136 :
6137 : /* -------------------------------------------------------------------- */
6138 : /* Compute mean/stddev for source raster. */
6139 : /* -------------------------------------------------------------------- */
6140 0 : double dfOrigMean = 0.0;
6141 0 : double dfOrigStdDev = 0.0;
6142 : {
6143 : const CPLErr eErr =
6144 0 : GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
6145 : pfnProgress, pProgressData);
6146 :
6147 0 : if (eErr != CE_None)
6148 0 : return eErr;
6149 : }
6150 :
6151 : /* -------------------------------------------------------------------- */
6152 : /* Loop on overview bands. */
6153 : /* -------------------------------------------------------------------- */
6154 0 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
6155 : {
6156 : GDALRasterBand *poOverview =
6157 0 : GDALRasterBand::FromHandle(pahOverviews[iOverview]);
6158 : double dfOverviewMean, dfOverviewStdDev;
6159 :
6160 : const CPLErr eErr =
6161 0 : GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
6162 : &dfOverviewStdDev, pfnProgress, pProgressData);
6163 :
6164 0 : if (eErr != CE_None)
6165 0 : return eErr;
6166 :
6167 0 : double dfGain = 1.0;
6168 0 : if (dfOrigStdDev >= 0.0001)
6169 0 : dfGain = dfOrigStdDev / dfOverviewStdDev;
6170 :
6171 : /* --------------------------------------------------------------------
6172 : */
6173 : /* Apply gain and offset. */
6174 : /* --------------------------------------------------------------------
6175 : */
6176 0 : const int nWidth = poOverview->GetXSize();
6177 0 : const int nHeight = poOverview->GetYSize();
6178 :
6179 0 : GDALDataType eWrkType = GDT_Unknown;
6180 0 : float *pafData = nullptr;
6181 0 : const GDALDataType eType = poOverview->GetRasterDataType();
6182 0 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6183 0 : if (bComplex)
6184 : {
6185 : pafData = static_cast<float *>(
6186 0 : VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6187 0 : eWrkType = GDT_CFloat32;
6188 : }
6189 : else
6190 : {
6191 : pafData = static_cast<float *>(
6192 0 : VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6193 0 : eWrkType = GDT_Float32;
6194 : }
6195 :
6196 0 : if (pafData == nullptr)
6197 : {
6198 0 : return CE_Failure;
6199 : }
6200 :
6201 0 : for (int iLine = 0; iLine < nHeight; ++iLine)
6202 : {
6203 0 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6204 : pProgressData))
6205 : {
6206 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6207 0 : CPLFree(pafData);
6208 0 : return CE_Failure;
6209 : }
6210 :
6211 0 : if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
6212 : nWidth, 1, eWrkType, 0, 0,
6213 0 : nullptr) != CE_None)
6214 : {
6215 0 : CPLFree(pafData);
6216 0 : return CE_Failure;
6217 : }
6218 :
6219 0 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6220 : {
6221 0 : if (bComplex)
6222 : {
6223 0 : pafData[iPixel * 2] *= static_cast<float>(dfGain);
6224 0 : pafData[iPixel * 2 + 1] *= static_cast<float>(dfGain);
6225 : }
6226 : else
6227 : {
6228 0 : pafData[iPixel] = static_cast<float>(
6229 0 : (pafData[iPixel] - dfOverviewMean) * dfGain +
6230 : dfOrigMean);
6231 : }
6232 : }
6233 :
6234 0 : if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
6235 : nWidth, 1, eWrkType, 0, 0,
6236 0 : nullptr) != CE_None)
6237 : {
6238 0 : CPLFree(pafData);
6239 0 : return CE_Failure;
6240 : }
6241 : }
6242 :
6243 0 : if (!pfnProgress(1.0, nullptr, pProgressData))
6244 : {
6245 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6246 0 : CPLFree(pafData);
6247 0 : return CE_Failure;
6248 : }
6249 :
6250 0 : CPLFree(pafData);
6251 : }
6252 :
6253 0 : return CE_None;
6254 : }
|