Line data Source code
1 :
2 : /******************************************************************************
3 : *
4 : * Project: GDAL Core
5 : * Purpose: Helper code to implement overview support in different drivers.
6 : * Author: Frank Warmerdam, warmerdam@pobox.com
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 2000, Frank Warmerdam
10 : * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * SPDX-License-Identifier: MIT
13 : ****************************************************************************/
14 :
15 : #include "cpl_port.h"
16 : #include "gdal_priv.h"
17 :
18 : #include <cmath>
19 : #include <cstddef>
20 : #include <cstdlib>
21 :
22 : #include <algorithm>
23 : #include <complex>
24 : #include <condition_variable>
25 : #include <limits>
26 : #include <list>
27 : #include <memory>
28 : #include <mutex>
29 : #include <vector>
30 :
31 : #include "cpl_conv.h"
32 : #include "cpl_error.h"
33 : #include "cpl_float.h"
34 : #include "cpl_progress.h"
35 : #include "cpl_vsi.h"
36 : #include "cpl_worker_thread_pool.h"
37 : #include "gdal.h"
38 : #include "gdal_thread_pool.h"
39 : #include "gdalwarper.h"
40 : #include "gdal_vrt.h"
41 : #include "vrtdataset.h"
42 :
43 : #ifdef USE_NEON_OPTIMIZATIONS
44 : #include "include_sse2neon.h"
45 :
46 : #if (!defined(__aarch64__) && !defined(_M_ARM64))
47 : #define ARM_V7
48 : #endif
49 :
50 : #define USE_SSE2
51 :
52 : #include "gdalsse_priv.h"
53 :
54 : // Restrict to 64bit processors because they are guaranteed to have SSE2,
55 : // or if __AVX2__ is defined.
56 : #elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
57 : #define USE_SSE2
58 :
59 : #include "gdalsse_priv.h"
60 :
61 : #ifdef __SSE3__
62 : #include <pmmintrin.h>
63 : #endif
64 : #ifdef __SSSE3__
65 : #include <tmmintrin.h>
66 : #endif
67 : #ifdef __SSE4_1__
68 : #include <smmintrin.h>
69 : #endif
70 : #ifdef __AVX2__
71 : #include <immintrin.h>
72 : #endif
73 :
74 : #endif
75 :
76 : // To be included after above USE_SSE2 and include gdalsse_priv.h
77 : // to avoid build issue on Windows x86
78 : #include "gdal_priv_templates.hpp"
79 :
80 : /************************************************************************/
81 : /* GDALResampleChunk_Near() */
82 : /************************************************************************/
83 :
84 : template <class T>
85 1266 : static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
86 : const T *pChunk, T **ppDstBuffer)
87 :
88 : {
89 1266 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
90 1266 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
91 1266 : const GDALDataType eWrkDataType = args.eWrkDataType;
92 1266 : const int nChunkXOff = args.nChunkXOff;
93 1266 : const int nChunkXSize = args.nChunkXSize;
94 1266 : const int nChunkYOff = args.nChunkYOff;
95 1266 : const int nDstXOff = args.nDstXOff;
96 1266 : const int nDstXOff2 = args.nDstXOff2;
97 1266 : const int nDstYOff = args.nDstYOff;
98 1266 : const int nDstYOff2 = args.nDstYOff2;
99 1266 : const int nDstXWidth = nDstXOff2 - nDstXOff;
100 :
101 : /* -------------------------------------------------------------------- */
102 : /* Allocate buffers. */
103 : /* -------------------------------------------------------------------- */
104 1266 : *ppDstBuffer = static_cast<T *>(
105 1266 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
106 : GDALGetDataTypeSizeBytes(eWrkDataType)));
107 1266 : if (*ppDstBuffer == nullptr)
108 : {
109 0 : return CE_Failure;
110 : }
111 1266 : T *const pDstBuffer = *ppDstBuffer;
112 :
113 : int *panSrcXOff =
114 1266 : static_cast<int *>(VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(int)));
115 :
116 1266 : if (panSrcXOff == nullptr)
117 : {
118 0 : return CE_Failure;
119 : }
120 :
121 : /* ==================================================================== */
122 : /* Precompute inner loop constants. */
123 : /* ==================================================================== */
124 840896 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
125 : {
126 839630 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
127 839630 : if (nSrcXOff < nChunkXOff)
128 0 : nSrcXOff = nChunkXOff;
129 :
130 839630 : panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
131 : }
132 :
133 : /* ==================================================================== */
134 : /* Loop over destination scanlines. */
135 : /* ==================================================================== */
136 142457 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
137 : {
138 141191 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
139 141191 : if (nSrcYOff < nChunkYOff)
140 0 : nSrcYOff = nChunkYOff;
141 :
142 141191 : const T *const pSrcScanline =
143 : pChunk +
144 141191 : (static_cast<size_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
145 137798 : nChunkXOff;
146 :
147 : /* --------------------------------------------------------------------
148 : */
149 : /* Loop over destination pixels */
150 : /* --------------------------------------------------------------------
151 : */
152 141191 : T *pDstScanline =
153 141191 : pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
154 120247393 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
155 : {
156 120106000 : pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
157 : }
158 : }
159 :
160 1266 : CPLFree(panSrcXOff);
161 :
162 1266 : return CE_None;
163 : }
164 :
165 1266 : static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
166 : const void *pChunk, void **ppDstBuffer,
167 : GDALDataType *peDstBufferDataType)
168 : {
169 1266 : *peDstBufferDataType = args.eWrkDataType;
170 1266 : switch (args.eWrkDataType)
171 : {
172 : // For nearest resampling, as no computation is done, only the
173 : // size of the data type matters.
174 1098 : case GDT_UInt8:
175 : case GDT_Int8:
176 : {
177 1098 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
178 1098 : return GDALResampleChunk_NearT(
179 : args, static_cast<const uint8_t *>(pChunk),
180 1098 : reinterpret_cast<uint8_t **>(ppDstBuffer));
181 : }
182 :
183 52 : case GDT_Int16:
184 : case GDT_UInt16:
185 : case GDT_Float16:
186 : {
187 52 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
188 52 : return GDALResampleChunk_NearT(
189 : args, static_cast<const uint16_t *>(pChunk),
190 52 : reinterpret_cast<uint16_t **>(ppDstBuffer));
191 : }
192 :
193 68 : case GDT_CInt16:
194 : case GDT_CFloat16:
195 : case GDT_Int32:
196 : case GDT_UInt32:
197 : case GDT_Float32:
198 : {
199 68 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
200 68 : return GDALResampleChunk_NearT(
201 : args, static_cast<const uint32_t *>(pChunk),
202 68 : reinterpret_cast<uint32_t **>(ppDstBuffer));
203 : }
204 :
205 44 : case GDT_CInt32:
206 : case GDT_CFloat32:
207 : case GDT_Int64:
208 : case GDT_UInt64:
209 : case GDT_Float64:
210 : {
211 44 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
212 44 : return GDALResampleChunk_NearT(
213 : args, static_cast<const uint64_t *>(pChunk),
214 44 : reinterpret_cast<uint64_t **>(ppDstBuffer));
215 : }
216 :
217 4 : case GDT_CFloat64:
218 : {
219 4 : return GDALResampleChunk_NearT(
220 : args, static_cast<const std::complex<double> *>(pChunk),
221 4 : reinterpret_cast<std::complex<double> **>(ppDstBuffer));
222 : }
223 :
224 0 : case GDT_Unknown:
225 : case GDT_TypeCount:
226 0 : break;
227 : }
228 0 : CPLAssert(false);
229 : return CE_Failure;
230 : }
231 :
232 : namespace
233 : {
234 :
235 : // Find in the color table the entry whose RGB value is the closest
236 : // (using quadratic distance) to the test color, ignoring transparent entries.
237 3837 : int BestColorEntry(const std::vector<GDALColorEntry> &entries,
238 : const GDALColorEntry &test)
239 : {
240 3837 : int nMinDist = std::numeric_limits<int>::max();
241 3837 : size_t bestEntry = 0;
242 986109 : for (size_t i = 0; i < entries.size(); ++i)
243 : {
244 982272 : const GDALColorEntry &entry = entries[i];
245 : // Ignore transparent entries
246 982272 : if (entry.c4 == 0)
247 3237 : continue;
248 :
249 979035 : int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
250 979035 : ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
251 979035 : ((test.c3 - entry.c3) * (test.c3 - entry.c3));
252 979035 : if (nDist < nMinDist)
253 : {
254 15847 : nMinDist = nDist;
255 15847 : bestEntry = i;
256 : }
257 : }
258 3837 : return static_cast<int>(bestEntry);
259 : }
260 :
261 7 : std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
262 : int &transparentIdx)
263 : {
264 7 : std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
265 :
266 7 : transparentIdx = -1;
267 7 : int i = 0;
268 1799 : for (auto &entry : entries)
269 : {
270 1792 : table.GetColorEntryAsRGB(i, &entry);
271 1792 : if (transparentIdx < 0 && entry.c4 == 0)
272 1 : transparentIdx = i;
273 1792 : ++i;
274 : }
275 7 : return entries;
276 : }
277 :
278 : } // unnamed namespace
279 :
280 : /************************************************************************/
281 : /* SQUARE() */
282 : /************************************************************************/
283 :
284 6427 : template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
285 : {
286 6427 : return static_cast<Tsquare>(val) * val;
287 : }
288 :
289 : /************************************************************************/
290 : /* ComputeIntegerRMS() */
291 : /************************************************************************/
292 : // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
293 : // integer that minimizes abs(rms**2 - sumSquares / weight)
294 : template <class T, class Twork>
295 42 : inline T ComputeIntegerRMS(double sumSquares, double weight)
296 : {
297 42 : const double sumDivWeight = sumSquares / weight;
298 42 : T rms = static_cast<T>(sqrt(sumDivWeight));
299 :
300 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
301 : // Naive version:
302 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
303 42 : if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
304 42 : 2 * sumDivWeight)
305 6 : rms += 1;
306 42 : return rms;
307 : }
308 :
309 : template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
310 : {
311 : CPLAssert(false);
312 : return 0;
313 : }
314 :
315 28 : template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
316 : {
317 : // It has been verified that given the correction on rms below, using
318 : // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
319 : // is equivalent, so use the former as it is used twice.
320 28 : const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
321 28 : const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
322 28 : GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
323 :
324 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
325 : // Naive version:
326 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
327 : // Optimized version for integer case and weight == 4
328 28 : if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
329 5 : rms += 1;
330 28 : return rms;
331 : }
332 :
333 : template <>
334 24 : inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
335 : {
336 24 : const double sumDivWeight = sumSquares * 0.25;
337 24 : GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
338 :
339 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
340 : // Naive version:
341 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
342 : // Optimized version for integer case and weight == 4
343 24 : if (static_cast<GUInt32>(rms) * (rms + 1) <
344 24 : static_cast<GUInt32>(sumDivWeight + 0.25))
345 4 : rms += 1;
346 24 : return rms;
347 : }
348 :
349 : #ifdef USE_SSE2
350 :
351 : /************************************************************************/
352 : /* QuadraticMeanByteSSE2OrAVX2() */
353 : /************************************************************************/
354 :
355 : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
356 : #define sse2_hadd_epi16 _mm_hadd_epi16
357 : #else
358 4104270 : inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
359 : {
360 : // Horizontal addition of adjacent pairs
361 4104270 : const auto mask = _mm_set1_epi32(0xFFFF);
362 : const auto horizLo =
363 12312800 : _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
364 : const auto horizHi =
365 12312800 : _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
366 :
367 : // Recombine low and high parts
368 4104270 : return _mm_packs_epi32(horizLo, horizHi);
369 : }
370 : #endif
371 :
372 : #ifdef __AVX2__
373 :
374 : #define set1_epi16 _mm256_set1_epi16
375 : #define set1_epi32 _mm256_set1_epi32
376 : #define setzero _mm256_setzero_si256
377 : #define set1_ps _mm256_set1_ps
378 : #define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
379 : #define unpacklo_epi8 _mm256_unpacklo_epi8
380 : #define unpackhi_epi8 _mm256_unpackhi_epi8
381 : #define madd_epi16 _mm256_madd_epi16
382 : #define add_epi32 _mm256_add_epi32
383 : #define mul_ps _mm256_mul_ps
384 : #define cvtepi32_ps _mm256_cvtepi32_ps
385 : #define sqrt_ps _mm256_sqrt_ps
386 : #define cvttps_epi32 _mm256_cvttps_epi32
387 : #define packs_epi32 _mm256_packs_epi32
388 : #define packus_epi32 _mm256_packus_epi32
389 : #define srli_epi32 _mm256_srli_epi32
390 : #define mullo_epi16 _mm256_mullo_epi16
391 : #define srli_epi16 _mm256_srli_epi16
392 : #define cmpgt_epi16 _mm256_cmpgt_epi16
393 : #define add_epi16 _mm256_add_epi16
394 : #define sub_epi16 _mm256_sub_epi16
395 : #define packus_epi16 _mm256_packus_epi16
396 :
397 : /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
398 : /* to get the lower 128-bit bits of what would be a true 256-bit vector register
399 : */
400 :
401 : inline __m256i FIXUP_LANES(__m256i x)
402 : {
403 : return _mm256_permute4x64_epi64(x, _MM_SHUFFLE(3, 1, 2, 0));
404 : }
405 :
406 : #define store_lo(x, y) \
407 : _mm_storeu_si128(reinterpret_cast<__m128i *>(x), \
408 : _mm256_extracti128_si256(FIXUP_LANES(y), 0))
409 : #define storeu_int(x, y) \
410 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(x), FIXUP_LANES(y))
411 : #define hadd_epi16 _mm256_hadd_epi16
412 : #else
413 : #define set1_epi16 _mm_set1_epi16
414 : #define set1_epi32 _mm_set1_epi32
415 : #define setzero _mm_setzero_si128
416 : #define set1_ps _mm_set1_ps
417 : #define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
418 : #define unpacklo_epi8 _mm_unpacklo_epi8
419 : #define unpackhi_epi8 _mm_unpackhi_epi8
420 : #define madd_epi16 _mm_madd_epi16
421 : #define add_epi32 _mm_add_epi32
422 : #define mul_ps _mm_mul_ps
423 : #define cvtepi32_ps _mm_cvtepi32_ps
424 : #define sqrt_ps _mm_sqrt_ps
425 : #define cvttps_epi32 _mm_cvttps_epi32
426 : #define packs_epi32 _mm_packs_epi32
427 : #define packus_epi32 GDAL_mm_packus_epi32
428 : #define srli_epi32 _mm_srli_epi32
429 : #define mullo_epi16 _mm_mullo_epi16
430 : #define srli_epi16 _mm_srli_epi16
431 : #define cmpgt_epi16 _mm_cmpgt_epi16
432 : #define add_epi16 _mm_add_epi16
433 : #define sub_epi16 _mm_sub_epi16
434 : #define packus_epi16 _mm_packus_epi16
435 : #define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
436 : #define storeu_int(x, y) _mm_storeu_si128(reinterpret_cast<__m128i *>(x), (y))
437 : #define hadd_epi16 sse2_hadd_epi16
438 : #endif
439 :
440 : template <class T>
441 : static int
442 : #if defined(__GNUC__)
443 : __attribute__((noinline))
444 : #endif
445 5389 : QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
446 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
447 : T *CPL_RESTRICT pDstScanline)
448 : {
449 : // Optimized implementation for RMS on Byte by
450 : // processing by group of 8 output pixels, so as to use
451 : // a single _mm_sqrt_ps() call for 4 output pixels
452 5389 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
453 :
454 5389 : int iDstPixel = 0;
455 5389 : const auto one16 = set1_epi16(1);
456 5389 : const auto one32 = set1_epi32(1);
457 5389 : const auto zero = setzero();
458 5389 : const auto minus32768 = set1_epi16(-32768);
459 :
460 5389 : constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
461 521504 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
462 : {
463 : // Load 2 * DEST_ELTS bytes from each line
464 516115 : auto firstLine = loadu_int(pSrcScanlineShifted);
465 1032230 : auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
466 : // Extend those Bytes as UInt16s
467 516115 : auto firstLineLo = unpacklo_epi8(firstLine, zero);
468 516115 : auto firstLineHi = unpackhi_epi8(firstLine, zero);
469 516115 : auto secondLineLo = unpacklo_epi8(secondLine, zero);
470 516115 : auto secondLineHi = unpackhi_epi8(secondLine, zero);
471 :
472 : // Multiplication of 16 bit values and horizontal
473 : // addition of 32 bit results
474 : // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
475 516115 : firstLineLo = madd_epi16(firstLineLo, firstLineLo);
476 516115 : firstLineHi = madd_epi16(firstLineHi, firstLineHi);
477 516115 : secondLineLo = madd_epi16(secondLineLo, secondLineLo);
478 516115 : secondLineHi = madd_epi16(secondLineHi, secondLineHi);
479 :
480 : // Vertical addition
481 516115 : const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
482 516115 : const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
483 :
484 : const auto sumSquaresPlusOneDiv4Lo =
485 1032230 : srli_epi32(add_epi32(sumSquaresLo, one32), 2);
486 : const auto sumSquaresPlusOneDiv4Hi =
487 1032230 : srli_epi32(add_epi32(sumSquaresHi, one32), 2);
488 :
489 : // Take square root and truncate/floor to int32
490 : const auto rmsLo =
491 1548340 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
492 : const auto rmsHi =
493 1548340 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
494 :
495 : // Merge back low and high registers with each RMS value
496 : // as a 16 bit value.
497 516115 : auto rms = packs_epi32(rmsLo, rmsHi);
498 :
499 : // Round to upper value if it minimizes the
500 : // error |rms^2 - sumSquares/4|
501 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
502 : // rms += 1;
503 : // which is equivalent to:
504 : // if( rms * (rms + 1) < (sumSquares+1) / 4 )
505 : // rms += 1;
506 : // And both left and right parts fit on 16 (unsigned) bits
507 : const auto sumSquaresPlusOneDiv4 =
508 516115 : packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
509 : // cmpgt_epi16 operates on signed int16, but here
510 : // we have unsigned values, so shift them by -32768 before
511 2580580 : const auto mask = cmpgt_epi16(
512 : add_epi16(sumSquaresPlusOneDiv4, minus32768),
513 : add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
514 : // The value of the mask will be -1 when the correction needs to be
515 : // applied
516 516115 : rms = sub_epi16(rms, mask);
517 :
518 : // Pack each 16 bit RMS value to 8 bits
519 516115 : rms = packus_epi16(rms, rms /* could be anything */);
520 516115 : store_lo(&pDstScanline[iDstPixel], rms);
521 516115 : pSrcScanlineShifted += 2 * DEST_ELTS;
522 : }
523 :
524 5389 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
525 5389 : return iDstPixel;
526 : }
527 :
528 : /************************************************************************/
529 : /* AverageByteSSE2OrAVX2() */
530 : /************************************************************************/
531 :
532 : static int
533 120136 : AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
534 : const GByte *&CPL_RESTRICT pSrcScanlineShiftedInOut,
535 : GByte *CPL_RESTRICT pDstScanline)
536 : {
537 : // Optimized implementation for average on Byte by
538 : // processing by group of 16 output pixels for SSE2, or 32 for AVX2
539 :
540 120136 : const auto zero = setzero();
541 120136 : const auto two16 = set1_epi16(2);
542 120136 : const GByte *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
543 :
544 120136 : constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
545 120136 : int iDstPixel = 0;
546 2172270 : for (; iDstPixel < nDstXWidth - (2 * DEST_ELTS - 1);
547 2052130 : iDstPixel += 2 * DEST_ELTS)
548 : {
549 : decltype(setzero()) average0;
550 : {
551 : // Load 2 * DEST_ELTS bytes from each line
552 2052130 : const auto firstLine = loadu_int(pSrcScanlineShifted);
553 : const auto secondLine =
554 4104270 : loadu_int(pSrcScanlineShifted + nChunkXSize);
555 : // Extend those Bytes as UInt16s
556 2052130 : const auto firstLineLo = unpacklo_epi8(firstLine, zero);
557 2052130 : const auto firstLineHi = unpackhi_epi8(firstLine, zero);
558 2052130 : const auto secondLineLo = unpacklo_epi8(secondLine, zero);
559 2052130 : const auto secondLineHi = unpackhi_epi8(secondLine, zero);
560 :
561 : // Vertical addition
562 2052130 : const auto sumLo = add_epi16(firstLineLo, secondLineLo);
563 2052130 : const auto sumHi = add_epi16(firstLineHi, secondLineHi);
564 :
565 : // Horizontal addition of adjacent pairs, and recombine low and high
566 : // parts
567 2052130 : const auto sum = hadd_epi16(sumLo, sumHi);
568 :
569 : // average = (sum + 2) / 4
570 2052130 : average0 = srli_epi16(add_epi16(sum, two16), 2);
571 :
572 2052130 : pSrcScanlineShifted += 2 * DEST_ELTS;
573 : }
574 :
575 : decltype(setzero()) average1;
576 : {
577 : // Load 2 * DEST_ELTS bytes from each line
578 2052130 : const auto firstLine = loadu_int(pSrcScanlineShifted);
579 : const auto secondLine =
580 4104270 : loadu_int(pSrcScanlineShifted + nChunkXSize);
581 : // Extend those Bytes as UInt16s
582 2052130 : const auto firstLineLo = unpacklo_epi8(firstLine, zero);
583 2052130 : const auto firstLineHi = unpackhi_epi8(firstLine, zero);
584 2052130 : const auto secondLineLo = unpacklo_epi8(secondLine, zero);
585 2052130 : const auto secondLineHi = unpackhi_epi8(secondLine, zero);
586 :
587 : // Vertical addition
588 2052130 : const auto sumLo = add_epi16(firstLineLo, secondLineLo);
589 2052130 : const auto sumHi = add_epi16(firstLineHi, secondLineHi);
590 :
591 : // Horizontal addition of adjacent pairs, and recombine low and high
592 : // parts
593 2052130 : const auto sum = hadd_epi16(sumLo, sumHi);
594 :
595 : // average = (sum + 2) / 4
596 2052130 : average1 = srli_epi16(add_epi16(sum, two16), 2);
597 :
598 2052130 : pSrcScanlineShifted += 2 * DEST_ELTS;
599 : }
600 :
601 : // Pack each 16 bit average value to 8 bits
602 2052130 : const auto average = packus_epi16(average0, average1);
603 2052130 : storeu_int(&pDstScanline[iDstPixel], average);
604 : }
605 :
606 120136 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
607 120136 : return iDstPixel;
608 : }
609 :
610 : /************************************************************************/
611 : /* QuadraticMeanUInt16SSE2() */
612 : /************************************************************************/
613 :
614 : #ifdef __SSE3__
615 : #define sse2_hadd_pd _mm_hadd_pd
616 : #else
617 185 : inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
618 : {
619 : auto aLo_bLo =
620 740 : _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
621 : auto aHi_bHi =
622 740 : _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
623 185 : return _mm_add_pd(aLo_bLo, aHi_bHi); // (aLo + aHi, bLo + bHi)
624 : }
625 : #endif
626 :
627 120 : inline __m128d SQUARE_PD(__m128d x)
628 : {
629 120 : return _mm_mul_pd(x, x);
630 : }
631 :
632 : #ifdef __AVX2__
633 :
634 : inline __m256d SQUARE_PD(__m256d x)
635 : {
636 : return _mm256_mul_pd(x, x);
637 : }
638 :
639 : inline __m256d FIXUP_LANES(__m256d x)
640 : {
641 : return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
642 : }
643 :
644 : inline __m256 FIXUP_LANES(__m256 x)
645 : {
646 : return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
647 : }
648 :
649 : #endif
650 :
651 : static int
652 14 : QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
653 : const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
654 : uint16_t *CPL_RESTRICT pDstScanline)
655 : {
656 : // Optimized implementation for RMS on UInt16 by
657 : // processing by group of 4 output pixels.
658 14 : const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
659 :
660 14 : int iDstPixel = 0;
661 14 : const auto zero = _mm_setzero_si128();
662 :
663 : #ifdef __AVX2__
664 : const auto zeroDot25 = _mm256_set1_pd(0.25);
665 : const auto zeroDot5 = _mm256_set1_pd(0.5);
666 :
667 : // The first four 0's could be anything, as we only take the bottom
668 : // 128 bits.
669 : const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
670 : #else
671 14 : const auto zeroDot25 = _mm_set1_pd(0.25);
672 14 : const auto zeroDot5 = _mm_set1_pd(0.5);
673 : #endif
674 :
675 14 : constexpr int DEST_ELTS =
676 : static_cast<int>(sizeof(zero) / sizeof(uint16_t)) / 2;
677 52 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
678 : {
679 : // Load 8 UInt16 from each line
680 38 : const auto firstLine = _mm_loadu_si128(
681 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
682 : const auto secondLine =
683 38 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
684 38 : pSrcScanlineShifted + nChunkXSize));
685 :
686 : // Detect if all of the source values fit in 14 bits.
687 : // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
688 : // and we can do a much faster implementation.
689 : const auto maskTmp =
690 76 : _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
691 : #if defined(__i386__) || defined(_M_IX86)
692 : uint64_t nMaskFitsIn14Bits = 0;
693 : _mm_storel_epi64(
694 : reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
695 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
696 : #else
697 38 : const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
698 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
699 : #endif
700 38 : if (nMaskFitsIn14Bits == 0)
701 : {
702 : // Multiplication of 16 bit values and horizontal
703 : // addition of 32 bit results
704 : const auto firstLineHSumSquare =
705 26 : _mm_madd_epi16(firstLine, firstLine);
706 : const auto secondLineHSumSquare =
707 26 : _mm_madd_epi16(secondLine, secondLine);
708 : // Vertical addition
709 : const auto sumSquares =
710 26 : _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
711 : // In theory we should take sqrt(sumSquares * 0.25f)
712 : // but given the rounding we do, this is equivalent to
713 : // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
714 : // sumSquares <= 4 * 16383^2
715 26 : const auto one32 = _mm_set1_epi32(1);
716 : const auto sumSquaresPlusOneDiv4 =
717 52 : _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
718 : // Take square root and truncate/floor to int32
719 78 : auto rms = _mm_cvttps_epi32(
720 : _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
721 :
722 : // Round to upper value if it minimizes the
723 : // error |rms^2 - sumSquares/4|
724 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
725 : // rms += 1;
726 : // which is equivalent to:
727 : // if( rms * rms + rms < (sumSquares+1) / 4 )
728 : // rms += 1;
729 : auto mask =
730 78 : _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
731 : _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
732 26 : rms = _mm_sub_epi32(rms, mask);
733 : // Pack each 32 bit RMS value to 16 bits
734 26 : rms = _mm_packs_epi32(rms, rms /* could be anything */);
735 : _mm_storel_epi64(
736 26 : reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
737 26 : pSrcScanlineShifted += 2 * DEST_ELTS;
738 26 : continue;
739 : }
740 :
741 : // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
742 : // to 32 bit would result in 4 multiplications instead of 8, but
743 : // mullo/mulhi have a worse throughput than mul_pd.
744 :
745 : // Extend those UInt16s as UInt32s
746 12 : const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
747 12 : const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
748 12 : const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
749 12 : const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
750 :
751 : #ifdef __AVX2__
752 : // Multiplication of 32 bit values previously converted to 64 bit double
753 : const auto firstLineLoDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineLo));
754 : const auto firstLineHiDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineHi));
755 : const auto secondLineLoDbl =
756 : SQUARE_PD(_mm256_cvtepi32_pd(secondLineLo));
757 : const auto secondLineHiDbl =
758 : SQUARE_PD(_mm256_cvtepi32_pd(secondLineHi));
759 :
760 : // Vertical addition of squares
761 : const auto sumSquaresLo =
762 : _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
763 : const auto sumSquaresHi =
764 : _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
765 :
766 : // Horizontal addition of squares
767 : const auto sumSquares =
768 : FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
769 :
770 : const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
771 :
772 : // Take square root and truncate/floor to int32
773 : auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
774 : const auto rmsDouble = _mm256_cvtepi32_pd(rms);
775 : const auto right = _mm256_sub_pd(
776 : sumDivWeight, _mm256_add_pd(SQUARE_PD(rmsDouble), rmsDouble));
777 :
778 : auto mask =
779 : _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
780 : // Extract 32-bit from each of the 4 64-bit masks
781 : // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
782 : // _MM_SHUFFLE(2,0,2,0)));
783 : mask = _mm256_permutevar8x32_ps(mask, permutation);
784 : const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
785 :
786 : // Apply the correction
787 : rms = _mm_sub_epi32(rms, maskI);
788 :
789 : // Pack each 32 bit RMS value to 16 bits
790 : rms = _mm_packus_epi32(rms, rms /* could be anything */);
791 : #else
792 : // Multiplication of 32 bit values previously converted to 64 bit double
793 12 : const auto firstLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineLo));
794 : const auto firstLineLoHi =
795 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
796 12 : const auto firstLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineHi));
797 : const auto firstLineHiHi =
798 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
799 :
800 12 : const auto secondLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineLo));
801 : const auto secondLineLoHi =
802 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
803 12 : const auto secondLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineHi));
804 : const auto secondLineHiHi =
805 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
806 :
807 : // Vertical addition of squares
808 12 : const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
809 12 : const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
810 12 : const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
811 12 : const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
812 :
813 : // Horizontal addition of squares
814 12 : const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
815 12 : const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
816 :
817 12 : const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
818 12 : const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
819 : // Take square root and truncate/floor to int32
820 24 : const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
821 24 : const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
822 :
823 : // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
824 : // if( 0.5 < sumDivWeight - (rms * rms + rms) )
825 : // rms += 1;
826 12 : const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
827 12 : const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
828 24 : const auto rightLo = _mm_sub_pd(
829 : sumDivWeightLo, _mm_add_pd(SQUARE_PD(rmsLoDouble), rmsLoDouble));
830 36 : const auto rightHi = _mm_sub_pd(
831 : sumDivWeightHi, _mm_add_pd(SQUARE_PD(rmsHiDouble), rmsHiDouble));
832 :
833 24 : const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
834 12 : const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
835 : // The value of the mask will be -1 when the correction needs to be
836 : // applied
837 24 : const auto mask = _mm_castps_si128(_mm_shuffle_ps(
838 : maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
839 :
840 48 : auto rms = _mm_castps_si128(
841 : _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
842 : // Apply the correction
843 12 : rms = _mm_sub_epi32(rms, mask);
844 :
845 : // Pack each 32 bit RMS value to 16 bits
846 12 : rms = GDAL_mm_int32_to_uint16(rms);
847 : #endif
848 :
849 12 : _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
850 : rms);
851 12 : pSrcScanlineShifted += 2 * DEST_ELTS;
852 : }
853 :
854 14 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
855 14 : return iDstPixel;
856 : }
857 :
858 : /************************************************************************/
859 : /* AverageUInt16SSE2() */
860 : /************************************************************************/
861 :
862 : static int
863 13 : AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
864 : const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
865 : uint16_t *CPL_RESTRICT pDstScanline)
866 : {
867 : // Optimized implementation for average on UInt16 by
868 : // processing by group of 8 output pixels.
869 :
870 13 : const auto mask = _mm_set1_epi32(0xFFFF);
871 13 : const auto two = _mm_set1_epi32(2);
872 13 : const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
873 :
874 13 : int iDstPixel = 0;
875 13 : constexpr int DEST_ELTS = static_cast<int>(sizeof(mask) / sizeof(uint16_t));
876 25 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
877 : {
878 : __m128i averageLow;
879 : // Load 8 UInt16 from each line
880 : {
881 12 : const auto firstLine = _mm_loadu_si128(
882 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
883 : const auto secondLine =
884 12 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
885 12 : pSrcScanlineShifted + nChunkXSize));
886 :
887 : // Horizontal addition and extension to 32 bit
888 36 : const auto horizAddFirstLine = _mm_add_epi32(
889 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
890 : const auto horizAddSecondLine =
891 36 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
892 : _mm_srli_epi32(secondLine, 16));
893 :
894 : // Vertical addition and average computation
895 : // average = (sum + 2) >> 2
896 24 : const auto sum = _mm_add_epi32(
897 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
898 12 : averageLow = _mm_srli_epi32(sum, 2);
899 : }
900 : // Load 8 UInt16 from each line
901 : __m128i averageHigh;
902 : {
903 : const auto firstLine =
904 12 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
905 12 : pSrcScanlineShifted + DEST_ELTS));
906 : const auto secondLine =
907 12 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
908 12 : pSrcScanlineShifted + DEST_ELTS + nChunkXSize));
909 :
910 : // Horizontal addition and extension to 32 bit
911 36 : const auto horizAddFirstLine = _mm_add_epi32(
912 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
913 : const auto horizAddSecondLine =
914 36 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
915 : _mm_srli_epi32(secondLine, 16));
916 :
917 : // Vertical addition and average computation
918 : // average = (sum + 2) >> 2
919 24 : const auto sum = _mm_add_epi32(
920 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
921 12 : averageHigh = _mm_srli_epi32(sum, 2);
922 : }
923 :
924 : // Pack each 32 bit average value to 16 bits
925 12 : auto average = GDAL_mm_packus_epi32(averageLow, averageHigh);
926 12 : _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
927 : average);
928 12 : pSrcScanlineShifted += 2 * DEST_ELTS;
929 : }
930 :
931 13 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
932 13 : return iDstPixel;
933 : }
934 :
935 : /************************************************************************/
936 : /* QuadraticMeanFloatSSE2() */
937 : /************************************************************************/
938 :
939 : #if !defined(ARM_V7)
940 :
941 : #ifdef __SSE3__
942 : #define sse2_hadd_ps _mm_hadd_ps
943 : #else
944 82 : inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
945 : {
946 82 : auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
947 82 : auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
948 82 : return _mm_add_ps(aEven_bEven, aOdd_bOdd); // (aEven + aOdd, bEven + bOdd)
949 : }
950 : #endif
951 :
952 : #ifdef __AVX2__
953 : #define set1_ps _mm256_set1_ps
954 : #define loadu_ps _mm256_loadu_ps
955 : #define andnot_ps _mm256_andnot_ps
956 : #define and_ps _mm256_and_ps
957 : #define max_ps _mm256_max_ps
958 : #define shuffle_ps _mm256_shuffle_ps
959 : #define div_ps _mm256_div_ps
960 : #define cmpeq_ps(x, y) _mm256_cmp_ps((x), (y), _CMP_EQ_OQ)
961 : #define mul_ps _mm256_mul_ps
962 : #define add_ps _mm256_add_ps
963 : #define hadd_ps _mm256_hadd_ps
964 : #define sqrt_ps _mm256_sqrt_ps
965 : #define or_ps _mm256_or_ps
966 : #define unpacklo_ps _mm256_unpacklo_ps
967 : #define unpackhi_ps _mm256_unpackhi_ps
968 : #define storeu_ps _mm256_storeu_ps
969 : #define blendv_ps _mm256_blendv_ps
970 :
971 : inline __m256 SQUARE_PS(__m256 x)
972 : {
973 : return _mm256_mul_ps(x, x);
974 : }
975 :
976 : #else
977 :
978 : #define set1_ps _mm_set1_ps
979 : #define loadu_ps _mm_loadu_ps
980 : #define andnot_ps _mm_andnot_ps
981 : #define and_ps _mm_and_ps
982 : #define max_ps _mm_max_ps
983 : #define shuffle_ps _mm_shuffle_ps
984 : #define div_ps _mm_div_ps
985 : #define cmpeq_ps _mm_cmpeq_ps
986 : #define mul_ps _mm_mul_ps
987 : #define add_ps _mm_add_ps
988 : #define hadd_ps sse2_hadd_ps
989 : #define sqrt_ps _mm_sqrt_ps
990 : #define or_ps _mm_or_ps
991 : #define unpacklo_ps _mm_unpacklo_ps
992 : #define unpackhi_ps _mm_unpackhi_ps
993 : #define storeu_ps _mm_storeu_ps
994 :
995 132 : inline __m128 blendv_ps(__m128 a, __m128 b, __m128 mask)
996 : {
997 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
998 : return _mm_blendv_ps(a, b, mask);
999 : #else
1000 396 : return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b));
1001 : #endif
1002 : }
1003 :
1004 528 : inline __m128 SQUARE_PS(__m128 x)
1005 : {
1006 528 : return _mm_mul_ps(x, x);
1007 : }
1008 :
1009 132 : inline __m128 FIXUP_LANES(__m128 x)
1010 : {
1011 132 : return x;
1012 : }
1013 :
1014 : #endif
1015 :
1016 : static int
1017 : #if defined(__GNUC__)
1018 : __attribute__((noinline))
1019 : #endif
1020 66 : QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
1021 : const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1022 : float *CPL_RESTRICT pDstScanline)
1023 : {
1024 : // Optimized implementation for RMS on Float32 by
1025 : // processing by group of output pixels.
1026 66 : const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1027 :
1028 66 : int iDstPixel = 0;
1029 66 : const auto minus_zero = set1_ps(-0.0f);
1030 66 : const auto zeroDot25 = set1_ps(0.25f);
1031 66 : const auto one = set1_ps(1.0f);
1032 66 : const auto infv = set1_ps(std::numeric_limits<float>::infinity());
1033 66 : constexpr int DEST_ELTS = static_cast<int>(sizeof(one) / sizeof(float));
1034 :
1035 198 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1036 : {
1037 : // Load 2*DEST_ELTS Float32 from each line
1038 132 : auto firstLineLo = loadu_ps(pSrcScanlineShifted);
1039 132 : auto firstLineHi = loadu_ps(pSrcScanlineShifted + DEST_ELTS);
1040 132 : auto secondLineLo = loadu_ps(pSrcScanlineShifted + nChunkXSize);
1041 : auto secondLineHi =
1042 264 : loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize);
1043 :
1044 : // Take the absolute value
1045 132 : firstLineLo = andnot_ps(minus_zero, firstLineLo);
1046 132 : firstLineHi = andnot_ps(minus_zero, firstLineHi);
1047 132 : secondLineLo = andnot_ps(minus_zero, secondLineLo);
1048 132 : secondLineHi = andnot_ps(minus_zero, secondLineHi);
1049 :
1050 : auto firstLineEven =
1051 132 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1052 : auto firstLineOdd =
1053 132 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1054 : auto secondLineEven =
1055 132 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1056 : auto secondLineOdd =
1057 132 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1058 :
1059 : // Compute the maximum of each DEST_ELTS value to RMS-average
1060 396 : const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
1061 : max_ps(secondLineEven, secondLineOdd));
1062 :
1063 : // Normalize each value by the maximum of the DEST_ELTS ones.
1064 : // This step is important to avoid that the square evaluates to infinity
1065 : // for sufficiently big input.
1066 132 : auto invMax = div_ps(one, maxV);
1067 : // Deal with 0 being the maximum to correct division by zero
1068 : // note: comparing to -0 leads to identical results as to comparing with
1069 : // 0
1070 264 : invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
1071 :
1072 132 : firstLineEven = mul_ps(firstLineEven, invMax);
1073 132 : firstLineOdd = mul_ps(firstLineOdd, invMax);
1074 132 : secondLineEven = mul_ps(secondLineEven, invMax);
1075 132 : secondLineOdd = mul_ps(secondLineOdd, invMax);
1076 :
1077 : // Compute squares
1078 132 : firstLineEven = SQUARE_PS(firstLineEven);
1079 132 : firstLineOdd = SQUARE_PS(firstLineOdd);
1080 132 : secondLineEven = SQUARE_PS(secondLineEven);
1081 132 : secondLineOdd = SQUARE_PS(secondLineOdd);
1082 :
1083 396 : const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
1084 : add_ps(secondLineEven, secondLineOdd));
1085 :
1086 396 : auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
1087 :
1088 : // Deal with infinity being the maximum
1089 132 : const auto maskIsInf = cmpeq_ps(maxV, infv);
1090 132 : rms = blendv_ps(rms, infv, maskIsInf);
1091 :
1092 132 : rms = FIXUP_LANES(rms);
1093 :
1094 132 : storeu_ps(&pDstScanline[iDstPixel], rms);
1095 132 : pSrcScanlineShifted += DEST_ELTS * 2;
1096 : }
1097 :
1098 66 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1099 66 : return iDstPixel;
1100 : }
1101 :
1102 : /************************************************************************/
1103 : /* AverageFloatSSE2() */
1104 : /************************************************************************/
1105 :
1106 50 : static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
1107 : const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1108 : float *CPL_RESTRICT pDstScanline)
1109 : {
1110 : // Optimized implementation for average on Float32 by
1111 : // processing by group of output pixels.
1112 50 : const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1113 :
1114 50 : int iDstPixel = 0;
1115 50 : const auto zeroDot25 = _mm_set1_ps(0.25f);
1116 50 : constexpr int DEST_ELTS =
1117 : static_cast<int>(sizeof(zeroDot25) / sizeof(float));
1118 :
1119 132 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1120 : {
1121 : // Load 2 * DEST_ELTS Float32 from each line
1122 : const auto firstLineLo =
1123 82 : _mm_mul_ps(_mm_loadu_ps(pSrcScanlineShifted), zeroDot25);
1124 164 : const auto firstLineHi = _mm_mul_ps(
1125 : _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS), zeroDot25);
1126 82 : const auto secondLineLo = _mm_mul_ps(
1127 82 : _mm_loadu_ps(pSrcScanlineShifted + nChunkXSize), zeroDot25);
1128 164 : const auto secondLineHi = _mm_mul_ps(
1129 82 : _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize),
1130 : zeroDot25);
1131 :
1132 : // Vertical addition
1133 82 : const auto tmpLo = _mm_add_ps(firstLineLo, secondLineLo);
1134 82 : const auto tmpHi = _mm_add_ps(firstLineHi, secondLineHi);
1135 :
1136 : // Horizontal addition
1137 82 : const auto average = sse2_hadd_ps(tmpLo, tmpHi);
1138 :
1139 82 : _mm_storeu_ps(&pDstScanline[iDstPixel], average);
1140 82 : pSrcScanlineShifted += DEST_ELTS * 2;
1141 : }
1142 :
1143 50 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1144 50 : return iDstPixel;
1145 : }
1146 :
1147 : /************************************************************************/
1148 : /* AverageDoubleSSE2() */
1149 : /************************************************************************/
1150 :
1151 : static int
1152 50 : AverageDoubleSSE2(int nDstXWidth, int nChunkXSize,
1153 : const double *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1154 : double *CPL_RESTRICT pDstScanline)
1155 : {
1156 : // Optimized implementation for average on Float64 by
1157 : // processing by group of output pixels.
1158 50 : const double *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1159 :
1160 50 : int iDstPixel = 0;
1161 50 : const auto zeroDot25 = _mm_set1_pd(0.25);
1162 50 : constexpr int DEST_ELTS =
1163 : static_cast<int>(sizeof(zeroDot25) / sizeof(double));
1164 :
1165 211 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1166 : {
1167 : // Load 4 * DEST_ELTS Float64 from each line
1168 161 : const auto firstLine0 = _mm_mul_pd(
1169 : _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS), zeroDot25);
1170 322 : const auto firstLine1 = _mm_mul_pd(
1171 : _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS), zeroDot25);
1172 161 : const auto secondLine0 = _mm_mul_pd(
1173 161 : _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS + nChunkXSize),
1174 : zeroDot25);
1175 322 : const auto secondLine1 = _mm_mul_pd(
1176 161 : _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS + nChunkXSize),
1177 : zeroDot25);
1178 :
1179 : // Vertical addition
1180 161 : const auto tmp0 = _mm_add_pd(firstLine0, secondLine0);
1181 161 : const auto tmp1 = _mm_add_pd(firstLine1, secondLine1);
1182 :
1183 : // Horizontal addition
1184 161 : const auto average0 = sse2_hadd_pd(tmp0, tmp1);
1185 :
1186 161 : _mm_storeu_pd(&pDstScanline[iDstPixel + 0], average0);
1187 161 : pSrcScanlineShifted += DEST_ELTS * 2;
1188 : }
1189 :
1190 50 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1191 50 : return iDstPixel;
1192 : }
1193 :
1194 : #endif
1195 :
1196 : #endif
1197 :
1198 : /************************************************************************/
1199 : /* GDALResampleChunk_AverageOrRMS() */
1200 : /************************************************************************/
1201 :
1202 : template <class T, class Tsum, GDALDataType eWrkDataType, bool bQuadraticMean>
1203 : static CPLErr
1204 7347 : GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
1205 : const T *pChunk, void **ppDstBuffer)
1206 : {
1207 7347 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1208 7347 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1209 7347 : const double dfSrcXDelta = args.dfSrcXDelta;
1210 7347 : const double dfSrcYDelta = args.dfSrcYDelta;
1211 7347 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1212 7347 : const int nChunkXOff = args.nChunkXOff;
1213 7347 : const int nChunkYOff = args.nChunkYOff;
1214 7347 : const int nChunkXSize = args.nChunkXSize;
1215 7347 : const int nChunkYSize = args.nChunkYSize;
1216 7347 : const int nDstXOff = args.nDstXOff;
1217 7347 : const int nDstXOff2 = args.nDstXOff2;
1218 7347 : const int nDstYOff = args.nDstYOff;
1219 7347 : const int nDstYOff2 = args.nDstYOff2;
1220 7347 : const char *pszResampling = args.pszResampling;
1221 7347 : bool bHasNoData = args.bHasNoData;
1222 7347 : const double dfNoDataValue = args.dfNoDataValue;
1223 7347 : const GDALColorTable *const poColorTable =
1224 : !bQuadraticMean &&
1225 : // AVERAGE_BIT2GRAYSCALE
1226 7264 : STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G")
1227 : ? nullptr
1228 : : args.poColorTable;
1229 7347 : const bool bPropagateNoData = args.bPropagateNoData;
1230 :
1231 7347 : T tNoDataValue = (!bHasNoData) ? 0 : static_cast<T>(dfNoDataValue);
1232 7347 : const T tReplacementVal =
1233 206 : bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
1234 72 : args.eOvrDataType, dfNoDataValue))
1235 : : 0;
1236 :
1237 7347 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
1238 7347 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1239 7347 : const int nDstXWidth = nDstXOff2 - nDstXOff;
1240 :
1241 : /* -------------------------------------------------------------------- */
1242 : /* Allocate buffers. */
1243 : /* -------------------------------------------------------------------- */
1244 7347 : *ppDstBuffer = static_cast<T *>(
1245 7347 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
1246 : GDALGetDataTypeSizeBytes(eWrkDataType)));
1247 7347 : if (*ppDstBuffer == nullptr)
1248 : {
1249 0 : return CE_Failure;
1250 : }
1251 7347 : T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
1252 :
1253 : struct PrecomputedXValue
1254 : {
1255 : int nLeftXOffShifted;
1256 : int nRightXOffShifted;
1257 : double dfLeftWeight;
1258 : double dfRightWeight;
1259 : double dfTotalWeightFullLine;
1260 : };
1261 :
1262 : PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
1263 7347 : VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(PrecomputedXValue)));
1264 :
1265 7347 : if (pasSrcX == nullptr)
1266 : {
1267 0 : return CE_Failure;
1268 : }
1269 :
1270 7347 : std::vector<GDALColorEntry> colorEntries;
1271 :
1272 7347 : if (poColorTable)
1273 : {
1274 5 : int nTransparentIdx = -1;
1275 5 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1276 :
1277 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1278 : // it as nodata value
1279 6 : if (bHasNoData && dfNoDataValue >= 0.0 &&
1280 1 : tNoDataValue < colorEntries.size())
1281 1 : colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1282 :
1283 : // Or if we have no explicit nodata, but a color table entry that is
1284 : // transparent, consider it as the nodata value
1285 4 : else if (!bHasNoData && nTransparentIdx >= 0)
1286 : {
1287 0 : bHasNoData = true;
1288 0 : tNoDataValue = static_cast<T>(nTransparentIdx);
1289 : }
1290 : }
1291 :
1292 : /* ==================================================================== */
1293 : /* Precompute inner loop constants. */
1294 : /* ==================================================================== */
1295 7347 : bool bSrcXSpacingIsTwo = true;
1296 7347 : int nLastSrcXOff2 = -1;
1297 1659150 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1298 : {
1299 1651805 : const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
1300 : // Apply some epsilon to avoid numerical precision issues
1301 1651805 : const int nSrcXOff =
1302 1651805 : std::max(static_cast<int>(dfSrcXOff + 1e-8), nChunkXOff);
1303 1651805 : const double dfSrcXOff2 =
1304 1651805 : dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
1305 1651805 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
1306 1651805 : if (nSrcXOff2 == nSrcXOff)
1307 0 : nSrcXOff2++;
1308 1651805 : if (nSrcXOff2 > nChunkRightXOff)
1309 1 : nSrcXOff2 = nChunkRightXOff;
1310 :
1311 1651805 : pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
1312 1651805 : pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
1313 1651805 : nSrcXOff2 - nChunkXOff;
1314 21 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
1315 1651805 : (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
1316 1651805 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
1317 1651805 : 1 - (nSrcXOff2 - dfSrcXOff2);
1318 1651805 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
1319 1651805 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
1320 1651805 : if (nSrcXOff + 1 < nSrcXOff2)
1321 : {
1322 1651779 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1323 1651779 : nSrcXOff2 - nSrcXOff - 2;
1324 1651779 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1325 1651779 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
1326 : }
1327 :
1328 1651805 : if (nSrcXOff2 - nSrcXOff != 2 ||
1329 1553902 : (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
1330 : {
1331 91989 : bSrcXSpacingIsTwo = false;
1332 : }
1333 1651805 : nLastSrcXOff2 = nSrcXOff2;
1334 : }
1335 :
1336 : /* ==================================================================== */
1337 : /* Loop over destination scanlines. */
1338 : /* ==================================================================== */
1339 701567 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1340 : {
1341 694220 : const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
1342 694220 : int nSrcYOff = std::max(static_cast<int>(dfSrcYOff + 1e-8), nChunkYOff);
1343 :
1344 694220 : const double dfSrcYOff2 =
1345 694220 : dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
1346 694220 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
1347 694220 : if (nSrcYOff2 == nSrcYOff)
1348 0 : ++nSrcYOff2;
1349 694220 : if (nSrcYOff2 > nChunkBottomYOff)
1350 3 : nSrcYOff2 = nChunkBottomYOff;
1351 :
1352 694220 : T *const pDstScanline =
1353 694220 : pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
1354 :
1355 : /* --------------------------------------------------------------------
1356 : */
1357 : /* Loop over destination pixels */
1358 : /* --------------------------------------------------------------------
1359 : */
1360 694220 : if (poColorTable == nullptr)
1361 : {
1362 694105 : if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
1363 : pabyChunkNodataMask == nullptr)
1364 : {
1365 : if constexpr (eWrkDataType == GDT_UInt8 ||
1366 : eWrkDataType == GDT_UInt16)
1367 : {
1368 : // Optimized case : no nodata, overview by a factor of 2 and
1369 : // regular x and y src spacing.
1370 125552 : const T *pSrcScanlineShifted =
1371 125552 : pChunk + pasSrcX[0].nLeftXOffShifted +
1372 125552 : static_cast<size_t>(nSrcYOff - nChunkYOff) *
1373 125552 : nChunkXSize;
1374 125552 : int iDstPixel = 0;
1375 : #ifdef USE_SSE2
1376 : if constexpr (eWrkDataType == GDT_UInt8)
1377 : {
1378 : if constexpr (bQuadraticMean)
1379 : {
1380 5389 : iDstPixel = QuadraticMeanByteSSE2OrAVX2(
1381 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1382 : pDstScanline);
1383 : }
1384 : else
1385 : {
1386 120136 : iDstPixel = AverageByteSSE2OrAVX2(
1387 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1388 : pDstScanline);
1389 : }
1390 : }
1391 : else
1392 : {
1393 : static_assert(eWrkDataType == GDT_UInt16);
1394 : if constexpr (bQuadraticMean)
1395 : {
1396 14 : iDstPixel = QuadraticMeanUInt16SSE2(
1397 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1398 : pDstScanline);
1399 : }
1400 : else
1401 : {
1402 13 : iDstPixel = AverageUInt16SSE2(
1403 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1404 : pDstScanline);
1405 : }
1406 : }
1407 : #endif
1408 300011 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1409 : {
1410 174459 : Tsum nTotal = 0;
1411 : T nVal;
1412 : if constexpr (bQuadraticMean)
1413 52 : nTotal =
1414 52 : SQUARE<Tsum>(pSrcScanlineShifted[0]) +
1415 52 : SQUARE<Tsum>(pSrcScanlineShifted[1]) +
1416 52 : SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
1417 52 : SQUARE<Tsum>(
1418 52 : pSrcScanlineShifted[1 + nChunkXSize]);
1419 : else
1420 174407 : nTotal = pSrcScanlineShifted[0] +
1421 174407 : pSrcScanlineShifted[1] +
1422 174407 : pSrcScanlineShifted[nChunkXSize] +
1423 174407 : pSrcScanlineShifted[1 + nChunkXSize];
1424 :
1425 174459 : constexpr int nTotalWeight = 4;
1426 : if constexpr (bQuadraticMean)
1427 52 : nVal = ComputeIntegerRMS_4values<T>(nTotal);
1428 : else
1429 174407 : nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
1430 : nTotalWeight);
1431 :
1432 : // No need to compare nVal against tNoDataValue as we
1433 : // are in a case where pabyChunkNodataMask == nullptr
1434 : // implies the absence of nodata value.
1435 174459 : pDstScanline[iDstPixel] = nVal;
1436 174459 : pSrcScanlineShifted += 2;
1437 : }
1438 : }
1439 : else
1440 : {
1441 : static_assert(eWrkDataType == GDT_Float32 ||
1442 : eWrkDataType == GDT_Float64);
1443 202 : const T *pSrcScanlineShifted =
1444 202 : pChunk + pasSrcX[0].nLeftXOffShifted +
1445 202 : static_cast<size_t>(nSrcYOff - nChunkYOff) *
1446 202 : nChunkXSize;
1447 202 : int iDstPixel = 0;
1448 : #if defined(USE_SSE2) && !defined(ARM_V7)
1449 : if constexpr (eWrkDataType == GDT_Float32)
1450 : {
1451 : static_assert(std::is_same_v<T, float>);
1452 : if constexpr (bQuadraticMean)
1453 : {
1454 66 : iDstPixel = QuadraticMeanFloatSSE2(
1455 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1456 : pDstScanline);
1457 : }
1458 : else
1459 : {
1460 50 : iDstPixel = AverageFloatSSE2(
1461 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1462 : pDstScanline);
1463 : }
1464 : }
1465 : else
1466 : {
1467 : if constexpr (!bQuadraticMean)
1468 : {
1469 50 : iDstPixel = AverageDoubleSSE2(
1470 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1471 : pDstScanline);
1472 : }
1473 : }
1474 : #endif
1475 :
1476 726 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1477 : {
1478 : T nVal;
1479 :
1480 : if constexpr (bQuadraticMean)
1481 : {
1482 : // Avoid issues with large values by renormalizing
1483 96 : const auto max = std::max(
1484 420 : {std::fabs(pSrcScanlineShifted[0]),
1485 420 : std::fabs(pSrcScanlineShifted[1]),
1486 420 : std::fabs(pSrcScanlineShifted[nChunkXSize]),
1487 420 : std::fabs(
1488 420 : pSrcScanlineShifted[1 + nChunkXSize])});
1489 420 : if (max == 0)
1490 : {
1491 8 : nVal = 0;
1492 : }
1493 412 : else if (std::isinf(max))
1494 : {
1495 : // If there is at least one infinity value,
1496 : // then just summing, and taking the abs
1497 : // value will give the expected result:
1498 : // * +inf if all values are +inf
1499 : // * +inf if all values are -inf
1500 : // * NaN otherwise
1501 82 : nVal = std::fabs(
1502 82 : pSrcScanlineShifted[0] +
1503 82 : pSrcScanlineShifted[1] +
1504 82 : pSrcScanlineShifted[nChunkXSize] +
1505 82 : pSrcScanlineShifted[1 + nChunkXSize]);
1506 : }
1507 : else
1508 : {
1509 330 : const auto inv_max = static_cast<T>(1.0) / max;
1510 330 : nVal =
1511 : max *
1512 330 : std::sqrt(
1513 : static_cast<T>(0.25) *
1514 330 : (SQUARE(pSrcScanlineShifted[0] *
1515 330 : inv_max) +
1516 330 : SQUARE(pSrcScanlineShifted[1] *
1517 330 : inv_max) +
1518 330 : SQUARE(
1519 330 : pSrcScanlineShifted[nChunkXSize] *
1520 330 : inv_max) +
1521 330 : SQUARE(
1522 330 : pSrcScanlineShifted[1 +
1523 : nChunkXSize] *
1524 : inv_max)));
1525 : }
1526 : }
1527 : else
1528 : {
1529 104 : constexpr auto weight = static_cast<T>(0.25);
1530 : // Multiply each value by weight to avoid
1531 : // potential overflow
1532 104 : nVal =
1533 104 : (weight * pSrcScanlineShifted[0] +
1534 104 : weight * pSrcScanlineShifted[1] +
1535 104 : weight * pSrcScanlineShifted[nChunkXSize] +
1536 104 : weight * pSrcScanlineShifted[1 + nChunkXSize]);
1537 : }
1538 :
1539 : // No need to compare nVal against tNoDataValue as we
1540 : // are in a case where pabyChunkNodataMask == nullptr
1541 : // implies the absence of nodata value.
1542 524 : pDstScanline[iDstPixel] = nVal;
1543 524 : pSrcScanlineShifted += 2;
1544 : }
1545 125754 : }
1546 : }
1547 : else
1548 : {
1549 17 : const double dfBottomWeight =
1550 568351 : (nSrcYOff + 1 == nSrcYOff2) ? 1.0
1551 568334 : : 1.0 - (dfSrcYOff - nSrcYOff);
1552 568351 : const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
1553 568351 : nSrcYOff -= nChunkYOff;
1554 568351 : nSrcYOff2 -= nChunkYOff;
1555 :
1556 568351 : double dfTotalWeightFullColumn = dfBottomWeight;
1557 568351 : if (nSrcYOff + 1 < nSrcYOff2)
1558 : {
1559 568334 : dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
1560 568334 : dfTotalWeightFullColumn += dfTopWeight;
1561 : }
1562 :
1563 9784185 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1564 : {
1565 9215839 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1566 9215839 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1567 :
1568 9215839 : double dfTotal = 0;
1569 9215839 : double dfTotalWeight = 0;
1570 9215839 : [[maybe_unused]] double dfMulFactor = 1.0;
1571 9215839 : [[maybe_unused]] double dfInvMulFactor = 1.0;
1572 9215839 : constexpr bool bUseMulFactor =
1573 : (eWrkDataType == GDT_Float32 ||
1574 : eWrkDataType == GDT_Float64);
1575 9215839 : if (pabyChunkNodataMask == nullptr)
1576 : {
1577 : if constexpr (bUseMulFactor)
1578 : {
1579 : if constexpr (bQuadraticMean)
1580 : {
1581 80 : T mulFactor = 0;
1582 80 : auto pChunkShifted =
1583 80 : pChunk +
1584 80 : static_cast<size_t>(nSrcYOff) * nChunkXSize;
1585 :
1586 240 : for (int iY = nSrcYOff; iY < nSrcYOff2;
1587 160 : ++iY, pChunkShifted += nChunkXSize)
1588 : {
1589 480 : for (int iX = nSrcXOff; iX < nSrcXOff2;
1590 : ++iX)
1591 640 : mulFactor = std::max(
1592 : mulFactor,
1593 320 : std::fabs(pChunkShifted[iX]));
1594 : }
1595 80 : dfMulFactor = double(mulFactor);
1596 142 : dfInvMulFactor =
1597 62 : dfMulFactor > 0 &&
1598 62 : std::isfinite(dfMulFactor)
1599 : ? 1.0 / dfMulFactor
1600 : : 1.0;
1601 : }
1602 : else
1603 : {
1604 139 : dfMulFactor = (nSrcYOff2 - nSrcYOff) *
1605 139 : (nSrcXOff2 - nSrcXOff);
1606 139 : dfInvMulFactor = 1.0 / dfMulFactor;
1607 : }
1608 : }
1609 :
1610 1746545 : auto pChunkShifted =
1611 227 : pChunk +
1612 1746545 : static_cast<size_t>(nSrcYOff) * nChunkXSize;
1613 1746545 : int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1614 1746545 : double dfWeightY = dfBottomWeight;
1615 3493539 : while (true)
1616 : {
1617 : double dfTotalLine;
1618 : if constexpr (bQuadraticMean)
1619 : {
1620 : // Left pixel
1621 : {
1622 216 : const T val = pChunkShifted[nSrcXOff];
1623 216 : dfTotalLine =
1624 216 : SQUARE(double(val) * dfInvMulFactor) *
1625 216 : pasSrcX[iDstPixel].dfLeftWeight;
1626 : }
1627 :
1628 216 : if (nSrcXOff + 1 < nSrcXOff2)
1629 : {
1630 : // Middle pixels
1631 216 : for (int iX = nSrcXOff + 1;
1632 536 : iX < nSrcXOff2 - 1; ++iX)
1633 : {
1634 320 : const T val = pChunkShifted[iX];
1635 320 : dfTotalLine += SQUARE(double(val) *
1636 : dfInvMulFactor);
1637 : }
1638 :
1639 : // Right pixel
1640 : {
1641 216 : const T val =
1642 216 : pChunkShifted[nSrcXOff2 - 1];
1643 216 : dfTotalLine +=
1644 216 : SQUARE(double(val) *
1645 216 : dfInvMulFactor) *
1646 216 : pasSrcX[iDstPixel].dfRightWeight;
1647 : }
1648 : }
1649 : }
1650 : else
1651 : {
1652 : // Left pixel
1653 : {
1654 5239868 : const T val = pChunkShifted[nSrcXOff];
1655 5239868 : dfTotalLine =
1656 5239868 : double(val) * dfInvMulFactor *
1657 5239868 : pasSrcX[iDstPixel].dfLeftWeight;
1658 : }
1659 :
1660 5239868 : if (nSrcXOff + 1 < nSrcXOff2)
1661 : {
1662 : // Middle pixels
1663 4239442 : for (int iX = nSrcXOff + 1;
1664 64183238 : iX < nSrcXOff2 - 1; ++iX)
1665 : {
1666 59943836 : const T val = pChunkShifted[iX];
1667 59943836 : dfTotalLine +=
1668 59943836 : double(val) * dfInvMulFactor;
1669 : }
1670 :
1671 : // Right pixel
1672 : {
1673 4239442 : const T val =
1674 4239442 : pChunkShifted[nSrcXOff2 - 1];
1675 4239442 : dfTotalLine +=
1676 4239442 : double(val) * dfInvMulFactor *
1677 4239442 : pasSrcX[iDstPixel].dfRightWeight;
1678 : }
1679 : }
1680 : }
1681 :
1682 5240084 : dfTotal += dfTotalLine * dfWeightY;
1683 5240084 : --nCounterY;
1684 5240084 : if (nCounterY < 0)
1685 1746545 : break;
1686 3493539 : pChunkShifted += nChunkXSize;
1687 3493539 : dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
1688 : }
1689 :
1690 1746545 : dfTotalWeight =
1691 1746545 : pasSrcX[iDstPixel].dfTotalWeightFullLine *
1692 : dfTotalWeightFullColumn;
1693 : }
1694 : else
1695 : {
1696 7469294 : size_t nCount = 0;
1697 30285576 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1698 : {
1699 22816292 : const auto pChunkShifted =
1700 22816292 : pChunk + static_cast<size_t>(iY) * nChunkXSize;
1701 :
1702 22816292 : double dfTotalLine = 0;
1703 22816292 : double dfTotalWeightLine = 0;
1704 : // Left pixel
1705 : {
1706 22816292 : const int iX = nSrcXOff;
1707 22816292 : const T val = pChunkShifted[iX];
1708 22816292 : if (pabyChunkNodataMask
1709 22816292 : [iX +
1710 22816292 : static_cast<size_t>(iY) * nChunkXSize])
1711 : {
1712 17325139 : nCount++;
1713 17325139 : const double dfWeightX =
1714 17325139 : pasSrcX[iDstPixel].dfLeftWeight;
1715 17325139 : dfTotalWeightLine = dfWeightX;
1716 : if constexpr (bQuadraticMean)
1717 508 : dfTotalLine =
1718 508 : SQUARE(double(val)) * dfWeightX;
1719 : else
1720 17324631 : dfTotalLine = double(val) * dfWeightX;
1721 : }
1722 : }
1723 :
1724 22816292 : if (nSrcXOff < nSrcXOff2 - 1)
1725 : {
1726 : // Middle pixels
1727 61618372 : for (int iX = nSrcXOff + 1; iX < nSrcXOff2 - 1;
1728 : ++iX)
1729 : {
1730 38802080 : const T val = pChunkShifted[iX];
1731 38802080 : if (pabyChunkNodataMask
1732 38802080 : [iX + static_cast<size_t>(iY) *
1733 38802080 : nChunkXSize])
1734 : {
1735 28038780 : nCount++;
1736 28038780 : dfTotalWeightLine += 1;
1737 : if constexpr (bQuadraticMean)
1738 640 : dfTotalLine += SQUARE(double(val));
1739 : else
1740 28038140 : dfTotalLine += double(val);
1741 : }
1742 : }
1743 :
1744 : // Right pixel
1745 : {
1746 22816292 : const int iX = nSrcXOff2 - 1;
1747 22816292 : const T val = pChunkShifted[iX];
1748 22816292 : if (pabyChunkNodataMask
1749 22816292 : [iX + static_cast<size_t>(iY) *
1750 22816292 : nChunkXSize])
1751 : {
1752 17324495 : nCount++;
1753 17324495 : const double dfWeightX =
1754 17324495 : pasSrcX[iDstPixel].dfRightWeight;
1755 17324495 : dfTotalWeightLine += dfWeightX;
1756 : if constexpr (bQuadraticMean)
1757 503 : dfTotalLine +=
1758 503 : SQUARE(double(val)) * dfWeightX;
1759 : else
1760 17323992 : dfTotalLine +=
1761 17323992 : double(val) * dfWeightX;
1762 : }
1763 : }
1764 : }
1765 :
1766 38163300 : const double dfWeightY =
1767 : (iY == nSrcYOff) ? dfBottomWeight
1768 15347008 : : (iY + 1 == nSrcYOff2) ? dfTopWeight
1769 : : 1.0;
1770 22816292 : dfTotal += dfTotalLine * dfWeightY;
1771 22816292 : dfTotalWeight += dfTotalWeightLine * dfWeightY;
1772 : }
1773 :
1774 7469294 : if (nCount == 0 ||
1775 8 : (bPropagateNoData &&
1776 : nCount <
1777 8 : static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
1778 8 : (nSrcXOff2 - nSrcXOff)))
1779 : {
1780 2307682 : pDstScanline[iDstPixel] = tNoDataValue;
1781 2307682 : continue;
1782 : }
1783 : }
1784 : if constexpr (eWrkDataType == GDT_UInt8)
1785 : {
1786 : T nVal;
1787 : if constexpr (bQuadraticMean)
1788 38 : nVal = ComputeIntegerRMS<T, int>(dfTotal,
1789 : dfTotalWeight);
1790 : else
1791 6901260 : nVal =
1792 6901260 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1793 6901298 : if (bHasNoData && nVal == tNoDataValue)
1794 0 : nVal = tReplacementVal;
1795 6901298 : pDstScanline[iDstPixel] = nVal;
1796 : }
1797 : else if constexpr (eWrkDataType == GDT_UInt16)
1798 : {
1799 : T nVal;
1800 : if constexpr (bQuadraticMean)
1801 4 : nVal = ComputeIntegerRMS<T, uint64_t>(
1802 : dfTotal, dfTotalWeight);
1803 : else
1804 4 : nVal =
1805 4 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1806 8 : if (bHasNoData && nVal == tNoDataValue)
1807 0 : nVal = tReplacementVal;
1808 8 : pDstScanline[iDstPixel] = nVal;
1809 : }
1810 : else
1811 : {
1812 : T nVal;
1813 : if constexpr (bQuadraticMean)
1814 : {
1815 : if constexpr (bUseMulFactor)
1816 249 : nVal = static_cast<T>(
1817 132 : dfMulFactor *
1818 249 : sqrt(dfTotal / dfTotalWeight));
1819 : else
1820 : nVal = static_cast<T>(
1821 : sqrt(dfTotal / dfTotalWeight));
1822 : }
1823 : else
1824 : {
1825 : if constexpr (bUseMulFactor)
1826 6602 : nVal = static_cast<T>(
1827 6602 : dfMulFactor * (dfTotal / dfTotalWeight));
1828 : else
1829 : nVal = static_cast<T>(dfTotal / dfTotalWeight);
1830 : }
1831 6851 : if (bHasNoData && nVal == tNoDataValue)
1832 2 : nVal = tReplacementVal;
1833 6851 : pDstScanline[iDstPixel] = nVal;
1834 : }
1835 : }
1836 : }
1837 : }
1838 : else
1839 : {
1840 115 : nSrcYOff -= nChunkYOff;
1841 115 : nSrcYOff2 -= nChunkYOff;
1842 :
1843 6590 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1844 : {
1845 6475 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1846 6475 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1847 :
1848 6475 : uint64_t nTotalR = 0;
1849 6475 : uint64_t nTotalG = 0;
1850 6475 : uint64_t nTotalB = 0;
1851 6475 : size_t nCount = 0;
1852 :
1853 19425 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1854 : {
1855 38850 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
1856 : {
1857 25900 : const T val =
1858 25900 : pChunk[iX + static_cast<size_t>(iY) * nChunkXSize];
1859 : // cppcheck-suppress unsignedLessThanZero
1860 25900 : if (val < 0 || val >= colorEntries.size())
1861 0 : continue;
1862 25900 : const size_t idx = static_cast<size_t>(val);
1863 25900 : const auto &entry = colorEntries[idx];
1864 25900 : if (entry.c4)
1865 : {
1866 : if constexpr (bQuadraticMean)
1867 : {
1868 800 : nTotalR += SQUARE<int>(entry.c1);
1869 800 : nTotalG += SQUARE<int>(entry.c2);
1870 800 : nTotalB += SQUARE<int>(entry.c3);
1871 800 : ++nCount;
1872 : }
1873 : else
1874 : {
1875 13328 : nTotalR += entry.c1;
1876 13328 : nTotalG += entry.c2;
1877 13328 : nTotalB += entry.c3;
1878 13328 : ++nCount;
1879 : }
1880 : }
1881 : }
1882 : }
1883 :
1884 6475 : if (nCount == 0 ||
1885 0 : (bPropagateNoData &&
1886 0 : nCount < static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
1887 0 : (nSrcXOff2 - nSrcXOff)))
1888 : {
1889 2838 : pDstScanline[iDstPixel] = tNoDataValue;
1890 : }
1891 : else
1892 : {
1893 : GDALColorEntry color;
1894 : if constexpr (bQuadraticMean)
1895 : {
1896 200 : color.c1 =
1897 200 : static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
1898 200 : color.c2 =
1899 200 : static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
1900 200 : color.c3 =
1901 200 : static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
1902 : }
1903 : else
1904 : {
1905 3437 : color.c1 =
1906 3437 : static_cast<short>((nTotalR + nCount / 2) / nCount);
1907 3437 : color.c2 =
1908 3437 : static_cast<short>((nTotalG + nCount / 2) / nCount);
1909 3437 : color.c3 =
1910 3437 : static_cast<short>((nTotalB + nCount / 2) / nCount);
1911 : }
1912 3637 : pDstScanline[iDstPixel] =
1913 3637 : static_cast<T>(BestColorEntry(colorEntries, color));
1914 : }
1915 : }
1916 : }
1917 : }
1918 :
1919 7347 : CPLFree(pasSrcX);
1920 :
1921 7347 : return CE_None;
1922 : }
1923 :
1924 : template <bool bQuadraticMean>
1925 : static CPLErr
1926 7347 : GDALResampleChunk_AverageOrRMSInternal(const GDALOverviewResampleArgs &args,
1927 : const void *pChunk, void **ppDstBuffer,
1928 : GDALDataType *peDstBufferDataType)
1929 : {
1930 7347 : *peDstBufferDataType = args.eWrkDataType;
1931 7347 : switch (args.eWrkDataType)
1932 : {
1933 7202 : case GDT_UInt8:
1934 : {
1935 : return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_UInt8,
1936 7202 : bQuadraticMean>(
1937 7202 : args, static_cast<const GByte *>(pChunk), ppDstBuffer);
1938 : }
1939 :
1940 11 : case GDT_UInt16:
1941 : {
1942 : if constexpr (bQuadraticMean)
1943 : {
1944 : // Use double as accumulation type, because UInt32 could overflow
1945 : return GDALResampleChunk_AverageOrRMS_T<
1946 6 : GUInt16, double, GDT_UInt16, bQuadraticMean>(
1947 6 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1948 : }
1949 : else
1950 : {
1951 : return GDALResampleChunk_AverageOrRMS_T<
1952 5 : GUInt16, GUInt32, GDT_UInt16, bQuadraticMean>(
1953 5 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1954 : }
1955 : }
1956 :
1957 81 : case GDT_Float32:
1958 : {
1959 : return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32,
1960 81 : bQuadraticMean>(
1961 81 : args, static_cast<const float *>(pChunk), ppDstBuffer);
1962 : }
1963 :
1964 53 : case GDT_Float64:
1965 : {
1966 : return GDALResampleChunk_AverageOrRMS_T<double, double, GDT_Float64,
1967 53 : bQuadraticMean>(
1968 53 : args, static_cast<const double *>(pChunk), ppDstBuffer);
1969 : }
1970 :
1971 0 : default:
1972 0 : break;
1973 : }
1974 :
1975 0 : CPLAssert(false);
1976 : return CE_Failure;
1977 : }
1978 :
1979 : static CPLErr
1980 7347 : GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
1981 : const void *pChunk, void **ppDstBuffer,
1982 : GDALDataType *peDstBufferDataType)
1983 : {
1984 7347 : if (EQUAL(args.pszResampling, "RMS"))
1985 83 : return GDALResampleChunk_AverageOrRMSInternal<true>(
1986 83 : args, pChunk, ppDstBuffer, peDstBufferDataType);
1987 : else
1988 7264 : return GDALResampleChunk_AverageOrRMSInternal<false>(
1989 7264 : args, pChunk, ppDstBuffer, peDstBufferDataType);
1990 : }
1991 :
1992 : /************************************************************************/
1993 : /* GDALResampleChunk_Gauss() */
1994 : /************************************************************************/
1995 :
1996 86 : static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
1997 : const void *pChunk, void **ppDstBuffer,
1998 : GDALDataType *peDstBufferDataType)
1999 :
2000 : {
2001 86 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2002 86 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2003 86 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2004 86 : const int nChunkXOff = args.nChunkXOff;
2005 86 : const int nChunkXSize = args.nChunkXSize;
2006 86 : const int nChunkYOff = args.nChunkYOff;
2007 86 : const int nChunkYSize = args.nChunkYSize;
2008 86 : const int nDstXOff = args.nDstXOff;
2009 86 : const int nDstXOff2 = args.nDstXOff2;
2010 86 : const int nDstYOff = args.nDstYOff;
2011 86 : const int nDstYOff2 = args.nDstYOff2;
2012 86 : const bool bHasNoData = args.bHasNoData;
2013 86 : double dfNoDataValue = args.dfNoDataValue;
2014 86 : const GDALColorTable *poColorTable = args.poColorTable;
2015 :
2016 86 : const double *const padfChunk = static_cast<const double *>(pChunk);
2017 :
2018 86 : *ppDstBuffer =
2019 86 : VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
2020 : GDALGetDataTypeSizeBytes(GDT_Float64));
2021 86 : if (*ppDstBuffer == nullptr)
2022 : {
2023 0 : return CE_Failure;
2024 : }
2025 86 : *peDstBufferDataType = GDT_Float64;
2026 86 : double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
2027 :
2028 : /* -------------------------------------------------------------------- */
2029 : /* Create the filter kernel and allocate scanline buffer. */
2030 : /* -------------------------------------------------------------------- */
2031 86 : int nGaussMatrixDim = 3;
2032 : const int *panGaussMatrix;
2033 86 : constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
2034 86 : constexpr int anGaussMatrix5x5[] = {1, 4, 6, 4, 1, 4, 16, 24, 16,
2035 : 4, 6, 24, 36, 24, 6, 4, 16, 24,
2036 : 16, 4, 1, 4, 6, 4, 1};
2037 86 : constexpr int anGaussMatrix7x7[] = {
2038 : 1, 6, 15, 20, 15, 6, 1, 6, 36, 90, 120, 90, 36,
2039 : 6, 15, 90, 225, 300, 225, 90, 15, 20, 120, 300, 400, 300,
2040 : 120, 20, 15, 90, 225, 300, 225, 90, 15, 6, 36, 90, 120,
2041 : 90, 36, 6, 1, 6, 15, 20, 15, 6, 1};
2042 :
2043 86 : const int nOXSize = args.nOvrXSize;
2044 86 : const int nOYSize = args.nOvrYSize;
2045 86 : const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
2046 :
2047 : // matrix for gauss filter
2048 86 : if (nResYFactor <= 2)
2049 : {
2050 85 : panGaussMatrix = anGaussMatrix3x3;
2051 85 : nGaussMatrixDim = 3;
2052 : }
2053 1 : else if (nResYFactor <= 4)
2054 : {
2055 0 : panGaussMatrix = anGaussMatrix5x5;
2056 0 : nGaussMatrixDim = 5;
2057 : }
2058 : else
2059 : {
2060 1 : panGaussMatrix = anGaussMatrix7x7;
2061 1 : nGaussMatrixDim = 7;
2062 : }
2063 :
2064 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
2065 : int *panGaussMatrixDup = static_cast<int *>(
2066 : CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
2067 : memcpy(panGaussMatrixDup, panGaussMatrix,
2068 : sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
2069 : panGaussMatrix = panGaussMatrixDup;
2070 : #endif
2071 :
2072 86 : if (!bHasNoData)
2073 79 : dfNoDataValue = 0.0;
2074 :
2075 86 : std::vector<GDALColorEntry> colorEntries;
2076 86 : int nTransparentIdx = -1;
2077 86 : if (poColorTable)
2078 2 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
2079 :
2080 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
2081 : // it as nodata value.
2082 92 : if (bHasNoData && dfNoDataValue >= 0.0 &&
2083 6 : dfNoDataValue < colorEntries.size())
2084 0 : colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
2085 :
2086 : // Or if we have no explicit nodata, but a color table entry that is
2087 : // transparent, consider it as the nodata value.
2088 86 : else if (!bHasNoData && nTransparentIdx >= 0)
2089 : {
2090 0 : dfNoDataValue = nTransparentIdx;
2091 : }
2092 :
2093 86 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2094 86 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2095 86 : const int nDstXWidth = nDstXOff2 - nDstXOff;
2096 :
2097 : /* ==================================================================== */
2098 : /* Loop over destination scanlines. */
2099 : /* ==================================================================== */
2100 16488 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2101 : {
2102 16402 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
2103 16402 : int nSrcYOff2 =
2104 16402 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
2105 :
2106 16402 : if (nSrcYOff < nChunkYOff)
2107 : {
2108 0 : nSrcYOff = nChunkYOff;
2109 0 : nSrcYOff2++;
2110 : }
2111 :
2112 16402 : const int iSizeY = nSrcYOff2 - nSrcYOff;
2113 16402 : nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
2114 16402 : nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
2115 :
2116 16402 : if (nSrcYOff2 > nChunkBottomYOff ||
2117 16359 : (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
2118 : {
2119 44 : nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
2120 : }
2121 :
2122 16402 : int nYShiftGaussMatrix = 0;
2123 16402 : if (nSrcYOff < nChunkYOff)
2124 : {
2125 0 : nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
2126 0 : nSrcYOff = nChunkYOff;
2127 : }
2128 :
2129 16402 : const double *const padfSrcScanline =
2130 16402 : padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
2131 16402 : const GByte *pabySrcScanlineNodataMask = nullptr;
2132 16402 : if (pabyChunkNodataMask != nullptr)
2133 152 : pabySrcScanlineNodataMask =
2134 152 : pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
2135 :
2136 : /* --------------------------------------------------------------------
2137 : */
2138 : /* Loop over destination pixels */
2139 : /* --------------------------------------------------------------------
2140 : */
2141 16402 : double *const padfDstScanline =
2142 16402 : padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
2143 4149980 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2144 : {
2145 4133580 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
2146 4133580 : int nSrcXOff2 =
2147 4133580 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
2148 :
2149 4133580 : if (nSrcXOff < nChunkXOff)
2150 : {
2151 0 : nSrcXOff = nChunkXOff;
2152 0 : nSrcXOff2++;
2153 : }
2154 :
2155 4133580 : const int iSizeX = nSrcXOff2 - nSrcXOff;
2156 4133580 : nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
2157 4133580 : nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
2158 :
2159 4133580 : if (nSrcXOff2 > nChunkRightXOff ||
2160 4127930 : (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
2161 : {
2162 5650 : nSrcXOff2 =
2163 5650 : std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
2164 : }
2165 :
2166 4133580 : int nXShiftGaussMatrix = 0;
2167 4133580 : if (nSrcXOff < nChunkXOff)
2168 : {
2169 0 : nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
2170 0 : nSrcXOff = nChunkXOff;
2171 : }
2172 :
2173 4133580 : if (poColorTable == nullptr)
2174 : {
2175 4133380 : double dfTotal = 0.0;
2176 4133380 : GInt64 nCount = 0;
2177 4133380 : const int *panLineWeight =
2178 4133380 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2179 : nXShiftGaussMatrix;
2180 :
2181 16527900 : for (int iY = nSrcYOff; iY < nSrcYOff2;
2182 12394500 : ++iY, panLineWeight += nGaussMatrixDim)
2183 : {
2184 49561300 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2185 : {
2186 37166800 : const double val =
2187 37166800 : padfSrcScanline[iX - nChunkXOff +
2188 37166800 : static_cast<GPtrDiff_t>(iY -
2189 37166800 : nSrcYOff) *
2190 37166800 : nChunkXSize];
2191 37166800 : if (pabySrcScanlineNodataMask == nullptr ||
2192 32872 : pabySrcScanlineNodataMask[iX - nChunkXOff +
2193 32872 : static_cast<GPtrDiff_t>(
2194 32872 : iY - nSrcYOff) *
2195 32872 : nChunkXSize])
2196 : {
2197 37146100 : const int nWeight = panLineWeight[i];
2198 37146100 : dfTotal += val * nWeight;
2199 37146100 : nCount += nWeight;
2200 : }
2201 : }
2202 : }
2203 :
2204 4133380 : if (nCount == 0)
2205 : {
2206 2217 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2207 : }
2208 : else
2209 : {
2210 4131160 : padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
2211 : }
2212 : }
2213 : else
2214 : {
2215 200 : GInt64 nTotalR = 0;
2216 200 : GInt64 nTotalG = 0;
2217 200 : GInt64 nTotalB = 0;
2218 200 : GInt64 nTotalWeight = 0;
2219 200 : const int *panLineWeight =
2220 200 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2221 : nXShiftGaussMatrix;
2222 :
2223 780 : for (int iY = nSrcYOff; iY < nSrcYOff2;
2224 580 : ++iY, panLineWeight += nGaussMatrixDim)
2225 : {
2226 2262 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2227 : {
2228 1682 : const double val =
2229 1682 : padfSrcScanline[iX - nChunkXOff +
2230 1682 : static_cast<GPtrDiff_t>(iY -
2231 1682 : nSrcYOff) *
2232 1682 : nChunkXSize];
2233 1682 : if (val < 0 || val >= colorEntries.size())
2234 0 : continue;
2235 :
2236 1682 : size_t idx = static_cast<size_t>(val);
2237 1682 : if (colorEntries[idx].c4)
2238 : {
2239 1682 : const int nWeight = panLineWeight[i];
2240 1682 : nTotalR +=
2241 1682 : static_cast<GInt64>(colorEntries[idx].c1) *
2242 1682 : nWeight;
2243 1682 : nTotalG +=
2244 1682 : static_cast<GInt64>(colorEntries[idx].c2) *
2245 1682 : nWeight;
2246 1682 : nTotalB +=
2247 1682 : static_cast<GInt64>(colorEntries[idx].c3) *
2248 1682 : nWeight;
2249 1682 : nTotalWeight += nWeight;
2250 : }
2251 : }
2252 : }
2253 :
2254 200 : if (nTotalWeight == 0)
2255 : {
2256 0 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2257 : }
2258 : else
2259 : {
2260 : GDALColorEntry color;
2261 :
2262 200 : color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
2263 : nTotalWeight);
2264 200 : color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
2265 : nTotalWeight);
2266 200 : color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
2267 : nTotalWeight);
2268 200 : padfDstScanline[iDstPixel - nDstXOff] =
2269 200 : BestColorEntry(colorEntries, color);
2270 : }
2271 : }
2272 : }
2273 : }
2274 :
2275 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
2276 : CPLFree(panGaussMatrixDup);
2277 : #endif
2278 :
2279 86 : return CE_None;
2280 : }
2281 :
2282 : /************************************************************************/
2283 : /* GDALResampleChunk_Mode() */
2284 : /************************************************************************/
2285 :
2286 688 : template <class T> static inline bool IsSame(T a, T b)
2287 : {
2288 688 : return a == b;
2289 : }
2290 :
2291 60 : template <> bool IsSame<GFloat16>(GFloat16 a, GFloat16 b)
2292 : {
2293 60 : return a == b || (CPLIsNan(a) && CPLIsNan(b));
2294 : }
2295 :
2296 5583 : template <> bool IsSame<float>(float a, float b)
2297 : {
2298 5583 : return a == b || (std::isnan(a) && std::isnan(b));
2299 : }
2300 :
2301 1701 : template <> bool IsSame<double>(double a, double b)
2302 : {
2303 1701 : return a == b || (std::isnan(a) && std::isnan(b));
2304 : }
2305 :
2306 : namespace
2307 : {
2308 : struct ComplexFloat16
2309 : {
2310 : GFloat16 r;
2311 : GFloat16 i;
2312 : };
2313 : } // namespace
2314 :
2315 60 : template <> bool IsSame<ComplexFloat16>(ComplexFloat16 a, ComplexFloat16 b)
2316 : {
2317 90 : return (a.r == b.r && a.i == b.i) ||
2318 90 : (CPLIsNan(a.r) && CPLIsNan(a.i) && CPLIsNan(b.r) && CPLIsNan(b.i));
2319 : }
2320 :
2321 : template <>
2322 60 : bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
2323 : {
2324 120 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2325 120 : std::isnan(b.real()) && std::isnan(b.imag()));
2326 : }
2327 :
2328 : template <>
2329 60 : bool IsSame<std::complex<double>>(std::complex<double> a,
2330 : std::complex<double> b)
2331 : {
2332 120 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2333 120 : std::isnan(b.real()) && std::isnan(b.imag()));
2334 : }
2335 :
2336 : template <class T>
2337 182 : static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
2338 : const T *pChunk, T *const pDstBuffer)
2339 :
2340 : {
2341 182 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2342 182 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2343 182 : const double dfSrcXDelta = args.dfSrcXDelta;
2344 182 : const double dfSrcYDelta = args.dfSrcYDelta;
2345 182 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2346 182 : const int nChunkXOff = args.nChunkXOff;
2347 182 : const int nChunkXSize = args.nChunkXSize;
2348 182 : const int nChunkYOff = args.nChunkYOff;
2349 182 : const int nChunkYSize = args.nChunkYSize;
2350 182 : const int nDstXOff = args.nDstXOff;
2351 182 : const int nDstXOff2 = args.nDstXOff2;
2352 182 : const int nDstYOff = args.nDstYOff;
2353 182 : const int nDstYOff2 = args.nDstYOff2;
2354 182 : const bool bHasNoData = args.bHasNoData;
2355 182 : const GDALColorTable *poColorTable = args.poColorTable;
2356 182 : const int nDstXSize = nDstXOff2 - nDstXOff;
2357 :
2358 8 : T tNoDataValue;
2359 : if constexpr (std::is_same<T, ComplexFloat16>::value)
2360 : {
2361 4 : tNoDataValue.r = cpl::NumericLimits<GFloat16>::quiet_NaN();
2362 4 : tNoDataValue.i = cpl::NumericLimits<GFloat16>::quiet_NaN();
2363 : }
2364 : else if constexpr (std::is_same<T, std::complex<float>>::value ||
2365 : std::is_same<T, std::complex<double>>::value)
2366 : {
2367 : using BaseT = typename T::value_type;
2368 8 : tNoDataValue =
2369 : std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
2370 : std::numeric_limits<BaseT>::quiet_NaN());
2371 : }
2372 170 : else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
2373 169 : tNoDataValue = 0;
2374 : else
2375 1 : tNoDataValue = static_cast<T>(args.dfNoDataValue);
2376 :
2377 : using CountType = uint32_t;
2378 182 : CountType nMaxNumPx = 0;
2379 182 : T *paVals = nullptr;
2380 182 : CountType *panCounts = nullptr;
2381 :
2382 182 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2383 182 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2384 364 : std::vector<int> anVals(256, 0);
2385 :
2386 : /* ==================================================================== */
2387 : /* Loop over destination scanlines. */
2388 : /* ==================================================================== */
2389 7713 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2390 : {
2391 7531 : const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
2392 7531 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
2393 : #ifdef only_pixels_with_more_than_10_pct_participation
2394 : // When oversampling, don't take into account pixels that have a tiny
2395 : // participation in the resulting pixel
2396 : if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2397 : nSrcYOff < nChunkBottomYOff)
2398 : nSrcYOff++;
2399 : #endif
2400 7531 : if (nSrcYOff < nChunkYOff)
2401 0 : nSrcYOff = nChunkYOff;
2402 :
2403 7531 : const double dfSrcYOff2 =
2404 7531 : dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
2405 7531 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
2406 : #ifdef only_pixels_with_more_than_10_pct_participation
2407 : // When oversampling, don't take into account pixels that have a tiny
2408 : // participation in the resulting pixel
2409 : if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2410 : nSrcYOff2 > nChunkYOff)
2411 : nSrcYOff2--;
2412 : #endif
2413 7531 : if (nSrcYOff2 == nSrcYOff)
2414 0 : ++nSrcYOff2;
2415 7531 : if (nSrcYOff2 > nChunkBottomYOff)
2416 0 : nSrcYOff2 = nChunkBottomYOff;
2417 :
2418 7531 : const T *const paSrcScanline =
2419 281 : pChunk +
2420 7531 : (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
2421 7531 : const GByte *pabySrcScanlineNodataMask = nullptr;
2422 7531 : if (pabyChunkNodataMask != nullptr)
2423 1838 : pabySrcScanlineNodataMask =
2424 : pabyChunkNodataMask +
2425 1838 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
2426 :
2427 7531 : T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
2428 : /* --------------------------------------------------------------------
2429 : */
2430 : /* Loop over destination pixels */
2431 : /* --------------------------------------------------------------------
2432 : */
2433 4260596 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2434 : {
2435 4253061 : const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
2436 : // Apply some epsilon to avoid numerical precision issues
2437 4253061 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
2438 : #ifdef only_pixels_with_more_than_10_pct_participation
2439 : // When oversampling, don't take into account pixels that have a
2440 : // tiny participation in the resulting pixel
2441 : if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2442 : nSrcXOff < nChunkRightXOff)
2443 : nSrcXOff++;
2444 : #endif
2445 4253061 : if (nSrcXOff < nChunkXOff)
2446 0 : nSrcXOff = nChunkXOff;
2447 :
2448 4253061 : const double dfSrcXOff2 =
2449 4253061 : dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
2450 4253061 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
2451 : #ifdef only_pixels_with_more_than_10_pct_participation
2452 : // When oversampling, don't take into account pixels that have a
2453 : // tiny participation in the resulting pixel
2454 : if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2455 : nSrcXOff2 > nChunkXOff)
2456 : nSrcXOff2--;
2457 : #endif
2458 4253061 : if (nSrcXOff2 == nSrcXOff)
2459 0 : nSrcXOff2++;
2460 4253061 : if (nSrcXOff2 > nChunkRightXOff)
2461 0 : nSrcXOff2 = nChunkRightXOff;
2462 :
2463 4253061 : bool bRegularProcessing = false;
2464 : if constexpr (!std::is_same<T, GByte>::value)
2465 1671 : bRegularProcessing = true;
2466 4251390 : else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
2467 0 : bRegularProcessing = true;
2468 :
2469 4253061 : if (bRegularProcessing)
2470 : {
2471 : // Sanity check to make sure the allocation of paVals and
2472 : // panCounts don't overflow.
2473 : static_assert(sizeof(CountType) <= sizeof(size_t));
2474 3342 : if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
2475 1671 : static_cast<CountType>(nSrcYOff2 - nSrcYOff) >
2476 1671 : (std::numeric_limits<CountType>::max() /
2477 3342 : std::max(sizeof(T), sizeof(CountType))) /
2478 1671 : static_cast<CountType>(nSrcXOff2 - nSrcXOff))
2479 : {
2480 0 : CPLError(CE_Failure, CPLE_NotSupported,
2481 : "Too big downsampling factor");
2482 0 : CPLFree(paVals);
2483 0 : CPLFree(panCounts);
2484 0 : return CE_Failure;
2485 : }
2486 1671 : const CountType nNumPx =
2487 1671 : static_cast<CountType>(nSrcYOff2 - nSrcYOff) *
2488 1671 : (nSrcXOff2 - nSrcXOff);
2489 1671 : CountType iMaxInd = 0;
2490 1671 : CountType iMaxVal = 0;
2491 :
2492 1671 : if (paVals == nullptr || nNumPx > nMaxNumPx)
2493 : {
2494 : T *paValsNew = static_cast<T *>(
2495 116 : VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
2496 : CountType *panCountsNew =
2497 116 : static_cast<CountType *>(VSI_REALLOC_VERBOSE(
2498 : panCounts, nNumPx * sizeof(CountType)));
2499 116 : if (paValsNew != nullptr)
2500 116 : paVals = paValsNew;
2501 116 : if (panCountsNew != nullptr)
2502 116 : panCounts = panCountsNew;
2503 116 : if (paValsNew == nullptr || panCountsNew == nullptr)
2504 : {
2505 0 : CPLFree(paVals);
2506 0 : CPLFree(panCounts);
2507 0 : return CE_Failure;
2508 : }
2509 116 : nMaxNumPx = nNumPx;
2510 : }
2511 :
2512 5245 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2513 : {
2514 3574 : const GPtrDiff_t iTotYOff =
2515 3574 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2516 3574 : nChunkXOff;
2517 11842 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2518 : {
2519 8268 : if (pabySrcScanlineNodataMask == nullptr ||
2520 1552 : pabySrcScanlineNodataMask[iX + iTotYOff])
2521 : {
2522 8247 : const T val = paSrcScanline[iX + iTotYOff];
2523 8247 : CountType i = 0; // Used after for.
2524 :
2525 : // Check array for existing entry.
2526 11611 : for (; i < iMaxInd; ++i)
2527 : {
2528 8212 : if (IsSame(paVals[i], val))
2529 : {
2530 4848 : if (++panCounts[i] > panCounts[iMaxVal])
2531 : {
2532 246 : iMaxVal = i;
2533 : }
2534 4848 : break;
2535 : }
2536 : }
2537 :
2538 : // Add to arr if entry not already there.
2539 8247 : if (i == iMaxInd)
2540 : {
2541 3399 : paVals[iMaxInd] = val;
2542 3399 : panCounts[iMaxInd] = 1;
2543 :
2544 3399 : if (iMaxInd == 0)
2545 : {
2546 1668 : iMaxVal = iMaxInd;
2547 : }
2548 :
2549 3399 : ++iMaxInd;
2550 : }
2551 : }
2552 : }
2553 : }
2554 :
2555 1671 : if (iMaxInd == 0)
2556 3 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2557 : else
2558 1668 : paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
2559 : }
2560 : else if constexpr (std::is_same<T, GByte>::value)
2561 : // ( eSrcDataType == GDT_UInt8 && nEntryCount < 256 )
2562 : {
2563 : // So we go here for a paletted or non-paletted byte band.
2564 : // The input values are then between 0 and 255.
2565 4251390 : int nMaxVal = 0;
2566 4251390 : int iMaxInd = -1;
2567 :
2568 : // The cost of this zeroing might be high. Perhaps we should
2569 : // just use the above generic case, and go to this one if the
2570 : // number of source pixels is large enough
2571 4251390 : std::fill(anVals.begin(), anVals.end(), 0);
2572 :
2573 12777800 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2574 : {
2575 8526440 : const GPtrDiff_t iTotYOff =
2576 8526440 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2577 8526440 : nChunkXOff;
2578 25649600 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2579 : {
2580 17123100 : const T val = paSrcScanline[iX + iTotYOff];
2581 17123100 : if (!bHasNoData || val != tNoDataValue)
2582 : {
2583 17123100 : int nVal = static_cast<int>(val);
2584 17123100 : if (++anVals[nVal] > nMaxVal)
2585 : {
2586 : // Sum the density.
2587 : // Is it the most common value so far?
2588 17006400 : iMaxInd = nVal;
2589 17006400 : nMaxVal = anVals[nVal];
2590 : }
2591 : }
2592 : }
2593 : }
2594 :
2595 4251390 : if (iMaxInd == -1)
2596 0 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2597 : else
2598 4251390 : paDstScanline[iDstPixel - nDstXOff] =
2599 : static_cast<T>(iMaxInd);
2600 : }
2601 : }
2602 : }
2603 :
2604 182 : CPLFree(paVals);
2605 182 : CPLFree(panCounts);
2606 :
2607 182 : return CE_None;
2608 : }
2609 :
2610 182 : static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
2611 : const void *pChunk, void **ppDstBuffer,
2612 : GDALDataType *peDstBufferDataType)
2613 : {
2614 182 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
2615 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
2616 : GDALGetDataTypeSizeBytes(args.eWrkDataType));
2617 182 : if (*ppDstBuffer == nullptr)
2618 : {
2619 0 : return CE_Failure;
2620 : }
2621 :
2622 182 : CPLAssert(args.eSrcDataType == args.eWrkDataType);
2623 :
2624 182 : *peDstBufferDataType = args.eWrkDataType;
2625 182 : switch (args.eWrkDataType)
2626 : {
2627 : // For mode resampling, as no computation is done, only the
2628 : // size of the data type matters... except for Byte where we have
2629 : // special processing. And for floating point values
2630 66 : case GDT_UInt8:
2631 : {
2632 66 : return GDALResampleChunk_ModeT(args,
2633 : static_cast<const GByte *>(pChunk),
2634 66 : static_cast<GByte *>(*ppDstBuffer));
2635 : }
2636 :
2637 4 : case GDT_Int8:
2638 : {
2639 4 : return GDALResampleChunk_ModeT(args,
2640 : static_cast<const int8_t *>(pChunk),
2641 4 : static_cast<int8_t *>(*ppDstBuffer));
2642 : }
2643 :
2644 10 : case GDT_Int16:
2645 : case GDT_UInt16:
2646 : {
2647 10 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
2648 10 : return GDALResampleChunk_ModeT(
2649 : args, static_cast<const uint16_t *>(pChunk),
2650 10 : static_cast<uint16_t *>(*ppDstBuffer));
2651 : }
2652 :
2653 15 : case GDT_CInt16:
2654 : case GDT_Int32:
2655 : case GDT_UInt32:
2656 : {
2657 15 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2658 15 : return GDALResampleChunk_ModeT(
2659 : args, static_cast<const uint32_t *>(pChunk),
2660 15 : static_cast<uint32_t *>(*ppDstBuffer));
2661 : }
2662 :
2663 12 : case GDT_CInt32:
2664 : case GDT_Int64:
2665 : case GDT_UInt64:
2666 : {
2667 12 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2668 12 : return GDALResampleChunk_ModeT(
2669 : args, static_cast<const uint64_t *>(pChunk),
2670 12 : static_cast<uint64_t *>(*ppDstBuffer));
2671 : }
2672 :
2673 4 : case GDT_Float16:
2674 : {
2675 4 : return GDALResampleChunk_ModeT(
2676 : args, static_cast<const GFloat16 *>(pChunk),
2677 4 : static_cast<GFloat16 *>(*ppDstBuffer));
2678 : }
2679 :
2680 35 : case GDT_Float32:
2681 : {
2682 35 : return GDALResampleChunk_ModeT(args,
2683 : static_cast<const float *>(pChunk),
2684 35 : static_cast<float *>(*ppDstBuffer));
2685 : }
2686 :
2687 24 : case GDT_Float64:
2688 : {
2689 24 : return GDALResampleChunk_ModeT(args,
2690 : static_cast<const double *>(pChunk),
2691 24 : static_cast<double *>(*ppDstBuffer));
2692 : }
2693 :
2694 4 : case GDT_CFloat16:
2695 : {
2696 4 : return GDALResampleChunk_ModeT(
2697 : args, static_cast<const ComplexFloat16 *>(pChunk),
2698 4 : static_cast<ComplexFloat16 *>(*ppDstBuffer));
2699 : }
2700 :
2701 4 : case GDT_CFloat32:
2702 : {
2703 4 : return GDALResampleChunk_ModeT(
2704 : args, static_cast<const std::complex<float> *>(pChunk),
2705 4 : static_cast<std::complex<float> *>(*ppDstBuffer));
2706 : }
2707 :
2708 4 : case GDT_CFloat64:
2709 : {
2710 4 : return GDALResampleChunk_ModeT(
2711 : args, static_cast<const std::complex<double> *>(pChunk),
2712 4 : static_cast<std::complex<double> *>(*ppDstBuffer));
2713 : }
2714 :
2715 0 : case GDT_Unknown:
2716 : case GDT_TypeCount:
2717 0 : break;
2718 : }
2719 :
2720 0 : CPLAssert(false);
2721 : return CE_Failure;
2722 : }
2723 :
2724 : /************************************************************************/
2725 : /* GDALResampleConvolutionHorizontal() */
2726 : /************************************************************************/
2727 :
2728 : template <class T>
2729 : static inline double
2730 46038 : GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
2731 : int nSrcPixelCount)
2732 : {
2733 46038 : double dfVal1 = 0.0;
2734 46038 : double dfVal2 = 0.0;
2735 46038 : int i = 0; // Used after for.
2736 : // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
2737 : // manually (untypical) unrolled loop in -O2 and -O3:
2738 : // https://github.com/OSGeo/gdal/issues/9508
2739 : #if !defined(__INTEL_CLANG_COMPILER)
2740 92396 : for (; i < nSrcPixelCount - 3; i += 4)
2741 : {
2742 46358 : dfVal1 += double(pChunk[i + 0]) * padfWeights[i];
2743 46358 : dfVal1 += double(pChunk[i + 1]) * padfWeights[i + 1];
2744 46358 : dfVal2 += double(pChunk[i + 2]) * padfWeights[i + 2];
2745 46358 : dfVal2 += double(pChunk[i + 3]) * padfWeights[i + 3];
2746 : }
2747 : #endif
2748 48662 : for (; i < nSrcPixelCount; ++i)
2749 : {
2750 2624 : dfVal1 += double(pChunk[i]) * padfWeights[i];
2751 : }
2752 46038 : return dfVal1 + dfVal2;
2753 : }
2754 :
2755 : template <class T, bool bHasNaN>
2756 46368 : static inline void GDALResampleConvolutionHorizontalWithMask(
2757 : const T *pChunk, const GByte *pabyMask, const double *padfWeights,
2758 : int nSrcPixelCount, double &dfWeightValMaskSum, double &dfWeightMaskSum,
2759 : double &dfWeightSum)
2760 : {
2761 46368 : dfWeightValMaskSum = 0;
2762 46368 : dfWeightMaskSum = 0;
2763 46368 : dfWeightSum = 0;
2764 46368 : int i = 0;
2765 103804 : for (; i < nSrcPixelCount - 3; i += 4)
2766 : {
2767 57436 : double dfWeightMask0 = padfWeights[i + 0] * pabyMask[i + 0];
2768 57436 : double dfWeightMask1 = padfWeights[i + 1] * pabyMask[i + 1];
2769 57436 : double dfWeightMask2 = padfWeights[i + 2] * pabyMask[i + 2];
2770 57436 : double dfWeightMask3 = padfWeights[i + 3] * pabyMask[i + 3];
2771 :
2772 229744 : const auto MulNaNAware = [](double v, double &w, double &val)
2773 : {
2774 : if constexpr (bHasNaN)
2775 : {
2776 14848 : if (std::isnan(v))
2777 : {
2778 76 : w = 0;
2779 76 : return;
2780 : }
2781 : }
2782 14772 : val += v * w;
2783 : };
2784 :
2785 57436 : MulNaNAware(double(pChunk[i + 0]), dfWeightMask0, dfWeightValMaskSum);
2786 57436 : MulNaNAware(double(pChunk[i + 1]), dfWeightMask1, dfWeightValMaskSum);
2787 57436 : MulNaNAware(double(pChunk[i + 2]), dfWeightMask2, dfWeightValMaskSum);
2788 57436 : MulNaNAware(double(pChunk[i + 3]), dfWeightMask3, dfWeightValMaskSum);
2789 57436 : dfWeightMaskSum +=
2790 57436 : dfWeightMask0 + dfWeightMask1 + dfWeightMask2 + dfWeightMask3;
2791 57436 : dfWeightSum += padfWeights[i + 0] + padfWeights[i + 1] +
2792 57436 : padfWeights[i + 2] + padfWeights[i + 3];
2793 : }
2794 64874 : for (; i < nSrcPixelCount; ++i)
2795 : {
2796 18506 : const double dfWeightMask = padfWeights[i] * pabyMask[i];
2797 : if constexpr (bHasNaN)
2798 : {
2799 1920 : if (!std::isnan(pChunk[i]))
2800 : {
2801 1920 : dfWeightValMaskSum += double(pChunk[i]) * dfWeightMask;
2802 1920 : dfWeightMaskSum += dfWeightMask;
2803 1920 : dfWeightSum += padfWeights[i];
2804 : }
2805 : }
2806 : else
2807 : {
2808 16586 : dfWeightValMaskSum += double(pChunk[i]) * dfWeightMask;
2809 16586 : dfWeightMaskSum += dfWeightMask;
2810 16586 : dfWeightSum += padfWeights[i];
2811 : }
2812 : }
2813 46368 : }
2814 :
2815 : template <class T, bool bHasNaN>
2816 1341366 : static inline void GDALResampleConvolutionHorizontal_3rows(
2817 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2818 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2819 : double &dfRes2, double &dfRes3)
2820 : {
2821 1341366 : double dfVal1 = 0.0;
2822 1341366 : double dfVal2 = 0.0;
2823 1341366 : double dfVal3 = 0.0;
2824 1341366 : double dfVal4 = 0.0;
2825 1341366 : double dfVal5 = 0.0;
2826 1341366 : double dfVal6 = 0.0;
2827 1341366 : int i = 0; // Used after for.
2828 :
2829 16866840 : const auto MulNaNAware = [](double a, double w)
2830 : {
2831 : if constexpr (bHasNaN)
2832 : {
2833 0 : if (std::isnan(a))
2834 0 : return 0.0;
2835 : }
2836 16866900 : return a * w;
2837 : };
2838 :
2839 2736937 : for (; i < nSrcPixelCount - 3; i += 4)
2840 : {
2841 1395570 : dfVal1 += MulNaNAware(double(pChunkRow1[i + 0]), padfWeights[i + 0]);
2842 1395570 : dfVal1 += MulNaNAware(double(pChunkRow1[i + 1]), padfWeights[i + 1]);
2843 1395570 : dfVal2 += MulNaNAware(double(pChunkRow1[i + 2]), padfWeights[i + 2]);
2844 1395570 : dfVal2 += MulNaNAware(double(pChunkRow1[i + 3]), padfWeights[i + 3]);
2845 1395570 : dfVal3 += MulNaNAware(double(pChunkRow2[i + 0]), padfWeights[i + 0]);
2846 1395570 : dfVal3 += MulNaNAware(double(pChunkRow2[i + 1]), padfWeights[i + 1]);
2847 1395570 : dfVal4 += MulNaNAware(double(pChunkRow2[i + 2]), padfWeights[i + 2]);
2848 1395570 : dfVal4 += MulNaNAware(double(pChunkRow2[i + 3]), padfWeights[i + 3]);
2849 1395570 : dfVal5 += MulNaNAware(double(pChunkRow3[i + 0]), padfWeights[i + 0]);
2850 1395570 : dfVal5 += MulNaNAware(double(pChunkRow3[i + 1]), padfWeights[i + 1]);
2851 1395570 : dfVal6 += MulNaNAware(double(pChunkRow3[i + 2]), padfWeights[i + 2]);
2852 1395570 : dfVal6 += MulNaNAware(double(pChunkRow3[i + 3]), padfWeights[i + 3]);
2853 : }
2854 1381377 : for (; i < nSrcPixelCount; ++i)
2855 : {
2856 40011 : dfVal1 += MulNaNAware(double(pChunkRow1[i]), padfWeights[i]);
2857 40011 : dfVal3 += MulNaNAware(double(pChunkRow2[i]), padfWeights[i]);
2858 40011 : dfVal5 += MulNaNAware(double(pChunkRow3[i]), padfWeights[i]);
2859 : }
2860 1341366 : dfRes1 = dfVal1 + dfVal2;
2861 1341366 : dfRes2 = dfVal3 + dfVal4;
2862 1341366 : dfRes3 = dfVal5 + dfVal6;
2863 1341366 : }
2864 :
2865 : template <class T, bool bHasNaN>
2866 18980 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
2867 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2868 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2869 : double &dfRes2, double &dfRes3)
2870 : {
2871 18980 : GDALResampleConvolutionHorizontal_3rows<T, bHasNaN>(
2872 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeights, nSrcPixelCount, dfRes1,
2873 : dfRes2, dfRes3);
2874 18980 : }
2875 :
2876 : template <class T, bool bHasNaN>
2877 1256690 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
2878 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2879 : const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
2880 : {
2881 1256690 : GDALResampleConvolutionHorizontal_3rows<T, bHasNaN>(
2882 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeights, 4, dfRes1, dfRes2,
2883 : dfRes3);
2884 1256690 : }
2885 :
2886 : /************************************************************************/
2887 : /* GDALResampleConvolutionVertical() */
2888 : /************************************************************************/
2889 :
2890 : template <class T>
2891 : static inline double
2892 472545 : GDALResampleConvolutionVertical(const T *pChunk, size_t nStride,
2893 : const double *padfWeights, int nSrcLineCount)
2894 : {
2895 472545 : double dfVal1 = 0.0;
2896 472545 : double dfVal2 = 0.0;
2897 472545 : int i = 0;
2898 472545 : size_t j = 0;
2899 936186 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2900 : {
2901 463641 : dfVal1 += pChunk[j + 0 * nStride] * padfWeights[i + 0];
2902 463641 : dfVal1 += pChunk[j + 1 * nStride] * padfWeights[i + 1];
2903 463641 : dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2904 463641 : dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2905 : }
2906 526884 : for (; i < nSrcLineCount; ++i, j += nStride)
2907 : {
2908 54339 : dfVal1 += pChunk[j] * padfWeights[i];
2909 : }
2910 472545 : return dfVal1 + dfVal2;
2911 : }
2912 :
2913 : template <class T>
2914 2930610 : static inline void GDALResampleConvolutionVertical_2cols(
2915 : const T *pChunk, size_t nStride, const double *padfWeights,
2916 : int nSrcLineCount, double &dfRes1, double &dfRes2)
2917 : {
2918 2930610 : double dfVal1 = 0.0;
2919 2930610 : double dfVal2 = 0.0;
2920 2930610 : double dfVal3 = 0.0;
2921 2930610 : double dfVal4 = 0.0;
2922 2930610 : int i = 0;
2923 2930610 : size_t j = 0;
2924 5863170 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2925 : {
2926 2932560 : dfVal1 += pChunk[j + 0 + 0 * nStride] * padfWeights[i + 0];
2927 2932560 : dfVal3 += pChunk[j + 1 + 0 * nStride] * padfWeights[i + 0];
2928 2932560 : dfVal1 += pChunk[j + 0 + 1 * nStride] * padfWeights[i + 1];
2929 2932560 : dfVal3 += pChunk[j + 1 + 1 * nStride] * padfWeights[i + 1];
2930 2932560 : dfVal2 += pChunk[j + 0 + 2 * nStride] * padfWeights[i + 2];
2931 2932560 : dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
2932 2932560 : dfVal2 += pChunk[j + 0 + 3 * nStride] * padfWeights[i + 3];
2933 2932560 : dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
2934 : }
2935 3053490 : for (; i < nSrcLineCount; ++i, j += nStride)
2936 : {
2937 122880 : dfVal1 += pChunk[j + 0] * padfWeights[i];
2938 122880 : dfVal3 += pChunk[j + 1] * padfWeights[i];
2939 : }
2940 2930610 : dfRes1 = dfVal1 + dfVal2;
2941 2930610 : dfRes2 = dfVal3 + dfVal4;
2942 2930610 : }
2943 :
2944 : #ifdef USE_SSE2
2945 :
2946 : #ifdef __AVX__
2947 : /************************************************************************/
2948 : /* GDALResampleConvolutionVertical_16cols<T> */
2949 : /************************************************************************/
2950 :
2951 : template <class T>
2952 : static inline void
2953 : GDALResampleConvolutionVertical_16cols(const T *pChunk, size_t nStride,
2954 : const double *padfWeights,
2955 : int nSrcLineCount, float *afDest)
2956 : {
2957 : int i = 0;
2958 : size_t j = 0;
2959 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2960 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2961 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2962 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2963 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2964 : {
2965 : XMMReg4Double w0 =
2966 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2967 : XMMReg4Double w1 =
2968 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2969 : XMMReg4Double w2 =
2970 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2971 : XMMReg4Double w3 =
2972 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2973 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2974 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2975 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
2976 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
2977 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2978 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2979 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
2980 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
2981 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2982 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2983 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
2984 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
2985 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2986 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2987 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
2988 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
2989 : }
2990 : for (; i < nSrcLineCount; ++i, j += nStride)
2991 : {
2992 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2993 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2994 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2995 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
2996 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
2997 : }
2998 : v_acc0.Store4Val(afDest);
2999 : v_acc1.Store4Val(afDest + 4);
3000 : v_acc2.Store4Val(afDest + 8);
3001 : v_acc3.Store4Val(afDest + 12);
3002 : }
3003 :
3004 : template <class T>
3005 : static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
3006 : const double *, int,
3007 : double *)
3008 : {
3009 : // Cannot be reached
3010 : CPLAssert(false);
3011 : }
3012 :
3013 : #else
3014 :
3015 : /************************************************************************/
3016 : /* GDALResampleConvolutionVertical_8cols<T> */
3017 : /************************************************************************/
3018 :
3019 : template <class T>
3020 : static inline void
3021 25804000 : GDALResampleConvolutionVertical_8cols(const T *pChunk, size_t nStride,
3022 : const double *padfWeights,
3023 : int nSrcLineCount, float *afDest)
3024 : {
3025 25804000 : int i = 0;
3026 25804000 : size_t j = 0;
3027 25804000 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
3028 25804000 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3029 53883400 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
3030 : {
3031 28079400 : XMMReg4Double w0 =
3032 28079400 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
3033 28079400 : XMMReg4Double w1 =
3034 28079400 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
3035 28079400 : XMMReg4Double w2 =
3036 28079400 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
3037 28079400 : XMMReg4Double w3 =
3038 28079400 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
3039 28079400 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
3040 28079400 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
3041 28079400 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
3042 28079400 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
3043 28079400 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
3044 28079400 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
3045 28079400 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
3046 28079400 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
3047 : }
3048 37376100 : for (; i < nSrcLineCount; ++i, j += nStride)
3049 : {
3050 11572100 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
3051 11572100 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
3052 11572100 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
3053 : }
3054 25804000 : v_acc0.Store4Val(afDest);
3055 25804000 : v_acc1.Store4Val(afDest + 4);
3056 25804000 : }
3057 :
3058 : template <class T>
3059 : static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
3060 : const double *, int,
3061 : double *)
3062 : {
3063 : // Cannot be reached
3064 : CPLAssert(false);
3065 : }
3066 :
3067 : #endif // __AVX__
3068 :
3069 : /************************************************************************/
3070 : /* GDALResampleConvolutionHorizontalSSE2<T> */
3071 : /************************************************************************/
3072 :
3073 : template <class T>
3074 3375702 : static inline double GDALResampleConvolutionHorizontalSSE2(
3075 : const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3076 : {
3077 3375702 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3078 3375702 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3079 3375702 : int i = 0; // Used after for.
3080 3754648 : for (; i < nSrcPixelCount - 7; i += 8)
3081 : {
3082 : // Retrieve the pixel & accumulate
3083 378952 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
3084 378952 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
3085 378952 : const XMMReg4Double v_weight1 =
3086 378952 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3087 378952 : const XMMReg4Double v_weight2 =
3088 378952 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
3089 :
3090 378952 : v_acc1 += v_pixels1 * v_weight1;
3091 378952 : v_acc2 += v_pixels2 * v_weight2;
3092 : }
3093 :
3094 3375702 : v_acc1 += v_acc2;
3095 :
3096 3375702 : double dfVal = v_acc1.GetHorizSum();
3097 11491480 : for (; i < nSrcPixelCount; ++i)
3098 : {
3099 8115780 : dfVal += pChunk[i] * padfWeightsAligned[i];
3100 : }
3101 3375702 : return dfVal;
3102 : }
3103 :
3104 : /************************************************************************/
3105 : /* GDALResampleConvolutionHorizontal<GByte> */
3106 : /************************************************************************/
3107 :
3108 : template <>
3109 2826540 : inline double GDALResampleConvolutionHorizontal<GByte>(
3110 : const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3111 : {
3112 2826540 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
3113 2826540 : nSrcPixelCount);
3114 : }
3115 :
3116 : template <>
3117 549162 : inline double GDALResampleConvolutionHorizontal<GUInt16>(
3118 : const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3119 : {
3120 549162 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
3121 549162 : nSrcPixelCount);
3122 : }
3123 :
3124 : /************************************************************************/
3125 : /* GDALResampleConvolutionHorizontalWithMaskSSE2<T> */
3126 : /************************************************************************/
3127 :
3128 : template <class T>
3129 10626463 : static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
3130 : const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
3131 : int nSrcPixelCount, double &dfWeightValMaskSum, double &dfWeightMaskSum,
3132 : double &dfWeightSum)
3133 : {
3134 10626463 : int i = 0; // Used after for.
3135 10626463 : XMMReg4Double v_acc_val_mask_weight = XMMReg4Double::Zero();
3136 10626463 : XMMReg4Double v_acc_mask_weight = XMMReg4Double::Zero();
3137 10626463 : XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
3138 26199121 : for (; i < nSrcPixelCount - 3; i += 4)
3139 : {
3140 15572658 : const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
3141 15572658 : const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
3142 15572658 : XMMReg4Double v_weight =
3143 15572658 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3144 15572658 : v_acc_weight += v_weight;
3145 15572658 : v_weight *= v_mask;
3146 15572658 : v_acc_val_mask_weight += v_pixels * v_weight;
3147 15572658 : v_acc_mask_weight += v_weight;
3148 : }
3149 :
3150 10626463 : dfWeightValMaskSum = v_acc_val_mask_weight.GetHorizSum();
3151 10626463 : dfWeightMaskSum = v_acc_mask_weight.GetHorizSum();
3152 10626463 : dfWeightSum = v_acc_weight.GetHorizSum();
3153 10910963 : for (; i < nSrcPixelCount; ++i)
3154 : {
3155 284454 : const double dfWeight = padfWeightsAligned[i];
3156 284454 : const double dfWeightMask = dfWeight * pabyMask[i];
3157 284454 : dfWeightValMaskSum += pChunk[i] * dfWeightMask;
3158 284454 : dfWeightMaskSum += dfWeightMask;
3159 284454 : dfWeightSum += dfWeight;
3160 : }
3161 10626463 : }
3162 :
3163 : /************************************************************************/
3164 : /* GDALResampleConvolutionHorizontalWithMask<GByte> */
3165 : /************************************************************************/
3166 :
3167 : template <>
3168 10626400 : inline void GDALResampleConvolutionHorizontalWithMask<GByte, false>(
3169 : const GByte *pChunk, const GByte *pabyMask,
3170 : const double *padfWeightsAligned, int nSrcPixelCount,
3171 : double &dfWeightValMaskSum, double &dfWeightMaskSum, double &dfWeightSum)
3172 : {
3173 10626400 : GDALResampleConvolutionHorizontalWithMaskSSE2(
3174 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount,
3175 : dfWeightValMaskSum, dfWeightMaskSum, dfWeightSum);
3176 10626400 : }
3177 :
3178 : template <>
3179 63 : inline void GDALResampleConvolutionHorizontalWithMask<GUInt16, false>(
3180 : const GUInt16 *pChunk, const GByte *pabyMask,
3181 : const double *padfWeightsAligned, int nSrcPixelCount,
3182 : double &dfWeightValMaskSum, double &dfWeightMaskSum, double &dfWeightSum)
3183 : {
3184 63 : GDALResampleConvolutionHorizontalWithMaskSSE2(
3185 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount,
3186 : dfWeightValMaskSum, dfWeightMaskSum, dfWeightSum);
3187 63 : }
3188 :
3189 : /************************************************************************/
3190 : /* GDALResampleConvolutionHorizontal_3rows_SSE2<T> */
3191 : /************************************************************************/
3192 :
3193 : template <class T>
3194 35560186 : static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
3195 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3196 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3197 : double &dfRes2, double &dfRes3)
3198 : {
3199 35560186 : XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
3200 35560186 : v_acc2 = XMMReg4Double::Zero(),
3201 35560186 : v_acc3 = XMMReg4Double::Zero();
3202 35560186 : int i = 0;
3203 70929556 : for (; i < nSrcPixelCount - 7; i += 8)
3204 : {
3205 : // Retrieve the pixel & accumulate.
3206 35369370 : XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3207 35369370 : XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
3208 35369370 : const XMMReg4Double v_weight1 =
3209 35369370 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3210 35369370 : const XMMReg4Double v_weight2 =
3211 35369370 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
3212 :
3213 35369370 : v_acc1 += v_pixels1 * v_weight1;
3214 35369370 : v_acc1 += v_pixels2 * v_weight2;
3215 :
3216 35369370 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3217 35369370 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
3218 35369370 : v_acc2 += v_pixels1 * v_weight1;
3219 35369370 : v_acc2 += v_pixels2 * v_weight2;
3220 :
3221 35369370 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3222 35369370 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
3223 35369370 : v_acc3 += v_pixels1 * v_weight1;
3224 35369370 : v_acc3 += v_pixels2 * v_weight2;
3225 : }
3226 :
3227 35560186 : dfRes1 = v_acc1.GetHorizSum();
3228 35560186 : dfRes2 = v_acc2.GetHorizSum();
3229 35560186 : dfRes3 = v_acc3.GetHorizSum();
3230 47825952 : for (; i < nSrcPixelCount; ++i)
3231 : {
3232 12265766 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3233 12265766 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3234 12265766 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3235 : }
3236 35560186 : }
3237 :
3238 : /************************************************************************/
3239 : /* GDALResampleConvolutionHorizontal_3rows<GByte> */
3240 : /************************************************************************/
3241 :
3242 : template <>
3243 35560100 : inline void GDALResampleConvolutionHorizontal_3rows<GByte, false>(
3244 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3245 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3246 : double &dfRes2, double &dfRes3)
3247 : {
3248 35560100 : GDALResampleConvolutionHorizontal_3rows_SSE2(
3249 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3250 : dfRes1, dfRes2, dfRes3);
3251 35560100 : }
3252 :
3253 : template <>
3254 86 : inline void GDALResampleConvolutionHorizontal_3rows<GUInt16, false>(
3255 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3256 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3257 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3258 : {
3259 86 : GDALResampleConvolutionHorizontal_3rows_SSE2(
3260 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3261 : dfRes1, dfRes2, dfRes3);
3262 86 : }
3263 :
3264 : /************************************************************************/
3265 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T> */
3266 : /************************************************************************/
3267 :
3268 : template <class T>
3269 7849120 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3270 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3271 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3272 : double &dfRes2, double &dfRes3)
3273 : {
3274 7849120 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3275 7849120 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3276 7849120 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
3277 7849120 : int i = 0; // Use after for.
3278 19113750 : for (; i < nSrcPixelCount - 3; i += 4)
3279 : {
3280 : // Retrieve the pixel & accumulate.
3281 11264600 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3282 11264600 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3283 11264600 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3284 11264600 : const XMMReg4Double v_weight =
3285 11264600 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3286 :
3287 11264600 : v_acc1 += v_pixels1 * v_weight;
3288 11264600 : v_acc2 += v_pixels2 * v_weight;
3289 11264600 : v_acc3 += v_pixels3 * v_weight;
3290 : }
3291 :
3292 7849120 : dfRes1 = v_acc1.GetHorizSum();
3293 7849120 : dfRes2 = v_acc2.GetHorizSum();
3294 7849120 : dfRes3 = v_acc3.GetHorizSum();
3295 :
3296 12324622 : for (; i < nSrcPixelCount; ++i)
3297 : {
3298 4475522 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3299 4475522 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3300 4475522 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3301 : }
3302 7849120 : }
3303 :
3304 : /************************************************************************/
3305 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte> */
3306 : /************************************************************************/
3307 :
3308 : template <>
3309 : inline void
3310 7781970 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte, false>(
3311 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3312 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3313 : double &dfRes2, double &dfRes3)
3314 : {
3315 7781970 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3316 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3317 : dfRes1, dfRes2, dfRes3);
3318 7781970 : }
3319 :
3320 : template <>
3321 : inline void
3322 67150 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16, false>(
3323 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3324 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3325 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3326 : {
3327 67150 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3328 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3329 : dfRes1, dfRes2, dfRes3);
3330 67150 : }
3331 :
3332 : /************************************************************************/
3333 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T> */
3334 : /************************************************************************/
3335 :
3336 : template <class T>
3337 14904860 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3338 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3339 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3340 : double &dfRes3)
3341 : {
3342 14904860 : const XMMReg4Double v_weight =
3343 : XMMReg4Double::Load4ValAligned(padfWeightsAligned);
3344 :
3345 : // Retrieve the pixel & accumulate.
3346 14904860 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
3347 14904860 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
3348 14904860 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
3349 :
3350 14904860 : XMMReg4Double v_acc1 = v_pixels1 * v_weight;
3351 14904860 : XMMReg4Double v_acc2 = v_pixels2 * v_weight;
3352 14904860 : XMMReg4Double v_acc3 = v_pixels3 * v_weight;
3353 :
3354 14904860 : dfRes1 = v_acc1.GetHorizSum();
3355 14904860 : dfRes2 = v_acc2.GetHorizSum();
3356 14904860 : dfRes3 = v_acc3.GetHorizSum();
3357 14904860 : }
3358 :
3359 : /************************************************************************/
3360 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte> */
3361 : /************************************************************************/
3362 :
3363 : template <>
3364 9192140 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte, false>(
3365 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3366 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3367 : double &dfRes3)
3368 : {
3369 9192140 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3370 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3371 : dfRes3);
3372 9192140 : }
3373 :
3374 : template <>
3375 5712720 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16, false>(
3376 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3377 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
3378 : double &dfRes2, double &dfRes3)
3379 : {
3380 5712720 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3381 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3382 : dfRes3);
3383 5712720 : }
3384 :
3385 : #endif // USE_SSE2
3386 :
3387 : /************************************************************************/
3388 : /* GDALResampleChunk_Convolution() */
3389 : /************************************************************************/
3390 :
3391 : template <class T, class Twork, GDALDataType eWrkDataType,
3392 : bool bKernelWithNegativeWeights, bool bNeedRescale>
3393 9597 : static CPLErr GDALResampleChunk_ConvolutionT(
3394 : const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
3395 : FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
3396 : int nKernelRadius, float fMaxVal)
3397 :
3398 : {
3399 9597 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
3400 9597 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
3401 9597 : const double dfSrcXDelta = args.dfSrcXDelta;
3402 9597 : const double dfSrcYDelta = args.dfSrcYDelta;
3403 9597 : constexpr int nBands = 1;
3404 9597 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
3405 9597 : const int nChunkXOff = args.nChunkXOff;
3406 9597 : const int nChunkXSize = args.nChunkXSize;
3407 9597 : const int nChunkYOff = args.nChunkYOff;
3408 9597 : const int nChunkYSize = args.nChunkYSize;
3409 9597 : const int nDstXOff = args.nDstXOff;
3410 9597 : const int nDstXOff2 = args.nDstXOff2;
3411 9597 : const int nDstYOff = args.nDstYOff;
3412 9597 : const int nDstYOff2 = args.nDstYOff2;
3413 9597 : const bool bHasNoData = args.bHasNoData;
3414 9597 : double dfNoDataValue = args.dfNoDataValue;
3415 :
3416 9597 : if (!bHasNoData)
3417 9498 : dfNoDataValue = 0.0;
3418 9597 : const auto dstDataType = args.eOvrDataType;
3419 9597 : const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
3420 9597 : const double dfReplacementVal =
3421 99 : bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
3422 : : dfNoDataValue;
3423 : // cppcheck-suppress unreadVariable
3424 9597 : const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
3425 9597 : const bool bNoDataValueInt64Valid =
3426 9597 : isIntegerDT && GDALIsValueExactAs<GInt64>(dfNoDataValue);
3427 9597 : const auto nNodataValueInt64 =
3428 : bNoDataValueInt64Valid ? static_cast<GInt64>(dfNoDataValue) : 0;
3429 9597 : constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
3430 :
3431 : // TODO: we should have some generic function to do this.
3432 9597 : Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
3433 9597 : Twork fDstMax = cpl::NumericLimits<Twork>::max();
3434 9597 : if (dstDataType == GDT_UInt8)
3435 : {
3436 8667 : fDstMin = std::numeric_limits<GByte>::min();
3437 8667 : fDstMax = std::numeric_limits<GByte>::max();
3438 : }
3439 930 : else if (dstDataType == GDT_Int8)
3440 : {
3441 1 : fDstMin = std::numeric_limits<GInt8>::min();
3442 1 : fDstMax = std::numeric_limits<GInt8>::max();
3443 : }
3444 929 : else if (dstDataType == GDT_UInt16)
3445 : {
3446 402 : fDstMin = std::numeric_limits<GUInt16>::min();
3447 402 : fDstMax = std::numeric_limits<GUInt16>::max();
3448 : }
3449 527 : else if (dstDataType == GDT_Int16)
3450 : {
3451 292 : fDstMin = std::numeric_limits<GInt16>::min();
3452 292 : fDstMax = std::numeric_limits<GInt16>::max();
3453 : }
3454 235 : else if (dstDataType == GDT_UInt32)
3455 : {
3456 1 : fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
3457 1 : fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
3458 : }
3459 234 : else if (dstDataType == GDT_Int32)
3460 : {
3461 : // cppcheck-suppress unreadVariable
3462 6 : fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
3463 : // cppcheck-suppress unreadVariable
3464 6 : fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
3465 : }
3466 228 : else if (dstDataType == GDT_UInt64)
3467 : {
3468 : // cppcheck-suppress unreadVariable
3469 1 : fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
3470 : // cppcheck-suppress unreadVariable
3471 : // (1 << 64) - 2048: largest uint64 value a double can hold
3472 1 : fDstMax = static_cast<Twork>(18446744073709549568ULL);
3473 : }
3474 227 : else if (dstDataType == GDT_Int64)
3475 : {
3476 : // cppcheck-suppress unreadVariable
3477 1 : fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
3478 : // cppcheck-suppress unreadVariable
3479 : // (1 << 63) - 1024: largest int64 that a double can hold
3480 1 : fDstMax = static_cast<Twork>(9223372036854774784LL);
3481 : }
3482 :
3483 9597 : bool bHasNaN = false;
3484 490 : if (pabyChunkNodataMask)
3485 : {
3486 : if constexpr (std::is_floating_point_v<T>)
3487 : {
3488 120140 : for (size_t i = 0;
3489 120140 : i < static_cast<size_t>(nChunkXSize) * nChunkYSize; ++i)
3490 : {
3491 120122 : if (std::isnan(pChunk[i]))
3492 : {
3493 24 : bHasNaN = true;
3494 24 : break;
3495 : }
3496 : }
3497 : }
3498 : }
3499 :
3500 37413146 : auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
3501 : bNoDataValueInt64Valid, nNodataValueInt64,
3502 : dfNoDataValue, dfReplacementVal](Twork fVal)
3503 : {
3504 16299800 : if (!bHasNoData)
3505 12078600 : return fVal;
3506 :
3507 : // Clamp value before comparing to nodata: this is only needed for
3508 : // kernels with negative weights (Lanczos)
3509 4221160 : Twork fClamped = fVal;
3510 4221160 : if (fClamped < fDstMin)
3511 14504 : fClamped = fDstMin;
3512 4206660 : else if (fClamped > fDstMax)
3513 13638 : fClamped = fDstMax;
3514 4221160 : if (isIntegerDT)
3515 : {
3516 4220480 : if (bNoDataValueInt64Valid)
3517 : {
3518 4220470 : const double fClampedRounded = double(std::round(fClamped));
3519 8440960 : if (fClampedRounded >=
3520 : static_cast<double>(static_cast<Twork>(
3521 8440960 : std::numeric_limits<int64_t>::min())) &&
3522 : fClampedRounded <= static_cast<double>(static_cast<Twork>(
3523 8440960 : 9223372036854774784LL)) &&
3524 4220470 : nNodataValueInt64 ==
3525 4220480 : static_cast<GInt64>(std::round(fClamped)))
3526 : {
3527 : // Do not use the nodata value
3528 13195 : return static_cast<Twork>(dfReplacementVal);
3529 : }
3530 : }
3531 : }
3532 679 : else if (dfNoDataValue == static_cast<double>(fClamped))
3533 : {
3534 : // Do not use the nodata value
3535 1 : return static_cast<Twork>(dfReplacementVal);
3536 : }
3537 4207960 : return fClamped;
3538 : };
3539 :
3540 : /* -------------------------------------------------------------------- */
3541 : /* Allocate work buffers. */
3542 : /* -------------------------------------------------------------------- */
3543 9597 : const int nDstXSize = nDstXOff2 - nDstXOff;
3544 9597 : Twork *pafWrkScanline = nullptr;
3545 9597 : if (dstDataType != eWrkDataType)
3546 : {
3547 : pafWrkScanline =
3548 9385 : static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
3549 9385 : if (pafWrkScanline == nullptr)
3550 0 : return CE_Failure;
3551 : }
3552 :
3553 9597 : const double dfXScale = 1.0 / dfXRatioDstToSrc;
3554 9597 : const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
3555 9597 : const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
3556 9597 : const double dfYScale = 1.0 / dfYRatioDstToSrc;
3557 9597 : const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
3558 9597 : const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
3559 :
3560 : // Temporary array to store result of horizontal filter.
3561 : double *const padfHorizontalFiltered = static_cast<double *>(
3562 9597 : VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
3563 9597 : const uint64_t nWeightCount = static_cast<uint64_t>(
3564 9597 : 2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) + 0.5);
3565 9597 : if (nWeightCount > std::numeric_limits<uint32_t>::max() / sizeof(double))
3566 : {
3567 0 : VSIFree(pafWrkScanline);
3568 0 : CPLError(CE_Failure, CPLE_NotSupported,
3569 : "Too large downsampling factor");
3570 0 : return CE_Failure;
3571 : }
3572 : // To store convolution coefficients.
3573 : double *const padfWeights =
3574 9597 : static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
3575 : static_cast<size_t>(nWeightCount) * sizeof(double)));
3576 :
3577 9597 : GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
3578 9597 : if (pabyChunkNodataMask)
3579 : pabyChunkNodataMaskHorizontalFiltered =
3580 3357 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
3581 9597 : if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
3582 3357 : (pabyChunkNodataMask != nullptr &&
3583 : pabyChunkNodataMaskHorizontalFiltered == nullptr))
3584 : {
3585 0 : VSIFree(pafWrkScanline);
3586 0 : VSIFree(padfHorizontalFiltered);
3587 0 : VSIFreeAligned(padfWeights);
3588 0 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3589 0 : return CE_Failure;
3590 : }
3591 :
3592 : /* ==================================================================== */
3593 : /* First pass: horizontal filter */
3594 : /* ==================================================================== */
3595 9597 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
3596 : #ifdef USE_SSE2
3597 9597 : const bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
3598 : #endif
3599 3723494 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
3600 : {
3601 3713892 : const double dfSrcPixel =
3602 3713892 : (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
3603 3713892 : const int nSrcPixelStart = std::max(
3604 3713892 : static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5)),
3605 3713892 : nChunkXOff);
3606 3713892 : const int nSrcPixelStop =
3607 3713892 : std::min(static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5),
3608 3713892 : nChunkRightXOff);
3609 : #if 0
3610 : if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
3611 : {
3612 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3613 : }
3614 : if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
3615 : {
3616 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3617 : }
3618 : #endif
3619 3713892 : const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
3620 3713892 : double dfWeightSum = 0.0;
3621 :
3622 : // Compute convolution coefficients.
3623 3713892 : int nSrcPixel = nSrcPixelStart;
3624 3713892 : double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
3625 5823636 : for (; nSrcPixel < nSrcPixelStop - 3; nSrcPixel += 4)
3626 : {
3627 2109749 : padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
3628 2109749 : dfX += dfXScaleWeight;
3629 2109749 : padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
3630 2109749 : dfX += dfXScaleWeight;
3631 2109749 : padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
3632 2109749 : dfX += dfXScaleWeight;
3633 2109749 : padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
3634 2109749 : dfX += dfXScaleWeight;
3635 2109749 : dfWeightSum +=
3636 2109749 : pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
3637 : }
3638 7719022 : for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
3639 : {
3640 4005130 : const double dfWeight = pfnFilterFunc(dfX);
3641 4005130 : padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3642 4005130 : dfWeightSum += dfWeight;
3643 : }
3644 :
3645 3713892 : const int nHeight = nChunkYSize * nBands;
3646 3713892 : if (pabyChunkNodataMask == nullptr)
3647 : {
3648 : // For floating-point data types, we must scale down a bit values
3649 : // if input values are close to +/- std::numeric_limits<T>::max()
3650 : #ifdef OLD_CPPCHECK
3651 : constexpr double mulFactor = 1;
3652 : #else
3653 3191883 : constexpr double mulFactor =
3654 : (bNeedRescale &&
3655 : (std::is_same_v<T, float> || std::is_same_v<T, double>))
3656 : ? 2
3657 : : 1;
3658 : #endif
3659 :
3660 3191883 : if (dfWeightSum != 0)
3661 : {
3662 3191883 : const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
3663 13086524 : for (int i = 0; i < nSrcPixelCount; ++i)
3664 : {
3665 9894651 : padfWeights[i] *= dfInvWeightSum;
3666 : }
3667 : }
3668 :
3669 182388430 : const auto ScaleValue = [
3670 : #ifdef _MSC_VER
3671 : mulFactor
3672 : #endif
3673 : ](double dfVal, [[maybe_unused]] const T *inputValues,
3674 : [[maybe_unused]] int nInputValues)
3675 : {
3676 182388000 : constexpr bool isFloat =
3677 : std::is_same_v<T, float> || std::is_same_v<T, double>;
3678 : if constexpr (isFloat)
3679 : {
3680 4070140 : if (std::isfinite(dfVal))
3681 : {
3682 : return std::clamp(dfVal,
3683 12204800 : -std::numeric_limits<double>::max() /
3684 : mulFactor,
3685 4068260 : std::numeric_limits<double>::max() /
3686 4068260 : mulFactor) *
3687 4068260 : mulFactor;
3688 : }
3689 : else if constexpr (bKernelWithNegativeWeights)
3690 : {
3691 936 : if (std::isnan(dfVal))
3692 : {
3693 : // Either one of the input value is NaN or they are +/-Inf
3694 936 : const bool isPositive = inputValues[0] >= 0;
3695 6008 : for (int i = 0; i < nInputValues; ++i)
3696 : {
3697 5384 : if (std::isnan(inputValues[i]))
3698 312 : return dfVal;
3699 : // cppcheck-suppress knownConditionTrueFalse
3700 5072 : if ((inputValues[i] >= 0) != isPositive)
3701 0 : return dfVal;
3702 : }
3703 : // All values are positive or negative infinity
3704 624 : return static_cast<double>(inputValues[0]);
3705 : }
3706 : }
3707 : }
3708 178319000 : return dfVal;
3709 : };
3710 :
3711 3191883 : int iSrcLineOff = 0;
3712 : #ifdef USE_SSE2
3713 3191883 : if (nSrcPixelCount == 4)
3714 : {
3715 17007029 : for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3716 : {
3717 16161558 : const size_t j =
3718 16161558 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3719 16161558 : (nSrcPixelStart - nChunkXOff);
3720 16161558 : double dfVal1 = 0.0;
3721 16161558 : double dfVal2 = 0.0;
3722 16161558 : double dfVal3 = 0.0;
3723 : if constexpr (std::is_floating_point_v<T>)
3724 : {
3725 1256690 : if (bHasNaN)
3726 : {
3727 : GDALResampleConvolutionHorizontalPixelCount4_3rows<
3728 0 : T, true>(pChunk + j, pChunk + j + nChunkXSize,
3729 0 : pChunk + j + 2 * nChunkXSize,
3730 : padfWeights, dfVal1, dfVal2, dfVal3);
3731 : }
3732 : else
3733 : {
3734 : GDALResampleConvolutionHorizontalPixelCount4_3rows<
3735 1256690 : T, false>(pChunk + j, pChunk + j + nChunkXSize,
3736 1256690 : pChunk + j + 2 * nChunkXSize,
3737 : padfWeights, dfVal1, dfVal2, dfVal3);
3738 : }
3739 : }
3740 : else
3741 : {
3742 : GDALResampleConvolutionHorizontalPixelCount4_3rows<
3743 14904868 : T, false>(pChunk + j, pChunk + j + nChunkXSize,
3744 14904868 : pChunk + j + 2 * nChunkXSize, padfWeights,
3745 : dfVal1, dfVal2, dfVal3);
3746 : }
3747 32323080 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3748 16161558 : nDstXSize +
3749 16161558 : iDstPixel - nDstXOff] =
3750 16161558 : ScaleValue(dfVal1, pChunk + j, 4);
3751 32323080 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3752 16161558 : 1) *
3753 16161558 : nDstXSize +
3754 16161558 : iDstPixel - nDstXOff] =
3755 16161558 : ScaleValue(dfVal2, pChunk + j + nChunkXSize, 4);
3756 16161967 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3757 16161558 : 2) *
3758 16161558 : nDstXSize +
3759 16161558 : iDstPixel - nDstXOff] =
3760 16161558 : ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize, 4);
3761 : }
3762 : }
3763 2346404 : else if (bSrcPixelCountLess8)
3764 : {
3765 9938308 : for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3766 : {
3767 7868098 : const size_t j =
3768 7868098 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3769 7868098 : (nSrcPixelStart - nChunkXOff);
3770 7868098 : double dfVal1 = 0.0;
3771 7868098 : double dfVal2 = 0.0;
3772 7868098 : double dfVal3 = 0.0;
3773 : if constexpr (std::is_floating_point_v<T>)
3774 : {
3775 18980 : if (bHasNaN)
3776 : {
3777 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows<
3778 0 : T, true>(pChunk + j, pChunk + j + nChunkXSize,
3779 0 : pChunk + j + 2 * nChunkXSize,
3780 : padfWeights, nSrcPixelCount, dfVal1,
3781 : dfVal2, dfVal3);
3782 : }
3783 : else
3784 : {
3785 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows<
3786 18980 : T, false>(pChunk + j, pChunk + j + nChunkXSize,
3787 18980 : pChunk + j + 2 * nChunkXSize,
3788 : padfWeights, nSrcPixelCount, dfVal1,
3789 : dfVal2, dfVal3);
3790 : }
3791 : }
3792 : else
3793 : {
3794 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows<
3795 7849118 : T, false>(pChunk + j, pChunk + j + nChunkXSize,
3796 7849118 : pChunk + j + 2 * nChunkXSize, padfWeights,
3797 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3798 : }
3799 15736156 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3800 7868098 : nDstXSize +
3801 7868098 : iDstPixel - nDstXOff] =
3802 7868098 : ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
3803 15736156 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3804 7868098 : 1) *
3805 7868098 : nDstXSize +
3806 7868098 : iDstPixel - nDstXOff] =
3807 7868098 : ScaleValue(dfVal2, pChunk + j + nChunkXSize,
3808 : nSrcPixelCount);
3809 7868186 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3810 7868098 : 2) *
3811 7868098 : nDstXSize +
3812 7868098 : iDstPixel - nDstXOff] =
3813 7868098 : ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
3814 : nSrcPixelCount);
3815 : }
3816 : }
3817 : else
3818 : #endif
3819 : {
3820 35902058 : for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3821 : {
3822 35625944 : const size_t j =
3823 35625944 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3824 35625944 : (nSrcPixelStart - nChunkXOff);
3825 35625944 : double dfVal1 = 0.0;
3826 35625944 : double dfVal2 = 0.0;
3827 35625944 : double dfVal3 = 0.0;
3828 : if constexpr (std::is_floating_point_v<T>)
3829 : {
3830 65696 : if (bHasNaN)
3831 : {
3832 0 : GDALResampleConvolutionHorizontal_3rows<T, true>(
3833 0 : pChunk + j, pChunk + j + nChunkXSize,
3834 0 : pChunk + j + 2 * nChunkXSize, padfWeights,
3835 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3836 : }
3837 : else
3838 : {
3839 65696 : GDALResampleConvolutionHorizontal_3rows<T, false>(
3840 65696 : pChunk + j, pChunk + j + nChunkXSize,
3841 65696 : pChunk + j + 2 * nChunkXSize, padfWeights,
3842 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3843 : }
3844 : }
3845 : else
3846 : {
3847 35560248 : GDALResampleConvolutionHorizontal_3rows<T, false>(
3848 35560248 : pChunk + j, pChunk + j + nChunkXSize,
3849 35560248 : pChunk + j + 2 * nChunkXSize, padfWeights,
3850 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3851 : }
3852 71251798 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3853 35625944 : nDstXSize +
3854 35625944 : iDstPixel - nDstXOff] =
3855 35625944 : ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
3856 71251798 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3857 35625944 : 1) *
3858 35625944 : nDstXSize +
3859 35625944 : iDstPixel - nDstXOff] =
3860 35625944 : ScaleValue(dfVal2, pChunk + j + nChunkXSize,
3861 : nSrcPixelCount);
3862 35691048 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3863 35625944 : 2) *
3864 35625944 : nDstXSize +
3865 35625944 : iDstPixel - nDstXOff] =
3866 35625944 : ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
3867 : nSrcPixelCount);
3868 : }
3869 : }
3870 6613620 : for (; iSrcLineOff < nHeight; ++iSrcLineOff)
3871 : {
3872 3421743 : const size_t j =
3873 3421743 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3874 3421743 : (nSrcPixelStart - nChunkXOff);
3875 3970903 : const double dfVal = GDALResampleConvolutionHorizontal(
3876 595200 : pChunk + j, padfWeights, nSrcPixelCount);
3877 3422192 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3878 3421743 : nDstXSize +
3879 3421743 : iDstPixel - nDstXOff] =
3880 3421743 : ScaleValue(dfVal, pChunk + j, nSrcPixelCount);
3881 : }
3882 : }
3883 : else
3884 : {
3885 32759623 : for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
3886 : {
3887 32237528 : const size_t j =
3888 32237528 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3889 32237528 : (nSrcPixelStart - nChunkXOff);
3890 :
3891 : if (bKernelWithNegativeWeights)
3892 : {
3893 27492508 : int nConsecutiveValid = 0;
3894 27492508 : int nMaxConsecutiveValid = 0;
3895 747674146 : for (int k = 0; k < nSrcPixelCount; k++)
3896 : {
3897 720181938 : if (pabyChunkNodataMask[j + k])
3898 43694301 : nConsecutiveValid++;
3899 676487837 : else if (nConsecutiveValid)
3900 : {
3901 107658 : nMaxConsecutiveValid = std::max(
3902 107658 : nMaxConsecutiveValid, nConsecutiveValid);
3903 107658 : nConsecutiveValid = 0;
3904 : }
3905 : }
3906 27492508 : nMaxConsecutiveValid =
3907 27492508 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
3908 27492508 : if (nMaxConsecutiveValid < nSrcPixelCount / 2)
3909 : {
3910 21564707 : const size_t nTempOffset =
3911 21564707 : static_cast<size_t>(iSrcLineOff) * nDstXSize +
3912 21564707 : iDstPixel - nDstXOff;
3913 21564707 : padfHorizontalFiltered[nTempOffset] = 0.0;
3914 21564707 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3915 21564707 : continue;
3916 : }
3917 : }
3918 :
3919 10672871 : double dfSumWeightedVal = 0.0;
3920 10672871 : double dfSumWeightedAlpha = 0.0;
3921 : if constexpr (std::is_floating_point_v<T>)
3922 : {
3923 46368 : if (bHasNaN)
3924 : {
3925 1792 : GDALResampleConvolutionHorizontalWithMask<T, true>(
3926 1792 : pChunk + j, pabyChunkNodataMask + j, padfWeights,
3927 : nSrcPixelCount, dfSumWeightedVal,
3928 : dfSumWeightedAlpha, dfWeightSum);
3929 : }
3930 : else
3931 : {
3932 44576 : GDALResampleConvolutionHorizontalWithMask<T, false>(
3933 44576 : pChunk + j, pabyChunkNodataMask + j, padfWeights,
3934 : nSrcPixelCount, dfSumWeightedVal,
3935 : dfSumWeightedAlpha, dfWeightSum);
3936 : }
3937 : }
3938 : else
3939 : {
3940 10626503 : GDALResampleConvolutionHorizontalWithMask<T, false>(
3941 63 : pChunk + j, pabyChunkNodataMask + j, padfWeights,
3942 : nSrcPixelCount, dfSumWeightedVal, dfSumWeightedAlpha,
3943 : dfWeightSum);
3944 : }
3945 10672871 : const size_t nTempOffset =
3946 10672871 : static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
3947 10672871 : nDstXOff;
3948 10672871 : if (dfSumWeightedAlpha > 0.0)
3949 : {
3950 8760088 : padfHorizontalFiltered[nTempOffset] =
3951 8760088 : dfSumWeightedVal / dfSumWeightedAlpha;
3952 : // Not entirely clear if clamping values in the horizontal filter
3953 : // is the right thing to do, but otherwise, for
3954 : // https://github.com/OSGeo/gdal/issues/14728
3955 : // with very small values of alpha, we get very strong under
3956 : // and over shoots.
3957 : if constexpr (std::is_same_v<T, uint8_t>)
3958 : {
3959 8713690 : padfHorizontalFiltered[nTempOffset] = std::clamp(
3960 8713690 : padfHorizontalFiltered[nTempOffset], 0.0, 255.0);
3961 : }
3962 : else if constexpr (std::is_same_v<T, uint16_t>)
3963 : {
3964 60 : padfHorizontalFiltered[nTempOffset] = std::clamp(
3965 60 : padfHorizontalFiltered[nTempOffset], 0.0, 65535.0);
3966 : }
3967 8760088 : const double dfAlpha = dfSumWeightedAlpha / dfWeightSum;
3968 8760088 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] =
3969 8760088 : static_cast<uint8_t>(std::min(dfAlpha + 0.5, 255.0));
3970 : }
3971 : else
3972 : {
3973 1912797 : padfHorizontalFiltered[nTempOffset] = 0.0;
3974 1912797 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3975 : }
3976 : }
3977 : }
3978 : }
3979 :
3980 : /* ==================================================================== */
3981 : /* Second pass: vertical filter */
3982 : /* ==================================================================== */
3983 9597 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
3984 :
3985 414141 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3986 : {
3987 404544 : Twork *const pafDstScanline =
3988 : pafWrkScanline
3989 404544 : ? pafWrkScanline
3990 14028 : : static_cast<Twork *>(pDstBuffer) +
3991 14028 : static_cast<size_t>(iDstLine - nDstYOff) * nDstXSize;
3992 :
3993 404544 : const double dfSrcLine =
3994 404544 : (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
3995 404544 : const int nSrcLineStart =
3996 404544 : std::max(static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5)),
3997 404544 : nChunkYOff);
3998 404544 : const int nSrcLineStop =
3999 404544 : std::min(static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5),
4000 404544 : nChunkBottomYOff);
4001 : #if 0
4002 : if( nSrcLineStart < nChunkYOff &&
4003 : nChunkYOff > 0 )
4004 : {
4005 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
4006 : }
4007 : if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
4008 : {
4009 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
4010 : }
4011 : #endif
4012 404544 : const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
4013 404544 : double dfWeightSum = 0.0;
4014 :
4015 : // Compute convolution coefficients.
4016 404544 : int nSrcLine = nSrcLineStart; // Used after for.
4017 404544 : double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
4018 1076797 : for (; nSrcLine < nSrcLineStop - 3;
4019 672253 : nSrcLine += 4, dfY += 4 * dfYScaleWeight)
4020 : {
4021 672253 : padfWeights[nSrcLine - nSrcLineStart] = dfY;
4022 672253 : padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
4023 672253 : padfWeights[nSrcLine + 2 - nSrcLineStart] =
4024 672253 : dfY + 2 * dfYScaleWeight;
4025 672253 : padfWeights[nSrcLine + 3 - nSrcLineStart] =
4026 672253 : dfY + 3 * dfYScaleWeight;
4027 672253 : dfWeightSum +=
4028 672253 : pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
4029 : }
4030 443434 : for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
4031 : {
4032 38890 : const double dfWeight = pfnFilterFunc(dfY);
4033 38890 : padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
4034 38890 : dfWeightSum += dfWeight;
4035 : }
4036 :
4037 404544 : if (pabyChunkNodataMask == nullptr)
4038 : {
4039 : // For floating-point data types, we must scale down a bit values
4040 : // if input values are close to +/- std::numeric_limits<T>::max()
4041 : #ifdef OLD_CPPCHECK
4042 : constexpr double mulFactor = 1;
4043 : #else
4044 360192 : constexpr double mulFactor =
4045 : (bNeedRescale &&
4046 : (std::is_same_v<T, float> || std::is_same_v<T, double>))
4047 : ? 2
4048 : : 1;
4049 : #endif
4050 :
4051 360192 : if (dfWeightSum != 0)
4052 : {
4053 360192 : const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
4054 2617653 : for (int i = 0; i < nSrcLineCount; ++i)
4055 2257467 : padfWeights[i] *= dfInvWeightSum;
4056 : }
4057 :
4058 360192 : int iFilteredPixelOff = 0; // Used after for.
4059 : // j used after for.
4060 360192 : size_t j =
4061 360192 : (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
4062 : #ifdef USE_SSE2
4063 : if constexpr ((!bNeedRescale || !std::is_same_v<T, float>) &&
4064 : eWrkDataType == GDT_Float32)
4065 : {
4066 : #ifdef __AVX__
4067 : for (; iFilteredPixelOff < nDstXSize - 15;
4068 : iFilteredPixelOff += 16, j += 16)
4069 : {
4070 : GDALResampleConvolutionVertical_16cols(
4071 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
4072 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
4073 : if (bHasNoData)
4074 : {
4075 : for (int k = 0; k < 16; k++)
4076 : {
4077 : pafDstScanline[iFilteredPixelOff + k] =
4078 : replaceValIfNodata(
4079 : pafDstScanline[iFilteredPixelOff + k]);
4080 : }
4081 : }
4082 : }
4083 : #else
4084 26155459 : for (; iFilteredPixelOff < nDstXSize - 7;
4085 : iFilteredPixelOff += 8, j += 8)
4086 : {
4087 25804048 : GDALResampleConvolutionVertical_8cols(
4088 25804048 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
4089 25804048 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
4090 25804048 : if (bHasNoData)
4091 : {
4092 123192 : for (int k = 0; k < 8; k++)
4093 : {
4094 109504 : pafDstScanline[iFilteredPixelOff + k] =
4095 109504 : replaceValIfNodata(
4096 109504 : pafDstScanline[iFilteredPixelOff + k]);
4097 : }
4098 : }
4099 : }
4100 : #endif
4101 :
4102 822491 : for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
4103 : {
4104 471118 : const Twork fVal =
4105 471118 : static_cast<Twork>(GDALResampleConvolutionVertical(
4106 471118 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
4107 : nSrcLineCount));
4108 471118 : pafDstScanline[iFilteredPixelOff] =
4109 471118 : replaceValIfNodata(fVal);
4110 : }
4111 : }
4112 : else
4113 : #endif
4114 : {
4115 5862642 : const auto ScaleValue = [
4116 : #ifdef _MSC_VER
4117 : mulFactor
4118 : #endif
4119 : ](double dfVal, [[maybe_unused]] const double *inputValues,
4120 : [[maybe_unused]] int nStride,
4121 : [[maybe_unused]] int nInputValues)
4122 : {
4123 5862640 : constexpr bool isFloat =
4124 : std::is_same_v<T, float> || std::is_same_v<T, double>;
4125 : if constexpr (isFloat)
4126 : {
4127 5862640 : if (std::isfinite(dfVal))
4128 : {
4129 : return std::clamp(
4130 : dfVal,
4131 : static_cast<double>(
4132 17585400 : -std::numeric_limits<Twork>::max()) /
4133 : mulFactor,
4134 : static_cast<double>(
4135 5861800 : std::numeric_limits<Twork>::max()) /
4136 5861800 : mulFactor) *
4137 5861800 : mulFactor;
4138 : }
4139 : else if constexpr (bKernelWithNegativeWeights)
4140 : {
4141 480 : if (std::isnan(dfVal))
4142 : {
4143 : // Either one of the input value is NaN or they are +/-Inf
4144 480 : const bool isPositive = inputValues[0] >= 0;
4145 2520 : for (int i = 0; i < nInputValues; ++i)
4146 : {
4147 2200 : if (std::isnan(inputValues[i * nStride]))
4148 160 : return dfVal;
4149 : // cppcheck-suppress knownConditionTrueFalse
4150 2040 : if ((inputValues[i] >= 0) != isPositive)
4151 0 : return dfVal;
4152 : }
4153 : // All values are positive or negative infinity
4154 320 : return inputValues[0];
4155 : }
4156 : }
4157 : }
4158 :
4159 360 : return dfVal;
4160 : };
4161 :
4162 2939422 : for (; iFilteredPixelOff < nDstXSize - 1;
4163 : iFilteredPixelOff += 2, j += 2)
4164 : {
4165 2930610 : double dfVal1 = 0.0;
4166 2930610 : double dfVal2 = 0.0;
4167 2930610 : GDALResampleConvolutionVertical_2cols(
4168 2930610 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
4169 : nSrcLineCount, dfVal1, dfVal2);
4170 5861220 : pafDstScanline[iFilteredPixelOff] =
4171 2930610 : replaceValIfNodata(static_cast<Twork>(
4172 2930610 : ScaleValue(dfVal1, padfHorizontalFiltered + j,
4173 : nDstXSize, nSrcLineCount)));
4174 2930610 : pafDstScanline[iFilteredPixelOff + 1] =
4175 2930610 : replaceValIfNodata(static_cast<Twork>(
4176 2930610 : ScaleValue(dfVal2, padfHorizontalFiltered + j + 1,
4177 : nDstXSize, nSrcLineCount)));
4178 : }
4179 8819 : if (iFilteredPixelOff < nDstXSize)
4180 : {
4181 1427 : const double dfVal = GDALResampleConvolutionVertical(
4182 1427 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
4183 : nSrcLineCount);
4184 1427 : pafDstScanline[iFilteredPixelOff] =
4185 1427 : replaceValIfNodata(static_cast<Twork>(
4186 1427 : ScaleValue(dfVal, padfHorizontalFiltered + j,
4187 : nDstXSize, nSrcLineCount)));
4188 : }
4189 : }
4190 : }
4191 : else
4192 : {
4193 19948965 : for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
4194 : ++iFilteredPixelOff)
4195 : {
4196 19904685 : double dfVal = 0.0;
4197 19904685 : dfWeightSum = 0.0;
4198 19904685 : size_t j = (nSrcLineStart - nChunkYOff) *
4199 19904685 : static_cast<size_t>(nDstXSize) +
4200 19904685 : iFilteredPixelOff;
4201 : if (bKernelWithNegativeWeights)
4202 : {
4203 18637437 : int nConsecutiveValid = 0;
4204 18637437 : int nMaxConsecutiveValid = 0;
4205 162845921 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
4206 : {
4207 144208284 : const double dfWeight =
4208 144208284 : padfWeights[i] *
4209 : pabyChunkNodataMaskHorizontalFiltered[j];
4210 144208284 : if (pabyChunkNodataMaskHorizontalFiltered[j])
4211 : {
4212 45969501 : nConsecutiveValid++;
4213 : }
4214 98238683 : else if (nConsecutiveValid)
4215 : {
4216 211128 : nMaxConsecutiveValid = std::max(
4217 211128 : nMaxConsecutiveValid, nConsecutiveValid);
4218 211128 : nConsecutiveValid = 0;
4219 : }
4220 144208284 : dfVal += padfHorizontalFiltered[j] * dfWeight;
4221 144208284 : dfWeightSum += dfWeight;
4222 : }
4223 18637437 : nMaxConsecutiveValid =
4224 18637437 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
4225 18637437 : if (nMaxConsecutiveValid < nSrcLineCount / 2)
4226 : {
4227 9501801 : pafDstScanline[iFilteredPixelOff] =
4228 9501709 : static_cast<Twork>(dfNoDataValue);
4229 9501801 : continue;
4230 : }
4231 : }
4232 : else
4233 : {
4234 6353336 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
4235 : {
4236 5086078 : const double dfWeight =
4237 5086078 : padfWeights[i] *
4238 : pabyChunkNodataMaskHorizontalFiltered[j];
4239 5086078 : dfVal += padfHorizontalFiltered[j] * dfWeight;
4240 5086078 : dfWeightSum += dfWeight;
4241 : }
4242 : }
4243 10402854 : if (dfWeightSum > 0.0)
4244 : {
4245 9856520 : pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
4246 9856172 : static_cast<Twork>(dfVal / dfWeightSum));
4247 : }
4248 : else
4249 : {
4250 546347 : pafDstScanline[iFilteredPixelOff] =
4251 546323 : static_cast<Twork>(dfNoDataValue);
4252 : }
4253 : }
4254 : }
4255 :
4256 404544 : if (fMaxVal != 0.0f)
4257 : {
4258 : if constexpr (std::is_same_v<T, double>)
4259 : {
4260 0 : for (int i = 0; i < nDstXSize; ++i)
4261 : {
4262 0 : if (pafDstScanline[i] > static_cast<double>(fMaxVal))
4263 0 : pafDstScanline[i] = static_cast<double>(fMaxVal);
4264 : }
4265 : }
4266 : else
4267 : {
4268 192324 : for (int i = 0; i < nDstXSize; ++i)
4269 : {
4270 192088 : if (pafDstScanline[i] > fMaxVal)
4271 96022 : pafDstScanline[i] = fMaxVal;
4272 : }
4273 : }
4274 : }
4275 :
4276 404544 : if (pafWrkScanline)
4277 : {
4278 390516 : GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
4279 : static_cast<GByte *>(pDstBuffer) +
4280 390516 : static_cast<size_t>(iDstLine - nDstYOff) *
4281 390516 : nDstXSize * nDstDataTypeSize,
4282 : dstDataType, nDstDataTypeSize, nDstXSize);
4283 : }
4284 : }
4285 :
4286 9597 : VSIFree(pafWrkScanline);
4287 9597 : VSIFreeAligned(padfWeights);
4288 9597 : VSIFree(padfHorizontalFiltered);
4289 9597 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
4290 :
4291 9597 : return CE_None;
4292 : }
4293 :
4294 : template <bool bKernelWithNegativeWeights, bool bNeedRescale>
4295 : static CPLErr
4296 9597 : GDALResampleChunk_ConvolutionInternal(const GDALOverviewResampleArgs &args,
4297 : const void *pChunk, void **ppDstBuffer,
4298 : GDALDataType *peDstBufferDataType)
4299 : {
4300 : GDALResampleAlg eResample;
4301 9597 : if (EQUAL(args.pszResampling, "BILINEAR"))
4302 7097 : eResample = GRA_Bilinear;
4303 2500 : else if (EQUAL(args.pszResampling, "CUBIC"))
4304 2318 : eResample = GRA_Cubic;
4305 182 : else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
4306 86 : eResample = GRA_CubicSpline;
4307 96 : else if (EQUAL(args.pszResampling, "LANCZOS"))
4308 96 : eResample = GRA_Lanczos;
4309 : else
4310 : {
4311 0 : CPLAssert(false);
4312 : return CE_Failure;
4313 : }
4314 9597 : const int nKernelRadius = GWKGetFilterRadius(eResample);
4315 9597 : FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
4316 : const FilterFunc4ValuesType pfnFilterFunc4Values =
4317 9597 : GWKGetFilterFunc4Values(eResample);
4318 :
4319 9597 : float fMaxVal = 0.f;
4320 : // Cubic, etc... can have overshoots, so make sure we clamp values to the
4321 : // maximum value if NBITS is set.
4322 9597 : if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
4323 8 : (args.eOvrDataType == GDT_UInt8 || args.eOvrDataType == GDT_UInt16 ||
4324 0 : args.eOvrDataType == GDT_UInt32))
4325 : {
4326 8 : int nBits = args.nOvrNBITS;
4327 8 : if (nBits == GDALGetDataTypeSizeBits(args.eOvrDataType))
4328 1 : nBits = 0;
4329 8 : if (nBits > 0 && nBits < 32)
4330 7 : fMaxVal = static_cast<float>((1U << nBits) - 1);
4331 : }
4332 :
4333 9597 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
4334 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
4335 : GDALGetDataTypeSizeBytes(args.eOvrDataType));
4336 9597 : if (*ppDstBuffer == nullptr)
4337 : {
4338 0 : return CE_Failure;
4339 : }
4340 9597 : *peDstBufferDataType = args.eOvrDataType;
4341 :
4342 9597 : switch (args.eWrkDataType)
4343 : {
4344 8705 : case GDT_UInt8:
4345 : {
4346 : return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32,
4347 : bKernelWithNegativeWeights,
4348 8705 : bNeedRescale>(
4349 : args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
4350 8705 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4351 : }
4352 :
4353 402 : case GDT_UInt16:
4354 : {
4355 : return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32,
4356 : bKernelWithNegativeWeights,
4357 402 : bNeedRescale>(
4358 : args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
4359 402 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4360 : }
4361 :
4362 387 : case GDT_Float32:
4363 : {
4364 : return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32,
4365 : bKernelWithNegativeWeights,
4366 387 : bNeedRescale>(
4367 : args, static_cast<const float *>(pChunk), *ppDstBuffer,
4368 387 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4369 : }
4370 :
4371 103 : case GDT_Float64:
4372 : {
4373 : return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64,
4374 : bKernelWithNegativeWeights,
4375 103 : bNeedRescale>(
4376 : args, static_cast<const double *>(pChunk), *ppDstBuffer,
4377 103 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4378 : }
4379 :
4380 0 : default:
4381 0 : break;
4382 : }
4383 :
4384 0 : CPLAssert(false);
4385 : return CE_Failure;
4386 : }
4387 :
4388 : static CPLErr
4389 9597 : GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
4390 : const void *pChunk, void **ppDstBuffer,
4391 : GDALDataType *peDstBufferDataType)
4392 : {
4393 9597 : if (EQUAL(args.pszResampling, "CUBIC") ||
4394 7279 : EQUAL(args.pszResampling, "LANCZOS"))
4395 : return GDALResampleChunk_ConvolutionInternal<
4396 2414 : /* bKernelWithNegativeWeights=*/true, /* bNeedRescale = */ true>(
4397 2414 : args, pChunk, ppDstBuffer, peDstBufferDataType);
4398 7183 : else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
4399 86 : return GDALResampleChunk_ConvolutionInternal<false, true>(
4400 86 : args, pChunk, ppDstBuffer, peDstBufferDataType);
4401 : else
4402 7097 : return GDALResampleChunk_ConvolutionInternal<false, false>(
4403 7097 : args, pChunk, ppDstBuffer, peDstBufferDataType);
4404 : }
4405 :
4406 : /************************************************************************/
4407 : /* GDALResampleChunkC32R() */
4408 : /************************************************************************/
4409 :
4410 2 : static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
4411 : const float *pafChunk, const int nChunkYOff,
4412 : const int nChunkYSize, const int nDstYOff,
4413 : const int nDstYOff2, const int nOvrXSize,
4414 : const int nOvrYSize, void **ppDstBuffer,
4415 : GDALDataType *peDstBufferDataType,
4416 : const char *pszResampling)
4417 :
4418 : {
4419 : enum Method
4420 : {
4421 : NEAR,
4422 : AVERAGE,
4423 : AVERAGE_MAGPHASE,
4424 : RMS,
4425 : };
4426 :
4427 2 : Method eMethod = NEAR;
4428 2 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
4429 : {
4430 0 : eMethod = NEAR;
4431 : }
4432 2 : else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
4433 : {
4434 0 : eMethod = AVERAGE_MAGPHASE;
4435 : }
4436 2 : else if (EQUAL(pszResampling, "RMS"))
4437 : {
4438 2 : eMethod = RMS;
4439 : }
4440 0 : else if (STARTS_WITH_CI(pszResampling, "AVER"))
4441 : {
4442 0 : eMethod = AVERAGE;
4443 : }
4444 : else
4445 : {
4446 0 : CPLError(
4447 : CE_Failure, CPLE_NotSupported,
4448 : "Resampling method %s is not supported for complex data types. "
4449 : "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
4450 : pszResampling);
4451 0 : return CE_Failure;
4452 : }
4453 :
4454 2 : const int nOXSize = nOvrXSize;
4455 2 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
4456 : GDALGetDataTypeSizeBytes(GDT_CFloat32));
4457 2 : if (*ppDstBuffer == nullptr)
4458 : {
4459 0 : return CE_Failure;
4460 : }
4461 2 : float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
4462 2 : *peDstBufferDataType = GDT_CFloat32;
4463 :
4464 2 : const int nOYSize = nOvrYSize;
4465 2 : const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
4466 2 : const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
4467 :
4468 : /* ==================================================================== */
4469 : /* Loop over destination scanlines. */
4470 : /* ==================================================================== */
4471 8 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
4472 : {
4473 6 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
4474 6 : if (nSrcYOff < nChunkYOff)
4475 0 : nSrcYOff = nChunkYOff;
4476 :
4477 6 : int nSrcYOff2 =
4478 6 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
4479 6 : if (nSrcYOff2 == nSrcYOff)
4480 0 : nSrcYOff2++;
4481 :
4482 6 : if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
4483 : {
4484 2 : if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
4485 0 : nSrcYOff = nSrcHeight - 1;
4486 2 : nSrcYOff2 = nSrcHeight;
4487 : }
4488 6 : if (nSrcYOff2 > nChunkYOff + nChunkYSize)
4489 0 : nSrcYOff2 = nChunkYOff + nChunkYSize;
4490 :
4491 6 : const float *const pafSrcScanline =
4492 6 : pafChunk +
4493 6 : (static_cast<size_t>(nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
4494 6 : float *const pafDstScanline =
4495 6 : pafDstBuffer +
4496 6 : static_cast<size_t>(iDstLine - nDstYOff) * 2 * nOXSize;
4497 :
4498 : /* --------------------------------------------------------------------
4499 : */
4500 : /* Loop over destination pixels */
4501 : /* --------------------------------------------------------------------
4502 : */
4503 18 : for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
4504 : {
4505 12 : const size_t iDstPixelSZ = static_cast<size_t>(iDstPixel);
4506 12 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
4507 12 : int nSrcXOff2 =
4508 12 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
4509 12 : if (nSrcXOff2 == nSrcXOff)
4510 0 : nSrcXOff2++;
4511 12 : if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
4512 : {
4513 6 : if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
4514 0 : nSrcXOff = nSrcWidth - 1;
4515 6 : nSrcXOff2 = nSrcWidth;
4516 : }
4517 12 : const size_t nSrcXOffSZ = static_cast<size_t>(nSrcXOff);
4518 :
4519 12 : if (eMethod == NEAR)
4520 : {
4521 0 : pafDstScanline[iDstPixelSZ * 2] =
4522 0 : pafSrcScanline[nSrcXOffSZ * 2];
4523 0 : pafDstScanline[iDstPixelSZ * 2 + 1] =
4524 0 : pafSrcScanline[nSrcXOffSZ * 2 + 1];
4525 : }
4526 12 : else if (eMethod == AVERAGE_MAGPHASE)
4527 : {
4528 0 : double dfTotalR = 0.0;
4529 0 : double dfTotalI = 0.0;
4530 0 : double dfTotalM = 0.0;
4531 0 : size_t nCount = 0;
4532 :
4533 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4534 : {
4535 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4536 : {
4537 0 : const double dfR = double(
4538 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4539 0 : static_cast<size_t>(iY - nSrcYOff) *
4540 0 : nSrcWidth * 2]);
4541 0 : const double dfI = double(
4542 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4543 0 : static_cast<size_t>(iY - nSrcYOff) *
4544 0 : nSrcWidth * 2 +
4545 0 : 1]);
4546 0 : dfTotalR += dfR;
4547 0 : dfTotalI += dfI;
4548 0 : dfTotalM += std::hypot(dfR, dfI);
4549 0 : ++nCount;
4550 : }
4551 : }
4552 :
4553 0 : CPLAssert(nCount > 0);
4554 0 : if (nCount == 0)
4555 : {
4556 0 : pafDstScanline[iDstPixelSZ * 2] = 0.0;
4557 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4558 : }
4559 : else
4560 : {
4561 0 : pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4562 0 : dfTotalR / static_cast<double>(nCount));
4563 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4564 0 : dfTotalI / static_cast<double>(nCount));
4565 : const double dfM =
4566 0 : double(std::hypot(pafDstScanline[iDstPixelSZ * 2],
4567 0 : pafDstScanline[iDstPixelSZ * 2 + 1]));
4568 0 : const double dfDesiredM =
4569 0 : dfTotalM / static_cast<double>(nCount);
4570 0 : double dfRatio = 1.0;
4571 0 : if (dfM != 0.0)
4572 0 : dfRatio = dfDesiredM / dfM;
4573 :
4574 0 : pafDstScanline[iDstPixelSZ * 2] *=
4575 0 : static_cast<float>(dfRatio);
4576 0 : pafDstScanline[iDstPixelSZ * 2 + 1] *=
4577 0 : static_cast<float>(dfRatio);
4578 : }
4579 : }
4580 12 : else if (eMethod == RMS)
4581 : {
4582 12 : double dfTotalR = 0.0;
4583 12 : double dfTotalI = 0.0;
4584 12 : size_t nCount = 0;
4585 :
4586 36 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4587 : {
4588 72 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4589 : {
4590 48 : const double dfR = double(
4591 48 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4592 48 : static_cast<size_t>(iY - nSrcYOff) *
4593 48 : nSrcWidth * 2]);
4594 48 : const double dfI = double(
4595 48 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4596 48 : static_cast<size_t>(iY - nSrcYOff) *
4597 48 : nSrcWidth * 2 +
4598 48 : 1]);
4599 :
4600 48 : dfTotalR += SQUARE(dfR);
4601 48 : dfTotalI += SQUARE(dfI);
4602 :
4603 48 : ++nCount;
4604 : }
4605 : }
4606 :
4607 12 : CPLAssert(nCount > 0);
4608 12 : if (nCount == 0)
4609 : {
4610 0 : pafDstScanline[iDstPixelSZ * 2] = 0.0;
4611 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4612 : }
4613 : else
4614 : {
4615 : /* compute RMS */
4616 12 : pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4617 12 : sqrt(dfTotalR / static_cast<double>(nCount)));
4618 12 : pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4619 12 : sqrt(dfTotalI / static_cast<double>(nCount)));
4620 : }
4621 : }
4622 0 : else if (eMethod == AVERAGE)
4623 : {
4624 0 : double dfTotalR = 0.0;
4625 0 : double dfTotalI = 0.0;
4626 0 : size_t nCount = 0;
4627 :
4628 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4629 : {
4630 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4631 : {
4632 : // TODO(schwehr): Maybe use std::complex?
4633 0 : dfTotalR += double(
4634 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4635 0 : static_cast<size_t>(iY - nSrcYOff) *
4636 0 : nSrcWidth * 2]);
4637 0 : dfTotalI += double(
4638 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4639 0 : static_cast<size_t>(iY - nSrcYOff) *
4640 0 : nSrcWidth * 2 +
4641 0 : 1]);
4642 0 : ++nCount;
4643 : }
4644 : }
4645 :
4646 0 : CPLAssert(nCount > 0);
4647 0 : if (nCount == 0)
4648 : {
4649 0 : pafDstScanline[iDstPixelSZ * 2] = 0.0;
4650 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4651 : }
4652 : else
4653 : {
4654 0 : pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4655 0 : dfTotalR / static_cast<double>(nCount));
4656 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4657 0 : dfTotalI / static_cast<double>(nCount));
4658 : }
4659 : }
4660 : }
4661 : }
4662 :
4663 2 : return CE_None;
4664 : }
4665 :
4666 : /************************************************************************/
4667 : /* GDALRegenerateCascadingOverviews() */
4668 : /* */
4669 : /* Generate a list of overviews in order from largest to */
4670 : /* smallest, computing each from the next larger. */
4671 : /************************************************************************/
4672 :
4673 44 : static CPLErr GDALRegenerateCascadingOverviews(
4674 : GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
4675 : const char *pszResampling, GDALProgressFunc pfnProgress,
4676 : void *pProgressData, CSLConstList papszOptions)
4677 :
4678 : {
4679 : /* -------------------------------------------------------------------- */
4680 : /* First, we must put the overviews in order from largest to */
4681 : /* smallest. */
4682 : /* -------------------------------------------------------------------- */
4683 127 : for (int i = 0; i < nOverviews - 1; ++i)
4684 : {
4685 292 : for (int j = 0; j < nOverviews - i - 1; ++j)
4686 : {
4687 209 : if (papoOvrBands[j]->GetXSize() *
4688 209 : static_cast<float>(papoOvrBands[j]->GetYSize()) <
4689 209 : papoOvrBands[j + 1]->GetXSize() *
4690 209 : static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
4691 : {
4692 0 : GDALRasterBand *poTempBand = papoOvrBands[j];
4693 0 : papoOvrBands[j] = papoOvrBands[j + 1];
4694 0 : papoOvrBands[j + 1] = poTempBand;
4695 : }
4696 : }
4697 : }
4698 :
4699 : /* -------------------------------------------------------------------- */
4700 : /* Count total pixels so we can prepare appropriate scaled */
4701 : /* progress functions. */
4702 : /* -------------------------------------------------------------------- */
4703 44 : double dfTotalPixels = 0.0;
4704 :
4705 171 : for (int i = 0; i < nOverviews; ++i)
4706 : {
4707 127 : dfTotalPixels += papoOvrBands[i]->GetXSize() *
4708 127 : static_cast<double>(papoOvrBands[i]->GetYSize());
4709 : }
4710 :
4711 : /* -------------------------------------------------------------------- */
4712 : /* Generate all the bands. */
4713 : /* -------------------------------------------------------------------- */
4714 44 : double dfPixelsProcessed = 0.0;
4715 :
4716 88 : CPLStringList aosOptions(papszOptions);
4717 44 : aosOptions.SetNameValue("CASCADING", "YES");
4718 171 : for (int i = 0; i < nOverviews; ++i)
4719 : {
4720 127 : GDALRasterBand *poBaseBand = poSrcBand;
4721 127 : if (i != 0)
4722 83 : poBaseBand = papoOvrBands[i - 1];
4723 :
4724 127 : double dfPixels = papoOvrBands[i]->GetXSize() *
4725 127 : static_cast<double>(papoOvrBands[i]->GetYSize());
4726 :
4727 254 : void *pScaledProgressData = GDALCreateScaledProgress(
4728 : dfPixelsProcessed / dfTotalPixels,
4729 127 : (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
4730 : pProgressData);
4731 :
4732 254 : const CPLErr eErr = GDALRegenerateOverviewsEx(
4733 : poBaseBand, 1,
4734 127 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
4735 : pszResampling, GDALScaledProgress, pScaledProgressData,
4736 127 : aosOptions.List());
4737 127 : GDALDestroyScaledProgress(pScaledProgressData);
4738 :
4739 127 : if (eErr != CE_None)
4740 0 : return eErr;
4741 :
4742 127 : dfPixelsProcessed += dfPixels;
4743 :
4744 : // Only do the bit2grayscale promotion on the base band.
4745 127 : if (STARTS_WITH_CI(pszResampling,
4746 : "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
4747 8 : pszResampling = "AVERAGE";
4748 : }
4749 :
4750 44 : return CE_None;
4751 : }
4752 :
4753 : /************************************************************************/
4754 : /* GDALGetResampleFunction() */
4755 : /************************************************************************/
4756 :
4757 19281 : GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
4758 : int *pnRadius)
4759 : {
4760 19281 : if (pnRadius)
4761 19281 : *pnRadius = 0;
4762 19281 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
4763 545 : return GDALResampleChunk_Near;
4764 18736 : else if (STARTS_WITH_CI(pszResampling, "AVER") ||
4765 7508 : EQUAL(pszResampling, "RMS"))
4766 11293 : return GDALResampleChunk_AverageOrRMS;
4767 7443 : else if (EQUAL(pszResampling, "GAUSS"))
4768 : {
4769 26 : if (pnRadius)
4770 26 : *pnRadius = 1;
4771 26 : return GDALResampleChunk_Gauss;
4772 : }
4773 7417 : else if (EQUAL(pszResampling, "MODE"))
4774 142 : return GDALResampleChunk_Mode;
4775 7275 : else if (EQUAL(pszResampling, "CUBIC"))
4776 : {
4777 1648 : if (pnRadius)
4778 1648 : *pnRadius = GWKGetFilterRadius(GRA_Cubic);
4779 1648 : return GDALResampleChunk_Convolution;
4780 : }
4781 5627 : else if (EQUAL(pszResampling, "CUBICSPLINE"))
4782 : {
4783 60 : if (pnRadius)
4784 60 : *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
4785 60 : return GDALResampleChunk_Convolution;
4786 : }
4787 5567 : else if (EQUAL(pszResampling, "LANCZOS"))
4788 : {
4789 50 : if (pnRadius)
4790 50 : *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
4791 50 : return GDALResampleChunk_Convolution;
4792 : }
4793 5517 : else if (EQUAL(pszResampling, "BILINEAR"))
4794 : {
4795 5517 : if (pnRadius)
4796 5517 : *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
4797 5517 : return GDALResampleChunk_Convolution;
4798 : }
4799 : else
4800 : {
4801 0 : CPLError(
4802 : CE_Failure, CPLE_AppDefined,
4803 : "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
4804 : pszResampling);
4805 0 : return nullptr;
4806 : }
4807 : }
4808 :
4809 : /************************************************************************/
4810 : /* GDALGetOvrWorkDataType() */
4811 : /************************************************************************/
4812 :
4813 19163 : GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
4814 : GDALDataType eSrcDataType)
4815 : {
4816 19163 : if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
4817 : {
4818 679 : return eSrcDataType;
4819 : }
4820 18484 : else if (eSrcDataType == GDT_UInt8 &&
4821 17911 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4822 6781 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4823 5375 : EQUAL(pszResampling, "CUBICSPLINE") ||
4824 5355 : EQUAL(pszResampling, "LANCZOS") ||
4825 5348 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4826 : {
4827 17904 : return GDT_UInt8;
4828 : }
4829 580 : else if (eSrcDataType == GDT_UInt16 &&
4830 131 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4831 126 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4832 8 : EQUAL(pszResampling, "CUBICSPLINE") ||
4833 6 : EQUAL(pszResampling, "LANCZOS") ||
4834 3 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4835 : {
4836 131 : return GDT_UInt16;
4837 : }
4838 449 : else if (EQUAL(pszResampling, "GAUSS"))
4839 20 : return GDT_Float64;
4840 :
4841 429 : if (eSrcDataType == GDT_UInt8 || eSrcDataType == GDT_Int8 ||
4842 428 : eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
4843 : eSrcDataType == GDT_Float32)
4844 : {
4845 277 : return GDT_Float32;
4846 : }
4847 152 : return GDT_Float64;
4848 : }
4849 :
4850 : namespace
4851 : {
4852 : // Structure to hold a pointer to free with CPLFree()
4853 : struct PointerHolder
4854 : {
4855 : void *ptr = nullptr;
4856 :
4857 4087 : template <class T> explicit PointerHolder(T *&ptrIn) : ptr(ptrIn)
4858 : {
4859 4087 : ptrIn = nullptr;
4860 4087 : }
4861 :
4862 : template <class T>
4863 32 : explicit PointerHolder(std::unique_ptr<T, VSIFreeReleaser> ptrIn)
4864 32 : : ptr(ptrIn.release())
4865 : {
4866 32 : }
4867 :
4868 4119 : ~PointerHolder()
4869 4119 : {
4870 4119 : CPLFree(ptr);
4871 4119 : }
4872 :
4873 : PointerHolder(const PointerHolder &) = delete;
4874 : PointerHolder &operator=(const PointerHolder &) = delete;
4875 : };
4876 : } // namespace
4877 :
4878 : /************************************************************************/
4879 : /* GDALRegenerateOverviews() */
4880 : /************************************************************************/
4881 :
4882 : /**
4883 : * \brief Generate downsampled overviews.
4884 : *
4885 : * This function will generate one or more overview images from a base image
4886 : * using the requested downsampling algorithm. Its primary use is for
4887 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4888 : * used to generate downsampled images in one file from another outside the
4889 : * overview architecture.
4890 : *
4891 : * The output bands need to exist in advance.
4892 : *
4893 : * The full set of resampling algorithms is documented in
4894 : * GDALDataset::BuildOverviews().
4895 : *
4896 : * This function will honour properly NODATA_VALUES tuples (special dataset
4897 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4898 : * considered as the nodata value and not each value of the triplet
4899 : * independently per band.
4900 : *
4901 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4902 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4903 : * overview computation.
4904 : *
4905 : * @param hSrcBand the source (base level) band.
4906 : * @param nOverviewCount the number of downsampled bands being generated.
4907 : * @param pahOvrBands the list of downsampled bands to be generated.
4908 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4909 : * @param pfnProgress progress report function.
4910 : * @param pProgressData progress function callback data.
4911 : * @return CE_None on success or CE_Failure on failure.
4912 : */
4913 113 : CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
4914 : GDALRasterBandH *pahOvrBands,
4915 : const char *pszResampling,
4916 : GDALProgressFunc pfnProgress,
4917 : void *pProgressData)
4918 :
4919 : {
4920 113 : return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
4921 : pszResampling, pfnProgress, pProgressData,
4922 113 : nullptr);
4923 : }
4924 :
4925 : /************************************************************************/
4926 : /* GDALRegenerateOverviewsEx() */
4927 : /************************************************************************/
4928 :
4929 : constexpr int RADIUS_TO_DIAMETER = 2;
4930 :
4931 : /**
4932 : * \brief Generate downsampled overviews.
4933 : *
4934 : * This function will generate one or more overview images from a base image
4935 : * using the requested downsampling algorithm. Its primary use is for
4936 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4937 : * used to generate downsampled images in one file from another outside the
4938 : * overview architecture.
4939 : *
4940 : * The output bands need to exist in advance.
4941 : *
4942 : * The full set of resampling algorithms is documented in
4943 : * GDALDataset::BuildOverviews().
4944 : *
4945 : * This function will honour properly NODATA_VALUES tuples (special dataset
4946 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4947 : * considered as the nodata value and not each value of the triplet
4948 : * independently per band.
4949 : *
4950 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4951 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4952 : * overview computation.
4953 : *
4954 : * @param hSrcBand the source (base level) band.
4955 : * @param nOverviewCount the number of downsampled bands being generated.
4956 : * @param pahOvrBands the list of downsampled bands to be generated.
4957 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4958 : * @param pfnProgress progress report function.
4959 : * @param pProgressData progress function callback data.
4960 : * @param papszOptions NULL terminated list of options as key=value pairs, or
4961 : * NULL
4962 : * @return CE_None on success or CE_Failure on failure.
4963 : * @since GDAL 3.6
4964 : */
4965 794 : CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
4966 : GDALRasterBandH *pahOvrBands,
4967 : const char *pszResampling,
4968 : GDALProgressFunc pfnProgress,
4969 : void *pProgressData, CSLConstList papszOptions)
4970 :
4971 : {
4972 794 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
4973 794 : GDALRasterBand **papoOvrBands =
4974 : reinterpret_cast<GDALRasterBand **>(pahOvrBands);
4975 :
4976 794 : if (pfnProgress == nullptr)
4977 102 : pfnProgress = GDALDummyProgress;
4978 :
4979 794 : if (EQUAL(pszResampling, "NONE"))
4980 51 : return CE_None;
4981 :
4982 743 : int nKernelRadius = 0;
4983 : GDALResampleFunction pfnResampleFn =
4984 743 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
4985 :
4986 743 : if (pfnResampleFn == nullptr)
4987 0 : return CE_Failure;
4988 :
4989 : /* -------------------------------------------------------------------- */
4990 : /* Check color tables... */
4991 : /* -------------------------------------------------------------------- */
4992 743 : GDALColorTable *poColorTable = nullptr;
4993 :
4994 520 : if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
4995 1564 : EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
4996 312 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4997 : {
4998 9 : poColorTable = poSrcBand->GetColorTable();
4999 9 : if (poColorTable != nullptr)
5000 : {
5001 9 : if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
5002 : {
5003 0 : CPLError(CE_Warning, CPLE_AppDefined,
5004 : "Computing overviews on palette index raster bands "
5005 : "with a palette whose color interpretation is not RGB "
5006 : "will probably lead to unexpected results.");
5007 0 : poColorTable = nullptr;
5008 : }
5009 9 : else if (poColorTable->IsIdentity())
5010 : {
5011 0 : poColorTable = nullptr;
5012 : }
5013 : }
5014 : else
5015 : {
5016 0 : CPLError(CE_Warning, CPLE_AppDefined,
5017 : "Computing overviews on palette index raster bands "
5018 : "without a palette will probably lead to unexpected "
5019 : "results.");
5020 : }
5021 : }
5022 : // Not ready yet
5023 2148 : else if ((EQUAL(pszResampling, "CUBIC") ||
5024 680 : EQUAL(pszResampling, "CUBICSPLINE") ||
5025 680 : EQUAL(pszResampling, "LANCZOS") ||
5026 1494 : EQUAL(pszResampling, "BILINEAR")) &&
5027 80 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
5028 : {
5029 0 : CPLError(CE_Warning, CPLE_AppDefined,
5030 : "Computing %s overviews on palette index raster bands "
5031 : "will probably lead to unexpected results.",
5032 : pszResampling);
5033 : }
5034 :
5035 : // If we have a nodata mask and we are doing something more complicated
5036 : // than nearest neighbouring, we have to fetch to nodata mask.
5037 :
5038 743 : GDALRasterBand *poMaskBand = nullptr;
5039 743 : bool bUseNoDataMask = false;
5040 743 : bool bCanUseCascaded = true;
5041 :
5042 743 : if (!STARTS_WITH_CI(pszResampling, "NEAR"))
5043 : {
5044 : // Special case if we are an alpha/mask band. We want it to be
5045 : // considered as the mask band to avoid alpha=0 to be taken into account
5046 : // in average computation.
5047 392 : if (poSrcBand->IsMaskBand())
5048 : {
5049 51 : poMaskBand = poSrcBand;
5050 51 : bUseNoDataMask = true;
5051 : }
5052 : else
5053 : {
5054 341 : poMaskBand = poSrcBand->GetMaskBand();
5055 341 : const int nMaskFlags = poSrcBand->GetMaskFlags();
5056 341 : bCanUseCascaded =
5057 341 : (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
5058 341 : bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
5059 : }
5060 : }
5061 :
5062 743 : int nHasNoData = 0;
5063 743 : const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
5064 743 : const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
5065 : const bool bPropagateNoData =
5066 743 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5067 :
5068 811 : if (poSrcBand->GetBand() == 1 && bUseNoDataMask &&
5069 68 : CSLFetchNameValue(papszOptions, "CASCADING") == nullptr)
5070 : {
5071 112 : std::string osDetailMessage;
5072 56 : if (poSrcBand->HasConflictingMaskSources(&osDetailMessage, false))
5073 : {
5074 2 : CPLError(
5075 : CE_Warning, CPLE_AppDefined, "%s%s", osDetailMessage.c_str(),
5076 : bHasNoData
5077 : ? "Only the nodata value will be taken into account."
5078 : : "Only the first listed one will be taken into account.");
5079 : }
5080 : }
5081 :
5082 : /* -------------------------------------------------------------------- */
5083 : /* If we are operating on multiple overviews, and using */
5084 : /* averaging, lets do them in cascading order to reduce the */
5085 : /* amount of computation. */
5086 : /* -------------------------------------------------------------------- */
5087 :
5088 : // In case the mask made be computed from another band of the dataset,
5089 : // we can't use cascaded generation, as the computation of the overviews
5090 : // of the band used for the mask band may not have yet occurred (#3033).
5091 743 : if ((STARTS_WITH_CI(pszResampling, "AVER") ||
5092 520 : EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
5093 489 : EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
5094 435 : EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
5095 743 : EQUAL(pszResampling, "MODE")) &&
5096 44 : nOverviewCount > 1 && bCanUseCascaded)
5097 44 : return GDALRegenerateCascadingOverviews(
5098 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
5099 44 : pProgressData, papszOptions);
5100 :
5101 : /* -------------------------------------------------------------------- */
5102 : /* Setup one horizontal swath to read from the raw buffer. */
5103 : /* -------------------------------------------------------------------- */
5104 699 : int nFRXBlockSize = 0;
5105 699 : int nFRYBlockSize = 0;
5106 699 : poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
5107 :
5108 699 : const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
5109 1047 : const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
5110 997 : EQUAL(pszResampling, "MODE") ||
5111 298 : !GDALDataTypeIsComplex(eSrcDataType);
5112 : const GDALDataType eWrkDataType =
5113 : bUseGenericResampleFn
5114 699 : ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
5115 699 : : GDT_CFloat32;
5116 :
5117 699 : const int nWidth = poSrcBand->GetXSize();
5118 699 : const int nHeight = poSrcBand->GetYSize();
5119 :
5120 699 : int nMaxOvrFactor = 1;
5121 1521 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
5122 : {
5123 822 : const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
5124 822 : const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
5125 822 : nMaxOvrFactor = std::max(
5126 : nMaxOvrFactor,
5127 822 : static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
5128 822 : nMaxOvrFactor = std::max(
5129 : nMaxOvrFactor,
5130 822 : static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
5131 : }
5132 :
5133 699 : int nFullResYChunk = nFRYBlockSize;
5134 699 : int nMaxChunkYSizeQueried = 0;
5135 :
5136 : const auto UpdateChunkHeightAndGetChunkSize =
5137 9441 : [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
5138 76489 : eWrkDataType, nWidth]()
5139 : {
5140 : // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
5141 : // + nFullResYChunk) / nMaxOvrFactor)
5142 9441 : if (nMaxOvrFactor > INT_MAX / RADIUS_TO_DIAMETER)
5143 : {
5144 1 : return GINTBIG_MAX;
5145 : }
5146 9440 : nFullResYChunk =
5147 9440 : std::max(nFullResYChunk, RADIUS_TO_DIAMETER * nMaxOvrFactor);
5148 9440 : if ((nKernelRadius > 0 &&
5149 970 : nMaxOvrFactor > INT_MAX / (RADIUS_TO_DIAMETER * nKernelRadius)) ||
5150 9440 : nFullResYChunk >
5151 9440 : INT_MAX - RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor)
5152 : {
5153 0 : return GINTBIG_MAX;
5154 : }
5155 9440 : nMaxChunkYSizeQueried =
5156 9440 : nFullResYChunk + RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor;
5157 9440 : if (GDALGetDataTypeSizeBytes(eWrkDataType) >
5158 9440 : std::numeric_limits<int64_t>::max() /
5159 9440 : (static_cast<int64_t>(nMaxChunkYSizeQueried) * nWidth))
5160 : {
5161 1 : return GINTBIG_MAX;
5162 : }
5163 9439 : return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
5164 9439 : nMaxChunkYSizeQueried * nWidth;
5165 699 : };
5166 :
5167 : const char *pszChunkYSize =
5168 699 : CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
5169 : #ifndef __COVERITY__
5170 : // Only configurable for debug / testing
5171 699 : if (pszChunkYSize)
5172 : {
5173 0 : nFullResYChunk = atoi(pszChunkYSize);
5174 : }
5175 : #endif
5176 :
5177 : // Only configurable for debug / testing
5178 : const int nChunkMaxSize =
5179 699 : atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
5180 :
5181 699 : auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
5182 699 : if (nChunkSize > nChunkMaxSize)
5183 : {
5184 15 : if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
5185 44 : !GDALDataTypeIsComplex(eSrcDataType) &&
5186 14 : (!STARTS_WITH_CI(pszResampling, "AVER") ||
5187 2 : EQUAL(pszResampling, "AVERAGE")))
5188 : {
5189 : // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
5190 : // which use a block based strategy, which is much less memory
5191 : // hungry.
5192 14 : return GDALRegenerateOverviewsMultiBand(
5193 : 1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
5194 14 : pfnProgress, pProgressData, papszOptions);
5195 : }
5196 1 : else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
5197 : {
5198 0 : return GDALRegenerateCascadingOverviews(
5199 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
5200 0 : pfnProgress, pProgressData, papszOptions);
5201 : }
5202 : }
5203 684 : else if (pszChunkYSize == nullptr)
5204 : {
5205 : // Try to get as close as possible to nChunkMaxSize
5206 9426 : while (nChunkSize < nChunkMaxSize / 2)
5207 : {
5208 8742 : nFullResYChunk *= 2;
5209 8742 : nChunkSize = UpdateChunkHeightAndGetChunkSize();
5210 : }
5211 : }
5212 :
5213 : // Structure describing a resampling job
5214 : struct OvrJob
5215 : {
5216 : // Buffers to free when job is finished
5217 : std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
5218 : std::shared_ptr<PointerHolder> oSrcBufferHolder{};
5219 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
5220 :
5221 : GDALRasterBand *poDstBand = nullptr;
5222 :
5223 : // Input parameters of pfnResampleFn
5224 : GDALResampleFunction pfnResampleFn = nullptr;
5225 : int nSrcWidth = 0;
5226 : int nSrcHeight = 0;
5227 : int nDstWidth = 0;
5228 : GDALOverviewResampleArgs args{};
5229 : const void *pChunk = nullptr;
5230 : bool bUseGenericResampleFn = false;
5231 :
5232 : // Output values of resampling function
5233 : CPLErr eErr = CE_Failure;
5234 : void *pDstBuffer = nullptr;
5235 : GDALDataType eDstBufferDataType = GDT_Unknown;
5236 :
5237 0 : void SetSrcMaskBufferHolder(
5238 : const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
5239 : {
5240 0 : oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
5241 0 : }
5242 :
5243 0 : void SetSrcBufferHolder(
5244 : const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
5245 : {
5246 0 : oSrcBufferHolder = oSrcBufferHolderIn;
5247 0 : }
5248 :
5249 791 : void NotifyFinished()
5250 : {
5251 1582 : std::lock_guard guard(mutex);
5252 791 : bFinished = true;
5253 791 : cv.notify_one();
5254 791 : }
5255 :
5256 0 : bool IsFinished()
5257 : {
5258 0 : std::lock_guard guard(mutex);
5259 0 : return bFinished;
5260 : }
5261 :
5262 0 : void WaitFinished()
5263 : {
5264 0 : std::unique_lock oGuard(mutex);
5265 0 : while (!bFinished)
5266 : {
5267 0 : cv.wait(oGuard);
5268 : }
5269 0 : }
5270 :
5271 : private:
5272 : // Synchronization
5273 : bool bFinished = false;
5274 : std::mutex mutex{};
5275 : std::condition_variable cv{};
5276 : };
5277 :
5278 : // Thread function to resample
5279 791 : const auto JobResampleFunc = [](void *pData)
5280 : {
5281 791 : OvrJob *poJob = static_cast<OvrJob *>(pData);
5282 :
5283 791 : if (poJob->bUseGenericResampleFn)
5284 : {
5285 789 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
5286 : &(poJob->pDstBuffer),
5287 : &(poJob->eDstBufferDataType));
5288 : }
5289 : else
5290 : {
5291 2 : poJob->eErr = GDALResampleChunkC32R(
5292 : poJob->nSrcWidth, poJob->nSrcHeight,
5293 2 : static_cast<const float *>(poJob->pChunk),
5294 : poJob->args.nChunkYOff, poJob->args.nChunkYSize,
5295 : poJob->args.nDstYOff, poJob->args.nDstYOff2,
5296 : poJob->args.nOvrXSize, poJob->args.nOvrYSize,
5297 : &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
5298 : poJob->args.pszResampling);
5299 : }
5300 :
5301 791 : auto pDstBuffer = poJob->pDstBuffer;
5302 791 : poJob->oDstBufferHolder = std::make_unique<PointerHolder>(pDstBuffer);
5303 :
5304 791 : poJob->NotifyFinished();
5305 791 : };
5306 :
5307 : // Function to write resample data to target band
5308 791 : const auto WriteJobData = [](const OvrJob *poJob)
5309 : {
5310 1582 : return poJob->poDstBand->RasterIO(
5311 791 : GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
5312 791 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
5313 791 : poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
5314 791 : poJob->eDstBufferDataType, 0, 0, nullptr);
5315 : };
5316 :
5317 : // Wait for completion of oldest job and serialize it
5318 : const auto WaitAndFinalizeOldestJob =
5319 0 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
5320 : {
5321 0 : auto poOldestJob = jobList.front().get();
5322 0 : poOldestJob->WaitFinished();
5323 0 : CPLErr l_eErr = poOldestJob->eErr;
5324 0 : if (l_eErr == CE_None)
5325 : {
5326 0 : l_eErr = WriteJobData(poOldestJob);
5327 : }
5328 :
5329 0 : jobList.pop_front();
5330 0 : return l_eErr;
5331 : };
5332 :
5333 : // Queue of jobs
5334 1370 : std::list<std::unique_ptr<OvrJob>> jobList;
5335 :
5336 685 : GByte *pabyChunkNodataMask = nullptr;
5337 685 : void *pChunk = nullptr;
5338 :
5339 685 : const int nThreads = GDALGetNumThreads(GDAL_DEFAULT_MAX_THREAD_COUNT,
5340 : /* bDefaultToAllCPUs=*/false);
5341 : auto poThreadPool =
5342 685 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5343 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5344 1370 : : std::unique_ptr<CPLJobQueue>(nullptr);
5345 :
5346 : /* -------------------------------------------------------------------- */
5347 : /* Loop over image operating on chunks. */
5348 : /* -------------------------------------------------------------------- */
5349 685 : int nChunkYOff = 0;
5350 685 : CPLErr eErr = CE_None;
5351 :
5352 1375 : for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
5353 690 : nChunkYOff += nFullResYChunk)
5354 : {
5355 690 : if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
5356 : pProgressData))
5357 : {
5358 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
5359 0 : eErr = CE_Failure;
5360 : }
5361 :
5362 690 : if (nFullResYChunk + nChunkYOff > nHeight)
5363 682 : nFullResYChunk = nHeight - nChunkYOff;
5364 :
5365 690 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
5366 690 : int nChunkYSizeQueried =
5367 690 : nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
5368 690 : if (nChunkYOffQueried < 0)
5369 : {
5370 83 : nChunkYSizeQueried += nChunkYOffQueried;
5371 83 : nChunkYOffQueried = 0;
5372 : }
5373 690 : if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
5374 83 : nChunkYSizeQueried = nHeight - nChunkYOffQueried;
5375 :
5376 : // Avoid accumulating too many tasks and exhaust RAM
5377 : // Try to complete already finished jobs
5378 690 : while (eErr == CE_None && !jobList.empty())
5379 : {
5380 0 : auto poOldestJob = jobList.front().get();
5381 0 : if (!poOldestJob->IsFinished())
5382 0 : break;
5383 0 : eErr = poOldestJob->eErr;
5384 0 : if (eErr == CE_None)
5385 : {
5386 0 : eErr = WriteJobData(poOldestJob);
5387 : }
5388 :
5389 0 : jobList.pop_front();
5390 : }
5391 :
5392 : // And in case we have saturated the number of threads,
5393 : // wait for completion of tasks to go below the threshold.
5394 1380 : while (eErr == CE_None &&
5395 690 : jobList.size() >= static_cast<size_t>(nThreads))
5396 : {
5397 0 : eErr = WaitAndFinalizeOldestJob(jobList);
5398 : }
5399 :
5400 : // (Re)allocate buffers if needed
5401 690 : if (pChunk == nullptr)
5402 : {
5403 685 : pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
5404 : nMaxChunkYSizeQueried, nWidth);
5405 : }
5406 690 : if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
5407 : {
5408 139 : pabyChunkNodataMask = static_cast<GByte *>(
5409 139 : VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
5410 : }
5411 :
5412 690 : if (pChunk == nullptr ||
5413 139 : (bUseNoDataMask && pabyChunkNodataMask == nullptr))
5414 : {
5415 0 : CPLFree(pChunk);
5416 0 : CPLFree(pabyChunkNodataMask);
5417 0 : return CE_Failure;
5418 : }
5419 :
5420 : // Read chunk.
5421 690 : if (eErr == CE_None)
5422 690 : eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
5423 : nChunkYSizeQueried, pChunk, nWidth,
5424 : nChunkYSizeQueried, eWrkDataType, 0, 0,
5425 : nullptr);
5426 690 : if (eErr == CE_None && bUseNoDataMask)
5427 139 : eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
5428 : nChunkYSizeQueried, pabyChunkNodataMask,
5429 : nWidth, nChunkYSizeQueried, GDT_UInt8,
5430 : 0, 0, nullptr);
5431 :
5432 : // Special case to promote 1bit data to 8bit 0/255 values.
5433 690 : if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
5434 : {
5435 9 : if (eWrkDataType == GDT_Float32)
5436 : {
5437 0 : float *pafChunk = static_cast<float *>(pChunk);
5438 0 : for (size_t i = 0;
5439 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5440 : {
5441 0 : if (pafChunk[i] == 1.0f)
5442 0 : pafChunk[i] = 255.0f;
5443 : }
5444 : }
5445 9 : else if (eWrkDataType == GDT_UInt8)
5446 : {
5447 9 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
5448 168417 : for (size_t i = 0;
5449 168417 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5450 : {
5451 168408 : if (pabyChunk[i] == 1)
5452 127437 : pabyChunk[i] = 255;
5453 : }
5454 : }
5455 0 : else if (eWrkDataType == GDT_UInt16)
5456 : {
5457 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
5458 0 : for (size_t i = 0;
5459 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5460 : {
5461 0 : if (pasChunk[i] == 1)
5462 0 : pasChunk[i] = 255;
5463 : }
5464 : }
5465 0 : else if (eWrkDataType == GDT_Float64)
5466 : {
5467 0 : double *padfChunk = static_cast<double *>(pChunk);
5468 0 : for (size_t i = 0;
5469 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5470 : {
5471 0 : if (padfChunk[i] == 1.0)
5472 0 : padfChunk[i] = 255.0;
5473 : }
5474 : }
5475 : else
5476 : {
5477 0 : CPLAssert(false);
5478 : }
5479 : }
5480 681 : else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
5481 : {
5482 0 : if (eWrkDataType == GDT_Float32)
5483 : {
5484 0 : float *pafChunk = static_cast<float *>(pChunk);
5485 0 : for (size_t i = 0;
5486 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5487 : {
5488 0 : if (pafChunk[i] == 1.0f)
5489 0 : pafChunk[i] = 0.0f;
5490 0 : else if (pafChunk[i] == 0.0f)
5491 0 : pafChunk[i] = 255.0f;
5492 : }
5493 : }
5494 0 : else if (eWrkDataType == GDT_UInt8)
5495 : {
5496 0 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
5497 0 : for (size_t i = 0;
5498 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5499 : {
5500 0 : if (pabyChunk[i] == 1)
5501 0 : pabyChunk[i] = 0;
5502 0 : else if (pabyChunk[i] == 0)
5503 0 : pabyChunk[i] = 255;
5504 : }
5505 : }
5506 0 : else if (eWrkDataType == GDT_UInt16)
5507 : {
5508 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
5509 0 : for (size_t i = 0;
5510 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5511 : {
5512 0 : if (pasChunk[i] == 1)
5513 0 : pasChunk[i] = 0;
5514 0 : else if (pasChunk[i] == 0)
5515 0 : pasChunk[i] = 255;
5516 : }
5517 : }
5518 0 : else if (eWrkDataType == GDT_Float64)
5519 : {
5520 0 : double *padfChunk = static_cast<double *>(pChunk);
5521 0 : for (size_t i = 0;
5522 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5523 : {
5524 0 : if (padfChunk[i] == 1.0)
5525 0 : padfChunk[i] = 0.0;
5526 0 : else if (padfChunk[i] == 0.0)
5527 0 : padfChunk[i] = 255.0;
5528 : }
5529 : }
5530 : else
5531 : {
5532 0 : CPLAssert(false);
5533 : }
5534 : }
5535 :
5536 690 : auto pChunkRaw = pChunk;
5537 690 : auto pabyChunkNodataMaskRaw = pabyChunkNodataMask;
5538 690 : std::shared_ptr<PointerHolder> oSrcBufferHolder;
5539 690 : std::shared_ptr<PointerHolder> oSrcMaskBufferHolder;
5540 690 : if (poJobQueue)
5541 : {
5542 0 : oSrcBufferHolder = std::make_shared<PointerHolder>(pChunk);
5543 : oSrcMaskBufferHolder =
5544 0 : std::make_shared<PointerHolder>(pabyChunkNodataMask);
5545 : }
5546 :
5547 1481 : for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
5548 : ++iOverview)
5549 : {
5550 791 : GDALRasterBand *poDstBand = papoOvrBands[iOverview];
5551 791 : const int nDstWidth = poDstBand->GetXSize();
5552 791 : const int nDstHeight = poDstBand->GetYSize();
5553 :
5554 791 : const double dfXRatioDstToSrc =
5555 791 : static_cast<double>(nWidth) / nDstWidth;
5556 791 : const double dfYRatioDstToSrc =
5557 791 : static_cast<double>(nHeight) / nDstHeight;
5558 :
5559 : /* --------------------------------------------------------------------
5560 : */
5561 : /* Figure out the line to start writing to, and the first line
5562 : */
5563 : /* to not write to. In theory this approach should ensure that
5564 : */
5565 : /* every output line will be written if all input chunks are */
5566 : /* processed. */
5567 : /* --------------------------------------------------------------------
5568 : */
5569 791 : int nDstYOff =
5570 791 : static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
5571 791 : if (nDstYOff == nDstHeight)
5572 0 : continue;
5573 791 : int nDstYOff2 = static_cast<int>(
5574 791 : 0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
5575 :
5576 791 : if (nChunkYOff + nFullResYChunk == nHeight)
5577 784 : nDstYOff2 = nDstHeight;
5578 : #if DEBUG_VERBOSE
5579 : CPLDebug("GDAL",
5580 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
5581 : nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
5582 : nDstWidth, nDstYOff2 - nDstYOff);
5583 : #endif
5584 :
5585 1582 : auto poJob = std::make_unique<OvrJob>();
5586 791 : poJob->pfnResampleFn = pfnResampleFn;
5587 791 : poJob->bUseGenericResampleFn = bUseGenericResampleFn;
5588 791 : poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
5589 791 : poJob->args.nOvrXSize = poDstBand->GetXSize();
5590 791 : poJob->args.nOvrYSize = poDstBand->GetYSize();
5591 1582 : const char *pszNBITS = poDstBand->GetMetadataItem(
5592 791 : GDALMD_NBITS, GDAL_MDD_IMAGE_STRUCTURE);
5593 791 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
5594 791 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
5595 791 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
5596 791 : poJob->args.eWrkDataType = eWrkDataType;
5597 791 : poJob->pChunk = pChunkRaw;
5598 791 : poJob->args.pabyChunkNodataMask = pabyChunkNodataMaskRaw;
5599 791 : poJob->nSrcWidth = nWidth;
5600 791 : poJob->nSrcHeight = nHeight;
5601 791 : poJob->args.nChunkXOff = 0;
5602 791 : poJob->args.nChunkXSize = nWidth;
5603 791 : poJob->args.nChunkYOff = nChunkYOffQueried;
5604 791 : poJob->args.nChunkYSize = nChunkYSizeQueried;
5605 791 : poJob->nDstWidth = nDstWidth;
5606 791 : poJob->args.nDstXOff = 0;
5607 791 : poJob->args.nDstXOff2 = nDstWidth;
5608 791 : poJob->args.nDstYOff = nDstYOff;
5609 791 : poJob->args.nDstYOff2 = nDstYOff2;
5610 791 : poJob->poDstBand = poDstBand;
5611 791 : poJob->args.pszResampling = pszResampling;
5612 791 : poJob->args.bHasNoData = bHasNoData;
5613 791 : poJob->args.dfNoDataValue = dfNoDataValue;
5614 791 : poJob->args.poColorTable = poColorTable;
5615 791 : poJob->args.eSrcDataType = eSrcDataType;
5616 791 : poJob->args.bPropagateNoData = bPropagateNoData;
5617 :
5618 791 : if (poJobQueue)
5619 : {
5620 0 : poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
5621 0 : poJob->SetSrcBufferHolder(oSrcBufferHolder);
5622 0 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
5623 0 : jobList.emplace_back(std::move(poJob));
5624 : }
5625 : else
5626 : {
5627 791 : JobResampleFunc(poJob.get());
5628 791 : eErr = poJob->eErr;
5629 791 : if (eErr == CE_None)
5630 : {
5631 791 : eErr = WriteJobData(poJob.get());
5632 : }
5633 : }
5634 : }
5635 : }
5636 :
5637 685 : VSIFree(pChunk);
5638 685 : VSIFree(pabyChunkNodataMask);
5639 :
5640 : // Wait for all pending jobs to complete
5641 685 : while (!jobList.empty())
5642 : {
5643 0 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5644 0 : if (l_eErr != CE_None && eErr == CE_None)
5645 0 : eErr = l_eErr;
5646 : }
5647 :
5648 : /* -------------------------------------------------------------------- */
5649 : /* Renormalized overview mean / stddev if needed. */
5650 : /* -------------------------------------------------------------------- */
5651 685 : if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
5652 : {
5653 0 : GDALOverviewMagnitudeCorrection(
5654 : poSrcBand, nOverviewCount,
5655 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
5656 : GDALDummyProgress, nullptr);
5657 : }
5658 :
5659 : /* -------------------------------------------------------------------- */
5660 : /* It can be important to flush out data to overviews. */
5661 : /* -------------------------------------------------------------------- */
5662 1469 : for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
5663 : ++iOverview)
5664 : {
5665 784 : eErr = papoOvrBands[iOverview]->FlushCache(false);
5666 : }
5667 :
5668 685 : if (eErr == CE_None)
5669 685 : pfnProgress(1.0, nullptr, pProgressData);
5670 :
5671 685 : return eErr;
5672 : }
5673 :
5674 : /************************************************************************/
5675 : /* GDALRegenerateOverviewsMultiBand() */
5676 : /************************************************************************/
5677 :
5678 : /**
5679 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5680 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5681 : *
5682 : * This function will generate one or more overview images from a base
5683 : * image using the requested downsampling algorithm. Its primary use
5684 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
5685 : * can also be used to generate downsampled images in one file from another
5686 : * outside the overview architecture.
5687 : *
5688 : * The output bands need to exist in advance and share the same characteristics
5689 : * (type, dimensions)
5690 : *
5691 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5692 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5693 : *
5694 : * It does not support color tables or complex data types.
5695 : *
5696 : * The pseudo-algorithm used by the function is :
5697 : * for each overview
5698 : * iterate on lines of the source by a step of deltay
5699 : * iterate on columns of the source by a step of deltax
5700 : * read the source data of size deltax * deltay for all the bands
5701 : * generate the corresponding overview block for all the bands
5702 : *
5703 : * This function will honour properly NODATA_VALUES tuples (special dataset
5704 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5705 : * considered as the nodata value and not each value of the triplet
5706 : * independently per band.
5707 : *
5708 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
5709 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5710 : * overview computation.
5711 : *
5712 : * @param nBands the number of bands, size of papoSrcBands and size of
5713 : * first dimension of papapoOverviewBands
5714 : * @param papoSrcBands the list of source bands to downsample
5715 : * @param nOverviews the number of downsampled overview levels being generated.
5716 : * @param papapoOverviewBands bidimension array of bands. First dimension is
5717 : * indexed by nBands. Second dimension is indexed by
5718 : * nOverviews.
5719 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5720 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5721 : * @param pfnProgress progress report function.
5722 : * @param pProgressData progress function callback data.
5723 : * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
5724 : * key=value pairs, or NULL
5725 : * Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
5726 : * options can be specified to express that overviews should
5727 : * be regenerated only in the specified subset of the source
5728 : * dataset.
5729 : * @return CE_None on success or CE_Failure on failure.
5730 : */
5731 :
5732 390 : CPLErr GDALRegenerateOverviewsMultiBand(
5733 : int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
5734 : GDALRasterBand *const *const *papapoOverviewBands,
5735 : const char *pszResampling, GDALProgressFunc pfnProgress,
5736 : void *pProgressData, CSLConstList papszOptions)
5737 : {
5738 390 : CPL_IGNORE_RET_VAL(papszOptions);
5739 :
5740 390 : if (pfnProgress == nullptr)
5741 11 : pfnProgress = GDALDummyProgress;
5742 :
5743 390 : if (EQUAL(pszResampling, "NONE") || nBands == 0 || nOverviews == 0)
5744 3 : return CE_None;
5745 :
5746 : // Sanity checks.
5747 387 : if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
5748 193 : !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
5749 84 : !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
5750 25 : !EQUAL(pszResampling, "CUBICSPLINE") &&
5751 24 : !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
5752 5 : !EQUAL(pszResampling, "MODE"))
5753 : {
5754 0 : CPLError(CE_Failure, CPLE_NotSupported,
5755 : "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
5756 : "not supported",
5757 : pszResampling);
5758 0 : return CE_Failure;
5759 : }
5760 :
5761 387 : int nKernelRadius = 0;
5762 : GDALResampleFunction pfnResampleFn =
5763 387 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
5764 387 : if (pfnResampleFn == nullptr)
5765 0 : return CE_Failure;
5766 :
5767 387 : const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
5768 387 : const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
5769 387 : if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
5770 0 : return CE_None;
5771 387 : GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
5772 66235 : for (int iBand = 1; iBand < nBands; ++iBand)
5773 : {
5774 131696 : if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
5775 65848 : papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
5776 : {
5777 0 : CPLError(
5778 : CE_Failure, CPLE_NotSupported,
5779 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5780 : "have the same dimensions");
5781 0 : return CE_Failure;
5782 : }
5783 65848 : if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
5784 : {
5785 0 : CPLError(
5786 : CE_Failure, CPLE_NotSupported,
5787 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5788 : "have the same data type");
5789 0 : return CE_Failure;
5790 : }
5791 : }
5792 :
5793 1030 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5794 : {
5795 643 : const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
5796 643 : const int nDstWidth = poOvrFirstBand->GetXSize();
5797 643 : const int nDstHeight = poOvrFirstBand->GetYSize();
5798 66751 : for (int iBand = 1; iBand < nBands; ++iBand)
5799 : {
5800 66108 : const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
5801 132216 : if (poOvrBand->GetXSize() != nDstWidth ||
5802 66108 : poOvrBand->GetYSize() != nDstHeight)
5803 : {
5804 0 : CPLError(
5805 : CE_Failure, CPLE_NotSupported,
5806 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5807 : "of the same level must have the same dimensions");
5808 0 : return CE_Failure;
5809 : }
5810 66108 : if (poOvrBand->GetRasterDataType() != eDataType)
5811 : {
5812 0 : CPLError(
5813 : CE_Failure, CPLE_NotSupported,
5814 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5815 : "must have the same data type as the source bands");
5816 0 : return CE_Failure;
5817 : }
5818 : }
5819 : }
5820 :
5821 : // First pass to compute the total number of pixels to write.
5822 387 : double dfTotalPixelCount = 0;
5823 387 : const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
5824 387 : const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
5825 387 : const int nSrcXSize = atoi(CSLFetchNameValueDef(
5826 : papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
5827 387 : const int nSrcYSize = atoi(CSLFetchNameValueDef(
5828 : papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
5829 1030 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5830 : {
5831 643 : dfTotalPixelCount +=
5832 1286 : static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
5833 643 : papapoOverviewBands[0][iOverview]->GetXSize() *
5834 1286 : static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
5835 643 : papapoOverviewBands[0][iOverview]->GetYSize();
5836 : }
5837 :
5838 : const GDALDataType eWrkDataType =
5839 387 : GDALGetOvrWorkDataType(pszResampling, eDataType);
5840 : const int nWrkDataTypeSize =
5841 387 : std::max(1, GDALGetDataTypeSizeBytes(eWrkDataType));
5842 :
5843 387 : const bool bIsMask = papoSrcBands[0]->IsMaskBand();
5844 :
5845 : // If we have a nodata mask and we are doing something more complicated
5846 : // than nearest neighbouring, we have to fetch to nodata mask.
5847 : const bool bUseNoDataMask =
5848 574 : !STARTS_WITH_CI(pszResampling, "NEAR") &&
5849 187 : (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
5850 :
5851 774 : std::vector<bool> abHasNoData(nBands);
5852 774 : std::vector<double> adfNoDataValue(nBands);
5853 :
5854 66622 : for (int iBand = 0; iBand < nBands; ++iBand)
5855 : {
5856 66235 : int nHasNoData = 0;
5857 132470 : adfNoDataValue[iBand] =
5858 66235 : papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
5859 66235 : abHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
5860 : }
5861 :
5862 774 : std::string osDetailMessage;
5863 440 : if (bUseNoDataMask &&
5864 53 : papoSrcBands[0]->HasConflictingMaskSources(&osDetailMessage, false))
5865 : {
5866 9 : CPLError(CE_Warning, CPLE_AppDefined, "%s%s", osDetailMessage.c_str(),
5867 18 : abHasNoData[0]
5868 : ? "Only the nodata value will be taken into account."
5869 9 : : "Only the first listed one will be taken into account.");
5870 : }
5871 :
5872 : const bool bPropagateNoData =
5873 387 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5874 :
5875 387 : const int nThreads = GDALGetNumThreads(GDAL_DEFAULT_MAX_THREAD_COUNT,
5876 : /* bDefaultToAllCPUs=*/false);
5877 : auto poThreadPool =
5878 387 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5879 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5880 774 : : std::unique_ptr<CPLJobQueue>(nullptr);
5881 :
5882 : // Only configurable for debug / testing
5883 387 : const GIntBig nChunkMaxSize = []() -> GIntBig
5884 : {
5885 : const char *pszVal =
5886 387 : CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", nullptr);
5887 387 : if (pszVal)
5888 : {
5889 15 : GIntBig nRet = 0;
5890 15 : CPLParseMemorySize(pszVal, &nRet, nullptr);
5891 15 : return std::max<GIntBig>(100, nRet);
5892 : }
5893 372 : return 10 * 1024 * 1024;
5894 387 : }();
5895 :
5896 : // Only configurable for debug / testing
5897 387 : const GIntBig nChunkMaxSizeForTempFile = []() -> GIntBig
5898 : {
5899 387 : const char *pszVal = CPLGetConfigOption(
5900 : "GDAL_OVR_CHUNK_MAX_SIZE_FOR_TEMP_FILE", nullptr);
5901 387 : if (pszVal)
5902 : {
5903 14 : GIntBig nRet = 0;
5904 14 : CPLParseMemorySize(pszVal, &nRet, nullptr);
5905 14 : return std::max<GIntBig>(100, nRet);
5906 : }
5907 373 : const auto nUsableRAM = CPLGetUsablePhysicalRAM();
5908 373 : if (nUsableRAM > 0)
5909 373 : return nUsableRAM / 10;
5910 : // Select a value to be able to at least downsample by 2 for a RGB
5911 : // 1024x1024 tiled output: (2 * 1024 + 2) * (2 * 1024 + 2) * 3 = 12 MB
5912 0 : return 100 * 1024 * 1024;
5913 387 : }();
5914 :
5915 : // Second pass to do the real job.
5916 387 : double dfCurPixelCount = 0;
5917 387 : CPLErr eErr = CE_None;
5918 1024 : for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
5919 : ++iOverview)
5920 : {
5921 642 : int iSrcOverview = -1; // -1 means the source bands.
5922 :
5923 : const int nDstTotalWidth =
5924 642 : papapoOverviewBands[0][iOverview]->GetXSize();
5925 : const int nDstTotalHeight =
5926 642 : papapoOverviewBands[0][iOverview]->GetYSize();
5927 :
5928 : // Compute the coordinates of the target region to refresh
5929 642 : constexpr double EPS = 1e-8;
5930 642 : const int nDstXOffStart = static_cast<int>(
5931 642 : static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
5932 : EPS);
5933 : const int nDstXOffEnd =
5934 1284 : std::min(static_cast<int>(
5935 642 : std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
5936 642 : nToplevelSrcWidth * nDstTotalWidth -
5937 : EPS)),
5938 642 : nDstTotalWidth);
5939 642 : const int nDstWidth = nDstXOffEnd - nDstXOffStart;
5940 642 : const int nDstYOffStart =
5941 642 : static_cast<int>(static_cast<double>(nSrcYOff) /
5942 642 : nToplevelSrcHeight * nDstTotalHeight +
5943 : EPS);
5944 : const int nDstYOffEnd =
5945 1284 : std::min(static_cast<int>(
5946 642 : std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
5947 642 : nToplevelSrcHeight * nDstTotalHeight -
5948 : EPS)),
5949 642 : nDstTotalHeight);
5950 642 : const int nDstHeight = nDstYOffEnd - nDstYOffStart;
5951 :
5952 : // Try to use previous level of overview as the source to compute
5953 : // the next level.
5954 642 : int nSrcWidth = nToplevelSrcWidth;
5955 642 : int nSrcHeight = nToplevelSrcHeight;
5956 897 : if (iOverview > 0 &&
5957 255 : papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
5958 : {
5959 247 : nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
5960 247 : nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
5961 247 : iSrcOverview = iOverview - 1;
5962 : }
5963 :
5964 642 : const double dfXRatioDstToSrc =
5965 642 : static_cast<double>(nSrcWidth) / nDstTotalWidth;
5966 642 : const double dfYRatioDstToSrc =
5967 642 : static_cast<double>(nSrcHeight) / nDstTotalHeight;
5968 :
5969 : const int nOvrFactor =
5970 1926 : std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
5971 642 : static_cast<int>(0.5 + dfYRatioDstToSrc)));
5972 :
5973 642 : int nDstChunkXSize = 0;
5974 642 : int nDstChunkYSize = 0;
5975 642 : papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
5976 : &nDstChunkYSize);
5977 :
5978 642 : constexpr int PIXEL_MARGIN = 2;
5979 : // Try to extend the chunk size so that the memory needed to acquire
5980 : // source pixels goes up to 10 MB.
5981 : // This can help for drivers that support multi-threaded reading
5982 642 : const int nFullResYChunk = static_cast<int>(std::min<double>(
5983 642 : nSrcHeight, PIXEL_MARGIN + nDstChunkYSize * dfYRatioDstToSrc));
5984 642 : const int nFullResYChunkQueried = static_cast<int>(std::min<int64_t>(
5985 1284 : nSrcHeight,
5986 1284 : nFullResYChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5987 642 : nKernelRadius * nOvrFactor));
5988 876 : while (nDstChunkXSize < nDstWidth)
5989 : {
5990 254 : constexpr int INCREASE_FACTOR = 2;
5991 :
5992 254 : const int nFullResXChunk = static_cast<int>(std::min<double>(
5993 508 : nSrcWidth, PIXEL_MARGIN + INCREASE_FACTOR * nDstChunkXSize *
5994 254 : dfXRatioDstToSrc));
5995 :
5996 : const int nFullResXChunkQueried =
5997 254 : static_cast<int>(std::min<int64_t>(
5998 508 : nSrcWidth,
5999 508 : nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
6000 254 : nKernelRadius * nOvrFactor));
6001 :
6002 254 : if (nBands > nChunkMaxSize / nFullResXChunkQueried /
6003 254 : nFullResYChunkQueried / nWrkDataTypeSize)
6004 : {
6005 20 : break;
6006 : }
6007 :
6008 234 : nDstChunkXSize *= INCREASE_FACTOR;
6009 : }
6010 642 : nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
6011 :
6012 642 : const int nFullResXChunk = static_cast<int>(std::min<double>(
6013 642 : nSrcWidth, PIXEL_MARGIN + nDstChunkXSize * dfXRatioDstToSrc));
6014 642 : const int nFullResXChunkQueried = static_cast<int>(std::min<int64_t>(
6015 1284 : nSrcWidth,
6016 1284 : nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
6017 642 : nKernelRadius * nOvrFactor));
6018 :
6019 : // Make sure that the RAM requirements to acquire the source data does
6020 : // not exceed nChunkMaxSizeForTempFile
6021 : // If so, reduce the destination chunk size, generate overviews in a
6022 : // temporary dataset, and copy that temporary dataset over the target
6023 : // overview bands (to avoid issues with lossy compression)
6024 : const bool bOverflowFullResXChunkYChunkQueried =
6025 642 : nBands > std::numeric_limits<int64_t>::max() /
6026 642 : nFullResXChunkQueried / nFullResYChunkQueried /
6027 642 : nWrkDataTypeSize;
6028 :
6029 642 : const auto nMemRequirement =
6030 : bOverflowFullResXChunkYChunkQueried
6031 642 : ? 0
6032 638 : : static_cast<GIntBig>(nFullResXChunkQueried) *
6033 638 : nFullResYChunkQueried * nBands * nWrkDataTypeSize;
6034 : // Use a temporary dataset with a smaller destination chunk size
6035 642 : const auto nOverShootFactor =
6036 : nMemRequirement / nChunkMaxSizeForTempFile;
6037 :
6038 642 : constexpr int MIN_OVERSHOOT_FACTOR = 4;
6039 : const auto nSqrtOverShootFactor = std::max<GIntBig>(
6040 1284 : MIN_OVERSHOOT_FACTOR, static_cast<GIntBig>(std::ceil(std::sqrt(
6041 642 : static_cast<double>(nOverShootFactor)))));
6042 642 : constexpr int DEFAULT_CHUNK_SIZE = 256;
6043 642 : constexpr int GTIFF_BLOCK_SIZE_MULTIPLE = 16;
6044 : const int nReducedDstChunkXSize =
6045 : bOverflowFullResXChunkYChunkQueried
6046 1280 : ? DEFAULT_CHUNK_SIZE
6047 1280 : : std::max(1, static_cast<int>(nDstChunkXSize /
6048 1280 : nSqrtOverShootFactor) &
6049 638 : ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
6050 : const int nReducedDstChunkYSize =
6051 : bOverflowFullResXChunkYChunkQueried
6052 1280 : ? DEFAULT_CHUNK_SIZE
6053 1280 : : std::max(1, static_cast<int>(nDstChunkYSize /
6054 1280 : nSqrtOverShootFactor) &
6055 638 : ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
6056 :
6057 642 : if (bOverflowFullResXChunkYChunkQueried ||
6058 : nMemRequirement > nChunkMaxSizeForTempFile)
6059 : {
6060 : const auto nDTSize =
6061 43 : std::max(1, GDALGetDataTypeSizeBytes(eDataType));
6062 : const bool bTmpDSMemRequirementOverflow =
6063 43 : nBands > std::numeric_limits<int64_t>::max() / nDstWidth /
6064 43 : nDstHeight / nDTSize;
6065 43 : const auto nTmpDSMemRequirement =
6066 : bTmpDSMemRequirementOverflow
6067 43 : ? 0
6068 41 : : static_cast<GIntBig>(nDstWidth) * nDstHeight * nBands *
6069 41 : nDTSize;
6070 :
6071 : // make sure that one band buffer doesn't overflow size_t
6072 : const bool bChunkSizeOverflow =
6073 43 : static_cast<size_t>(nDTSize) >
6074 43 : std::numeric_limits<size_t>::max() / nDstWidth / nDstHeight;
6075 43 : const size_t nChunkSize =
6076 : bChunkSizeOverflow
6077 43 : ? 0
6078 41 : : static_cast<size_t>(nDstWidth) * nDstHeight * nDTSize;
6079 :
6080 : const auto CreateVRT =
6081 41 : [nBands, nSrcWidth, nSrcHeight, nDstTotalWidth, nDstTotalHeight,
6082 : pszResampling, eWrkDataType, papoSrcBands, papapoOverviewBands,
6083 : iSrcOverview, &abHasNoData,
6084 393585 : &adfNoDataValue](int nVRTBlockXSize, int nVRTBlockYSize)
6085 : {
6086 : auto poVRTDS = std::make_unique<VRTDataset>(
6087 41 : nDstTotalWidth, nDstTotalHeight, nVRTBlockXSize,
6088 41 : nVRTBlockYSize);
6089 :
6090 65620 : for (int iBand = 0; iBand < nBands; ++iBand)
6091 : {
6092 131158 : auto poVRTSrc = std::make_unique<VRTSimpleSource>();
6093 65579 : poVRTSrc->SetResampling(pszResampling);
6094 65579 : poVRTDS->AddBand(eWrkDataType);
6095 : auto poVRTBand = static_cast<VRTSourcedRasterBand *>(
6096 65579 : poVRTDS->GetRasterBand(iBand + 1));
6097 :
6098 65579 : auto poSrcBand = papoSrcBands[iBand];
6099 65579 : if (iSrcOverview != -1)
6100 24 : poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
6101 65579 : poVRTBand->ConfigureSource(
6102 : poVRTSrc.get(), poSrcBand, false, 0, 0, nSrcWidth,
6103 : nSrcHeight, 0, 0, nDstTotalWidth, nDstTotalHeight);
6104 : // Add the source to the band
6105 65579 : poVRTBand->AddSource(poVRTSrc.release());
6106 65579 : if (abHasNoData[iBand])
6107 3 : poVRTBand->SetNoDataValue(adfNoDataValue[iBand]);
6108 : }
6109 :
6110 42 : if (papoSrcBands[0]->GetMaskFlags() == GMF_PER_DATASET &&
6111 1 : poVRTDS->CreateMaskBand(GMF_PER_DATASET) == CE_None)
6112 : {
6113 : VRTSourcedRasterBand *poMaskVRTBand =
6114 1 : cpl::down_cast<VRTSourcedRasterBand *>(
6115 1 : poVRTDS->GetRasterBand(1)->GetMaskBand());
6116 1 : auto poSrcBand = papoSrcBands[0];
6117 1 : if (iSrcOverview != -1)
6118 0 : poSrcBand = papapoOverviewBands[0][iSrcOverview];
6119 1 : poMaskVRTBand->AddMaskBandSource(
6120 1 : poSrcBand->GetMaskBand(), 0, 0, nSrcWidth, nSrcHeight,
6121 : 0, 0, nDstTotalWidth, nDstTotalHeight);
6122 : }
6123 :
6124 41 : return poVRTDS;
6125 43 : };
6126 :
6127 : // If the overview accommodates chunking, do so and recurse
6128 : // to avoid generating full size temporary files
6129 43 : if (!bOverflowFullResXChunkYChunkQueried &&
6130 39 : !bTmpDSMemRequirementOverflow && !bChunkSizeOverflow &&
6131 39 : (nDstChunkXSize < nDstWidth || nDstChunkYSize < nDstHeight))
6132 : {
6133 : // Create a VRT with the smaller chunk to do the scaling
6134 : auto poVRTDS =
6135 13 : CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
6136 :
6137 13 : std::vector<GDALRasterBand *> apoVRTBand(nBands);
6138 13 : std::vector<GDALRasterBand *> apoDstBand(nBands);
6139 65560 : for (int iBand = 0; iBand < nBands; ++iBand)
6140 : {
6141 65547 : apoDstBand[iBand] = papapoOverviewBands[iBand][iOverview];
6142 65547 : apoVRTBand[iBand] = poVRTDS->GetRasterBand(iBand + 1);
6143 : }
6144 :
6145 : // Use a flag to avoid reading from the overview being built
6146 : GDALRasterIOExtraArg sExtraArg;
6147 13 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
6148 13 : if (iSrcOverview == -1)
6149 13 : sExtraArg.bUseOnlyThisScale = true;
6150 :
6151 : // A single band buffer for data transfer to the overview
6152 13 : std::vector<GByte> abyChunk;
6153 : try
6154 : {
6155 13 : abyChunk.resize(nChunkSize);
6156 : }
6157 0 : catch (const std::exception &)
6158 : {
6159 0 : CPLError(CE_Failure, CPLE_OutOfMemory,
6160 : "Out of memory allocating temporary buffer");
6161 0 : return CE_Failure;
6162 : }
6163 :
6164 : // Loop over output height, in chunks
6165 13 : for (int nDstYOff = nDstYOffStart;
6166 38 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6167 : /* */)
6168 : {
6169 : const int nDstYCount =
6170 25 : std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
6171 : // Loop over output width, in output chunks
6172 25 : for (int nDstXOff = nDstXOffStart;
6173 74 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6174 : /* */)
6175 : {
6176 : const int nDstXCount =
6177 49 : std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
6178 : // Read and transfer the chunk to the overview
6179 98 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
6180 : ++iBand)
6181 : {
6182 98 : eErr = apoVRTBand[iBand]->RasterIO(
6183 : GF_Read, nDstXOff, nDstYOff, nDstXCount,
6184 49 : nDstYCount, abyChunk.data(), nDstXCount,
6185 : nDstYCount, eDataType, 0, 0, &sExtraArg);
6186 49 : if (eErr == CE_None)
6187 : {
6188 96 : eErr = apoDstBand[iBand]->RasterIO(
6189 : GF_Write, nDstXOff, nDstYOff, nDstXCount,
6190 48 : nDstYCount, abyChunk.data(), nDstXCount,
6191 : nDstYCount, eDataType, 0, 0, nullptr);
6192 : }
6193 : }
6194 :
6195 49 : dfCurPixelCount +=
6196 49 : static_cast<double>(nDstXCount) * nDstYCount;
6197 :
6198 49 : nDstXOff += nDstXCount;
6199 : } // width
6200 :
6201 25 : if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount,
6202 : nullptr, pProgressData))
6203 : {
6204 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
6205 : "User terminated");
6206 0 : eErr = CE_Failure;
6207 : }
6208 :
6209 25 : nDstYOff += nDstYCount;
6210 : } // height
6211 :
6212 13 : if (CE_None != eErr)
6213 : {
6214 1 : CPLError(CE_Failure, CPLE_AppDefined,
6215 : "Error while writing overview");
6216 1 : return CE_Failure;
6217 : }
6218 :
6219 12 : pfnProgress(1.0, nullptr, pProgressData);
6220 : // Flush the overviews we just generated
6221 24 : for (int iBand = 0; iBand < nBands; ++iBand)
6222 12 : apoDstBand[iBand]->FlushCache(false);
6223 :
6224 12 : continue; // Next overview
6225 : } // chunking via temporary dataset
6226 :
6227 0 : std::unique_ptr<GDALDataset> poTmpDS;
6228 : // Config option mostly/only for autotest purposes
6229 : const char *pszGDAL_OVR_TEMP_DRIVER =
6230 30 : CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
6231 30 : if ((!bTmpDSMemRequirementOverflow &&
6232 4 : nTmpDSMemRequirement <= nChunkMaxSizeForTempFile &&
6233 4 : !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
6234 26 : EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
6235 : {
6236 10 : auto poTmpDrv = GetGDALDriverManager()->GetDriverByName("MEM");
6237 10 : if (!poTmpDrv)
6238 : {
6239 0 : eErr = CE_Failure;
6240 0 : break;
6241 : }
6242 10 : poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
6243 : nDstTotalHeight, nBands,
6244 10 : eDataType, nullptr));
6245 : }
6246 : else
6247 : {
6248 : // Create a temporary file for the overview
6249 : auto poTmpDrv =
6250 20 : GetGDALDriverManager()->GetDriverByName("GTiff");
6251 20 : if (!poTmpDrv)
6252 : {
6253 0 : eErr = CE_Failure;
6254 0 : break;
6255 : }
6256 40 : std::string osTmpFilename;
6257 20 : auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
6258 20 : if (poDstDS)
6259 : {
6260 20 : osTmpFilename = poDstDS->GetDescription();
6261 : VSIStatBufL sStatBuf;
6262 20 : if (!osTmpFilename.empty() &&
6263 0 : VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
6264 0 : osTmpFilename += "_tmp_ovr.tif";
6265 : }
6266 20 : if (osTmpFilename.empty())
6267 : {
6268 20 : osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
6269 20 : osTmpFilename += ".tif";
6270 : }
6271 20 : CPLDebug("GDAL", "Creating temporary file %s of %d x %d x %d",
6272 : osTmpFilename.c_str(), nDstWidth, nDstHeight, nBands);
6273 40 : CPLStringList aosCO;
6274 20 : if (0 == ((nReducedDstChunkXSize % GTIFF_BLOCK_SIZE_MULTIPLE) |
6275 20 : (nReducedDstChunkYSize % GTIFF_BLOCK_SIZE_MULTIPLE)))
6276 : {
6277 14 : aosCO.SetNameValue("TILED", "YES");
6278 : aosCO.SetNameValue("BLOCKXSIZE",
6279 14 : CPLSPrintf("%d", nReducedDstChunkXSize));
6280 : aosCO.SetNameValue("BLOCKYSIZE",
6281 14 : CPLSPrintf("%d", nReducedDstChunkYSize));
6282 : }
6283 20 : if (const char *pszCOList =
6284 20 : poTmpDrv->GetMetadataItem(GDAL_DMD_CREATIONOPTIONLIST))
6285 : {
6286 : aosCO.SetNameValue(
6287 20 : "COMPRESS", strstr(pszCOList, "ZSTD") ? "ZSTD" : "LZW");
6288 : }
6289 20 : poTmpDS.reset(poTmpDrv->Create(osTmpFilename.c_str(), nDstWidth,
6290 : nDstHeight, nBands, eDataType,
6291 20 : aosCO.List()));
6292 20 : if (poTmpDS)
6293 : {
6294 18 : poTmpDS->MarkSuppressOnClose();
6295 18 : VSIUnlink(osTmpFilename.c_str());
6296 : }
6297 : }
6298 30 : if (!poTmpDS)
6299 : {
6300 2 : eErr = CE_Failure;
6301 2 : break;
6302 : }
6303 :
6304 : // Create a full size VRT to do the resampling without edge effects
6305 : auto poVRTDS =
6306 28 : CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
6307 :
6308 : // Allocate a band buffer with the overview chunk size
6309 : std::unique_ptr<void, VSIFreeReleaser> pDstBuffer(
6310 : VSI_MALLOC3_VERBOSE(size_t(nWrkDataTypeSize), nDstChunkXSize,
6311 28 : nDstChunkYSize));
6312 28 : if (pDstBuffer == nullptr)
6313 : {
6314 0 : eErr = CE_Failure;
6315 0 : break;
6316 : }
6317 :
6318 : // Use a flag to avoid reading the overview being built
6319 : GDALRasterIOExtraArg sExtraArg;
6320 28 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
6321 28 : if (iSrcOverview == -1)
6322 4 : sExtraArg.bUseOnlyThisScale = true;
6323 :
6324 : // Scale and copy data from the VRT to the temp file
6325 28 : for (int nDstYOff = nDstYOffStart;
6326 914 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6327 : /* */)
6328 : {
6329 : const int nDstYCount =
6330 886 : std::min(nReducedDstChunkYSize, nDstYOffEnd - nDstYOff);
6331 886 : for (int nDstXOff = nDstXOffStart;
6332 201218 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6333 : /* */)
6334 : {
6335 : const int nDstXCount =
6336 200332 : std::min(nReducedDstChunkXSize, nDstXOffEnd - nDstXOff);
6337 400668 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
6338 : ++iBand)
6339 : {
6340 200336 : auto poSrcBand = poVRTDS->GetRasterBand(iBand + 1);
6341 200336 : eErr = poSrcBand->RasterIO(
6342 : GF_Read, nDstXOff, nDstYOff, nDstXCount, nDstYCount,
6343 : pDstBuffer.get(), nDstXCount, nDstYCount,
6344 : eWrkDataType, 0, 0, &sExtraArg);
6345 200336 : if (eErr == CE_None)
6346 : {
6347 : // Write to the temporary dataset, shifted
6348 200334 : auto poOvrBand = poTmpDS->GetRasterBand(iBand + 1);
6349 200334 : eErr = poOvrBand->RasterIO(
6350 : GF_Write, nDstXOff - nDstXOffStart,
6351 : nDstYOff - nDstYOffStart, nDstXCount,
6352 : nDstYCount, pDstBuffer.get(), nDstXCount,
6353 : nDstYCount, eWrkDataType, 0, 0, nullptr);
6354 : }
6355 : }
6356 200332 : nDstXOff += nDstXCount;
6357 : }
6358 886 : nDstYOff += nDstYCount;
6359 : }
6360 :
6361 : // Copy from the temporary to the overview
6362 28 : for (int nDstYOff = nDstYOffStart;
6363 54 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6364 : /* */)
6365 : {
6366 : const int nDstYCount =
6367 26 : std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
6368 26 : for (int nDstXOff = nDstXOffStart;
6369 52 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6370 : /* */)
6371 : {
6372 : const int nDstXCount =
6373 26 : std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
6374 56 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
6375 : ++iBand)
6376 : {
6377 30 : auto poSrcBand = poTmpDS->GetRasterBand(iBand + 1);
6378 30 : eErr = poSrcBand->RasterIO(
6379 : GF_Read, nDstXOff - nDstXOffStart,
6380 : nDstYOff - nDstYOffStart, nDstXCount, nDstYCount,
6381 : pDstBuffer.get(), nDstXCount, nDstYCount,
6382 : eWrkDataType, 0, 0, nullptr);
6383 30 : if (eErr == CE_None)
6384 : {
6385 : // Write to the destination overview bands
6386 30 : auto poOvrBand =
6387 30 : papapoOverviewBands[iBand][iOverview];
6388 30 : eErr = poOvrBand->RasterIO(
6389 : GF_Write, nDstXOff, nDstYOff, nDstXCount,
6390 : nDstYCount, pDstBuffer.get(), nDstXCount,
6391 : nDstYCount, eWrkDataType, 0, 0, nullptr);
6392 : }
6393 : }
6394 26 : nDstXOff += nDstXCount;
6395 : }
6396 26 : nDstYOff += nDstYCount;
6397 : }
6398 :
6399 28 : if (eErr != CE_None)
6400 : {
6401 2 : CPLError(CE_Failure, CPLE_AppDefined,
6402 : "Failed to write overview %d", iOverview);
6403 2 : return eErr;
6404 : }
6405 :
6406 : // Flush the data to overviews.
6407 56 : for (int iBand = 0; iBand < nBands; ++iBand)
6408 30 : papapoOverviewBands[iBand][iOverview]->FlushCache(false);
6409 :
6410 26 : continue;
6411 : }
6412 :
6413 : // Structure describing a resampling job
6414 : struct OvrJob
6415 : {
6416 : // Buffers to free when job is finished
6417 : std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
6418 : std::unique_ptr<PointerHolder> oSrcBufferHolder{};
6419 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
6420 :
6421 : GDALRasterBand *poDstBand = nullptr;
6422 :
6423 : // Input parameters of pfnResampleFn
6424 : GDALResampleFunction pfnResampleFn = nullptr;
6425 : GDALOverviewResampleArgs args{};
6426 : const void *pChunk = nullptr;
6427 :
6428 : // Output values of resampling function
6429 : CPLErr eErr = CE_Failure;
6430 : void *pDstBuffer = nullptr;
6431 : GDALDataType eDstBufferDataType = GDT_Unknown;
6432 :
6433 3296 : void NotifyFinished()
6434 : {
6435 6592 : std::lock_guard guard(mutex);
6436 3296 : bFinished = true;
6437 3296 : cv.notify_one();
6438 3296 : }
6439 :
6440 2 : bool IsFinished()
6441 : {
6442 2 : std::lock_guard guard(mutex);
6443 4 : return bFinished;
6444 : }
6445 :
6446 14 : void WaitFinished()
6447 : {
6448 28 : std::unique_lock oGuard(mutex);
6449 21 : while (!bFinished)
6450 : {
6451 7 : cv.wait(oGuard);
6452 : }
6453 14 : }
6454 :
6455 : private:
6456 : // Synchronization
6457 : bool bFinished = false;
6458 : std::mutex mutex{};
6459 : std::condition_variable cv{};
6460 : };
6461 :
6462 : // Thread function to resample
6463 3296 : const auto JobResampleFunc = [](void *pData)
6464 : {
6465 3296 : OvrJob *poJob = static_cast<OvrJob *>(pData);
6466 :
6467 3296 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
6468 : &(poJob->pDstBuffer),
6469 : &(poJob->eDstBufferDataType));
6470 :
6471 3296 : auto pDstBuffer = poJob->pDstBuffer;
6472 : poJob->oDstBufferHolder =
6473 3296 : std::make_unique<PointerHolder>(pDstBuffer);
6474 :
6475 3296 : poJob->NotifyFinished();
6476 3296 : };
6477 :
6478 : // Function to write resample data to target band
6479 3296 : const auto WriteJobData = [](const OvrJob *poJob)
6480 : {
6481 6592 : return poJob->poDstBand->RasterIO(
6482 3296 : GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
6483 3296 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
6484 3296 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
6485 3296 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
6486 3296 : poJob->args.nDstYOff2 - poJob->args.nDstYOff,
6487 3296 : poJob->eDstBufferDataType, 0, 0, nullptr);
6488 : };
6489 :
6490 : // Wait for completion of oldest job and serialize it
6491 : const auto WaitAndFinalizeOldestJob =
6492 14 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
6493 : {
6494 14 : auto poOldestJob = jobList.front().get();
6495 14 : poOldestJob->WaitFinished();
6496 14 : CPLErr l_eErr = poOldestJob->eErr;
6497 14 : if (l_eErr == CE_None)
6498 : {
6499 14 : l_eErr = WriteJobData(poOldestJob);
6500 : }
6501 :
6502 14 : jobList.pop_front();
6503 14 : return l_eErr;
6504 : };
6505 :
6506 : // Queue of jobs
6507 1198 : std::list<std::unique_ptr<OvrJob>> jobList;
6508 :
6509 1198 : std::vector<std::unique_ptr<void, VSIFreeReleaser>> apaChunk(nBands);
6510 : std::vector<std::unique_ptr<GByte, VSIFreeReleaser>>
6511 1198 : apabyChunkNoDataMask(nBands);
6512 :
6513 : // Iterate on destination overview, block by block.
6514 599 : for (int nDstYOff = nDstYOffStart;
6515 2105 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6516 1506 : nDstYOff += nDstChunkYSize)
6517 : {
6518 : int nDstYCount;
6519 1506 : if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
6520 1085 : nDstYCount = nDstChunkYSize;
6521 : else
6522 421 : nDstYCount = nDstYOffEnd - nDstYOff;
6523 :
6524 1506 : int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
6525 1506 : int nChunkYOff2 = static_cast<int>(
6526 1506 : ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
6527 1506 : if (nChunkYOff2 > nSrcHeight ||
6528 1506 : nDstYOff + nDstYCount == nDstTotalHeight)
6529 592 : nChunkYOff2 = nSrcHeight;
6530 1506 : int nYCount = nChunkYOff2 - nChunkYOff;
6531 1506 : CPLAssert(nYCount <= nFullResYChunk);
6532 :
6533 1506 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
6534 1506 : int nChunkYSizeQueried =
6535 1506 : nYCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
6536 1506 : if (nChunkYOffQueried < 0)
6537 : {
6538 146 : nChunkYSizeQueried += nChunkYOffQueried;
6539 146 : nChunkYOffQueried = 0;
6540 : }
6541 1506 : if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
6542 146 : nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
6543 1506 : CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
6544 :
6545 1506 : if (!pfnProgress(std::min(1.0, dfCurPixelCount / dfTotalPixelCount),
6546 : nullptr, pProgressData))
6547 : {
6548 1 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6549 1 : eErr = CE_Failure;
6550 : }
6551 :
6552 : // Iterate on destination overview, block by block.
6553 1506 : for (int nDstXOff = nDstXOffStart;
6554 3053 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6555 1547 : nDstXOff += nDstChunkXSize)
6556 : {
6557 1547 : int nDstXCount = 0;
6558 1547 : if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
6559 1528 : nDstXCount = nDstChunkXSize;
6560 : else
6561 19 : nDstXCount = nDstXOffEnd - nDstXOff;
6562 :
6563 1547 : dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
6564 :
6565 1547 : int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
6566 1547 : int nChunkXOff2 = static_cast<int>(
6567 1547 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
6568 1547 : if (nChunkXOff2 > nSrcWidth ||
6569 1547 : nDstXOff + nDstXCount == nDstTotalWidth)
6570 1470 : nChunkXOff2 = nSrcWidth;
6571 1547 : const int nXCount = nChunkXOff2 - nChunkXOff;
6572 1547 : CPLAssert(nXCount <= nFullResXChunk);
6573 :
6574 1547 : int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
6575 1547 : int nChunkXSizeQueried =
6576 1547 : nXCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
6577 1547 : if (nChunkXOffQueried < 0)
6578 : {
6579 209 : nChunkXSizeQueried += nChunkXOffQueried;
6580 209 : nChunkXOffQueried = 0;
6581 : }
6582 1547 : if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
6583 218 : nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
6584 1547 : CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
6585 : #if DEBUG_VERBOSE
6586 : CPLDebug("GDAL",
6587 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
6588 : nChunkXOffQueried, nChunkYOffQueried,
6589 : nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
6590 : nDstYOff, nDstXCount, nDstYCount);
6591 : #endif
6592 :
6593 : // Avoid accumulating too many tasks and exhaust RAM
6594 :
6595 : // Try to complete already finished jobs
6596 1549 : while (eErr == CE_None && !jobList.empty())
6597 : {
6598 2 : auto poOldestJob = jobList.front().get();
6599 2 : if (!poOldestJob->IsFinished())
6600 0 : break;
6601 2 : eErr = poOldestJob->eErr;
6602 2 : if (eErr == CE_None)
6603 : {
6604 2 : eErr = WriteJobData(poOldestJob);
6605 : }
6606 :
6607 2 : jobList.pop_front();
6608 : }
6609 :
6610 : // And in case we have saturated the number of threads,
6611 : // wait for completion of tasks to go below the threshold.
6612 3094 : while (eErr == CE_None &&
6613 1547 : jobList.size() >= static_cast<size_t>(nThreads))
6614 : {
6615 0 : eErr = WaitAndFinalizeOldestJob(jobList);
6616 : }
6617 :
6618 : // Read the source buffers for all the bands.
6619 4844 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6620 : {
6621 : // (Re)allocate buffers if needed
6622 3297 : if (apaChunk[iBand] == nullptr)
6623 : {
6624 1171 : apaChunk[iBand].reset(VSI_MALLOC3_VERBOSE(
6625 : nFullResXChunkQueried, nFullResYChunkQueried,
6626 : nWrkDataTypeSize));
6627 1171 : if (apaChunk[iBand] == nullptr)
6628 : {
6629 0 : eErr = CE_Failure;
6630 : }
6631 : }
6632 3632 : if (bUseNoDataMask &&
6633 335 : apabyChunkNoDataMask[iBand] == nullptr)
6634 : {
6635 268 : apabyChunkNoDataMask[iBand].reset(
6636 268 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
6637 : nFullResXChunkQueried, nFullResYChunkQueried)));
6638 268 : if (apabyChunkNoDataMask[iBand] == nullptr)
6639 : {
6640 0 : eErr = CE_Failure;
6641 : }
6642 : }
6643 :
6644 3297 : if (eErr == CE_None)
6645 : {
6646 3297 : GDALRasterBand *poSrcBand = nullptr;
6647 3297 : if (iSrcOverview == -1)
6648 2405 : poSrcBand = papoSrcBands[iBand];
6649 : else
6650 892 : poSrcBand =
6651 892 : papapoOverviewBands[iBand][iSrcOverview];
6652 3297 : eErr = poSrcBand->RasterIO(
6653 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6654 : nChunkXSizeQueried, nChunkYSizeQueried,
6655 3297 : apaChunk[iBand].get(), nChunkXSizeQueried,
6656 : nChunkYSizeQueried, eWrkDataType, 0, 0, nullptr);
6657 :
6658 3297 : if (bUseNoDataMask && eErr == CE_None)
6659 : {
6660 335 : auto poMaskBand = poSrcBand->IsMaskBand()
6661 335 : ? poSrcBand
6662 253 : : poSrcBand->GetMaskBand();
6663 335 : eErr = poMaskBand->RasterIO(
6664 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6665 : nChunkXSizeQueried, nChunkYSizeQueried,
6666 335 : apabyChunkNoDataMask[iBand].get(),
6667 : nChunkXSizeQueried, nChunkYSizeQueried,
6668 : GDT_UInt8, 0, 0, nullptr);
6669 : }
6670 : }
6671 : }
6672 :
6673 : // Compute the resulting overview block.
6674 4843 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6675 : {
6676 6592 : auto poJob = std::make_unique<OvrJob>();
6677 3296 : poJob->pfnResampleFn = pfnResampleFn;
6678 3296 : poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
6679 6592 : poJob->args.eOvrDataType =
6680 3296 : poJob->poDstBand->GetRasterDataType();
6681 3296 : poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
6682 3296 : poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
6683 3296 : const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
6684 3296 : GDALMD_NBITS, GDAL_MDD_IMAGE_STRUCTURE);
6685 3296 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
6686 3296 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
6687 3296 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
6688 3296 : poJob->args.eWrkDataType = eWrkDataType;
6689 3296 : poJob->pChunk = apaChunk[iBand].get();
6690 3296 : poJob->args.pabyChunkNodataMask =
6691 3296 : apabyChunkNoDataMask[iBand].get();
6692 3296 : poJob->args.nChunkXOff = nChunkXOffQueried;
6693 3296 : poJob->args.nChunkXSize = nChunkXSizeQueried;
6694 3296 : poJob->args.nChunkYOff = nChunkYOffQueried;
6695 3296 : poJob->args.nChunkYSize = nChunkYSizeQueried;
6696 3296 : poJob->args.nDstXOff = nDstXOff;
6697 3296 : poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
6698 3296 : poJob->args.nDstYOff = nDstYOff;
6699 3296 : poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
6700 3296 : poJob->args.pszResampling = pszResampling;
6701 3296 : poJob->args.bHasNoData = abHasNoData[iBand];
6702 3296 : poJob->args.dfNoDataValue = adfNoDataValue[iBand];
6703 3296 : poJob->args.eSrcDataType = eDataType;
6704 3296 : poJob->args.bPropagateNoData = bPropagateNoData;
6705 :
6706 3296 : if (poJobQueue)
6707 : {
6708 16 : poJob->oSrcMaskBufferHolder =
6709 32 : std::make_unique<PointerHolder>(
6710 32 : std::move(apabyChunkNoDataMask[iBand]));
6711 :
6712 16 : poJob->oSrcBufferHolder =
6713 32 : std::make_unique<PointerHolder>(
6714 32 : std::move(apaChunk[iBand]));
6715 :
6716 16 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
6717 16 : jobList.emplace_back(std::move(poJob));
6718 : }
6719 : else
6720 : {
6721 3280 : JobResampleFunc(poJob.get());
6722 3280 : eErr = poJob->eErr;
6723 3280 : if (eErr == CE_None)
6724 : {
6725 3280 : eErr = WriteJobData(poJob.get());
6726 : }
6727 : }
6728 : }
6729 : }
6730 : }
6731 :
6732 : // Wait for all pending jobs to complete
6733 613 : while (!jobList.empty())
6734 : {
6735 14 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
6736 14 : if (l_eErr != CE_None && eErr == CE_None)
6737 0 : eErr = l_eErr;
6738 : }
6739 :
6740 : // Flush the data to overviews.
6741 1768 : for (int iBand = 0; iBand < nBands; ++iBand)
6742 : {
6743 1169 : if (papapoOverviewBands[iBand][iOverview]->FlushCache(false) !=
6744 : CE_None)
6745 0 : eErr = CE_Failure;
6746 : }
6747 : }
6748 :
6749 384 : if (eErr == CE_None)
6750 380 : pfnProgress(1.0, nullptr, pProgressData);
6751 :
6752 384 : return eErr;
6753 : }
6754 :
6755 : /************************************************************************/
6756 : /* GDALRegenerateOverviewsMultiBand() */
6757 : /************************************************************************/
6758 :
6759 : /**
6760 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
6761 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
6762 : *
6763 : * This function will generate one or more overview images from a base
6764 : * image using the requested downsampling algorithm. Its primary use
6765 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
6766 : * can also be used to generate downsampled images in one file from another
6767 : * outside the overview architecture.
6768 : *
6769 : * The output bands need to exist in advance and share the same characteristics
6770 : * (type, dimensions)
6771 : *
6772 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
6773 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
6774 : *
6775 : * It does not support color tables or complex data types.
6776 : *
6777 : * The pseudo-algorithm used by the function is :
6778 : * for each overview
6779 : * iterate on lines of the source by a step of deltay
6780 : * iterate on columns of the source by a step of deltax
6781 : * read the source data of size deltax * deltay for all the bands
6782 : * generate the corresponding overview block for all the bands
6783 : *
6784 : * This function will honour properly NODATA_VALUES tuples (special dataset
6785 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
6786 : * considered as the nodata value and not each value of the triplet
6787 : * independently per band.
6788 : *
6789 : * The GDAL_NUM_THREADS configuration option can be set
6790 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
6791 : * overview computation.
6792 : *
6793 : * @param apoSrcBands the list of source bands to downsample
6794 : * @param aapoOverviewBands bidimension array of bands. First dimension is
6795 : * indexed by bands. Second dimension is indexed by
6796 : * overview levels. All aapoOverviewBands[i] arrays
6797 : * must have the same size (i.e. same number of
6798 : * overviews)
6799 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
6800 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
6801 : * @param pfnProgress progress report function.
6802 : * @param pProgressData progress function callback data.
6803 : * @param papszOptions NULL terminated list of options as
6804 : * key=value pairs, or NULL
6805 : * The XOFF, YOFF, XSIZE and YSIZE
6806 : * options can be specified to express that overviews should
6807 : * be regenerated only in the specified subset of the source
6808 : * dataset.
6809 : * @return CE_None on success or CE_Failure on failure.
6810 : * @since 3.10
6811 : */
6812 :
6813 19 : CPLErr GDALRegenerateOverviewsMultiBand(
6814 : const std::vector<GDALRasterBand *> &apoSrcBands,
6815 : const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
6816 : const char *pszResampling, GDALProgressFunc pfnProgress,
6817 : void *pProgressData, CSLConstList papszOptions)
6818 : {
6819 19 : CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
6820 29 : for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
6821 : {
6822 10 : CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
6823 : }
6824 :
6825 19 : if (aapoOverviewBands.empty())
6826 0 : return CE_None;
6827 :
6828 19 : std::vector<GDALRasterBand **> apapoOverviewBands;
6829 48 : for (auto &apoOverviewBands : aapoOverviewBands)
6830 : {
6831 : auto papoOverviewBands = static_cast<GDALRasterBand **>(
6832 29 : CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
6833 61 : for (size_t i = 0; i < apoOverviewBands.size(); ++i)
6834 : {
6835 32 : papoOverviewBands[i] = apoOverviewBands[i];
6836 : }
6837 29 : apapoOverviewBands.push_back(papoOverviewBands);
6838 : }
6839 38 : const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
6840 19 : static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
6841 19 : static_cast<int>(aapoOverviewBands[0].size()),
6842 19 : apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
6843 : papszOptions);
6844 48 : for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
6845 29 : CPLFree(papoOverviewBands);
6846 19 : return eErr;
6847 : }
6848 :
6849 : /************************************************************************/
6850 : /* GDALComputeBandStats() */
6851 : /************************************************************************/
6852 :
6853 : /** Undocumented
6854 : * @param hSrcBand undocumented.
6855 : * @param nSampleStep Step between scanlines used to compute statistics.
6856 : * When nSampleStep is equal to 1, all scanlines will
6857 : * be processed.
6858 : * @param pdfMean undocumented.
6859 : * @param pdfStdDev undocumented.
6860 : * @param pfnProgress undocumented.
6861 : * @param pProgressData undocumented.
6862 : * @return undocumented
6863 : */
6864 18 : CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
6865 : int nSampleStep, double *pdfMean,
6866 : double *pdfStdDev,
6867 : GDALProgressFunc pfnProgress,
6868 : void *pProgressData)
6869 :
6870 : {
6871 18 : VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
6872 :
6873 18 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
6874 :
6875 18 : if (pfnProgress == nullptr)
6876 18 : pfnProgress = GDALDummyProgress;
6877 :
6878 18 : const int nWidth = poSrcBand->GetXSize();
6879 18 : const int nHeight = poSrcBand->GetYSize();
6880 :
6881 18 : if (nSampleStep >= nHeight || nSampleStep < 1)
6882 5 : nSampleStep = 1;
6883 :
6884 18 : GDALDataType eWrkType = GDT_Unknown;
6885 18 : float *pafData = nullptr;
6886 18 : GDALDataType eType = poSrcBand->GetRasterDataType();
6887 18 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6888 18 : if (bComplex)
6889 : {
6890 : pafData = static_cast<float *>(
6891 0 : VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6892 0 : eWrkType = GDT_CFloat32;
6893 : }
6894 : else
6895 : {
6896 : pafData =
6897 18 : static_cast<float *>(VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6898 18 : eWrkType = GDT_Float32;
6899 : }
6900 :
6901 18 : if (nWidth == 0 || pafData == nullptr)
6902 : {
6903 0 : VSIFree(pafData);
6904 0 : return CE_Failure;
6905 : }
6906 :
6907 : /* -------------------------------------------------------------------- */
6908 : /* Loop over all sample lines. */
6909 : /* -------------------------------------------------------------------- */
6910 18 : double dfSum = 0.0;
6911 18 : double dfSum2 = 0.0;
6912 18 : int iLine = 0;
6913 18 : GIntBig nSamples = 0;
6914 :
6915 2143 : do
6916 : {
6917 2161 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6918 : pProgressData))
6919 : {
6920 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6921 0 : CPLFree(pafData);
6922 0 : return CE_Failure;
6923 : }
6924 :
6925 : const CPLErr eErr =
6926 2161 : poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
6927 : 1, eWrkType, 0, 0, nullptr);
6928 2161 : if (eErr != CE_None)
6929 : {
6930 1 : CPLFree(pafData);
6931 1 : return eErr;
6932 : }
6933 :
6934 725208 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6935 : {
6936 723048 : float fValue = 0.0f;
6937 :
6938 723048 : if (bComplex)
6939 : {
6940 : // Compute the magnitude of the complex value.
6941 : fValue =
6942 0 : std::hypot(pafData[static_cast<size_t>(iPixel) * 2],
6943 0 : pafData[static_cast<size_t>(iPixel) * 2 + 1]);
6944 : }
6945 : else
6946 : {
6947 723048 : fValue = pafData[iPixel];
6948 : }
6949 :
6950 723048 : dfSum += static_cast<double>(fValue);
6951 723048 : dfSum2 += static_cast<double>(fValue) * static_cast<double>(fValue);
6952 : }
6953 :
6954 2160 : nSamples += nWidth;
6955 2160 : iLine += nSampleStep;
6956 2160 : } while (iLine < nHeight);
6957 :
6958 17 : if (!pfnProgress(1.0, nullptr, pProgressData))
6959 : {
6960 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6961 0 : CPLFree(pafData);
6962 0 : return CE_Failure;
6963 : }
6964 :
6965 : /* -------------------------------------------------------------------- */
6966 : /* Produce the result values. */
6967 : /* -------------------------------------------------------------------- */
6968 17 : if (pdfMean != nullptr)
6969 17 : *pdfMean = dfSum / nSamples;
6970 :
6971 17 : if (pdfStdDev != nullptr)
6972 : {
6973 17 : const double dfMean = dfSum / nSamples;
6974 :
6975 17 : *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
6976 : }
6977 :
6978 17 : CPLFree(pafData);
6979 :
6980 17 : return CE_None;
6981 : }
6982 :
6983 : /************************************************************************/
6984 : /* GDALOverviewMagnitudeCorrection() */
6985 : /* */
6986 : /* Correct the mean and standard deviation of the overviews of */
6987 : /* the given band to match the base layer approximately. */
6988 : /************************************************************************/
6989 :
6990 : /** Undocumented
6991 : * @param hBaseBand undocumented.
6992 : * @param nOverviewCount undocumented.
6993 : * @param pahOverviews undocumented.
6994 : * @param pfnProgress undocumented.
6995 : * @param pProgressData undocumented.
6996 : * @return undocumented
6997 : */
6998 0 : CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
6999 : int nOverviewCount,
7000 : GDALRasterBandH *pahOverviews,
7001 : GDALProgressFunc pfnProgress,
7002 : void *pProgressData)
7003 :
7004 : {
7005 0 : VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
7006 :
7007 : /* -------------------------------------------------------------------- */
7008 : /* Compute mean/stddev for source raster. */
7009 : /* -------------------------------------------------------------------- */
7010 0 : double dfOrigMean = 0.0;
7011 0 : double dfOrigStdDev = 0.0;
7012 : {
7013 : const CPLErr eErr =
7014 0 : GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
7015 : pfnProgress, pProgressData);
7016 :
7017 0 : if (eErr != CE_None)
7018 0 : return eErr;
7019 : }
7020 :
7021 : /* -------------------------------------------------------------------- */
7022 : /* Loop on overview bands. */
7023 : /* -------------------------------------------------------------------- */
7024 0 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
7025 : {
7026 : GDALRasterBand *poOverview =
7027 0 : GDALRasterBand::FromHandle(pahOverviews[iOverview]);
7028 : double dfOverviewMean, dfOverviewStdDev;
7029 :
7030 : const CPLErr eErr =
7031 0 : GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
7032 : &dfOverviewStdDev, pfnProgress, pProgressData);
7033 :
7034 0 : if (eErr != CE_None)
7035 0 : return eErr;
7036 :
7037 0 : double dfGain = 1.0;
7038 0 : if (dfOrigStdDev >= 0.0001)
7039 0 : dfGain = dfOrigStdDev / dfOverviewStdDev;
7040 :
7041 : /* --------------------------------------------------------------------
7042 : */
7043 : /* Apply gain and offset. */
7044 : /* --------------------------------------------------------------------
7045 : */
7046 0 : const int nWidth = poOverview->GetXSize();
7047 0 : const int nHeight = poOverview->GetYSize();
7048 :
7049 0 : GDALDataType eWrkType = GDT_Unknown;
7050 0 : float *pafData = nullptr;
7051 0 : const GDALDataType eType = poOverview->GetRasterDataType();
7052 0 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
7053 0 : if (bComplex)
7054 : {
7055 : pafData = static_cast<float *>(
7056 0 : VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
7057 0 : eWrkType = GDT_CFloat32;
7058 : }
7059 : else
7060 : {
7061 : pafData = static_cast<float *>(
7062 0 : VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
7063 0 : eWrkType = GDT_Float32;
7064 : }
7065 :
7066 0 : if (pafData == nullptr)
7067 : {
7068 0 : return CE_Failure;
7069 : }
7070 :
7071 0 : for (int iLine = 0; iLine < nHeight; ++iLine)
7072 : {
7073 0 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
7074 : pProgressData))
7075 : {
7076 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
7077 0 : CPLFree(pafData);
7078 0 : return CE_Failure;
7079 : }
7080 :
7081 0 : if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
7082 : nWidth, 1, eWrkType, 0, 0,
7083 0 : nullptr) != CE_None)
7084 : {
7085 0 : CPLFree(pafData);
7086 0 : return CE_Failure;
7087 : }
7088 :
7089 0 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
7090 : {
7091 0 : if (bComplex)
7092 : {
7093 0 : pafData[static_cast<size_t>(iPixel) * 2] *=
7094 0 : static_cast<float>(dfGain);
7095 0 : pafData[static_cast<size_t>(iPixel) * 2 + 1] *=
7096 0 : static_cast<float>(dfGain);
7097 : }
7098 : else
7099 : {
7100 0 : pafData[iPixel] = static_cast<float>(
7101 0 : (double(pafData[iPixel]) - dfOverviewMean) * dfGain +
7102 : dfOrigMean);
7103 : }
7104 : }
7105 :
7106 0 : if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
7107 : nWidth, 1, eWrkType, 0, 0,
7108 0 : nullptr) != CE_None)
7109 : {
7110 0 : CPLFree(pafData);
7111 0 : return CE_Failure;
7112 : }
7113 : }
7114 :
7115 0 : if (!pfnProgress(1.0, nullptr, pProgressData))
7116 : {
7117 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
7118 0 : CPLFree(pafData);
7119 0 : return CE_Failure;
7120 : }
7121 :
7122 0 : CPLFree(pafData);
7123 : }
7124 :
7125 0 : return CE_None;
7126 : }
|