Line data Source code
1 :
2 : /******************************************************************************
3 : *
4 : * Project: GDAL Core
5 : * Purpose: Helper code to implement overview support in different drivers.
6 : * Author: Frank Warmerdam, warmerdam@pobox.com
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 2000, Frank Warmerdam
10 : * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * SPDX-License-Identifier: MIT
13 : ****************************************************************************/
14 :
15 : #include "cpl_port.h"
16 : #include "gdal_priv.h"
17 :
18 : #include <cmath>
19 : #include <cstddef>
20 : #include <cstdlib>
21 :
22 : #include <algorithm>
23 : #include <complex>
24 : #include <condition_variable>
25 : #include <limits>
26 : #include <list>
27 : #include <memory>
28 : #include <mutex>
29 : #include <vector>
30 :
31 : #include "cpl_conv.h"
32 : #include "cpl_error.h"
33 : #include "cpl_float.h"
34 : #include "cpl_progress.h"
35 : #include "cpl_vsi.h"
36 : #include "cpl_worker_thread_pool.h"
37 : #include "gdal.h"
38 : #include "gdal_thread_pool.h"
39 : #include "gdalwarper.h"
40 : #include "gdal_vrt.h"
41 : #include "vrtdataset.h"
42 :
43 : #ifdef USE_NEON_OPTIMIZATIONS
44 : #include "include_sse2neon.h"
45 :
46 : #if (!defined(__aarch64__) && !defined(_M_ARM64))
47 : #define ARM_V7
48 : #endif
49 :
50 : #define USE_SSE2
51 :
52 : #include "gdalsse_priv.h"
53 :
54 : // Restrict to 64bit processors because they are guaranteed to have SSE2,
55 : // or if __AVX2__ is defined.
56 : #elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
57 : #define USE_SSE2
58 :
59 : #include "gdalsse_priv.h"
60 :
61 : #ifdef __SSE3__
62 : #include <pmmintrin.h>
63 : #endif
64 : #ifdef __SSSE3__
65 : #include <tmmintrin.h>
66 : #endif
67 : #ifdef __SSE4_1__
68 : #include <smmintrin.h>
69 : #endif
70 : #ifdef __AVX2__
71 : #include <immintrin.h>
72 : #endif
73 :
74 : #endif
75 :
76 : // To be included after above USE_SSE2 and include gdalsse_priv.h
77 : // to avoid build issue on Windows x86
78 : #include "gdal_priv_templates.hpp"
79 :
80 : /************************************************************************/
81 : /* GDALResampleChunk_Near() */
82 : /************************************************************************/
83 :
84 : template <class T>
85 1249 : static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
86 : const T *pChunk, T **ppDstBuffer)
87 :
88 : {
89 1249 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
90 1249 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
91 1249 : const GDALDataType eWrkDataType = args.eWrkDataType;
92 1249 : const int nChunkXOff = args.nChunkXOff;
93 1249 : const int nChunkXSize = args.nChunkXSize;
94 1249 : const int nChunkYOff = args.nChunkYOff;
95 1249 : const int nDstXOff = args.nDstXOff;
96 1249 : const int nDstXOff2 = args.nDstXOff2;
97 1249 : const int nDstYOff = args.nDstYOff;
98 1249 : const int nDstYOff2 = args.nDstYOff2;
99 1249 : const int nDstXWidth = nDstXOff2 - nDstXOff;
100 :
101 : /* -------------------------------------------------------------------- */
102 : /* Allocate buffers. */
103 : /* -------------------------------------------------------------------- */
104 1249 : *ppDstBuffer = static_cast<T *>(
105 1249 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
106 : GDALGetDataTypeSizeBytes(eWrkDataType)));
107 1249 : if (*ppDstBuffer == nullptr)
108 : {
109 0 : return CE_Failure;
110 : }
111 1249 : T *const pDstBuffer = *ppDstBuffer;
112 :
113 : int *panSrcXOff =
114 1249 : static_cast<int *>(VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(int)));
115 :
116 1249 : if (panSrcXOff == nullptr)
117 : {
118 0 : return CE_Failure;
119 : }
120 :
121 : /* ==================================================================== */
122 : /* Precompute inner loop constants. */
123 : /* ==================================================================== */
124 840811 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
125 : {
126 839562 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
127 839562 : if (nSrcXOff < nChunkXOff)
128 0 : nSrcXOff = nChunkXOff;
129 :
130 839562 : panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
131 : }
132 :
133 : /* ==================================================================== */
134 : /* Loop over destination scanlines. */
135 : /* ==================================================================== */
136 142386 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
137 : {
138 141137 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
139 141137 : if (nSrcYOff < nChunkYOff)
140 0 : nSrcYOff = nChunkYOff;
141 :
142 141137 : const T *const pSrcScanline =
143 : pChunk +
144 141137 : (static_cast<size_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
145 137744 : nChunkXOff;
146 :
147 : /* --------------------------------------------------------------------
148 : */
149 : /* Loop over destination pixels */
150 : /* --------------------------------------------------------------------
151 : */
152 141137 : T *pDstScanline =
153 141137 : pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
154 120249393 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
155 : {
156 120108000 : pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
157 : }
158 : }
159 :
160 1249 : CPLFree(panSrcXOff);
161 :
162 1249 : return CE_None;
163 : }
164 :
165 1249 : static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
166 : const void *pChunk, void **ppDstBuffer,
167 : GDALDataType *peDstBufferDataType)
168 : {
169 1249 : *peDstBufferDataType = args.eWrkDataType;
170 1249 : switch (args.eWrkDataType)
171 : {
172 : // For nearest resampling, as no computation is done, only the
173 : // size of the data type matters.
174 1081 : case GDT_UInt8:
175 : case GDT_Int8:
176 : {
177 1081 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
178 1081 : return GDALResampleChunk_NearT(
179 : args, static_cast<const uint8_t *>(pChunk),
180 1081 : reinterpret_cast<uint8_t **>(ppDstBuffer));
181 : }
182 :
183 52 : case GDT_Int16:
184 : case GDT_UInt16:
185 : case GDT_Float16:
186 : {
187 52 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
188 52 : return GDALResampleChunk_NearT(
189 : args, static_cast<const uint16_t *>(pChunk),
190 52 : reinterpret_cast<uint16_t **>(ppDstBuffer));
191 : }
192 :
193 68 : case GDT_CInt16:
194 : case GDT_CFloat16:
195 : case GDT_Int32:
196 : case GDT_UInt32:
197 : case GDT_Float32:
198 : {
199 68 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
200 68 : return GDALResampleChunk_NearT(
201 : args, static_cast<const uint32_t *>(pChunk),
202 68 : reinterpret_cast<uint32_t **>(ppDstBuffer));
203 : }
204 :
205 44 : case GDT_CInt32:
206 : case GDT_CFloat32:
207 : case GDT_Int64:
208 : case GDT_UInt64:
209 : case GDT_Float64:
210 : {
211 44 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
212 44 : return GDALResampleChunk_NearT(
213 : args, static_cast<const uint64_t *>(pChunk),
214 44 : reinterpret_cast<uint64_t **>(ppDstBuffer));
215 : }
216 :
217 4 : case GDT_CFloat64:
218 : {
219 4 : return GDALResampleChunk_NearT(
220 : args, static_cast<const std::complex<double> *>(pChunk),
221 4 : reinterpret_cast<std::complex<double> **>(ppDstBuffer));
222 : }
223 :
224 0 : case GDT_Unknown:
225 : case GDT_TypeCount:
226 0 : break;
227 : }
228 0 : CPLAssert(false);
229 : return CE_Failure;
230 : }
231 :
232 : namespace
233 : {
234 :
235 : // Find in the color table the entry whose RGB value is the closest
236 : // (using quadratic distance) to the test color, ignoring transparent entries.
237 3837 : int BestColorEntry(const std::vector<GDALColorEntry> &entries,
238 : const GDALColorEntry &test)
239 : {
240 3837 : int nMinDist = std::numeric_limits<int>::max();
241 3837 : size_t bestEntry = 0;
242 986109 : for (size_t i = 0; i < entries.size(); ++i)
243 : {
244 982272 : const GDALColorEntry &entry = entries[i];
245 : // Ignore transparent entries
246 982272 : if (entry.c4 == 0)
247 3237 : continue;
248 :
249 979035 : int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
250 979035 : ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
251 979035 : ((test.c3 - entry.c3) * (test.c3 - entry.c3));
252 979035 : if (nDist < nMinDist)
253 : {
254 15847 : nMinDist = nDist;
255 15847 : bestEntry = i;
256 : }
257 : }
258 3837 : return static_cast<int>(bestEntry);
259 : }
260 :
261 7 : std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
262 : int &transparentIdx)
263 : {
264 7 : std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
265 :
266 7 : transparentIdx = -1;
267 7 : int i = 0;
268 1799 : for (auto &entry : entries)
269 : {
270 1792 : table.GetColorEntryAsRGB(i, &entry);
271 1792 : if (transparentIdx < 0 && entry.c4 == 0)
272 1 : transparentIdx = i;
273 1792 : ++i;
274 : }
275 7 : return entries;
276 : }
277 :
278 : } // unnamed namespace
279 :
280 : /************************************************************************/
281 : /* SQUARE() */
282 : /************************************************************************/
283 :
284 6427 : template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
285 : {
286 6427 : return static_cast<Tsquare>(val) * val;
287 : }
288 :
289 : /************************************************************************/
290 : /* ComputeIntegerRMS() */
291 : /************************************************************************/
292 : // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
293 : // integer that minimizes abs(rms**2 - sumSquares / weight)
294 : template <class T, class Twork>
295 42 : inline T ComputeIntegerRMS(double sumSquares, double weight)
296 : {
297 42 : const double sumDivWeight = sumSquares / weight;
298 42 : T rms = static_cast<T>(sqrt(sumDivWeight));
299 :
300 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
301 : // Naive version:
302 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
303 42 : if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
304 42 : 2 * sumDivWeight)
305 6 : rms += 1;
306 42 : return rms;
307 : }
308 :
309 : template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
310 : {
311 : CPLAssert(false);
312 : return 0;
313 : }
314 :
315 28 : template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
316 : {
317 : // It has been verified that given the correction on rms below, using
318 : // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
319 : // is equivalent, so use the former as it is used twice.
320 28 : const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
321 28 : const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
322 28 : GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
323 :
324 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
325 : // Naive version:
326 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
327 : // Optimized version for integer case and weight == 4
328 28 : if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
329 5 : rms += 1;
330 28 : return rms;
331 : }
332 :
333 : template <>
334 24 : inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
335 : {
336 24 : const double sumDivWeight = sumSquares * 0.25;
337 24 : GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
338 :
339 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
340 : // Naive version:
341 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
342 : // Optimized version for integer case and weight == 4
343 24 : if (static_cast<GUInt32>(rms) * (rms + 1) <
344 24 : static_cast<GUInt32>(sumDivWeight + 0.25))
345 4 : rms += 1;
346 24 : return rms;
347 : }
348 :
349 : #ifdef USE_SSE2
350 :
351 : /************************************************************************/
352 : /* QuadraticMeanByteSSE2OrAVX2() */
353 : /************************************************************************/
354 :
355 : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
356 : #define sse2_hadd_epi16 _mm_hadd_epi16
357 : #else
358 5064270 : inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
359 : {
360 : // Horizontal addition of adjacent pairs
361 5064270 : const auto mask = _mm_set1_epi32(0xFFFF);
362 : const auto horizLo =
363 15192800 : _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
364 : const auto horizHi =
365 15192800 : _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
366 :
367 : // Recombine low and high parts
368 5064270 : return _mm_packs_epi32(horizLo, horizHi);
369 : }
370 : #endif
371 :
372 : #ifdef __AVX2__
373 :
374 : #define set1_epi16 _mm256_set1_epi16
375 : #define set1_epi32 _mm256_set1_epi32
376 : #define setzero _mm256_setzero_si256
377 : #define set1_ps _mm256_set1_ps
378 : #define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
379 : #define unpacklo_epi8 _mm256_unpacklo_epi8
380 : #define unpackhi_epi8 _mm256_unpackhi_epi8
381 : #define madd_epi16 _mm256_madd_epi16
382 : #define add_epi32 _mm256_add_epi32
383 : #define mul_ps _mm256_mul_ps
384 : #define cvtepi32_ps _mm256_cvtepi32_ps
385 : #define sqrt_ps _mm256_sqrt_ps
386 : #define cvttps_epi32 _mm256_cvttps_epi32
387 : #define packs_epi32 _mm256_packs_epi32
388 : #define packus_epi32 _mm256_packus_epi32
389 : #define srli_epi32 _mm256_srli_epi32
390 : #define mullo_epi16 _mm256_mullo_epi16
391 : #define srli_epi16 _mm256_srli_epi16
392 : #define cmpgt_epi16 _mm256_cmpgt_epi16
393 : #define add_epi16 _mm256_add_epi16
394 : #define sub_epi16 _mm256_sub_epi16
395 : #define packus_epi16 _mm256_packus_epi16
396 :
397 : /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
398 : /* to get the lower 128-bit bits of what would be a true 256-bit vector register
399 : */
400 :
401 : inline __m256i FIXUP_LANES(__m256i x)
402 : {
403 : return _mm256_permute4x64_epi64(x, _MM_SHUFFLE(3, 1, 2, 0));
404 : }
405 :
406 : #define store_lo(x, y) \
407 : _mm_storeu_si128(reinterpret_cast<__m128i *>(x), \
408 : _mm256_extracti128_si256(FIXUP_LANES(y), 0))
409 : #define storeu_int(x, y) \
410 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(x), FIXUP_LANES(y))
411 : #define hadd_epi16 _mm256_hadd_epi16
412 : #else
413 : #define set1_epi16 _mm_set1_epi16
414 : #define set1_epi32 _mm_set1_epi32
415 : #define setzero _mm_setzero_si128
416 : #define set1_ps _mm_set1_ps
417 : #define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
418 : #define unpacklo_epi8 _mm_unpacklo_epi8
419 : #define unpackhi_epi8 _mm_unpackhi_epi8
420 : #define madd_epi16 _mm_madd_epi16
421 : #define add_epi32 _mm_add_epi32
422 : #define mul_ps _mm_mul_ps
423 : #define cvtepi32_ps _mm_cvtepi32_ps
424 : #define sqrt_ps _mm_sqrt_ps
425 : #define cvttps_epi32 _mm_cvttps_epi32
426 : #define packs_epi32 _mm_packs_epi32
427 : #define packus_epi32 GDAL_mm_packus_epi32
428 : #define srli_epi32 _mm_srli_epi32
429 : #define mullo_epi16 _mm_mullo_epi16
430 : #define srli_epi16 _mm_srli_epi16
431 : #define cmpgt_epi16 _mm_cmpgt_epi16
432 : #define add_epi16 _mm_add_epi16
433 : #define sub_epi16 _mm_sub_epi16
434 : #define packus_epi16 _mm_packus_epi16
435 : #define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
436 : #define storeu_int(x, y) _mm_storeu_si128(reinterpret_cast<__m128i *>(x), (y))
437 : #define hadd_epi16 sse2_hadd_epi16
438 : #endif
439 :
440 : template <class T>
441 : static int
442 : #if defined(__GNUC__)
443 : __attribute__((noinline))
444 : #endif
445 5389 : QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
446 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
447 : T *CPL_RESTRICT pDstScanline)
448 : {
449 : // Optimized implementation for RMS on Byte by
450 : // processing by group of 8 output pixels, so as to use
451 : // a single _mm_sqrt_ps() call for 4 output pixels
452 5389 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
453 :
454 5389 : int iDstPixel = 0;
455 5389 : const auto one16 = set1_epi16(1);
456 5389 : const auto one32 = set1_epi32(1);
457 5389 : const auto zero = setzero();
458 5389 : const auto minus32768 = set1_epi16(-32768);
459 :
460 5389 : constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
461 521504 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
462 : {
463 : // Load 2 * DEST_ELTS bytes from each line
464 516115 : auto firstLine = loadu_int(pSrcScanlineShifted);
465 1032230 : auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
466 : // Extend those Bytes as UInt16s
467 516115 : auto firstLineLo = unpacklo_epi8(firstLine, zero);
468 516115 : auto firstLineHi = unpackhi_epi8(firstLine, zero);
469 516115 : auto secondLineLo = unpacklo_epi8(secondLine, zero);
470 516115 : auto secondLineHi = unpackhi_epi8(secondLine, zero);
471 :
472 : // Multiplication of 16 bit values and horizontal
473 : // addition of 32 bit results
474 : // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
475 516115 : firstLineLo = madd_epi16(firstLineLo, firstLineLo);
476 516115 : firstLineHi = madd_epi16(firstLineHi, firstLineHi);
477 516115 : secondLineLo = madd_epi16(secondLineLo, secondLineLo);
478 516115 : secondLineHi = madd_epi16(secondLineHi, secondLineHi);
479 :
480 : // Vertical addition
481 516115 : const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
482 516115 : const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
483 :
484 : const auto sumSquaresPlusOneDiv4Lo =
485 1032230 : srli_epi32(add_epi32(sumSquaresLo, one32), 2);
486 : const auto sumSquaresPlusOneDiv4Hi =
487 1032230 : srli_epi32(add_epi32(sumSquaresHi, one32), 2);
488 :
489 : // Take square root and truncate/floor to int32
490 : const auto rmsLo =
491 1548340 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
492 : const auto rmsHi =
493 1548340 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
494 :
495 : // Merge back low and high registers with each RMS value
496 : // as a 16 bit value.
497 516115 : auto rms = packs_epi32(rmsLo, rmsHi);
498 :
499 : // Round to upper value if it minimizes the
500 : // error |rms^2 - sumSquares/4|
501 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
502 : // rms += 1;
503 : // which is equivalent to:
504 : // if( rms * (rms + 1) < (sumSquares+1) / 4 )
505 : // rms += 1;
506 : // And both left and right parts fit on 16 (unsigned) bits
507 : const auto sumSquaresPlusOneDiv4 =
508 516115 : packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
509 : // cmpgt_epi16 operates on signed int16, but here
510 : // we have unsigned values, so shift them by -32768 before
511 2580580 : const auto mask = cmpgt_epi16(
512 : add_epi16(sumSquaresPlusOneDiv4, minus32768),
513 : add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
514 : // The value of the mask will be -1 when the correction needs to be
515 : // applied
516 516115 : rms = sub_epi16(rms, mask);
517 :
518 : // Pack each 16 bit RMS value to 8 bits
519 516115 : rms = packus_epi16(rms, rms /* could be anything */);
520 516115 : store_lo(&pDstScanline[iDstPixel], rms);
521 516115 : pSrcScanlineShifted += 2 * DEST_ELTS;
522 : }
523 :
524 5389 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
525 5389 : return iDstPixel;
526 : }
527 :
528 : /************************************************************************/
529 : /* AverageByteSSE2OrAVX2() */
530 : /************************************************************************/
531 :
532 : static int
533 123976 : AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
534 : const GByte *&CPL_RESTRICT pSrcScanlineShiftedInOut,
535 : GByte *CPL_RESTRICT pDstScanline)
536 : {
537 : // Optimized implementation for average on Byte by
538 : // processing by group of 16 output pixels for SSE2, or 32 for AVX2
539 :
540 123976 : const auto zero = setzero();
541 123976 : const auto two16 = set1_epi16(2);
542 123976 : const GByte *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
543 :
544 123976 : constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
545 123976 : int iDstPixel = 0;
546 2656110 : for (; iDstPixel < nDstXWidth - (2 * DEST_ELTS - 1);
547 2532130 : iDstPixel += 2 * DEST_ELTS)
548 : {
549 : decltype(setzero()) average0;
550 : {
551 : // Load 2 * DEST_ELTS bytes from each line
552 2532130 : const auto firstLine = loadu_int(pSrcScanlineShifted);
553 : const auto secondLine =
554 5064270 : loadu_int(pSrcScanlineShifted + nChunkXSize);
555 : // Extend those Bytes as UInt16s
556 2532130 : const auto firstLineLo = unpacklo_epi8(firstLine, zero);
557 2532130 : const auto firstLineHi = unpackhi_epi8(firstLine, zero);
558 2532130 : const auto secondLineLo = unpacklo_epi8(secondLine, zero);
559 2532130 : const auto secondLineHi = unpackhi_epi8(secondLine, zero);
560 :
561 : // Vertical addition
562 2532130 : const auto sumLo = add_epi16(firstLineLo, secondLineLo);
563 2532130 : const auto sumHi = add_epi16(firstLineHi, secondLineHi);
564 :
565 : // Horizontal addition of adjacent pairs, and recombine low and high
566 : // parts
567 2532130 : const auto sum = hadd_epi16(sumLo, sumHi);
568 :
569 : // average = (sum + 2) / 4
570 2532130 : average0 = srli_epi16(add_epi16(sum, two16), 2);
571 :
572 2532130 : pSrcScanlineShifted += 2 * DEST_ELTS;
573 : }
574 :
575 : decltype(setzero()) average1;
576 : {
577 : // Load 2 * DEST_ELTS bytes from each line
578 2532130 : const auto firstLine = loadu_int(pSrcScanlineShifted);
579 : const auto secondLine =
580 5064270 : loadu_int(pSrcScanlineShifted + nChunkXSize);
581 : // Extend those Bytes as UInt16s
582 2532130 : const auto firstLineLo = unpacklo_epi8(firstLine, zero);
583 2532130 : const auto firstLineHi = unpackhi_epi8(firstLine, zero);
584 2532130 : const auto secondLineLo = unpacklo_epi8(secondLine, zero);
585 2532130 : const auto secondLineHi = unpackhi_epi8(secondLine, zero);
586 :
587 : // Vertical addition
588 2532130 : const auto sumLo = add_epi16(firstLineLo, secondLineLo);
589 2532130 : const auto sumHi = add_epi16(firstLineHi, secondLineHi);
590 :
591 : // Horizontal addition of adjacent pairs, and recombine low and high
592 : // parts
593 2532130 : const auto sum = hadd_epi16(sumLo, sumHi);
594 :
595 : // average = (sum + 2) / 4
596 2532130 : average1 = srli_epi16(add_epi16(sum, two16), 2);
597 :
598 2532130 : pSrcScanlineShifted += 2 * DEST_ELTS;
599 : }
600 :
601 : // Pack each 16 bit average value to 8 bits
602 2532130 : const auto average = packus_epi16(average0, average1);
603 2532130 : storeu_int(&pDstScanline[iDstPixel], average);
604 : }
605 :
606 123976 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
607 123976 : return iDstPixel;
608 : }
609 :
610 : /************************************************************************/
611 : /* QuadraticMeanUInt16SSE2() */
612 : /************************************************************************/
613 :
614 : #ifdef __SSE3__
615 : #define sse2_hadd_pd _mm_hadd_pd
616 : #else
617 185 : inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
618 : {
619 : auto aLo_bLo =
620 740 : _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
621 : auto aHi_bHi =
622 740 : _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
623 185 : return _mm_add_pd(aLo_bLo, aHi_bHi); // (aLo + aHi, bLo + bHi)
624 : }
625 : #endif
626 :
627 120 : inline __m128d SQUARE_PD(__m128d x)
628 : {
629 120 : return _mm_mul_pd(x, x);
630 : }
631 :
632 : #ifdef __AVX2__
633 :
634 : inline __m256d SQUARE_PD(__m256d x)
635 : {
636 : return _mm256_mul_pd(x, x);
637 : }
638 :
639 : inline __m256d FIXUP_LANES(__m256d x)
640 : {
641 : return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
642 : }
643 :
644 : inline __m256 FIXUP_LANES(__m256 x)
645 : {
646 : return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
647 : }
648 :
649 : #endif
650 :
651 : static int
652 14 : QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
653 : const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
654 : uint16_t *CPL_RESTRICT pDstScanline)
655 : {
656 : // Optimized implementation for RMS on UInt16 by
657 : // processing by group of 4 output pixels.
658 14 : const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
659 :
660 14 : int iDstPixel = 0;
661 14 : const auto zero = _mm_setzero_si128();
662 :
663 : #ifdef __AVX2__
664 : const auto zeroDot25 = _mm256_set1_pd(0.25);
665 : const auto zeroDot5 = _mm256_set1_pd(0.5);
666 :
667 : // The first four 0's could be anything, as we only take the bottom
668 : // 128 bits.
669 : const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
670 : #else
671 14 : const auto zeroDot25 = _mm_set1_pd(0.25);
672 14 : const auto zeroDot5 = _mm_set1_pd(0.5);
673 : #endif
674 :
675 14 : constexpr int DEST_ELTS =
676 : static_cast<int>(sizeof(zero) / sizeof(uint16_t)) / 2;
677 52 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
678 : {
679 : // Load 8 UInt16 from each line
680 38 : const auto firstLine = _mm_loadu_si128(
681 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
682 : const auto secondLine =
683 38 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
684 38 : pSrcScanlineShifted + nChunkXSize));
685 :
686 : // Detect if all of the source values fit in 14 bits.
687 : // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
688 : // and we can do a much faster implementation.
689 : const auto maskTmp =
690 76 : _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
691 : #if defined(__i386__) || defined(_M_IX86)
692 : uint64_t nMaskFitsIn14Bits = 0;
693 : _mm_storel_epi64(
694 : reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
695 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
696 : #else
697 38 : const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
698 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
699 : #endif
700 38 : if (nMaskFitsIn14Bits == 0)
701 : {
702 : // Multiplication of 16 bit values and horizontal
703 : // addition of 32 bit results
704 : const auto firstLineHSumSquare =
705 26 : _mm_madd_epi16(firstLine, firstLine);
706 : const auto secondLineHSumSquare =
707 26 : _mm_madd_epi16(secondLine, secondLine);
708 : // Vertical addition
709 : const auto sumSquares =
710 26 : _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
711 : // In theory we should take sqrt(sumSquares * 0.25f)
712 : // but given the rounding we do, this is equivalent to
713 : // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
714 : // sumSquares <= 4 * 16383^2
715 26 : const auto one32 = _mm_set1_epi32(1);
716 : const auto sumSquaresPlusOneDiv4 =
717 52 : _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
718 : // Take square root and truncate/floor to int32
719 78 : auto rms = _mm_cvttps_epi32(
720 : _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
721 :
722 : // Round to upper value if it minimizes the
723 : // error |rms^2 - sumSquares/4|
724 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
725 : // rms += 1;
726 : // which is equivalent to:
727 : // if( rms * rms + rms < (sumSquares+1) / 4 )
728 : // rms += 1;
729 : auto mask =
730 78 : _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
731 : _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
732 26 : rms = _mm_sub_epi32(rms, mask);
733 : // Pack each 32 bit RMS value to 16 bits
734 26 : rms = _mm_packs_epi32(rms, rms /* could be anything */);
735 : _mm_storel_epi64(
736 26 : reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
737 26 : pSrcScanlineShifted += 2 * DEST_ELTS;
738 26 : continue;
739 : }
740 :
741 : // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
742 : // to 32 bit would result in 4 multiplications instead of 8, but
743 : // mullo/mulhi have a worse throughput than mul_pd.
744 :
745 : // Extend those UInt16s as UInt32s
746 12 : const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
747 12 : const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
748 12 : const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
749 12 : const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
750 :
751 : #ifdef __AVX2__
752 : // Multiplication of 32 bit values previously converted to 64 bit double
753 : const auto firstLineLoDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineLo));
754 : const auto firstLineHiDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineHi));
755 : const auto secondLineLoDbl =
756 : SQUARE_PD(_mm256_cvtepi32_pd(secondLineLo));
757 : const auto secondLineHiDbl =
758 : SQUARE_PD(_mm256_cvtepi32_pd(secondLineHi));
759 :
760 : // Vertical addition of squares
761 : const auto sumSquaresLo =
762 : _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
763 : const auto sumSquaresHi =
764 : _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
765 :
766 : // Horizontal addition of squares
767 : const auto sumSquares =
768 : FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
769 :
770 : const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
771 :
772 : // Take square root and truncate/floor to int32
773 : auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
774 : const auto rmsDouble = _mm256_cvtepi32_pd(rms);
775 : const auto right = _mm256_sub_pd(
776 : sumDivWeight, _mm256_add_pd(SQUARE_PD(rmsDouble), rmsDouble));
777 :
778 : auto mask =
779 : _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
780 : // Extract 32-bit from each of the 4 64-bit masks
781 : // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
782 : // _MM_SHUFFLE(2,0,2,0)));
783 : mask = _mm256_permutevar8x32_ps(mask, permutation);
784 : const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
785 :
786 : // Apply the correction
787 : rms = _mm_sub_epi32(rms, maskI);
788 :
789 : // Pack each 32 bit RMS value to 16 bits
790 : rms = _mm_packus_epi32(rms, rms /* could be anything */);
791 : #else
792 : // Multiplication of 32 bit values previously converted to 64 bit double
793 12 : const auto firstLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineLo));
794 : const auto firstLineLoHi =
795 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
796 12 : const auto firstLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineHi));
797 : const auto firstLineHiHi =
798 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
799 :
800 12 : const auto secondLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineLo));
801 : const auto secondLineLoHi =
802 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
803 12 : const auto secondLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineHi));
804 : const auto secondLineHiHi =
805 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
806 :
807 : // Vertical addition of squares
808 12 : const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
809 12 : const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
810 12 : const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
811 12 : const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
812 :
813 : // Horizontal addition of squares
814 12 : const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
815 12 : const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
816 :
817 12 : const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
818 12 : const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
819 : // Take square root and truncate/floor to int32
820 24 : const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
821 24 : const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
822 :
823 : // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
824 : // if( 0.5 < sumDivWeight - (rms * rms + rms) )
825 : // rms += 1;
826 12 : const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
827 12 : const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
828 24 : const auto rightLo = _mm_sub_pd(
829 : sumDivWeightLo, _mm_add_pd(SQUARE_PD(rmsLoDouble), rmsLoDouble));
830 36 : const auto rightHi = _mm_sub_pd(
831 : sumDivWeightHi, _mm_add_pd(SQUARE_PD(rmsHiDouble), rmsHiDouble));
832 :
833 24 : const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
834 12 : const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
835 : // The value of the mask will be -1 when the correction needs to be
836 : // applied
837 24 : const auto mask = _mm_castps_si128(_mm_shuffle_ps(
838 : maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
839 :
840 48 : auto rms = _mm_castps_si128(
841 : _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
842 : // Apply the correction
843 12 : rms = _mm_sub_epi32(rms, mask);
844 :
845 : // Pack each 32 bit RMS value to 16 bits
846 12 : rms = GDAL_mm_int32_to_uint16(rms);
847 : #endif
848 :
849 12 : _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
850 : rms);
851 12 : pSrcScanlineShifted += 2 * DEST_ELTS;
852 : }
853 :
854 14 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
855 14 : return iDstPixel;
856 : }
857 :
858 : /************************************************************************/
859 : /* AverageUInt16SSE2() */
860 : /************************************************************************/
861 :
862 : static int
863 13 : AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
864 : const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
865 : uint16_t *CPL_RESTRICT pDstScanline)
866 : {
867 : // Optimized implementation for average on UInt16 by
868 : // processing by group of 8 output pixels.
869 :
870 13 : const auto mask = _mm_set1_epi32(0xFFFF);
871 13 : const auto two = _mm_set1_epi32(2);
872 13 : const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
873 :
874 13 : int iDstPixel = 0;
875 13 : constexpr int DEST_ELTS = static_cast<int>(sizeof(mask) / sizeof(uint16_t));
876 25 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
877 : {
878 : __m128i averageLow;
879 : // Load 8 UInt16 from each line
880 : {
881 12 : const auto firstLine = _mm_loadu_si128(
882 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
883 : const auto secondLine =
884 12 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
885 12 : pSrcScanlineShifted + nChunkXSize));
886 :
887 : // Horizontal addition and extension to 32 bit
888 36 : const auto horizAddFirstLine = _mm_add_epi32(
889 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
890 : const auto horizAddSecondLine =
891 36 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
892 : _mm_srli_epi32(secondLine, 16));
893 :
894 : // Vertical addition and average computation
895 : // average = (sum + 2) >> 2
896 24 : const auto sum = _mm_add_epi32(
897 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
898 12 : averageLow = _mm_srli_epi32(sum, 2);
899 : }
900 : // Load 8 UInt16 from each line
901 : __m128i averageHigh;
902 : {
903 : const auto firstLine =
904 12 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
905 12 : pSrcScanlineShifted + DEST_ELTS));
906 : const auto secondLine =
907 12 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
908 12 : pSrcScanlineShifted + DEST_ELTS + nChunkXSize));
909 :
910 : // Horizontal addition and extension to 32 bit
911 36 : const auto horizAddFirstLine = _mm_add_epi32(
912 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
913 : const auto horizAddSecondLine =
914 36 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
915 : _mm_srli_epi32(secondLine, 16));
916 :
917 : // Vertical addition and average computation
918 : // average = (sum + 2) >> 2
919 24 : const auto sum = _mm_add_epi32(
920 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
921 12 : averageHigh = _mm_srli_epi32(sum, 2);
922 : }
923 :
924 : // Pack each 32 bit average value to 16 bits
925 12 : auto average = GDAL_mm_packus_epi32(averageLow, averageHigh);
926 12 : _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
927 : average);
928 12 : pSrcScanlineShifted += 2 * DEST_ELTS;
929 : }
930 :
931 13 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
932 13 : return iDstPixel;
933 : }
934 :
935 : /************************************************************************/
936 : /* QuadraticMeanFloatSSE2() */
937 : /************************************************************************/
938 :
939 : #if !defined(ARM_V7)
940 :
941 : #ifdef __SSE3__
942 : #define sse2_hadd_ps _mm_hadd_ps
943 : #else
944 82 : inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
945 : {
946 82 : auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
947 82 : auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
948 82 : return _mm_add_ps(aEven_bEven, aOdd_bOdd); // (aEven + aOdd, bEven + bOdd)
949 : }
950 : #endif
951 :
952 : #ifdef __AVX2__
953 : #define set1_ps _mm256_set1_ps
954 : #define loadu_ps _mm256_loadu_ps
955 : #define andnot_ps _mm256_andnot_ps
956 : #define and_ps _mm256_and_ps
957 : #define max_ps _mm256_max_ps
958 : #define shuffle_ps _mm256_shuffle_ps
959 : #define div_ps _mm256_div_ps
960 : #define cmpeq_ps(x, y) _mm256_cmp_ps((x), (y), _CMP_EQ_OQ)
961 : #define mul_ps _mm256_mul_ps
962 : #define add_ps _mm256_add_ps
963 : #define hadd_ps _mm256_hadd_ps
964 : #define sqrt_ps _mm256_sqrt_ps
965 : #define or_ps _mm256_or_ps
966 : #define unpacklo_ps _mm256_unpacklo_ps
967 : #define unpackhi_ps _mm256_unpackhi_ps
968 : #define storeu_ps _mm256_storeu_ps
969 : #define blendv_ps _mm256_blendv_ps
970 :
971 : inline __m256 SQUARE_PS(__m256 x)
972 : {
973 : return _mm256_mul_ps(x, x);
974 : }
975 :
976 : #else
977 :
978 : #define set1_ps _mm_set1_ps
979 : #define loadu_ps _mm_loadu_ps
980 : #define andnot_ps _mm_andnot_ps
981 : #define and_ps _mm_and_ps
982 : #define max_ps _mm_max_ps
983 : #define shuffle_ps _mm_shuffle_ps
984 : #define div_ps _mm_div_ps
985 : #define cmpeq_ps _mm_cmpeq_ps
986 : #define mul_ps _mm_mul_ps
987 : #define add_ps _mm_add_ps
988 : #define hadd_ps sse2_hadd_ps
989 : #define sqrt_ps _mm_sqrt_ps
990 : #define or_ps _mm_or_ps
991 : #define unpacklo_ps _mm_unpacklo_ps
992 : #define unpackhi_ps _mm_unpackhi_ps
993 : #define storeu_ps _mm_storeu_ps
994 :
995 132 : inline __m128 blendv_ps(__m128 a, __m128 b, __m128 mask)
996 : {
997 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
998 : return _mm_blendv_ps(a, b, mask);
999 : #else
1000 396 : return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b));
1001 : #endif
1002 : }
1003 :
1004 528 : inline __m128 SQUARE_PS(__m128 x)
1005 : {
1006 528 : return _mm_mul_ps(x, x);
1007 : }
1008 :
1009 132 : inline __m128 FIXUP_LANES(__m128 x)
1010 : {
1011 132 : return x;
1012 : }
1013 :
1014 : #endif
1015 :
1016 : static int
1017 : #if defined(__GNUC__)
1018 : __attribute__((noinline))
1019 : #endif
1020 66 : QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
1021 : const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1022 : float *CPL_RESTRICT pDstScanline)
1023 : {
1024 : // Optimized implementation for RMS on Float32 by
1025 : // processing by group of output pixels.
1026 66 : const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1027 :
1028 66 : int iDstPixel = 0;
1029 66 : const auto minus_zero = set1_ps(-0.0f);
1030 66 : const auto zeroDot25 = set1_ps(0.25f);
1031 66 : const auto one = set1_ps(1.0f);
1032 66 : const auto infv = set1_ps(std::numeric_limits<float>::infinity());
1033 66 : constexpr int DEST_ELTS = static_cast<int>(sizeof(one) / sizeof(float));
1034 :
1035 198 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1036 : {
1037 : // Load 2*DEST_ELTS Float32 from each line
1038 132 : auto firstLineLo = loadu_ps(pSrcScanlineShifted);
1039 132 : auto firstLineHi = loadu_ps(pSrcScanlineShifted + DEST_ELTS);
1040 132 : auto secondLineLo = loadu_ps(pSrcScanlineShifted + nChunkXSize);
1041 : auto secondLineHi =
1042 264 : loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize);
1043 :
1044 : // Take the absolute value
1045 132 : firstLineLo = andnot_ps(minus_zero, firstLineLo);
1046 132 : firstLineHi = andnot_ps(minus_zero, firstLineHi);
1047 132 : secondLineLo = andnot_ps(minus_zero, secondLineLo);
1048 132 : secondLineHi = andnot_ps(minus_zero, secondLineHi);
1049 :
1050 : auto firstLineEven =
1051 132 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1052 : auto firstLineOdd =
1053 132 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1054 : auto secondLineEven =
1055 132 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1056 : auto secondLineOdd =
1057 132 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1058 :
1059 : // Compute the maximum of each DEST_ELTS value to RMS-average
1060 396 : const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
1061 : max_ps(secondLineEven, secondLineOdd));
1062 :
1063 : // Normalize each value by the maximum of the DEST_ELTS ones.
1064 : // This step is important to avoid that the square evaluates to infinity
1065 : // for sufficiently big input.
1066 132 : auto invMax = div_ps(one, maxV);
1067 : // Deal with 0 being the maximum to correct division by zero
1068 : // note: comparing to -0 leads to identical results as to comparing with
1069 : // 0
1070 264 : invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
1071 :
1072 132 : firstLineEven = mul_ps(firstLineEven, invMax);
1073 132 : firstLineOdd = mul_ps(firstLineOdd, invMax);
1074 132 : secondLineEven = mul_ps(secondLineEven, invMax);
1075 132 : secondLineOdd = mul_ps(secondLineOdd, invMax);
1076 :
1077 : // Compute squares
1078 132 : firstLineEven = SQUARE_PS(firstLineEven);
1079 132 : firstLineOdd = SQUARE_PS(firstLineOdd);
1080 132 : secondLineEven = SQUARE_PS(secondLineEven);
1081 132 : secondLineOdd = SQUARE_PS(secondLineOdd);
1082 :
1083 396 : const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
1084 : add_ps(secondLineEven, secondLineOdd));
1085 :
1086 396 : auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
1087 :
1088 : // Deal with infinity being the maximum
1089 132 : const auto maskIsInf = cmpeq_ps(maxV, infv);
1090 132 : rms = blendv_ps(rms, infv, maskIsInf);
1091 :
1092 132 : rms = FIXUP_LANES(rms);
1093 :
1094 132 : storeu_ps(&pDstScanline[iDstPixel], rms);
1095 132 : pSrcScanlineShifted += DEST_ELTS * 2;
1096 : }
1097 :
1098 66 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1099 66 : return iDstPixel;
1100 : }
1101 :
1102 : /************************************************************************/
1103 : /* AverageFloatSSE2() */
1104 : /************************************************************************/
1105 :
1106 50 : static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
1107 : const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1108 : float *CPL_RESTRICT pDstScanline)
1109 : {
1110 : // Optimized implementation for average on Float32 by
1111 : // processing by group of output pixels.
1112 50 : const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1113 :
1114 50 : int iDstPixel = 0;
1115 50 : const auto zeroDot25 = _mm_set1_ps(0.25f);
1116 50 : constexpr int DEST_ELTS =
1117 : static_cast<int>(sizeof(zeroDot25) / sizeof(float));
1118 :
1119 132 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1120 : {
1121 : // Load 2 * DEST_ELTS Float32 from each line
1122 : const auto firstLineLo =
1123 82 : _mm_mul_ps(_mm_loadu_ps(pSrcScanlineShifted), zeroDot25);
1124 164 : const auto firstLineHi = _mm_mul_ps(
1125 : _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS), zeroDot25);
1126 82 : const auto secondLineLo = _mm_mul_ps(
1127 82 : _mm_loadu_ps(pSrcScanlineShifted + nChunkXSize), zeroDot25);
1128 164 : const auto secondLineHi = _mm_mul_ps(
1129 82 : _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize),
1130 : zeroDot25);
1131 :
1132 : // Vertical addition
1133 82 : const auto tmpLo = _mm_add_ps(firstLineLo, secondLineLo);
1134 82 : const auto tmpHi = _mm_add_ps(firstLineHi, secondLineHi);
1135 :
1136 : // Horizontal addition
1137 82 : const auto average = sse2_hadd_ps(tmpLo, tmpHi);
1138 :
1139 82 : _mm_storeu_ps(&pDstScanline[iDstPixel], average);
1140 82 : pSrcScanlineShifted += DEST_ELTS * 2;
1141 : }
1142 :
1143 50 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1144 50 : return iDstPixel;
1145 : }
1146 :
1147 : /************************************************************************/
1148 : /* AverageDoubleSSE2() */
1149 : /************************************************************************/
1150 :
1151 : static int
1152 50 : AverageDoubleSSE2(int nDstXWidth, int nChunkXSize,
1153 : const double *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1154 : double *CPL_RESTRICT pDstScanline)
1155 : {
1156 : // Optimized implementation for average on Float64 by
1157 : // processing by group of output pixels.
1158 50 : const double *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1159 :
1160 50 : int iDstPixel = 0;
1161 50 : const auto zeroDot25 = _mm_set1_pd(0.25);
1162 50 : constexpr int DEST_ELTS =
1163 : static_cast<int>(sizeof(zeroDot25) / sizeof(double));
1164 :
1165 211 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1166 : {
1167 : // Load 4 * DEST_ELTS Float64 from each line
1168 161 : const auto firstLine0 = _mm_mul_pd(
1169 : _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS), zeroDot25);
1170 322 : const auto firstLine1 = _mm_mul_pd(
1171 : _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS), zeroDot25);
1172 161 : const auto secondLine0 = _mm_mul_pd(
1173 161 : _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS + nChunkXSize),
1174 : zeroDot25);
1175 322 : const auto secondLine1 = _mm_mul_pd(
1176 161 : _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS + nChunkXSize),
1177 : zeroDot25);
1178 :
1179 : // Vertical addition
1180 161 : const auto tmp0 = _mm_add_pd(firstLine0, secondLine0);
1181 161 : const auto tmp1 = _mm_add_pd(firstLine1, secondLine1);
1182 :
1183 : // Horizontal addition
1184 161 : const auto average0 = sse2_hadd_pd(tmp0, tmp1);
1185 :
1186 161 : _mm_storeu_pd(&pDstScanline[iDstPixel + 0], average0);
1187 161 : pSrcScanlineShifted += DEST_ELTS * 2;
1188 : }
1189 :
1190 50 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1191 50 : return iDstPixel;
1192 : }
1193 :
1194 : #endif
1195 :
1196 : #endif
1197 :
1198 : /************************************************************************/
1199 : /* GDALResampleChunk_AverageOrRMS() */
1200 : /************************************************************************/
1201 :
1202 : template <class T, class Tsum, GDALDataType eWrkDataType, bool bQuadraticMean>
1203 : static CPLErr
1204 7362 : GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
1205 : const T *pChunk, void **ppDstBuffer)
1206 : {
1207 7362 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1208 7362 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1209 7362 : const double dfSrcXDelta = args.dfSrcXDelta;
1210 7362 : const double dfSrcYDelta = args.dfSrcYDelta;
1211 7362 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1212 7362 : const int nChunkXOff = args.nChunkXOff;
1213 7362 : const int nChunkYOff = args.nChunkYOff;
1214 7362 : const int nChunkXSize = args.nChunkXSize;
1215 7362 : const int nChunkYSize = args.nChunkYSize;
1216 7362 : const int nDstXOff = args.nDstXOff;
1217 7362 : const int nDstXOff2 = args.nDstXOff2;
1218 7362 : const int nDstYOff = args.nDstYOff;
1219 7362 : const int nDstYOff2 = args.nDstYOff2;
1220 7362 : const char *pszResampling = args.pszResampling;
1221 7362 : bool bHasNoData = args.bHasNoData;
1222 7362 : const double dfNoDataValue = args.dfNoDataValue;
1223 7362 : const GDALColorTable *const poColorTable =
1224 : !bQuadraticMean &&
1225 : // AVERAGE_BIT2GRAYSCALE
1226 7279 : STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G")
1227 : ? nullptr
1228 : : args.poColorTable;
1229 7362 : const bool bPropagateNoData = args.bPropagateNoData;
1230 :
1231 7362 : T tNoDataValue = (!bHasNoData) ? 0 : static_cast<T>(dfNoDataValue);
1232 7362 : const T tReplacementVal =
1233 206 : bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
1234 72 : args.eOvrDataType, dfNoDataValue))
1235 : : 0;
1236 :
1237 7362 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
1238 7362 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1239 7362 : const int nDstXWidth = nDstXOff2 - nDstXOff;
1240 :
1241 : /* -------------------------------------------------------------------- */
1242 : /* Allocate buffers. */
1243 : /* -------------------------------------------------------------------- */
1244 7362 : *ppDstBuffer = static_cast<T *>(
1245 7362 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
1246 : GDALGetDataTypeSizeBytes(eWrkDataType)));
1247 7362 : if (*ppDstBuffer == nullptr)
1248 : {
1249 0 : return CE_Failure;
1250 : }
1251 7362 : T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
1252 :
1253 : struct PrecomputedXValue
1254 : {
1255 : int nLeftXOffShifted;
1256 : int nRightXOffShifted;
1257 : double dfLeftWeight;
1258 : double dfRightWeight;
1259 : double dfTotalWeightFullLine;
1260 : };
1261 :
1262 : PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
1263 7362 : VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(PrecomputedXValue)));
1264 :
1265 7362 : if (pasSrcX == nullptr)
1266 : {
1267 0 : return CE_Failure;
1268 : }
1269 :
1270 7362 : std::vector<GDALColorEntry> colorEntries;
1271 :
1272 7362 : if (poColorTable)
1273 : {
1274 5 : int nTransparentIdx = -1;
1275 5 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1276 :
1277 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1278 : // it as nodata value
1279 6 : if (bHasNoData && dfNoDataValue >= 0.0 &&
1280 1 : tNoDataValue < colorEntries.size())
1281 1 : colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1282 :
1283 : // Or if we have no explicit nodata, but a color table entry that is
1284 : // transparent, consider it as the nodata value
1285 4 : else if (!bHasNoData && nTransparentIdx >= 0)
1286 : {
1287 0 : bHasNoData = true;
1288 0 : tNoDataValue = static_cast<T>(nTransparentIdx);
1289 : }
1290 : }
1291 :
1292 : /* ==================================================================== */
1293 : /* Precompute inner loop constants. */
1294 : /* ==================================================================== */
1295 7362 : bool bSrcXSpacingIsTwo = true;
1296 7362 : int nLastSrcXOff2 = -1;
1297 1689160 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1298 : {
1299 1681805 : const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
1300 : // Apply some epsilon to avoid numerical precision issues
1301 1681805 : const int nSrcXOff =
1302 1681805 : std::max(static_cast<int>(dfSrcXOff + 1e-8), nChunkXOff);
1303 1681805 : const double dfSrcXOff2 =
1304 1681805 : dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
1305 1681805 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
1306 1681805 : if (nSrcXOff2 == nSrcXOff)
1307 0 : nSrcXOff2++;
1308 1681805 : if (nSrcXOff2 > nChunkRightXOff)
1309 1 : nSrcXOff2 = nChunkRightXOff;
1310 :
1311 1681805 : pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
1312 1681805 : pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
1313 1681805 : nSrcXOff2 - nChunkXOff;
1314 21 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
1315 1681805 : (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
1316 1681805 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
1317 1681805 : 1 - (nSrcXOff2 - dfSrcXOff2);
1318 1681805 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
1319 1681805 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
1320 1681805 : if (nSrcXOff + 1 < nSrcXOff2)
1321 : {
1322 1681779 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1323 1681779 : nSrcXOff2 - nSrcXOff - 2;
1324 1681779 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1325 1681779 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
1326 : }
1327 :
1328 1681805 : if (nSrcXOff2 - nSrcXOff != 2 ||
1329 1583882 : (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
1330 : {
1331 91989 : bSrcXSpacingIsTwo = false;
1332 : }
1333 1681805 : nLastSrcXOff2 = nSrcXOff2;
1334 : }
1335 :
1336 : /* ==================================================================== */
1337 : /* Loop over destination scanlines. */
1338 : /* ==================================================================== */
1339 705422 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1340 : {
1341 698060 : const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
1342 698060 : int nSrcYOff = std::max(static_cast<int>(dfSrcYOff + 1e-8), nChunkYOff);
1343 :
1344 698060 : const double dfSrcYOff2 =
1345 698060 : dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
1346 698060 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
1347 698060 : if (nSrcYOff2 == nSrcYOff)
1348 0 : ++nSrcYOff2;
1349 698060 : if (nSrcYOff2 > nChunkBottomYOff)
1350 3 : nSrcYOff2 = nChunkBottomYOff;
1351 :
1352 698060 : T *const pDstScanline =
1353 698060 : pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
1354 :
1355 : /* --------------------------------------------------------------------
1356 : */
1357 : /* Loop over destination pixels */
1358 : /* --------------------------------------------------------------------
1359 : */
1360 698060 : if (poColorTable == nullptr)
1361 : {
1362 697945 : if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
1363 : pabyChunkNodataMask == nullptr)
1364 : {
1365 : if constexpr (eWrkDataType == GDT_UInt8 ||
1366 : eWrkDataType == GDT_UInt16)
1367 : {
1368 : // Optimized case : no nodata, overview by a factor of 2 and
1369 : // regular x and y src spacing.
1370 129392 : const T *pSrcScanlineShifted =
1371 129392 : pChunk + pasSrcX[0].nLeftXOffShifted +
1372 129392 : static_cast<size_t>(nSrcYOff - nChunkYOff) *
1373 129392 : nChunkXSize;
1374 129392 : int iDstPixel = 0;
1375 : #ifdef USE_SSE2
1376 : if constexpr (eWrkDataType == GDT_UInt8)
1377 : {
1378 : if constexpr (bQuadraticMean)
1379 : {
1380 5389 : iDstPixel = QuadraticMeanByteSSE2OrAVX2(
1381 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1382 : pDstScanline);
1383 : }
1384 : else
1385 : {
1386 123976 : iDstPixel = AverageByteSSE2OrAVX2(
1387 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1388 : pDstScanline);
1389 : }
1390 : }
1391 : else
1392 : {
1393 : static_assert(eWrkDataType == GDT_UInt16);
1394 : if constexpr (bQuadraticMean)
1395 : {
1396 14 : iDstPixel = QuadraticMeanUInt16SSE2(
1397 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1398 : pDstScanline);
1399 : }
1400 : else
1401 : {
1402 13 : iDstPixel = AverageUInt16SSE2(
1403 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1404 : pDstScanline);
1405 : }
1406 : }
1407 : #endif
1408 303851 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1409 : {
1410 174459 : Tsum nTotal = 0;
1411 : T nVal;
1412 : if constexpr (bQuadraticMean)
1413 52 : nTotal =
1414 52 : SQUARE<Tsum>(pSrcScanlineShifted[0]) +
1415 52 : SQUARE<Tsum>(pSrcScanlineShifted[1]) +
1416 52 : SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
1417 52 : SQUARE<Tsum>(
1418 52 : pSrcScanlineShifted[1 + nChunkXSize]);
1419 : else
1420 174407 : nTotal = pSrcScanlineShifted[0] +
1421 174407 : pSrcScanlineShifted[1] +
1422 174407 : pSrcScanlineShifted[nChunkXSize] +
1423 174407 : pSrcScanlineShifted[1 + nChunkXSize];
1424 :
1425 174459 : constexpr int nTotalWeight = 4;
1426 : if constexpr (bQuadraticMean)
1427 52 : nVal = ComputeIntegerRMS_4values<T>(nTotal);
1428 : else
1429 174407 : nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
1430 : nTotalWeight);
1431 :
1432 : // No need to compare nVal against tNoDataValue as we
1433 : // are in a case where pabyChunkNodataMask == nullptr
1434 : // implies the absence of nodata value.
1435 174459 : pDstScanline[iDstPixel] = nVal;
1436 174459 : pSrcScanlineShifted += 2;
1437 : }
1438 : }
1439 : else
1440 : {
1441 : static_assert(eWrkDataType == GDT_Float32 ||
1442 : eWrkDataType == GDT_Float64);
1443 202 : const T *pSrcScanlineShifted =
1444 202 : pChunk + pasSrcX[0].nLeftXOffShifted +
1445 202 : static_cast<size_t>(nSrcYOff - nChunkYOff) *
1446 202 : nChunkXSize;
1447 202 : int iDstPixel = 0;
1448 : #if defined(USE_SSE2) && !defined(ARM_V7)
1449 : if constexpr (eWrkDataType == GDT_Float32)
1450 : {
1451 : static_assert(std::is_same_v<T, float>);
1452 : if constexpr (bQuadraticMean)
1453 : {
1454 66 : iDstPixel = QuadraticMeanFloatSSE2(
1455 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1456 : pDstScanline);
1457 : }
1458 : else
1459 : {
1460 50 : iDstPixel = AverageFloatSSE2(
1461 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1462 : pDstScanline);
1463 : }
1464 : }
1465 : else
1466 : {
1467 : if constexpr (!bQuadraticMean)
1468 : {
1469 50 : iDstPixel = AverageDoubleSSE2(
1470 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1471 : pDstScanline);
1472 : }
1473 : }
1474 : #endif
1475 :
1476 726 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1477 : {
1478 : T nVal;
1479 :
1480 : if constexpr (bQuadraticMean)
1481 : {
1482 : // Avoid issues with large values by renormalizing
1483 96 : const auto max = std::max(
1484 420 : {std::fabs(pSrcScanlineShifted[0]),
1485 420 : std::fabs(pSrcScanlineShifted[1]),
1486 420 : std::fabs(pSrcScanlineShifted[nChunkXSize]),
1487 420 : std::fabs(
1488 420 : pSrcScanlineShifted[1 + nChunkXSize])});
1489 420 : if (max == 0)
1490 : {
1491 8 : nVal = 0;
1492 : }
1493 412 : else if (std::isinf(max))
1494 : {
1495 : // If there is at least one infinity value,
1496 : // then just summing, and taking the abs
1497 : // value will give the expected result:
1498 : // * +inf if all values are +inf
1499 : // * +inf if all values are -inf
1500 : // * NaN otherwise
1501 82 : nVal = std::fabs(
1502 82 : pSrcScanlineShifted[0] +
1503 82 : pSrcScanlineShifted[1] +
1504 82 : pSrcScanlineShifted[nChunkXSize] +
1505 82 : pSrcScanlineShifted[1 + nChunkXSize]);
1506 : }
1507 : else
1508 : {
1509 330 : const auto inv_max = static_cast<T>(1.0) / max;
1510 330 : nVal =
1511 : max *
1512 330 : std::sqrt(
1513 : static_cast<T>(0.25) *
1514 330 : (SQUARE(pSrcScanlineShifted[0] *
1515 330 : inv_max) +
1516 330 : SQUARE(pSrcScanlineShifted[1] *
1517 330 : inv_max) +
1518 330 : SQUARE(
1519 330 : pSrcScanlineShifted[nChunkXSize] *
1520 330 : inv_max) +
1521 330 : SQUARE(
1522 330 : pSrcScanlineShifted[1 +
1523 : nChunkXSize] *
1524 : inv_max)));
1525 : }
1526 : }
1527 : else
1528 : {
1529 104 : constexpr auto weight = static_cast<T>(0.25);
1530 : // Multiply each value by weight to avoid
1531 : // potential overflow
1532 104 : nVal =
1533 104 : (weight * pSrcScanlineShifted[0] +
1534 104 : weight * pSrcScanlineShifted[1] +
1535 104 : weight * pSrcScanlineShifted[nChunkXSize] +
1536 104 : weight * pSrcScanlineShifted[1 + nChunkXSize]);
1537 : }
1538 :
1539 : // No need to compare nVal against tNoDataValue as we
1540 : // are in a case where pabyChunkNodataMask == nullptr
1541 : // implies the absence of nodata value.
1542 524 : pDstScanline[iDstPixel] = nVal;
1543 524 : pSrcScanlineShifted += 2;
1544 : }
1545 129594 : }
1546 : }
1547 : else
1548 : {
1549 17 : const double dfBottomWeight =
1550 568351 : (nSrcYOff + 1 == nSrcYOff2) ? 1.0
1551 568334 : : 1.0 - (dfSrcYOff - nSrcYOff);
1552 568351 : const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
1553 568351 : nSrcYOff -= nChunkYOff;
1554 568351 : nSrcYOff2 -= nChunkYOff;
1555 :
1556 568351 : double dfTotalWeightFullColumn = dfBottomWeight;
1557 568351 : if (nSrcYOff + 1 < nSrcYOff2)
1558 : {
1559 568334 : dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
1560 568334 : dfTotalWeightFullColumn += dfTopWeight;
1561 : }
1562 :
1563 9784185 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1564 : {
1565 9215839 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1566 9215839 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1567 :
1568 9215839 : double dfTotal = 0;
1569 9215839 : double dfTotalWeight = 0;
1570 9215839 : [[maybe_unused]] double dfMulFactor = 1.0;
1571 9215839 : [[maybe_unused]] double dfInvMulFactor = 1.0;
1572 9215839 : constexpr bool bUseMulFactor =
1573 : (eWrkDataType == GDT_Float32 ||
1574 : eWrkDataType == GDT_Float64);
1575 9215839 : if (pabyChunkNodataMask == nullptr)
1576 : {
1577 : if constexpr (bUseMulFactor)
1578 : {
1579 : if constexpr (bQuadraticMean)
1580 : {
1581 80 : T mulFactor = 0;
1582 80 : auto pChunkShifted =
1583 80 : pChunk +
1584 80 : static_cast<size_t>(nSrcYOff) * nChunkXSize;
1585 :
1586 240 : for (int iY = nSrcYOff; iY < nSrcYOff2;
1587 160 : ++iY, pChunkShifted += nChunkXSize)
1588 : {
1589 480 : for (int iX = nSrcXOff; iX < nSrcXOff2;
1590 : ++iX)
1591 640 : mulFactor = std::max(
1592 : mulFactor,
1593 320 : std::fabs(pChunkShifted[iX]));
1594 : }
1595 80 : dfMulFactor = double(mulFactor);
1596 142 : dfInvMulFactor =
1597 62 : dfMulFactor > 0 &&
1598 62 : std::isfinite(dfMulFactor)
1599 : ? 1.0 / dfMulFactor
1600 : : 1.0;
1601 : }
1602 : else
1603 : {
1604 139 : dfMulFactor = (nSrcYOff2 - nSrcYOff) *
1605 139 : (nSrcXOff2 - nSrcXOff);
1606 139 : dfInvMulFactor = 1.0 / dfMulFactor;
1607 : }
1608 : }
1609 :
1610 1746545 : auto pChunkShifted =
1611 227 : pChunk +
1612 1746545 : static_cast<size_t>(nSrcYOff) * nChunkXSize;
1613 1746545 : int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1614 1746545 : double dfWeightY = dfBottomWeight;
1615 3493539 : while (true)
1616 : {
1617 : double dfTotalLine;
1618 : if constexpr (bQuadraticMean)
1619 : {
1620 : // Left pixel
1621 : {
1622 216 : const T val = pChunkShifted[nSrcXOff];
1623 216 : dfTotalLine =
1624 216 : SQUARE(double(val) * dfInvMulFactor) *
1625 216 : pasSrcX[iDstPixel].dfLeftWeight;
1626 : }
1627 :
1628 216 : if (nSrcXOff + 1 < nSrcXOff2)
1629 : {
1630 : // Middle pixels
1631 216 : for (int iX = nSrcXOff + 1;
1632 536 : iX < nSrcXOff2 - 1; ++iX)
1633 : {
1634 320 : const T val = pChunkShifted[iX];
1635 320 : dfTotalLine += SQUARE(double(val) *
1636 : dfInvMulFactor);
1637 : }
1638 :
1639 : // Right pixel
1640 : {
1641 216 : const T val =
1642 216 : pChunkShifted[nSrcXOff2 - 1];
1643 216 : dfTotalLine +=
1644 216 : SQUARE(double(val) *
1645 216 : dfInvMulFactor) *
1646 216 : pasSrcX[iDstPixel].dfRightWeight;
1647 : }
1648 : }
1649 : }
1650 : else
1651 : {
1652 : // Left pixel
1653 : {
1654 5239868 : const T val = pChunkShifted[nSrcXOff];
1655 5239868 : dfTotalLine =
1656 5239868 : double(val) * dfInvMulFactor *
1657 5239868 : pasSrcX[iDstPixel].dfLeftWeight;
1658 : }
1659 :
1660 5239868 : if (nSrcXOff + 1 < nSrcXOff2)
1661 : {
1662 : // Middle pixels
1663 4239442 : for (int iX = nSrcXOff + 1;
1664 64183238 : iX < nSrcXOff2 - 1; ++iX)
1665 : {
1666 59943836 : const T val = pChunkShifted[iX];
1667 59943836 : dfTotalLine +=
1668 59943836 : double(val) * dfInvMulFactor;
1669 : }
1670 :
1671 : // Right pixel
1672 : {
1673 4239442 : const T val =
1674 4239442 : pChunkShifted[nSrcXOff2 - 1];
1675 4239442 : dfTotalLine +=
1676 4239442 : double(val) * dfInvMulFactor *
1677 4239442 : pasSrcX[iDstPixel].dfRightWeight;
1678 : }
1679 : }
1680 : }
1681 :
1682 5240084 : dfTotal += dfTotalLine * dfWeightY;
1683 5240084 : --nCounterY;
1684 5240084 : if (nCounterY < 0)
1685 1746545 : break;
1686 3493539 : pChunkShifted += nChunkXSize;
1687 3493539 : dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
1688 : }
1689 :
1690 1746545 : dfTotalWeight =
1691 1746545 : pasSrcX[iDstPixel].dfTotalWeightFullLine *
1692 : dfTotalWeightFullColumn;
1693 : }
1694 : else
1695 : {
1696 7469294 : size_t nCount = 0;
1697 30285576 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1698 : {
1699 22816292 : const auto pChunkShifted =
1700 22816292 : pChunk + static_cast<size_t>(iY) * nChunkXSize;
1701 :
1702 22816292 : double dfTotalLine = 0;
1703 22816292 : double dfTotalWeightLine = 0;
1704 : // Left pixel
1705 : {
1706 22816292 : const int iX = nSrcXOff;
1707 22816292 : const T val = pChunkShifted[iX];
1708 22816292 : if (pabyChunkNodataMask
1709 22816292 : [iX +
1710 22816292 : static_cast<size_t>(iY) * nChunkXSize])
1711 : {
1712 17325139 : nCount++;
1713 17325139 : const double dfWeightX =
1714 17325139 : pasSrcX[iDstPixel].dfLeftWeight;
1715 17325139 : dfTotalWeightLine = dfWeightX;
1716 : if constexpr (bQuadraticMean)
1717 508 : dfTotalLine =
1718 508 : SQUARE(double(val)) * dfWeightX;
1719 : else
1720 17324631 : dfTotalLine = double(val) * dfWeightX;
1721 : }
1722 : }
1723 :
1724 22816292 : if (nSrcXOff < nSrcXOff2 - 1)
1725 : {
1726 : // Middle pixels
1727 61618372 : for (int iX = nSrcXOff + 1; iX < nSrcXOff2 - 1;
1728 : ++iX)
1729 : {
1730 38802080 : const T val = pChunkShifted[iX];
1731 38802080 : if (pabyChunkNodataMask
1732 38802080 : [iX + static_cast<size_t>(iY) *
1733 38802080 : nChunkXSize])
1734 : {
1735 28038780 : nCount++;
1736 28038780 : dfTotalWeightLine += 1;
1737 : if constexpr (bQuadraticMean)
1738 640 : dfTotalLine += SQUARE(double(val));
1739 : else
1740 28038140 : dfTotalLine += double(val);
1741 : }
1742 : }
1743 :
1744 : // Right pixel
1745 : {
1746 22816292 : const int iX = nSrcXOff2 - 1;
1747 22816292 : const T val = pChunkShifted[iX];
1748 22816292 : if (pabyChunkNodataMask
1749 22816292 : [iX + static_cast<size_t>(iY) *
1750 22816292 : nChunkXSize])
1751 : {
1752 17324495 : nCount++;
1753 17324495 : const double dfWeightX =
1754 17324495 : pasSrcX[iDstPixel].dfRightWeight;
1755 17324495 : dfTotalWeightLine += dfWeightX;
1756 : if constexpr (bQuadraticMean)
1757 503 : dfTotalLine +=
1758 503 : SQUARE(double(val)) * dfWeightX;
1759 : else
1760 17323992 : dfTotalLine +=
1761 17323992 : double(val) * dfWeightX;
1762 : }
1763 : }
1764 : }
1765 :
1766 38163300 : const double dfWeightY =
1767 : (iY == nSrcYOff) ? dfBottomWeight
1768 15347008 : : (iY + 1 == nSrcYOff2) ? dfTopWeight
1769 : : 1.0;
1770 22816292 : dfTotal += dfTotalLine * dfWeightY;
1771 22816292 : dfTotalWeight += dfTotalWeightLine * dfWeightY;
1772 : }
1773 :
1774 7469294 : if (nCount == 0 ||
1775 8 : (bPropagateNoData &&
1776 : nCount <
1777 8 : static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
1778 8 : (nSrcXOff2 - nSrcXOff)))
1779 : {
1780 2307682 : pDstScanline[iDstPixel] = tNoDataValue;
1781 2307682 : continue;
1782 : }
1783 : }
1784 : if constexpr (eWrkDataType == GDT_UInt8)
1785 : {
1786 : T nVal;
1787 : if constexpr (bQuadraticMean)
1788 38 : nVal = ComputeIntegerRMS<T, int>(dfTotal,
1789 : dfTotalWeight);
1790 : else
1791 6901260 : nVal =
1792 6901260 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1793 6901298 : if (bHasNoData && nVal == tNoDataValue)
1794 0 : nVal = tReplacementVal;
1795 6901298 : pDstScanline[iDstPixel] = nVal;
1796 : }
1797 : else if constexpr (eWrkDataType == GDT_UInt16)
1798 : {
1799 : T nVal;
1800 : if constexpr (bQuadraticMean)
1801 4 : nVal = ComputeIntegerRMS<T, uint64_t>(
1802 : dfTotal, dfTotalWeight);
1803 : else
1804 4 : nVal =
1805 4 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1806 8 : if (bHasNoData && nVal == tNoDataValue)
1807 0 : nVal = tReplacementVal;
1808 8 : pDstScanline[iDstPixel] = nVal;
1809 : }
1810 : else
1811 : {
1812 : T nVal;
1813 : if constexpr (bQuadraticMean)
1814 : {
1815 : if constexpr (bUseMulFactor)
1816 249 : nVal = static_cast<T>(
1817 132 : dfMulFactor *
1818 249 : sqrt(dfTotal / dfTotalWeight));
1819 : else
1820 : nVal = static_cast<T>(
1821 : sqrt(dfTotal / dfTotalWeight));
1822 : }
1823 : else
1824 : {
1825 : if constexpr (bUseMulFactor)
1826 6602 : nVal = static_cast<T>(
1827 6602 : dfMulFactor * (dfTotal / dfTotalWeight));
1828 : else
1829 : nVal = static_cast<T>(dfTotal / dfTotalWeight);
1830 : }
1831 6851 : if (bHasNoData && nVal == tNoDataValue)
1832 2 : nVal = tReplacementVal;
1833 6851 : pDstScanline[iDstPixel] = nVal;
1834 : }
1835 : }
1836 : }
1837 : }
1838 : else
1839 : {
1840 115 : nSrcYOff -= nChunkYOff;
1841 115 : nSrcYOff2 -= nChunkYOff;
1842 :
1843 6590 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1844 : {
1845 6475 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1846 6475 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1847 :
1848 6475 : uint64_t nTotalR = 0;
1849 6475 : uint64_t nTotalG = 0;
1850 6475 : uint64_t nTotalB = 0;
1851 6475 : size_t nCount = 0;
1852 :
1853 19425 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1854 : {
1855 38850 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
1856 : {
1857 25900 : const T val =
1858 25900 : pChunk[iX + static_cast<size_t>(iY) * nChunkXSize];
1859 : // cppcheck-suppress unsignedLessThanZero
1860 25900 : if (val < 0 || val >= colorEntries.size())
1861 0 : continue;
1862 25900 : const size_t idx = static_cast<size_t>(val);
1863 25900 : const auto &entry = colorEntries[idx];
1864 25900 : if (entry.c4)
1865 : {
1866 : if constexpr (bQuadraticMean)
1867 : {
1868 800 : nTotalR += SQUARE<int>(entry.c1);
1869 800 : nTotalG += SQUARE<int>(entry.c2);
1870 800 : nTotalB += SQUARE<int>(entry.c3);
1871 800 : ++nCount;
1872 : }
1873 : else
1874 : {
1875 13328 : nTotalR += entry.c1;
1876 13328 : nTotalG += entry.c2;
1877 13328 : nTotalB += entry.c3;
1878 13328 : ++nCount;
1879 : }
1880 : }
1881 : }
1882 : }
1883 :
1884 6475 : if (nCount == 0 ||
1885 0 : (bPropagateNoData &&
1886 0 : nCount < static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
1887 0 : (nSrcXOff2 - nSrcXOff)))
1888 : {
1889 2838 : pDstScanline[iDstPixel] = tNoDataValue;
1890 : }
1891 : else
1892 : {
1893 : GDALColorEntry color;
1894 : if constexpr (bQuadraticMean)
1895 : {
1896 200 : color.c1 =
1897 200 : static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
1898 200 : color.c2 =
1899 200 : static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
1900 200 : color.c3 =
1901 200 : static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
1902 : }
1903 : else
1904 : {
1905 3437 : color.c1 =
1906 3437 : static_cast<short>((nTotalR + nCount / 2) / nCount);
1907 3437 : color.c2 =
1908 3437 : static_cast<short>((nTotalG + nCount / 2) / nCount);
1909 3437 : color.c3 =
1910 3437 : static_cast<short>((nTotalB + nCount / 2) / nCount);
1911 : }
1912 3637 : pDstScanline[iDstPixel] =
1913 3637 : static_cast<T>(BestColorEntry(colorEntries, color));
1914 : }
1915 : }
1916 : }
1917 : }
1918 :
1919 7362 : CPLFree(pasSrcX);
1920 :
1921 7362 : return CE_None;
1922 : }
1923 :
1924 : template <bool bQuadraticMean>
1925 : static CPLErr
1926 7362 : GDALResampleChunk_AverageOrRMSInternal(const GDALOverviewResampleArgs &args,
1927 : const void *pChunk, void **ppDstBuffer,
1928 : GDALDataType *peDstBufferDataType)
1929 : {
1930 7362 : *peDstBufferDataType = args.eWrkDataType;
1931 7362 : switch (args.eWrkDataType)
1932 : {
1933 7217 : case GDT_UInt8:
1934 : {
1935 : return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_UInt8,
1936 7217 : bQuadraticMean>(
1937 7217 : args, static_cast<const GByte *>(pChunk), ppDstBuffer);
1938 : }
1939 :
1940 11 : case GDT_UInt16:
1941 : {
1942 : if constexpr (bQuadraticMean)
1943 : {
1944 : // Use double as accumulation type, because UInt32 could overflow
1945 : return GDALResampleChunk_AverageOrRMS_T<
1946 6 : GUInt16, double, GDT_UInt16, bQuadraticMean>(
1947 6 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1948 : }
1949 : else
1950 : {
1951 : return GDALResampleChunk_AverageOrRMS_T<
1952 5 : GUInt16, GUInt32, GDT_UInt16, bQuadraticMean>(
1953 5 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1954 : }
1955 : }
1956 :
1957 81 : case GDT_Float32:
1958 : {
1959 : return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32,
1960 81 : bQuadraticMean>(
1961 81 : args, static_cast<const float *>(pChunk), ppDstBuffer);
1962 : }
1963 :
1964 53 : case GDT_Float64:
1965 : {
1966 : return GDALResampleChunk_AverageOrRMS_T<double, double, GDT_Float64,
1967 53 : bQuadraticMean>(
1968 53 : args, static_cast<const double *>(pChunk), ppDstBuffer);
1969 : }
1970 :
1971 0 : default:
1972 0 : break;
1973 : }
1974 :
1975 0 : CPLAssert(false);
1976 : return CE_Failure;
1977 : }
1978 :
1979 : static CPLErr
1980 7362 : GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
1981 : const void *pChunk, void **ppDstBuffer,
1982 : GDALDataType *peDstBufferDataType)
1983 : {
1984 7362 : if (EQUAL(args.pszResampling, "RMS"))
1985 83 : return GDALResampleChunk_AverageOrRMSInternal<true>(
1986 83 : args, pChunk, ppDstBuffer, peDstBufferDataType);
1987 : else
1988 7279 : return GDALResampleChunk_AverageOrRMSInternal<false>(
1989 7279 : args, pChunk, ppDstBuffer, peDstBufferDataType);
1990 : }
1991 :
1992 : /************************************************************************/
1993 : /* GDALResampleChunk_Gauss() */
1994 : /************************************************************************/
1995 :
1996 86 : static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
1997 : const void *pChunk, void **ppDstBuffer,
1998 : GDALDataType *peDstBufferDataType)
1999 :
2000 : {
2001 86 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2002 86 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2003 86 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2004 86 : const int nChunkXOff = args.nChunkXOff;
2005 86 : const int nChunkXSize = args.nChunkXSize;
2006 86 : const int nChunkYOff = args.nChunkYOff;
2007 86 : const int nChunkYSize = args.nChunkYSize;
2008 86 : const int nDstXOff = args.nDstXOff;
2009 86 : const int nDstXOff2 = args.nDstXOff2;
2010 86 : const int nDstYOff = args.nDstYOff;
2011 86 : const int nDstYOff2 = args.nDstYOff2;
2012 86 : const bool bHasNoData = args.bHasNoData;
2013 86 : double dfNoDataValue = args.dfNoDataValue;
2014 86 : const GDALColorTable *poColorTable = args.poColorTable;
2015 :
2016 86 : const double *const padfChunk = static_cast<const double *>(pChunk);
2017 :
2018 86 : *ppDstBuffer =
2019 86 : VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
2020 : GDALGetDataTypeSizeBytes(GDT_Float64));
2021 86 : if (*ppDstBuffer == nullptr)
2022 : {
2023 0 : return CE_Failure;
2024 : }
2025 86 : *peDstBufferDataType = GDT_Float64;
2026 86 : double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
2027 :
2028 : /* -------------------------------------------------------------------- */
2029 : /* Create the filter kernel and allocate scanline buffer. */
2030 : /* -------------------------------------------------------------------- */
2031 86 : int nGaussMatrixDim = 3;
2032 : const int *panGaussMatrix;
2033 86 : constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
2034 86 : constexpr int anGaussMatrix5x5[] = {1, 4, 6, 4, 1, 4, 16, 24, 16,
2035 : 4, 6, 24, 36, 24, 6, 4, 16, 24,
2036 : 16, 4, 1, 4, 6, 4, 1};
2037 86 : constexpr int anGaussMatrix7x7[] = {
2038 : 1, 6, 15, 20, 15, 6, 1, 6, 36, 90, 120, 90, 36,
2039 : 6, 15, 90, 225, 300, 225, 90, 15, 20, 120, 300, 400, 300,
2040 : 120, 20, 15, 90, 225, 300, 225, 90, 15, 6, 36, 90, 120,
2041 : 90, 36, 6, 1, 6, 15, 20, 15, 6, 1};
2042 :
2043 86 : const int nOXSize = args.nOvrXSize;
2044 86 : const int nOYSize = args.nOvrYSize;
2045 86 : const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
2046 :
2047 : // matrix for gauss filter
2048 86 : if (nResYFactor <= 2)
2049 : {
2050 85 : panGaussMatrix = anGaussMatrix3x3;
2051 85 : nGaussMatrixDim = 3;
2052 : }
2053 1 : else if (nResYFactor <= 4)
2054 : {
2055 0 : panGaussMatrix = anGaussMatrix5x5;
2056 0 : nGaussMatrixDim = 5;
2057 : }
2058 : else
2059 : {
2060 1 : panGaussMatrix = anGaussMatrix7x7;
2061 1 : nGaussMatrixDim = 7;
2062 : }
2063 :
2064 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
2065 : int *panGaussMatrixDup = static_cast<int *>(
2066 : CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
2067 : memcpy(panGaussMatrixDup, panGaussMatrix,
2068 : sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
2069 : panGaussMatrix = panGaussMatrixDup;
2070 : #endif
2071 :
2072 86 : if (!bHasNoData)
2073 79 : dfNoDataValue = 0.0;
2074 :
2075 86 : std::vector<GDALColorEntry> colorEntries;
2076 86 : int nTransparentIdx = -1;
2077 86 : if (poColorTable)
2078 2 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
2079 :
2080 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
2081 : // it as nodata value.
2082 92 : if (bHasNoData && dfNoDataValue >= 0.0 &&
2083 6 : dfNoDataValue < colorEntries.size())
2084 0 : colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
2085 :
2086 : // Or if we have no explicit nodata, but a color table entry that is
2087 : // transparent, consider it as the nodata value.
2088 86 : else if (!bHasNoData && nTransparentIdx >= 0)
2089 : {
2090 0 : dfNoDataValue = nTransparentIdx;
2091 : }
2092 :
2093 86 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2094 86 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2095 86 : const int nDstXWidth = nDstXOff2 - nDstXOff;
2096 :
2097 : /* ==================================================================== */
2098 : /* Loop over destination scanlines. */
2099 : /* ==================================================================== */
2100 16488 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2101 : {
2102 16402 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
2103 16402 : int nSrcYOff2 =
2104 16402 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
2105 :
2106 16402 : if (nSrcYOff < nChunkYOff)
2107 : {
2108 0 : nSrcYOff = nChunkYOff;
2109 0 : nSrcYOff2++;
2110 : }
2111 :
2112 16402 : const int iSizeY = nSrcYOff2 - nSrcYOff;
2113 16402 : nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
2114 16402 : nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
2115 :
2116 16402 : if (nSrcYOff2 > nChunkBottomYOff ||
2117 16359 : (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
2118 : {
2119 44 : nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
2120 : }
2121 :
2122 16402 : int nYShiftGaussMatrix = 0;
2123 16402 : if (nSrcYOff < nChunkYOff)
2124 : {
2125 0 : nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
2126 0 : nSrcYOff = nChunkYOff;
2127 : }
2128 :
2129 16402 : const double *const padfSrcScanline =
2130 16402 : padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
2131 16402 : const GByte *pabySrcScanlineNodataMask = nullptr;
2132 16402 : if (pabyChunkNodataMask != nullptr)
2133 152 : pabySrcScanlineNodataMask =
2134 152 : pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
2135 :
2136 : /* --------------------------------------------------------------------
2137 : */
2138 : /* Loop over destination pixels */
2139 : /* --------------------------------------------------------------------
2140 : */
2141 16402 : double *const padfDstScanline =
2142 16402 : padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
2143 4149980 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2144 : {
2145 4133580 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
2146 4133580 : int nSrcXOff2 =
2147 4133580 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
2148 :
2149 4133580 : if (nSrcXOff < nChunkXOff)
2150 : {
2151 0 : nSrcXOff = nChunkXOff;
2152 0 : nSrcXOff2++;
2153 : }
2154 :
2155 4133580 : const int iSizeX = nSrcXOff2 - nSrcXOff;
2156 4133580 : nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
2157 4133580 : nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
2158 :
2159 4133580 : if (nSrcXOff2 > nChunkRightXOff ||
2160 4127930 : (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
2161 : {
2162 5650 : nSrcXOff2 =
2163 5650 : std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
2164 : }
2165 :
2166 4133580 : int nXShiftGaussMatrix = 0;
2167 4133580 : if (nSrcXOff < nChunkXOff)
2168 : {
2169 0 : nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
2170 0 : nSrcXOff = nChunkXOff;
2171 : }
2172 :
2173 4133580 : if (poColorTable == nullptr)
2174 : {
2175 4133380 : double dfTotal = 0.0;
2176 4133380 : GInt64 nCount = 0;
2177 4133380 : const int *panLineWeight =
2178 4133380 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2179 : nXShiftGaussMatrix;
2180 :
2181 16527900 : for (int iY = nSrcYOff; iY < nSrcYOff2;
2182 12394500 : ++iY, panLineWeight += nGaussMatrixDim)
2183 : {
2184 49561300 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2185 : {
2186 37166800 : const double val =
2187 37166800 : padfSrcScanline[iX - nChunkXOff +
2188 37166800 : static_cast<GPtrDiff_t>(iY -
2189 37166800 : nSrcYOff) *
2190 37166800 : nChunkXSize];
2191 37166800 : if (pabySrcScanlineNodataMask == nullptr ||
2192 32872 : pabySrcScanlineNodataMask[iX - nChunkXOff +
2193 32872 : static_cast<GPtrDiff_t>(
2194 32872 : iY - nSrcYOff) *
2195 32872 : nChunkXSize])
2196 : {
2197 37146100 : const int nWeight = panLineWeight[i];
2198 37146100 : dfTotal += val * nWeight;
2199 37146100 : nCount += nWeight;
2200 : }
2201 : }
2202 : }
2203 :
2204 4133380 : if (nCount == 0)
2205 : {
2206 2217 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2207 : }
2208 : else
2209 : {
2210 4131160 : padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
2211 : }
2212 : }
2213 : else
2214 : {
2215 200 : GInt64 nTotalR = 0;
2216 200 : GInt64 nTotalG = 0;
2217 200 : GInt64 nTotalB = 0;
2218 200 : GInt64 nTotalWeight = 0;
2219 200 : const int *panLineWeight =
2220 200 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2221 : nXShiftGaussMatrix;
2222 :
2223 780 : for (int iY = nSrcYOff; iY < nSrcYOff2;
2224 580 : ++iY, panLineWeight += nGaussMatrixDim)
2225 : {
2226 2262 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2227 : {
2228 1682 : const double val =
2229 1682 : padfSrcScanline[iX - nChunkXOff +
2230 1682 : static_cast<GPtrDiff_t>(iY -
2231 1682 : nSrcYOff) *
2232 1682 : nChunkXSize];
2233 1682 : if (val < 0 || val >= colorEntries.size())
2234 0 : continue;
2235 :
2236 1682 : size_t idx = static_cast<size_t>(val);
2237 1682 : if (colorEntries[idx].c4)
2238 : {
2239 1682 : const int nWeight = panLineWeight[i];
2240 1682 : nTotalR +=
2241 1682 : static_cast<GInt64>(colorEntries[idx].c1) *
2242 1682 : nWeight;
2243 1682 : nTotalG +=
2244 1682 : static_cast<GInt64>(colorEntries[idx].c2) *
2245 1682 : nWeight;
2246 1682 : nTotalB +=
2247 1682 : static_cast<GInt64>(colorEntries[idx].c3) *
2248 1682 : nWeight;
2249 1682 : nTotalWeight += nWeight;
2250 : }
2251 : }
2252 : }
2253 :
2254 200 : if (nTotalWeight == 0)
2255 : {
2256 0 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2257 : }
2258 : else
2259 : {
2260 : GDALColorEntry color;
2261 :
2262 200 : color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
2263 : nTotalWeight);
2264 200 : color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
2265 : nTotalWeight);
2266 200 : color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
2267 : nTotalWeight);
2268 200 : padfDstScanline[iDstPixel - nDstXOff] =
2269 200 : BestColorEntry(colorEntries, color);
2270 : }
2271 : }
2272 : }
2273 : }
2274 :
2275 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
2276 : CPLFree(panGaussMatrixDup);
2277 : #endif
2278 :
2279 86 : return CE_None;
2280 : }
2281 :
2282 : /************************************************************************/
2283 : /* GDALResampleChunk_Mode() */
2284 : /************************************************************************/
2285 :
2286 688 : template <class T> static inline bool IsSame(T a, T b)
2287 : {
2288 688 : return a == b;
2289 : }
2290 :
2291 60 : template <> bool IsSame<GFloat16>(GFloat16 a, GFloat16 b)
2292 : {
2293 60 : return a == b || (CPLIsNan(a) && CPLIsNan(b));
2294 : }
2295 :
2296 5583 : template <> bool IsSame<float>(float a, float b)
2297 : {
2298 5583 : return a == b || (std::isnan(a) && std::isnan(b));
2299 : }
2300 :
2301 1701 : template <> bool IsSame<double>(double a, double b)
2302 : {
2303 1701 : return a == b || (std::isnan(a) && std::isnan(b));
2304 : }
2305 :
2306 : namespace
2307 : {
2308 : struct ComplexFloat16
2309 : {
2310 : GFloat16 r;
2311 : GFloat16 i;
2312 : };
2313 : } // namespace
2314 :
2315 60 : template <> bool IsSame<ComplexFloat16>(ComplexFloat16 a, ComplexFloat16 b)
2316 : {
2317 90 : return (a.r == b.r && a.i == b.i) ||
2318 90 : (CPLIsNan(a.r) && CPLIsNan(a.i) && CPLIsNan(b.r) && CPLIsNan(b.i));
2319 : }
2320 :
2321 : template <>
2322 60 : bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
2323 : {
2324 120 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2325 120 : std::isnan(b.real()) && std::isnan(b.imag()));
2326 : }
2327 :
2328 : template <>
2329 60 : bool IsSame<std::complex<double>>(std::complex<double> a,
2330 : std::complex<double> b)
2331 : {
2332 120 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2333 120 : std::isnan(b.real()) && std::isnan(b.imag()));
2334 : }
2335 :
2336 : template <class T>
2337 182 : static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
2338 : const T *pChunk, T *const pDstBuffer)
2339 :
2340 : {
2341 182 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2342 182 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2343 182 : const double dfSrcXDelta = args.dfSrcXDelta;
2344 182 : const double dfSrcYDelta = args.dfSrcYDelta;
2345 182 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2346 182 : const int nChunkXOff = args.nChunkXOff;
2347 182 : const int nChunkXSize = args.nChunkXSize;
2348 182 : const int nChunkYOff = args.nChunkYOff;
2349 182 : const int nChunkYSize = args.nChunkYSize;
2350 182 : const int nDstXOff = args.nDstXOff;
2351 182 : const int nDstXOff2 = args.nDstXOff2;
2352 182 : const int nDstYOff = args.nDstYOff;
2353 182 : const int nDstYOff2 = args.nDstYOff2;
2354 182 : const bool bHasNoData = args.bHasNoData;
2355 182 : const GDALColorTable *poColorTable = args.poColorTable;
2356 182 : const int nDstXSize = nDstXOff2 - nDstXOff;
2357 :
2358 8 : T tNoDataValue;
2359 : if constexpr (std::is_same<T, ComplexFloat16>::value)
2360 : {
2361 4 : tNoDataValue.r = cpl::NumericLimits<GFloat16>::quiet_NaN();
2362 4 : tNoDataValue.i = cpl::NumericLimits<GFloat16>::quiet_NaN();
2363 : }
2364 : else if constexpr (std::is_same<T, std::complex<float>>::value ||
2365 : std::is_same<T, std::complex<double>>::value)
2366 : {
2367 : using BaseT = typename T::value_type;
2368 8 : tNoDataValue =
2369 : std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
2370 : std::numeric_limits<BaseT>::quiet_NaN());
2371 : }
2372 170 : else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
2373 169 : tNoDataValue = 0;
2374 : else
2375 1 : tNoDataValue = static_cast<T>(args.dfNoDataValue);
2376 :
2377 : using CountType = uint32_t;
2378 182 : CountType nMaxNumPx = 0;
2379 182 : T *paVals = nullptr;
2380 182 : CountType *panCounts = nullptr;
2381 :
2382 182 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2383 182 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2384 364 : std::vector<int> anVals(256, 0);
2385 :
2386 : /* ==================================================================== */
2387 : /* Loop over destination scanlines. */
2388 : /* ==================================================================== */
2389 7713 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2390 : {
2391 7531 : const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
2392 7531 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
2393 : #ifdef only_pixels_with_more_than_10_pct_participation
2394 : // When oversampling, don't take into account pixels that have a tiny
2395 : // participation in the resulting pixel
2396 : if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2397 : nSrcYOff < nChunkBottomYOff)
2398 : nSrcYOff++;
2399 : #endif
2400 7531 : if (nSrcYOff < nChunkYOff)
2401 0 : nSrcYOff = nChunkYOff;
2402 :
2403 7531 : const double dfSrcYOff2 =
2404 7531 : dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
2405 7531 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
2406 : #ifdef only_pixels_with_more_than_10_pct_participation
2407 : // When oversampling, don't take into account pixels that have a tiny
2408 : // participation in the resulting pixel
2409 : if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2410 : nSrcYOff2 > nChunkYOff)
2411 : nSrcYOff2--;
2412 : #endif
2413 7531 : if (nSrcYOff2 == nSrcYOff)
2414 0 : ++nSrcYOff2;
2415 7531 : if (nSrcYOff2 > nChunkBottomYOff)
2416 0 : nSrcYOff2 = nChunkBottomYOff;
2417 :
2418 7531 : const T *const paSrcScanline =
2419 281 : pChunk +
2420 7531 : (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
2421 7531 : const GByte *pabySrcScanlineNodataMask = nullptr;
2422 7531 : if (pabyChunkNodataMask != nullptr)
2423 1838 : pabySrcScanlineNodataMask =
2424 : pabyChunkNodataMask +
2425 1838 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
2426 :
2427 7531 : T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
2428 : /* --------------------------------------------------------------------
2429 : */
2430 : /* Loop over destination pixels */
2431 : /* --------------------------------------------------------------------
2432 : */
2433 4260596 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2434 : {
2435 4253061 : const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
2436 : // Apply some epsilon to avoid numerical precision issues
2437 4253061 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
2438 : #ifdef only_pixels_with_more_than_10_pct_participation
2439 : // When oversampling, don't take into account pixels that have a
2440 : // tiny participation in the resulting pixel
2441 : if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2442 : nSrcXOff < nChunkRightXOff)
2443 : nSrcXOff++;
2444 : #endif
2445 4253061 : if (nSrcXOff < nChunkXOff)
2446 0 : nSrcXOff = nChunkXOff;
2447 :
2448 4253061 : const double dfSrcXOff2 =
2449 4253061 : dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
2450 4253061 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
2451 : #ifdef only_pixels_with_more_than_10_pct_participation
2452 : // When oversampling, don't take into account pixels that have a
2453 : // tiny participation in the resulting pixel
2454 : if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2455 : nSrcXOff2 > nChunkXOff)
2456 : nSrcXOff2--;
2457 : #endif
2458 4253061 : if (nSrcXOff2 == nSrcXOff)
2459 0 : nSrcXOff2++;
2460 4253061 : if (nSrcXOff2 > nChunkRightXOff)
2461 0 : nSrcXOff2 = nChunkRightXOff;
2462 :
2463 4253061 : bool bRegularProcessing = false;
2464 : if constexpr (!std::is_same<T, GByte>::value)
2465 1671 : bRegularProcessing = true;
2466 4251390 : else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
2467 0 : bRegularProcessing = true;
2468 :
2469 4253061 : if (bRegularProcessing)
2470 : {
2471 : // Sanity check to make sure the allocation of paVals and
2472 : // panCounts don't overflow.
2473 : static_assert(sizeof(CountType) <= sizeof(size_t));
2474 3342 : if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
2475 1671 : static_cast<CountType>(nSrcYOff2 - nSrcYOff) >
2476 1671 : (std::numeric_limits<CountType>::max() /
2477 3342 : std::max(sizeof(T), sizeof(CountType))) /
2478 1671 : static_cast<CountType>(nSrcXOff2 - nSrcXOff))
2479 : {
2480 0 : CPLError(CE_Failure, CPLE_NotSupported,
2481 : "Too big downsampling factor");
2482 0 : CPLFree(paVals);
2483 0 : CPLFree(panCounts);
2484 0 : return CE_Failure;
2485 : }
2486 1671 : const CountType nNumPx =
2487 1671 : static_cast<CountType>(nSrcYOff2 - nSrcYOff) *
2488 1671 : (nSrcXOff2 - nSrcXOff);
2489 1671 : CountType iMaxInd = 0;
2490 1671 : CountType iMaxVal = 0;
2491 :
2492 1671 : if (paVals == nullptr || nNumPx > nMaxNumPx)
2493 : {
2494 : T *paValsNew = static_cast<T *>(
2495 116 : VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
2496 : CountType *panCountsNew =
2497 116 : static_cast<CountType *>(VSI_REALLOC_VERBOSE(
2498 : panCounts, nNumPx * sizeof(CountType)));
2499 116 : if (paValsNew != nullptr)
2500 116 : paVals = paValsNew;
2501 116 : if (panCountsNew != nullptr)
2502 116 : panCounts = panCountsNew;
2503 116 : if (paValsNew == nullptr || panCountsNew == nullptr)
2504 : {
2505 0 : CPLFree(paVals);
2506 0 : CPLFree(panCounts);
2507 0 : return CE_Failure;
2508 : }
2509 116 : nMaxNumPx = nNumPx;
2510 : }
2511 :
2512 5245 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2513 : {
2514 3574 : const GPtrDiff_t iTotYOff =
2515 3574 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2516 3574 : nChunkXOff;
2517 11842 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2518 : {
2519 8268 : if (pabySrcScanlineNodataMask == nullptr ||
2520 1552 : pabySrcScanlineNodataMask[iX + iTotYOff])
2521 : {
2522 8247 : const T val = paSrcScanline[iX + iTotYOff];
2523 8247 : CountType i = 0; // Used after for.
2524 :
2525 : // Check array for existing entry.
2526 11611 : for (; i < iMaxInd; ++i)
2527 : {
2528 8212 : if (IsSame(paVals[i], val))
2529 : {
2530 4848 : if (++panCounts[i] > panCounts[iMaxVal])
2531 : {
2532 246 : iMaxVal = i;
2533 : }
2534 4848 : break;
2535 : }
2536 : }
2537 :
2538 : // Add to arr if entry not already there.
2539 8247 : if (i == iMaxInd)
2540 : {
2541 3399 : paVals[iMaxInd] = val;
2542 3399 : panCounts[iMaxInd] = 1;
2543 :
2544 3399 : if (iMaxInd == 0)
2545 : {
2546 1668 : iMaxVal = iMaxInd;
2547 : }
2548 :
2549 3399 : ++iMaxInd;
2550 : }
2551 : }
2552 : }
2553 : }
2554 :
2555 1671 : if (iMaxInd == 0)
2556 3 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2557 : else
2558 1668 : paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
2559 : }
2560 : else if constexpr (std::is_same<T, GByte>::value)
2561 : // ( eSrcDataType == GDT_UInt8 && nEntryCount < 256 )
2562 : {
2563 : // So we go here for a paletted or non-paletted byte band.
2564 : // The input values are then between 0 and 255.
2565 4251390 : int nMaxVal = 0;
2566 4251390 : int iMaxInd = -1;
2567 :
2568 : // The cost of this zeroing might be high. Perhaps we should
2569 : // just use the above generic case, and go to this one if the
2570 : // number of source pixels is large enough
2571 4251390 : std::fill(anVals.begin(), anVals.end(), 0);
2572 :
2573 12777800 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2574 : {
2575 8526440 : const GPtrDiff_t iTotYOff =
2576 8526440 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2577 8526440 : nChunkXOff;
2578 25649600 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2579 : {
2580 17123100 : const T val = paSrcScanline[iX + iTotYOff];
2581 17123100 : if (!bHasNoData || val != tNoDataValue)
2582 : {
2583 17123100 : int nVal = static_cast<int>(val);
2584 17123100 : if (++anVals[nVal] > nMaxVal)
2585 : {
2586 : // Sum the density.
2587 : // Is it the most common value so far?
2588 17006400 : iMaxInd = nVal;
2589 17006400 : nMaxVal = anVals[nVal];
2590 : }
2591 : }
2592 : }
2593 : }
2594 :
2595 4251390 : if (iMaxInd == -1)
2596 0 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2597 : else
2598 4251390 : paDstScanline[iDstPixel - nDstXOff] =
2599 : static_cast<T>(iMaxInd);
2600 : }
2601 : }
2602 : }
2603 :
2604 182 : CPLFree(paVals);
2605 182 : CPLFree(panCounts);
2606 :
2607 182 : return CE_None;
2608 : }
2609 :
2610 182 : static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
2611 : const void *pChunk, void **ppDstBuffer,
2612 : GDALDataType *peDstBufferDataType)
2613 : {
2614 182 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
2615 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
2616 : GDALGetDataTypeSizeBytes(args.eWrkDataType));
2617 182 : if (*ppDstBuffer == nullptr)
2618 : {
2619 0 : return CE_Failure;
2620 : }
2621 :
2622 182 : CPLAssert(args.eSrcDataType == args.eWrkDataType);
2623 :
2624 182 : *peDstBufferDataType = args.eWrkDataType;
2625 182 : switch (args.eWrkDataType)
2626 : {
2627 : // For mode resampling, as no computation is done, only the
2628 : // size of the data type matters... except for Byte where we have
2629 : // special processing. And for floating point values
2630 66 : case GDT_UInt8:
2631 : {
2632 66 : return GDALResampleChunk_ModeT(args,
2633 : static_cast<const GByte *>(pChunk),
2634 66 : static_cast<GByte *>(*ppDstBuffer));
2635 : }
2636 :
2637 4 : case GDT_Int8:
2638 : {
2639 4 : return GDALResampleChunk_ModeT(args,
2640 : static_cast<const int8_t *>(pChunk),
2641 4 : static_cast<int8_t *>(*ppDstBuffer));
2642 : }
2643 :
2644 10 : case GDT_Int16:
2645 : case GDT_UInt16:
2646 : {
2647 10 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
2648 10 : return GDALResampleChunk_ModeT(
2649 : args, static_cast<const uint16_t *>(pChunk),
2650 10 : static_cast<uint16_t *>(*ppDstBuffer));
2651 : }
2652 :
2653 15 : case GDT_CInt16:
2654 : case GDT_Int32:
2655 : case GDT_UInt32:
2656 : {
2657 15 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2658 15 : return GDALResampleChunk_ModeT(
2659 : args, static_cast<const uint32_t *>(pChunk),
2660 15 : static_cast<uint32_t *>(*ppDstBuffer));
2661 : }
2662 :
2663 12 : case GDT_CInt32:
2664 : case GDT_Int64:
2665 : case GDT_UInt64:
2666 : {
2667 12 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2668 12 : return GDALResampleChunk_ModeT(
2669 : args, static_cast<const uint64_t *>(pChunk),
2670 12 : static_cast<uint64_t *>(*ppDstBuffer));
2671 : }
2672 :
2673 4 : case GDT_Float16:
2674 : {
2675 4 : return GDALResampleChunk_ModeT(
2676 : args, static_cast<const GFloat16 *>(pChunk),
2677 4 : static_cast<GFloat16 *>(*ppDstBuffer));
2678 : }
2679 :
2680 35 : case GDT_Float32:
2681 : {
2682 35 : return GDALResampleChunk_ModeT(args,
2683 : static_cast<const float *>(pChunk),
2684 35 : static_cast<float *>(*ppDstBuffer));
2685 : }
2686 :
2687 24 : case GDT_Float64:
2688 : {
2689 24 : return GDALResampleChunk_ModeT(args,
2690 : static_cast<const double *>(pChunk),
2691 24 : static_cast<double *>(*ppDstBuffer));
2692 : }
2693 :
2694 4 : case GDT_CFloat16:
2695 : {
2696 4 : return GDALResampleChunk_ModeT(
2697 : args, static_cast<const ComplexFloat16 *>(pChunk),
2698 4 : static_cast<ComplexFloat16 *>(*ppDstBuffer));
2699 : }
2700 :
2701 4 : case GDT_CFloat32:
2702 : {
2703 4 : return GDALResampleChunk_ModeT(
2704 : args, static_cast<const std::complex<float> *>(pChunk),
2705 4 : static_cast<std::complex<float> *>(*ppDstBuffer));
2706 : }
2707 :
2708 4 : case GDT_CFloat64:
2709 : {
2710 4 : return GDALResampleChunk_ModeT(
2711 : args, static_cast<const std::complex<double> *>(pChunk),
2712 4 : static_cast<std::complex<double> *>(*ppDstBuffer));
2713 : }
2714 :
2715 0 : case GDT_Unknown:
2716 : case GDT_TypeCount:
2717 0 : break;
2718 : }
2719 :
2720 0 : CPLAssert(false);
2721 : return CE_Failure;
2722 : }
2723 :
2724 : /************************************************************************/
2725 : /* GDALResampleConvolutionHorizontal() */
2726 : /************************************************************************/
2727 :
2728 : template <class T>
2729 : static inline double
2730 46038 : GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
2731 : int nSrcPixelCount)
2732 : {
2733 46038 : double dfVal1 = 0.0;
2734 46038 : double dfVal2 = 0.0;
2735 46038 : int i = 0; // Used after for.
2736 : // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
2737 : // manually (untypical) unrolled loop in -O2 and -O3:
2738 : // https://github.com/OSGeo/gdal/issues/9508
2739 : #if !defined(__INTEL_CLANG_COMPILER)
2740 92396 : for (; i < nSrcPixelCount - 3; i += 4)
2741 : {
2742 46358 : dfVal1 += double(pChunk[i + 0]) * padfWeights[i];
2743 46358 : dfVal1 += double(pChunk[i + 1]) * padfWeights[i + 1];
2744 46358 : dfVal2 += double(pChunk[i + 2]) * padfWeights[i + 2];
2745 46358 : dfVal2 += double(pChunk[i + 3]) * padfWeights[i + 3];
2746 : }
2747 : #endif
2748 48662 : for (; i < nSrcPixelCount; ++i)
2749 : {
2750 2624 : dfVal1 += double(pChunk[i]) * padfWeights[i];
2751 : }
2752 46038 : return dfVal1 + dfVal2;
2753 : }
2754 :
2755 : template <class T, bool bHasNaN>
2756 46368 : static inline void GDALResampleConvolutionHorizontalWithMask(
2757 : const T *pChunk, const GByte *pabyMask, const double *padfWeights,
2758 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2759 : {
2760 46368 : dfVal = 0;
2761 46368 : dfWeightSum = 0;
2762 46368 : int i = 0;
2763 103804 : for (; i < nSrcPixelCount - 3; i += 4)
2764 : {
2765 57436 : double dfWeight0 = padfWeights[i + 0] * pabyMask[i + 0];
2766 57436 : double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
2767 57436 : double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
2768 57436 : double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
2769 :
2770 229744 : const auto MulNaNAware = [](double v, double &w, double &val)
2771 : {
2772 : if constexpr (bHasNaN)
2773 : {
2774 14848 : if (std::isnan(v))
2775 : {
2776 76 : w = 0;
2777 76 : return;
2778 : }
2779 : }
2780 14772 : val += v * w;
2781 : };
2782 :
2783 57436 : MulNaNAware(double(pChunk[i + 0]), dfWeight0, dfVal);
2784 57436 : MulNaNAware(double(pChunk[i + 1]), dfWeight1, dfVal);
2785 57436 : MulNaNAware(double(pChunk[i + 2]), dfWeight2, dfVal);
2786 57436 : MulNaNAware(double(pChunk[i + 3]), dfWeight3, dfVal);
2787 57436 : dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
2788 : }
2789 64874 : for (; i < nSrcPixelCount; ++i)
2790 : {
2791 18506 : const double dfWeight = padfWeights[i] * pabyMask[i];
2792 : if constexpr (bHasNaN)
2793 : {
2794 1920 : if (!std::isnan(pChunk[i]))
2795 : {
2796 1920 : dfVal += double(pChunk[i]) * dfWeight;
2797 1920 : dfWeightSum += dfWeight;
2798 : }
2799 : }
2800 : else
2801 : {
2802 16586 : dfVal += double(pChunk[i]) * dfWeight;
2803 16586 : dfWeightSum += dfWeight;
2804 : }
2805 : }
2806 46368 : }
2807 :
2808 : template <class T, bool bHasNaN>
2809 1341366 : static inline void GDALResampleConvolutionHorizontal_3rows(
2810 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2811 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2812 : double &dfRes2, double &dfRes3)
2813 : {
2814 1341366 : double dfVal1 = 0.0;
2815 1341366 : double dfVal2 = 0.0;
2816 1341366 : double dfVal3 = 0.0;
2817 1341366 : double dfVal4 = 0.0;
2818 1341366 : double dfVal5 = 0.0;
2819 1341366 : double dfVal6 = 0.0;
2820 1341366 : int i = 0; // Used after for.
2821 :
2822 16866840 : const auto MulNaNAware = [](double a, double w)
2823 : {
2824 : if constexpr (bHasNaN)
2825 : {
2826 0 : if (std::isnan(a))
2827 0 : return 0.0;
2828 : }
2829 16866900 : return a * w;
2830 : };
2831 :
2832 2736937 : for (; i < nSrcPixelCount - 3; i += 4)
2833 : {
2834 1395570 : dfVal1 += MulNaNAware(double(pChunkRow1[i + 0]), padfWeights[i + 0]);
2835 1395570 : dfVal1 += MulNaNAware(double(pChunkRow1[i + 1]), padfWeights[i + 1]);
2836 1395570 : dfVal2 += MulNaNAware(double(pChunkRow1[i + 2]), padfWeights[i + 2]);
2837 1395570 : dfVal2 += MulNaNAware(double(pChunkRow1[i + 3]), padfWeights[i + 3]);
2838 1395570 : dfVal3 += MulNaNAware(double(pChunkRow2[i + 0]), padfWeights[i + 0]);
2839 1395570 : dfVal3 += MulNaNAware(double(pChunkRow2[i + 1]), padfWeights[i + 1]);
2840 1395570 : dfVal4 += MulNaNAware(double(pChunkRow2[i + 2]), padfWeights[i + 2]);
2841 1395570 : dfVal4 += MulNaNAware(double(pChunkRow2[i + 3]), padfWeights[i + 3]);
2842 1395570 : dfVal5 += MulNaNAware(double(pChunkRow3[i + 0]), padfWeights[i + 0]);
2843 1395570 : dfVal5 += MulNaNAware(double(pChunkRow3[i + 1]), padfWeights[i + 1]);
2844 1395570 : dfVal6 += MulNaNAware(double(pChunkRow3[i + 2]), padfWeights[i + 2]);
2845 1395570 : dfVal6 += MulNaNAware(double(pChunkRow3[i + 3]), padfWeights[i + 3]);
2846 : }
2847 1381377 : for (; i < nSrcPixelCount; ++i)
2848 : {
2849 40011 : dfVal1 += MulNaNAware(double(pChunkRow1[i]), padfWeights[i]);
2850 40011 : dfVal3 += MulNaNAware(double(pChunkRow2[i]), padfWeights[i]);
2851 40011 : dfVal5 += MulNaNAware(double(pChunkRow3[i]), padfWeights[i]);
2852 : }
2853 1341366 : dfRes1 = dfVal1 + dfVal2;
2854 1341366 : dfRes2 = dfVal3 + dfVal4;
2855 1341366 : dfRes3 = dfVal5 + dfVal6;
2856 1341366 : }
2857 :
2858 : template <class T, bool bHasNaN>
2859 18980 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
2860 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2861 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2862 : double &dfRes2, double &dfRes3)
2863 : {
2864 18980 : GDALResampleConvolutionHorizontal_3rows<T, bHasNaN>(
2865 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeights, nSrcPixelCount, dfRes1,
2866 : dfRes2, dfRes3);
2867 18980 : }
2868 :
2869 : template <class T, bool bHasNaN>
2870 1256690 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
2871 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2872 : const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
2873 : {
2874 1256690 : GDALResampleConvolutionHorizontal_3rows<T, bHasNaN>(
2875 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeights, 4, dfRes1, dfRes2,
2876 : dfRes3);
2877 1256690 : }
2878 :
2879 : /************************************************************************/
2880 : /* GDALResampleConvolutionVertical() */
2881 : /************************************************************************/
2882 :
2883 : template <class T>
2884 : static inline double
2885 472545 : GDALResampleConvolutionVertical(const T *pChunk, size_t nStride,
2886 : const double *padfWeights, int nSrcLineCount)
2887 : {
2888 472545 : double dfVal1 = 0.0;
2889 472545 : double dfVal2 = 0.0;
2890 472545 : int i = 0;
2891 472545 : size_t j = 0;
2892 936186 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2893 : {
2894 463641 : dfVal1 += pChunk[j + 0 * nStride] * padfWeights[i + 0];
2895 463641 : dfVal1 += pChunk[j + 1 * nStride] * padfWeights[i + 1];
2896 463641 : dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2897 463641 : dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2898 : }
2899 526884 : for (; i < nSrcLineCount; ++i, j += nStride)
2900 : {
2901 54339 : dfVal1 += pChunk[j] * padfWeights[i];
2902 : }
2903 472545 : return dfVal1 + dfVal2;
2904 : }
2905 :
2906 : template <class T>
2907 2930610 : static inline void GDALResampleConvolutionVertical_2cols(
2908 : const T *pChunk, size_t nStride, const double *padfWeights,
2909 : int nSrcLineCount, double &dfRes1, double &dfRes2)
2910 : {
2911 2930610 : double dfVal1 = 0.0;
2912 2930610 : double dfVal2 = 0.0;
2913 2930610 : double dfVal3 = 0.0;
2914 2930610 : double dfVal4 = 0.0;
2915 2930610 : int i = 0;
2916 2930610 : size_t j = 0;
2917 5863170 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2918 : {
2919 2932560 : dfVal1 += pChunk[j + 0 + 0 * nStride] * padfWeights[i + 0];
2920 2932560 : dfVal3 += pChunk[j + 1 + 0 * nStride] * padfWeights[i + 0];
2921 2932560 : dfVal1 += pChunk[j + 0 + 1 * nStride] * padfWeights[i + 1];
2922 2932560 : dfVal3 += pChunk[j + 1 + 1 * nStride] * padfWeights[i + 1];
2923 2932560 : dfVal2 += pChunk[j + 0 + 2 * nStride] * padfWeights[i + 2];
2924 2932560 : dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
2925 2932560 : dfVal2 += pChunk[j + 0 + 3 * nStride] * padfWeights[i + 3];
2926 2932560 : dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
2927 : }
2928 3053490 : for (; i < nSrcLineCount; ++i, j += nStride)
2929 : {
2930 122880 : dfVal1 += pChunk[j + 0] * padfWeights[i];
2931 122880 : dfVal3 += pChunk[j + 1] * padfWeights[i];
2932 : }
2933 2930610 : dfRes1 = dfVal1 + dfVal2;
2934 2930610 : dfRes2 = dfVal3 + dfVal4;
2935 2930610 : }
2936 :
2937 : #ifdef USE_SSE2
2938 :
2939 : #ifdef __AVX__
2940 : /************************************************************************/
2941 : /* GDALResampleConvolutionVertical_16cols<T> */
2942 : /************************************************************************/
2943 :
2944 : template <class T>
2945 : static inline void
2946 : GDALResampleConvolutionVertical_16cols(const T *pChunk, size_t nStride,
2947 : const double *padfWeights,
2948 : int nSrcLineCount, float *afDest)
2949 : {
2950 : int i = 0;
2951 : size_t j = 0;
2952 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2953 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2954 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2955 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2956 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2957 : {
2958 : XMMReg4Double w0 =
2959 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2960 : XMMReg4Double w1 =
2961 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2962 : XMMReg4Double w2 =
2963 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2964 : XMMReg4Double w3 =
2965 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2966 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2967 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2968 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
2969 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
2970 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2971 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2972 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
2973 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
2974 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2975 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2976 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
2977 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
2978 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2979 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2980 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
2981 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
2982 : }
2983 : for (; i < nSrcLineCount; ++i, j += nStride)
2984 : {
2985 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2986 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2987 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2988 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
2989 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
2990 : }
2991 : v_acc0.Store4Val(afDest);
2992 : v_acc1.Store4Val(afDest + 4);
2993 : v_acc2.Store4Val(afDest + 8);
2994 : v_acc3.Store4Val(afDest + 12);
2995 : }
2996 :
2997 : template <class T>
2998 : static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
2999 : const double *, int,
3000 : double *)
3001 : {
3002 : // Cannot be reached
3003 : CPLAssert(false);
3004 : }
3005 :
3006 : #else
3007 :
3008 : /************************************************************************/
3009 : /* GDALResampleConvolutionVertical_8cols<T> */
3010 : /************************************************************************/
3011 :
3012 : template <class T>
3013 : static inline void
3014 25804000 : GDALResampleConvolutionVertical_8cols(const T *pChunk, size_t nStride,
3015 : const double *padfWeights,
3016 : int nSrcLineCount, float *afDest)
3017 : {
3018 25804000 : int i = 0;
3019 25804000 : size_t j = 0;
3020 25804000 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
3021 25804000 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3022 53883400 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
3023 : {
3024 28079400 : XMMReg4Double w0 =
3025 28079400 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
3026 28079400 : XMMReg4Double w1 =
3027 28079400 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
3028 28079400 : XMMReg4Double w2 =
3029 28079400 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
3030 28079400 : XMMReg4Double w3 =
3031 28079400 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
3032 28079400 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
3033 28079400 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
3034 28079400 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
3035 28079400 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
3036 28079400 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
3037 28079400 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
3038 28079400 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
3039 28079400 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
3040 : }
3041 37376100 : for (; i < nSrcLineCount; ++i, j += nStride)
3042 : {
3043 11572100 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
3044 11572100 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
3045 11572100 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
3046 : }
3047 25804000 : v_acc0.Store4Val(afDest);
3048 25804000 : v_acc1.Store4Val(afDest + 4);
3049 25804000 : }
3050 :
3051 : template <class T>
3052 : static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
3053 : const double *, int,
3054 : double *)
3055 : {
3056 : // Cannot be reached
3057 : CPLAssert(false);
3058 : }
3059 :
3060 : #endif // __AVX__
3061 :
3062 : /************************************************************************/
3063 : /* GDALResampleConvolutionHorizontalSSE2<T> */
3064 : /************************************************************************/
3065 :
3066 : template <class T>
3067 3375702 : static inline double GDALResampleConvolutionHorizontalSSE2(
3068 : const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3069 : {
3070 3375702 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3071 3375702 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3072 3375702 : int i = 0; // Used after for.
3073 3754648 : for (; i < nSrcPixelCount - 7; i += 8)
3074 : {
3075 : // Retrieve the pixel & accumulate
3076 378952 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
3077 378952 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
3078 378952 : const XMMReg4Double v_weight1 =
3079 378952 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3080 378952 : const XMMReg4Double v_weight2 =
3081 378952 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
3082 :
3083 378952 : v_acc1 += v_pixels1 * v_weight1;
3084 378952 : v_acc2 += v_pixels2 * v_weight2;
3085 : }
3086 :
3087 3375702 : v_acc1 += v_acc2;
3088 :
3089 3375702 : double dfVal = v_acc1.GetHorizSum();
3090 11491480 : for (; i < nSrcPixelCount; ++i)
3091 : {
3092 8115780 : dfVal += pChunk[i] * padfWeightsAligned[i];
3093 : }
3094 3375702 : return dfVal;
3095 : }
3096 :
3097 : /************************************************************************/
3098 : /* GDALResampleConvolutionHorizontal<GByte> */
3099 : /************************************************************************/
3100 :
3101 : template <>
3102 2826540 : inline double GDALResampleConvolutionHorizontal<GByte>(
3103 : const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3104 : {
3105 2826540 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
3106 2826540 : nSrcPixelCount);
3107 : }
3108 :
3109 : template <>
3110 549162 : inline double GDALResampleConvolutionHorizontal<GUInt16>(
3111 : const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3112 : {
3113 549162 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
3114 549162 : nSrcPixelCount);
3115 : }
3116 :
3117 : /************************************************************************/
3118 : /* GDALResampleConvolutionHorizontalWithMaskSSE2<T> */
3119 : /************************************************************************/
3120 :
3121 : template <class T>
3122 10627663 : static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
3123 : const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
3124 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
3125 : {
3126 10627663 : int i = 0; // Used after for.
3127 10627663 : XMMReg4Double v_acc = XMMReg4Double::Zero();
3128 10627663 : XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
3129 26196921 : for (; i < nSrcPixelCount - 3; i += 4)
3130 : {
3131 15569258 : const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
3132 15569258 : const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
3133 15569258 : XMMReg4Double v_weight =
3134 15569258 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3135 15569258 : v_weight *= v_mask;
3136 15569258 : v_acc += v_pixels * v_weight;
3137 15569258 : v_acc_weight += v_weight;
3138 : }
3139 :
3140 10627663 : dfVal = v_acc.GetHorizSum();
3141 10627663 : dfWeightSum = v_acc_weight.GetHorizSum();
3142 10912663 : for (; i < nSrcPixelCount; ++i)
3143 : {
3144 284972 : const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
3145 284972 : dfVal += pChunk[i] * dfWeight;
3146 284972 : dfWeightSum += dfWeight;
3147 : }
3148 10627663 : }
3149 :
3150 : /************************************************************************/
3151 : /* GDALResampleConvolutionHorizontalWithMask<GByte> */
3152 : /************************************************************************/
3153 :
3154 : template <>
3155 10627600 : inline void GDALResampleConvolutionHorizontalWithMask<GByte, false>(
3156 : const GByte *pChunk, const GByte *pabyMask,
3157 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
3158 : double &dfWeightSum)
3159 : {
3160 10627600 : GDALResampleConvolutionHorizontalWithMaskSSE2(
3161 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
3162 : dfWeightSum);
3163 10627600 : }
3164 :
3165 : template <>
3166 63 : inline void GDALResampleConvolutionHorizontalWithMask<GUInt16, false>(
3167 : const GUInt16 *pChunk, const GByte *pabyMask,
3168 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
3169 : double &dfWeightSum)
3170 : {
3171 63 : GDALResampleConvolutionHorizontalWithMaskSSE2(
3172 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
3173 : dfWeightSum);
3174 63 : }
3175 :
3176 : /************************************************************************/
3177 : /* GDALResampleConvolutionHorizontal_3rows_SSE2<T> */
3178 : /************************************************************************/
3179 :
3180 : template <class T>
3181 35560186 : static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
3182 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3183 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3184 : double &dfRes2, double &dfRes3)
3185 : {
3186 35560186 : XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
3187 35560186 : v_acc2 = XMMReg4Double::Zero(),
3188 35560186 : v_acc3 = XMMReg4Double::Zero();
3189 35560186 : int i = 0;
3190 70929556 : for (; i < nSrcPixelCount - 7; i += 8)
3191 : {
3192 : // Retrieve the pixel & accumulate.
3193 35369370 : XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3194 35369370 : XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
3195 35369370 : const XMMReg4Double v_weight1 =
3196 35369370 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3197 35369370 : const XMMReg4Double v_weight2 =
3198 35369370 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
3199 :
3200 35369370 : v_acc1 += v_pixels1 * v_weight1;
3201 35369370 : v_acc1 += v_pixels2 * v_weight2;
3202 :
3203 35369370 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3204 35369370 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
3205 35369370 : v_acc2 += v_pixels1 * v_weight1;
3206 35369370 : v_acc2 += v_pixels2 * v_weight2;
3207 :
3208 35369370 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3209 35369370 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
3210 35369370 : v_acc3 += v_pixels1 * v_weight1;
3211 35369370 : v_acc3 += v_pixels2 * v_weight2;
3212 : }
3213 :
3214 35560186 : dfRes1 = v_acc1.GetHorizSum();
3215 35560186 : dfRes2 = v_acc2.GetHorizSum();
3216 35560186 : dfRes3 = v_acc3.GetHorizSum();
3217 47825952 : for (; i < nSrcPixelCount; ++i)
3218 : {
3219 12265766 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3220 12265766 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3221 12265766 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3222 : }
3223 35560186 : }
3224 :
3225 : /************************************************************************/
3226 : /* GDALResampleConvolutionHorizontal_3rows<GByte> */
3227 : /************************************************************************/
3228 :
3229 : template <>
3230 35560100 : inline void GDALResampleConvolutionHorizontal_3rows<GByte, false>(
3231 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3232 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3233 : double &dfRes2, double &dfRes3)
3234 : {
3235 35560100 : GDALResampleConvolutionHorizontal_3rows_SSE2(
3236 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3237 : dfRes1, dfRes2, dfRes3);
3238 35560100 : }
3239 :
3240 : template <>
3241 86 : inline void GDALResampleConvolutionHorizontal_3rows<GUInt16, false>(
3242 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3243 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3244 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3245 : {
3246 86 : GDALResampleConvolutionHorizontal_3rows_SSE2(
3247 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3248 : dfRes1, dfRes2, dfRes3);
3249 86 : }
3250 :
3251 : /************************************************************************/
3252 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T> */
3253 : /************************************************************************/
3254 :
3255 : template <class T>
3256 7849120 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3257 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3258 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3259 : double &dfRes2, double &dfRes3)
3260 : {
3261 7849120 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3262 7849120 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3263 7849120 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
3264 7849120 : int i = 0; // Use after for.
3265 19113750 : for (; i < nSrcPixelCount - 3; i += 4)
3266 : {
3267 : // Retrieve the pixel & accumulate.
3268 11264600 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3269 11264600 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3270 11264600 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3271 11264600 : const XMMReg4Double v_weight =
3272 11264600 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3273 :
3274 11264600 : v_acc1 += v_pixels1 * v_weight;
3275 11264600 : v_acc2 += v_pixels2 * v_weight;
3276 11264600 : v_acc3 += v_pixels3 * v_weight;
3277 : }
3278 :
3279 7849120 : dfRes1 = v_acc1.GetHorizSum();
3280 7849120 : dfRes2 = v_acc2.GetHorizSum();
3281 7849120 : dfRes3 = v_acc3.GetHorizSum();
3282 :
3283 12324622 : for (; i < nSrcPixelCount; ++i)
3284 : {
3285 4475522 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3286 4475522 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3287 4475522 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3288 : }
3289 7849120 : }
3290 :
3291 : /************************************************************************/
3292 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte> */
3293 : /************************************************************************/
3294 :
3295 : template <>
3296 : inline void
3297 7781970 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte, false>(
3298 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3299 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3300 : double &dfRes2, double &dfRes3)
3301 : {
3302 7781970 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3303 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3304 : dfRes1, dfRes2, dfRes3);
3305 7781970 : }
3306 :
3307 : template <>
3308 : inline void
3309 67150 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16, false>(
3310 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3311 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3312 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3313 : {
3314 67150 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3315 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3316 : dfRes1, dfRes2, dfRes3);
3317 67150 : }
3318 :
3319 : /************************************************************************/
3320 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T> */
3321 : /************************************************************************/
3322 :
3323 : template <class T>
3324 14904860 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3325 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3326 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3327 : double &dfRes3)
3328 : {
3329 14904860 : const XMMReg4Double v_weight =
3330 : XMMReg4Double::Load4ValAligned(padfWeightsAligned);
3331 :
3332 : // Retrieve the pixel & accumulate.
3333 14904860 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
3334 14904860 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
3335 14904860 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
3336 :
3337 14904860 : XMMReg4Double v_acc1 = v_pixels1 * v_weight;
3338 14904860 : XMMReg4Double v_acc2 = v_pixels2 * v_weight;
3339 14904860 : XMMReg4Double v_acc3 = v_pixels3 * v_weight;
3340 :
3341 14904860 : dfRes1 = v_acc1.GetHorizSum();
3342 14904860 : dfRes2 = v_acc2.GetHorizSum();
3343 14904860 : dfRes3 = v_acc3.GetHorizSum();
3344 14904860 : }
3345 :
3346 : /************************************************************************/
3347 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte> */
3348 : /************************************************************************/
3349 :
3350 : template <>
3351 9192140 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte, false>(
3352 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3353 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3354 : double &dfRes3)
3355 : {
3356 9192140 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3357 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3358 : dfRes3);
3359 9192140 : }
3360 :
3361 : template <>
3362 5712720 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16, false>(
3363 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3364 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
3365 : double &dfRes2, double &dfRes3)
3366 : {
3367 5712720 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3368 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3369 : dfRes3);
3370 5712720 : }
3371 :
3372 : #endif // USE_SSE2
3373 :
3374 : /************************************************************************/
3375 : /* GDALResampleChunk_Convolution() */
3376 : /************************************************************************/
3377 :
3378 : template <class T, class Twork, GDALDataType eWrkDataType,
3379 : bool bKernelWithNegativeWeights, bool bNeedRescale>
3380 9579 : static CPLErr GDALResampleChunk_ConvolutionT(
3381 : const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
3382 : FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
3383 : int nKernelRadius, float fMaxVal)
3384 :
3385 : {
3386 9579 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
3387 9579 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
3388 9579 : const double dfSrcXDelta = args.dfSrcXDelta;
3389 9579 : const double dfSrcYDelta = args.dfSrcYDelta;
3390 9579 : constexpr int nBands = 1;
3391 9579 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
3392 9579 : const int nChunkXOff = args.nChunkXOff;
3393 9579 : const int nChunkXSize = args.nChunkXSize;
3394 9579 : const int nChunkYOff = args.nChunkYOff;
3395 9579 : const int nChunkYSize = args.nChunkYSize;
3396 9579 : const int nDstXOff = args.nDstXOff;
3397 9579 : const int nDstXOff2 = args.nDstXOff2;
3398 9579 : const int nDstYOff = args.nDstYOff;
3399 9579 : const int nDstYOff2 = args.nDstYOff2;
3400 9579 : const bool bHasNoData = args.bHasNoData;
3401 9579 : double dfNoDataValue = args.dfNoDataValue;
3402 :
3403 9579 : if (!bHasNoData)
3404 9480 : dfNoDataValue = 0.0;
3405 9579 : const auto dstDataType = args.eOvrDataType;
3406 9579 : const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
3407 9579 : const double dfReplacementVal =
3408 99 : bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
3409 : : dfNoDataValue;
3410 : // cppcheck-suppress unreadVariable
3411 9579 : const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
3412 9579 : const bool bNoDataValueInt64Valid =
3413 9579 : isIntegerDT && GDALIsValueExactAs<GInt64>(dfNoDataValue);
3414 9579 : const auto nNodataValueInt64 =
3415 : bNoDataValueInt64Valid ? static_cast<GInt64>(dfNoDataValue) : 0;
3416 9579 : constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
3417 :
3418 : // TODO: we should have some generic function to do this.
3419 9579 : Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
3420 9579 : Twork fDstMax = cpl::NumericLimits<Twork>::max();
3421 9579 : if (dstDataType == GDT_UInt8)
3422 : {
3423 8649 : fDstMin = std::numeric_limits<GByte>::min();
3424 8649 : fDstMax = std::numeric_limits<GByte>::max();
3425 : }
3426 930 : else if (dstDataType == GDT_Int8)
3427 : {
3428 1 : fDstMin = std::numeric_limits<GInt8>::min();
3429 1 : fDstMax = std::numeric_limits<GInt8>::max();
3430 : }
3431 929 : else if (dstDataType == GDT_UInt16)
3432 : {
3433 402 : fDstMin = std::numeric_limits<GUInt16>::min();
3434 402 : fDstMax = std::numeric_limits<GUInt16>::max();
3435 : }
3436 527 : else if (dstDataType == GDT_Int16)
3437 : {
3438 292 : fDstMin = std::numeric_limits<GInt16>::min();
3439 292 : fDstMax = std::numeric_limits<GInt16>::max();
3440 : }
3441 235 : else if (dstDataType == GDT_UInt32)
3442 : {
3443 1 : fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
3444 1 : fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
3445 : }
3446 234 : else if (dstDataType == GDT_Int32)
3447 : {
3448 : // cppcheck-suppress unreadVariable
3449 6 : fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
3450 : // cppcheck-suppress unreadVariable
3451 6 : fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
3452 : }
3453 228 : else if (dstDataType == GDT_UInt64)
3454 : {
3455 : // cppcheck-suppress unreadVariable
3456 1 : fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
3457 : // cppcheck-suppress unreadVariable
3458 : // (1 << 64) - 2048: largest uint64 value a double can hold
3459 1 : fDstMax = static_cast<Twork>(18446744073709549568ULL);
3460 : }
3461 227 : else if (dstDataType == GDT_Int64)
3462 : {
3463 : // cppcheck-suppress unreadVariable
3464 1 : fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
3465 : // cppcheck-suppress unreadVariable
3466 : // (1 << 63) - 1024: largest int64 that a double can hold
3467 1 : fDstMax = static_cast<Twork>(9223372036854774784LL);
3468 : }
3469 :
3470 9579 : bool bHasNaN = false;
3471 490 : if (pabyChunkNodataMask)
3472 : {
3473 : if constexpr (std::is_floating_point_v<T>)
3474 : {
3475 120140 : for (size_t i = 0;
3476 120140 : i < static_cast<size_t>(nChunkXSize) * nChunkYSize; ++i)
3477 : {
3478 120122 : if (std::isnan(pChunk[i]))
3479 : {
3480 24 : bHasNaN = true;
3481 24 : break;
3482 : }
3483 : }
3484 : }
3485 : }
3486 :
3487 37455524 : auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
3488 : bNoDataValueInt64Valid, nNodataValueInt64,
3489 : dfNoDataValue, dfReplacementVal](Twork fVal)
3490 : {
3491 16342300 : if (!bHasNoData)
3492 12121200 : return fVal;
3493 :
3494 : // Clamp value before comparing to nodata: this is only needed for
3495 : // kernels with negative weights (Lanczos)
3496 4221160 : Twork fClamped = fVal;
3497 4221160 : if (fClamped < fDstMin)
3498 15998 : fClamped = fDstMin;
3499 4205160 : else if (fClamped > fDstMax)
3500 16406 : fClamped = fDstMax;
3501 4221160 : if (isIntegerDT)
3502 : {
3503 4220480 : if (bNoDataValueInt64Valid)
3504 : {
3505 4220470 : const double fClampedRounded = double(std::round(fClamped));
3506 8440960 : if (fClampedRounded >=
3507 : static_cast<double>(static_cast<Twork>(
3508 8440960 : std::numeric_limits<int64_t>::min())) &&
3509 : fClampedRounded <= static_cast<double>(static_cast<Twork>(
3510 8440960 : 9223372036854774784LL)) &&
3511 4220470 : nNodataValueInt64 ==
3512 4220480 : static_cast<GInt64>(std::round(fClamped)))
3513 : {
3514 : // Do not use the nodata value
3515 14435 : return static_cast<Twork>(dfReplacementVal);
3516 : }
3517 : }
3518 : }
3519 679 : else if (dfNoDataValue == static_cast<double>(fClamped))
3520 : {
3521 : // Do not use the nodata value
3522 1 : return static_cast<Twork>(dfReplacementVal);
3523 : }
3524 4206720 : return fClamped;
3525 : };
3526 :
3527 : /* -------------------------------------------------------------------- */
3528 : /* Allocate work buffers. */
3529 : /* -------------------------------------------------------------------- */
3530 9579 : const int nDstXSize = nDstXOff2 - nDstXOff;
3531 9579 : Twork *pafWrkScanline = nullptr;
3532 9579 : if (dstDataType != eWrkDataType)
3533 : {
3534 : pafWrkScanline =
3535 9367 : static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
3536 9367 : if (pafWrkScanline == nullptr)
3537 0 : return CE_Failure;
3538 : }
3539 :
3540 9579 : const double dfXScale = 1.0 / dfXRatioDstToSrc;
3541 9579 : const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
3542 9579 : const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
3543 9579 : const double dfYScale = 1.0 / dfYRatioDstToSrc;
3544 9579 : const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
3545 9579 : const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
3546 :
3547 : // Temporary array to store result of horizontal filter.
3548 : double *const padfHorizontalFiltered = static_cast<double *>(
3549 9579 : VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
3550 9579 : const uint64_t nWeightCount = static_cast<uint64_t>(
3551 9579 : 2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) + 0.5);
3552 9579 : if (nWeightCount > std::numeric_limits<uint32_t>::max() / sizeof(double))
3553 : {
3554 0 : VSIFree(pafWrkScanline);
3555 0 : CPLError(CE_Failure, CPLE_NotSupported,
3556 : "Too large downsampling factor");
3557 0 : return CE_Failure;
3558 : }
3559 : // To store convolution coefficients.
3560 : double *const padfWeights =
3561 9579 : static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
3562 : static_cast<size_t>(nWeightCount) * sizeof(double)));
3563 :
3564 9579 : GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
3565 9579 : if (pabyChunkNodataMask)
3566 : pabyChunkNodataMaskHorizontalFiltered =
3567 3339 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
3568 9579 : if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
3569 3339 : (pabyChunkNodataMask != nullptr &&
3570 : pabyChunkNodataMaskHorizontalFiltered == nullptr))
3571 : {
3572 0 : VSIFree(pafWrkScanline);
3573 0 : VSIFree(padfHorizontalFiltered);
3574 0 : VSIFreeAligned(padfWeights);
3575 0 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3576 0 : return CE_Failure;
3577 : }
3578 :
3579 : /* ==================================================================== */
3580 : /* First pass: horizontal filter */
3581 : /* ==================================================================== */
3582 9579 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
3583 : #ifdef USE_SSE2
3584 9579 : const bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
3585 : #endif
3586 3720332 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
3587 : {
3588 3710748 : const double dfSrcPixel =
3589 3710748 : (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
3590 3710748 : const int nSrcPixelStart = std::max(
3591 3710748 : static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5)),
3592 3710748 : nChunkXOff);
3593 3710748 : const int nSrcPixelStop =
3594 3710748 : std::min(static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5),
3595 3710748 : nChunkRightXOff);
3596 : #if 0
3597 : if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
3598 : {
3599 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3600 : }
3601 : if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
3602 : {
3603 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3604 : }
3605 : #endif
3606 3710748 : const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
3607 3710748 : double dfWeightSum = 0.0;
3608 :
3609 : // Compute convolution coefficients.
3610 3710748 : int nSrcPixel = nSrcPixelStart;
3611 3710748 : double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
3612 5770526 : for (; nSrcPixel < nSrcPixelStop - 3; nSrcPixel += 4)
3613 : {
3614 2059781 : padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
3615 2059781 : dfX += dfXScaleWeight;
3616 2059781 : padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
3617 2059781 : dfX += dfXScaleWeight;
3618 2059781 : padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
3619 2059781 : dfX += dfXScaleWeight;
3620 2059781 : padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
3621 2059781 : dfX += dfXScaleWeight;
3622 2059781 : dfWeightSum +=
3623 2059781 : pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
3624 : }
3625 7715158 : for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
3626 : {
3627 4004410 : const double dfWeight = pfnFilterFunc(dfX);
3628 4004410 : padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3629 4004410 : dfWeightSum += dfWeight;
3630 : }
3631 :
3632 3710748 : const int nHeight = nChunkYSize * nBands;
3633 3710748 : if (pabyChunkNodataMask == nullptr)
3634 : {
3635 : // For floating-point data types, we must scale down a bit values
3636 : // if input values are close to +/- std::numeric_limits<T>::max()
3637 : #ifdef OLD_CPPCHECK
3638 : constexpr double mulFactor = 1;
3639 : #else
3640 3191883 : constexpr double mulFactor =
3641 : (bNeedRescale &&
3642 : (std::is_same_v<T, float> || std::is_same_v<T, double>))
3643 : ? 2
3644 : : 1;
3645 : #endif
3646 :
3647 3191883 : if (dfWeightSum != 0)
3648 : {
3649 3191883 : const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
3650 13086524 : for (int i = 0; i < nSrcPixelCount; ++i)
3651 : {
3652 9894651 : padfWeights[i] *= dfInvWeightSum;
3653 : }
3654 : }
3655 :
3656 182388430 : const auto ScaleValue = [
3657 : #ifdef _MSC_VER
3658 : mulFactor
3659 : #endif
3660 : ](double dfVal, [[maybe_unused]] const T *inputValues,
3661 : [[maybe_unused]] int nInputValues)
3662 : {
3663 182388000 : constexpr bool isFloat =
3664 : std::is_same_v<T, float> || std::is_same_v<T, double>;
3665 : if constexpr (isFloat)
3666 : {
3667 4070140 : if (std::isfinite(dfVal))
3668 : {
3669 : return std::clamp(dfVal,
3670 12204800 : -std::numeric_limits<double>::max() /
3671 : mulFactor,
3672 4068260 : std::numeric_limits<double>::max() /
3673 4068260 : mulFactor) *
3674 4068260 : mulFactor;
3675 : }
3676 : else if constexpr (bKernelWithNegativeWeights)
3677 : {
3678 936 : if (std::isnan(dfVal))
3679 : {
3680 : // Either one of the input value is NaN or they are +/-Inf
3681 936 : const bool isPositive = inputValues[0] >= 0;
3682 6008 : for (int i = 0; i < nInputValues; ++i)
3683 : {
3684 5384 : if (std::isnan(inputValues[i]))
3685 312 : return dfVal;
3686 : // cppcheck-suppress knownConditionTrueFalse
3687 5072 : if ((inputValues[i] >= 0) != isPositive)
3688 0 : return dfVal;
3689 : }
3690 : // All values are positive or negative infinity
3691 624 : return static_cast<double>(inputValues[0]);
3692 : }
3693 : }
3694 : }
3695 178319000 : return dfVal;
3696 : };
3697 :
3698 3191883 : int iSrcLineOff = 0;
3699 : #ifdef USE_SSE2
3700 3191883 : if (nSrcPixelCount == 4)
3701 : {
3702 17007029 : for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3703 : {
3704 16161558 : const size_t j =
3705 16161558 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3706 16161558 : (nSrcPixelStart - nChunkXOff);
3707 16161558 : double dfVal1 = 0.0;
3708 16161558 : double dfVal2 = 0.0;
3709 16161558 : double dfVal3 = 0.0;
3710 : if constexpr (std::is_floating_point_v<T>)
3711 : {
3712 1256690 : if (bHasNaN)
3713 : {
3714 : GDALResampleConvolutionHorizontalPixelCount4_3rows<
3715 0 : T, true>(pChunk + j, pChunk + j + nChunkXSize,
3716 0 : pChunk + j + 2 * nChunkXSize,
3717 : padfWeights, dfVal1, dfVal2, dfVal3);
3718 : }
3719 : else
3720 : {
3721 : GDALResampleConvolutionHorizontalPixelCount4_3rows<
3722 1256690 : T, false>(pChunk + j, pChunk + j + nChunkXSize,
3723 1256690 : pChunk + j + 2 * nChunkXSize,
3724 : padfWeights, dfVal1, dfVal2, dfVal3);
3725 : }
3726 : }
3727 : else
3728 : {
3729 : GDALResampleConvolutionHorizontalPixelCount4_3rows<
3730 14904868 : T, false>(pChunk + j, pChunk + j + nChunkXSize,
3731 14904868 : pChunk + j + 2 * nChunkXSize, padfWeights,
3732 : dfVal1, dfVal2, dfVal3);
3733 : }
3734 32323080 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3735 16161558 : nDstXSize +
3736 16161558 : iDstPixel - nDstXOff] =
3737 16161558 : ScaleValue(dfVal1, pChunk + j, 4);
3738 32323080 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3739 16161558 : 1) *
3740 16161558 : nDstXSize +
3741 16161558 : iDstPixel - nDstXOff] =
3742 16161558 : ScaleValue(dfVal2, pChunk + j + nChunkXSize, 4);
3743 16161967 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3744 16161558 : 2) *
3745 16161558 : nDstXSize +
3746 16161558 : iDstPixel - nDstXOff] =
3747 16161558 : ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize, 4);
3748 : }
3749 : }
3750 2346404 : else if (bSrcPixelCountLess8)
3751 : {
3752 9938308 : for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3753 : {
3754 7868098 : const size_t j =
3755 7868098 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3756 7868098 : (nSrcPixelStart - nChunkXOff);
3757 7868098 : double dfVal1 = 0.0;
3758 7868098 : double dfVal2 = 0.0;
3759 7868098 : double dfVal3 = 0.0;
3760 : if constexpr (std::is_floating_point_v<T>)
3761 : {
3762 18980 : if (bHasNaN)
3763 : {
3764 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows<
3765 0 : T, true>(pChunk + j, pChunk + j + nChunkXSize,
3766 0 : pChunk + j + 2 * nChunkXSize,
3767 : padfWeights, nSrcPixelCount, dfVal1,
3768 : dfVal2, dfVal3);
3769 : }
3770 : else
3771 : {
3772 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows<
3773 18980 : T, false>(pChunk + j, pChunk + j + nChunkXSize,
3774 18980 : pChunk + j + 2 * nChunkXSize,
3775 : padfWeights, nSrcPixelCount, dfVal1,
3776 : dfVal2, dfVal3);
3777 : }
3778 : }
3779 : else
3780 : {
3781 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows<
3782 7849118 : T, false>(pChunk + j, pChunk + j + nChunkXSize,
3783 7849118 : pChunk + j + 2 * nChunkXSize, padfWeights,
3784 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3785 : }
3786 15736156 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3787 7868098 : nDstXSize +
3788 7868098 : iDstPixel - nDstXOff] =
3789 7868098 : ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
3790 15736156 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3791 7868098 : 1) *
3792 7868098 : nDstXSize +
3793 7868098 : iDstPixel - nDstXOff] =
3794 7868098 : ScaleValue(dfVal2, pChunk + j + nChunkXSize,
3795 : nSrcPixelCount);
3796 7868186 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3797 7868098 : 2) *
3798 7868098 : nDstXSize +
3799 7868098 : iDstPixel - nDstXOff] =
3800 7868098 : ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
3801 : nSrcPixelCount);
3802 : }
3803 : }
3804 : else
3805 : #endif
3806 : {
3807 35902058 : for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3808 : {
3809 35625944 : const size_t j =
3810 35625944 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3811 35625944 : (nSrcPixelStart - nChunkXOff);
3812 35625944 : double dfVal1 = 0.0;
3813 35625944 : double dfVal2 = 0.0;
3814 35625944 : double dfVal3 = 0.0;
3815 : if constexpr (std::is_floating_point_v<T>)
3816 : {
3817 65696 : if (bHasNaN)
3818 : {
3819 0 : GDALResampleConvolutionHorizontal_3rows<T, true>(
3820 0 : pChunk + j, pChunk + j + nChunkXSize,
3821 0 : pChunk + j + 2 * nChunkXSize, padfWeights,
3822 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3823 : }
3824 : else
3825 : {
3826 65696 : GDALResampleConvolutionHorizontal_3rows<T, false>(
3827 65696 : pChunk + j, pChunk + j + nChunkXSize,
3828 65696 : pChunk + j + 2 * nChunkXSize, padfWeights,
3829 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3830 : }
3831 : }
3832 : else
3833 : {
3834 35560248 : GDALResampleConvolutionHorizontal_3rows<T, false>(
3835 35560248 : pChunk + j, pChunk + j + nChunkXSize,
3836 35560248 : pChunk + j + 2 * nChunkXSize, padfWeights,
3837 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3838 : }
3839 71251798 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3840 35625944 : nDstXSize +
3841 35625944 : iDstPixel - nDstXOff] =
3842 35625944 : ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
3843 71251798 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3844 35625944 : 1) *
3845 35625944 : nDstXSize +
3846 35625944 : iDstPixel - nDstXOff] =
3847 35625944 : ScaleValue(dfVal2, pChunk + j + nChunkXSize,
3848 : nSrcPixelCount);
3849 35691048 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3850 35625944 : 2) *
3851 35625944 : nDstXSize +
3852 35625944 : iDstPixel - nDstXOff] =
3853 35625944 : ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
3854 : nSrcPixelCount);
3855 : }
3856 : }
3857 6613620 : for (; iSrcLineOff < nHeight; ++iSrcLineOff)
3858 : {
3859 3421743 : const size_t j =
3860 3421743 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3861 3421743 : (nSrcPixelStart - nChunkXOff);
3862 3970903 : const double dfVal = GDALResampleConvolutionHorizontal(
3863 595200 : pChunk + j, padfWeights, nSrcPixelCount);
3864 3422192 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3865 3421743 : nDstXSize +
3866 3421743 : iDstPixel - nDstXOff] =
3867 3421743 : ScaleValue(dfVal, pChunk + j, nSrcPixelCount);
3868 : }
3869 : }
3870 : else
3871 : {
3872 23844223 : for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
3873 : {
3874 23325328 : const size_t j =
3875 23325328 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3876 23325328 : (nSrcPixelStart - nChunkXOff);
3877 :
3878 : if (bKernelWithNegativeWeights)
3879 : {
3880 18580308 : int nConsecutiveValid = 0;
3881 18580308 : int nMaxConsecutiveValid = 0;
3882 170151146 : for (int k = 0; k < nSrcPixelCount; k++)
3883 : {
3884 151569938 : if (pabyChunkNodataMask[j + k])
3885 43681801 : nConsecutiveValid++;
3886 107888837 : else if (nConsecutiveValid)
3887 : {
3888 107830 : nMaxConsecutiveValid = std::max(
3889 107830 : nMaxConsecutiveValid, nConsecutiveValid);
3890 107830 : nConsecutiveValid = 0;
3891 : }
3892 : }
3893 18580308 : nMaxConsecutiveValid =
3894 18580308 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
3895 18580308 : if (nMaxConsecutiveValid < nSrcPixelCount / 2)
3896 : {
3897 12651307 : const size_t nTempOffset =
3898 12651307 : static_cast<size_t>(iSrcLineOff) * nDstXSize +
3899 12651307 : iDstPixel - nDstXOff;
3900 12651307 : padfHorizontalFiltered[nTempOffset] = 0.0;
3901 12651307 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3902 12651307 : continue;
3903 : }
3904 : }
3905 :
3906 10674031 : double dfVal = 0.0;
3907 : if constexpr (std::is_floating_point_v<T>)
3908 : {
3909 46368 : if (bHasNaN)
3910 : {
3911 1792 : GDALResampleConvolutionHorizontalWithMask<T, true>(
3912 1792 : pChunk + j, pabyChunkNodataMask + j, padfWeights,
3913 : nSrcPixelCount, dfVal, dfWeightSum);
3914 : }
3915 : else
3916 : {
3917 44576 : GDALResampleConvolutionHorizontalWithMask<T, false>(
3918 44576 : pChunk + j, pabyChunkNodataMask + j, padfWeights,
3919 : nSrcPixelCount, dfVal, dfWeightSum);
3920 : }
3921 : }
3922 : else
3923 : {
3924 10627663 : GDALResampleConvolutionHorizontalWithMask<T, false>(
3925 63 : pChunk + j, pabyChunkNodataMask + j, padfWeights,
3926 : nSrcPixelCount, dfVal, dfWeightSum);
3927 : }
3928 10674031 : const size_t nTempOffset =
3929 10674031 : static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
3930 10674031 : nDstXOff;
3931 10674031 : if (dfWeightSum > 0.0)
3932 : {
3933 8761258 : padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
3934 8761258 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
3935 : }
3936 : else
3937 : {
3938 1912781 : padfHorizontalFiltered[nTempOffset] = 0.0;
3939 1912781 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3940 : }
3941 : }
3942 : }
3943 : }
3944 :
3945 : /* ==================================================================== */
3946 : /* Second pass: vertical filter */
3947 : /* ==================================================================== */
3948 9579 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
3949 :
3950 410979 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3951 : {
3952 401400 : Twork *const pafDstScanline =
3953 : pafWrkScanline
3954 401400 : ? pafWrkScanline
3955 14028 : : static_cast<Twork *>(pDstBuffer) +
3956 14028 : static_cast<size_t>(iDstLine - nDstYOff) * nDstXSize;
3957 :
3958 401400 : const double dfSrcLine =
3959 401400 : (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
3960 401400 : const int nSrcLineStart =
3961 401400 : std::max(static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5)),
3962 401400 : nChunkYOff);
3963 401400 : const int nSrcLineStop =
3964 401400 : std::min(static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5),
3965 401400 : nChunkBottomYOff);
3966 : #if 0
3967 : if( nSrcLineStart < nChunkYOff &&
3968 : nChunkYOff > 0 )
3969 : {
3970 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3971 : }
3972 : if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
3973 : {
3974 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3975 : }
3976 : #endif
3977 401400 : const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
3978 401400 : double dfWeightSum = 0.0;
3979 :
3980 : // Compute convolution coefficients.
3981 401400 : int nSrcLine = nSrcLineStart; // Used after for.
3982 401400 : double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
3983 1023685 : for (; nSrcLine < nSrcLineStop - 3;
3984 622285 : nSrcLine += 4, dfY += 4 * dfYScaleWeight)
3985 : {
3986 622285 : padfWeights[nSrcLine - nSrcLineStart] = dfY;
3987 622285 : padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
3988 622285 : padfWeights[nSrcLine + 2 - nSrcLineStart] =
3989 622285 : dfY + 2 * dfYScaleWeight;
3990 622285 : padfWeights[nSrcLine + 3 - nSrcLineStart] =
3991 622285 : dfY + 3 * dfYScaleWeight;
3992 622285 : dfWeightSum +=
3993 622285 : pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
3994 : }
3995 439570 : for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
3996 : {
3997 38170 : const double dfWeight = pfnFilterFunc(dfY);
3998 38170 : padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
3999 38170 : dfWeightSum += dfWeight;
4000 : }
4001 :
4002 401400 : if (pabyChunkNodataMask == nullptr)
4003 : {
4004 : // For floating-point data types, we must scale down a bit values
4005 : // if input values are close to +/- std::numeric_limits<T>::max()
4006 : #ifdef OLD_CPPCHECK
4007 : constexpr double mulFactor = 1;
4008 : #else
4009 360192 : constexpr double mulFactor =
4010 : (bNeedRescale &&
4011 : (std::is_same_v<T, float> || std::is_same_v<T, double>))
4012 : ? 2
4013 : : 1;
4014 : #endif
4015 :
4016 360192 : if (dfWeightSum != 0)
4017 : {
4018 360192 : const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
4019 2617653 : for (int i = 0; i < nSrcLineCount; ++i)
4020 2257467 : padfWeights[i] *= dfInvWeightSum;
4021 : }
4022 :
4023 360192 : int iFilteredPixelOff = 0; // Used after for.
4024 : // j used after for.
4025 360192 : size_t j =
4026 360192 : (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
4027 : #ifdef USE_SSE2
4028 : if constexpr ((!bNeedRescale || !std::is_same_v<T, float>) &&
4029 : eWrkDataType == GDT_Float32)
4030 : {
4031 : #ifdef __AVX__
4032 : for (; iFilteredPixelOff < nDstXSize - 15;
4033 : iFilteredPixelOff += 16, j += 16)
4034 : {
4035 : GDALResampleConvolutionVertical_16cols(
4036 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
4037 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
4038 : if (bHasNoData)
4039 : {
4040 : for (int k = 0; k < 16; k++)
4041 : {
4042 : pafDstScanline[iFilteredPixelOff + k] =
4043 : replaceValIfNodata(
4044 : pafDstScanline[iFilteredPixelOff + k]);
4045 : }
4046 : }
4047 : }
4048 : #else
4049 26155459 : for (; iFilteredPixelOff < nDstXSize - 7;
4050 : iFilteredPixelOff += 8, j += 8)
4051 : {
4052 25804048 : GDALResampleConvolutionVertical_8cols(
4053 25804048 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
4054 25804048 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
4055 25804048 : if (bHasNoData)
4056 : {
4057 123192 : for (int k = 0; k < 8; k++)
4058 : {
4059 109504 : pafDstScanline[iFilteredPixelOff + k] =
4060 109504 : replaceValIfNodata(
4061 109504 : pafDstScanline[iFilteredPixelOff + k]);
4062 : }
4063 : }
4064 : }
4065 : #endif
4066 :
4067 822491 : for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
4068 : {
4069 471118 : const Twork fVal =
4070 471118 : static_cast<Twork>(GDALResampleConvolutionVertical(
4071 471118 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
4072 : nSrcLineCount));
4073 471118 : pafDstScanline[iFilteredPixelOff] =
4074 471118 : replaceValIfNodata(fVal);
4075 : }
4076 : }
4077 : else
4078 : #endif
4079 : {
4080 5862642 : const auto ScaleValue = [
4081 : #ifdef _MSC_VER
4082 : mulFactor
4083 : #endif
4084 : ](double dfVal, [[maybe_unused]] const double *inputValues,
4085 : [[maybe_unused]] int nStride,
4086 : [[maybe_unused]] int nInputValues)
4087 : {
4088 5862640 : constexpr bool isFloat =
4089 : std::is_same_v<T, float> || std::is_same_v<T, double>;
4090 : if constexpr (isFloat)
4091 : {
4092 5862640 : if (std::isfinite(dfVal))
4093 : {
4094 : return std::clamp(
4095 : dfVal,
4096 : static_cast<double>(
4097 17585400 : -std::numeric_limits<Twork>::max()) /
4098 : mulFactor,
4099 : static_cast<double>(
4100 5861800 : std::numeric_limits<Twork>::max()) /
4101 5861800 : mulFactor) *
4102 5861800 : mulFactor;
4103 : }
4104 : else if constexpr (bKernelWithNegativeWeights)
4105 : {
4106 480 : if (std::isnan(dfVal))
4107 : {
4108 : // Either one of the input value is NaN or they are +/-Inf
4109 480 : const bool isPositive = inputValues[0] >= 0;
4110 2520 : for (int i = 0; i < nInputValues; ++i)
4111 : {
4112 2200 : if (std::isnan(inputValues[i * nStride]))
4113 160 : return dfVal;
4114 : // cppcheck-suppress knownConditionTrueFalse
4115 2040 : if ((inputValues[i] >= 0) != isPositive)
4116 0 : return dfVal;
4117 : }
4118 : // All values are positive or negative infinity
4119 320 : return inputValues[0];
4120 : }
4121 : }
4122 : }
4123 :
4124 360 : return dfVal;
4125 : };
4126 :
4127 2939422 : for (; iFilteredPixelOff < nDstXSize - 1;
4128 : iFilteredPixelOff += 2, j += 2)
4129 : {
4130 2930610 : double dfVal1 = 0.0;
4131 2930610 : double dfVal2 = 0.0;
4132 2930610 : GDALResampleConvolutionVertical_2cols(
4133 2930610 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
4134 : nSrcLineCount, dfVal1, dfVal2);
4135 5861220 : pafDstScanline[iFilteredPixelOff] =
4136 2930610 : replaceValIfNodata(static_cast<Twork>(
4137 2930610 : ScaleValue(dfVal1, padfHorizontalFiltered + j,
4138 : nDstXSize, nSrcLineCount)));
4139 2930610 : pafDstScanline[iFilteredPixelOff + 1] =
4140 2930610 : replaceValIfNodata(static_cast<Twork>(
4141 2930610 : ScaleValue(dfVal2, padfHorizontalFiltered + j + 1,
4142 : nDstXSize, nSrcLineCount)));
4143 : }
4144 8819 : if (iFilteredPixelOff < nDstXSize)
4145 : {
4146 1427 : const double dfVal = GDALResampleConvolutionVertical(
4147 1427 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
4148 : nSrcLineCount);
4149 1427 : pafDstScanline[iFilteredPixelOff] =
4150 1427 : replaceValIfNodata(static_cast<Twork>(
4151 1427 : ScaleValue(dfVal, padfHorizontalFiltered + j,
4152 : nDstXSize, nSrcLineCount)));
4153 : }
4154 : }
4155 : }
4156 : else
4157 : {
4158 19396665 : for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
4159 : ++iFilteredPixelOff)
4160 : {
4161 19355485 : double dfVal = 0.0;
4162 19355485 : dfWeightSum = 0.0;
4163 19355485 : size_t j = (nSrcLineStart - nChunkYOff) *
4164 19355485 : static_cast<size_t>(nDstXSize) +
4165 19355485 : iFilteredPixelOff;
4166 : if (bKernelWithNegativeWeights)
4167 : {
4168 18088237 : int nConsecutiveValid = 0;
4169 18088237 : int nMaxConsecutiveValid = 0;
4170 127259921 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
4171 : {
4172 109171284 : const double dfWeight =
4173 109171284 : padfWeights[i] *
4174 : pabyChunkNodataMaskHorizontalFiltered[j];
4175 109171284 : if (pabyChunkNodataMaskHorizontalFiltered[j])
4176 : {
4177 46111301 : nConsecutiveValid++;
4178 : }
4179 63060183 : else if (nConsecutiveValid)
4180 : {
4181 204376 : nMaxConsecutiveValid = std::max(
4182 204376 : nMaxConsecutiveValid, nConsecutiveValid);
4183 204376 : nConsecutiveValid = 0;
4184 : }
4185 109171284 : dfVal += padfHorizontalFiltered[j] * dfWeight;
4186 109171284 : dfWeightSum += dfWeight;
4187 : }
4188 18088237 : nMaxConsecutiveValid =
4189 18088237 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
4190 18088237 : if (nMaxConsecutiveValid < nSrcLineCount / 2)
4191 : {
4192 8918591 : pafDstScanline[iFilteredPixelOff] =
4193 8918499 : static_cast<Twork>(dfNoDataValue);
4194 8918591 : continue;
4195 : }
4196 : }
4197 : else
4198 : {
4199 6353336 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
4200 : {
4201 5086078 : const double dfWeight =
4202 5086078 : padfWeights[i] *
4203 : pabyChunkNodataMaskHorizontalFiltered[j];
4204 5086078 : dfVal += padfHorizontalFiltered[j] * dfWeight;
4205 5086078 : dfWeightSum += dfWeight;
4206 : }
4207 : }
4208 10436914 : if (dfWeightSum > 0.0)
4209 : {
4210 9899086 : pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
4211 9898738 : static_cast<Twork>(dfVal / dfWeightSum));
4212 : }
4213 : else
4214 : {
4215 537838 : pafDstScanline[iFilteredPixelOff] =
4216 537814 : static_cast<Twork>(dfNoDataValue);
4217 : }
4218 : }
4219 : }
4220 :
4221 401400 : if (fMaxVal != 0.0f)
4222 : {
4223 : if constexpr (std::is_same_v<T, double>)
4224 : {
4225 0 : for (int i = 0; i < nDstXSize; ++i)
4226 : {
4227 0 : if (pafDstScanline[i] > static_cast<double>(fMaxVal))
4228 0 : pafDstScanline[i] = static_cast<double>(fMaxVal);
4229 : }
4230 : }
4231 : else
4232 : {
4233 192324 : for (int i = 0; i < nDstXSize; ++i)
4234 : {
4235 192088 : if (pafDstScanline[i] > fMaxVal)
4236 96022 : pafDstScanline[i] = fMaxVal;
4237 : }
4238 : }
4239 : }
4240 :
4241 401400 : if (pafWrkScanline)
4242 : {
4243 387372 : GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
4244 : static_cast<GByte *>(pDstBuffer) +
4245 387372 : static_cast<size_t>(iDstLine - nDstYOff) *
4246 387372 : nDstXSize * nDstDataTypeSize,
4247 : dstDataType, nDstDataTypeSize, nDstXSize);
4248 : }
4249 : }
4250 :
4251 9579 : VSIFree(pafWrkScanline);
4252 9579 : VSIFreeAligned(padfWeights);
4253 9579 : VSIFree(padfHorizontalFiltered);
4254 9579 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
4255 :
4256 9579 : return CE_None;
4257 : }
4258 :
4259 : template <bool bKernelWithNegativeWeights, bool bNeedRescale>
4260 : static CPLErr
4261 9579 : GDALResampleChunk_ConvolutionInternal(const GDALOverviewResampleArgs &args,
4262 : const void *pChunk, void **ppDstBuffer,
4263 : GDALDataType *peDstBufferDataType)
4264 : {
4265 : GDALResampleAlg eResample;
4266 9579 : if (EQUAL(args.pszResampling, "BILINEAR"))
4267 7097 : eResample = GRA_Bilinear;
4268 2482 : else if (EQUAL(args.pszResampling, "CUBIC"))
4269 2300 : eResample = GRA_Cubic;
4270 182 : else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
4271 86 : eResample = GRA_CubicSpline;
4272 96 : else if (EQUAL(args.pszResampling, "LANCZOS"))
4273 96 : eResample = GRA_Lanczos;
4274 : else
4275 : {
4276 0 : CPLAssert(false);
4277 : return CE_Failure;
4278 : }
4279 9579 : const int nKernelRadius = GWKGetFilterRadius(eResample);
4280 9579 : FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
4281 : const FilterFunc4ValuesType pfnFilterFunc4Values =
4282 9579 : GWKGetFilterFunc4Values(eResample);
4283 :
4284 9579 : float fMaxVal = 0.f;
4285 : // Cubic, etc... can have overshoots, so make sure we clamp values to the
4286 : // maximum value if NBITS is set.
4287 9579 : if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
4288 8 : (args.eOvrDataType == GDT_UInt8 || args.eOvrDataType == GDT_UInt16 ||
4289 0 : args.eOvrDataType == GDT_UInt32))
4290 : {
4291 8 : int nBits = args.nOvrNBITS;
4292 8 : if (nBits == GDALGetDataTypeSizeBits(args.eOvrDataType))
4293 1 : nBits = 0;
4294 8 : if (nBits > 0 && nBits < 32)
4295 7 : fMaxVal = static_cast<float>((1U << nBits) - 1);
4296 : }
4297 :
4298 9579 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
4299 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
4300 : GDALGetDataTypeSizeBytes(args.eOvrDataType));
4301 9579 : if (*ppDstBuffer == nullptr)
4302 : {
4303 0 : return CE_Failure;
4304 : }
4305 9579 : *peDstBufferDataType = args.eOvrDataType;
4306 :
4307 9579 : switch (args.eWrkDataType)
4308 : {
4309 8687 : case GDT_UInt8:
4310 : {
4311 : return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32,
4312 : bKernelWithNegativeWeights,
4313 8687 : bNeedRescale>(
4314 : args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
4315 8687 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4316 : }
4317 :
4318 402 : case GDT_UInt16:
4319 : {
4320 : return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32,
4321 : bKernelWithNegativeWeights,
4322 402 : bNeedRescale>(
4323 : args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
4324 402 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4325 : }
4326 :
4327 387 : case GDT_Float32:
4328 : {
4329 : return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32,
4330 : bKernelWithNegativeWeights,
4331 387 : bNeedRescale>(
4332 : args, static_cast<const float *>(pChunk), *ppDstBuffer,
4333 387 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4334 : }
4335 :
4336 103 : case GDT_Float64:
4337 : {
4338 : return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64,
4339 : bKernelWithNegativeWeights,
4340 103 : bNeedRescale>(
4341 : args, static_cast<const double *>(pChunk), *ppDstBuffer,
4342 103 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4343 : }
4344 :
4345 0 : default:
4346 0 : break;
4347 : }
4348 :
4349 0 : CPLAssert(false);
4350 : return CE_Failure;
4351 : }
4352 :
4353 : static CPLErr
4354 9579 : GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
4355 : const void *pChunk, void **ppDstBuffer,
4356 : GDALDataType *peDstBufferDataType)
4357 : {
4358 9579 : if (EQUAL(args.pszResampling, "CUBIC") ||
4359 7279 : EQUAL(args.pszResampling, "LANCZOS"))
4360 : return GDALResampleChunk_ConvolutionInternal<
4361 2396 : /* bKernelWithNegativeWeights=*/true, /* bNeedRescale = */ true>(
4362 2396 : args, pChunk, ppDstBuffer, peDstBufferDataType);
4363 7183 : else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
4364 86 : return GDALResampleChunk_ConvolutionInternal<false, true>(
4365 86 : args, pChunk, ppDstBuffer, peDstBufferDataType);
4366 : else
4367 7097 : return GDALResampleChunk_ConvolutionInternal<false, false>(
4368 7097 : args, pChunk, ppDstBuffer, peDstBufferDataType);
4369 : }
4370 :
4371 : /************************************************************************/
4372 : /* GDALResampleChunkC32R() */
4373 : /************************************************************************/
4374 :
4375 2 : static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
4376 : const float *pafChunk, const int nChunkYOff,
4377 : const int nChunkYSize, const int nDstYOff,
4378 : const int nDstYOff2, const int nOvrXSize,
4379 : const int nOvrYSize, void **ppDstBuffer,
4380 : GDALDataType *peDstBufferDataType,
4381 : const char *pszResampling)
4382 :
4383 : {
4384 : enum Method
4385 : {
4386 : NEAR,
4387 : AVERAGE,
4388 : AVERAGE_MAGPHASE,
4389 : RMS,
4390 : };
4391 :
4392 2 : Method eMethod = NEAR;
4393 2 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
4394 : {
4395 0 : eMethod = NEAR;
4396 : }
4397 2 : else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
4398 : {
4399 0 : eMethod = AVERAGE_MAGPHASE;
4400 : }
4401 2 : else if (EQUAL(pszResampling, "RMS"))
4402 : {
4403 2 : eMethod = RMS;
4404 : }
4405 0 : else if (STARTS_WITH_CI(pszResampling, "AVER"))
4406 : {
4407 0 : eMethod = AVERAGE;
4408 : }
4409 : else
4410 : {
4411 0 : CPLError(
4412 : CE_Failure, CPLE_NotSupported,
4413 : "Resampling method %s is not supported for complex data types. "
4414 : "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
4415 : pszResampling);
4416 0 : return CE_Failure;
4417 : }
4418 :
4419 2 : const int nOXSize = nOvrXSize;
4420 2 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
4421 : GDALGetDataTypeSizeBytes(GDT_CFloat32));
4422 2 : if (*ppDstBuffer == nullptr)
4423 : {
4424 0 : return CE_Failure;
4425 : }
4426 2 : float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
4427 2 : *peDstBufferDataType = GDT_CFloat32;
4428 :
4429 2 : const int nOYSize = nOvrYSize;
4430 2 : const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
4431 2 : const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
4432 :
4433 : /* ==================================================================== */
4434 : /* Loop over destination scanlines. */
4435 : /* ==================================================================== */
4436 8 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
4437 : {
4438 6 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
4439 6 : if (nSrcYOff < nChunkYOff)
4440 0 : nSrcYOff = nChunkYOff;
4441 :
4442 6 : int nSrcYOff2 =
4443 6 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
4444 6 : if (nSrcYOff2 == nSrcYOff)
4445 0 : nSrcYOff2++;
4446 :
4447 6 : if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
4448 : {
4449 2 : if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
4450 0 : nSrcYOff = nSrcHeight - 1;
4451 2 : nSrcYOff2 = nSrcHeight;
4452 : }
4453 6 : if (nSrcYOff2 > nChunkYOff + nChunkYSize)
4454 0 : nSrcYOff2 = nChunkYOff + nChunkYSize;
4455 :
4456 6 : const float *const pafSrcScanline =
4457 6 : pafChunk +
4458 6 : (static_cast<size_t>(nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
4459 6 : float *const pafDstScanline =
4460 6 : pafDstBuffer +
4461 6 : static_cast<size_t>(iDstLine - nDstYOff) * 2 * nOXSize;
4462 :
4463 : /* --------------------------------------------------------------------
4464 : */
4465 : /* Loop over destination pixels */
4466 : /* --------------------------------------------------------------------
4467 : */
4468 18 : for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
4469 : {
4470 12 : const size_t iDstPixelSZ = static_cast<size_t>(iDstPixel);
4471 12 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
4472 12 : int nSrcXOff2 =
4473 12 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
4474 12 : if (nSrcXOff2 == nSrcXOff)
4475 0 : nSrcXOff2++;
4476 12 : if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
4477 : {
4478 6 : if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
4479 0 : nSrcXOff = nSrcWidth - 1;
4480 6 : nSrcXOff2 = nSrcWidth;
4481 : }
4482 12 : const size_t nSrcXOffSZ = static_cast<size_t>(nSrcXOff);
4483 :
4484 12 : if (eMethod == NEAR)
4485 : {
4486 0 : pafDstScanline[iDstPixelSZ * 2] =
4487 0 : pafSrcScanline[nSrcXOffSZ * 2];
4488 0 : pafDstScanline[iDstPixelSZ * 2 + 1] =
4489 0 : pafSrcScanline[nSrcXOffSZ * 2 + 1];
4490 : }
4491 12 : else if (eMethod == AVERAGE_MAGPHASE)
4492 : {
4493 0 : double dfTotalR = 0.0;
4494 0 : double dfTotalI = 0.0;
4495 0 : double dfTotalM = 0.0;
4496 0 : size_t nCount = 0;
4497 :
4498 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4499 : {
4500 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4501 : {
4502 0 : const double dfR = double(
4503 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4504 0 : static_cast<size_t>(iY - nSrcYOff) *
4505 0 : nSrcWidth * 2]);
4506 0 : const double dfI = double(
4507 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4508 0 : static_cast<size_t>(iY - nSrcYOff) *
4509 0 : nSrcWidth * 2 +
4510 0 : 1]);
4511 0 : dfTotalR += dfR;
4512 0 : dfTotalI += dfI;
4513 0 : dfTotalM += std::hypot(dfR, dfI);
4514 0 : ++nCount;
4515 : }
4516 : }
4517 :
4518 0 : CPLAssert(nCount > 0);
4519 0 : if (nCount == 0)
4520 : {
4521 0 : pafDstScanline[iDstPixelSZ * 2] = 0.0;
4522 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4523 : }
4524 : else
4525 : {
4526 0 : pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4527 0 : dfTotalR / static_cast<double>(nCount));
4528 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4529 0 : dfTotalI / static_cast<double>(nCount));
4530 : const double dfM =
4531 0 : double(std::hypot(pafDstScanline[iDstPixelSZ * 2],
4532 0 : pafDstScanline[iDstPixelSZ * 2 + 1]));
4533 0 : const double dfDesiredM =
4534 0 : dfTotalM / static_cast<double>(nCount);
4535 0 : double dfRatio = 1.0;
4536 0 : if (dfM != 0.0)
4537 0 : dfRatio = dfDesiredM / dfM;
4538 :
4539 0 : pafDstScanline[iDstPixelSZ * 2] *=
4540 0 : static_cast<float>(dfRatio);
4541 0 : pafDstScanline[iDstPixelSZ * 2 + 1] *=
4542 0 : static_cast<float>(dfRatio);
4543 : }
4544 : }
4545 12 : else if (eMethod == RMS)
4546 : {
4547 12 : double dfTotalR = 0.0;
4548 12 : double dfTotalI = 0.0;
4549 12 : size_t nCount = 0;
4550 :
4551 36 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4552 : {
4553 72 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4554 : {
4555 48 : const double dfR = double(
4556 48 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4557 48 : static_cast<size_t>(iY - nSrcYOff) *
4558 48 : nSrcWidth * 2]);
4559 48 : const double dfI = double(
4560 48 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4561 48 : static_cast<size_t>(iY - nSrcYOff) *
4562 48 : nSrcWidth * 2 +
4563 48 : 1]);
4564 :
4565 48 : dfTotalR += SQUARE(dfR);
4566 48 : dfTotalI += SQUARE(dfI);
4567 :
4568 48 : ++nCount;
4569 : }
4570 : }
4571 :
4572 12 : CPLAssert(nCount > 0);
4573 12 : if (nCount == 0)
4574 : {
4575 0 : pafDstScanline[iDstPixelSZ * 2] = 0.0;
4576 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4577 : }
4578 : else
4579 : {
4580 : /* compute RMS */
4581 12 : pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4582 12 : sqrt(dfTotalR / static_cast<double>(nCount)));
4583 12 : pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4584 12 : sqrt(dfTotalI / static_cast<double>(nCount)));
4585 : }
4586 : }
4587 0 : else if (eMethod == AVERAGE)
4588 : {
4589 0 : double dfTotalR = 0.0;
4590 0 : double dfTotalI = 0.0;
4591 0 : size_t nCount = 0;
4592 :
4593 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4594 : {
4595 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4596 : {
4597 : // TODO(schwehr): Maybe use std::complex?
4598 0 : dfTotalR += double(
4599 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4600 0 : static_cast<size_t>(iY - nSrcYOff) *
4601 0 : nSrcWidth * 2]);
4602 0 : dfTotalI += double(
4603 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4604 0 : static_cast<size_t>(iY - nSrcYOff) *
4605 0 : nSrcWidth * 2 +
4606 0 : 1]);
4607 0 : ++nCount;
4608 : }
4609 : }
4610 :
4611 0 : CPLAssert(nCount > 0);
4612 0 : if (nCount == 0)
4613 : {
4614 0 : pafDstScanline[iDstPixelSZ * 2] = 0.0;
4615 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4616 : }
4617 : else
4618 : {
4619 0 : pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4620 0 : dfTotalR / static_cast<double>(nCount));
4621 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4622 0 : dfTotalI / static_cast<double>(nCount));
4623 : }
4624 : }
4625 : }
4626 : }
4627 :
4628 2 : return CE_None;
4629 : }
4630 :
4631 : /************************************************************************/
4632 : /* GDALRegenerateCascadingOverviews() */
4633 : /* */
4634 : /* Generate a list of overviews in order from largest to */
4635 : /* smallest, computing each from the next larger. */
4636 : /************************************************************************/
4637 :
4638 44 : static CPLErr GDALRegenerateCascadingOverviews(
4639 : GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
4640 : const char *pszResampling, GDALProgressFunc pfnProgress,
4641 : void *pProgressData, CSLConstList papszOptions)
4642 :
4643 : {
4644 : /* -------------------------------------------------------------------- */
4645 : /* First, we must put the overviews in order from largest to */
4646 : /* smallest. */
4647 : /* -------------------------------------------------------------------- */
4648 127 : for (int i = 0; i < nOverviews - 1; ++i)
4649 : {
4650 292 : for (int j = 0; j < nOverviews - i - 1; ++j)
4651 : {
4652 209 : if (papoOvrBands[j]->GetXSize() *
4653 209 : static_cast<float>(papoOvrBands[j]->GetYSize()) <
4654 209 : papoOvrBands[j + 1]->GetXSize() *
4655 209 : static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
4656 : {
4657 0 : GDALRasterBand *poTempBand = papoOvrBands[j];
4658 0 : papoOvrBands[j] = papoOvrBands[j + 1];
4659 0 : papoOvrBands[j + 1] = poTempBand;
4660 : }
4661 : }
4662 : }
4663 :
4664 : /* -------------------------------------------------------------------- */
4665 : /* Count total pixels so we can prepare appropriate scaled */
4666 : /* progress functions. */
4667 : /* -------------------------------------------------------------------- */
4668 44 : double dfTotalPixels = 0.0;
4669 :
4670 171 : for (int i = 0; i < nOverviews; ++i)
4671 : {
4672 127 : dfTotalPixels += papoOvrBands[i]->GetXSize() *
4673 127 : static_cast<double>(papoOvrBands[i]->GetYSize());
4674 : }
4675 :
4676 : /* -------------------------------------------------------------------- */
4677 : /* Generate all the bands. */
4678 : /* -------------------------------------------------------------------- */
4679 44 : double dfPixelsProcessed = 0.0;
4680 :
4681 88 : CPLStringList aosOptions(papszOptions);
4682 44 : aosOptions.SetNameValue("CASCADING", "YES");
4683 171 : for (int i = 0; i < nOverviews; ++i)
4684 : {
4685 127 : GDALRasterBand *poBaseBand = poSrcBand;
4686 127 : if (i != 0)
4687 83 : poBaseBand = papoOvrBands[i - 1];
4688 :
4689 127 : double dfPixels = papoOvrBands[i]->GetXSize() *
4690 127 : static_cast<double>(papoOvrBands[i]->GetYSize());
4691 :
4692 254 : void *pScaledProgressData = GDALCreateScaledProgress(
4693 : dfPixelsProcessed / dfTotalPixels,
4694 127 : (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
4695 : pProgressData);
4696 :
4697 254 : const CPLErr eErr = GDALRegenerateOverviewsEx(
4698 : poBaseBand, 1,
4699 127 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
4700 : pszResampling, GDALScaledProgress, pScaledProgressData,
4701 127 : aosOptions.List());
4702 127 : GDALDestroyScaledProgress(pScaledProgressData);
4703 :
4704 127 : if (eErr != CE_None)
4705 0 : return eErr;
4706 :
4707 127 : dfPixelsProcessed += dfPixels;
4708 :
4709 : // Only do the bit2grayscale promotion on the base band.
4710 127 : if (STARTS_WITH_CI(pszResampling,
4711 : "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
4712 8 : pszResampling = "AVERAGE";
4713 : }
4714 :
4715 44 : return CE_None;
4716 : }
4717 :
4718 : /************************************************************************/
4719 : /* GDALGetResampleFunction() */
4720 : /************************************************************************/
4721 :
4722 19267 : GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
4723 : int *pnRadius)
4724 : {
4725 19267 : if (pnRadius)
4726 19267 : *pnRadius = 0;
4727 19267 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
4728 532 : return GDALResampleChunk_Near;
4729 18735 : else if (STARTS_WITH_CI(pszResampling, "AVER") ||
4730 7507 : EQUAL(pszResampling, "RMS"))
4731 11293 : return GDALResampleChunk_AverageOrRMS;
4732 7442 : else if (EQUAL(pszResampling, "GAUSS"))
4733 : {
4734 26 : if (pnRadius)
4735 26 : *pnRadius = 1;
4736 26 : return GDALResampleChunk_Gauss;
4737 : }
4738 7416 : else if (EQUAL(pszResampling, "MODE"))
4739 142 : return GDALResampleChunk_Mode;
4740 7274 : else if (EQUAL(pszResampling, "CUBIC"))
4741 : {
4742 1647 : if (pnRadius)
4743 1647 : *pnRadius = GWKGetFilterRadius(GRA_Cubic);
4744 1647 : return GDALResampleChunk_Convolution;
4745 : }
4746 5627 : else if (EQUAL(pszResampling, "CUBICSPLINE"))
4747 : {
4748 60 : if (pnRadius)
4749 60 : *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
4750 60 : return GDALResampleChunk_Convolution;
4751 : }
4752 5567 : else if (EQUAL(pszResampling, "LANCZOS"))
4753 : {
4754 50 : if (pnRadius)
4755 50 : *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
4756 50 : return GDALResampleChunk_Convolution;
4757 : }
4758 5517 : else if (EQUAL(pszResampling, "BILINEAR"))
4759 : {
4760 5517 : if (pnRadius)
4761 5517 : *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
4762 5517 : return GDALResampleChunk_Convolution;
4763 : }
4764 : else
4765 : {
4766 0 : CPLError(
4767 : CE_Failure, CPLE_AppDefined,
4768 : "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
4769 : pszResampling);
4770 0 : return nullptr;
4771 : }
4772 : }
4773 :
4774 : /************************************************************************/
4775 : /* GDALGetOvrWorkDataType() */
4776 : /************************************************************************/
4777 :
4778 19149 : GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
4779 : GDALDataType eSrcDataType)
4780 : {
4781 19149 : if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
4782 : {
4783 666 : return eSrcDataType;
4784 : }
4785 18483 : else if (eSrcDataType == GDT_UInt8 &&
4786 17910 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4787 6780 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4788 5375 : EQUAL(pszResampling, "CUBICSPLINE") ||
4789 5355 : EQUAL(pszResampling, "LANCZOS") ||
4790 5348 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4791 : {
4792 17903 : return GDT_UInt8;
4793 : }
4794 580 : else if (eSrcDataType == GDT_UInt16 &&
4795 131 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4796 126 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4797 8 : EQUAL(pszResampling, "CUBICSPLINE") ||
4798 6 : EQUAL(pszResampling, "LANCZOS") ||
4799 3 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4800 : {
4801 131 : return GDT_UInt16;
4802 : }
4803 449 : else if (EQUAL(pszResampling, "GAUSS"))
4804 20 : return GDT_Float64;
4805 :
4806 429 : if (eSrcDataType == GDT_UInt8 || eSrcDataType == GDT_Int8 ||
4807 428 : eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
4808 : eSrcDataType == GDT_Float32)
4809 : {
4810 277 : return GDT_Float32;
4811 : }
4812 152 : return GDT_Float64;
4813 : }
4814 :
4815 : namespace
4816 : {
4817 : // Structure to hold a pointer to free with CPLFree()
4818 : struct PointerHolder
4819 : {
4820 : void *ptr = nullptr;
4821 :
4822 4067 : template <class T> explicit PointerHolder(T *&ptrIn) : ptr(ptrIn)
4823 : {
4824 4067 : ptrIn = nullptr;
4825 4067 : }
4826 :
4827 : template <class T>
4828 32 : explicit PointerHolder(std::unique_ptr<T, VSIFreeReleaser> ptrIn)
4829 32 : : ptr(ptrIn.release())
4830 : {
4831 32 : }
4832 :
4833 4099 : ~PointerHolder()
4834 4099 : {
4835 4099 : CPLFree(ptr);
4836 4099 : }
4837 :
4838 : PointerHolder(const PointerHolder &) = delete;
4839 : PointerHolder &operator=(const PointerHolder &) = delete;
4840 : };
4841 : } // namespace
4842 :
4843 : /************************************************************************/
4844 : /* GDALRegenerateOverviews() */
4845 : /************************************************************************/
4846 :
4847 : /**
4848 : * \brief Generate downsampled overviews.
4849 : *
4850 : * This function will generate one or more overview images from a base image
4851 : * using the requested downsampling algorithm. Its primary use is for
4852 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4853 : * used to generate downsampled images in one file from another outside the
4854 : * overview architecture.
4855 : *
4856 : * The output bands need to exist in advance.
4857 : *
4858 : * The full set of resampling algorithms is documented in
4859 : * GDALDataset::BuildOverviews().
4860 : *
4861 : * This function will honour properly NODATA_VALUES tuples (special dataset
4862 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4863 : * considered as the nodata value and not each value of the triplet
4864 : * independently per band.
4865 : *
4866 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4867 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4868 : * overview computation.
4869 : *
4870 : * @param hSrcBand the source (base level) band.
4871 : * @param nOverviewCount the number of downsampled bands being generated.
4872 : * @param pahOvrBands the list of downsampled bands to be generated.
4873 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4874 : * @param pfnProgress progress report function.
4875 : * @param pProgressData progress function callback data.
4876 : * @return CE_None on success or CE_Failure on failure.
4877 : */
4878 113 : CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
4879 : GDALRasterBandH *pahOvrBands,
4880 : const char *pszResampling,
4881 : GDALProgressFunc pfnProgress,
4882 : void *pProgressData)
4883 :
4884 : {
4885 113 : return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
4886 : pszResampling, pfnProgress, pProgressData,
4887 113 : nullptr);
4888 : }
4889 :
4890 : /************************************************************************/
4891 : /* GDALRegenerateOverviewsEx() */
4892 : /************************************************************************/
4893 :
4894 : constexpr int RADIUS_TO_DIAMETER = 2;
4895 :
4896 : /**
4897 : * \brief Generate downsampled overviews.
4898 : *
4899 : * This function will generate one or more overview images from a base image
4900 : * using the requested downsampling algorithm. Its primary use is for
4901 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4902 : * used to generate downsampled images in one file from another outside the
4903 : * overview architecture.
4904 : *
4905 : * The output bands need to exist in advance.
4906 : *
4907 : * The full set of resampling algorithms is documented in
4908 : * GDALDataset::BuildOverviews().
4909 : *
4910 : * This function will honour properly NODATA_VALUES tuples (special dataset
4911 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4912 : * considered as the nodata value and not each value of the triplet
4913 : * independently per band.
4914 : *
4915 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4916 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4917 : * overview computation.
4918 : *
4919 : * @param hSrcBand the source (base level) band.
4920 : * @param nOverviewCount the number of downsampled bands being generated.
4921 : * @param pahOvrBands the list of downsampled bands to be generated.
4922 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4923 : * @param pfnProgress progress report function.
4924 : * @param pProgressData progress function callback data.
4925 : * @param papszOptions NULL terminated list of options as key=value pairs, or
4926 : * NULL
4927 : * @return CE_None on success or CE_Failure on failure.
4928 : * @since GDAL 3.6
4929 : */
4930 781 : CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
4931 : GDALRasterBandH *pahOvrBands,
4932 : const char *pszResampling,
4933 : GDALProgressFunc pfnProgress,
4934 : void *pProgressData, CSLConstList papszOptions)
4935 :
4936 : {
4937 781 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
4938 781 : GDALRasterBand **papoOvrBands =
4939 : reinterpret_cast<GDALRasterBand **>(pahOvrBands);
4940 :
4941 781 : if (pfnProgress == nullptr)
4942 102 : pfnProgress = GDALDummyProgress;
4943 :
4944 781 : if (EQUAL(pszResampling, "NONE"))
4945 51 : return CE_None;
4946 :
4947 730 : int nKernelRadius = 0;
4948 : GDALResampleFunction pfnResampleFn =
4949 730 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
4950 :
4951 730 : if (pfnResampleFn == nullptr)
4952 0 : return CE_Failure;
4953 :
4954 : /* -------------------------------------------------------------------- */
4955 : /* Check color tables... */
4956 : /* -------------------------------------------------------------------- */
4957 730 : GDALColorTable *poColorTable = nullptr;
4958 :
4959 507 : if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
4960 1538 : EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
4961 312 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4962 : {
4963 9 : poColorTable = poSrcBand->GetColorTable();
4964 9 : if (poColorTable != nullptr)
4965 : {
4966 9 : if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
4967 : {
4968 0 : CPLError(CE_Warning, CPLE_AppDefined,
4969 : "Computing overviews on palette index raster bands "
4970 : "with a palette whose color interpretation is not RGB "
4971 : "will probably lead to unexpected results.");
4972 0 : poColorTable = nullptr;
4973 : }
4974 9 : else if (poColorTable->IsIdentity())
4975 : {
4976 0 : poColorTable = nullptr;
4977 : }
4978 : }
4979 : else
4980 : {
4981 0 : CPLError(CE_Warning, CPLE_AppDefined,
4982 : "Computing overviews on palette index raster bands "
4983 : "without a palette will probably lead to unexpected "
4984 : "results.");
4985 : }
4986 : }
4987 : // Not ready yet
4988 2109 : else if ((EQUAL(pszResampling, "CUBIC") ||
4989 667 : EQUAL(pszResampling, "CUBICSPLINE") ||
4990 667 : EQUAL(pszResampling, "LANCZOS") ||
4991 1468 : EQUAL(pszResampling, "BILINEAR")) &&
4992 80 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4993 : {
4994 0 : CPLError(CE_Warning, CPLE_AppDefined,
4995 : "Computing %s overviews on palette index raster bands "
4996 : "will probably lead to unexpected results.",
4997 : pszResampling);
4998 : }
4999 :
5000 : // If we have a nodata mask and we are doing something more complicated
5001 : // than nearest neighbouring, we have to fetch to nodata mask.
5002 :
5003 730 : GDALRasterBand *poMaskBand = nullptr;
5004 730 : bool bUseNoDataMask = false;
5005 730 : bool bCanUseCascaded = true;
5006 :
5007 730 : if (!STARTS_WITH_CI(pszResampling, "NEAR"))
5008 : {
5009 : // Special case if we are an alpha/mask band. We want it to be
5010 : // considered as the mask band to avoid alpha=0 to be taken into account
5011 : // in average computation.
5012 392 : if (poSrcBand->IsMaskBand())
5013 : {
5014 51 : poMaskBand = poSrcBand;
5015 51 : bUseNoDataMask = true;
5016 : }
5017 : else
5018 : {
5019 341 : poMaskBand = poSrcBand->GetMaskBand();
5020 341 : const int nMaskFlags = poSrcBand->GetMaskFlags();
5021 341 : bCanUseCascaded =
5022 341 : (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
5023 341 : bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
5024 : }
5025 : }
5026 :
5027 730 : int nHasNoData = 0;
5028 730 : const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
5029 730 : const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
5030 : const bool bPropagateNoData =
5031 730 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5032 :
5033 798 : if (poSrcBand->GetBand() == 1 && bUseNoDataMask &&
5034 68 : CSLFetchNameValue(papszOptions, "CASCADING") == nullptr)
5035 : {
5036 112 : std::string osDetailMessage;
5037 56 : if (poSrcBand->HasConflictingMaskSources(&osDetailMessage, false))
5038 : {
5039 2 : CPLError(
5040 : CE_Warning, CPLE_AppDefined, "%s%s", osDetailMessage.c_str(),
5041 : bHasNoData
5042 : ? "Only the nodata value will be taken into account."
5043 : : "Only the first listed one will be taken into account.");
5044 : }
5045 : }
5046 :
5047 : /* -------------------------------------------------------------------- */
5048 : /* If we are operating on multiple overviews, and using */
5049 : /* averaging, lets do them in cascading order to reduce the */
5050 : /* amount of computation. */
5051 : /* -------------------------------------------------------------------- */
5052 :
5053 : // In case the mask made be computed from another band of the dataset,
5054 : // we can't use cascaded generation, as the computation of the overviews
5055 : // of the band used for the mask band may not have yet occurred (#3033).
5056 730 : if ((STARTS_WITH_CI(pszResampling, "AVER") ||
5057 507 : EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
5058 476 : EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
5059 422 : EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
5060 730 : EQUAL(pszResampling, "MODE")) &&
5061 44 : nOverviewCount > 1 && bCanUseCascaded)
5062 44 : return GDALRegenerateCascadingOverviews(
5063 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
5064 44 : pProgressData, papszOptions);
5065 :
5066 : /* -------------------------------------------------------------------- */
5067 : /* Setup one horizontal swath to read from the raw buffer. */
5068 : /* -------------------------------------------------------------------- */
5069 686 : int nFRXBlockSize = 0;
5070 686 : int nFRYBlockSize = 0;
5071 686 : poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
5072 :
5073 686 : const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
5074 1034 : const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
5075 984 : EQUAL(pszResampling, "MODE") ||
5076 298 : !GDALDataTypeIsComplex(eSrcDataType);
5077 : const GDALDataType eWrkDataType =
5078 : bUseGenericResampleFn
5079 686 : ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
5080 686 : : GDT_CFloat32;
5081 :
5082 686 : const int nWidth = poSrcBand->GetXSize();
5083 686 : const int nHeight = poSrcBand->GetYSize();
5084 :
5085 686 : int nMaxOvrFactor = 1;
5086 1491 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
5087 : {
5088 805 : const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
5089 805 : const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
5090 805 : nMaxOvrFactor = std::max(
5091 : nMaxOvrFactor,
5092 805 : static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
5093 805 : nMaxOvrFactor = std::max(
5094 : nMaxOvrFactor,
5095 805 : static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
5096 : }
5097 :
5098 686 : int nFullResYChunk = nFRYBlockSize;
5099 686 : int nMaxChunkYSizeQueried = 0;
5100 :
5101 : const auto UpdateChunkHeightAndGetChunkSize =
5102 9220 : [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
5103 74721 : eWrkDataType, nWidth]()
5104 : {
5105 : // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
5106 : // + nFullResYChunk) / nMaxOvrFactor)
5107 9220 : if (nMaxOvrFactor > INT_MAX / RADIUS_TO_DIAMETER)
5108 : {
5109 1 : return GINTBIG_MAX;
5110 : }
5111 9219 : nFullResYChunk =
5112 9219 : std::max(nFullResYChunk, RADIUS_TO_DIAMETER * nMaxOvrFactor);
5113 9219 : if ((nKernelRadius > 0 &&
5114 970 : nMaxOvrFactor > INT_MAX / (RADIUS_TO_DIAMETER * nKernelRadius)) ||
5115 9219 : nFullResYChunk >
5116 9219 : INT_MAX - RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor)
5117 : {
5118 0 : return GINTBIG_MAX;
5119 : }
5120 9219 : nMaxChunkYSizeQueried =
5121 9219 : nFullResYChunk + RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor;
5122 9219 : if (GDALGetDataTypeSizeBytes(eWrkDataType) >
5123 9219 : std::numeric_limits<int64_t>::max() /
5124 9219 : (static_cast<int64_t>(nMaxChunkYSizeQueried) * nWidth))
5125 : {
5126 1 : return GINTBIG_MAX;
5127 : }
5128 9218 : return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
5129 9218 : nMaxChunkYSizeQueried * nWidth;
5130 686 : };
5131 :
5132 : const char *pszChunkYSize =
5133 686 : CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
5134 : #ifndef __COVERITY__
5135 : // Only configurable for debug / testing
5136 686 : if (pszChunkYSize)
5137 : {
5138 0 : nFullResYChunk = atoi(pszChunkYSize);
5139 : }
5140 : #endif
5141 :
5142 : // Only configurable for debug / testing
5143 : const int nChunkMaxSize =
5144 686 : atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
5145 :
5146 686 : auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
5147 686 : if (nChunkSize > nChunkMaxSize)
5148 : {
5149 15 : if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
5150 44 : !GDALDataTypeIsComplex(eSrcDataType) &&
5151 14 : (!STARTS_WITH_CI(pszResampling, "AVER") ||
5152 2 : EQUAL(pszResampling, "AVERAGE")))
5153 : {
5154 : // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
5155 : // which use a block based strategy, which is much less memory
5156 : // hungry.
5157 14 : return GDALRegenerateOverviewsMultiBand(
5158 : 1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
5159 14 : pfnProgress, pProgressData, papszOptions);
5160 : }
5161 1 : else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
5162 : {
5163 0 : return GDALRegenerateCascadingOverviews(
5164 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
5165 0 : pfnProgress, pProgressData, papszOptions);
5166 : }
5167 : }
5168 671 : else if (pszChunkYSize == nullptr)
5169 : {
5170 : // Try to get as close as possible to nChunkMaxSize
5171 9205 : while (nChunkSize < nChunkMaxSize / 2)
5172 : {
5173 8534 : nFullResYChunk *= 2;
5174 8534 : nChunkSize = UpdateChunkHeightAndGetChunkSize();
5175 : }
5176 : }
5177 :
5178 : // Structure describing a resampling job
5179 : struct OvrJob
5180 : {
5181 : // Buffers to free when job is finished
5182 : std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
5183 : std::shared_ptr<PointerHolder> oSrcBufferHolder{};
5184 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
5185 :
5186 : GDALRasterBand *poDstBand = nullptr;
5187 :
5188 : // Input parameters of pfnResampleFn
5189 : GDALResampleFunction pfnResampleFn = nullptr;
5190 : int nSrcWidth = 0;
5191 : int nSrcHeight = 0;
5192 : int nDstWidth = 0;
5193 : GDALOverviewResampleArgs args{};
5194 : const void *pChunk = nullptr;
5195 : bool bUseGenericResampleFn = false;
5196 :
5197 : // Output values of resampling function
5198 : CPLErr eErr = CE_Failure;
5199 : void *pDstBuffer = nullptr;
5200 : GDALDataType eDstBufferDataType = GDT_Unknown;
5201 :
5202 0 : void SetSrcMaskBufferHolder(
5203 : const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
5204 : {
5205 0 : oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
5206 0 : }
5207 :
5208 0 : void SetSrcBufferHolder(
5209 : const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
5210 : {
5211 0 : oSrcBufferHolder = oSrcBufferHolderIn;
5212 0 : }
5213 :
5214 774 : void NotifyFinished()
5215 : {
5216 1548 : std::lock_guard guard(mutex);
5217 774 : bFinished = true;
5218 774 : cv.notify_one();
5219 774 : }
5220 :
5221 0 : bool IsFinished()
5222 : {
5223 0 : std::lock_guard guard(mutex);
5224 0 : return bFinished;
5225 : }
5226 :
5227 0 : void WaitFinished()
5228 : {
5229 0 : std::unique_lock oGuard(mutex);
5230 0 : while (!bFinished)
5231 : {
5232 0 : cv.wait(oGuard);
5233 : }
5234 0 : }
5235 :
5236 : private:
5237 : // Synchronization
5238 : bool bFinished = false;
5239 : std::mutex mutex{};
5240 : std::condition_variable cv{};
5241 : };
5242 :
5243 : // Thread function to resample
5244 774 : const auto JobResampleFunc = [](void *pData)
5245 : {
5246 774 : OvrJob *poJob = static_cast<OvrJob *>(pData);
5247 :
5248 774 : if (poJob->bUseGenericResampleFn)
5249 : {
5250 772 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
5251 : &(poJob->pDstBuffer),
5252 : &(poJob->eDstBufferDataType));
5253 : }
5254 : else
5255 : {
5256 2 : poJob->eErr = GDALResampleChunkC32R(
5257 : poJob->nSrcWidth, poJob->nSrcHeight,
5258 2 : static_cast<const float *>(poJob->pChunk),
5259 : poJob->args.nChunkYOff, poJob->args.nChunkYSize,
5260 : poJob->args.nDstYOff, poJob->args.nDstYOff2,
5261 : poJob->args.nOvrXSize, poJob->args.nOvrYSize,
5262 : &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
5263 : poJob->args.pszResampling);
5264 : }
5265 :
5266 774 : auto pDstBuffer = poJob->pDstBuffer;
5267 774 : poJob->oDstBufferHolder = std::make_unique<PointerHolder>(pDstBuffer);
5268 :
5269 774 : poJob->NotifyFinished();
5270 774 : };
5271 :
5272 : // Function to write resample data to target band
5273 774 : const auto WriteJobData = [](const OvrJob *poJob)
5274 : {
5275 1548 : return poJob->poDstBand->RasterIO(
5276 774 : GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
5277 774 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
5278 774 : poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
5279 774 : poJob->eDstBufferDataType, 0, 0, nullptr);
5280 : };
5281 :
5282 : // Wait for completion of oldest job and serialize it
5283 : const auto WaitAndFinalizeOldestJob =
5284 0 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
5285 : {
5286 0 : auto poOldestJob = jobList.front().get();
5287 0 : poOldestJob->WaitFinished();
5288 0 : CPLErr l_eErr = poOldestJob->eErr;
5289 0 : if (l_eErr == CE_None)
5290 : {
5291 0 : l_eErr = WriteJobData(poOldestJob);
5292 : }
5293 :
5294 0 : jobList.pop_front();
5295 0 : return l_eErr;
5296 : };
5297 :
5298 : // Queue of jobs
5299 1344 : std::list<std::unique_ptr<OvrJob>> jobList;
5300 :
5301 672 : GByte *pabyChunkNodataMask = nullptr;
5302 672 : void *pChunk = nullptr;
5303 :
5304 672 : const int nThreads = GDALGetNumThreads(GDAL_DEFAULT_MAX_THREAD_COUNT,
5305 : /* bDefaultToAllCPUs=*/false);
5306 : auto poThreadPool =
5307 672 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5308 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5309 1344 : : std::unique_ptr<CPLJobQueue>(nullptr);
5310 :
5311 : /* -------------------------------------------------------------------- */
5312 : /* Loop over image operating on chunks. */
5313 : /* -------------------------------------------------------------------- */
5314 672 : int nChunkYOff = 0;
5315 672 : CPLErr eErr = CE_None;
5316 :
5317 1349 : for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
5318 677 : nChunkYOff += nFullResYChunk)
5319 : {
5320 677 : if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
5321 : pProgressData))
5322 : {
5323 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
5324 0 : eErr = CE_Failure;
5325 : }
5326 :
5327 677 : if (nFullResYChunk + nChunkYOff > nHeight)
5328 669 : nFullResYChunk = nHeight - nChunkYOff;
5329 :
5330 677 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
5331 677 : int nChunkYSizeQueried =
5332 677 : nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
5333 677 : if (nChunkYOffQueried < 0)
5334 : {
5335 83 : nChunkYSizeQueried += nChunkYOffQueried;
5336 83 : nChunkYOffQueried = 0;
5337 : }
5338 677 : if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
5339 83 : nChunkYSizeQueried = nHeight - nChunkYOffQueried;
5340 :
5341 : // Avoid accumulating too many tasks and exhaust RAM
5342 : // Try to complete already finished jobs
5343 677 : while (eErr == CE_None && !jobList.empty())
5344 : {
5345 0 : auto poOldestJob = jobList.front().get();
5346 0 : if (!poOldestJob->IsFinished())
5347 0 : break;
5348 0 : eErr = poOldestJob->eErr;
5349 0 : if (eErr == CE_None)
5350 : {
5351 0 : eErr = WriteJobData(poOldestJob);
5352 : }
5353 :
5354 0 : jobList.pop_front();
5355 : }
5356 :
5357 : // And in case we have saturated the number of threads,
5358 : // wait for completion of tasks to go below the threshold.
5359 1354 : while (eErr == CE_None &&
5360 677 : jobList.size() >= static_cast<size_t>(nThreads))
5361 : {
5362 0 : eErr = WaitAndFinalizeOldestJob(jobList);
5363 : }
5364 :
5365 : // (Re)allocate buffers if needed
5366 677 : if (pChunk == nullptr)
5367 : {
5368 672 : pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
5369 : nMaxChunkYSizeQueried, nWidth);
5370 : }
5371 677 : if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
5372 : {
5373 139 : pabyChunkNodataMask = static_cast<GByte *>(
5374 139 : VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
5375 : }
5376 :
5377 677 : if (pChunk == nullptr ||
5378 139 : (bUseNoDataMask && pabyChunkNodataMask == nullptr))
5379 : {
5380 0 : CPLFree(pChunk);
5381 0 : CPLFree(pabyChunkNodataMask);
5382 0 : return CE_Failure;
5383 : }
5384 :
5385 : // Read chunk.
5386 677 : if (eErr == CE_None)
5387 677 : eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
5388 : nChunkYSizeQueried, pChunk, nWidth,
5389 : nChunkYSizeQueried, eWrkDataType, 0, 0,
5390 : nullptr);
5391 677 : if (eErr == CE_None && bUseNoDataMask)
5392 139 : eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
5393 : nChunkYSizeQueried, pabyChunkNodataMask,
5394 : nWidth, nChunkYSizeQueried, GDT_UInt8,
5395 : 0, 0, nullptr);
5396 :
5397 : // Special case to promote 1bit data to 8bit 0/255 values.
5398 677 : if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
5399 : {
5400 9 : if (eWrkDataType == GDT_Float32)
5401 : {
5402 0 : float *pafChunk = static_cast<float *>(pChunk);
5403 0 : for (size_t i = 0;
5404 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5405 : {
5406 0 : if (pafChunk[i] == 1.0f)
5407 0 : pafChunk[i] = 255.0f;
5408 : }
5409 : }
5410 9 : else if (eWrkDataType == GDT_UInt8)
5411 : {
5412 9 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
5413 168417 : for (size_t i = 0;
5414 168417 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5415 : {
5416 168408 : if (pabyChunk[i] == 1)
5417 127437 : pabyChunk[i] = 255;
5418 : }
5419 : }
5420 0 : else if (eWrkDataType == GDT_UInt16)
5421 : {
5422 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
5423 0 : for (size_t i = 0;
5424 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5425 : {
5426 0 : if (pasChunk[i] == 1)
5427 0 : pasChunk[i] = 255;
5428 : }
5429 : }
5430 0 : else if (eWrkDataType == GDT_Float64)
5431 : {
5432 0 : double *padfChunk = static_cast<double *>(pChunk);
5433 0 : for (size_t i = 0;
5434 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5435 : {
5436 0 : if (padfChunk[i] == 1.0)
5437 0 : padfChunk[i] = 255.0;
5438 : }
5439 : }
5440 : else
5441 : {
5442 0 : CPLAssert(false);
5443 : }
5444 : }
5445 668 : else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
5446 : {
5447 0 : if (eWrkDataType == GDT_Float32)
5448 : {
5449 0 : float *pafChunk = static_cast<float *>(pChunk);
5450 0 : for (size_t i = 0;
5451 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5452 : {
5453 0 : if (pafChunk[i] == 1.0f)
5454 0 : pafChunk[i] = 0.0f;
5455 0 : else if (pafChunk[i] == 0.0f)
5456 0 : pafChunk[i] = 255.0f;
5457 : }
5458 : }
5459 0 : else if (eWrkDataType == GDT_UInt8)
5460 : {
5461 0 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
5462 0 : for (size_t i = 0;
5463 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5464 : {
5465 0 : if (pabyChunk[i] == 1)
5466 0 : pabyChunk[i] = 0;
5467 0 : else if (pabyChunk[i] == 0)
5468 0 : pabyChunk[i] = 255;
5469 : }
5470 : }
5471 0 : else if (eWrkDataType == GDT_UInt16)
5472 : {
5473 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
5474 0 : for (size_t i = 0;
5475 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5476 : {
5477 0 : if (pasChunk[i] == 1)
5478 0 : pasChunk[i] = 0;
5479 0 : else if (pasChunk[i] == 0)
5480 0 : pasChunk[i] = 255;
5481 : }
5482 : }
5483 0 : else if (eWrkDataType == GDT_Float64)
5484 : {
5485 0 : double *padfChunk = static_cast<double *>(pChunk);
5486 0 : for (size_t i = 0;
5487 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5488 : {
5489 0 : if (padfChunk[i] == 1.0)
5490 0 : padfChunk[i] = 0.0;
5491 0 : else if (padfChunk[i] == 0.0)
5492 0 : padfChunk[i] = 255.0;
5493 : }
5494 : }
5495 : else
5496 : {
5497 0 : CPLAssert(false);
5498 : }
5499 : }
5500 :
5501 677 : auto pChunkRaw = pChunk;
5502 677 : auto pabyChunkNodataMaskRaw = pabyChunkNodataMask;
5503 677 : std::shared_ptr<PointerHolder> oSrcBufferHolder;
5504 677 : std::shared_ptr<PointerHolder> oSrcMaskBufferHolder;
5505 677 : if (poJobQueue)
5506 : {
5507 0 : oSrcBufferHolder = std::make_shared<PointerHolder>(pChunk);
5508 : oSrcMaskBufferHolder =
5509 0 : std::make_shared<PointerHolder>(pabyChunkNodataMask);
5510 : }
5511 :
5512 1451 : for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
5513 : ++iOverview)
5514 : {
5515 774 : GDALRasterBand *poDstBand = papoOvrBands[iOverview];
5516 774 : const int nDstWidth = poDstBand->GetXSize();
5517 774 : const int nDstHeight = poDstBand->GetYSize();
5518 :
5519 774 : const double dfXRatioDstToSrc =
5520 774 : static_cast<double>(nWidth) / nDstWidth;
5521 774 : const double dfYRatioDstToSrc =
5522 774 : static_cast<double>(nHeight) / nDstHeight;
5523 :
5524 : /* --------------------------------------------------------------------
5525 : */
5526 : /* Figure out the line to start writing to, and the first line
5527 : */
5528 : /* to not write to. In theory this approach should ensure that
5529 : */
5530 : /* every output line will be written if all input chunks are */
5531 : /* processed. */
5532 : /* --------------------------------------------------------------------
5533 : */
5534 774 : int nDstYOff =
5535 774 : static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
5536 774 : if (nDstYOff == nDstHeight)
5537 0 : continue;
5538 774 : int nDstYOff2 = static_cast<int>(
5539 774 : 0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
5540 :
5541 774 : if (nChunkYOff + nFullResYChunk == nHeight)
5542 767 : nDstYOff2 = nDstHeight;
5543 : #if DEBUG_VERBOSE
5544 : CPLDebug("GDAL",
5545 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
5546 : nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
5547 : nDstWidth, nDstYOff2 - nDstYOff);
5548 : #endif
5549 :
5550 1548 : auto poJob = std::make_unique<OvrJob>();
5551 774 : poJob->pfnResampleFn = pfnResampleFn;
5552 774 : poJob->bUseGenericResampleFn = bUseGenericResampleFn;
5553 774 : poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
5554 774 : poJob->args.nOvrXSize = poDstBand->GetXSize();
5555 774 : poJob->args.nOvrYSize = poDstBand->GetYSize();
5556 : const char *pszNBITS =
5557 774 : poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
5558 774 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
5559 774 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
5560 774 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
5561 774 : poJob->args.eWrkDataType = eWrkDataType;
5562 774 : poJob->pChunk = pChunkRaw;
5563 774 : poJob->args.pabyChunkNodataMask = pabyChunkNodataMaskRaw;
5564 774 : poJob->nSrcWidth = nWidth;
5565 774 : poJob->nSrcHeight = nHeight;
5566 774 : poJob->args.nChunkXOff = 0;
5567 774 : poJob->args.nChunkXSize = nWidth;
5568 774 : poJob->args.nChunkYOff = nChunkYOffQueried;
5569 774 : poJob->args.nChunkYSize = nChunkYSizeQueried;
5570 774 : poJob->nDstWidth = nDstWidth;
5571 774 : poJob->args.nDstXOff = 0;
5572 774 : poJob->args.nDstXOff2 = nDstWidth;
5573 774 : poJob->args.nDstYOff = nDstYOff;
5574 774 : poJob->args.nDstYOff2 = nDstYOff2;
5575 774 : poJob->poDstBand = poDstBand;
5576 774 : poJob->args.pszResampling = pszResampling;
5577 774 : poJob->args.bHasNoData = bHasNoData;
5578 774 : poJob->args.dfNoDataValue = dfNoDataValue;
5579 774 : poJob->args.poColorTable = poColorTable;
5580 774 : poJob->args.eSrcDataType = eSrcDataType;
5581 774 : poJob->args.bPropagateNoData = bPropagateNoData;
5582 :
5583 774 : if (poJobQueue)
5584 : {
5585 0 : poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
5586 0 : poJob->SetSrcBufferHolder(oSrcBufferHolder);
5587 0 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
5588 0 : jobList.emplace_back(std::move(poJob));
5589 : }
5590 : else
5591 : {
5592 774 : JobResampleFunc(poJob.get());
5593 774 : eErr = poJob->eErr;
5594 774 : if (eErr == CE_None)
5595 : {
5596 774 : eErr = WriteJobData(poJob.get());
5597 : }
5598 : }
5599 : }
5600 : }
5601 :
5602 672 : VSIFree(pChunk);
5603 672 : VSIFree(pabyChunkNodataMask);
5604 :
5605 : // Wait for all pending jobs to complete
5606 672 : while (!jobList.empty())
5607 : {
5608 0 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5609 0 : if (l_eErr != CE_None && eErr == CE_None)
5610 0 : eErr = l_eErr;
5611 : }
5612 :
5613 : /* -------------------------------------------------------------------- */
5614 : /* Renormalized overview mean / stddev if needed. */
5615 : /* -------------------------------------------------------------------- */
5616 672 : if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
5617 : {
5618 0 : GDALOverviewMagnitudeCorrection(
5619 : poSrcBand, nOverviewCount,
5620 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
5621 : GDALDummyProgress, nullptr);
5622 : }
5623 :
5624 : /* -------------------------------------------------------------------- */
5625 : /* It can be important to flush out data to overviews. */
5626 : /* -------------------------------------------------------------------- */
5627 1439 : for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
5628 : ++iOverview)
5629 : {
5630 767 : eErr = papoOvrBands[iOverview]->FlushCache(false);
5631 : }
5632 :
5633 672 : if (eErr == CE_None)
5634 672 : pfnProgress(1.0, nullptr, pProgressData);
5635 :
5636 672 : return eErr;
5637 : }
5638 :
5639 : /************************************************************************/
5640 : /* GDALRegenerateOverviewsMultiBand() */
5641 : /************************************************************************/
5642 :
5643 : /**
5644 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5645 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5646 : *
5647 : * This function will generate one or more overview images from a base
5648 : * image using the requested downsampling algorithm. Its primary use
5649 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
5650 : * can also be used to generate downsampled images in one file from another
5651 : * outside the overview architecture.
5652 : *
5653 : * The output bands need to exist in advance and share the same characteristics
5654 : * (type, dimensions)
5655 : *
5656 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5657 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5658 : *
5659 : * It does not support color tables or complex data types.
5660 : *
5661 : * The pseudo-algorithm used by the function is :
5662 : * for each overview
5663 : * iterate on lines of the source by a step of deltay
5664 : * iterate on columns of the source by a step of deltax
5665 : * read the source data of size deltax * deltay for all the bands
5666 : * generate the corresponding overview block for all the bands
5667 : *
5668 : * This function will honour properly NODATA_VALUES tuples (special dataset
5669 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5670 : * considered as the nodata value and not each value of the triplet
5671 : * independently per band.
5672 : *
5673 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
5674 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5675 : * overview computation.
5676 : *
5677 : * @param nBands the number of bands, size of papoSrcBands and size of
5678 : * first dimension of papapoOverviewBands
5679 : * @param papoSrcBands the list of source bands to downsample
5680 : * @param nOverviews the number of downsampled overview levels being generated.
5681 : * @param papapoOverviewBands bidimension array of bands. First dimension is
5682 : * indexed by nBands. Second dimension is indexed by
5683 : * nOverviews.
5684 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5685 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5686 : * @param pfnProgress progress report function.
5687 : * @param pProgressData progress function callback data.
5688 : * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
5689 : * key=value pairs, or NULL
5690 : * Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
5691 : * options can be specified to express that overviews should
5692 : * be regenerated only in the specified subset of the source
5693 : * dataset.
5694 : * @return CE_None on success or CE_Failure on failure.
5695 : */
5696 :
5697 389 : CPLErr GDALRegenerateOverviewsMultiBand(
5698 : int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
5699 : GDALRasterBand *const *const *papapoOverviewBands,
5700 : const char *pszResampling, GDALProgressFunc pfnProgress,
5701 : void *pProgressData, CSLConstList papszOptions)
5702 : {
5703 389 : CPL_IGNORE_RET_VAL(papszOptions);
5704 :
5705 389 : if (pfnProgress == nullptr)
5706 11 : pfnProgress = GDALDummyProgress;
5707 :
5708 389 : if (EQUAL(pszResampling, "NONE") || nBands == 0 || nOverviews == 0)
5709 3 : return CE_None;
5710 :
5711 : // Sanity checks.
5712 386 : if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
5713 192 : !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
5714 83 : !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
5715 25 : !EQUAL(pszResampling, "CUBICSPLINE") &&
5716 24 : !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
5717 5 : !EQUAL(pszResampling, "MODE"))
5718 : {
5719 0 : CPLError(CE_Failure, CPLE_NotSupported,
5720 : "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
5721 : "not supported",
5722 : pszResampling);
5723 0 : return CE_Failure;
5724 : }
5725 :
5726 386 : int nKernelRadius = 0;
5727 : GDALResampleFunction pfnResampleFn =
5728 386 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
5729 386 : if (pfnResampleFn == nullptr)
5730 0 : return CE_Failure;
5731 :
5732 386 : const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
5733 386 : const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
5734 386 : if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
5735 0 : return CE_None;
5736 386 : GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
5737 66233 : for (int iBand = 1; iBand < nBands; ++iBand)
5738 : {
5739 131694 : if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
5740 65847 : papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
5741 : {
5742 0 : CPLError(
5743 : CE_Failure, CPLE_NotSupported,
5744 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5745 : "have the same dimensions");
5746 0 : return CE_Failure;
5747 : }
5748 65847 : if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
5749 : {
5750 0 : CPLError(
5751 : CE_Failure, CPLE_NotSupported,
5752 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5753 : "have the same data type");
5754 0 : return CE_Failure;
5755 : }
5756 : }
5757 :
5758 1028 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5759 : {
5760 642 : const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
5761 642 : const int nDstWidth = poOvrFirstBand->GetXSize();
5762 642 : const int nDstHeight = poOvrFirstBand->GetYSize();
5763 66749 : for (int iBand = 1; iBand < nBands; ++iBand)
5764 : {
5765 66107 : const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
5766 132214 : if (poOvrBand->GetXSize() != nDstWidth ||
5767 66107 : poOvrBand->GetYSize() != nDstHeight)
5768 : {
5769 0 : CPLError(
5770 : CE_Failure, CPLE_NotSupported,
5771 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5772 : "of the same level must have the same dimensions");
5773 0 : return CE_Failure;
5774 : }
5775 66107 : if (poOvrBand->GetRasterDataType() != eDataType)
5776 : {
5777 0 : CPLError(
5778 : CE_Failure, CPLE_NotSupported,
5779 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5780 : "must have the same data type as the source bands");
5781 0 : return CE_Failure;
5782 : }
5783 : }
5784 : }
5785 :
5786 : // First pass to compute the total number of pixels to write.
5787 386 : double dfTotalPixelCount = 0;
5788 386 : const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
5789 386 : const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
5790 386 : const int nSrcXSize = atoi(CSLFetchNameValueDef(
5791 : papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
5792 386 : const int nSrcYSize = atoi(CSLFetchNameValueDef(
5793 : papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
5794 1028 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5795 : {
5796 642 : dfTotalPixelCount +=
5797 1284 : static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
5798 642 : papapoOverviewBands[0][iOverview]->GetXSize() *
5799 1284 : static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
5800 642 : papapoOverviewBands[0][iOverview]->GetYSize();
5801 : }
5802 :
5803 : const GDALDataType eWrkDataType =
5804 386 : GDALGetOvrWorkDataType(pszResampling, eDataType);
5805 : const int nWrkDataTypeSize =
5806 386 : std::max(1, GDALGetDataTypeSizeBytes(eWrkDataType));
5807 :
5808 386 : const bool bIsMask = papoSrcBands[0]->IsMaskBand();
5809 :
5810 : // If we have a nodata mask and we are doing something more complicated
5811 : // than nearest neighbouring, we have to fetch to nodata mask.
5812 : const bool bUseNoDataMask =
5813 572 : !STARTS_WITH_CI(pszResampling, "NEAR") &&
5814 186 : (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
5815 :
5816 772 : std::vector<bool> abHasNoData(nBands);
5817 772 : std::vector<double> adfNoDataValue(nBands);
5818 :
5819 66619 : for (int iBand = 0; iBand < nBands; ++iBand)
5820 : {
5821 66233 : int nHasNoData = 0;
5822 132466 : adfNoDataValue[iBand] =
5823 66233 : papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
5824 66233 : abHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
5825 : }
5826 :
5827 772 : std::string osDetailMessage;
5828 438 : if (bUseNoDataMask &&
5829 52 : papoSrcBands[0]->HasConflictingMaskSources(&osDetailMessage, false))
5830 : {
5831 9 : CPLError(CE_Warning, CPLE_AppDefined, "%s%s", osDetailMessage.c_str(),
5832 18 : abHasNoData[0]
5833 : ? "Only the nodata value will be taken into account."
5834 9 : : "Only the first listed one will be taken into account.");
5835 : }
5836 :
5837 : const bool bPropagateNoData =
5838 386 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5839 :
5840 386 : const int nThreads = GDALGetNumThreads(GDAL_DEFAULT_MAX_THREAD_COUNT,
5841 : /* bDefaultToAllCPUs=*/false);
5842 : auto poThreadPool =
5843 386 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5844 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5845 772 : : std::unique_ptr<CPLJobQueue>(nullptr);
5846 :
5847 : // Only configurable for debug / testing
5848 386 : const GIntBig nChunkMaxSize = []() -> GIntBig
5849 : {
5850 : const char *pszVal =
5851 386 : CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", nullptr);
5852 386 : if (pszVal)
5853 : {
5854 15 : GIntBig nRet = 0;
5855 15 : CPLParseMemorySize(pszVal, &nRet, nullptr);
5856 15 : return std::max<GIntBig>(100, nRet);
5857 : }
5858 371 : return 10 * 1024 * 1024;
5859 386 : }();
5860 :
5861 : // Only configurable for debug / testing
5862 386 : const GIntBig nChunkMaxSizeForTempFile = []() -> GIntBig
5863 : {
5864 386 : const char *pszVal = CPLGetConfigOption(
5865 : "GDAL_OVR_CHUNK_MAX_SIZE_FOR_TEMP_FILE", nullptr);
5866 386 : if (pszVal)
5867 : {
5868 14 : GIntBig nRet = 0;
5869 14 : CPLParseMemorySize(pszVal, &nRet, nullptr);
5870 14 : return std::max<GIntBig>(100, nRet);
5871 : }
5872 372 : const auto nUsableRAM = CPLGetUsablePhysicalRAM();
5873 372 : if (nUsableRAM > 0)
5874 372 : return nUsableRAM / 10;
5875 : // Select a value to be able to at least downsample by 2 for a RGB
5876 : // 1024x1024 tiled output: (2 * 1024 + 2) * (2 * 1024 + 2) * 3 = 12 MB
5877 0 : return 100 * 1024 * 1024;
5878 386 : }();
5879 :
5880 : // Second pass to do the real job.
5881 386 : double dfCurPixelCount = 0;
5882 386 : CPLErr eErr = CE_None;
5883 1022 : for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
5884 : ++iOverview)
5885 : {
5886 641 : int iSrcOverview = -1; // -1 means the source bands.
5887 :
5888 : const int nDstTotalWidth =
5889 641 : papapoOverviewBands[0][iOverview]->GetXSize();
5890 : const int nDstTotalHeight =
5891 641 : papapoOverviewBands[0][iOverview]->GetYSize();
5892 :
5893 : // Compute the coordinates of the target region to refresh
5894 641 : constexpr double EPS = 1e-8;
5895 641 : const int nDstXOffStart = static_cast<int>(
5896 641 : static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
5897 : EPS);
5898 : const int nDstXOffEnd =
5899 1282 : std::min(static_cast<int>(
5900 641 : std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
5901 641 : nToplevelSrcWidth * nDstTotalWidth -
5902 : EPS)),
5903 641 : nDstTotalWidth);
5904 641 : const int nDstWidth = nDstXOffEnd - nDstXOffStart;
5905 641 : const int nDstYOffStart =
5906 641 : static_cast<int>(static_cast<double>(nSrcYOff) /
5907 641 : nToplevelSrcHeight * nDstTotalHeight +
5908 : EPS);
5909 : const int nDstYOffEnd =
5910 1282 : std::min(static_cast<int>(
5911 641 : std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
5912 641 : nToplevelSrcHeight * nDstTotalHeight -
5913 : EPS)),
5914 641 : nDstTotalHeight);
5915 641 : const int nDstHeight = nDstYOffEnd - nDstYOffStart;
5916 :
5917 : // Try to use previous level of overview as the source to compute
5918 : // the next level.
5919 641 : int nSrcWidth = nToplevelSrcWidth;
5920 641 : int nSrcHeight = nToplevelSrcHeight;
5921 896 : if (iOverview > 0 &&
5922 255 : papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
5923 : {
5924 247 : nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
5925 247 : nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
5926 247 : iSrcOverview = iOverview - 1;
5927 : }
5928 :
5929 641 : const double dfXRatioDstToSrc =
5930 641 : static_cast<double>(nSrcWidth) / nDstTotalWidth;
5931 641 : const double dfYRatioDstToSrc =
5932 641 : static_cast<double>(nSrcHeight) / nDstTotalHeight;
5933 :
5934 : const int nOvrFactor =
5935 1923 : std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
5936 641 : static_cast<int>(0.5 + dfYRatioDstToSrc)));
5937 :
5938 641 : int nDstChunkXSize = 0;
5939 641 : int nDstChunkYSize = 0;
5940 641 : papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
5941 : &nDstChunkYSize);
5942 :
5943 641 : constexpr int PIXEL_MARGIN = 2;
5944 : // Try to extend the chunk size so that the memory needed to acquire
5945 : // source pixels goes up to 10 MB.
5946 : // This can help for drivers that support multi-threaded reading
5947 641 : const int nFullResYChunk = static_cast<int>(std::min<double>(
5948 641 : nSrcHeight, PIXEL_MARGIN + nDstChunkYSize * dfYRatioDstToSrc));
5949 641 : const int nFullResYChunkQueried = static_cast<int>(std::min<int64_t>(
5950 1282 : nSrcHeight,
5951 1282 : nFullResYChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5952 641 : nKernelRadius * nOvrFactor));
5953 875 : while (nDstChunkXSize < nDstWidth)
5954 : {
5955 253 : constexpr int INCREASE_FACTOR = 2;
5956 :
5957 253 : const int nFullResXChunk = static_cast<int>(std::min<double>(
5958 506 : nSrcWidth, PIXEL_MARGIN + INCREASE_FACTOR * nDstChunkXSize *
5959 253 : dfXRatioDstToSrc));
5960 :
5961 : const int nFullResXChunkQueried =
5962 253 : static_cast<int>(std::min<int64_t>(
5963 506 : nSrcWidth,
5964 506 : nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5965 253 : nKernelRadius * nOvrFactor));
5966 :
5967 253 : if (nBands > nChunkMaxSize / nFullResXChunkQueried /
5968 253 : nFullResYChunkQueried / nWrkDataTypeSize)
5969 : {
5970 19 : break;
5971 : }
5972 :
5973 234 : nDstChunkXSize *= INCREASE_FACTOR;
5974 : }
5975 641 : nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
5976 :
5977 641 : const int nFullResXChunk = static_cast<int>(std::min<double>(
5978 641 : nSrcWidth, PIXEL_MARGIN + nDstChunkXSize * dfXRatioDstToSrc));
5979 641 : const int nFullResXChunkQueried = static_cast<int>(std::min<int64_t>(
5980 1282 : nSrcWidth,
5981 1282 : nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5982 641 : nKernelRadius * nOvrFactor));
5983 :
5984 : // Make sure that the RAM requirements to acquire the source data does
5985 : // not exceed nChunkMaxSizeForTempFile
5986 : // If so, reduce the destination chunk size, generate overviews in a
5987 : // temporary dataset, and copy that temporary dataset over the target
5988 : // overview bands (to avoid issues with lossy compression)
5989 : const bool bOverflowFullResXChunkYChunkQueried =
5990 641 : nBands > std::numeric_limits<int64_t>::max() /
5991 641 : nFullResXChunkQueried / nFullResYChunkQueried /
5992 641 : nWrkDataTypeSize;
5993 :
5994 641 : const auto nMemRequirement =
5995 : bOverflowFullResXChunkYChunkQueried
5996 641 : ? 0
5997 637 : : static_cast<GIntBig>(nFullResXChunkQueried) *
5998 637 : nFullResYChunkQueried * nBands * nWrkDataTypeSize;
5999 : // Use a temporary dataset with a smaller destination chunk size
6000 641 : const auto nOverShootFactor =
6001 : nMemRequirement / nChunkMaxSizeForTempFile;
6002 :
6003 641 : constexpr int MIN_OVERSHOOT_FACTOR = 4;
6004 : const auto nSqrtOverShootFactor = std::max<GIntBig>(
6005 1282 : MIN_OVERSHOOT_FACTOR, static_cast<GIntBig>(std::ceil(std::sqrt(
6006 641 : static_cast<double>(nOverShootFactor)))));
6007 641 : constexpr int DEFAULT_CHUNK_SIZE = 256;
6008 641 : constexpr int GTIFF_BLOCK_SIZE_MULTIPLE = 16;
6009 : const int nReducedDstChunkXSize =
6010 : bOverflowFullResXChunkYChunkQueried
6011 1278 : ? DEFAULT_CHUNK_SIZE
6012 1278 : : std::max(1, static_cast<int>(nDstChunkXSize /
6013 1278 : nSqrtOverShootFactor) &
6014 637 : ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
6015 : const int nReducedDstChunkYSize =
6016 : bOverflowFullResXChunkYChunkQueried
6017 1278 : ? DEFAULT_CHUNK_SIZE
6018 1278 : : std::max(1, static_cast<int>(nDstChunkYSize /
6019 1278 : nSqrtOverShootFactor) &
6020 637 : ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
6021 :
6022 641 : if (bOverflowFullResXChunkYChunkQueried ||
6023 : nMemRequirement > nChunkMaxSizeForTempFile)
6024 : {
6025 : const auto nDTSize =
6026 43 : std::max(1, GDALGetDataTypeSizeBytes(eDataType));
6027 : const bool bTmpDSMemRequirementOverflow =
6028 43 : nBands > std::numeric_limits<int64_t>::max() / nDstWidth /
6029 43 : nDstHeight / nDTSize;
6030 43 : const auto nTmpDSMemRequirement =
6031 : bTmpDSMemRequirementOverflow
6032 43 : ? 0
6033 41 : : static_cast<GIntBig>(nDstWidth) * nDstHeight * nBands *
6034 41 : nDTSize;
6035 :
6036 : // make sure that one band buffer doesn't overflow size_t
6037 : const bool bChunkSizeOverflow =
6038 43 : static_cast<size_t>(nDTSize) >
6039 43 : std::numeric_limits<size_t>::max() / nDstWidth / nDstHeight;
6040 43 : const size_t nChunkSize =
6041 : bChunkSizeOverflow
6042 43 : ? 0
6043 41 : : static_cast<size_t>(nDstWidth) * nDstHeight * nDTSize;
6044 :
6045 : const auto CreateVRT =
6046 41 : [nBands, nSrcWidth, nSrcHeight, nDstTotalWidth, nDstTotalHeight,
6047 : pszResampling, eWrkDataType, papoSrcBands, papapoOverviewBands,
6048 : iSrcOverview, &abHasNoData,
6049 393585 : &adfNoDataValue](int nVRTBlockXSize, int nVRTBlockYSize)
6050 : {
6051 : auto poVRTDS = std::make_unique<VRTDataset>(
6052 41 : nDstTotalWidth, nDstTotalHeight, nVRTBlockXSize,
6053 41 : nVRTBlockYSize);
6054 :
6055 65620 : for (int iBand = 0; iBand < nBands; ++iBand)
6056 : {
6057 131158 : auto poVRTSrc = std::make_unique<VRTSimpleSource>();
6058 65579 : poVRTSrc->SetResampling(pszResampling);
6059 65579 : poVRTDS->AddBand(eWrkDataType);
6060 : auto poVRTBand = static_cast<VRTSourcedRasterBand *>(
6061 65579 : poVRTDS->GetRasterBand(iBand + 1));
6062 :
6063 65579 : auto poSrcBand = papoSrcBands[iBand];
6064 65579 : if (iSrcOverview != -1)
6065 24 : poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
6066 65579 : poVRTBand->ConfigureSource(
6067 : poVRTSrc.get(), poSrcBand, false, 0, 0, nSrcWidth,
6068 : nSrcHeight, 0, 0, nDstTotalWidth, nDstTotalHeight);
6069 : // Add the source to the band
6070 65579 : poVRTBand->AddSource(poVRTSrc.release());
6071 65579 : if (abHasNoData[iBand])
6072 3 : poVRTBand->SetNoDataValue(adfNoDataValue[iBand]);
6073 : }
6074 :
6075 42 : if (papoSrcBands[0]->GetMaskFlags() == GMF_PER_DATASET &&
6076 1 : poVRTDS->CreateMaskBand(GMF_PER_DATASET) == CE_None)
6077 : {
6078 : VRTSourcedRasterBand *poMaskVRTBand =
6079 1 : cpl::down_cast<VRTSourcedRasterBand *>(
6080 1 : poVRTDS->GetRasterBand(1)->GetMaskBand());
6081 1 : auto poSrcBand = papoSrcBands[0];
6082 1 : if (iSrcOverview != -1)
6083 0 : poSrcBand = papapoOverviewBands[0][iSrcOverview];
6084 1 : poMaskVRTBand->AddMaskBandSource(
6085 1 : poSrcBand->GetMaskBand(), 0, 0, nSrcWidth, nSrcHeight,
6086 : 0, 0, nDstTotalWidth, nDstTotalHeight);
6087 : }
6088 :
6089 41 : return poVRTDS;
6090 43 : };
6091 :
6092 : // If the overview accommodates chunking, do so and recurse
6093 : // to avoid generating full size temporary files
6094 43 : if (!bOverflowFullResXChunkYChunkQueried &&
6095 39 : !bTmpDSMemRequirementOverflow && !bChunkSizeOverflow &&
6096 39 : (nDstChunkXSize < nDstWidth || nDstChunkYSize < nDstHeight))
6097 : {
6098 : // Create a VRT with the smaller chunk to do the scaling
6099 : auto poVRTDS =
6100 13 : CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
6101 :
6102 13 : std::vector<GDALRasterBand *> apoVRTBand(nBands);
6103 13 : std::vector<GDALRasterBand *> apoDstBand(nBands);
6104 65560 : for (int iBand = 0; iBand < nBands; ++iBand)
6105 : {
6106 65547 : apoDstBand[iBand] = papapoOverviewBands[iBand][iOverview];
6107 65547 : apoVRTBand[iBand] = poVRTDS->GetRasterBand(iBand + 1);
6108 : }
6109 :
6110 : // Use a flag to avoid reading from the overview being built
6111 : GDALRasterIOExtraArg sExtraArg;
6112 13 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
6113 13 : if (iSrcOverview == -1)
6114 13 : sExtraArg.bUseOnlyThisScale = true;
6115 :
6116 : // A single band buffer for data transfer to the overview
6117 13 : std::vector<GByte> abyChunk;
6118 : try
6119 : {
6120 13 : abyChunk.resize(nChunkSize);
6121 : }
6122 0 : catch (const std::exception &)
6123 : {
6124 0 : CPLError(CE_Failure, CPLE_OutOfMemory,
6125 : "Out of memory allocating temporary buffer");
6126 0 : return CE_Failure;
6127 : }
6128 :
6129 : // Loop over output height, in chunks
6130 13 : for (int nDstYOff = nDstYOffStart;
6131 38 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6132 : /* */)
6133 : {
6134 : const int nDstYCount =
6135 25 : std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
6136 : // Loop over output width, in output chunks
6137 25 : for (int nDstXOff = nDstXOffStart;
6138 74 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6139 : /* */)
6140 : {
6141 : const int nDstXCount =
6142 49 : std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
6143 : // Read and transfer the chunk to the overview
6144 98 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
6145 : ++iBand)
6146 : {
6147 98 : eErr = apoVRTBand[iBand]->RasterIO(
6148 : GF_Read, nDstXOff, nDstYOff, nDstXCount,
6149 49 : nDstYCount, abyChunk.data(), nDstXCount,
6150 : nDstYCount, eDataType, 0, 0, &sExtraArg);
6151 49 : if (eErr == CE_None)
6152 : {
6153 96 : eErr = apoDstBand[iBand]->RasterIO(
6154 : GF_Write, nDstXOff, nDstYOff, nDstXCount,
6155 48 : nDstYCount, abyChunk.data(), nDstXCount,
6156 : nDstYCount, eDataType, 0, 0, nullptr);
6157 : }
6158 : }
6159 :
6160 49 : dfCurPixelCount +=
6161 49 : static_cast<double>(nDstXCount) * nDstYCount;
6162 :
6163 49 : nDstXOff += nDstXCount;
6164 : } // width
6165 :
6166 25 : if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount,
6167 : nullptr, pProgressData))
6168 : {
6169 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
6170 : "User terminated");
6171 0 : eErr = CE_Failure;
6172 : }
6173 :
6174 25 : nDstYOff += nDstYCount;
6175 : } // height
6176 :
6177 13 : if (CE_None != eErr)
6178 : {
6179 1 : CPLError(CE_Failure, CPLE_AppDefined,
6180 : "Error while writing overview");
6181 1 : return CE_Failure;
6182 : }
6183 :
6184 12 : pfnProgress(1.0, nullptr, pProgressData);
6185 : // Flush the overviews we just generated
6186 24 : for (int iBand = 0; iBand < nBands; ++iBand)
6187 12 : apoDstBand[iBand]->FlushCache(false);
6188 :
6189 12 : continue; // Next overview
6190 : } // chunking via temporary dataset
6191 :
6192 0 : std::unique_ptr<GDALDataset> poTmpDS;
6193 : // Config option mostly/only for autotest purposes
6194 : const char *pszGDAL_OVR_TEMP_DRIVER =
6195 30 : CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
6196 30 : if ((!bTmpDSMemRequirementOverflow &&
6197 4 : nTmpDSMemRequirement <= nChunkMaxSizeForTempFile &&
6198 4 : !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
6199 26 : EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
6200 : {
6201 10 : auto poTmpDrv = GetGDALDriverManager()->GetDriverByName("MEM");
6202 10 : if (!poTmpDrv)
6203 : {
6204 0 : eErr = CE_Failure;
6205 0 : break;
6206 : }
6207 10 : poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
6208 : nDstTotalHeight, nBands,
6209 10 : eDataType, nullptr));
6210 : }
6211 : else
6212 : {
6213 : // Create a temporary file for the overview
6214 : auto poTmpDrv =
6215 20 : GetGDALDriverManager()->GetDriverByName("GTiff");
6216 20 : if (!poTmpDrv)
6217 : {
6218 0 : eErr = CE_Failure;
6219 0 : break;
6220 : }
6221 40 : std::string osTmpFilename;
6222 20 : auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
6223 20 : if (poDstDS)
6224 : {
6225 20 : osTmpFilename = poDstDS->GetDescription();
6226 : VSIStatBufL sStatBuf;
6227 20 : if (!osTmpFilename.empty() &&
6228 0 : VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
6229 0 : osTmpFilename += "_tmp_ovr.tif";
6230 : }
6231 20 : if (osTmpFilename.empty())
6232 : {
6233 20 : osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
6234 20 : osTmpFilename += ".tif";
6235 : }
6236 20 : CPLDebug("GDAL", "Creating temporary file %s of %d x %d x %d",
6237 : osTmpFilename.c_str(), nDstWidth, nDstHeight, nBands);
6238 40 : CPLStringList aosCO;
6239 20 : if (0 == ((nReducedDstChunkXSize % GTIFF_BLOCK_SIZE_MULTIPLE) |
6240 20 : (nReducedDstChunkYSize % GTIFF_BLOCK_SIZE_MULTIPLE)))
6241 : {
6242 14 : aosCO.SetNameValue("TILED", "YES");
6243 : aosCO.SetNameValue("BLOCKXSIZE",
6244 14 : CPLSPrintf("%d", nReducedDstChunkXSize));
6245 : aosCO.SetNameValue("BLOCKYSIZE",
6246 14 : CPLSPrintf("%d", nReducedDstChunkYSize));
6247 : }
6248 20 : if (const char *pszCOList =
6249 20 : poTmpDrv->GetMetadataItem(GDAL_DMD_CREATIONOPTIONLIST))
6250 : {
6251 : aosCO.SetNameValue(
6252 20 : "COMPRESS", strstr(pszCOList, "ZSTD") ? "ZSTD" : "LZW");
6253 : }
6254 20 : poTmpDS.reset(poTmpDrv->Create(osTmpFilename.c_str(), nDstWidth,
6255 : nDstHeight, nBands, eDataType,
6256 20 : aosCO.List()));
6257 20 : if (poTmpDS)
6258 : {
6259 18 : poTmpDS->MarkSuppressOnClose();
6260 18 : VSIUnlink(osTmpFilename.c_str());
6261 : }
6262 : }
6263 30 : if (!poTmpDS)
6264 : {
6265 2 : eErr = CE_Failure;
6266 2 : break;
6267 : }
6268 :
6269 : // Create a full size VRT to do the resampling without edge effects
6270 : auto poVRTDS =
6271 28 : CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
6272 :
6273 : // Allocate a band buffer with the overview chunk size
6274 : std::unique_ptr<void, VSIFreeReleaser> pDstBuffer(
6275 : VSI_MALLOC3_VERBOSE(size_t(nWrkDataTypeSize), nDstChunkXSize,
6276 28 : nDstChunkYSize));
6277 28 : if (pDstBuffer == nullptr)
6278 : {
6279 0 : eErr = CE_Failure;
6280 0 : break;
6281 : }
6282 :
6283 : // Use a flag to avoid reading the overview being built
6284 : GDALRasterIOExtraArg sExtraArg;
6285 28 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
6286 28 : if (iSrcOverview == -1)
6287 4 : sExtraArg.bUseOnlyThisScale = true;
6288 :
6289 : // Scale and copy data from the VRT to the temp file
6290 28 : for (int nDstYOff = nDstYOffStart;
6291 914 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6292 : /* */)
6293 : {
6294 : const int nDstYCount =
6295 886 : std::min(nReducedDstChunkYSize, nDstYOffEnd - nDstYOff);
6296 886 : for (int nDstXOff = nDstXOffStart;
6297 201218 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6298 : /* */)
6299 : {
6300 : const int nDstXCount =
6301 200332 : std::min(nReducedDstChunkXSize, nDstXOffEnd - nDstXOff);
6302 400668 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
6303 : ++iBand)
6304 : {
6305 200336 : auto poSrcBand = poVRTDS->GetRasterBand(iBand + 1);
6306 200336 : eErr = poSrcBand->RasterIO(
6307 : GF_Read, nDstXOff, nDstYOff, nDstXCount, nDstYCount,
6308 : pDstBuffer.get(), nDstXCount, nDstYCount,
6309 : eWrkDataType, 0, 0, &sExtraArg);
6310 200336 : if (eErr == CE_None)
6311 : {
6312 : // Write to the temporary dataset, shifted
6313 200334 : auto poOvrBand = poTmpDS->GetRasterBand(iBand + 1);
6314 200334 : eErr = poOvrBand->RasterIO(
6315 : GF_Write, nDstXOff - nDstXOffStart,
6316 : nDstYOff - nDstYOffStart, nDstXCount,
6317 : nDstYCount, pDstBuffer.get(), nDstXCount,
6318 : nDstYCount, eWrkDataType, 0, 0, nullptr);
6319 : }
6320 : }
6321 200332 : nDstXOff += nDstXCount;
6322 : }
6323 886 : nDstYOff += nDstYCount;
6324 : }
6325 :
6326 : // Copy from the temporary to the overview
6327 28 : for (int nDstYOff = nDstYOffStart;
6328 54 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6329 : /* */)
6330 : {
6331 : const int nDstYCount =
6332 26 : std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
6333 26 : for (int nDstXOff = nDstXOffStart;
6334 52 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6335 : /* */)
6336 : {
6337 : const int nDstXCount =
6338 26 : std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
6339 56 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
6340 : ++iBand)
6341 : {
6342 30 : auto poSrcBand = poTmpDS->GetRasterBand(iBand + 1);
6343 30 : eErr = poSrcBand->RasterIO(
6344 : GF_Read, nDstXOff - nDstXOffStart,
6345 : nDstYOff - nDstYOffStart, nDstXCount, nDstYCount,
6346 : pDstBuffer.get(), nDstXCount, nDstYCount,
6347 : eWrkDataType, 0, 0, nullptr);
6348 30 : if (eErr == CE_None)
6349 : {
6350 : // Write to the destination overview bands
6351 30 : auto poOvrBand =
6352 30 : papapoOverviewBands[iBand][iOverview];
6353 30 : eErr = poOvrBand->RasterIO(
6354 : GF_Write, nDstXOff, nDstYOff, nDstXCount,
6355 : nDstYCount, pDstBuffer.get(), nDstXCount,
6356 : nDstYCount, eWrkDataType, 0, 0, nullptr);
6357 : }
6358 : }
6359 26 : nDstXOff += nDstXCount;
6360 : }
6361 26 : nDstYOff += nDstYCount;
6362 : }
6363 :
6364 28 : if (eErr != CE_None)
6365 : {
6366 2 : CPLError(CE_Failure, CPLE_AppDefined,
6367 : "Failed to write overview %d", iOverview);
6368 2 : return eErr;
6369 : }
6370 :
6371 : // Flush the data to overviews.
6372 56 : for (int iBand = 0; iBand < nBands; ++iBand)
6373 30 : papapoOverviewBands[iBand][iOverview]->FlushCache(false);
6374 :
6375 26 : continue;
6376 : }
6377 :
6378 : // Structure describing a resampling job
6379 : struct OvrJob
6380 : {
6381 : // Buffers to free when job is finished
6382 : std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
6383 : std::unique_ptr<PointerHolder> oSrcBufferHolder{};
6384 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
6385 :
6386 : GDALRasterBand *poDstBand = nullptr;
6387 :
6388 : // Input parameters of pfnResampleFn
6389 : GDALResampleFunction pfnResampleFn = nullptr;
6390 : GDALOverviewResampleArgs args{};
6391 : const void *pChunk = nullptr;
6392 :
6393 : // Output values of resampling function
6394 : CPLErr eErr = CE_Failure;
6395 : void *pDstBuffer = nullptr;
6396 : GDALDataType eDstBufferDataType = GDT_Unknown;
6397 :
6398 3293 : void NotifyFinished()
6399 : {
6400 6586 : std::lock_guard guard(mutex);
6401 3293 : bFinished = true;
6402 3293 : cv.notify_one();
6403 3293 : }
6404 :
6405 2 : bool IsFinished()
6406 : {
6407 2 : std::lock_guard guard(mutex);
6408 4 : return bFinished;
6409 : }
6410 :
6411 14 : void WaitFinished()
6412 : {
6413 28 : std::unique_lock oGuard(mutex);
6414 21 : while (!bFinished)
6415 : {
6416 7 : cv.wait(oGuard);
6417 : }
6418 14 : }
6419 :
6420 : private:
6421 : // Synchronization
6422 : bool bFinished = false;
6423 : std::mutex mutex{};
6424 : std::condition_variable cv{};
6425 : };
6426 :
6427 : // Thread function to resample
6428 3293 : const auto JobResampleFunc = [](void *pData)
6429 : {
6430 3293 : OvrJob *poJob = static_cast<OvrJob *>(pData);
6431 :
6432 3293 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
6433 : &(poJob->pDstBuffer),
6434 : &(poJob->eDstBufferDataType));
6435 :
6436 3293 : auto pDstBuffer = poJob->pDstBuffer;
6437 : poJob->oDstBufferHolder =
6438 3293 : std::make_unique<PointerHolder>(pDstBuffer);
6439 :
6440 3293 : poJob->NotifyFinished();
6441 3293 : };
6442 :
6443 : // Function to write resample data to target band
6444 3293 : const auto WriteJobData = [](const OvrJob *poJob)
6445 : {
6446 6586 : return poJob->poDstBand->RasterIO(
6447 3293 : GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
6448 3293 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
6449 3293 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
6450 3293 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
6451 3293 : poJob->args.nDstYOff2 - poJob->args.nDstYOff,
6452 3293 : poJob->eDstBufferDataType, 0, 0, nullptr);
6453 : };
6454 :
6455 : // Wait for completion of oldest job and serialize it
6456 : const auto WaitAndFinalizeOldestJob =
6457 14 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
6458 : {
6459 14 : auto poOldestJob = jobList.front().get();
6460 14 : poOldestJob->WaitFinished();
6461 14 : CPLErr l_eErr = poOldestJob->eErr;
6462 14 : if (l_eErr == CE_None)
6463 : {
6464 14 : l_eErr = WriteJobData(poOldestJob);
6465 : }
6466 :
6467 14 : jobList.pop_front();
6468 14 : return l_eErr;
6469 : };
6470 :
6471 : // Queue of jobs
6472 1196 : std::list<std::unique_ptr<OvrJob>> jobList;
6473 :
6474 1196 : std::vector<std::unique_ptr<void, VSIFreeReleaser>> apaChunk(nBands);
6475 : std::vector<std::unique_ptr<GByte, VSIFreeReleaser>>
6476 1196 : apabyChunkNoDataMask(nBands);
6477 :
6478 : // Iterate on destination overview, block by block.
6479 598 : for (int nDstYOff = nDstYOffStart;
6480 2102 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6481 1504 : nDstYOff += nDstChunkYSize)
6482 : {
6483 : int nDstYCount;
6484 1504 : if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
6485 1084 : nDstYCount = nDstChunkYSize;
6486 : else
6487 420 : nDstYCount = nDstYOffEnd - nDstYOff;
6488 :
6489 1504 : int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
6490 1504 : int nChunkYOff2 = static_cast<int>(
6491 1504 : ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
6492 1504 : if (nChunkYOff2 > nSrcHeight ||
6493 1504 : nDstYOff + nDstYCount == nDstTotalHeight)
6494 591 : nChunkYOff2 = nSrcHeight;
6495 1504 : int nYCount = nChunkYOff2 - nChunkYOff;
6496 1504 : CPLAssert(nYCount <= nFullResYChunk);
6497 :
6498 1504 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
6499 1504 : int nChunkYSizeQueried =
6500 1504 : nYCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
6501 1504 : if (nChunkYOffQueried < 0)
6502 : {
6503 145 : nChunkYSizeQueried += nChunkYOffQueried;
6504 145 : nChunkYOffQueried = 0;
6505 : }
6506 1504 : if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
6507 145 : nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
6508 1504 : CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
6509 :
6510 1504 : if (!pfnProgress(std::min(1.0, dfCurPixelCount / dfTotalPixelCount),
6511 : nullptr, pProgressData))
6512 : {
6513 1 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6514 1 : eErr = CE_Failure;
6515 : }
6516 :
6517 : // Iterate on destination overview, block by block.
6518 1504 : for (int nDstXOff = nDstXOffStart;
6519 3047 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6520 1543 : nDstXOff += nDstChunkXSize)
6521 : {
6522 1543 : int nDstXCount = 0;
6523 1543 : if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
6524 1526 : nDstXCount = nDstChunkXSize;
6525 : else
6526 17 : nDstXCount = nDstXOffEnd - nDstXOff;
6527 :
6528 1543 : dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
6529 :
6530 1543 : int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
6531 1543 : int nChunkXOff2 = static_cast<int>(
6532 1543 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
6533 1543 : if (nChunkXOff2 > nSrcWidth ||
6534 1543 : nDstXOff + nDstXCount == nDstTotalWidth)
6535 1468 : nChunkXOff2 = nSrcWidth;
6536 1543 : const int nXCount = nChunkXOff2 - nChunkXOff;
6537 1543 : CPLAssert(nXCount <= nFullResXChunk);
6538 :
6539 1543 : int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
6540 1543 : int nChunkXSizeQueried =
6541 1543 : nXCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
6542 1543 : if (nChunkXOffQueried < 0)
6543 : {
6544 206 : nChunkXSizeQueried += nChunkXOffQueried;
6545 206 : nChunkXOffQueried = 0;
6546 : }
6547 1543 : if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
6548 215 : nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
6549 1543 : CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
6550 : #if DEBUG_VERBOSE
6551 : CPLDebug("GDAL",
6552 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
6553 : nChunkXOffQueried, nChunkYOffQueried,
6554 : nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
6555 : nDstYOff, nDstXCount, nDstYCount);
6556 : #endif
6557 :
6558 : // Avoid accumulating too many tasks and exhaust RAM
6559 :
6560 : // Try to complete already finished jobs
6561 1545 : while (eErr == CE_None && !jobList.empty())
6562 : {
6563 2 : auto poOldestJob = jobList.front().get();
6564 2 : if (!poOldestJob->IsFinished())
6565 0 : break;
6566 2 : eErr = poOldestJob->eErr;
6567 2 : if (eErr == CE_None)
6568 : {
6569 2 : eErr = WriteJobData(poOldestJob);
6570 : }
6571 :
6572 2 : jobList.pop_front();
6573 : }
6574 :
6575 : // And in case we have saturated the number of threads,
6576 : // wait for completion of tasks to go below the threshold.
6577 3086 : while (eErr == CE_None &&
6578 1543 : jobList.size() >= static_cast<size_t>(nThreads))
6579 : {
6580 0 : eErr = WaitAndFinalizeOldestJob(jobList);
6581 : }
6582 :
6583 : // Read the source buffers for all the bands.
6584 4837 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6585 : {
6586 : // (Re)allocate buffers if needed
6587 3294 : if (apaChunk[iBand] == nullptr)
6588 : {
6589 1169 : apaChunk[iBand].reset(VSI_MALLOC3_VERBOSE(
6590 : nFullResXChunkQueried, nFullResYChunkQueried,
6591 : nWrkDataTypeSize));
6592 1169 : if (apaChunk[iBand] == nullptr)
6593 : {
6594 0 : eErr = CE_Failure;
6595 : }
6596 : }
6597 3611 : if (bUseNoDataMask &&
6598 317 : apabyChunkNoDataMask[iBand] == nullptr)
6599 : {
6600 266 : apabyChunkNoDataMask[iBand].reset(
6601 266 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
6602 : nFullResXChunkQueried, nFullResYChunkQueried)));
6603 266 : if (apabyChunkNoDataMask[iBand] == nullptr)
6604 : {
6605 0 : eErr = CE_Failure;
6606 : }
6607 : }
6608 :
6609 3294 : if (eErr == CE_None)
6610 : {
6611 3294 : GDALRasterBand *poSrcBand = nullptr;
6612 3294 : if (iSrcOverview == -1)
6613 2402 : poSrcBand = papoSrcBands[iBand];
6614 : else
6615 892 : poSrcBand =
6616 892 : papapoOverviewBands[iBand][iSrcOverview];
6617 3294 : eErr = poSrcBand->RasterIO(
6618 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6619 : nChunkXSizeQueried, nChunkYSizeQueried,
6620 3294 : apaChunk[iBand].get(), nChunkXSizeQueried,
6621 : nChunkYSizeQueried, eWrkDataType, 0, 0, nullptr);
6622 :
6623 3294 : if (bUseNoDataMask && eErr == CE_None)
6624 : {
6625 317 : auto poMaskBand = poSrcBand->IsMaskBand()
6626 317 : ? poSrcBand
6627 244 : : poSrcBand->GetMaskBand();
6628 317 : eErr = poMaskBand->RasterIO(
6629 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6630 : nChunkXSizeQueried, nChunkYSizeQueried,
6631 317 : apabyChunkNoDataMask[iBand].get(),
6632 : nChunkXSizeQueried, nChunkYSizeQueried,
6633 : GDT_UInt8, 0, 0, nullptr);
6634 : }
6635 : }
6636 : }
6637 :
6638 : // Compute the resulting overview block.
6639 4836 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6640 : {
6641 6586 : auto poJob = std::make_unique<OvrJob>();
6642 3293 : poJob->pfnResampleFn = pfnResampleFn;
6643 3293 : poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
6644 6586 : poJob->args.eOvrDataType =
6645 3293 : poJob->poDstBand->GetRasterDataType();
6646 3293 : poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
6647 3293 : poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
6648 3293 : const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
6649 3293 : "NBITS", "IMAGE_STRUCTURE");
6650 3293 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
6651 3293 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
6652 3293 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
6653 3293 : poJob->args.eWrkDataType = eWrkDataType;
6654 3293 : poJob->pChunk = apaChunk[iBand].get();
6655 3293 : poJob->args.pabyChunkNodataMask =
6656 3293 : apabyChunkNoDataMask[iBand].get();
6657 3293 : poJob->args.nChunkXOff = nChunkXOffQueried;
6658 3293 : poJob->args.nChunkXSize = nChunkXSizeQueried;
6659 3293 : poJob->args.nChunkYOff = nChunkYOffQueried;
6660 3293 : poJob->args.nChunkYSize = nChunkYSizeQueried;
6661 3293 : poJob->args.nDstXOff = nDstXOff;
6662 3293 : poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
6663 3293 : poJob->args.nDstYOff = nDstYOff;
6664 3293 : poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
6665 3293 : poJob->args.pszResampling = pszResampling;
6666 3293 : poJob->args.bHasNoData = abHasNoData[iBand];
6667 3293 : poJob->args.dfNoDataValue = adfNoDataValue[iBand];
6668 3293 : poJob->args.eSrcDataType = eDataType;
6669 3293 : poJob->args.bPropagateNoData = bPropagateNoData;
6670 :
6671 3293 : if (poJobQueue)
6672 : {
6673 16 : poJob->oSrcMaskBufferHolder =
6674 32 : std::make_unique<PointerHolder>(
6675 32 : std::move(apabyChunkNoDataMask[iBand]));
6676 :
6677 16 : poJob->oSrcBufferHolder =
6678 32 : std::make_unique<PointerHolder>(
6679 32 : std::move(apaChunk[iBand]));
6680 :
6681 16 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
6682 16 : jobList.emplace_back(std::move(poJob));
6683 : }
6684 : else
6685 : {
6686 3277 : JobResampleFunc(poJob.get());
6687 3277 : eErr = poJob->eErr;
6688 3277 : if (eErr == CE_None)
6689 : {
6690 3277 : eErr = WriteJobData(poJob.get());
6691 : }
6692 : }
6693 : }
6694 : }
6695 : }
6696 :
6697 : // Wait for all pending jobs to complete
6698 612 : while (!jobList.empty())
6699 : {
6700 14 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
6701 14 : if (l_eErr != CE_None && eErr == CE_None)
6702 0 : eErr = l_eErr;
6703 : }
6704 :
6705 : // Flush the data to overviews.
6706 1765 : for (int iBand = 0; iBand < nBands; ++iBand)
6707 : {
6708 1167 : if (papapoOverviewBands[iBand][iOverview]->FlushCache(false) !=
6709 : CE_None)
6710 0 : eErr = CE_Failure;
6711 : }
6712 : }
6713 :
6714 383 : if (eErr == CE_None)
6715 379 : pfnProgress(1.0, nullptr, pProgressData);
6716 :
6717 383 : return eErr;
6718 : }
6719 :
6720 : /************************************************************************/
6721 : /* GDALRegenerateOverviewsMultiBand() */
6722 : /************************************************************************/
6723 :
6724 : /**
6725 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
6726 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
6727 : *
6728 : * This function will generate one or more overview images from a base
6729 : * image using the requested downsampling algorithm. Its primary use
6730 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
6731 : * can also be used to generate downsampled images in one file from another
6732 : * outside the overview architecture.
6733 : *
6734 : * The output bands need to exist in advance and share the same characteristics
6735 : * (type, dimensions)
6736 : *
6737 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
6738 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
6739 : *
6740 : * It does not support color tables or complex data types.
6741 : *
6742 : * The pseudo-algorithm used by the function is :
6743 : * for each overview
6744 : * iterate on lines of the source by a step of deltay
6745 : * iterate on columns of the source by a step of deltax
6746 : * read the source data of size deltax * deltay for all the bands
6747 : * generate the corresponding overview block for all the bands
6748 : *
6749 : * This function will honour properly NODATA_VALUES tuples (special dataset
6750 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
6751 : * considered as the nodata value and not each value of the triplet
6752 : * independently per band.
6753 : *
6754 : * The GDAL_NUM_THREADS configuration option can be set
6755 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
6756 : * overview computation.
6757 : *
6758 : * @param apoSrcBands the list of source bands to downsample
6759 : * @param aapoOverviewBands bidimension array of bands. First dimension is
6760 : * indexed by bands. Second dimension is indexed by
6761 : * overview levels. All aapoOverviewBands[i] arrays
6762 : * must have the same size (i.e. same number of
6763 : * overviews)
6764 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
6765 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
6766 : * @param pfnProgress progress report function.
6767 : * @param pProgressData progress function callback data.
6768 : * @param papszOptions NULL terminated list of options as
6769 : * key=value pairs, or NULL
6770 : * The XOFF, YOFF, XSIZE and YSIZE
6771 : * options can be specified to express that overviews should
6772 : * be regenerated only in the specified subset of the source
6773 : * dataset.
6774 : * @return CE_None on success or CE_Failure on failure.
6775 : * @since 3.10
6776 : */
6777 :
6778 19 : CPLErr GDALRegenerateOverviewsMultiBand(
6779 : const std::vector<GDALRasterBand *> &apoSrcBands,
6780 : const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
6781 : const char *pszResampling, GDALProgressFunc pfnProgress,
6782 : void *pProgressData, CSLConstList papszOptions)
6783 : {
6784 19 : CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
6785 29 : for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
6786 : {
6787 10 : CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
6788 : }
6789 :
6790 19 : if (aapoOverviewBands.empty())
6791 0 : return CE_None;
6792 :
6793 19 : std::vector<GDALRasterBand **> apapoOverviewBands;
6794 48 : for (auto &apoOverviewBands : aapoOverviewBands)
6795 : {
6796 : auto papoOverviewBands = static_cast<GDALRasterBand **>(
6797 29 : CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
6798 61 : for (size_t i = 0; i < apoOverviewBands.size(); ++i)
6799 : {
6800 32 : papoOverviewBands[i] = apoOverviewBands[i];
6801 : }
6802 29 : apapoOverviewBands.push_back(papoOverviewBands);
6803 : }
6804 38 : const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
6805 19 : static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
6806 19 : static_cast<int>(aapoOverviewBands[0].size()),
6807 19 : apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
6808 : papszOptions);
6809 48 : for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
6810 29 : CPLFree(papoOverviewBands);
6811 19 : return eErr;
6812 : }
6813 :
6814 : /************************************************************************/
6815 : /* GDALComputeBandStats() */
6816 : /************************************************************************/
6817 :
6818 : /** Undocumented
6819 : * @param hSrcBand undocumented.
6820 : * @param nSampleStep Step between scanlines used to compute statistics.
6821 : * When nSampleStep is equal to 1, all scanlines will
6822 : * be processed.
6823 : * @param pdfMean undocumented.
6824 : * @param pdfStdDev undocumented.
6825 : * @param pfnProgress undocumented.
6826 : * @param pProgressData undocumented.
6827 : * @return undocumented
6828 : */
6829 18 : CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
6830 : int nSampleStep, double *pdfMean,
6831 : double *pdfStdDev,
6832 : GDALProgressFunc pfnProgress,
6833 : void *pProgressData)
6834 :
6835 : {
6836 18 : VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
6837 :
6838 18 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
6839 :
6840 18 : if (pfnProgress == nullptr)
6841 18 : pfnProgress = GDALDummyProgress;
6842 :
6843 18 : const int nWidth = poSrcBand->GetXSize();
6844 18 : const int nHeight = poSrcBand->GetYSize();
6845 :
6846 18 : if (nSampleStep >= nHeight || nSampleStep < 1)
6847 5 : nSampleStep = 1;
6848 :
6849 18 : GDALDataType eWrkType = GDT_Unknown;
6850 18 : float *pafData = nullptr;
6851 18 : GDALDataType eType = poSrcBand->GetRasterDataType();
6852 18 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6853 18 : if (bComplex)
6854 : {
6855 : pafData = static_cast<float *>(
6856 0 : VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6857 0 : eWrkType = GDT_CFloat32;
6858 : }
6859 : else
6860 : {
6861 : pafData =
6862 18 : static_cast<float *>(VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6863 18 : eWrkType = GDT_Float32;
6864 : }
6865 :
6866 18 : if (nWidth == 0 || pafData == nullptr)
6867 : {
6868 0 : VSIFree(pafData);
6869 0 : return CE_Failure;
6870 : }
6871 :
6872 : /* -------------------------------------------------------------------- */
6873 : /* Loop over all sample lines. */
6874 : /* -------------------------------------------------------------------- */
6875 18 : double dfSum = 0.0;
6876 18 : double dfSum2 = 0.0;
6877 18 : int iLine = 0;
6878 18 : GIntBig nSamples = 0;
6879 :
6880 2143 : do
6881 : {
6882 2161 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6883 : pProgressData))
6884 : {
6885 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6886 0 : CPLFree(pafData);
6887 0 : return CE_Failure;
6888 : }
6889 :
6890 : const CPLErr eErr =
6891 2161 : poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
6892 : 1, eWrkType, 0, 0, nullptr);
6893 2161 : if (eErr != CE_None)
6894 : {
6895 1 : CPLFree(pafData);
6896 1 : return eErr;
6897 : }
6898 :
6899 725208 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6900 : {
6901 723048 : float fValue = 0.0f;
6902 :
6903 723048 : if (bComplex)
6904 : {
6905 : // Compute the magnitude of the complex value.
6906 : fValue =
6907 0 : std::hypot(pafData[static_cast<size_t>(iPixel) * 2],
6908 0 : pafData[static_cast<size_t>(iPixel) * 2 + 1]);
6909 : }
6910 : else
6911 : {
6912 723048 : fValue = pafData[iPixel];
6913 : }
6914 :
6915 723048 : dfSum += static_cast<double>(fValue);
6916 723048 : dfSum2 += static_cast<double>(fValue) * static_cast<double>(fValue);
6917 : }
6918 :
6919 2160 : nSamples += nWidth;
6920 2160 : iLine += nSampleStep;
6921 2160 : } while (iLine < nHeight);
6922 :
6923 17 : if (!pfnProgress(1.0, nullptr, pProgressData))
6924 : {
6925 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6926 0 : CPLFree(pafData);
6927 0 : return CE_Failure;
6928 : }
6929 :
6930 : /* -------------------------------------------------------------------- */
6931 : /* Produce the result values. */
6932 : /* -------------------------------------------------------------------- */
6933 17 : if (pdfMean != nullptr)
6934 17 : *pdfMean = dfSum / nSamples;
6935 :
6936 17 : if (pdfStdDev != nullptr)
6937 : {
6938 17 : const double dfMean = dfSum / nSamples;
6939 :
6940 17 : *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
6941 : }
6942 :
6943 17 : CPLFree(pafData);
6944 :
6945 17 : return CE_None;
6946 : }
6947 :
6948 : /************************************************************************/
6949 : /* GDALOverviewMagnitudeCorrection() */
6950 : /* */
6951 : /* Correct the mean and standard deviation of the overviews of */
6952 : /* the given band to match the base layer approximately. */
6953 : /************************************************************************/
6954 :
6955 : /** Undocumented
6956 : * @param hBaseBand undocumented.
6957 : * @param nOverviewCount undocumented.
6958 : * @param pahOverviews undocumented.
6959 : * @param pfnProgress undocumented.
6960 : * @param pProgressData undocumented.
6961 : * @return undocumented
6962 : */
6963 0 : CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
6964 : int nOverviewCount,
6965 : GDALRasterBandH *pahOverviews,
6966 : GDALProgressFunc pfnProgress,
6967 : void *pProgressData)
6968 :
6969 : {
6970 0 : VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
6971 :
6972 : /* -------------------------------------------------------------------- */
6973 : /* Compute mean/stddev for source raster. */
6974 : /* -------------------------------------------------------------------- */
6975 0 : double dfOrigMean = 0.0;
6976 0 : double dfOrigStdDev = 0.0;
6977 : {
6978 : const CPLErr eErr =
6979 0 : GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
6980 : pfnProgress, pProgressData);
6981 :
6982 0 : if (eErr != CE_None)
6983 0 : return eErr;
6984 : }
6985 :
6986 : /* -------------------------------------------------------------------- */
6987 : /* Loop on overview bands. */
6988 : /* -------------------------------------------------------------------- */
6989 0 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
6990 : {
6991 : GDALRasterBand *poOverview =
6992 0 : GDALRasterBand::FromHandle(pahOverviews[iOverview]);
6993 : double dfOverviewMean, dfOverviewStdDev;
6994 :
6995 : const CPLErr eErr =
6996 0 : GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
6997 : &dfOverviewStdDev, pfnProgress, pProgressData);
6998 :
6999 0 : if (eErr != CE_None)
7000 0 : return eErr;
7001 :
7002 0 : double dfGain = 1.0;
7003 0 : if (dfOrigStdDev >= 0.0001)
7004 0 : dfGain = dfOrigStdDev / dfOverviewStdDev;
7005 :
7006 : /* --------------------------------------------------------------------
7007 : */
7008 : /* Apply gain and offset. */
7009 : /* --------------------------------------------------------------------
7010 : */
7011 0 : const int nWidth = poOverview->GetXSize();
7012 0 : const int nHeight = poOverview->GetYSize();
7013 :
7014 0 : GDALDataType eWrkType = GDT_Unknown;
7015 0 : float *pafData = nullptr;
7016 0 : const GDALDataType eType = poOverview->GetRasterDataType();
7017 0 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
7018 0 : if (bComplex)
7019 : {
7020 : pafData = static_cast<float *>(
7021 0 : VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
7022 0 : eWrkType = GDT_CFloat32;
7023 : }
7024 : else
7025 : {
7026 : pafData = static_cast<float *>(
7027 0 : VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
7028 0 : eWrkType = GDT_Float32;
7029 : }
7030 :
7031 0 : if (pafData == nullptr)
7032 : {
7033 0 : return CE_Failure;
7034 : }
7035 :
7036 0 : for (int iLine = 0; iLine < nHeight; ++iLine)
7037 : {
7038 0 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
7039 : pProgressData))
7040 : {
7041 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
7042 0 : CPLFree(pafData);
7043 0 : return CE_Failure;
7044 : }
7045 :
7046 0 : if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
7047 : nWidth, 1, eWrkType, 0, 0,
7048 0 : nullptr) != CE_None)
7049 : {
7050 0 : CPLFree(pafData);
7051 0 : return CE_Failure;
7052 : }
7053 :
7054 0 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
7055 : {
7056 0 : if (bComplex)
7057 : {
7058 0 : pafData[static_cast<size_t>(iPixel) * 2] *=
7059 0 : static_cast<float>(dfGain);
7060 0 : pafData[static_cast<size_t>(iPixel) * 2 + 1] *=
7061 0 : static_cast<float>(dfGain);
7062 : }
7063 : else
7064 : {
7065 0 : pafData[iPixel] = static_cast<float>(
7066 0 : (double(pafData[iPixel]) - dfOverviewMean) * dfGain +
7067 : dfOrigMean);
7068 : }
7069 : }
7070 :
7071 0 : if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
7072 : nWidth, 1, eWrkType, 0, 0,
7073 0 : nullptr) != CE_None)
7074 : {
7075 0 : CPLFree(pafData);
7076 0 : return CE_Failure;
7077 : }
7078 : }
7079 :
7080 0 : if (!pfnProgress(1.0, nullptr, pProgressData))
7081 : {
7082 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
7083 0 : CPLFree(pafData);
7084 0 : return CE_Failure;
7085 : }
7086 :
7087 0 : CPLFree(pafData);
7088 : }
7089 :
7090 0 : return CE_None;
7091 : }
|