Line data Source code
1 :
2 : /******************************************************************************
3 : *
4 : * Project: GDAL Core
5 : * Purpose: Helper code to implement overview support in different drivers.
6 : * Author: Frank Warmerdam, warmerdam@pobox.com
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 2000, Frank Warmerdam
10 : * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * SPDX-License-Identifier: MIT
13 : ****************************************************************************/
14 :
15 : #include "cpl_port.h"
16 : #include "gdal_priv.h"
17 :
18 : #include <cmath>
19 : #include <cstddef>
20 : #include <cstdlib>
21 :
22 : #include <algorithm>
23 : #include <complex>
24 : #include <condition_variable>
25 : #include <limits>
26 : #include <list>
27 : #include <memory>
28 : #include <mutex>
29 : #include <vector>
30 :
31 : #include "cpl_conv.h"
32 : #include "cpl_error.h"
33 : #include "cpl_float.h"
34 : #include "cpl_progress.h"
35 : #include "cpl_vsi.h"
36 : #include "gdal.h"
37 : #include "gdal_thread_pool.h"
38 : #include "gdalwarper.h"
39 : #include "gdal_vrt.h"
40 : #include "vrtdataset.h"
41 :
42 : #ifdef USE_NEON_OPTIMIZATIONS
43 : #include "include_sse2neon.h"
44 :
45 : #if (!defined(__aarch64__) && !defined(_M_ARM64))
46 : #define ARM_V7
47 : #endif
48 :
49 : #define USE_SSE2
50 :
51 : #include "gdalsse_priv.h"
52 :
53 : // Restrict to 64bit processors because they are guaranteed to have SSE2,
54 : // or if __AVX2__ is defined.
55 : #elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
56 : #define USE_SSE2
57 :
58 : #include "gdalsse_priv.h"
59 :
60 : #ifdef __SSE3__
61 : #include <pmmintrin.h>
62 : #endif
63 : #ifdef __SSSE3__
64 : #include <tmmintrin.h>
65 : #endif
66 : #ifdef __SSE4_1__
67 : #include <smmintrin.h>
68 : #endif
69 : #ifdef __AVX2__
70 : #include <immintrin.h>
71 : #endif
72 :
73 : #endif
74 :
75 : // To be included after above USE_SSE2 and include gdalsse_priv.h
76 : // to avoid build issue on Windows x86
77 : #include "gdal_priv_templates.hpp"
78 :
79 : /************************************************************************/
80 : /* GDALResampleChunk_Near() */
81 : /************************************************************************/
82 :
83 : template <class T>
84 1236 : static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
85 : const T *pChunk, T **ppDstBuffer)
86 :
87 : {
88 1236 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
89 1236 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
90 1236 : const GDALDataType eWrkDataType = args.eWrkDataType;
91 1236 : const int nChunkXOff = args.nChunkXOff;
92 1236 : const int nChunkXSize = args.nChunkXSize;
93 1236 : const int nChunkYOff = args.nChunkYOff;
94 1236 : const int nDstXOff = args.nDstXOff;
95 1236 : const int nDstXOff2 = args.nDstXOff2;
96 1236 : const int nDstYOff = args.nDstYOff;
97 1236 : const int nDstYOff2 = args.nDstYOff2;
98 1236 : const int nDstXWidth = nDstXOff2 - nDstXOff;
99 :
100 : /* -------------------------------------------------------------------- */
101 : /* Allocate buffers. */
102 : /* -------------------------------------------------------------------- */
103 1236 : *ppDstBuffer = static_cast<T *>(
104 1236 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
105 : GDALGetDataTypeSizeBytes(eWrkDataType)));
106 1236 : if (*ppDstBuffer == nullptr)
107 : {
108 0 : return CE_Failure;
109 : }
110 1236 : T *const pDstBuffer = *ppDstBuffer;
111 :
112 : int *panSrcXOff =
113 1236 : static_cast<int *>(VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(int)));
114 :
115 1236 : if (panSrcXOff == nullptr)
116 : {
117 0 : return CE_Failure;
118 : }
119 :
120 : /* ==================================================================== */
121 : /* Precompute inner loop constants. */
122 : /* ==================================================================== */
123 840457 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
124 : {
125 839221 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
126 839221 : if (nSrcXOff < nChunkXOff)
127 0 : nSrcXOff = nChunkXOff;
128 :
129 839221 : panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
130 : }
131 :
132 : /* ==================================================================== */
133 : /* Loop over destination scanlines. */
134 : /* ==================================================================== */
135 142064 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
136 : {
137 140828 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
138 140828 : if (nSrcYOff < nChunkYOff)
139 0 : nSrcYOff = nChunkYOff;
140 :
141 140828 : const T *const pSrcScanline =
142 : pChunk +
143 140828 : (static_cast<size_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
144 137794 : nChunkXOff;
145 :
146 : /* --------------------------------------------------------------------
147 : */
148 : /* Loop over destination pixels */
149 : /* --------------------------------------------------------------------
150 : */
151 140828 : T *pDstScanline =
152 140828 : pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
153 120237794 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
154 : {
155 120096760 : pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
156 : }
157 : }
158 :
159 1236 : CPLFree(panSrcXOff);
160 :
161 1236 : return CE_None;
162 : }
163 :
164 1236 : static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
165 : const void *pChunk, void **ppDstBuffer,
166 : GDALDataType *peDstBufferDataType)
167 : {
168 1236 : *peDstBufferDataType = args.eWrkDataType;
169 1236 : switch (args.eWrkDataType)
170 : {
171 : // For nearest resampling, as no computation is done, only the
172 : // size of the data type matters.
173 1079 : case GDT_UInt8:
174 : case GDT_Int8:
175 : {
176 1079 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
177 1079 : return GDALResampleChunk_NearT(
178 : args, static_cast<const uint8_t *>(pChunk),
179 1079 : reinterpret_cast<uint8_t **>(ppDstBuffer));
180 : }
181 :
182 52 : case GDT_Int16:
183 : case GDT_UInt16:
184 : case GDT_Float16:
185 : {
186 52 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
187 52 : return GDALResampleChunk_NearT(
188 : args, static_cast<const uint16_t *>(pChunk),
189 52 : reinterpret_cast<uint16_t **>(ppDstBuffer));
190 : }
191 :
192 57 : case GDT_CInt16:
193 : case GDT_CFloat16:
194 : case GDT_Int32:
195 : case GDT_UInt32:
196 : case GDT_Float32:
197 : {
198 57 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
199 57 : return GDALResampleChunk_NearT(
200 : args, static_cast<const uint32_t *>(pChunk),
201 57 : reinterpret_cast<uint32_t **>(ppDstBuffer));
202 : }
203 :
204 44 : case GDT_CInt32:
205 : case GDT_CFloat32:
206 : case GDT_Int64:
207 : case GDT_UInt64:
208 : case GDT_Float64:
209 : {
210 44 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
211 44 : return GDALResampleChunk_NearT(
212 : args, static_cast<const uint64_t *>(pChunk),
213 44 : reinterpret_cast<uint64_t **>(ppDstBuffer));
214 : }
215 :
216 4 : case GDT_CFloat64:
217 : {
218 4 : return GDALResampleChunk_NearT(
219 : args, static_cast<const std::complex<double> *>(pChunk),
220 4 : reinterpret_cast<std::complex<double> **>(ppDstBuffer));
221 : }
222 :
223 0 : case GDT_Unknown:
224 : case GDT_TypeCount:
225 0 : break;
226 : }
227 0 : CPLAssert(false);
228 : return CE_Failure;
229 : }
230 :
231 : namespace
232 : {
233 :
234 : // Find in the color table the entry whose RGB value is the closest
235 : // (using quadratic distance) to the test color, ignoring transparent entries.
236 3837 : int BestColorEntry(const std::vector<GDALColorEntry> &entries,
237 : const GDALColorEntry &test)
238 : {
239 3837 : int nMinDist = std::numeric_limits<int>::max();
240 3837 : size_t bestEntry = 0;
241 986109 : for (size_t i = 0; i < entries.size(); ++i)
242 : {
243 982272 : const GDALColorEntry &entry = entries[i];
244 : // Ignore transparent entries
245 982272 : if (entry.c4 == 0)
246 3237 : continue;
247 :
248 979035 : int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
249 979035 : ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
250 979035 : ((test.c3 - entry.c3) * (test.c3 - entry.c3));
251 979035 : if (nDist < nMinDist)
252 : {
253 15847 : nMinDist = nDist;
254 15847 : bestEntry = i;
255 : }
256 : }
257 3837 : return static_cast<int>(bestEntry);
258 : }
259 :
260 7 : std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
261 : int &transparentIdx)
262 : {
263 7 : std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
264 :
265 7 : transparentIdx = -1;
266 7 : int i = 0;
267 1799 : for (auto &entry : entries)
268 : {
269 1792 : table.GetColorEntryAsRGB(i, &entry);
270 1792 : if (transparentIdx < 0 && entry.c4 == 0)
271 1 : transparentIdx = i;
272 1792 : ++i;
273 : }
274 7 : return entries;
275 : }
276 :
277 : } // unnamed namespace
278 :
279 : /************************************************************************/
280 : /* SQUARE() */
281 : /************************************************************************/
282 :
283 4897 : template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
284 : {
285 4897 : return static_cast<Tsquare>(val) * val;
286 : }
287 :
288 : /************************************************************************/
289 : /* ComputeIntegerRMS() */
290 : /************************************************************************/
291 : // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
292 : // integer that minimizes abs(rms**2 - sumSquares / weight)
293 : template <class T, class Twork>
294 42 : inline T ComputeIntegerRMS(double sumSquares, double weight)
295 : {
296 42 : const double sumDivWeight = sumSquares / weight;
297 42 : T rms = static_cast<T>(sqrt(sumDivWeight));
298 :
299 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
300 : // Naive version:
301 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
302 42 : if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
303 42 : 2 * sumDivWeight)
304 6 : rms += 1;
305 42 : return rms;
306 : }
307 :
308 : template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
309 : {
310 : CPLAssert(false);
311 : return 0;
312 : }
313 :
314 28 : template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
315 : {
316 : // It has been verified that given the correction on rms below, using
317 : // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
318 : // is equivalent, so use the former as it is used twice.
319 28 : const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
320 28 : const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
321 28 : GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
322 :
323 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
324 : // Naive version:
325 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
326 : // Optimized version for integer case and weight == 4
327 28 : if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
328 5 : rms += 1;
329 28 : return rms;
330 : }
331 :
332 : template <>
333 24 : inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
334 : {
335 24 : const double sumDivWeight = sumSquares * 0.25;
336 24 : GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
337 :
338 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
339 : // Naive version:
340 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
341 : // Optimized version for integer case and weight == 4
342 24 : if (static_cast<GUInt32>(rms) * (rms + 1) <
343 24 : static_cast<GUInt32>(sumDivWeight + 0.25))
344 4 : rms += 1;
345 24 : return rms;
346 : }
347 :
348 : #ifdef USE_SSE2
349 :
350 : /************************************************************************/
351 : /* QuadraticMeanByteSSE2OrAVX2() */
352 : /************************************************************************/
353 :
354 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
355 : #define sse2_packus_epi32 _mm_packus_epi32
356 : #else
357 516139 : inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
358 : {
359 516139 : const auto minus32768_32 = _mm_set1_epi32(-32768);
360 516139 : const auto minus32768_16 = _mm_set1_epi16(-32768);
361 516139 : a = _mm_add_epi32(a, minus32768_32);
362 516139 : b = _mm_add_epi32(b, minus32768_32);
363 516139 : a = _mm_packs_epi32(a, b);
364 516139 : a = _mm_sub_epi16(a, minus32768_16);
365 516139 : return a;
366 : }
367 : #endif
368 :
369 : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
370 : #define sse2_hadd_epi16 _mm_hadd_epi16
371 : #else
372 4715530 : inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
373 : {
374 : // Horizontal addition of adjacent pairs
375 4715530 : const auto mask = _mm_set1_epi32(0xFFFF);
376 : const auto horizLo =
377 14146600 : _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
378 : const auto horizHi =
379 14146600 : _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
380 :
381 : // Recombine low and high parts
382 4715530 : return _mm_packs_epi32(horizLo, horizHi);
383 : }
384 : #endif
385 :
386 : #ifdef __AVX2__
387 :
388 : #define set1_epi16 _mm256_set1_epi16
389 : #define set1_epi32 _mm256_set1_epi32
390 : #define setzero _mm256_setzero_si256
391 : #define set1_ps _mm256_set1_ps
392 : #define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
393 : #define unpacklo_epi8 _mm256_unpacklo_epi8
394 : #define unpackhi_epi8 _mm256_unpackhi_epi8
395 : #define madd_epi16 _mm256_madd_epi16
396 : #define add_epi32 _mm256_add_epi32
397 : #define mul_ps _mm256_mul_ps
398 : #define cvtepi32_ps _mm256_cvtepi32_ps
399 : #define sqrt_ps _mm256_sqrt_ps
400 : #define cvttps_epi32 _mm256_cvttps_epi32
401 : #define packs_epi32 _mm256_packs_epi32
402 : #define packus_epi32 _mm256_packus_epi32
403 : #define srli_epi32 _mm256_srli_epi32
404 : #define mullo_epi16 _mm256_mullo_epi16
405 : #define srli_epi16 _mm256_srli_epi16
406 : #define cmpgt_epi16 _mm256_cmpgt_epi16
407 : #define add_epi16 _mm256_add_epi16
408 : #define sub_epi16 _mm256_sub_epi16
409 : #define packus_epi16 _mm256_packus_epi16
410 :
411 : /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
412 : /* to get the lower 128-bit bits of what would be a true 256-bit vector register
413 : */
414 :
415 : inline __m256i FIXUP_LANES(__m256i x)
416 : {
417 : return _mm256_permute4x64_epi64(x, _MM_SHUFFLE(3, 1, 2, 0));
418 : }
419 :
420 : #define store_lo(x, y) \
421 : _mm_storeu_si128(reinterpret_cast<__m128i *>(x), \
422 : _mm256_extracti128_si256(FIXUP_LANES(y), 0))
423 : #define storeu_int(x, y) \
424 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(x), FIXUP_LANES(y))
425 : #define hadd_epi16 _mm256_hadd_epi16
426 : #else
427 : #define set1_epi16 _mm_set1_epi16
428 : #define set1_epi32 _mm_set1_epi32
429 : #define setzero _mm_setzero_si128
430 : #define set1_ps _mm_set1_ps
431 : #define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
432 : #define unpacklo_epi8 _mm_unpacklo_epi8
433 : #define unpackhi_epi8 _mm_unpackhi_epi8
434 : #define madd_epi16 _mm_madd_epi16
435 : #define add_epi32 _mm_add_epi32
436 : #define mul_ps _mm_mul_ps
437 : #define cvtepi32_ps _mm_cvtepi32_ps
438 : #define sqrt_ps _mm_sqrt_ps
439 : #define cvttps_epi32 _mm_cvttps_epi32
440 : #define packs_epi32 _mm_packs_epi32
441 : #define packus_epi32 sse2_packus_epi32
442 : #define srli_epi32 _mm_srli_epi32
443 : #define mullo_epi16 _mm_mullo_epi16
444 : #define srli_epi16 _mm_srli_epi16
445 : #define cmpgt_epi16 _mm_cmpgt_epi16
446 : #define add_epi16 _mm_add_epi16
447 : #define sub_epi16 _mm_sub_epi16
448 : #define packus_epi16 _mm_packus_epi16
449 : #define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
450 : #define storeu_int(x, y) _mm_storeu_si128(reinterpret_cast<__m128i *>(x), (y))
451 : #define hadd_epi16 sse2_hadd_epi16
452 : #endif
453 :
454 : template <class T>
455 : static int
456 : #if defined(__GNUC__)
457 : __attribute__((noinline))
458 : #endif
459 5389 : QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
460 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
461 : T *CPL_RESTRICT pDstScanline)
462 : {
463 : // Optimized implementation for RMS on Byte by
464 : // processing by group of 8 output pixels, so as to use
465 : // a single _mm_sqrt_ps() call for 4 output pixels
466 5389 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
467 :
468 5389 : int iDstPixel = 0;
469 5389 : const auto one16 = set1_epi16(1);
470 5389 : const auto one32 = set1_epi32(1);
471 5389 : const auto zero = setzero();
472 5389 : const auto minus32768 = set1_epi16(-32768);
473 :
474 5389 : constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
475 521504 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
476 : {
477 : // Load 2 * DEST_ELTS bytes from each line
478 516115 : auto firstLine = loadu_int(pSrcScanlineShifted);
479 1032230 : auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
480 : // Extend those Bytes as UInt16s
481 516115 : auto firstLineLo = unpacklo_epi8(firstLine, zero);
482 516115 : auto firstLineHi = unpackhi_epi8(firstLine, zero);
483 516115 : auto secondLineLo = unpacklo_epi8(secondLine, zero);
484 516115 : auto secondLineHi = unpackhi_epi8(secondLine, zero);
485 :
486 : // Multiplication of 16 bit values and horizontal
487 : // addition of 32 bit results
488 : // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
489 516115 : firstLineLo = madd_epi16(firstLineLo, firstLineLo);
490 516115 : firstLineHi = madd_epi16(firstLineHi, firstLineHi);
491 516115 : secondLineLo = madd_epi16(secondLineLo, secondLineLo);
492 516115 : secondLineHi = madd_epi16(secondLineHi, secondLineHi);
493 :
494 : // Vertical addition
495 516115 : const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
496 516115 : const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
497 :
498 : const auto sumSquaresPlusOneDiv4Lo =
499 1032230 : srli_epi32(add_epi32(sumSquaresLo, one32), 2);
500 : const auto sumSquaresPlusOneDiv4Hi =
501 1032230 : srli_epi32(add_epi32(sumSquaresHi, one32), 2);
502 :
503 : // Take square root and truncate/floor to int32
504 : const auto rmsLo =
505 1548340 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
506 : const auto rmsHi =
507 1548340 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
508 :
509 : // Merge back low and high registers with each RMS value
510 : // as a 16 bit value.
511 516115 : auto rms = packs_epi32(rmsLo, rmsHi);
512 :
513 : // Round to upper value if it minimizes the
514 : // error |rms^2 - sumSquares/4|
515 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
516 : // rms += 1;
517 : // which is equivalent to:
518 : // if( rms * (rms + 1) < (sumSquares+1) / 4 )
519 : // rms += 1;
520 : // And both left and right parts fit on 16 (unsigned) bits
521 : const auto sumSquaresPlusOneDiv4 =
522 516115 : packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
523 : // cmpgt_epi16 operates on signed int16, but here
524 : // we have unsigned values, so shift them by -32768 before
525 2580580 : const auto mask = cmpgt_epi16(
526 : add_epi16(sumSquaresPlusOneDiv4, minus32768),
527 : add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
528 : // The value of the mask will be -1 when the correction needs to be
529 : // applied
530 516115 : rms = sub_epi16(rms, mask);
531 :
532 : // Pack each 16 bit RMS value to 8 bits
533 516115 : rms = packus_epi16(rms, rms /* could be anything */);
534 516115 : store_lo(&pDstScanline[iDstPixel], rms);
535 516115 : pSrcScanlineShifted += 2 * DEST_ELTS;
536 : }
537 :
538 5389 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
539 5389 : return iDstPixel;
540 : }
541 :
542 : /************************************************************************/
543 : /* AverageByteSSE2OrAVX2() */
544 : /************************************************************************/
545 :
546 : static int
547 111734 : AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
548 : const GByte *&CPL_RESTRICT pSrcScanlineShiftedInOut,
549 : GByte *CPL_RESTRICT pDstScanline)
550 : {
551 : // Optimized implementation for average on Byte by
552 : // processing by group of 16 output pixels for SSE2, or 32 for AVX2
553 :
554 111734 : const auto zero = setzero();
555 111734 : const auto two16 = set1_epi16(2);
556 111734 : const GByte *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
557 :
558 111734 : constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
559 111734 : int iDstPixel = 0;
560 2469500 : for (; iDstPixel < nDstXWidth - (2 * DEST_ELTS - 1);
561 2357770 : iDstPixel += 2 * DEST_ELTS)
562 : {
563 : decltype(setzero()) average0;
564 : {
565 : // Load 2 * DEST_ELTS bytes from each line
566 2357770 : const auto firstLine = loadu_int(pSrcScanlineShifted);
567 : const auto secondLine =
568 4715530 : loadu_int(pSrcScanlineShifted + nChunkXSize);
569 : // Extend those Bytes as UInt16s
570 2357770 : const auto firstLineLo = unpacklo_epi8(firstLine, zero);
571 2357770 : const auto firstLineHi = unpackhi_epi8(firstLine, zero);
572 2357770 : const auto secondLineLo = unpacklo_epi8(secondLine, zero);
573 2357770 : const auto secondLineHi = unpackhi_epi8(secondLine, zero);
574 :
575 : // Vertical addition
576 2357770 : const auto sumLo = add_epi16(firstLineLo, secondLineLo);
577 2357770 : const auto sumHi = add_epi16(firstLineHi, secondLineHi);
578 :
579 : // Horizontal addition of adjacent pairs, and recombine low and high
580 : // parts
581 2357770 : const auto sum = hadd_epi16(sumLo, sumHi);
582 :
583 : // average = (sum + 2) / 4
584 2357770 : average0 = srli_epi16(add_epi16(sum, two16), 2);
585 :
586 2357770 : pSrcScanlineShifted += 2 * DEST_ELTS;
587 : }
588 :
589 : decltype(setzero()) average1;
590 : {
591 : // Load 2 * DEST_ELTS bytes from each line
592 2357770 : const auto firstLine = loadu_int(pSrcScanlineShifted);
593 : const auto secondLine =
594 4715530 : loadu_int(pSrcScanlineShifted + nChunkXSize);
595 : // Extend those Bytes as UInt16s
596 2357770 : const auto firstLineLo = unpacklo_epi8(firstLine, zero);
597 2357770 : const auto firstLineHi = unpackhi_epi8(firstLine, zero);
598 2357770 : const auto secondLineLo = unpacklo_epi8(secondLine, zero);
599 2357770 : const auto secondLineHi = unpackhi_epi8(secondLine, zero);
600 :
601 : // Vertical addition
602 2357770 : const auto sumLo = add_epi16(firstLineLo, secondLineLo);
603 2357770 : const auto sumHi = add_epi16(firstLineHi, secondLineHi);
604 :
605 : // Horizontal addition of adjacent pairs, and recombine low and high
606 : // parts
607 2357770 : const auto sum = hadd_epi16(sumLo, sumHi);
608 :
609 : // average = (sum + 2) / 4
610 2357770 : average1 = srli_epi16(add_epi16(sum, two16), 2);
611 :
612 2357770 : pSrcScanlineShifted += 2 * DEST_ELTS;
613 : }
614 :
615 : // Pack each 16 bit average value to 8 bits
616 2357770 : const auto average = packus_epi16(average0, average1);
617 2357770 : storeu_int(&pDstScanline[iDstPixel], average);
618 : }
619 :
620 111734 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
621 111734 : return iDstPixel;
622 : }
623 :
624 : /************************************************************************/
625 : /* QuadraticMeanUInt16SSE2() */
626 : /************************************************************************/
627 :
628 : #ifdef __SSE3__
629 : #define sse2_hadd_pd _mm_hadd_pd
630 : #else
631 185 : inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
632 : {
633 : auto aLo_bLo =
634 740 : _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
635 : auto aHi_bHi =
636 740 : _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
637 185 : return _mm_add_pd(aLo_bLo, aHi_bHi); // (aLo + aHi, bLo + bHi)
638 : }
639 : #endif
640 :
641 120 : inline __m128d SQUARE_PD(__m128d x)
642 : {
643 120 : return _mm_mul_pd(x, x);
644 : }
645 :
646 : #ifdef __AVX2__
647 :
648 : inline __m256d SQUARE_PD(__m256d x)
649 : {
650 : return _mm256_mul_pd(x, x);
651 : }
652 :
653 : inline __m256d FIXUP_LANES(__m256d x)
654 : {
655 : return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
656 : }
657 :
658 : inline __m256 FIXUP_LANES(__m256 x)
659 : {
660 : return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
661 : }
662 :
663 : #endif
664 :
665 : static int
666 14 : QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
667 : const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
668 : uint16_t *CPL_RESTRICT pDstScanline)
669 : {
670 : // Optimized implementation for RMS on UInt16 by
671 : // processing by group of 4 output pixels.
672 14 : const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
673 :
674 14 : int iDstPixel = 0;
675 14 : const auto zero = _mm_setzero_si128();
676 :
677 : #ifdef __AVX2__
678 : const auto zeroDot25 = _mm256_set1_pd(0.25);
679 : const auto zeroDot5 = _mm256_set1_pd(0.5);
680 :
681 : // The first four 0's could be anything, as we only take the bottom
682 : // 128 bits.
683 : const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
684 : #else
685 14 : const auto zeroDot25 = _mm_set1_pd(0.25);
686 14 : const auto zeroDot5 = _mm_set1_pd(0.5);
687 : #endif
688 :
689 14 : constexpr int DEST_ELTS =
690 : static_cast<int>(sizeof(zero) / sizeof(uint16_t)) / 2;
691 52 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
692 : {
693 : // Load 8 UInt16 from each line
694 38 : const auto firstLine = _mm_loadu_si128(
695 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
696 : const auto secondLine =
697 38 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
698 38 : pSrcScanlineShifted + nChunkXSize));
699 :
700 : // Detect if all of the source values fit in 14 bits.
701 : // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
702 : // and we can do a much faster implementation.
703 : const auto maskTmp =
704 76 : _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
705 : #if defined(__i386__) || defined(_M_IX86)
706 : uint64_t nMaskFitsIn14Bits = 0;
707 : _mm_storel_epi64(
708 : reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
709 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
710 : #else
711 38 : const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
712 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
713 : #endif
714 38 : if (nMaskFitsIn14Bits == 0)
715 : {
716 : // Multiplication of 16 bit values and horizontal
717 : // addition of 32 bit results
718 : const auto firstLineHSumSquare =
719 26 : _mm_madd_epi16(firstLine, firstLine);
720 : const auto secondLineHSumSquare =
721 26 : _mm_madd_epi16(secondLine, secondLine);
722 : // Vertical addition
723 : const auto sumSquares =
724 26 : _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
725 : // In theory we should take sqrt(sumSquares * 0.25f)
726 : // but given the rounding we do, this is equivalent to
727 : // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
728 : // sumSquares <= 4 * 16383^2
729 26 : const auto one32 = _mm_set1_epi32(1);
730 : const auto sumSquaresPlusOneDiv4 =
731 52 : _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
732 : // Take square root and truncate/floor to int32
733 78 : auto rms = _mm_cvttps_epi32(
734 : _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
735 :
736 : // Round to upper value if it minimizes the
737 : // error |rms^2 - sumSquares/4|
738 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
739 : // rms += 1;
740 : // which is equivalent to:
741 : // if( rms * rms + rms < (sumSquares+1) / 4 )
742 : // rms += 1;
743 : auto mask =
744 78 : _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
745 : _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
746 26 : rms = _mm_sub_epi32(rms, mask);
747 : // Pack each 32 bit RMS value to 16 bits
748 26 : rms = _mm_packs_epi32(rms, rms /* could be anything */);
749 : _mm_storel_epi64(
750 26 : reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
751 26 : pSrcScanlineShifted += 2 * DEST_ELTS;
752 26 : continue;
753 : }
754 :
755 : // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
756 : // to 32 bit would result in 4 multiplications instead of 8, but
757 : // mullo/mulhi have a worse throughput than mul_pd.
758 :
759 : // Extend those UInt16s as UInt32s
760 12 : const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
761 12 : const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
762 12 : const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
763 12 : const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
764 :
765 : #ifdef __AVX2__
766 : // Multiplication of 32 bit values previously converted to 64 bit double
767 : const auto firstLineLoDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineLo));
768 : const auto firstLineHiDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineHi));
769 : const auto secondLineLoDbl =
770 : SQUARE_PD(_mm256_cvtepi32_pd(secondLineLo));
771 : const auto secondLineHiDbl =
772 : SQUARE_PD(_mm256_cvtepi32_pd(secondLineHi));
773 :
774 : // Vertical addition of squares
775 : const auto sumSquaresLo =
776 : _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
777 : const auto sumSquaresHi =
778 : _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
779 :
780 : // Horizontal addition of squares
781 : const auto sumSquares =
782 : FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
783 :
784 : const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
785 :
786 : // Take square root and truncate/floor to int32
787 : auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
788 : const auto rmsDouble = _mm256_cvtepi32_pd(rms);
789 : const auto right = _mm256_sub_pd(
790 : sumDivWeight, _mm256_add_pd(SQUARE_PD(rmsDouble), rmsDouble));
791 :
792 : auto mask =
793 : _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
794 : // Extract 32-bit from each of the 4 64-bit masks
795 : // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
796 : // _MM_SHUFFLE(2,0,2,0)));
797 : mask = _mm256_permutevar8x32_ps(mask, permutation);
798 : const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
799 :
800 : // Apply the correction
801 : rms = _mm_sub_epi32(rms, maskI);
802 :
803 : // Pack each 32 bit RMS value to 16 bits
804 : rms = _mm_packus_epi32(rms, rms /* could be anything */);
805 : #else
806 : // Multiplication of 32 bit values previously converted to 64 bit double
807 12 : const auto firstLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineLo));
808 : const auto firstLineLoHi =
809 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
810 12 : const auto firstLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineHi));
811 : const auto firstLineHiHi =
812 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
813 :
814 12 : const auto secondLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineLo));
815 : const auto secondLineLoHi =
816 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
817 12 : const auto secondLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineHi));
818 : const auto secondLineHiHi =
819 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
820 :
821 : // Vertical addition of squares
822 12 : const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
823 12 : const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
824 12 : const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
825 12 : const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
826 :
827 : // Horizontal addition of squares
828 12 : const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
829 12 : const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
830 :
831 12 : const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
832 12 : const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
833 : // Take square root and truncate/floor to int32
834 24 : const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
835 24 : const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
836 :
837 : // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
838 : // if( 0.5 < sumDivWeight - (rms * rms + rms) )
839 : // rms += 1;
840 12 : const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
841 12 : const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
842 24 : const auto rightLo = _mm_sub_pd(
843 : sumDivWeightLo, _mm_add_pd(SQUARE_PD(rmsLoDouble), rmsLoDouble));
844 36 : const auto rightHi = _mm_sub_pd(
845 : sumDivWeightHi, _mm_add_pd(SQUARE_PD(rmsHiDouble), rmsHiDouble));
846 :
847 24 : const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
848 12 : const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
849 : // The value of the mask will be -1 when the correction needs to be
850 : // applied
851 24 : const auto mask = _mm_castps_si128(_mm_shuffle_ps(
852 : maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
853 :
854 48 : auto rms = _mm_castps_si128(
855 : _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
856 : // Apply the correction
857 12 : rms = _mm_sub_epi32(rms, mask);
858 :
859 : // Pack each 32 bit RMS value to 16 bits
860 12 : rms = sse2_packus_epi32(rms, rms /* could be anything */);
861 : #endif
862 :
863 12 : _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
864 : rms);
865 12 : pSrcScanlineShifted += 2 * DEST_ELTS;
866 : }
867 :
868 14 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
869 14 : return iDstPixel;
870 : }
871 :
872 : /************************************************************************/
873 : /* AverageUInt16SSE2() */
874 : /************************************************************************/
875 :
876 : static int
877 13 : AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
878 : const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
879 : uint16_t *CPL_RESTRICT pDstScanline)
880 : {
881 : // Optimized implementation for average on UInt16 by
882 : // processing by group of 8 output pixels.
883 :
884 13 : const auto mask = _mm_set1_epi32(0xFFFF);
885 13 : const auto two = _mm_set1_epi32(2);
886 13 : const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
887 :
888 13 : int iDstPixel = 0;
889 13 : constexpr int DEST_ELTS = static_cast<int>(sizeof(mask) / sizeof(uint16_t));
890 25 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
891 : {
892 : __m128i averageLow;
893 : // Load 8 UInt16 from each line
894 : {
895 12 : const auto firstLine = _mm_loadu_si128(
896 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
897 : const auto secondLine =
898 12 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
899 12 : pSrcScanlineShifted + nChunkXSize));
900 :
901 : // Horizontal addition and extension to 32 bit
902 36 : const auto horizAddFirstLine = _mm_add_epi32(
903 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
904 : const auto horizAddSecondLine =
905 36 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
906 : _mm_srli_epi32(secondLine, 16));
907 :
908 : // Vertical addition and average computation
909 : // average = (sum + 2) >> 2
910 24 : const auto sum = _mm_add_epi32(
911 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
912 12 : averageLow = _mm_srli_epi32(sum, 2);
913 : }
914 : // Load 8 UInt16 from each line
915 : __m128i averageHigh;
916 : {
917 : const auto firstLine =
918 12 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
919 12 : pSrcScanlineShifted + DEST_ELTS));
920 : const auto secondLine =
921 12 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
922 12 : pSrcScanlineShifted + DEST_ELTS + nChunkXSize));
923 :
924 : // Horizontal addition and extension to 32 bit
925 36 : const auto horizAddFirstLine = _mm_add_epi32(
926 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
927 : const auto horizAddSecondLine =
928 36 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
929 : _mm_srli_epi32(secondLine, 16));
930 :
931 : // Vertical addition and average computation
932 : // average = (sum + 2) >> 2
933 24 : const auto sum = _mm_add_epi32(
934 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
935 12 : averageHigh = _mm_srli_epi32(sum, 2);
936 : }
937 :
938 : // Pack each 32 bit average value to 16 bits
939 12 : auto average = sse2_packus_epi32(averageLow, averageHigh);
940 12 : _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
941 : average);
942 12 : pSrcScanlineShifted += 2 * DEST_ELTS;
943 : }
944 :
945 13 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
946 13 : return iDstPixel;
947 : }
948 :
949 : /************************************************************************/
950 : /* QuadraticMeanFloatSSE2() */
951 : /************************************************************************/
952 :
953 : #if !defined(ARM_V7)
954 :
955 : #ifdef __SSE3__
956 : #define sse2_hadd_ps _mm_hadd_ps
957 : #else
958 82 : inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
959 : {
960 82 : auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
961 82 : auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
962 82 : return _mm_add_ps(aEven_bEven, aOdd_bOdd); // (aEven + aOdd, bEven + bOdd)
963 : }
964 : #endif
965 :
966 : #ifdef __AVX2__
967 : #define set1_ps _mm256_set1_ps
968 : #define loadu_ps _mm256_loadu_ps
969 : #define andnot_ps _mm256_andnot_ps
970 : #define and_ps _mm256_and_ps
971 : #define max_ps _mm256_max_ps
972 : #define shuffle_ps _mm256_shuffle_ps
973 : #define div_ps _mm256_div_ps
974 : #define cmpeq_ps(x, y) _mm256_cmp_ps((x), (y), _CMP_EQ_OQ)
975 : #define mul_ps _mm256_mul_ps
976 : #define add_ps _mm256_add_ps
977 : #define hadd_ps _mm256_hadd_ps
978 : #define sqrt_ps _mm256_sqrt_ps
979 : #define or_ps _mm256_or_ps
980 : #define unpacklo_ps _mm256_unpacklo_ps
981 : #define unpackhi_ps _mm256_unpackhi_ps
982 : #define storeu_ps _mm256_storeu_ps
983 : #define blendv_ps _mm256_blendv_ps
984 :
985 : inline __m256 SQUARE_PS(__m256 x)
986 : {
987 : return _mm256_mul_ps(x, x);
988 : }
989 :
990 : #else
991 :
992 : #define set1_ps _mm_set1_ps
993 : #define loadu_ps _mm_loadu_ps
994 : #define andnot_ps _mm_andnot_ps
995 : #define and_ps _mm_and_ps
996 : #define max_ps _mm_max_ps
997 : #define shuffle_ps _mm_shuffle_ps
998 : #define div_ps _mm_div_ps
999 : #define cmpeq_ps _mm_cmpeq_ps
1000 : #define mul_ps _mm_mul_ps
1001 : #define add_ps _mm_add_ps
1002 : #define hadd_ps sse2_hadd_ps
1003 : #define sqrt_ps _mm_sqrt_ps
1004 : #define or_ps _mm_or_ps
1005 : #define unpacklo_ps _mm_unpacklo_ps
1006 : #define unpackhi_ps _mm_unpackhi_ps
1007 : #define storeu_ps _mm_storeu_ps
1008 :
1009 132 : inline __m128 blendv_ps(__m128 a, __m128 b, __m128 mask)
1010 : {
1011 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
1012 : return _mm_blendv_ps(a, b, mask);
1013 : #else
1014 396 : return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b));
1015 : #endif
1016 : }
1017 :
1018 528 : inline __m128 SQUARE_PS(__m128 x)
1019 : {
1020 528 : return _mm_mul_ps(x, x);
1021 : }
1022 :
1023 132 : inline __m128 FIXUP_LANES(__m128 x)
1024 : {
1025 132 : return x;
1026 : }
1027 :
1028 : #endif
1029 :
1030 : static int
1031 : #if defined(__GNUC__)
1032 : __attribute__((noinline))
1033 : #endif
1034 66 : QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
1035 : const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1036 : float *CPL_RESTRICT pDstScanline)
1037 : {
1038 : // Optimized implementation for RMS on Float32 by
1039 : // processing by group of output pixels.
1040 66 : const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1041 :
1042 66 : int iDstPixel = 0;
1043 66 : const auto minus_zero = set1_ps(-0.0f);
1044 66 : const auto zeroDot25 = set1_ps(0.25f);
1045 66 : const auto one = set1_ps(1.0f);
1046 66 : const auto infv = set1_ps(std::numeric_limits<float>::infinity());
1047 66 : constexpr int DEST_ELTS = static_cast<int>(sizeof(one) / sizeof(float));
1048 :
1049 198 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1050 : {
1051 : // Load 2*DEST_ELTS Float32 from each line
1052 132 : auto firstLineLo = loadu_ps(pSrcScanlineShifted);
1053 132 : auto firstLineHi = loadu_ps(pSrcScanlineShifted + DEST_ELTS);
1054 132 : auto secondLineLo = loadu_ps(pSrcScanlineShifted + nChunkXSize);
1055 : auto secondLineHi =
1056 264 : loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize);
1057 :
1058 : // Take the absolute value
1059 132 : firstLineLo = andnot_ps(minus_zero, firstLineLo);
1060 132 : firstLineHi = andnot_ps(minus_zero, firstLineHi);
1061 132 : secondLineLo = andnot_ps(minus_zero, secondLineLo);
1062 132 : secondLineHi = andnot_ps(minus_zero, secondLineHi);
1063 :
1064 : auto firstLineEven =
1065 132 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1066 : auto firstLineOdd =
1067 132 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1068 : auto secondLineEven =
1069 132 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1070 : auto secondLineOdd =
1071 132 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1072 :
1073 : // Compute the maximum of each DEST_ELTS value to RMS-average
1074 396 : const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
1075 : max_ps(secondLineEven, secondLineEven));
1076 :
1077 : // Normalize each value by the maximum of the DEST_ELTS ones.
1078 : // This step is important to avoid that the square evaluates to infinity
1079 : // for sufficiently big input.
1080 132 : auto invMax = div_ps(one, maxV);
1081 : // Deal with 0 being the maximum to correct division by zero
1082 : // note: comparing to -0 leads to identical results as to comparing with
1083 : // 0
1084 264 : invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
1085 :
1086 132 : firstLineEven = mul_ps(firstLineEven, invMax);
1087 132 : firstLineOdd = mul_ps(firstLineOdd, invMax);
1088 132 : secondLineEven = mul_ps(secondLineEven, invMax);
1089 132 : secondLineOdd = mul_ps(secondLineOdd, invMax);
1090 :
1091 : // Compute squares
1092 132 : firstLineEven = SQUARE_PS(firstLineEven);
1093 132 : firstLineOdd = SQUARE_PS(firstLineOdd);
1094 132 : secondLineEven = SQUARE_PS(secondLineEven);
1095 132 : secondLineOdd = SQUARE_PS(secondLineOdd);
1096 :
1097 396 : const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
1098 : add_ps(secondLineEven, secondLineOdd));
1099 :
1100 396 : auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
1101 :
1102 : // Deal with infinity being the maximum
1103 132 : const auto maskIsInf = cmpeq_ps(maxV, infv);
1104 132 : rms = blendv_ps(rms, infv, maskIsInf);
1105 :
1106 132 : rms = FIXUP_LANES(rms);
1107 :
1108 132 : storeu_ps(&pDstScanline[iDstPixel], rms);
1109 132 : pSrcScanlineShifted += DEST_ELTS * 2;
1110 : }
1111 :
1112 66 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1113 66 : return iDstPixel;
1114 : }
1115 :
1116 : /************************************************************************/
1117 : /* AverageFloatSSE2() */
1118 : /************************************************************************/
1119 :
1120 50 : static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
1121 : const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1122 : float *CPL_RESTRICT pDstScanline)
1123 : {
1124 : // Optimized implementation for average on Float32 by
1125 : // processing by group of output pixels.
1126 50 : const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1127 :
1128 50 : int iDstPixel = 0;
1129 50 : const auto zeroDot25 = _mm_set1_ps(0.25f);
1130 50 : constexpr int DEST_ELTS =
1131 : static_cast<int>(sizeof(zeroDot25) / sizeof(float));
1132 :
1133 132 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1134 : {
1135 : // Load 2 * DEST_ELTS Float32 from each line
1136 : const auto firstLineLo =
1137 82 : _mm_mul_ps(_mm_loadu_ps(pSrcScanlineShifted), zeroDot25);
1138 164 : const auto firstLineHi = _mm_mul_ps(
1139 : _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS), zeroDot25);
1140 82 : const auto secondLineLo = _mm_mul_ps(
1141 82 : _mm_loadu_ps(pSrcScanlineShifted + nChunkXSize), zeroDot25);
1142 164 : const auto secondLineHi = _mm_mul_ps(
1143 82 : _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize),
1144 : zeroDot25);
1145 :
1146 : // Vertical addition
1147 82 : const auto tmpLo = _mm_add_ps(firstLineLo, secondLineLo);
1148 82 : const auto tmpHi = _mm_add_ps(firstLineHi, secondLineHi);
1149 :
1150 : // Horizontal addition
1151 82 : const auto average = sse2_hadd_ps(tmpLo, tmpHi);
1152 :
1153 82 : _mm_storeu_ps(&pDstScanline[iDstPixel], average);
1154 82 : pSrcScanlineShifted += DEST_ELTS * 2;
1155 : }
1156 :
1157 50 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1158 50 : return iDstPixel;
1159 : }
1160 :
1161 : /************************************************************************/
1162 : /* AverageDoubleSSE2() */
1163 : /************************************************************************/
1164 :
1165 : static int
1166 50 : AverageDoubleSSE2(int nDstXWidth, int nChunkXSize,
1167 : const double *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1168 : double *CPL_RESTRICT pDstScanline)
1169 : {
1170 : // Optimized implementation for average on Float64 by
1171 : // processing by group of output pixels.
1172 50 : const double *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1173 :
1174 50 : int iDstPixel = 0;
1175 50 : const auto zeroDot25 = _mm_set1_pd(0.25);
1176 50 : constexpr int DEST_ELTS =
1177 : static_cast<int>(sizeof(zeroDot25) / sizeof(double));
1178 :
1179 211 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1180 : {
1181 : // Load 4 * DEST_ELTS Float64 from each line
1182 161 : const auto firstLine0 = _mm_mul_pd(
1183 : _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS), zeroDot25);
1184 322 : const auto firstLine1 = _mm_mul_pd(
1185 : _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS), zeroDot25);
1186 161 : const auto secondLine0 = _mm_mul_pd(
1187 161 : _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS + nChunkXSize),
1188 : zeroDot25);
1189 322 : const auto secondLine1 = _mm_mul_pd(
1190 161 : _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS + nChunkXSize),
1191 : zeroDot25);
1192 :
1193 : // Vertical addition
1194 161 : const auto tmp0 = _mm_add_pd(firstLine0, secondLine0);
1195 161 : const auto tmp1 = _mm_add_pd(firstLine1, secondLine1);
1196 :
1197 : // Horizontal addition
1198 161 : const auto average0 = sse2_hadd_pd(tmp0, tmp1);
1199 :
1200 161 : _mm_storeu_pd(&pDstScanline[iDstPixel + 0], average0);
1201 161 : pSrcScanlineShifted += DEST_ELTS * 2;
1202 : }
1203 :
1204 50 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1205 50 : return iDstPixel;
1206 : }
1207 :
1208 : #endif
1209 :
1210 : #endif
1211 :
1212 : /************************************************************************/
1213 : /* GDALResampleChunk_AverageOrRMS() */
1214 : /************************************************************************/
1215 :
1216 : template <class T, class Tsum, GDALDataType eWrkDataType, bool bQuadraticMean>
1217 : static CPLErr
1218 2394 : GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
1219 : const T *pChunk, void **ppDstBuffer)
1220 : {
1221 2394 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1222 2394 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1223 2394 : const double dfSrcXDelta = args.dfSrcXDelta;
1224 2394 : const double dfSrcYDelta = args.dfSrcYDelta;
1225 2394 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1226 2394 : const int nChunkXOff = args.nChunkXOff;
1227 2394 : const int nChunkYOff = args.nChunkYOff;
1228 2394 : const int nChunkXSize = args.nChunkXSize;
1229 2394 : const int nChunkYSize = args.nChunkYSize;
1230 2394 : const int nDstXOff = args.nDstXOff;
1231 2394 : const int nDstXOff2 = args.nDstXOff2;
1232 2394 : const int nDstYOff = args.nDstYOff;
1233 2394 : const int nDstYOff2 = args.nDstYOff2;
1234 2394 : const char *pszResampling = args.pszResampling;
1235 2394 : bool bHasNoData = args.bHasNoData;
1236 2394 : const double dfNoDataValue = args.dfNoDataValue;
1237 2394 : const GDALColorTable *const poColorTable =
1238 : !bQuadraticMean &&
1239 : // AVERAGE_BIT2GRAYSCALE
1240 2317 : CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"))
1241 : ? nullptr
1242 : : args.poColorTable;
1243 2394 : const bool bPropagateNoData = args.bPropagateNoData;
1244 :
1245 2394 : T tNoDataValue = (!bHasNoData) ? 0 : static_cast<T>(dfNoDataValue);
1246 2394 : const T tReplacementVal =
1247 178 : bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
1248 58 : args.eOvrDataType, dfNoDataValue))
1249 : : 0;
1250 :
1251 2394 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
1252 2394 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1253 2394 : const int nDstXWidth = nDstXOff2 - nDstXOff;
1254 :
1255 : /* -------------------------------------------------------------------- */
1256 : /* Allocate buffers. */
1257 : /* -------------------------------------------------------------------- */
1258 2394 : *ppDstBuffer = static_cast<T *>(
1259 2394 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
1260 : GDALGetDataTypeSizeBytes(eWrkDataType)));
1261 2394 : if (*ppDstBuffer == nullptr)
1262 : {
1263 0 : return CE_Failure;
1264 : }
1265 2394 : T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
1266 :
1267 : struct PrecomputedXValue
1268 : {
1269 : int nLeftXOffShifted;
1270 : int nRightXOffShifted;
1271 : double dfLeftWeight;
1272 : double dfRightWeight;
1273 : double dfTotalWeightFullLine;
1274 : };
1275 :
1276 : PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
1277 2394 : VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(PrecomputedXValue)));
1278 :
1279 2394 : if (pasSrcX == nullptr)
1280 : {
1281 0 : return CE_Failure;
1282 : }
1283 :
1284 2394 : std::vector<GDALColorEntry> colorEntries;
1285 :
1286 2394 : if (poColorTable)
1287 : {
1288 5 : int nTransparentIdx = -1;
1289 5 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1290 :
1291 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1292 : // it as nodata value
1293 6 : if (bHasNoData && dfNoDataValue >= 0.0 &&
1294 1 : tNoDataValue < colorEntries.size())
1295 1 : colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1296 :
1297 : // Or if we have no explicit nodata, but a color table entry that is
1298 : // transparent, consider it as the nodata value
1299 4 : else if (!bHasNoData && nTransparentIdx >= 0)
1300 : {
1301 0 : bHasNoData = true;
1302 0 : tNoDataValue = static_cast<T>(nTransparentIdx);
1303 : }
1304 : }
1305 :
1306 : /* ==================================================================== */
1307 : /* Precompute inner loop constants. */
1308 : /* ==================================================================== */
1309 2394 : bool bSrcXSpacingIsTwo = true;
1310 2394 : int nLastSrcXOff2 = -1;
1311 856928 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1312 : {
1313 854534 : const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
1314 : // Apply some epsilon to avoid numerical precision issues
1315 854534 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
1316 854534 : const double dfSrcXOff2 =
1317 854534 : dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
1318 854534 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
1319 :
1320 854534 : if (nSrcXOff < nChunkXOff)
1321 0 : nSrcXOff = nChunkXOff;
1322 854534 : if (nSrcXOff2 == nSrcXOff)
1323 0 : nSrcXOff2++;
1324 854534 : if (nSrcXOff2 > nChunkRightXOff)
1325 1 : nSrcXOff2 = nChunkRightXOff;
1326 :
1327 854534 : pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
1328 854534 : pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
1329 854534 : nSrcXOff2 - nChunkXOff;
1330 21 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
1331 854534 : (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
1332 854534 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
1333 854534 : 1 - (nSrcXOff2 - dfSrcXOff2);
1334 854534 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
1335 854534 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
1336 854534 : if (nSrcXOff + 1 < nSrcXOff2)
1337 : {
1338 854513 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1339 854513 : nSrcXOff2 - nSrcXOff - 2;
1340 854513 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1341 854513 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
1342 : }
1343 :
1344 854534 : if (nSrcXOff2 - nSrcXOff != 2 ||
1345 733041 : (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
1346 : {
1347 120637 : bSrcXSpacingIsTwo = false;
1348 : }
1349 854534 : nLastSrcXOff2 = nSrcXOff2;
1350 : }
1351 :
1352 : /* ==================================================================== */
1353 : /* Loop over destination scanlines. */
1354 : /* ==================================================================== */
1355 722579 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1356 : {
1357 720185 : const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
1358 720185 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
1359 720185 : if (nSrcYOff < nChunkYOff)
1360 0 : nSrcYOff = nChunkYOff;
1361 :
1362 720185 : const double dfSrcYOff2 =
1363 720185 : dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
1364 720185 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
1365 720185 : if (nSrcYOff2 == nSrcYOff)
1366 0 : ++nSrcYOff2;
1367 720185 : if (nSrcYOff2 > nChunkBottomYOff)
1368 3 : nSrcYOff2 = nChunkBottomYOff;
1369 :
1370 720185 : T *const pDstScanline =
1371 720185 : pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
1372 :
1373 : /* --------------------------------------------------------------------
1374 : */
1375 : /* Loop over destination pixels */
1376 : /* --------------------------------------------------------------------
1377 : */
1378 720185 : if (poColorTable == nullptr)
1379 : {
1380 720070 : if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
1381 : pabyChunkNodataMask == nullptr)
1382 : {
1383 : if constexpr (eWrkDataType == GDT_UInt8 ||
1384 : eWrkDataType == GDT_UInt16)
1385 : {
1386 : // Optimized case : no nodata, overview by a factor of 2 and
1387 : // regular x and y src spacing.
1388 117150 : const T *pSrcScanlineShifted =
1389 117150 : pChunk + pasSrcX[0].nLeftXOffShifted +
1390 117150 : static_cast<size_t>(nSrcYOff - nChunkYOff) *
1391 117150 : nChunkXSize;
1392 117150 : int iDstPixel = 0;
1393 : #ifdef USE_SSE2
1394 : if constexpr (eWrkDataType == GDT_UInt8)
1395 : {
1396 : if constexpr (bQuadraticMean)
1397 : {
1398 5389 : iDstPixel = QuadraticMeanByteSSE2OrAVX2(
1399 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1400 : pDstScanline);
1401 : }
1402 : else
1403 : {
1404 111734 : iDstPixel = AverageByteSSE2OrAVX2(
1405 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1406 : pDstScanline);
1407 : }
1408 : }
1409 : else
1410 : {
1411 : static_assert(eWrkDataType == GDT_UInt16);
1412 : if constexpr (bQuadraticMean)
1413 : {
1414 14 : iDstPixel = QuadraticMeanUInt16SSE2(
1415 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1416 : pDstScanline);
1417 : }
1418 : else
1419 : {
1420 13 : iDstPixel = AverageUInt16SSE2(
1421 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1422 : pDstScanline);
1423 : }
1424 : }
1425 : #endif
1426 291609 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1427 : {
1428 174459 : Tsum nTotal = 0;
1429 : T nVal;
1430 : if constexpr (bQuadraticMean)
1431 52 : nTotal =
1432 52 : SQUARE<Tsum>(pSrcScanlineShifted[0]) +
1433 52 : SQUARE<Tsum>(pSrcScanlineShifted[1]) +
1434 52 : SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
1435 52 : SQUARE<Tsum>(
1436 52 : pSrcScanlineShifted[1 + nChunkXSize]);
1437 : else
1438 174407 : nTotal = pSrcScanlineShifted[0] +
1439 174407 : pSrcScanlineShifted[1] +
1440 174407 : pSrcScanlineShifted[nChunkXSize] +
1441 174407 : pSrcScanlineShifted[1 + nChunkXSize];
1442 :
1443 174459 : constexpr int nTotalWeight = 4;
1444 : if constexpr (bQuadraticMean)
1445 52 : nVal = ComputeIntegerRMS_4values<T>(nTotal);
1446 : else
1447 174407 : nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
1448 : nTotalWeight);
1449 :
1450 : // No need to compare nVal against tNoDataValue as we
1451 : // are in a case where pabyChunkNodataMask == nullptr
1452 : // implies the absence of nodata value.
1453 174459 : pDstScanline[iDstPixel] = nVal;
1454 174459 : pSrcScanlineShifted += 2;
1455 : }
1456 : }
1457 : else
1458 : {
1459 : static_assert(eWrkDataType == GDT_Float32 ||
1460 : eWrkDataType == GDT_Float64);
1461 202 : const T *pSrcScanlineShifted =
1462 202 : pChunk + pasSrcX[0].nLeftXOffShifted +
1463 202 : static_cast<size_t>(nSrcYOff - nChunkYOff) *
1464 202 : nChunkXSize;
1465 202 : int iDstPixel = 0;
1466 : #if defined(USE_SSE2) && !defined(ARM_V7)
1467 : if constexpr (eWrkDataType == GDT_Float32)
1468 : {
1469 : static_assert(std::is_same_v<T, float>);
1470 : if constexpr (bQuadraticMean)
1471 : {
1472 66 : iDstPixel = QuadraticMeanFloatSSE2(
1473 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1474 : pDstScanline);
1475 : }
1476 : else
1477 : {
1478 50 : iDstPixel = AverageFloatSSE2(
1479 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1480 : pDstScanline);
1481 : }
1482 : }
1483 : else
1484 : {
1485 : if constexpr (!bQuadraticMean)
1486 : {
1487 50 : iDstPixel = AverageDoubleSSE2(
1488 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1489 : pDstScanline);
1490 : }
1491 : }
1492 : #endif
1493 :
1494 726 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1495 : {
1496 : T nVal;
1497 :
1498 : if constexpr (bQuadraticMean)
1499 : {
1500 : // Avoid issues with large values by renormalizing
1501 96 : const auto max = std::max(
1502 420 : {std::fabs(pSrcScanlineShifted[0]),
1503 420 : std::fabs(pSrcScanlineShifted[1]),
1504 420 : std::fabs(pSrcScanlineShifted[nChunkXSize]),
1505 420 : std::fabs(
1506 420 : pSrcScanlineShifted[1 + nChunkXSize])});
1507 420 : if (max == 0)
1508 : {
1509 8 : nVal = 0;
1510 : }
1511 412 : else if (std::isinf(max))
1512 : {
1513 : // If there is at least one infinity value,
1514 : // then just summing, and taking the abs
1515 : // value will give the expected result:
1516 : // * +inf if all values are +inf
1517 : // * +inf if all values are -inf
1518 : // * NaN otherwise
1519 82 : nVal = std::fabs(
1520 82 : pSrcScanlineShifted[0] +
1521 82 : pSrcScanlineShifted[1] +
1522 82 : pSrcScanlineShifted[nChunkXSize] +
1523 82 : pSrcScanlineShifted[1 + nChunkXSize]);
1524 : }
1525 : else
1526 : {
1527 330 : const auto inv_max = static_cast<T>(1.0) / max;
1528 330 : nVal =
1529 : max *
1530 330 : std::sqrt(
1531 : static_cast<T>(0.25) *
1532 330 : (SQUARE(pSrcScanlineShifted[0] *
1533 330 : inv_max) +
1534 330 : SQUARE(pSrcScanlineShifted[1] *
1535 330 : inv_max) +
1536 330 : SQUARE(
1537 330 : pSrcScanlineShifted[nChunkXSize] *
1538 330 : inv_max) +
1539 330 : SQUARE(
1540 330 : pSrcScanlineShifted[1 +
1541 : nChunkXSize] *
1542 : inv_max)));
1543 : }
1544 : }
1545 : else
1546 : {
1547 104 : constexpr auto weight = static_cast<T>(0.25);
1548 : // Multiply each value by weight to avoid
1549 : // potential overflow
1550 104 : nVal =
1551 104 : (weight * pSrcScanlineShifted[0] +
1552 104 : weight * pSrcScanlineShifted[1] +
1553 104 : weight * pSrcScanlineShifted[nChunkXSize] +
1554 104 : weight * pSrcScanlineShifted[1 + nChunkXSize]);
1555 : }
1556 :
1557 : // No need to compare nVal against tNoDataValue as we
1558 : // are in a case where pabyChunkNodataMask == nullptr
1559 : // implies the absence of nodata value.
1560 524 : pDstScanline[iDstPixel] = nVal;
1561 524 : pSrcScanlineShifted += 2;
1562 : }
1563 117352 : }
1564 : }
1565 : else
1566 : {
1567 17 : const double dfBottomWeight =
1568 602718 : (nSrcYOff + 1 == nSrcYOff2) ? 1.0
1569 602701 : : 1.0 - (dfSrcYOff - nSrcYOff);
1570 602718 : const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
1571 602718 : nSrcYOff -= nChunkYOff;
1572 602718 : nSrcYOff2 -= nChunkYOff;
1573 :
1574 602718 : double dfTotalWeightFullColumn = dfBottomWeight;
1575 602718 : if (nSrcYOff + 1 < nSrcYOff2)
1576 : {
1577 602701 : dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
1578 602701 : dfTotalWeightFullColumn += dfTopWeight;
1579 : }
1580 :
1581 18759673 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1582 : {
1583 18156933 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1584 18156933 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1585 :
1586 18156933 : double dfTotal = 0;
1587 18156933 : double dfTotalWeight = 0;
1588 18156933 : [[maybe_unused]] double dfMulFactor = 1.0;
1589 18156933 : [[maybe_unused]] double dfInvMulFactor = 1.0;
1590 18156933 : constexpr bool bUseMulFactor =
1591 : (eWrkDataType == GDT_Float32 ||
1592 : eWrkDataType == GDT_Float64);
1593 18156933 : if (pabyChunkNodataMask == nullptr)
1594 : {
1595 : if constexpr (bUseMulFactor)
1596 : {
1597 : if constexpr (bQuadraticMean)
1598 : {
1599 80 : T mulFactor = 0;
1600 80 : auto pChunkShifted =
1601 80 : pChunk +
1602 80 : static_cast<size_t>(nSrcYOff) * nChunkXSize;
1603 :
1604 240 : for (int iY = nSrcYOff; iY < nSrcYOff2;
1605 160 : ++iY, pChunkShifted += nChunkXSize)
1606 : {
1607 480 : for (int iX = nSrcXOff; iX < nSrcXOff2;
1608 : ++iX)
1609 640 : mulFactor = std::max(
1610 : mulFactor,
1611 320 : std::fabs(pChunkShifted[iX]));
1612 : }
1613 80 : dfMulFactor = double(mulFactor);
1614 142 : dfInvMulFactor =
1615 62 : dfMulFactor > 0 &&
1616 62 : std::isfinite(dfMulFactor)
1617 : ? 1.0 / dfMulFactor
1618 : : 1.0;
1619 : }
1620 : else
1621 : {
1622 139 : dfMulFactor = (nSrcYOff2 - nSrcYOff) *
1623 139 : (nSrcXOff2 - nSrcXOff);
1624 139 : dfInvMulFactor = 1.0 / dfMulFactor;
1625 : }
1626 : }
1627 :
1628 1746545 : auto pChunkShifted =
1629 227 : pChunk +
1630 1746545 : static_cast<size_t>(nSrcYOff) * nChunkXSize;
1631 1746545 : int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1632 1746545 : double dfWeightY = dfBottomWeight;
1633 3493539 : while (true)
1634 : {
1635 : double dfTotalLine;
1636 : if constexpr (bQuadraticMean)
1637 : {
1638 : // Left pixel
1639 : {
1640 216 : const T val = pChunkShifted[nSrcXOff];
1641 216 : dfTotalLine =
1642 216 : SQUARE(double(val) * dfInvMulFactor) *
1643 216 : pasSrcX[iDstPixel].dfLeftWeight;
1644 : }
1645 :
1646 216 : if (nSrcXOff + 1 < nSrcXOff2)
1647 : {
1648 : // Middle pixels
1649 216 : for (int iX = nSrcXOff + 1;
1650 536 : iX < nSrcXOff2 - 1; ++iX)
1651 : {
1652 320 : const T val = pChunkShifted[iX];
1653 320 : dfTotalLine += SQUARE(double(val) *
1654 : dfInvMulFactor);
1655 : }
1656 :
1657 : // Right pixel
1658 : {
1659 216 : const T val =
1660 216 : pChunkShifted[nSrcXOff2 - 1];
1661 216 : dfTotalLine +=
1662 216 : SQUARE(double(val) *
1663 216 : dfInvMulFactor) *
1664 216 : pasSrcX[iDstPixel].dfRightWeight;
1665 : }
1666 : }
1667 : }
1668 : else
1669 : {
1670 : // Left pixel
1671 : {
1672 5239868 : const T val = pChunkShifted[nSrcXOff];
1673 5239868 : dfTotalLine =
1674 5239868 : double(val) * dfInvMulFactor *
1675 5239868 : pasSrcX[iDstPixel].dfLeftWeight;
1676 : }
1677 :
1678 5239868 : if (nSrcXOff + 1 < nSrcXOff2)
1679 : {
1680 : // Middle pixels
1681 4239442 : for (int iX = nSrcXOff + 1;
1682 64183238 : iX < nSrcXOff2 - 1; ++iX)
1683 : {
1684 59943836 : const T val = pChunkShifted[iX];
1685 59943836 : dfTotalLine +=
1686 59943836 : double(val) * dfInvMulFactor;
1687 : }
1688 :
1689 : // Right pixel
1690 : {
1691 4239442 : const T val =
1692 4239442 : pChunkShifted[nSrcXOff2 - 1];
1693 4239442 : dfTotalLine +=
1694 4239442 : double(val) * dfInvMulFactor *
1695 4239442 : pasSrcX[iDstPixel].dfRightWeight;
1696 : }
1697 : }
1698 : }
1699 :
1700 5240084 : dfTotal += dfTotalLine * dfWeightY;
1701 5240084 : --nCounterY;
1702 5240084 : if (nCounterY < 0)
1703 1746545 : break;
1704 3493539 : pChunkShifted += nChunkXSize;
1705 3493539 : dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
1706 : }
1707 :
1708 1746545 : dfTotalWeight =
1709 1746545 : pasSrcX[iDstPixel].dfTotalWeightFullLine *
1710 : dfTotalWeightFullColumn;
1711 : }
1712 : else
1713 : {
1714 16410398 : size_t nCount = 0;
1715 71788794 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1716 : {
1717 55378396 : const auto pChunkShifted =
1718 55378396 : pChunk + static_cast<size_t>(iY) * nChunkXSize;
1719 :
1720 55378396 : double dfTotalLine = 0;
1721 55378396 : double dfTotalWeightLine = 0;
1722 : // Left pixel
1723 : {
1724 55378396 : const int iX = nSrcXOff;
1725 55378396 : const T val = pChunkShifted[iX];
1726 55378396 : if (pabyChunkNodataMask
1727 55378396 : [iX +
1728 55378396 : static_cast<size_t>(iY) * nChunkXSize])
1729 : {
1730 23518643 : nCount++;
1731 23518643 : const double dfWeightX =
1732 23518643 : pasSrcX[iDstPixel].dfLeftWeight;
1733 23518643 : dfTotalWeightLine = dfWeightX;
1734 : if constexpr (bQuadraticMean)
1735 60 : dfTotalLine =
1736 60 : SQUARE(double(val)) * dfWeightX;
1737 : else
1738 23518583 : dfTotalLine = double(val) * dfWeightX;
1739 : }
1740 : }
1741 :
1742 55378396 : if (nSrcXOff < nSrcXOff2 - 1)
1743 : {
1744 : // Middle pixels
1745 152899196 : for (int iX = nSrcXOff + 1; iX < nSrcXOff2 - 1;
1746 : ++iX)
1747 : {
1748 97521100 : const T val = pChunkShifted[iX];
1749 97521100 : if (pabyChunkNodataMask
1750 97521100 : [iX + static_cast<size_t>(iY) *
1751 97521100 : nChunkXSize])
1752 : {
1753 39728400 : nCount++;
1754 39728400 : dfTotalWeightLine += 1;
1755 : if constexpr (bQuadraticMean)
1756 0 : dfTotalLine += SQUARE(double(val));
1757 : else
1758 39728400 : dfTotalLine += double(val);
1759 : }
1760 : }
1761 :
1762 : // Right pixel
1763 : {
1764 55378396 : const int iX = nSrcXOff2 - 1;
1765 55378396 : const T val = pChunkShifted[iX];
1766 55378396 : if (pabyChunkNodataMask
1767 55378396 : [iX + static_cast<size_t>(iY) *
1768 55378396 : nChunkXSize])
1769 : {
1770 23517911 : nCount++;
1771 23517911 : const double dfWeightX =
1772 23517911 : pasSrcX[iDstPixel].dfRightWeight;
1773 23517911 : dfTotalWeightLine += dfWeightX;
1774 : if constexpr (bQuadraticMean)
1775 61 : dfTotalLine +=
1776 61 : SQUARE(double(val)) * dfWeightX;
1777 : else
1778 23517850 : dfTotalLine +=
1779 23517850 : double(val) * dfWeightX;
1780 : }
1781 : }
1782 : }
1783 :
1784 94346294 : const double dfWeightY =
1785 : (iY == nSrcYOff) ? dfBottomWeight
1786 38967998 : : (iY + 1 == nSrcYOff2) ? dfTopWeight
1787 : : 1.0;
1788 55378396 : dfTotal += dfTotalLine * dfWeightY;
1789 55378396 : dfTotalWeight += dfTotalWeightLine * dfWeightY;
1790 : }
1791 :
1792 16410398 : if (nCount == 0 ||
1793 8 : (bPropagateNoData &&
1794 : nCount <
1795 8 : static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
1796 8 : (nSrcXOff2 - nSrcXOff)))
1797 : {
1798 9609422 : pDstScanline[iDstPixel] = tNoDataValue;
1799 9609422 : continue;
1800 : }
1801 : }
1802 : if constexpr (eWrkDataType == GDT_UInt8)
1803 : {
1804 : T nVal;
1805 : if constexpr (bQuadraticMean)
1806 38 : nVal = ComputeIntegerRMS<T, int>(dfTotal,
1807 : dfTotalWeight);
1808 : else
1809 8547230 : nVal =
1810 8547230 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1811 8547268 : if (bHasNoData && nVal == tNoDataValue)
1812 0 : nVal = tReplacementVal;
1813 8547268 : pDstScanline[iDstPixel] = nVal;
1814 : }
1815 : else if constexpr (eWrkDataType == GDT_UInt16)
1816 : {
1817 : T nVal;
1818 : if constexpr (bQuadraticMean)
1819 4 : nVal = ComputeIntegerRMS<T, uint64_t>(
1820 : dfTotal, dfTotalWeight);
1821 : else
1822 4 : nVal =
1823 4 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1824 8 : if (bHasNoData && nVal == tNoDataValue)
1825 0 : nVal = tReplacementVal;
1826 8 : pDstScanline[iDstPixel] = nVal;
1827 : }
1828 : else
1829 : {
1830 : T nVal;
1831 : if constexpr (bQuadraticMean)
1832 : {
1833 : if constexpr (bUseMulFactor)
1834 81 : nVal = static_cast<T>(
1835 48 : dfMulFactor *
1836 81 : sqrt(dfTotal / dfTotalWeight));
1837 : else
1838 : nVal = static_cast<T>(
1839 : sqrt(dfTotal / dfTotalWeight));
1840 : }
1841 : else
1842 : {
1843 : if constexpr (bUseMulFactor)
1844 184 : nVal = static_cast<T>(
1845 184 : dfMulFactor * (dfTotal / dfTotalWeight));
1846 : else
1847 : nVal = static_cast<T>(dfTotal / dfTotalWeight);
1848 : }
1849 265 : if (bHasNoData && nVal == tNoDataValue)
1850 2 : nVal = tReplacementVal;
1851 265 : pDstScanline[iDstPixel] = nVal;
1852 : }
1853 : }
1854 : }
1855 : }
1856 : else
1857 : {
1858 115 : nSrcYOff -= nChunkYOff;
1859 115 : nSrcYOff2 -= nChunkYOff;
1860 :
1861 6590 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1862 : {
1863 6475 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1864 6475 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1865 :
1866 6475 : uint64_t nTotalR = 0;
1867 6475 : uint64_t nTotalG = 0;
1868 6475 : uint64_t nTotalB = 0;
1869 6475 : size_t nCount = 0;
1870 :
1871 19425 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1872 : {
1873 38850 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
1874 : {
1875 25900 : const T val =
1876 25900 : pChunk[iX + static_cast<size_t>(iY) * nChunkXSize];
1877 : // cppcheck-suppress unsignedLessThanZero
1878 25900 : if (val < 0 || val >= colorEntries.size())
1879 0 : continue;
1880 25900 : const size_t idx = static_cast<size_t>(val);
1881 25900 : const auto &entry = colorEntries[idx];
1882 25900 : if (entry.c4)
1883 : {
1884 : if constexpr (bQuadraticMean)
1885 : {
1886 800 : nTotalR += SQUARE<int>(entry.c1);
1887 800 : nTotalG += SQUARE<int>(entry.c2);
1888 800 : nTotalB += SQUARE<int>(entry.c3);
1889 800 : ++nCount;
1890 : }
1891 : else
1892 : {
1893 13328 : nTotalR += entry.c1;
1894 13328 : nTotalG += entry.c2;
1895 13328 : nTotalB += entry.c3;
1896 13328 : ++nCount;
1897 : }
1898 : }
1899 : }
1900 : }
1901 :
1902 6475 : if (nCount == 0 ||
1903 0 : (bPropagateNoData &&
1904 0 : nCount < static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
1905 0 : (nSrcXOff2 - nSrcXOff)))
1906 : {
1907 2838 : pDstScanline[iDstPixel] = tNoDataValue;
1908 : }
1909 : else
1910 : {
1911 : GDALColorEntry color;
1912 : if constexpr (bQuadraticMean)
1913 : {
1914 200 : color.c1 =
1915 200 : static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
1916 200 : color.c2 =
1917 200 : static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
1918 200 : color.c3 =
1919 200 : static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
1920 : }
1921 : else
1922 : {
1923 3437 : color.c1 =
1924 3437 : static_cast<short>((nTotalR + nCount / 2) / nCount);
1925 3437 : color.c2 =
1926 3437 : static_cast<short>((nTotalG + nCount / 2) / nCount);
1927 3437 : color.c3 =
1928 3437 : static_cast<short>((nTotalB + nCount / 2) / nCount);
1929 : }
1930 3637 : pDstScanline[iDstPixel] =
1931 3637 : static_cast<T>(BestColorEntry(colorEntries, color));
1932 : }
1933 : }
1934 : }
1935 : }
1936 :
1937 2394 : CPLFree(pasSrcX);
1938 :
1939 2394 : return CE_None;
1940 : }
1941 :
1942 : template <bool bQuadraticMean>
1943 : static CPLErr
1944 2394 : GDALResampleChunk_AverageOrRMSInternal(const GDALOverviewResampleArgs &args,
1945 : const void *pChunk, void **ppDstBuffer,
1946 : GDALDataType *peDstBufferDataType)
1947 : {
1948 2394 : *peDstBufferDataType = args.eWrkDataType;
1949 2394 : switch (args.eWrkDataType)
1950 : {
1951 2263 : case GDT_UInt8:
1952 : {
1953 : return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_UInt8,
1954 2263 : bQuadraticMean>(
1955 2263 : args, static_cast<const GByte *>(pChunk), ppDstBuffer);
1956 : }
1957 :
1958 11 : case GDT_UInt16:
1959 : {
1960 : if constexpr (bQuadraticMean)
1961 : {
1962 : // Use double as accumulation type, because UInt32 could overflow
1963 : return GDALResampleChunk_AverageOrRMS_T<
1964 6 : GUInt16, double, GDT_UInt16, bQuadraticMean>(
1965 6 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1966 : }
1967 : else
1968 : {
1969 : return GDALResampleChunk_AverageOrRMS_T<
1970 5 : GUInt16, GUInt32, GDT_UInt16, bQuadraticMean>(
1971 5 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1972 : }
1973 : }
1974 :
1975 73 : case GDT_Float32:
1976 : {
1977 : return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32,
1978 73 : bQuadraticMean>(
1979 73 : args, static_cast<const float *>(pChunk), ppDstBuffer);
1980 : }
1981 :
1982 47 : case GDT_Float64:
1983 : {
1984 : return GDALResampleChunk_AverageOrRMS_T<double, double, GDT_Float64,
1985 47 : bQuadraticMean>(
1986 47 : args, static_cast<const double *>(pChunk), ppDstBuffer);
1987 : }
1988 :
1989 0 : default:
1990 0 : break;
1991 : }
1992 :
1993 0 : CPLAssert(false);
1994 : return CE_Failure;
1995 : }
1996 :
1997 : static CPLErr
1998 2394 : GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
1999 : const void *pChunk, void **ppDstBuffer,
2000 : GDALDataType *peDstBufferDataType)
2001 : {
2002 2394 : if (EQUAL(args.pszResampling, "RMS"))
2003 77 : return GDALResampleChunk_AverageOrRMSInternal<true>(
2004 77 : args, pChunk, ppDstBuffer, peDstBufferDataType);
2005 : else
2006 2317 : return GDALResampleChunk_AverageOrRMSInternal<false>(
2007 2317 : args, pChunk, ppDstBuffer, peDstBufferDataType);
2008 : }
2009 :
2010 : /************************************************************************/
2011 : /* GDALResampleChunk_Gauss() */
2012 : /************************************************************************/
2013 :
2014 86 : static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
2015 : const void *pChunk, void **ppDstBuffer,
2016 : GDALDataType *peDstBufferDataType)
2017 :
2018 : {
2019 86 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2020 86 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2021 86 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2022 86 : const int nChunkXOff = args.nChunkXOff;
2023 86 : const int nChunkXSize = args.nChunkXSize;
2024 86 : const int nChunkYOff = args.nChunkYOff;
2025 86 : const int nChunkYSize = args.nChunkYSize;
2026 86 : const int nDstXOff = args.nDstXOff;
2027 86 : const int nDstXOff2 = args.nDstXOff2;
2028 86 : const int nDstYOff = args.nDstYOff;
2029 86 : const int nDstYOff2 = args.nDstYOff2;
2030 86 : const bool bHasNoData = args.bHasNoData;
2031 86 : double dfNoDataValue = args.dfNoDataValue;
2032 86 : const GDALColorTable *poColorTable = args.poColorTable;
2033 :
2034 86 : const double *const padfChunk = static_cast<const double *>(pChunk);
2035 :
2036 86 : *ppDstBuffer =
2037 86 : VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
2038 : GDALGetDataTypeSizeBytes(GDT_Float64));
2039 86 : if (*ppDstBuffer == nullptr)
2040 : {
2041 0 : return CE_Failure;
2042 : }
2043 86 : *peDstBufferDataType = GDT_Float64;
2044 86 : double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
2045 :
2046 : /* -------------------------------------------------------------------- */
2047 : /* Create the filter kernel and allocate scanline buffer. */
2048 : /* -------------------------------------------------------------------- */
2049 86 : int nGaussMatrixDim = 3;
2050 : const int *panGaussMatrix;
2051 86 : constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
2052 86 : constexpr int anGaussMatrix5x5[] = {1, 4, 6, 4, 1, 4, 16, 24, 16,
2053 : 4, 6, 24, 36, 24, 6, 4, 16, 24,
2054 : 16, 4, 1, 4, 6, 4, 1};
2055 86 : constexpr int anGaussMatrix7x7[] = {
2056 : 1, 6, 15, 20, 15, 6, 1, 6, 36, 90, 120, 90, 36,
2057 : 6, 15, 90, 225, 300, 225, 90, 15, 20, 120, 300, 400, 300,
2058 : 120, 20, 15, 90, 225, 300, 225, 90, 15, 6, 36, 90, 120,
2059 : 90, 36, 6, 1, 6, 15, 20, 15, 6, 1};
2060 :
2061 86 : const int nOXSize = args.nOvrXSize;
2062 86 : const int nOYSize = args.nOvrYSize;
2063 86 : const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
2064 :
2065 : // matrix for gauss filter
2066 86 : if (nResYFactor <= 2)
2067 : {
2068 85 : panGaussMatrix = anGaussMatrix3x3;
2069 85 : nGaussMatrixDim = 3;
2070 : }
2071 1 : else if (nResYFactor <= 4)
2072 : {
2073 0 : panGaussMatrix = anGaussMatrix5x5;
2074 0 : nGaussMatrixDim = 5;
2075 : }
2076 : else
2077 : {
2078 1 : panGaussMatrix = anGaussMatrix7x7;
2079 1 : nGaussMatrixDim = 7;
2080 : }
2081 :
2082 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
2083 : int *panGaussMatrixDup = static_cast<int *>(
2084 : CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
2085 : memcpy(panGaussMatrixDup, panGaussMatrix,
2086 : sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
2087 : panGaussMatrix = panGaussMatrixDup;
2088 : #endif
2089 :
2090 86 : if (!bHasNoData)
2091 79 : dfNoDataValue = 0.0;
2092 :
2093 86 : std::vector<GDALColorEntry> colorEntries;
2094 86 : int nTransparentIdx = -1;
2095 86 : if (poColorTable)
2096 2 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
2097 :
2098 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
2099 : // it as nodata value.
2100 92 : if (bHasNoData && dfNoDataValue >= 0.0 &&
2101 6 : dfNoDataValue < colorEntries.size())
2102 0 : colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
2103 :
2104 : // Or if we have no explicit nodata, but a color table entry that is
2105 : // transparent, consider it as the nodata value.
2106 86 : else if (!bHasNoData && nTransparentIdx >= 0)
2107 : {
2108 0 : dfNoDataValue = nTransparentIdx;
2109 : }
2110 :
2111 86 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2112 86 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2113 86 : const int nDstXWidth = nDstXOff2 - nDstXOff;
2114 :
2115 : /* ==================================================================== */
2116 : /* Loop over destination scanlines. */
2117 : /* ==================================================================== */
2118 16488 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2119 : {
2120 16402 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
2121 16402 : int nSrcYOff2 =
2122 16402 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
2123 :
2124 16402 : if (nSrcYOff < nChunkYOff)
2125 : {
2126 0 : nSrcYOff = nChunkYOff;
2127 0 : nSrcYOff2++;
2128 : }
2129 :
2130 16402 : const int iSizeY = nSrcYOff2 - nSrcYOff;
2131 16402 : nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
2132 16402 : nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
2133 :
2134 16402 : if (nSrcYOff2 > nChunkBottomYOff ||
2135 16359 : (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
2136 : {
2137 44 : nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
2138 : }
2139 :
2140 16402 : int nYShiftGaussMatrix = 0;
2141 16402 : if (nSrcYOff < nChunkYOff)
2142 : {
2143 0 : nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
2144 0 : nSrcYOff = nChunkYOff;
2145 : }
2146 :
2147 16402 : const double *const padfSrcScanline =
2148 16402 : padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
2149 16402 : const GByte *pabySrcScanlineNodataMask = nullptr;
2150 16402 : if (pabyChunkNodataMask != nullptr)
2151 152 : pabySrcScanlineNodataMask =
2152 152 : pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
2153 :
2154 : /* --------------------------------------------------------------------
2155 : */
2156 : /* Loop over destination pixels */
2157 : /* --------------------------------------------------------------------
2158 : */
2159 16402 : double *const padfDstScanline =
2160 16402 : padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
2161 4149980 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2162 : {
2163 4133580 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
2164 4133580 : int nSrcXOff2 =
2165 4133580 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
2166 :
2167 4133580 : if (nSrcXOff < nChunkXOff)
2168 : {
2169 0 : nSrcXOff = nChunkXOff;
2170 0 : nSrcXOff2++;
2171 : }
2172 :
2173 4133580 : const int iSizeX = nSrcXOff2 - nSrcXOff;
2174 4133580 : nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
2175 4133580 : nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
2176 :
2177 4133580 : if (nSrcXOff2 > nChunkRightXOff ||
2178 4127930 : (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
2179 : {
2180 5650 : nSrcXOff2 =
2181 5650 : std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
2182 : }
2183 :
2184 4133580 : int nXShiftGaussMatrix = 0;
2185 4133580 : if (nSrcXOff < nChunkXOff)
2186 : {
2187 0 : nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
2188 0 : nSrcXOff = nChunkXOff;
2189 : }
2190 :
2191 4133580 : if (poColorTable == nullptr)
2192 : {
2193 4133380 : double dfTotal = 0.0;
2194 4133380 : GInt64 nCount = 0;
2195 4133380 : const int *panLineWeight =
2196 4133380 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2197 : nXShiftGaussMatrix;
2198 :
2199 16527900 : for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2200 12394500 : ++iY, ++j, panLineWeight += nGaussMatrixDim)
2201 : {
2202 49561300 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2203 : {
2204 37166800 : const double val =
2205 37166800 : padfSrcScanline[iX - nChunkXOff +
2206 37166800 : static_cast<GPtrDiff_t>(iY -
2207 37166800 : nSrcYOff) *
2208 37166800 : nChunkXSize];
2209 37166800 : if (pabySrcScanlineNodataMask == nullptr ||
2210 32872 : pabySrcScanlineNodataMask[iX - nChunkXOff +
2211 32872 : static_cast<GPtrDiff_t>(
2212 32872 : iY - nSrcYOff) *
2213 32872 : nChunkXSize])
2214 : {
2215 37146100 : const int nWeight = panLineWeight[i];
2216 37146100 : dfTotal += val * nWeight;
2217 37146100 : nCount += nWeight;
2218 : }
2219 : }
2220 : }
2221 :
2222 4133380 : if (nCount == 0)
2223 : {
2224 2217 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2225 : }
2226 : else
2227 : {
2228 4131160 : padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
2229 : }
2230 : }
2231 : else
2232 : {
2233 200 : GInt64 nTotalR = 0;
2234 200 : GInt64 nTotalG = 0;
2235 200 : GInt64 nTotalB = 0;
2236 200 : GInt64 nTotalWeight = 0;
2237 200 : const int *panLineWeight =
2238 200 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2239 : nXShiftGaussMatrix;
2240 :
2241 780 : for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2242 580 : ++iY, ++j, panLineWeight += nGaussMatrixDim)
2243 : {
2244 2262 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2245 : {
2246 1682 : const double val =
2247 1682 : padfSrcScanline[iX - nChunkXOff +
2248 1682 : static_cast<GPtrDiff_t>(iY -
2249 1682 : nSrcYOff) *
2250 1682 : nChunkXSize];
2251 1682 : if (val < 0 || val >= colorEntries.size())
2252 0 : continue;
2253 :
2254 1682 : size_t idx = static_cast<size_t>(val);
2255 1682 : if (colorEntries[idx].c4)
2256 : {
2257 1682 : const int nWeight = panLineWeight[i];
2258 1682 : nTotalR +=
2259 1682 : static_cast<GInt64>(colorEntries[idx].c1) *
2260 1682 : nWeight;
2261 1682 : nTotalG +=
2262 1682 : static_cast<GInt64>(colorEntries[idx].c2) *
2263 1682 : nWeight;
2264 1682 : nTotalB +=
2265 1682 : static_cast<GInt64>(colorEntries[idx].c3) *
2266 1682 : nWeight;
2267 1682 : nTotalWeight += nWeight;
2268 : }
2269 : }
2270 : }
2271 :
2272 200 : if (nTotalWeight == 0)
2273 : {
2274 0 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2275 : }
2276 : else
2277 : {
2278 : GDALColorEntry color;
2279 :
2280 200 : color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
2281 : nTotalWeight);
2282 200 : color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
2283 : nTotalWeight);
2284 200 : color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
2285 : nTotalWeight);
2286 200 : padfDstScanline[iDstPixel - nDstXOff] =
2287 200 : BestColorEntry(colorEntries, color);
2288 : }
2289 : }
2290 : }
2291 : }
2292 :
2293 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
2294 : CPLFree(panGaussMatrixDup);
2295 : #endif
2296 :
2297 86 : return CE_None;
2298 : }
2299 :
2300 : /************************************************************************/
2301 : /* GDALResampleChunk_Mode() */
2302 : /************************************************************************/
2303 :
2304 688 : template <class T> static inline bool IsSame(T a, T b)
2305 : {
2306 688 : return a == b;
2307 : }
2308 :
2309 60 : template <> bool IsSame<GFloat16>(GFloat16 a, GFloat16 b)
2310 : {
2311 60 : return a == b || (CPLIsNan(a) && CPLIsNan(b));
2312 : }
2313 :
2314 4902 : template <> bool IsSame<float>(float a, float b)
2315 : {
2316 4902 : return a == b || (std::isnan(a) && std::isnan(b));
2317 : }
2318 :
2319 1020 : template <> bool IsSame<double>(double a, double b)
2320 : {
2321 1020 : return a == b || (std::isnan(a) && std::isnan(b));
2322 : }
2323 :
2324 : namespace
2325 : {
2326 : struct ComplexFloat16
2327 : {
2328 : GFloat16 r;
2329 : GFloat16 i;
2330 : };
2331 : } // namespace
2332 :
2333 60 : template <> bool IsSame<ComplexFloat16>(ComplexFloat16 a, ComplexFloat16 b)
2334 : {
2335 90 : return (a.r == b.r && a.i == b.i) ||
2336 90 : (CPLIsNan(a.r) && CPLIsNan(a.i) && CPLIsNan(b.r) && CPLIsNan(b.i));
2337 : }
2338 :
2339 : template <>
2340 60 : bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
2341 : {
2342 120 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2343 120 : std::isnan(b.real()) && std::isnan(b.imag()));
2344 : }
2345 :
2346 : template <>
2347 60 : bool IsSame<std::complex<double>>(std::complex<double> a,
2348 : std::complex<double> b)
2349 : {
2350 120 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2351 120 : std::isnan(b.real()) && std::isnan(b.imag()));
2352 : }
2353 :
2354 : template <class T>
2355 176 : static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
2356 : const T *pChunk, T *const pDstBuffer)
2357 :
2358 : {
2359 176 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2360 176 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2361 176 : const double dfSrcXDelta = args.dfSrcXDelta;
2362 176 : const double dfSrcYDelta = args.dfSrcYDelta;
2363 176 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2364 176 : const int nChunkXOff = args.nChunkXOff;
2365 176 : const int nChunkXSize = args.nChunkXSize;
2366 176 : const int nChunkYOff = args.nChunkYOff;
2367 176 : const int nChunkYSize = args.nChunkYSize;
2368 176 : const int nDstXOff = args.nDstXOff;
2369 176 : const int nDstXOff2 = args.nDstXOff2;
2370 176 : const int nDstYOff = args.nDstYOff;
2371 176 : const int nDstYOff2 = args.nDstYOff2;
2372 176 : const bool bHasNoData = args.bHasNoData;
2373 176 : const GDALColorTable *poColorTable = args.poColorTable;
2374 176 : const int nDstXSize = nDstXOff2 - nDstXOff;
2375 :
2376 8 : T tNoDataValue;
2377 : if constexpr (std::is_same<T, ComplexFloat16>::value)
2378 : {
2379 4 : tNoDataValue.r = cpl::NumericLimits<GFloat16>::quiet_NaN();
2380 4 : tNoDataValue.i = cpl::NumericLimits<GFloat16>::quiet_NaN();
2381 : }
2382 : else if constexpr (std::is_same<T, std::complex<float>>::value ||
2383 : std::is_same<T, std::complex<double>>::value)
2384 : {
2385 : using BaseT = typename T::value_type;
2386 8 : tNoDataValue =
2387 : std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
2388 : std::numeric_limits<BaseT>::quiet_NaN());
2389 : }
2390 164 : else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
2391 163 : tNoDataValue = 0;
2392 : else
2393 1 : tNoDataValue = static_cast<T>(args.dfNoDataValue);
2394 :
2395 : using CountType = uint32_t;
2396 176 : CountType nMaxNumPx = 0;
2397 176 : T *paVals = nullptr;
2398 176 : CountType *panCounts = nullptr;
2399 :
2400 176 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2401 176 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2402 352 : std::vector<int> anVals(256, 0);
2403 :
2404 : /* ==================================================================== */
2405 : /* Loop over destination scanlines. */
2406 : /* ==================================================================== */
2407 7679 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2408 : {
2409 7503 : const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
2410 7503 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
2411 : #ifdef only_pixels_with_more_than_10_pct_participation
2412 : // When oversampling, don't take into account pixels that have a tiny
2413 : // participation in the resulting pixel
2414 : if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2415 : nSrcYOff < nChunkBottomYOff)
2416 : nSrcYOff++;
2417 : #endif
2418 7503 : if (nSrcYOff < nChunkYOff)
2419 0 : nSrcYOff = nChunkYOff;
2420 :
2421 7503 : const double dfSrcYOff2 =
2422 7503 : dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
2423 7503 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
2424 : #ifdef only_pixels_with_more_than_10_pct_participation
2425 : // When oversampling, don't take into account pixels that have a tiny
2426 : // participation in the resulting pixel
2427 : if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2428 : nSrcYOff2 > nChunkYOff)
2429 : nSrcYOff2--;
2430 : #endif
2431 7503 : if (nSrcYOff2 == nSrcYOff)
2432 0 : ++nSrcYOff2;
2433 7503 : if (nSrcYOff2 > nChunkBottomYOff)
2434 0 : nSrcYOff2 = nChunkBottomYOff;
2435 :
2436 7503 : const T *const paSrcScanline =
2437 253 : pChunk +
2438 7503 : (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
2439 7503 : const GByte *pabySrcScanlineNodataMask = nullptr;
2440 7503 : if (pabyChunkNodataMask != nullptr)
2441 1810 : pabySrcScanlineNodataMask =
2442 : pabyChunkNodataMask +
2443 1810 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
2444 :
2445 7503 : T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
2446 : /* --------------------------------------------------------------------
2447 : */
2448 : /* Loop over destination pixels */
2449 : /* --------------------------------------------------------------------
2450 : */
2451 4260400 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2452 : {
2453 4252893 : const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
2454 : // Apply some epsilon to avoid numerical precision issues
2455 4252893 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
2456 : #ifdef only_pixels_with_more_than_10_pct_participation
2457 : // When oversampling, don't take into account pixels that have a
2458 : // tiny participation in the resulting pixel
2459 : if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2460 : nSrcXOff < nChunkRightXOff)
2461 : nSrcXOff++;
2462 : #endif
2463 4252893 : if (nSrcXOff < nChunkXOff)
2464 0 : nSrcXOff = nChunkXOff;
2465 :
2466 4252893 : const double dfSrcXOff2 =
2467 4252893 : dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
2468 4252893 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
2469 : #ifdef only_pixels_with_more_than_10_pct_participation
2470 : // When oversampling, don't take into account pixels that have a
2471 : // tiny participation in the resulting pixel
2472 : if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2473 : nSrcXOff2 > nChunkXOff)
2474 : nSrcXOff2--;
2475 : #endif
2476 4252893 : if (nSrcXOff2 == nSrcXOff)
2477 0 : nSrcXOff2++;
2478 4252893 : if (nSrcXOff2 > nChunkRightXOff)
2479 0 : nSrcXOff2 = nChunkRightXOff;
2480 :
2481 4252893 : bool bRegularProcessing = false;
2482 : if constexpr (!std::is_same<T, GByte>::value)
2483 1503 : bRegularProcessing = true;
2484 4251390 : else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
2485 0 : bRegularProcessing = true;
2486 :
2487 4252893 : if (bRegularProcessing)
2488 : {
2489 : // Sanity check to make sure the allocation of paVals and
2490 : // panCounts don't overflow.
2491 : static_assert(sizeof(CountType) <= sizeof(size_t));
2492 3006 : if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
2493 1503 : static_cast<CountType>(nSrcYOff2 - nSrcYOff) >
2494 1503 : (std::numeric_limits<CountType>::max() /
2495 3006 : std::max(sizeof(T), sizeof(CountType))) /
2496 1503 : static_cast<CountType>(nSrcXOff2 - nSrcXOff))
2497 : {
2498 0 : CPLError(CE_Failure, CPLE_NotSupported,
2499 : "Too big downsampling factor");
2500 0 : CPLFree(paVals);
2501 0 : CPLFree(panCounts);
2502 0 : return CE_Failure;
2503 : }
2504 1503 : const CountType nNumPx =
2505 1503 : static_cast<CountType>(nSrcYOff2 - nSrcYOff) *
2506 1503 : (nSrcXOff2 - nSrcXOff);
2507 1503 : CountType iMaxInd = 0;
2508 1503 : CountType iMaxVal = 0;
2509 :
2510 1503 : if (paVals == nullptr || nNumPx > nMaxNumPx)
2511 : {
2512 : T *paValsNew = static_cast<T *>(
2513 110 : VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
2514 : CountType *panCountsNew =
2515 110 : static_cast<CountType *>(VSI_REALLOC_VERBOSE(
2516 : panCounts, nNumPx * sizeof(CountType)));
2517 110 : if (paValsNew != nullptr)
2518 110 : paVals = paValsNew;
2519 110 : if (panCountsNew != nullptr)
2520 110 : panCounts = panCountsNew;
2521 110 : if (paValsNew == nullptr || panCountsNew == nullptr)
2522 : {
2523 0 : CPLFree(paVals);
2524 0 : CPLFree(panCounts);
2525 0 : return CE_Failure;
2526 : }
2527 110 : nMaxNumPx = nNumPx;
2528 : }
2529 :
2530 4629 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2531 : {
2532 3126 : const GPtrDiff_t iTotYOff =
2533 3126 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2534 3126 : nChunkXOff;
2535 9858 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2536 : {
2537 6732 : if (pabySrcScanlineNodataMask == nullptr ||
2538 16 : pabySrcScanlineNodataMask[iX + iTotYOff])
2539 : {
2540 6717 : const T val = paSrcScanline[iX + iTotYOff];
2541 6717 : CountType i = 0; // Used after for.
2542 :
2543 : // Check array for existing entry.
2544 10081 : for (; i < iMaxInd; ++i)
2545 : {
2546 6850 : if (IsSame(paVals[i], val))
2547 : {
2548 3486 : if (++panCounts[i] > panCounts[iMaxVal])
2549 : {
2550 246 : iMaxVal = i;
2551 : }
2552 3486 : break;
2553 : }
2554 : }
2555 :
2556 : // Add to arr if entry not already there.
2557 6717 : if (i == iMaxInd)
2558 : {
2559 3231 : paVals[iMaxInd] = val;
2560 3231 : panCounts[iMaxInd] = 1;
2561 :
2562 3231 : if (iMaxInd == 0)
2563 : {
2564 1500 : iMaxVal = iMaxInd;
2565 : }
2566 :
2567 3231 : ++iMaxInd;
2568 : }
2569 : }
2570 : }
2571 : }
2572 :
2573 1503 : if (iMaxInd == 0)
2574 3 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2575 : else
2576 1500 : paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
2577 : }
2578 : else if constexpr (std::is_same<T, GByte>::value)
2579 : // ( eSrcDataType == GDT_UInt8 && nEntryCount < 256 )
2580 : {
2581 : // So we go here for a paletted or non-paletted byte band.
2582 : // The input values are then between 0 and 255.
2583 4251390 : int nMaxVal = 0;
2584 4251390 : int iMaxInd = -1;
2585 :
2586 : // The cost of this zeroing might be high. Perhaps we should
2587 : // just use the above generic case, and go to this one if the
2588 : // number of source pixels is large enough
2589 4251390 : std::fill(anVals.begin(), anVals.end(), 0);
2590 :
2591 12777800 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2592 : {
2593 8526440 : const GPtrDiff_t iTotYOff =
2594 8526440 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2595 8526440 : nChunkXOff;
2596 25649600 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2597 : {
2598 17123100 : const T val = paSrcScanline[iX + iTotYOff];
2599 17123100 : if (!bHasNoData || val != tNoDataValue)
2600 : {
2601 17123100 : int nVal = static_cast<int>(val);
2602 17123100 : if (++anVals[nVal] > nMaxVal)
2603 : {
2604 : // Sum the density.
2605 : // Is it the most common value so far?
2606 17006400 : iMaxInd = nVal;
2607 17006400 : nMaxVal = anVals[nVal];
2608 : }
2609 : }
2610 : }
2611 : }
2612 :
2613 4251390 : if (iMaxInd == -1)
2614 0 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2615 : else
2616 4251390 : paDstScanline[iDstPixel - nDstXOff] =
2617 : static_cast<T>(iMaxInd);
2618 : }
2619 : }
2620 : }
2621 :
2622 176 : CPLFree(paVals);
2623 176 : CPLFree(panCounts);
2624 :
2625 176 : return CE_None;
2626 : }
2627 :
2628 176 : static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
2629 : const void *pChunk, void **ppDstBuffer,
2630 : GDALDataType *peDstBufferDataType)
2631 : {
2632 176 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
2633 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
2634 : GDALGetDataTypeSizeBytes(args.eWrkDataType));
2635 176 : if (*ppDstBuffer == nullptr)
2636 : {
2637 0 : return CE_Failure;
2638 : }
2639 :
2640 176 : CPLAssert(args.eSrcDataType == args.eWrkDataType);
2641 :
2642 176 : *peDstBufferDataType = args.eWrkDataType;
2643 176 : switch (args.eWrkDataType)
2644 : {
2645 : // For mode resampling, as no computation is done, only the
2646 : // size of the data type matters... except for Byte where we have
2647 : // special processing. And for floating point values
2648 66 : case GDT_UInt8:
2649 : {
2650 66 : return GDALResampleChunk_ModeT(args,
2651 : static_cast<const GByte *>(pChunk),
2652 66 : static_cast<GByte *>(*ppDstBuffer));
2653 : }
2654 :
2655 4 : case GDT_Int8:
2656 : {
2657 4 : return GDALResampleChunk_ModeT(args,
2658 : static_cast<const int8_t *>(pChunk),
2659 4 : static_cast<int8_t *>(*ppDstBuffer));
2660 : }
2661 :
2662 10 : case GDT_Int16:
2663 : case GDT_UInt16:
2664 : {
2665 10 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
2666 10 : return GDALResampleChunk_ModeT(
2667 : args, static_cast<const uint16_t *>(pChunk),
2668 10 : static_cast<uint16_t *>(*ppDstBuffer));
2669 : }
2670 :
2671 15 : case GDT_CInt16:
2672 : case GDT_Int32:
2673 : case GDT_UInt32:
2674 : {
2675 15 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2676 15 : return GDALResampleChunk_ModeT(
2677 : args, static_cast<const uint32_t *>(pChunk),
2678 15 : static_cast<uint32_t *>(*ppDstBuffer));
2679 : }
2680 :
2681 12 : case GDT_CInt32:
2682 : case GDT_Int64:
2683 : case GDT_UInt64:
2684 : {
2685 12 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2686 12 : return GDALResampleChunk_ModeT(
2687 : args, static_cast<const uint64_t *>(pChunk),
2688 12 : static_cast<uint64_t *>(*ppDstBuffer));
2689 : }
2690 :
2691 4 : case GDT_Float16:
2692 : {
2693 4 : return GDALResampleChunk_ModeT(
2694 : args, static_cast<const GFloat16 *>(pChunk),
2695 4 : static_cast<GFloat16 *>(*ppDstBuffer));
2696 : }
2697 :
2698 32 : case GDT_Float32:
2699 : {
2700 32 : return GDALResampleChunk_ModeT(args,
2701 : static_cast<const float *>(pChunk),
2702 32 : static_cast<float *>(*ppDstBuffer));
2703 : }
2704 :
2705 21 : case GDT_Float64:
2706 : {
2707 21 : return GDALResampleChunk_ModeT(args,
2708 : static_cast<const double *>(pChunk),
2709 21 : static_cast<double *>(*ppDstBuffer));
2710 : }
2711 :
2712 4 : case GDT_CFloat16:
2713 : {
2714 4 : return GDALResampleChunk_ModeT(
2715 : args, static_cast<const ComplexFloat16 *>(pChunk),
2716 4 : static_cast<ComplexFloat16 *>(*ppDstBuffer));
2717 : }
2718 :
2719 4 : case GDT_CFloat32:
2720 : {
2721 4 : return GDALResampleChunk_ModeT(
2722 : args, static_cast<const std::complex<float> *>(pChunk),
2723 4 : static_cast<std::complex<float> *>(*ppDstBuffer));
2724 : }
2725 :
2726 4 : case GDT_CFloat64:
2727 : {
2728 4 : return GDALResampleChunk_ModeT(
2729 : args, static_cast<const std::complex<double> *>(pChunk),
2730 4 : static_cast<std::complex<double> *>(*ppDstBuffer));
2731 : }
2732 :
2733 0 : case GDT_Unknown:
2734 : case GDT_TypeCount:
2735 0 : break;
2736 : }
2737 :
2738 0 : CPLAssert(false);
2739 : return CE_Failure;
2740 : }
2741 :
2742 : /************************************************************************/
2743 : /* GDALResampleConvolutionHorizontal() */
2744 : /************************************************************************/
2745 :
2746 : template <class T>
2747 : static inline double
2748 46038 : GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
2749 : int nSrcPixelCount)
2750 : {
2751 46038 : double dfVal1 = 0.0;
2752 46038 : double dfVal2 = 0.0;
2753 46038 : int i = 0; // Used after for.
2754 : // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
2755 : // manually (untypical) unrolled loop in -O2 and -O3:
2756 : // https://github.com/OSGeo/gdal/issues/9508
2757 : #if !defined(__INTEL_CLANG_COMPILER)
2758 92396 : for (; i < nSrcPixelCount - 3; i += 4)
2759 : {
2760 46358 : dfVal1 += double(pChunk[i + 0]) * padfWeights[i];
2761 46358 : dfVal1 += double(pChunk[i + 1]) * padfWeights[i + 1];
2762 46358 : dfVal2 += double(pChunk[i + 2]) * padfWeights[i + 2];
2763 46358 : dfVal2 += double(pChunk[i + 3]) * padfWeights[i + 3];
2764 : }
2765 : #endif
2766 48662 : for (; i < nSrcPixelCount; ++i)
2767 : {
2768 2624 : dfVal1 += double(pChunk[i]) * padfWeights[i];
2769 : }
2770 46038 : return dfVal1 + dfVal2;
2771 : }
2772 :
2773 : template <class T>
2774 44576 : static inline void GDALResampleConvolutionHorizontalWithMask(
2775 : const T *pChunk, const GByte *pabyMask, const double *padfWeights,
2776 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2777 : {
2778 44576 : dfVal = 0;
2779 44576 : dfWeightSum = 0;
2780 44576 : int i = 0;
2781 98300 : for (; i < nSrcPixelCount - 3; i += 4)
2782 : {
2783 53724 : const double dfWeight0 = padfWeights[i] * pabyMask[i];
2784 53724 : const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
2785 53724 : const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
2786 53724 : const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
2787 53724 : dfVal += double(pChunk[i + 0]) * dfWeight0;
2788 53724 : dfVal += double(pChunk[i + 1]) * dfWeight1;
2789 53724 : dfVal += double(pChunk[i + 2]) * dfWeight2;
2790 53724 : dfVal += double(pChunk[i + 3]) * dfWeight3;
2791 53724 : dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
2792 : }
2793 61162 : for (; i < nSrcPixelCount; ++i)
2794 : {
2795 16586 : const double dfWeight = padfWeights[i] * pabyMask[i];
2796 16586 : dfVal += double(pChunk[i]) * dfWeight;
2797 16586 : dfWeightSum += dfWeight;
2798 : }
2799 44576 : }
2800 :
2801 : template <class T>
2802 1341366 : static inline void GDALResampleConvolutionHorizontal_3rows(
2803 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2804 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2805 : double &dfRes2, double &dfRes3)
2806 : {
2807 1341366 : double dfVal1 = 0.0;
2808 1341366 : double dfVal2 = 0.0;
2809 1341366 : double dfVal3 = 0.0;
2810 1341366 : double dfVal4 = 0.0;
2811 1341366 : double dfVal5 = 0.0;
2812 1341366 : double dfVal6 = 0.0;
2813 1341366 : int i = 0; // Used after for.
2814 2736937 : for (; i < nSrcPixelCount - 3; i += 4)
2815 : {
2816 1395570 : dfVal1 += double(pChunkRow1[i + 0]) * padfWeights[i + 0];
2817 1395570 : dfVal1 += double(pChunkRow1[i + 1]) * padfWeights[i + 1];
2818 1395570 : dfVal2 += double(pChunkRow1[i + 2]) * padfWeights[i + 2];
2819 1395570 : dfVal2 += double(pChunkRow1[i + 3]) * padfWeights[i + 3];
2820 1395570 : dfVal3 += double(pChunkRow2[i + 0]) * padfWeights[i + 0];
2821 1395570 : dfVal3 += double(pChunkRow2[i + 1]) * padfWeights[i + 1];
2822 1395570 : dfVal4 += double(pChunkRow2[i + 2]) * padfWeights[i + 2];
2823 1395570 : dfVal4 += double(pChunkRow2[i + 3]) * padfWeights[i + 3];
2824 1395570 : dfVal5 += double(pChunkRow3[i + 0]) * padfWeights[i + 0];
2825 1395570 : dfVal5 += double(pChunkRow3[i + 1]) * padfWeights[i + 1];
2826 1395570 : dfVal6 += double(pChunkRow3[i + 2]) * padfWeights[i + 2];
2827 1395570 : dfVal6 += double(pChunkRow3[i + 3]) * padfWeights[i + 3];
2828 : }
2829 1381377 : for (; i < nSrcPixelCount; ++i)
2830 : {
2831 40011 : dfVal1 += double(pChunkRow1[i]) * padfWeights[i];
2832 40011 : dfVal3 += double(pChunkRow2[i]) * padfWeights[i];
2833 40011 : dfVal5 += double(pChunkRow3[i]) * padfWeights[i];
2834 : }
2835 1341366 : dfRes1 = dfVal1 + dfVal2;
2836 1341366 : dfRes2 = dfVal3 + dfVal4;
2837 1341366 : dfRes3 = dfVal5 + dfVal6;
2838 1341366 : }
2839 :
2840 : template <class T>
2841 18980 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
2842 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2843 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2844 : double &dfRes2, double &dfRes3)
2845 : {
2846 18980 : GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2847 : padfWeights, nSrcPixelCount, dfRes1,
2848 : dfRes2, dfRes3);
2849 18980 : }
2850 :
2851 : template <class T>
2852 1256690 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
2853 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2854 : const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
2855 : {
2856 1256690 : GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2857 : padfWeights, 4, dfRes1, dfRes2,
2858 : dfRes3);
2859 1256690 : }
2860 :
2861 : /************************************************************************/
2862 : /* GDALResampleConvolutionVertical() */
2863 : /************************************************************************/
2864 :
2865 : template <class T>
2866 : static inline double
2867 465678 : GDALResampleConvolutionVertical(const T *pChunk, size_t nStride,
2868 : const double *padfWeights, int nSrcLineCount)
2869 : {
2870 465678 : double dfVal1 = 0.0;
2871 465678 : double dfVal2 = 0.0;
2872 465678 : int i = 0;
2873 465678 : size_t j = 0;
2874 916919 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2875 : {
2876 451241 : dfVal1 += pChunk[j + 0 * nStride] * padfWeights[i + 0];
2877 451241 : dfVal1 += pChunk[j + 1 * nStride] * padfWeights[i + 1];
2878 451241 : dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2879 451241 : dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2880 : }
2881 519809 : for (; i < nSrcLineCount; ++i, j += nStride)
2882 : {
2883 54131 : dfVal1 += pChunk[j] * padfWeights[i];
2884 : }
2885 465678 : return dfVal1 + dfVal2;
2886 : }
2887 :
2888 : template <class T>
2889 2930610 : static inline void GDALResampleConvolutionVertical_2cols(
2890 : const T *pChunk, size_t nStride, const double *padfWeights,
2891 : int nSrcLineCount, double &dfRes1, double &dfRes2)
2892 : {
2893 2930610 : double dfVal1 = 0.0;
2894 2930610 : double dfVal2 = 0.0;
2895 2930610 : double dfVal3 = 0.0;
2896 2930610 : double dfVal4 = 0.0;
2897 2930610 : int i = 0;
2898 2930610 : size_t j = 0;
2899 5863170 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2900 : {
2901 2932560 : dfVal1 += pChunk[j + 0 + 0 * nStride] * padfWeights[i + 0];
2902 2932560 : dfVal3 += pChunk[j + 1 + 0 * nStride] * padfWeights[i + 0];
2903 2932560 : dfVal1 += pChunk[j + 0 + 1 * nStride] * padfWeights[i + 1];
2904 2932560 : dfVal3 += pChunk[j + 1 + 1 * nStride] * padfWeights[i + 1];
2905 2932560 : dfVal2 += pChunk[j + 0 + 2 * nStride] * padfWeights[i + 2];
2906 2932560 : dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
2907 2932560 : dfVal2 += pChunk[j + 0 + 3 * nStride] * padfWeights[i + 3];
2908 2932560 : dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
2909 : }
2910 3053490 : for (; i < nSrcLineCount; ++i, j += nStride)
2911 : {
2912 122880 : dfVal1 += pChunk[j + 0] * padfWeights[i];
2913 122880 : dfVal3 += pChunk[j + 1] * padfWeights[i];
2914 : }
2915 2930610 : dfRes1 = dfVal1 + dfVal2;
2916 2930610 : dfRes2 = dfVal3 + dfVal4;
2917 2930610 : }
2918 :
2919 : #ifdef USE_SSE2
2920 :
2921 : #ifdef __AVX__
2922 : /************************************************************************/
2923 : /* GDALResampleConvolutionVertical_16cols<T> */
2924 : /************************************************************************/
2925 :
2926 : template <class T>
2927 : static inline void
2928 : GDALResampleConvolutionVertical_16cols(const T *pChunk, size_t nStride,
2929 : const double *padfWeights,
2930 : int nSrcLineCount, float *afDest)
2931 : {
2932 : int i = 0;
2933 : size_t j = 0;
2934 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2935 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2936 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2937 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2938 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2939 : {
2940 : XMMReg4Double w0 =
2941 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2942 : XMMReg4Double w1 =
2943 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2944 : XMMReg4Double w2 =
2945 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2946 : XMMReg4Double w3 =
2947 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2948 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2949 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2950 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
2951 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
2952 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2953 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2954 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
2955 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
2956 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2957 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2958 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
2959 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
2960 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2961 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2962 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
2963 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
2964 : }
2965 : for (; i < nSrcLineCount; ++i, j += nStride)
2966 : {
2967 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2968 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2969 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2970 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
2971 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
2972 : }
2973 : v_acc0.Store4Val(afDest);
2974 : v_acc1.Store4Val(afDest + 4);
2975 : v_acc2.Store4Val(afDest + 8);
2976 : v_acc3.Store4Val(afDest + 12);
2977 : }
2978 :
2979 : template <class T>
2980 : static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
2981 : const double *, int,
2982 : double *)
2983 : {
2984 : // Cannot be reached
2985 : CPLAssert(false);
2986 : }
2987 :
2988 : #else
2989 :
2990 : /************************************************************************/
2991 : /* GDALResampleConvolutionVertical_8cols<T> */
2992 : /************************************************************************/
2993 :
2994 : template <class T>
2995 : static inline void
2996 25609800 : GDALResampleConvolutionVertical_8cols(const T *pChunk, size_t nStride,
2997 : const double *padfWeights,
2998 : int nSrcLineCount, float *afDest)
2999 : {
3000 25609800 : int i = 0;
3001 25609800 : size_t j = 0;
3002 25609800 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
3003 25609800 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3004 53417600 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
3005 : {
3006 27807800 : XMMReg4Double w0 =
3007 27807800 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
3008 27807800 : XMMReg4Double w1 =
3009 27807800 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
3010 27807800 : XMMReg4Double w2 =
3011 27807800 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
3012 27807800 : XMMReg4Double w3 =
3013 27807800 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
3014 27807800 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
3015 27807800 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
3016 27807800 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
3017 27807800 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
3018 27807800 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
3019 27807800 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
3020 27807800 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
3021 27807800 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
3022 : }
3023 37176700 : for (; i < nSrcLineCount; ++i, j += nStride)
3024 : {
3025 11566800 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
3026 11566800 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
3027 11566800 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
3028 : }
3029 25609800 : v_acc0.Store4Val(afDest);
3030 25609800 : v_acc1.Store4Val(afDest + 4);
3031 25609800 : }
3032 :
3033 : template <class T>
3034 : static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
3035 : const double *, int,
3036 : double *)
3037 : {
3038 : // Cannot be reached
3039 : CPLAssert(false);
3040 : }
3041 :
3042 : #endif // __AVX__
3043 :
3044 : /************************************************************************/
3045 : /* GDALResampleConvolutionHorizontalSSE2<T> */
3046 : /************************************************************************/
3047 :
3048 : template <class T>
3049 3137782 : static inline double GDALResampleConvolutionHorizontalSSE2(
3050 : const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3051 : {
3052 3137782 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3053 3137782 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3054 3137782 : int i = 0; // Used after for.
3055 3513158 : for (; i < nSrcPixelCount - 7; i += 8)
3056 : {
3057 : // Retrieve the pixel & accumulate
3058 375371 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
3059 375371 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
3060 375371 : const XMMReg4Double v_weight1 =
3061 375371 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3062 375371 : const XMMReg4Double v_weight2 =
3063 375371 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
3064 :
3065 375371 : v_acc1 += v_pixels1 * v_weight1;
3066 375371 : v_acc2 += v_pixels2 * v_weight2;
3067 : }
3068 :
3069 3137782 : v_acc1 += v_acc2;
3070 :
3071 3137782 : double dfVal = v_acc1.GetHorizSum();
3072 10317730 : for (; i < nSrcPixelCount; ++i)
3073 : {
3074 7179950 : dfVal += pChunk[i] * padfWeightsAligned[i];
3075 : }
3076 3137782 : return dfVal;
3077 : }
3078 :
3079 : /************************************************************************/
3080 : /* GDALResampleConvolutionHorizontal<GByte> */
3081 : /************************************************************************/
3082 :
3083 : template <>
3084 2588620 : inline double GDALResampleConvolutionHorizontal<GByte>(
3085 : const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3086 : {
3087 2588620 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
3088 2588620 : nSrcPixelCount);
3089 : }
3090 :
3091 : template <>
3092 549162 : inline double GDALResampleConvolutionHorizontal<GUInt16>(
3093 : const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3094 : {
3095 549162 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
3096 549162 : nSrcPixelCount);
3097 : }
3098 :
3099 : /************************************************************************/
3100 : /* GDALResampleConvolutionHorizontalWithMaskSSE2<T> */
3101 : /************************************************************************/
3102 :
3103 : template <class T>
3104 6408653 : static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
3105 : const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
3106 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
3107 : {
3108 6408653 : int i = 0; // Used after for.
3109 6408653 : XMMReg4Double v_acc = XMMReg4Double::Zero();
3110 6408653 : XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
3111 17785121 : for (; i < nSrcPixelCount - 3; i += 4)
3112 : {
3113 11376458 : const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
3114 11376458 : const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
3115 11376458 : XMMReg4Double v_weight =
3116 11376458 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3117 11376458 : v_weight *= v_mask;
3118 11376458 : v_acc += v_pixels * v_weight;
3119 11376458 : v_acc_weight += v_weight;
3120 : }
3121 :
3122 6408653 : dfVal = v_acc.GetHorizSum();
3123 6408653 : dfWeightSum = v_acc_weight.GetHorizSum();
3124 6614913 : for (; i < nSrcPixelCount; ++i)
3125 : {
3126 206258 : const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
3127 206258 : dfVal += pChunk[i] * dfWeight;
3128 206258 : dfWeightSum += dfWeight;
3129 : }
3130 6408653 : }
3131 :
3132 : /************************************************************************/
3133 : /* GDALResampleConvolutionHorizontalWithMask<GByte> */
3134 : /************************************************************************/
3135 :
3136 : template <>
3137 6408590 : inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
3138 : const GByte *pChunk, const GByte *pabyMask,
3139 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
3140 : double &dfWeightSum)
3141 : {
3142 6408590 : GDALResampleConvolutionHorizontalWithMaskSSE2(
3143 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
3144 : dfWeightSum);
3145 6408590 : }
3146 :
3147 : template <>
3148 63 : inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
3149 : const GUInt16 *pChunk, const GByte *pabyMask,
3150 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
3151 : double &dfWeightSum)
3152 : {
3153 63 : GDALResampleConvolutionHorizontalWithMaskSSE2(
3154 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
3155 : dfWeightSum);
3156 63 : }
3157 :
3158 : /************************************************************************/
3159 : /* GDALResampleConvolutionHorizontal_3rows_SSE2<T> */
3160 : /************************************************************************/
3161 :
3162 : template <class T>
3163 35128386 : static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
3164 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3165 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3166 : double &dfRes2, double &dfRes3)
3167 : {
3168 35128386 : XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
3169 35128386 : v_acc2 = XMMReg4Double::Zero(),
3170 35128386 : v_acc3 = XMMReg4Double::Zero();
3171 35128386 : int i = 0;
3172 70070156 : for (; i < nSrcPixelCount - 7; i += 8)
3173 : {
3174 : // Retrieve the pixel & accumulate.
3175 34941870 : XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3176 34941870 : XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
3177 34941870 : const XMMReg4Double v_weight1 =
3178 34941870 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3179 34941870 : const XMMReg4Double v_weight2 =
3180 34941870 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
3181 :
3182 34941870 : v_acc1 += v_pixels1 * v_weight1;
3183 34941870 : v_acc1 += v_pixels2 * v_weight2;
3184 :
3185 34941870 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3186 34941870 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
3187 34941870 : v_acc2 += v_pixels1 * v_weight1;
3188 34941870 : v_acc2 += v_pixels2 * v_weight2;
3189 :
3190 34941870 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3191 34941870 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
3192 34941870 : v_acc3 += v_pixels1 * v_weight1;
3193 34941870 : v_acc3 += v_pixels2 * v_weight2;
3194 : }
3195 :
3196 35128386 : dfRes1 = v_acc1.GetHorizSum();
3197 35128386 : dfRes2 = v_acc2.GetHorizSum();
3198 35128386 : dfRes3 = v_acc3.GetHorizSum();
3199 47367852 : for (; i < nSrcPixelCount; ++i)
3200 : {
3201 12239466 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3202 12239466 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3203 12239466 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3204 : }
3205 35128386 : }
3206 :
3207 : /************************************************************************/
3208 : /* GDALResampleConvolutionHorizontal_3rows<GByte> */
3209 : /************************************************************************/
3210 :
3211 : template <>
3212 35128300 : inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
3213 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3214 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3215 : double &dfRes2, double &dfRes3)
3216 : {
3217 35128300 : GDALResampleConvolutionHorizontal_3rows_SSE2(
3218 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3219 : dfRes1, dfRes2, dfRes3);
3220 35128300 : }
3221 :
3222 : template <>
3223 86 : inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
3224 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3225 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3226 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3227 : {
3228 86 : GDALResampleConvolutionHorizontal_3rows_SSE2(
3229 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3230 : dfRes1, dfRes2, dfRes3);
3231 86 : }
3232 :
3233 : /************************************************************************/
3234 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T> */
3235 : /************************************************************************/
3236 :
3237 : template <class T>
3238 7840250 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3239 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3240 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3241 : double &dfRes2, double &dfRes3)
3242 : {
3243 7840250 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3244 7840250 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3245 7840250 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
3246 7840250 : int i = 0; // Use after for.
3247 19104350 : for (; i < nSrcPixelCount - 3; i += 4)
3248 : {
3249 : // Retrieve the pixel & accumulate.
3250 11264100 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3251 11264100 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3252 11264100 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3253 11264100 : const XMMReg4Double v_weight =
3254 11264100 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3255 :
3256 11264100 : v_acc1 += v_pixels1 * v_weight;
3257 11264100 : v_acc2 += v_pixels2 * v_weight;
3258 11264100 : v_acc3 += v_pixels3 * v_weight;
3259 : }
3260 :
3261 7840250 : dfRes1 = v_acc1.GetHorizSum();
3262 7840250 : dfRes2 = v_acc2.GetHorizSum();
3263 7840250 : dfRes3 = v_acc3.GetHorizSum();
3264 :
3265 12290222 : for (; i < nSrcPixelCount; ++i)
3266 : {
3267 4449942 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3268 4449942 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3269 4449942 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3270 : }
3271 7840250 : }
3272 :
3273 : /************************************************************************/
3274 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte> */
3275 : /************************************************************************/
3276 :
3277 : template <>
3278 7773100 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
3279 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3280 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3281 : double &dfRes2, double &dfRes3)
3282 : {
3283 7773100 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3284 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3285 : dfRes1, dfRes2, dfRes3);
3286 7773100 : }
3287 :
3288 : template <>
3289 67150 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
3290 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3291 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3292 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3293 : {
3294 67150 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3295 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3296 : dfRes1, dfRes2, dfRes3);
3297 67150 : }
3298 :
3299 : /************************************************************************/
3300 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T> */
3301 : /************************************************************************/
3302 :
3303 : template <class T>
3304 13996690 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3305 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3306 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3307 : double &dfRes3)
3308 : {
3309 13996690 : const XMMReg4Double v_weight =
3310 : XMMReg4Double::Load4ValAligned(padfWeightsAligned);
3311 :
3312 : // Retrieve the pixel & accumulate.
3313 13996690 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
3314 13996690 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
3315 13996690 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
3316 :
3317 13996690 : XMMReg4Double v_acc1 = v_pixels1 * v_weight;
3318 13996690 : XMMReg4Double v_acc2 = v_pixels2 * v_weight;
3319 13996690 : XMMReg4Double v_acc3 = v_pixels3 * v_weight;
3320 :
3321 13996690 : dfRes1 = v_acc1.GetHorizSum();
3322 13996690 : dfRes2 = v_acc2.GetHorizSum();
3323 13996690 : dfRes3 = v_acc3.GetHorizSum();
3324 13996690 : }
3325 :
3326 : /************************************************************************/
3327 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte> */
3328 : /************************************************************************/
3329 :
3330 : template <>
3331 8283970 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
3332 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3333 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3334 : double &dfRes3)
3335 : {
3336 8283970 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3337 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3338 : dfRes3);
3339 8283970 : }
3340 :
3341 : template <>
3342 5712720 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
3343 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3344 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
3345 : double &dfRes2, double &dfRes3)
3346 : {
3347 5712720 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3348 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3349 : dfRes3);
3350 5712720 : }
3351 :
3352 : #endif // USE_SSE2
3353 :
3354 : /************************************************************************/
3355 : /* GDALResampleChunk_Convolution() */
3356 : /************************************************************************/
3357 :
3358 : template <class T, class Twork, GDALDataType eWrkDataType,
3359 : bool bKernelWithNegativeWeights, bool bNeedRescale>
3360 5093 : static CPLErr GDALResampleChunk_ConvolutionT(
3361 : const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
3362 : FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
3363 : int nKernelRadius, float fMaxVal)
3364 :
3365 : {
3366 5093 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
3367 5093 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
3368 5093 : const double dfSrcXDelta = args.dfSrcXDelta;
3369 5093 : const double dfSrcYDelta = args.dfSrcYDelta;
3370 5093 : constexpr int nBands = 1;
3371 5093 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
3372 5093 : const int nChunkXOff = args.nChunkXOff;
3373 5093 : const int nChunkXSize = args.nChunkXSize;
3374 5093 : const int nChunkYOff = args.nChunkYOff;
3375 5093 : const int nChunkYSize = args.nChunkYSize;
3376 5093 : const int nDstXOff = args.nDstXOff;
3377 5093 : const int nDstXOff2 = args.nDstXOff2;
3378 5093 : const int nDstYOff = args.nDstYOff;
3379 5093 : const int nDstYOff2 = args.nDstYOff2;
3380 5093 : const bool bHasNoData = args.bHasNoData;
3381 5093 : double dfNoDataValue = args.dfNoDataValue;
3382 :
3383 5093 : if (!bHasNoData)
3384 5018 : dfNoDataValue = 0.0;
3385 5093 : const auto dstDataType = args.eOvrDataType;
3386 5093 : const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
3387 5093 : const double dfReplacementVal =
3388 75 : bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
3389 : : dfNoDataValue;
3390 : // cppcheck-suppress unreadVariable
3391 5093 : const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
3392 5093 : const bool bNoDataValueInt64Valid =
3393 5093 : isIntegerDT && GDALIsValueExactAs<GInt64>(dfNoDataValue);
3394 5093 : const auto nNodataValueInt64 =
3395 : bNoDataValueInt64Valid ? static_cast<GInt64>(dfNoDataValue) : 0;
3396 5093 : constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
3397 :
3398 : // TODO: we should have some generic function to do this.
3399 5093 : Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
3400 5093 : Twork fDstMax = cpl::NumericLimits<Twork>::max();
3401 5093 : if (dstDataType == GDT_UInt8)
3402 : {
3403 4225 : fDstMin = std::numeric_limits<GByte>::min();
3404 4225 : fDstMax = std::numeric_limits<GByte>::max();
3405 : }
3406 868 : else if (dstDataType == GDT_Int8)
3407 : {
3408 1 : fDstMin = std::numeric_limits<GInt8>::min();
3409 1 : fDstMax = std::numeric_limits<GInt8>::max();
3410 : }
3411 867 : else if (dstDataType == GDT_UInt16)
3412 : {
3413 402 : fDstMin = std::numeric_limits<GUInt16>::min();
3414 402 : fDstMax = std::numeric_limits<GUInt16>::max();
3415 : }
3416 465 : else if (dstDataType == GDT_Int16)
3417 : {
3418 291 : fDstMin = std::numeric_limits<GInt16>::min();
3419 291 : fDstMax = std::numeric_limits<GInt16>::max();
3420 : }
3421 174 : else if (dstDataType == GDT_UInt32)
3422 : {
3423 1 : fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
3424 1 : fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
3425 : }
3426 173 : else if (dstDataType == GDT_Int32)
3427 : {
3428 : // cppcheck-suppress unreadVariable
3429 2 : fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
3430 : // cppcheck-suppress unreadVariable
3431 2 : fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
3432 : }
3433 171 : else if (dstDataType == GDT_UInt64)
3434 : {
3435 : // cppcheck-suppress unreadVariable
3436 1 : fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
3437 : // cppcheck-suppress unreadVariable
3438 : // (1 << 64) - 2048: largest uint64 value a double can hold
3439 1 : fDstMax = static_cast<Twork>(18446744073709549568ULL);
3440 : }
3441 170 : else if (dstDataType == GDT_Int64)
3442 : {
3443 : // cppcheck-suppress unreadVariable
3444 1 : fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
3445 : // cppcheck-suppress unreadVariable
3446 : // (1 << 63) - 1024: largest int64 that a double can hold
3447 1 : fDstMax = static_cast<Twork>(9223372036854774784LL);
3448 : }
3449 :
3450 36939169 : auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
3451 : bNoDataValueInt64Valid, nNodataValueInt64,
3452 : dfNoDataValue, dfReplacementVal](Twork fVal)
3453 : {
3454 15833200 : if (!bHasNoData)
3455 11612800 : return fVal;
3456 :
3457 : // Clamp value before comparing to nodata: this is only needed for
3458 : // kernels with negative weights (Lanczos)
3459 4220490 : Twork fClamped = fVal;
3460 4220490 : if (fClamped < fDstMin)
3461 15998 : fClamped = fDstMin;
3462 4204490 : else if (fClamped > fDstMax)
3463 16406 : fClamped = fDstMax;
3464 4220490 : if (isIntegerDT)
3465 : {
3466 4220480 : if (bNoDataValueInt64Valid)
3467 : {
3468 4220470 : const double fClampedRounded = double(std::round(fClamped));
3469 8440960 : if (fClampedRounded >=
3470 : static_cast<double>(static_cast<Twork>(
3471 8440960 : std::numeric_limits<int64_t>::min())) &&
3472 : fClampedRounded <= static_cast<double>(static_cast<Twork>(
3473 8440960 : 9223372036854774784LL)) &&
3474 4220470 : nNodataValueInt64 ==
3475 4220480 : static_cast<GInt64>(std::round(fClamped)))
3476 : {
3477 : // Do not use the nodata value
3478 14435 : return static_cast<Twork>(dfReplacementVal);
3479 : }
3480 : }
3481 : }
3482 7 : else if (dfNoDataValue == static_cast<double>(fClamped))
3483 : {
3484 : // Do not use the nodata value
3485 1 : return static_cast<Twork>(dfReplacementVal);
3486 : }
3487 4206050 : return fClamped;
3488 : };
3489 :
3490 : /* -------------------------------------------------------------------- */
3491 : /* Allocate work buffers. */
3492 : /* -------------------------------------------------------------------- */
3493 5093 : const int nDstXSize = nDstXOff2 - nDstXOff;
3494 5093 : Twork *pafWrkScanline = nullptr;
3495 5093 : if (dstDataType != eWrkDataType)
3496 : {
3497 : pafWrkScanline =
3498 4924 : static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
3499 4924 : if (pafWrkScanline == nullptr)
3500 0 : return CE_Failure;
3501 : }
3502 :
3503 5093 : const double dfXScale = 1.0 / dfXRatioDstToSrc;
3504 5093 : const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
3505 5093 : const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
3506 5093 : const double dfYScale = 1.0 / dfYRatioDstToSrc;
3507 5093 : const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
3508 5093 : const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
3509 :
3510 : // Temporary array to store result of horizontal filter.
3511 : double *const padfHorizontalFiltered = static_cast<double *>(
3512 5093 : VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
3513 :
3514 : // To store convolution coefficients.
3515 : double *const padfWeights =
3516 5093 : static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
3517 : static_cast<int>(
3518 : 2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) + 0.5) *
3519 : sizeof(double)));
3520 :
3521 5093 : GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
3522 5093 : if (pabyChunkNodataMask)
3523 : pabyChunkNodataMaskHorizontalFiltered =
3524 438 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
3525 5093 : if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
3526 438 : (pabyChunkNodataMask != nullptr &&
3527 : pabyChunkNodataMaskHorizontalFiltered == nullptr))
3528 : {
3529 0 : VSIFree(pafWrkScanline);
3530 0 : VSIFree(padfHorizontalFiltered);
3531 0 : VSIFreeAligned(padfWeights);
3532 0 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3533 0 : return CE_Failure;
3534 : }
3535 :
3536 : /* ==================================================================== */
3537 : /* First pass: horizontal filter */
3538 : /* ==================================================================== */
3539 5093 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
3540 : #ifdef USE_SSE2
3541 5093 : const bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
3542 : #endif
3543 3042748 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
3544 : {
3545 3037659 : const double dfSrcPixel =
3546 3037659 : (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
3547 3037659 : int nSrcPixelStart =
3548 3037659 : static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
3549 3037659 : if (nSrcPixelStart < nChunkXOff)
3550 57253 : nSrcPixelStart = nChunkXOff;
3551 3037659 : int nSrcPixelStop =
3552 3037659 : static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
3553 3037659 : if (nSrcPixelStop > nChunkRightXOff)
3554 57268 : nSrcPixelStop = nChunkRightXOff;
3555 : #if 0
3556 : if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
3557 : {
3558 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3559 : }
3560 : if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
3561 : {
3562 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3563 : }
3564 : #endif
3565 3037659 : const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
3566 3037659 : double dfWeightSum = 0.0;
3567 :
3568 : // Compute convolution coefficients.
3569 3037659 : int nSrcPixel = nSrcPixelStart;
3570 3037659 : double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
3571 4424858 : for (; nSrcPixel < nSrcPixelStop - 3; nSrcPixel += 4)
3572 : {
3573 1387200 : padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
3574 1387200 : dfX += dfXScaleWeight;
3575 1387200 : padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
3576 1387200 : dfX += dfXScaleWeight;
3577 1387200 : padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
3578 1387200 : dfX += dfXScaleWeight;
3579 1387200 : padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
3580 1387200 : dfX += dfXScaleWeight;
3581 1387200 : dfWeightSum +=
3582 1387200 : pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
3583 : }
3584 7028459 : for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
3585 : {
3586 3990800 : const double dfWeight = pfnFilterFunc(dfX);
3587 3990800 : padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3588 3990800 : dfWeightSum += dfWeight;
3589 : }
3590 :
3591 3037659 : const int nHeight = nChunkYSize * nBands;
3592 3037659 : if (pabyChunkNodataMask == nullptr)
3593 : {
3594 : // For floating-point data types, we must scale down a bit values
3595 : // if input values are close to +/- std::numeric_limits<T>::max()
3596 : #ifdef OLD_CPPCHECK
3597 : constexpr double mulFactor = 1;
3598 : #else
3599 2954736 : constexpr double mulFactor =
3600 : (bNeedRescale &&
3601 : (std::is_same_v<T, float> || std::is_same_v<T, double>))
3602 : ? 2
3603 : : 1;
3604 : #endif
3605 :
3606 2954736 : if (dfWeightSum != 0)
3607 : {
3608 2954736 : const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
3609 11886984 : for (int i = 0; i < nSrcPixelCount; ++i)
3610 : {
3611 8932253 : padfWeights[i] *= dfInvWeightSum;
3612 : }
3613 : }
3614 :
3615 178104060 : const auto ScaleValue = [
3616 : #ifdef _MSC_VER
3617 : mulFactor
3618 : #endif
3619 : ](double dfVal, [[maybe_unused]] const T *inputValues,
3620 : [[maybe_unused]] int nInputValues)
3621 : {
3622 178104000 : constexpr bool isFloat =
3623 : std::is_same_v<T, float> || std::is_same_v<T, double>;
3624 : if constexpr (isFloat)
3625 : {
3626 4070140 : if (std::isfinite(dfVal))
3627 : {
3628 : return std::clamp(dfVal,
3629 12204800 : -std::numeric_limits<double>::max() /
3630 : mulFactor,
3631 4068260 : std::numeric_limits<double>::max() /
3632 4068260 : mulFactor) *
3633 4068260 : mulFactor;
3634 : }
3635 : else if constexpr (bKernelWithNegativeWeights)
3636 : {
3637 936 : if (std::isnan(dfVal))
3638 : {
3639 : // Either one of the input value is NaN or they are +/-Inf
3640 936 : const bool isPositive = inputValues[0] >= 0;
3641 6008 : for (int i = 0; i < nInputValues; ++i)
3642 : {
3643 5384 : if (std::isnan(inputValues[i]))
3644 312 : return dfVal;
3645 : // cppcheck-suppress knownConditionTrueFalse
3646 5072 : if ((inputValues[i] >= 0) != isPositive)
3647 0 : return dfVal;
3648 : }
3649 : // All values are positive or negative infinity
3650 624 : return static_cast<double>(inputValues[0]);
3651 : }
3652 : }
3653 : }
3654 174035000 : return dfVal;
3655 : };
3656 :
3657 2954736 : int iSrcLineOff = 0;
3658 : #ifdef USE_SSE2
3659 2954736 : if (nSrcPixelCount == 4)
3660 : {
3661 15867185 : for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3662 : {
3663 15253386 : const size_t j =
3664 15253386 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3665 15253386 : (nSrcPixelStart - nChunkXOff);
3666 15253386 : double dfVal1 = 0.0;
3667 15253386 : double dfVal2 = 0.0;
3668 15253386 : double dfVal3 = 0.0;
3669 15253386 : GDALResampleConvolutionHorizontalPixelCount4_3rows(
3670 15253386 : pChunk + j, pChunk + j + nChunkXSize,
3671 15253386 : pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
3672 : dfVal2, dfVal3);
3673 30506746 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3674 15253386 : nDstXSize +
3675 15253386 : iDstPixel - nDstXOff] =
3676 15253386 : ScaleValue(dfVal1, pChunk + j, 4);
3677 30506746 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3678 15253386 : 1) *
3679 15253386 : nDstXSize +
3680 15253386 : iDstPixel - nDstXOff] =
3681 15253386 : ScaleValue(dfVal2, pChunk + j + nChunkXSize, 4);
3682 15253795 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3683 15253386 : 2) *
3684 15253386 : nDstXSize +
3685 15253386 : iDstPixel - nDstXOff] =
3686 15253386 : ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize, 4);
3687 : }
3688 : }
3689 2340929 : else if (bSrcPixelCountLess8)
3690 : {
3691 9927838 : for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3692 : {
3693 7859228 : const size_t j =
3694 7859228 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3695 7859228 : (nSrcPixelStart - nChunkXOff);
3696 7859228 : double dfVal1 = 0.0;
3697 7859228 : double dfVal2 = 0.0;
3698 7859228 : double dfVal3 = 0.0;
3699 7859228 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
3700 7859228 : pChunk + j, pChunk + j + nChunkXSize,
3701 7859228 : pChunk + j + 2 * nChunkXSize, padfWeights,
3702 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3703 15718416 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3704 7859228 : nDstXSize +
3705 7859228 : iDstPixel - nDstXOff] =
3706 7859228 : ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
3707 15718416 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3708 7859228 : 1) *
3709 7859228 : nDstXSize +
3710 7859228 : iDstPixel - nDstXOff] =
3711 7859228 : ScaleValue(dfVal2, pChunk + j + nChunkXSize,
3712 : nSrcPixelCount);
3713 7859316 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3714 7859228 : 2) *
3715 7859228 : nDstXSize +
3716 7859228 : iDstPixel - nDstXOff] =
3717 7859228 : ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
3718 : nSrcPixelCount);
3719 : }
3720 : }
3721 : else
3722 : #endif
3723 : {
3724 35466358 : for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3725 : {
3726 35194044 : const size_t j =
3727 35194044 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3728 35194044 : (nSrcPixelStart - nChunkXOff);
3729 35194044 : double dfVal1 = 0.0;
3730 35194044 : double dfVal2 = 0.0;
3731 35194044 : double dfVal3 = 0.0;
3732 35194044 : GDALResampleConvolutionHorizontal_3rows(
3733 35194044 : pChunk + j, pChunk + j + nChunkXSize,
3734 35194044 : pChunk + j + 2 * nChunkXSize, padfWeights,
3735 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3736 70388098 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3737 35194044 : nDstXSize +
3738 35194044 : iDstPixel - nDstXOff] =
3739 35194044 : ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
3740 70388098 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3741 35194044 : 1) *
3742 35194044 : nDstXSize +
3743 35194044 : iDstPixel - nDstXOff] =
3744 35194044 : ScaleValue(dfVal2, pChunk + j + nChunkXSize,
3745 : nSrcPixelCount);
3746 35259148 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3747 35194044 : 2) *
3748 35194044 : nDstXSize +
3749 35194044 : iDstPixel - nDstXOff] =
3750 35194044 : ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
3751 : nSrcPixelCount);
3752 : }
3753 : }
3754 6138566 : for (; iSrcLineOff < nHeight; ++iSrcLineOff)
3755 : {
3756 3183826 : const size_t j =
3757 3183826 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3758 3183826 : (nSrcPixelStart - nChunkXOff);
3759 3732986 : const double dfVal = GDALResampleConvolutionHorizontal(
3760 595200 : pChunk + j, padfWeights, nSrcPixelCount);
3761 3184275 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3762 3183826 : nDstXSize +
3763 3183826 : iDstPixel - nDstXOff] =
3764 3183826 : ScaleValue(dfVal, pChunk + j, nSrcPixelCount);
3765 : }
3766 : }
3767 : else
3768 : {
3769 19187467 : for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
3770 : {
3771 19104530 : const size_t j =
3772 19104530 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3773 19104530 : (nSrcPixelStart - nChunkXOff);
3774 :
3775 : if (bKernelWithNegativeWeights)
3776 : {
3777 18579412 : int nConsecutiveValid = 0;
3778 18579412 : int nMaxConsecutiveValid = 0;
3779 170140458 : for (int k = 0; k < nSrcPixelCount; k++)
3780 : {
3781 151560146 : if (pabyChunkNodataMask[j + k])
3782 43672053 : nConsecutiveValid++;
3783 107888793 : else if (nConsecutiveValid)
3784 : {
3785 107790 : nMaxConsecutiveValid = std::max(
3786 107790 : nMaxConsecutiveValid, nConsecutiveValid);
3787 107790 : nConsecutiveValid = 0;
3788 : }
3789 : }
3790 18579412 : nMaxConsecutiveValid =
3791 18579412 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
3792 18579412 : if (nMaxConsecutiveValid < nSrcPixelCount / 2)
3793 : {
3794 12651307 : const size_t nTempOffset =
3795 12651307 : static_cast<size_t>(iSrcLineOff) * nDstXSize +
3796 12651307 : iDstPixel - nDstXOff;
3797 12651307 : padfHorizontalFiltered[nTempOffset] = 0.0;
3798 12651307 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3799 12651307 : continue;
3800 : }
3801 : }
3802 :
3803 6453233 : double dfVal = 0.0;
3804 6453233 : GDALResampleConvolutionHorizontalWithMask(
3805 44639 : pChunk + j, pabyChunkNodataMask + j, padfWeights,
3806 : nSrcPixelCount, dfVal, dfWeightSum);
3807 6453233 : const size_t nTempOffset =
3808 6453233 : static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
3809 6453233 : nDstXOff;
3810 6453233 : if (dfWeightSum > 0.0)
3811 : {
3812 6408568 : padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
3813 6408568 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
3814 : }
3815 : else
3816 : {
3817 44663 : padfHorizontalFiltered[nTempOffset] = 0.0;
3818 44663 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3819 : }
3820 : }
3821 : }
3822 : }
3823 :
3824 : /* ==================================================================== */
3825 : /* Second pass: vertical filter */
3826 : /* ==================================================================== */
3827 5093 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
3828 :
3829 394928 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3830 : {
3831 389835 : Twork *const pafDstScanline =
3832 : pafWrkScanline
3833 389835 : ? pafWrkScanline
3834 8797 : : static_cast<Twork *>(pDstBuffer) +
3835 8797 : static_cast<size_t>(iDstLine - nDstYOff) * nDstXSize;
3836 :
3837 389835 : const double dfSrcLine =
3838 389835 : (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
3839 389835 : int nSrcLineStart =
3840 389835 : static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
3841 389835 : int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
3842 389835 : if (nSrcLineStart < nChunkYOff)
3843 3388 : nSrcLineStart = nChunkYOff;
3844 389835 : if (nSrcLineStop > nChunkBottomYOff)
3845 3432 : nSrcLineStop = nChunkBottomYOff;
3846 : #if 0
3847 : if( nSrcLineStart < nChunkYOff &&
3848 : nChunkYOff > 0 )
3849 : {
3850 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3851 : }
3852 : if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
3853 : {
3854 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3855 : }
3856 : #endif
3857 389835 : const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
3858 389835 : double dfWeightSum = 0.0;
3859 :
3860 : // Compute convolution coefficients.
3861 389835 : int nSrcLine = nSrcLineStart; // Used after for.
3862 389835 : double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
3863 998892 : for (; nSrcLine < nSrcLineStop - 3;
3864 609057 : nSrcLine += 4, dfY += 4 * dfYScaleWeight)
3865 : {
3866 609057 : padfWeights[nSrcLine - nSrcLineStart] = dfY;
3867 609057 : padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
3868 609057 : padfWeights[nSrcLine + 2 - nSrcLineStart] =
3869 609057 : dfY + 2 * dfYScaleWeight;
3870 609057 : padfWeights[nSrcLine + 3 - nSrcLineStart] =
3871 609057 : dfY + 3 * dfYScaleWeight;
3872 609057 : dfWeightSum +=
3873 609057 : pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
3874 : }
3875 427653 : for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
3876 : {
3877 37818 : const double dfWeight = pfnFilterFunc(dfY);
3878 37818 : padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
3879 37818 : dfWeightSum += dfWeight;
3880 : }
3881 :
3882 389835 : if (pabyChunkNodataMask == nullptr)
3883 : {
3884 : // For floating-point data types, we must scale down a bit values
3885 : // if input values are close to +/- std::numeric_limits<T>::max()
3886 : #ifdef OLD_CPPCHECK
3887 : constexpr double mulFactor = 1;
3888 : #else
3889 353911 : constexpr double mulFactor =
3890 : (bNeedRescale &&
3891 : (std::is_same_v<T, float> || std::is_same_v<T, double>))
3892 : ? 2
3893 : : 1;
3894 : #endif
3895 :
3896 353911 : if (dfWeightSum != 0)
3897 : {
3898 353911 : const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
3899 2579837 : for (int i = 0; i < nSrcLineCount; ++i)
3900 2225927 : padfWeights[i] *= dfInvWeightSum;
3901 : }
3902 :
3903 353911 : int iFilteredPixelOff = 0; // Used after for.
3904 : // j used after for.
3905 353911 : size_t j =
3906 353911 : (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
3907 : #ifdef USE_SSE2
3908 : if constexpr ((!bNeedRescale ||
3909 : !std::is_same_v<T, float>)&&eWrkDataType ==
3910 : GDT_Float32)
3911 : {
3912 : #ifdef __AVX__
3913 : for (; iFilteredPixelOff < nDstXSize - 15;
3914 : iFilteredPixelOff += 16, j += 16)
3915 : {
3916 : GDALResampleConvolutionVertical_16cols(
3917 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3918 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3919 : if (bHasNoData)
3920 : {
3921 : for (int k = 0; k < 16; k++)
3922 : {
3923 : pafDstScanline[iFilteredPixelOff + k] =
3924 : replaceValIfNodata(
3925 : pafDstScanline[iFilteredPixelOff + k]);
3926 : }
3927 : }
3928 : }
3929 : #else
3930 25954967 : for (; iFilteredPixelOff < nDstXSize - 7;
3931 : iFilteredPixelOff += 8, j += 8)
3932 : {
3933 25609808 : GDALResampleConvolutionVertical_8cols(
3934 25609808 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3935 25609808 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3936 25609808 : if (bHasNoData)
3937 : {
3938 123192 : for (int k = 0; k < 8; k++)
3939 : {
3940 109504 : pafDstScanline[iFilteredPixelOff + k] =
3941 109504 : replaceValIfNodata(
3942 109504 : pafDstScanline[iFilteredPixelOff + k]);
3943 : }
3944 : }
3945 : }
3946 : #endif
3947 :
3948 809343 : for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
3949 : {
3950 464251 : const Twork fVal =
3951 464251 : static_cast<Twork>(GDALResampleConvolutionVertical(
3952 464251 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3953 : nSrcLineCount));
3954 464251 : pafDstScanline[iFilteredPixelOff] =
3955 464251 : replaceValIfNodata(fVal);
3956 : }
3957 : }
3958 : else
3959 : #endif
3960 : {
3961 5862642 : const auto ScaleValue = [
3962 : #ifdef _MSC_VER
3963 : mulFactor
3964 : #endif
3965 : ](double dfVal, [[maybe_unused]] const double *inputValues,
3966 : [[maybe_unused]] int nStride,
3967 : [[maybe_unused]] int nInputValues)
3968 : {
3969 5862640 : constexpr bool isFloat =
3970 : std::is_same_v<T, float> || std::is_same_v<T, double>;
3971 : if constexpr (isFloat)
3972 : {
3973 5862640 : if (std::isfinite(dfVal))
3974 : {
3975 : return std::clamp(
3976 : dfVal,
3977 : static_cast<double>(
3978 17585400 : -std::numeric_limits<Twork>::max()) /
3979 : mulFactor,
3980 : static_cast<double>(
3981 5861800 : std::numeric_limits<Twork>::max()) /
3982 5861800 : mulFactor) *
3983 5861800 : mulFactor;
3984 : }
3985 : else if constexpr (bKernelWithNegativeWeights)
3986 : {
3987 480 : if (std::isnan(dfVal))
3988 : {
3989 : // Either one of the input value is NaN or they are +/-Inf
3990 480 : const bool isPositive = inputValues[0] >= 0;
3991 2520 : for (int i = 0; i < nInputValues; ++i)
3992 : {
3993 2200 : if (std::isnan(inputValues[i * nStride]))
3994 160 : return dfVal;
3995 : // cppcheck-suppress knownConditionTrueFalse
3996 2040 : if ((inputValues[i] >= 0) != isPositive)
3997 0 : return dfVal;
3998 : }
3999 : // All values are positive or negative infinity
4000 320 : return inputValues[0];
4001 : }
4002 : }
4003 : }
4004 :
4005 360 : return dfVal;
4006 : };
4007 :
4008 2939422 : for (; iFilteredPixelOff < nDstXSize - 1;
4009 : iFilteredPixelOff += 2, j += 2)
4010 : {
4011 2930610 : double dfVal1 = 0.0;
4012 2930610 : double dfVal2 = 0.0;
4013 2930610 : GDALResampleConvolutionVertical_2cols(
4014 2930610 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
4015 : nSrcLineCount, dfVal1, dfVal2);
4016 5861220 : pafDstScanline[iFilteredPixelOff] =
4017 2930610 : replaceValIfNodata(static_cast<Twork>(
4018 2930610 : ScaleValue(dfVal1, padfHorizontalFiltered + j,
4019 : nDstXSize, nSrcLineCount)));
4020 2930610 : pafDstScanline[iFilteredPixelOff + 1] =
4021 2930610 : replaceValIfNodata(static_cast<Twork>(
4022 2930610 : ScaleValue(dfVal2, padfHorizontalFiltered + j + 1,
4023 : nDstXSize, nSrcLineCount)));
4024 : }
4025 8819 : if (iFilteredPixelOff < nDstXSize)
4026 : {
4027 1427 : const double dfVal = GDALResampleConvolutionVertical(
4028 1427 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
4029 : nSrcLineCount);
4030 1427 : pafDstScanline[iFilteredPixelOff] =
4031 1427 : replaceValIfNodata(static_cast<Twork>(
4032 1427 : ScaleValue(dfVal, padfHorizontalFiltered + j,
4033 : nDstXSize, nSrcLineCount)));
4034 : }
4035 : }
4036 : }
4037 : else
4038 : {
4039 18367351 : for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
4040 : ++iFilteredPixelOff)
4041 : {
4042 18331457 : double dfVal = 0.0;
4043 18331457 : dfWeightSum = 0.0;
4044 18331457 : size_t j = (nSrcLineStart - nChunkYOff) *
4045 18331457 : static_cast<size_t>(nDstXSize) +
4046 18331457 : iFilteredPixelOff;
4047 : if (bKernelWithNegativeWeights)
4048 : {
4049 18087901 : int nConsecutiveValid = 0;
4050 18087901 : int nMaxConsecutiveValid = 0;
4051 127256321 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
4052 : {
4053 109168020 : const double dfWeight =
4054 109168020 : padfWeights[i] *
4055 : pabyChunkNodataMaskHorizontalFiltered[j];
4056 109168020 : if (pabyChunkNodataMaskHorizontalFiltered[j])
4057 : {
4058 46108037 : nConsecutiveValid++;
4059 : }
4060 63060183 : else if (nConsecutiveValid)
4061 : {
4062 204376 : nMaxConsecutiveValid = std::max(
4063 204376 : nMaxConsecutiveValid, nConsecutiveValid);
4064 204376 : nConsecutiveValid = 0;
4065 : }
4066 109168020 : dfVal += padfHorizontalFiltered[j] * dfWeight;
4067 109168020 : dfWeightSum += dfWeight;
4068 : }
4069 18087901 : nMaxConsecutiveValid =
4070 18087901 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
4071 18087901 : if (nMaxConsecutiveValid < nSrcLineCount / 2)
4072 : {
4073 8918591 : pafDstScanline[iFilteredPixelOff] =
4074 8918499 : static_cast<Twork>(dfNoDataValue);
4075 8918591 : continue;
4076 : }
4077 : }
4078 : else
4079 : {
4080 1237062 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
4081 : {
4082 993504 : const double dfWeight =
4083 993504 : padfWeights[i] *
4084 : pabyChunkNodataMaskHorizontalFiltered[j];
4085 993504 : dfVal += padfHorizontalFiltered[j] * dfWeight;
4086 993504 : dfWeightSum += dfWeight;
4087 : }
4088 : }
4089 9412886 : if (dfWeightSum > 0.0)
4090 : {
4091 9396847 : pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
4092 9396835 : static_cast<Twork>(dfVal / dfWeightSum));
4093 : }
4094 : else
4095 : {
4096 16045 : pafDstScanline[iFilteredPixelOff] =
4097 16021 : static_cast<Twork>(dfNoDataValue);
4098 : }
4099 : }
4100 : }
4101 :
4102 389835 : if (fMaxVal != 0.0f)
4103 : {
4104 : if constexpr (std::is_same_v<T, double>)
4105 : {
4106 0 : for (int i = 0; i < nDstXSize; ++i)
4107 : {
4108 0 : if (pafDstScanline[i] > static_cast<double>(fMaxVal))
4109 0 : pafDstScanline[i] = static_cast<double>(fMaxVal);
4110 : }
4111 : }
4112 : else
4113 : {
4114 192324 : for (int i = 0; i < nDstXSize; ++i)
4115 : {
4116 192088 : if (pafDstScanline[i] > fMaxVal)
4117 96022 : pafDstScanline[i] = fMaxVal;
4118 : }
4119 : }
4120 : }
4121 :
4122 389835 : if (pafWrkScanline)
4123 : {
4124 381038 : GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
4125 : static_cast<GByte *>(pDstBuffer) +
4126 381038 : static_cast<size_t>(iDstLine - nDstYOff) *
4127 381038 : nDstXSize * nDstDataTypeSize,
4128 : dstDataType, nDstDataTypeSize, nDstXSize);
4129 : }
4130 : }
4131 :
4132 5093 : VSIFree(pafWrkScanline);
4133 5093 : VSIFreeAligned(padfWeights);
4134 5093 : VSIFree(padfHorizontalFiltered);
4135 5093 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
4136 :
4137 5093 : return CE_None;
4138 : }
4139 :
4140 : template <bool bKernelWithNegativeWeights, bool bNeedRescale>
4141 : static CPLErr
4142 5093 : GDALResampleChunk_ConvolutionInternal(const GDALOverviewResampleArgs &args,
4143 : const void *pChunk, void **ppDstBuffer,
4144 : GDALDataType *peDstBufferDataType)
4145 : {
4146 : GDALResampleAlg eResample;
4147 5093 : if (EQUAL(args.pszResampling, "BILINEAR"))
4148 2660 : eResample = GRA_Bilinear;
4149 2433 : else if (EQUAL(args.pszResampling, "CUBIC"))
4150 2284 : eResample = GRA_Cubic;
4151 149 : else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
4152 59 : eResample = GRA_CubicSpline;
4153 90 : else if (EQUAL(args.pszResampling, "LANCZOS"))
4154 90 : eResample = GRA_Lanczos;
4155 : else
4156 : {
4157 0 : CPLAssert(false);
4158 : return CE_Failure;
4159 : }
4160 5093 : const int nKernelRadius = GWKGetFilterRadius(eResample);
4161 5093 : FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
4162 : const FilterFunc4ValuesType pfnFilterFunc4Values =
4163 5093 : GWKGetFilterFunc4Values(eResample);
4164 :
4165 5093 : float fMaxVal = 0.f;
4166 : // Cubic, etc... can have overshoots, so make sure we clamp values to the
4167 : // maximum value if NBITS is set.
4168 5093 : if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
4169 8 : (args.eOvrDataType == GDT_UInt8 || args.eOvrDataType == GDT_UInt16 ||
4170 0 : args.eOvrDataType == GDT_UInt32))
4171 : {
4172 8 : int nBits = args.nOvrNBITS;
4173 8 : if (nBits == GDALGetDataTypeSizeBits(args.eOvrDataType))
4174 1 : nBits = 0;
4175 8 : if (nBits > 0 && nBits < 32)
4176 7 : fMaxVal = static_cast<float>((1U << nBits) - 1);
4177 : }
4178 :
4179 5093 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
4180 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
4181 : GDALGetDataTypeSizeBytes(args.eOvrDataType));
4182 5093 : if (*ppDstBuffer == nullptr)
4183 : {
4184 0 : return CE_Failure;
4185 : }
4186 5093 : *peDstBufferDataType = args.eOvrDataType;
4187 :
4188 5093 : switch (args.eWrkDataType)
4189 : {
4190 4225 : case GDT_UInt8:
4191 : {
4192 : return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32,
4193 : bKernelWithNegativeWeights,
4194 4225 : bNeedRescale>(
4195 : args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
4196 4225 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4197 : }
4198 :
4199 402 : case GDT_UInt16:
4200 : {
4201 : return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32,
4202 : bKernelWithNegativeWeights,
4203 402 : bNeedRescale>(
4204 : args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
4205 402 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4206 : }
4207 :
4208 375 : case GDT_Float32:
4209 : {
4210 : return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32,
4211 : bKernelWithNegativeWeights,
4212 375 : bNeedRescale>(
4213 : args, static_cast<const float *>(pChunk), *ppDstBuffer,
4214 375 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4215 : }
4216 :
4217 91 : case GDT_Float64:
4218 : {
4219 : return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64,
4220 : bKernelWithNegativeWeights,
4221 91 : bNeedRescale>(
4222 : args, static_cast<const double *>(pChunk), *ppDstBuffer,
4223 91 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4224 : }
4225 :
4226 0 : default:
4227 0 : break;
4228 : }
4229 :
4230 0 : CPLAssert(false);
4231 : return CE_Failure;
4232 : }
4233 :
4234 : static CPLErr
4235 5093 : GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
4236 : const void *pChunk, void **ppDstBuffer,
4237 : GDALDataType *peDstBufferDataType)
4238 : {
4239 5093 : if (EQUAL(args.pszResampling, "CUBIC") ||
4240 2809 : EQUAL(args.pszResampling, "LANCZOS"))
4241 : return GDALResampleChunk_ConvolutionInternal<
4242 2374 : /* bKernelWithNegativeWeights=*/true, /* bNeedRescale = */ true>(
4243 2374 : args, pChunk, ppDstBuffer, peDstBufferDataType);
4244 2719 : else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
4245 59 : return GDALResampleChunk_ConvolutionInternal<false, true>(
4246 59 : args, pChunk, ppDstBuffer, peDstBufferDataType);
4247 : else
4248 2660 : return GDALResampleChunk_ConvolutionInternal<false, false>(
4249 2660 : args, pChunk, ppDstBuffer, peDstBufferDataType);
4250 : }
4251 :
4252 : /************************************************************************/
4253 : /* GDALResampleChunkC32R() */
4254 : /************************************************************************/
4255 :
4256 2 : static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
4257 : const float *pafChunk, const int nChunkYOff,
4258 : const int nChunkYSize, const int nDstYOff,
4259 : const int nDstYOff2, const int nOvrXSize,
4260 : const int nOvrYSize, void **ppDstBuffer,
4261 : GDALDataType *peDstBufferDataType,
4262 : const char *pszResampling)
4263 :
4264 : {
4265 : enum Method
4266 : {
4267 : NEAR,
4268 : AVERAGE,
4269 : AVERAGE_MAGPHASE,
4270 : RMS,
4271 : };
4272 :
4273 2 : Method eMethod = NEAR;
4274 2 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
4275 : {
4276 0 : eMethod = NEAR;
4277 : }
4278 2 : else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
4279 : {
4280 0 : eMethod = AVERAGE_MAGPHASE;
4281 : }
4282 2 : else if (EQUAL(pszResampling, "RMS"))
4283 : {
4284 2 : eMethod = RMS;
4285 : }
4286 0 : else if (STARTS_WITH_CI(pszResampling, "AVER"))
4287 : {
4288 0 : eMethod = AVERAGE;
4289 : }
4290 : else
4291 : {
4292 0 : CPLError(
4293 : CE_Failure, CPLE_NotSupported,
4294 : "Resampling method %s is not supported for complex data types. "
4295 : "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
4296 : pszResampling);
4297 0 : return CE_Failure;
4298 : }
4299 :
4300 2 : const int nOXSize = nOvrXSize;
4301 2 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
4302 : GDALGetDataTypeSizeBytes(GDT_CFloat32));
4303 2 : if (*ppDstBuffer == nullptr)
4304 : {
4305 0 : return CE_Failure;
4306 : }
4307 2 : float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
4308 2 : *peDstBufferDataType = GDT_CFloat32;
4309 :
4310 2 : const int nOYSize = nOvrYSize;
4311 2 : const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
4312 2 : const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
4313 :
4314 : /* ==================================================================== */
4315 : /* Loop over destination scanlines. */
4316 : /* ==================================================================== */
4317 8 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
4318 : {
4319 6 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
4320 6 : if (nSrcYOff < nChunkYOff)
4321 0 : nSrcYOff = nChunkYOff;
4322 :
4323 6 : int nSrcYOff2 =
4324 6 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
4325 6 : if (nSrcYOff2 == nSrcYOff)
4326 0 : nSrcYOff2++;
4327 :
4328 6 : if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
4329 : {
4330 2 : if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
4331 0 : nSrcYOff = nSrcHeight - 1;
4332 2 : nSrcYOff2 = nSrcHeight;
4333 : }
4334 6 : if (nSrcYOff2 > nChunkYOff + nChunkYSize)
4335 0 : nSrcYOff2 = nChunkYOff + nChunkYSize;
4336 :
4337 6 : const float *const pafSrcScanline =
4338 6 : pafChunk +
4339 6 : (static_cast<size_t>(nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
4340 6 : float *const pafDstScanline =
4341 6 : pafDstBuffer +
4342 6 : static_cast<size_t>(iDstLine - nDstYOff) * 2 * nOXSize;
4343 :
4344 : /* --------------------------------------------------------------------
4345 : */
4346 : /* Loop over destination pixels */
4347 : /* --------------------------------------------------------------------
4348 : */
4349 18 : for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
4350 : {
4351 12 : const size_t iDstPixelSZ = static_cast<size_t>(iDstPixel);
4352 12 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
4353 12 : int nSrcXOff2 =
4354 12 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
4355 12 : if (nSrcXOff2 == nSrcXOff)
4356 0 : nSrcXOff2++;
4357 12 : if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
4358 : {
4359 6 : if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
4360 0 : nSrcXOff = nSrcWidth - 1;
4361 6 : nSrcXOff2 = nSrcWidth;
4362 : }
4363 12 : const size_t nSrcXOffSZ = static_cast<size_t>(nSrcXOff);
4364 :
4365 12 : if (eMethod == NEAR)
4366 : {
4367 0 : pafDstScanline[iDstPixelSZ * 2] =
4368 0 : pafSrcScanline[nSrcXOffSZ * 2];
4369 0 : pafDstScanline[iDstPixelSZ * 2 + 1] =
4370 0 : pafSrcScanline[nSrcXOffSZ * 2 + 1];
4371 : }
4372 12 : else if (eMethod == AVERAGE_MAGPHASE)
4373 : {
4374 0 : double dfTotalR = 0.0;
4375 0 : double dfTotalI = 0.0;
4376 0 : double dfTotalM = 0.0;
4377 0 : size_t nCount = 0;
4378 :
4379 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4380 : {
4381 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4382 : {
4383 0 : const double dfR = double(
4384 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4385 0 : static_cast<size_t>(iY - nSrcYOff) *
4386 0 : nSrcWidth * 2]);
4387 0 : const double dfI = double(
4388 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4389 0 : static_cast<size_t>(iY - nSrcYOff) *
4390 0 : nSrcWidth * 2 +
4391 0 : 1]);
4392 0 : dfTotalR += dfR;
4393 0 : dfTotalI += dfI;
4394 0 : dfTotalM += std::hypot(dfR, dfI);
4395 0 : ++nCount;
4396 : }
4397 : }
4398 :
4399 0 : CPLAssert(nCount > 0);
4400 0 : if (nCount == 0)
4401 : {
4402 0 : pafDstScanline[iDstPixelSZ * 2] = 0.0;
4403 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4404 : }
4405 : else
4406 : {
4407 0 : pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4408 0 : dfTotalR / static_cast<double>(nCount));
4409 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4410 0 : dfTotalI / static_cast<double>(nCount));
4411 : const double dfM =
4412 0 : double(std::hypot(pafDstScanline[iDstPixelSZ * 2],
4413 0 : pafDstScanline[iDstPixelSZ * 2 + 1]));
4414 0 : const double dfDesiredM =
4415 0 : dfTotalM / static_cast<double>(nCount);
4416 0 : double dfRatio = 1.0;
4417 0 : if (dfM != 0.0)
4418 0 : dfRatio = dfDesiredM / dfM;
4419 :
4420 0 : pafDstScanline[iDstPixelSZ * 2] *=
4421 0 : static_cast<float>(dfRatio);
4422 0 : pafDstScanline[iDstPixelSZ * 2 + 1] *=
4423 0 : static_cast<float>(dfRatio);
4424 : }
4425 : }
4426 12 : else if (eMethod == RMS)
4427 : {
4428 12 : double dfTotalR = 0.0;
4429 12 : double dfTotalI = 0.0;
4430 12 : size_t nCount = 0;
4431 :
4432 36 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4433 : {
4434 72 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4435 : {
4436 48 : const double dfR = double(
4437 48 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4438 48 : static_cast<size_t>(iY - nSrcYOff) *
4439 48 : nSrcWidth * 2]);
4440 48 : const double dfI = double(
4441 48 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4442 48 : static_cast<size_t>(iY - nSrcYOff) *
4443 48 : nSrcWidth * 2 +
4444 48 : 1]);
4445 :
4446 48 : dfTotalR += SQUARE(dfR);
4447 48 : dfTotalI += SQUARE(dfI);
4448 :
4449 48 : ++nCount;
4450 : }
4451 : }
4452 :
4453 12 : CPLAssert(nCount > 0);
4454 12 : if (nCount == 0)
4455 : {
4456 0 : pafDstScanline[iDstPixelSZ * 2] = 0.0;
4457 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4458 : }
4459 : else
4460 : {
4461 : /* compute RMS */
4462 12 : pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4463 12 : sqrt(dfTotalR / static_cast<double>(nCount)));
4464 12 : pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4465 12 : sqrt(dfTotalI / static_cast<double>(nCount)));
4466 : }
4467 : }
4468 0 : else if (eMethod == AVERAGE)
4469 : {
4470 0 : double dfTotalR = 0.0;
4471 0 : double dfTotalI = 0.0;
4472 0 : size_t nCount = 0;
4473 :
4474 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4475 : {
4476 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4477 : {
4478 : // TODO(schwehr): Maybe use std::complex?
4479 0 : dfTotalR += double(
4480 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4481 0 : static_cast<size_t>(iY - nSrcYOff) *
4482 0 : nSrcWidth * 2]);
4483 0 : dfTotalI += double(
4484 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4485 0 : static_cast<size_t>(iY - nSrcYOff) *
4486 0 : nSrcWidth * 2 +
4487 0 : 1]);
4488 0 : ++nCount;
4489 : }
4490 : }
4491 :
4492 0 : CPLAssert(nCount > 0);
4493 0 : if (nCount == 0)
4494 : {
4495 0 : pafDstScanline[iDstPixelSZ * 2] = 0.0;
4496 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4497 : }
4498 : else
4499 : {
4500 0 : pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4501 0 : dfTotalR / static_cast<double>(nCount));
4502 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4503 0 : dfTotalI / static_cast<double>(nCount));
4504 : }
4505 : }
4506 : }
4507 : }
4508 :
4509 2 : return CE_None;
4510 : }
4511 :
4512 : /************************************************************************/
4513 : /* GDALRegenerateCascadingOverviews() */
4514 : /* */
4515 : /* Generate a list of overviews in order from largest to */
4516 : /* smallest, computing each from the next larger. */
4517 : /************************************************************************/
4518 :
4519 44 : static CPLErr GDALRegenerateCascadingOverviews(
4520 : GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
4521 : const char *pszResampling, GDALProgressFunc pfnProgress,
4522 : void *pProgressData, CSLConstList papszOptions)
4523 :
4524 : {
4525 : /* -------------------------------------------------------------------- */
4526 : /* First, we must put the overviews in order from largest to */
4527 : /* smallest. */
4528 : /* -------------------------------------------------------------------- */
4529 127 : for (int i = 0; i < nOverviews - 1; ++i)
4530 : {
4531 292 : for (int j = 0; j < nOverviews - i - 1; ++j)
4532 : {
4533 209 : if (papoOvrBands[j]->GetXSize() *
4534 209 : static_cast<float>(papoOvrBands[j]->GetYSize()) <
4535 209 : papoOvrBands[j + 1]->GetXSize() *
4536 209 : static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
4537 : {
4538 0 : GDALRasterBand *poTempBand = papoOvrBands[j];
4539 0 : papoOvrBands[j] = papoOvrBands[j + 1];
4540 0 : papoOvrBands[j + 1] = poTempBand;
4541 : }
4542 : }
4543 : }
4544 :
4545 : /* -------------------------------------------------------------------- */
4546 : /* Count total pixels so we can prepare appropriate scaled */
4547 : /* progress functions. */
4548 : /* -------------------------------------------------------------------- */
4549 44 : double dfTotalPixels = 0.0;
4550 :
4551 171 : for (int i = 0; i < nOverviews; ++i)
4552 : {
4553 127 : dfTotalPixels += papoOvrBands[i]->GetXSize() *
4554 127 : static_cast<double>(papoOvrBands[i]->GetYSize());
4555 : }
4556 :
4557 : /* -------------------------------------------------------------------- */
4558 : /* Generate all the bands. */
4559 : /* -------------------------------------------------------------------- */
4560 44 : double dfPixelsProcessed = 0.0;
4561 :
4562 88 : CPLStringList aosOptions(papszOptions);
4563 44 : aosOptions.SetNameValue("CASCADING", "YES");
4564 171 : for (int i = 0; i < nOverviews; ++i)
4565 : {
4566 127 : GDALRasterBand *poBaseBand = poSrcBand;
4567 127 : if (i != 0)
4568 83 : poBaseBand = papoOvrBands[i - 1];
4569 :
4570 127 : double dfPixels = papoOvrBands[i]->GetXSize() *
4571 127 : static_cast<double>(papoOvrBands[i]->GetYSize());
4572 :
4573 254 : void *pScaledProgressData = GDALCreateScaledProgress(
4574 : dfPixelsProcessed / dfTotalPixels,
4575 127 : (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
4576 : pProgressData);
4577 :
4578 254 : const CPLErr eErr = GDALRegenerateOverviewsEx(
4579 : poBaseBand, 1,
4580 127 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
4581 : pszResampling, GDALScaledProgress, pScaledProgressData,
4582 127 : aosOptions.List());
4583 127 : GDALDestroyScaledProgress(pScaledProgressData);
4584 :
4585 127 : if (eErr != CE_None)
4586 0 : return eErr;
4587 :
4588 127 : dfPixelsProcessed += dfPixels;
4589 :
4590 : // Only do the bit2grayscale promotion on the base band.
4591 127 : if (STARTS_WITH_CI(pszResampling,
4592 : "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
4593 8 : pszResampling = "AVERAGE";
4594 : }
4595 :
4596 44 : return CE_None;
4597 : }
4598 :
4599 : /************************************************************************/
4600 : /* GDALGetResampleFunction() */
4601 : /************************************************************************/
4602 :
4603 5466 : GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
4604 : int *pnRadius)
4605 : {
4606 5466 : if (pnRadius)
4607 5466 : *pnRadius = 0;
4608 5466 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
4609 518 : return GDALResampleChunk_Near;
4610 4948 : else if (STARTS_WITH_CI(pszResampling, "AVER") ||
4611 4373 : EQUAL(pszResampling, "RMS"))
4612 634 : return GDALResampleChunk_AverageOrRMS;
4613 4314 : else if (EQUAL(pszResampling, "GAUSS"))
4614 : {
4615 26 : if (pnRadius)
4616 26 : *pnRadius = 1;
4617 26 : return GDALResampleChunk_Gauss;
4618 : }
4619 4288 : else if (EQUAL(pszResampling, "MODE"))
4620 136 : return GDALResampleChunk_Mode;
4621 4152 : else if (EQUAL(pszResampling, "CUBIC"))
4622 : {
4623 1639 : if (pnRadius)
4624 1639 : *pnRadius = GWKGetFilterRadius(GRA_Cubic);
4625 1639 : return GDALResampleChunk_Convolution;
4626 : }
4627 2513 : else if (EQUAL(pszResampling, "CUBICSPLINE"))
4628 : {
4629 39 : if (pnRadius)
4630 39 : *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
4631 39 : return GDALResampleChunk_Convolution;
4632 : }
4633 2474 : else if (EQUAL(pszResampling, "LANCZOS"))
4634 : {
4635 44 : if (pnRadius)
4636 44 : *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
4637 44 : return GDALResampleChunk_Convolution;
4638 : }
4639 2430 : else if (EQUAL(pszResampling, "BILINEAR"))
4640 : {
4641 2430 : if (pnRadius)
4642 2430 : *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
4643 2430 : return GDALResampleChunk_Convolution;
4644 : }
4645 : else
4646 : {
4647 0 : CPLError(
4648 : CE_Failure, CPLE_AppDefined,
4649 : "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
4650 : pszResampling);
4651 0 : return nullptr;
4652 : }
4653 : }
4654 :
4655 : /************************************************************************/
4656 : /* GDALGetOvrWorkDataType() */
4657 : /************************************************************************/
4658 :
4659 5348 : GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
4660 : GDALDataType eSrcDataType)
4661 : {
4662 5348 : if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
4663 : {
4664 646 : return eSrcDataType;
4665 : }
4666 4702 : else if (eSrcDataType == GDT_UInt8 &&
4667 4167 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4668 3682 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4669 2279 : EQUAL(pszResampling, "CUBICSPLINE") ||
4670 2274 : EQUAL(pszResampling, "LANCZOS") ||
4671 2267 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4672 : {
4673 4160 : return GDT_UInt8;
4674 : }
4675 542 : else if (eSrcDataType == GDT_UInt16 &&
4676 131 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4677 126 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4678 8 : EQUAL(pszResampling, "CUBICSPLINE") ||
4679 6 : EQUAL(pszResampling, "LANCZOS") ||
4680 3 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4681 : {
4682 131 : return GDT_UInt16;
4683 : }
4684 411 : else if (EQUAL(pszResampling, "GAUSS"))
4685 20 : return GDT_Float64;
4686 :
4687 391 : if (eSrcDataType == GDT_UInt8 || eSrcDataType == GDT_Int8 ||
4688 390 : eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
4689 : eSrcDataType == GDT_Float32)
4690 : {
4691 257 : return GDT_Float32;
4692 : }
4693 134 : return GDT_Float64;
4694 : }
4695 :
4696 : namespace
4697 : {
4698 : // Structure to hold a pointer to free with CPLFree()
4699 : struct PointerHolder
4700 : {
4701 : void *ptr = nullptr;
4702 :
4703 5833 : explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
4704 : {
4705 5833 : }
4706 :
4707 5833 : ~PointerHolder()
4708 5833 : {
4709 5833 : CPLFree(ptr);
4710 5833 : }
4711 :
4712 : PointerHolder(const PointerHolder &) = delete;
4713 : PointerHolder &operator=(const PointerHolder &) = delete;
4714 : };
4715 : } // namespace
4716 :
4717 : /************************************************************************/
4718 : /* GDALRegenerateOverviews() */
4719 : /************************************************************************/
4720 :
4721 : /**
4722 : * \brief Generate downsampled overviews.
4723 : *
4724 : * This function will generate one or more overview images from a base image
4725 : * using the requested downsampling algorithm. Its primary use is for
4726 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4727 : * used to generate downsampled images in one file from another outside the
4728 : * overview architecture.
4729 : *
4730 : * The output bands need to exist in advance.
4731 : *
4732 : * The full set of resampling algorithms is documented in
4733 : * GDALDataset::BuildOverviews().
4734 : *
4735 : * This function will honour properly NODATA_VALUES tuples (special dataset
4736 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4737 : * considered as the nodata value and not each value of the triplet
4738 : * independently per band.
4739 : *
4740 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4741 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4742 : * overview computation.
4743 : *
4744 : * @param hSrcBand the source (base level) band.
4745 : * @param nOverviewCount the number of downsampled bands being generated.
4746 : * @param pahOvrBands the list of downsampled bands to be generated.
4747 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4748 : * @param pfnProgress progress report function.
4749 : * @param pProgressData progress function callback data.
4750 : * @return CE_None on success or CE_Failure on failure.
4751 : */
4752 250 : CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
4753 : GDALRasterBandH *pahOvrBands,
4754 : const char *pszResampling,
4755 : GDALProgressFunc pfnProgress,
4756 : void *pProgressData)
4757 :
4758 : {
4759 250 : return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
4760 : pszResampling, pfnProgress, pProgressData,
4761 250 : nullptr);
4762 : }
4763 :
4764 : /************************************************************************/
4765 : /* GDALRegenerateOverviewsEx() */
4766 : /************************************************************************/
4767 :
4768 : constexpr int RADIUS_TO_DIAMETER = 2;
4769 :
4770 : /**
4771 : * \brief Generate downsampled overviews.
4772 : *
4773 : * This function will generate one or more overview images from a base image
4774 : * using the requested downsampling algorithm. Its primary use is for
4775 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4776 : * used to generate downsampled images in one file from another outside the
4777 : * overview architecture.
4778 : *
4779 : * The output bands need to exist in advance.
4780 : *
4781 : * The full set of resampling algorithms is documented in
4782 : * GDALDataset::BuildOverviews().
4783 : *
4784 : * This function will honour properly NODATA_VALUES tuples (special dataset
4785 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4786 : * considered as the nodata value and not each value of the triplet
4787 : * independently per band.
4788 : *
4789 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4790 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4791 : * overview computation.
4792 : *
4793 : * @param hSrcBand the source (base level) band.
4794 : * @param nOverviewCount the number of downsampled bands being generated.
4795 : * @param pahOvrBands the list of downsampled bands to be generated.
4796 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4797 : * @param pfnProgress progress report function.
4798 : * @param pProgressData progress function callback data.
4799 : * @param papszOptions NULL terminated list of options as key=value pairs, or
4800 : * NULL
4801 : * @return CE_None on success or CE_Failure on failure.
4802 : * @since GDAL 3.6
4803 : */
4804 914 : CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
4805 : GDALRasterBandH *pahOvrBands,
4806 : const char *pszResampling,
4807 : GDALProgressFunc pfnProgress,
4808 : void *pProgressData, CSLConstList papszOptions)
4809 :
4810 : {
4811 914 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
4812 914 : GDALRasterBand **papoOvrBands =
4813 : reinterpret_cast<GDALRasterBand **>(pahOvrBands);
4814 :
4815 914 : if (pfnProgress == nullptr)
4816 252 : pfnProgress = GDALDummyProgress;
4817 :
4818 914 : if (EQUAL(pszResampling, "NONE"))
4819 49 : return CE_None;
4820 :
4821 865 : int nKernelRadius = 0;
4822 : GDALResampleFunction pfnResampleFn =
4823 865 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
4824 :
4825 865 : if (pfnResampleFn == nullptr)
4826 0 : return CE_Failure;
4827 :
4828 : /* -------------------------------------------------------------------- */
4829 : /* Check color tables... */
4830 : /* -------------------------------------------------------------------- */
4831 865 : GDALColorTable *poColorTable = nullptr;
4832 :
4833 494 : if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
4834 1808 : EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
4835 460 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4836 : {
4837 9 : poColorTable = poSrcBand->GetColorTable();
4838 9 : if (poColorTable != nullptr)
4839 : {
4840 9 : if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
4841 : {
4842 0 : CPLError(CE_Warning, CPLE_AppDefined,
4843 : "Computing overviews on palette index raster bands "
4844 : "with a palette whose color interpretation is not RGB "
4845 : "will probably lead to unexpected results.");
4846 0 : poColorTable = nullptr;
4847 : }
4848 9 : else if (poColorTable->IsIdentity())
4849 : {
4850 0 : poColorTable = nullptr;
4851 : }
4852 : }
4853 : else
4854 : {
4855 0 : CPLError(CE_Warning, CPLE_AppDefined,
4856 : "Computing overviews on palette index raster bands "
4857 : "without a palette will probably lead to unexpected "
4858 : "results.");
4859 : }
4860 : }
4861 : // Not ready yet
4862 2514 : else if ((EQUAL(pszResampling, "CUBIC") ||
4863 802 : EQUAL(pszResampling, "CUBICSPLINE") ||
4864 802 : EQUAL(pszResampling, "LANCZOS") ||
4865 1738 : EQUAL(pszResampling, "BILINEAR")) &&
4866 80 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4867 : {
4868 0 : CPLError(CE_Warning, CPLE_AppDefined,
4869 : "Computing %s overviews on palette index raster bands "
4870 : "will probably lead to unexpected results.",
4871 : pszResampling);
4872 : }
4873 :
4874 : // If we have a nodata mask and we are doing something more complicated
4875 : // than nearest neighbouring, we have to fetch to nodata mask.
4876 :
4877 865 : GDALRasterBand *poMaskBand = nullptr;
4878 865 : bool bUseNoDataMask = false;
4879 865 : bool bCanUseCascaded = true;
4880 :
4881 865 : if (!STARTS_WITH_CI(pszResampling, "NEAR"))
4882 : {
4883 : // Special case if we are an alpha/mask band. We want it to be
4884 : // considered as the mask band to avoid alpha=0 to be taken into account
4885 : // in average computation.
4886 540 : if (poSrcBand->IsMaskBand())
4887 : {
4888 93 : poMaskBand = poSrcBand;
4889 93 : bUseNoDataMask = true;
4890 : }
4891 : else
4892 : {
4893 447 : poMaskBand = poSrcBand->GetMaskBand();
4894 447 : const int nMaskFlags = poSrcBand->GetMaskFlags();
4895 447 : bCanUseCascaded =
4896 447 : (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
4897 447 : bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
4898 : }
4899 : }
4900 :
4901 865 : int nHasNoData = 0;
4902 865 : const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
4903 865 : const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
4904 : const bool bPropagateNoData =
4905 865 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
4906 :
4907 973 : if (poSrcBand->GetBand() == 1 && bUseNoDataMask &&
4908 108 : CSLFetchNameValue(papszOptions, "CASCADING") == nullptr)
4909 : {
4910 192 : std::string osDetailMessage;
4911 96 : if (poSrcBand->HasConflictingMaskSources(&osDetailMessage, false))
4912 : {
4913 2 : CPLError(
4914 : CE_Warning, CPLE_AppDefined, "%s%s", osDetailMessage.c_str(),
4915 : bHasNoData
4916 : ? "Only the nodata value will be taken into account."
4917 : : "Only the first listed one will be taken into account.");
4918 : }
4919 : }
4920 :
4921 : /* -------------------------------------------------------------------- */
4922 : /* If we are operating on multiple overviews, and using */
4923 : /* averaging, lets do them in cascading order to reduce the */
4924 : /* amount of computation. */
4925 : /* -------------------------------------------------------------------- */
4926 :
4927 : // In case the mask made be computed from another band of the dataset,
4928 : // we can't use cascaded generation, as the computation of the overviews
4929 : // of the band used for the mask band may not have yet occurred (#3033).
4930 865 : if ((STARTS_WITH_CI(pszResampling, "AVER") ||
4931 494 : EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
4932 463 : EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
4933 409 : EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
4934 865 : EQUAL(pszResampling, "MODE")) &&
4935 44 : nOverviewCount > 1 && bCanUseCascaded)
4936 44 : return GDALRegenerateCascadingOverviews(
4937 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
4938 44 : pProgressData, papszOptions);
4939 :
4940 : /* -------------------------------------------------------------------- */
4941 : /* Setup one horizontal swath to read from the raw buffer. */
4942 : /* -------------------------------------------------------------------- */
4943 821 : int nFRXBlockSize = 0;
4944 821 : int nFRYBlockSize = 0;
4945 821 : poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
4946 :
4947 821 : const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
4948 1317 : const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
4949 1267 : EQUAL(pszResampling, "MODE") ||
4950 446 : !GDALDataTypeIsComplex(eSrcDataType);
4951 : const GDALDataType eWrkDataType =
4952 : bUseGenericResampleFn
4953 821 : ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
4954 821 : : GDT_CFloat32;
4955 :
4956 821 : const int nWidth = poSrcBand->GetXSize();
4957 821 : const int nHeight = poSrcBand->GetYSize();
4958 :
4959 821 : int nMaxOvrFactor = 1;
4960 1761 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
4961 : {
4962 940 : const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
4963 940 : const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
4964 940 : nMaxOvrFactor = std::max(
4965 : nMaxOvrFactor,
4966 940 : static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
4967 940 : nMaxOvrFactor = std::max(
4968 : nMaxOvrFactor,
4969 940 : static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
4970 : }
4971 :
4972 821 : int nFullResYChunk = nFRYBlockSize;
4973 821 : int nMaxChunkYSizeQueried = 0;
4974 :
4975 : const auto UpdateChunkHeightAndGetChunkSize =
4976 10784 : [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
4977 87233 : eWrkDataType, nWidth]()
4978 : {
4979 : // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
4980 : // + nFullResYChunk) / nMaxOvrFactor)
4981 10784 : if (nMaxOvrFactor > INT_MAX / RADIUS_TO_DIAMETER)
4982 : {
4983 1 : return GINTBIG_MAX;
4984 : }
4985 10783 : nFullResYChunk =
4986 10783 : std::max(nFullResYChunk, RADIUS_TO_DIAMETER * nMaxOvrFactor);
4987 10783 : if ((nKernelRadius > 0 &&
4988 970 : nMaxOvrFactor > INT_MAX / (RADIUS_TO_DIAMETER * nKernelRadius)) ||
4989 10783 : nFullResYChunk >
4990 10783 : INT_MAX - RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor)
4991 : {
4992 0 : return GINTBIG_MAX;
4993 : }
4994 10783 : nMaxChunkYSizeQueried =
4995 10783 : nFullResYChunk + RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor;
4996 10783 : if (GDALGetDataTypeSizeBytes(eWrkDataType) >
4997 10783 : std::numeric_limits<int64_t>::max() /
4998 10783 : (static_cast<int64_t>(nMaxChunkYSizeQueried) * nWidth))
4999 : {
5000 1 : return GINTBIG_MAX;
5001 : }
5002 10782 : return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
5003 10782 : nMaxChunkYSizeQueried * nWidth;
5004 821 : };
5005 :
5006 : const char *pszChunkYSize =
5007 821 : CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
5008 : #ifndef __COVERITY__
5009 : // Only configurable for debug / testing
5010 821 : if (pszChunkYSize)
5011 : {
5012 0 : nFullResYChunk = atoi(pszChunkYSize);
5013 : }
5014 : #endif
5015 :
5016 : // Only configurable for debug / testing
5017 : const int nChunkMaxSize =
5018 821 : atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
5019 :
5020 821 : auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
5021 821 : if (nChunkSize > nChunkMaxSize)
5022 : {
5023 15 : if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
5024 44 : !GDALDataTypeIsComplex(eSrcDataType) &&
5025 14 : (!STARTS_WITH_CI(pszResampling, "AVER") ||
5026 2 : EQUAL(pszResampling, "AVERAGE")))
5027 : {
5028 : // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
5029 : // which use a block based strategy, which is much less memory
5030 : // hungry.
5031 14 : return GDALRegenerateOverviewsMultiBand(
5032 : 1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
5033 14 : pfnProgress, pProgressData, papszOptions);
5034 : }
5035 1 : else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
5036 : {
5037 0 : return GDALRegenerateCascadingOverviews(
5038 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
5039 0 : pfnProgress, pProgressData, papszOptions);
5040 : }
5041 : }
5042 806 : else if (pszChunkYSize == nullptr)
5043 : {
5044 : // Try to get as close as possible to nChunkMaxSize
5045 10769 : while (nChunkSize < nChunkMaxSize / 2)
5046 : {
5047 9963 : nFullResYChunk *= 2;
5048 9963 : nChunkSize = UpdateChunkHeightAndGetChunkSize();
5049 : }
5050 : }
5051 :
5052 : // Structure describing a resampling job
5053 : struct OvrJob
5054 : {
5055 : // Buffers to free when job is finished
5056 : std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
5057 : std::shared_ptr<PointerHolder> oSrcBufferHolder{};
5058 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
5059 :
5060 : GDALRasterBand *poDstBand = nullptr;
5061 :
5062 : // Input parameters of pfnResampleFn
5063 : GDALResampleFunction pfnResampleFn = nullptr;
5064 : int nSrcWidth = 0;
5065 : int nSrcHeight = 0;
5066 : int nDstWidth = 0;
5067 : GDALOverviewResampleArgs args{};
5068 : const void *pChunk = nullptr;
5069 : bool bUseGenericResampleFn = false;
5070 :
5071 : // Output values of resampling function
5072 : CPLErr eErr = CE_Failure;
5073 : void *pDstBuffer = nullptr;
5074 : GDALDataType eDstBufferDataType = GDT_Unknown;
5075 :
5076 0 : void SetSrcMaskBufferHolder(
5077 : const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
5078 : {
5079 0 : oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
5080 0 : }
5081 :
5082 0 : void SetSrcBufferHolder(
5083 : const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
5084 : {
5085 0 : oSrcBufferHolder = oSrcBufferHolderIn;
5086 0 : }
5087 :
5088 909 : void NotifyFinished()
5089 : {
5090 1818 : std::lock_guard guard(mutex);
5091 909 : bFinished = true;
5092 909 : cv.notify_one();
5093 909 : }
5094 :
5095 0 : bool IsFinished()
5096 : {
5097 0 : std::lock_guard guard(mutex);
5098 0 : return bFinished;
5099 : }
5100 :
5101 0 : void WaitFinished()
5102 : {
5103 0 : std::unique_lock oGuard(mutex);
5104 0 : while (!bFinished)
5105 : {
5106 0 : cv.wait(oGuard);
5107 : }
5108 0 : }
5109 :
5110 : private:
5111 : // Synchronization
5112 : bool bFinished = false;
5113 : std::mutex mutex{};
5114 : std::condition_variable cv{};
5115 : };
5116 :
5117 : // Thread function to resample
5118 909 : const auto JobResampleFunc = [](void *pData)
5119 : {
5120 909 : OvrJob *poJob = static_cast<OvrJob *>(pData);
5121 :
5122 909 : if (poJob->bUseGenericResampleFn)
5123 : {
5124 907 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
5125 : &(poJob->pDstBuffer),
5126 : &(poJob->eDstBufferDataType));
5127 : }
5128 : else
5129 : {
5130 2 : poJob->eErr = GDALResampleChunkC32R(
5131 : poJob->nSrcWidth, poJob->nSrcHeight,
5132 2 : static_cast<const float *>(poJob->pChunk),
5133 : poJob->args.nChunkYOff, poJob->args.nChunkYSize,
5134 : poJob->args.nDstYOff, poJob->args.nDstYOff2,
5135 : poJob->args.nOvrXSize, poJob->args.nOvrYSize,
5136 : &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
5137 : poJob->args.pszResampling);
5138 : }
5139 :
5140 : poJob->oDstBufferHolder =
5141 909 : std::make_unique<PointerHolder>(poJob->pDstBuffer);
5142 :
5143 909 : poJob->NotifyFinished();
5144 909 : };
5145 :
5146 : // Function to write resample data to target band
5147 909 : const auto WriteJobData = [](const OvrJob *poJob)
5148 : {
5149 1818 : return poJob->poDstBand->RasterIO(
5150 909 : GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
5151 909 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
5152 909 : poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
5153 909 : poJob->eDstBufferDataType, 0, 0, nullptr);
5154 : };
5155 :
5156 : // Wait for completion of oldest job and serialize it
5157 : const auto WaitAndFinalizeOldestJob =
5158 0 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
5159 : {
5160 0 : auto poOldestJob = jobList.front().get();
5161 0 : poOldestJob->WaitFinished();
5162 0 : CPLErr l_eErr = poOldestJob->eErr;
5163 0 : if (l_eErr == CE_None)
5164 : {
5165 0 : l_eErr = WriteJobData(poOldestJob);
5166 : }
5167 :
5168 0 : jobList.pop_front();
5169 0 : return l_eErr;
5170 : };
5171 :
5172 : // Queue of jobs
5173 1614 : std::list<std::unique_ptr<OvrJob>> jobList;
5174 :
5175 807 : GByte *pabyChunkNodataMask = nullptr;
5176 807 : void *pChunk = nullptr;
5177 :
5178 807 : const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
5179 3228 : const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
5180 807 : ? CPLGetNumCPUs()
5181 807 : : atoi(pszThreads)));
5182 : auto poThreadPool =
5183 807 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5184 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5185 1614 : : std::unique_ptr<CPLJobQueue>(nullptr);
5186 :
5187 : /* -------------------------------------------------------------------- */
5188 : /* Loop over image operating on chunks. */
5189 : /* -------------------------------------------------------------------- */
5190 807 : int nChunkYOff = 0;
5191 807 : CPLErr eErr = CE_None;
5192 :
5193 1619 : for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
5194 812 : nChunkYOff += nFullResYChunk)
5195 : {
5196 812 : if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
5197 : pProgressData))
5198 : {
5199 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
5200 0 : eErr = CE_Failure;
5201 : }
5202 :
5203 812 : if (nFullResYChunk + nChunkYOff > nHeight)
5204 804 : nFullResYChunk = nHeight - nChunkYOff;
5205 :
5206 812 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
5207 812 : int nChunkYSizeQueried =
5208 812 : nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
5209 812 : if (nChunkYOffQueried < 0)
5210 : {
5211 83 : nChunkYSizeQueried += nChunkYOffQueried;
5212 83 : nChunkYOffQueried = 0;
5213 : }
5214 812 : if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
5215 83 : nChunkYSizeQueried = nHeight - nChunkYOffQueried;
5216 :
5217 : // Avoid accumulating too many tasks and exhaust RAM
5218 : // Try to complete already finished jobs
5219 812 : while (eErr == CE_None && !jobList.empty())
5220 : {
5221 0 : auto poOldestJob = jobList.front().get();
5222 0 : if (!poOldestJob->IsFinished())
5223 0 : break;
5224 0 : eErr = poOldestJob->eErr;
5225 0 : if (eErr == CE_None)
5226 : {
5227 0 : eErr = WriteJobData(poOldestJob);
5228 : }
5229 :
5230 0 : jobList.pop_front();
5231 : }
5232 :
5233 : // And in case we have saturated the number of threads,
5234 : // wait for completion of tasks to go below the threshold.
5235 1624 : while (eErr == CE_None &&
5236 812 : jobList.size() >= static_cast<size_t>(nThreads))
5237 : {
5238 0 : eErr = WaitAndFinalizeOldestJob(jobList);
5239 : }
5240 :
5241 : // (Re)allocate buffers if needed
5242 812 : if (pChunk == nullptr)
5243 : {
5244 807 : pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
5245 : nMaxChunkYSizeQueried, nWidth);
5246 : }
5247 812 : if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
5248 : {
5249 : pabyChunkNodataMask = static_cast<GByte *>(
5250 287 : VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
5251 : }
5252 :
5253 812 : if (pChunk == nullptr ||
5254 287 : (bUseNoDataMask && pabyChunkNodataMask == nullptr))
5255 : {
5256 0 : CPLFree(pChunk);
5257 0 : CPLFree(pabyChunkNodataMask);
5258 0 : return CE_Failure;
5259 : }
5260 :
5261 : // Read chunk.
5262 812 : if (eErr == CE_None)
5263 812 : eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
5264 : nChunkYSizeQueried, pChunk, nWidth,
5265 : nChunkYSizeQueried, eWrkDataType, 0, 0,
5266 : nullptr);
5267 812 : if (eErr == CE_None && bUseNoDataMask)
5268 287 : eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
5269 : nChunkYSizeQueried, pabyChunkNodataMask,
5270 : nWidth, nChunkYSizeQueried, GDT_UInt8,
5271 : 0, 0, nullptr);
5272 :
5273 : // Special case to promote 1bit data to 8bit 0/255 values.
5274 812 : if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
5275 : {
5276 9 : if (eWrkDataType == GDT_Float32)
5277 : {
5278 0 : float *pafChunk = static_cast<float *>(pChunk);
5279 0 : for (size_t i = 0;
5280 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5281 : {
5282 0 : if (pafChunk[i] == 1.0f)
5283 0 : pafChunk[i] = 255.0f;
5284 : }
5285 : }
5286 9 : else if (eWrkDataType == GDT_UInt8)
5287 : {
5288 9 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
5289 168417 : for (size_t i = 0;
5290 168417 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5291 : {
5292 168408 : if (pabyChunk[i] == 1)
5293 127437 : pabyChunk[i] = 255;
5294 : }
5295 : }
5296 0 : else if (eWrkDataType == GDT_UInt16)
5297 : {
5298 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
5299 0 : for (size_t i = 0;
5300 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5301 : {
5302 0 : if (pasChunk[i] == 1)
5303 0 : pasChunk[i] = 255;
5304 : }
5305 : }
5306 0 : else if (eWrkDataType == GDT_Float64)
5307 : {
5308 0 : double *padfChunk = static_cast<double *>(pChunk);
5309 0 : for (size_t i = 0;
5310 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5311 : {
5312 0 : if (padfChunk[i] == 1.0)
5313 0 : padfChunk[i] = 255.0;
5314 : }
5315 : }
5316 : else
5317 : {
5318 0 : CPLAssert(false);
5319 : }
5320 : }
5321 803 : else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
5322 : {
5323 0 : if (eWrkDataType == GDT_Float32)
5324 : {
5325 0 : float *pafChunk = static_cast<float *>(pChunk);
5326 0 : for (size_t i = 0;
5327 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5328 : {
5329 0 : if (pafChunk[i] == 1.0f)
5330 0 : pafChunk[i] = 0.0f;
5331 0 : else if (pafChunk[i] == 0.0f)
5332 0 : pafChunk[i] = 255.0f;
5333 : }
5334 : }
5335 0 : else if (eWrkDataType == GDT_UInt8)
5336 : {
5337 0 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
5338 0 : for (size_t i = 0;
5339 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5340 : {
5341 0 : if (pabyChunk[i] == 1)
5342 0 : pabyChunk[i] = 0;
5343 0 : else if (pabyChunk[i] == 0)
5344 0 : pabyChunk[i] = 255;
5345 : }
5346 : }
5347 0 : else if (eWrkDataType == GDT_UInt16)
5348 : {
5349 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
5350 0 : for (size_t i = 0;
5351 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5352 : {
5353 0 : if (pasChunk[i] == 1)
5354 0 : pasChunk[i] = 0;
5355 0 : else if (pasChunk[i] == 0)
5356 0 : pasChunk[i] = 255;
5357 : }
5358 : }
5359 0 : else if (eWrkDataType == GDT_Float64)
5360 : {
5361 0 : double *padfChunk = static_cast<double *>(pChunk);
5362 0 : for (size_t i = 0;
5363 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5364 : {
5365 0 : if (padfChunk[i] == 1.0)
5366 0 : padfChunk[i] = 0.0;
5367 0 : else if (padfChunk[i] == 0.0)
5368 0 : padfChunk[i] = 255.0;
5369 : }
5370 : }
5371 : else
5372 : {
5373 0 : CPLAssert(false);
5374 : }
5375 : }
5376 :
5377 : auto oSrcBufferHolder =
5378 1624 : std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
5379 : auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
5380 1624 : poJobQueue ? pabyChunkNodataMask : nullptr);
5381 :
5382 1721 : for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
5383 : ++iOverview)
5384 : {
5385 909 : GDALRasterBand *poDstBand = papoOvrBands[iOverview];
5386 909 : const int nDstWidth = poDstBand->GetXSize();
5387 909 : const int nDstHeight = poDstBand->GetYSize();
5388 :
5389 909 : const double dfXRatioDstToSrc =
5390 909 : static_cast<double>(nWidth) / nDstWidth;
5391 909 : const double dfYRatioDstToSrc =
5392 909 : static_cast<double>(nHeight) / nDstHeight;
5393 :
5394 : /* --------------------------------------------------------------------
5395 : */
5396 : /* Figure out the line to start writing to, and the first line
5397 : */
5398 : /* to not write to. In theory this approach should ensure that
5399 : */
5400 : /* every output line will be written if all input chunks are */
5401 : /* processed. */
5402 : /* --------------------------------------------------------------------
5403 : */
5404 909 : int nDstYOff =
5405 909 : static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
5406 909 : if (nDstYOff == nDstHeight)
5407 0 : continue;
5408 909 : int nDstYOff2 = static_cast<int>(
5409 909 : 0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
5410 :
5411 909 : if (nChunkYOff + nFullResYChunk == nHeight)
5412 902 : nDstYOff2 = nDstHeight;
5413 : #if DEBUG_VERBOSE
5414 : CPLDebug("GDAL",
5415 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
5416 : nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
5417 : nDstWidth, nDstYOff2 - nDstYOff);
5418 : #endif
5419 :
5420 1818 : auto poJob = std::make_unique<OvrJob>();
5421 909 : poJob->pfnResampleFn = pfnResampleFn;
5422 909 : poJob->bUseGenericResampleFn = bUseGenericResampleFn;
5423 909 : poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
5424 909 : poJob->args.nOvrXSize = poDstBand->GetXSize();
5425 909 : poJob->args.nOvrYSize = poDstBand->GetYSize();
5426 : const char *pszNBITS =
5427 909 : poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
5428 909 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
5429 909 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
5430 909 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
5431 909 : poJob->args.eWrkDataType = eWrkDataType;
5432 909 : poJob->pChunk = pChunk;
5433 909 : poJob->args.pabyChunkNodataMask = pabyChunkNodataMask;
5434 909 : poJob->nSrcWidth = nWidth;
5435 909 : poJob->nSrcHeight = nHeight;
5436 909 : poJob->args.nChunkXOff = 0;
5437 909 : poJob->args.nChunkXSize = nWidth;
5438 909 : poJob->args.nChunkYOff = nChunkYOffQueried;
5439 909 : poJob->args.nChunkYSize = nChunkYSizeQueried;
5440 909 : poJob->nDstWidth = nDstWidth;
5441 909 : poJob->args.nDstXOff = 0;
5442 909 : poJob->args.nDstXOff2 = nDstWidth;
5443 909 : poJob->args.nDstYOff = nDstYOff;
5444 909 : poJob->args.nDstYOff2 = nDstYOff2;
5445 909 : poJob->poDstBand = poDstBand;
5446 909 : poJob->args.pszResampling = pszResampling;
5447 909 : poJob->args.bHasNoData = bHasNoData;
5448 909 : poJob->args.dfNoDataValue = dfNoDataValue;
5449 909 : poJob->args.poColorTable = poColorTable;
5450 909 : poJob->args.eSrcDataType = eSrcDataType;
5451 909 : poJob->args.bPropagateNoData = bPropagateNoData;
5452 :
5453 909 : if (poJobQueue)
5454 : {
5455 0 : poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
5456 0 : poJob->SetSrcBufferHolder(oSrcBufferHolder);
5457 0 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
5458 0 : jobList.emplace_back(std::move(poJob));
5459 : }
5460 : else
5461 : {
5462 909 : JobResampleFunc(poJob.get());
5463 909 : eErr = poJob->eErr;
5464 909 : if (eErr == CE_None)
5465 : {
5466 909 : eErr = WriteJobData(poJob.get());
5467 : }
5468 : }
5469 : }
5470 :
5471 812 : if (poJobQueue)
5472 : {
5473 0 : pChunk = nullptr;
5474 0 : pabyChunkNodataMask = nullptr;
5475 : }
5476 : }
5477 :
5478 807 : VSIFree(pChunk);
5479 807 : VSIFree(pabyChunkNodataMask);
5480 :
5481 : // Wait for all pending jobs to complete
5482 807 : while (!jobList.empty())
5483 : {
5484 0 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5485 0 : if (l_eErr != CE_None && eErr == CE_None)
5486 0 : eErr = l_eErr;
5487 : }
5488 :
5489 : /* -------------------------------------------------------------------- */
5490 : /* Renormalized overview mean / stddev if needed. */
5491 : /* -------------------------------------------------------------------- */
5492 807 : if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
5493 : {
5494 0 : GDALOverviewMagnitudeCorrection(
5495 : poSrcBand, nOverviewCount,
5496 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
5497 : GDALDummyProgress, nullptr);
5498 : }
5499 :
5500 : /* -------------------------------------------------------------------- */
5501 : /* It can be important to flush out data to overviews. */
5502 : /* -------------------------------------------------------------------- */
5503 1709 : for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
5504 : ++iOverview)
5505 : {
5506 902 : eErr = papoOvrBands[iOverview]->FlushCache(false);
5507 : }
5508 :
5509 807 : if (eErr == CE_None)
5510 807 : pfnProgress(1.0, nullptr, pProgressData);
5511 :
5512 807 : return eErr;
5513 : }
5514 :
5515 : /************************************************************************/
5516 : /* GDALRegenerateOverviewsMultiBand() */
5517 : /************************************************************************/
5518 :
5519 : /**
5520 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5521 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5522 : *
5523 : * This function will generate one or more overview images from a base
5524 : * image using the requested downsampling algorithm. Its primary use
5525 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
5526 : * can also be used to generate downsampled images in one file from another
5527 : * outside the overview architecture.
5528 : *
5529 : * The output bands need to exist in advance and share the same characteristics
5530 : * (type, dimensions)
5531 : *
5532 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5533 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5534 : *
5535 : * It does not support color tables or complex data types.
5536 : *
5537 : * The pseudo-algorithm used by the function is :
5538 : * for each overview
5539 : * iterate on lines of the source by a step of deltay
5540 : * iterate on columns of the source by a step of deltax
5541 : * read the source data of size deltax * deltay for all the bands
5542 : * generate the corresponding overview block for all the bands
5543 : *
5544 : * This function will honour properly NODATA_VALUES tuples (special dataset
5545 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5546 : * considered as the nodata value and not each value of the triplet
5547 : * independently per band.
5548 : *
5549 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
5550 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5551 : * overview computation.
5552 : *
5553 : * @param nBands the number of bands, size of papoSrcBands and size of
5554 : * first dimension of papapoOverviewBands
5555 : * @param papoSrcBands the list of source bands to downsample
5556 : * @param nOverviews the number of downsampled overview levels being generated.
5557 : * @param papapoOverviewBands bidimension array of bands. First dimension is
5558 : * indexed by nBands. Second dimension is indexed by
5559 : * nOverviews.
5560 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5561 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5562 : * @param pfnProgress progress report function.
5563 : * @param pProgressData progress function callback data.
5564 : * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
5565 : * key=value pairs, or NULL
5566 : * Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
5567 : * options can be specified to express that overviews should
5568 : * be regenerated only in the specified subset of the source
5569 : * dataset.
5570 : * @return CE_None on success or CE_Failure on failure.
5571 : */
5572 :
5573 383 : CPLErr GDALRegenerateOverviewsMultiBand(
5574 : int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
5575 : GDALRasterBand *const *const *papapoOverviewBands,
5576 : const char *pszResampling, GDALProgressFunc pfnProgress,
5577 : void *pProgressData, CSLConstList papszOptions)
5578 : {
5579 383 : CPL_IGNORE_RET_VAL(papszOptions);
5580 :
5581 383 : if (pfnProgress == nullptr)
5582 11 : pfnProgress = GDALDummyProgress;
5583 :
5584 383 : if (EQUAL(pszResampling, "NONE") || nBands == 0 || nOverviews == 0)
5585 3 : return CE_None;
5586 :
5587 : // Sanity checks.
5588 380 : if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
5589 187 : !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
5590 78 : !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
5591 22 : !EQUAL(pszResampling, "CUBICSPLINE") &&
5592 21 : !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
5593 5 : !EQUAL(pszResampling, "MODE"))
5594 : {
5595 0 : CPLError(CE_Failure, CPLE_NotSupported,
5596 : "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
5597 : "not supported",
5598 : pszResampling);
5599 0 : return CE_Failure;
5600 : }
5601 :
5602 380 : int nKernelRadius = 0;
5603 : GDALResampleFunction pfnResampleFn =
5604 380 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
5605 380 : if (pfnResampleFn == nullptr)
5606 0 : return CE_Failure;
5607 :
5608 380 : const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
5609 380 : const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
5610 380 : if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
5611 0 : return CE_None;
5612 380 : GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
5613 66221 : for (int iBand = 1; iBand < nBands; ++iBand)
5614 : {
5615 131682 : if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
5616 65841 : papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
5617 : {
5618 0 : CPLError(
5619 : CE_Failure, CPLE_NotSupported,
5620 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5621 : "have the same dimensions");
5622 0 : return CE_Failure;
5623 : }
5624 65841 : if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
5625 : {
5626 0 : CPLError(
5627 : CE_Failure, CPLE_NotSupported,
5628 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5629 : "have the same data type");
5630 0 : return CE_Failure;
5631 : }
5632 : }
5633 :
5634 1013 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5635 : {
5636 633 : const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
5637 633 : const int nDstWidth = poOvrFirstBand->GetXSize();
5638 633 : const int nDstHeight = poOvrFirstBand->GetYSize();
5639 66732 : for (int iBand = 1; iBand < nBands; ++iBand)
5640 : {
5641 66099 : const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
5642 132198 : if (poOvrBand->GetXSize() != nDstWidth ||
5643 66099 : poOvrBand->GetYSize() != nDstHeight)
5644 : {
5645 0 : CPLError(
5646 : CE_Failure, CPLE_NotSupported,
5647 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5648 : "of the same level must have the same dimensions");
5649 0 : return CE_Failure;
5650 : }
5651 66099 : if (poOvrBand->GetRasterDataType() != eDataType)
5652 : {
5653 0 : CPLError(
5654 : CE_Failure, CPLE_NotSupported,
5655 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5656 : "must have the same data type as the source bands");
5657 0 : return CE_Failure;
5658 : }
5659 : }
5660 : }
5661 :
5662 : // First pass to compute the total number of pixels to write.
5663 380 : double dfTotalPixelCount = 0;
5664 380 : const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
5665 380 : const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
5666 380 : const int nSrcXSize = atoi(CSLFetchNameValueDef(
5667 : papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
5668 380 : const int nSrcYSize = atoi(CSLFetchNameValueDef(
5669 : papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
5670 1013 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5671 : {
5672 633 : dfTotalPixelCount +=
5673 1266 : static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
5674 633 : papapoOverviewBands[0][iOverview]->GetXSize() *
5675 1266 : static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
5676 633 : papapoOverviewBands[0][iOverview]->GetYSize();
5677 : }
5678 :
5679 : const GDALDataType eWrkDataType =
5680 380 : GDALGetOvrWorkDataType(pszResampling, eDataType);
5681 : const int nWrkDataTypeSize =
5682 380 : std::max(1, GDALGetDataTypeSizeBytes(eWrkDataType));
5683 :
5684 380 : const bool bIsMask = papoSrcBands[0]->IsMaskBand();
5685 :
5686 : // If we have a nodata mask and we are doing something more complicated
5687 : // than nearest neighbouring, we have to fetch to nodata mask.
5688 : const bool bUseNoDataMask =
5689 561 : !STARTS_WITH_CI(pszResampling, "NEAR") &&
5690 181 : (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
5691 :
5692 760 : std::vector<bool> abHasNoData(nBands);
5693 760 : std::vector<double> adfNoDataValue(nBands);
5694 :
5695 66601 : for (int iBand = 0; iBand < nBands; ++iBand)
5696 : {
5697 66221 : int nHasNoData = 0;
5698 132442 : adfNoDataValue[iBand] =
5699 66221 : papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
5700 66221 : abHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
5701 : }
5702 :
5703 760 : std::string osDetailMessage;
5704 432 : if (bUseNoDataMask &&
5705 52 : papoSrcBands[0]->HasConflictingMaskSources(&osDetailMessage, false))
5706 : {
5707 9 : CPLError(CE_Warning, CPLE_AppDefined, "%s%s", osDetailMessage.c_str(),
5708 18 : abHasNoData[0]
5709 : ? "Only the nodata value will be taken into account."
5710 9 : : "Only the first listed one will be taken into account.");
5711 : }
5712 :
5713 : const bool bPropagateNoData =
5714 380 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5715 :
5716 380 : const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
5717 1520 : const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
5718 380 : ? CPLGetNumCPUs()
5719 380 : : atoi(pszThreads)));
5720 : auto poThreadPool =
5721 380 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5722 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5723 760 : : std::unique_ptr<CPLJobQueue>(nullptr);
5724 :
5725 : // Only configurable for debug / testing
5726 380 : const GIntBig nChunkMaxSize = []() -> GIntBig
5727 : {
5728 : const char *pszVal =
5729 380 : CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", nullptr);
5730 380 : if (pszVal)
5731 : {
5732 15 : GIntBig nRet = 0;
5733 15 : CPLParseMemorySize(pszVal, &nRet, nullptr);
5734 15 : return std::max<GIntBig>(100, nRet);
5735 : }
5736 365 : return 10 * 1024 * 1024;
5737 380 : }();
5738 :
5739 : // Only configurable for debug / testing
5740 380 : const GIntBig nChunkMaxSizeForTempFile = []() -> GIntBig
5741 : {
5742 380 : const char *pszVal = CPLGetConfigOption(
5743 : "GDAL_OVR_CHUNK_MAX_SIZE_FOR_TEMP_FILE", nullptr);
5744 380 : if (pszVal)
5745 : {
5746 14 : GIntBig nRet = 0;
5747 14 : CPLParseMemorySize(pszVal, &nRet, nullptr);
5748 14 : return std::max<GIntBig>(100, nRet);
5749 : }
5750 366 : const auto nUsableRAM = CPLGetUsablePhysicalRAM();
5751 366 : if (nUsableRAM > 0)
5752 366 : return nUsableRAM / 10;
5753 : // Select a value to be able to at least downsample by 2 for a RGB
5754 : // 1024x1024 tiled output: (2 * 1024 + 2) * (2 * 1024 + 2) * 3 = 12 MB
5755 0 : return 100 * 1024 * 1024;
5756 380 : }();
5757 :
5758 : // Second pass to do the real job.
5759 380 : double dfCurPixelCount = 0;
5760 380 : CPLErr eErr = CE_None;
5761 1007 : for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
5762 : ++iOverview)
5763 : {
5764 632 : int iSrcOverview = -1; // -1 means the source bands.
5765 :
5766 : const int nDstTotalWidth =
5767 632 : papapoOverviewBands[0][iOverview]->GetXSize();
5768 : const int nDstTotalHeight =
5769 632 : papapoOverviewBands[0][iOverview]->GetYSize();
5770 :
5771 : // Compute the coordinates of the target region to refresh
5772 632 : constexpr double EPS = 1e-8;
5773 632 : const int nDstXOffStart = static_cast<int>(
5774 632 : static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
5775 : EPS);
5776 : const int nDstXOffEnd =
5777 1264 : std::min(static_cast<int>(
5778 632 : std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
5779 632 : nToplevelSrcWidth * nDstTotalWidth -
5780 : EPS)),
5781 632 : nDstTotalWidth);
5782 632 : const int nDstWidth = nDstXOffEnd - nDstXOffStart;
5783 632 : const int nDstYOffStart =
5784 632 : static_cast<int>(static_cast<double>(nSrcYOff) /
5785 632 : nToplevelSrcHeight * nDstTotalHeight +
5786 : EPS);
5787 : const int nDstYOffEnd =
5788 1264 : std::min(static_cast<int>(
5789 632 : std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
5790 632 : nToplevelSrcHeight * nDstTotalHeight -
5791 : EPS)),
5792 632 : nDstTotalHeight);
5793 632 : const int nDstHeight = nDstYOffEnd - nDstYOffStart;
5794 :
5795 : // Try to use previous level of overview as the source to compute
5796 : // the next level.
5797 632 : int nSrcWidth = nToplevelSrcWidth;
5798 632 : int nSrcHeight = nToplevelSrcHeight;
5799 884 : if (iOverview > 0 &&
5800 252 : papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
5801 : {
5802 244 : nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
5803 244 : nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
5804 244 : iSrcOverview = iOverview - 1;
5805 : }
5806 :
5807 632 : const double dfXRatioDstToSrc =
5808 632 : static_cast<double>(nSrcWidth) / nDstTotalWidth;
5809 632 : const double dfYRatioDstToSrc =
5810 632 : static_cast<double>(nSrcHeight) / nDstTotalHeight;
5811 :
5812 : const int nOvrFactor =
5813 1896 : std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
5814 632 : static_cast<int>(0.5 + dfYRatioDstToSrc)));
5815 :
5816 632 : int nDstChunkXSize = 0;
5817 632 : int nDstChunkYSize = 0;
5818 632 : papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
5819 : &nDstChunkYSize);
5820 :
5821 632 : constexpr int PIXEL_MARGIN = 2;
5822 : // Try to extend the chunk size so that the memory needed to acquire
5823 : // source pixels goes up to 10 MB.
5824 : // This can help for drivers that support multi-threaded reading
5825 632 : const int nFullResYChunk = static_cast<int>(std::min<double>(
5826 632 : nSrcHeight, PIXEL_MARGIN + nDstChunkYSize * dfYRatioDstToSrc));
5827 632 : const int nFullResYChunkQueried = static_cast<int>(std::min<int64_t>(
5828 1264 : nSrcHeight,
5829 1264 : nFullResYChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5830 632 : nKernelRadius * nOvrFactor));
5831 861 : while (nDstChunkXSize < nDstWidth)
5832 : {
5833 248 : constexpr int INCREASE_FACTOR = 2;
5834 :
5835 248 : const int nFullResXChunk = static_cast<int>(std::min<double>(
5836 496 : nSrcWidth, PIXEL_MARGIN + INCREASE_FACTOR * nDstChunkXSize *
5837 248 : dfXRatioDstToSrc));
5838 :
5839 : const int nFullResXChunkQueried =
5840 248 : static_cast<int>(std::min<int64_t>(
5841 496 : nSrcWidth,
5842 496 : nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5843 248 : nKernelRadius * nOvrFactor));
5844 :
5845 248 : if (nBands > nChunkMaxSize / nFullResXChunkQueried /
5846 248 : nFullResYChunkQueried / nWrkDataTypeSize)
5847 : {
5848 19 : break;
5849 : }
5850 :
5851 229 : nDstChunkXSize *= INCREASE_FACTOR;
5852 : }
5853 632 : nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
5854 :
5855 632 : const int nFullResXChunk = static_cast<int>(std::min<double>(
5856 632 : nSrcWidth, PIXEL_MARGIN + nDstChunkXSize * dfXRatioDstToSrc));
5857 632 : const int nFullResXChunkQueried = static_cast<int>(std::min<int64_t>(
5858 1264 : nSrcWidth,
5859 1264 : nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5860 632 : nKernelRadius * nOvrFactor));
5861 :
5862 : // Make sure that the RAM requirements to acquire the source data does
5863 : // not exceed nChunkMaxSizeForTempFile
5864 : // If so, reduce the destination chunk size, generate overviews in a
5865 : // temporary dataset, and copy that temporary dataset over the target
5866 : // overview bands (to avoid issues with lossy compression)
5867 : const bool bOverflowFullResXChunkYChunkQueried =
5868 632 : nBands > std::numeric_limits<int64_t>::max() /
5869 632 : nFullResXChunkQueried / nFullResYChunkQueried /
5870 632 : nWrkDataTypeSize;
5871 :
5872 632 : const auto nMemRequirement =
5873 : bOverflowFullResXChunkYChunkQueried
5874 632 : ? 0
5875 628 : : static_cast<GIntBig>(nFullResXChunkQueried) *
5876 628 : nFullResYChunkQueried * nBands * nWrkDataTypeSize;
5877 : // Use a temporary dataset with a smaller destination chunk size
5878 632 : const auto nOverShootFactor =
5879 : nMemRequirement / nChunkMaxSizeForTempFile;
5880 :
5881 632 : constexpr int MIN_OVERSHOOT_FACTOR = 4;
5882 : const auto nSqrtOverShootFactor = std::max<GIntBig>(
5883 1264 : MIN_OVERSHOOT_FACTOR, static_cast<GIntBig>(std::ceil(std::sqrt(
5884 632 : static_cast<double>(nOverShootFactor)))));
5885 632 : constexpr int DEFAULT_CHUNK_SIZE = 256;
5886 632 : constexpr int GTIFF_BLOCK_SIZE_MULTIPLE = 16;
5887 : const int nReducedDstChunkXSize =
5888 : bOverflowFullResXChunkYChunkQueried
5889 1260 : ? DEFAULT_CHUNK_SIZE
5890 1260 : : std::max(1, static_cast<int>(nDstChunkXSize /
5891 1260 : nSqrtOverShootFactor) &
5892 628 : ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
5893 : const int nReducedDstChunkYSize =
5894 : bOverflowFullResXChunkYChunkQueried
5895 1260 : ? DEFAULT_CHUNK_SIZE
5896 1260 : : std::max(1, static_cast<int>(nDstChunkYSize /
5897 1260 : nSqrtOverShootFactor) &
5898 628 : ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
5899 :
5900 632 : if (bOverflowFullResXChunkYChunkQueried ||
5901 : nMemRequirement > nChunkMaxSizeForTempFile)
5902 : {
5903 : const auto nDTSize =
5904 43 : std::max(1, GDALGetDataTypeSizeBytes(eDataType));
5905 : const bool bTmpDSMemRequirementOverflow =
5906 43 : nBands > std::numeric_limits<int64_t>::max() / nDstWidth /
5907 43 : nDstHeight / nDTSize;
5908 43 : const auto nTmpDSMemRequirement =
5909 : bTmpDSMemRequirementOverflow
5910 43 : ? 0
5911 41 : : static_cast<GIntBig>(nDstWidth) * nDstHeight * nBands *
5912 41 : nDTSize;
5913 :
5914 : // make sure that one band buffer doesn't overflow size_t
5915 : const bool bChunkSizeOverflow =
5916 43 : static_cast<size_t>(nDTSize) >
5917 43 : std::numeric_limits<size_t>::max() / nDstWidth / nDstHeight;
5918 43 : const size_t nChunkSize =
5919 : bChunkSizeOverflow
5920 43 : ? 0
5921 41 : : static_cast<size_t>(nDstWidth) * nDstHeight * nDTSize;
5922 :
5923 : const auto CreateVRT =
5924 41 : [nBands, nSrcWidth, nSrcHeight, nDstTotalWidth, nDstTotalHeight,
5925 : pszResampling, eWrkDataType, papoSrcBands, papapoOverviewBands,
5926 : iSrcOverview, &abHasNoData,
5927 393585 : &adfNoDataValue](int nVRTBlockXSize, int nVRTBlockYSize)
5928 : {
5929 : auto poVRTDS = std::make_unique<VRTDataset>(
5930 41 : nDstTotalWidth, nDstTotalHeight, nVRTBlockXSize,
5931 41 : nVRTBlockYSize);
5932 :
5933 65620 : for (int iBand = 0; iBand < nBands; ++iBand)
5934 : {
5935 131158 : auto poVRTSrc = std::make_unique<VRTSimpleSource>();
5936 65579 : poVRTSrc->SetResampling(pszResampling);
5937 65579 : poVRTDS->AddBand(eWrkDataType);
5938 : auto poVRTBand = static_cast<VRTSourcedRasterBand *>(
5939 65579 : poVRTDS->GetRasterBand(iBand + 1));
5940 :
5941 65579 : auto poSrcBand = papoSrcBands[iBand];
5942 65579 : if (iSrcOverview != -1)
5943 24 : poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
5944 65579 : poVRTBand->ConfigureSource(
5945 : poVRTSrc.get(), poSrcBand, false, 0, 0, nSrcWidth,
5946 : nSrcHeight, 0, 0, nDstTotalWidth, nDstTotalHeight);
5947 : // Add the source to the band
5948 65579 : poVRTBand->AddSource(poVRTSrc.release());
5949 65579 : if (abHasNoData[iBand])
5950 3 : poVRTBand->SetNoDataValue(adfNoDataValue[iBand]);
5951 : }
5952 :
5953 42 : if (papoSrcBands[0]->GetMaskFlags() == GMF_PER_DATASET &&
5954 1 : poVRTDS->CreateMaskBand(GMF_PER_DATASET) == CE_None)
5955 : {
5956 : VRTSourcedRasterBand *poMaskVRTBand =
5957 1 : cpl::down_cast<VRTSourcedRasterBand *>(
5958 1 : poVRTDS->GetRasterBand(1)->GetMaskBand());
5959 1 : auto poSrcBand = papoSrcBands[0];
5960 1 : if (iSrcOverview != -1)
5961 0 : poSrcBand = papapoOverviewBands[0][iSrcOverview];
5962 1 : poMaskVRTBand->AddMaskBandSource(
5963 1 : poSrcBand->GetMaskBand(), 0, 0, nSrcWidth, nSrcHeight,
5964 : 0, 0, nDstTotalWidth, nDstTotalHeight);
5965 : }
5966 :
5967 41 : return poVRTDS;
5968 43 : };
5969 :
5970 : // If the overview accommodates chunking, do so and recurse
5971 : // to avoid generating full size temporary files
5972 43 : if (!bOverflowFullResXChunkYChunkQueried &&
5973 39 : !bTmpDSMemRequirementOverflow && !bChunkSizeOverflow &&
5974 39 : (nDstChunkXSize < nDstWidth || nDstChunkYSize < nDstHeight))
5975 : {
5976 : // Create a VRT with the smaller chunk to do the scaling
5977 : auto poVRTDS =
5978 13 : CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
5979 :
5980 13 : std::vector<GDALRasterBand *> apoVRTBand(nBands);
5981 13 : std::vector<GDALRasterBand *> apoDstBand(nBands);
5982 65560 : for (int iBand = 0; iBand < nBands; ++iBand)
5983 : {
5984 65547 : apoDstBand[iBand] = papapoOverviewBands[iBand][iOverview];
5985 65547 : apoVRTBand[iBand] = poVRTDS->GetRasterBand(iBand + 1);
5986 : }
5987 :
5988 : // Use a flag to avoid reading from the overview being built
5989 : GDALRasterIOExtraArg sExtraArg;
5990 13 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5991 13 : if (iSrcOverview == -1)
5992 13 : sExtraArg.bUseOnlyThisScale = true;
5993 :
5994 : // A single band buffer for data transfer to the overview
5995 13 : std::vector<GByte> abyChunk;
5996 : try
5997 : {
5998 13 : abyChunk.resize(nChunkSize);
5999 : }
6000 0 : catch (const std::exception &)
6001 : {
6002 0 : CPLError(CE_Failure, CPLE_OutOfMemory,
6003 : "Out of memory allocating temporary buffer");
6004 0 : return CE_Failure;
6005 : }
6006 :
6007 : // Loop over output height, in chunks
6008 13 : for (int nDstYOff = nDstYOffStart;
6009 38 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6010 : /* */)
6011 : {
6012 : const int nDstYCount =
6013 25 : std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
6014 : // Loop over output width, in output chunks
6015 25 : for (int nDstXOff = nDstXOffStart;
6016 74 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6017 : /* */)
6018 : {
6019 : const int nDstXCount =
6020 49 : std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
6021 : // Read and transfer the chunk to the overview
6022 98 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
6023 : ++iBand)
6024 : {
6025 98 : eErr = apoVRTBand[iBand]->RasterIO(
6026 : GF_Read, nDstXOff, nDstYOff, nDstXCount,
6027 49 : nDstYCount, abyChunk.data(), nDstXCount,
6028 : nDstYCount, eDataType, 0, 0, &sExtraArg);
6029 49 : if (eErr == CE_None)
6030 : {
6031 96 : eErr = apoDstBand[iBand]->RasterIO(
6032 : GF_Write, nDstXOff, nDstYOff, nDstXCount,
6033 48 : nDstYCount, abyChunk.data(), nDstXCount,
6034 : nDstYCount, eDataType, 0, 0, nullptr);
6035 : }
6036 : }
6037 :
6038 49 : dfCurPixelCount +=
6039 49 : static_cast<double>(nDstXCount) * nDstYCount;
6040 :
6041 49 : nDstXOff += nDstXCount;
6042 : } // width
6043 :
6044 25 : if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount,
6045 : nullptr, pProgressData))
6046 : {
6047 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
6048 : "User terminated");
6049 0 : eErr = CE_Failure;
6050 : }
6051 :
6052 25 : nDstYOff += nDstYCount;
6053 : } // height
6054 :
6055 13 : if (CE_None != eErr)
6056 : {
6057 1 : CPLError(CE_Failure, CPLE_AppDefined,
6058 : "Error while writing overview");
6059 1 : return CE_Failure;
6060 : }
6061 :
6062 12 : pfnProgress(1.0, nullptr, pProgressData);
6063 : // Flush the overviews we just generated
6064 24 : for (int iBand = 0; iBand < nBands; ++iBand)
6065 12 : apoDstBand[iBand]->FlushCache(false);
6066 :
6067 12 : continue; // Next overview
6068 : } // chunking via temporary dataset
6069 :
6070 0 : std::unique_ptr<GDALDataset> poTmpDS;
6071 : // Config option mostly/only for autotest purposes
6072 : const char *pszGDAL_OVR_TEMP_DRIVER =
6073 30 : CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
6074 30 : if ((!bTmpDSMemRequirementOverflow &&
6075 4 : nTmpDSMemRequirement <= nChunkMaxSizeForTempFile &&
6076 4 : !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
6077 26 : EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
6078 : {
6079 10 : auto poTmpDrv = GetGDALDriverManager()->GetDriverByName("MEM");
6080 10 : if (!poTmpDrv)
6081 : {
6082 0 : eErr = CE_Failure;
6083 0 : break;
6084 : }
6085 10 : poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
6086 : nDstTotalHeight, nBands,
6087 10 : eDataType, nullptr));
6088 : }
6089 : else
6090 : {
6091 : // Create a temporary file for the overview
6092 : auto poTmpDrv =
6093 20 : GetGDALDriverManager()->GetDriverByName("GTiff");
6094 20 : if (!poTmpDrv)
6095 : {
6096 0 : eErr = CE_Failure;
6097 0 : break;
6098 : }
6099 40 : std::string osTmpFilename;
6100 20 : auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
6101 20 : if (poDstDS)
6102 : {
6103 20 : osTmpFilename = poDstDS->GetDescription();
6104 : VSIStatBufL sStatBuf;
6105 20 : if (!osTmpFilename.empty() &&
6106 0 : VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
6107 0 : osTmpFilename += "_tmp_ovr.tif";
6108 : }
6109 20 : if (osTmpFilename.empty())
6110 : {
6111 20 : osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
6112 20 : osTmpFilename += ".tif";
6113 : }
6114 20 : CPLDebug("GDAL", "Creating temporary file %s of %d x %d x %d",
6115 : osTmpFilename.c_str(), nDstWidth, nDstHeight, nBands);
6116 40 : CPLStringList aosCO;
6117 20 : if (0 == ((nReducedDstChunkXSize % GTIFF_BLOCK_SIZE_MULTIPLE) |
6118 20 : (nReducedDstChunkYSize % GTIFF_BLOCK_SIZE_MULTIPLE)))
6119 : {
6120 14 : aosCO.SetNameValue("TILED", "YES");
6121 : aosCO.SetNameValue("BLOCKXSIZE",
6122 14 : CPLSPrintf("%d", nReducedDstChunkXSize));
6123 : aosCO.SetNameValue("BLOCKYSIZE",
6124 14 : CPLSPrintf("%d", nReducedDstChunkYSize));
6125 : }
6126 20 : if (const char *pszCOList =
6127 20 : poTmpDrv->GetMetadataItem(GDAL_DMD_CREATIONOPTIONLIST))
6128 : {
6129 : aosCO.SetNameValue(
6130 20 : "COMPRESS", strstr(pszCOList, "ZSTD") ? "ZSTD" : "LZW");
6131 : }
6132 20 : poTmpDS.reset(poTmpDrv->Create(osTmpFilename.c_str(), nDstWidth,
6133 : nDstHeight, nBands, eDataType,
6134 20 : aosCO.List()));
6135 20 : if (poTmpDS)
6136 : {
6137 18 : poTmpDS->MarkSuppressOnClose();
6138 18 : VSIUnlink(osTmpFilename.c_str());
6139 : }
6140 : }
6141 30 : if (!poTmpDS)
6142 : {
6143 2 : eErr = CE_Failure;
6144 2 : break;
6145 : }
6146 :
6147 : // Create a full size VRT to do the resampling without edge effects
6148 : auto poVRTDS =
6149 28 : CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
6150 :
6151 : // Allocate a band buffer with the overview chunk size
6152 : std::unique_ptr<void, VSIFreeReleaser> pDstBuffer(
6153 : VSI_MALLOC3_VERBOSE(size_t(nWrkDataTypeSize), nDstChunkXSize,
6154 28 : nDstChunkYSize));
6155 28 : if (pDstBuffer == nullptr)
6156 : {
6157 0 : eErr = CE_Failure;
6158 0 : break;
6159 : }
6160 :
6161 : // Use a flag to avoid reading the overview being built
6162 : GDALRasterIOExtraArg sExtraArg;
6163 28 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
6164 28 : if (iSrcOverview == -1)
6165 4 : sExtraArg.bUseOnlyThisScale = true;
6166 :
6167 : // Scale and copy data from the VRT to the temp file
6168 28 : for (int nDstYOff = nDstYOffStart;
6169 914 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6170 : /* */)
6171 : {
6172 : const int nDstYCount =
6173 886 : std::min(nReducedDstChunkYSize, nDstYOffEnd - nDstYOff);
6174 886 : for (int nDstXOff = nDstXOffStart;
6175 201218 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6176 : /* */)
6177 : {
6178 : const int nDstXCount =
6179 200332 : std::min(nReducedDstChunkXSize, nDstXOffEnd - nDstXOff);
6180 400668 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
6181 : ++iBand)
6182 : {
6183 200336 : auto poSrcBand = poVRTDS->GetRasterBand(iBand + 1);
6184 200336 : eErr = poSrcBand->RasterIO(
6185 : GF_Read, nDstXOff, nDstYOff, nDstXCount, nDstYCount,
6186 : pDstBuffer.get(), nDstXCount, nDstYCount,
6187 : eWrkDataType, 0, 0, &sExtraArg);
6188 200336 : if (eErr == CE_None)
6189 : {
6190 : // Write to the temporary dataset, shifted
6191 200334 : auto poOvrBand = poTmpDS->GetRasterBand(iBand + 1);
6192 200334 : eErr = poOvrBand->RasterIO(
6193 : GF_Write, nDstXOff - nDstXOffStart,
6194 : nDstYOff - nDstYOffStart, nDstXCount,
6195 : nDstYCount, pDstBuffer.get(), nDstXCount,
6196 : nDstYCount, eWrkDataType, 0, 0, nullptr);
6197 : }
6198 : }
6199 200332 : nDstXOff += nDstXCount;
6200 : }
6201 886 : nDstYOff += nDstYCount;
6202 : }
6203 :
6204 : // Copy from the temporary to the overview
6205 28 : for (int nDstYOff = nDstYOffStart;
6206 54 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6207 : /* */)
6208 : {
6209 : const int nDstYCount =
6210 26 : std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
6211 26 : for (int nDstXOff = nDstXOffStart;
6212 52 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6213 : /* */)
6214 : {
6215 : const int nDstXCount =
6216 26 : std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
6217 56 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
6218 : ++iBand)
6219 : {
6220 30 : auto poSrcBand = poTmpDS->GetRasterBand(iBand + 1);
6221 30 : eErr = poSrcBand->RasterIO(
6222 : GF_Read, nDstXOff - nDstXOffStart,
6223 : nDstYOff - nDstYOffStart, nDstXCount, nDstYCount,
6224 : pDstBuffer.get(), nDstXCount, nDstYCount,
6225 : eWrkDataType, 0, 0, nullptr);
6226 30 : if (eErr == CE_None)
6227 : {
6228 : // Write to the destination overview bands
6229 30 : auto poOvrBand =
6230 30 : papapoOverviewBands[iBand][iOverview];
6231 30 : eErr = poOvrBand->RasterIO(
6232 : GF_Write, nDstXOff, nDstYOff, nDstXCount,
6233 : nDstYCount, pDstBuffer.get(), nDstXCount,
6234 : nDstYCount, eWrkDataType, 0, 0, nullptr);
6235 : }
6236 : }
6237 26 : nDstXOff += nDstXCount;
6238 : }
6239 26 : nDstYOff += nDstYCount;
6240 : }
6241 :
6242 28 : if (eErr != CE_None)
6243 : {
6244 2 : CPLError(CE_Failure, CPLE_AppDefined,
6245 : "Failed to write overview %d", iOverview);
6246 2 : return eErr;
6247 : }
6248 :
6249 : // Flush the data to overviews.
6250 56 : for (int iBand = 0; iBand < nBands; ++iBand)
6251 30 : papapoOverviewBands[iBand][iOverview]->FlushCache(false);
6252 :
6253 26 : continue;
6254 : }
6255 :
6256 : // Structure describing a resampling job
6257 : struct OvrJob
6258 : {
6259 : // Buffers to free when job is finished
6260 : std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
6261 : std::unique_ptr<PointerHolder> oSrcBufferHolder{};
6262 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
6263 :
6264 : GDALRasterBand *poDstBand = nullptr;
6265 :
6266 : // Input parameters of pfnResampleFn
6267 : GDALResampleFunction pfnResampleFn = nullptr;
6268 : GDALOverviewResampleArgs args{};
6269 : const void *pChunk = nullptr;
6270 :
6271 : // Output values of resampling function
6272 : CPLErr eErr = CE_Failure;
6273 : void *pDstBuffer = nullptr;
6274 : GDALDataType eDstBufferDataType = GDT_Unknown;
6275 :
6276 3268 : void NotifyFinished()
6277 : {
6278 6536 : std::lock_guard guard(mutex);
6279 3268 : bFinished = true;
6280 3268 : cv.notify_one();
6281 3268 : }
6282 :
6283 2 : bool IsFinished()
6284 : {
6285 2 : std::lock_guard guard(mutex);
6286 4 : return bFinished;
6287 : }
6288 :
6289 15 : void WaitFinished()
6290 : {
6291 30 : std::unique_lock oGuard(mutex);
6292 19 : while (!bFinished)
6293 : {
6294 4 : cv.wait(oGuard);
6295 : }
6296 15 : }
6297 :
6298 : private:
6299 : // Synchronization
6300 : bool bFinished = false;
6301 : std::mutex mutex{};
6302 : std::condition_variable cv{};
6303 : };
6304 :
6305 : // Thread function to resample
6306 3268 : const auto JobResampleFunc = [](void *pData)
6307 : {
6308 3268 : OvrJob *poJob = static_cast<OvrJob *>(pData);
6309 :
6310 3268 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
6311 : &(poJob->pDstBuffer),
6312 : &(poJob->eDstBufferDataType));
6313 :
6314 3268 : poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
6315 :
6316 3268 : poJob->NotifyFinished();
6317 3268 : };
6318 :
6319 : // Function to write resample data to target band
6320 3268 : const auto WriteJobData = [](const OvrJob *poJob)
6321 : {
6322 6536 : return poJob->poDstBand->RasterIO(
6323 3268 : GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
6324 3268 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
6325 3268 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
6326 3268 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
6327 3268 : poJob->args.nDstYOff2 - poJob->args.nDstYOff,
6328 3268 : poJob->eDstBufferDataType, 0, 0, nullptr);
6329 : };
6330 :
6331 : // Wait for completion of oldest job and serialize it
6332 : const auto WaitAndFinalizeOldestJob =
6333 15 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
6334 : {
6335 15 : auto poOldestJob = jobList.front().get();
6336 15 : poOldestJob->WaitFinished();
6337 15 : CPLErr l_eErr = poOldestJob->eErr;
6338 15 : if (l_eErr == CE_None)
6339 : {
6340 15 : l_eErr = WriteJobData(poOldestJob);
6341 : }
6342 :
6343 15 : jobList.pop_front();
6344 15 : return l_eErr;
6345 : };
6346 :
6347 : // Queue of jobs
6348 1178 : std::list<std::unique_ptr<OvrJob>> jobList;
6349 :
6350 1178 : std::vector<std::unique_ptr<void, VSIFreeReleaser>> apaChunk(nBands);
6351 : std::vector<std::unique_ptr<GByte, VSIFreeReleaser>>
6352 1178 : apabyChunkNoDataMask(nBands);
6353 :
6354 : // Iterate on destination overview, block by block.
6355 589 : for (int nDstYOff = nDstYOffStart;
6356 2078 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6357 1489 : nDstYOff += nDstChunkYSize)
6358 : {
6359 : int nDstYCount;
6360 1489 : if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
6361 1077 : nDstYCount = nDstChunkYSize;
6362 : else
6363 412 : nDstYCount = nDstYOffEnd - nDstYOff;
6364 :
6365 1489 : int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
6366 1489 : int nChunkYOff2 = static_cast<int>(
6367 1489 : ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
6368 1489 : if (nChunkYOff2 > nSrcHeight ||
6369 1489 : nDstYOff + nDstYCount == nDstTotalHeight)
6370 582 : nChunkYOff2 = nSrcHeight;
6371 1489 : int nYCount = nChunkYOff2 - nChunkYOff;
6372 1489 : CPLAssert(nYCount <= nFullResYChunk);
6373 :
6374 1489 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
6375 1489 : int nChunkYSizeQueried =
6376 1489 : nYCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
6377 1489 : if (nChunkYOffQueried < 0)
6378 : {
6379 136 : nChunkYSizeQueried += nChunkYOffQueried;
6380 136 : nChunkYOffQueried = 0;
6381 : }
6382 1489 : if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
6383 135 : nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
6384 1489 : CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
6385 :
6386 1489 : if (!pfnProgress(std::min(1.0, dfCurPixelCount / dfTotalPixelCount),
6387 : nullptr, pProgressData))
6388 : {
6389 1 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6390 1 : eErr = CE_Failure;
6391 : }
6392 :
6393 : // Iterate on destination overview, block by block.
6394 1489 : for (int nDstXOff = nDstXOffStart;
6395 3017 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6396 1528 : nDstXOff += nDstChunkXSize)
6397 : {
6398 1528 : int nDstXCount = 0;
6399 1528 : if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
6400 1511 : nDstXCount = nDstChunkXSize;
6401 : else
6402 17 : nDstXCount = nDstXOffEnd - nDstXOff;
6403 :
6404 1528 : dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
6405 :
6406 1528 : int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
6407 1528 : int nChunkXOff2 = static_cast<int>(
6408 1528 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
6409 1528 : if (nChunkXOff2 > nSrcWidth ||
6410 1528 : nDstXOff + nDstXCount == nDstTotalWidth)
6411 1453 : nChunkXOff2 = nSrcWidth;
6412 1528 : const int nXCount = nChunkXOff2 - nChunkXOff;
6413 1528 : CPLAssert(nXCount <= nFullResXChunk);
6414 :
6415 1528 : int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
6416 1528 : int nChunkXSizeQueried =
6417 1528 : nXCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
6418 1528 : if (nChunkXOffQueried < 0)
6419 : {
6420 191 : nChunkXSizeQueried += nChunkXOffQueried;
6421 191 : nChunkXOffQueried = 0;
6422 : }
6423 1528 : if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
6424 200 : nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
6425 1528 : CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
6426 : #if DEBUG_VERBOSE
6427 : CPLDebug("GDAL",
6428 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
6429 : nChunkXOffQueried, nChunkYOffQueried,
6430 : nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
6431 : nDstYOff, nDstXCount, nDstYCount);
6432 : #endif
6433 :
6434 : // Avoid accumulating too many tasks and exhaust RAM
6435 :
6436 : // Try to complete already finished jobs
6437 1529 : while (eErr == CE_None && !jobList.empty())
6438 : {
6439 2 : auto poOldestJob = jobList.front().get();
6440 2 : if (!poOldestJob->IsFinished())
6441 1 : break;
6442 1 : eErr = poOldestJob->eErr;
6443 1 : if (eErr == CE_None)
6444 : {
6445 1 : eErr = WriteJobData(poOldestJob);
6446 : }
6447 :
6448 1 : jobList.pop_front();
6449 : }
6450 :
6451 : // And in case we have saturated the number of threads,
6452 : // wait for completion of tasks to go below the threshold.
6453 3056 : while (eErr == CE_None &&
6454 1528 : jobList.size() >= static_cast<size_t>(nThreads))
6455 : {
6456 0 : eErr = WaitAndFinalizeOldestJob(jobList);
6457 : }
6458 :
6459 : // Read the source buffers for all the bands.
6460 4797 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6461 : {
6462 : // (Re)allocate buffers if needed
6463 3269 : if (apaChunk[iBand] == nullptr)
6464 : {
6465 1152 : apaChunk[iBand].reset(VSI_MALLOC3_VERBOSE(
6466 : nFullResXChunkQueried, nFullResYChunkQueried,
6467 : nWrkDataTypeSize));
6468 1152 : if (apaChunk[iBand] == nullptr)
6469 : {
6470 0 : eErr = CE_Failure;
6471 : }
6472 : }
6473 3586 : if (bUseNoDataMask &&
6474 317 : apabyChunkNoDataMask[iBand] == nullptr)
6475 : {
6476 266 : apabyChunkNoDataMask[iBand].reset(
6477 266 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
6478 : nFullResXChunkQueried, nFullResYChunkQueried)));
6479 266 : if (apabyChunkNoDataMask[iBand] == nullptr)
6480 : {
6481 0 : eErr = CE_Failure;
6482 : }
6483 : }
6484 :
6485 3269 : if (eErr == CE_None)
6486 : {
6487 3269 : GDALRasterBand *poSrcBand = nullptr;
6488 3269 : if (iSrcOverview == -1)
6489 2383 : poSrcBand = papoSrcBands[iBand];
6490 : else
6491 886 : poSrcBand =
6492 886 : papapoOverviewBands[iBand][iSrcOverview];
6493 3269 : eErr = poSrcBand->RasterIO(
6494 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6495 : nChunkXSizeQueried, nChunkYSizeQueried,
6496 3269 : apaChunk[iBand].get(), nChunkXSizeQueried,
6497 : nChunkYSizeQueried, eWrkDataType, 0, 0, nullptr);
6498 :
6499 3269 : if (bUseNoDataMask && eErr == CE_None)
6500 : {
6501 317 : auto poMaskBand = poSrcBand->IsMaskBand()
6502 317 : ? poSrcBand
6503 244 : : poSrcBand->GetMaskBand();
6504 317 : eErr = poMaskBand->RasterIO(
6505 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6506 : nChunkXSizeQueried, nChunkYSizeQueried,
6507 317 : apabyChunkNoDataMask[iBand].get(),
6508 : nChunkXSizeQueried, nChunkYSizeQueried,
6509 : GDT_UInt8, 0, 0, nullptr);
6510 : }
6511 : }
6512 : }
6513 :
6514 : // Compute the resulting overview block.
6515 4796 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6516 : {
6517 6536 : auto poJob = std::make_unique<OvrJob>();
6518 3268 : poJob->pfnResampleFn = pfnResampleFn;
6519 3268 : poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
6520 6536 : poJob->args.eOvrDataType =
6521 3268 : poJob->poDstBand->GetRasterDataType();
6522 3268 : poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
6523 3268 : poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
6524 3268 : const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
6525 3268 : "NBITS", "IMAGE_STRUCTURE");
6526 3268 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
6527 3268 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
6528 3268 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
6529 3268 : poJob->args.eWrkDataType = eWrkDataType;
6530 3268 : poJob->pChunk = apaChunk[iBand].get();
6531 3268 : poJob->args.pabyChunkNodataMask =
6532 3268 : apabyChunkNoDataMask[iBand].get();
6533 3268 : poJob->args.nChunkXOff = nChunkXOffQueried;
6534 3268 : poJob->args.nChunkXSize = nChunkXSizeQueried;
6535 3268 : poJob->args.nChunkYOff = nChunkYOffQueried;
6536 3268 : poJob->args.nChunkYSize = nChunkYSizeQueried;
6537 3268 : poJob->args.nDstXOff = nDstXOff;
6538 3268 : poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
6539 3268 : poJob->args.nDstYOff = nDstYOff;
6540 3268 : poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
6541 3268 : poJob->args.pszResampling = pszResampling;
6542 3268 : poJob->args.bHasNoData = abHasNoData[iBand];
6543 3268 : poJob->args.dfNoDataValue = adfNoDataValue[iBand];
6544 3268 : poJob->args.eSrcDataType = eDataType;
6545 3268 : poJob->args.bPropagateNoData = bPropagateNoData;
6546 :
6547 3268 : if (poJobQueue)
6548 : {
6549 32 : poJob->oSrcMaskBufferHolder.reset(new PointerHolder(
6550 16 : apabyChunkNoDataMask[iBand].release()));
6551 :
6552 32 : poJob->oSrcBufferHolder.reset(
6553 16 : new PointerHolder(apaChunk[iBand].release()));
6554 :
6555 16 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
6556 16 : jobList.emplace_back(std::move(poJob));
6557 : }
6558 : else
6559 : {
6560 3252 : JobResampleFunc(poJob.get());
6561 3252 : eErr = poJob->eErr;
6562 3252 : if (eErr == CE_None)
6563 : {
6564 3252 : eErr = WriteJobData(poJob.get());
6565 : }
6566 : }
6567 : }
6568 : }
6569 : }
6570 :
6571 : // Wait for all pending jobs to complete
6572 604 : while (!jobList.empty())
6573 : {
6574 15 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
6575 15 : if (l_eErr != CE_None && eErr == CE_None)
6576 0 : eErr = l_eErr;
6577 : }
6578 :
6579 : // Flush the data to overviews.
6580 1739 : for (int iBand = 0; iBand < nBands; ++iBand)
6581 : {
6582 1150 : if (papapoOverviewBands[iBand][iOverview]->FlushCache(false) !=
6583 : CE_None)
6584 0 : eErr = CE_Failure;
6585 : }
6586 : }
6587 :
6588 377 : if (eErr == CE_None)
6589 373 : pfnProgress(1.0, nullptr, pProgressData);
6590 :
6591 377 : return eErr;
6592 : }
6593 :
6594 : /************************************************************************/
6595 : /* GDALRegenerateOverviewsMultiBand() */
6596 : /************************************************************************/
6597 :
6598 : /**
6599 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
6600 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
6601 : *
6602 : * This function will generate one or more overview images from a base
6603 : * image using the requested downsampling algorithm. Its primary use
6604 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
6605 : * can also be used to generate downsampled images in one file from another
6606 : * outside the overview architecture.
6607 : *
6608 : * The output bands need to exist in advance and share the same characteristics
6609 : * (type, dimensions)
6610 : *
6611 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
6612 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
6613 : *
6614 : * It does not support color tables or complex data types.
6615 : *
6616 : * The pseudo-algorithm used by the function is :
6617 : * for each overview
6618 : * iterate on lines of the source by a step of deltay
6619 : * iterate on columns of the source by a step of deltax
6620 : * read the source data of size deltax * deltay for all the bands
6621 : * generate the corresponding overview block for all the bands
6622 : *
6623 : * This function will honour properly NODATA_VALUES tuples (special dataset
6624 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
6625 : * considered as the nodata value and not each value of the triplet
6626 : * independently per band.
6627 : *
6628 : * The GDAL_NUM_THREADS configuration option can be set
6629 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
6630 : * overview computation.
6631 : *
6632 : * @param apoSrcBands the list of source bands to downsample
6633 : * @param aapoOverviewBands bidimension array of bands. First dimension is
6634 : * indexed by bands. Second dimension is indexed by
6635 : * overview levels. All aapoOverviewBands[i] arrays
6636 : * must have the same size (i.e. same number of
6637 : * overviews)
6638 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
6639 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
6640 : * @param pfnProgress progress report function.
6641 : * @param pProgressData progress function callback data.
6642 : * @param papszOptions NULL terminated list of options as
6643 : * key=value pairs, or NULL
6644 : * The XOFF, YOFF, XSIZE and YSIZE
6645 : * options can be specified to express that overviews should
6646 : * be regenerated only in the specified subset of the source
6647 : * dataset.
6648 : * @return CE_None on success or CE_Failure on failure.
6649 : * @since 3.10
6650 : */
6651 :
6652 19 : CPLErr GDALRegenerateOverviewsMultiBand(
6653 : const std::vector<GDALRasterBand *> &apoSrcBands,
6654 : const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
6655 : const char *pszResampling, GDALProgressFunc pfnProgress,
6656 : void *pProgressData, CSLConstList papszOptions)
6657 : {
6658 19 : CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
6659 29 : for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
6660 : {
6661 10 : CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
6662 : }
6663 :
6664 19 : if (aapoOverviewBands.empty())
6665 0 : return CE_None;
6666 :
6667 19 : std::vector<GDALRasterBand **> apapoOverviewBands;
6668 48 : for (auto &apoOverviewBands : aapoOverviewBands)
6669 : {
6670 : auto papoOverviewBands = static_cast<GDALRasterBand **>(
6671 29 : CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
6672 61 : for (size_t i = 0; i < apoOverviewBands.size(); ++i)
6673 : {
6674 32 : papoOverviewBands[i] = apoOverviewBands[i];
6675 : }
6676 29 : apapoOverviewBands.push_back(papoOverviewBands);
6677 : }
6678 38 : const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
6679 19 : static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
6680 19 : static_cast<int>(aapoOverviewBands[0].size()),
6681 19 : apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
6682 : papszOptions);
6683 48 : for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
6684 29 : CPLFree(papoOverviewBands);
6685 19 : return eErr;
6686 : }
6687 :
6688 : /************************************************************************/
6689 : /* GDALComputeBandStats() */
6690 : /************************************************************************/
6691 :
6692 : /** Undocumented
6693 : * @param hSrcBand undocumented.
6694 : * @param nSampleStep Step between scanlines used to compute statistics.
6695 : * When nSampleStep is equal to 1, all scanlines will
6696 : * be processed.
6697 : * @param pdfMean undocumented.
6698 : * @param pdfStdDev undocumented.
6699 : * @param pfnProgress undocumented.
6700 : * @param pProgressData undocumented.
6701 : * @return undocumented
6702 : */
6703 18 : CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
6704 : int nSampleStep, double *pdfMean,
6705 : double *pdfStdDev,
6706 : GDALProgressFunc pfnProgress,
6707 : void *pProgressData)
6708 :
6709 : {
6710 18 : VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
6711 :
6712 18 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
6713 :
6714 18 : if (pfnProgress == nullptr)
6715 18 : pfnProgress = GDALDummyProgress;
6716 :
6717 18 : const int nWidth = poSrcBand->GetXSize();
6718 18 : const int nHeight = poSrcBand->GetYSize();
6719 :
6720 18 : if (nSampleStep >= nHeight || nSampleStep < 1)
6721 5 : nSampleStep = 1;
6722 :
6723 18 : GDALDataType eWrkType = GDT_Unknown;
6724 18 : float *pafData = nullptr;
6725 18 : GDALDataType eType = poSrcBand->GetRasterDataType();
6726 18 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6727 18 : if (bComplex)
6728 : {
6729 : pafData = static_cast<float *>(
6730 0 : VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6731 0 : eWrkType = GDT_CFloat32;
6732 : }
6733 : else
6734 : {
6735 : pafData =
6736 18 : static_cast<float *>(VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6737 18 : eWrkType = GDT_Float32;
6738 : }
6739 :
6740 18 : if (nWidth == 0 || pafData == nullptr)
6741 : {
6742 0 : VSIFree(pafData);
6743 0 : return CE_Failure;
6744 : }
6745 :
6746 : /* -------------------------------------------------------------------- */
6747 : /* Loop over all sample lines. */
6748 : /* -------------------------------------------------------------------- */
6749 18 : double dfSum = 0.0;
6750 18 : double dfSum2 = 0.0;
6751 18 : int iLine = 0;
6752 18 : GIntBig nSamples = 0;
6753 :
6754 2143 : do
6755 : {
6756 2161 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6757 : pProgressData))
6758 : {
6759 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6760 0 : CPLFree(pafData);
6761 0 : return CE_Failure;
6762 : }
6763 :
6764 : const CPLErr eErr =
6765 2161 : poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
6766 : 1, eWrkType, 0, 0, nullptr);
6767 2161 : if (eErr != CE_None)
6768 : {
6769 1 : CPLFree(pafData);
6770 1 : return eErr;
6771 : }
6772 :
6773 725208 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6774 : {
6775 723048 : float fValue = 0.0f;
6776 :
6777 723048 : if (bComplex)
6778 : {
6779 : // Compute the magnitude of the complex value.
6780 : fValue =
6781 0 : std::hypot(pafData[static_cast<size_t>(iPixel) * 2],
6782 0 : pafData[static_cast<size_t>(iPixel) * 2 + 1]);
6783 : }
6784 : else
6785 : {
6786 723048 : fValue = pafData[iPixel];
6787 : }
6788 :
6789 723048 : dfSum += static_cast<double>(fValue);
6790 723048 : dfSum2 += static_cast<double>(fValue) * static_cast<double>(fValue);
6791 : }
6792 :
6793 2160 : nSamples += nWidth;
6794 2160 : iLine += nSampleStep;
6795 2160 : } while (iLine < nHeight);
6796 :
6797 17 : if (!pfnProgress(1.0, nullptr, pProgressData))
6798 : {
6799 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6800 0 : CPLFree(pafData);
6801 0 : return CE_Failure;
6802 : }
6803 :
6804 : /* -------------------------------------------------------------------- */
6805 : /* Produce the result values. */
6806 : /* -------------------------------------------------------------------- */
6807 17 : if (pdfMean != nullptr)
6808 17 : *pdfMean = dfSum / nSamples;
6809 :
6810 17 : if (pdfStdDev != nullptr)
6811 : {
6812 17 : const double dfMean = dfSum / nSamples;
6813 :
6814 17 : *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
6815 : }
6816 :
6817 17 : CPLFree(pafData);
6818 :
6819 17 : return CE_None;
6820 : }
6821 :
6822 : /************************************************************************/
6823 : /* GDALOverviewMagnitudeCorrection() */
6824 : /* */
6825 : /* Correct the mean and standard deviation of the overviews of */
6826 : /* the given band to match the base layer approximately. */
6827 : /************************************************************************/
6828 :
6829 : /** Undocumented
6830 : * @param hBaseBand undocumented.
6831 : * @param nOverviewCount undocumented.
6832 : * @param pahOverviews undocumented.
6833 : * @param pfnProgress undocumented.
6834 : * @param pProgressData undocumented.
6835 : * @return undocumented
6836 : */
6837 0 : CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
6838 : int nOverviewCount,
6839 : GDALRasterBandH *pahOverviews,
6840 : GDALProgressFunc pfnProgress,
6841 : void *pProgressData)
6842 :
6843 : {
6844 0 : VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
6845 :
6846 : /* -------------------------------------------------------------------- */
6847 : /* Compute mean/stddev for source raster. */
6848 : /* -------------------------------------------------------------------- */
6849 0 : double dfOrigMean = 0.0;
6850 0 : double dfOrigStdDev = 0.0;
6851 : {
6852 : const CPLErr eErr =
6853 0 : GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
6854 : pfnProgress, pProgressData);
6855 :
6856 0 : if (eErr != CE_None)
6857 0 : return eErr;
6858 : }
6859 :
6860 : /* -------------------------------------------------------------------- */
6861 : /* Loop on overview bands. */
6862 : /* -------------------------------------------------------------------- */
6863 0 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
6864 : {
6865 : GDALRasterBand *poOverview =
6866 0 : GDALRasterBand::FromHandle(pahOverviews[iOverview]);
6867 : double dfOverviewMean, dfOverviewStdDev;
6868 :
6869 : const CPLErr eErr =
6870 0 : GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
6871 : &dfOverviewStdDev, pfnProgress, pProgressData);
6872 :
6873 0 : if (eErr != CE_None)
6874 0 : return eErr;
6875 :
6876 0 : double dfGain = 1.0;
6877 0 : if (dfOrigStdDev >= 0.0001)
6878 0 : dfGain = dfOrigStdDev / dfOverviewStdDev;
6879 :
6880 : /* --------------------------------------------------------------------
6881 : */
6882 : /* Apply gain and offset. */
6883 : /* --------------------------------------------------------------------
6884 : */
6885 0 : const int nWidth = poOverview->GetXSize();
6886 0 : const int nHeight = poOverview->GetYSize();
6887 :
6888 0 : GDALDataType eWrkType = GDT_Unknown;
6889 0 : float *pafData = nullptr;
6890 0 : const GDALDataType eType = poOverview->GetRasterDataType();
6891 0 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6892 0 : if (bComplex)
6893 : {
6894 : pafData = static_cast<float *>(
6895 0 : VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6896 0 : eWrkType = GDT_CFloat32;
6897 : }
6898 : else
6899 : {
6900 : pafData = static_cast<float *>(
6901 0 : VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6902 0 : eWrkType = GDT_Float32;
6903 : }
6904 :
6905 0 : if (pafData == nullptr)
6906 : {
6907 0 : return CE_Failure;
6908 : }
6909 :
6910 0 : for (int iLine = 0; iLine < nHeight; ++iLine)
6911 : {
6912 0 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6913 : pProgressData))
6914 : {
6915 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6916 0 : CPLFree(pafData);
6917 0 : return CE_Failure;
6918 : }
6919 :
6920 0 : if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
6921 : nWidth, 1, eWrkType, 0, 0,
6922 0 : nullptr) != CE_None)
6923 : {
6924 0 : CPLFree(pafData);
6925 0 : return CE_Failure;
6926 : }
6927 :
6928 0 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6929 : {
6930 0 : if (bComplex)
6931 : {
6932 0 : pafData[static_cast<size_t>(iPixel) * 2] *=
6933 0 : static_cast<float>(dfGain);
6934 0 : pafData[static_cast<size_t>(iPixel) * 2 + 1] *=
6935 0 : static_cast<float>(dfGain);
6936 : }
6937 : else
6938 : {
6939 0 : pafData[iPixel] = static_cast<float>(
6940 0 : (double(pafData[iPixel]) - dfOverviewMean) * dfGain +
6941 : dfOrigMean);
6942 : }
6943 : }
6944 :
6945 0 : if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
6946 : nWidth, 1, eWrkType, 0, 0,
6947 0 : nullptr) != CE_None)
6948 : {
6949 0 : CPLFree(pafData);
6950 0 : return CE_Failure;
6951 : }
6952 : }
6953 :
6954 0 : if (!pfnProgress(1.0, nullptr, pProgressData))
6955 : {
6956 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6957 0 : CPLFree(pafData);
6958 0 : return CE_Failure;
6959 : }
6960 :
6961 0 : CPLFree(pafData);
6962 : }
6963 :
6964 0 : return CE_None;
6965 : }
|