Line data Source code
1 :
2 : /******************************************************************************
3 : *
4 : * Project: GDAL Core
5 : * Purpose: Helper code to implement overview support in different drivers.
6 : * Author: Frank Warmerdam, warmerdam@pobox.com
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 2000, Frank Warmerdam
10 : * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * SPDX-License-Identifier: MIT
13 : ****************************************************************************/
14 :
15 : #include "cpl_port.h"
16 : #include "gdal_priv.h"
17 :
18 : #include <cmath>
19 : #include <cstddef>
20 : #include <cstdlib>
21 :
22 : #include <algorithm>
23 : #include <complex>
24 : #include <condition_variable>
25 : #include <limits>
26 : #include <list>
27 : #include <memory>
28 : #include <mutex>
29 : #include <vector>
30 :
31 : #include "cpl_conv.h"
32 : #include "cpl_error.h"
33 : #include "cpl_float.h"
34 : #include "cpl_progress.h"
35 : #include "cpl_vsi.h"
36 : #include "cpl_worker_thread_pool.h"
37 : #include "gdal.h"
38 : #include "gdal_thread_pool.h"
39 : #include "gdalwarper.h"
40 : #include "gdal_vrt.h"
41 : #include "vrtdataset.h"
42 :
43 : #ifdef USE_NEON_OPTIMIZATIONS
44 : #include "include_sse2neon.h"
45 :
46 : #if (!defined(__aarch64__) && !defined(_M_ARM64))
47 : #define ARM_V7
48 : #endif
49 :
50 : #define USE_SSE2
51 :
52 : #include "gdalsse_priv.h"
53 :
54 : // Restrict to 64bit processors because they are guaranteed to have SSE2,
55 : // or if __AVX2__ is defined.
56 : #elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
57 : #define USE_SSE2
58 :
59 : #include "gdalsse_priv.h"
60 :
61 : #ifdef __SSE3__
62 : #include <pmmintrin.h>
63 : #endif
64 : #ifdef __SSSE3__
65 : #include <tmmintrin.h>
66 : #endif
67 : #ifdef __SSE4_1__
68 : #include <smmintrin.h>
69 : #endif
70 : #ifdef __AVX2__
71 : #include <immintrin.h>
72 : #endif
73 :
74 : #endif
75 :
76 : // To be included after above USE_SSE2 and include gdalsse_priv.h
77 : // to avoid build issue on Windows x86
78 : #include "gdal_priv_templates.hpp"
79 :
80 : /************************************************************************/
81 : /* GDALResampleChunk_Near() */
82 : /************************************************************************/
83 :
84 : template <class T>
85 1237 : static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
86 : const T *pChunk, T **ppDstBuffer)
87 :
88 : {
89 1237 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
90 1237 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
91 1237 : const GDALDataType eWrkDataType = args.eWrkDataType;
92 1237 : const int nChunkXOff = args.nChunkXOff;
93 1237 : const int nChunkXSize = args.nChunkXSize;
94 1237 : const int nChunkYOff = args.nChunkYOff;
95 1237 : const int nDstXOff = args.nDstXOff;
96 1237 : const int nDstXOff2 = args.nDstXOff2;
97 1237 : const int nDstYOff = args.nDstYOff;
98 1237 : const int nDstYOff2 = args.nDstYOff2;
99 1237 : const int nDstXWidth = nDstXOff2 - nDstXOff;
100 :
101 : /* -------------------------------------------------------------------- */
102 : /* Allocate buffers. */
103 : /* -------------------------------------------------------------------- */
104 1237 : *ppDstBuffer = static_cast<T *>(
105 1237 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
106 : GDALGetDataTypeSizeBytes(eWrkDataType)));
107 1237 : if (*ppDstBuffer == nullptr)
108 : {
109 0 : return CE_Failure;
110 : }
111 1237 : T *const pDstBuffer = *ppDstBuffer;
112 :
113 : int *panSrcXOff =
114 1237 : static_cast<int *>(VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(int)));
115 :
116 1237 : if (panSrcXOff == nullptr)
117 : {
118 0 : return CE_Failure;
119 : }
120 :
121 : /* ==================================================================== */
122 : /* Precompute inner loop constants. */
123 : /* ==================================================================== */
124 840463 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
125 : {
126 839226 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
127 839226 : if (nSrcXOff < nChunkXOff)
128 0 : nSrcXOff = nChunkXOff;
129 :
130 839226 : panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
131 : }
132 :
133 : /* ==================================================================== */
134 : /* Loop over destination scanlines. */
135 : /* ==================================================================== */
136 142070 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
137 : {
138 140833 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
139 140833 : if (nSrcYOff < nChunkYOff)
140 0 : nSrcYOff = nChunkYOff;
141 :
142 140833 : const T *const pSrcScanline =
143 : pChunk +
144 140833 : (static_cast<size_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
145 137799 : nChunkXOff;
146 :
147 : /* --------------------------------------------------------------------
148 : */
149 : /* Loop over destination pixels */
150 : /* --------------------------------------------------------------------
151 : */
152 140833 : T *pDstScanline =
153 140833 : pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
154 120237794 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
155 : {
156 120096760 : pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
157 : }
158 : }
159 :
160 1237 : CPLFree(panSrcXOff);
161 :
162 1237 : return CE_None;
163 : }
164 :
165 1237 : static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
166 : const void *pChunk, void **ppDstBuffer,
167 : GDALDataType *peDstBufferDataType)
168 : {
169 1237 : *peDstBufferDataType = args.eWrkDataType;
170 1237 : switch (args.eWrkDataType)
171 : {
172 : // For nearest resampling, as no computation is done, only the
173 : // size of the data type matters.
174 1080 : case GDT_UInt8:
175 : case GDT_Int8:
176 : {
177 1080 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
178 1080 : return GDALResampleChunk_NearT(
179 : args, static_cast<const uint8_t *>(pChunk),
180 1080 : reinterpret_cast<uint8_t **>(ppDstBuffer));
181 : }
182 :
183 52 : case GDT_Int16:
184 : case GDT_UInt16:
185 : case GDT_Float16:
186 : {
187 52 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
188 52 : return GDALResampleChunk_NearT(
189 : args, static_cast<const uint16_t *>(pChunk),
190 52 : reinterpret_cast<uint16_t **>(ppDstBuffer));
191 : }
192 :
193 57 : case GDT_CInt16:
194 : case GDT_CFloat16:
195 : case GDT_Int32:
196 : case GDT_UInt32:
197 : case GDT_Float32:
198 : {
199 57 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
200 57 : return GDALResampleChunk_NearT(
201 : args, static_cast<const uint32_t *>(pChunk),
202 57 : reinterpret_cast<uint32_t **>(ppDstBuffer));
203 : }
204 :
205 44 : case GDT_CInt32:
206 : case GDT_CFloat32:
207 : case GDT_Int64:
208 : case GDT_UInt64:
209 : case GDT_Float64:
210 : {
211 44 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
212 44 : return GDALResampleChunk_NearT(
213 : args, static_cast<const uint64_t *>(pChunk),
214 44 : reinterpret_cast<uint64_t **>(ppDstBuffer));
215 : }
216 :
217 4 : case GDT_CFloat64:
218 : {
219 4 : return GDALResampleChunk_NearT(
220 : args, static_cast<const std::complex<double> *>(pChunk),
221 4 : reinterpret_cast<std::complex<double> **>(ppDstBuffer));
222 : }
223 :
224 0 : case GDT_Unknown:
225 : case GDT_TypeCount:
226 0 : break;
227 : }
228 0 : CPLAssert(false);
229 : return CE_Failure;
230 : }
231 :
232 : namespace
233 : {
234 :
235 : // Find in the color table the entry whose RGB value is the closest
236 : // (using quadratic distance) to the test color, ignoring transparent entries.
237 3837 : int BestColorEntry(const std::vector<GDALColorEntry> &entries,
238 : const GDALColorEntry &test)
239 : {
240 3837 : int nMinDist = std::numeric_limits<int>::max();
241 3837 : size_t bestEntry = 0;
242 986109 : for (size_t i = 0; i < entries.size(); ++i)
243 : {
244 982272 : const GDALColorEntry &entry = entries[i];
245 : // Ignore transparent entries
246 982272 : if (entry.c4 == 0)
247 3237 : continue;
248 :
249 979035 : int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
250 979035 : ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
251 979035 : ((test.c3 - entry.c3) * (test.c3 - entry.c3));
252 979035 : if (nDist < nMinDist)
253 : {
254 15847 : nMinDist = nDist;
255 15847 : bestEntry = i;
256 : }
257 : }
258 3837 : return static_cast<int>(bestEntry);
259 : }
260 :
261 7 : std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
262 : int &transparentIdx)
263 : {
264 7 : std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
265 :
266 7 : transparentIdx = -1;
267 7 : int i = 0;
268 1799 : for (auto &entry : entries)
269 : {
270 1792 : table.GetColorEntryAsRGB(i, &entry);
271 1792 : if (transparentIdx < 0 && entry.c4 == 0)
272 1 : transparentIdx = i;
273 1792 : ++i;
274 : }
275 7 : return entries;
276 : }
277 :
278 : } // unnamed namespace
279 :
280 : /************************************************************************/
281 : /* SQUARE() */
282 : /************************************************************************/
283 :
284 4897 : template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
285 : {
286 4897 : return static_cast<Tsquare>(val) * val;
287 : }
288 :
289 : /************************************************************************/
290 : /* ComputeIntegerRMS() */
291 : /************************************************************************/
292 : // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
293 : // integer that minimizes abs(rms**2 - sumSquares / weight)
294 : template <class T, class Twork>
295 42 : inline T ComputeIntegerRMS(double sumSquares, double weight)
296 : {
297 42 : const double sumDivWeight = sumSquares / weight;
298 42 : T rms = static_cast<T>(sqrt(sumDivWeight));
299 :
300 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
301 : // Naive version:
302 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
303 42 : if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
304 42 : 2 * sumDivWeight)
305 6 : rms += 1;
306 42 : return rms;
307 : }
308 :
309 : template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
310 : {
311 : CPLAssert(false);
312 : return 0;
313 : }
314 :
315 28 : template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
316 : {
317 : // It has been verified that given the correction on rms below, using
318 : // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
319 : // is equivalent, so use the former as it is used twice.
320 28 : const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
321 28 : const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
322 28 : GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
323 :
324 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
325 : // Naive version:
326 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
327 : // Optimized version for integer case and weight == 4
328 28 : if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
329 5 : rms += 1;
330 28 : return rms;
331 : }
332 :
333 : template <>
334 24 : inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
335 : {
336 24 : const double sumDivWeight = sumSquares * 0.25;
337 24 : GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
338 :
339 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
340 : // Naive version:
341 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
342 : // Optimized version for integer case and weight == 4
343 24 : if (static_cast<GUInt32>(rms) * (rms + 1) <
344 24 : static_cast<GUInt32>(sumDivWeight + 0.25))
345 4 : rms += 1;
346 24 : return rms;
347 : }
348 :
349 : #ifdef USE_SSE2
350 :
351 : /************************************************************************/
352 : /* QuadraticMeanByteSSE2OrAVX2() */
353 : /************************************************************************/
354 :
355 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
356 : #define sse2_packus_epi32 _mm_packus_epi32
357 : #else
358 516139 : inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
359 : {
360 516139 : const auto minus32768_32 = _mm_set1_epi32(-32768);
361 516139 : const auto minus32768_16 = _mm_set1_epi16(-32768);
362 516139 : a = _mm_add_epi32(a, minus32768_32);
363 516139 : b = _mm_add_epi32(b, minus32768_32);
364 516139 : a = _mm_packs_epi32(a, b);
365 516139 : a = _mm_sub_epi16(a, minus32768_16);
366 516139 : return a;
367 : }
368 : #endif
369 :
370 : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
371 : #define sse2_hadd_epi16 _mm_hadd_epi16
372 : #else
373 4715530 : inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
374 : {
375 : // Horizontal addition of adjacent pairs
376 4715530 : const auto mask = _mm_set1_epi32(0xFFFF);
377 : const auto horizLo =
378 14146600 : _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
379 : const auto horizHi =
380 14146600 : _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
381 :
382 : // Recombine low and high parts
383 4715530 : return _mm_packs_epi32(horizLo, horizHi);
384 : }
385 : #endif
386 :
387 : #ifdef __AVX2__
388 :
389 : #define set1_epi16 _mm256_set1_epi16
390 : #define set1_epi32 _mm256_set1_epi32
391 : #define setzero _mm256_setzero_si256
392 : #define set1_ps _mm256_set1_ps
393 : #define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
394 : #define unpacklo_epi8 _mm256_unpacklo_epi8
395 : #define unpackhi_epi8 _mm256_unpackhi_epi8
396 : #define madd_epi16 _mm256_madd_epi16
397 : #define add_epi32 _mm256_add_epi32
398 : #define mul_ps _mm256_mul_ps
399 : #define cvtepi32_ps _mm256_cvtepi32_ps
400 : #define sqrt_ps _mm256_sqrt_ps
401 : #define cvttps_epi32 _mm256_cvttps_epi32
402 : #define packs_epi32 _mm256_packs_epi32
403 : #define packus_epi32 _mm256_packus_epi32
404 : #define srli_epi32 _mm256_srli_epi32
405 : #define mullo_epi16 _mm256_mullo_epi16
406 : #define srli_epi16 _mm256_srli_epi16
407 : #define cmpgt_epi16 _mm256_cmpgt_epi16
408 : #define add_epi16 _mm256_add_epi16
409 : #define sub_epi16 _mm256_sub_epi16
410 : #define packus_epi16 _mm256_packus_epi16
411 :
412 : /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
413 : /* to get the lower 128-bit bits of what would be a true 256-bit vector register
414 : */
415 :
416 : inline __m256i FIXUP_LANES(__m256i x)
417 : {
418 : return _mm256_permute4x64_epi64(x, _MM_SHUFFLE(3, 1, 2, 0));
419 : }
420 :
421 : #define store_lo(x, y) \
422 : _mm_storeu_si128(reinterpret_cast<__m128i *>(x), \
423 : _mm256_extracti128_si256(FIXUP_LANES(y), 0))
424 : #define storeu_int(x, y) \
425 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(x), FIXUP_LANES(y))
426 : #define hadd_epi16 _mm256_hadd_epi16
427 : #else
428 : #define set1_epi16 _mm_set1_epi16
429 : #define set1_epi32 _mm_set1_epi32
430 : #define setzero _mm_setzero_si128
431 : #define set1_ps _mm_set1_ps
432 : #define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
433 : #define unpacklo_epi8 _mm_unpacklo_epi8
434 : #define unpackhi_epi8 _mm_unpackhi_epi8
435 : #define madd_epi16 _mm_madd_epi16
436 : #define add_epi32 _mm_add_epi32
437 : #define mul_ps _mm_mul_ps
438 : #define cvtepi32_ps _mm_cvtepi32_ps
439 : #define sqrt_ps _mm_sqrt_ps
440 : #define cvttps_epi32 _mm_cvttps_epi32
441 : #define packs_epi32 _mm_packs_epi32
442 : #define packus_epi32 sse2_packus_epi32
443 : #define srli_epi32 _mm_srli_epi32
444 : #define mullo_epi16 _mm_mullo_epi16
445 : #define srli_epi16 _mm_srli_epi16
446 : #define cmpgt_epi16 _mm_cmpgt_epi16
447 : #define add_epi16 _mm_add_epi16
448 : #define sub_epi16 _mm_sub_epi16
449 : #define packus_epi16 _mm_packus_epi16
450 : #define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
451 : #define storeu_int(x, y) _mm_storeu_si128(reinterpret_cast<__m128i *>(x), (y))
452 : #define hadd_epi16 sse2_hadd_epi16
453 : #endif
454 :
455 : template <class T>
456 : static int
457 : #if defined(__GNUC__)
458 : __attribute__((noinline))
459 : #endif
460 5389 : QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
461 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
462 : T *CPL_RESTRICT pDstScanline)
463 : {
464 : // Optimized implementation for RMS on Byte by
465 : // processing by group of 8 output pixels, so as to use
466 : // a single _mm_sqrt_ps() call for 4 output pixels
467 5389 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
468 :
469 5389 : int iDstPixel = 0;
470 5389 : const auto one16 = set1_epi16(1);
471 5389 : const auto one32 = set1_epi32(1);
472 5389 : const auto zero = setzero();
473 5389 : const auto minus32768 = set1_epi16(-32768);
474 :
475 5389 : constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
476 521504 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
477 : {
478 : // Load 2 * DEST_ELTS bytes from each line
479 516115 : auto firstLine = loadu_int(pSrcScanlineShifted);
480 1032230 : auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
481 : // Extend those Bytes as UInt16s
482 516115 : auto firstLineLo = unpacklo_epi8(firstLine, zero);
483 516115 : auto firstLineHi = unpackhi_epi8(firstLine, zero);
484 516115 : auto secondLineLo = unpacklo_epi8(secondLine, zero);
485 516115 : auto secondLineHi = unpackhi_epi8(secondLine, zero);
486 :
487 : // Multiplication of 16 bit values and horizontal
488 : // addition of 32 bit results
489 : // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
490 516115 : firstLineLo = madd_epi16(firstLineLo, firstLineLo);
491 516115 : firstLineHi = madd_epi16(firstLineHi, firstLineHi);
492 516115 : secondLineLo = madd_epi16(secondLineLo, secondLineLo);
493 516115 : secondLineHi = madd_epi16(secondLineHi, secondLineHi);
494 :
495 : // Vertical addition
496 516115 : const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
497 516115 : const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
498 :
499 : const auto sumSquaresPlusOneDiv4Lo =
500 1032230 : srli_epi32(add_epi32(sumSquaresLo, one32), 2);
501 : const auto sumSquaresPlusOneDiv4Hi =
502 1032230 : srli_epi32(add_epi32(sumSquaresHi, one32), 2);
503 :
504 : // Take square root and truncate/floor to int32
505 : const auto rmsLo =
506 1548340 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
507 : const auto rmsHi =
508 1548340 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
509 :
510 : // Merge back low and high registers with each RMS value
511 : // as a 16 bit value.
512 516115 : auto rms = packs_epi32(rmsLo, rmsHi);
513 :
514 : // Round to upper value if it minimizes the
515 : // error |rms^2 - sumSquares/4|
516 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
517 : // rms += 1;
518 : // which is equivalent to:
519 : // if( rms * (rms + 1) < (sumSquares+1) / 4 )
520 : // rms += 1;
521 : // And both left and right parts fit on 16 (unsigned) bits
522 : const auto sumSquaresPlusOneDiv4 =
523 516115 : packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
524 : // cmpgt_epi16 operates on signed int16, but here
525 : // we have unsigned values, so shift them by -32768 before
526 2580580 : const auto mask = cmpgt_epi16(
527 : add_epi16(sumSquaresPlusOneDiv4, minus32768),
528 : add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
529 : // The value of the mask will be -1 when the correction needs to be
530 : // applied
531 516115 : rms = sub_epi16(rms, mask);
532 :
533 : // Pack each 16 bit RMS value to 8 bits
534 516115 : rms = packus_epi16(rms, rms /* could be anything */);
535 516115 : store_lo(&pDstScanline[iDstPixel], rms);
536 516115 : pSrcScanlineShifted += 2 * DEST_ELTS;
537 : }
538 :
539 5389 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
540 5389 : return iDstPixel;
541 : }
542 :
543 : /************************************************************************/
544 : /* AverageByteSSE2OrAVX2() */
545 : /************************************************************************/
546 :
547 : static int
548 111734 : AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
549 : const GByte *&CPL_RESTRICT pSrcScanlineShiftedInOut,
550 : GByte *CPL_RESTRICT pDstScanline)
551 : {
552 : // Optimized implementation for average on Byte by
553 : // processing by group of 16 output pixels for SSE2, or 32 for AVX2
554 :
555 111734 : const auto zero = setzero();
556 111734 : const auto two16 = set1_epi16(2);
557 111734 : const GByte *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
558 :
559 111734 : constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
560 111734 : int iDstPixel = 0;
561 2469500 : for (; iDstPixel < nDstXWidth - (2 * DEST_ELTS - 1);
562 2357770 : iDstPixel += 2 * DEST_ELTS)
563 : {
564 : decltype(setzero()) average0;
565 : {
566 : // Load 2 * DEST_ELTS bytes from each line
567 2357770 : const auto firstLine = loadu_int(pSrcScanlineShifted);
568 : const auto secondLine =
569 4715530 : loadu_int(pSrcScanlineShifted + nChunkXSize);
570 : // Extend those Bytes as UInt16s
571 2357770 : const auto firstLineLo = unpacklo_epi8(firstLine, zero);
572 2357770 : const auto firstLineHi = unpackhi_epi8(firstLine, zero);
573 2357770 : const auto secondLineLo = unpacklo_epi8(secondLine, zero);
574 2357770 : const auto secondLineHi = unpackhi_epi8(secondLine, zero);
575 :
576 : // Vertical addition
577 2357770 : const auto sumLo = add_epi16(firstLineLo, secondLineLo);
578 2357770 : const auto sumHi = add_epi16(firstLineHi, secondLineHi);
579 :
580 : // Horizontal addition of adjacent pairs, and recombine low and high
581 : // parts
582 2357770 : const auto sum = hadd_epi16(sumLo, sumHi);
583 :
584 : // average = (sum + 2) / 4
585 2357770 : average0 = srli_epi16(add_epi16(sum, two16), 2);
586 :
587 2357770 : pSrcScanlineShifted += 2 * DEST_ELTS;
588 : }
589 :
590 : decltype(setzero()) average1;
591 : {
592 : // Load 2 * DEST_ELTS bytes from each line
593 2357770 : const auto firstLine = loadu_int(pSrcScanlineShifted);
594 : const auto secondLine =
595 4715530 : loadu_int(pSrcScanlineShifted + nChunkXSize);
596 : // Extend those Bytes as UInt16s
597 2357770 : const auto firstLineLo = unpacklo_epi8(firstLine, zero);
598 2357770 : const auto firstLineHi = unpackhi_epi8(firstLine, zero);
599 2357770 : const auto secondLineLo = unpacklo_epi8(secondLine, zero);
600 2357770 : const auto secondLineHi = unpackhi_epi8(secondLine, zero);
601 :
602 : // Vertical addition
603 2357770 : const auto sumLo = add_epi16(firstLineLo, secondLineLo);
604 2357770 : const auto sumHi = add_epi16(firstLineHi, secondLineHi);
605 :
606 : // Horizontal addition of adjacent pairs, and recombine low and high
607 : // parts
608 2357770 : const auto sum = hadd_epi16(sumLo, sumHi);
609 :
610 : // average = (sum + 2) / 4
611 2357770 : average1 = srli_epi16(add_epi16(sum, two16), 2);
612 :
613 2357770 : pSrcScanlineShifted += 2 * DEST_ELTS;
614 : }
615 :
616 : // Pack each 16 bit average value to 8 bits
617 2357770 : const auto average = packus_epi16(average0, average1);
618 2357770 : storeu_int(&pDstScanline[iDstPixel], average);
619 : }
620 :
621 111734 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
622 111734 : return iDstPixel;
623 : }
624 :
625 : /************************************************************************/
626 : /* QuadraticMeanUInt16SSE2() */
627 : /************************************************************************/
628 :
629 : #ifdef __SSE3__
630 : #define sse2_hadd_pd _mm_hadd_pd
631 : #else
632 185 : inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
633 : {
634 : auto aLo_bLo =
635 740 : _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
636 : auto aHi_bHi =
637 740 : _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
638 185 : return _mm_add_pd(aLo_bLo, aHi_bHi); // (aLo + aHi, bLo + bHi)
639 : }
640 : #endif
641 :
642 120 : inline __m128d SQUARE_PD(__m128d x)
643 : {
644 120 : return _mm_mul_pd(x, x);
645 : }
646 :
647 : #ifdef __AVX2__
648 :
649 : inline __m256d SQUARE_PD(__m256d x)
650 : {
651 : return _mm256_mul_pd(x, x);
652 : }
653 :
654 : inline __m256d FIXUP_LANES(__m256d x)
655 : {
656 : return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
657 : }
658 :
659 : inline __m256 FIXUP_LANES(__m256 x)
660 : {
661 : return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
662 : }
663 :
664 : #endif
665 :
666 : static int
667 14 : QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
668 : const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
669 : uint16_t *CPL_RESTRICT pDstScanline)
670 : {
671 : // Optimized implementation for RMS on UInt16 by
672 : // processing by group of 4 output pixels.
673 14 : const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
674 :
675 14 : int iDstPixel = 0;
676 14 : const auto zero = _mm_setzero_si128();
677 :
678 : #ifdef __AVX2__
679 : const auto zeroDot25 = _mm256_set1_pd(0.25);
680 : const auto zeroDot5 = _mm256_set1_pd(0.5);
681 :
682 : // The first four 0's could be anything, as we only take the bottom
683 : // 128 bits.
684 : const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
685 : #else
686 14 : const auto zeroDot25 = _mm_set1_pd(0.25);
687 14 : const auto zeroDot5 = _mm_set1_pd(0.5);
688 : #endif
689 :
690 14 : constexpr int DEST_ELTS =
691 : static_cast<int>(sizeof(zero) / sizeof(uint16_t)) / 2;
692 52 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
693 : {
694 : // Load 8 UInt16 from each line
695 38 : const auto firstLine = _mm_loadu_si128(
696 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
697 : const auto secondLine =
698 38 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
699 38 : pSrcScanlineShifted + nChunkXSize));
700 :
701 : // Detect if all of the source values fit in 14 bits.
702 : // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
703 : // and we can do a much faster implementation.
704 : const auto maskTmp =
705 76 : _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
706 : #if defined(__i386__) || defined(_M_IX86)
707 : uint64_t nMaskFitsIn14Bits = 0;
708 : _mm_storel_epi64(
709 : reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
710 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
711 : #else
712 38 : const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
713 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
714 : #endif
715 38 : if (nMaskFitsIn14Bits == 0)
716 : {
717 : // Multiplication of 16 bit values and horizontal
718 : // addition of 32 bit results
719 : const auto firstLineHSumSquare =
720 26 : _mm_madd_epi16(firstLine, firstLine);
721 : const auto secondLineHSumSquare =
722 26 : _mm_madd_epi16(secondLine, secondLine);
723 : // Vertical addition
724 : const auto sumSquares =
725 26 : _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
726 : // In theory we should take sqrt(sumSquares * 0.25f)
727 : // but given the rounding we do, this is equivalent to
728 : // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
729 : // sumSquares <= 4 * 16383^2
730 26 : const auto one32 = _mm_set1_epi32(1);
731 : const auto sumSquaresPlusOneDiv4 =
732 52 : _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
733 : // Take square root and truncate/floor to int32
734 78 : auto rms = _mm_cvttps_epi32(
735 : _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
736 :
737 : // Round to upper value if it minimizes the
738 : // error |rms^2 - sumSquares/4|
739 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
740 : // rms += 1;
741 : // which is equivalent to:
742 : // if( rms * rms + rms < (sumSquares+1) / 4 )
743 : // rms += 1;
744 : auto mask =
745 78 : _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
746 : _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
747 26 : rms = _mm_sub_epi32(rms, mask);
748 : // Pack each 32 bit RMS value to 16 bits
749 26 : rms = _mm_packs_epi32(rms, rms /* could be anything */);
750 : _mm_storel_epi64(
751 26 : reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
752 26 : pSrcScanlineShifted += 2 * DEST_ELTS;
753 26 : continue;
754 : }
755 :
756 : // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
757 : // to 32 bit would result in 4 multiplications instead of 8, but
758 : // mullo/mulhi have a worse throughput than mul_pd.
759 :
760 : // Extend those UInt16s as UInt32s
761 12 : const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
762 12 : const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
763 12 : const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
764 12 : const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
765 :
766 : #ifdef __AVX2__
767 : // Multiplication of 32 bit values previously converted to 64 bit double
768 : const auto firstLineLoDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineLo));
769 : const auto firstLineHiDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineHi));
770 : const auto secondLineLoDbl =
771 : SQUARE_PD(_mm256_cvtepi32_pd(secondLineLo));
772 : const auto secondLineHiDbl =
773 : SQUARE_PD(_mm256_cvtepi32_pd(secondLineHi));
774 :
775 : // Vertical addition of squares
776 : const auto sumSquaresLo =
777 : _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
778 : const auto sumSquaresHi =
779 : _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
780 :
781 : // Horizontal addition of squares
782 : const auto sumSquares =
783 : FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
784 :
785 : const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
786 :
787 : // Take square root and truncate/floor to int32
788 : auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
789 : const auto rmsDouble = _mm256_cvtepi32_pd(rms);
790 : const auto right = _mm256_sub_pd(
791 : sumDivWeight, _mm256_add_pd(SQUARE_PD(rmsDouble), rmsDouble));
792 :
793 : auto mask =
794 : _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
795 : // Extract 32-bit from each of the 4 64-bit masks
796 : // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
797 : // _MM_SHUFFLE(2,0,2,0)));
798 : mask = _mm256_permutevar8x32_ps(mask, permutation);
799 : const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
800 :
801 : // Apply the correction
802 : rms = _mm_sub_epi32(rms, maskI);
803 :
804 : // Pack each 32 bit RMS value to 16 bits
805 : rms = _mm_packus_epi32(rms, rms /* could be anything */);
806 : #else
807 : // Multiplication of 32 bit values previously converted to 64 bit double
808 12 : const auto firstLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineLo));
809 : const auto firstLineLoHi =
810 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
811 12 : const auto firstLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineHi));
812 : const auto firstLineHiHi =
813 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
814 :
815 12 : const auto secondLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineLo));
816 : const auto secondLineLoHi =
817 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
818 12 : const auto secondLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineHi));
819 : const auto secondLineHiHi =
820 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
821 :
822 : // Vertical addition of squares
823 12 : const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
824 12 : const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
825 12 : const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
826 12 : const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
827 :
828 : // Horizontal addition of squares
829 12 : const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
830 12 : const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
831 :
832 12 : const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
833 12 : const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
834 : // Take square root and truncate/floor to int32
835 24 : const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
836 24 : const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
837 :
838 : // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
839 : // if( 0.5 < sumDivWeight - (rms * rms + rms) )
840 : // rms += 1;
841 12 : const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
842 12 : const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
843 24 : const auto rightLo = _mm_sub_pd(
844 : sumDivWeightLo, _mm_add_pd(SQUARE_PD(rmsLoDouble), rmsLoDouble));
845 36 : const auto rightHi = _mm_sub_pd(
846 : sumDivWeightHi, _mm_add_pd(SQUARE_PD(rmsHiDouble), rmsHiDouble));
847 :
848 24 : const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
849 12 : const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
850 : // The value of the mask will be -1 when the correction needs to be
851 : // applied
852 24 : const auto mask = _mm_castps_si128(_mm_shuffle_ps(
853 : maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
854 :
855 48 : auto rms = _mm_castps_si128(
856 : _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
857 : // Apply the correction
858 12 : rms = _mm_sub_epi32(rms, mask);
859 :
860 : // Pack each 32 bit RMS value to 16 bits
861 12 : rms = sse2_packus_epi32(rms, rms /* could be anything */);
862 : #endif
863 :
864 12 : _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
865 : rms);
866 12 : pSrcScanlineShifted += 2 * DEST_ELTS;
867 : }
868 :
869 14 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
870 14 : return iDstPixel;
871 : }
872 :
873 : /************************************************************************/
874 : /* AverageUInt16SSE2() */
875 : /************************************************************************/
876 :
877 : static int
878 13 : AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
879 : const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
880 : uint16_t *CPL_RESTRICT pDstScanline)
881 : {
882 : // Optimized implementation for average on UInt16 by
883 : // processing by group of 8 output pixels.
884 :
885 13 : const auto mask = _mm_set1_epi32(0xFFFF);
886 13 : const auto two = _mm_set1_epi32(2);
887 13 : const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
888 :
889 13 : int iDstPixel = 0;
890 13 : constexpr int DEST_ELTS = static_cast<int>(sizeof(mask) / sizeof(uint16_t));
891 25 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
892 : {
893 : __m128i averageLow;
894 : // Load 8 UInt16 from each line
895 : {
896 12 : const auto firstLine = _mm_loadu_si128(
897 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
898 : const auto secondLine =
899 12 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
900 12 : pSrcScanlineShifted + nChunkXSize));
901 :
902 : // Horizontal addition and extension to 32 bit
903 36 : const auto horizAddFirstLine = _mm_add_epi32(
904 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
905 : const auto horizAddSecondLine =
906 36 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
907 : _mm_srli_epi32(secondLine, 16));
908 :
909 : // Vertical addition and average computation
910 : // average = (sum + 2) >> 2
911 24 : const auto sum = _mm_add_epi32(
912 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
913 12 : averageLow = _mm_srli_epi32(sum, 2);
914 : }
915 : // Load 8 UInt16 from each line
916 : __m128i averageHigh;
917 : {
918 : const auto firstLine =
919 12 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
920 12 : pSrcScanlineShifted + DEST_ELTS));
921 : const auto secondLine =
922 12 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
923 12 : pSrcScanlineShifted + DEST_ELTS + nChunkXSize));
924 :
925 : // Horizontal addition and extension to 32 bit
926 36 : const auto horizAddFirstLine = _mm_add_epi32(
927 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
928 : const auto horizAddSecondLine =
929 36 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
930 : _mm_srli_epi32(secondLine, 16));
931 :
932 : // Vertical addition and average computation
933 : // average = (sum + 2) >> 2
934 24 : const auto sum = _mm_add_epi32(
935 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
936 12 : averageHigh = _mm_srli_epi32(sum, 2);
937 : }
938 :
939 : // Pack each 32 bit average value to 16 bits
940 12 : auto average = sse2_packus_epi32(averageLow, averageHigh);
941 12 : _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
942 : average);
943 12 : pSrcScanlineShifted += 2 * DEST_ELTS;
944 : }
945 :
946 13 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
947 13 : return iDstPixel;
948 : }
949 :
950 : /************************************************************************/
951 : /* QuadraticMeanFloatSSE2() */
952 : /************************************************************************/
953 :
954 : #if !defined(ARM_V7)
955 :
956 : #ifdef __SSE3__
957 : #define sse2_hadd_ps _mm_hadd_ps
958 : #else
959 82 : inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
960 : {
961 82 : auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
962 82 : auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
963 82 : return _mm_add_ps(aEven_bEven, aOdd_bOdd); // (aEven + aOdd, bEven + bOdd)
964 : }
965 : #endif
966 :
967 : #ifdef __AVX2__
968 : #define set1_ps _mm256_set1_ps
969 : #define loadu_ps _mm256_loadu_ps
970 : #define andnot_ps _mm256_andnot_ps
971 : #define and_ps _mm256_and_ps
972 : #define max_ps _mm256_max_ps
973 : #define shuffle_ps _mm256_shuffle_ps
974 : #define div_ps _mm256_div_ps
975 : #define cmpeq_ps(x, y) _mm256_cmp_ps((x), (y), _CMP_EQ_OQ)
976 : #define mul_ps _mm256_mul_ps
977 : #define add_ps _mm256_add_ps
978 : #define hadd_ps _mm256_hadd_ps
979 : #define sqrt_ps _mm256_sqrt_ps
980 : #define or_ps _mm256_or_ps
981 : #define unpacklo_ps _mm256_unpacklo_ps
982 : #define unpackhi_ps _mm256_unpackhi_ps
983 : #define storeu_ps _mm256_storeu_ps
984 : #define blendv_ps _mm256_blendv_ps
985 :
986 : inline __m256 SQUARE_PS(__m256 x)
987 : {
988 : return _mm256_mul_ps(x, x);
989 : }
990 :
991 : #else
992 :
993 : #define set1_ps _mm_set1_ps
994 : #define loadu_ps _mm_loadu_ps
995 : #define andnot_ps _mm_andnot_ps
996 : #define and_ps _mm_and_ps
997 : #define max_ps _mm_max_ps
998 : #define shuffle_ps _mm_shuffle_ps
999 : #define div_ps _mm_div_ps
1000 : #define cmpeq_ps _mm_cmpeq_ps
1001 : #define mul_ps _mm_mul_ps
1002 : #define add_ps _mm_add_ps
1003 : #define hadd_ps sse2_hadd_ps
1004 : #define sqrt_ps _mm_sqrt_ps
1005 : #define or_ps _mm_or_ps
1006 : #define unpacklo_ps _mm_unpacklo_ps
1007 : #define unpackhi_ps _mm_unpackhi_ps
1008 : #define storeu_ps _mm_storeu_ps
1009 :
1010 132 : inline __m128 blendv_ps(__m128 a, __m128 b, __m128 mask)
1011 : {
1012 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
1013 : return _mm_blendv_ps(a, b, mask);
1014 : #else
1015 396 : return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b));
1016 : #endif
1017 : }
1018 :
1019 528 : inline __m128 SQUARE_PS(__m128 x)
1020 : {
1021 528 : return _mm_mul_ps(x, x);
1022 : }
1023 :
1024 132 : inline __m128 FIXUP_LANES(__m128 x)
1025 : {
1026 132 : return x;
1027 : }
1028 :
1029 : #endif
1030 :
1031 : static int
1032 : #if defined(__GNUC__)
1033 : __attribute__((noinline))
1034 : #endif
1035 66 : QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
1036 : const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1037 : float *CPL_RESTRICT pDstScanline)
1038 : {
1039 : // Optimized implementation for RMS on Float32 by
1040 : // processing by group of output pixels.
1041 66 : const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1042 :
1043 66 : int iDstPixel = 0;
1044 66 : const auto minus_zero = set1_ps(-0.0f);
1045 66 : const auto zeroDot25 = set1_ps(0.25f);
1046 66 : const auto one = set1_ps(1.0f);
1047 66 : const auto infv = set1_ps(std::numeric_limits<float>::infinity());
1048 66 : constexpr int DEST_ELTS = static_cast<int>(sizeof(one) / sizeof(float));
1049 :
1050 198 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1051 : {
1052 : // Load 2*DEST_ELTS Float32 from each line
1053 132 : auto firstLineLo = loadu_ps(pSrcScanlineShifted);
1054 132 : auto firstLineHi = loadu_ps(pSrcScanlineShifted + DEST_ELTS);
1055 132 : auto secondLineLo = loadu_ps(pSrcScanlineShifted + nChunkXSize);
1056 : auto secondLineHi =
1057 264 : loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize);
1058 :
1059 : // Take the absolute value
1060 132 : firstLineLo = andnot_ps(minus_zero, firstLineLo);
1061 132 : firstLineHi = andnot_ps(minus_zero, firstLineHi);
1062 132 : secondLineLo = andnot_ps(minus_zero, secondLineLo);
1063 132 : secondLineHi = andnot_ps(minus_zero, secondLineHi);
1064 :
1065 : auto firstLineEven =
1066 132 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1067 : auto firstLineOdd =
1068 132 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1069 : auto secondLineEven =
1070 132 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1071 : auto secondLineOdd =
1072 132 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1073 :
1074 : // Compute the maximum of each DEST_ELTS value to RMS-average
1075 396 : const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
1076 : max_ps(secondLineEven, secondLineEven));
1077 :
1078 : // Normalize each value by the maximum of the DEST_ELTS ones.
1079 : // This step is important to avoid that the square evaluates to infinity
1080 : // for sufficiently big input.
1081 132 : auto invMax = div_ps(one, maxV);
1082 : // Deal with 0 being the maximum to correct division by zero
1083 : // note: comparing to -0 leads to identical results as to comparing with
1084 : // 0
1085 264 : invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
1086 :
1087 132 : firstLineEven = mul_ps(firstLineEven, invMax);
1088 132 : firstLineOdd = mul_ps(firstLineOdd, invMax);
1089 132 : secondLineEven = mul_ps(secondLineEven, invMax);
1090 132 : secondLineOdd = mul_ps(secondLineOdd, invMax);
1091 :
1092 : // Compute squares
1093 132 : firstLineEven = SQUARE_PS(firstLineEven);
1094 132 : firstLineOdd = SQUARE_PS(firstLineOdd);
1095 132 : secondLineEven = SQUARE_PS(secondLineEven);
1096 132 : secondLineOdd = SQUARE_PS(secondLineOdd);
1097 :
1098 396 : const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
1099 : add_ps(secondLineEven, secondLineOdd));
1100 :
1101 396 : auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
1102 :
1103 : // Deal with infinity being the maximum
1104 132 : const auto maskIsInf = cmpeq_ps(maxV, infv);
1105 132 : rms = blendv_ps(rms, infv, maskIsInf);
1106 :
1107 132 : rms = FIXUP_LANES(rms);
1108 :
1109 132 : storeu_ps(&pDstScanline[iDstPixel], rms);
1110 132 : pSrcScanlineShifted += DEST_ELTS * 2;
1111 : }
1112 :
1113 66 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1114 66 : return iDstPixel;
1115 : }
1116 :
1117 : /************************************************************************/
1118 : /* AverageFloatSSE2() */
1119 : /************************************************************************/
1120 :
1121 50 : static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
1122 : const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1123 : float *CPL_RESTRICT pDstScanline)
1124 : {
1125 : // Optimized implementation for average on Float32 by
1126 : // processing by group of output pixels.
1127 50 : const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1128 :
1129 50 : int iDstPixel = 0;
1130 50 : const auto zeroDot25 = _mm_set1_ps(0.25f);
1131 50 : constexpr int DEST_ELTS =
1132 : static_cast<int>(sizeof(zeroDot25) / sizeof(float));
1133 :
1134 132 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1135 : {
1136 : // Load 2 * DEST_ELTS Float32 from each line
1137 : const auto firstLineLo =
1138 82 : _mm_mul_ps(_mm_loadu_ps(pSrcScanlineShifted), zeroDot25);
1139 164 : const auto firstLineHi = _mm_mul_ps(
1140 : _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS), zeroDot25);
1141 82 : const auto secondLineLo = _mm_mul_ps(
1142 82 : _mm_loadu_ps(pSrcScanlineShifted + nChunkXSize), zeroDot25);
1143 164 : const auto secondLineHi = _mm_mul_ps(
1144 82 : _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize),
1145 : zeroDot25);
1146 :
1147 : // Vertical addition
1148 82 : const auto tmpLo = _mm_add_ps(firstLineLo, secondLineLo);
1149 82 : const auto tmpHi = _mm_add_ps(firstLineHi, secondLineHi);
1150 :
1151 : // Horizontal addition
1152 82 : const auto average = sse2_hadd_ps(tmpLo, tmpHi);
1153 :
1154 82 : _mm_storeu_ps(&pDstScanline[iDstPixel], average);
1155 82 : pSrcScanlineShifted += DEST_ELTS * 2;
1156 : }
1157 :
1158 50 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1159 50 : return iDstPixel;
1160 : }
1161 :
1162 : /************************************************************************/
1163 : /* AverageDoubleSSE2() */
1164 : /************************************************************************/
1165 :
1166 : static int
1167 50 : AverageDoubleSSE2(int nDstXWidth, int nChunkXSize,
1168 : const double *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1169 : double *CPL_RESTRICT pDstScanline)
1170 : {
1171 : // Optimized implementation for average on Float64 by
1172 : // processing by group of output pixels.
1173 50 : const double *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1174 :
1175 50 : int iDstPixel = 0;
1176 50 : const auto zeroDot25 = _mm_set1_pd(0.25);
1177 50 : constexpr int DEST_ELTS =
1178 : static_cast<int>(sizeof(zeroDot25) / sizeof(double));
1179 :
1180 211 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1181 : {
1182 : // Load 4 * DEST_ELTS Float64 from each line
1183 161 : const auto firstLine0 = _mm_mul_pd(
1184 : _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS), zeroDot25);
1185 322 : const auto firstLine1 = _mm_mul_pd(
1186 : _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS), zeroDot25);
1187 161 : const auto secondLine0 = _mm_mul_pd(
1188 161 : _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS + nChunkXSize),
1189 : zeroDot25);
1190 322 : const auto secondLine1 = _mm_mul_pd(
1191 161 : _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS + nChunkXSize),
1192 : zeroDot25);
1193 :
1194 : // Vertical addition
1195 161 : const auto tmp0 = _mm_add_pd(firstLine0, secondLine0);
1196 161 : const auto tmp1 = _mm_add_pd(firstLine1, secondLine1);
1197 :
1198 : // Horizontal addition
1199 161 : const auto average0 = sse2_hadd_pd(tmp0, tmp1);
1200 :
1201 161 : _mm_storeu_pd(&pDstScanline[iDstPixel + 0], average0);
1202 161 : pSrcScanlineShifted += DEST_ELTS * 2;
1203 : }
1204 :
1205 50 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1206 50 : return iDstPixel;
1207 : }
1208 :
1209 : #endif
1210 :
1211 : #endif
1212 :
1213 : /************************************************************************/
1214 : /* GDALResampleChunk_AverageOrRMS() */
1215 : /************************************************************************/
1216 :
1217 : template <class T, class Tsum, GDALDataType eWrkDataType, bool bQuadraticMean>
1218 : static CPLErr
1219 2394 : GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
1220 : const T *pChunk, void **ppDstBuffer)
1221 : {
1222 2394 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1223 2394 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1224 2394 : const double dfSrcXDelta = args.dfSrcXDelta;
1225 2394 : const double dfSrcYDelta = args.dfSrcYDelta;
1226 2394 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1227 2394 : const int nChunkXOff = args.nChunkXOff;
1228 2394 : const int nChunkYOff = args.nChunkYOff;
1229 2394 : const int nChunkXSize = args.nChunkXSize;
1230 2394 : const int nChunkYSize = args.nChunkYSize;
1231 2394 : const int nDstXOff = args.nDstXOff;
1232 2394 : const int nDstXOff2 = args.nDstXOff2;
1233 2394 : const int nDstYOff = args.nDstYOff;
1234 2394 : const int nDstYOff2 = args.nDstYOff2;
1235 2394 : const char *pszResampling = args.pszResampling;
1236 2394 : bool bHasNoData = args.bHasNoData;
1237 2394 : const double dfNoDataValue = args.dfNoDataValue;
1238 2394 : const GDALColorTable *const poColorTable =
1239 : !bQuadraticMean &&
1240 : // AVERAGE_BIT2GRAYSCALE
1241 2317 : CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"))
1242 : ? nullptr
1243 : : args.poColorTable;
1244 2394 : const bool bPropagateNoData = args.bPropagateNoData;
1245 :
1246 2394 : T tNoDataValue = (!bHasNoData) ? 0 : static_cast<T>(dfNoDataValue);
1247 2394 : const T tReplacementVal =
1248 178 : bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
1249 58 : args.eOvrDataType, dfNoDataValue))
1250 : : 0;
1251 :
1252 2394 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
1253 2394 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1254 2394 : const int nDstXWidth = nDstXOff2 - nDstXOff;
1255 :
1256 : /* -------------------------------------------------------------------- */
1257 : /* Allocate buffers. */
1258 : /* -------------------------------------------------------------------- */
1259 2394 : *ppDstBuffer = static_cast<T *>(
1260 2394 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
1261 : GDALGetDataTypeSizeBytes(eWrkDataType)));
1262 2394 : if (*ppDstBuffer == nullptr)
1263 : {
1264 0 : return CE_Failure;
1265 : }
1266 2394 : T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
1267 :
1268 : struct PrecomputedXValue
1269 : {
1270 : int nLeftXOffShifted;
1271 : int nRightXOffShifted;
1272 : double dfLeftWeight;
1273 : double dfRightWeight;
1274 : double dfTotalWeightFullLine;
1275 : };
1276 :
1277 : PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
1278 2394 : VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(PrecomputedXValue)));
1279 :
1280 2394 : if (pasSrcX == nullptr)
1281 : {
1282 0 : return CE_Failure;
1283 : }
1284 :
1285 2394 : std::vector<GDALColorEntry> colorEntries;
1286 :
1287 2394 : if (poColorTable)
1288 : {
1289 5 : int nTransparentIdx = -1;
1290 5 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1291 :
1292 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1293 : // it as nodata value
1294 6 : if (bHasNoData && dfNoDataValue >= 0.0 &&
1295 1 : tNoDataValue < colorEntries.size())
1296 1 : colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1297 :
1298 : // Or if we have no explicit nodata, but a color table entry that is
1299 : // transparent, consider it as the nodata value
1300 4 : else if (!bHasNoData && nTransparentIdx >= 0)
1301 : {
1302 0 : bHasNoData = true;
1303 0 : tNoDataValue = static_cast<T>(nTransparentIdx);
1304 : }
1305 : }
1306 :
1307 : /* ==================================================================== */
1308 : /* Precompute inner loop constants. */
1309 : /* ==================================================================== */
1310 2394 : bool bSrcXSpacingIsTwo = true;
1311 2394 : int nLastSrcXOff2 = -1;
1312 856928 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1313 : {
1314 854534 : const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
1315 : // Apply some epsilon to avoid numerical precision issues
1316 854534 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
1317 854534 : const double dfSrcXOff2 =
1318 854534 : dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
1319 854534 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
1320 :
1321 854534 : if (nSrcXOff < nChunkXOff)
1322 0 : nSrcXOff = nChunkXOff;
1323 854534 : if (nSrcXOff2 == nSrcXOff)
1324 0 : nSrcXOff2++;
1325 854534 : if (nSrcXOff2 > nChunkRightXOff)
1326 1 : nSrcXOff2 = nChunkRightXOff;
1327 :
1328 854534 : pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
1329 854534 : pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
1330 854534 : nSrcXOff2 - nChunkXOff;
1331 21 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
1332 854534 : (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
1333 854534 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
1334 854534 : 1 - (nSrcXOff2 - dfSrcXOff2);
1335 854534 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
1336 854534 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
1337 854534 : if (nSrcXOff + 1 < nSrcXOff2)
1338 : {
1339 854513 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1340 854513 : nSrcXOff2 - nSrcXOff - 2;
1341 854513 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1342 854513 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
1343 : }
1344 :
1345 854534 : if (nSrcXOff2 - nSrcXOff != 2 ||
1346 733041 : (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
1347 : {
1348 120637 : bSrcXSpacingIsTwo = false;
1349 : }
1350 854534 : nLastSrcXOff2 = nSrcXOff2;
1351 : }
1352 :
1353 : /* ==================================================================== */
1354 : /* Loop over destination scanlines. */
1355 : /* ==================================================================== */
1356 722579 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1357 : {
1358 720185 : const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
1359 720185 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
1360 720185 : if (nSrcYOff < nChunkYOff)
1361 0 : nSrcYOff = nChunkYOff;
1362 :
1363 720185 : const double dfSrcYOff2 =
1364 720185 : dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
1365 720185 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
1366 720185 : if (nSrcYOff2 == nSrcYOff)
1367 0 : ++nSrcYOff2;
1368 720185 : if (nSrcYOff2 > nChunkBottomYOff)
1369 3 : nSrcYOff2 = nChunkBottomYOff;
1370 :
1371 720185 : T *const pDstScanline =
1372 720185 : pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
1373 :
1374 : /* --------------------------------------------------------------------
1375 : */
1376 : /* Loop over destination pixels */
1377 : /* --------------------------------------------------------------------
1378 : */
1379 720185 : if (poColorTable == nullptr)
1380 : {
1381 720070 : if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
1382 : pabyChunkNodataMask == nullptr)
1383 : {
1384 : if constexpr (eWrkDataType == GDT_UInt8 ||
1385 : eWrkDataType == GDT_UInt16)
1386 : {
1387 : // Optimized case : no nodata, overview by a factor of 2 and
1388 : // regular x and y src spacing.
1389 117150 : const T *pSrcScanlineShifted =
1390 117150 : pChunk + pasSrcX[0].nLeftXOffShifted +
1391 117150 : static_cast<size_t>(nSrcYOff - nChunkYOff) *
1392 117150 : nChunkXSize;
1393 117150 : int iDstPixel = 0;
1394 : #ifdef USE_SSE2
1395 : if constexpr (eWrkDataType == GDT_UInt8)
1396 : {
1397 : if constexpr (bQuadraticMean)
1398 : {
1399 5389 : iDstPixel = QuadraticMeanByteSSE2OrAVX2(
1400 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1401 : pDstScanline);
1402 : }
1403 : else
1404 : {
1405 111734 : iDstPixel = AverageByteSSE2OrAVX2(
1406 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1407 : pDstScanline);
1408 : }
1409 : }
1410 : else
1411 : {
1412 : static_assert(eWrkDataType == GDT_UInt16);
1413 : if constexpr (bQuadraticMean)
1414 : {
1415 14 : iDstPixel = QuadraticMeanUInt16SSE2(
1416 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1417 : pDstScanline);
1418 : }
1419 : else
1420 : {
1421 13 : iDstPixel = AverageUInt16SSE2(
1422 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1423 : pDstScanline);
1424 : }
1425 : }
1426 : #endif
1427 291609 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1428 : {
1429 174459 : Tsum nTotal = 0;
1430 : T nVal;
1431 : if constexpr (bQuadraticMean)
1432 52 : nTotal =
1433 52 : SQUARE<Tsum>(pSrcScanlineShifted[0]) +
1434 52 : SQUARE<Tsum>(pSrcScanlineShifted[1]) +
1435 52 : SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
1436 52 : SQUARE<Tsum>(
1437 52 : pSrcScanlineShifted[1 + nChunkXSize]);
1438 : else
1439 174407 : nTotal = pSrcScanlineShifted[0] +
1440 174407 : pSrcScanlineShifted[1] +
1441 174407 : pSrcScanlineShifted[nChunkXSize] +
1442 174407 : pSrcScanlineShifted[1 + nChunkXSize];
1443 :
1444 174459 : constexpr int nTotalWeight = 4;
1445 : if constexpr (bQuadraticMean)
1446 52 : nVal = ComputeIntegerRMS_4values<T>(nTotal);
1447 : else
1448 174407 : nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
1449 : nTotalWeight);
1450 :
1451 : // No need to compare nVal against tNoDataValue as we
1452 : // are in a case where pabyChunkNodataMask == nullptr
1453 : // implies the absence of nodata value.
1454 174459 : pDstScanline[iDstPixel] = nVal;
1455 174459 : pSrcScanlineShifted += 2;
1456 : }
1457 : }
1458 : else
1459 : {
1460 : static_assert(eWrkDataType == GDT_Float32 ||
1461 : eWrkDataType == GDT_Float64);
1462 202 : const T *pSrcScanlineShifted =
1463 202 : pChunk + pasSrcX[0].nLeftXOffShifted +
1464 202 : static_cast<size_t>(nSrcYOff - nChunkYOff) *
1465 202 : nChunkXSize;
1466 202 : int iDstPixel = 0;
1467 : #if defined(USE_SSE2) && !defined(ARM_V7)
1468 : if constexpr (eWrkDataType == GDT_Float32)
1469 : {
1470 : static_assert(std::is_same_v<T, float>);
1471 : if constexpr (bQuadraticMean)
1472 : {
1473 66 : iDstPixel = QuadraticMeanFloatSSE2(
1474 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1475 : pDstScanline);
1476 : }
1477 : else
1478 : {
1479 50 : iDstPixel = AverageFloatSSE2(
1480 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1481 : pDstScanline);
1482 : }
1483 : }
1484 : else
1485 : {
1486 : if constexpr (!bQuadraticMean)
1487 : {
1488 50 : iDstPixel = AverageDoubleSSE2(
1489 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1490 : pDstScanline);
1491 : }
1492 : }
1493 : #endif
1494 :
1495 726 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1496 : {
1497 : T nVal;
1498 :
1499 : if constexpr (bQuadraticMean)
1500 : {
1501 : // Avoid issues with large values by renormalizing
1502 96 : const auto max = std::max(
1503 420 : {std::fabs(pSrcScanlineShifted[0]),
1504 420 : std::fabs(pSrcScanlineShifted[1]),
1505 420 : std::fabs(pSrcScanlineShifted[nChunkXSize]),
1506 420 : std::fabs(
1507 420 : pSrcScanlineShifted[1 + nChunkXSize])});
1508 420 : if (max == 0)
1509 : {
1510 8 : nVal = 0;
1511 : }
1512 412 : else if (std::isinf(max))
1513 : {
1514 : // If there is at least one infinity value,
1515 : // then just summing, and taking the abs
1516 : // value will give the expected result:
1517 : // * +inf if all values are +inf
1518 : // * +inf if all values are -inf
1519 : // * NaN otherwise
1520 82 : nVal = std::fabs(
1521 82 : pSrcScanlineShifted[0] +
1522 82 : pSrcScanlineShifted[1] +
1523 82 : pSrcScanlineShifted[nChunkXSize] +
1524 82 : pSrcScanlineShifted[1 + nChunkXSize]);
1525 : }
1526 : else
1527 : {
1528 330 : const auto inv_max = static_cast<T>(1.0) / max;
1529 330 : nVal =
1530 : max *
1531 330 : std::sqrt(
1532 : static_cast<T>(0.25) *
1533 330 : (SQUARE(pSrcScanlineShifted[0] *
1534 330 : inv_max) +
1535 330 : SQUARE(pSrcScanlineShifted[1] *
1536 330 : inv_max) +
1537 330 : SQUARE(
1538 330 : pSrcScanlineShifted[nChunkXSize] *
1539 330 : inv_max) +
1540 330 : SQUARE(
1541 330 : pSrcScanlineShifted[1 +
1542 : nChunkXSize] *
1543 : inv_max)));
1544 : }
1545 : }
1546 : else
1547 : {
1548 104 : constexpr auto weight = static_cast<T>(0.25);
1549 : // Multiply each value by weight to avoid
1550 : // potential overflow
1551 104 : nVal =
1552 104 : (weight * pSrcScanlineShifted[0] +
1553 104 : weight * pSrcScanlineShifted[1] +
1554 104 : weight * pSrcScanlineShifted[nChunkXSize] +
1555 104 : weight * pSrcScanlineShifted[1 + nChunkXSize]);
1556 : }
1557 :
1558 : // No need to compare nVal against tNoDataValue as we
1559 : // are in a case where pabyChunkNodataMask == nullptr
1560 : // implies the absence of nodata value.
1561 524 : pDstScanline[iDstPixel] = nVal;
1562 524 : pSrcScanlineShifted += 2;
1563 : }
1564 117352 : }
1565 : }
1566 : else
1567 : {
1568 17 : const double dfBottomWeight =
1569 602718 : (nSrcYOff + 1 == nSrcYOff2) ? 1.0
1570 602701 : : 1.0 - (dfSrcYOff - nSrcYOff);
1571 602718 : const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
1572 602718 : nSrcYOff -= nChunkYOff;
1573 602718 : nSrcYOff2 -= nChunkYOff;
1574 :
1575 602718 : double dfTotalWeightFullColumn = dfBottomWeight;
1576 602718 : if (nSrcYOff + 1 < nSrcYOff2)
1577 : {
1578 602701 : dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
1579 602701 : dfTotalWeightFullColumn += dfTopWeight;
1580 : }
1581 :
1582 18759673 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1583 : {
1584 18156933 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1585 18156933 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1586 :
1587 18156933 : double dfTotal = 0;
1588 18156933 : double dfTotalWeight = 0;
1589 18156933 : [[maybe_unused]] double dfMulFactor = 1.0;
1590 18156933 : [[maybe_unused]] double dfInvMulFactor = 1.0;
1591 18156933 : constexpr bool bUseMulFactor =
1592 : (eWrkDataType == GDT_Float32 ||
1593 : eWrkDataType == GDT_Float64);
1594 18156933 : if (pabyChunkNodataMask == nullptr)
1595 : {
1596 : if constexpr (bUseMulFactor)
1597 : {
1598 : if constexpr (bQuadraticMean)
1599 : {
1600 80 : T mulFactor = 0;
1601 80 : auto pChunkShifted =
1602 80 : pChunk +
1603 80 : static_cast<size_t>(nSrcYOff) * nChunkXSize;
1604 :
1605 240 : for (int iY = nSrcYOff; iY < nSrcYOff2;
1606 160 : ++iY, pChunkShifted += nChunkXSize)
1607 : {
1608 480 : for (int iX = nSrcXOff; iX < nSrcXOff2;
1609 : ++iX)
1610 640 : mulFactor = std::max(
1611 : mulFactor,
1612 320 : std::fabs(pChunkShifted[iX]));
1613 : }
1614 80 : dfMulFactor = double(mulFactor);
1615 142 : dfInvMulFactor =
1616 62 : dfMulFactor > 0 &&
1617 62 : std::isfinite(dfMulFactor)
1618 : ? 1.0 / dfMulFactor
1619 : : 1.0;
1620 : }
1621 : else
1622 : {
1623 139 : dfMulFactor = (nSrcYOff2 - nSrcYOff) *
1624 139 : (nSrcXOff2 - nSrcXOff);
1625 139 : dfInvMulFactor = 1.0 / dfMulFactor;
1626 : }
1627 : }
1628 :
1629 1746545 : auto pChunkShifted =
1630 227 : pChunk +
1631 1746545 : static_cast<size_t>(nSrcYOff) * nChunkXSize;
1632 1746545 : int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1633 1746545 : double dfWeightY = dfBottomWeight;
1634 3493539 : while (true)
1635 : {
1636 : double dfTotalLine;
1637 : if constexpr (bQuadraticMean)
1638 : {
1639 : // Left pixel
1640 : {
1641 216 : const T val = pChunkShifted[nSrcXOff];
1642 216 : dfTotalLine =
1643 216 : SQUARE(double(val) * dfInvMulFactor) *
1644 216 : pasSrcX[iDstPixel].dfLeftWeight;
1645 : }
1646 :
1647 216 : if (nSrcXOff + 1 < nSrcXOff2)
1648 : {
1649 : // Middle pixels
1650 216 : for (int iX = nSrcXOff + 1;
1651 536 : iX < nSrcXOff2 - 1; ++iX)
1652 : {
1653 320 : const T val = pChunkShifted[iX];
1654 320 : dfTotalLine += SQUARE(double(val) *
1655 : dfInvMulFactor);
1656 : }
1657 :
1658 : // Right pixel
1659 : {
1660 216 : const T val =
1661 216 : pChunkShifted[nSrcXOff2 - 1];
1662 216 : dfTotalLine +=
1663 216 : SQUARE(double(val) *
1664 216 : dfInvMulFactor) *
1665 216 : pasSrcX[iDstPixel].dfRightWeight;
1666 : }
1667 : }
1668 : }
1669 : else
1670 : {
1671 : // Left pixel
1672 : {
1673 5239868 : const T val = pChunkShifted[nSrcXOff];
1674 5239868 : dfTotalLine =
1675 5239868 : double(val) * dfInvMulFactor *
1676 5239868 : pasSrcX[iDstPixel].dfLeftWeight;
1677 : }
1678 :
1679 5239868 : if (nSrcXOff + 1 < nSrcXOff2)
1680 : {
1681 : // Middle pixels
1682 4239442 : for (int iX = nSrcXOff + 1;
1683 64183238 : iX < nSrcXOff2 - 1; ++iX)
1684 : {
1685 59943836 : const T val = pChunkShifted[iX];
1686 59943836 : dfTotalLine +=
1687 59943836 : double(val) * dfInvMulFactor;
1688 : }
1689 :
1690 : // Right pixel
1691 : {
1692 4239442 : const T val =
1693 4239442 : pChunkShifted[nSrcXOff2 - 1];
1694 4239442 : dfTotalLine +=
1695 4239442 : double(val) * dfInvMulFactor *
1696 4239442 : pasSrcX[iDstPixel].dfRightWeight;
1697 : }
1698 : }
1699 : }
1700 :
1701 5240084 : dfTotal += dfTotalLine * dfWeightY;
1702 5240084 : --nCounterY;
1703 5240084 : if (nCounterY < 0)
1704 1746545 : break;
1705 3493539 : pChunkShifted += nChunkXSize;
1706 3493539 : dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
1707 : }
1708 :
1709 1746545 : dfTotalWeight =
1710 1746545 : pasSrcX[iDstPixel].dfTotalWeightFullLine *
1711 : dfTotalWeightFullColumn;
1712 : }
1713 : else
1714 : {
1715 16410398 : size_t nCount = 0;
1716 71788794 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1717 : {
1718 55378396 : const auto pChunkShifted =
1719 55378396 : pChunk + static_cast<size_t>(iY) * nChunkXSize;
1720 :
1721 55378396 : double dfTotalLine = 0;
1722 55378396 : double dfTotalWeightLine = 0;
1723 : // Left pixel
1724 : {
1725 55378396 : const int iX = nSrcXOff;
1726 55378396 : const T val = pChunkShifted[iX];
1727 55378396 : if (pabyChunkNodataMask
1728 55378396 : [iX +
1729 55378396 : static_cast<size_t>(iY) * nChunkXSize])
1730 : {
1731 23518643 : nCount++;
1732 23518643 : const double dfWeightX =
1733 23518643 : pasSrcX[iDstPixel].dfLeftWeight;
1734 23518643 : dfTotalWeightLine = dfWeightX;
1735 : if constexpr (bQuadraticMean)
1736 60 : dfTotalLine =
1737 60 : SQUARE(double(val)) * dfWeightX;
1738 : else
1739 23518583 : dfTotalLine = double(val) * dfWeightX;
1740 : }
1741 : }
1742 :
1743 55378396 : if (nSrcXOff < nSrcXOff2 - 1)
1744 : {
1745 : // Middle pixels
1746 152899196 : for (int iX = nSrcXOff + 1; iX < nSrcXOff2 - 1;
1747 : ++iX)
1748 : {
1749 97521100 : const T val = pChunkShifted[iX];
1750 97521100 : if (pabyChunkNodataMask
1751 97521100 : [iX + static_cast<size_t>(iY) *
1752 97521100 : nChunkXSize])
1753 : {
1754 39728400 : nCount++;
1755 39728400 : dfTotalWeightLine += 1;
1756 : if constexpr (bQuadraticMean)
1757 0 : dfTotalLine += SQUARE(double(val));
1758 : else
1759 39728400 : dfTotalLine += double(val);
1760 : }
1761 : }
1762 :
1763 : // Right pixel
1764 : {
1765 55378396 : const int iX = nSrcXOff2 - 1;
1766 55378396 : const T val = pChunkShifted[iX];
1767 55378396 : if (pabyChunkNodataMask
1768 55378396 : [iX + static_cast<size_t>(iY) *
1769 55378396 : nChunkXSize])
1770 : {
1771 23517911 : nCount++;
1772 23517911 : const double dfWeightX =
1773 23517911 : pasSrcX[iDstPixel].dfRightWeight;
1774 23517911 : dfTotalWeightLine += dfWeightX;
1775 : if constexpr (bQuadraticMean)
1776 61 : dfTotalLine +=
1777 61 : SQUARE(double(val)) * dfWeightX;
1778 : else
1779 23517850 : dfTotalLine +=
1780 23517850 : double(val) * dfWeightX;
1781 : }
1782 : }
1783 : }
1784 :
1785 94346294 : const double dfWeightY =
1786 : (iY == nSrcYOff) ? dfBottomWeight
1787 38967998 : : (iY + 1 == nSrcYOff2) ? dfTopWeight
1788 : : 1.0;
1789 55378396 : dfTotal += dfTotalLine * dfWeightY;
1790 55378396 : dfTotalWeight += dfTotalWeightLine * dfWeightY;
1791 : }
1792 :
1793 16410398 : if (nCount == 0 ||
1794 8 : (bPropagateNoData &&
1795 : nCount <
1796 8 : static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
1797 8 : (nSrcXOff2 - nSrcXOff)))
1798 : {
1799 9609422 : pDstScanline[iDstPixel] = tNoDataValue;
1800 9609422 : continue;
1801 : }
1802 : }
1803 : if constexpr (eWrkDataType == GDT_UInt8)
1804 : {
1805 : T nVal;
1806 : if constexpr (bQuadraticMean)
1807 38 : nVal = ComputeIntegerRMS<T, int>(dfTotal,
1808 : dfTotalWeight);
1809 : else
1810 8547230 : nVal =
1811 8547230 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1812 8547268 : if (bHasNoData && nVal == tNoDataValue)
1813 0 : nVal = tReplacementVal;
1814 8547268 : pDstScanline[iDstPixel] = nVal;
1815 : }
1816 : else if constexpr (eWrkDataType == GDT_UInt16)
1817 : {
1818 : T nVal;
1819 : if constexpr (bQuadraticMean)
1820 4 : nVal = ComputeIntegerRMS<T, uint64_t>(
1821 : dfTotal, dfTotalWeight);
1822 : else
1823 4 : nVal =
1824 4 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1825 8 : if (bHasNoData && nVal == tNoDataValue)
1826 0 : nVal = tReplacementVal;
1827 8 : pDstScanline[iDstPixel] = nVal;
1828 : }
1829 : else
1830 : {
1831 : T nVal;
1832 : if constexpr (bQuadraticMean)
1833 : {
1834 : if constexpr (bUseMulFactor)
1835 81 : nVal = static_cast<T>(
1836 48 : dfMulFactor *
1837 81 : sqrt(dfTotal / dfTotalWeight));
1838 : else
1839 : nVal = static_cast<T>(
1840 : sqrt(dfTotal / dfTotalWeight));
1841 : }
1842 : else
1843 : {
1844 : if constexpr (bUseMulFactor)
1845 184 : nVal = static_cast<T>(
1846 184 : dfMulFactor * (dfTotal / dfTotalWeight));
1847 : else
1848 : nVal = static_cast<T>(dfTotal / dfTotalWeight);
1849 : }
1850 265 : if (bHasNoData && nVal == tNoDataValue)
1851 2 : nVal = tReplacementVal;
1852 265 : pDstScanline[iDstPixel] = nVal;
1853 : }
1854 : }
1855 : }
1856 : }
1857 : else
1858 : {
1859 115 : nSrcYOff -= nChunkYOff;
1860 115 : nSrcYOff2 -= nChunkYOff;
1861 :
1862 6590 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1863 : {
1864 6475 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1865 6475 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1866 :
1867 6475 : uint64_t nTotalR = 0;
1868 6475 : uint64_t nTotalG = 0;
1869 6475 : uint64_t nTotalB = 0;
1870 6475 : size_t nCount = 0;
1871 :
1872 19425 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1873 : {
1874 38850 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
1875 : {
1876 25900 : const T val =
1877 25900 : pChunk[iX + static_cast<size_t>(iY) * nChunkXSize];
1878 : // cppcheck-suppress unsignedLessThanZero
1879 25900 : if (val < 0 || val >= colorEntries.size())
1880 0 : continue;
1881 25900 : const size_t idx = static_cast<size_t>(val);
1882 25900 : const auto &entry = colorEntries[idx];
1883 25900 : if (entry.c4)
1884 : {
1885 : if constexpr (bQuadraticMean)
1886 : {
1887 800 : nTotalR += SQUARE<int>(entry.c1);
1888 800 : nTotalG += SQUARE<int>(entry.c2);
1889 800 : nTotalB += SQUARE<int>(entry.c3);
1890 800 : ++nCount;
1891 : }
1892 : else
1893 : {
1894 13328 : nTotalR += entry.c1;
1895 13328 : nTotalG += entry.c2;
1896 13328 : nTotalB += entry.c3;
1897 13328 : ++nCount;
1898 : }
1899 : }
1900 : }
1901 : }
1902 :
1903 6475 : if (nCount == 0 ||
1904 0 : (bPropagateNoData &&
1905 0 : nCount < static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
1906 0 : (nSrcXOff2 - nSrcXOff)))
1907 : {
1908 2838 : pDstScanline[iDstPixel] = tNoDataValue;
1909 : }
1910 : else
1911 : {
1912 : GDALColorEntry color;
1913 : if constexpr (bQuadraticMean)
1914 : {
1915 200 : color.c1 =
1916 200 : static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
1917 200 : color.c2 =
1918 200 : static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
1919 200 : color.c3 =
1920 200 : static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
1921 : }
1922 : else
1923 : {
1924 3437 : color.c1 =
1925 3437 : static_cast<short>((nTotalR + nCount / 2) / nCount);
1926 3437 : color.c2 =
1927 3437 : static_cast<short>((nTotalG + nCount / 2) / nCount);
1928 3437 : color.c3 =
1929 3437 : static_cast<short>((nTotalB + nCount / 2) / nCount);
1930 : }
1931 3637 : pDstScanline[iDstPixel] =
1932 3637 : static_cast<T>(BestColorEntry(colorEntries, color));
1933 : }
1934 : }
1935 : }
1936 : }
1937 :
1938 2394 : CPLFree(pasSrcX);
1939 :
1940 2394 : return CE_None;
1941 : }
1942 :
1943 : template <bool bQuadraticMean>
1944 : static CPLErr
1945 2394 : GDALResampleChunk_AverageOrRMSInternal(const GDALOverviewResampleArgs &args,
1946 : const void *pChunk, void **ppDstBuffer,
1947 : GDALDataType *peDstBufferDataType)
1948 : {
1949 2394 : *peDstBufferDataType = args.eWrkDataType;
1950 2394 : switch (args.eWrkDataType)
1951 : {
1952 2263 : case GDT_UInt8:
1953 : {
1954 : return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_UInt8,
1955 2263 : bQuadraticMean>(
1956 2263 : args, static_cast<const GByte *>(pChunk), ppDstBuffer);
1957 : }
1958 :
1959 11 : case GDT_UInt16:
1960 : {
1961 : if constexpr (bQuadraticMean)
1962 : {
1963 : // Use double as accumulation type, because UInt32 could overflow
1964 : return GDALResampleChunk_AverageOrRMS_T<
1965 6 : GUInt16, double, GDT_UInt16, bQuadraticMean>(
1966 6 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1967 : }
1968 : else
1969 : {
1970 : return GDALResampleChunk_AverageOrRMS_T<
1971 5 : GUInt16, GUInt32, GDT_UInt16, bQuadraticMean>(
1972 5 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1973 : }
1974 : }
1975 :
1976 73 : case GDT_Float32:
1977 : {
1978 : return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32,
1979 73 : bQuadraticMean>(
1980 73 : args, static_cast<const float *>(pChunk), ppDstBuffer);
1981 : }
1982 :
1983 47 : case GDT_Float64:
1984 : {
1985 : return GDALResampleChunk_AverageOrRMS_T<double, double, GDT_Float64,
1986 47 : bQuadraticMean>(
1987 47 : args, static_cast<const double *>(pChunk), ppDstBuffer);
1988 : }
1989 :
1990 0 : default:
1991 0 : break;
1992 : }
1993 :
1994 0 : CPLAssert(false);
1995 : return CE_Failure;
1996 : }
1997 :
1998 : static CPLErr
1999 2394 : GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
2000 : const void *pChunk, void **ppDstBuffer,
2001 : GDALDataType *peDstBufferDataType)
2002 : {
2003 2394 : if (EQUAL(args.pszResampling, "RMS"))
2004 77 : return GDALResampleChunk_AverageOrRMSInternal<true>(
2005 77 : args, pChunk, ppDstBuffer, peDstBufferDataType);
2006 : else
2007 2317 : return GDALResampleChunk_AverageOrRMSInternal<false>(
2008 2317 : args, pChunk, ppDstBuffer, peDstBufferDataType);
2009 : }
2010 :
2011 : /************************************************************************/
2012 : /* GDALResampleChunk_Gauss() */
2013 : /************************************************************************/
2014 :
2015 86 : static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
2016 : const void *pChunk, void **ppDstBuffer,
2017 : GDALDataType *peDstBufferDataType)
2018 :
2019 : {
2020 86 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2021 86 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2022 86 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2023 86 : const int nChunkXOff = args.nChunkXOff;
2024 86 : const int nChunkXSize = args.nChunkXSize;
2025 86 : const int nChunkYOff = args.nChunkYOff;
2026 86 : const int nChunkYSize = args.nChunkYSize;
2027 86 : const int nDstXOff = args.nDstXOff;
2028 86 : const int nDstXOff2 = args.nDstXOff2;
2029 86 : const int nDstYOff = args.nDstYOff;
2030 86 : const int nDstYOff2 = args.nDstYOff2;
2031 86 : const bool bHasNoData = args.bHasNoData;
2032 86 : double dfNoDataValue = args.dfNoDataValue;
2033 86 : const GDALColorTable *poColorTable = args.poColorTable;
2034 :
2035 86 : const double *const padfChunk = static_cast<const double *>(pChunk);
2036 :
2037 86 : *ppDstBuffer =
2038 86 : VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
2039 : GDALGetDataTypeSizeBytes(GDT_Float64));
2040 86 : if (*ppDstBuffer == nullptr)
2041 : {
2042 0 : return CE_Failure;
2043 : }
2044 86 : *peDstBufferDataType = GDT_Float64;
2045 86 : double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
2046 :
2047 : /* -------------------------------------------------------------------- */
2048 : /* Create the filter kernel and allocate scanline buffer. */
2049 : /* -------------------------------------------------------------------- */
2050 86 : int nGaussMatrixDim = 3;
2051 : const int *panGaussMatrix;
2052 86 : constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
2053 86 : constexpr int anGaussMatrix5x5[] = {1, 4, 6, 4, 1, 4, 16, 24, 16,
2054 : 4, 6, 24, 36, 24, 6, 4, 16, 24,
2055 : 16, 4, 1, 4, 6, 4, 1};
2056 86 : constexpr int anGaussMatrix7x7[] = {
2057 : 1, 6, 15, 20, 15, 6, 1, 6, 36, 90, 120, 90, 36,
2058 : 6, 15, 90, 225, 300, 225, 90, 15, 20, 120, 300, 400, 300,
2059 : 120, 20, 15, 90, 225, 300, 225, 90, 15, 6, 36, 90, 120,
2060 : 90, 36, 6, 1, 6, 15, 20, 15, 6, 1};
2061 :
2062 86 : const int nOXSize = args.nOvrXSize;
2063 86 : const int nOYSize = args.nOvrYSize;
2064 86 : const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
2065 :
2066 : // matrix for gauss filter
2067 86 : if (nResYFactor <= 2)
2068 : {
2069 85 : panGaussMatrix = anGaussMatrix3x3;
2070 85 : nGaussMatrixDim = 3;
2071 : }
2072 1 : else if (nResYFactor <= 4)
2073 : {
2074 0 : panGaussMatrix = anGaussMatrix5x5;
2075 0 : nGaussMatrixDim = 5;
2076 : }
2077 : else
2078 : {
2079 1 : panGaussMatrix = anGaussMatrix7x7;
2080 1 : nGaussMatrixDim = 7;
2081 : }
2082 :
2083 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
2084 : int *panGaussMatrixDup = static_cast<int *>(
2085 : CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
2086 : memcpy(panGaussMatrixDup, panGaussMatrix,
2087 : sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
2088 : panGaussMatrix = panGaussMatrixDup;
2089 : #endif
2090 :
2091 86 : if (!bHasNoData)
2092 79 : dfNoDataValue = 0.0;
2093 :
2094 86 : std::vector<GDALColorEntry> colorEntries;
2095 86 : int nTransparentIdx = -1;
2096 86 : if (poColorTable)
2097 2 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
2098 :
2099 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
2100 : // it as nodata value.
2101 92 : if (bHasNoData && dfNoDataValue >= 0.0 &&
2102 6 : dfNoDataValue < colorEntries.size())
2103 0 : colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
2104 :
2105 : // Or if we have no explicit nodata, but a color table entry that is
2106 : // transparent, consider it as the nodata value.
2107 86 : else if (!bHasNoData && nTransparentIdx >= 0)
2108 : {
2109 0 : dfNoDataValue = nTransparentIdx;
2110 : }
2111 :
2112 86 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2113 86 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2114 86 : const int nDstXWidth = nDstXOff2 - nDstXOff;
2115 :
2116 : /* ==================================================================== */
2117 : /* Loop over destination scanlines. */
2118 : /* ==================================================================== */
2119 16488 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2120 : {
2121 16402 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
2122 16402 : int nSrcYOff2 =
2123 16402 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
2124 :
2125 16402 : if (nSrcYOff < nChunkYOff)
2126 : {
2127 0 : nSrcYOff = nChunkYOff;
2128 0 : nSrcYOff2++;
2129 : }
2130 :
2131 16402 : const int iSizeY = nSrcYOff2 - nSrcYOff;
2132 16402 : nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
2133 16402 : nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
2134 :
2135 16402 : if (nSrcYOff2 > nChunkBottomYOff ||
2136 16359 : (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
2137 : {
2138 44 : nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
2139 : }
2140 :
2141 16402 : int nYShiftGaussMatrix = 0;
2142 16402 : if (nSrcYOff < nChunkYOff)
2143 : {
2144 0 : nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
2145 0 : nSrcYOff = nChunkYOff;
2146 : }
2147 :
2148 16402 : const double *const padfSrcScanline =
2149 16402 : padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
2150 16402 : const GByte *pabySrcScanlineNodataMask = nullptr;
2151 16402 : if (pabyChunkNodataMask != nullptr)
2152 152 : pabySrcScanlineNodataMask =
2153 152 : pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
2154 :
2155 : /* --------------------------------------------------------------------
2156 : */
2157 : /* Loop over destination pixels */
2158 : /* --------------------------------------------------------------------
2159 : */
2160 16402 : double *const padfDstScanline =
2161 16402 : padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
2162 4149980 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2163 : {
2164 4133580 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
2165 4133580 : int nSrcXOff2 =
2166 4133580 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
2167 :
2168 4133580 : if (nSrcXOff < nChunkXOff)
2169 : {
2170 0 : nSrcXOff = nChunkXOff;
2171 0 : nSrcXOff2++;
2172 : }
2173 :
2174 4133580 : const int iSizeX = nSrcXOff2 - nSrcXOff;
2175 4133580 : nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
2176 4133580 : nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
2177 :
2178 4133580 : if (nSrcXOff2 > nChunkRightXOff ||
2179 4127930 : (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
2180 : {
2181 5650 : nSrcXOff2 =
2182 5650 : std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
2183 : }
2184 :
2185 4133580 : int nXShiftGaussMatrix = 0;
2186 4133580 : if (nSrcXOff < nChunkXOff)
2187 : {
2188 0 : nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
2189 0 : nSrcXOff = nChunkXOff;
2190 : }
2191 :
2192 4133580 : if (poColorTable == nullptr)
2193 : {
2194 4133380 : double dfTotal = 0.0;
2195 4133380 : GInt64 nCount = 0;
2196 4133380 : const int *panLineWeight =
2197 4133380 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2198 : nXShiftGaussMatrix;
2199 :
2200 16527900 : for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2201 12394500 : ++iY, ++j, panLineWeight += nGaussMatrixDim)
2202 : {
2203 49561300 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2204 : {
2205 37166800 : const double val =
2206 37166800 : padfSrcScanline[iX - nChunkXOff +
2207 37166800 : static_cast<GPtrDiff_t>(iY -
2208 37166800 : nSrcYOff) *
2209 37166800 : nChunkXSize];
2210 37166800 : if (pabySrcScanlineNodataMask == nullptr ||
2211 32872 : pabySrcScanlineNodataMask[iX - nChunkXOff +
2212 32872 : static_cast<GPtrDiff_t>(
2213 32872 : iY - nSrcYOff) *
2214 32872 : nChunkXSize])
2215 : {
2216 37146100 : const int nWeight = panLineWeight[i];
2217 37146100 : dfTotal += val * nWeight;
2218 37146100 : nCount += nWeight;
2219 : }
2220 : }
2221 : }
2222 :
2223 4133380 : if (nCount == 0)
2224 : {
2225 2217 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2226 : }
2227 : else
2228 : {
2229 4131160 : padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
2230 : }
2231 : }
2232 : else
2233 : {
2234 200 : GInt64 nTotalR = 0;
2235 200 : GInt64 nTotalG = 0;
2236 200 : GInt64 nTotalB = 0;
2237 200 : GInt64 nTotalWeight = 0;
2238 200 : const int *panLineWeight =
2239 200 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2240 : nXShiftGaussMatrix;
2241 :
2242 780 : for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2243 580 : ++iY, ++j, panLineWeight += nGaussMatrixDim)
2244 : {
2245 2262 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2246 : {
2247 1682 : const double val =
2248 1682 : padfSrcScanline[iX - nChunkXOff +
2249 1682 : static_cast<GPtrDiff_t>(iY -
2250 1682 : nSrcYOff) *
2251 1682 : nChunkXSize];
2252 1682 : if (val < 0 || val >= colorEntries.size())
2253 0 : continue;
2254 :
2255 1682 : size_t idx = static_cast<size_t>(val);
2256 1682 : if (colorEntries[idx].c4)
2257 : {
2258 1682 : const int nWeight = panLineWeight[i];
2259 1682 : nTotalR +=
2260 1682 : static_cast<GInt64>(colorEntries[idx].c1) *
2261 1682 : nWeight;
2262 1682 : nTotalG +=
2263 1682 : static_cast<GInt64>(colorEntries[idx].c2) *
2264 1682 : nWeight;
2265 1682 : nTotalB +=
2266 1682 : static_cast<GInt64>(colorEntries[idx].c3) *
2267 1682 : nWeight;
2268 1682 : nTotalWeight += nWeight;
2269 : }
2270 : }
2271 : }
2272 :
2273 200 : if (nTotalWeight == 0)
2274 : {
2275 0 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2276 : }
2277 : else
2278 : {
2279 : GDALColorEntry color;
2280 :
2281 200 : color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
2282 : nTotalWeight);
2283 200 : color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
2284 : nTotalWeight);
2285 200 : color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
2286 : nTotalWeight);
2287 200 : padfDstScanline[iDstPixel - nDstXOff] =
2288 200 : BestColorEntry(colorEntries, color);
2289 : }
2290 : }
2291 : }
2292 : }
2293 :
2294 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
2295 : CPLFree(panGaussMatrixDup);
2296 : #endif
2297 :
2298 86 : return CE_None;
2299 : }
2300 :
2301 : /************************************************************************/
2302 : /* GDALResampleChunk_Mode() */
2303 : /************************************************************************/
2304 :
2305 688 : template <class T> static inline bool IsSame(T a, T b)
2306 : {
2307 688 : return a == b;
2308 : }
2309 :
2310 60 : template <> bool IsSame<GFloat16>(GFloat16 a, GFloat16 b)
2311 : {
2312 60 : return a == b || (CPLIsNan(a) && CPLIsNan(b));
2313 : }
2314 :
2315 4902 : template <> bool IsSame<float>(float a, float b)
2316 : {
2317 4902 : return a == b || (std::isnan(a) && std::isnan(b));
2318 : }
2319 :
2320 1020 : template <> bool IsSame<double>(double a, double b)
2321 : {
2322 1020 : return a == b || (std::isnan(a) && std::isnan(b));
2323 : }
2324 :
2325 : namespace
2326 : {
2327 : struct ComplexFloat16
2328 : {
2329 : GFloat16 r;
2330 : GFloat16 i;
2331 : };
2332 : } // namespace
2333 :
2334 60 : template <> bool IsSame<ComplexFloat16>(ComplexFloat16 a, ComplexFloat16 b)
2335 : {
2336 90 : return (a.r == b.r && a.i == b.i) ||
2337 90 : (CPLIsNan(a.r) && CPLIsNan(a.i) && CPLIsNan(b.r) && CPLIsNan(b.i));
2338 : }
2339 :
2340 : template <>
2341 60 : bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
2342 : {
2343 120 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2344 120 : std::isnan(b.real()) && std::isnan(b.imag()));
2345 : }
2346 :
2347 : template <>
2348 60 : bool IsSame<std::complex<double>>(std::complex<double> a,
2349 : std::complex<double> b)
2350 : {
2351 120 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2352 120 : std::isnan(b.real()) && std::isnan(b.imag()));
2353 : }
2354 :
2355 : template <class T>
2356 176 : static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
2357 : const T *pChunk, T *const pDstBuffer)
2358 :
2359 : {
2360 176 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2361 176 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2362 176 : const double dfSrcXDelta = args.dfSrcXDelta;
2363 176 : const double dfSrcYDelta = args.dfSrcYDelta;
2364 176 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2365 176 : const int nChunkXOff = args.nChunkXOff;
2366 176 : const int nChunkXSize = args.nChunkXSize;
2367 176 : const int nChunkYOff = args.nChunkYOff;
2368 176 : const int nChunkYSize = args.nChunkYSize;
2369 176 : const int nDstXOff = args.nDstXOff;
2370 176 : const int nDstXOff2 = args.nDstXOff2;
2371 176 : const int nDstYOff = args.nDstYOff;
2372 176 : const int nDstYOff2 = args.nDstYOff2;
2373 176 : const bool bHasNoData = args.bHasNoData;
2374 176 : const GDALColorTable *poColorTable = args.poColorTable;
2375 176 : const int nDstXSize = nDstXOff2 - nDstXOff;
2376 :
2377 8 : T tNoDataValue;
2378 : if constexpr (std::is_same<T, ComplexFloat16>::value)
2379 : {
2380 4 : tNoDataValue.r = cpl::NumericLimits<GFloat16>::quiet_NaN();
2381 4 : tNoDataValue.i = cpl::NumericLimits<GFloat16>::quiet_NaN();
2382 : }
2383 : else if constexpr (std::is_same<T, std::complex<float>>::value ||
2384 : std::is_same<T, std::complex<double>>::value)
2385 : {
2386 : using BaseT = typename T::value_type;
2387 8 : tNoDataValue =
2388 : std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
2389 : std::numeric_limits<BaseT>::quiet_NaN());
2390 : }
2391 164 : else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
2392 163 : tNoDataValue = 0;
2393 : else
2394 1 : tNoDataValue = static_cast<T>(args.dfNoDataValue);
2395 :
2396 : using CountType = uint32_t;
2397 176 : CountType nMaxNumPx = 0;
2398 176 : T *paVals = nullptr;
2399 176 : CountType *panCounts = nullptr;
2400 :
2401 176 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2402 176 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2403 352 : std::vector<int> anVals(256, 0);
2404 :
2405 : /* ==================================================================== */
2406 : /* Loop over destination scanlines. */
2407 : /* ==================================================================== */
2408 7679 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2409 : {
2410 7503 : const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
2411 7503 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
2412 : #ifdef only_pixels_with_more_than_10_pct_participation
2413 : // When oversampling, don't take into account pixels that have a tiny
2414 : // participation in the resulting pixel
2415 : if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2416 : nSrcYOff < nChunkBottomYOff)
2417 : nSrcYOff++;
2418 : #endif
2419 7503 : if (nSrcYOff < nChunkYOff)
2420 0 : nSrcYOff = nChunkYOff;
2421 :
2422 7503 : const double dfSrcYOff2 =
2423 7503 : dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
2424 7503 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
2425 : #ifdef only_pixels_with_more_than_10_pct_participation
2426 : // When oversampling, don't take into account pixels that have a tiny
2427 : // participation in the resulting pixel
2428 : if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2429 : nSrcYOff2 > nChunkYOff)
2430 : nSrcYOff2--;
2431 : #endif
2432 7503 : if (nSrcYOff2 == nSrcYOff)
2433 0 : ++nSrcYOff2;
2434 7503 : if (nSrcYOff2 > nChunkBottomYOff)
2435 0 : nSrcYOff2 = nChunkBottomYOff;
2436 :
2437 7503 : const T *const paSrcScanline =
2438 253 : pChunk +
2439 7503 : (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
2440 7503 : const GByte *pabySrcScanlineNodataMask = nullptr;
2441 7503 : if (pabyChunkNodataMask != nullptr)
2442 1810 : pabySrcScanlineNodataMask =
2443 : pabyChunkNodataMask +
2444 1810 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
2445 :
2446 7503 : T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
2447 : /* --------------------------------------------------------------------
2448 : */
2449 : /* Loop over destination pixels */
2450 : /* --------------------------------------------------------------------
2451 : */
2452 4260400 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2453 : {
2454 4252893 : const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
2455 : // Apply some epsilon to avoid numerical precision issues
2456 4252893 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
2457 : #ifdef only_pixels_with_more_than_10_pct_participation
2458 : // When oversampling, don't take into account pixels that have a
2459 : // tiny participation in the resulting pixel
2460 : if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2461 : nSrcXOff < nChunkRightXOff)
2462 : nSrcXOff++;
2463 : #endif
2464 4252893 : if (nSrcXOff < nChunkXOff)
2465 0 : nSrcXOff = nChunkXOff;
2466 :
2467 4252893 : const double dfSrcXOff2 =
2468 4252893 : dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
2469 4252893 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
2470 : #ifdef only_pixels_with_more_than_10_pct_participation
2471 : // When oversampling, don't take into account pixels that have a
2472 : // tiny participation in the resulting pixel
2473 : if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2474 : nSrcXOff2 > nChunkXOff)
2475 : nSrcXOff2--;
2476 : #endif
2477 4252893 : if (nSrcXOff2 == nSrcXOff)
2478 0 : nSrcXOff2++;
2479 4252893 : if (nSrcXOff2 > nChunkRightXOff)
2480 0 : nSrcXOff2 = nChunkRightXOff;
2481 :
2482 4252893 : bool bRegularProcessing = false;
2483 : if constexpr (!std::is_same<T, GByte>::value)
2484 1503 : bRegularProcessing = true;
2485 4251390 : else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
2486 0 : bRegularProcessing = true;
2487 :
2488 4252893 : if (bRegularProcessing)
2489 : {
2490 : // Sanity check to make sure the allocation of paVals and
2491 : // panCounts don't overflow.
2492 : static_assert(sizeof(CountType) <= sizeof(size_t));
2493 3006 : if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
2494 1503 : static_cast<CountType>(nSrcYOff2 - nSrcYOff) >
2495 1503 : (std::numeric_limits<CountType>::max() /
2496 3006 : std::max(sizeof(T), sizeof(CountType))) /
2497 1503 : static_cast<CountType>(nSrcXOff2 - nSrcXOff))
2498 : {
2499 0 : CPLError(CE_Failure, CPLE_NotSupported,
2500 : "Too big downsampling factor");
2501 0 : CPLFree(paVals);
2502 0 : CPLFree(panCounts);
2503 0 : return CE_Failure;
2504 : }
2505 1503 : const CountType nNumPx =
2506 1503 : static_cast<CountType>(nSrcYOff2 - nSrcYOff) *
2507 1503 : (nSrcXOff2 - nSrcXOff);
2508 1503 : CountType iMaxInd = 0;
2509 1503 : CountType iMaxVal = 0;
2510 :
2511 1503 : if (paVals == nullptr || nNumPx > nMaxNumPx)
2512 : {
2513 : T *paValsNew = static_cast<T *>(
2514 110 : VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
2515 : CountType *panCountsNew =
2516 110 : static_cast<CountType *>(VSI_REALLOC_VERBOSE(
2517 : panCounts, nNumPx * sizeof(CountType)));
2518 110 : if (paValsNew != nullptr)
2519 110 : paVals = paValsNew;
2520 110 : if (panCountsNew != nullptr)
2521 110 : panCounts = panCountsNew;
2522 110 : if (paValsNew == nullptr || panCountsNew == nullptr)
2523 : {
2524 0 : CPLFree(paVals);
2525 0 : CPLFree(panCounts);
2526 0 : return CE_Failure;
2527 : }
2528 110 : nMaxNumPx = nNumPx;
2529 : }
2530 :
2531 4629 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2532 : {
2533 3126 : const GPtrDiff_t iTotYOff =
2534 3126 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2535 3126 : nChunkXOff;
2536 9858 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2537 : {
2538 6732 : if (pabySrcScanlineNodataMask == nullptr ||
2539 16 : pabySrcScanlineNodataMask[iX + iTotYOff])
2540 : {
2541 6717 : const T val = paSrcScanline[iX + iTotYOff];
2542 6717 : CountType i = 0; // Used after for.
2543 :
2544 : // Check array for existing entry.
2545 10081 : for (; i < iMaxInd; ++i)
2546 : {
2547 6850 : if (IsSame(paVals[i], val))
2548 : {
2549 3486 : if (++panCounts[i] > panCounts[iMaxVal])
2550 : {
2551 246 : iMaxVal = i;
2552 : }
2553 3486 : break;
2554 : }
2555 : }
2556 :
2557 : // Add to arr if entry not already there.
2558 6717 : if (i == iMaxInd)
2559 : {
2560 3231 : paVals[iMaxInd] = val;
2561 3231 : panCounts[iMaxInd] = 1;
2562 :
2563 3231 : if (iMaxInd == 0)
2564 : {
2565 1500 : iMaxVal = iMaxInd;
2566 : }
2567 :
2568 3231 : ++iMaxInd;
2569 : }
2570 : }
2571 : }
2572 : }
2573 :
2574 1503 : if (iMaxInd == 0)
2575 3 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2576 : else
2577 1500 : paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
2578 : }
2579 : else if constexpr (std::is_same<T, GByte>::value)
2580 : // ( eSrcDataType == GDT_UInt8 && nEntryCount < 256 )
2581 : {
2582 : // So we go here for a paletted or non-paletted byte band.
2583 : // The input values are then between 0 and 255.
2584 4251390 : int nMaxVal = 0;
2585 4251390 : int iMaxInd = -1;
2586 :
2587 : // The cost of this zeroing might be high. Perhaps we should
2588 : // just use the above generic case, and go to this one if the
2589 : // number of source pixels is large enough
2590 4251390 : std::fill(anVals.begin(), anVals.end(), 0);
2591 :
2592 12777800 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2593 : {
2594 8526440 : const GPtrDiff_t iTotYOff =
2595 8526440 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2596 8526440 : nChunkXOff;
2597 25649600 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2598 : {
2599 17123100 : const T val = paSrcScanline[iX + iTotYOff];
2600 17123100 : if (!bHasNoData || val != tNoDataValue)
2601 : {
2602 17123100 : int nVal = static_cast<int>(val);
2603 17123100 : if (++anVals[nVal] > nMaxVal)
2604 : {
2605 : // Sum the density.
2606 : // Is it the most common value so far?
2607 17006400 : iMaxInd = nVal;
2608 17006400 : nMaxVal = anVals[nVal];
2609 : }
2610 : }
2611 : }
2612 : }
2613 :
2614 4251390 : if (iMaxInd == -1)
2615 0 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2616 : else
2617 4251390 : paDstScanline[iDstPixel - nDstXOff] =
2618 : static_cast<T>(iMaxInd);
2619 : }
2620 : }
2621 : }
2622 :
2623 176 : CPLFree(paVals);
2624 176 : CPLFree(panCounts);
2625 :
2626 176 : return CE_None;
2627 : }
2628 :
2629 176 : static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
2630 : const void *pChunk, void **ppDstBuffer,
2631 : GDALDataType *peDstBufferDataType)
2632 : {
2633 176 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
2634 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
2635 : GDALGetDataTypeSizeBytes(args.eWrkDataType));
2636 176 : if (*ppDstBuffer == nullptr)
2637 : {
2638 0 : return CE_Failure;
2639 : }
2640 :
2641 176 : CPLAssert(args.eSrcDataType == args.eWrkDataType);
2642 :
2643 176 : *peDstBufferDataType = args.eWrkDataType;
2644 176 : switch (args.eWrkDataType)
2645 : {
2646 : // For mode resampling, as no computation is done, only the
2647 : // size of the data type matters... except for Byte where we have
2648 : // special processing. And for floating point values
2649 66 : case GDT_UInt8:
2650 : {
2651 66 : return GDALResampleChunk_ModeT(args,
2652 : static_cast<const GByte *>(pChunk),
2653 66 : static_cast<GByte *>(*ppDstBuffer));
2654 : }
2655 :
2656 4 : case GDT_Int8:
2657 : {
2658 4 : return GDALResampleChunk_ModeT(args,
2659 : static_cast<const int8_t *>(pChunk),
2660 4 : static_cast<int8_t *>(*ppDstBuffer));
2661 : }
2662 :
2663 10 : case GDT_Int16:
2664 : case GDT_UInt16:
2665 : {
2666 10 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
2667 10 : return GDALResampleChunk_ModeT(
2668 : args, static_cast<const uint16_t *>(pChunk),
2669 10 : static_cast<uint16_t *>(*ppDstBuffer));
2670 : }
2671 :
2672 15 : case GDT_CInt16:
2673 : case GDT_Int32:
2674 : case GDT_UInt32:
2675 : {
2676 15 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2677 15 : return GDALResampleChunk_ModeT(
2678 : args, static_cast<const uint32_t *>(pChunk),
2679 15 : static_cast<uint32_t *>(*ppDstBuffer));
2680 : }
2681 :
2682 12 : case GDT_CInt32:
2683 : case GDT_Int64:
2684 : case GDT_UInt64:
2685 : {
2686 12 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2687 12 : return GDALResampleChunk_ModeT(
2688 : args, static_cast<const uint64_t *>(pChunk),
2689 12 : static_cast<uint64_t *>(*ppDstBuffer));
2690 : }
2691 :
2692 4 : case GDT_Float16:
2693 : {
2694 4 : return GDALResampleChunk_ModeT(
2695 : args, static_cast<const GFloat16 *>(pChunk),
2696 4 : static_cast<GFloat16 *>(*ppDstBuffer));
2697 : }
2698 :
2699 32 : case GDT_Float32:
2700 : {
2701 32 : return GDALResampleChunk_ModeT(args,
2702 : static_cast<const float *>(pChunk),
2703 32 : static_cast<float *>(*ppDstBuffer));
2704 : }
2705 :
2706 21 : case GDT_Float64:
2707 : {
2708 21 : return GDALResampleChunk_ModeT(args,
2709 : static_cast<const double *>(pChunk),
2710 21 : static_cast<double *>(*ppDstBuffer));
2711 : }
2712 :
2713 4 : case GDT_CFloat16:
2714 : {
2715 4 : return GDALResampleChunk_ModeT(
2716 : args, static_cast<const ComplexFloat16 *>(pChunk),
2717 4 : static_cast<ComplexFloat16 *>(*ppDstBuffer));
2718 : }
2719 :
2720 4 : case GDT_CFloat32:
2721 : {
2722 4 : return GDALResampleChunk_ModeT(
2723 : args, static_cast<const std::complex<float> *>(pChunk),
2724 4 : static_cast<std::complex<float> *>(*ppDstBuffer));
2725 : }
2726 :
2727 4 : case GDT_CFloat64:
2728 : {
2729 4 : return GDALResampleChunk_ModeT(
2730 : args, static_cast<const std::complex<double> *>(pChunk),
2731 4 : static_cast<std::complex<double> *>(*ppDstBuffer));
2732 : }
2733 :
2734 0 : case GDT_Unknown:
2735 : case GDT_TypeCount:
2736 0 : break;
2737 : }
2738 :
2739 0 : CPLAssert(false);
2740 : return CE_Failure;
2741 : }
2742 :
2743 : /************************************************************************/
2744 : /* GDALResampleConvolutionHorizontal() */
2745 : /************************************************************************/
2746 :
2747 : template <class T>
2748 : static inline double
2749 46038 : GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
2750 : int nSrcPixelCount)
2751 : {
2752 46038 : double dfVal1 = 0.0;
2753 46038 : double dfVal2 = 0.0;
2754 46038 : int i = 0; // Used after for.
2755 : // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
2756 : // manually (untypical) unrolled loop in -O2 and -O3:
2757 : // https://github.com/OSGeo/gdal/issues/9508
2758 : #if !defined(__INTEL_CLANG_COMPILER)
2759 92396 : for (; i < nSrcPixelCount - 3; i += 4)
2760 : {
2761 46358 : dfVal1 += double(pChunk[i + 0]) * padfWeights[i];
2762 46358 : dfVal1 += double(pChunk[i + 1]) * padfWeights[i + 1];
2763 46358 : dfVal2 += double(pChunk[i + 2]) * padfWeights[i + 2];
2764 46358 : dfVal2 += double(pChunk[i + 3]) * padfWeights[i + 3];
2765 : }
2766 : #endif
2767 48662 : for (; i < nSrcPixelCount; ++i)
2768 : {
2769 2624 : dfVal1 += double(pChunk[i]) * padfWeights[i];
2770 : }
2771 46038 : return dfVal1 + dfVal2;
2772 : }
2773 :
2774 : template <class T>
2775 44576 : static inline void GDALResampleConvolutionHorizontalWithMask(
2776 : const T *pChunk, const GByte *pabyMask, const double *padfWeights,
2777 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2778 : {
2779 44576 : dfVal = 0;
2780 44576 : dfWeightSum = 0;
2781 44576 : int i = 0;
2782 98300 : for (; i < nSrcPixelCount - 3; i += 4)
2783 : {
2784 53724 : const double dfWeight0 = padfWeights[i] * pabyMask[i];
2785 53724 : const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
2786 53724 : const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
2787 53724 : const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
2788 53724 : dfVal += double(pChunk[i + 0]) * dfWeight0;
2789 53724 : dfVal += double(pChunk[i + 1]) * dfWeight1;
2790 53724 : dfVal += double(pChunk[i + 2]) * dfWeight2;
2791 53724 : dfVal += double(pChunk[i + 3]) * dfWeight3;
2792 53724 : dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
2793 : }
2794 61162 : for (; i < nSrcPixelCount; ++i)
2795 : {
2796 16586 : const double dfWeight = padfWeights[i] * pabyMask[i];
2797 16586 : dfVal += double(pChunk[i]) * dfWeight;
2798 16586 : dfWeightSum += dfWeight;
2799 : }
2800 44576 : }
2801 :
2802 : template <class T>
2803 1341366 : static inline void GDALResampleConvolutionHorizontal_3rows(
2804 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2805 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2806 : double &dfRes2, double &dfRes3)
2807 : {
2808 1341366 : double dfVal1 = 0.0;
2809 1341366 : double dfVal2 = 0.0;
2810 1341366 : double dfVal3 = 0.0;
2811 1341366 : double dfVal4 = 0.0;
2812 1341366 : double dfVal5 = 0.0;
2813 1341366 : double dfVal6 = 0.0;
2814 1341366 : int i = 0; // Used after for.
2815 2736937 : for (; i < nSrcPixelCount - 3; i += 4)
2816 : {
2817 1395570 : dfVal1 += double(pChunkRow1[i + 0]) * padfWeights[i + 0];
2818 1395570 : dfVal1 += double(pChunkRow1[i + 1]) * padfWeights[i + 1];
2819 1395570 : dfVal2 += double(pChunkRow1[i + 2]) * padfWeights[i + 2];
2820 1395570 : dfVal2 += double(pChunkRow1[i + 3]) * padfWeights[i + 3];
2821 1395570 : dfVal3 += double(pChunkRow2[i + 0]) * padfWeights[i + 0];
2822 1395570 : dfVal3 += double(pChunkRow2[i + 1]) * padfWeights[i + 1];
2823 1395570 : dfVal4 += double(pChunkRow2[i + 2]) * padfWeights[i + 2];
2824 1395570 : dfVal4 += double(pChunkRow2[i + 3]) * padfWeights[i + 3];
2825 1395570 : dfVal5 += double(pChunkRow3[i + 0]) * padfWeights[i + 0];
2826 1395570 : dfVal5 += double(pChunkRow3[i + 1]) * padfWeights[i + 1];
2827 1395570 : dfVal6 += double(pChunkRow3[i + 2]) * padfWeights[i + 2];
2828 1395570 : dfVal6 += double(pChunkRow3[i + 3]) * padfWeights[i + 3];
2829 : }
2830 1381377 : for (; i < nSrcPixelCount; ++i)
2831 : {
2832 40011 : dfVal1 += double(pChunkRow1[i]) * padfWeights[i];
2833 40011 : dfVal3 += double(pChunkRow2[i]) * padfWeights[i];
2834 40011 : dfVal5 += double(pChunkRow3[i]) * padfWeights[i];
2835 : }
2836 1341366 : dfRes1 = dfVal1 + dfVal2;
2837 1341366 : dfRes2 = dfVal3 + dfVal4;
2838 1341366 : dfRes3 = dfVal5 + dfVal6;
2839 1341366 : }
2840 :
2841 : template <class T>
2842 18980 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
2843 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2844 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2845 : double &dfRes2, double &dfRes3)
2846 : {
2847 18980 : GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2848 : padfWeights, nSrcPixelCount, dfRes1,
2849 : dfRes2, dfRes3);
2850 18980 : }
2851 :
2852 : template <class T>
2853 1256690 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
2854 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2855 : const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
2856 : {
2857 1256690 : GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2858 : padfWeights, 4, dfRes1, dfRes2,
2859 : dfRes3);
2860 1256690 : }
2861 :
2862 : /************************************************************************/
2863 : /* GDALResampleConvolutionVertical() */
2864 : /************************************************************************/
2865 :
2866 : template <class T>
2867 : static inline double
2868 465678 : GDALResampleConvolutionVertical(const T *pChunk, size_t nStride,
2869 : const double *padfWeights, int nSrcLineCount)
2870 : {
2871 465678 : double dfVal1 = 0.0;
2872 465678 : double dfVal2 = 0.0;
2873 465678 : int i = 0;
2874 465678 : size_t j = 0;
2875 916919 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2876 : {
2877 451241 : dfVal1 += pChunk[j + 0 * nStride] * padfWeights[i + 0];
2878 451241 : dfVal1 += pChunk[j + 1 * nStride] * padfWeights[i + 1];
2879 451241 : dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2880 451241 : dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2881 : }
2882 519809 : for (; i < nSrcLineCount; ++i, j += nStride)
2883 : {
2884 54131 : dfVal1 += pChunk[j] * padfWeights[i];
2885 : }
2886 465678 : return dfVal1 + dfVal2;
2887 : }
2888 :
2889 : template <class T>
2890 2930610 : static inline void GDALResampleConvolutionVertical_2cols(
2891 : const T *pChunk, size_t nStride, const double *padfWeights,
2892 : int nSrcLineCount, double &dfRes1, double &dfRes2)
2893 : {
2894 2930610 : double dfVal1 = 0.0;
2895 2930610 : double dfVal2 = 0.0;
2896 2930610 : double dfVal3 = 0.0;
2897 2930610 : double dfVal4 = 0.0;
2898 2930610 : int i = 0;
2899 2930610 : size_t j = 0;
2900 5863170 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2901 : {
2902 2932560 : dfVal1 += pChunk[j + 0 + 0 * nStride] * padfWeights[i + 0];
2903 2932560 : dfVal3 += pChunk[j + 1 + 0 * nStride] * padfWeights[i + 0];
2904 2932560 : dfVal1 += pChunk[j + 0 + 1 * nStride] * padfWeights[i + 1];
2905 2932560 : dfVal3 += pChunk[j + 1 + 1 * nStride] * padfWeights[i + 1];
2906 2932560 : dfVal2 += pChunk[j + 0 + 2 * nStride] * padfWeights[i + 2];
2907 2932560 : dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
2908 2932560 : dfVal2 += pChunk[j + 0 + 3 * nStride] * padfWeights[i + 3];
2909 2932560 : dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
2910 : }
2911 3053490 : for (; i < nSrcLineCount; ++i, j += nStride)
2912 : {
2913 122880 : dfVal1 += pChunk[j + 0] * padfWeights[i];
2914 122880 : dfVal3 += pChunk[j + 1] * padfWeights[i];
2915 : }
2916 2930610 : dfRes1 = dfVal1 + dfVal2;
2917 2930610 : dfRes2 = dfVal3 + dfVal4;
2918 2930610 : }
2919 :
2920 : #ifdef USE_SSE2
2921 :
2922 : #ifdef __AVX__
2923 : /************************************************************************/
2924 : /* GDALResampleConvolutionVertical_16cols<T> */
2925 : /************************************************************************/
2926 :
2927 : template <class T>
2928 : static inline void
2929 : GDALResampleConvolutionVertical_16cols(const T *pChunk, size_t nStride,
2930 : const double *padfWeights,
2931 : int nSrcLineCount, float *afDest)
2932 : {
2933 : int i = 0;
2934 : size_t j = 0;
2935 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2936 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2937 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2938 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2939 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2940 : {
2941 : XMMReg4Double w0 =
2942 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2943 : XMMReg4Double w1 =
2944 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2945 : XMMReg4Double w2 =
2946 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2947 : XMMReg4Double w3 =
2948 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2949 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2950 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2951 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
2952 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
2953 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2954 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2955 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
2956 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
2957 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2958 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2959 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
2960 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
2961 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2962 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2963 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
2964 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
2965 : }
2966 : for (; i < nSrcLineCount; ++i, j += nStride)
2967 : {
2968 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2969 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2970 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2971 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
2972 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
2973 : }
2974 : v_acc0.Store4Val(afDest);
2975 : v_acc1.Store4Val(afDest + 4);
2976 : v_acc2.Store4Val(afDest + 8);
2977 : v_acc3.Store4Val(afDest + 12);
2978 : }
2979 :
2980 : template <class T>
2981 : static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
2982 : const double *, int,
2983 : double *)
2984 : {
2985 : // Cannot be reached
2986 : CPLAssert(false);
2987 : }
2988 :
2989 : #else
2990 :
2991 : /************************************************************************/
2992 : /* GDALResampleConvolutionVertical_8cols<T> */
2993 : /************************************************************************/
2994 :
2995 : template <class T>
2996 : static inline void
2997 25609800 : GDALResampleConvolutionVertical_8cols(const T *pChunk, size_t nStride,
2998 : const double *padfWeights,
2999 : int nSrcLineCount, float *afDest)
3000 : {
3001 25609800 : int i = 0;
3002 25609800 : size_t j = 0;
3003 25609800 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
3004 25609800 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3005 53417600 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
3006 : {
3007 27807800 : XMMReg4Double w0 =
3008 27807800 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
3009 27807800 : XMMReg4Double w1 =
3010 27807800 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
3011 27807800 : XMMReg4Double w2 =
3012 27807800 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
3013 27807800 : XMMReg4Double w3 =
3014 27807800 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
3015 27807800 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
3016 27807800 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
3017 27807800 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
3018 27807800 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
3019 27807800 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
3020 27807800 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
3021 27807800 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
3022 27807800 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
3023 : }
3024 37176700 : for (; i < nSrcLineCount; ++i, j += nStride)
3025 : {
3026 11566800 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
3027 11566800 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
3028 11566800 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
3029 : }
3030 25609800 : v_acc0.Store4Val(afDest);
3031 25609800 : v_acc1.Store4Val(afDest + 4);
3032 25609800 : }
3033 :
3034 : template <class T>
3035 : static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
3036 : const double *, int,
3037 : double *)
3038 : {
3039 : // Cannot be reached
3040 : CPLAssert(false);
3041 : }
3042 :
3043 : #endif // __AVX__
3044 :
3045 : /************************************************************************/
3046 : /* GDALResampleConvolutionHorizontalSSE2<T> */
3047 : /************************************************************************/
3048 :
3049 : template <class T>
3050 3137782 : static inline double GDALResampleConvolutionHorizontalSSE2(
3051 : const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3052 : {
3053 3137782 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3054 3137782 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3055 3137782 : int i = 0; // Used after for.
3056 3513158 : for (; i < nSrcPixelCount - 7; i += 8)
3057 : {
3058 : // Retrieve the pixel & accumulate
3059 375371 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
3060 375371 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
3061 375371 : const XMMReg4Double v_weight1 =
3062 375371 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3063 375371 : const XMMReg4Double v_weight2 =
3064 375371 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
3065 :
3066 375371 : v_acc1 += v_pixels1 * v_weight1;
3067 375371 : v_acc2 += v_pixels2 * v_weight2;
3068 : }
3069 :
3070 3137782 : v_acc1 += v_acc2;
3071 :
3072 3137782 : double dfVal = v_acc1.GetHorizSum();
3073 10317730 : for (; i < nSrcPixelCount; ++i)
3074 : {
3075 7179950 : dfVal += pChunk[i] * padfWeightsAligned[i];
3076 : }
3077 3137782 : return dfVal;
3078 : }
3079 :
3080 : /************************************************************************/
3081 : /* GDALResampleConvolutionHorizontal<GByte> */
3082 : /************************************************************************/
3083 :
3084 : template <>
3085 2588620 : inline double GDALResampleConvolutionHorizontal<GByte>(
3086 : const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3087 : {
3088 2588620 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
3089 2588620 : nSrcPixelCount);
3090 : }
3091 :
3092 : template <>
3093 549162 : inline double GDALResampleConvolutionHorizontal<GUInt16>(
3094 : const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3095 : {
3096 549162 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
3097 549162 : nSrcPixelCount);
3098 : }
3099 :
3100 : /************************************************************************/
3101 : /* GDALResampleConvolutionHorizontalWithMaskSSE2<T> */
3102 : /************************************************************************/
3103 :
3104 : template <class T>
3105 6408653 : static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
3106 : const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
3107 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
3108 : {
3109 6408653 : int i = 0; // Used after for.
3110 6408653 : XMMReg4Double v_acc = XMMReg4Double::Zero();
3111 6408653 : XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
3112 17785121 : for (; i < nSrcPixelCount - 3; i += 4)
3113 : {
3114 11376458 : const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
3115 11376458 : const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
3116 11376458 : XMMReg4Double v_weight =
3117 11376458 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3118 11376458 : v_weight *= v_mask;
3119 11376458 : v_acc += v_pixels * v_weight;
3120 11376458 : v_acc_weight += v_weight;
3121 : }
3122 :
3123 6408653 : dfVal = v_acc.GetHorizSum();
3124 6408653 : dfWeightSum = v_acc_weight.GetHorizSum();
3125 6614913 : for (; i < nSrcPixelCount; ++i)
3126 : {
3127 206258 : const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
3128 206258 : dfVal += pChunk[i] * dfWeight;
3129 206258 : dfWeightSum += dfWeight;
3130 : }
3131 6408653 : }
3132 :
3133 : /************************************************************************/
3134 : /* GDALResampleConvolutionHorizontalWithMask<GByte> */
3135 : /************************************************************************/
3136 :
3137 : template <>
3138 6408590 : inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
3139 : const GByte *pChunk, const GByte *pabyMask,
3140 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
3141 : double &dfWeightSum)
3142 : {
3143 6408590 : GDALResampleConvolutionHorizontalWithMaskSSE2(
3144 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
3145 : dfWeightSum);
3146 6408590 : }
3147 :
3148 : template <>
3149 63 : inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
3150 : const GUInt16 *pChunk, const GByte *pabyMask,
3151 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
3152 : double &dfWeightSum)
3153 : {
3154 63 : GDALResampleConvolutionHorizontalWithMaskSSE2(
3155 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
3156 : dfWeightSum);
3157 63 : }
3158 :
3159 : /************************************************************************/
3160 : /* GDALResampleConvolutionHorizontal_3rows_SSE2<T> */
3161 : /************************************************************************/
3162 :
3163 : template <class T>
3164 35128386 : static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
3165 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3166 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3167 : double &dfRes2, double &dfRes3)
3168 : {
3169 35128386 : XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
3170 35128386 : v_acc2 = XMMReg4Double::Zero(),
3171 35128386 : v_acc3 = XMMReg4Double::Zero();
3172 35128386 : int i = 0;
3173 70070156 : for (; i < nSrcPixelCount - 7; i += 8)
3174 : {
3175 : // Retrieve the pixel & accumulate.
3176 34941870 : XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3177 34941870 : XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
3178 34941870 : const XMMReg4Double v_weight1 =
3179 34941870 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3180 34941870 : const XMMReg4Double v_weight2 =
3181 34941870 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
3182 :
3183 34941870 : v_acc1 += v_pixels1 * v_weight1;
3184 34941870 : v_acc1 += v_pixels2 * v_weight2;
3185 :
3186 34941870 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3187 34941870 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
3188 34941870 : v_acc2 += v_pixels1 * v_weight1;
3189 34941870 : v_acc2 += v_pixels2 * v_weight2;
3190 :
3191 34941870 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3192 34941870 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
3193 34941870 : v_acc3 += v_pixels1 * v_weight1;
3194 34941870 : v_acc3 += v_pixels2 * v_weight2;
3195 : }
3196 :
3197 35128386 : dfRes1 = v_acc1.GetHorizSum();
3198 35128386 : dfRes2 = v_acc2.GetHorizSum();
3199 35128386 : dfRes3 = v_acc3.GetHorizSum();
3200 47367852 : for (; i < nSrcPixelCount; ++i)
3201 : {
3202 12239466 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3203 12239466 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3204 12239466 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3205 : }
3206 35128386 : }
3207 :
3208 : /************************************************************************/
3209 : /* GDALResampleConvolutionHorizontal_3rows<GByte> */
3210 : /************************************************************************/
3211 :
3212 : template <>
3213 35128300 : inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
3214 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3215 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3216 : double &dfRes2, double &dfRes3)
3217 : {
3218 35128300 : GDALResampleConvolutionHorizontal_3rows_SSE2(
3219 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3220 : dfRes1, dfRes2, dfRes3);
3221 35128300 : }
3222 :
3223 : template <>
3224 86 : inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
3225 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3226 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3227 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3228 : {
3229 86 : GDALResampleConvolutionHorizontal_3rows_SSE2(
3230 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3231 : dfRes1, dfRes2, dfRes3);
3232 86 : }
3233 :
3234 : /************************************************************************/
3235 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T> */
3236 : /************************************************************************/
3237 :
3238 : template <class T>
3239 7840250 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3240 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3241 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3242 : double &dfRes2, double &dfRes3)
3243 : {
3244 7840250 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3245 7840250 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3246 7840250 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
3247 7840250 : int i = 0; // Use after for.
3248 19104350 : for (; i < nSrcPixelCount - 3; i += 4)
3249 : {
3250 : // Retrieve the pixel & accumulate.
3251 11264100 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3252 11264100 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3253 11264100 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3254 11264100 : const XMMReg4Double v_weight =
3255 11264100 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3256 :
3257 11264100 : v_acc1 += v_pixels1 * v_weight;
3258 11264100 : v_acc2 += v_pixels2 * v_weight;
3259 11264100 : v_acc3 += v_pixels3 * v_weight;
3260 : }
3261 :
3262 7840250 : dfRes1 = v_acc1.GetHorizSum();
3263 7840250 : dfRes2 = v_acc2.GetHorizSum();
3264 7840250 : dfRes3 = v_acc3.GetHorizSum();
3265 :
3266 12290222 : for (; i < nSrcPixelCount; ++i)
3267 : {
3268 4449942 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3269 4449942 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3270 4449942 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3271 : }
3272 7840250 : }
3273 :
3274 : /************************************************************************/
3275 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte> */
3276 : /************************************************************************/
3277 :
3278 : template <>
3279 7773100 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
3280 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3281 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3282 : double &dfRes2, double &dfRes3)
3283 : {
3284 7773100 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3285 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3286 : dfRes1, dfRes2, dfRes3);
3287 7773100 : }
3288 :
3289 : template <>
3290 67150 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
3291 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3292 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3293 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3294 : {
3295 67150 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3296 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3297 : dfRes1, dfRes2, dfRes3);
3298 67150 : }
3299 :
3300 : /************************************************************************/
3301 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T> */
3302 : /************************************************************************/
3303 :
3304 : template <class T>
3305 13996690 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3306 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3307 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3308 : double &dfRes3)
3309 : {
3310 13996690 : const XMMReg4Double v_weight =
3311 : XMMReg4Double::Load4ValAligned(padfWeightsAligned);
3312 :
3313 : // Retrieve the pixel & accumulate.
3314 13996690 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
3315 13996690 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
3316 13996690 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
3317 :
3318 13996690 : XMMReg4Double v_acc1 = v_pixels1 * v_weight;
3319 13996690 : XMMReg4Double v_acc2 = v_pixels2 * v_weight;
3320 13996690 : XMMReg4Double v_acc3 = v_pixels3 * v_weight;
3321 :
3322 13996690 : dfRes1 = v_acc1.GetHorizSum();
3323 13996690 : dfRes2 = v_acc2.GetHorizSum();
3324 13996690 : dfRes3 = v_acc3.GetHorizSum();
3325 13996690 : }
3326 :
3327 : /************************************************************************/
3328 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte> */
3329 : /************************************************************************/
3330 :
3331 : template <>
3332 8283970 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
3333 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3334 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3335 : double &dfRes3)
3336 : {
3337 8283970 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3338 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3339 : dfRes3);
3340 8283970 : }
3341 :
3342 : template <>
3343 5712720 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
3344 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3345 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
3346 : double &dfRes2, double &dfRes3)
3347 : {
3348 5712720 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3349 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3350 : dfRes3);
3351 5712720 : }
3352 :
3353 : #endif // USE_SSE2
3354 :
3355 : /************************************************************************/
3356 : /* GDALResampleChunk_Convolution() */
3357 : /************************************************************************/
3358 :
3359 : template <class T, class Twork, GDALDataType eWrkDataType,
3360 : bool bKernelWithNegativeWeights, bool bNeedRescale>
3361 5093 : static CPLErr GDALResampleChunk_ConvolutionT(
3362 : const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
3363 : FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
3364 : int nKernelRadius, float fMaxVal)
3365 :
3366 : {
3367 5093 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
3368 5093 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
3369 5093 : const double dfSrcXDelta = args.dfSrcXDelta;
3370 5093 : const double dfSrcYDelta = args.dfSrcYDelta;
3371 5093 : constexpr int nBands = 1;
3372 5093 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
3373 5093 : const int nChunkXOff = args.nChunkXOff;
3374 5093 : const int nChunkXSize = args.nChunkXSize;
3375 5093 : const int nChunkYOff = args.nChunkYOff;
3376 5093 : const int nChunkYSize = args.nChunkYSize;
3377 5093 : const int nDstXOff = args.nDstXOff;
3378 5093 : const int nDstXOff2 = args.nDstXOff2;
3379 5093 : const int nDstYOff = args.nDstYOff;
3380 5093 : const int nDstYOff2 = args.nDstYOff2;
3381 5093 : const bool bHasNoData = args.bHasNoData;
3382 5093 : double dfNoDataValue = args.dfNoDataValue;
3383 :
3384 5093 : if (!bHasNoData)
3385 5018 : dfNoDataValue = 0.0;
3386 5093 : const auto dstDataType = args.eOvrDataType;
3387 5093 : const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
3388 5093 : const double dfReplacementVal =
3389 75 : bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
3390 : : dfNoDataValue;
3391 : // cppcheck-suppress unreadVariable
3392 5093 : const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
3393 5093 : const bool bNoDataValueInt64Valid =
3394 5093 : isIntegerDT && GDALIsValueExactAs<GInt64>(dfNoDataValue);
3395 5093 : const auto nNodataValueInt64 =
3396 : bNoDataValueInt64Valid ? static_cast<GInt64>(dfNoDataValue) : 0;
3397 5093 : constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
3398 :
3399 : // TODO: we should have some generic function to do this.
3400 5093 : Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
3401 5093 : Twork fDstMax = cpl::NumericLimits<Twork>::max();
3402 5093 : if (dstDataType == GDT_UInt8)
3403 : {
3404 4225 : fDstMin = std::numeric_limits<GByte>::min();
3405 4225 : fDstMax = std::numeric_limits<GByte>::max();
3406 : }
3407 868 : else if (dstDataType == GDT_Int8)
3408 : {
3409 1 : fDstMin = std::numeric_limits<GInt8>::min();
3410 1 : fDstMax = std::numeric_limits<GInt8>::max();
3411 : }
3412 867 : else if (dstDataType == GDT_UInt16)
3413 : {
3414 402 : fDstMin = std::numeric_limits<GUInt16>::min();
3415 402 : fDstMax = std::numeric_limits<GUInt16>::max();
3416 : }
3417 465 : else if (dstDataType == GDT_Int16)
3418 : {
3419 291 : fDstMin = std::numeric_limits<GInt16>::min();
3420 291 : fDstMax = std::numeric_limits<GInt16>::max();
3421 : }
3422 174 : else if (dstDataType == GDT_UInt32)
3423 : {
3424 1 : fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
3425 1 : fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
3426 : }
3427 173 : else if (dstDataType == GDT_Int32)
3428 : {
3429 : // cppcheck-suppress unreadVariable
3430 2 : fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
3431 : // cppcheck-suppress unreadVariable
3432 2 : fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
3433 : }
3434 171 : else if (dstDataType == GDT_UInt64)
3435 : {
3436 : // cppcheck-suppress unreadVariable
3437 1 : fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
3438 : // cppcheck-suppress unreadVariable
3439 : // (1 << 64) - 2048: largest uint64 value a double can hold
3440 1 : fDstMax = static_cast<Twork>(18446744073709549568ULL);
3441 : }
3442 170 : else if (dstDataType == GDT_Int64)
3443 : {
3444 : // cppcheck-suppress unreadVariable
3445 1 : fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
3446 : // cppcheck-suppress unreadVariable
3447 : // (1 << 63) - 1024: largest int64 that a double can hold
3448 1 : fDstMax = static_cast<Twork>(9223372036854774784LL);
3449 : }
3450 :
3451 36939169 : auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
3452 : bNoDataValueInt64Valid, nNodataValueInt64,
3453 : dfNoDataValue, dfReplacementVal](Twork fVal)
3454 : {
3455 15833200 : if (!bHasNoData)
3456 11612800 : return fVal;
3457 :
3458 : // Clamp value before comparing to nodata: this is only needed for
3459 : // kernels with negative weights (Lanczos)
3460 4220490 : Twork fClamped = fVal;
3461 4220490 : if (fClamped < fDstMin)
3462 15998 : fClamped = fDstMin;
3463 4204490 : else if (fClamped > fDstMax)
3464 16406 : fClamped = fDstMax;
3465 4220490 : if (isIntegerDT)
3466 : {
3467 4220480 : if (bNoDataValueInt64Valid)
3468 : {
3469 4220470 : const double fClampedRounded = double(std::round(fClamped));
3470 8440960 : if (fClampedRounded >=
3471 : static_cast<double>(static_cast<Twork>(
3472 8440960 : std::numeric_limits<int64_t>::min())) &&
3473 : fClampedRounded <= static_cast<double>(static_cast<Twork>(
3474 8440960 : 9223372036854774784LL)) &&
3475 4220470 : nNodataValueInt64 ==
3476 4220480 : static_cast<GInt64>(std::round(fClamped)))
3477 : {
3478 : // Do not use the nodata value
3479 14435 : return static_cast<Twork>(dfReplacementVal);
3480 : }
3481 : }
3482 : }
3483 7 : else if (dfNoDataValue == static_cast<double>(fClamped))
3484 : {
3485 : // Do not use the nodata value
3486 1 : return static_cast<Twork>(dfReplacementVal);
3487 : }
3488 4206050 : return fClamped;
3489 : };
3490 :
3491 : /* -------------------------------------------------------------------- */
3492 : /* Allocate work buffers. */
3493 : /* -------------------------------------------------------------------- */
3494 5093 : const int nDstXSize = nDstXOff2 - nDstXOff;
3495 5093 : Twork *pafWrkScanline = nullptr;
3496 5093 : if (dstDataType != eWrkDataType)
3497 : {
3498 : pafWrkScanline =
3499 4924 : static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
3500 4924 : if (pafWrkScanline == nullptr)
3501 0 : return CE_Failure;
3502 : }
3503 :
3504 5093 : const double dfXScale = 1.0 / dfXRatioDstToSrc;
3505 5093 : const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
3506 5093 : const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
3507 5093 : const double dfYScale = 1.0 / dfYRatioDstToSrc;
3508 5093 : const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
3509 5093 : const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
3510 :
3511 : // Temporary array to store result of horizontal filter.
3512 : double *const padfHorizontalFiltered = static_cast<double *>(
3513 5093 : VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
3514 :
3515 : // To store convolution coefficients.
3516 : double *const padfWeights =
3517 5093 : static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
3518 : static_cast<int>(
3519 : 2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) + 0.5) *
3520 : sizeof(double)));
3521 :
3522 5093 : GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
3523 5093 : if (pabyChunkNodataMask)
3524 : pabyChunkNodataMaskHorizontalFiltered =
3525 438 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
3526 5093 : if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
3527 438 : (pabyChunkNodataMask != nullptr &&
3528 : pabyChunkNodataMaskHorizontalFiltered == nullptr))
3529 : {
3530 0 : VSIFree(pafWrkScanline);
3531 0 : VSIFree(padfHorizontalFiltered);
3532 0 : VSIFreeAligned(padfWeights);
3533 0 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3534 0 : return CE_Failure;
3535 : }
3536 :
3537 : /* ==================================================================== */
3538 : /* First pass: horizontal filter */
3539 : /* ==================================================================== */
3540 5093 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
3541 : #ifdef USE_SSE2
3542 5093 : const bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
3543 : #endif
3544 3042748 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
3545 : {
3546 3037659 : const double dfSrcPixel =
3547 3037659 : (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
3548 3037659 : int nSrcPixelStart =
3549 3037659 : static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
3550 3037659 : if (nSrcPixelStart < nChunkXOff)
3551 57253 : nSrcPixelStart = nChunkXOff;
3552 3037659 : int nSrcPixelStop =
3553 3037659 : static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
3554 3037659 : if (nSrcPixelStop > nChunkRightXOff)
3555 57268 : nSrcPixelStop = nChunkRightXOff;
3556 : #if 0
3557 : if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
3558 : {
3559 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3560 : }
3561 : if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
3562 : {
3563 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3564 : }
3565 : #endif
3566 3037659 : const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
3567 3037659 : double dfWeightSum = 0.0;
3568 :
3569 : // Compute convolution coefficients.
3570 3037659 : int nSrcPixel = nSrcPixelStart;
3571 3037659 : double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
3572 4424858 : for (; nSrcPixel < nSrcPixelStop - 3; nSrcPixel += 4)
3573 : {
3574 1387200 : padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
3575 1387200 : dfX += dfXScaleWeight;
3576 1387200 : padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
3577 1387200 : dfX += dfXScaleWeight;
3578 1387200 : padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
3579 1387200 : dfX += dfXScaleWeight;
3580 1387200 : padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
3581 1387200 : dfX += dfXScaleWeight;
3582 1387200 : dfWeightSum +=
3583 1387200 : pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
3584 : }
3585 7028459 : for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
3586 : {
3587 3990800 : const double dfWeight = pfnFilterFunc(dfX);
3588 3990800 : padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3589 3990800 : dfWeightSum += dfWeight;
3590 : }
3591 :
3592 3037659 : const int nHeight = nChunkYSize * nBands;
3593 3037659 : if (pabyChunkNodataMask == nullptr)
3594 : {
3595 : // For floating-point data types, we must scale down a bit values
3596 : // if input values are close to +/- std::numeric_limits<T>::max()
3597 : #ifdef OLD_CPPCHECK
3598 : constexpr double mulFactor = 1;
3599 : #else
3600 2954736 : constexpr double mulFactor =
3601 : (bNeedRescale &&
3602 : (std::is_same_v<T, float> || std::is_same_v<T, double>))
3603 : ? 2
3604 : : 1;
3605 : #endif
3606 :
3607 2954736 : if (dfWeightSum != 0)
3608 : {
3609 2954736 : const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
3610 11886984 : for (int i = 0; i < nSrcPixelCount; ++i)
3611 : {
3612 8932253 : padfWeights[i] *= dfInvWeightSum;
3613 : }
3614 : }
3615 :
3616 178104060 : const auto ScaleValue = [
3617 : #ifdef _MSC_VER
3618 : mulFactor
3619 : #endif
3620 : ](double dfVal, [[maybe_unused]] const T *inputValues,
3621 : [[maybe_unused]] int nInputValues)
3622 : {
3623 178104000 : constexpr bool isFloat =
3624 : std::is_same_v<T, float> || std::is_same_v<T, double>;
3625 : if constexpr (isFloat)
3626 : {
3627 4070140 : if (std::isfinite(dfVal))
3628 : {
3629 : return std::clamp(dfVal,
3630 12204800 : -std::numeric_limits<double>::max() /
3631 : mulFactor,
3632 4068260 : std::numeric_limits<double>::max() /
3633 4068260 : mulFactor) *
3634 4068260 : mulFactor;
3635 : }
3636 : else if constexpr (bKernelWithNegativeWeights)
3637 : {
3638 936 : if (std::isnan(dfVal))
3639 : {
3640 : // Either one of the input value is NaN or they are +/-Inf
3641 936 : const bool isPositive = inputValues[0] >= 0;
3642 6008 : for (int i = 0; i < nInputValues; ++i)
3643 : {
3644 5384 : if (std::isnan(inputValues[i]))
3645 312 : return dfVal;
3646 : // cppcheck-suppress knownConditionTrueFalse
3647 5072 : if ((inputValues[i] >= 0) != isPositive)
3648 0 : return dfVal;
3649 : }
3650 : // All values are positive or negative infinity
3651 624 : return static_cast<double>(inputValues[0]);
3652 : }
3653 : }
3654 : }
3655 174035000 : return dfVal;
3656 : };
3657 :
3658 2954736 : int iSrcLineOff = 0;
3659 : #ifdef USE_SSE2
3660 2954736 : if (nSrcPixelCount == 4)
3661 : {
3662 15867185 : for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3663 : {
3664 15253386 : const size_t j =
3665 15253386 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3666 15253386 : (nSrcPixelStart - nChunkXOff);
3667 15253386 : double dfVal1 = 0.0;
3668 15253386 : double dfVal2 = 0.0;
3669 15253386 : double dfVal3 = 0.0;
3670 15253386 : GDALResampleConvolutionHorizontalPixelCount4_3rows(
3671 15253386 : pChunk + j, pChunk + j + nChunkXSize,
3672 15253386 : pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
3673 : dfVal2, dfVal3);
3674 30506746 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3675 15253386 : nDstXSize +
3676 15253386 : iDstPixel - nDstXOff] =
3677 15253386 : ScaleValue(dfVal1, pChunk + j, 4);
3678 30506746 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3679 15253386 : 1) *
3680 15253386 : nDstXSize +
3681 15253386 : iDstPixel - nDstXOff] =
3682 15253386 : ScaleValue(dfVal2, pChunk + j + nChunkXSize, 4);
3683 15253795 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3684 15253386 : 2) *
3685 15253386 : nDstXSize +
3686 15253386 : iDstPixel - nDstXOff] =
3687 15253386 : ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize, 4);
3688 : }
3689 : }
3690 2340929 : else if (bSrcPixelCountLess8)
3691 : {
3692 9927838 : for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3693 : {
3694 7859228 : const size_t j =
3695 7859228 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3696 7859228 : (nSrcPixelStart - nChunkXOff);
3697 7859228 : double dfVal1 = 0.0;
3698 7859228 : double dfVal2 = 0.0;
3699 7859228 : double dfVal3 = 0.0;
3700 7859228 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
3701 7859228 : pChunk + j, pChunk + j + nChunkXSize,
3702 7859228 : pChunk + j + 2 * nChunkXSize, padfWeights,
3703 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3704 15718416 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3705 7859228 : nDstXSize +
3706 7859228 : iDstPixel - nDstXOff] =
3707 7859228 : ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
3708 15718416 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3709 7859228 : 1) *
3710 7859228 : nDstXSize +
3711 7859228 : iDstPixel - nDstXOff] =
3712 7859228 : ScaleValue(dfVal2, pChunk + j + nChunkXSize,
3713 : nSrcPixelCount);
3714 7859316 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3715 7859228 : 2) *
3716 7859228 : nDstXSize +
3717 7859228 : iDstPixel - nDstXOff] =
3718 7859228 : ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
3719 : nSrcPixelCount);
3720 : }
3721 : }
3722 : else
3723 : #endif
3724 : {
3725 35466358 : for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3726 : {
3727 35194044 : const size_t j =
3728 35194044 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3729 35194044 : (nSrcPixelStart - nChunkXOff);
3730 35194044 : double dfVal1 = 0.0;
3731 35194044 : double dfVal2 = 0.0;
3732 35194044 : double dfVal3 = 0.0;
3733 35194044 : GDALResampleConvolutionHorizontal_3rows(
3734 35194044 : pChunk + j, pChunk + j + nChunkXSize,
3735 35194044 : pChunk + j + 2 * nChunkXSize, padfWeights,
3736 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3737 70388098 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3738 35194044 : nDstXSize +
3739 35194044 : iDstPixel - nDstXOff] =
3740 35194044 : ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
3741 70388098 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3742 35194044 : 1) *
3743 35194044 : nDstXSize +
3744 35194044 : iDstPixel - nDstXOff] =
3745 35194044 : ScaleValue(dfVal2, pChunk + j + nChunkXSize,
3746 : nSrcPixelCount);
3747 35259148 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3748 35194044 : 2) *
3749 35194044 : nDstXSize +
3750 35194044 : iDstPixel - nDstXOff] =
3751 35194044 : ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
3752 : nSrcPixelCount);
3753 : }
3754 : }
3755 6138566 : for (; iSrcLineOff < nHeight; ++iSrcLineOff)
3756 : {
3757 3183826 : const size_t j =
3758 3183826 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3759 3183826 : (nSrcPixelStart - nChunkXOff);
3760 3732986 : const double dfVal = GDALResampleConvolutionHorizontal(
3761 595200 : pChunk + j, padfWeights, nSrcPixelCount);
3762 3184275 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3763 3183826 : nDstXSize +
3764 3183826 : iDstPixel - nDstXOff] =
3765 3183826 : ScaleValue(dfVal, pChunk + j, nSrcPixelCount);
3766 : }
3767 : }
3768 : else
3769 : {
3770 19187467 : for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
3771 : {
3772 19104530 : const size_t j =
3773 19104530 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3774 19104530 : (nSrcPixelStart - nChunkXOff);
3775 :
3776 : if (bKernelWithNegativeWeights)
3777 : {
3778 18579412 : int nConsecutiveValid = 0;
3779 18579412 : int nMaxConsecutiveValid = 0;
3780 170140458 : for (int k = 0; k < nSrcPixelCount; k++)
3781 : {
3782 151560146 : if (pabyChunkNodataMask[j + k])
3783 43672053 : nConsecutiveValid++;
3784 107888793 : else if (nConsecutiveValid)
3785 : {
3786 107790 : nMaxConsecutiveValid = std::max(
3787 107790 : nMaxConsecutiveValid, nConsecutiveValid);
3788 107790 : nConsecutiveValid = 0;
3789 : }
3790 : }
3791 18579412 : nMaxConsecutiveValid =
3792 18579412 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
3793 18579412 : if (nMaxConsecutiveValid < nSrcPixelCount / 2)
3794 : {
3795 12651307 : const size_t nTempOffset =
3796 12651307 : static_cast<size_t>(iSrcLineOff) * nDstXSize +
3797 12651307 : iDstPixel - nDstXOff;
3798 12651307 : padfHorizontalFiltered[nTempOffset] = 0.0;
3799 12651307 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3800 12651307 : continue;
3801 : }
3802 : }
3803 :
3804 6453233 : double dfVal = 0.0;
3805 6453233 : GDALResampleConvolutionHorizontalWithMask(
3806 44639 : pChunk + j, pabyChunkNodataMask + j, padfWeights,
3807 : nSrcPixelCount, dfVal, dfWeightSum);
3808 6453233 : const size_t nTempOffset =
3809 6453233 : static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
3810 6453233 : nDstXOff;
3811 6453233 : if (dfWeightSum > 0.0)
3812 : {
3813 6408568 : padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
3814 6408568 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
3815 : }
3816 : else
3817 : {
3818 44663 : padfHorizontalFiltered[nTempOffset] = 0.0;
3819 44663 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3820 : }
3821 : }
3822 : }
3823 : }
3824 :
3825 : /* ==================================================================== */
3826 : /* Second pass: vertical filter */
3827 : /* ==================================================================== */
3828 5093 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
3829 :
3830 394928 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3831 : {
3832 389835 : Twork *const pafDstScanline =
3833 : pafWrkScanline
3834 389835 : ? pafWrkScanline
3835 8797 : : static_cast<Twork *>(pDstBuffer) +
3836 8797 : static_cast<size_t>(iDstLine - nDstYOff) * nDstXSize;
3837 :
3838 389835 : const double dfSrcLine =
3839 389835 : (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
3840 389835 : int nSrcLineStart =
3841 389835 : static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
3842 389835 : int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
3843 389835 : if (nSrcLineStart < nChunkYOff)
3844 3388 : nSrcLineStart = nChunkYOff;
3845 389835 : if (nSrcLineStop > nChunkBottomYOff)
3846 3432 : nSrcLineStop = nChunkBottomYOff;
3847 : #if 0
3848 : if( nSrcLineStart < nChunkYOff &&
3849 : nChunkYOff > 0 )
3850 : {
3851 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3852 : }
3853 : if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
3854 : {
3855 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3856 : }
3857 : #endif
3858 389835 : const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
3859 389835 : double dfWeightSum = 0.0;
3860 :
3861 : // Compute convolution coefficients.
3862 389835 : int nSrcLine = nSrcLineStart; // Used after for.
3863 389835 : double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
3864 998892 : for (; nSrcLine < nSrcLineStop - 3;
3865 609057 : nSrcLine += 4, dfY += 4 * dfYScaleWeight)
3866 : {
3867 609057 : padfWeights[nSrcLine - nSrcLineStart] = dfY;
3868 609057 : padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
3869 609057 : padfWeights[nSrcLine + 2 - nSrcLineStart] =
3870 609057 : dfY + 2 * dfYScaleWeight;
3871 609057 : padfWeights[nSrcLine + 3 - nSrcLineStart] =
3872 609057 : dfY + 3 * dfYScaleWeight;
3873 609057 : dfWeightSum +=
3874 609057 : pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
3875 : }
3876 427653 : for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
3877 : {
3878 37818 : const double dfWeight = pfnFilterFunc(dfY);
3879 37818 : padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
3880 37818 : dfWeightSum += dfWeight;
3881 : }
3882 :
3883 389835 : if (pabyChunkNodataMask == nullptr)
3884 : {
3885 : // For floating-point data types, we must scale down a bit values
3886 : // if input values are close to +/- std::numeric_limits<T>::max()
3887 : #ifdef OLD_CPPCHECK
3888 : constexpr double mulFactor = 1;
3889 : #else
3890 353911 : constexpr double mulFactor =
3891 : (bNeedRescale &&
3892 : (std::is_same_v<T, float> || std::is_same_v<T, double>))
3893 : ? 2
3894 : : 1;
3895 : #endif
3896 :
3897 353911 : if (dfWeightSum != 0)
3898 : {
3899 353911 : const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
3900 2579837 : for (int i = 0; i < nSrcLineCount; ++i)
3901 2225927 : padfWeights[i] *= dfInvWeightSum;
3902 : }
3903 :
3904 353911 : int iFilteredPixelOff = 0; // Used after for.
3905 : // j used after for.
3906 353911 : size_t j =
3907 353911 : (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
3908 : #ifdef USE_SSE2
3909 : if constexpr ((!bNeedRescale || !std::is_same_v<T, float>) &&
3910 : eWrkDataType == GDT_Float32)
3911 : {
3912 : #ifdef __AVX__
3913 : for (; iFilteredPixelOff < nDstXSize - 15;
3914 : iFilteredPixelOff += 16, j += 16)
3915 : {
3916 : GDALResampleConvolutionVertical_16cols(
3917 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3918 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3919 : if (bHasNoData)
3920 : {
3921 : for (int k = 0; k < 16; k++)
3922 : {
3923 : pafDstScanline[iFilteredPixelOff + k] =
3924 : replaceValIfNodata(
3925 : pafDstScanline[iFilteredPixelOff + k]);
3926 : }
3927 : }
3928 : }
3929 : #else
3930 25954967 : for (; iFilteredPixelOff < nDstXSize - 7;
3931 : iFilteredPixelOff += 8, j += 8)
3932 : {
3933 25609808 : GDALResampleConvolutionVertical_8cols(
3934 25609808 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3935 25609808 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3936 25609808 : if (bHasNoData)
3937 : {
3938 123192 : for (int k = 0; k < 8; k++)
3939 : {
3940 109504 : pafDstScanline[iFilteredPixelOff + k] =
3941 109504 : replaceValIfNodata(
3942 109504 : pafDstScanline[iFilteredPixelOff + k]);
3943 : }
3944 : }
3945 : }
3946 : #endif
3947 :
3948 809343 : for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
3949 : {
3950 464251 : const Twork fVal =
3951 464251 : static_cast<Twork>(GDALResampleConvolutionVertical(
3952 464251 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3953 : nSrcLineCount));
3954 464251 : pafDstScanline[iFilteredPixelOff] =
3955 464251 : replaceValIfNodata(fVal);
3956 : }
3957 : }
3958 : else
3959 : #endif
3960 : {
3961 5862642 : const auto ScaleValue = [
3962 : #ifdef _MSC_VER
3963 : mulFactor
3964 : #endif
3965 : ](double dfVal, [[maybe_unused]] const double *inputValues,
3966 : [[maybe_unused]] int nStride,
3967 : [[maybe_unused]] int nInputValues)
3968 : {
3969 5862640 : constexpr bool isFloat =
3970 : std::is_same_v<T, float> || std::is_same_v<T, double>;
3971 : if constexpr (isFloat)
3972 : {
3973 5862640 : if (std::isfinite(dfVal))
3974 : {
3975 : return std::clamp(
3976 : dfVal,
3977 : static_cast<double>(
3978 17585400 : -std::numeric_limits<Twork>::max()) /
3979 : mulFactor,
3980 : static_cast<double>(
3981 5861800 : std::numeric_limits<Twork>::max()) /
3982 5861800 : mulFactor) *
3983 5861800 : mulFactor;
3984 : }
3985 : else if constexpr (bKernelWithNegativeWeights)
3986 : {
3987 480 : if (std::isnan(dfVal))
3988 : {
3989 : // Either one of the input value is NaN or they are +/-Inf
3990 480 : const bool isPositive = inputValues[0] >= 0;
3991 2520 : for (int i = 0; i < nInputValues; ++i)
3992 : {
3993 2200 : if (std::isnan(inputValues[i * nStride]))
3994 160 : return dfVal;
3995 : // cppcheck-suppress knownConditionTrueFalse
3996 2040 : if ((inputValues[i] >= 0) != isPositive)
3997 0 : return dfVal;
3998 : }
3999 : // All values are positive or negative infinity
4000 320 : return inputValues[0];
4001 : }
4002 : }
4003 : }
4004 :
4005 360 : return dfVal;
4006 : };
4007 :
4008 2939422 : for (; iFilteredPixelOff < nDstXSize - 1;
4009 : iFilteredPixelOff += 2, j += 2)
4010 : {
4011 2930610 : double dfVal1 = 0.0;
4012 2930610 : double dfVal2 = 0.0;
4013 2930610 : GDALResampleConvolutionVertical_2cols(
4014 2930610 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
4015 : nSrcLineCount, dfVal1, dfVal2);
4016 5861220 : pafDstScanline[iFilteredPixelOff] =
4017 2930610 : replaceValIfNodata(static_cast<Twork>(
4018 2930610 : ScaleValue(dfVal1, padfHorizontalFiltered + j,
4019 : nDstXSize, nSrcLineCount)));
4020 2930610 : pafDstScanline[iFilteredPixelOff + 1] =
4021 2930610 : replaceValIfNodata(static_cast<Twork>(
4022 2930610 : ScaleValue(dfVal2, padfHorizontalFiltered + j + 1,
4023 : nDstXSize, nSrcLineCount)));
4024 : }
4025 8819 : if (iFilteredPixelOff < nDstXSize)
4026 : {
4027 1427 : const double dfVal = GDALResampleConvolutionVertical(
4028 1427 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
4029 : nSrcLineCount);
4030 1427 : pafDstScanline[iFilteredPixelOff] =
4031 1427 : replaceValIfNodata(static_cast<Twork>(
4032 1427 : ScaleValue(dfVal, padfHorizontalFiltered + j,
4033 : nDstXSize, nSrcLineCount)));
4034 : }
4035 : }
4036 : }
4037 : else
4038 : {
4039 18367351 : for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
4040 : ++iFilteredPixelOff)
4041 : {
4042 18331457 : double dfVal = 0.0;
4043 18331457 : dfWeightSum = 0.0;
4044 18331457 : size_t j = (nSrcLineStart - nChunkYOff) *
4045 18331457 : static_cast<size_t>(nDstXSize) +
4046 18331457 : iFilteredPixelOff;
4047 : if (bKernelWithNegativeWeights)
4048 : {
4049 18087901 : int nConsecutiveValid = 0;
4050 18087901 : int nMaxConsecutiveValid = 0;
4051 127256321 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
4052 : {
4053 109168020 : const double dfWeight =
4054 109168020 : padfWeights[i] *
4055 : pabyChunkNodataMaskHorizontalFiltered[j];
4056 109168020 : if (pabyChunkNodataMaskHorizontalFiltered[j])
4057 : {
4058 46108037 : nConsecutiveValid++;
4059 : }
4060 63060183 : else if (nConsecutiveValid)
4061 : {
4062 204376 : nMaxConsecutiveValid = std::max(
4063 204376 : nMaxConsecutiveValid, nConsecutiveValid);
4064 204376 : nConsecutiveValid = 0;
4065 : }
4066 109168020 : dfVal += padfHorizontalFiltered[j] * dfWeight;
4067 109168020 : dfWeightSum += dfWeight;
4068 : }
4069 18087901 : nMaxConsecutiveValid =
4070 18087901 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
4071 18087901 : if (nMaxConsecutiveValid < nSrcLineCount / 2)
4072 : {
4073 8918591 : pafDstScanline[iFilteredPixelOff] =
4074 8918499 : static_cast<Twork>(dfNoDataValue);
4075 8918591 : continue;
4076 : }
4077 : }
4078 : else
4079 : {
4080 1237062 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
4081 : {
4082 993504 : const double dfWeight =
4083 993504 : padfWeights[i] *
4084 : pabyChunkNodataMaskHorizontalFiltered[j];
4085 993504 : dfVal += padfHorizontalFiltered[j] * dfWeight;
4086 993504 : dfWeightSum += dfWeight;
4087 : }
4088 : }
4089 9412886 : if (dfWeightSum > 0.0)
4090 : {
4091 9396847 : pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
4092 9396835 : static_cast<Twork>(dfVal / dfWeightSum));
4093 : }
4094 : else
4095 : {
4096 16045 : pafDstScanline[iFilteredPixelOff] =
4097 16021 : static_cast<Twork>(dfNoDataValue);
4098 : }
4099 : }
4100 : }
4101 :
4102 389835 : if (fMaxVal != 0.0f)
4103 : {
4104 : if constexpr (std::is_same_v<T, double>)
4105 : {
4106 0 : for (int i = 0; i < nDstXSize; ++i)
4107 : {
4108 0 : if (pafDstScanline[i] > static_cast<double>(fMaxVal))
4109 0 : pafDstScanline[i] = static_cast<double>(fMaxVal);
4110 : }
4111 : }
4112 : else
4113 : {
4114 192324 : for (int i = 0; i < nDstXSize; ++i)
4115 : {
4116 192088 : if (pafDstScanline[i] > fMaxVal)
4117 96022 : pafDstScanline[i] = fMaxVal;
4118 : }
4119 : }
4120 : }
4121 :
4122 389835 : if (pafWrkScanline)
4123 : {
4124 381038 : GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
4125 : static_cast<GByte *>(pDstBuffer) +
4126 381038 : static_cast<size_t>(iDstLine - nDstYOff) *
4127 381038 : nDstXSize * nDstDataTypeSize,
4128 : dstDataType, nDstDataTypeSize, nDstXSize);
4129 : }
4130 : }
4131 :
4132 5093 : VSIFree(pafWrkScanline);
4133 5093 : VSIFreeAligned(padfWeights);
4134 5093 : VSIFree(padfHorizontalFiltered);
4135 5093 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
4136 :
4137 5093 : return CE_None;
4138 : }
4139 :
4140 : template <bool bKernelWithNegativeWeights, bool bNeedRescale>
4141 : static CPLErr
4142 5093 : GDALResampleChunk_ConvolutionInternal(const GDALOverviewResampleArgs &args,
4143 : const void *pChunk, void **ppDstBuffer,
4144 : GDALDataType *peDstBufferDataType)
4145 : {
4146 : GDALResampleAlg eResample;
4147 5093 : if (EQUAL(args.pszResampling, "BILINEAR"))
4148 2660 : eResample = GRA_Bilinear;
4149 2433 : else if (EQUAL(args.pszResampling, "CUBIC"))
4150 2284 : eResample = GRA_Cubic;
4151 149 : else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
4152 59 : eResample = GRA_CubicSpline;
4153 90 : else if (EQUAL(args.pszResampling, "LANCZOS"))
4154 90 : eResample = GRA_Lanczos;
4155 : else
4156 : {
4157 0 : CPLAssert(false);
4158 : return CE_Failure;
4159 : }
4160 5093 : const int nKernelRadius = GWKGetFilterRadius(eResample);
4161 5093 : FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
4162 : const FilterFunc4ValuesType pfnFilterFunc4Values =
4163 5093 : GWKGetFilterFunc4Values(eResample);
4164 :
4165 5093 : float fMaxVal = 0.f;
4166 : // Cubic, etc... can have overshoots, so make sure we clamp values to the
4167 : // maximum value if NBITS is set.
4168 5093 : if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
4169 8 : (args.eOvrDataType == GDT_UInt8 || args.eOvrDataType == GDT_UInt16 ||
4170 0 : args.eOvrDataType == GDT_UInt32))
4171 : {
4172 8 : int nBits = args.nOvrNBITS;
4173 8 : if (nBits == GDALGetDataTypeSizeBits(args.eOvrDataType))
4174 1 : nBits = 0;
4175 8 : if (nBits > 0 && nBits < 32)
4176 7 : fMaxVal = static_cast<float>((1U << nBits) - 1);
4177 : }
4178 :
4179 5093 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
4180 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
4181 : GDALGetDataTypeSizeBytes(args.eOvrDataType));
4182 5093 : if (*ppDstBuffer == nullptr)
4183 : {
4184 0 : return CE_Failure;
4185 : }
4186 5093 : *peDstBufferDataType = args.eOvrDataType;
4187 :
4188 5093 : switch (args.eWrkDataType)
4189 : {
4190 4225 : case GDT_UInt8:
4191 : {
4192 : return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32,
4193 : bKernelWithNegativeWeights,
4194 4225 : bNeedRescale>(
4195 : args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
4196 4225 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4197 : }
4198 :
4199 402 : case GDT_UInt16:
4200 : {
4201 : return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32,
4202 : bKernelWithNegativeWeights,
4203 402 : bNeedRescale>(
4204 : args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
4205 402 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4206 : }
4207 :
4208 375 : case GDT_Float32:
4209 : {
4210 : return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32,
4211 : bKernelWithNegativeWeights,
4212 375 : bNeedRescale>(
4213 : args, static_cast<const float *>(pChunk), *ppDstBuffer,
4214 375 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4215 : }
4216 :
4217 91 : case GDT_Float64:
4218 : {
4219 : return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64,
4220 : bKernelWithNegativeWeights,
4221 91 : bNeedRescale>(
4222 : args, static_cast<const double *>(pChunk), *ppDstBuffer,
4223 91 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4224 : }
4225 :
4226 0 : default:
4227 0 : break;
4228 : }
4229 :
4230 0 : CPLAssert(false);
4231 : return CE_Failure;
4232 : }
4233 :
4234 : static CPLErr
4235 5093 : GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
4236 : const void *pChunk, void **ppDstBuffer,
4237 : GDALDataType *peDstBufferDataType)
4238 : {
4239 5093 : if (EQUAL(args.pszResampling, "CUBIC") ||
4240 2809 : EQUAL(args.pszResampling, "LANCZOS"))
4241 : return GDALResampleChunk_ConvolutionInternal<
4242 2374 : /* bKernelWithNegativeWeights=*/true, /* bNeedRescale = */ true>(
4243 2374 : args, pChunk, ppDstBuffer, peDstBufferDataType);
4244 2719 : else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
4245 59 : return GDALResampleChunk_ConvolutionInternal<false, true>(
4246 59 : args, pChunk, ppDstBuffer, peDstBufferDataType);
4247 : else
4248 2660 : return GDALResampleChunk_ConvolutionInternal<false, false>(
4249 2660 : args, pChunk, ppDstBuffer, peDstBufferDataType);
4250 : }
4251 :
4252 : /************************************************************************/
4253 : /* GDALResampleChunkC32R() */
4254 : /************************************************************************/
4255 :
4256 2 : static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
4257 : const float *pafChunk, const int nChunkYOff,
4258 : const int nChunkYSize, const int nDstYOff,
4259 : const int nDstYOff2, const int nOvrXSize,
4260 : const int nOvrYSize, void **ppDstBuffer,
4261 : GDALDataType *peDstBufferDataType,
4262 : const char *pszResampling)
4263 :
4264 : {
4265 : enum Method
4266 : {
4267 : NEAR,
4268 : AVERAGE,
4269 : AVERAGE_MAGPHASE,
4270 : RMS,
4271 : };
4272 :
4273 2 : Method eMethod = NEAR;
4274 2 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
4275 : {
4276 0 : eMethod = NEAR;
4277 : }
4278 2 : else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
4279 : {
4280 0 : eMethod = AVERAGE_MAGPHASE;
4281 : }
4282 2 : else if (EQUAL(pszResampling, "RMS"))
4283 : {
4284 2 : eMethod = RMS;
4285 : }
4286 0 : else if (STARTS_WITH_CI(pszResampling, "AVER"))
4287 : {
4288 0 : eMethod = AVERAGE;
4289 : }
4290 : else
4291 : {
4292 0 : CPLError(
4293 : CE_Failure, CPLE_NotSupported,
4294 : "Resampling method %s is not supported for complex data types. "
4295 : "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
4296 : pszResampling);
4297 0 : return CE_Failure;
4298 : }
4299 :
4300 2 : const int nOXSize = nOvrXSize;
4301 2 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
4302 : GDALGetDataTypeSizeBytes(GDT_CFloat32));
4303 2 : if (*ppDstBuffer == nullptr)
4304 : {
4305 0 : return CE_Failure;
4306 : }
4307 2 : float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
4308 2 : *peDstBufferDataType = GDT_CFloat32;
4309 :
4310 2 : const int nOYSize = nOvrYSize;
4311 2 : const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
4312 2 : const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
4313 :
4314 : /* ==================================================================== */
4315 : /* Loop over destination scanlines. */
4316 : /* ==================================================================== */
4317 8 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
4318 : {
4319 6 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
4320 6 : if (nSrcYOff < nChunkYOff)
4321 0 : nSrcYOff = nChunkYOff;
4322 :
4323 6 : int nSrcYOff2 =
4324 6 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
4325 6 : if (nSrcYOff2 == nSrcYOff)
4326 0 : nSrcYOff2++;
4327 :
4328 6 : if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
4329 : {
4330 2 : if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
4331 0 : nSrcYOff = nSrcHeight - 1;
4332 2 : nSrcYOff2 = nSrcHeight;
4333 : }
4334 6 : if (nSrcYOff2 > nChunkYOff + nChunkYSize)
4335 0 : nSrcYOff2 = nChunkYOff + nChunkYSize;
4336 :
4337 6 : const float *const pafSrcScanline =
4338 6 : pafChunk +
4339 6 : (static_cast<size_t>(nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
4340 6 : float *const pafDstScanline =
4341 6 : pafDstBuffer +
4342 6 : static_cast<size_t>(iDstLine - nDstYOff) * 2 * nOXSize;
4343 :
4344 : /* --------------------------------------------------------------------
4345 : */
4346 : /* Loop over destination pixels */
4347 : /* --------------------------------------------------------------------
4348 : */
4349 18 : for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
4350 : {
4351 12 : const size_t iDstPixelSZ = static_cast<size_t>(iDstPixel);
4352 12 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
4353 12 : int nSrcXOff2 =
4354 12 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
4355 12 : if (nSrcXOff2 == nSrcXOff)
4356 0 : nSrcXOff2++;
4357 12 : if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
4358 : {
4359 6 : if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
4360 0 : nSrcXOff = nSrcWidth - 1;
4361 6 : nSrcXOff2 = nSrcWidth;
4362 : }
4363 12 : const size_t nSrcXOffSZ = static_cast<size_t>(nSrcXOff);
4364 :
4365 12 : if (eMethod == NEAR)
4366 : {
4367 0 : pafDstScanline[iDstPixelSZ * 2] =
4368 0 : pafSrcScanline[nSrcXOffSZ * 2];
4369 0 : pafDstScanline[iDstPixelSZ * 2 + 1] =
4370 0 : pafSrcScanline[nSrcXOffSZ * 2 + 1];
4371 : }
4372 12 : else if (eMethod == AVERAGE_MAGPHASE)
4373 : {
4374 0 : double dfTotalR = 0.0;
4375 0 : double dfTotalI = 0.0;
4376 0 : double dfTotalM = 0.0;
4377 0 : size_t nCount = 0;
4378 :
4379 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4380 : {
4381 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4382 : {
4383 0 : const double dfR = double(
4384 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4385 0 : static_cast<size_t>(iY - nSrcYOff) *
4386 0 : nSrcWidth * 2]);
4387 0 : const double dfI = double(
4388 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4389 0 : static_cast<size_t>(iY - nSrcYOff) *
4390 0 : nSrcWidth * 2 +
4391 0 : 1]);
4392 0 : dfTotalR += dfR;
4393 0 : dfTotalI += dfI;
4394 0 : dfTotalM += std::hypot(dfR, dfI);
4395 0 : ++nCount;
4396 : }
4397 : }
4398 :
4399 0 : CPLAssert(nCount > 0);
4400 0 : if (nCount == 0)
4401 : {
4402 0 : pafDstScanline[iDstPixelSZ * 2] = 0.0;
4403 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4404 : }
4405 : else
4406 : {
4407 0 : pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4408 0 : dfTotalR / static_cast<double>(nCount));
4409 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4410 0 : dfTotalI / static_cast<double>(nCount));
4411 : const double dfM =
4412 0 : double(std::hypot(pafDstScanline[iDstPixelSZ * 2],
4413 0 : pafDstScanline[iDstPixelSZ * 2 + 1]));
4414 0 : const double dfDesiredM =
4415 0 : dfTotalM / static_cast<double>(nCount);
4416 0 : double dfRatio = 1.0;
4417 0 : if (dfM != 0.0)
4418 0 : dfRatio = dfDesiredM / dfM;
4419 :
4420 0 : pafDstScanline[iDstPixelSZ * 2] *=
4421 0 : static_cast<float>(dfRatio);
4422 0 : pafDstScanline[iDstPixelSZ * 2 + 1] *=
4423 0 : static_cast<float>(dfRatio);
4424 : }
4425 : }
4426 12 : else if (eMethod == RMS)
4427 : {
4428 12 : double dfTotalR = 0.0;
4429 12 : double dfTotalI = 0.0;
4430 12 : size_t nCount = 0;
4431 :
4432 36 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4433 : {
4434 72 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4435 : {
4436 48 : const double dfR = double(
4437 48 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4438 48 : static_cast<size_t>(iY - nSrcYOff) *
4439 48 : nSrcWidth * 2]);
4440 48 : const double dfI = double(
4441 48 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4442 48 : static_cast<size_t>(iY - nSrcYOff) *
4443 48 : nSrcWidth * 2 +
4444 48 : 1]);
4445 :
4446 48 : dfTotalR += SQUARE(dfR);
4447 48 : dfTotalI += SQUARE(dfI);
4448 :
4449 48 : ++nCount;
4450 : }
4451 : }
4452 :
4453 12 : CPLAssert(nCount > 0);
4454 12 : if (nCount == 0)
4455 : {
4456 0 : pafDstScanline[iDstPixelSZ * 2] = 0.0;
4457 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4458 : }
4459 : else
4460 : {
4461 : /* compute RMS */
4462 12 : pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4463 12 : sqrt(dfTotalR / static_cast<double>(nCount)));
4464 12 : pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4465 12 : sqrt(dfTotalI / static_cast<double>(nCount)));
4466 : }
4467 : }
4468 0 : else if (eMethod == AVERAGE)
4469 : {
4470 0 : double dfTotalR = 0.0;
4471 0 : double dfTotalI = 0.0;
4472 0 : size_t nCount = 0;
4473 :
4474 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4475 : {
4476 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4477 : {
4478 : // TODO(schwehr): Maybe use std::complex?
4479 0 : dfTotalR += double(
4480 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4481 0 : static_cast<size_t>(iY - nSrcYOff) *
4482 0 : nSrcWidth * 2]);
4483 0 : dfTotalI += double(
4484 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4485 0 : static_cast<size_t>(iY - nSrcYOff) *
4486 0 : nSrcWidth * 2 +
4487 0 : 1]);
4488 0 : ++nCount;
4489 : }
4490 : }
4491 :
4492 0 : CPLAssert(nCount > 0);
4493 0 : if (nCount == 0)
4494 : {
4495 0 : pafDstScanline[iDstPixelSZ * 2] = 0.0;
4496 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4497 : }
4498 : else
4499 : {
4500 0 : pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4501 0 : dfTotalR / static_cast<double>(nCount));
4502 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4503 0 : dfTotalI / static_cast<double>(nCount));
4504 : }
4505 : }
4506 : }
4507 : }
4508 :
4509 2 : return CE_None;
4510 : }
4511 :
4512 : /************************************************************************/
4513 : /* GDALRegenerateCascadingOverviews() */
4514 : /* */
4515 : /* Generate a list of overviews in order from largest to */
4516 : /* smallest, computing each from the next larger. */
4517 : /************************************************************************/
4518 :
4519 44 : static CPLErr GDALRegenerateCascadingOverviews(
4520 : GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
4521 : const char *pszResampling, GDALProgressFunc pfnProgress,
4522 : void *pProgressData, CSLConstList papszOptions)
4523 :
4524 : {
4525 : /* -------------------------------------------------------------------- */
4526 : /* First, we must put the overviews in order from largest to */
4527 : /* smallest. */
4528 : /* -------------------------------------------------------------------- */
4529 127 : for (int i = 0; i < nOverviews - 1; ++i)
4530 : {
4531 292 : for (int j = 0; j < nOverviews - i - 1; ++j)
4532 : {
4533 209 : if (papoOvrBands[j]->GetXSize() *
4534 209 : static_cast<float>(papoOvrBands[j]->GetYSize()) <
4535 209 : papoOvrBands[j + 1]->GetXSize() *
4536 209 : static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
4537 : {
4538 0 : GDALRasterBand *poTempBand = papoOvrBands[j];
4539 0 : papoOvrBands[j] = papoOvrBands[j + 1];
4540 0 : papoOvrBands[j + 1] = poTempBand;
4541 : }
4542 : }
4543 : }
4544 :
4545 : /* -------------------------------------------------------------------- */
4546 : /* Count total pixels so we can prepare appropriate scaled */
4547 : /* progress functions. */
4548 : /* -------------------------------------------------------------------- */
4549 44 : double dfTotalPixels = 0.0;
4550 :
4551 171 : for (int i = 0; i < nOverviews; ++i)
4552 : {
4553 127 : dfTotalPixels += papoOvrBands[i]->GetXSize() *
4554 127 : static_cast<double>(papoOvrBands[i]->GetYSize());
4555 : }
4556 :
4557 : /* -------------------------------------------------------------------- */
4558 : /* Generate all the bands. */
4559 : /* -------------------------------------------------------------------- */
4560 44 : double dfPixelsProcessed = 0.0;
4561 :
4562 88 : CPLStringList aosOptions(papszOptions);
4563 44 : aosOptions.SetNameValue("CASCADING", "YES");
4564 171 : for (int i = 0; i < nOverviews; ++i)
4565 : {
4566 127 : GDALRasterBand *poBaseBand = poSrcBand;
4567 127 : if (i != 0)
4568 83 : poBaseBand = papoOvrBands[i - 1];
4569 :
4570 127 : double dfPixels = papoOvrBands[i]->GetXSize() *
4571 127 : static_cast<double>(papoOvrBands[i]->GetYSize());
4572 :
4573 254 : void *pScaledProgressData = GDALCreateScaledProgress(
4574 : dfPixelsProcessed / dfTotalPixels,
4575 127 : (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
4576 : pProgressData);
4577 :
4578 254 : const CPLErr eErr = GDALRegenerateOverviewsEx(
4579 : poBaseBand, 1,
4580 127 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
4581 : pszResampling, GDALScaledProgress, pScaledProgressData,
4582 127 : aosOptions.List());
4583 127 : GDALDestroyScaledProgress(pScaledProgressData);
4584 :
4585 127 : if (eErr != CE_None)
4586 0 : return eErr;
4587 :
4588 127 : dfPixelsProcessed += dfPixels;
4589 :
4590 : // Only do the bit2grayscale promotion on the base band.
4591 127 : if (STARTS_WITH_CI(pszResampling,
4592 : "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
4593 8 : pszResampling = "AVERAGE";
4594 : }
4595 :
4596 44 : return CE_None;
4597 : }
4598 :
4599 : /************************************************************************/
4600 : /* GDALGetResampleFunction() */
4601 : /************************************************************************/
4602 :
4603 5467 : GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
4604 : int *pnRadius)
4605 : {
4606 5467 : if (pnRadius)
4607 5467 : *pnRadius = 0;
4608 5467 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
4609 519 : return GDALResampleChunk_Near;
4610 4948 : else if (STARTS_WITH_CI(pszResampling, "AVER") ||
4611 4373 : EQUAL(pszResampling, "RMS"))
4612 634 : return GDALResampleChunk_AverageOrRMS;
4613 4314 : else if (EQUAL(pszResampling, "GAUSS"))
4614 : {
4615 26 : if (pnRadius)
4616 26 : *pnRadius = 1;
4617 26 : return GDALResampleChunk_Gauss;
4618 : }
4619 4288 : else if (EQUAL(pszResampling, "MODE"))
4620 136 : return GDALResampleChunk_Mode;
4621 4152 : else if (EQUAL(pszResampling, "CUBIC"))
4622 : {
4623 1639 : if (pnRadius)
4624 1639 : *pnRadius = GWKGetFilterRadius(GRA_Cubic);
4625 1639 : return GDALResampleChunk_Convolution;
4626 : }
4627 2513 : else if (EQUAL(pszResampling, "CUBICSPLINE"))
4628 : {
4629 39 : if (pnRadius)
4630 39 : *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
4631 39 : return GDALResampleChunk_Convolution;
4632 : }
4633 2474 : else if (EQUAL(pszResampling, "LANCZOS"))
4634 : {
4635 44 : if (pnRadius)
4636 44 : *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
4637 44 : return GDALResampleChunk_Convolution;
4638 : }
4639 2430 : else if (EQUAL(pszResampling, "BILINEAR"))
4640 : {
4641 2430 : if (pnRadius)
4642 2430 : *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
4643 2430 : return GDALResampleChunk_Convolution;
4644 : }
4645 : else
4646 : {
4647 0 : CPLError(
4648 : CE_Failure, CPLE_AppDefined,
4649 : "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
4650 : pszResampling);
4651 0 : return nullptr;
4652 : }
4653 : }
4654 :
4655 : /************************************************************************/
4656 : /* GDALGetOvrWorkDataType() */
4657 : /************************************************************************/
4658 :
4659 5349 : GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
4660 : GDALDataType eSrcDataType)
4661 : {
4662 5349 : if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
4663 : {
4664 647 : return eSrcDataType;
4665 : }
4666 4702 : else if (eSrcDataType == GDT_UInt8 &&
4667 4167 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4668 3682 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4669 2279 : EQUAL(pszResampling, "CUBICSPLINE") ||
4670 2274 : EQUAL(pszResampling, "LANCZOS") ||
4671 2267 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4672 : {
4673 4160 : return GDT_UInt8;
4674 : }
4675 542 : else if (eSrcDataType == GDT_UInt16 &&
4676 131 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4677 126 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4678 8 : EQUAL(pszResampling, "CUBICSPLINE") ||
4679 6 : EQUAL(pszResampling, "LANCZOS") ||
4680 3 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4681 : {
4682 131 : return GDT_UInt16;
4683 : }
4684 411 : else if (EQUAL(pszResampling, "GAUSS"))
4685 20 : return GDT_Float64;
4686 :
4687 391 : if (eSrcDataType == GDT_UInt8 || eSrcDataType == GDT_Int8 ||
4688 390 : eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
4689 : eSrcDataType == GDT_Float32)
4690 : {
4691 257 : return GDT_Float32;
4692 : }
4693 134 : return GDT_Float64;
4694 : }
4695 :
4696 : namespace
4697 : {
4698 : // Structure to hold a pointer to free with CPLFree()
4699 : struct PointerHolder
4700 : {
4701 : void *ptr = nullptr;
4702 :
4703 4178 : template <class T> explicit PointerHolder(T *&ptrIn) : ptr(ptrIn)
4704 : {
4705 4178 : ptrIn = nullptr;
4706 4178 : }
4707 :
4708 : template <class T>
4709 32 : explicit PointerHolder(std::unique_ptr<T, VSIFreeReleaser> ptrIn)
4710 32 : : ptr(ptrIn.release())
4711 : {
4712 32 : }
4713 :
4714 4210 : ~PointerHolder()
4715 4210 : {
4716 4210 : CPLFree(ptr);
4717 4210 : }
4718 :
4719 : PointerHolder(const PointerHolder &) = delete;
4720 : PointerHolder &operator=(const PointerHolder &) = delete;
4721 : };
4722 : } // namespace
4723 :
4724 : /************************************************************************/
4725 : /* GDALRegenerateOverviews() */
4726 : /************************************************************************/
4727 :
4728 : /**
4729 : * \brief Generate downsampled overviews.
4730 : *
4731 : * This function will generate one or more overview images from a base image
4732 : * using the requested downsampling algorithm. Its primary use is for
4733 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4734 : * used to generate downsampled images in one file from another outside the
4735 : * overview architecture.
4736 : *
4737 : * The output bands need to exist in advance.
4738 : *
4739 : * The full set of resampling algorithms is documented in
4740 : * GDALDataset::BuildOverviews().
4741 : *
4742 : * This function will honour properly NODATA_VALUES tuples (special dataset
4743 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4744 : * considered as the nodata value and not each value of the triplet
4745 : * independently per band.
4746 : *
4747 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4748 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4749 : * overview computation.
4750 : *
4751 : * @param hSrcBand the source (base level) band.
4752 : * @param nOverviewCount the number of downsampled bands being generated.
4753 : * @param pahOvrBands the list of downsampled bands to be generated.
4754 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4755 : * @param pfnProgress progress report function.
4756 : * @param pProgressData progress function callback data.
4757 : * @return CE_None on success or CE_Failure on failure.
4758 : */
4759 250 : CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
4760 : GDALRasterBandH *pahOvrBands,
4761 : const char *pszResampling,
4762 : GDALProgressFunc pfnProgress,
4763 : void *pProgressData)
4764 :
4765 : {
4766 250 : return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
4767 : pszResampling, pfnProgress, pProgressData,
4768 250 : nullptr);
4769 : }
4770 :
4771 : /************************************************************************/
4772 : /* GDALRegenerateOverviewsEx() */
4773 : /************************************************************************/
4774 :
4775 : constexpr int RADIUS_TO_DIAMETER = 2;
4776 :
4777 : /**
4778 : * \brief Generate downsampled overviews.
4779 : *
4780 : * This function will generate one or more overview images from a base image
4781 : * using the requested downsampling algorithm. Its primary use is for
4782 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4783 : * used to generate downsampled images in one file from another outside the
4784 : * overview architecture.
4785 : *
4786 : * The output bands need to exist in advance.
4787 : *
4788 : * The full set of resampling algorithms is documented in
4789 : * GDALDataset::BuildOverviews().
4790 : *
4791 : * This function will honour properly NODATA_VALUES tuples (special dataset
4792 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4793 : * considered as the nodata value and not each value of the triplet
4794 : * independently per band.
4795 : *
4796 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4797 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4798 : * overview computation.
4799 : *
4800 : * @param hSrcBand the source (base level) band.
4801 : * @param nOverviewCount the number of downsampled bands being generated.
4802 : * @param pahOvrBands the list of downsampled bands to be generated.
4803 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4804 : * @param pfnProgress progress report function.
4805 : * @param pProgressData progress function callback data.
4806 : * @param papszOptions NULL terminated list of options as key=value pairs, or
4807 : * NULL
4808 : * @return CE_None on success or CE_Failure on failure.
4809 : * @since GDAL 3.6
4810 : */
4811 915 : CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
4812 : GDALRasterBandH *pahOvrBands,
4813 : const char *pszResampling,
4814 : GDALProgressFunc pfnProgress,
4815 : void *pProgressData, CSLConstList papszOptions)
4816 :
4817 : {
4818 915 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
4819 915 : GDALRasterBand **papoOvrBands =
4820 : reinterpret_cast<GDALRasterBand **>(pahOvrBands);
4821 :
4822 915 : if (pfnProgress == nullptr)
4823 252 : pfnProgress = GDALDummyProgress;
4824 :
4825 915 : if (EQUAL(pszResampling, "NONE"))
4826 49 : return CE_None;
4827 :
4828 866 : int nKernelRadius = 0;
4829 : GDALResampleFunction pfnResampleFn =
4830 866 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
4831 :
4832 866 : if (pfnResampleFn == nullptr)
4833 0 : return CE_Failure;
4834 :
4835 : /* -------------------------------------------------------------------- */
4836 : /* Check color tables... */
4837 : /* -------------------------------------------------------------------- */
4838 866 : GDALColorTable *poColorTable = nullptr;
4839 :
4840 495 : if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
4841 1810 : EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
4842 460 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4843 : {
4844 9 : poColorTable = poSrcBand->GetColorTable();
4845 9 : if (poColorTable != nullptr)
4846 : {
4847 9 : if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
4848 : {
4849 0 : CPLError(CE_Warning, CPLE_AppDefined,
4850 : "Computing overviews on palette index raster bands "
4851 : "with a palette whose color interpretation is not RGB "
4852 : "will probably lead to unexpected results.");
4853 0 : poColorTable = nullptr;
4854 : }
4855 9 : else if (poColorTable->IsIdentity())
4856 : {
4857 0 : poColorTable = nullptr;
4858 : }
4859 : }
4860 : else
4861 : {
4862 0 : CPLError(CE_Warning, CPLE_AppDefined,
4863 : "Computing overviews on palette index raster bands "
4864 : "without a palette will probably lead to unexpected "
4865 : "results.");
4866 : }
4867 : }
4868 : // Not ready yet
4869 2517 : else if ((EQUAL(pszResampling, "CUBIC") ||
4870 803 : EQUAL(pszResampling, "CUBICSPLINE") ||
4871 803 : EQUAL(pszResampling, "LANCZOS") ||
4872 1740 : EQUAL(pszResampling, "BILINEAR")) &&
4873 80 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4874 : {
4875 0 : CPLError(CE_Warning, CPLE_AppDefined,
4876 : "Computing %s overviews on palette index raster bands "
4877 : "will probably lead to unexpected results.",
4878 : pszResampling);
4879 : }
4880 :
4881 : // If we have a nodata mask and we are doing something more complicated
4882 : // than nearest neighbouring, we have to fetch to nodata mask.
4883 :
4884 866 : GDALRasterBand *poMaskBand = nullptr;
4885 866 : bool bUseNoDataMask = false;
4886 866 : bool bCanUseCascaded = true;
4887 :
4888 866 : if (!STARTS_WITH_CI(pszResampling, "NEAR"))
4889 : {
4890 : // Special case if we are an alpha/mask band. We want it to be
4891 : // considered as the mask band to avoid alpha=0 to be taken into account
4892 : // in average computation.
4893 540 : if (poSrcBand->IsMaskBand())
4894 : {
4895 93 : poMaskBand = poSrcBand;
4896 93 : bUseNoDataMask = true;
4897 : }
4898 : else
4899 : {
4900 447 : poMaskBand = poSrcBand->GetMaskBand();
4901 447 : const int nMaskFlags = poSrcBand->GetMaskFlags();
4902 447 : bCanUseCascaded =
4903 447 : (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
4904 447 : bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
4905 : }
4906 : }
4907 :
4908 866 : int nHasNoData = 0;
4909 866 : const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
4910 866 : const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
4911 : const bool bPropagateNoData =
4912 866 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
4913 :
4914 974 : if (poSrcBand->GetBand() == 1 && bUseNoDataMask &&
4915 108 : CSLFetchNameValue(papszOptions, "CASCADING") == nullptr)
4916 : {
4917 192 : std::string osDetailMessage;
4918 96 : if (poSrcBand->HasConflictingMaskSources(&osDetailMessage, false))
4919 : {
4920 2 : CPLError(
4921 : CE_Warning, CPLE_AppDefined, "%s%s", osDetailMessage.c_str(),
4922 : bHasNoData
4923 : ? "Only the nodata value will be taken into account."
4924 : : "Only the first listed one will be taken into account.");
4925 : }
4926 : }
4927 :
4928 : /* -------------------------------------------------------------------- */
4929 : /* If we are operating on multiple overviews, and using */
4930 : /* averaging, lets do them in cascading order to reduce the */
4931 : /* amount of computation. */
4932 : /* -------------------------------------------------------------------- */
4933 :
4934 : // In case the mask made be computed from another band of the dataset,
4935 : // we can't use cascaded generation, as the computation of the overviews
4936 : // of the band used for the mask band may not have yet occurred (#3033).
4937 866 : if ((STARTS_WITH_CI(pszResampling, "AVER") ||
4938 495 : EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
4939 464 : EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
4940 410 : EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
4941 866 : EQUAL(pszResampling, "MODE")) &&
4942 44 : nOverviewCount > 1 && bCanUseCascaded)
4943 44 : return GDALRegenerateCascadingOverviews(
4944 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
4945 44 : pProgressData, papszOptions);
4946 :
4947 : /* -------------------------------------------------------------------- */
4948 : /* Setup one horizontal swath to read from the raw buffer. */
4949 : /* -------------------------------------------------------------------- */
4950 822 : int nFRXBlockSize = 0;
4951 822 : int nFRYBlockSize = 0;
4952 822 : poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
4953 :
4954 822 : const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
4955 1318 : const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
4956 1268 : EQUAL(pszResampling, "MODE") ||
4957 446 : !GDALDataTypeIsComplex(eSrcDataType);
4958 : const GDALDataType eWrkDataType =
4959 : bUseGenericResampleFn
4960 822 : ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
4961 822 : : GDT_CFloat32;
4962 :
4963 822 : const int nWidth = poSrcBand->GetXSize();
4964 822 : const int nHeight = poSrcBand->GetYSize();
4965 :
4966 822 : int nMaxOvrFactor = 1;
4967 1763 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
4968 : {
4969 941 : const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
4970 941 : const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
4971 941 : nMaxOvrFactor = std::max(
4972 : nMaxOvrFactor,
4973 941 : static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
4974 941 : nMaxOvrFactor = std::max(
4975 : nMaxOvrFactor,
4976 941 : static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
4977 : }
4978 :
4979 822 : int nFullResYChunk = nFRYBlockSize;
4980 822 : int nMaxChunkYSizeQueried = 0;
4981 :
4982 : const auto UpdateChunkHeightAndGetChunkSize =
4983 10802 : [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
4984 87377 : eWrkDataType, nWidth]()
4985 : {
4986 : // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
4987 : // + nFullResYChunk) / nMaxOvrFactor)
4988 10802 : if (nMaxOvrFactor > INT_MAX / RADIUS_TO_DIAMETER)
4989 : {
4990 1 : return GINTBIG_MAX;
4991 : }
4992 10801 : nFullResYChunk =
4993 10801 : std::max(nFullResYChunk, RADIUS_TO_DIAMETER * nMaxOvrFactor);
4994 10801 : if ((nKernelRadius > 0 &&
4995 970 : nMaxOvrFactor > INT_MAX / (RADIUS_TO_DIAMETER * nKernelRadius)) ||
4996 10801 : nFullResYChunk >
4997 10801 : INT_MAX - RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor)
4998 : {
4999 0 : return GINTBIG_MAX;
5000 : }
5001 10801 : nMaxChunkYSizeQueried =
5002 10801 : nFullResYChunk + RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor;
5003 10801 : if (GDALGetDataTypeSizeBytes(eWrkDataType) >
5004 10801 : std::numeric_limits<int64_t>::max() /
5005 10801 : (static_cast<int64_t>(nMaxChunkYSizeQueried) * nWidth))
5006 : {
5007 1 : return GINTBIG_MAX;
5008 : }
5009 10800 : return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
5010 10800 : nMaxChunkYSizeQueried * nWidth;
5011 822 : };
5012 :
5013 : const char *pszChunkYSize =
5014 822 : CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
5015 : #ifndef __COVERITY__
5016 : // Only configurable for debug / testing
5017 822 : if (pszChunkYSize)
5018 : {
5019 0 : nFullResYChunk = atoi(pszChunkYSize);
5020 : }
5021 : #endif
5022 :
5023 : // Only configurable for debug / testing
5024 : const int nChunkMaxSize =
5025 822 : atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
5026 :
5027 822 : auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
5028 822 : if (nChunkSize > nChunkMaxSize)
5029 : {
5030 15 : if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
5031 44 : !GDALDataTypeIsComplex(eSrcDataType) &&
5032 14 : (!STARTS_WITH_CI(pszResampling, "AVER") ||
5033 2 : EQUAL(pszResampling, "AVERAGE")))
5034 : {
5035 : // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
5036 : // which use a block based strategy, which is much less memory
5037 : // hungry.
5038 14 : return GDALRegenerateOverviewsMultiBand(
5039 : 1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
5040 14 : pfnProgress, pProgressData, papszOptions);
5041 : }
5042 1 : else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
5043 : {
5044 0 : return GDALRegenerateCascadingOverviews(
5045 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
5046 0 : pfnProgress, pProgressData, papszOptions);
5047 : }
5048 : }
5049 807 : else if (pszChunkYSize == nullptr)
5050 : {
5051 : // Try to get as close as possible to nChunkMaxSize
5052 10787 : while (nChunkSize < nChunkMaxSize / 2)
5053 : {
5054 9980 : nFullResYChunk *= 2;
5055 9980 : nChunkSize = UpdateChunkHeightAndGetChunkSize();
5056 : }
5057 : }
5058 :
5059 : // Structure describing a resampling job
5060 : struct OvrJob
5061 : {
5062 : // Buffers to free when job is finished
5063 : std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
5064 : std::shared_ptr<PointerHolder> oSrcBufferHolder{};
5065 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
5066 :
5067 : GDALRasterBand *poDstBand = nullptr;
5068 :
5069 : // Input parameters of pfnResampleFn
5070 : GDALResampleFunction pfnResampleFn = nullptr;
5071 : int nSrcWidth = 0;
5072 : int nSrcHeight = 0;
5073 : int nDstWidth = 0;
5074 : GDALOverviewResampleArgs args{};
5075 : const void *pChunk = nullptr;
5076 : bool bUseGenericResampleFn = false;
5077 :
5078 : // Output values of resampling function
5079 : CPLErr eErr = CE_Failure;
5080 : void *pDstBuffer = nullptr;
5081 : GDALDataType eDstBufferDataType = GDT_Unknown;
5082 :
5083 0 : void SetSrcMaskBufferHolder(
5084 : const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
5085 : {
5086 0 : oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
5087 0 : }
5088 :
5089 0 : void SetSrcBufferHolder(
5090 : const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
5091 : {
5092 0 : oSrcBufferHolder = oSrcBufferHolderIn;
5093 0 : }
5094 :
5095 910 : void NotifyFinished()
5096 : {
5097 1820 : std::lock_guard guard(mutex);
5098 910 : bFinished = true;
5099 910 : cv.notify_one();
5100 910 : }
5101 :
5102 0 : bool IsFinished()
5103 : {
5104 0 : std::lock_guard guard(mutex);
5105 0 : return bFinished;
5106 : }
5107 :
5108 0 : void WaitFinished()
5109 : {
5110 0 : std::unique_lock oGuard(mutex);
5111 0 : while (!bFinished)
5112 : {
5113 0 : cv.wait(oGuard);
5114 : }
5115 0 : }
5116 :
5117 : private:
5118 : // Synchronization
5119 : bool bFinished = false;
5120 : std::mutex mutex{};
5121 : std::condition_variable cv{};
5122 : };
5123 :
5124 : // Thread function to resample
5125 910 : const auto JobResampleFunc = [](void *pData)
5126 : {
5127 910 : OvrJob *poJob = static_cast<OvrJob *>(pData);
5128 :
5129 910 : if (poJob->bUseGenericResampleFn)
5130 : {
5131 908 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
5132 : &(poJob->pDstBuffer),
5133 : &(poJob->eDstBufferDataType));
5134 : }
5135 : else
5136 : {
5137 2 : poJob->eErr = GDALResampleChunkC32R(
5138 : poJob->nSrcWidth, poJob->nSrcHeight,
5139 2 : static_cast<const float *>(poJob->pChunk),
5140 : poJob->args.nChunkYOff, poJob->args.nChunkYSize,
5141 : poJob->args.nDstYOff, poJob->args.nDstYOff2,
5142 : poJob->args.nOvrXSize, poJob->args.nOvrYSize,
5143 : &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
5144 : poJob->args.pszResampling);
5145 : }
5146 :
5147 910 : auto pDstBuffer = poJob->pDstBuffer;
5148 910 : poJob->oDstBufferHolder = std::make_unique<PointerHolder>(pDstBuffer);
5149 :
5150 910 : poJob->NotifyFinished();
5151 910 : };
5152 :
5153 : // Function to write resample data to target band
5154 910 : const auto WriteJobData = [](const OvrJob *poJob)
5155 : {
5156 1820 : return poJob->poDstBand->RasterIO(
5157 910 : GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
5158 910 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
5159 910 : poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
5160 910 : poJob->eDstBufferDataType, 0, 0, nullptr);
5161 : };
5162 :
5163 : // Wait for completion of oldest job and serialize it
5164 : const auto WaitAndFinalizeOldestJob =
5165 0 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
5166 : {
5167 0 : auto poOldestJob = jobList.front().get();
5168 0 : poOldestJob->WaitFinished();
5169 0 : CPLErr l_eErr = poOldestJob->eErr;
5170 0 : if (l_eErr == CE_None)
5171 : {
5172 0 : l_eErr = WriteJobData(poOldestJob);
5173 : }
5174 :
5175 0 : jobList.pop_front();
5176 0 : return l_eErr;
5177 : };
5178 :
5179 : // Queue of jobs
5180 1616 : std::list<std::unique_ptr<OvrJob>> jobList;
5181 :
5182 808 : GByte *pabyChunkNodataMask = nullptr;
5183 808 : void *pChunk = nullptr;
5184 :
5185 808 : const int nThreads = GDALGetNumThreads(GDAL_DEFAULT_MAX_THREAD_COUNT,
5186 : /* bDefaultToAllCPUs=*/false);
5187 : auto poThreadPool =
5188 808 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5189 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5190 1616 : : std::unique_ptr<CPLJobQueue>(nullptr);
5191 :
5192 : /* -------------------------------------------------------------------- */
5193 : /* Loop over image operating on chunks. */
5194 : /* -------------------------------------------------------------------- */
5195 808 : int nChunkYOff = 0;
5196 808 : CPLErr eErr = CE_None;
5197 :
5198 1621 : for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
5199 813 : nChunkYOff += nFullResYChunk)
5200 : {
5201 813 : if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
5202 : pProgressData))
5203 : {
5204 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
5205 0 : eErr = CE_Failure;
5206 : }
5207 :
5208 813 : if (nFullResYChunk + nChunkYOff > nHeight)
5209 805 : nFullResYChunk = nHeight - nChunkYOff;
5210 :
5211 813 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
5212 813 : int nChunkYSizeQueried =
5213 813 : nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
5214 813 : if (nChunkYOffQueried < 0)
5215 : {
5216 83 : nChunkYSizeQueried += nChunkYOffQueried;
5217 83 : nChunkYOffQueried = 0;
5218 : }
5219 813 : if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
5220 83 : nChunkYSizeQueried = nHeight - nChunkYOffQueried;
5221 :
5222 : // Avoid accumulating too many tasks and exhaust RAM
5223 : // Try to complete already finished jobs
5224 813 : while (eErr == CE_None && !jobList.empty())
5225 : {
5226 0 : auto poOldestJob = jobList.front().get();
5227 0 : if (!poOldestJob->IsFinished())
5228 0 : break;
5229 0 : eErr = poOldestJob->eErr;
5230 0 : if (eErr == CE_None)
5231 : {
5232 0 : eErr = WriteJobData(poOldestJob);
5233 : }
5234 :
5235 0 : jobList.pop_front();
5236 : }
5237 :
5238 : // And in case we have saturated the number of threads,
5239 : // wait for completion of tasks to go below the threshold.
5240 1626 : while (eErr == CE_None &&
5241 813 : jobList.size() >= static_cast<size_t>(nThreads))
5242 : {
5243 0 : eErr = WaitAndFinalizeOldestJob(jobList);
5244 : }
5245 :
5246 : // (Re)allocate buffers if needed
5247 813 : if (pChunk == nullptr)
5248 : {
5249 808 : pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
5250 : nMaxChunkYSizeQueried, nWidth);
5251 : }
5252 813 : if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
5253 : {
5254 287 : pabyChunkNodataMask = static_cast<GByte *>(
5255 287 : VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
5256 : }
5257 :
5258 813 : if (pChunk == nullptr ||
5259 287 : (bUseNoDataMask && pabyChunkNodataMask == nullptr))
5260 : {
5261 0 : CPLFree(pChunk);
5262 0 : CPLFree(pabyChunkNodataMask);
5263 0 : return CE_Failure;
5264 : }
5265 :
5266 : // Read chunk.
5267 813 : if (eErr == CE_None)
5268 813 : eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
5269 : nChunkYSizeQueried, pChunk, nWidth,
5270 : nChunkYSizeQueried, eWrkDataType, 0, 0,
5271 : nullptr);
5272 813 : if (eErr == CE_None && bUseNoDataMask)
5273 287 : eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
5274 : nChunkYSizeQueried, pabyChunkNodataMask,
5275 : nWidth, nChunkYSizeQueried, GDT_UInt8,
5276 : 0, 0, nullptr);
5277 :
5278 : // Special case to promote 1bit data to 8bit 0/255 values.
5279 813 : if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
5280 : {
5281 9 : if (eWrkDataType == GDT_Float32)
5282 : {
5283 0 : float *pafChunk = static_cast<float *>(pChunk);
5284 0 : for (size_t i = 0;
5285 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5286 : {
5287 0 : if (pafChunk[i] == 1.0f)
5288 0 : pafChunk[i] = 255.0f;
5289 : }
5290 : }
5291 9 : else if (eWrkDataType == GDT_UInt8)
5292 : {
5293 9 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
5294 168417 : for (size_t i = 0;
5295 168417 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5296 : {
5297 168408 : if (pabyChunk[i] == 1)
5298 127437 : pabyChunk[i] = 255;
5299 : }
5300 : }
5301 0 : else if (eWrkDataType == GDT_UInt16)
5302 : {
5303 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
5304 0 : for (size_t i = 0;
5305 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5306 : {
5307 0 : if (pasChunk[i] == 1)
5308 0 : pasChunk[i] = 255;
5309 : }
5310 : }
5311 0 : else if (eWrkDataType == GDT_Float64)
5312 : {
5313 0 : double *padfChunk = static_cast<double *>(pChunk);
5314 0 : for (size_t i = 0;
5315 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5316 : {
5317 0 : if (padfChunk[i] == 1.0)
5318 0 : padfChunk[i] = 255.0;
5319 : }
5320 : }
5321 : else
5322 : {
5323 0 : CPLAssert(false);
5324 : }
5325 : }
5326 804 : else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
5327 : {
5328 0 : if (eWrkDataType == GDT_Float32)
5329 : {
5330 0 : float *pafChunk = static_cast<float *>(pChunk);
5331 0 : for (size_t i = 0;
5332 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5333 : {
5334 0 : if (pafChunk[i] == 1.0f)
5335 0 : pafChunk[i] = 0.0f;
5336 0 : else if (pafChunk[i] == 0.0f)
5337 0 : pafChunk[i] = 255.0f;
5338 : }
5339 : }
5340 0 : else if (eWrkDataType == GDT_UInt8)
5341 : {
5342 0 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
5343 0 : for (size_t i = 0;
5344 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5345 : {
5346 0 : if (pabyChunk[i] == 1)
5347 0 : pabyChunk[i] = 0;
5348 0 : else if (pabyChunk[i] == 0)
5349 0 : pabyChunk[i] = 255;
5350 : }
5351 : }
5352 0 : else if (eWrkDataType == GDT_UInt16)
5353 : {
5354 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
5355 0 : for (size_t i = 0;
5356 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5357 : {
5358 0 : if (pasChunk[i] == 1)
5359 0 : pasChunk[i] = 0;
5360 0 : else if (pasChunk[i] == 0)
5361 0 : pasChunk[i] = 255;
5362 : }
5363 : }
5364 0 : else if (eWrkDataType == GDT_Float64)
5365 : {
5366 0 : double *padfChunk = static_cast<double *>(pChunk);
5367 0 : for (size_t i = 0;
5368 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5369 : {
5370 0 : if (padfChunk[i] == 1.0)
5371 0 : padfChunk[i] = 0.0;
5372 0 : else if (padfChunk[i] == 0.0)
5373 0 : padfChunk[i] = 255.0;
5374 : }
5375 : }
5376 : else
5377 : {
5378 0 : CPLAssert(false);
5379 : }
5380 : }
5381 :
5382 813 : auto pChunkRaw = pChunk;
5383 813 : auto pabyChunkNodataMaskRaw = pabyChunkNodataMask;
5384 813 : std::shared_ptr<PointerHolder> oSrcBufferHolder;
5385 813 : std::shared_ptr<PointerHolder> oSrcMaskBufferHolder;
5386 813 : if (poJobQueue)
5387 : {
5388 0 : oSrcBufferHolder = std::make_shared<PointerHolder>(pChunk);
5389 : oSrcMaskBufferHolder =
5390 0 : std::make_shared<PointerHolder>(pabyChunkNodataMask);
5391 : }
5392 :
5393 1723 : for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
5394 : ++iOverview)
5395 : {
5396 910 : GDALRasterBand *poDstBand = papoOvrBands[iOverview];
5397 910 : const int nDstWidth = poDstBand->GetXSize();
5398 910 : const int nDstHeight = poDstBand->GetYSize();
5399 :
5400 910 : const double dfXRatioDstToSrc =
5401 910 : static_cast<double>(nWidth) / nDstWidth;
5402 910 : const double dfYRatioDstToSrc =
5403 910 : static_cast<double>(nHeight) / nDstHeight;
5404 :
5405 : /* --------------------------------------------------------------------
5406 : */
5407 : /* Figure out the line to start writing to, and the first line
5408 : */
5409 : /* to not write to. In theory this approach should ensure that
5410 : */
5411 : /* every output line will be written if all input chunks are */
5412 : /* processed. */
5413 : /* --------------------------------------------------------------------
5414 : */
5415 910 : int nDstYOff =
5416 910 : static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
5417 910 : if (nDstYOff == nDstHeight)
5418 0 : continue;
5419 910 : int nDstYOff2 = static_cast<int>(
5420 910 : 0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
5421 :
5422 910 : if (nChunkYOff + nFullResYChunk == nHeight)
5423 903 : nDstYOff2 = nDstHeight;
5424 : #if DEBUG_VERBOSE
5425 : CPLDebug("GDAL",
5426 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
5427 : nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
5428 : nDstWidth, nDstYOff2 - nDstYOff);
5429 : #endif
5430 :
5431 1820 : auto poJob = std::make_unique<OvrJob>();
5432 910 : poJob->pfnResampleFn = pfnResampleFn;
5433 910 : poJob->bUseGenericResampleFn = bUseGenericResampleFn;
5434 910 : poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
5435 910 : poJob->args.nOvrXSize = poDstBand->GetXSize();
5436 910 : poJob->args.nOvrYSize = poDstBand->GetYSize();
5437 : const char *pszNBITS =
5438 910 : poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
5439 910 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
5440 910 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
5441 910 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
5442 910 : poJob->args.eWrkDataType = eWrkDataType;
5443 910 : poJob->pChunk = pChunkRaw;
5444 910 : poJob->args.pabyChunkNodataMask = pabyChunkNodataMaskRaw;
5445 910 : poJob->nSrcWidth = nWidth;
5446 910 : poJob->nSrcHeight = nHeight;
5447 910 : poJob->args.nChunkXOff = 0;
5448 910 : poJob->args.nChunkXSize = nWidth;
5449 910 : poJob->args.nChunkYOff = nChunkYOffQueried;
5450 910 : poJob->args.nChunkYSize = nChunkYSizeQueried;
5451 910 : poJob->nDstWidth = nDstWidth;
5452 910 : poJob->args.nDstXOff = 0;
5453 910 : poJob->args.nDstXOff2 = nDstWidth;
5454 910 : poJob->args.nDstYOff = nDstYOff;
5455 910 : poJob->args.nDstYOff2 = nDstYOff2;
5456 910 : poJob->poDstBand = poDstBand;
5457 910 : poJob->args.pszResampling = pszResampling;
5458 910 : poJob->args.bHasNoData = bHasNoData;
5459 910 : poJob->args.dfNoDataValue = dfNoDataValue;
5460 910 : poJob->args.poColorTable = poColorTable;
5461 910 : poJob->args.eSrcDataType = eSrcDataType;
5462 910 : poJob->args.bPropagateNoData = bPropagateNoData;
5463 :
5464 910 : if (poJobQueue)
5465 : {
5466 0 : poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
5467 0 : poJob->SetSrcBufferHolder(oSrcBufferHolder);
5468 0 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
5469 0 : jobList.emplace_back(std::move(poJob));
5470 : }
5471 : else
5472 : {
5473 910 : JobResampleFunc(poJob.get());
5474 910 : eErr = poJob->eErr;
5475 910 : if (eErr == CE_None)
5476 : {
5477 910 : eErr = WriteJobData(poJob.get());
5478 : }
5479 : }
5480 : }
5481 : }
5482 :
5483 808 : VSIFree(pChunk);
5484 808 : VSIFree(pabyChunkNodataMask);
5485 :
5486 : // Wait for all pending jobs to complete
5487 808 : while (!jobList.empty())
5488 : {
5489 0 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5490 0 : if (l_eErr != CE_None && eErr == CE_None)
5491 0 : eErr = l_eErr;
5492 : }
5493 :
5494 : /* -------------------------------------------------------------------- */
5495 : /* Renormalized overview mean / stddev if needed. */
5496 : /* -------------------------------------------------------------------- */
5497 808 : if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
5498 : {
5499 0 : GDALOverviewMagnitudeCorrection(
5500 : poSrcBand, nOverviewCount,
5501 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
5502 : GDALDummyProgress, nullptr);
5503 : }
5504 :
5505 : /* -------------------------------------------------------------------- */
5506 : /* It can be important to flush out data to overviews. */
5507 : /* -------------------------------------------------------------------- */
5508 1711 : for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
5509 : ++iOverview)
5510 : {
5511 903 : eErr = papoOvrBands[iOverview]->FlushCache(false);
5512 : }
5513 :
5514 808 : if (eErr == CE_None)
5515 808 : pfnProgress(1.0, nullptr, pProgressData);
5516 :
5517 808 : return eErr;
5518 : }
5519 :
5520 : /************************************************************************/
5521 : /* GDALRegenerateOverviewsMultiBand() */
5522 : /************************************************************************/
5523 :
5524 : /**
5525 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5526 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5527 : *
5528 : * This function will generate one or more overview images from a base
5529 : * image using the requested downsampling algorithm. Its primary use
5530 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
5531 : * can also be used to generate downsampled images in one file from another
5532 : * outside the overview architecture.
5533 : *
5534 : * The output bands need to exist in advance and share the same characteristics
5535 : * (type, dimensions)
5536 : *
5537 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5538 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5539 : *
5540 : * It does not support color tables or complex data types.
5541 : *
5542 : * The pseudo-algorithm used by the function is :
5543 : * for each overview
5544 : * iterate on lines of the source by a step of deltay
5545 : * iterate on columns of the source by a step of deltax
5546 : * read the source data of size deltax * deltay for all the bands
5547 : * generate the corresponding overview block for all the bands
5548 : *
5549 : * This function will honour properly NODATA_VALUES tuples (special dataset
5550 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5551 : * considered as the nodata value and not each value of the triplet
5552 : * independently per band.
5553 : *
5554 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
5555 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5556 : * overview computation.
5557 : *
5558 : * @param nBands the number of bands, size of papoSrcBands and size of
5559 : * first dimension of papapoOverviewBands
5560 : * @param papoSrcBands the list of source bands to downsample
5561 : * @param nOverviews the number of downsampled overview levels being generated.
5562 : * @param papapoOverviewBands bidimension array of bands. First dimension is
5563 : * indexed by nBands. Second dimension is indexed by
5564 : * nOverviews.
5565 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5566 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5567 : * @param pfnProgress progress report function.
5568 : * @param pProgressData progress function callback data.
5569 : * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
5570 : * key=value pairs, or NULL
5571 : * Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
5572 : * options can be specified to express that overviews should
5573 : * be regenerated only in the specified subset of the source
5574 : * dataset.
5575 : * @return CE_None on success or CE_Failure on failure.
5576 : */
5577 :
5578 383 : CPLErr GDALRegenerateOverviewsMultiBand(
5579 : int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
5580 : GDALRasterBand *const *const *papapoOverviewBands,
5581 : const char *pszResampling, GDALProgressFunc pfnProgress,
5582 : void *pProgressData, CSLConstList papszOptions)
5583 : {
5584 383 : CPL_IGNORE_RET_VAL(papszOptions);
5585 :
5586 383 : if (pfnProgress == nullptr)
5587 11 : pfnProgress = GDALDummyProgress;
5588 :
5589 383 : if (EQUAL(pszResampling, "NONE") || nBands == 0 || nOverviews == 0)
5590 3 : return CE_None;
5591 :
5592 : // Sanity checks.
5593 380 : if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
5594 187 : !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
5595 78 : !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
5596 22 : !EQUAL(pszResampling, "CUBICSPLINE") &&
5597 21 : !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
5598 5 : !EQUAL(pszResampling, "MODE"))
5599 : {
5600 0 : CPLError(CE_Failure, CPLE_NotSupported,
5601 : "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
5602 : "not supported",
5603 : pszResampling);
5604 0 : return CE_Failure;
5605 : }
5606 :
5607 380 : int nKernelRadius = 0;
5608 : GDALResampleFunction pfnResampleFn =
5609 380 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
5610 380 : if (pfnResampleFn == nullptr)
5611 0 : return CE_Failure;
5612 :
5613 380 : const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
5614 380 : const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
5615 380 : if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
5616 0 : return CE_None;
5617 380 : GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
5618 66221 : for (int iBand = 1; iBand < nBands; ++iBand)
5619 : {
5620 131682 : if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
5621 65841 : papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
5622 : {
5623 0 : CPLError(
5624 : CE_Failure, CPLE_NotSupported,
5625 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5626 : "have the same dimensions");
5627 0 : return CE_Failure;
5628 : }
5629 65841 : if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
5630 : {
5631 0 : CPLError(
5632 : CE_Failure, CPLE_NotSupported,
5633 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5634 : "have the same data type");
5635 0 : return CE_Failure;
5636 : }
5637 : }
5638 :
5639 1013 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5640 : {
5641 633 : const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
5642 633 : const int nDstWidth = poOvrFirstBand->GetXSize();
5643 633 : const int nDstHeight = poOvrFirstBand->GetYSize();
5644 66732 : for (int iBand = 1; iBand < nBands; ++iBand)
5645 : {
5646 66099 : const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
5647 132198 : if (poOvrBand->GetXSize() != nDstWidth ||
5648 66099 : poOvrBand->GetYSize() != nDstHeight)
5649 : {
5650 0 : CPLError(
5651 : CE_Failure, CPLE_NotSupported,
5652 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5653 : "of the same level must have the same dimensions");
5654 0 : return CE_Failure;
5655 : }
5656 66099 : if (poOvrBand->GetRasterDataType() != eDataType)
5657 : {
5658 0 : CPLError(
5659 : CE_Failure, CPLE_NotSupported,
5660 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5661 : "must have the same data type as the source bands");
5662 0 : return CE_Failure;
5663 : }
5664 : }
5665 : }
5666 :
5667 : // First pass to compute the total number of pixels to write.
5668 380 : double dfTotalPixelCount = 0;
5669 380 : const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
5670 380 : const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
5671 380 : const int nSrcXSize = atoi(CSLFetchNameValueDef(
5672 : papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
5673 380 : const int nSrcYSize = atoi(CSLFetchNameValueDef(
5674 : papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
5675 1013 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5676 : {
5677 633 : dfTotalPixelCount +=
5678 1266 : static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
5679 633 : papapoOverviewBands[0][iOverview]->GetXSize() *
5680 1266 : static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
5681 633 : papapoOverviewBands[0][iOverview]->GetYSize();
5682 : }
5683 :
5684 : const GDALDataType eWrkDataType =
5685 380 : GDALGetOvrWorkDataType(pszResampling, eDataType);
5686 : const int nWrkDataTypeSize =
5687 380 : std::max(1, GDALGetDataTypeSizeBytes(eWrkDataType));
5688 :
5689 380 : const bool bIsMask = papoSrcBands[0]->IsMaskBand();
5690 :
5691 : // If we have a nodata mask and we are doing something more complicated
5692 : // than nearest neighbouring, we have to fetch to nodata mask.
5693 : const bool bUseNoDataMask =
5694 561 : !STARTS_WITH_CI(pszResampling, "NEAR") &&
5695 181 : (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
5696 :
5697 760 : std::vector<bool> abHasNoData(nBands);
5698 760 : std::vector<double> adfNoDataValue(nBands);
5699 :
5700 66601 : for (int iBand = 0; iBand < nBands; ++iBand)
5701 : {
5702 66221 : int nHasNoData = 0;
5703 132442 : adfNoDataValue[iBand] =
5704 66221 : papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
5705 66221 : abHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
5706 : }
5707 :
5708 760 : std::string osDetailMessage;
5709 432 : if (bUseNoDataMask &&
5710 52 : papoSrcBands[0]->HasConflictingMaskSources(&osDetailMessage, false))
5711 : {
5712 9 : CPLError(CE_Warning, CPLE_AppDefined, "%s%s", osDetailMessage.c_str(),
5713 18 : abHasNoData[0]
5714 : ? "Only the nodata value will be taken into account."
5715 9 : : "Only the first listed one will be taken into account.");
5716 : }
5717 :
5718 : const bool bPropagateNoData =
5719 380 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5720 :
5721 380 : const int nThreads = GDALGetNumThreads(GDAL_DEFAULT_MAX_THREAD_COUNT,
5722 : /* bDefaultToAllCPUs=*/false);
5723 : auto poThreadPool =
5724 380 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5725 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5726 760 : : std::unique_ptr<CPLJobQueue>(nullptr);
5727 :
5728 : // Only configurable for debug / testing
5729 380 : const GIntBig nChunkMaxSize = []() -> GIntBig
5730 : {
5731 : const char *pszVal =
5732 380 : CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", nullptr);
5733 380 : if (pszVal)
5734 : {
5735 15 : GIntBig nRet = 0;
5736 15 : CPLParseMemorySize(pszVal, &nRet, nullptr);
5737 15 : return std::max<GIntBig>(100, nRet);
5738 : }
5739 365 : return 10 * 1024 * 1024;
5740 380 : }();
5741 :
5742 : // Only configurable for debug / testing
5743 380 : const GIntBig nChunkMaxSizeForTempFile = []() -> GIntBig
5744 : {
5745 380 : const char *pszVal = CPLGetConfigOption(
5746 : "GDAL_OVR_CHUNK_MAX_SIZE_FOR_TEMP_FILE", nullptr);
5747 380 : if (pszVal)
5748 : {
5749 14 : GIntBig nRet = 0;
5750 14 : CPLParseMemorySize(pszVal, &nRet, nullptr);
5751 14 : return std::max<GIntBig>(100, nRet);
5752 : }
5753 366 : const auto nUsableRAM = CPLGetUsablePhysicalRAM();
5754 366 : if (nUsableRAM > 0)
5755 366 : return nUsableRAM / 10;
5756 : // Select a value to be able to at least downsample by 2 for a RGB
5757 : // 1024x1024 tiled output: (2 * 1024 + 2) * (2 * 1024 + 2) * 3 = 12 MB
5758 0 : return 100 * 1024 * 1024;
5759 380 : }();
5760 :
5761 : // Second pass to do the real job.
5762 380 : double dfCurPixelCount = 0;
5763 380 : CPLErr eErr = CE_None;
5764 1007 : for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
5765 : ++iOverview)
5766 : {
5767 632 : int iSrcOverview = -1; // -1 means the source bands.
5768 :
5769 : const int nDstTotalWidth =
5770 632 : papapoOverviewBands[0][iOverview]->GetXSize();
5771 : const int nDstTotalHeight =
5772 632 : papapoOverviewBands[0][iOverview]->GetYSize();
5773 :
5774 : // Compute the coordinates of the target region to refresh
5775 632 : constexpr double EPS = 1e-8;
5776 632 : const int nDstXOffStart = static_cast<int>(
5777 632 : static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
5778 : EPS);
5779 : const int nDstXOffEnd =
5780 1264 : std::min(static_cast<int>(
5781 632 : std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
5782 632 : nToplevelSrcWidth * nDstTotalWidth -
5783 : EPS)),
5784 632 : nDstTotalWidth);
5785 632 : const int nDstWidth = nDstXOffEnd - nDstXOffStart;
5786 632 : const int nDstYOffStart =
5787 632 : static_cast<int>(static_cast<double>(nSrcYOff) /
5788 632 : nToplevelSrcHeight * nDstTotalHeight +
5789 : EPS);
5790 : const int nDstYOffEnd =
5791 1264 : std::min(static_cast<int>(
5792 632 : std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
5793 632 : nToplevelSrcHeight * nDstTotalHeight -
5794 : EPS)),
5795 632 : nDstTotalHeight);
5796 632 : const int nDstHeight = nDstYOffEnd - nDstYOffStart;
5797 :
5798 : // Try to use previous level of overview as the source to compute
5799 : // the next level.
5800 632 : int nSrcWidth = nToplevelSrcWidth;
5801 632 : int nSrcHeight = nToplevelSrcHeight;
5802 884 : if (iOverview > 0 &&
5803 252 : papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
5804 : {
5805 244 : nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
5806 244 : nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
5807 244 : iSrcOverview = iOverview - 1;
5808 : }
5809 :
5810 632 : const double dfXRatioDstToSrc =
5811 632 : static_cast<double>(nSrcWidth) / nDstTotalWidth;
5812 632 : const double dfYRatioDstToSrc =
5813 632 : static_cast<double>(nSrcHeight) / nDstTotalHeight;
5814 :
5815 : const int nOvrFactor =
5816 1896 : std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
5817 632 : static_cast<int>(0.5 + dfYRatioDstToSrc)));
5818 :
5819 632 : int nDstChunkXSize = 0;
5820 632 : int nDstChunkYSize = 0;
5821 632 : papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
5822 : &nDstChunkYSize);
5823 :
5824 632 : constexpr int PIXEL_MARGIN = 2;
5825 : // Try to extend the chunk size so that the memory needed to acquire
5826 : // source pixels goes up to 10 MB.
5827 : // This can help for drivers that support multi-threaded reading
5828 632 : const int nFullResYChunk = static_cast<int>(std::min<double>(
5829 632 : nSrcHeight, PIXEL_MARGIN + nDstChunkYSize * dfYRatioDstToSrc));
5830 632 : const int nFullResYChunkQueried = static_cast<int>(std::min<int64_t>(
5831 1264 : nSrcHeight,
5832 1264 : nFullResYChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5833 632 : nKernelRadius * nOvrFactor));
5834 861 : while (nDstChunkXSize < nDstWidth)
5835 : {
5836 248 : constexpr int INCREASE_FACTOR = 2;
5837 :
5838 248 : const int nFullResXChunk = static_cast<int>(std::min<double>(
5839 496 : nSrcWidth, PIXEL_MARGIN + INCREASE_FACTOR * nDstChunkXSize *
5840 248 : dfXRatioDstToSrc));
5841 :
5842 : const int nFullResXChunkQueried =
5843 248 : static_cast<int>(std::min<int64_t>(
5844 496 : nSrcWidth,
5845 496 : nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5846 248 : nKernelRadius * nOvrFactor));
5847 :
5848 248 : if (nBands > nChunkMaxSize / nFullResXChunkQueried /
5849 248 : nFullResYChunkQueried / nWrkDataTypeSize)
5850 : {
5851 19 : break;
5852 : }
5853 :
5854 229 : nDstChunkXSize *= INCREASE_FACTOR;
5855 : }
5856 632 : nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
5857 :
5858 632 : const int nFullResXChunk = static_cast<int>(std::min<double>(
5859 632 : nSrcWidth, PIXEL_MARGIN + nDstChunkXSize * dfXRatioDstToSrc));
5860 632 : const int nFullResXChunkQueried = static_cast<int>(std::min<int64_t>(
5861 1264 : nSrcWidth,
5862 1264 : nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5863 632 : nKernelRadius * nOvrFactor));
5864 :
5865 : // Make sure that the RAM requirements to acquire the source data does
5866 : // not exceed nChunkMaxSizeForTempFile
5867 : // If so, reduce the destination chunk size, generate overviews in a
5868 : // temporary dataset, and copy that temporary dataset over the target
5869 : // overview bands (to avoid issues with lossy compression)
5870 : const bool bOverflowFullResXChunkYChunkQueried =
5871 632 : nBands > std::numeric_limits<int64_t>::max() /
5872 632 : nFullResXChunkQueried / nFullResYChunkQueried /
5873 632 : nWrkDataTypeSize;
5874 :
5875 632 : const auto nMemRequirement =
5876 : bOverflowFullResXChunkYChunkQueried
5877 632 : ? 0
5878 628 : : static_cast<GIntBig>(nFullResXChunkQueried) *
5879 628 : nFullResYChunkQueried * nBands * nWrkDataTypeSize;
5880 : // Use a temporary dataset with a smaller destination chunk size
5881 632 : const auto nOverShootFactor =
5882 : nMemRequirement / nChunkMaxSizeForTempFile;
5883 :
5884 632 : constexpr int MIN_OVERSHOOT_FACTOR = 4;
5885 : const auto nSqrtOverShootFactor = std::max<GIntBig>(
5886 1264 : MIN_OVERSHOOT_FACTOR, static_cast<GIntBig>(std::ceil(std::sqrt(
5887 632 : static_cast<double>(nOverShootFactor)))));
5888 632 : constexpr int DEFAULT_CHUNK_SIZE = 256;
5889 632 : constexpr int GTIFF_BLOCK_SIZE_MULTIPLE = 16;
5890 : const int nReducedDstChunkXSize =
5891 : bOverflowFullResXChunkYChunkQueried
5892 1260 : ? DEFAULT_CHUNK_SIZE
5893 1260 : : std::max(1, static_cast<int>(nDstChunkXSize /
5894 1260 : nSqrtOverShootFactor) &
5895 628 : ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
5896 : const int nReducedDstChunkYSize =
5897 : bOverflowFullResXChunkYChunkQueried
5898 1260 : ? DEFAULT_CHUNK_SIZE
5899 1260 : : std::max(1, static_cast<int>(nDstChunkYSize /
5900 1260 : nSqrtOverShootFactor) &
5901 628 : ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
5902 :
5903 632 : if (bOverflowFullResXChunkYChunkQueried ||
5904 : nMemRequirement > nChunkMaxSizeForTempFile)
5905 : {
5906 : const auto nDTSize =
5907 43 : std::max(1, GDALGetDataTypeSizeBytes(eDataType));
5908 : const bool bTmpDSMemRequirementOverflow =
5909 43 : nBands > std::numeric_limits<int64_t>::max() / nDstWidth /
5910 43 : nDstHeight / nDTSize;
5911 43 : const auto nTmpDSMemRequirement =
5912 : bTmpDSMemRequirementOverflow
5913 43 : ? 0
5914 41 : : static_cast<GIntBig>(nDstWidth) * nDstHeight * nBands *
5915 41 : nDTSize;
5916 :
5917 : // make sure that one band buffer doesn't overflow size_t
5918 : const bool bChunkSizeOverflow =
5919 43 : static_cast<size_t>(nDTSize) >
5920 43 : std::numeric_limits<size_t>::max() / nDstWidth / nDstHeight;
5921 43 : const size_t nChunkSize =
5922 : bChunkSizeOverflow
5923 43 : ? 0
5924 41 : : static_cast<size_t>(nDstWidth) * nDstHeight * nDTSize;
5925 :
5926 : const auto CreateVRT =
5927 41 : [nBands, nSrcWidth, nSrcHeight, nDstTotalWidth, nDstTotalHeight,
5928 : pszResampling, eWrkDataType, papoSrcBands, papapoOverviewBands,
5929 : iSrcOverview, &abHasNoData,
5930 393585 : &adfNoDataValue](int nVRTBlockXSize, int nVRTBlockYSize)
5931 : {
5932 : auto poVRTDS = std::make_unique<VRTDataset>(
5933 41 : nDstTotalWidth, nDstTotalHeight, nVRTBlockXSize,
5934 41 : nVRTBlockYSize);
5935 :
5936 65620 : for (int iBand = 0; iBand < nBands; ++iBand)
5937 : {
5938 131158 : auto poVRTSrc = std::make_unique<VRTSimpleSource>();
5939 65579 : poVRTSrc->SetResampling(pszResampling);
5940 65579 : poVRTDS->AddBand(eWrkDataType);
5941 : auto poVRTBand = static_cast<VRTSourcedRasterBand *>(
5942 65579 : poVRTDS->GetRasterBand(iBand + 1));
5943 :
5944 65579 : auto poSrcBand = papoSrcBands[iBand];
5945 65579 : if (iSrcOverview != -1)
5946 24 : poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
5947 65579 : poVRTBand->ConfigureSource(
5948 : poVRTSrc.get(), poSrcBand, false, 0, 0, nSrcWidth,
5949 : nSrcHeight, 0, 0, nDstTotalWidth, nDstTotalHeight);
5950 : // Add the source to the band
5951 65579 : poVRTBand->AddSource(poVRTSrc.release());
5952 65579 : if (abHasNoData[iBand])
5953 3 : poVRTBand->SetNoDataValue(adfNoDataValue[iBand]);
5954 : }
5955 :
5956 42 : if (papoSrcBands[0]->GetMaskFlags() == GMF_PER_DATASET &&
5957 1 : poVRTDS->CreateMaskBand(GMF_PER_DATASET) == CE_None)
5958 : {
5959 : VRTSourcedRasterBand *poMaskVRTBand =
5960 1 : cpl::down_cast<VRTSourcedRasterBand *>(
5961 1 : poVRTDS->GetRasterBand(1)->GetMaskBand());
5962 1 : auto poSrcBand = papoSrcBands[0];
5963 1 : if (iSrcOverview != -1)
5964 0 : poSrcBand = papapoOverviewBands[0][iSrcOverview];
5965 1 : poMaskVRTBand->AddMaskBandSource(
5966 1 : poSrcBand->GetMaskBand(), 0, 0, nSrcWidth, nSrcHeight,
5967 : 0, 0, nDstTotalWidth, nDstTotalHeight);
5968 : }
5969 :
5970 41 : return poVRTDS;
5971 43 : };
5972 :
5973 : // If the overview accommodates chunking, do so and recurse
5974 : // to avoid generating full size temporary files
5975 43 : if (!bOverflowFullResXChunkYChunkQueried &&
5976 39 : !bTmpDSMemRequirementOverflow && !bChunkSizeOverflow &&
5977 39 : (nDstChunkXSize < nDstWidth || nDstChunkYSize < nDstHeight))
5978 : {
5979 : // Create a VRT with the smaller chunk to do the scaling
5980 : auto poVRTDS =
5981 13 : CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
5982 :
5983 13 : std::vector<GDALRasterBand *> apoVRTBand(nBands);
5984 13 : std::vector<GDALRasterBand *> apoDstBand(nBands);
5985 65560 : for (int iBand = 0; iBand < nBands; ++iBand)
5986 : {
5987 65547 : apoDstBand[iBand] = papapoOverviewBands[iBand][iOverview];
5988 65547 : apoVRTBand[iBand] = poVRTDS->GetRasterBand(iBand + 1);
5989 : }
5990 :
5991 : // Use a flag to avoid reading from the overview being built
5992 : GDALRasterIOExtraArg sExtraArg;
5993 13 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5994 13 : if (iSrcOverview == -1)
5995 13 : sExtraArg.bUseOnlyThisScale = true;
5996 :
5997 : // A single band buffer for data transfer to the overview
5998 13 : std::vector<GByte> abyChunk;
5999 : try
6000 : {
6001 13 : abyChunk.resize(nChunkSize);
6002 : }
6003 0 : catch (const std::exception &)
6004 : {
6005 0 : CPLError(CE_Failure, CPLE_OutOfMemory,
6006 : "Out of memory allocating temporary buffer");
6007 0 : return CE_Failure;
6008 : }
6009 :
6010 : // Loop over output height, in chunks
6011 13 : for (int nDstYOff = nDstYOffStart;
6012 38 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6013 : /* */)
6014 : {
6015 : const int nDstYCount =
6016 25 : std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
6017 : // Loop over output width, in output chunks
6018 25 : for (int nDstXOff = nDstXOffStart;
6019 74 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6020 : /* */)
6021 : {
6022 : const int nDstXCount =
6023 49 : std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
6024 : // Read and transfer the chunk to the overview
6025 98 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
6026 : ++iBand)
6027 : {
6028 98 : eErr = apoVRTBand[iBand]->RasterIO(
6029 : GF_Read, nDstXOff, nDstYOff, nDstXCount,
6030 49 : nDstYCount, abyChunk.data(), nDstXCount,
6031 : nDstYCount, eDataType, 0, 0, &sExtraArg);
6032 49 : if (eErr == CE_None)
6033 : {
6034 96 : eErr = apoDstBand[iBand]->RasterIO(
6035 : GF_Write, nDstXOff, nDstYOff, nDstXCount,
6036 48 : nDstYCount, abyChunk.data(), nDstXCount,
6037 : nDstYCount, eDataType, 0, 0, nullptr);
6038 : }
6039 : }
6040 :
6041 49 : dfCurPixelCount +=
6042 49 : static_cast<double>(nDstXCount) * nDstYCount;
6043 :
6044 49 : nDstXOff += nDstXCount;
6045 : } // width
6046 :
6047 25 : if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount,
6048 : nullptr, pProgressData))
6049 : {
6050 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
6051 : "User terminated");
6052 0 : eErr = CE_Failure;
6053 : }
6054 :
6055 25 : nDstYOff += nDstYCount;
6056 : } // height
6057 :
6058 13 : if (CE_None != eErr)
6059 : {
6060 1 : CPLError(CE_Failure, CPLE_AppDefined,
6061 : "Error while writing overview");
6062 1 : return CE_Failure;
6063 : }
6064 :
6065 12 : pfnProgress(1.0, nullptr, pProgressData);
6066 : // Flush the overviews we just generated
6067 24 : for (int iBand = 0; iBand < nBands; ++iBand)
6068 12 : apoDstBand[iBand]->FlushCache(false);
6069 :
6070 12 : continue; // Next overview
6071 : } // chunking via temporary dataset
6072 :
6073 0 : std::unique_ptr<GDALDataset> poTmpDS;
6074 : // Config option mostly/only for autotest purposes
6075 : const char *pszGDAL_OVR_TEMP_DRIVER =
6076 30 : CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
6077 30 : if ((!bTmpDSMemRequirementOverflow &&
6078 4 : nTmpDSMemRequirement <= nChunkMaxSizeForTempFile &&
6079 4 : !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
6080 26 : EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
6081 : {
6082 10 : auto poTmpDrv = GetGDALDriverManager()->GetDriverByName("MEM");
6083 10 : if (!poTmpDrv)
6084 : {
6085 0 : eErr = CE_Failure;
6086 0 : break;
6087 : }
6088 10 : poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
6089 : nDstTotalHeight, nBands,
6090 10 : eDataType, nullptr));
6091 : }
6092 : else
6093 : {
6094 : // Create a temporary file for the overview
6095 : auto poTmpDrv =
6096 20 : GetGDALDriverManager()->GetDriverByName("GTiff");
6097 20 : if (!poTmpDrv)
6098 : {
6099 0 : eErr = CE_Failure;
6100 0 : break;
6101 : }
6102 40 : std::string osTmpFilename;
6103 20 : auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
6104 20 : if (poDstDS)
6105 : {
6106 20 : osTmpFilename = poDstDS->GetDescription();
6107 : VSIStatBufL sStatBuf;
6108 20 : if (!osTmpFilename.empty() &&
6109 0 : VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
6110 0 : osTmpFilename += "_tmp_ovr.tif";
6111 : }
6112 20 : if (osTmpFilename.empty())
6113 : {
6114 20 : osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
6115 20 : osTmpFilename += ".tif";
6116 : }
6117 20 : CPLDebug("GDAL", "Creating temporary file %s of %d x %d x %d",
6118 : osTmpFilename.c_str(), nDstWidth, nDstHeight, nBands);
6119 40 : CPLStringList aosCO;
6120 20 : if (0 == ((nReducedDstChunkXSize % GTIFF_BLOCK_SIZE_MULTIPLE) |
6121 20 : (nReducedDstChunkYSize % GTIFF_BLOCK_SIZE_MULTIPLE)))
6122 : {
6123 14 : aosCO.SetNameValue("TILED", "YES");
6124 : aosCO.SetNameValue("BLOCKXSIZE",
6125 14 : CPLSPrintf("%d", nReducedDstChunkXSize));
6126 : aosCO.SetNameValue("BLOCKYSIZE",
6127 14 : CPLSPrintf("%d", nReducedDstChunkYSize));
6128 : }
6129 20 : if (const char *pszCOList =
6130 20 : poTmpDrv->GetMetadataItem(GDAL_DMD_CREATIONOPTIONLIST))
6131 : {
6132 : aosCO.SetNameValue(
6133 20 : "COMPRESS", strstr(pszCOList, "ZSTD") ? "ZSTD" : "LZW");
6134 : }
6135 20 : poTmpDS.reset(poTmpDrv->Create(osTmpFilename.c_str(), nDstWidth,
6136 : nDstHeight, nBands, eDataType,
6137 20 : aosCO.List()));
6138 20 : if (poTmpDS)
6139 : {
6140 18 : poTmpDS->MarkSuppressOnClose();
6141 18 : VSIUnlink(osTmpFilename.c_str());
6142 : }
6143 : }
6144 30 : if (!poTmpDS)
6145 : {
6146 2 : eErr = CE_Failure;
6147 2 : break;
6148 : }
6149 :
6150 : // Create a full size VRT to do the resampling without edge effects
6151 : auto poVRTDS =
6152 28 : CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
6153 :
6154 : // Allocate a band buffer with the overview chunk size
6155 : std::unique_ptr<void, VSIFreeReleaser> pDstBuffer(
6156 : VSI_MALLOC3_VERBOSE(size_t(nWrkDataTypeSize), nDstChunkXSize,
6157 28 : nDstChunkYSize));
6158 28 : if (pDstBuffer == nullptr)
6159 : {
6160 0 : eErr = CE_Failure;
6161 0 : break;
6162 : }
6163 :
6164 : // Use a flag to avoid reading the overview being built
6165 : GDALRasterIOExtraArg sExtraArg;
6166 28 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
6167 28 : if (iSrcOverview == -1)
6168 4 : sExtraArg.bUseOnlyThisScale = true;
6169 :
6170 : // Scale and copy data from the VRT to the temp file
6171 28 : for (int nDstYOff = nDstYOffStart;
6172 914 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6173 : /* */)
6174 : {
6175 : const int nDstYCount =
6176 886 : std::min(nReducedDstChunkYSize, nDstYOffEnd - nDstYOff);
6177 886 : for (int nDstXOff = nDstXOffStart;
6178 201218 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6179 : /* */)
6180 : {
6181 : const int nDstXCount =
6182 200332 : std::min(nReducedDstChunkXSize, nDstXOffEnd - nDstXOff);
6183 400668 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
6184 : ++iBand)
6185 : {
6186 200336 : auto poSrcBand = poVRTDS->GetRasterBand(iBand + 1);
6187 200336 : eErr = poSrcBand->RasterIO(
6188 : GF_Read, nDstXOff, nDstYOff, nDstXCount, nDstYCount,
6189 : pDstBuffer.get(), nDstXCount, nDstYCount,
6190 : eWrkDataType, 0, 0, &sExtraArg);
6191 200336 : if (eErr == CE_None)
6192 : {
6193 : // Write to the temporary dataset, shifted
6194 200334 : auto poOvrBand = poTmpDS->GetRasterBand(iBand + 1);
6195 200334 : eErr = poOvrBand->RasterIO(
6196 : GF_Write, nDstXOff - nDstXOffStart,
6197 : nDstYOff - nDstYOffStart, nDstXCount,
6198 : nDstYCount, pDstBuffer.get(), nDstXCount,
6199 : nDstYCount, eWrkDataType, 0, 0, nullptr);
6200 : }
6201 : }
6202 200332 : nDstXOff += nDstXCount;
6203 : }
6204 886 : nDstYOff += nDstYCount;
6205 : }
6206 :
6207 : // Copy from the temporary to the overview
6208 28 : for (int nDstYOff = nDstYOffStart;
6209 54 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6210 : /* */)
6211 : {
6212 : const int nDstYCount =
6213 26 : std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
6214 26 : for (int nDstXOff = nDstXOffStart;
6215 52 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6216 : /* */)
6217 : {
6218 : const int nDstXCount =
6219 26 : std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
6220 56 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
6221 : ++iBand)
6222 : {
6223 30 : auto poSrcBand = poTmpDS->GetRasterBand(iBand + 1);
6224 30 : eErr = poSrcBand->RasterIO(
6225 : GF_Read, nDstXOff - nDstXOffStart,
6226 : nDstYOff - nDstYOffStart, nDstXCount, nDstYCount,
6227 : pDstBuffer.get(), nDstXCount, nDstYCount,
6228 : eWrkDataType, 0, 0, nullptr);
6229 30 : if (eErr == CE_None)
6230 : {
6231 : // Write to the destination overview bands
6232 30 : auto poOvrBand =
6233 30 : papapoOverviewBands[iBand][iOverview];
6234 30 : eErr = poOvrBand->RasterIO(
6235 : GF_Write, nDstXOff, nDstYOff, nDstXCount,
6236 : nDstYCount, pDstBuffer.get(), nDstXCount,
6237 : nDstYCount, eWrkDataType, 0, 0, nullptr);
6238 : }
6239 : }
6240 26 : nDstXOff += nDstXCount;
6241 : }
6242 26 : nDstYOff += nDstYCount;
6243 : }
6244 :
6245 28 : if (eErr != CE_None)
6246 : {
6247 2 : CPLError(CE_Failure, CPLE_AppDefined,
6248 : "Failed to write overview %d", iOverview);
6249 2 : return eErr;
6250 : }
6251 :
6252 : // Flush the data to overviews.
6253 56 : for (int iBand = 0; iBand < nBands; ++iBand)
6254 30 : papapoOverviewBands[iBand][iOverview]->FlushCache(false);
6255 :
6256 26 : continue;
6257 : }
6258 :
6259 : // Structure describing a resampling job
6260 : struct OvrJob
6261 : {
6262 : // Buffers to free when job is finished
6263 : std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
6264 : std::unique_ptr<PointerHolder> oSrcBufferHolder{};
6265 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
6266 :
6267 : GDALRasterBand *poDstBand = nullptr;
6268 :
6269 : // Input parameters of pfnResampleFn
6270 : GDALResampleFunction pfnResampleFn = nullptr;
6271 : GDALOverviewResampleArgs args{};
6272 : const void *pChunk = nullptr;
6273 :
6274 : // Output values of resampling function
6275 : CPLErr eErr = CE_Failure;
6276 : void *pDstBuffer = nullptr;
6277 : GDALDataType eDstBufferDataType = GDT_Unknown;
6278 :
6279 3268 : void NotifyFinished()
6280 : {
6281 6536 : std::lock_guard guard(mutex);
6282 3268 : bFinished = true;
6283 3268 : cv.notify_one();
6284 3268 : }
6285 :
6286 2 : bool IsFinished()
6287 : {
6288 2 : std::lock_guard guard(mutex);
6289 4 : return bFinished;
6290 : }
6291 :
6292 16 : void WaitFinished()
6293 : {
6294 32 : std::unique_lock oGuard(mutex);
6295 21 : while (!bFinished)
6296 : {
6297 5 : cv.wait(oGuard);
6298 : }
6299 16 : }
6300 :
6301 : private:
6302 : // Synchronization
6303 : bool bFinished = false;
6304 : std::mutex mutex{};
6305 : std::condition_variable cv{};
6306 : };
6307 :
6308 : // Thread function to resample
6309 3268 : const auto JobResampleFunc = [](void *pData)
6310 : {
6311 3268 : OvrJob *poJob = static_cast<OvrJob *>(pData);
6312 :
6313 3268 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
6314 : &(poJob->pDstBuffer),
6315 : &(poJob->eDstBufferDataType));
6316 :
6317 3268 : auto pDstBuffer = poJob->pDstBuffer;
6318 : poJob->oDstBufferHolder =
6319 3268 : std::make_unique<PointerHolder>(pDstBuffer);
6320 :
6321 3268 : poJob->NotifyFinished();
6322 3268 : };
6323 :
6324 : // Function to write resample data to target band
6325 3268 : const auto WriteJobData = [](const OvrJob *poJob)
6326 : {
6327 6536 : return poJob->poDstBand->RasterIO(
6328 3268 : GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
6329 3268 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
6330 3268 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
6331 3268 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
6332 3268 : poJob->args.nDstYOff2 - poJob->args.nDstYOff,
6333 3268 : poJob->eDstBufferDataType, 0, 0, nullptr);
6334 : };
6335 :
6336 : // Wait for completion of oldest job and serialize it
6337 : const auto WaitAndFinalizeOldestJob =
6338 16 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
6339 : {
6340 16 : auto poOldestJob = jobList.front().get();
6341 16 : poOldestJob->WaitFinished();
6342 16 : CPLErr l_eErr = poOldestJob->eErr;
6343 16 : if (l_eErr == CE_None)
6344 : {
6345 16 : l_eErr = WriteJobData(poOldestJob);
6346 : }
6347 :
6348 16 : jobList.pop_front();
6349 16 : return l_eErr;
6350 : };
6351 :
6352 : // Queue of jobs
6353 1178 : std::list<std::unique_ptr<OvrJob>> jobList;
6354 :
6355 1178 : std::vector<std::unique_ptr<void, VSIFreeReleaser>> apaChunk(nBands);
6356 : std::vector<std::unique_ptr<GByte, VSIFreeReleaser>>
6357 1178 : apabyChunkNoDataMask(nBands);
6358 :
6359 : // Iterate on destination overview, block by block.
6360 589 : for (int nDstYOff = nDstYOffStart;
6361 2078 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6362 1489 : nDstYOff += nDstChunkYSize)
6363 : {
6364 : int nDstYCount;
6365 1489 : if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
6366 1077 : nDstYCount = nDstChunkYSize;
6367 : else
6368 412 : nDstYCount = nDstYOffEnd - nDstYOff;
6369 :
6370 1489 : int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
6371 1489 : int nChunkYOff2 = static_cast<int>(
6372 1489 : ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
6373 1489 : if (nChunkYOff2 > nSrcHeight ||
6374 1489 : nDstYOff + nDstYCount == nDstTotalHeight)
6375 582 : nChunkYOff2 = nSrcHeight;
6376 1489 : int nYCount = nChunkYOff2 - nChunkYOff;
6377 1489 : CPLAssert(nYCount <= nFullResYChunk);
6378 :
6379 1489 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
6380 1489 : int nChunkYSizeQueried =
6381 1489 : nYCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
6382 1489 : if (nChunkYOffQueried < 0)
6383 : {
6384 136 : nChunkYSizeQueried += nChunkYOffQueried;
6385 136 : nChunkYOffQueried = 0;
6386 : }
6387 1489 : if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
6388 135 : nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
6389 1489 : CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
6390 :
6391 1489 : if (!pfnProgress(std::min(1.0, dfCurPixelCount / dfTotalPixelCount),
6392 : nullptr, pProgressData))
6393 : {
6394 1 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6395 1 : eErr = CE_Failure;
6396 : }
6397 :
6398 : // Iterate on destination overview, block by block.
6399 1489 : for (int nDstXOff = nDstXOffStart;
6400 3017 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6401 1528 : nDstXOff += nDstChunkXSize)
6402 : {
6403 1528 : int nDstXCount = 0;
6404 1528 : if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
6405 1511 : nDstXCount = nDstChunkXSize;
6406 : else
6407 17 : nDstXCount = nDstXOffEnd - nDstXOff;
6408 :
6409 1528 : dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
6410 :
6411 1528 : int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
6412 1528 : int nChunkXOff2 = static_cast<int>(
6413 1528 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
6414 1528 : if (nChunkXOff2 > nSrcWidth ||
6415 1528 : nDstXOff + nDstXCount == nDstTotalWidth)
6416 1453 : nChunkXOff2 = nSrcWidth;
6417 1528 : const int nXCount = nChunkXOff2 - nChunkXOff;
6418 1528 : CPLAssert(nXCount <= nFullResXChunk);
6419 :
6420 1528 : int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
6421 1528 : int nChunkXSizeQueried =
6422 1528 : nXCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
6423 1528 : if (nChunkXOffQueried < 0)
6424 : {
6425 191 : nChunkXSizeQueried += nChunkXOffQueried;
6426 191 : nChunkXOffQueried = 0;
6427 : }
6428 1528 : if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
6429 200 : nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
6430 1528 : CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
6431 : #if DEBUG_VERBOSE
6432 : CPLDebug("GDAL",
6433 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
6434 : nChunkXOffQueried, nChunkYOffQueried,
6435 : nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
6436 : nDstYOff, nDstXCount, nDstYCount);
6437 : #endif
6438 :
6439 : // Avoid accumulating too many tasks and exhaust RAM
6440 :
6441 : // Try to complete already finished jobs
6442 1528 : while (eErr == CE_None && !jobList.empty())
6443 : {
6444 2 : auto poOldestJob = jobList.front().get();
6445 2 : if (!poOldestJob->IsFinished())
6446 2 : break;
6447 0 : eErr = poOldestJob->eErr;
6448 0 : if (eErr == CE_None)
6449 : {
6450 0 : eErr = WriteJobData(poOldestJob);
6451 : }
6452 :
6453 0 : jobList.pop_front();
6454 : }
6455 :
6456 : // And in case we have saturated the number of threads,
6457 : // wait for completion of tasks to go below the threshold.
6458 3056 : while (eErr == CE_None &&
6459 1528 : jobList.size() >= static_cast<size_t>(nThreads))
6460 : {
6461 0 : eErr = WaitAndFinalizeOldestJob(jobList);
6462 : }
6463 :
6464 : // Read the source buffers for all the bands.
6465 4797 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6466 : {
6467 : // (Re)allocate buffers if needed
6468 3269 : if (apaChunk[iBand] == nullptr)
6469 : {
6470 1152 : apaChunk[iBand].reset(VSI_MALLOC3_VERBOSE(
6471 : nFullResXChunkQueried, nFullResYChunkQueried,
6472 : nWrkDataTypeSize));
6473 1152 : if (apaChunk[iBand] == nullptr)
6474 : {
6475 0 : eErr = CE_Failure;
6476 : }
6477 : }
6478 3586 : if (bUseNoDataMask &&
6479 317 : apabyChunkNoDataMask[iBand] == nullptr)
6480 : {
6481 266 : apabyChunkNoDataMask[iBand].reset(
6482 266 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
6483 : nFullResXChunkQueried, nFullResYChunkQueried)));
6484 266 : if (apabyChunkNoDataMask[iBand] == nullptr)
6485 : {
6486 0 : eErr = CE_Failure;
6487 : }
6488 : }
6489 :
6490 3269 : if (eErr == CE_None)
6491 : {
6492 3269 : GDALRasterBand *poSrcBand = nullptr;
6493 3269 : if (iSrcOverview == -1)
6494 2383 : poSrcBand = papoSrcBands[iBand];
6495 : else
6496 886 : poSrcBand =
6497 886 : papapoOverviewBands[iBand][iSrcOverview];
6498 3269 : eErr = poSrcBand->RasterIO(
6499 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6500 : nChunkXSizeQueried, nChunkYSizeQueried,
6501 3269 : apaChunk[iBand].get(), nChunkXSizeQueried,
6502 : nChunkYSizeQueried, eWrkDataType, 0, 0, nullptr);
6503 :
6504 3269 : if (bUseNoDataMask && eErr == CE_None)
6505 : {
6506 317 : auto poMaskBand = poSrcBand->IsMaskBand()
6507 317 : ? poSrcBand
6508 244 : : poSrcBand->GetMaskBand();
6509 317 : eErr = poMaskBand->RasterIO(
6510 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6511 : nChunkXSizeQueried, nChunkYSizeQueried,
6512 317 : apabyChunkNoDataMask[iBand].get(),
6513 : nChunkXSizeQueried, nChunkYSizeQueried,
6514 : GDT_UInt8, 0, 0, nullptr);
6515 : }
6516 : }
6517 : }
6518 :
6519 : // Compute the resulting overview block.
6520 4796 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6521 : {
6522 6536 : auto poJob = std::make_unique<OvrJob>();
6523 3268 : poJob->pfnResampleFn = pfnResampleFn;
6524 3268 : poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
6525 6536 : poJob->args.eOvrDataType =
6526 3268 : poJob->poDstBand->GetRasterDataType();
6527 3268 : poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
6528 3268 : poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
6529 3268 : const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
6530 3268 : "NBITS", "IMAGE_STRUCTURE");
6531 3268 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
6532 3268 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
6533 3268 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
6534 3268 : poJob->args.eWrkDataType = eWrkDataType;
6535 3268 : poJob->pChunk = apaChunk[iBand].get();
6536 3268 : poJob->args.pabyChunkNodataMask =
6537 3268 : apabyChunkNoDataMask[iBand].get();
6538 3268 : poJob->args.nChunkXOff = nChunkXOffQueried;
6539 3268 : poJob->args.nChunkXSize = nChunkXSizeQueried;
6540 3268 : poJob->args.nChunkYOff = nChunkYOffQueried;
6541 3268 : poJob->args.nChunkYSize = nChunkYSizeQueried;
6542 3268 : poJob->args.nDstXOff = nDstXOff;
6543 3268 : poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
6544 3268 : poJob->args.nDstYOff = nDstYOff;
6545 3268 : poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
6546 3268 : poJob->args.pszResampling = pszResampling;
6547 3268 : poJob->args.bHasNoData = abHasNoData[iBand];
6548 3268 : poJob->args.dfNoDataValue = adfNoDataValue[iBand];
6549 3268 : poJob->args.eSrcDataType = eDataType;
6550 3268 : poJob->args.bPropagateNoData = bPropagateNoData;
6551 :
6552 3268 : if (poJobQueue)
6553 : {
6554 16 : poJob->oSrcMaskBufferHolder =
6555 32 : std::make_unique<PointerHolder>(
6556 32 : std::move(apabyChunkNoDataMask[iBand]));
6557 :
6558 16 : poJob->oSrcBufferHolder =
6559 32 : std::make_unique<PointerHolder>(
6560 32 : std::move(apaChunk[iBand]));
6561 :
6562 16 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
6563 16 : jobList.emplace_back(std::move(poJob));
6564 : }
6565 : else
6566 : {
6567 3252 : JobResampleFunc(poJob.get());
6568 3252 : eErr = poJob->eErr;
6569 3252 : if (eErr == CE_None)
6570 : {
6571 3252 : eErr = WriteJobData(poJob.get());
6572 : }
6573 : }
6574 : }
6575 : }
6576 : }
6577 :
6578 : // Wait for all pending jobs to complete
6579 605 : while (!jobList.empty())
6580 : {
6581 16 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
6582 16 : if (l_eErr != CE_None && eErr == CE_None)
6583 0 : eErr = l_eErr;
6584 : }
6585 :
6586 : // Flush the data to overviews.
6587 1739 : for (int iBand = 0; iBand < nBands; ++iBand)
6588 : {
6589 1150 : if (papapoOverviewBands[iBand][iOverview]->FlushCache(false) !=
6590 : CE_None)
6591 0 : eErr = CE_Failure;
6592 : }
6593 : }
6594 :
6595 377 : if (eErr == CE_None)
6596 373 : pfnProgress(1.0, nullptr, pProgressData);
6597 :
6598 377 : return eErr;
6599 : }
6600 :
6601 : /************************************************************************/
6602 : /* GDALRegenerateOverviewsMultiBand() */
6603 : /************************************************************************/
6604 :
6605 : /**
6606 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
6607 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
6608 : *
6609 : * This function will generate one or more overview images from a base
6610 : * image using the requested downsampling algorithm. Its primary use
6611 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
6612 : * can also be used to generate downsampled images in one file from another
6613 : * outside the overview architecture.
6614 : *
6615 : * The output bands need to exist in advance and share the same characteristics
6616 : * (type, dimensions)
6617 : *
6618 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
6619 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
6620 : *
6621 : * It does not support color tables or complex data types.
6622 : *
6623 : * The pseudo-algorithm used by the function is :
6624 : * for each overview
6625 : * iterate on lines of the source by a step of deltay
6626 : * iterate on columns of the source by a step of deltax
6627 : * read the source data of size deltax * deltay for all the bands
6628 : * generate the corresponding overview block for all the bands
6629 : *
6630 : * This function will honour properly NODATA_VALUES tuples (special dataset
6631 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
6632 : * considered as the nodata value and not each value of the triplet
6633 : * independently per band.
6634 : *
6635 : * The GDAL_NUM_THREADS configuration option can be set
6636 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
6637 : * overview computation.
6638 : *
6639 : * @param apoSrcBands the list of source bands to downsample
6640 : * @param aapoOverviewBands bidimension array of bands. First dimension is
6641 : * indexed by bands. Second dimension is indexed by
6642 : * overview levels. All aapoOverviewBands[i] arrays
6643 : * must have the same size (i.e. same number of
6644 : * overviews)
6645 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
6646 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
6647 : * @param pfnProgress progress report function.
6648 : * @param pProgressData progress function callback data.
6649 : * @param papszOptions NULL terminated list of options as
6650 : * key=value pairs, or NULL
6651 : * The XOFF, YOFF, XSIZE and YSIZE
6652 : * options can be specified to express that overviews should
6653 : * be regenerated only in the specified subset of the source
6654 : * dataset.
6655 : * @return CE_None on success or CE_Failure on failure.
6656 : * @since 3.10
6657 : */
6658 :
6659 19 : CPLErr GDALRegenerateOverviewsMultiBand(
6660 : const std::vector<GDALRasterBand *> &apoSrcBands,
6661 : const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
6662 : const char *pszResampling, GDALProgressFunc pfnProgress,
6663 : void *pProgressData, CSLConstList papszOptions)
6664 : {
6665 19 : CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
6666 29 : for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
6667 : {
6668 10 : CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
6669 : }
6670 :
6671 19 : if (aapoOverviewBands.empty())
6672 0 : return CE_None;
6673 :
6674 19 : std::vector<GDALRasterBand **> apapoOverviewBands;
6675 48 : for (auto &apoOverviewBands : aapoOverviewBands)
6676 : {
6677 : auto papoOverviewBands = static_cast<GDALRasterBand **>(
6678 29 : CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
6679 61 : for (size_t i = 0; i < apoOverviewBands.size(); ++i)
6680 : {
6681 32 : papoOverviewBands[i] = apoOverviewBands[i];
6682 : }
6683 29 : apapoOverviewBands.push_back(papoOverviewBands);
6684 : }
6685 38 : const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
6686 19 : static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
6687 19 : static_cast<int>(aapoOverviewBands[0].size()),
6688 19 : apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
6689 : papszOptions);
6690 48 : for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
6691 29 : CPLFree(papoOverviewBands);
6692 19 : return eErr;
6693 : }
6694 :
6695 : /************************************************************************/
6696 : /* GDALComputeBandStats() */
6697 : /************************************************************************/
6698 :
6699 : /** Undocumented
6700 : * @param hSrcBand undocumented.
6701 : * @param nSampleStep Step between scanlines used to compute statistics.
6702 : * When nSampleStep is equal to 1, all scanlines will
6703 : * be processed.
6704 : * @param pdfMean undocumented.
6705 : * @param pdfStdDev undocumented.
6706 : * @param pfnProgress undocumented.
6707 : * @param pProgressData undocumented.
6708 : * @return undocumented
6709 : */
6710 18 : CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
6711 : int nSampleStep, double *pdfMean,
6712 : double *pdfStdDev,
6713 : GDALProgressFunc pfnProgress,
6714 : void *pProgressData)
6715 :
6716 : {
6717 18 : VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
6718 :
6719 18 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
6720 :
6721 18 : if (pfnProgress == nullptr)
6722 18 : pfnProgress = GDALDummyProgress;
6723 :
6724 18 : const int nWidth = poSrcBand->GetXSize();
6725 18 : const int nHeight = poSrcBand->GetYSize();
6726 :
6727 18 : if (nSampleStep >= nHeight || nSampleStep < 1)
6728 5 : nSampleStep = 1;
6729 :
6730 18 : GDALDataType eWrkType = GDT_Unknown;
6731 18 : float *pafData = nullptr;
6732 18 : GDALDataType eType = poSrcBand->GetRasterDataType();
6733 18 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6734 18 : if (bComplex)
6735 : {
6736 : pafData = static_cast<float *>(
6737 0 : VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6738 0 : eWrkType = GDT_CFloat32;
6739 : }
6740 : else
6741 : {
6742 : pafData =
6743 18 : static_cast<float *>(VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6744 18 : eWrkType = GDT_Float32;
6745 : }
6746 :
6747 18 : if (nWidth == 0 || pafData == nullptr)
6748 : {
6749 0 : VSIFree(pafData);
6750 0 : return CE_Failure;
6751 : }
6752 :
6753 : /* -------------------------------------------------------------------- */
6754 : /* Loop over all sample lines. */
6755 : /* -------------------------------------------------------------------- */
6756 18 : double dfSum = 0.0;
6757 18 : double dfSum2 = 0.0;
6758 18 : int iLine = 0;
6759 18 : GIntBig nSamples = 0;
6760 :
6761 2143 : do
6762 : {
6763 2161 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6764 : pProgressData))
6765 : {
6766 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6767 0 : CPLFree(pafData);
6768 0 : return CE_Failure;
6769 : }
6770 :
6771 : const CPLErr eErr =
6772 2161 : poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
6773 : 1, eWrkType, 0, 0, nullptr);
6774 2161 : if (eErr != CE_None)
6775 : {
6776 1 : CPLFree(pafData);
6777 1 : return eErr;
6778 : }
6779 :
6780 725208 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6781 : {
6782 723048 : float fValue = 0.0f;
6783 :
6784 723048 : if (bComplex)
6785 : {
6786 : // Compute the magnitude of the complex value.
6787 : fValue =
6788 0 : std::hypot(pafData[static_cast<size_t>(iPixel) * 2],
6789 0 : pafData[static_cast<size_t>(iPixel) * 2 + 1]);
6790 : }
6791 : else
6792 : {
6793 723048 : fValue = pafData[iPixel];
6794 : }
6795 :
6796 723048 : dfSum += static_cast<double>(fValue);
6797 723048 : dfSum2 += static_cast<double>(fValue) * static_cast<double>(fValue);
6798 : }
6799 :
6800 2160 : nSamples += nWidth;
6801 2160 : iLine += nSampleStep;
6802 2160 : } while (iLine < nHeight);
6803 :
6804 17 : if (!pfnProgress(1.0, nullptr, pProgressData))
6805 : {
6806 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6807 0 : CPLFree(pafData);
6808 0 : return CE_Failure;
6809 : }
6810 :
6811 : /* -------------------------------------------------------------------- */
6812 : /* Produce the result values. */
6813 : /* -------------------------------------------------------------------- */
6814 17 : if (pdfMean != nullptr)
6815 17 : *pdfMean = dfSum / nSamples;
6816 :
6817 17 : if (pdfStdDev != nullptr)
6818 : {
6819 17 : const double dfMean = dfSum / nSamples;
6820 :
6821 17 : *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
6822 : }
6823 :
6824 17 : CPLFree(pafData);
6825 :
6826 17 : return CE_None;
6827 : }
6828 :
6829 : /************************************************************************/
6830 : /* GDALOverviewMagnitudeCorrection() */
6831 : /* */
6832 : /* Correct the mean and standard deviation of the overviews of */
6833 : /* the given band to match the base layer approximately. */
6834 : /************************************************************************/
6835 :
6836 : /** Undocumented
6837 : * @param hBaseBand undocumented.
6838 : * @param nOverviewCount undocumented.
6839 : * @param pahOverviews undocumented.
6840 : * @param pfnProgress undocumented.
6841 : * @param pProgressData undocumented.
6842 : * @return undocumented
6843 : */
6844 0 : CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
6845 : int nOverviewCount,
6846 : GDALRasterBandH *pahOverviews,
6847 : GDALProgressFunc pfnProgress,
6848 : void *pProgressData)
6849 :
6850 : {
6851 0 : VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
6852 :
6853 : /* -------------------------------------------------------------------- */
6854 : /* Compute mean/stddev for source raster. */
6855 : /* -------------------------------------------------------------------- */
6856 0 : double dfOrigMean = 0.0;
6857 0 : double dfOrigStdDev = 0.0;
6858 : {
6859 : const CPLErr eErr =
6860 0 : GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
6861 : pfnProgress, pProgressData);
6862 :
6863 0 : if (eErr != CE_None)
6864 0 : return eErr;
6865 : }
6866 :
6867 : /* -------------------------------------------------------------------- */
6868 : /* Loop on overview bands. */
6869 : /* -------------------------------------------------------------------- */
6870 0 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
6871 : {
6872 : GDALRasterBand *poOverview =
6873 0 : GDALRasterBand::FromHandle(pahOverviews[iOverview]);
6874 : double dfOverviewMean, dfOverviewStdDev;
6875 :
6876 : const CPLErr eErr =
6877 0 : GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
6878 : &dfOverviewStdDev, pfnProgress, pProgressData);
6879 :
6880 0 : if (eErr != CE_None)
6881 0 : return eErr;
6882 :
6883 0 : double dfGain = 1.0;
6884 0 : if (dfOrigStdDev >= 0.0001)
6885 0 : dfGain = dfOrigStdDev / dfOverviewStdDev;
6886 :
6887 : /* --------------------------------------------------------------------
6888 : */
6889 : /* Apply gain and offset. */
6890 : /* --------------------------------------------------------------------
6891 : */
6892 0 : const int nWidth = poOverview->GetXSize();
6893 0 : const int nHeight = poOverview->GetYSize();
6894 :
6895 0 : GDALDataType eWrkType = GDT_Unknown;
6896 0 : float *pafData = nullptr;
6897 0 : const GDALDataType eType = poOverview->GetRasterDataType();
6898 0 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6899 0 : if (bComplex)
6900 : {
6901 : pafData = static_cast<float *>(
6902 0 : VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6903 0 : eWrkType = GDT_CFloat32;
6904 : }
6905 : else
6906 : {
6907 : pafData = static_cast<float *>(
6908 0 : VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6909 0 : eWrkType = GDT_Float32;
6910 : }
6911 :
6912 0 : if (pafData == nullptr)
6913 : {
6914 0 : return CE_Failure;
6915 : }
6916 :
6917 0 : for (int iLine = 0; iLine < nHeight; ++iLine)
6918 : {
6919 0 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6920 : pProgressData))
6921 : {
6922 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6923 0 : CPLFree(pafData);
6924 0 : return CE_Failure;
6925 : }
6926 :
6927 0 : if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
6928 : nWidth, 1, eWrkType, 0, 0,
6929 0 : nullptr) != CE_None)
6930 : {
6931 0 : CPLFree(pafData);
6932 0 : return CE_Failure;
6933 : }
6934 :
6935 0 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6936 : {
6937 0 : if (bComplex)
6938 : {
6939 0 : pafData[static_cast<size_t>(iPixel) * 2] *=
6940 0 : static_cast<float>(dfGain);
6941 0 : pafData[static_cast<size_t>(iPixel) * 2 + 1] *=
6942 0 : static_cast<float>(dfGain);
6943 : }
6944 : else
6945 : {
6946 0 : pafData[iPixel] = static_cast<float>(
6947 0 : (double(pafData[iPixel]) - dfOverviewMean) * dfGain +
6948 : dfOrigMean);
6949 : }
6950 : }
6951 :
6952 0 : if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
6953 : nWidth, 1, eWrkType, 0, 0,
6954 0 : nullptr) != CE_None)
6955 : {
6956 0 : CPLFree(pafData);
6957 0 : return CE_Failure;
6958 : }
6959 : }
6960 :
6961 0 : if (!pfnProgress(1.0, nullptr, pProgressData))
6962 : {
6963 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6964 0 : CPLFree(pafData);
6965 0 : return CE_Failure;
6966 : }
6967 :
6968 0 : CPLFree(pafData);
6969 : }
6970 :
6971 0 : return CE_None;
6972 : }
|