Line data Source code
1 :
2 : /******************************************************************************
3 : *
4 : * Project: GDAL Core
5 : * Purpose: Helper code to implement overview support in different drivers.
6 : * Author: Frank Warmerdam, warmerdam@pobox.com
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 2000, Frank Warmerdam
10 : * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * SPDX-License-Identifier: MIT
13 : ****************************************************************************/
14 :
15 : #include "cpl_port.h"
16 : #include "gdal_priv.h"
17 :
18 : #include <cmath>
19 : #include <cstddef>
20 : #include <cstdlib>
21 :
22 : #include <algorithm>
23 : #include <complex>
24 : #include <condition_variable>
25 : #include <limits>
26 : #include <list>
27 : #include <memory>
28 : #include <mutex>
29 : #include <vector>
30 :
31 : #include "cpl_conv.h"
32 : #include "cpl_error.h"
33 : #include "cpl_float.h"
34 : #include "cpl_progress.h"
35 : #include "cpl_vsi.h"
36 : #include "gdal.h"
37 : #include "gdal_thread_pool.h"
38 : #include "gdalwarper.h"
39 : #include "gdal_vrt.h"
40 : #include "vrtdataset.h"
41 :
42 : #ifdef USE_NEON_OPTIMIZATIONS
43 : #include "include_sse2neon.h"
44 :
45 : #if (!defined(__aarch64__) && !defined(_M_ARM64))
46 : #define ARM_V7
47 : #endif
48 :
49 : #define USE_SSE2
50 :
51 : #include "gdalsse_priv.h"
52 :
53 : // Restrict to 64bit processors because they are guaranteed to have SSE2,
54 : // or if __AVX2__ is defined.
55 : #elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
56 : #define USE_SSE2
57 :
58 : #include "gdalsse_priv.h"
59 :
60 : #ifdef __SSE3__
61 : #include <pmmintrin.h>
62 : #endif
63 : #ifdef __SSSE3__
64 : #include <tmmintrin.h>
65 : #endif
66 : #ifdef __SSE4_1__
67 : #include <smmintrin.h>
68 : #endif
69 : #ifdef __AVX2__
70 : #include <immintrin.h>
71 : #endif
72 :
73 : #endif
74 :
75 : // To be included after above USE_SSE2 and include gdalsse_priv.h
76 : // to avoid build issue on Windows x86
77 : #include "gdal_priv_templates.hpp"
78 :
79 : /************************************************************************/
80 : /* GDALResampleChunk_Near() */
81 : /************************************************************************/
82 :
83 : template <class T>
84 1245 : static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
85 : const T *pChunk, T **ppDstBuffer)
86 :
87 : {
88 1245 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
89 1245 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
90 1245 : const GDALDataType eWrkDataType = args.eWrkDataType;
91 1245 : const int nChunkXOff = args.nChunkXOff;
92 1245 : const int nChunkXSize = args.nChunkXSize;
93 1245 : const int nChunkYOff = args.nChunkYOff;
94 1245 : const int nDstXOff = args.nDstXOff;
95 1245 : const int nDstXOff2 = args.nDstXOff2;
96 1245 : const int nDstYOff = args.nDstYOff;
97 1245 : const int nDstYOff2 = args.nDstYOff2;
98 1245 : const int nDstXWidth = nDstXOff2 - nDstXOff;
99 :
100 : /* -------------------------------------------------------------------- */
101 : /* Allocate buffers. */
102 : /* -------------------------------------------------------------------- */
103 1245 : *ppDstBuffer = static_cast<T *>(
104 1245 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
105 : GDALGetDataTypeSizeBytes(eWrkDataType)));
106 1245 : if (*ppDstBuffer == nullptr)
107 : {
108 0 : return CE_Failure;
109 : }
110 1245 : T *const pDstBuffer = *ppDstBuffer;
111 :
112 : int *panSrcXOff =
113 1245 : static_cast<int *>(VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(int)));
114 :
115 1245 : if (panSrcXOff == nullptr)
116 : {
117 0 : return CE_Failure;
118 : }
119 :
120 : /* ==================================================================== */
121 : /* Precompute inner loop constants. */
122 : /* ==================================================================== */
123 842563 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
124 : {
125 841318 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
126 841318 : if (nSrcXOff < nChunkXOff)
127 0 : nSrcXOff = nChunkXOff;
128 :
129 841318 : panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
130 : }
131 :
132 : /* ==================================================================== */
133 : /* Loop over destination scanlines. */
134 : /* ==================================================================== */
135 142379 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
136 : {
137 141134 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
138 141134 : if (nSrcYOff < nChunkYOff)
139 0 : nSrcYOff = nChunkYOff;
140 :
141 141134 : const T *const pSrcScanline =
142 : pChunk +
143 141134 : (static_cast<size_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
144 138100 : nChunkXOff;
145 :
146 : /* --------------------------------------------------------------------
147 : */
148 : /* Loop over destination pixels */
149 : /* --------------------------------------------------------------------
150 : */
151 141134 : T *pDstScanline =
152 141134 : pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
153 119889794 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
154 : {
155 119748760 : pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
156 : }
157 : }
158 :
159 1245 : CPLFree(panSrcXOff);
160 :
161 1245 : return CE_None;
162 : }
163 :
164 1245 : static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
165 : const void *pChunk, void **ppDstBuffer,
166 : GDALDataType *peDstBufferDataType)
167 : {
168 1245 : *peDstBufferDataType = args.eWrkDataType;
169 1245 : switch (args.eWrkDataType)
170 : {
171 : // For nearest resampling, as no computation is done, only the
172 : // size of the data type matters.
173 1088 : case GDT_Byte:
174 : case GDT_Int8:
175 : {
176 1088 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
177 1088 : return GDALResampleChunk_NearT(
178 : args, static_cast<const uint8_t *>(pChunk),
179 1088 : reinterpret_cast<uint8_t **>(ppDstBuffer));
180 : }
181 :
182 52 : case GDT_Int16:
183 : case GDT_UInt16:
184 : case GDT_Float16:
185 : {
186 52 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
187 52 : return GDALResampleChunk_NearT(
188 : args, static_cast<const uint16_t *>(pChunk),
189 52 : reinterpret_cast<uint16_t **>(ppDstBuffer));
190 : }
191 :
192 57 : case GDT_CInt16:
193 : case GDT_CFloat16:
194 : case GDT_Int32:
195 : case GDT_UInt32:
196 : case GDT_Float32:
197 : {
198 57 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
199 57 : return GDALResampleChunk_NearT(
200 : args, static_cast<const uint32_t *>(pChunk),
201 57 : reinterpret_cast<uint32_t **>(ppDstBuffer));
202 : }
203 :
204 44 : case GDT_CInt32:
205 : case GDT_CFloat32:
206 : case GDT_Int64:
207 : case GDT_UInt64:
208 : case GDT_Float64:
209 : {
210 44 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
211 44 : return GDALResampleChunk_NearT(
212 : args, static_cast<const uint64_t *>(pChunk),
213 44 : reinterpret_cast<uint64_t **>(ppDstBuffer));
214 : }
215 :
216 4 : case GDT_CFloat64:
217 : {
218 4 : return GDALResampleChunk_NearT(
219 : args, static_cast<const std::complex<double> *>(pChunk),
220 4 : reinterpret_cast<std::complex<double> **>(ppDstBuffer));
221 : }
222 :
223 0 : case GDT_Unknown:
224 : case GDT_TypeCount:
225 0 : break;
226 : }
227 0 : CPLAssert(false);
228 : return CE_Failure;
229 : }
230 :
231 : namespace
232 : {
233 :
234 : // Find in the color table the entry whose RGB value is the closest
235 : // (using quadratic distance) to the test color, ignoring transparent entries.
236 3837 : int BestColorEntry(const std::vector<GDALColorEntry> &entries,
237 : const GDALColorEntry &test)
238 : {
239 3837 : int nMinDist = std::numeric_limits<int>::max();
240 3837 : size_t bestEntry = 0;
241 986109 : for (size_t i = 0; i < entries.size(); ++i)
242 : {
243 982272 : const GDALColorEntry &entry = entries[i];
244 : // Ignore transparent entries
245 982272 : if (entry.c4 == 0)
246 3237 : continue;
247 :
248 979035 : int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
249 979035 : ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
250 979035 : ((test.c3 - entry.c3) * (test.c3 - entry.c3));
251 979035 : if (nDist < nMinDist)
252 : {
253 15847 : nMinDist = nDist;
254 15847 : bestEntry = i;
255 : }
256 : }
257 3837 : return static_cast<int>(bestEntry);
258 : }
259 :
260 7 : std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
261 : int &transparentIdx)
262 : {
263 7 : std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
264 :
265 7 : transparentIdx = -1;
266 7 : int i = 0;
267 1799 : for (auto &entry : entries)
268 : {
269 1792 : table.GetColorEntryAsRGB(i, &entry);
270 1792 : if (transparentIdx < 0 && entry.c4 == 0)
271 1 : transparentIdx = i;
272 1792 : ++i;
273 : }
274 7 : return entries;
275 : }
276 :
277 : } // unnamed namespace
278 :
279 : /************************************************************************/
280 : /* SQUARE() */
281 : /************************************************************************/
282 :
283 4897 : template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
284 : {
285 4897 : return static_cast<Tsquare>(val) * val;
286 : }
287 :
288 : /************************************************************************/
289 : /* ComputeIntegerRMS() */
290 : /************************************************************************/
291 : // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
292 : // integer that minimizes abs(rms**2 - sumSquares / weight)
293 : template <class T, class Twork>
294 42 : inline T ComputeIntegerRMS(double sumSquares, double weight)
295 : {
296 42 : const double sumDivWeight = sumSquares / weight;
297 42 : T rms = static_cast<T>(sqrt(sumDivWeight));
298 :
299 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
300 : // Naive version:
301 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
302 42 : if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
303 42 : 2 * sumDivWeight)
304 6 : rms += 1;
305 42 : return rms;
306 : }
307 :
308 : template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
309 : {
310 : CPLAssert(false);
311 : return 0;
312 : }
313 :
314 28 : template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
315 : {
316 : // It has been verified that given the correction on rms below, using
317 : // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
318 : // is equivalent, so use the former as it is used twice.
319 28 : const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
320 28 : const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
321 28 : GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
322 :
323 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
324 : // Naive version:
325 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
326 : // Optimized version for integer case and weight == 4
327 28 : if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
328 5 : rms += 1;
329 28 : return rms;
330 : }
331 :
332 : template <>
333 24 : inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
334 : {
335 24 : const double sumDivWeight = sumSquares * 0.25;
336 24 : GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
337 :
338 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
339 : // Naive version:
340 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
341 : // Optimized version for integer case and weight == 4
342 24 : if (static_cast<GUInt32>(rms) * (rms + 1) <
343 24 : static_cast<GUInt32>(sumDivWeight + 0.25))
344 4 : rms += 1;
345 24 : return rms;
346 : }
347 :
348 : #ifdef USE_SSE2
349 :
350 : /************************************************************************/
351 : /* QuadraticMeanByteSSE2OrAVX2() */
352 : /************************************************************************/
353 :
354 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
355 : #define sse2_packus_epi32 _mm_packus_epi32
356 : #else
357 516139 : inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
358 : {
359 516139 : const auto minus32768_32 = _mm_set1_epi32(-32768);
360 516139 : const auto minus32768_16 = _mm_set1_epi16(-32768);
361 516139 : a = _mm_add_epi32(a, minus32768_32);
362 516139 : b = _mm_add_epi32(b, minus32768_32);
363 516139 : a = _mm_packs_epi32(a, b);
364 516139 : a = _mm_sub_epi16(a, minus32768_16);
365 516139 : return a;
366 : }
367 : #endif
368 :
369 : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
370 : #define sse2_hadd_epi16 _mm_hadd_epi16
371 : #else
372 4715530 : inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
373 : {
374 : // Horizontal addition of adjacent pairs
375 4715530 : const auto mask = _mm_set1_epi32(0xFFFF);
376 : const auto horizLo =
377 14146600 : _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
378 : const auto horizHi =
379 14146600 : _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
380 :
381 : // Recombine low and high parts
382 4715530 : return _mm_packs_epi32(horizLo, horizHi);
383 : }
384 : #endif
385 :
386 : #ifdef __AVX2__
387 :
388 : #define set1_epi16 _mm256_set1_epi16
389 : #define set1_epi32 _mm256_set1_epi32
390 : #define setzero _mm256_setzero_si256
391 : #define set1_ps _mm256_set1_ps
392 : #define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
393 : #define unpacklo_epi8 _mm256_unpacklo_epi8
394 : #define unpackhi_epi8 _mm256_unpackhi_epi8
395 : #define madd_epi16 _mm256_madd_epi16
396 : #define add_epi32 _mm256_add_epi32
397 : #define mul_ps _mm256_mul_ps
398 : #define cvtepi32_ps _mm256_cvtepi32_ps
399 : #define sqrt_ps _mm256_sqrt_ps
400 : #define cvttps_epi32 _mm256_cvttps_epi32
401 : #define packs_epi32 _mm256_packs_epi32
402 : #define packus_epi32 _mm256_packus_epi32
403 : #define srli_epi32 _mm256_srli_epi32
404 : #define mullo_epi16 _mm256_mullo_epi16
405 : #define srli_epi16 _mm256_srli_epi16
406 : #define cmpgt_epi16 _mm256_cmpgt_epi16
407 : #define add_epi16 _mm256_add_epi16
408 : #define sub_epi16 _mm256_sub_epi16
409 : #define packus_epi16 _mm256_packus_epi16
410 :
411 : /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
412 : /* to get the lower 128-bit bits of what would be a true 256-bit vector register
413 : */
414 :
415 : inline __m256i FIXUP_LANES(__m256i x)
416 : {
417 : return _mm256_permute4x64_epi64(x, _MM_SHUFFLE(3, 1, 2, 0));
418 : }
419 :
420 : #define store_lo(x, y) \
421 : _mm_storeu_si128(reinterpret_cast<__m128i *>(x), \
422 : _mm256_extracti128_si256(FIXUP_LANES(y), 0))
423 : #define storeu_int(x, y) \
424 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(x), FIXUP_LANES(y))
425 : #define hadd_epi16 _mm256_hadd_epi16
426 : #else
427 : #define set1_epi16 _mm_set1_epi16
428 : #define set1_epi32 _mm_set1_epi32
429 : #define setzero _mm_setzero_si128
430 : #define set1_ps _mm_set1_ps
431 : #define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
432 : #define unpacklo_epi8 _mm_unpacklo_epi8
433 : #define unpackhi_epi8 _mm_unpackhi_epi8
434 : #define madd_epi16 _mm_madd_epi16
435 : #define add_epi32 _mm_add_epi32
436 : #define mul_ps _mm_mul_ps
437 : #define cvtepi32_ps _mm_cvtepi32_ps
438 : #define sqrt_ps _mm_sqrt_ps
439 : #define cvttps_epi32 _mm_cvttps_epi32
440 : #define packs_epi32 _mm_packs_epi32
441 : #define packus_epi32 sse2_packus_epi32
442 : #define srli_epi32 _mm_srli_epi32
443 : #define mullo_epi16 _mm_mullo_epi16
444 : #define srli_epi16 _mm_srli_epi16
445 : #define cmpgt_epi16 _mm_cmpgt_epi16
446 : #define add_epi16 _mm_add_epi16
447 : #define sub_epi16 _mm_sub_epi16
448 : #define packus_epi16 _mm_packus_epi16
449 : #define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
450 : #define storeu_int(x, y) _mm_storeu_si128(reinterpret_cast<__m128i *>(x), (y))
451 : #define hadd_epi16 sse2_hadd_epi16
452 : #endif
453 :
454 : template <class T>
455 : static int
456 : #if defined(__GNUC__)
457 : __attribute__((noinline))
458 : #endif
459 5389 : QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
460 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
461 : T *CPL_RESTRICT pDstScanline)
462 : {
463 : // Optimized implementation for RMS on Byte by
464 : // processing by group of 8 output pixels, so as to use
465 : // a single _mm_sqrt_ps() call for 4 output pixels
466 5389 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
467 :
468 5389 : int iDstPixel = 0;
469 5389 : const auto one16 = set1_epi16(1);
470 5389 : const auto one32 = set1_epi32(1);
471 5389 : const auto zero = setzero();
472 5389 : const auto minus32768 = set1_epi16(-32768);
473 :
474 5389 : constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
475 521504 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
476 : {
477 : // Load 2 * DEST_ELTS bytes from each line
478 516115 : auto firstLine = loadu_int(pSrcScanlineShifted);
479 1032230 : auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
480 : // Extend those Bytes as UInt16s
481 516115 : auto firstLineLo = unpacklo_epi8(firstLine, zero);
482 516115 : auto firstLineHi = unpackhi_epi8(firstLine, zero);
483 516115 : auto secondLineLo = unpacklo_epi8(secondLine, zero);
484 516115 : auto secondLineHi = unpackhi_epi8(secondLine, zero);
485 :
486 : // Multiplication of 16 bit values and horizontal
487 : // addition of 32 bit results
488 : // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
489 516115 : firstLineLo = madd_epi16(firstLineLo, firstLineLo);
490 516115 : firstLineHi = madd_epi16(firstLineHi, firstLineHi);
491 516115 : secondLineLo = madd_epi16(secondLineLo, secondLineLo);
492 516115 : secondLineHi = madd_epi16(secondLineHi, secondLineHi);
493 :
494 : // Vertical addition
495 516115 : const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
496 516115 : const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
497 :
498 : const auto sumSquaresPlusOneDiv4Lo =
499 1032230 : srli_epi32(add_epi32(sumSquaresLo, one32), 2);
500 : const auto sumSquaresPlusOneDiv4Hi =
501 1032230 : srli_epi32(add_epi32(sumSquaresHi, one32), 2);
502 :
503 : // Take square root and truncate/floor to int32
504 : const auto rmsLo =
505 1548340 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
506 : const auto rmsHi =
507 1548340 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
508 :
509 : // Merge back low and high registers with each RMS value
510 : // as a 16 bit value.
511 516115 : auto rms = packs_epi32(rmsLo, rmsHi);
512 :
513 : // Round to upper value if it minimizes the
514 : // error |rms^2 - sumSquares/4|
515 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
516 : // rms += 1;
517 : // which is equivalent to:
518 : // if( rms * (rms + 1) < (sumSquares+1) / 4 )
519 : // rms += 1;
520 : // And both left and right parts fit on 16 (unsigned) bits
521 : const auto sumSquaresPlusOneDiv4 =
522 516115 : packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
523 : // cmpgt_epi16 operates on signed int16, but here
524 : // we have unsigned values, so shift them by -32768 before
525 2580580 : const auto mask = cmpgt_epi16(
526 : add_epi16(sumSquaresPlusOneDiv4, minus32768),
527 : add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
528 : // The value of the mask will be -1 when the correction needs to be
529 : // applied
530 516115 : rms = sub_epi16(rms, mask);
531 :
532 : // Pack each 16 bit RMS value to 8 bits
533 516115 : rms = packus_epi16(rms, rms /* could be anything */);
534 516115 : store_lo(&pDstScanline[iDstPixel], rms);
535 516115 : pSrcScanlineShifted += 2 * DEST_ELTS;
536 : }
537 :
538 5389 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
539 5389 : return iDstPixel;
540 : }
541 :
542 : /************************************************************************/
543 : /* AverageByteSSE2OrAVX2() */
544 : /************************************************************************/
545 :
546 : static int
547 111734 : AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
548 : const GByte *&CPL_RESTRICT pSrcScanlineShiftedInOut,
549 : GByte *CPL_RESTRICT pDstScanline)
550 : {
551 : // Optimized implementation for average on Byte by
552 : // processing by group of 16 output pixels for SSE2, or 32 for AVX2
553 :
554 111734 : const auto zero = setzero();
555 111734 : const auto two16 = set1_epi16(2);
556 111734 : const GByte *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
557 :
558 111734 : constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
559 111734 : int iDstPixel = 0;
560 2469500 : for (; iDstPixel < nDstXWidth - (2 * DEST_ELTS - 1);
561 2357770 : iDstPixel += 2 * DEST_ELTS)
562 : {
563 : decltype(setzero()) average0;
564 : {
565 : // Load 2 * DEST_ELTS bytes from each line
566 2357770 : const auto firstLine = loadu_int(pSrcScanlineShifted);
567 : const auto secondLine =
568 4715530 : loadu_int(pSrcScanlineShifted + nChunkXSize);
569 : // Extend those Bytes as UInt16s
570 2357770 : const auto firstLineLo = unpacklo_epi8(firstLine, zero);
571 2357770 : const auto firstLineHi = unpackhi_epi8(firstLine, zero);
572 2357770 : const auto secondLineLo = unpacklo_epi8(secondLine, zero);
573 2357770 : const auto secondLineHi = unpackhi_epi8(secondLine, zero);
574 :
575 : // Vertical addition
576 2357770 : const auto sumLo = add_epi16(firstLineLo, secondLineLo);
577 2357770 : const auto sumHi = add_epi16(firstLineHi, secondLineHi);
578 :
579 : // Horizontal addition of adjacent pairs, and recombine low and high
580 : // parts
581 2357770 : const auto sum = hadd_epi16(sumLo, sumHi);
582 :
583 : // average = (sum + 2) / 4
584 2357770 : average0 = srli_epi16(add_epi16(sum, two16), 2);
585 :
586 2357770 : pSrcScanlineShifted += 2 * DEST_ELTS;
587 : }
588 :
589 : decltype(setzero()) average1;
590 : {
591 : // Load 2 * DEST_ELTS bytes from each line
592 2357770 : const auto firstLine = loadu_int(pSrcScanlineShifted);
593 : const auto secondLine =
594 4715530 : loadu_int(pSrcScanlineShifted + nChunkXSize);
595 : // Extend those Bytes as UInt16s
596 2357770 : const auto firstLineLo = unpacklo_epi8(firstLine, zero);
597 2357770 : const auto firstLineHi = unpackhi_epi8(firstLine, zero);
598 2357770 : const auto secondLineLo = unpacklo_epi8(secondLine, zero);
599 2357770 : const auto secondLineHi = unpackhi_epi8(secondLine, zero);
600 :
601 : // Vertical addition
602 2357770 : const auto sumLo = add_epi16(firstLineLo, secondLineLo);
603 2357770 : const auto sumHi = add_epi16(firstLineHi, secondLineHi);
604 :
605 : // Horizontal addition of adjacent pairs, and recombine low and high
606 : // parts
607 2357770 : const auto sum = hadd_epi16(sumLo, sumHi);
608 :
609 : // average = (sum + 2) / 4
610 2357770 : average1 = srli_epi16(add_epi16(sum, two16), 2);
611 :
612 2357770 : pSrcScanlineShifted += 2 * DEST_ELTS;
613 : }
614 :
615 : // Pack each 16 bit average value to 8 bits
616 2357770 : const auto average = packus_epi16(average0, average1);
617 2357770 : storeu_int(&pDstScanline[iDstPixel], average);
618 : }
619 :
620 111734 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
621 111734 : return iDstPixel;
622 : }
623 :
624 : /************************************************************************/
625 : /* QuadraticMeanUInt16SSE2() */
626 : /************************************************************************/
627 :
628 : #ifdef __SSE3__
629 : #define sse2_hadd_pd _mm_hadd_pd
630 : #else
631 185 : inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
632 : {
633 : auto aLo_bLo =
634 740 : _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
635 : auto aHi_bHi =
636 740 : _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
637 185 : return _mm_add_pd(aLo_bLo, aHi_bHi); // (aLo + aHi, bLo + bHi)
638 : }
639 : #endif
640 :
641 120 : inline __m128d SQUARE_PD(__m128d x)
642 : {
643 120 : return _mm_mul_pd(x, x);
644 : }
645 :
646 : #ifdef __AVX2__
647 :
648 : inline __m256d SQUARE_PD(__m256d x)
649 : {
650 : return _mm256_mul_pd(x, x);
651 : }
652 :
653 : inline __m256d FIXUP_LANES(__m256d x)
654 : {
655 : return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
656 : }
657 :
658 : inline __m256 FIXUP_LANES(__m256 x)
659 : {
660 : return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
661 : }
662 :
663 : #endif
664 :
665 : static int
666 14 : QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
667 : const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
668 : uint16_t *CPL_RESTRICT pDstScanline)
669 : {
670 : // Optimized implementation for RMS on UInt16 by
671 : // processing by group of 4 output pixels.
672 14 : const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
673 :
674 14 : int iDstPixel = 0;
675 14 : const auto zero = _mm_setzero_si128();
676 :
677 : #ifdef __AVX2__
678 : const auto zeroDot25 = _mm256_set1_pd(0.25);
679 : const auto zeroDot5 = _mm256_set1_pd(0.5);
680 :
681 : // The first four 0's could be anything, as we only take the bottom
682 : // 128 bits.
683 : const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
684 : #else
685 14 : const auto zeroDot25 = _mm_set1_pd(0.25);
686 14 : const auto zeroDot5 = _mm_set1_pd(0.5);
687 : #endif
688 :
689 14 : constexpr int DEST_ELTS =
690 : static_cast<int>(sizeof(zero) / sizeof(uint16_t)) / 2;
691 52 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
692 : {
693 : // Load 8 UInt16 from each line
694 38 : const auto firstLine = _mm_loadu_si128(
695 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
696 : const auto secondLine =
697 38 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
698 38 : pSrcScanlineShifted + nChunkXSize));
699 :
700 : // Detect if all of the source values fit in 14 bits.
701 : // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
702 : // and we can do a much faster implementation.
703 : const auto maskTmp =
704 76 : _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
705 : #if defined(__i386__) || defined(_M_IX86)
706 : uint64_t nMaskFitsIn14Bits = 0;
707 : _mm_storel_epi64(
708 : reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
709 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
710 : #else
711 38 : const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
712 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
713 : #endif
714 38 : if (nMaskFitsIn14Bits == 0)
715 : {
716 : // Multiplication of 16 bit values and horizontal
717 : // addition of 32 bit results
718 : const auto firstLineHSumSquare =
719 26 : _mm_madd_epi16(firstLine, firstLine);
720 : const auto secondLineHSumSquare =
721 26 : _mm_madd_epi16(secondLine, secondLine);
722 : // Vertical addition
723 : const auto sumSquares =
724 26 : _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
725 : // In theory we should take sqrt(sumSquares * 0.25f)
726 : // but given the rounding we do, this is equivalent to
727 : // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
728 : // sumSquares <= 4 * 16383^2
729 26 : const auto one32 = _mm_set1_epi32(1);
730 : const auto sumSquaresPlusOneDiv4 =
731 52 : _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
732 : // Take square root and truncate/floor to int32
733 78 : auto rms = _mm_cvttps_epi32(
734 : _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
735 :
736 : // Round to upper value if it minimizes the
737 : // error |rms^2 - sumSquares/4|
738 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
739 : // rms += 1;
740 : // which is equivalent to:
741 : // if( rms * rms + rms < (sumSquares+1) / 4 )
742 : // rms += 1;
743 : auto mask =
744 78 : _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
745 : _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
746 26 : rms = _mm_sub_epi32(rms, mask);
747 : // Pack each 32 bit RMS value to 16 bits
748 26 : rms = _mm_packs_epi32(rms, rms /* could be anything */);
749 : _mm_storel_epi64(
750 26 : reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
751 26 : pSrcScanlineShifted += 2 * DEST_ELTS;
752 26 : continue;
753 : }
754 :
755 : // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
756 : // to 32 bit would result in 4 multiplications instead of 8, but
757 : // mullo/mulhi have a worse throughput than mul_pd.
758 :
759 : // Extend those UInt16s as UInt32s
760 12 : const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
761 12 : const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
762 12 : const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
763 12 : const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
764 :
765 : #ifdef __AVX2__
766 : // Multiplication of 32 bit values previously converted to 64 bit double
767 : const auto firstLineLoDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineLo));
768 : const auto firstLineHiDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineHi));
769 : const auto secondLineLoDbl =
770 : SQUARE_PD(_mm256_cvtepi32_pd(secondLineLo));
771 : const auto secondLineHiDbl =
772 : SQUARE_PD(_mm256_cvtepi32_pd(secondLineHi));
773 :
774 : // Vertical addition of squares
775 : const auto sumSquaresLo =
776 : _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
777 : const auto sumSquaresHi =
778 : _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
779 :
780 : // Horizontal addition of squares
781 : const auto sumSquares =
782 : FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
783 :
784 : const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
785 :
786 : // Take square root and truncate/floor to int32
787 : auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
788 : const auto rmsDouble = _mm256_cvtepi32_pd(rms);
789 : const auto right = _mm256_sub_pd(
790 : sumDivWeight, _mm256_add_pd(SQUARE_PD(rmsDouble), rmsDouble));
791 :
792 : auto mask =
793 : _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
794 : // Extract 32-bit from each of the 4 64-bit masks
795 : // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
796 : // _MM_SHUFFLE(2,0,2,0)));
797 : mask = _mm256_permutevar8x32_ps(mask, permutation);
798 : const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
799 :
800 : // Apply the correction
801 : rms = _mm_sub_epi32(rms, maskI);
802 :
803 : // Pack each 32 bit RMS value to 16 bits
804 : rms = _mm_packus_epi32(rms, rms /* could be anything */);
805 : #else
806 : // Multiplication of 32 bit values previously converted to 64 bit double
807 12 : const auto firstLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineLo));
808 : const auto firstLineLoHi =
809 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
810 12 : const auto firstLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineHi));
811 : const auto firstLineHiHi =
812 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
813 :
814 12 : const auto secondLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineLo));
815 : const auto secondLineLoHi =
816 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
817 12 : const auto secondLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineHi));
818 : const auto secondLineHiHi =
819 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
820 :
821 : // Vertical addition of squares
822 12 : const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
823 12 : const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
824 12 : const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
825 12 : const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
826 :
827 : // Horizontal addition of squares
828 12 : const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
829 12 : const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
830 :
831 12 : const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
832 12 : const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
833 : // Take square root and truncate/floor to int32
834 24 : const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
835 24 : const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
836 :
837 : // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
838 : // if( 0.5 < sumDivWeight - (rms * rms + rms) )
839 : // rms += 1;
840 12 : const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
841 12 : const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
842 24 : const auto rightLo = _mm_sub_pd(
843 : sumDivWeightLo, _mm_add_pd(SQUARE_PD(rmsLoDouble), rmsLoDouble));
844 36 : const auto rightHi = _mm_sub_pd(
845 : sumDivWeightHi, _mm_add_pd(SQUARE_PD(rmsHiDouble), rmsHiDouble));
846 :
847 24 : const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
848 12 : const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
849 : // The value of the mask will be -1 when the correction needs to be
850 : // applied
851 24 : const auto mask = _mm_castps_si128(_mm_shuffle_ps(
852 : maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
853 :
854 48 : auto rms = _mm_castps_si128(
855 : _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
856 : // Apply the correction
857 12 : rms = _mm_sub_epi32(rms, mask);
858 :
859 : // Pack each 32 bit RMS value to 16 bits
860 12 : rms = sse2_packus_epi32(rms, rms /* could be anything */);
861 : #endif
862 :
863 12 : _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
864 : rms);
865 12 : pSrcScanlineShifted += 2 * DEST_ELTS;
866 : }
867 :
868 14 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
869 14 : return iDstPixel;
870 : }
871 :
872 : /************************************************************************/
873 : /* AverageUInt16SSE2() */
874 : /************************************************************************/
875 :
876 : static int
877 13 : AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
878 : const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
879 : uint16_t *CPL_RESTRICT pDstScanline)
880 : {
881 : // Optimized implementation for average on UInt16 by
882 : // processing by group of 8 output pixels.
883 :
884 13 : const auto mask = _mm_set1_epi32(0xFFFF);
885 13 : const auto two = _mm_set1_epi32(2);
886 13 : const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
887 :
888 13 : int iDstPixel = 0;
889 13 : constexpr int DEST_ELTS = static_cast<int>(sizeof(mask) / sizeof(uint16_t));
890 25 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
891 : {
892 : __m128i averageLow;
893 : // Load 8 UInt16 from each line
894 : {
895 12 : const auto firstLine = _mm_loadu_si128(
896 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
897 : const auto secondLine =
898 12 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
899 12 : pSrcScanlineShifted + nChunkXSize));
900 :
901 : // Horizontal addition and extension to 32 bit
902 36 : const auto horizAddFirstLine = _mm_add_epi32(
903 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
904 : const auto horizAddSecondLine =
905 36 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
906 : _mm_srli_epi32(secondLine, 16));
907 :
908 : // Vertical addition and average computation
909 : // average = (sum + 2) >> 2
910 24 : const auto sum = _mm_add_epi32(
911 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
912 12 : averageLow = _mm_srli_epi32(sum, 2);
913 : }
914 : // Load 8 UInt16 from each line
915 : __m128i averageHigh;
916 : {
917 : const auto firstLine =
918 12 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
919 12 : pSrcScanlineShifted + DEST_ELTS));
920 : const auto secondLine =
921 12 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
922 12 : pSrcScanlineShifted + DEST_ELTS + nChunkXSize));
923 :
924 : // Horizontal addition and extension to 32 bit
925 36 : const auto horizAddFirstLine = _mm_add_epi32(
926 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
927 : const auto horizAddSecondLine =
928 36 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
929 : _mm_srli_epi32(secondLine, 16));
930 :
931 : // Vertical addition and average computation
932 : // average = (sum + 2) >> 2
933 24 : const auto sum = _mm_add_epi32(
934 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
935 12 : averageHigh = _mm_srli_epi32(sum, 2);
936 : }
937 :
938 : // Pack each 32 bit average value to 16 bits
939 12 : auto average = sse2_packus_epi32(averageLow, averageHigh);
940 12 : _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
941 : average);
942 12 : pSrcScanlineShifted += 2 * DEST_ELTS;
943 : }
944 :
945 13 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
946 13 : return iDstPixel;
947 : }
948 :
949 : /************************************************************************/
950 : /* QuadraticMeanFloatSSE2() */
951 : /************************************************************************/
952 :
953 : #if !defined(ARM_V7)
954 :
955 : #ifdef __SSE3__
956 : #define sse2_hadd_ps _mm_hadd_ps
957 : #else
958 82 : inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
959 : {
960 82 : auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
961 82 : auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
962 82 : return _mm_add_ps(aEven_bEven, aOdd_bOdd); // (aEven + aOdd, bEven + bOdd)
963 : }
964 : #endif
965 :
966 : #ifdef __AVX2__
967 : #define set1_ps _mm256_set1_ps
968 : #define loadu_ps _mm256_loadu_ps
969 : #define andnot_ps _mm256_andnot_ps
970 : #define and_ps _mm256_and_ps
971 : #define max_ps _mm256_max_ps
972 : #define shuffle_ps _mm256_shuffle_ps
973 : #define div_ps _mm256_div_ps
974 : #define cmpeq_ps(x, y) _mm256_cmp_ps(x, y, _CMP_EQ_OQ)
975 : #define mul_ps _mm256_mul_ps
976 : #define add_ps _mm256_add_ps
977 : #define hadd_ps _mm256_hadd_ps
978 : #define sqrt_ps _mm256_sqrt_ps
979 : #define or_ps _mm256_or_ps
980 : #define unpacklo_ps _mm256_unpacklo_ps
981 : #define unpackhi_ps _mm256_unpackhi_ps
982 : #define storeu_ps _mm256_storeu_ps
983 : #define blendv_ps _mm256_blendv_ps
984 :
985 : inline __m256 SQUARE_PS(__m256 x)
986 : {
987 : return _mm256_mul_ps(x, x);
988 : }
989 :
990 : #else
991 :
992 : #define set1_ps _mm_set1_ps
993 : #define loadu_ps _mm_loadu_ps
994 : #define andnot_ps _mm_andnot_ps
995 : #define and_ps _mm_and_ps
996 : #define max_ps _mm_max_ps
997 : #define shuffle_ps _mm_shuffle_ps
998 : #define div_ps _mm_div_ps
999 : #define cmpeq_ps _mm_cmpeq_ps
1000 : #define mul_ps _mm_mul_ps
1001 : #define add_ps _mm_add_ps
1002 : #define hadd_ps sse2_hadd_ps
1003 : #define sqrt_ps _mm_sqrt_ps
1004 : #define or_ps _mm_or_ps
1005 : #define unpacklo_ps _mm_unpacklo_ps
1006 : #define unpackhi_ps _mm_unpackhi_ps
1007 : #define storeu_ps _mm_storeu_ps
1008 :
1009 132 : inline __m128 blendv_ps(__m128 a, __m128 b, __m128 mask)
1010 : {
1011 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
1012 : return _mm_blendv_ps(a, b, mask);
1013 : #else
1014 396 : return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b));
1015 : #endif
1016 : }
1017 :
1018 528 : inline __m128 SQUARE_PS(__m128 x)
1019 : {
1020 528 : return _mm_mul_ps(x, x);
1021 : }
1022 :
1023 132 : inline __m128 FIXUP_LANES(__m128 x)
1024 : {
1025 132 : return x;
1026 : }
1027 :
1028 : #endif
1029 :
1030 : static int
1031 : #if defined(__GNUC__)
1032 : __attribute__((noinline))
1033 : #endif
1034 66 : QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
1035 : const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1036 : float *CPL_RESTRICT pDstScanline)
1037 : {
1038 : // Optimized implementation for RMS on Float32 by
1039 : // processing by group of output pixels.
1040 66 : const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1041 :
1042 66 : int iDstPixel = 0;
1043 66 : const auto minus_zero = set1_ps(-0.0f);
1044 66 : const auto zeroDot25 = set1_ps(0.25f);
1045 66 : const auto one = set1_ps(1.0f);
1046 66 : const auto infv = set1_ps(std::numeric_limits<float>::infinity());
1047 66 : constexpr int DEST_ELTS = static_cast<int>(sizeof(one) / sizeof(float));
1048 :
1049 198 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1050 : {
1051 : // Load 2*DEST_ELTS Float32 from each line
1052 132 : auto firstLineLo = loadu_ps(pSrcScanlineShifted);
1053 132 : auto firstLineHi = loadu_ps(pSrcScanlineShifted + DEST_ELTS);
1054 132 : auto secondLineLo = loadu_ps(pSrcScanlineShifted + nChunkXSize);
1055 : auto secondLineHi =
1056 264 : loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize);
1057 :
1058 : // Take the absolute value
1059 132 : firstLineLo = andnot_ps(minus_zero, firstLineLo);
1060 132 : firstLineHi = andnot_ps(minus_zero, firstLineHi);
1061 132 : secondLineLo = andnot_ps(minus_zero, secondLineLo);
1062 132 : secondLineHi = andnot_ps(minus_zero, secondLineHi);
1063 :
1064 : auto firstLineEven =
1065 132 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1066 : auto firstLineOdd =
1067 132 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1068 : auto secondLineEven =
1069 132 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1070 : auto secondLineOdd =
1071 132 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1072 :
1073 : // Compute the maximum of each DEST_ELTS value to RMS-average
1074 396 : const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
1075 : max_ps(secondLineEven, secondLineEven));
1076 :
1077 : // Normalize each value by the maximum of the DEST_ELTS ones.
1078 : // This step is important to avoid that the square evaluates to infinity
1079 : // for sufficiently big input.
1080 132 : auto invMax = div_ps(one, maxV);
1081 : // Deal with 0 being the maximum to correct division by zero
1082 : // note: comparing to -0 leads to identical results as to comparing with
1083 : // 0
1084 264 : invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
1085 :
1086 132 : firstLineEven = mul_ps(firstLineEven, invMax);
1087 132 : firstLineOdd = mul_ps(firstLineOdd, invMax);
1088 132 : secondLineEven = mul_ps(secondLineEven, invMax);
1089 132 : secondLineOdd = mul_ps(secondLineOdd, invMax);
1090 :
1091 : // Compute squares
1092 132 : firstLineEven = SQUARE_PS(firstLineEven);
1093 132 : firstLineOdd = SQUARE_PS(firstLineOdd);
1094 132 : secondLineEven = SQUARE_PS(secondLineEven);
1095 132 : secondLineOdd = SQUARE_PS(secondLineOdd);
1096 :
1097 396 : const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
1098 : add_ps(secondLineEven, secondLineOdd));
1099 :
1100 396 : auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
1101 :
1102 : // Deal with infinity being the maximum
1103 132 : const auto maskIsInf = cmpeq_ps(maxV, infv);
1104 132 : rms = blendv_ps(rms, infv, maskIsInf);
1105 :
1106 132 : rms = FIXUP_LANES(rms);
1107 :
1108 132 : storeu_ps(&pDstScanline[iDstPixel], rms);
1109 132 : pSrcScanlineShifted += DEST_ELTS * 2;
1110 : }
1111 :
1112 66 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1113 66 : return iDstPixel;
1114 : }
1115 :
1116 : /************************************************************************/
1117 : /* AverageFloatSSE2() */
1118 : /************************************************************************/
1119 :
1120 46 : static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
1121 : const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1122 : float *CPL_RESTRICT pDstScanline)
1123 : {
1124 : // Optimized implementation for average on Float32 by
1125 : // processing by group of output pixels.
1126 46 : const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1127 :
1128 46 : int iDstPixel = 0;
1129 46 : const auto zeroDot25 = _mm_set1_ps(0.25f);
1130 46 : constexpr int DEST_ELTS =
1131 : static_cast<int>(sizeof(zeroDot25) / sizeof(float));
1132 :
1133 128 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1134 : {
1135 : // Load 2 * DEST_ELTS Float32 from each line
1136 : const auto firstLineLo =
1137 82 : _mm_mul_ps(_mm_loadu_ps(pSrcScanlineShifted), zeroDot25);
1138 164 : const auto firstLineHi = _mm_mul_ps(
1139 : _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS), zeroDot25);
1140 82 : const auto secondLineLo = _mm_mul_ps(
1141 82 : _mm_loadu_ps(pSrcScanlineShifted + nChunkXSize), zeroDot25);
1142 164 : const auto secondLineHi = _mm_mul_ps(
1143 82 : _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize),
1144 : zeroDot25);
1145 :
1146 : // Vertical addition
1147 82 : const auto tmpLo = _mm_add_ps(firstLineLo, secondLineLo);
1148 82 : const auto tmpHi = _mm_add_ps(firstLineHi, secondLineHi);
1149 :
1150 : // Horizontal addition
1151 82 : const auto average = sse2_hadd_ps(tmpLo, tmpHi);
1152 :
1153 82 : _mm_storeu_ps(&pDstScanline[iDstPixel], average);
1154 82 : pSrcScanlineShifted += DEST_ELTS * 2;
1155 : }
1156 :
1157 46 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1158 46 : return iDstPixel;
1159 : }
1160 :
1161 : /************************************************************************/
1162 : /* AverageDoubleSSE2() */
1163 : /************************************************************************/
1164 :
1165 : static int
1166 50 : AverageDoubleSSE2(int nDstXWidth, int nChunkXSize,
1167 : const double *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1168 : double *CPL_RESTRICT pDstScanline)
1169 : {
1170 : // Optimized implementation for average on Float64 by
1171 : // processing by group of output pixels.
1172 50 : const double *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1173 :
1174 50 : int iDstPixel = 0;
1175 50 : const auto zeroDot25 = _mm_set1_pd(0.25);
1176 50 : constexpr int DEST_ELTS =
1177 : static_cast<int>(sizeof(zeroDot25) / sizeof(double));
1178 :
1179 211 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1180 : {
1181 : // Load 4 * DEST_ELTS Float64 from each line
1182 161 : const auto firstLine0 = _mm_mul_pd(
1183 : _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS), zeroDot25);
1184 322 : const auto firstLine1 = _mm_mul_pd(
1185 : _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS), zeroDot25);
1186 161 : const auto secondLine0 = _mm_mul_pd(
1187 161 : _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS + nChunkXSize),
1188 : zeroDot25);
1189 322 : const auto secondLine1 = _mm_mul_pd(
1190 161 : _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS + nChunkXSize),
1191 : zeroDot25);
1192 :
1193 : // Vertical addition
1194 161 : const auto tmp0 = _mm_add_pd(firstLine0, secondLine0);
1195 161 : const auto tmp1 = _mm_add_pd(firstLine1, secondLine1);
1196 :
1197 : // Horizontal addition
1198 161 : const auto average0 = sse2_hadd_pd(tmp0, tmp1);
1199 :
1200 161 : _mm_storeu_pd(&pDstScanline[iDstPixel + 0], average0);
1201 161 : pSrcScanlineShifted += DEST_ELTS * 2;
1202 : }
1203 :
1204 50 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1205 50 : return iDstPixel;
1206 : }
1207 :
1208 : #endif
1209 :
1210 : #endif
1211 :
1212 : /************************************************************************/
1213 : /* GDALResampleChunk_AverageOrRMS() */
1214 : /************************************************************************/
1215 :
1216 : template <class T, class Tsum, GDALDataType eWrkDataType, bool bQuadraticMean>
1217 : static CPLErr
1218 2388 : GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
1219 : const T *pChunk, void **ppDstBuffer)
1220 : {
1221 2388 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1222 2388 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1223 2388 : const double dfSrcXDelta = args.dfSrcXDelta;
1224 2388 : const double dfSrcYDelta = args.dfSrcYDelta;
1225 2388 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1226 2388 : const int nChunkXOff = args.nChunkXOff;
1227 2388 : const int nChunkYOff = args.nChunkYOff;
1228 2388 : const int nChunkXSize = args.nChunkXSize;
1229 2388 : const int nChunkYSize = args.nChunkYSize;
1230 2388 : const int nDstXOff = args.nDstXOff;
1231 2388 : const int nDstXOff2 = args.nDstXOff2;
1232 2388 : const int nDstYOff = args.nDstYOff;
1233 2388 : const int nDstYOff2 = args.nDstYOff2;
1234 2388 : const char *pszResampling = args.pszResampling;
1235 2388 : bool bHasNoData = args.bHasNoData;
1236 2388 : const double dfNoDataValue = args.dfNoDataValue;
1237 2388 : const GDALColorTable *const poColorTable =
1238 : !bQuadraticMean &&
1239 : // AVERAGE_BIT2GRAYSCALE
1240 2311 : CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"))
1241 : ? nullptr
1242 : : args.poColorTable;
1243 2388 : const bool bPropagateNoData = args.bPropagateNoData;
1244 :
1245 2388 : T tNoDataValue = (!bHasNoData) ? 0 : static_cast<T>(dfNoDataValue);
1246 2388 : const T tReplacementVal =
1247 174 : bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
1248 56 : args.eOvrDataType, dfNoDataValue))
1249 : : 0;
1250 :
1251 2388 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
1252 2388 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1253 2388 : const int nDstXWidth = nDstXOff2 - nDstXOff;
1254 :
1255 : /* -------------------------------------------------------------------- */
1256 : /* Allocate buffers. */
1257 : /* -------------------------------------------------------------------- */
1258 2388 : *ppDstBuffer = static_cast<T *>(
1259 2388 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
1260 : GDALGetDataTypeSizeBytes(eWrkDataType)));
1261 2388 : if (*ppDstBuffer == nullptr)
1262 : {
1263 0 : return CE_Failure;
1264 : }
1265 2388 : T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
1266 :
1267 : struct PrecomputedXValue
1268 : {
1269 : int nLeftXOffShifted;
1270 : int nRightXOffShifted;
1271 : double dfLeftWeight;
1272 : double dfRightWeight;
1273 : double dfTotalWeightFullLine;
1274 : };
1275 :
1276 : PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
1277 2388 : VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(PrecomputedXValue)));
1278 :
1279 2388 : if (pasSrcX == nullptr)
1280 : {
1281 0 : return CE_Failure;
1282 : }
1283 :
1284 2388 : std::vector<GDALColorEntry> colorEntries;
1285 :
1286 2388 : if (poColorTable)
1287 : {
1288 5 : int nTransparentIdx = -1;
1289 5 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1290 :
1291 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1292 : // it as nodata value
1293 6 : if (bHasNoData && dfNoDataValue >= 0.0 &&
1294 1 : tNoDataValue < colorEntries.size())
1295 1 : colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1296 :
1297 : // Or if we have no explicit nodata, but a color table entry that is
1298 : // transparent, consider it as the nodata value
1299 4 : else if (!bHasNoData && nTransparentIdx >= 0)
1300 : {
1301 0 : bHasNoData = true;
1302 0 : tNoDataValue = static_cast<T>(nTransparentIdx);
1303 : }
1304 : }
1305 :
1306 : /* ==================================================================== */
1307 : /* Precompute inner loop constants. */
1308 : /* ==================================================================== */
1309 2388 : bool bSrcXSpacingIsTwo = true;
1310 2388 : int nLastSrcXOff2 = -1;
1311 856888 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1312 : {
1313 854500 : const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
1314 : // Apply some epsilon to avoid numerical precision issues
1315 854500 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
1316 854500 : const double dfSrcXOff2 =
1317 854500 : dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
1318 854500 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
1319 :
1320 854500 : if (nSrcXOff < nChunkXOff)
1321 0 : nSrcXOff = nChunkXOff;
1322 854500 : if (nSrcXOff2 == nSrcXOff)
1323 0 : nSrcXOff2++;
1324 854500 : if (nSrcXOff2 > nChunkRightXOff)
1325 1 : nSrcXOff2 = nChunkRightXOff;
1326 :
1327 854500 : pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
1328 854500 : pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
1329 854500 : nSrcXOff2 - nChunkXOff;
1330 21 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
1331 854500 : (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
1332 854500 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
1333 854500 : 1 - (nSrcXOff2 - dfSrcXOff2);
1334 854500 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
1335 854500 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
1336 854500 : if (nSrcXOff + 1 < nSrcXOff2)
1337 : {
1338 854479 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1339 854479 : nSrcXOff2 - nSrcXOff - 2;
1340 854479 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1341 854479 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
1342 : }
1343 :
1344 854500 : if (nSrcXOff2 - nSrcXOff != 2 ||
1345 733021 : (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
1346 : {
1347 120627 : bSrcXSpacingIsTwo = false;
1348 : }
1349 854500 : nLastSrcXOff2 = nSrcXOff2;
1350 : }
1351 :
1352 : /* ==================================================================== */
1353 : /* Loop over destination scanlines. */
1354 : /* ==================================================================== */
1355 722538 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1356 : {
1357 720150 : const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
1358 720150 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
1359 720150 : if (nSrcYOff < nChunkYOff)
1360 0 : nSrcYOff = nChunkYOff;
1361 :
1362 720150 : const double dfSrcYOff2 =
1363 720150 : dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
1364 720150 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
1365 720150 : if (nSrcYOff2 == nSrcYOff)
1366 0 : ++nSrcYOff2;
1367 720150 : if (nSrcYOff2 > nChunkBottomYOff)
1368 3 : nSrcYOff2 = nChunkBottomYOff;
1369 :
1370 720150 : T *const pDstScanline =
1371 720150 : pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
1372 :
1373 : /* --------------------------------------------------------------------
1374 : */
1375 : /* Loop over destination pixels */
1376 : /* --------------------------------------------------------------------
1377 : */
1378 720150 : if (poColorTable == nullptr)
1379 : {
1380 720035 : if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
1381 : pabyChunkNodataMask == nullptr)
1382 : {
1383 : if constexpr (eWrkDataType == GDT_Byte ||
1384 : eWrkDataType == GDT_UInt16)
1385 : {
1386 : // Optimized case : no nodata, overview by a factor of 2 and
1387 : // regular x and y src spacing.
1388 117150 : const T *pSrcScanlineShifted =
1389 117150 : pChunk + pasSrcX[0].nLeftXOffShifted +
1390 117150 : static_cast<size_t>(nSrcYOff - nChunkYOff) *
1391 117150 : nChunkXSize;
1392 117150 : int iDstPixel = 0;
1393 : #ifdef USE_SSE2
1394 : if constexpr (eWrkDataType == GDT_Byte)
1395 : {
1396 : if constexpr (bQuadraticMean)
1397 : {
1398 5389 : iDstPixel = QuadraticMeanByteSSE2OrAVX2(
1399 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1400 : pDstScanline);
1401 : }
1402 : else
1403 : {
1404 111734 : iDstPixel = AverageByteSSE2OrAVX2(
1405 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1406 : pDstScanline);
1407 : }
1408 : }
1409 : else
1410 : {
1411 : static_assert(eWrkDataType == GDT_UInt16);
1412 : if constexpr (bQuadraticMean)
1413 : {
1414 14 : iDstPixel = QuadraticMeanUInt16SSE2(
1415 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1416 : pDstScanline);
1417 : }
1418 : else
1419 : {
1420 13 : iDstPixel = AverageUInt16SSE2(
1421 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1422 : pDstScanline);
1423 : }
1424 : }
1425 : #endif
1426 291609 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1427 : {
1428 174459 : Tsum nTotal = 0;
1429 : T nVal;
1430 : if constexpr (bQuadraticMean)
1431 52 : nTotal =
1432 52 : SQUARE<Tsum>(pSrcScanlineShifted[0]) +
1433 52 : SQUARE<Tsum>(pSrcScanlineShifted[1]) +
1434 52 : SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
1435 52 : SQUARE<Tsum>(
1436 52 : pSrcScanlineShifted[1 + nChunkXSize]);
1437 : else
1438 174407 : nTotal = pSrcScanlineShifted[0] +
1439 174407 : pSrcScanlineShifted[1] +
1440 174407 : pSrcScanlineShifted[nChunkXSize] +
1441 174407 : pSrcScanlineShifted[1 + nChunkXSize];
1442 :
1443 174459 : constexpr int nTotalWeight = 4;
1444 : if constexpr (bQuadraticMean)
1445 52 : nVal = ComputeIntegerRMS_4values<T>(nTotal);
1446 : else
1447 174407 : nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
1448 : nTotalWeight);
1449 :
1450 : // No need to compare nVal against tNoDataValue as we
1451 : // are in a case where pabyChunkNodataMask == nullptr
1452 : // implies the absence of nodata value.
1453 174459 : pDstScanline[iDstPixel] = nVal;
1454 174459 : pSrcScanlineShifted += 2;
1455 : }
1456 : }
1457 : else
1458 : {
1459 : static_assert(eWrkDataType == GDT_Float32 ||
1460 : eWrkDataType == GDT_Float64);
1461 198 : const T *pSrcScanlineShifted =
1462 198 : pChunk + pasSrcX[0].nLeftXOffShifted +
1463 198 : static_cast<size_t>(nSrcYOff - nChunkYOff) *
1464 198 : nChunkXSize;
1465 198 : int iDstPixel = 0;
1466 : #if defined(USE_SSE2) && !defined(ARM_V7)
1467 : if constexpr (eWrkDataType == GDT_Float32)
1468 : {
1469 : static_assert(std::is_same_v<T, float>);
1470 : if constexpr (bQuadraticMean)
1471 : {
1472 66 : iDstPixel = QuadraticMeanFloatSSE2(
1473 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1474 : pDstScanline);
1475 : }
1476 : else
1477 : {
1478 46 : iDstPixel = AverageFloatSSE2(
1479 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1480 : pDstScanline);
1481 : }
1482 : }
1483 : else
1484 : {
1485 : if constexpr (!bQuadraticMean)
1486 : {
1487 50 : iDstPixel = AverageDoubleSSE2(
1488 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1489 : pDstScanline);
1490 : }
1491 : }
1492 : #endif
1493 :
1494 714 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1495 : {
1496 : T nVal;
1497 :
1498 : if constexpr (bQuadraticMean)
1499 : {
1500 : // Avoid issues with large values by renormalizing
1501 96 : const auto max = std::max(
1502 420 : {std::fabs(pSrcScanlineShifted[0]),
1503 420 : std::fabs(pSrcScanlineShifted[1]),
1504 420 : std::fabs(pSrcScanlineShifted[nChunkXSize]),
1505 420 : std::fabs(
1506 420 : pSrcScanlineShifted[1 + nChunkXSize])});
1507 420 : if (max == 0)
1508 : {
1509 8 : nVal = 0;
1510 : }
1511 412 : else if (std::isinf(max))
1512 : {
1513 : // If there is at least one infinity value,
1514 : // then just summing, and taking the abs
1515 : // value will give the expected result:
1516 : // * +inf if all values are +inf
1517 : // * +inf if all values are -inf
1518 : // * NaN otherwise
1519 82 : nVal = std::fabs(
1520 82 : pSrcScanlineShifted[0] +
1521 82 : pSrcScanlineShifted[1] +
1522 82 : pSrcScanlineShifted[nChunkXSize] +
1523 82 : pSrcScanlineShifted[1 + nChunkXSize]);
1524 : }
1525 : else
1526 : {
1527 330 : const auto inv_max = static_cast<T>(1.0) / max;
1528 330 : nVal =
1529 : max *
1530 330 : std::sqrt(
1531 : static_cast<T>(0.25) *
1532 330 : (SQUARE(pSrcScanlineShifted[0] *
1533 330 : inv_max) +
1534 330 : SQUARE(pSrcScanlineShifted[1] *
1535 330 : inv_max) +
1536 330 : SQUARE(
1537 330 : pSrcScanlineShifted[nChunkXSize] *
1538 330 : inv_max) +
1539 330 : SQUARE(
1540 330 : pSrcScanlineShifted[1 +
1541 : nChunkXSize] *
1542 : inv_max)));
1543 : }
1544 : }
1545 : else
1546 : {
1547 96 : constexpr auto weight = static_cast<T>(0.25);
1548 : // Multiply each value by weight to avoid
1549 : // potential overflow
1550 96 : nVal =
1551 96 : (weight * pSrcScanlineShifted[0] +
1552 96 : weight * pSrcScanlineShifted[1] +
1553 96 : weight * pSrcScanlineShifted[nChunkXSize] +
1554 96 : weight * pSrcScanlineShifted[1 + nChunkXSize]);
1555 : }
1556 :
1557 : // No need to compare nVal against tNoDataValue as we
1558 : // are in a case where pabyChunkNodataMask == nullptr
1559 : // implies the absence of nodata value.
1560 516 : pDstScanline[iDstPixel] = nVal;
1561 516 : pSrcScanlineShifted += 2;
1562 : }
1563 117348 : }
1564 : }
1565 : else
1566 : {
1567 18 : const double dfBottomWeight =
1568 602687 : (nSrcYOff + 1 == nSrcYOff2) ? 1.0
1569 602669 : : 1.0 - (dfSrcYOff - nSrcYOff);
1570 602687 : const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
1571 602687 : nSrcYOff -= nChunkYOff;
1572 602687 : nSrcYOff2 -= nChunkYOff;
1573 :
1574 602687 : double dfTotalWeightFullColumn = dfBottomWeight;
1575 602687 : if (nSrcYOff + 1 < nSrcYOff2)
1576 : {
1577 602669 : dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
1578 602669 : dfTotalWeightFullColumn += dfTopWeight;
1579 : }
1580 :
1581 18752173 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1582 : {
1583 18149533 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1584 18149533 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1585 :
1586 18149533 : double dfTotal = 0;
1587 18149533 : double dfTotalWeight = 0;
1588 18149533 : [[maybe_unused]] double dfMulFactor = 1.0;
1589 18149533 : [[maybe_unused]] double dfInvMulFactor = 1.0;
1590 18149533 : constexpr bool bUseMulFactor =
1591 : (eWrkDataType == GDT_Float32 ||
1592 : eWrkDataType == GDT_Float64);
1593 18149533 : if (pabyChunkNodataMask == nullptr)
1594 : {
1595 : if constexpr (bUseMulFactor)
1596 : {
1597 : if constexpr (bQuadraticMean)
1598 : {
1599 80 : T mulFactor = 0;
1600 80 : auto pChunkShifted =
1601 80 : pChunk +
1602 80 : static_cast<size_t>(nSrcYOff) * nChunkXSize;
1603 :
1604 240 : for (int iY = nSrcYOff; iY < nSrcYOff2;
1605 160 : ++iY, pChunkShifted += nChunkXSize)
1606 : {
1607 480 : for (int iX = nSrcXOff; iX < nSrcXOff2;
1608 : ++iX)
1609 640 : mulFactor = std::max(
1610 : mulFactor,
1611 320 : std::fabs(pChunkShifted[iX]));
1612 : }
1613 80 : dfMulFactor = double(mulFactor);
1614 142 : dfInvMulFactor =
1615 62 : dfMulFactor > 0 &&
1616 62 : std::isfinite(dfMulFactor)
1617 : ? 1.0 / dfMulFactor
1618 : : 1.0;
1619 : }
1620 : else
1621 : {
1622 139 : dfMulFactor = (nSrcYOff2 - nSrcYOff) *
1623 139 : (nSrcXOff2 - nSrcXOff);
1624 139 : dfInvMulFactor = 1.0 / dfMulFactor;
1625 : }
1626 : }
1627 :
1628 1746545 : auto pChunkShifted =
1629 227 : pChunk +
1630 1746545 : static_cast<size_t>(nSrcYOff) * nChunkXSize;
1631 1746545 : int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1632 1746545 : double dfWeightY = dfBottomWeight;
1633 3493539 : while (true)
1634 : {
1635 : double dfTotalLine;
1636 : if constexpr (bQuadraticMean)
1637 : {
1638 : // Left pixel
1639 : {
1640 216 : const T val = pChunkShifted[nSrcXOff];
1641 216 : dfTotalLine =
1642 216 : SQUARE(double(val) * dfInvMulFactor) *
1643 216 : pasSrcX[iDstPixel].dfLeftWeight;
1644 : }
1645 :
1646 216 : if (nSrcXOff + 1 < nSrcXOff2)
1647 : {
1648 : // Middle pixels
1649 216 : for (int iX = nSrcXOff + 1;
1650 536 : iX < nSrcXOff2 - 1; ++iX)
1651 : {
1652 320 : const T val = pChunkShifted[iX];
1653 320 : dfTotalLine += SQUARE(double(val) *
1654 : dfInvMulFactor);
1655 : }
1656 :
1657 : // Right pixel
1658 : {
1659 216 : const T val =
1660 216 : pChunkShifted[nSrcXOff2 - 1];
1661 216 : dfTotalLine +=
1662 216 : SQUARE(double(val) *
1663 216 : dfInvMulFactor) *
1664 216 : pasSrcX[iDstPixel].dfRightWeight;
1665 : }
1666 : }
1667 : }
1668 : else
1669 : {
1670 : // Left pixel
1671 : {
1672 5239868 : const T val = pChunkShifted[nSrcXOff];
1673 5239868 : dfTotalLine =
1674 5239868 : double(val) * dfInvMulFactor *
1675 5239868 : pasSrcX[iDstPixel].dfLeftWeight;
1676 : }
1677 :
1678 5239868 : if (nSrcXOff + 1 < nSrcXOff2)
1679 : {
1680 : // Middle pixels
1681 4239442 : for (int iX = nSrcXOff + 1;
1682 64183238 : iX < nSrcXOff2 - 1; ++iX)
1683 : {
1684 59943836 : const T val = pChunkShifted[iX];
1685 59943836 : dfTotalLine +=
1686 59943836 : double(val) * dfInvMulFactor;
1687 : }
1688 :
1689 : // Right pixel
1690 : {
1691 4239442 : const T val =
1692 4239442 : pChunkShifted[nSrcXOff2 - 1];
1693 4239442 : dfTotalLine +=
1694 4239442 : double(val) * dfInvMulFactor *
1695 4239442 : pasSrcX[iDstPixel].dfRightWeight;
1696 : }
1697 : }
1698 : }
1699 :
1700 5240084 : dfTotal += dfTotalLine * dfWeightY;
1701 5240084 : --nCounterY;
1702 5240084 : if (nCounterY < 0)
1703 1746545 : break;
1704 3493539 : pChunkShifted += nChunkXSize;
1705 3493539 : dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
1706 : }
1707 :
1708 1746545 : dfTotalWeight =
1709 1746545 : pasSrcX[iDstPixel].dfTotalWeightFullLine *
1710 : dfTotalWeightFullColumn;
1711 : }
1712 : else
1713 : {
1714 16402998 : size_t nCount = 0;
1715 71753694 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1716 : {
1717 55350696 : const auto pChunkShifted =
1718 55350696 : pChunk + static_cast<size_t>(iY) * nChunkXSize;
1719 :
1720 55350696 : double dfTotalLine = 0;
1721 55350696 : double dfTotalWeightLine = 0;
1722 : // Left pixel
1723 : {
1724 55350696 : const int iX = nSrcXOff;
1725 55350696 : const T val = pChunkShifted[iX];
1726 55350696 : if (pabyChunkNodataMask
1727 55350696 : [iX +
1728 55350696 : static_cast<size_t>(iY) * nChunkXSize])
1729 : {
1730 23510843 : nCount++;
1731 23510843 : const double dfWeightX =
1732 23510843 : pasSrcX[iDstPixel].dfLeftWeight;
1733 23510843 : dfTotalWeightLine = dfWeightX;
1734 : if constexpr (bQuadraticMean)
1735 60 : dfTotalLine =
1736 60 : SQUARE(double(val)) * dfWeightX;
1737 : else
1738 23510783 : dfTotalLine = double(val) * dfWeightX;
1739 : }
1740 : }
1741 :
1742 55350696 : if (nSrcXOff < nSrcXOff2 - 1)
1743 : {
1744 : // Middle pixels
1745 152871196 : for (int iX = nSrcXOff + 1; iX < nSrcXOff2 - 1;
1746 : ++iX)
1747 : {
1748 97519600 : const T val = pChunkShifted[iX];
1749 97519600 : if (pabyChunkNodataMask
1750 97519600 : [iX + static_cast<size_t>(iY) *
1751 97519600 : nChunkXSize])
1752 : {
1753 39728100 : nCount++;
1754 39728100 : dfTotalWeightLine += 1;
1755 : if constexpr (bQuadraticMean)
1756 0 : dfTotalLine += SQUARE(double(val));
1757 : else
1758 39728100 : dfTotalLine += double(val);
1759 : }
1760 : }
1761 :
1762 : // Right pixel
1763 : {
1764 55351596 : const int iX = nSrcXOff2 - 1;
1765 55351596 : const T val = pChunkShifted[iX];
1766 55351596 : if (pabyChunkNodataMask
1767 55351596 : [iX + static_cast<size_t>(iY) *
1768 55351596 : nChunkXSize])
1769 : {
1770 23510111 : nCount++;
1771 23510111 : const double dfWeightX =
1772 23510111 : pasSrcX[iDstPixel].dfRightWeight;
1773 23510111 : dfTotalWeightLine += dfWeightX;
1774 : if constexpr (bQuadraticMean)
1775 61 : dfTotalLine +=
1776 61 : SQUARE(double(val)) * dfWeightX;
1777 : else
1778 23510050 : dfTotalLine +=
1779 23510050 : double(val) * dfWeightX;
1780 : }
1781 : }
1782 : }
1783 :
1784 94310394 : const double dfWeightY =
1785 : (iY == nSrcYOff) ? dfBottomWeight
1786 38959698 : : (iY + 1 == nSrcYOff2) ? dfTopWeight
1787 : : 1.0;
1788 55350696 : dfTotal += dfTotalLine * dfWeightY;
1789 55350696 : dfTotalWeight += dfTotalWeightLine * dfWeightY;
1790 : }
1791 :
1792 16402998 : if (nCount == 0 ||
1793 8 : (bPropagateNoData &&
1794 : nCount <
1795 8 : static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
1796 8 : (nSrcXOff2 - nSrcXOff)))
1797 : {
1798 9607412 : pDstScanline[iDstPixel] = tNoDataValue;
1799 9607412 : continue;
1800 : }
1801 : }
1802 : if constexpr (eWrkDataType == GDT_Byte)
1803 : {
1804 : T nVal;
1805 : if constexpr (bQuadraticMean)
1806 38 : nVal = ComputeIntegerRMS<T, int>(dfTotal,
1807 : dfTotalWeight);
1808 : else
1809 8541810 : nVal =
1810 8541810 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1811 8541848 : if (bHasNoData && nVal == tNoDataValue)
1812 0 : nVal = tReplacementVal;
1813 8541848 : pDstScanline[iDstPixel] = nVal;
1814 : }
1815 : else if constexpr (eWrkDataType == GDT_UInt16)
1816 : {
1817 : T nVal;
1818 : if constexpr (bQuadraticMean)
1819 4 : nVal = ComputeIntegerRMS<T, uint64_t>(
1820 : dfTotal, dfTotalWeight);
1821 : else
1822 4 : nVal =
1823 4 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1824 8 : if (bHasNoData && nVal == tNoDataValue)
1825 0 : nVal = tReplacementVal;
1826 8 : pDstScanline[iDstPixel] = nVal;
1827 : }
1828 : else
1829 : {
1830 : T nVal;
1831 : if constexpr (bQuadraticMean)
1832 : {
1833 : if constexpr (bUseMulFactor)
1834 81 : nVal = static_cast<T>(
1835 48 : dfMulFactor *
1836 81 : sqrt(dfTotal / dfTotalWeight));
1837 : else
1838 : nVal = static_cast<T>(
1839 : sqrt(dfTotal / dfTotalWeight));
1840 : }
1841 : else
1842 : {
1843 : if constexpr (bUseMulFactor)
1844 184 : nVal = static_cast<T>(
1845 184 : dfMulFactor * (dfTotal / dfTotalWeight));
1846 : else
1847 : nVal = static_cast<T>(dfTotal / dfTotalWeight);
1848 : }
1849 265 : if (bHasNoData && nVal == tNoDataValue)
1850 2 : nVal = tReplacementVal;
1851 265 : pDstScanline[iDstPixel] = nVal;
1852 : }
1853 : }
1854 : }
1855 : }
1856 : else
1857 : {
1858 115 : nSrcYOff -= nChunkYOff;
1859 115 : nSrcYOff2 -= nChunkYOff;
1860 :
1861 6590 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1862 : {
1863 6475 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1864 6475 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1865 :
1866 6475 : uint64_t nTotalR = 0;
1867 6475 : uint64_t nTotalG = 0;
1868 6475 : uint64_t nTotalB = 0;
1869 6475 : size_t nCount = 0;
1870 :
1871 19425 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1872 : {
1873 38850 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
1874 : {
1875 25900 : const T val =
1876 25900 : pChunk[iX + static_cast<size_t>(iY) * nChunkXSize];
1877 : // cppcheck-suppress unsignedLessThanZero
1878 25900 : if (val < 0 || val >= colorEntries.size())
1879 0 : continue;
1880 25900 : const size_t idx = static_cast<size_t>(val);
1881 25900 : const auto &entry = colorEntries[idx];
1882 25900 : if (entry.c4)
1883 : {
1884 : if constexpr (bQuadraticMean)
1885 : {
1886 800 : nTotalR += SQUARE<int>(entry.c1);
1887 800 : nTotalG += SQUARE<int>(entry.c2);
1888 800 : nTotalB += SQUARE<int>(entry.c3);
1889 800 : ++nCount;
1890 : }
1891 : else
1892 : {
1893 13328 : nTotalR += entry.c1;
1894 13328 : nTotalG += entry.c2;
1895 13328 : nTotalB += entry.c3;
1896 13328 : ++nCount;
1897 : }
1898 : }
1899 : }
1900 : }
1901 :
1902 6475 : if (nCount == 0 ||
1903 0 : (bPropagateNoData &&
1904 0 : nCount < static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
1905 0 : (nSrcXOff2 - nSrcXOff)))
1906 : {
1907 2838 : pDstScanline[iDstPixel] = tNoDataValue;
1908 : }
1909 : else
1910 : {
1911 : GDALColorEntry color;
1912 : if constexpr (bQuadraticMean)
1913 : {
1914 200 : color.c1 =
1915 200 : static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
1916 200 : color.c2 =
1917 200 : static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
1918 200 : color.c3 =
1919 200 : static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
1920 : }
1921 : else
1922 : {
1923 3437 : color.c1 =
1924 3437 : static_cast<short>((nTotalR + nCount / 2) / nCount);
1925 3437 : color.c2 =
1926 3437 : static_cast<short>((nTotalG + nCount / 2) / nCount);
1927 3437 : color.c3 =
1928 3437 : static_cast<short>((nTotalB + nCount / 2) / nCount);
1929 : }
1930 3637 : pDstScanline[iDstPixel] =
1931 3637 : static_cast<T>(BestColorEntry(colorEntries, color));
1932 : }
1933 : }
1934 : }
1935 : }
1936 :
1937 2388 : CPLFree(pasSrcX);
1938 :
1939 2388 : return CE_None;
1940 : }
1941 :
1942 : template <bool bQuadraticMean>
1943 : static CPLErr
1944 2388 : GDALResampleChunk_AverageOrRMSInternal(const GDALOverviewResampleArgs &args,
1945 : const void *pChunk, void **ppDstBuffer,
1946 : GDALDataType *peDstBufferDataType)
1947 : {
1948 2388 : *peDstBufferDataType = args.eWrkDataType;
1949 2388 : switch (args.eWrkDataType)
1950 : {
1951 2259 : case GDT_Byte:
1952 : {
1953 : return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_Byte,
1954 2259 : bQuadraticMean>(
1955 2259 : args, static_cast<const GByte *>(pChunk), ppDstBuffer);
1956 : }
1957 :
1958 11 : case GDT_UInt16:
1959 : {
1960 : if constexpr (bQuadraticMean)
1961 : {
1962 : // Use double as accumulation type, because UInt32 could overflow
1963 : return GDALResampleChunk_AverageOrRMS_T<
1964 6 : GUInt16, double, GDT_UInt16, bQuadraticMean>(
1965 6 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1966 : }
1967 : else
1968 : {
1969 : return GDALResampleChunk_AverageOrRMS_T<
1970 5 : GUInt16, GUInt32, GDT_UInt16, bQuadraticMean>(
1971 5 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1972 : }
1973 : }
1974 :
1975 71 : case GDT_Float32:
1976 : {
1977 : return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32,
1978 71 : bQuadraticMean>(
1979 71 : args, static_cast<const float *>(pChunk), ppDstBuffer);
1980 : }
1981 :
1982 47 : case GDT_Float64:
1983 : {
1984 : return GDALResampleChunk_AverageOrRMS_T<double, double, GDT_Float64,
1985 47 : bQuadraticMean>(
1986 47 : args, static_cast<const double *>(pChunk), ppDstBuffer);
1987 : }
1988 :
1989 0 : default:
1990 0 : break;
1991 : }
1992 :
1993 0 : CPLAssert(false);
1994 : return CE_Failure;
1995 : }
1996 :
1997 : static CPLErr
1998 2388 : GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
1999 : const void *pChunk, void **ppDstBuffer,
2000 : GDALDataType *peDstBufferDataType)
2001 : {
2002 2388 : if (EQUAL(args.pszResampling, "RMS"))
2003 77 : return GDALResampleChunk_AverageOrRMSInternal<true>(
2004 77 : args, pChunk, ppDstBuffer, peDstBufferDataType);
2005 : else
2006 2311 : return GDALResampleChunk_AverageOrRMSInternal<false>(
2007 2311 : args, pChunk, ppDstBuffer, peDstBufferDataType);
2008 : }
2009 :
2010 : /************************************************************************/
2011 : /* GDALResampleChunk_Gauss() */
2012 : /************************************************************************/
2013 :
2014 86 : static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
2015 : const void *pChunk, void **ppDstBuffer,
2016 : GDALDataType *peDstBufferDataType)
2017 :
2018 : {
2019 86 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2020 86 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2021 86 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2022 86 : const int nChunkXOff = args.nChunkXOff;
2023 86 : const int nChunkXSize = args.nChunkXSize;
2024 86 : const int nChunkYOff = args.nChunkYOff;
2025 86 : const int nChunkYSize = args.nChunkYSize;
2026 86 : const int nDstXOff = args.nDstXOff;
2027 86 : const int nDstXOff2 = args.nDstXOff2;
2028 86 : const int nDstYOff = args.nDstYOff;
2029 86 : const int nDstYOff2 = args.nDstYOff2;
2030 86 : const bool bHasNoData = args.bHasNoData;
2031 86 : double dfNoDataValue = args.dfNoDataValue;
2032 86 : const GDALColorTable *poColorTable = args.poColorTable;
2033 :
2034 86 : const double *const padfChunk = static_cast<const double *>(pChunk);
2035 :
2036 86 : *ppDstBuffer =
2037 86 : VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
2038 : GDALGetDataTypeSizeBytes(GDT_Float64));
2039 86 : if (*ppDstBuffer == nullptr)
2040 : {
2041 0 : return CE_Failure;
2042 : }
2043 86 : *peDstBufferDataType = GDT_Float64;
2044 86 : double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
2045 :
2046 : /* -------------------------------------------------------------------- */
2047 : /* Create the filter kernel and allocate scanline buffer. */
2048 : /* -------------------------------------------------------------------- */
2049 86 : int nGaussMatrixDim = 3;
2050 : const int *panGaussMatrix;
2051 86 : constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
2052 86 : constexpr int anGaussMatrix5x5[] = {1, 4, 6, 4, 1, 4, 16, 24, 16,
2053 : 4, 6, 24, 36, 24, 6, 4, 16, 24,
2054 : 16, 4, 1, 4, 6, 4, 1};
2055 86 : constexpr int anGaussMatrix7x7[] = {
2056 : 1, 6, 15, 20, 15, 6, 1, 6, 36, 90, 120, 90, 36,
2057 : 6, 15, 90, 225, 300, 225, 90, 15, 20, 120, 300, 400, 300,
2058 : 120, 20, 15, 90, 225, 300, 225, 90, 15, 6, 36, 90, 120,
2059 : 90, 36, 6, 1, 6, 15, 20, 15, 6, 1};
2060 :
2061 86 : const int nOXSize = args.nOvrXSize;
2062 86 : const int nOYSize = args.nOvrYSize;
2063 86 : const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
2064 :
2065 : // matrix for gauss filter
2066 86 : if (nResYFactor <= 2)
2067 : {
2068 85 : panGaussMatrix = anGaussMatrix3x3;
2069 85 : nGaussMatrixDim = 3;
2070 : }
2071 1 : else if (nResYFactor <= 4)
2072 : {
2073 0 : panGaussMatrix = anGaussMatrix5x5;
2074 0 : nGaussMatrixDim = 5;
2075 : }
2076 : else
2077 : {
2078 1 : panGaussMatrix = anGaussMatrix7x7;
2079 1 : nGaussMatrixDim = 7;
2080 : }
2081 :
2082 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
2083 : int *panGaussMatrixDup = static_cast<int *>(
2084 : CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
2085 : memcpy(panGaussMatrixDup, panGaussMatrix,
2086 : sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
2087 : panGaussMatrix = panGaussMatrixDup;
2088 : #endif
2089 :
2090 86 : if (!bHasNoData)
2091 79 : dfNoDataValue = 0.0;
2092 :
2093 86 : std::vector<GDALColorEntry> colorEntries;
2094 86 : int nTransparentIdx = -1;
2095 86 : if (poColorTable)
2096 2 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
2097 :
2098 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
2099 : // it as nodata value.
2100 92 : if (bHasNoData && dfNoDataValue >= 0.0 &&
2101 6 : dfNoDataValue < colorEntries.size())
2102 0 : colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
2103 :
2104 : // Or if we have no explicit nodata, but a color table entry that is
2105 : // transparent, consider it as the nodata value.
2106 86 : else if (!bHasNoData && nTransparentIdx >= 0)
2107 : {
2108 0 : dfNoDataValue = nTransparentIdx;
2109 : }
2110 :
2111 86 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2112 86 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2113 86 : const int nDstXWidth = nDstXOff2 - nDstXOff;
2114 :
2115 : /* ==================================================================== */
2116 : /* Loop over destination scanlines. */
2117 : /* ==================================================================== */
2118 16488 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2119 : {
2120 16402 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
2121 16402 : int nSrcYOff2 =
2122 16402 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
2123 :
2124 16402 : if (nSrcYOff < nChunkYOff)
2125 : {
2126 0 : nSrcYOff = nChunkYOff;
2127 0 : nSrcYOff2++;
2128 : }
2129 :
2130 16402 : const int iSizeY = nSrcYOff2 - nSrcYOff;
2131 16402 : nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
2132 16402 : nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
2133 :
2134 16402 : if (nSrcYOff2 > nChunkBottomYOff ||
2135 16359 : (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
2136 : {
2137 44 : nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
2138 : }
2139 :
2140 16402 : int nYShiftGaussMatrix = 0;
2141 16402 : if (nSrcYOff < nChunkYOff)
2142 : {
2143 0 : nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
2144 0 : nSrcYOff = nChunkYOff;
2145 : }
2146 :
2147 16402 : const double *const padfSrcScanline =
2148 16402 : padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
2149 16402 : const GByte *pabySrcScanlineNodataMask = nullptr;
2150 16402 : if (pabyChunkNodataMask != nullptr)
2151 152 : pabySrcScanlineNodataMask =
2152 152 : pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
2153 :
2154 : /* --------------------------------------------------------------------
2155 : */
2156 : /* Loop over destination pixels */
2157 : /* --------------------------------------------------------------------
2158 : */
2159 16402 : double *const padfDstScanline =
2160 16402 : padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
2161 4149980 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2162 : {
2163 4133580 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
2164 4133580 : int nSrcXOff2 =
2165 4133580 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
2166 :
2167 4133580 : if (nSrcXOff < nChunkXOff)
2168 : {
2169 0 : nSrcXOff = nChunkXOff;
2170 0 : nSrcXOff2++;
2171 : }
2172 :
2173 4133580 : const int iSizeX = nSrcXOff2 - nSrcXOff;
2174 4133580 : nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
2175 4133580 : nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
2176 :
2177 4133580 : if (nSrcXOff2 > nChunkRightXOff ||
2178 4127930 : (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
2179 : {
2180 5650 : nSrcXOff2 =
2181 5650 : std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
2182 : }
2183 :
2184 4133580 : int nXShiftGaussMatrix = 0;
2185 4133580 : if (nSrcXOff < nChunkXOff)
2186 : {
2187 0 : nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
2188 0 : nSrcXOff = nChunkXOff;
2189 : }
2190 :
2191 4133580 : if (poColorTable == nullptr)
2192 : {
2193 4133380 : double dfTotal = 0.0;
2194 4133380 : GInt64 nCount = 0;
2195 4133380 : const int *panLineWeight =
2196 4133380 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2197 : nXShiftGaussMatrix;
2198 :
2199 16527900 : for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2200 12394500 : ++iY, ++j, panLineWeight += nGaussMatrixDim)
2201 : {
2202 49561300 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2203 : {
2204 37166800 : const double val =
2205 37166800 : padfSrcScanline[iX - nChunkXOff +
2206 37166800 : static_cast<GPtrDiff_t>(iY -
2207 37166800 : nSrcYOff) *
2208 37166800 : nChunkXSize];
2209 37166800 : if (pabySrcScanlineNodataMask == nullptr ||
2210 32872 : pabySrcScanlineNodataMask[iX - nChunkXOff +
2211 32872 : static_cast<GPtrDiff_t>(
2212 32872 : iY - nSrcYOff) *
2213 32872 : nChunkXSize])
2214 : {
2215 37146100 : const int nWeight = panLineWeight[i];
2216 37146100 : dfTotal += val * nWeight;
2217 37146100 : nCount += nWeight;
2218 : }
2219 : }
2220 : }
2221 :
2222 4133380 : if (nCount == 0)
2223 : {
2224 2217 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2225 : }
2226 : else
2227 : {
2228 4131160 : padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
2229 : }
2230 : }
2231 : else
2232 : {
2233 200 : GInt64 nTotalR = 0;
2234 200 : GInt64 nTotalG = 0;
2235 200 : GInt64 nTotalB = 0;
2236 200 : GInt64 nTotalWeight = 0;
2237 200 : const int *panLineWeight =
2238 200 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2239 : nXShiftGaussMatrix;
2240 :
2241 780 : for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2242 580 : ++iY, ++j, panLineWeight += nGaussMatrixDim)
2243 : {
2244 2262 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2245 : {
2246 1682 : const double val =
2247 1682 : padfSrcScanline[iX - nChunkXOff +
2248 1682 : static_cast<GPtrDiff_t>(iY -
2249 1682 : nSrcYOff) *
2250 1682 : nChunkXSize];
2251 1682 : if (val < 0 || val >= colorEntries.size())
2252 0 : continue;
2253 :
2254 1682 : size_t idx = static_cast<size_t>(val);
2255 1682 : if (colorEntries[idx].c4)
2256 : {
2257 1682 : const int nWeight = panLineWeight[i];
2258 1682 : nTotalR +=
2259 1682 : static_cast<GInt64>(colorEntries[idx].c1) *
2260 1682 : nWeight;
2261 1682 : nTotalG +=
2262 1682 : static_cast<GInt64>(colorEntries[idx].c2) *
2263 1682 : nWeight;
2264 1682 : nTotalB +=
2265 1682 : static_cast<GInt64>(colorEntries[idx].c3) *
2266 1682 : nWeight;
2267 1682 : nTotalWeight += nWeight;
2268 : }
2269 : }
2270 : }
2271 :
2272 200 : if (nTotalWeight == 0)
2273 : {
2274 0 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2275 : }
2276 : else
2277 : {
2278 : GDALColorEntry color;
2279 :
2280 200 : color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
2281 : nTotalWeight);
2282 200 : color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
2283 : nTotalWeight);
2284 200 : color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
2285 : nTotalWeight);
2286 200 : padfDstScanline[iDstPixel - nDstXOff] =
2287 200 : BestColorEntry(colorEntries, color);
2288 : }
2289 : }
2290 : }
2291 : }
2292 :
2293 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
2294 : CPLFree(panGaussMatrixDup);
2295 : #endif
2296 :
2297 86 : return CE_None;
2298 : }
2299 :
2300 : /************************************************************************/
2301 : /* GDALResampleChunk_Mode() */
2302 : /************************************************************************/
2303 :
2304 688 : template <class T> static inline bool IsSame(T a, T b)
2305 : {
2306 688 : return a == b;
2307 : }
2308 :
2309 60 : template <> bool IsSame<GFloat16>(GFloat16 a, GFloat16 b)
2310 : {
2311 60 : return a == b || (CPLIsNan(a) && CPLIsNan(b));
2312 : }
2313 :
2314 4902 : template <> bool IsSame<float>(float a, float b)
2315 : {
2316 4902 : return a == b || (std::isnan(a) && std::isnan(b));
2317 : }
2318 :
2319 1020 : template <> bool IsSame<double>(double a, double b)
2320 : {
2321 1020 : return a == b || (std::isnan(a) && std::isnan(b));
2322 : }
2323 :
2324 : namespace
2325 : {
2326 : struct ComplexFloat16
2327 : {
2328 : GFloat16 r;
2329 : GFloat16 i;
2330 : };
2331 : } // namespace
2332 :
2333 60 : template <> bool IsSame<ComplexFloat16>(ComplexFloat16 a, ComplexFloat16 b)
2334 : {
2335 90 : return (a.r == b.r && a.i == b.i) ||
2336 90 : (CPLIsNan(a.r) && CPLIsNan(a.i) && CPLIsNan(b.r) && CPLIsNan(b.i));
2337 : }
2338 :
2339 : template <>
2340 60 : bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
2341 : {
2342 120 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2343 120 : std::isnan(b.real()) && std::isnan(b.imag()));
2344 : }
2345 :
2346 : template <>
2347 60 : bool IsSame<std::complex<double>>(std::complex<double> a,
2348 : std::complex<double> b)
2349 : {
2350 120 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2351 120 : std::isnan(b.real()) && std::isnan(b.imag()));
2352 : }
2353 :
2354 : template <class T>
2355 176 : static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
2356 : const T *pChunk, T *const pDstBuffer)
2357 :
2358 : {
2359 176 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2360 176 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2361 176 : const double dfSrcXDelta = args.dfSrcXDelta;
2362 176 : const double dfSrcYDelta = args.dfSrcYDelta;
2363 176 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2364 176 : const int nChunkXOff = args.nChunkXOff;
2365 176 : const int nChunkXSize = args.nChunkXSize;
2366 176 : const int nChunkYOff = args.nChunkYOff;
2367 176 : const int nChunkYSize = args.nChunkYSize;
2368 176 : const int nDstXOff = args.nDstXOff;
2369 176 : const int nDstXOff2 = args.nDstXOff2;
2370 176 : const int nDstYOff = args.nDstYOff;
2371 176 : const int nDstYOff2 = args.nDstYOff2;
2372 176 : const bool bHasNoData = args.bHasNoData;
2373 176 : const GDALColorTable *poColorTable = args.poColorTable;
2374 176 : const int nDstXSize = nDstXOff2 - nDstXOff;
2375 :
2376 8 : T tNoDataValue;
2377 : if constexpr (std::is_same<T, ComplexFloat16>::value)
2378 : {
2379 4 : tNoDataValue.r = cpl::NumericLimits<GFloat16>::quiet_NaN();
2380 4 : tNoDataValue.i = cpl::NumericLimits<GFloat16>::quiet_NaN();
2381 : }
2382 : else if constexpr (std::is_same<T, std::complex<float>>::value ||
2383 : std::is_same<T, std::complex<double>>::value)
2384 : {
2385 : using BaseT = typename T::value_type;
2386 8 : tNoDataValue =
2387 : std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
2388 : std::numeric_limits<BaseT>::quiet_NaN());
2389 : }
2390 164 : else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
2391 163 : tNoDataValue = 0;
2392 : else
2393 1 : tNoDataValue = static_cast<T>(args.dfNoDataValue);
2394 :
2395 : using CountType = uint32_t;
2396 176 : CountType nMaxNumPx = 0;
2397 176 : T *paVals = nullptr;
2398 176 : CountType *panCounts = nullptr;
2399 :
2400 176 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2401 176 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2402 352 : std::vector<int> anVals(256, 0);
2403 :
2404 : /* ==================================================================== */
2405 : /* Loop over destination scanlines. */
2406 : /* ==================================================================== */
2407 7679 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2408 : {
2409 7503 : const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
2410 7503 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
2411 : #ifdef only_pixels_with_more_than_10_pct_participation
2412 : // When oversampling, don't take into account pixels that have a tiny
2413 : // participation in the resulting pixel
2414 : if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2415 : nSrcYOff < nChunkBottomYOff)
2416 : nSrcYOff++;
2417 : #endif
2418 7503 : if (nSrcYOff < nChunkYOff)
2419 0 : nSrcYOff = nChunkYOff;
2420 :
2421 7503 : const double dfSrcYOff2 =
2422 7503 : dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
2423 7503 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
2424 : #ifdef only_pixels_with_more_than_10_pct_participation
2425 : // When oversampling, don't take into account pixels that have a tiny
2426 : // participation in the resulting pixel
2427 : if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2428 : nSrcYOff2 > nChunkYOff)
2429 : nSrcYOff2--;
2430 : #endif
2431 7503 : if (nSrcYOff2 == nSrcYOff)
2432 0 : ++nSrcYOff2;
2433 7503 : if (nSrcYOff2 > nChunkBottomYOff)
2434 0 : nSrcYOff2 = nChunkBottomYOff;
2435 :
2436 7503 : const T *const paSrcScanline =
2437 253 : pChunk +
2438 7503 : (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
2439 7503 : const GByte *pabySrcScanlineNodataMask = nullptr;
2440 7503 : if (pabyChunkNodataMask != nullptr)
2441 1810 : pabySrcScanlineNodataMask =
2442 : pabyChunkNodataMask +
2443 1810 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
2444 :
2445 7503 : T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
2446 : /* --------------------------------------------------------------------
2447 : */
2448 : /* Loop over destination pixels */
2449 : /* --------------------------------------------------------------------
2450 : */
2451 4260400 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2452 : {
2453 4252893 : const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
2454 : // Apply some epsilon to avoid numerical precision issues
2455 4252893 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
2456 : #ifdef only_pixels_with_more_than_10_pct_participation
2457 : // When oversampling, don't take into account pixels that have a
2458 : // tiny participation in the resulting pixel
2459 : if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2460 : nSrcXOff < nChunkRightXOff)
2461 : nSrcXOff++;
2462 : #endif
2463 4252893 : if (nSrcXOff < nChunkXOff)
2464 0 : nSrcXOff = nChunkXOff;
2465 :
2466 4252893 : const double dfSrcXOff2 =
2467 4252893 : dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
2468 4252893 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
2469 : #ifdef only_pixels_with_more_than_10_pct_participation
2470 : // When oversampling, don't take into account pixels that have a
2471 : // tiny participation in the resulting pixel
2472 : if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2473 : nSrcXOff2 > nChunkXOff)
2474 : nSrcXOff2--;
2475 : #endif
2476 4252893 : if (nSrcXOff2 == nSrcXOff)
2477 0 : nSrcXOff2++;
2478 4252893 : if (nSrcXOff2 > nChunkRightXOff)
2479 0 : nSrcXOff2 = nChunkRightXOff;
2480 :
2481 4252893 : bool bRegularProcessing = false;
2482 : if constexpr (!std::is_same<T, GByte>::value)
2483 1503 : bRegularProcessing = true;
2484 4251390 : else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
2485 0 : bRegularProcessing = true;
2486 :
2487 4252893 : if (bRegularProcessing)
2488 : {
2489 : // Sanity check to make sure the allocation of paVals and
2490 : // panCounts don't overflow.
2491 : static_assert(sizeof(CountType) <= sizeof(size_t));
2492 3006 : if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
2493 1503 : static_cast<CountType>(nSrcYOff2 - nSrcYOff) >
2494 1503 : (std::numeric_limits<CountType>::max() /
2495 3006 : std::max(sizeof(T), sizeof(CountType))) /
2496 1503 : static_cast<CountType>(nSrcXOff2 - nSrcXOff))
2497 : {
2498 0 : CPLError(CE_Failure, CPLE_NotSupported,
2499 : "Too big downsampling factor");
2500 0 : CPLFree(paVals);
2501 0 : CPLFree(panCounts);
2502 0 : return CE_Failure;
2503 : }
2504 1503 : const CountType nNumPx =
2505 1503 : static_cast<CountType>(nSrcYOff2 - nSrcYOff) *
2506 1503 : (nSrcXOff2 - nSrcXOff);
2507 1503 : CountType iMaxInd = 0;
2508 1503 : CountType iMaxVal = 0;
2509 :
2510 1503 : if (paVals == nullptr || nNumPx > nMaxNumPx)
2511 : {
2512 : T *paValsNew = static_cast<T *>(
2513 110 : VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
2514 : CountType *panCountsNew =
2515 110 : static_cast<CountType *>(VSI_REALLOC_VERBOSE(
2516 : panCounts, nNumPx * sizeof(CountType)));
2517 110 : if (paValsNew != nullptr)
2518 110 : paVals = paValsNew;
2519 110 : if (panCountsNew != nullptr)
2520 110 : panCounts = panCountsNew;
2521 110 : if (paValsNew == nullptr || panCountsNew == nullptr)
2522 : {
2523 0 : CPLFree(paVals);
2524 0 : CPLFree(panCounts);
2525 0 : return CE_Failure;
2526 : }
2527 110 : nMaxNumPx = nNumPx;
2528 : }
2529 :
2530 4629 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2531 : {
2532 3126 : const GPtrDiff_t iTotYOff =
2533 3126 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2534 3126 : nChunkXOff;
2535 9858 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2536 : {
2537 6732 : if (pabySrcScanlineNodataMask == nullptr ||
2538 16 : pabySrcScanlineNodataMask[iX + iTotYOff])
2539 : {
2540 6717 : const T val = paSrcScanline[iX + iTotYOff];
2541 6717 : CountType i = 0; // Used after for.
2542 :
2543 : // Check array for existing entry.
2544 10081 : for (; i < iMaxInd; ++i)
2545 : {
2546 6850 : if (IsSame(paVals[i], val))
2547 : {
2548 3486 : if (++panCounts[i] > panCounts[iMaxVal])
2549 : {
2550 246 : iMaxVal = i;
2551 : }
2552 3486 : break;
2553 : }
2554 : }
2555 :
2556 : // Add to arr if entry not already there.
2557 6717 : if (i == iMaxInd)
2558 : {
2559 3231 : paVals[iMaxInd] = val;
2560 3231 : panCounts[iMaxInd] = 1;
2561 :
2562 3231 : if (iMaxInd == 0)
2563 : {
2564 1500 : iMaxVal = iMaxInd;
2565 : }
2566 :
2567 3231 : ++iMaxInd;
2568 : }
2569 : }
2570 : }
2571 : }
2572 :
2573 1503 : if (iMaxInd == 0)
2574 3 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2575 : else
2576 1500 : paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
2577 : }
2578 : else if constexpr (std::is_same<T, GByte>::value)
2579 : // ( eSrcDataType == GDT_Byte && nEntryCount < 256 )
2580 : {
2581 : // So we go here for a paletted or non-paletted byte band.
2582 : // The input values are then between 0 and 255.
2583 4251390 : int nMaxVal = 0;
2584 4251390 : int iMaxInd = -1;
2585 :
2586 : // The cost of this zeroing might be high. Perhaps we should
2587 : // just use the above generic case, and go to this one if the
2588 : // number of source pixels is large enough
2589 4251390 : std::fill(anVals.begin(), anVals.end(), 0);
2590 :
2591 12777800 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2592 : {
2593 8526440 : const GPtrDiff_t iTotYOff =
2594 8526440 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2595 8526440 : nChunkXOff;
2596 25649600 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2597 : {
2598 17123100 : const T val = paSrcScanline[iX + iTotYOff];
2599 17123100 : if (!bHasNoData || val != tNoDataValue)
2600 : {
2601 17123100 : int nVal = static_cast<int>(val);
2602 17123100 : if (++anVals[nVal] > nMaxVal)
2603 : {
2604 : // Sum the density.
2605 : // Is it the most common value so far?
2606 17006400 : iMaxInd = nVal;
2607 17006400 : nMaxVal = anVals[nVal];
2608 : }
2609 : }
2610 : }
2611 : }
2612 :
2613 4251390 : if (iMaxInd == -1)
2614 0 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2615 : else
2616 4251390 : paDstScanline[iDstPixel - nDstXOff] =
2617 : static_cast<T>(iMaxInd);
2618 : }
2619 : }
2620 : }
2621 :
2622 176 : CPLFree(paVals);
2623 176 : CPLFree(panCounts);
2624 :
2625 176 : return CE_None;
2626 : }
2627 :
2628 176 : static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
2629 : const void *pChunk, void **ppDstBuffer,
2630 : GDALDataType *peDstBufferDataType)
2631 : {
2632 176 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
2633 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
2634 : GDALGetDataTypeSizeBytes(args.eWrkDataType));
2635 176 : if (*ppDstBuffer == nullptr)
2636 : {
2637 0 : return CE_Failure;
2638 : }
2639 :
2640 176 : CPLAssert(args.eSrcDataType == args.eWrkDataType);
2641 :
2642 176 : *peDstBufferDataType = args.eWrkDataType;
2643 176 : switch (args.eWrkDataType)
2644 : {
2645 : // For mode resampling, as no computation is done, only the
2646 : // size of the data type matters... except for Byte where we have
2647 : // special processing. And for floating point values
2648 66 : case GDT_Byte:
2649 : {
2650 66 : return GDALResampleChunk_ModeT(args,
2651 : static_cast<const GByte *>(pChunk),
2652 66 : static_cast<GByte *>(*ppDstBuffer));
2653 : }
2654 :
2655 4 : case GDT_Int8:
2656 : {
2657 4 : return GDALResampleChunk_ModeT(args,
2658 : static_cast<const int8_t *>(pChunk),
2659 4 : static_cast<int8_t *>(*ppDstBuffer));
2660 : }
2661 :
2662 10 : case GDT_Int16:
2663 : case GDT_UInt16:
2664 : {
2665 10 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
2666 10 : return GDALResampleChunk_ModeT(
2667 : args, static_cast<const uint16_t *>(pChunk),
2668 10 : static_cast<uint16_t *>(*ppDstBuffer));
2669 : }
2670 :
2671 15 : case GDT_CInt16:
2672 : case GDT_Int32:
2673 : case GDT_UInt32:
2674 : {
2675 15 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2676 15 : return GDALResampleChunk_ModeT(
2677 : args, static_cast<const uint32_t *>(pChunk),
2678 15 : static_cast<uint32_t *>(*ppDstBuffer));
2679 : }
2680 :
2681 12 : case GDT_CInt32:
2682 : case GDT_Int64:
2683 : case GDT_UInt64:
2684 : {
2685 12 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2686 12 : return GDALResampleChunk_ModeT(
2687 : args, static_cast<const uint64_t *>(pChunk),
2688 12 : static_cast<uint64_t *>(*ppDstBuffer));
2689 : }
2690 :
2691 4 : case GDT_Float16:
2692 : {
2693 4 : return GDALResampleChunk_ModeT(
2694 : args, static_cast<const GFloat16 *>(pChunk),
2695 4 : static_cast<GFloat16 *>(*ppDstBuffer));
2696 : }
2697 :
2698 32 : case GDT_Float32:
2699 : {
2700 32 : return GDALResampleChunk_ModeT(args,
2701 : static_cast<const float *>(pChunk),
2702 32 : static_cast<float *>(*ppDstBuffer));
2703 : }
2704 :
2705 21 : case GDT_Float64:
2706 : {
2707 21 : return GDALResampleChunk_ModeT(args,
2708 : static_cast<const double *>(pChunk),
2709 21 : static_cast<double *>(*ppDstBuffer));
2710 : }
2711 :
2712 4 : case GDT_CFloat16:
2713 : {
2714 4 : return GDALResampleChunk_ModeT(
2715 : args, static_cast<const ComplexFloat16 *>(pChunk),
2716 4 : static_cast<ComplexFloat16 *>(*ppDstBuffer));
2717 : }
2718 :
2719 4 : case GDT_CFloat32:
2720 : {
2721 4 : return GDALResampleChunk_ModeT(
2722 : args, static_cast<const std::complex<float> *>(pChunk),
2723 4 : static_cast<std::complex<float> *>(*ppDstBuffer));
2724 : }
2725 :
2726 4 : case GDT_CFloat64:
2727 : {
2728 4 : return GDALResampleChunk_ModeT(
2729 : args, static_cast<const std::complex<double> *>(pChunk),
2730 4 : static_cast<std::complex<double> *>(*ppDstBuffer));
2731 : }
2732 :
2733 0 : case GDT_Unknown:
2734 : case GDT_TypeCount:
2735 0 : break;
2736 : }
2737 :
2738 0 : CPLAssert(false);
2739 : return CE_Failure;
2740 : }
2741 :
2742 : /************************************************************************/
2743 : /* GDALResampleConvolutionHorizontal() */
2744 : /************************************************************************/
2745 :
2746 : template <class T>
2747 : static inline double
2748 46038 : GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
2749 : int nSrcPixelCount)
2750 : {
2751 46038 : double dfVal1 = 0.0;
2752 46038 : double dfVal2 = 0.0;
2753 46038 : int i = 0; // Used after for.
2754 : // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
2755 : // manually (untypical) unrolled loop in -O2 and -O3:
2756 : // https://github.com/OSGeo/gdal/issues/9508
2757 : #if !defined(__INTEL_CLANG_COMPILER)
2758 92396 : for (; i < nSrcPixelCount - 3; i += 4)
2759 : {
2760 46358 : dfVal1 += double(pChunk[i + 0]) * padfWeights[i];
2761 46358 : dfVal1 += double(pChunk[i + 1]) * padfWeights[i + 1];
2762 46358 : dfVal2 += double(pChunk[i + 2]) * padfWeights[i + 2];
2763 46358 : dfVal2 += double(pChunk[i + 3]) * padfWeights[i + 3];
2764 : }
2765 : #endif
2766 48662 : for (; i < nSrcPixelCount; ++i)
2767 : {
2768 2624 : dfVal1 += double(pChunk[i]) * padfWeights[i];
2769 : }
2770 46038 : return dfVal1 + dfVal2;
2771 : }
2772 :
2773 : template <class T>
2774 44576 : static inline void GDALResampleConvolutionHorizontalWithMask(
2775 : const T *pChunk, const GByte *pabyMask, const double *padfWeights,
2776 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2777 : {
2778 44576 : dfVal = 0;
2779 44576 : dfWeightSum = 0;
2780 44576 : int i = 0;
2781 98300 : for (; i < nSrcPixelCount - 3; i += 4)
2782 : {
2783 53724 : const double dfWeight0 = padfWeights[i] * pabyMask[i];
2784 53724 : const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
2785 53724 : const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
2786 53724 : const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
2787 53724 : dfVal += double(pChunk[i + 0]) * dfWeight0;
2788 53724 : dfVal += double(pChunk[i + 1]) * dfWeight1;
2789 53724 : dfVal += double(pChunk[i + 2]) * dfWeight2;
2790 53724 : dfVal += double(pChunk[i + 3]) * dfWeight3;
2791 53724 : dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
2792 : }
2793 61162 : for (; i < nSrcPixelCount; ++i)
2794 : {
2795 16586 : const double dfWeight = padfWeights[i] * pabyMask[i];
2796 16586 : dfVal += double(pChunk[i]) * dfWeight;
2797 16586 : dfWeightSum += dfWeight;
2798 : }
2799 44576 : }
2800 :
2801 : template <class T>
2802 1341366 : static inline void GDALResampleConvolutionHorizontal_3rows(
2803 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2804 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2805 : double &dfRes2, double &dfRes3)
2806 : {
2807 1341366 : double dfVal1 = 0.0;
2808 1341366 : double dfVal2 = 0.0;
2809 1341366 : double dfVal3 = 0.0;
2810 1341366 : double dfVal4 = 0.0;
2811 1341366 : double dfVal5 = 0.0;
2812 1341366 : double dfVal6 = 0.0;
2813 1341366 : int i = 0; // Used after for.
2814 2736937 : for (; i < nSrcPixelCount - 3; i += 4)
2815 : {
2816 1395570 : dfVal1 += double(pChunkRow1[i + 0]) * padfWeights[i + 0];
2817 1395570 : dfVal1 += double(pChunkRow1[i + 1]) * padfWeights[i + 1];
2818 1395570 : dfVal2 += double(pChunkRow1[i + 2]) * padfWeights[i + 2];
2819 1395570 : dfVal2 += double(pChunkRow1[i + 3]) * padfWeights[i + 3];
2820 1395570 : dfVal3 += double(pChunkRow2[i + 0]) * padfWeights[i + 0];
2821 1395570 : dfVal3 += double(pChunkRow2[i + 1]) * padfWeights[i + 1];
2822 1395570 : dfVal4 += double(pChunkRow2[i + 2]) * padfWeights[i + 2];
2823 1395570 : dfVal4 += double(pChunkRow2[i + 3]) * padfWeights[i + 3];
2824 1395570 : dfVal5 += double(pChunkRow3[i + 0]) * padfWeights[i + 0];
2825 1395570 : dfVal5 += double(pChunkRow3[i + 1]) * padfWeights[i + 1];
2826 1395570 : dfVal6 += double(pChunkRow3[i + 2]) * padfWeights[i + 2];
2827 1395570 : dfVal6 += double(pChunkRow3[i + 3]) * padfWeights[i + 3];
2828 : }
2829 1381377 : for (; i < nSrcPixelCount; ++i)
2830 : {
2831 40011 : dfVal1 += double(pChunkRow1[i]) * padfWeights[i];
2832 40011 : dfVal3 += double(pChunkRow2[i]) * padfWeights[i];
2833 40011 : dfVal5 += double(pChunkRow3[i]) * padfWeights[i];
2834 : }
2835 1341366 : dfRes1 = dfVal1 + dfVal2;
2836 1341366 : dfRes2 = dfVal3 + dfVal4;
2837 1341366 : dfRes3 = dfVal5 + dfVal6;
2838 1341366 : }
2839 :
2840 : template <class T>
2841 18980 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
2842 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2843 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2844 : double &dfRes2, double &dfRes3)
2845 : {
2846 18980 : GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2847 : padfWeights, nSrcPixelCount, dfRes1,
2848 : dfRes2, dfRes3);
2849 18980 : }
2850 :
2851 : template <class T>
2852 1256690 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
2853 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2854 : const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
2855 : {
2856 1256690 : GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2857 : padfWeights, 4, dfRes1, dfRes2,
2858 : dfRes3);
2859 1256690 : }
2860 :
2861 : /************************************************************************/
2862 : /* GDALResampleConvolutionVertical() */
2863 : /************************************************************************/
2864 :
2865 : template <class T>
2866 : static inline double
2867 465333 : GDALResampleConvolutionVertical(const T *pChunk, size_t nStride,
2868 : const double *padfWeights, int nSrcLineCount)
2869 : {
2870 465333 : double dfVal1 = 0.0;
2871 465333 : double dfVal2 = 0.0;
2872 465333 : int i = 0;
2873 465333 : size_t j = 0;
2874 916410 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2875 : {
2876 451077 : dfVal1 += pChunk[j + 0 * nStride] * padfWeights[i + 0];
2877 451077 : dfVal1 += pChunk[j + 1 * nStride] * padfWeights[i + 1];
2878 451077 : dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2879 451077 : dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2880 : }
2881 519432 : for (; i < nSrcLineCount; ++i, j += nStride)
2882 : {
2883 54099 : dfVal1 += pChunk[j] * padfWeights[i];
2884 : }
2885 465333 : return dfVal1 + dfVal2;
2886 : }
2887 :
2888 : template <class T>
2889 2930610 : static inline void GDALResampleConvolutionVertical_2cols(
2890 : const T *pChunk, size_t nStride, const double *padfWeights,
2891 : int nSrcLineCount, double &dfRes1, double &dfRes2)
2892 : {
2893 2930610 : double dfVal1 = 0.0;
2894 2930610 : double dfVal2 = 0.0;
2895 2930610 : double dfVal3 = 0.0;
2896 2930610 : double dfVal4 = 0.0;
2897 2930610 : int i = 0;
2898 2930610 : size_t j = 0;
2899 5863170 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2900 : {
2901 2932560 : dfVal1 += pChunk[j + 0 + 0 * nStride] * padfWeights[i + 0];
2902 2932560 : dfVal3 += pChunk[j + 1 + 0 * nStride] * padfWeights[i + 0];
2903 2932560 : dfVal1 += pChunk[j + 0 + 1 * nStride] * padfWeights[i + 1];
2904 2932560 : dfVal3 += pChunk[j + 1 + 1 * nStride] * padfWeights[i + 1];
2905 2932560 : dfVal2 += pChunk[j + 0 + 2 * nStride] * padfWeights[i + 2];
2906 2932560 : dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
2907 2932560 : dfVal2 += pChunk[j + 0 + 3 * nStride] * padfWeights[i + 3];
2908 2932560 : dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
2909 : }
2910 3053490 : for (; i < nSrcLineCount; ++i, j += nStride)
2911 : {
2912 122880 : dfVal1 += pChunk[j + 0] * padfWeights[i];
2913 122880 : dfVal3 += pChunk[j + 1] * padfWeights[i];
2914 : }
2915 2930610 : dfRes1 = dfVal1 + dfVal2;
2916 2930610 : dfRes2 = dfVal3 + dfVal4;
2917 2930610 : }
2918 :
2919 : #ifdef USE_SSE2
2920 :
2921 : #ifdef __AVX__
2922 : /************************************************************************/
2923 : /* GDALResampleConvolutionVertical_16cols<T> */
2924 : /************************************************************************/
2925 :
2926 : template <class T>
2927 : static inline void
2928 : GDALResampleConvolutionVertical_16cols(const T *pChunk, size_t nStride,
2929 : const double *padfWeights,
2930 : int nSrcLineCount, float *afDest)
2931 : {
2932 : int i = 0;
2933 : size_t j = 0;
2934 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2935 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2936 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2937 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2938 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2939 : {
2940 : XMMReg4Double w0 =
2941 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2942 : XMMReg4Double w1 =
2943 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2944 : XMMReg4Double w2 =
2945 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2946 : XMMReg4Double w3 =
2947 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2948 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2949 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2950 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
2951 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
2952 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2953 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2954 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
2955 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
2956 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2957 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2958 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
2959 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
2960 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2961 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2962 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
2963 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
2964 : }
2965 : for (; i < nSrcLineCount; ++i, j += nStride)
2966 : {
2967 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2968 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2969 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2970 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
2971 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
2972 : }
2973 : v_acc0.Store4Val(afDest);
2974 : v_acc1.Store4Val(afDest + 4);
2975 : v_acc2.Store4Val(afDest + 8);
2976 : v_acc3.Store4Val(afDest + 12);
2977 : }
2978 :
2979 : template <class T>
2980 : static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
2981 : const double *, int,
2982 : double *)
2983 : {
2984 : // Cannot be reached
2985 : CPLAssert(false);
2986 : }
2987 :
2988 : #else
2989 :
2990 : /************************************************************************/
2991 : /* GDALResampleConvolutionVertical_8cols<T> */
2992 : /************************************************************************/
2993 :
2994 : template <class T>
2995 : static inline void
2996 24813600 : GDALResampleConvolutionVertical_8cols(const T *pChunk, size_t nStride,
2997 : const double *padfWeights,
2998 : int nSrcLineCount, float *afDest)
2999 : {
3000 24813600 : int i = 0;
3001 24813600 : size_t j = 0;
3002 24813600 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
3003 24821200 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3004 51151600 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
3005 : {
3006 26329600 : XMMReg4Double w0 =
3007 26329600 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
3008 26318400 : XMMReg4Double w1 =
3009 26318400 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
3010 26330100 : XMMReg4Double w2 =
3011 26330100 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
3012 26327400 : XMMReg4Double w3 =
3013 26327400 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
3014 26343700 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
3015 26319400 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
3016 26308500 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
3017 26293200 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
3018 26295300 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
3019 26295000 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
3020 26298700 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
3021 26319100 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
3022 : }
3023 36381700 : for (; i < nSrcLineCount; ++i, j += nStride)
3024 : {
3025 11559600 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
3026 11559600 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
3027 11559600 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
3028 : }
3029 24822000 : v_acc0.Store4Val(afDest);
3030 24805900 : v_acc1.Store4Val(afDest + 4);
3031 24832800 : }
3032 :
3033 : template <class T>
3034 : static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
3035 : const double *, int,
3036 : double *)
3037 : {
3038 : // Cannot be reached
3039 : CPLAssert(false);
3040 : }
3041 :
3042 : #endif // __AVX__
3043 :
3044 : /************************************************************************/
3045 : /* GDALResampleConvolutionHorizontalSSE2<T> */
3046 : /************************************************************************/
3047 :
3048 : template <class T>
3049 3112884 : static inline double GDALResampleConvolutionHorizontalSSE2(
3050 : const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3051 : {
3052 3112884 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3053 3112453 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3054 3112656 : int i = 0; // Used after for.
3055 3463145 : for (; i < nSrcPixelCount - 7; i += 8)
3056 : {
3057 : // Retrieve the pixel & accumulate
3058 350909 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
3059 350906 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
3060 350909 : const XMMReg4Double v_weight1 =
3061 350909 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3062 350907 : const XMMReg4Double v_weight2 =
3063 350907 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
3064 :
3065 350909 : v_acc1 += v_pixels1 * v_weight1;
3066 350904 : v_acc2 += v_pixels2 * v_weight2;
3067 : }
3068 :
3069 3112239 : v_acc1 += v_acc2;
3070 :
3071 3112688 : double dfVal = v_acc1.GetHorizSum();
3072 10288540 : for (; i < nSrcPixelCount; ++i)
3073 : {
3074 7175910 : dfVal += pChunk[i] * padfWeightsAligned[i];
3075 : }
3076 3112629 : return dfVal;
3077 : }
3078 :
3079 : /************************************************************************/
3080 : /* GDALResampleConvolutionHorizontal<GByte> */
3081 : /************************************************************************/
3082 :
3083 : template <>
3084 2563970 : inline double GDALResampleConvolutionHorizontal<GByte>(
3085 : const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3086 : {
3087 2563970 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
3088 2563970 : nSrcPixelCount);
3089 : }
3090 :
3091 : template <>
3092 547702 : inline double GDALResampleConvolutionHorizontal<GUInt16>(
3093 : const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3094 : {
3095 547702 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
3096 548996 : nSrcPixelCount);
3097 : }
3098 :
3099 : /************************************************************************/
3100 : /* GDALResampleConvolutionHorizontalWithMaskSSE2<T> */
3101 : /************************************************************************/
3102 :
3103 : template <class T>
3104 7042823 : static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
3105 : const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
3106 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
3107 : {
3108 7042823 : int i = 0; // Used after for.
3109 7042823 : XMMReg4Double v_acc = XMMReg4Double::Zero();
3110 7043103 : XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
3111 19720821 : for (; i < nSrcPixelCount - 3; i += 4)
3112 : {
3113 12675158 : const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
3114 12668458 : const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
3115 12668858 : XMMReg4Double v_weight =
3116 12668858 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3117 12671858 : v_weight *= v_mask;
3118 12669458 : v_acc += v_pixels * v_weight;
3119 12671158 : v_acc_weight += v_weight;
3120 : }
3121 :
3122 7045673 : dfVal = v_acc.GetHorizSum();
3123 7054993 : dfWeightSum = v_acc_weight.GetHorizSum();
3124 7287433 : for (; i < nSrcPixelCount; ++i)
3125 : {
3126 231090 : const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
3127 231090 : dfVal += pChunk[i] * dfWeight;
3128 231090 : dfWeightSum += dfWeight;
3129 : }
3130 7056343 : }
3131 :
3132 : /************************************************************************/
3133 : /* GDALResampleConvolutionHorizontalWithMask<GByte> */
3134 : /************************************************************************/
3135 :
3136 : template <>
3137 7051180 : inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
3138 : const GByte *pChunk, const GByte *pabyMask,
3139 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
3140 : double &dfWeightSum)
3141 : {
3142 7051180 : GDALResampleConvolutionHorizontalWithMaskSSE2(
3143 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
3144 : dfWeightSum);
3145 7057360 : }
3146 :
3147 : template <>
3148 63 : inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
3149 : const GUInt16 *pChunk, const GByte *pabyMask,
3150 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
3151 : double &dfWeightSum)
3152 : {
3153 63 : GDALResampleConvolutionHorizontalWithMaskSSE2(
3154 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
3155 : dfWeightSum);
3156 63 : }
3157 :
3158 : /************************************************************************/
3159 : /* GDALResampleConvolutionHorizontal_3rows_SSE2<T> */
3160 : /************************************************************************/
3161 :
3162 : template <class T>
3163 32036886 : static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
3164 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3165 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3166 : double &dfRes2, double &dfRes3)
3167 : {
3168 32036886 : XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
3169 32022986 : v_acc2 = XMMReg4Double::Zero(),
3170 32031986 : v_acc3 = XMMReg4Double::Zero();
3171 32042886 : int i = 0;
3172 63897056 : for (; i < nSrcPixelCount - 7; i += 8)
3173 : {
3174 : // Retrieve the pixel & accumulate.
3175 31881770 : XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3176 31880970 : XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
3177 31888770 : const XMMReg4Double v_weight1 =
3178 31888770 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3179 31842970 : const XMMReg4Double v_weight2 =
3180 31842970 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
3181 :
3182 31857570 : v_acc1 += v_pixels1 * v_weight1;
3183 31843070 : v_acc1 += v_pixels2 * v_weight2;
3184 :
3185 31844770 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3186 31855970 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
3187 31868870 : v_acc2 += v_pixels1 * v_weight1;
3188 31836070 : v_acc2 += v_pixels2 * v_weight2;
3189 :
3190 31841370 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3191 31871170 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
3192 31871370 : v_acc3 += v_pixels1 * v_weight1;
3193 31832870 : v_acc3 += v_pixels2 * v_weight2;
3194 : }
3195 :
3196 32015286 : dfRes1 = v_acc1.GetHorizSum();
3197 32038086 : dfRes2 = v_acc2.GetHorizSum();
3198 32040986 : dfRes3 = v_acc3.GetHorizSum();
3199 44201152 : for (; i < nSrcPixelCount; ++i)
3200 : {
3201 12156866 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3202 12156866 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3203 12156866 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3204 : }
3205 32044286 : }
3206 :
3207 : /************************************************************************/
3208 : /* GDALResampleConvolutionHorizontal_3rows<GByte> */
3209 : /************************************************************************/
3210 :
3211 : template <>
3212 32055500 : inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
3213 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3214 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3215 : double &dfRes2, double &dfRes3)
3216 : {
3217 32055500 : GDALResampleConvolutionHorizontal_3rows_SSE2(
3218 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3219 : dfRes1, dfRes2, dfRes3);
3220 32020000 : }
3221 :
3222 : template <>
3223 86 : inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
3224 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3225 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3226 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3227 : {
3228 86 : GDALResampleConvolutionHorizontal_3rows_SSE2(
3229 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3230 : dfRes1, dfRes2, dfRes3);
3231 86 : }
3232 :
3233 : /************************************************************************/
3234 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T> */
3235 : /************************************************************************/
3236 :
3237 : template <class T>
3238 7122126 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3239 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3240 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3241 : double &dfRes2, double &dfRes3)
3242 : {
3243 7122126 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3244 7111515 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3245 7113506 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
3246 7116117 : int i = 0; // Use after for.
3247 16899139 : for (; i < nSrcPixelCount - 3; i += 4)
3248 : {
3249 : // Retrieve the pixel & accumulate.
3250 9791660 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3251 9799940 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3252 9776680 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3253 9812570 : const XMMReg4Double v_weight =
3254 9812570 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3255 :
3256 9778350 : v_acc1 += v_pixels1 * v_weight;
3257 9753840 : v_acc2 += v_pixels2 * v_weight;
3258 9762090 : v_acc3 += v_pixels3 * v_weight;
3259 : }
3260 :
3261 7107469 : dfRes1 = v_acc1.GetHorizSum();
3262 7114146 : dfRes2 = v_acc2.GetHorizSum();
3263 7112909 : dfRes3 = v_acc3.GetHorizSum();
3264 :
3265 11551389 : for (; i < nSrcPixelCount; ++i)
3266 : {
3267 4438504 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3268 4438504 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3269 4438504 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3270 : }
3271 7112865 : }
3272 :
3273 : /************************************************************************/
3274 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte> */
3275 : /************************************************************************/
3276 :
3277 : template <>
3278 7057950 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
3279 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3280 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3281 : double &dfRes2, double &dfRes3)
3282 : {
3283 7057950 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3284 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3285 : dfRes1, dfRes2, dfRes3);
3286 7045740 : }
3287 :
3288 : template <>
3289 67039 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
3290 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3291 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3292 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3293 : {
3294 67039 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3295 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3296 : dfRes1, dfRes2, dfRes3);
3297 67109 : }
3298 :
3299 : /************************************************************************/
3300 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T> */
3301 : /************************************************************************/
3302 :
3303 : template <class T>
3304 13862460 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3305 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3306 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3307 : double &dfRes3)
3308 : {
3309 13862460 : const XMMReg4Double v_weight =
3310 : XMMReg4Double::Load4ValAligned(padfWeightsAligned);
3311 :
3312 : // Retrieve the pixel & accumulate.
3313 13920490 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
3314 13938010 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
3315 13867750 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
3316 :
3317 13938240 : XMMReg4Double v_acc1 = v_pixels1 * v_weight;
3318 13906500 : XMMReg4Double v_acc2 = v_pixels2 * v_weight;
3319 13902380 : XMMReg4Double v_acc3 = v_pixels3 * v_weight;
3320 :
3321 13899710 : dfRes1 = v_acc1.GetHorizSum();
3322 13895130 : dfRes2 = v_acc2.GetHorizSum();
3323 13928400 : dfRes3 = v_acc3.GetHorizSum();
3324 13910420 : }
3325 :
3326 : /************************************************************************/
3327 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte> */
3328 : /************************************************************************/
3329 :
3330 : template <>
3331 8268080 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
3332 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3333 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3334 : double &dfRes3)
3335 : {
3336 8268080 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3337 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3338 : dfRes3);
3339 8242110 : }
3340 :
3341 : template <>
3342 5680720 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
3343 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3344 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
3345 : double &dfRes2, double &dfRes3)
3346 : {
3347 5680720 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3348 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3349 : dfRes3);
3350 5651960 : }
3351 :
3352 : #endif // USE_SSE2
3353 :
3354 : /************************************************************************/
3355 : /* GDALResampleChunk_Convolution() */
3356 : /************************************************************************/
3357 :
3358 : template <class T, class Twork, GDALDataType eWrkDataType,
3359 : bool bKernelWithNegativeWeights, bool bNeedRescale>
3360 5030 : static CPLErr GDALResampleChunk_ConvolutionT(
3361 : const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
3362 : FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
3363 : int nKernelRadius, float fMaxVal)
3364 :
3365 : {
3366 5030 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
3367 5030 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
3368 5030 : const double dfSrcXDelta = args.dfSrcXDelta;
3369 5030 : const double dfSrcYDelta = args.dfSrcYDelta;
3370 5030 : constexpr int nBands = 1;
3371 5030 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
3372 5030 : const int nChunkXOff = args.nChunkXOff;
3373 5030 : const int nChunkXSize = args.nChunkXSize;
3374 5030 : const int nChunkYOff = args.nChunkYOff;
3375 5030 : const int nChunkYSize = args.nChunkYSize;
3376 5030 : const int nDstXOff = args.nDstXOff;
3377 5030 : const int nDstXOff2 = args.nDstXOff2;
3378 5030 : const int nDstYOff = args.nDstYOff;
3379 5030 : const int nDstYOff2 = args.nDstYOff2;
3380 5030 : const bool bHasNoData = args.bHasNoData;
3381 5030 : double dfNoDataValue = args.dfNoDataValue;
3382 :
3383 5030 : if (!bHasNoData)
3384 4955 : dfNoDataValue = 0.0;
3385 5030 : const auto dstDataType = args.eOvrDataType;
3386 5030 : const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
3387 5030 : const double dfReplacementVal =
3388 75 : bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
3389 : : dfNoDataValue;
3390 : // cppcheck-suppress unreadVariable
3391 5030 : const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
3392 5021 : const bool bNoDataValueInt64Valid =
3393 5028 : isIntegerDT && GDALIsValueExactAs<GInt64>(dfNoDataValue);
3394 5021 : const auto nNodataValueInt64 =
3395 : bNoDataValueInt64Valid ? static_cast<GInt64>(dfNoDataValue) : 0;
3396 5021 : constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
3397 :
3398 : // TODO: we should have some generic function to do this.
3399 5021 : Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
3400 5021 : Twork fDstMax = cpl::NumericLimits<Twork>::max();
3401 5021 : if (dstDataType == GDT_Byte)
3402 : {
3403 4164 : fDstMin = std::numeric_limits<GByte>::min();
3404 4161 : fDstMax = std::numeric_limits<GByte>::max();
3405 : }
3406 859 : else if (dstDataType == GDT_Int8)
3407 : {
3408 1 : fDstMin = std::numeric_limits<GInt8>::min();
3409 1 : fDstMax = std::numeric_limits<GInt8>::max();
3410 : }
3411 858 : else if (dstDataType == GDT_UInt16)
3412 : {
3413 396 : fDstMin = std::numeric_limits<GUInt16>::min();
3414 393 : fDstMax = std::numeric_limits<GUInt16>::max();
3415 : }
3416 465 : else if (dstDataType == GDT_Int16)
3417 : {
3418 291 : fDstMin = std::numeric_limits<GInt16>::min();
3419 291 : fDstMax = std::numeric_limits<GInt16>::max();
3420 : }
3421 174 : else if (dstDataType == GDT_UInt32)
3422 : {
3423 1 : fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
3424 1 : fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
3425 : }
3426 173 : else if (dstDataType == GDT_Int32)
3427 : {
3428 : // cppcheck-suppress unreadVariable
3429 2 : fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
3430 : // cppcheck-suppress unreadVariable
3431 2 : fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
3432 : }
3433 171 : else if (dstDataType == GDT_UInt64)
3434 : {
3435 : // cppcheck-suppress unreadVariable
3436 1 : fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
3437 : // cppcheck-suppress unreadVariable
3438 : // (1 << 64) - 2048: largest uint64 value a double can hold
3439 1 : fDstMax = static_cast<Twork>(18446744073709549568ULL);
3440 : }
3441 170 : else if (dstDataType == GDT_Int64)
3442 : {
3443 : // cppcheck-suppress unreadVariable
3444 1 : fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
3445 : // cppcheck-suppress unreadVariable
3446 : // (1 << 63) - 1024: largest int64 that a double can hold
3447 1 : fDstMax = static_cast<Twork>(9223372036854774784LL);
3448 : }
3449 :
3450 37229031 : auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
3451 : bNoDataValueInt64Valid, nNodataValueInt64,
3452 : dfNoDataValue, dfReplacementVal](Twork fVal)
3453 : {
3454 16155000 : if (!bHasNoData)
3455 11940300 : return fVal;
3456 :
3457 : // Clamp value before comparing to nodata: this is only needed for
3458 : // kernels with negative weights (Lanczos)
3459 4214720 : Twork fClamped = fVal;
3460 4214720 : if (fClamped < fDstMin)
3461 15998 : fClamped = fDstMin;
3462 4198730 : else if (fClamped > fDstMax)
3463 16406 : fClamped = fDstMax;
3464 4214720 : if (isIntegerDT)
3465 : {
3466 4184520 : if (bNoDataValueInt64Valid)
3467 : {
3468 4192860 : const double fClampedRounded = double(std::round(fClamped));
3469 8426020 : if (fClampedRounded >=
3470 : static_cast<double>(static_cast<Twork>(
3471 8425320 : std::numeric_limits<int64_t>::min())) &&
3472 : fClampedRounded <= static_cast<double>(static_cast<Twork>(
3473 8424440 : 9223372036854774784LL)) &&
3474 4211640 : nNodataValueInt64 ==
3475 4212460 : static_cast<GInt64>(std::round(fClamped)))
3476 : {
3477 : // Do not use the nodata value
3478 14435 : return static_cast<Twork>(dfReplacementVal);
3479 : }
3480 : }
3481 : }
3482 30202 : else if (dfNoDataValue == static_cast<double>(fClamped))
3483 : {
3484 : // Do not use the nodata value
3485 1 : return static_cast<Twork>(dfReplacementVal);
3486 : }
3487 4220020 : return fClamped;
3488 : };
3489 :
3490 : /* -------------------------------------------------------------------- */
3491 : /* Allocate work buffers. */
3492 : /* -------------------------------------------------------------------- */
3493 5024 : const int nDstXSize = nDstXOff2 - nDstXOff;
3494 5024 : Twork *pafWrkScanline = nullptr;
3495 5024 : if (dstDataType != eWrkDataType)
3496 : {
3497 : pafWrkScanline =
3498 4856 : static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
3499 4861 : if (pafWrkScanline == nullptr)
3500 0 : return CE_Failure;
3501 : }
3502 :
3503 5029 : const double dfXScale = 1.0 / dfXRatioDstToSrc;
3504 5029 : const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
3505 5029 : const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
3506 5029 : const double dfYScale = 1.0 / dfYRatioDstToSrc;
3507 5029 : const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
3508 5029 : const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
3509 :
3510 : // Temporary array to store result of horizontal filter.
3511 : double *const padfHorizontalFiltered = static_cast<double *>(
3512 5029 : VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
3513 :
3514 : // To store convolution coefficients.
3515 : double *const padfWeights =
3516 5032 : static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
3517 : static_cast<int>(
3518 : 2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) + 0.5) *
3519 : sizeof(double)));
3520 :
3521 5031 : GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
3522 5031 : if (pabyChunkNodataMask)
3523 : pabyChunkNodataMaskHorizontalFiltered =
3524 462 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
3525 5031 : if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
3526 462 : (pabyChunkNodataMask != nullptr &&
3527 : pabyChunkNodataMaskHorizontalFiltered == nullptr))
3528 : {
3529 0 : VSIFree(pafWrkScanline);
3530 0 : VSIFree(padfHorizontalFiltered);
3531 0 : VSIFreeAligned(padfWeights);
3532 0 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3533 0 : return CE_Failure;
3534 : }
3535 :
3536 : /* ==================================================================== */
3537 : /* First pass: horizontal filter */
3538 : /* ==================================================================== */
3539 5031 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
3540 : #ifdef USE_SSE2
3541 5031 : const bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
3542 : #endif
3543 3025812 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
3544 : {
3545 3020784 : const double dfSrcPixel =
3546 3020784 : (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
3547 3020784 : int nSrcPixelStart =
3548 3020784 : static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
3549 3020784 : if (nSrcPixelStart < nChunkXOff)
3550 57238 : nSrcPixelStart = nChunkXOff;
3551 3020784 : int nSrcPixelStop =
3552 3020784 : static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
3553 3020784 : if (nSrcPixelStop > nChunkRightXOff)
3554 57258 : nSrcPixelStop = nChunkRightXOff;
3555 : #if 0
3556 : if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
3557 : {
3558 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3559 : }
3560 : if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
3561 : {
3562 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3563 : }
3564 : #endif
3565 3020784 : const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
3566 3020784 : double dfWeightSum = 0.0;
3567 :
3568 : // Compute convolution coefficients.
3569 3020784 : int nSrcPixel = nSrcPixelStart;
3570 3020784 : double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
3571 4375084 : for (; nSrcPixel < nSrcPixelStop - 3; nSrcPixel += 4)
3572 : {
3573 1354097 : padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
3574 1354097 : dfX += dfXScaleWeight;
3575 1354097 : padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
3576 1354097 : dfX += dfXScaleWeight;
3577 1354097 : padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
3578 1354097 : dfX += dfXScaleWeight;
3579 1354097 : padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
3580 1354097 : dfX += dfXScaleWeight;
3581 1354302 : dfWeightSum +=
3582 1354097 : pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
3583 : }
3584 7011598 : for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
3585 : {
3586 3990747 : const double dfWeight = pfnFilterFunc(dfX);
3587 3990609 : padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3588 3990609 : dfWeightSum += dfWeight;
3589 : }
3590 :
3591 3020851 : const int nHeight = nChunkYSize * nBands;
3592 3020851 : if (pabyChunkNodataMask == nullptr)
3593 : {
3594 : // For floating-point data types, we must scale down a bit values
3595 : // if input values are close to +/- std::numeric_limits<T>::max()
3596 : #ifdef OLD_CPPCHECK
3597 : constexpr double mulFactor = 1;
3598 : #else
3599 2933099 : constexpr double mulFactor =
3600 : (bNeedRescale &&
3601 : (std::is_same_v<T, float> || std::is_same_v<T, double>))
3602 : ? 2
3603 : : 1;
3604 : #endif
3605 :
3606 2933099 : if (dfWeightSum != 0)
3607 : {
3608 2933104 : const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
3609 11692164 : for (int i = 0; i < nSrcPixelCount; ++i)
3610 : {
3611 8759063 : padfWeights[i] *= dfInvWeightSum;
3612 : }
3613 : }
3614 :
3615 166101460 : const auto ScaleValue = [
3616 : #ifdef _MSC_VER
3617 : mulFactor
3618 : #endif
3619 : ](double dfVal, [[maybe_unused]] const T *inputValues,
3620 : [[maybe_unused]] int nInputValues)
3621 : {
3622 166101000 : constexpr bool isFloat =
3623 : std::is_same_v<T, float> || std::is_same_v<T, double>;
3624 : if constexpr (isFloat)
3625 : {
3626 4070140 : if (std::isfinite(dfVal))
3627 : {
3628 : return std::clamp(dfVal,
3629 12204800 : -std::numeric_limits<double>::max() /
3630 : mulFactor,
3631 4068260 : std::numeric_limits<double>::max() /
3632 4068260 : mulFactor) *
3633 4068260 : mulFactor;
3634 : }
3635 : else if constexpr (bKernelWithNegativeWeights)
3636 : {
3637 936 : if (std::isnan(dfVal))
3638 : {
3639 : // Either one of the input value is NaN or they are +/-Inf
3640 936 : const bool isPositive = inputValues[0] >= 0;
3641 6008 : for (int i = 0; i < nInputValues; ++i)
3642 : {
3643 5384 : if (std::isnan(inputValues[i]))
3644 312 : return dfVal;
3645 : // cppcheck-suppress knownConditionTrueFalse
3646 5072 : if ((inputValues[i] >= 0) != isPositive)
3647 0 : return dfVal;
3648 : }
3649 : // All values are positive or negative infinity
3650 624 : return static_cast<double>(inputValues[0]);
3651 : }
3652 : }
3653 : }
3654 162032000 : return dfVal;
3655 : };
3656 :
3657 2933099 : int iSrcLineOff = 0;
3658 : #ifdef USE_SSE2
3659 2933099 : if (nSrcPixelCount == 4)
3660 : {
3661 15788535 : for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3662 : {
3663 15169866 : const size_t j =
3664 15169866 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3665 15169866 : (nSrcPixelStart - nChunkXOff);
3666 15169866 : double dfVal1 = 0.0;
3667 15169866 : double dfVal2 = 0.0;
3668 15169866 : double dfVal3 = 0.0;
3669 15169866 : GDALResampleConvolutionHorizontalPixelCount4_3rows(
3670 15169866 : pChunk + j, pChunk + j + nChunkXSize,
3671 15169866 : pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
3672 : dfVal2, dfVal3);
3673 30346746 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3674 15159066 : nDstXSize +
3675 15159066 : iDstPixel - nDstXOff] =
3676 15159066 : ScaleValue(dfVal1, pChunk + j, 4);
3677 30360746 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3678 15187746 : 1) *
3679 15187746 : nDstXSize +
3680 15187746 : iDstPixel - nDstXOff] =
3681 15187746 : ScaleValue(dfVal2, pChunk + j + nChunkXSize, 4);
3682 15175245 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3683 15172966 : 2) *
3684 15172966 : nDstXSize +
3685 15172966 : iDstPixel - nDstXOff] =
3686 15172966 : ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize, 4);
3687 : }
3688 : }
3689 2319407 : else if (bSrcPixelCountLess8)
3690 : {
3691 9200102 : for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3692 : {
3693 7137561 : const size_t j =
3694 7137561 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3695 7137561 : (nSrcPixelStart - nChunkXOff);
3696 7137561 : double dfVal1 = 0.0;
3697 7137561 : double dfVal2 = 0.0;
3698 7137561 : double dfVal3 = 0.0;
3699 7137561 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
3700 7137561 : pChunk + j, pChunk + j + nChunkXSize,
3701 7137561 : pChunk + j + 2 * nChunkXSize, padfWeights,
3702 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3703 14268458 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3704 7134556 : nDstXSize +
3705 7134556 : iDstPixel - nDstXOff] =
3706 7134556 : ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
3707 14268527 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3708 7133902 : 1) *
3709 7133902 : nDstXSize +
3710 7133902 : iDstPixel - nDstXOff] =
3711 7133902 : ScaleValue(dfVal2, pChunk + j + nChunkXSize,
3712 : nSrcPixelCount);
3713 7135683 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3714 7134625 : 2) *
3715 7134625 : nDstXSize +
3716 7134625 : iDstPixel - nDstXOff] =
3717 7134625 : ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
3718 : nSrcPixelCount);
3719 : }
3720 : }
3721 : else
3722 : #endif
3723 : {
3724 32386265 : for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3725 : {
3726 32112644 : const size_t j =
3727 32112644 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3728 32112644 : (nSrcPixelStart - nChunkXOff);
3729 32112644 : double dfVal1 = 0.0;
3730 32112644 : double dfVal2 = 0.0;
3731 32112644 : double dfVal3 = 0.0;
3732 32112644 : GDALResampleConvolutionHorizontal_3rows(
3733 32112644 : pChunk + j, pChunk + j + nChunkXSize,
3734 32112644 : pChunk + j + 2 * nChunkXSize, padfWeights,
3735 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3736 64216398 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3737 32110144 : nDstXSize +
3738 32110144 : iDstPixel - nDstXOff] =
3739 32110144 : ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
3740 64229498 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3741 32106244 : 1) *
3742 32106244 : nDstXSize +
3743 32106244 : iDstPixel - nDstXOff] =
3744 32106244 : ScaleValue(dfVal2, pChunk + j + nChunkXSize,
3745 : nSrcPixelCount);
3746 32196480 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3747 32123244 : 2) *
3748 32123244 : nDstXSize +
3749 32123244 : iDstPixel - nDstXOff] =
3750 32123244 : ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
3751 : nSrcPixelCount);
3752 : }
3753 : }
3754 6091994 : for (; iSrcLineOff < nHeight; ++iSrcLineOff)
3755 : {
3756 3158930 : const size_t j =
3757 3158930 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3758 3158930 : (nSrcPixelStart - nChunkXOff);
3759 3707934 : const double dfVal = GDALResampleConvolutionHorizontal(
3760 594956 : pChunk + j, padfWeights, nSrcPixelCount);
3761 3159492 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3762 3159019 : nDstXSize +
3763 3159019 : iDstPixel - nDstXOff] =
3764 3159019 : ScaleValue(dfVal, pChunk + j, nSrcPixelCount);
3765 : }
3766 : }
3767 : else
3768 : {
3769 20501636 : for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
3770 : {
3771 20413730 : const size_t j =
3772 20413730 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3773 20413730 : (nSrcPixelStart - nChunkXOff);
3774 :
3775 : if (bKernelWithNegativeWeights)
3776 : {
3777 19888612 : int nConsecutiveValid = 0;
3778 19888612 : int nMaxConsecutiveValid = 0;
3779 181872458 : for (int k = 0; k < nSrcPixelCount; k++)
3780 : {
3781 161985146 : if (pabyChunkNodataMask[j + k])
3782 48863253 : nConsecutiveValid++;
3783 113121793 : else if (nConsecutiveValid)
3784 : {
3785 106953 : nMaxConsecutiveValid = std::max(
3786 107790 : nMaxConsecutiveValid, nConsecutiveValid);
3787 106953 : nConsecutiveValid = 0;
3788 : }
3789 : }
3790 19888212 : nMaxConsecutiveValid =
3791 19887812 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
3792 19888212 : if (nMaxConsecutiveValid < nSrcPixelCount / 2)
3793 : {
3794 13314907 : const size_t nTempOffset =
3795 13314907 : static_cast<size_t>(iSrcLineOff) * nDstXSize +
3796 13314907 : iDstPixel - nDstXOff;
3797 13314907 : padfHorizontalFiltered[nTempOffset] = 0.0;
3798 13314907 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3799 13314907 : continue;
3800 : }
3801 : }
3802 :
3803 7098433 : double dfVal = 0.0;
3804 7098433 : GDALResampleConvolutionHorizontalWithMask(
3805 44639 : pChunk + j, pabyChunkNodataMask + j, padfWeights,
3806 : nSrcPixelCount, dfVal, dfWeightSum);
3807 7098835 : const size_t nTempOffset =
3808 7098835 : static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
3809 7098835 : nDstXOff;
3810 7098835 : if (dfWeightSum > 0.0)
3811 : {
3812 7043358 : padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
3813 7043358 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
3814 : }
3815 : else
3816 : {
3817 55532 : padfHorizontalFiltered[nTempOffset] = 0.0;
3818 55532 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3819 : }
3820 : }
3821 : }
3822 : }
3823 :
3824 : /* ==================================================================== */
3825 : /* Second pass: vertical filter */
3826 : /* ==================================================================== */
3827 5032 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
3828 :
3829 376391 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3830 : {
3831 371359 : Twork *const pafDstScanline =
3832 : pafWrkScanline
3833 371359 : ? pafWrkScanline
3834 8797 : : static_cast<Twork *>(pDstBuffer) +
3835 8797 : static_cast<size_t>(iDstLine - nDstYOff) * nDstXSize;
3836 :
3837 371359 : const double dfSrcLine =
3838 371359 : (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
3839 371359 : int nSrcLineStart =
3840 371359 : static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
3841 371359 : int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
3842 371359 : if (nSrcLineStart < nChunkYOff)
3843 3361 : nSrcLineStart = nChunkYOff;
3844 371359 : if (nSrcLineStop > nChunkBottomYOff)
3845 3405 : nSrcLineStop = nChunkBottomYOff;
3846 : #if 0
3847 : if( nSrcLineStart < nChunkYOff &&
3848 : nChunkYOff > 0 )
3849 : {
3850 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3851 : }
3852 : if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
3853 : {
3854 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3855 : }
3856 : #endif
3857 371359 : const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
3858 371359 : double dfWeightSum = 0.0;
3859 :
3860 : // Compute convolution coefficients.
3861 371359 : int nSrcLine = nSrcLineStart; // Used after for.
3862 371359 : double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
3863 943548 : for (; nSrcLine < nSrcLineStop - 3;
3864 572189 : nSrcLine += 4, dfY += 4 * dfYScaleWeight)
3865 : {
3866 572191 : padfWeights[nSrcLine - nSrcLineStart] = dfY;
3867 572191 : padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
3868 572191 : padfWeights[nSrcLine + 2 - nSrcLineStart] =
3869 572191 : dfY + 2 * dfYScaleWeight;
3870 572191 : padfWeights[nSrcLine + 3 - nSrcLineStart] =
3871 572191 : dfY + 3 * dfYScaleWeight;
3872 572189 : dfWeightSum +=
3873 572191 : pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
3874 : }
3875 409056 : for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
3876 : {
3877 37706 : const double dfWeight = pfnFilterFunc(dfY);
3878 37699 : padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
3879 37699 : dfWeightSum += dfWeight;
3880 : }
3881 :
3882 371350 : if (pabyChunkNodataMask == nullptr)
3883 : {
3884 : // For floating-point data types, we must scale down a bit values
3885 : // if input values are close to +/- std::numeric_limits<T>::max()
3886 : #ifdef OLD_CPPCHECK
3887 : constexpr double mulFactor = 1;
3888 : #else
3889 332360 : constexpr double mulFactor =
3890 : (bNeedRescale &&
3891 : (std::is_same_v<T, float> || std::is_same_v<T, double>))
3892 : ? 2
3893 : : 1;
3894 : #endif
3895 :
3896 332360 : if (dfWeightSum != 0)
3897 : {
3898 332360 : const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
3899 2386153 : for (int i = 0; i < nSrcLineCount; ++i)
3900 2053785 : padfWeights[i] *= dfInvWeightSum;
3901 : }
3902 :
3903 332360 : int iFilteredPixelOff = 0; // Used after for.
3904 : // j used after for.
3905 332360 : size_t j =
3906 332360 : (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
3907 : #ifdef USE_SSE2
3908 : if constexpr ((!bNeedRescale ||
3909 : !std::is_same_v<T, float>)&&eWrkDataType ==
3910 : GDT_Float32)
3911 : {
3912 : #ifdef __AVX__
3913 : for (; iFilteredPixelOff < nDstXSize - 15;
3914 : iFilteredPixelOff += 16, j += 16)
3915 : {
3916 : GDALResampleConvolutionVertical_16cols(
3917 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3918 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3919 : if (bHasNoData)
3920 : {
3921 : for (int k = 0; k < 16; k++)
3922 : {
3923 : pafDstScanline[iFilteredPixelOff + k] =
3924 : replaceValIfNodata(
3925 : pafDstScanline[iFilteredPixelOff + k]);
3926 : }
3927 : }
3928 : }
3929 : #else
3930 25148897 : for (; iFilteredPixelOff < nDstXSize - 7;
3931 : iFilteredPixelOff += 8, j += 8)
3932 : {
3933 24854008 : GDALResampleConvolutionVertical_8cols(
3934 24854008 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3935 24854008 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3936 24825338 : if (bHasNoData)
3937 : {
3938 123192 : for (int k = 0; k < 8; k++)
3939 : {
3940 109504 : pafDstScanline[iFilteredPixelOff + k] =
3941 109504 : replaceValIfNodata(
3942 109504 : pafDstScanline[iFilteredPixelOff + k]);
3943 : }
3944 : }
3945 : }
3946 : #endif
3947 :
3948 758986 : for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
3949 : {
3950 464121 : const Twork fVal =
3951 464085 : static_cast<Twork>(GDALResampleConvolutionVertical(
3952 464085 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3953 : nSrcLineCount));
3954 464076 : pafDstScanline[iFilteredPixelOff] =
3955 464121 : replaceValIfNodata(fVal);
3956 : }
3957 : }
3958 : else
3959 : #endif
3960 : {
3961 5862642 : const auto ScaleValue = [
3962 : #ifdef _MSC_VER
3963 : mulFactor
3964 : #endif
3965 : ](double dfVal, [[maybe_unused]] const double *inputValues,
3966 : [[maybe_unused]] int nStride,
3967 : [[maybe_unused]] int nInputValues)
3968 : {
3969 5862640 : constexpr bool isFloat =
3970 : std::is_same_v<T, float> || std::is_same_v<T, double>;
3971 : if constexpr (isFloat)
3972 : {
3973 5862640 : if (std::isfinite(dfVal))
3974 : {
3975 : return std::clamp(
3976 : dfVal,
3977 : static_cast<double>(
3978 17585400 : -std::numeric_limits<Twork>::max()) /
3979 : mulFactor,
3980 : static_cast<double>(
3981 5861800 : std::numeric_limits<Twork>::max()) /
3982 5861800 : mulFactor) *
3983 5861800 : mulFactor;
3984 : }
3985 : else if constexpr (bKernelWithNegativeWeights)
3986 : {
3987 480 : if (std::isnan(dfVal))
3988 : {
3989 : // Either one of the input value is NaN or they are +/-Inf
3990 480 : const bool isPositive = inputValues[0] >= 0;
3991 2520 : for (int i = 0; i < nInputValues; ++i)
3992 : {
3993 2200 : if (std::isnan(inputValues[i * nStride]))
3994 160 : return dfVal;
3995 : // cppcheck-suppress knownConditionTrueFalse
3996 2040 : if ((inputValues[i] >= 0) != isPositive)
3997 0 : return dfVal;
3998 : }
3999 : // All values are positive or negative infinity
4000 320 : return inputValues[0];
4001 : }
4002 : }
4003 : }
4004 :
4005 360 : return dfVal;
4006 : };
4007 :
4008 2939422 : for (; iFilteredPixelOff < nDstXSize - 1;
4009 : iFilteredPixelOff += 2, j += 2)
4010 : {
4011 2930610 : double dfVal1 = 0.0;
4012 2930610 : double dfVal2 = 0.0;
4013 2930610 : GDALResampleConvolutionVertical_2cols(
4014 2930610 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
4015 : nSrcLineCount, dfVal1, dfVal2);
4016 5861220 : pafDstScanline[iFilteredPixelOff] =
4017 2930610 : replaceValIfNodata(static_cast<Twork>(
4018 2930610 : ScaleValue(dfVal1, padfHorizontalFiltered + j,
4019 : nDstXSize, nSrcLineCount)));
4020 2930610 : pafDstScanline[iFilteredPixelOff + 1] =
4021 2930610 : replaceValIfNodata(static_cast<Twork>(
4022 2930610 : ScaleValue(dfVal2, padfHorizontalFiltered + j + 1,
4023 : nDstXSize, nSrcLineCount)));
4024 : }
4025 8819 : if (iFilteredPixelOff < nDstXSize)
4026 : {
4027 1427 : const double dfVal = GDALResampleConvolutionVertical(
4028 1427 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
4029 : nSrcLineCount);
4030 1427 : pafDstScanline[iFilteredPixelOff] =
4031 1427 : replaceValIfNodata(static_cast<Twork>(
4032 1427 : ScaleValue(dfVal, padfHorizontalFiltered + j,
4033 : nDstXSize, nSrcLineCount)));
4034 : }
4035 : }
4036 : }
4037 : else
4038 : {
4039 19012048 : for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
4040 : ++iFilteredPixelOff)
4041 : {
4042 18962057 : double dfVal = 0.0;
4043 18962057 : dfWeightSum = 0.0;
4044 18962057 : size_t j = (nSrcLineStart - nChunkYOff) *
4045 18962057 : static_cast<size_t>(nDstXSize) +
4046 18962057 : iFilteredPixelOff;
4047 : if (bKernelWithNegativeWeights)
4048 : {
4049 18718501 : int nConsecutiveValid = 0;
4050 18718501 : int nMaxConsecutiveValid = 0;
4051 133044321 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
4052 : {
4053 114303020 : const double dfWeight =
4054 114303020 : padfWeights[i] *
4055 : pabyChunkNodataMaskHorizontalFiltered[j];
4056 114303020 : if (pabyChunkNodataMaskHorizontalFiltered[j])
4057 : {
4058 48584037 : nConsecutiveValid++;
4059 : }
4060 65719283 : else if (nConsecutiveValid)
4061 : {
4062 226934 : nMaxConsecutiveValid = std::max(
4063 204376 : nMaxConsecutiveValid, nConsecutiveValid);
4064 226934 : nConsecutiveValid = 0;
4065 : }
4066 114326020 : dfVal += padfHorizontalFiltered[j] * dfWeight;
4067 114326020 : dfWeightSum += dfWeight;
4068 : }
4069 18740601 : nMaxConsecutiveValid =
4070 18741001 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
4071 18740601 : if (nMaxConsecutiveValid < nSrcLineCount / 2)
4072 : {
4073 9246271 : pafDstScanline[iFilteredPixelOff] =
4074 9246179 : static_cast<Twork>(dfNoDataValue);
4075 9246271 : continue;
4076 : }
4077 : }
4078 : else
4079 : {
4080 1237062 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
4081 : {
4082 993504 : const double dfWeight =
4083 993504 : padfWeights[i] *
4084 : pabyChunkNodataMaskHorizontalFiltered[j];
4085 993504 : dfVal += padfHorizontalFiltered[j] * dfWeight;
4086 993504 : dfWeightSum += dfWeight;
4087 : }
4088 : }
4089 9737866 : if (dfWeightSum > 0.0)
4090 : {
4091 9710707 : pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
4092 9721825 : static_cast<Twork>(dfVal / dfWeightSum));
4093 : }
4094 : else
4095 : {
4096 16036 : pafDstScanline[iFilteredPixelOff] =
4097 16012 : static_cast<Twork>(dfNoDataValue);
4098 : }
4099 : }
4100 : }
4101 :
4102 353681 : if (fMaxVal != 0.0f)
4103 : {
4104 : if constexpr (std::is_same_v<T, double>)
4105 : {
4106 0 : for (int i = 0; i < nDstXSize; ++i)
4107 : {
4108 0 : if (pafDstScanline[i] > static_cast<double>(fMaxVal))
4109 0 : pafDstScanline[i] = static_cast<double>(fMaxVal);
4110 : }
4111 : }
4112 : else
4113 : {
4114 192324 : for (int i = 0; i < nDstXSize; ++i)
4115 : {
4116 192088 : if (pafDstScanline[i] > fMaxVal)
4117 96022 : pafDstScanline[i] = fMaxVal;
4118 : }
4119 : }
4120 : }
4121 :
4122 353681 : if (pafWrkScanline)
4123 : {
4124 362563 : GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
4125 : static_cast<GByte *>(pDstBuffer) +
4126 362563 : static_cast<size_t>(iDstLine - nDstYOff) *
4127 362563 : nDstXSize * nDstDataTypeSize,
4128 : dstDataType, nDstDataTypeSize, nDstXSize);
4129 : }
4130 : }
4131 :
4132 5032 : VSIFree(pafWrkScanline);
4133 5032 : VSIFreeAligned(padfWeights);
4134 5032 : VSIFree(padfHorizontalFiltered);
4135 5032 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
4136 :
4137 5032 : return CE_None;
4138 : }
4139 :
4140 : template <bool bKernelWithNegativeWeights, bool bNeedRescale>
4141 : static CPLErr
4142 5032 : GDALResampleChunk_ConvolutionInternal(const GDALOverviewResampleArgs &args,
4143 : const void *pChunk, void **ppDstBuffer,
4144 : GDALDataType *peDstBufferDataType)
4145 : {
4146 : GDALResampleAlg eResample;
4147 5032 : if (EQUAL(args.pszResampling, "BILINEAR"))
4148 2660 : eResample = GRA_Bilinear;
4149 2372 : else if (EQUAL(args.pszResampling, "CUBIC"))
4150 2219 : eResample = GRA_Cubic;
4151 153 : else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
4152 59 : eResample = GRA_CubicSpline;
4153 94 : else if (EQUAL(args.pszResampling, "LANCZOS"))
4154 90 : eResample = GRA_Lanczos;
4155 : else
4156 : {
4157 4 : CPLAssert(false);
4158 : return CE_Failure;
4159 : }
4160 5028 : const int nKernelRadius = GWKGetFilterRadius(eResample);
4161 5026 : FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
4162 : const FilterFunc4ValuesType pfnFilterFunc4Values =
4163 5028 : GWKGetFilterFunc4Values(eResample);
4164 :
4165 5025 : float fMaxVal = 0.f;
4166 : // Cubic, etc... can have overshoots, so make sure we clamp values to the
4167 : // maximum value if NBITS is set.
4168 5025 : if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
4169 8 : (args.eOvrDataType == GDT_Byte || args.eOvrDataType == GDT_UInt16 ||
4170 0 : args.eOvrDataType == GDT_UInt32))
4171 : {
4172 8 : int nBits = args.nOvrNBITS;
4173 8 : if (nBits == GDALGetDataTypeSizeBits(args.eOvrDataType))
4174 1 : nBits = 0;
4175 8 : if (nBits > 0 && nBits < 32)
4176 7 : fMaxVal = static_cast<float>((1U << nBits) - 1);
4177 : }
4178 :
4179 5025 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
4180 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
4181 : GDALGetDataTypeSizeBytes(args.eOvrDataType));
4182 5032 : if (*ppDstBuffer == nullptr)
4183 : {
4184 0 : return CE_Failure;
4185 : }
4186 5032 : *peDstBufferDataType = args.eOvrDataType;
4187 :
4188 5032 : switch (args.eWrkDataType)
4189 : {
4190 4164 : case GDT_Byte:
4191 : {
4192 : return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32,
4193 : bKernelWithNegativeWeights,
4194 4164 : bNeedRescale>(
4195 : args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
4196 4164 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4197 : }
4198 :
4199 402 : case GDT_UInt16:
4200 : {
4201 : return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32,
4202 : bKernelWithNegativeWeights,
4203 402 : bNeedRescale>(
4204 : args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
4205 402 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4206 : }
4207 :
4208 375 : case GDT_Float32:
4209 : {
4210 : return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32,
4211 : bKernelWithNegativeWeights,
4212 375 : bNeedRescale>(
4213 : args, static_cast<const float *>(pChunk), *ppDstBuffer,
4214 375 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4215 : }
4216 :
4217 91 : case GDT_Float64:
4218 : {
4219 : return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64,
4220 : bKernelWithNegativeWeights,
4221 91 : bNeedRescale>(
4222 : args, static_cast<const double *>(pChunk), *ppDstBuffer,
4223 91 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4224 : }
4225 :
4226 0 : default:
4227 0 : break;
4228 : }
4229 :
4230 0 : CPLAssert(false);
4231 : return CE_Failure;
4232 : }
4233 :
4234 : static CPLErr
4235 5032 : GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
4236 : const void *pChunk, void **ppDstBuffer,
4237 : GDALDataType *peDstBufferDataType)
4238 : {
4239 5032 : if (EQUAL(args.pszResampling, "CUBIC") ||
4240 2809 : EQUAL(args.pszResampling, "LANCZOS"))
4241 : return GDALResampleChunk_ConvolutionInternal<
4242 2313 : /* bKernelWithNegativeWeights=*/true, /* bNeedRescale = */ true>(
4243 2313 : args, pChunk, ppDstBuffer, peDstBufferDataType);
4244 2719 : else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
4245 59 : return GDALResampleChunk_ConvolutionInternal<false, true>(
4246 59 : args, pChunk, ppDstBuffer, peDstBufferDataType);
4247 : else
4248 2660 : return GDALResampleChunk_ConvolutionInternal<false, false>(
4249 2660 : args, pChunk, ppDstBuffer, peDstBufferDataType);
4250 : }
4251 :
4252 : /************************************************************************/
4253 : /* GDALResampleChunkC32R() */
4254 : /************************************************************************/
4255 :
4256 2 : static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
4257 : const float *pafChunk, const int nChunkYOff,
4258 : const int nChunkYSize, const int nDstYOff,
4259 : const int nDstYOff2, const int nOvrXSize,
4260 : const int nOvrYSize, void **ppDstBuffer,
4261 : GDALDataType *peDstBufferDataType,
4262 : const char *pszResampling)
4263 :
4264 : {
4265 : enum Method
4266 : {
4267 : NEAR,
4268 : AVERAGE,
4269 : AVERAGE_MAGPHASE,
4270 : RMS,
4271 : };
4272 :
4273 2 : Method eMethod = NEAR;
4274 2 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
4275 : {
4276 0 : eMethod = NEAR;
4277 : }
4278 2 : else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
4279 : {
4280 0 : eMethod = AVERAGE_MAGPHASE;
4281 : }
4282 2 : else if (EQUAL(pszResampling, "RMS"))
4283 : {
4284 2 : eMethod = RMS;
4285 : }
4286 0 : else if (STARTS_WITH_CI(pszResampling, "AVER"))
4287 : {
4288 0 : eMethod = AVERAGE;
4289 : }
4290 : else
4291 : {
4292 0 : CPLError(
4293 : CE_Failure, CPLE_NotSupported,
4294 : "Resampling method %s is not supported for complex data types. "
4295 : "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
4296 : pszResampling);
4297 0 : return CE_Failure;
4298 : }
4299 :
4300 2 : const int nOXSize = nOvrXSize;
4301 2 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
4302 : GDALGetDataTypeSizeBytes(GDT_CFloat32));
4303 2 : if (*ppDstBuffer == nullptr)
4304 : {
4305 0 : return CE_Failure;
4306 : }
4307 2 : float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
4308 2 : *peDstBufferDataType = GDT_CFloat32;
4309 :
4310 2 : const int nOYSize = nOvrYSize;
4311 2 : const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
4312 2 : const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
4313 :
4314 : /* ==================================================================== */
4315 : /* Loop over destination scanlines. */
4316 : /* ==================================================================== */
4317 8 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
4318 : {
4319 6 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
4320 6 : if (nSrcYOff < nChunkYOff)
4321 0 : nSrcYOff = nChunkYOff;
4322 :
4323 6 : int nSrcYOff2 =
4324 6 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
4325 6 : if (nSrcYOff2 == nSrcYOff)
4326 0 : nSrcYOff2++;
4327 :
4328 6 : if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
4329 : {
4330 2 : if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
4331 0 : nSrcYOff = nSrcHeight - 1;
4332 2 : nSrcYOff2 = nSrcHeight;
4333 : }
4334 6 : if (nSrcYOff2 > nChunkYOff + nChunkYSize)
4335 0 : nSrcYOff2 = nChunkYOff + nChunkYSize;
4336 :
4337 6 : const float *const pafSrcScanline =
4338 6 : pafChunk +
4339 6 : (static_cast<size_t>(nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
4340 6 : float *const pafDstScanline =
4341 6 : pafDstBuffer +
4342 6 : static_cast<size_t>(iDstLine - nDstYOff) * 2 * nOXSize;
4343 :
4344 : /* --------------------------------------------------------------------
4345 : */
4346 : /* Loop over destination pixels */
4347 : /* --------------------------------------------------------------------
4348 : */
4349 18 : for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
4350 : {
4351 12 : const size_t iDstPixelSZ = static_cast<size_t>(iDstPixel);
4352 12 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
4353 12 : int nSrcXOff2 =
4354 12 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
4355 12 : if (nSrcXOff2 == nSrcXOff)
4356 0 : nSrcXOff2++;
4357 12 : if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
4358 : {
4359 6 : if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
4360 0 : nSrcXOff = nSrcWidth - 1;
4361 6 : nSrcXOff2 = nSrcWidth;
4362 : }
4363 12 : const size_t nSrcXOffSZ = static_cast<size_t>(nSrcXOff);
4364 :
4365 12 : if (eMethod == NEAR)
4366 : {
4367 0 : pafDstScanline[iDstPixelSZ * 2] =
4368 0 : pafSrcScanline[nSrcXOffSZ * 2];
4369 0 : pafDstScanline[iDstPixelSZ * 2 + 1] =
4370 0 : pafSrcScanline[nSrcXOffSZ * 2 + 1];
4371 : }
4372 12 : else if (eMethod == AVERAGE_MAGPHASE)
4373 : {
4374 0 : double dfTotalR = 0.0;
4375 0 : double dfTotalI = 0.0;
4376 0 : double dfTotalM = 0.0;
4377 0 : size_t nCount = 0;
4378 :
4379 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4380 : {
4381 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4382 : {
4383 0 : const double dfR = double(
4384 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4385 0 : static_cast<size_t>(iY - nSrcYOff) *
4386 0 : nSrcWidth * 2]);
4387 0 : const double dfI = double(
4388 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4389 0 : static_cast<size_t>(iY - nSrcYOff) *
4390 0 : nSrcWidth * 2 +
4391 0 : 1]);
4392 0 : dfTotalR += dfR;
4393 0 : dfTotalI += dfI;
4394 0 : dfTotalM += std::hypot(dfR, dfI);
4395 0 : ++nCount;
4396 : }
4397 : }
4398 :
4399 0 : CPLAssert(nCount > 0);
4400 0 : if (nCount == 0)
4401 : {
4402 0 : pafDstScanline[iDstPixelSZ * 2] = 0.0;
4403 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4404 : }
4405 : else
4406 : {
4407 0 : pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4408 0 : dfTotalR / static_cast<double>(nCount));
4409 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4410 0 : dfTotalI / static_cast<double>(nCount));
4411 : const double dfM =
4412 0 : double(std::hypot(pafDstScanline[iDstPixelSZ * 2],
4413 0 : pafDstScanline[iDstPixelSZ * 2 + 1]));
4414 0 : const double dfDesiredM =
4415 0 : dfTotalM / static_cast<double>(nCount);
4416 0 : double dfRatio = 1.0;
4417 0 : if (dfM != 0.0)
4418 0 : dfRatio = dfDesiredM / dfM;
4419 :
4420 0 : pafDstScanline[iDstPixelSZ * 2] *=
4421 0 : static_cast<float>(dfRatio);
4422 0 : pafDstScanline[iDstPixelSZ * 2 + 1] *=
4423 0 : static_cast<float>(dfRatio);
4424 : }
4425 : }
4426 12 : else if (eMethod == RMS)
4427 : {
4428 12 : double dfTotalR = 0.0;
4429 12 : double dfTotalI = 0.0;
4430 12 : size_t nCount = 0;
4431 :
4432 36 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4433 : {
4434 72 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4435 : {
4436 48 : const double dfR = double(
4437 48 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4438 48 : static_cast<size_t>(iY - nSrcYOff) *
4439 48 : nSrcWidth * 2]);
4440 48 : const double dfI = double(
4441 48 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4442 48 : static_cast<size_t>(iY - nSrcYOff) *
4443 48 : nSrcWidth * 2 +
4444 48 : 1]);
4445 :
4446 48 : dfTotalR += SQUARE(dfR);
4447 48 : dfTotalI += SQUARE(dfI);
4448 :
4449 48 : ++nCount;
4450 : }
4451 : }
4452 :
4453 12 : CPLAssert(nCount > 0);
4454 12 : if (nCount == 0)
4455 : {
4456 0 : pafDstScanline[iDstPixelSZ * 2] = 0.0;
4457 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4458 : }
4459 : else
4460 : {
4461 : /* compute RMS */
4462 12 : pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4463 12 : sqrt(dfTotalR / static_cast<double>(nCount)));
4464 12 : pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4465 12 : sqrt(dfTotalI / static_cast<double>(nCount)));
4466 : }
4467 : }
4468 0 : else if (eMethod == AVERAGE)
4469 : {
4470 0 : double dfTotalR = 0.0;
4471 0 : double dfTotalI = 0.0;
4472 0 : size_t nCount = 0;
4473 :
4474 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4475 : {
4476 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4477 : {
4478 : // TODO(schwehr): Maybe use std::complex?
4479 0 : dfTotalR += double(
4480 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4481 0 : static_cast<size_t>(iY - nSrcYOff) *
4482 0 : nSrcWidth * 2]);
4483 0 : dfTotalI += double(
4484 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4485 0 : static_cast<size_t>(iY - nSrcYOff) *
4486 0 : nSrcWidth * 2 +
4487 0 : 1]);
4488 0 : ++nCount;
4489 : }
4490 : }
4491 :
4492 0 : CPLAssert(nCount > 0);
4493 0 : if (nCount == 0)
4494 : {
4495 0 : pafDstScanline[iDstPixelSZ * 2] = 0.0;
4496 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4497 : }
4498 : else
4499 : {
4500 0 : pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4501 0 : dfTotalR / static_cast<double>(nCount));
4502 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4503 0 : dfTotalI / static_cast<double>(nCount));
4504 : }
4505 : }
4506 : }
4507 : }
4508 :
4509 2 : return CE_None;
4510 : }
4511 :
4512 : /************************************************************************/
4513 : /* GDALRegenerateCascadingOverviews() */
4514 : /* */
4515 : /* Generate a list of overviews in order from largest to */
4516 : /* smallest, computing each from the next larger. */
4517 : /************************************************************************/
4518 :
4519 44 : static CPLErr GDALRegenerateCascadingOverviews(
4520 : GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
4521 : const char *pszResampling, GDALProgressFunc pfnProgress,
4522 : void *pProgressData, CSLConstList papszOptions)
4523 :
4524 : {
4525 : /* -------------------------------------------------------------------- */
4526 : /* First, we must put the overviews in order from largest to */
4527 : /* smallest. */
4528 : /* -------------------------------------------------------------------- */
4529 127 : for (int i = 0; i < nOverviews - 1; ++i)
4530 : {
4531 292 : for (int j = 0; j < nOverviews - i - 1; ++j)
4532 : {
4533 209 : if (papoOvrBands[j]->GetXSize() *
4534 209 : static_cast<float>(papoOvrBands[j]->GetYSize()) <
4535 209 : papoOvrBands[j + 1]->GetXSize() *
4536 209 : static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
4537 : {
4538 0 : GDALRasterBand *poTempBand = papoOvrBands[j];
4539 0 : papoOvrBands[j] = papoOvrBands[j + 1];
4540 0 : papoOvrBands[j + 1] = poTempBand;
4541 : }
4542 : }
4543 : }
4544 :
4545 : /* -------------------------------------------------------------------- */
4546 : /* Count total pixels so we can prepare appropriate scaled */
4547 : /* progress functions. */
4548 : /* -------------------------------------------------------------------- */
4549 44 : double dfTotalPixels = 0.0;
4550 :
4551 171 : for (int i = 0; i < nOverviews; ++i)
4552 : {
4553 127 : dfTotalPixels += papoOvrBands[i]->GetXSize() *
4554 127 : static_cast<double>(papoOvrBands[i]->GetYSize());
4555 : }
4556 :
4557 : /* -------------------------------------------------------------------- */
4558 : /* Generate all the bands. */
4559 : /* -------------------------------------------------------------------- */
4560 44 : double dfPixelsProcessed = 0.0;
4561 :
4562 171 : for (int i = 0; i < nOverviews; ++i)
4563 : {
4564 127 : GDALRasterBand *poBaseBand = poSrcBand;
4565 127 : if (i != 0)
4566 83 : poBaseBand = papoOvrBands[i - 1];
4567 :
4568 127 : double dfPixels = papoOvrBands[i]->GetXSize() *
4569 127 : static_cast<double>(papoOvrBands[i]->GetYSize());
4570 :
4571 254 : void *pScaledProgressData = GDALCreateScaledProgress(
4572 : dfPixelsProcessed / dfTotalPixels,
4573 127 : (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
4574 : pProgressData);
4575 :
4576 254 : const CPLErr eErr = GDALRegenerateOverviewsEx(
4577 : poBaseBand, 1,
4578 127 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
4579 : pszResampling, GDALScaledProgress, pScaledProgressData,
4580 : papszOptions);
4581 127 : GDALDestroyScaledProgress(pScaledProgressData);
4582 :
4583 127 : if (eErr != CE_None)
4584 0 : return eErr;
4585 :
4586 127 : dfPixelsProcessed += dfPixels;
4587 :
4588 : // Only do the bit2grayscale promotion on the base band.
4589 127 : if (STARTS_WITH_CI(pszResampling,
4590 : "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
4591 8 : pszResampling = "AVERAGE";
4592 : }
4593 :
4594 44 : return CE_None;
4595 : }
4596 :
4597 : /************************************************************************/
4598 : /* GDALGetResampleFunction() */
4599 : /************************************************************************/
4600 :
4601 5409 : GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
4602 : int *pnRadius)
4603 : {
4604 5409 : if (pnRadius)
4605 5407 : *pnRadius = 0;
4606 5409 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
4607 512 : return GDALResampleChunk_Near;
4608 4897 : else if (STARTS_WITH_CI(pszResampling, "AVER") ||
4609 4324 : EQUAL(pszResampling, "RMS"))
4610 634 : return GDALResampleChunk_AverageOrRMS;
4611 4263 : else if (EQUAL(pszResampling, "GAUSS"))
4612 : {
4613 26 : if (pnRadius)
4614 26 : *pnRadius = 1;
4615 26 : return GDALResampleChunk_Gauss;
4616 : }
4617 4237 : else if (EQUAL(pszResampling, "MODE"))
4618 136 : return GDALResampleChunk_Mode;
4619 4101 : else if (EQUAL(pszResampling, "CUBIC"))
4620 : {
4621 1593 : if (pnRadius)
4622 1593 : *pnRadius = GWKGetFilterRadius(GRA_Cubic);
4623 1585 : return GDALResampleChunk_Convolution;
4624 : }
4625 2508 : else if (EQUAL(pszResampling, "CUBICSPLINE"))
4626 : {
4627 39 : if (pnRadius)
4628 39 : *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
4629 39 : return GDALResampleChunk_Convolution;
4630 : }
4631 2469 : else if (EQUAL(pszResampling, "LANCZOS"))
4632 : {
4633 44 : if (pnRadius)
4634 44 : *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
4635 44 : return GDALResampleChunk_Convolution;
4636 : }
4637 2425 : else if (EQUAL(pszResampling, "BILINEAR"))
4638 : {
4639 2430 : if (pnRadius)
4640 2430 : *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
4641 2430 : return GDALResampleChunk_Convolution;
4642 : }
4643 : else
4644 : {
4645 0 : CPLError(
4646 : CE_Failure, CPLE_AppDefined,
4647 : "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
4648 : pszResampling);
4649 0 : return nullptr;
4650 : }
4651 : }
4652 :
4653 : /************************************************************************/
4654 : /* GDALGetOvrWorkDataType() */
4655 : /************************************************************************/
4656 :
4657 5284 : GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
4658 : GDALDataType eSrcDataType)
4659 : {
4660 5284 : if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
4661 : {
4662 633 : return eSrcDataType;
4663 : }
4664 4651 : else if (eSrcDataType == GDT_Byte &&
4665 4121 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4666 3642 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4667 2279 : EQUAL(pszResampling, "CUBICSPLINE") ||
4668 2274 : EQUAL(pszResampling, "LANCZOS") ||
4669 2267 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4670 : {
4671 4113 : return GDT_Byte;
4672 : }
4673 538 : else if (eSrcDataType == GDT_UInt16 &&
4674 128 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4675 123 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4676 8 : EQUAL(pszResampling, "CUBICSPLINE") ||
4677 6 : EQUAL(pszResampling, "LANCZOS") ||
4678 3 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4679 : {
4680 126 : return GDT_UInt16;
4681 : }
4682 412 : else if (EQUAL(pszResampling, "GAUSS"))
4683 20 : return GDT_Float64;
4684 :
4685 392 : if (eSrcDataType == GDT_Byte || eSrcDataType == GDT_Int8 ||
4686 388 : eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
4687 : eSrcDataType == GDT_Float32)
4688 : {
4689 258 : return GDT_Float32;
4690 : }
4691 134 : return GDT_Float64;
4692 : }
4693 :
4694 : namespace
4695 : {
4696 : // Structure to hold a pointer to free with CPLFree()
4697 : struct PointerHolder
4698 : {
4699 : void *ptr = nullptr;
4700 :
4701 5840 : explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
4702 : {
4703 5840 : }
4704 :
4705 5840 : ~PointerHolder()
4706 5840 : {
4707 5840 : CPLFree(ptr);
4708 5840 : }
4709 :
4710 : PointerHolder(const PointerHolder &) = delete;
4711 : PointerHolder &operator=(const PointerHolder &) = delete;
4712 : };
4713 : } // namespace
4714 :
4715 : /************************************************************************/
4716 : /* GDALRegenerateOverviews() */
4717 : /************************************************************************/
4718 :
4719 : /**
4720 : * \brief Generate downsampled overviews.
4721 : *
4722 : * This function will generate one or more overview images from a base image
4723 : * using the requested downsampling algorithm. Its primary use is for
4724 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4725 : * used to generate downsampled images in one file from another outside the
4726 : * overview architecture.
4727 : *
4728 : * The output bands need to exist in advance.
4729 : *
4730 : * The full set of resampling algorithms is documented in
4731 : * GDALDataset::BuildOverviews().
4732 : *
4733 : * This function will honour properly NODATA_VALUES tuples (special dataset
4734 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4735 : * considered as the nodata value and not each value of the triplet
4736 : * independently per band.
4737 : *
4738 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4739 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4740 : * overview computation.
4741 : *
4742 : * @param hSrcBand the source (base level) band.
4743 : * @param nOverviewCount the number of downsampled bands being generated.
4744 : * @param pahOvrBands the list of downsampled bands to be generated.
4745 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4746 : * @param pfnProgress progress report function.
4747 : * @param pProgressData progress function callback data.
4748 : * @return CE_None on success or CE_Failure on failure.
4749 : */
4750 250 : CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
4751 : GDALRasterBandH *pahOvrBands,
4752 : const char *pszResampling,
4753 : GDALProgressFunc pfnProgress,
4754 : void *pProgressData)
4755 :
4756 : {
4757 250 : return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
4758 : pszResampling, pfnProgress, pProgressData,
4759 250 : nullptr);
4760 : }
4761 :
4762 : /************************************************************************/
4763 : /* GDALRegenerateOverviewsEx() */
4764 : /************************************************************************/
4765 :
4766 : constexpr int RADIUS_TO_DIAMETER = 2;
4767 :
4768 : /**
4769 : * \brief Generate downsampled overviews.
4770 : *
4771 : * This function will generate one or more overview images from a base image
4772 : * using the requested downsampling algorithm. Its primary use is for
4773 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4774 : * used to generate downsampled images in one file from another outside the
4775 : * overview architecture.
4776 : *
4777 : * The output bands need to exist in advance.
4778 : *
4779 : * The full set of resampling algorithms is documented in
4780 : * GDALDataset::BuildOverviews().
4781 : *
4782 : * This function will honour properly NODATA_VALUES tuples (special dataset
4783 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4784 : * considered as the nodata value and not each value of the triplet
4785 : * independently per band.
4786 : *
4787 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4788 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4789 : * overview computation.
4790 : *
4791 : * @param hSrcBand the source (base level) band.
4792 : * @param nOverviewCount the number of downsampled bands being generated.
4793 : * @param pahOvrBands the list of downsampled bands to be generated.
4794 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4795 : * @param pfnProgress progress report function.
4796 : * @param pProgressData progress function callback data.
4797 : * @param papszOptions NULL terminated list of options as key=value pairs, or
4798 : * NULL
4799 : * @return CE_None on success or CE_Failure on failure.
4800 : * @since GDAL 3.6
4801 : */
4802 903 : CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
4803 : GDALRasterBandH *pahOvrBands,
4804 : const char *pszResampling,
4805 : GDALProgressFunc pfnProgress,
4806 : void *pProgressData, CSLConstList papszOptions)
4807 :
4808 : {
4809 903 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
4810 903 : GDALRasterBand **papoOvrBands =
4811 : reinterpret_cast<GDALRasterBand **>(pahOvrBands);
4812 :
4813 903 : if (pfnProgress == nullptr)
4814 252 : pfnProgress = GDALDummyProgress;
4815 :
4816 903 : if (EQUAL(pszResampling, "NONE"))
4817 49 : return CE_None;
4818 :
4819 854 : int nKernelRadius = 0;
4820 : GDALResampleFunction pfnResampleFn =
4821 854 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
4822 :
4823 854 : if (pfnResampleFn == nullptr)
4824 0 : return CE_Failure;
4825 :
4826 : /* -------------------------------------------------------------------- */
4827 : /* Check color tables... */
4828 : /* -------------------------------------------------------------------- */
4829 854 : GDALColorTable *poColorTable = nullptr;
4830 :
4831 487 : if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
4832 1786 : EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
4833 456 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4834 : {
4835 9 : poColorTable = poSrcBand->GetColorTable();
4836 9 : if (poColorTable != nullptr)
4837 : {
4838 9 : if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
4839 : {
4840 0 : CPLError(CE_Warning, CPLE_AppDefined,
4841 : "Computing overviews on palette index raster bands "
4842 : "with a palette whose color interpretation is not RGB "
4843 : "will probably lead to unexpected results.");
4844 0 : poColorTable = nullptr;
4845 : }
4846 9 : else if (poColorTable->IsIdentity())
4847 : {
4848 0 : poColorTable = nullptr;
4849 : }
4850 : }
4851 : else
4852 : {
4853 0 : CPLError(CE_Warning, CPLE_AppDefined,
4854 : "Computing overviews on palette index raster bands "
4855 : "without a palette will probably lead to unexpected "
4856 : "results.");
4857 : }
4858 : }
4859 : // Not ready yet
4860 2481 : else if ((EQUAL(pszResampling, "CUBIC") ||
4861 791 : EQUAL(pszResampling, "CUBICSPLINE") ||
4862 791 : EQUAL(pszResampling, "LANCZOS") ||
4863 1716 : EQUAL(pszResampling, "BILINEAR")) &&
4864 80 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4865 : {
4866 0 : CPLError(CE_Warning, CPLE_AppDefined,
4867 : "Computing %s overviews on palette index raster bands "
4868 : "will probably lead to unexpected results.",
4869 : pszResampling);
4870 : }
4871 :
4872 : // If we have a nodata mask and we are doing something more complicated
4873 : // than nearest neighbouring, we have to fetch to nodata mask.
4874 :
4875 854 : GDALRasterBand *poMaskBand = nullptr;
4876 854 : bool bUseNoDataMask = false;
4877 854 : bool bCanUseCascaded = true;
4878 :
4879 854 : if (!STARTS_WITH_CI(pszResampling, "NEAR"))
4880 : {
4881 : // Special case if we are an alpha/mask band. We want it to be
4882 : // considered as the mask band to avoid alpha=0 to be taken into account
4883 : // in average computation.
4884 536 : if (poSrcBand->IsMaskBand())
4885 : {
4886 91 : poMaskBand = poSrcBand;
4887 91 : bUseNoDataMask = true;
4888 : }
4889 : else
4890 : {
4891 445 : poMaskBand = poSrcBand->GetMaskBand();
4892 445 : const int nMaskFlags = poSrcBand->GetMaskFlags();
4893 445 : bCanUseCascaded =
4894 445 : (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
4895 445 : bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
4896 : }
4897 : }
4898 :
4899 : /* -------------------------------------------------------------------- */
4900 : /* If we are operating on multiple overviews, and using */
4901 : /* averaging, lets do them in cascading order to reduce the */
4902 : /* amount of computation. */
4903 : /* -------------------------------------------------------------------- */
4904 :
4905 : // In case the mask made be computed from another band of the dataset,
4906 : // we can't use cascaded generation, as the computation of the overviews
4907 : // of the band used for the mask band may not have yet occurred (#3033).
4908 854 : if ((STARTS_WITH_CI(pszResampling, "AVER") ||
4909 487 : EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
4910 456 : EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
4911 402 : EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
4912 854 : EQUAL(pszResampling, "MODE")) &&
4913 44 : nOverviewCount > 1 && bCanUseCascaded)
4914 44 : return GDALRegenerateCascadingOverviews(
4915 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
4916 44 : pProgressData, papszOptions);
4917 :
4918 : /* -------------------------------------------------------------------- */
4919 : /* Setup one horizontal swath to read from the raw buffer. */
4920 : /* -------------------------------------------------------------------- */
4921 810 : int nFRXBlockSize = 0;
4922 810 : int nFRYBlockSize = 0;
4923 810 : poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
4924 :
4925 810 : const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
4926 1302 : const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
4927 1252 : EQUAL(pszResampling, "MODE") ||
4928 442 : !GDALDataTypeIsComplex(eSrcDataType);
4929 : const GDALDataType eWrkDataType =
4930 : bUseGenericResampleFn
4931 810 : ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
4932 810 : : GDT_CFloat32;
4933 :
4934 810 : const int nWidth = poSrcBand->GetXSize();
4935 810 : const int nHeight = poSrcBand->GetYSize();
4936 :
4937 810 : int nMaxOvrFactor = 1;
4938 1737 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
4939 : {
4940 927 : const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
4941 927 : const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
4942 927 : nMaxOvrFactor = std::max(
4943 : nMaxOvrFactor,
4944 927 : static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
4945 927 : nMaxOvrFactor = std::max(
4946 : nMaxOvrFactor,
4947 927 : static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
4948 : }
4949 :
4950 810 : int nFullResYChunk = nFRYBlockSize;
4951 810 : int nMaxChunkYSizeQueried = 0;
4952 :
4953 : const auto UpdateChunkHeightAndGetChunkSize =
4954 10629 : [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
4955 85993 : eWrkDataType, nWidth]()
4956 : {
4957 : // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
4958 : // + nFullResYChunk) / nMaxOvrFactor)
4959 10629 : if (nMaxOvrFactor > INT_MAX / RADIUS_TO_DIAMETER)
4960 : {
4961 1 : return GINTBIG_MAX;
4962 : }
4963 10628 : nFullResYChunk =
4964 10628 : std::max(nFullResYChunk, RADIUS_TO_DIAMETER * nMaxOvrFactor);
4965 10628 : if ((nKernelRadius > 0 &&
4966 970 : nMaxOvrFactor > INT_MAX / (RADIUS_TO_DIAMETER * nKernelRadius)) ||
4967 10628 : nFullResYChunk >
4968 10628 : INT_MAX - RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor)
4969 : {
4970 0 : return GINTBIG_MAX;
4971 : }
4972 10628 : nMaxChunkYSizeQueried =
4973 10628 : nFullResYChunk + RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor;
4974 10628 : if (GDALGetDataTypeSizeBytes(eWrkDataType) >
4975 10628 : std::numeric_limits<int64_t>::max() /
4976 10628 : (static_cast<int64_t>(nMaxChunkYSizeQueried) * nWidth))
4977 : {
4978 1 : return GINTBIG_MAX;
4979 : }
4980 10627 : return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
4981 10627 : nMaxChunkYSizeQueried * nWidth;
4982 810 : };
4983 :
4984 : const char *pszChunkYSize =
4985 810 : CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
4986 : #ifndef __COVERITY__
4987 : // Only configurable for debug / testing
4988 810 : if (pszChunkYSize)
4989 : {
4990 0 : nFullResYChunk = atoi(pszChunkYSize);
4991 : }
4992 : #endif
4993 :
4994 : // Only configurable for debug / testing
4995 : const int nChunkMaxSize =
4996 810 : atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
4997 :
4998 810 : auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
4999 810 : if (nChunkSize > nChunkMaxSize)
5000 : {
5001 15 : if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
5002 44 : !GDALDataTypeIsComplex(eSrcDataType) &&
5003 14 : (!STARTS_WITH_CI(pszResampling, "AVER") ||
5004 2 : EQUAL(pszResampling, "AVERAGE")))
5005 : {
5006 : // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
5007 : // which use a block based strategy, which is much less memory
5008 : // hungry.
5009 14 : return GDALRegenerateOverviewsMultiBand(
5010 : 1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
5011 14 : pfnProgress, pProgressData, papszOptions);
5012 : }
5013 1 : else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
5014 : {
5015 0 : return GDALRegenerateCascadingOverviews(
5016 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
5017 0 : pfnProgress, pProgressData, papszOptions);
5018 : }
5019 : }
5020 795 : else if (pszChunkYSize == nullptr)
5021 : {
5022 : // Try to get as close as possible to nChunkMaxSize
5023 10614 : while (nChunkSize < nChunkMaxSize / 2)
5024 : {
5025 9819 : nFullResYChunk *= 2;
5026 9819 : nChunkSize = UpdateChunkHeightAndGetChunkSize();
5027 : }
5028 : }
5029 :
5030 796 : int nHasNoData = 0;
5031 796 : const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
5032 796 : const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
5033 : const bool bPropagateNoData =
5034 796 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5035 :
5036 : // Structure describing a resampling job
5037 : struct OvrJob
5038 : {
5039 : // Buffers to free when job is finished
5040 : std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
5041 : std::shared_ptr<PointerHolder> oSrcBufferHolder{};
5042 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
5043 :
5044 : GDALRasterBand *poDstBand = nullptr;
5045 :
5046 : // Input parameters of pfnResampleFn
5047 : GDALResampleFunction pfnResampleFn = nullptr;
5048 : int nSrcWidth = 0;
5049 : int nSrcHeight = 0;
5050 : int nDstWidth = 0;
5051 : GDALOverviewResampleArgs args{};
5052 : const void *pChunk = nullptr;
5053 : bool bUseGenericResampleFn = false;
5054 :
5055 : // Output values of resampling function
5056 : CPLErr eErr = CE_Failure;
5057 : void *pDstBuffer = nullptr;
5058 : GDALDataType eDstBufferDataType = GDT_Unknown;
5059 :
5060 0 : void SetSrcMaskBufferHolder(
5061 : const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
5062 : {
5063 0 : oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
5064 0 : }
5065 :
5066 0 : void SetSrcBufferHolder(
5067 : const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
5068 : {
5069 0 : oSrcBufferHolder = oSrcBufferHolderIn;
5070 0 : }
5071 :
5072 896 : void NotifyFinished()
5073 : {
5074 1792 : std::lock_guard guard(mutex);
5075 896 : bFinished = true;
5076 896 : cv.notify_one();
5077 896 : }
5078 :
5079 0 : bool IsFinished()
5080 : {
5081 0 : std::lock_guard guard(mutex);
5082 0 : return bFinished;
5083 : }
5084 :
5085 0 : void WaitFinished()
5086 : {
5087 0 : std::unique_lock oGuard(mutex);
5088 0 : while (!bFinished)
5089 : {
5090 0 : cv.wait(oGuard);
5091 : }
5092 0 : }
5093 :
5094 : private:
5095 : // Synchronization
5096 : bool bFinished = false;
5097 : std::mutex mutex{};
5098 : std::condition_variable cv{};
5099 : };
5100 :
5101 : // Thread function to resample
5102 896 : const auto JobResampleFunc = [](void *pData)
5103 : {
5104 896 : OvrJob *poJob = static_cast<OvrJob *>(pData);
5105 :
5106 896 : if (poJob->bUseGenericResampleFn)
5107 : {
5108 894 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
5109 : &(poJob->pDstBuffer),
5110 : &(poJob->eDstBufferDataType));
5111 : }
5112 : else
5113 : {
5114 2 : poJob->eErr = GDALResampleChunkC32R(
5115 : poJob->nSrcWidth, poJob->nSrcHeight,
5116 2 : static_cast<const float *>(poJob->pChunk),
5117 : poJob->args.nChunkYOff, poJob->args.nChunkYSize,
5118 : poJob->args.nDstYOff, poJob->args.nDstYOff2,
5119 : poJob->args.nOvrXSize, poJob->args.nOvrYSize,
5120 : &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
5121 : poJob->args.pszResampling);
5122 : }
5123 :
5124 : poJob->oDstBufferHolder =
5125 896 : std::make_unique<PointerHolder>(poJob->pDstBuffer);
5126 :
5127 896 : poJob->NotifyFinished();
5128 896 : };
5129 :
5130 : // Function to write resample data to target band
5131 896 : const auto WriteJobData = [](const OvrJob *poJob)
5132 : {
5133 1792 : return poJob->poDstBand->RasterIO(
5134 896 : GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
5135 896 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
5136 896 : poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
5137 896 : poJob->eDstBufferDataType, 0, 0, nullptr);
5138 : };
5139 :
5140 : // Wait for completion of oldest job and serialize it
5141 : const auto WaitAndFinalizeOldestJob =
5142 0 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
5143 : {
5144 0 : auto poOldestJob = jobList.front().get();
5145 0 : poOldestJob->WaitFinished();
5146 0 : CPLErr l_eErr = poOldestJob->eErr;
5147 0 : if (l_eErr == CE_None)
5148 : {
5149 0 : l_eErr = WriteJobData(poOldestJob);
5150 : }
5151 :
5152 0 : jobList.pop_front();
5153 0 : return l_eErr;
5154 : };
5155 :
5156 : // Queue of jobs
5157 1592 : std::list<std::unique_ptr<OvrJob>> jobList;
5158 :
5159 796 : GByte *pabyChunkNodataMask = nullptr;
5160 796 : void *pChunk = nullptr;
5161 :
5162 796 : const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
5163 3184 : const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
5164 796 : ? CPLGetNumCPUs()
5165 796 : : atoi(pszThreads)));
5166 : auto poThreadPool =
5167 796 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5168 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5169 1592 : : std::unique_ptr<CPLJobQueue>(nullptr);
5170 :
5171 : /* -------------------------------------------------------------------- */
5172 : /* Loop over image operating on chunks. */
5173 : /* -------------------------------------------------------------------- */
5174 796 : int nChunkYOff = 0;
5175 796 : CPLErr eErr = CE_None;
5176 :
5177 1597 : for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
5178 801 : nChunkYOff += nFullResYChunk)
5179 : {
5180 801 : if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
5181 : pProgressData))
5182 : {
5183 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
5184 0 : eErr = CE_Failure;
5185 : }
5186 :
5187 801 : if (nFullResYChunk + nChunkYOff > nHeight)
5188 793 : nFullResYChunk = nHeight - nChunkYOff;
5189 :
5190 801 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
5191 801 : int nChunkYSizeQueried =
5192 801 : nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
5193 801 : if (nChunkYOffQueried < 0)
5194 : {
5195 83 : nChunkYSizeQueried += nChunkYOffQueried;
5196 83 : nChunkYOffQueried = 0;
5197 : }
5198 801 : if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
5199 83 : nChunkYSizeQueried = nHeight - nChunkYOffQueried;
5200 :
5201 : // Avoid accumulating too many tasks and exhaust RAM
5202 : // Try to complete already finished jobs
5203 801 : while (eErr == CE_None && !jobList.empty())
5204 : {
5205 0 : auto poOldestJob = jobList.front().get();
5206 0 : if (!poOldestJob->IsFinished())
5207 0 : break;
5208 0 : eErr = poOldestJob->eErr;
5209 0 : if (eErr == CE_None)
5210 : {
5211 0 : eErr = WriteJobData(poOldestJob);
5212 : }
5213 :
5214 0 : jobList.pop_front();
5215 : }
5216 :
5217 : // And in case we have saturated the number of threads,
5218 : // wait for completion of tasks to go below the threshold.
5219 1602 : while (eErr == CE_None &&
5220 801 : jobList.size() >= static_cast<size_t>(nThreads))
5221 : {
5222 0 : eErr = WaitAndFinalizeOldestJob(jobList);
5223 : }
5224 :
5225 : // (Re)allocate buffers if needed
5226 801 : if (pChunk == nullptr)
5227 : {
5228 796 : pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
5229 : nMaxChunkYSizeQueried, nWidth);
5230 : }
5231 801 : if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
5232 : {
5233 : pabyChunkNodataMask = static_cast<GByte *>(
5234 283 : VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
5235 : }
5236 :
5237 801 : if (pChunk == nullptr ||
5238 283 : (bUseNoDataMask && pabyChunkNodataMask == nullptr))
5239 : {
5240 0 : CPLFree(pChunk);
5241 0 : CPLFree(pabyChunkNodataMask);
5242 0 : return CE_Failure;
5243 : }
5244 :
5245 : // Read chunk.
5246 801 : if (eErr == CE_None)
5247 801 : eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
5248 : nChunkYSizeQueried, pChunk, nWidth,
5249 : nChunkYSizeQueried, eWrkDataType, 0, 0,
5250 : nullptr);
5251 801 : if (eErr == CE_None && bUseNoDataMask)
5252 283 : eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
5253 : nChunkYSizeQueried, pabyChunkNodataMask,
5254 : nWidth, nChunkYSizeQueried, GDT_Byte, 0,
5255 : 0, nullptr);
5256 :
5257 : // Special case to promote 1bit data to 8bit 0/255 values.
5258 801 : if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
5259 : {
5260 9 : if (eWrkDataType == GDT_Float32)
5261 : {
5262 0 : float *pafChunk = static_cast<float *>(pChunk);
5263 0 : for (size_t i = 0;
5264 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5265 : {
5266 0 : if (pafChunk[i] == 1.0f)
5267 0 : pafChunk[i] = 255.0f;
5268 : }
5269 : }
5270 9 : else if (eWrkDataType == GDT_Byte)
5271 : {
5272 9 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
5273 168417 : for (size_t i = 0;
5274 168417 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5275 : {
5276 168408 : if (pabyChunk[i] == 1)
5277 127437 : pabyChunk[i] = 255;
5278 : }
5279 : }
5280 0 : else if (eWrkDataType == GDT_UInt16)
5281 : {
5282 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
5283 0 : for (size_t i = 0;
5284 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5285 : {
5286 0 : if (pasChunk[i] == 1)
5287 0 : pasChunk[i] = 255;
5288 : }
5289 : }
5290 0 : else if (eWrkDataType == GDT_Float64)
5291 : {
5292 0 : double *padfChunk = static_cast<double *>(pChunk);
5293 0 : for (size_t i = 0;
5294 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5295 : {
5296 0 : if (padfChunk[i] == 1.0)
5297 0 : padfChunk[i] = 255.0;
5298 : }
5299 : }
5300 : else
5301 : {
5302 0 : CPLAssert(false);
5303 : }
5304 : }
5305 792 : else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
5306 : {
5307 0 : if (eWrkDataType == GDT_Float32)
5308 : {
5309 0 : float *pafChunk = static_cast<float *>(pChunk);
5310 0 : for (size_t i = 0;
5311 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5312 : {
5313 0 : if (pafChunk[i] == 1.0f)
5314 0 : pafChunk[i] = 0.0f;
5315 0 : else if (pafChunk[i] == 0.0f)
5316 0 : pafChunk[i] = 255.0f;
5317 : }
5318 : }
5319 0 : else if (eWrkDataType == GDT_Byte)
5320 : {
5321 0 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
5322 0 : for (size_t i = 0;
5323 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5324 : {
5325 0 : if (pabyChunk[i] == 1)
5326 0 : pabyChunk[i] = 0;
5327 0 : else if (pabyChunk[i] == 0)
5328 0 : pabyChunk[i] = 255;
5329 : }
5330 : }
5331 0 : else if (eWrkDataType == GDT_UInt16)
5332 : {
5333 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
5334 0 : for (size_t i = 0;
5335 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5336 : {
5337 0 : if (pasChunk[i] == 1)
5338 0 : pasChunk[i] = 0;
5339 0 : else if (pasChunk[i] == 0)
5340 0 : pasChunk[i] = 255;
5341 : }
5342 : }
5343 0 : else if (eWrkDataType == GDT_Float64)
5344 : {
5345 0 : double *padfChunk = static_cast<double *>(pChunk);
5346 0 : for (size_t i = 0;
5347 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5348 : {
5349 0 : if (padfChunk[i] == 1.0)
5350 0 : padfChunk[i] = 0.0;
5351 0 : else if (padfChunk[i] == 0.0)
5352 0 : padfChunk[i] = 255.0;
5353 : }
5354 : }
5355 : else
5356 : {
5357 0 : CPLAssert(false);
5358 : }
5359 : }
5360 :
5361 : auto oSrcBufferHolder =
5362 1602 : std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
5363 : auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
5364 1602 : poJobQueue ? pabyChunkNodataMask : nullptr);
5365 :
5366 1697 : for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
5367 : ++iOverview)
5368 : {
5369 896 : GDALRasterBand *poDstBand = papoOvrBands[iOverview];
5370 896 : const int nDstWidth = poDstBand->GetXSize();
5371 896 : const int nDstHeight = poDstBand->GetYSize();
5372 :
5373 896 : const double dfXRatioDstToSrc =
5374 896 : static_cast<double>(nWidth) / nDstWidth;
5375 896 : const double dfYRatioDstToSrc =
5376 896 : static_cast<double>(nHeight) / nDstHeight;
5377 :
5378 : /* --------------------------------------------------------------------
5379 : */
5380 : /* Figure out the line to start writing to, and the first line
5381 : */
5382 : /* to not write to. In theory this approach should ensure that
5383 : */
5384 : /* every output line will be written if all input chunks are */
5385 : /* processed. */
5386 : /* --------------------------------------------------------------------
5387 : */
5388 896 : int nDstYOff =
5389 896 : static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
5390 896 : if (nDstYOff == nDstHeight)
5391 0 : continue;
5392 896 : int nDstYOff2 = static_cast<int>(
5393 896 : 0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
5394 :
5395 896 : if (nChunkYOff + nFullResYChunk == nHeight)
5396 889 : nDstYOff2 = nDstHeight;
5397 : #if DEBUG_VERBOSE
5398 : CPLDebug("GDAL",
5399 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
5400 : nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
5401 : nDstWidth, nDstYOff2 - nDstYOff);
5402 : #endif
5403 :
5404 1792 : auto poJob = std::make_unique<OvrJob>();
5405 896 : poJob->pfnResampleFn = pfnResampleFn;
5406 896 : poJob->bUseGenericResampleFn = bUseGenericResampleFn;
5407 896 : poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
5408 896 : poJob->args.nOvrXSize = poDstBand->GetXSize();
5409 896 : poJob->args.nOvrYSize = poDstBand->GetYSize();
5410 : const char *pszNBITS =
5411 896 : poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
5412 896 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
5413 896 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
5414 896 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
5415 896 : poJob->args.eWrkDataType = eWrkDataType;
5416 896 : poJob->pChunk = pChunk;
5417 896 : poJob->args.pabyChunkNodataMask = pabyChunkNodataMask;
5418 896 : poJob->nSrcWidth = nWidth;
5419 896 : poJob->nSrcHeight = nHeight;
5420 896 : poJob->args.nChunkXOff = 0;
5421 896 : poJob->args.nChunkXSize = nWidth;
5422 896 : poJob->args.nChunkYOff = nChunkYOffQueried;
5423 896 : poJob->args.nChunkYSize = nChunkYSizeQueried;
5424 896 : poJob->nDstWidth = nDstWidth;
5425 896 : poJob->args.nDstXOff = 0;
5426 896 : poJob->args.nDstXOff2 = nDstWidth;
5427 896 : poJob->args.nDstYOff = nDstYOff;
5428 896 : poJob->args.nDstYOff2 = nDstYOff2;
5429 896 : poJob->poDstBand = poDstBand;
5430 896 : poJob->args.pszResampling = pszResampling;
5431 896 : poJob->args.bHasNoData = bHasNoData;
5432 896 : poJob->args.dfNoDataValue = dfNoDataValue;
5433 896 : poJob->args.poColorTable = poColorTable;
5434 896 : poJob->args.eSrcDataType = eSrcDataType;
5435 896 : poJob->args.bPropagateNoData = bPropagateNoData;
5436 :
5437 896 : if (poJobQueue)
5438 : {
5439 0 : poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
5440 0 : poJob->SetSrcBufferHolder(oSrcBufferHolder);
5441 0 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
5442 0 : jobList.emplace_back(std::move(poJob));
5443 : }
5444 : else
5445 : {
5446 896 : JobResampleFunc(poJob.get());
5447 896 : eErr = poJob->eErr;
5448 896 : if (eErr == CE_None)
5449 : {
5450 896 : eErr = WriteJobData(poJob.get());
5451 : }
5452 : }
5453 : }
5454 :
5455 801 : if (poJobQueue)
5456 : {
5457 0 : pChunk = nullptr;
5458 0 : pabyChunkNodataMask = nullptr;
5459 : }
5460 : }
5461 :
5462 796 : VSIFree(pChunk);
5463 796 : VSIFree(pabyChunkNodataMask);
5464 :
5465 : // Wait for all pending jobs to complete
5466 796 : while (!jobList.empty())
5467 : {
5468 0 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5469 0 : if (l_eErr != CE_None && eErr == CE_None)
5470 0 : eErr = l_eErr;
5471 : }
5472 :
5473 : /* -------------------------------------------------------------------- */
5474 : /* Renormalized overview mean / stddev if needed. */
5475 : /* -------------------------------------------------------------------- */
5476 796 : if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
5477 : {
5478 0 : GDALOverviewMagnitudeCorrection(
5479 : poSrcBand, nOverviewCount,
5480 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
5481 : GDALDummyProgress, nullptr);
5482 : }
5483 :
5484 : /* -------------------------------------------------------------------- */
5485 : /* It can be important to flush out data to overviews. */
5486 : /* -------------------------------------------------------------------- */
5487 1685 : for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
5488 : ++iOverview)
5489 : {
5490 889 : eErr = papoOvrBands[iOverview]->FlushCache(false);
5491 : }
5492 :
5493 796 : if (eErr == CE_None)
5494 796 : pfnProgress(1.0, nullptr, pProgressData);
5495 :
5496 796 : return eErr;
5497 : }
5498 :
5499 : /************************************************************************/
5500 : /* GDALRegenerateOverviewsMultiBand() */
5501 : /************************************************************************/
5502 :
5503 : /**
5504 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5505 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5506 : *
5507 : * This function will generate one or more overview images from a base
5508 : * image using the requested downsampling algorithm. Its primary use
5509 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
5510 : * can also be used to generate downsampled images in one file from another
5511 : * outside the overview architecture.
5512 : *
5513 : * The output bands need to exist in advance and share the same characteristics
5514 : * (type, dimensions)
5515 : *
5516 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5517 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5518 : *
5519 : * It does not support color tables or complex data types.
5520 : *
5521 : * The pseudo-algorithm used by the function is :
5522 : * for each overview
5523 : * iterate on lines of the source by a step of deltay
5524 : * iterate on columns of the source by a step of deltax
5525 : * read the source data of size deltax * deltay for all the bands
5526 : * generate the corresponding overview block for all the bands
5527 : *
5528 : * This function will honour properly NODATA_VALUES tuples (special dataset
5529 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5530 : * considered as the nodata value and not each value of the triplet
5531 : * independently per band.
5532 : *
5533 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
5534 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5535 : * overview computation.
5536 : *
5537 : * @param nBands the number of bands, size of papoSrcBands and size of
5538 : * first dimension of papapoOverviewBands
5539 : * @param papoSrcBands the list of source bands to downsample
5540 : * @param nOverviews the number of downsampled overview levels being generated.
5541 : * @param papapoOverviewBands bidimension array of bands. First dimension is
5542 : * indexed by nBands. Second dimension is indexed by
5543 : * nOverviews.
5544 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5545 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5546 : * @param pfnProgress progress report function.
5547 : * @param pProgressData progress function callback data.
5548 : * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
5549 : * key=value pairs, or NULL
5550 : * Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
5551 : * options can be specified to express that overviews should
5552 : * be regenerated only in the specified subset of the source
5553 : * dataset.
5554 : * @return CE_None on success or CE_Failure on failure.
5555 : */
5556 :
5557 388 : CPLErr GDALRegenerateOverviewsMultiBand(
5558 : int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
5559 : GDALRasterBand *const *const *papapoOverviewBands,
5560 : const char *pszResampling, GDALProgressFunc pfnProgress,
5561 : void *pProgressData, CSLConstList papszOptions)
5562 : {
5563 388 : CPL_IGNORE_RET_VAL(papszOptions);
5564 :
5565 388 : if (pfnProgress == nullptr)
5566 11 : pfnProgress = GDALDummyProgress;
5567 :
5568 388 : if (EQUAL(pszResampling, "NONE") || nBands == 0 || nOverviews == 0)
5569 3 : return CE_None;
5570 :
5571 : // Sanity checks.
5572 385 : if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
5573 191 : !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
5574 82 : !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
5575 22 : !EQUAL(pszResampling, "CUBICSPLINE") &&
5576 21 : !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
5577 5 : !EQUAL(pszResampling, "MODE"))
5578 : {
5579 0 : CPLError(CE_Failure, CPLE_NotSupported,
5580 : "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
5581 : "not supported",
5582 : pszResampling);
5583 0 : return CE_Failure;
5584 : }
5585 :
5586 385 : int nKernelRadius = 0;
5587 : GDALResampleFunction pfnResampleFn =
5588 385 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
5589 385 : if (pfnResampleFn == nullptr)
5590 0 : return CE_Failure;
5591 :
5592 385 : const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
5593 385 : const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
5594 385 : if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
5595 0 : return CE_None;
5596 385 : GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
5597 66232 : for (int iBand = 1; iBand < nBands; ++iBand)
5598 : {
5599 131694 : if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
5600 65847 : papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
5601 : {
5602 0 : CPLError(
5603 : CE_Failure, CPLE_NotSupported,
5604 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5605 : "have the same dimensions");
5606 0 : return CE_Failure;
5607 : }
5608 65847 : if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
5609 : {
5610 0 : CPLError(
5611 : CE_Failure, CPLE_NotSupported,
5612 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5613 : "have the same data type");
5614 0 : return CE_Failure;
5615 : }
5616 : }
5617 :
5618 1031 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5619 : {
5620 646 : const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
5621 646 : const int nDstWidth = poOvrFirstBand->GetXSize();
5622 646 : const int nDstHeight = poOvrFirstBand->GetYSize();
5623 66759 : for (int iBand = 1; iBand < nBands; ++iBand)
5624 : {
5625 66113 : const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
5626 132226 : if (poOvrBand->GetXSize() != nDstWidth ||
5627 66113 : poOvrBand->GetYSize() != nDstHeight)
5628 : {
5629 0 : CPLError(
5630 : CE_Failure, CPLE_NotSupported,
5631 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5632 : "of the same level must have the same dimensions");
5633 0 : return CE_Failure;
5634 : }
5635 66113 : if (poOvrBand->GetRasterDataType() != eDataType)
5636 : {
5637 0 : CPLError(
5638 : CE_Failure, CPLE_NotSupported,
5639 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5640 : "must have the same data type as the source bands");
5641 0 : return CE_Failure;
5642 : }
5643 : }
5644 : }
5645 :
5646 : // First pass to compute the total number of pixels to write.
5647 385 : double dfTotalPixelCount = 0;
5648 385 : const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
5649 385 : const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
5650 385 : const int nSrcXSize = atoi(CSLFetchNameValueDef(
5651 : papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
5652 385 : const int nSrcYSize = atoi(CSLFetchNameValueDef(
5653 : papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
5654 1031 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5655 : {
5656 646 : dfTotalPixelCount +=
5657 1292 : static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
5658 646 : papapoOverviewBands[0][iOverview]->GetXSize() *
5659 1292 : static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
5660 646 : papapoOverviewBands[0][iOverview]->GetYSize();
5661 : }
5662 :
5663 : const GDALDataType eWrkDataType =
5664 385 : GDALGetOvrWorkDataType(pszResampling, eDataType);
5665 : const int nWrkDataTypeSize =
5666 385 : std::max(1, GDALGetDataTypeSizeBytes(eWrkDataType));
5667 :
5668 385 : const bool bIsMask = papoSrcBands[0]->IsMaskBand();
5669 :
5670 : // If we have a nodata mask and we are doing something more complicated
5671 : // than nearest neighbouring, we have to fetch to nodata mask.
5672 : const bool bUseNoDataMask =
5673 568 : !STARTS_WITH_CI(pszResampling, "NEAR") &&
5674 183 : (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
5675 :
5676 770 : std::vector<bool> abHasNoData(nBands);
5677 770 : std::vector<double> adfNoDataValue(nBands);
5678 :
5679 66617 : for (int iBand = 0; iBand < nBands; ++iBand)
5680 : {
5681 66232 : int nHasNoData = 0;
5682 132464 : adfNoDataValue[iBand] =
5683 66232 : papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
5684 66232 : abHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
5685 : }
5686 : const bool bPropagateNoData =
5687 385 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5688 :
5689 385 : const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
5690 1540 : const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
5691 385 : ? CPLGetNumCPUs()
5692 385 : : atoi(pszThreads)));
5693 : auto poThreadPool =
5694 385 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5695 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5696 770 : : std::unique_ptr<CPLJobQueue>(nullptr);
5697 :
5698 : // Only configurable for debug / testing
5699 385 : const GIntBig nChunkMaxSize = []() -> GIntBig
5700 : {
5701 : const char *pszVal =
5702 385 : CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", nullptr);
5703 385 : if (pszVal)
5704 : {
5705 15 : GIntBig nRet = 0;
5706 15 : CPLParseMemorySize(pszVal, &nRet, nullptr);
5707 15 : return std::max<GIntBig>(100, nRet);
5708 : }
5709 370 : return 10 * 1024 * 1024;
5710 385 : }();
5711 :
5712 : // Only configurable for debug / testing
5713 385 : const GIntBig nChunkMaxSizeForTempFile = []() -> GIntBig
5714 : {
5715 385 : const char *pszVal = CPLGetConfigOption(
5716 : "GDAL_OVR_CHUNK_MAX_SIZE_FOR_TEMP_FILE", nullptr);
5717 385 : if (pszVal)
5718 : {
5719 14 : GIntBig nRet = 0;
5720 14 : CPLParseMemorySize(pszVal, &nRet, nullptr);
5721 14 : return std::max<GIntBig>(100, nRet);
5722 : }
5723 371 : const auto nUsableRAM = CPLGetUsablePhysicalRAM();
5724 371 : if (nUsableRAM > 0)
5725 371 : return nUsableRAM / 10;
5726 : // Select a value to be able to at least downsample by 2 for a RGB
5727 : // 1024x1024 tiled output: (2 * 1024 + 2) * (2 * 1024 + 2) * 3 = 12 MB
5728 0 : return 100 * 1024 * 1024;
5729 385 : }();
5730 :
5731 : // Second pass to do the real job.
5732 385 : double dfCurPixelCount = 0;
5733 385 : CPLErr eErr = CE_None;
5734 1025 : for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
5735 : ++iOverview)
5736 : {
5737 645 : int iSrcOverview = -1; // -1 means the source bands.
5738 :
5739 : const int nDstTotalWidth =
5740 645 : papapoOverviewBands[0][iOverview]->GetXSize();
5741 : const int nDstTotalHeight =
5742 645 : papapoOverviewBands[0][iOverview]->GetYSize();
5743 :
5744 : // Compute the coordinates of the target region to refresh
5745 645 : constexpr double EPS = 1e-8;
5746 645 : const int nDstXOffStart = static_cast<int>(
5747 645 : static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
5748 : EPS);
5749 : const int nDstXOffEnd =
5750 1290 : std::min(static_cast<int>(
5751 645 : std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
5752 645 : nToplevelSrcWidth * nDstTotalWidth -
5753 : EPS)),
5754 645 : nDstTotalWidth);
5755 645 : const int nDstWidth = nDstXOffEnd - nDstXOffStart;
5756 645 : const int nDstYOffStart =
5757 645 : static_cast<int>(static_cast<double>(nSrcYOff) /
5758 645 : nToplevelSrcHeight * nDstTotalHeight +
5759 : EPS);
5760 : const int nDstYOffEnd =
5761 1290 : std::min(static_cast<int>(
5762 645 : std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
5763 645 : nToplevelSrcHeight * nDstTotalHeight -
5764 : EPS)),
5765 645 : nDstTotalHeight);
5766 645 : const int nDstHeight = nDstYOffEnd - nDstYOffStart;
5767 :
5768 : // Try to use previous level of overview as the source to compute
5769 : // the next level.
5770 645 : int nSrcWidth = nToplevelSrcWidth;
5771 645 : int nSrcHeight = nToplevelSrcHeight;
5772 905 : if (iOverview > 0 &&
5773 260 : papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
5774 : {
5775 252 : nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
5776 252 : nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
5777 252 : iSrcOverview = iOverview - 1;
5778 : }
5779 :
5780 645 : const double dfXRatioDstToSrc =
5781 645 : static_cast<double>(nSrcWidth) / nDstTotalWidth;
5782 645 : const double dfYRatioDstToSrc =
5783 645 : static_cast<double>(nSrcHeight) / nDstTotalHeight;
5784 :
5785 : const int nOvrFactor =
5786 1935 : std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
5787 645 : static_cast<int>(0.5 + dfYRatioDstToSrc)));
5788 :
5789 645 : int nDstChunkXSize = 0;
5790 645 : int nDstChunkYSize = 0;
5791 645 : papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
5792 : &nDstChunkYSize);
5793 :
5794 645 : constexpr int PIXEL_MARGIN = 2;
5795 : // Try to extend the chunk size so that the memory needed to acquire
5796 : // source pixels goes up to 10 MB.
5797 : // This can help for drivers that support multi-threaded reading
5798 645 : const int nFullResYChunk = static_cast<int>(std::min<double>(
5799 645 : nSrcHeight, PIXEL_MARGIN + nDstChunkYSize * dfYRatioDstToSrc));
5800 645 : const int nFullResYChunkQueried = static_cast<int>(std::min<int64_t>(
5801 1290 : nSrcHeight,
5802 1290 : nFullResYChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5803 645 : nKernelRadius * nOvrFactor));
5804 881 : while (nDstChunkXSize < nDstWidth)
5805 : {
5806 255 : constexpr int INCREASE_FACTOR = 2;
5807 :
5808 255 : const int nFullResXChunk = static_cast<int>(std::min<double>(
5809 510 : nSrcWidth, PIXEL_MARGIN + INCREASE_FACTOR * nDstChunkXSize *
5810 255 : dfXRatioDstToSrc));
5811 :
5812 : const int nFullResXChunkQueried =
5813 255 : static_cast<int>(std::min<int64_t>(
5814 510 : nSrcWidth,
5815 510 : nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5816 255 : nKernelRadius * nOvrFactor));
5817 :
5818 255 : if (nBands > nChunkMaxSize / nFullResXChunkQueried /
5819 255 : nFullResYChunkQueried / nWrkDataTypeSize)
5820 : {
5821 19 : break;
5822 : }
5823 :
5824 236 : nDstChunkXSize *= INCREASE_FACTOR;
5825 : }
5826 645 : nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
5827 :
5828 645 : const int nFullResXChunk = static_cast<int>(std::min<double>(
5829 645 : nSrcWidth, PIXEL_MARGIN + nDstChunkXSize * dfXRatioDstToSrc));
5830 645 : const int nFullResXChunkQueried = static_cast<int>(std::min<int64_t>(
5831 1290 : nSrcWidth,
5832 1290 : nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5833 645 : nKernelRadius * nOvrFactor));
5834 :
5835 : // Make sure that the RAM requirements to acquire the source data does
5836 : // not exceed nChunkMaxSizeForTempFile
5837 : // If so, reduce the destination chunk size, generate overviews in a
5838 : // temporary dataset, and copy that temporary dataset over the target
5839 : // overview bands (to avoid issues with lossy compression)
5840 : const bool bOverflowFullResXChunkYChunkQueried =
5841 645 : nBands > std::numeric_limits<int64_t>::max() /
5842 645 : nFullResXChunkQueried / nFullResYChunkQueried /
5843 645 : nWrkDataTypeSize;
5844 :
5845 645 : const auto nMemRequirement =
5846 : bOverflowFullResXChunkYChunkQueried
5847 645 : ? 0
5848 641 : : static_cast<GIntBig>(nFullResXChunkQueried) *
5849 641 : nFullResYChunkQueried * nBands * nWrkDataTypeSize;
5850 : // Use a temporary dataset with a smaller destination chunk size
5851 645 : const auto nOverShootFactor =
5852 : nMemRequirement / nChunkMaxSizeForTempFile;
5853 :
5854 645 : constexpr int MIN_OVERSHOOT_FACTOR = 4;
5855 : const auto nSqrtOverShootFactor = std::max<GIntBig>(
5856 1290 : MIN_OVERSHOOT_FACTOR, static_cast<GIntBig>(std::ceil(std::sqrt(
5857 645 : static_cast<double>(nOverShootFactor)))));
5858 645 : constexpr int DEFAULT_CHUNK_SIZE = 256;
5859 645 : constexpr int GTIFF_BLOCK_SIZE_MULTIPLE = 16;
5860 : const int nReducedDstChunkXSize =
5861 : bOverflowFullResXChunkYChunkQueried
5862 1286 : ? DEFAULT_CHUNK_SIZE
5863 1286 : : std::max(1, static_cast<int>(nDstChunkXSize /
5864 1286 : nSqrtOverShootFactor) &
5865 641 : ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
5866 : const int nReducedDstChunkYSize =
5867 : bOverflowFullResXChunkYChunkQueried
5868 1286 : ? DEFAULT_CHUNK_SIZE
5869 1286 : : std::max(1, static_cast<int>(nDstChunkYSize /
5870 1286 : nSqrtOverShootFactor) &
5871 641 : ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
5872 :
5873 645 : if (bOverflowFullResXChunkYChunkQueried ||
5874 : nMemRequirement > nChunkMaxSizeForTempFile)
5875 : {
5876 : const auto nDTSize =
5877 43 : std::max(1, GDALGetDataTypeSizeBytes(eDataType));
5878 : const bool bTmpDSMemRequirementOverflow =
5879 43 : nBands > std::numeric_limits<int64_t>::max() / nDstWidth /
5880 43 : nDstHeight / nDTSize;
5881 43 : const auto nTmpDSMemRequirement =
5882 : bTmpDSMemRequirementOverflow
5883 43 : ? 0
5884 41 : : static_cast<GIntBig>(nDstWidth) * nDstHeight * nBands *
5885 41 : nDTSize;
5886 :
5887 : // make sure that one band buffer doesn't overflow size_t
5888 : const bool bChunkSizeOverflow =
5889 43 : static_cast<size_t>(nDTSize) >
5890 43 : std::numeric_limits<size_t>::max() / nDstWidth / nDstHeight;
5891 43 : const size_t nChunkSize =
5892 : bChunkSizeOverflow
5893 43 : ? 0
5894 41 : : static_cast<size_t>(nDstWidth) * nDstHeight * nDTSize;
5895 :
5896 : const auto CreateVRT =
5897 41 : [nBands, nSrcWidth, nSrcHeight, nDstTotalWidth, nDstTotalHeight,
5898 : pszResampling, eWrkDataType, papoSrcBands, papapoOverviewBands,
5899 : iSrcOverview, &abHasNoData,
5900 393585 : &adfNoDataValue](int nVRTBlockXSize, int nVRTBlockYSize)
5901 : {
5902 : auto poVRTDS = std::make_unique<VRTDataset>(
5903 41 : nDstTotalWidth, nDstTotalHeight, nVRTBlockXSize,
5904 41 : nVRTBlockYSize);
5905 :
5906 65620 : for (int iBand = 0; iBand < nBands; ++iBand)
5907 : {
5908 131158 : auto poVRTSrc = std::make_unique<VRTSimpleSource>();
5909 65579 : poVRTSrc->SetResampling(pszResampling);
5910 65579 : poVRTDS->AddBand(eWrkDataType);
5911 : auto poVRTBand = static_cast<VRTSourcedRasterBand *>(
5912 65579 : poVRTDS->GetRasterBand(iBand + 1));
5913 :
5914 65579 : auto poSrcBand = papoSrcBands[iBand];
5915 65579 : if (iSrcOverview != -1)
5916 24 : poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
5917 65579 : poVRTBand->ConfigureSource(
5918 : poVRTSrc.get(), poSrcBand, false, 0, 0, nSrcWidth,
5919 : nSrcHeight, 0, 0, nDstTotalWidth, nDstTotalHeight);
5920 : // Add the source to the band
5921 65579 : poVRTBand->AddSource(poVRTSrc.release());
5922 65579 : if (abHasNoData[iBand])
5923 3 : poVRTBand->SetNoDataValue(adfNoDataValue[iBand]);
5924 : }
5925 :
5926 42 : if (papoSrcBands[0]->GetMaskFlags() == GMF_PER_DATASET &&
5927 1 : poVRTDS->CreateMaskBand(GMF_PER_DATASET) == CE_None)
5928 : {
5929 : VRTSourcedRasterBand *poMaskVRTBand =
5930 1 : cpl::down_cast<VRTSourcedRasterBand *>(
5931 1 : poVRTDS->GetRasterBand(1)->GetMaskBand());
5932 1 : auto poSrcBand = papoSrcBands[0];
5933 1 : if (iSrcOverview != -1)
5934 0 : poSrcBand = papapoOverviewBands[0][iSrcOverview];
5935 1 : poMaskVRTBand->AddMaskBandSource(
5936 1 : poSrcBand->GetMaskBand(), 0, 0, nSrcWidth, nSrcHeight,
5937 : 0, 0, nDstTotalWidth, nDstTotalHeight);
5938 : }
5939 :
5940 41 : return poVRTDS;
5941 43 : };
5942 :
5943 : // If the overview accommodates chunking, do so and recurse
5944 : // to avoid generating full size temporary files
5945 43 : if (!bOverflowFullResXChunkYChunkQueried &&
5946 39 : !bTmpDSMemRequirementOverflow && !bChunkSizeOverflow &&
5947 39 : (nDstChunkXSize < nDstWidth || nDstChunkYSize < nDstHeight))
5948 : {
5949 : // Create a VRT with the smaller chunk to do the scaling
5950 : auto poVRTDS =
5951 13 : CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
5952 :
5953 13 : std::vector<GDALRasterBand *> apoVRTBand(nBands);
5954 13 : std::vector<GDALRasterBand *> apoDstBand(nBands);
5955 65560 : for (int iBand = 0; iBand < nBands; ++iBand)
5956 : {
5957 65547 : apoDstBand[iBand] = papapoOverviewBands[iBand][iOverview];
5958 65547 : apoVRTBand[iBand] = poVRTDS->GetRasterBand(iBand + 1);
5959 : }
5960 :
5961 : // Use a flag to avoid reading from the overview being built
5962 : GDALRasterIOExtraArg sExtraArg;
5963 13 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5964 13 : if (iSrcOverview == -1)
5965 13 : sExtraArg.bUseOnlyThisScale = true;
5966 :
5967 : // A single band buffer for data transfer to the overview
5968 13 : std::vector<GByte> abyChunk;
5969 : try
5970 : {
5971 13 : abyChunk.resize(nChunkSize);
5972 : }
5973 0 : catch (const std::exception &)
5974 : {
5975 0 : CPLError(CE_Failure, CPLE_OutOfMemory,
5976 : "Out of memory allocating temporary buffer");
5977 0 : return CE_Failure;
5978 : }
5979 :
5980 : // Loop over output height, in chunks
5981 13 : for (int nDstYOff = nDstYOffStart;
5982 38 : nDstYOff < nDstYOffEnd && eErr == CE_None;
5983 : /* */)
5984 : {
5985 : const int nDstYCount =
5986 25 : std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
5987 : // Loop over output width, in output chunks
5988 25 : for (int nDstXOff = nDstXOffStart;
5989 74 : nDstXOff < nDstXOffEnd && eErr == CE_None;
5990 : /* */)
5991 : {
5992 : const int nDstXCount =
5993 49 : std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
5994 : // Read and transfer the chunk to the overview
5995 98 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
5996 : ++iBand)
5997 : {
5998 98 : eErr = apoVRTBand[iBand]->RasterIO(
5999 : GF_Read, nDstXOff, nDstYOff, nDstXCount,
6000 49 : nDstYCount, abyChunk.data(), nDstXCount,
6001 : nDstYCount, eDataType, 0, 0, &sExtraArg);
6002 49 : if (eErr == CE_None)
6003 : {
6004 96 : eErr = apoDstBand[iBand]->RasterIO(
6005 : GF_Write, nDstXOff, nDstYOff, nDstXCount,
6006 48 : nDstYCount, abyChunk.data(), nDstXCount,
6007 : nDstYCount, eDataType, 0, 0, nullptr);
6008 : }
6009 : }
6010 :
6011 49 : dfCurPixelCount +=
6012 49 : static_cast<double>(nDstXCount) * nDstYCount;
6013 :
6014 49 : nDstXOff += nDstXCount;
6015 : } // width
6016 :
6017 25 : if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount,
6018 : nullptr, pProgressData))
6019 : {
6020 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
6021 : "User terminated");
6022 0 : eErr = CE_Failure;
6023 : }
6024 :
6025 25 : nDstYOff += nDstYCount;
6026 : } // height
6027 :
6028 13 : if (CE_None != eErr)
6029 : {
6030 1 : CPLError(CE_Failure, CPLE_AppDefined,
6031 : "Error while writing overview");
6032 1 : return CE_Failure;
6033 : }
6034 :
6035 12 : pfnProgress(1.0, nullptr, pProgressData);
6036 : // Flush the overviews we just generated
6037 24 : for (int iBand = 0; iBand < nBands; ++iBand)
6038 12 : apoDstBand[iBand]->FlushCache(false);
6039 :
6040 12 : continue; // Next overview
6041 : } // chunking via temporary dataset
6042 :
6043 0 : std::unique_ptr<GDALDataset> poTmpDS;
6044 : // Config option mostly/only for autotest purposes
6045 : const char *pszGDAL_OVR_TEMP_DRIVER =
6046 30 : CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
6047 30 : if ((!bTmpDSMemRequirementOverflow &&
6048 4 : nTmpDSMemRequirement <= nChunkMaxSizeForTempFile &&
6049 4 : !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
6050 26 : EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
6051 : {
6052 10 : auto poTmpDrv = GetGDALDriverManager()->GetDriverByName("MEM");
6053 10 : if (!poTmpDrv)
6054 : {
6055 0 : eErr = CE_Failure;
6056 0 : break;
6057 : }
6058 10 : poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
6059 : nDstTotalHeight, nBands,
6060 10 : eDataType, nullptr));
6061 : }
6062 : else
6063 : {
6064 : // Create a temporary file for the overview
6065 : auto poTmpDrv =
6066 20 : GetGDALDriverManager()->GetDriverByName("GTiff");
6067 20 : if (!poTmpDrv)
6068 : {
6069 0 : eErr = CE_Failure;
6070 0 : break;
6071 : }
6072 40 : std::string osTmpFilename;
6073 20 : auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
6074 20 : if (poDstDS)
6075 : {
6076 20 : osTmpFilename = poDstDS->GetDescription();
6077 : VSIStatBufL sStatBuf;
6078 20 : if (!osTmpFilename.empty() &&
6079 0 : VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
6080 0 : osTmpFilename += "_tmp_ovr.tif";
6081 : }
6082 20 : if (osTmpFilename.empty())
6083 : {
6084 20 : osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
6085 20 : osTmpFilename += ".tif";
6086 : }
6087 20 : CPLDebug("GDAL", "Creating temporary file %s of %d x %d x %d",
6088 : osTmpFilename.c_str(), nDstWidth, nDstHeight, nBands);
6089 40 : CPLStringList aosCO;
6090 20 : if (0 == ((nReducedDstChunkXSize % GTIFF_BLOCK_SIZE_MULTIPLE) |
6091 20 : (nReducedDstChunkYSize % GTIFF_BLOCK_SIZE_MULTIPLE)))
6092 : {
6093 14 : aosCO.SetNameValue("TILED", "YES");
6094 : aosCO.SetNameValue("BLOCKXSIZE",
6095 14 : CPLSPrintf("%d", nReducedDstChunkXSize));
6096 : aosCO.SetNameValue("BLOCKYSIZE",
6097 14 : CPLSPrintf("%d", nReducedDstChunkYSize));
6098 : }
6099 20 : if (const char *pszCOList =
6100 20 : poTmpDrv->GetMetadataItem(GDAL_DMD_CREATIONOPTIONLIST))
6101 : {
6102 : aosCO.SetNameValue(
6103 20 : "COMPRESS", strstr(pszCOList, "ZSTD") ? "ZSTD" : "LZW");
6104 : }
6105 20 : poTmpDS.reset(poTmpDrv->Create(osTmpFilename.c_str(), nDstWidth,
6106 : nDstHeight, nBands, eDataType,
6107 20 : aosCO.List()));
6108 20 : if (poTmpDS)
6109 : {
6110 18 : poTmpDS->MarkSuppressOnClose();
6111 18 : VSIUnlink(osTmpFilename.c_str());
6112 : }
6113 : }
6114 30 : if (!poTmpDS)
6115 : {
6116 2 : eErr = CE_Failure;
6117 2 : break;
6118 : }
6119 :
6120 : // Create a full size VRT to do the resampling without edge effects
6121 : auto poVRTDS =
6122 28 : CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
6123 :
6124 : // Allocate a band buffer with the overview chunk size
6125 : std::unique_ptr<void, VSIFreeReleaser> pDstBuffer(
6126 : VSI_MALLOC3_VERBOSE(size_t(nWrkDataTypeSize), nDstChunkXSize,
6127 28 : nDstChunkYSize));
6128 28 : if (pDstBuffer == nullptr)
6129 : {
6130 0 : eErr = CE_Failure;
6131 0 : break;
6132 : }
6133 :
6134 : // Use a flag to avoid reading the overview being built
6135 : GDALRasterIOExtraArg sExtraArg;
6136 28 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
6137 28 : if (iSrcOverview == -1)
6138 4 : sExtraArg.bUseOnlyThisScale = true;
6139 :
6140 : // Scale and copy data from the VRT to the temp file
6141 28 : for (int nDstYOff = nDstYOffStart;
6142 914 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6143 : /* */)
6144 : {
6145 : const int nDstYCount =
6146 886 : std::min(nReducedDstChunkYSize, nDstYOffEnd - nDstYOff);
6147 886 : for (int nDstXOff = nDstXOffStart;
6148 201218 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6149 : /* */)
6150 : {
6151 : const int nDstXCount =
6152 200332 : std::min(nReducedDstChunkXSize, nDstXOffEnd - nDstXOff);
6153 400668 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
6154 : ++iBand)
6155 : {
6156 200336 : auto poSrcBand = poVRTDS->GetRasterBand(iBand + 1);
6157 200336 : eErr = poSrcBand->RasterIO(
6158 : GF_Read, nDstXOff, nDstYOff, nDstXCount, nDstYCount,
6159 : pDstBuffer.get(), nDstXCount, nDstYCount,
6160 : eWrkDataType, 0, 0, &sExtraArg);
6161 200336 : if (eErr == CE_None)
6162 : {
6163 : // Write to the temporary dataset, shifted
6164 200334 : auto poOvrBand = poTmpDS->GetRasterBand(iBand + 1);
6165 200334 : eErr = poOvrBand->RasterIO(
6166 : GF_Write, nDstXOff - nDstXOffStart,
6167 : nDstYOff - nDstYOffStart, nDstXCount,
6168 : nDstYCount, pDstBuffer.get(), nDstXCount,
6169 : nDstYCount, eWrkDataType, 0, 0, nullptr);
6170 : }
6171 : }
6172 200332 : nDstXOff += nDstXCount;
6173 : }
6174 886 : nDstYOff += nDstYCount;
6175 : }
6176 :
6177 : // Copy from the temporary to the overview
6178 28 : for (int nDstYOff = nDstYOffStart;
6179 54 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6180 : /* */)
6181 : {
6182 : const int nDstYCount =
6183 26 : std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
6184 26 : for (int nDstXOff = nDstXOffStart;
6185 52 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6186 : /* */)
6187 : {
6188 : const int nDstXCount =
6189 26 : std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
6190 56 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
6191 : ++iBand)
6192 : {
6193 30 : auto poSrcBand = poTmpDS->GetRasterBand(iBand + 1);
6194 30 : eErr = poSrcBand->RasterIO(
6195 : GF_Read, nDstXOff - nDstXOffStart,
6196 : nDstYOff - nDstYOffStart, nDstXCount, nDstYCount,
6197 : pDstBuffer.get(), nDstXCount, nDstYCount,
6198 : eWrkDataType, 0, 0, nullptr);
6199 30 : if (eErr == CE_None)
6200 : {
6201 : // Write to the destination overview bands
6202 30 : auto poOvrBand =
6203 30 : papapoOverviewBands[iBand][iOverview];
6204 30 : eErr = poOvrBand->RasterIO(
6205 : GF_Write, nDstXOff, nDstYOff, nDstXCount,
6206 : nDstYCount, pDstBuffer.get(), nDstXCount,
6207 : nDstYCount, eWrkDataType, 0, 0, nullptr);
6208 : }
6209 : }
6210 26 : nDstXOff += nDstXCount;
6211 : }
6212 26 : nDstYOff += nDstYCount;
6213 : }
6214 :
6215 28 : if (eErr != CE_None)
6216 : {
6217 2 : CPLError(CE_Failure, CPLE_AppDefined,
6218 : "Failed to write overview %d", iOverview);
6219 2 : return eErr;
6220 : }
6221 :
6222 : // Flush the data to overviews.
6223 56 : for (int iBand = 0; iBand < nBands; ++iBand)
6224 30 : papapoOverviewBands[iBand][iOverview]->FlushCache(false);
6225 :
6226 26 : continue;
6227 : }
6228 :
6229 : // Structure describing a resampling job
6230 : struct OvrJob
6231 : {
6232 : // Buffers to free when job is finished
6233 : std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
6234 : std::unique_ptr<PointerHolder> oSrcBufferHolder{};
6235 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
6236 :
6237 : GDALRasterBand *poDstBand = nullptr;
6238 :
6239 : // Input parameters of pfnResampleFn
6240 : GDALResampleFunction pfnResampleFn = nullptr;
6241 : GDALOverviewResampleArgs args{};
6242 : const void *pChunk = nullptr;
6243 :
6244 : // Output values of resampling function
6245 : CPLErr eErr = CE_Failure;
6246 : void *pDstBuffer = nullptr;
6247 : GDALDataType eDstBufferDataType = GDT_Unknown;
6248 :
6249 3310 : void NotifyFinished()
6250 : {
6251 6620 : std::lock_guard guard(mutex);
6252 3310 : bFinished = true;
6253 3310 : cv.notify_one();
6254 3310 : }
6255 :
6256 2 : bool IsFinished()
6257 : {
6258 2 : std::lock_guard guard(mutex);
6259 4 : return bFinished;
6260 : }
6261 :
6262 16 : void WaitFinished()
6263 : {
6264 32 : std::unique_lock oGuard(mutex);
6265 21 : while (!bFinished)
6266 : {
6267 5 : cv.wait(oGuard);
6268 : }
6269 16 : }
6270 :
6271 : private:
6272 : // Synchronization
6273 : bool bFinished = false;
6274 : std::mutex mutex{};
6275 : std::condition_variable cv{};
6276 : };
6277 :
6278 : // Thread function to resample
6279 3310 : const auto JobResampleFunc = [](void *pData)
6280 : {
6281 3310 : OvrJob *poJob = static_cast<OvrJob *>(pData);
6282 :
6283 3310 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
6284 : &(poJob->pDstBuffer),
6285 : &(poJob->eDstBufferDataType));
6286 :
6287 3310 : poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
6288 :
6289 3310 : poJob->NotifyFinished();
6290 3310 : };
6291 :
6292 : // Function to write resample data to target band
6293 3310 : const auto WriteJobData = [](const OvrJob *poJob)
6294 : {
6295 6620 : return poJob->poDstBand->RasterIO(
6296 3310 : GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
6297 3310 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
6298 3310 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
6299 3310 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
6300 3310 : poJob->args.nDstYOff2 - poJob->args.nDstYOff,
6301 3310 : poJob->eDstBufferDataType, 0, 0, nullptr);
6302 : };
6303 :
6304 : // Wait for completion of oldest job and serialize it
6305 : const auto WaitAndFinalizeOldestJob =
6306 16 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
6307 : {
6308 16 : auto poOldestJob = jobList.front().get();
6309 16 : poOldestJob->WaitFinished();
6310 16 : CPLErr l_eErr = poOldestJob->eErr;
6311 16 : if (l_eErr == CE_None)
6312 : {
6313 16 : l_eErr = WriteJobData(poOldestJob);
6314 : }
6315 :
6316 16 : jobList.pop_front();
6317 16 : return l_eErr;
6318 : };
6319 :
6320 : // Queue of jobs
6321 1204 : std::list<std::unique_ptr<OvrJob>> jobList;
6322 :
6323 1204 : std::vector<std::unique_ptr<void, VSIFreeReleaser>> apaChunk(nBands);
6324 : std::vector<std::unique_ptr<GByte, VSIFreeReleaser>>
6325 1204 : apabyChunkNoDataMask(nBands);
6326 :
6327 : // Iterate on destination overview, block by block.
6328 602 : for (int nDstYOff = nDstYOffStart;
6329 2111 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6330 1509 : nDstYOff += nDstChunkYSize)
6331 : {
6332 : int nDstYCount;
6333 1509 : if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
6334 1099 : nDstYCount = nDstChunkYSize;
6335 : else
6336 410 : nDstYCount = nDstYOffEnd - nDstYOff;
6337 :
6338 1509 : int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
6339 1509 : int nChunkYOff2 = static_cast<int>(
6340 1509 : ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
6341 1509 : if (nChunkYOff2 > nSrcHeight ||
6342 1509 : nDstYOff + nDstYCount == nDstTotalHeight)
6343 595 : nChunkYOff2 = nSrcHeight;
6344 1509 : int nYCount = nChunkYOff2 - nChunkYOff;
6345 1509 : CPLAssert(nYCount <= nFullResYChunk);
6346 :
6347 1509 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
6348 1509 : int nChunkYSizeQueried =
6349 1509 : nYCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
6350 1509 : if (nChunkYOffQueried < 0)
6351 : {
6352 144 : nChunkYSizeQueried += nChunkYOffQueried;
6353 144 : nChunkYOffQueried = 0;
6354 : }
6355 1509 : if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
6356 143 : nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
6357 1509 : CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
6358 :
6359 1509 : if (!pfnProgress(std::min(1.0, dfCurPixelCount / dfTotalPixelCount),
6360 : nullptr, pProgressData))
6361 : {
6362 1 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6363 1 : eErr = CE_Failure;
6364 : }
6365 :
6366 : // Iterate on destination overview, block by block.
6367 1509 : for (int nDstXOff = nDstXOffStart;
6368 3057 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6369 1548 : nDstXOff += nDstChunkXSize)
6370 : {
6371 1548 : int nDstXCount = 0;
6372 1548 : if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
6373 1531 : nDstXCount = nDstChunkXSize;
6374 : else
6375 17 : nDstXCount = nDstXOffEnd - nDstXOff;
6376 :
6377 1548 : dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
6378 :
6379 1548 : int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
6380 1548 : int nChunkXOff2 = static_cast<int>(
6381 1548 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
6382 1548 : if (nChunkXOff2 > nSrcWidth ||
6383 1548 : nDstXOff + nDstXCount == nDstTotalWidth)
6384 1473 : nChunkXOff2 = nSrcWidth;
6385 1548 : const int nXCount = nChunkXOff2 - nChunkXOff;
6386 1548 : CPLAssert(nXCount <= nFullResXChunk);
6387 :
6388 1548 : int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
6389 1548 : int nChunkXSizeQueried =
6390 1548 : nXCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
6391 1548 : if (nChunkXOffQueried < 0)
6392 : {
6393 203 : nChunkXSizeQueried += nChunkXOffQueried;
6394 203 : nChunkXOffQueried = 0;
6395 : }
6396 1548 : if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
6397 212 : nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
6398 1548 : CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
6399 : #if DEBUG_VERBOSE
6400 : CPLDebug("GDAL",
6401 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
6402 : nChunkXOffQueried, nChunkYOffQueried,
6403 : nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
6404 : nDstYOff, nDstXCount, nDstYCount);
6405 : #endif
6406 :
6407 : // Avoid accumulating too many tasks and exhaust RAM
6408 :
6409 : // Try to complete already finished jobs
6410 1548 : while (eErr == CE_None && !jobList.empty())
6411 : {
6412 2 : auto poOldestJob = jobList.front().get();
6413 2 : if (!poOldestJob->IsFinished())
6414 2 : break;
6415 0 : eErr = poOldestJob->eErr;
6416 0 : if (eErr == CE_None)
6417 : {
6418 0 : eErr = WriteJobData(poOldestJob);
6419 : }
6420 :
6421 0 : jobList.pop_front();
6422 : }
6423 :
6424 : // And in case we have saturated the number of threads,
6425 : // wait for completion of tasks to go below the threshold.
6426 3096 : while (eErr == CE_None &&
6427 1548 : jobList.size() >= static_cast<size_t>(nThreads))
6428 : {
6429 0 : eErr = WaitAndFinalizeOldestJob(jobList);
6430 : }
6431 :
6432 : // Read the source buffers for all the bands.
6433 4859 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6434 : {
6435 : // (Re)allocate buffers if needed
6436 3311 : if (apaChunk[iBand] == nullptr)
6437 : {
6438 1179 : apaChunk[iBand].reset(VSI_MALLOC3_VERBOSE(
6439 : nFullResXChunkQueried, nFullResYChunkQueried,
6440 : nWrkDataTypeSize));
6441 1179 : if (apaChunk[iBand] == nullptr)
6442 : {
6443 0 : eErr = CE_Failure;
6444 : }
6445 : }
6446 3652 : if (bUseNoDataMask &&
6447 341 : apabyChunkNoDataMask[iBand] == nullptr)
6448 : {
6449 282 : apabyChunkNoDataMask[iBand].reset(
6450 282 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
6451 : nFullResXChunkQueried, nFullResYChunkQueried)));
6452 282 : if (apabyChunkNoDataMask[iBand] == nullptr)
6453 : {
6454 0 : eErr = CE_Failure;
6455 : }
6456 : }
6457 :
6458 3311 : if (eErr == CE_None)
6459 : {
6460 3311 : GDALRasterBand *poSrcBand = nullptr;
6461 3311 : if (iSrcOverview == -1)
6462 2409 : poSrcBand = papoSrcBands[iBand];
6463 : else
6464 902 : poSrcBand =
6465 902 : papapoOverviewBands[iBand][iSrcOverview];
6466 3311 : eErr = poSrcBand->RasterIO(
6467 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6468 : nChunkXSizeQueried, nChunkYSizeQueried,
6469 3311 : apaChunk[iBand].get(), nChunkXSizeQueried,
6470 : nChunkYSizeQueried, eWrkDataType, 0, 0, nullptr);
6471 :
6472 3311 : if (bUseNoDataMask && eErr == CE_None)
6473 : {
6474 341 : auto poMaskBand = poSrcBand->IsMaskBand()
6475 341 : ? poSrcBand
6476 262 : : poSrcBand->GetMaskBand();
6477 341 : eErr = poMaskBand->RasterIO(
6478 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6479 : nChunkXSizeQueried, nChunkYSizeQueried,
6480 341 : apabyChunkNoDataMask[iBand].get(),
6481 : nChunkXSizeQueried, nChunkYSizeQueried,
6482 : GDT_Byte, 0, 0, nullptr);
6483 : }
6484 : }
6485 : }
6486 :
6487 : // Compute the resulting overview block.
6488 4858 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6489 : {
6490 6620 : auto poJob = std::make_unique<OvrJob>();
6491 3310 : poJob->pfnResampleFn = pfnResampleFn;
6492 3310 : poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
6493 6620 : poJob->args.eOvrDataType =
6494 3310 : poJob->poDstBand->GetRasterDataType();
6495 3310 : poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
6496 3310 : poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
6497 3310 : const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
6498 3310 : "NBITS", "IMAGE_STRUCTURE");
6499 3310 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
6500 3310 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
6501 3310 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
6502 3310 : poJob->args.eWrkDataType = eWrkDataType;
6503 3310 : poJob->pChunk = apaChunk[iBand].get();
6504 3310 : poJob->args.pabyChunkNodataMask =
6505 3310 : apabyChunkNoDataMask[iBand].get();
6506 3310 : poJob->args.nChunkXOff = nChunkXOffQueried;
6507 3310 : poJob->args.nChunkXSize = nChunkXSizeQueried;
6508 3310 : poJob->args.nChunkYOff = nChunkYOffQueried;
6509 3310 : poJob->args.nChunkYSize = nChunkYSizeQueried;
6510 3310 : poJob->args.nDstXOff = nDstXOff;
6511 3310 : poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
6512 3310 : poJob->args.nDstYOff = nDstYOff;
6513 3310 : poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
6514 3310 : poJob->args.pszResampling = pszResampling;
6515 3310 : poJob->args.bHasNoData = abHasNoData[iBand];
6516 3310 : poJob->args.dfNoDataValue = adfNoDataValue[iBand];
6517 3310 : poJob->args.eSrcDataType = eDataType;
6518 3310 : poJob->args.bPropagateNoData = bPropagateNoData;
6519 :
6520 3310 : if (poJobQueue)
6521 : {
6522 32 : poJob->oSrcMaskBufferHolder.reset(new PointerHolder(
6523 16 : apabyChunkNoDataMask[iBand].release()));
6524 :
6525 32 : poJob->oSrcBufferHolder.reset(
6526 16 : new PointerHolder(apaChunk[iBand].release()));
6527 :
6528 16 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
6529 16 : jobList.emplace_back(std::move(poJob));
6530 : }
6531 : else
6532 : {
6533 3294 : JobResampleFunc(poJob.get());
6534 3294 : eErr = poJob->eErr;
6535 3294 : if (eErr == CE_None)
6536 : {
6537 3294 : eErr = WriteJobData(poJob.get());
6538 : }
6539 : }
6540 : }
6541 : }
6542 : }
6543 :
6544 : // Wait for all pending jobs to complete
6545 618 : while (!jobList.empty())
6546 : {
6547 16 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
6548 16 : if (l_eErr != CE_None && eErr == CE_None)
6549 0 : eErr = l_eErr;
6550 : }
6551 :
6552 : // Flush the data to overviews.
6553 1779 : for (int iBand = 0; iBand < nBands; ++iBand)
6554 : {
6555 1177 : if (papapoOverviewBands[iBand][iOverview]->FlushCache(false) !=
6556 : CE_None)
6557 0 : eErr = CE_Failure;
6558 : }
6559 : }
6560 :
6561 382 : if (eErr == CE_None)
6562 378 : pfnProgress(1.0, nullptr, pProgressData);
6563 :
6564 382 : return eErr;
6565 : }
6566 :
6567 : /************************************************************************/
6568 : /* GDALRegenerateOverviewsMultiBand() */
6569 : /************************************************************************/
6570 :
6571 : /**
6572 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
6573 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
6574 : *
6575 : * This function will generate one or more overview images from a base
6576 : * image using the requested downsampling algorithm. Its primary use
6577 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
6578 : * can also be used to generate downsampled images in one file from another
6579 : * outside the overview architecture.
6580 : *
6581 : * The output bands need to exist in advance and share the same characteristics
6582 : * (type, dimensions)
6583 : *
6584 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
6585 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
6586 : *
6587 : * It does not support color tables or complex data types.
6588 : *
6589 : * The pseudo-algorithm used by the function is :
6590 : * for each overview
6591 : * iterate on lines of the source by a step of deltay
6592 : * iterate on columns of the source by a step of deltax
6593 : * read the source data of size deltax * deltay for all the bands
6594 : * generate the corresponding overview block for all the bands
6595 : *
6596 : * This function will honour properly NODATA_VALUES tuples (special dataset
6597 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
6598 : * considered as the nodata value and not each value of the triplet
6599 : * independently per band.
6600 : *
6601 : * The GDAL_NUM_THREADS configuration option can be set
6602 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
6603 : * overview computation.
6604 : *
6605 : * @param apoSrcBands the list of source bands to downsample
6606 : * @param aapoOverviewBands bidimension array of bands. First dimension is
6607 : * indexed by bands. Second dimension is indexed by
6608 : * overview levels. All aapoOverviewBands[i] arrays
6609 : * must have the same size (i.e. same number of
6610 : * overviews)
6611 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
6612 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
6613 : * @param pfnProgress progress report function.
6614 : * @param pProgressData progress function callback data.
6615 : * @param papszOptions NULL terminated list of options as
6616 : * key=value pairs, or NULL
6617 : * The XOFF, YOFF, XSIZE and YSIZE
6618 : * options can be specified to express that overviews should
6619 : * be regenerated only in the specified subset of the source
6620 : * dataset.
6621 : * @return CE_None on success or CE_Failure on failure.
6622 : * @since 3.10
6623 : */
6624 :
6625 19 : CPLErr GDALRegenerateOverviewsMultiBand(
6626 : const std::vector<GDALRasterBand *> &apoSrcBands,
6627 : const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
6628 : const char *pszResampling, GDALProgressFunc pfnProgress,
6629 : void *pProgressData, CSLConstList papszOptions)
6630 : {
6631 19 : CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
6632 29 : for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
6633 : {
6634 10 : CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
6635 : }
6636 :
6637 19 : if (aapoOverviewBands.empty())
6638 0 : return CE_None;
6639 :
6640 19 : std::vector<GDALRasterBand **> apapoOverviewBands;
6641 48 : for (auto &apoOverviewBands : aapoOverviewBands)
6642 : {
6643 : auto papoOverviewBands = static_cast<GDALRasterBand **>(
6644 29 : CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
6645 61 : for (size_t i = 0; i < apoOverviewBands.size(); ++i)
6646 : {
6647 32 : papoOverviewBands[i] = apoOverviewBands[i];
6648 : }
6649 29 : apapoOverviewBands.push_back(papoOverviewBands);
6650 : }
6651 38 : const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
6652 19 : static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
6653 19 : static_cast<int>(aapoOverviewBands[0].size()),
6654 19 : apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
6655 : papszOptions);
6656 48 : for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
6657 29 : CPLFree(papoOverviewBands);
6658 19 : return eErr;
6659 : }
6660 :
6661 : /************************************************************************/
6662 : /* GDALComputeBandStats() */
6663 : /************************************************************************/
6664 :
6665 : /** Undocumented
6666 : * @param hSrcBand undocumented.
6667 : * @param nSampleStep Step between scanlines used to compute statistics.
6668 : * When nSampleStep is equal to 1, all scanlines will
6669 : * be processed.
6670 : * @param pdfMean undocumented.
6671 : * @param pdfStdDev undocumented.
6672 : * @param pfnProgress undocumented.
6673 : * @param pProgressData undocumented.
6674 : * @return undocumented
6675 : */
6676 18 : CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
6677 : int nSampleStep, double *pdfMean,
6678 : double *pdfStdDev,
6679 : GDALProgressFunc pfnProgress,
6680 : void *pProgressData)
6681 :
6682 : {
6683 18 : VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
6684 :
6685 18 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
6686 :
6687 18 : if (pfnProgress == nullptr)
6688 18 : pfnProgress = GDALDummyProgress;
6689 :
6690 18 : const int nWidth = poSrcBand->GetXSize();
6691 18 : const int nHeight = poSrcBand->GetYSize();
6692 :
6693 18 : if (nSampleStep >= nHeight || nSampleStep < 1)
6694 5 : nSampleStep = 1;
6695 :
6696 18 : GDALDataType eWrkType = GDT_Unknown;
6697 18 : float *pafData = nullptr;
6698 18 : GDALDataType eType = poSrcBand->GetRasterDataType();
6699 18 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6700 18 : if (bComplex)
6701 : {
6702 : pafData = static_cast<float *>(
6703 0 : VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6704 0 : eWrkType = GDT_CFloat32;
6705 : }
6706 : else
6707 : {
6708 : pafData =
6709 18 : static_cast<float *>(VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6710 18 : eWrkType = GDT_Float32;
6711 : }
6712 :
6713 18 : if (nWidth == 0 || pafData == nullptr)
6714 : {
6715 0 : VSIFree(pafData);
6716 0 : return CE_Failure;
6717 : }
6718 :
6719 : /* -------------------------------------------------------------------- */
6720 : /* Loop over all sample lines. */
6721 : /* -------------------------------------------------------------------- */
6722 18 : double dfSum = 0.0;
6723 18 : double dfSum2 = 0.0;
6724 18 : int iLine = 0;
6725 18 : GIntBig nSamples = 0;
6726 :
6727 2143 : do
6728 : {
6729 2161 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6730 : pProgressData))
6731 : {
6732 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6733 0 : CPLFree(pafData);
6734 0 : return CE_Failure;
6735 : }
6736 :
6737 : const CPLErr eErr =
6738 2161 : poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
6739 : 1, eWrkType, 0, 0, nullptr);
6740 2161 : if (eErr != CE_None)
6741 : {
6742 1 : CPLFree(pafData);
6743 1 : return eErr;
6744 : }
6745 :
6746 725208 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6747 : {
6748 723048 : float fValue = 0.0f;
6749 :
6750 723048 : if (bComplex)
6751 : {
6752 : // Compute the magnitude of the complex value.
6753 : fValue =
6754 0 : std::hypot(pafData[static_cast<size_t>(iPixel) * 2],
6755 0 : pafData[static_cast<size_t>(iPixel) * 2 + 1]);
6756 : }
6757 : else
6758 : {
6759 723048 : fValue = pafData[iPixel];
6760 : }
6761 :
6762 723048 : dfSum += static_cast<double>(fValue);
6763 723048 : dfSum2 += static_cast<double>(fValue) * static_cast<double>(fValue);
6764 : }
6765 :
6766 2160 : nSamples += nWidth;
6767 2160 : iLine += nSampleStep;
6768 2160 : } while (iLine < nHeight);
6769 :
6770 17 : if (!pfnProgress(1.0, nullptr, pProgressData))
6771 : {
6772 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6773 0 : CPLFree(pafData);
6774 0 : return CE_Failure;
6775 : }
6776 :
6777 : /* -------------------------------------------------------------------- */
6778 : /* Produce the result values. */
6779 : /* -------------------------------------------------------------------- */
6780 17 : if (pdfMean != nullptr)
6781 17 : *pdfMean = dfSum / nSamples;
6782 :
6783 17 : if (pdfStdDev != nullptr)
6784 : {
6785 17 : const double dfMean = dfSum / nSamples;
6786 :
6787 17 : *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
6788 : }
6789 :
6790 17 : CPLFree(pafData);
6791 :
6792 17 : return CE_None;
6793 : }
6794 :
6795 : /************************************************************************/
6796 : /* GDALOverviewMagnitudeCorrection() */
6797 : /* */
6798 : /* Correct the mean and standard deviation of the overviews of */
6799 : /* the given band to match the base layer approximately. */
6800 : /************************************************************************/
6801 :
6802 : /** Undocumented
6803 : * @param hBaseBand undocumented.
6804 : * @param nOverviewCount undocumented.
6805 : * @param pahOverviews undocumented.
6806 : * @param pfnProgress undocumented.
6807 : * @param pProgressData undocumented.
6808 : * @return undocumented
6809 : */
6810 0 : CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
6811 : int nOverviewCount,
6812 : GDALRasterBandH *pahOverviews,
6813 : GDALProgressFunc pfnProgress,
6814 : void *pProgressData)
6815 :
6816 : {
6817 0 : VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
6818 :
6819 : /* -------------------------------------------------------------------- */
6820 : /* Compute mean/stddev for source raster. */
6821 : /* -------------------------------------------------------------------- */
6822 0 : double dfOrigMean = 0.0;
6823 0 : double dfOrigStdDev = 0.0;
6824 : {
6825 : const CPLErr eErr =
6826 0 : GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
6827 : pfnProgress, pProgressData);
6828 :
6829 0 : if (eErr != CE_None)
6830 0 : return eErr;
6831 : }
6832 :
6833 : /* -------------------------------------------------------------------- */
6834 : /* Loop on overview bands. */
6835 : /* -------------------------------------------------------------------- */
6836 0 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
6837 : {
6838 : GDALRasterBand *poOverview =
6839 0 : GDALRasterBand::FromHandle(pahOverviews[iOverview]);
6840 : double dfOverviewMean, dfOverviewStdDev;
6841 :
6842 : const CPLErr eErr =
6843 0 : GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
6844 : &dfOverviewStdDev, pfnProgress, pProgressData);
6845 :
6846 0 : if (eErr != CE_None)
6847 0 : return eErr;
6848 :
6849 0 : double dfGain = 1.0;
6850 0 : if (dfOrigStdDev >= 0.0001)
6851 0 : dfGain = dfOrigStdDev / dfOverviewStdDev;
6852 :
6853 : /* --------------------------------------------------------------------
6854 : */
6855 : /* Apply gain and offset. */
6856 : /* --------------------------------------------------------------------
6857 : */
6858 0 : const int nWidth = poOverview->GetXSize();
6859 0 : const int nHeight = poOverview->GetYSize();
6860 :
6861 0 : GDALDataType eWrkType = GDT_Unknown;
6862 0 : float *pafData = nullptr;
6863 0 : const GDALDataType eType = poOverview->GetRasterDataType();
6864 0 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6865 0 : if (bComplex)
6866 : {
6867 : pafData = static_cast<float *>(
6868 0 : VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6869 0 : eWrkType = GDT_CFloat32;
6870 : }
6871 : else
6872 : {
6873 : pafData = static_cast<float *>(
6874 0 : VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6875 0 : eWrkType = GDT_Float32;
6876 : }
6877 :
6878 0 : if (pafData == nullptr)
6879 : {
6880 0 : return CE_Failure;
6881 : }
6882 :
6883 0 : for (int iLine = 0; iLine < nHeight; ++iLine)
6884 : {
6885 0 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6886 : pProgressData))
6887 : {
6888 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6889 0 : CPLFree(pafData);
6890 0 : return CE_Failure;
6891 : }
6892 :
6893 0 : if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
6894 : nWidth, 1, eWrkType, 0, 0,
6895 0 : nullptr) != CE_None)
6896 : {
6897 0 : CPLFree(pafData);
6898 0 : return CE_Failure;
6899 : }
6900 :
6901 0 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6902 : {
6903 0 : if (bComplex)
6904 : {
6905 0 : pafData[static_cast<size_t>(iPixel) * 2] *=
6906 0 : static_cast<float>(dfGain);
6907 0 : pafData[static_cast<size_t>(iPixel) * 2 + 1] *=
6908 0 : static_cast<float>(dfGain);
6909 : }
6910 : else
6911 : {
6912 0 : pafData[iPixel] = static_cast<float>(
6913 0 : (double(pafData[iPixel]) - dfOverviewMean) * dfGain +
6914 : dfOrigMean);
6915 : }
6916 : }
6917 :
6918 0 : if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
6919 : nWidth, 1, eWrkType, 0, 0,
6920 0 : nullptr) != CE_None)
6921 : {
6922 0 : CPLFree(pafData);
6923 0 : return CE_Failure;
6924 : }
6925 : }
6926 :
6927 0 : if (!pfnProgress(1.0, nullptr, pProgressData))
6928 : {
6929 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6930 0 : CPLFree(pafData);
6931 0 : return CE_Failure;
6932 : }
6933 :
6934 0 : CPLFree(pafData);
6935 : }
6936 :
6937 0 : return CE_None;
6938 : }
|