Line data Source code
1 :
2 : /******************************************************************************
3 : *
4 : * Project: GDAL Core
5 : * Purpose: Helper code to implement overview support in different drivers.
6 : * Author: Frank Warmerdam, warmerdam@pobox.com
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 2000, Frank Warmerdam
10 : * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * SPDX-License-Identifier: MIT
13 : ****************************************************************************/
14 :
15 : #include "cpl_port.h"
16 : #include "gdal_priv.h"
17 :
18 : #include <cmath>
19 : #include <cstddef>
20 : #include <cstdlib>
21 :
22 : #include <algorithm>
23 : #include <complex>
24 : #include <condition_variable>
25 : #include <limits>
26 : #include <list>
27 : #include <memory>
28 : #include <mutex>
29 : #include <vector>
30 :
31 : #include "cpl_conv.h"
32 : #include "cpl_error.h"
33 : #include "cpl_float.h"
34 : #include "cpl_progress.h"
35 : #include "cpl_vsi.h"
36 : #include "cpl_worker_thread_pool.h"
37 : #include "gdal.h"
38 : #include "gdal_thread_pool.h"
39 : #include "gdalwarper.h"
40 : #include "gdal_vrt.h"
41 : #include "vrtdataset.h"
42 :
43 : #ifdef USE_NEON_OPTIMIZATIONS
44 : #include "include_sse2neon.h"
45 :
46 : #if (!defined(__aarch64__) && !defined(_M_ARM64))
47 : #define ARM_V7
48 : #endif
49 :
50 : #define USE_SSE2
51 :
52 : #include "gdalsse_priv.h"
53 :
54 : // Restrict to 64bit processors because they are guaranteed to have SSE2,
55 : // or if __AVX2__ is defined.
56 : #elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
57 : #define USE_SSE2
58 :
59 : #include "gdalsse_priv.h"
60 :
61 : #ifdef __SSE3__
62 : #include <pmmintrin.h>
63 : #endif
64 : #ifdef __SSSE3__
65 : #include <tmmintrin.h>
66 : #endif
67 : #ifdef __SSE4_1__
68 : #include <smmintrin.h>
69 : #endif
70 : #ifdef __AVX2__
71 : #include <immintrin.h>
72 : #endif
73 :
74 : #endif
75 :
76 : // To be included after above USE_SSE2 and include gdalsse_priv.h
77 : // to avoid build issue on Windows x86
78 : #include "gdal_priv_templates.hpp"
79 :
80 : /************************************************************************/
81 : /* GDALResampleChunk_Near() */
82 : /************************************************************************/
83 :
84 : template <class T>
85 1251 : static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
86 : const T *pChunk, T **ppDstBuffer)
87 :
88 : {
89 1251 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
90 1251 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
91 1251 : const GDALDataType eWrkDataType = args.eWrkDataType;
92 1251 : const int nChunkXOff = args.nChunkXOff;
93 1251 : const int nChunkXSize = args.nChunkXSize;
94 1251 : const int nChunkYOff = args.nChunkYOff;
95 1251 : const int nDstXOff = args.nDstXOff;
96 1251 : const int nDstXOff2 = args.nDstXOff2;
97 1251 : const int nDstYOff = args.nDstYOff;
98 1251 : const int nDstYOff2 = args.nDstYOff2;
99 1251 : const int nDstXWidth = nDstXOff2 - nDstXOff;
100 :
101 : /* -------------------------------------------------------------------- */
102 : /* Allocate buffers. */
103 : /* -------------------------------------------------------------------- */
104 1251 : *ppDstBuffer = static_cast<T *>(
105 1251 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
106 : GDALGetDataTypeSizeBytes(eWrkDataType)));
107 1251 : if (*ppDstBuffer == nullptr)
108 : {
109 0 : return CE_Failure;
110 : }
111 1251 : T *const pDstBuffer = *ppDstBuffer;
112 :
113 : int *panSrcXOff =
114 1251 : static_cast<int *>(VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(int)));
115 :
116 1251 : if (panSrcXOff == nullptr)
117 : {
118 0 : return CE_Failure;
119 : }
120 :
121 : /* ==================================================================== */
122 : /* Precompute inner loop constants. */
123 : /* ==================================================================== */
124 840888 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
125 : {
126 839637 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
127 839637 : if (nSrcXOff < nChunkXOff)
128 0 : nSrcXOff = nChunkXOff;
129 :
130 839637 : panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
131 : }
132 :
133 : /* ==================================================================== */
134 : /* Loop over destination scanlines. */
135 : /* ==================================================================== */
136 142463 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
137 : {
138 141212 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
139 141212 : if (nSrcYOff < nChunkYOff)
140 0 : nSrcYOff = nChunkYOff;
141 :
142 141212 : const T *const pSrcScanline =
143 : pChunk +
144 141212 : (static_cast<size_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
145 137819 : nChunkXOff;
146 :
147 : /* --------------------------------------------------------------------
148 : */
149 : /* Loop over destination pixels */
150 : /* --------------------------------------------------------------------
151 : */
152 141212 : T *pDstScanline =
153 141212 : pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
154 120252393 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
155 : {
156 120111000 : pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
157 : }
158 : }
159 :
160 1251 : CPLFree(panSrcXOff);
161 :
162 1251 : return CE_None;
163 : }
164 :
165 1251 : static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
166 : const void *pChunk, void **ppDstBuffer,
167 : GDALDataType *peDstBufferDataType)
168 : {
169 1251 : *peDstBufferDataType = args.eWrkDataType;
170 1251 : switch (args.eWrkDataType)
171 : {
172 : // For nearest resampling, as no computation is done, only the
173 : // size of the data type matters.
174 1083 : case GDT_UInt8:
175 : case GDT_Int8:
176 : {
177 1083 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
178 1083 : return GDALResampleChunk_NearT(
179 : args, static_cast<const uint8_t *>(pChunk),
180 1083 : reinterpret_cast<uint8_t **>(ppDstBuffer));
181 : }
182 :
183 52 : case GDT_Int16:
184 : case GDT_UInt16:
185 : case GDT_Float16:
186 : {
187 52 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
188 52 : return GDALResampleChunk_NearT(
189 : args, static_cast<const uint16_t *>(pChunk),
190 52 : reinterpret_cast<uint16_t **>(ppDstBuffer));
191 : }
192 :
193 68 : case GDT_CInt16:
194 : case GDT_CFloat16:
195 : case GDT_Int32:
196 : case GDT_UInt32:
197 : case GDT_Float32:
198 : {
199 68 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
200 68 : return GDALResampleChunk_NearT(
201 : args, static_cast<const uint32_t *>(pChunk),
202 68 : reinterpret_cast<uint32_t **>(ppDstBuffer));
203 : }
204 :
205 44 : case GDT_CInt32:
206 : case GDT_CFloat32:
207 : case GDT_Int64:
208 : case GDT_UInt64:
209 : case GDT_Float64:
210 : {
211 44 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
212 44 : return GDALResampleChunk_NearT(
213 : args, static_cast<const uint64_t *>(pChunk),
214 44 : reinterpret_cast<uint64_t **>(ppDstBuffer));
215 : }
216 :
217 4 : case GDT_CFloat64:
218 : {
219 4 : return GDALResampleChunk_NearT(
220 : args, static_cast<const std::complex<double> *>(pChunk),
221 4 : reinterpret_cast<std::complex<double> **>(ppDstBuffer));
222 : }
223 :
224 0 : case GDT_Unknown:
225 : case GDT_TypeCount:
226 0 : break;
227 : }
228 0 : CPLAssert(false);
229 : return CE_Failure;
230 : }
231 :
232 : namespace
233 : {
234 :
235 : // Find in the color table the entry whose RGB value is the closest
236 : // (using quadratic distance) to the test color, ignoring transparent entries.
237 3837 : int BestColorEntry(const std::vector<GDALColorEntry> &entries,
238 : const GDALColorEntry &test)
239 : {
240 3837 : int nMinDist = std::numeric_limits<int>::max();
241 3837 : size_t bestEntry = 0;
242 986109 : for (size_t i = 0; i < entries.size(); ++i)
243 : {
244 982272 : const GDALColorEntry &entry = entries[i];
245 : // Ignore transparent entries
246 982272 : if (entry.c4 == 0)
247 3237 : continue;
248 :
249 979035 : int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
250 979035 : ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
251 979035 : ((test.c3 - entry.c3) * (test.c3 - entry.c3));
252 979035 : if (nDist < nMinDist)
253 : {
254 15847 : nMinDist = nDist;
255 15847 : bestEntry = i;
256 : }
257 : }
258 3837 : return static_cast<int>(bestEntry);
259 : }
260 :
261 7 : std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
262 : int &transparentIdx)
263 : {
264 7 : std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
265 :
266 7 : transparentIdx = -1;
267 7 : int i = 0;
268 1799 : for (auto &entry : entries)
269 : {
270 1792 : table.GetColorEntryAsRGB(i, &entry);
271 1792 : if (transparentIdx < 0 && entry.c4 == 0)
272 1 : transparentIdx = i;
273 1792 : ++i;
274 : }
275 7 : return entries;
276 : }
277 :
278 : } // unnamed namespace
279 :
280 : /************************************************************************/
281 : /* SQUARE() */
282 : /************************************************************************/
283 :
284 6427 : template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
285 : {
286 6427 : return static_cast<Tsquare>(val) * val;
287 : }
288 :
289 : /************************************************************************/
290 : /* ComputeIntegerRMS() */
291 : /************************************************************************/
292 : // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
293 : // integer that minimizes abs(rms**2 - sumSquares / weight)
294 : template <class T, class Twork>
295 42 : inline T ComputeIntegerRMS(double sumSquares, double weight)
296 : {
297 42 : const double sumDivWeight = sumSquares / weight;
298 42 : T rms = static_cast<T>(sqrt(sumDivWeight));
299 :
300 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
301 : // Naive version:
302 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
303 42 : if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
304 42 : 2 * sumDivWeight)
305 6 : rms += 1;
306 42 : return rms;
307 : }
308 :
309 : template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
310 : {
311 : CPLAssert(false);
312 : return 0;
313 : }
314 :
315 28 : template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
316 : {
317 : // It has been verified that given the correction on rms below, using
318 : // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
319 : // is equivalent, so use the former as it is used twice.
320 28 : const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
321 28 : const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
322 28 : GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
323 :
324 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
325 : // Naive version:
326 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
327 : // Optimized version for integer case and weight == 4
328 28 : if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
329 5 : rms += 1;
330 28 : return rms;
331 : }
332 :
333 : template <>
334 24 : inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
335 : {
336 24 : const double sumDivWeight = sumSquares * 0.25;
337 24 : GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
338 :
339 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
340 : // Naive version:
341 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
342 : // Optimized version for integer case and weight == 4
343 24 : if (static_cast<GUInt32>(rms) * (rms + 1) <
344 24 : static_cast<GUInt32>(sumDivWeight + 0.25))
345 4 : rms += 1;
346 24 : return rms;
347 : }
348 :
349 : #ifdef USE_SSE2
350 :
351 : /************************************************************************/
352 : /* QuadraticMeanByteSSE2OrAVX2() */
353 : /************************************************************************/
354 :
355 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
356 : #define sse2_packus_epi32 _mm_packus_epi32
357 : #else
358 516139 : inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
359 : {
360 516139 : const auto minus32768_32 = _mm_set1_epi32(-32768);
361 516139 : const auto minus32768_16 = _mm_set1_epi16(-32768);
362 516139 : a = _mm_add_epi32(a, minus32768_32);
363 516139 : b = _mm_add_epi32(b, minus32768_32);
364 516139 : a = _mm_packs_epi32(a, b);
365 516139 : a = _mm_sub_epi16(a, minus32768_16);
366 516139 : return a;
367 : }
368 : #endif
369 :
370 : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
371 : #define sse2_hadd_epi16 _mm_hadd_epi16
372 : #else
373 5064270 : inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
374 : {
375 : // Horizontal addition of adjacent pairs
376 5064270 : const auto mask = _mm_set1_epi32(0xFFFF);
377 : const auto horizLo =
378 15192800 : _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
379 : const auto horizHi =
380 15192800 : _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
381 :
382 : // Recombine low and high parts
383 5064270 : return _mm_packs_epi32(horizLo, horizHi);
384 : }
385 : #endif
386 :
387 : #ifdef __AVX2__
388 :
389 : #define set1_epi16 _mm256_set1_epi16
390 : #define set1_epi32 _mm256_set1_epi32
391 : #define setzero _mm256_setzero_si256
392 : #define set1_ps _mm256_set1_ps
393 : #define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
394 : #define unpacklo_epi8 _mm256_unpacklo_epi8
395 : #define unpackhi_epi8 _mm256_unpackhi_epi8
396 : #define madd_epi16 _mm256_madd_epi16
397 : #define add_epi32 _mm256_add_epi32
398 : #define mul_ps _mm256_mul_ps
399 : #define cvtepi32_ps _mm256_cvtepi32_ps
400 : #define sqrt_ps _mm256_sqrt_ps
401 : #define cvttps_epi32 _mm256_cvttps_epi32
402 : #define packs_epi32 _mm256_packs_epi32
403 : #define packus_epi32 _mm256_packus_epi32
404 : #define srli_epi32 _mm256_srli_epi32
405 : #define mullo_epi16 _mm256_mullo_epi16
406 : #define srli_epi16 _mm256_srli_epi16
407 : #define cmpgt_epi16 _mm256_cmpgt_epi16
408 : #define add_epi16 _mm256_add_epi16
409 : #define sub_epi16 _mm256_sub_epi16
410 : #define packus_epi16 _mm256_packus_epi16
411 :
412 : /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
413 : /* to get the lower 128-bit bits of what would be a true 256-bit vector register
414 : */
415 :
416 : inline __m256i FIXUP_LANES(__m256i x)
417 : {
418 : return _mm256_permute4x64_epi64(x, _MM_SHUFFLE(3, 1, 2, 0));
419 : }
420 :
421 : #define store_lo(x, y) \
422 : _mm_storeu_si128(reinterpret_cast<__m128i *>(x), \
423 : _mm256_extracti128_si256(FIXUP_LANES(y), 0))
424 : #define storeu_int(x, y) \
425 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(x), FIXUP_LANES(y))
426 : #define hadd_epi16 _mm256_hadd_epi16
427 : #else
428 : #define set1_epi16 _mm_set1_epi16
429 : #define set1_epi32 _mm_set1_epi32
430 : #define setzero _mm_setzero_si128
431 : #define set1_ps _mm_set1_ps
432 : #define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
433 : #define unpacklo_epi8 _mm_unpacklo_epi8
434 : #define unpackhi_epi8 _mm_unpackhi_epi8
435 : #define madd_epi16 _mm_madd_epi16
436 : #define add_epi32 _mm_add_epi32
437 : #define mul_ps _mm_mul_ps
438 : #define cvtepi32_ps _mm_cvtepi32_ps
439 : #define sqrt_ps _mm_sqrt_ps
440 : #define cvttps_epi32 _mm_cvttps_epi32
441 : #define packs_epi32 _mm_packs_epi32
442 : #define packus_epi32 sse2_packus_epi32
443 : #define srli_epi32 _mm_srli_epi32
444 : #define mullo_epi16 _mm_mullo_epi16
445 : #define srli_epi16 _mm_srli_epi16
446 : #define cmpgt_epi16 _mm_cmpgt_epi16
447 : #define add_epi16 _mm_add_epi16
448 : #define sub_epi16 _mm_sub_epi16
449 : #define packus_epi16 _mm_packus_epi16
450 : #define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
451 : #define storeu_int(x, y) _mm_storeu_si128(reinterpret_cast<__m128i *>(x), (y))
452 : #define hadd_epi16 sse2_hadd_epi16
453 : #endif
454 :
455 : template <class T>
456 : static int
457 : #if defined(__GNUC__)
458 : __attribute__((noinline))
459 : #endif
460 5389 : QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
461 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
462 : T *CPL_RESTRICT pDstScanline)
463 : {
464 : // Optimized implementation for RMS on Byte by
465 : // processing by group of 8 output pixels, so as to use
466 : // a single _mm_sqrt_ps() call for 4 output pixels
467 5389 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
468 :
469 5389 : int iDstPixel = 0;
470 5389 : const auto one16 = set1_epi16(1);
471 5389 : const auto one32 = set1_epi32(1);
472 5389 : const auto zero = setzero();
473 5389 : const auto minus32768 = set1_epi16(-32768);
474 :
475 5389 : constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
476 521504 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
477 : {
478 : // Load 2 * DEST_ELTS bytes from each line
479 516115 : auto firstLine = loadu_int(pSrcScanlineShifted);
480 1032230 : auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
481 : // Extend those Bytes as UInt16s
482 516115 : auto firstLineLo = unpacklo_epi8(firstLine, zero);
483 516115 : auto firstLineHi = unpackhi_epi8(firstLine, zero);
484 516115 : auto secondLineLo = unpacklo_epi8(secondLine, zero);
485 516115 : auto secondLineHi = unpackhi_epi8(secondLine, zero);
486 :
487 : // Multiplication of 16 bit values and horizontal
488 : // addition of 32 bit results
489 : // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
490 516115 : firstLineLo = madd_epi16(firstLineLo, firstLineLo);
491 516115 : firstLineHi = madd_epi16(firstLineHi, firstLineHi);
492 516115 : secondLineLo = madd_epi16(secondLineLo, secondLineLo);
493 516115 : secondLineHi = madd_epi16(secondLineHi, secondLineHi);
494 :
495 : // Vertical addition
496 516115 : const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
497 516115 : const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
498 :
499 : const auto sumSquaresPlusOneDiv4Lo =
500 1032230 : srli_epi32(add_epi32(sumSquaresLo, one32), 2);
501 : const auto sumSquaresPlusOneDiv4Hi =
502 1032230 : srli_epi32(add_epi32(sumSquaresHi, one32), 2);
503 :
504 : // Take square root and truncate/floor to int32
505 : const auto rmsLo =
506 1548340 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
507 : const auto rmsHi =
508 1548340 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
509 :
510 : // Merge back low and high registers with each RMS value
511 : // as a 16 bit value.
512 516115 : auto rms = packs_epi32(rmsLo, rmsHi);
513 :
514 : // Round to upper value if it minimizes the
515 : // error |rms^2 - sumSquares/4|
516 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
517 : // rms += 1;
518 : // which is equivalent to:
519 : // if( rms * (rms + 1) < (sumSquares+1) / 4 )
520 : // rms += 1;
521 : // And both left and right parts fit on 16 (unsigned) bits
522 : const auto sumSquaresPlusOneDiv4 =
523 516115 : packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
524 : // cmpgt_epi16 operates on signed int16, but here
525 : // we have unsigned values, so shift them by -32768 before
526 2580580 : const auto mask = cmpgt_epi16(
527 : add_epi16(sumSquaresPlusOneDiv4, minus32768),
528 : add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
529 : // The value of the mask will be -1 when the correction needs to be
530 : // applied
531 516115 : rms = sub_epi16(rms, mask);
532 :
533 : // Pack each 16 bit RMS value to 8 bits
534 516115 : rms = packus_epi16(rms, rms /* could be anything */);
535 516115 : store_lo(&pDstScanline[iDstPixel], rms);
536 516115 : pSrcScanlineShifted += 2 * DEST_ELTS;
537 : }
538 :
539 5389 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
540 5389 : return iDstPixel;
541 : }
542 :
543 : /************************************************************************/
544 : /* AverageByteSSE2OrAVX2() */
545 : /************************************************************************/
546 :
547 : static int
548 123976 : AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
549 : const GByte *&CPL_RESTRICT pSrcScanlineShiftedInOut,
550 : GByte *CPL_RESTRICT pDstScanline)
551 : {
552 : // Optimized implementation for average on Byte by
553 : // processing by group of 16 output pixels for SSE2, or 32 for AVX2
554 :
555 123976 : const auto zero = setzero();
556 123976 : const auto two16 = set1_epi16(2);
557 123976 : const GByte *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
558 :
559 123976 : constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
560 123976 : int iDstPixel = 0;
561 2656110 : for (; iDstPixel < nDstXWidth - (2 * DEST_ELTS - 1);
562 2532130 : iDstPixel += 2 * DEST_ELTS)
563 : {
564 : decltype(setzero()) average0;
565 : {
566 : // Load 2 * DEST_ELTS bytes from each line
567 2532130 : const auto firstLine = loadu_int(pSrcScanlineShifted);
568 : const auto secondLine =
569 5064270 : loadu_int(pSrcScanlineShifted + nChunkXSize);
570 : // Extend those Bytes as UInt16s
571 2532130 : const auto firstLineLo = unpacklo_epi8(firstLine, zero);
572 2532130 : const auto firstLineHi = unpackhi_epi8(firstLine, zero);
573 2532130 : const auto secondLineLo = unpacklo_epi8(secondLine, zero);
574 2532130 : const auto secondLineHi = unpackhi_epi8(secondLine, zero);
575 :
576 : // Vertical addition
577 2532130 : const auto sumLo = add_epi16(firstLineLo, secondLineLo);
578 2532130 : const auto sumHi = add_epi16(firstLineHi, secondLineHi);
579 :
580 : // Horizontal addition of adjacent pairs, and recombine low and high
581 : // parts
582 2532130 : const auto sum = hadd_epi16(sumLo, sumHi);
583 :
584 : // average = (sum + 2) / 4
585 2532130 : average0 = srli_epi16(add_epi16(sum, two16), 2);
586 :
587 2532130 : pSrcScanlineShifted += 2 * DEST_ELTS;
588 : }
589 :
590 : decltype(setzero()) average1;
591 : {
592 : // Load 2 * DEST_ELTS bytes from each line
593 2532130 : const auto firstLine = loadu_int(pSrcScanlineShifted);
594 : const auto secondLine =
595 5064270 : loadu_int(pSrcScanlineShifted + nChunkXSize);
596 : // Extend those Bytes as UInt16s
597 2532130 : const auto firstLineLo = unpacklo_epi8(firstLine, zero);
598 2532130 : const auto firstLineHi = unpackhi_epi8(firstLine, zero);
599 2532130 : const auto secondLineLo = unpacklo_epi8(secondLine, zero);
600 2532130 : const auto secondLineHi = unpackhi_epi8(secondLine, zero);
601 :
602 : // Vertical addition
603 2532130 : const auto sumLo = add_epi16(firstLineLo, secondLineLo);
604 2532130 : const auto sumHi = add_epi16(firstLineHi, secondLineHi);
605 :
606 : // Horizontal addition of adjacent pairs, and recombine low and high
607 : // parts
608 2532130 : const auto sum = hadd_epi16(sumLo, sumHi);
609 :
610 : // average = (sum + 2) / 4
611 2532130 : average1 = srli_epi16(add_epi16(sum, two16), 2);
612 :
613 2532130 : pSrcScanlineShifted += 2 * DEST_ELTS;
614 : }
615 :
616 : // Pack each 16 bit average value to 8 bits
617 2532130 : const auto average = packus_epi16(average0, average1);
618 2532130 : storeu_int(&pDstScanline[iDstPixel], average);
619 : }
620 :
621 123976 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
622 123976 : return iDstPixel;
623 : }
624 :
625 : /************************************************************************/
626 : /* QuadraticMeanUInt16SSE2() */
627 : /************************************************************************/
628 :
629 : #ifdef __SSE3__
630 : #define sse2_hadd_pd _mm_hadd_pd
631 : #else
632 185 : inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
633 : {
634 : auto aLo_bLo =
635 740 : _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
636 : auto aHi_bHi =
637 740 : _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
638 185 : return _mm_add_pd(aLo_bLo, aHi_bHi); // (aLo + aHi, bLo + bHi)
639 : }
640 : #endif
641 :
642 120 : inline __m128d SQUARE_PD(__m128d x)
643 : {
644 120 : return _mm_mul_pd(x, x);
645 : }
646 :
647 : #ifdef __AVX2__
648 :
649 : inline __m256d SQUARE_PD(__m256d x)
650 : {
651 : return _mm256_mul_pd(x, x);
652 : }
653 :
654 : inline __m256d FIXUP_LANES(__m256d x)
655 : {
656 : return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
657 : }
658 :
659 : inline __m256 FIXUP_LANES(__m256 x)
660 : {
661 : return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
662 : }
663 :
664 : #endif
665 :
666 : static int
667 14 : QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
668 : const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
669 : uint16_t *CPL_RESTRICT pDstScanline)
670 : {
671 : // Optimized implementation for RMS on UInt16 by
672 : // processing by group of 4 output pixels.
673 14 : const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
674 :
675 14 : int iDstPixel = 0;
676 14 : const auto zero = _mm_setzero_si128();
677 :
678 : #ifdef __AVX2__
679 : const auto zeroDot25 = _mm256_set1_pd(0.25);
680 : const auto zeroDot5 = _mm256_set1_pd(0.5);
681 :
682 : // The first four 0's could be anything, as we only take the bottom
683 : // 128 bits.
684 : const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
685 : #else
686 14 : const auto zeroDot25 = _mm_set1_pd(0.25);
687 14 : const auto zeroDot5 = _mm_set1_pd(0.5);
688 : #endif
689 :
690 14 : constexpr int DEST_ELTS =
691 : static_cast<int>(sizeof(zero) / sizeof(uint16_t)) / 2;
692 52 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
693 : {
694 : // Load 8 UInt16 from each line
695 38 : const auto firstLine = _mm_loadu_si128(
696 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
697 : const auto secondLine =
698 38 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
699 38 : pSrcScanlineShifted + nChunkXSize));
700 :
701 : // Detect if all of the source values fit in 14 bits.
702 : // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
703 : // and we can do a much faster implementation.
704 : const auto maskTmp =
705 76 : _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
706 : #if defined(__i386__) || defined(_M_IX86)
707 : uint64_t nMaskFitsIn14Bits = 0;
708 : _mm_storel_epi64(
709 : reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
710 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
711 : #else
712 38 : const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
713 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
714 : #endif
715 38 : if (nMaskFitsIn14Bits == 0)
716 : {
717 : // Multiplication of 16 bit values and horizontal
718 : // addition of 32 bit results
719 : const auto firstLineHSumSquare =
720 26 : _mm_madd_epi16(firstLine, firstLine);
721 : const auto secondLineHSumSquare =
722 26 : _mm_madd_epi16(secondLine, secondLine);
723 : // Vertical addition
724 : const auto sumSquares =
725 26 : _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
726 : // In theory we should take sqrt(sumSquares * 0.25f)
727 : // but given the rounding we do, this is equivalent to
728 : // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
729 : // sumSquares <= 4 * 16383^2
730 26 : const auto one32 = _mm_set1_epi32(1);
731 : const auto sumSquaresPlusOneDiv4 =
732 52 : _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
733 : // Take square root and truncate/floor to int32
734 78 : auto rms = _mm_cvttps_epi32(
735 : _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
736 :
737 : // Round to upper value if it minimizes the
738 : // error |rms^2 - sumSquares/4|
739 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
740 : // rms += 1;
741 : // which is equivalent to:
742 : // if( rms * rms + rms < (sumSquares+1) / 4 )
743 : // rms += 1;
744 : auto mask =
745 78 : _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
746 : _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
747 26 : rms = _mm_sub_epi32(rms, mask);
748 : // Pack each 32 bit RMS value to 16 bits
749 26 : rms = _mm_packs_epi32(rms, rms /* could be anything */);
750 : _mm_storel_epi64(
751 26 : reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
752 26 : pSrcScanlineShifted += 2 * DEST_ELTS;
753 26 : continue;
754 : }
755 :
756 : // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
757 : // to 32 bit would result in 4 multiplications instead of 8, but
758 : // mullo/mulhi have a worse throughput than mul_pd.
759 :
760 : // Extend those UInt16s as UInt32s
761 12 : const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
762 12 : const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
763 12 : const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
764 12 : const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
765 :
766 : #ifdef __AVX2__
767 : // Multiplication of 32 bit values previously converted to 64 bit double
768 : const auto firstLineLoDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineLo));
769 : const auto firstLineHiDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineHi));
770 : const auto secondLineLoDbl =
771 : SQUARE_PD(_mm256_cvtepi32_pd(secondLineLo));
772 : const auto secondLineHiDbl =
773 : SQUARE_PD(_mm256_cvtepi32_pd(secondLineHi));
774 :
775 : // Vertical addition of squares
776 : const auto sumSquaresLo =
777 : _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
778 : const auto sumSquaresHi =
779 : _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
780 :
781 : // Horizontal addition of squares
782 : const auto sumSquares =
783 : FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
784 :
785 : const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
786 :
787 : // Take square root and truncate/floor to int32
788 : auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
789 : const auto rmsDouble = _mm256_cvtepi32_pd(rms);
790 : const auto right = _mm256_sub_pd(
791 : sumDivWeight, _mm256_add_pd(SQUARE_PD(rmsDouble), rmsDouble));
792 :
793 : auto mask =
794 : _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
795 : // Extract 32-bit from each of the 4 64-bit masks
796 : // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
797 : // _MM_SHUFFLE(2,0,2,0)));
798 : mask = _mm256_permutevar8x32_ps(mask, permutation);
799 : const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
800 :
801 : // Apply the correction
802 : rms = _mm_sub_epi32(rms, maskI);
803 :
804 : // Pack each 32 bit RMS value to 16 bits
805 : rms = _mm_packus_epi32(rms, rms /* could be anything */);
806 : #else
807 : // Multiplication of 32 bit values previously converted to 64 bit double
808 12 : const auto firstLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineLo));
809 : const auto firstLineLoHi =
810 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
811 12 : const auto firstLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineHi));
812 : const auto firstLineHiHi =
813 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
814 :
815 12 : const auto secondLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineLo));
816 : const auto secondLineLoHi =
817 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
818 12 : const auto secondLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineHi));
819 : const auto secondLineHiHi =
820 24 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
821 :
822 : // Vertical addition of squares
823 12 : const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
824 12 : const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
825 12 : const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
826 12 : const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
827 :
828 : // Horizontal addition of squares
829 12 : const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
830 12 : const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
831 :
832 12 : const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
833 12 : const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
834 : // Take square root and truncate/floor to int32
835 24 : const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
836 24 : const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
837 :
838 : // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
839 : // if( 0.5 < sumDivWeight - (rms * rms + rms) )
840 : // rms += 1;
841 12 : const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
842 12 : const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
843 24 : const auto rightLo = _mm_sub_pd(
844 : sumDivWeightLo, _mm_add_pd(SQUARE_PD(rmsLoDouble), rmsLoDouble));
845 36 : const auto rightHi = _mm_sub_pd(
846 : sumDivWeightHi, _mm_add_pd(SQUARE_PD(rmsHiDouble), rmsHiDouble));
847 :
848 24 : const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
849 12 : const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
850 : // The value of the mask will be -1 when the correction needs to be
851 : // applied
852 24 : const auto mask = _mm_castps_si128(_mm_shuffle_ps(
853 : maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
854 :
855 48 : auto rms = _mm_castps_si128(
856 : _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
857 : // Apply the correction
858 12 : rms = _mm_sub_epi32(rms, mask);
859 :
860 : // Pack each 32 bit RMS value to 16 bits
861 12 : rms = sse2_packus_epi32(rms, rms /* could be anything */);
862 : #endif
863 :
864 12 : _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
865 : rms);
866 12 : pSrcScanlineShifted += 2 * DEST_ELTS;
867 : }
868 :
869 14 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
870 14 : return iDstPixel;
871 : }
872 :
873 : /************************************************************************/
874 : /* AverageUInt16SSE2() */
875 : /************************************************************************/
876 :
877 : static int
878 13 : AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
879 : const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
880 : uint16_t *CPL_RESTRICT pDstScanline)
881 : {
882 : // Optimized implementation for average on UInt16 by
883 : // processing by group of 8 output pixels.
884 :
885 13 : const auto mask = _mm_set1_epi32(0xFFFF);
886 13 : const auto two = _mm_set1_epi32(2);
887 13 : const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
888 :
889 13 : int iDstPixel = 0;
890 13 : constexpr int DEST_ELTS = static_cast<int>(sizeof(mask) / sizeof(uint16_t));
891 25 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
892 : {
893 : __m128i averageLow;
894 : // Load 8 UInt16 from each line
895 : {
896 12 : const auto firstLine = _mm_loadu_si128(
897 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
898 : const auto secondLine =
899 12 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
900 12 : pSrcScanlineShifted + nChunkXSize));
901 :
902 : // Horizontal addition and extension to 32 bit
903 36 : const auto horizAddFirstLine = _mm_add_epi32(
904 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
905 : const auto horizAddSecondLine =
906 36 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
907 : _mm_srli_epi32(secondLine, 16));
908 :
909 : // Vertical addition and average computation
910 : // average = (sum + 2) >> 2
911 24 : const auto sum = _mm_add_epi32(
912 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
913 12 : averageLow = _mm_srli_epi32(sum, 2);
914 : }
915 : // Load 8 UInt16 from each line
916 : __m128i averageHigh;
917 : {
918 : const auto firstLine =
919 12 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
920 12 : pSrcScanlineShifted + DEST_ELTS));
921 : const auto secondLine =
922 12 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
923 12 : pSrcScanlineShifted + DEST_ELTS + nChunkXSize));
924 :
925 : // Horizontal addition and extension to 32 bit
926 36 : const auto horizAddFirstLine = _mm_add_epi32(
927 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
928 : const auto horizAddSecondLine =
929 36 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
930 : _mm_srli_epi32(secondLine, 16));
931 :
932 : // Vertical addition and average computation
933 : // average = (sum + 2) >> 2
934 24 : const auto sum = _mm_add_epi32(
935 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
936 12 : averageHigh = _mm_srli_epi32(sum, 2);
937 : }
938 :
939 : // Pack each 32 bit average value to 16 bits
940 12 : auto average = sse2_packus_epi32(averageLow, averageHigh);
941 12 : _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
942 : average);
943 12 : pSrcScanlineShifted += 2 * DEST_ELTS;
944 : }
945 :
946 13 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
947 13 : return iDstPixel;
948 : }
949 :
950 : /************************************************************************/
951 : /* QuadraticMeanFloatSSE2() */
952 : /************************************************************************/
953 :
954 : #if !defined(ARM_V7)
955 :
956 : #ifdef __SSE3__
957 : #define sse2_hadd_ps _mm_hadd_ps
958 : #else
959 82 : inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
960 : {
961 82 : auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
962 82 : auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
963 82 : return _mm_add_ps(aEven_bEven, aOdd_bOdd); // (aEven + aOdd, bEven + bOdd)
964 : }
965 : #endif
966 :
967 : #ifdef __AVX2__
968 : #define set1_ps _mm256_set1_ps
969 : #define loadu_ps _mm256_loadu_ps
970 : #define andnot_ps _mm256_andnot_ps
971 : #define and_ps _mm256_and_ps
972 : #define max_ps _mm256_max_ps
973 : #define shuffle_ps _mm256_shuffle_ps
974 : #define div_ps _mm256_div_ps
975 : #define cmpeq_ps(x, y) _mm256_cmp_ps((x), (y), _CMP_EQ_OQ)
976 : #define mul_ps _mm256_mul_ps
977 : #define add_ps _mm256_add_ps
978 : #define hadd_ps _mm256_hadd_ps
979 : #define sqrt_ps _mm256_sqrt_ps
980 : #define or_ps _mm256_or_ps
981 : #define unpacklo_ps _mm256_unpacklo_ps
982 : #define unpackhi_ps _mm256_unpackhi_ps
983 : #define storeu_ps _mm256_storeu_ps
984 : #define blendv_ps _mm256_blendv_ps
985 :
986 : inline __m256 SQUARE_PS(__m256 x)
987 : {
988 : return _mm256_mul_ps(x, x);
989 : }
990 :
991 : #else
992 :
993 : #define set1_ps _mm_set1_ps
994 : #define loadu_ps _mm_loadu_ps
995 : #define andnot_ps _mm_andnot_ps
996 : #define and_ps _mm_and_ps
997 : #define max_ps _mm_max_ps
998 : #define shuffle_ps _mm_shuffle_ps
999 : #define div_ps _mm_div_ps
1000 : #define cmpeq_ps _mm_cmpeq_ps
1001 : #define mul_ps _mm_mul_ps
1002 : #define add_ps _mm_add_ps
1003 : #define hadd_ps sse2_hadd_ps
1004 : #define sqrt_ps _mm_sqrt_ps
1005 : #define or_ps _mm_or_ps
1006 : #define unpacklo_ps _mm_unpacklo_ps
1007 : #define unpackhi_ps _mm_unpackhi_ps
1008 : #define storeu_ps _mm_storeu_ps
1009 :
1010 132 : inline __m128 blendv_ps(__m128 a, __m128 b, __m128 mask)
1011 : {
1012 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
1013 : return _mm_blendv_ps(a, b, mask);
1014 : #else
1015 396 : return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b));
1016 : #endif
1017 : }
1018 :
1019 528 : inline __m128 SQUARE_PS(__m128 x)
1020 : {
1021 528 : return _mm_mul_ps(x, x);
1022 : }
1023 :
1024 132 : inline __m128 FIXUP_LANES(__m128 x)
1025 : {
1026 132 : return x;
1027 : }
1028 :
1029 : #endif
1030 :
1031 : static int
1032 : #if defined(__GNUC__)
1033 : __attribute__((noinline))
1034 : #endif
1035 66 : QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
1036 : const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1037 : float *CPL_RESTRICT pDstScanline)
1038 : {
1039 : // Optimized implementation for RMS on Float32 by
1040 : // processing by group of output pixels.
1041 66 : const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1042 :
1043 66 : int iDstPixel = 0;
1044 66 : const auto minus_zero = set1_ps(-0.0f);
1045 66 : const auto zeroDot25 = set1_ps(0.25f);
1046 66 : const auto one = set1_ps(1.0f);
1047 66 : const auto infv = set1_ps(std::numeric_limits<float>::infinity());
1048 66 : constexpr int DEST_ELTS = static_cast<int>(sizeof(one) / sizeof(float));
1049 :
1050 198 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1051 : {
1052 : // Load 2*DEST_ELTS Float32 from each line
1053 132 : auto firstLineLo = loadu_ps(pSrcScanlineShifted);
1054 132 : auto firstLineHi = loadu_ps(pSrcScanlineShifted + DEST_ELTS);
1055 132 : auto secondLineLo = loadu_ps(pSrcScanlineShifted + nChunkXSize);
1056 : auto secondLineHi =
1057 264 : loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize);
1058 :
1059 : // Take the absolute value
1060 132 : firstLineLo = andnot_ps(minus_zero, firstLineLo);
1061 132 : firstLineHi = andnot_ps(minus_zero, firstLineHi);
1062 132 : secondLineLo = andnot_ps(minus_zero, secondLineLo);
1063 132 : secondLineHi = andnot_ps(minus_zero, secondLineHi);
1064 :
1065 : auto firstLineEven =
1066 132 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1067 : auto firstLineOdd =
1068 132 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1069 : auto secondLineEven =
1070 132 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1071 : auto secondLineOdd =
1072 132 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1073 :
1074 : // Compute the maximum of each DEST_ELTS value to RMS-average
1075 396 : const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
1076 : max_ps(secondLineEven, secondLineOdd));
1077 :
1078 : // Normalize each value by the maximum of the DEST_ELTS ones.
1079 : // This step is important to avoid that the square evaluates to infinity
1080 : // for sufficiently big input.
1081 132 : auto invMax = div_ps(one, maxV);
1082 : // Deal with 0 being the maximum to correct division by zero
1083 : // note: comparing to -0 leads to identical results as to comparing with
1084 : // 0
1085 264 : invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
1086 :
1087 132 : firstLineEven = mul_ps(firstLineEven, invMax);
1088 132 : firstLineOdd = mul_ps(firstLineOdd, invMax);
1089 132 : secondLineEven = mul_ps(secondLineEven, invMax);
1090 132 : secondLineOdd = mul_ps(secondLineOdd, invMax);
1091 :
1092 : // Compute squares
1093 132 : firstLineEven = SQUARE_PS(firstLineEven);
1094 132 : firstLineOdd = SQUARE_PS(firstLineOdd);
1095 132 : secondLineEven = SQUARE_PS(secondLineEven);
1096 132 : secondLineOdd = SQUARE_PS(secondLineOdd);
1097 :
1098 396 : const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
1099 : add_ps(secondLineEven, secondLineOdd));
1100 :
1101 396 : auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
1102 :
1103 : // Deal with infinity being the maximum
1104 132 : const auto maskIsInf = cmpeq_ps(maxV, infv);
1105 132 : rms = blendv_ps(rms, infv, maskIsInf);
1106 :
1107 132 : rms = FIXUP_LANES(rms);
1108 :
1109 132 : storeu_ps(&pDstScanline[iDstPixel], rms);
1110 132 : pSrcScanlineShifted += DEST_ELTS * 2;
1111 : }
1112 :
1113 66 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1114 66 : return iDstPixel;
1115 : }
1116 :
1117 : /************************************************************************/
1118 : /* AverageFloatSSE2() */
1119 : /************************************************************************/
1120 :
1121 50 : static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
1122 : const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1123 : float *CPL_RESTRICT pDstScanline)
1124 : {
1125 : // Optimized implementation for average on Float32 by
1126 : // processing by group of output pixels.
1127 50 : const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1128 :
1129 50 : int iDstPixel = 0;
1130 50 : const auto zeroDot25 = _mm_set1_ps(0.25f);
1131 50 : constexpr int DEST_ELTS =
1132 : static_cast<int>(sizeof(zeroDot25) / sizeof(float));
1133 :
1134 132 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1135 : {
1136 : // Load 2 * DEST_ELTS Float32 from each line
1137 : const auto firstLineLo =
1138 82 : _mm_mul_ps(_mm_loadu_ps(pSrcScanlineShifted), zeroDot25);
1139 164 : const auto firstLineHi = _mm_mul_ps(
1140 : _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS), zeroDot25);
1141 82 : const auto secondLineLo = _mm_mul_ps(
1142 82 : _mm_loadu_ps(pSrcScanlineShifted + nChunkXSize), zeroDot25);
1143 164 : const auto secondLineHi = _mm_mul_ps(
1144 82 : _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize),
1145 : zeroDot25);
1146 :
1147 : // Vertical addition
1148 82 : const auto tmpLo = _mm_add_ps(firstLineLo, secondLineLo);
1149 82 : const auto tmpHi = _mm_add_ps(firstLineHi, secondLineHi);
1150 :
1151 : // Horizontal addition
1152 82 : const auto average = sse2_hadd_ps(tmpLo, tmpHi);
1153 :
1154 82 : _mm_storeu_ps(&pDstScanline[iDstPixel], average);
1155 82 : pSrcScanlineShifted += DEST_ELTS * 2;
1156 : }
1157 :
1158 50 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1159 50 : return iDstPixel;
1160 : }
1161 :
1162 : /************************************************************************/
1163 : /* AverageDoubleSSE2() */
1164 : /************************************************************************/
1165 :
1166 : static int
1167 50 : AverageDoubleSSE2(int nDstXWidth, int nChunkXSize,
1168 : const double *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1169 : double *CPL_RESTRICT pDstScanline)
1170 : {
1171 : // Optimized implementation for average on Float64 by
1172 : // processing by group of output pixels.
1173 50 : const double *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1174 :
1175 50 : int iDstPixel = 0;
1176 50 : const auto zeroDot25 = _mm_set1_pd(0.25);
1177 50 : constexpr int DEST_ELTS =
1178 : static_cast<int>(sizeof(zeroDot25) / sizeof(double));
1179 :
1180 211 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1181 : {
1182 : // Load 4 * DEST_ELTS Float64 from each line
1183 161 : const auto firstLine0 = _mm_mul_pd(
1184 : _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS), zeroDot25);
1185 322 : const auto firstLine1 = _mm_mul_pd(
1186 : _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS), zeroDot25);
1187 161 : const auto secondLine0 = _mm_mul_pd(
1188 161 : _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS + nChunkXSize),
1189 : zeroDot25);
1190 322 : const auto secondLine1 = _mm_mul_pd(
1191 161 : _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS + nChunkXSize),
1192 : zeroDot25);
1193 :
1194 : // Vertical addition
1195 161 : const auto tmp0 = _mm_add_pd(firstLine0, secondLine0);
1196 161 : const auto tmp1 = _mm_add_pd(firstLine1, secondLine1);
1197 :
1198 : // Horizontal addition
1199 161 : const auto average0 = sse2_hadd_pd(tmp0, tmp1);
1200 :
1201 161 : _mm_storeu_pd(&pDstScanline[iDstPixel + 0], average0);
1202 161 : pSrcScanlineShifted += DEST_ELTS * 2;
1203 : }
1204 :
1205 50 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1206 50 : return iDstPixel;
1207 : }
1208 :
1209 : #endif
1210 :
1211 : #endif
1212 :
1213 : /************************************************************************/
1214 : /* GDALResampleChunk_AverageOrRMS() */
1215 : /************************************************************************/
1216 :
1217 : template <class T, class Tsum, GDALDataType eWrkDataType, bool bQuadraticMean>
1218 : static CPLErr
1219 7362 : GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
1220 : const T *pChunk, void **ppDstBuffer)
1221 : {
1222 7362 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1223 7362 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1224 7362 : const double dfSrcXDelta = args.dfSrcXDelta;
1225 7362 : const double dfSrcYDelta = args.dfSrcYDelta;
1226 7362 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1227 7362 : const int nChunkXOff = args.nChunkXOff;
1228 7362 : const int nChunkYOff = args.nChunkYOff;
1229 7362 : const int nChunkXSize = args.nChunkXSize;
1230 7362 : const int nChunkYSize = args.nChunkYSize;
1231 7362 : const int nDstXOff = args.nDstXOff;
1232 7362 : const int nDstXOff2 = args.nDstXOff2;
1233 7362 : const int nDstYOff = args.nDstYOff;
1234 7362 : const int nDstYOff2 = args.nDstYOff2;
1235 7362 : const char *pszResampling = args.pszResampling;
1236 7362 : bool bHasNoData = args.bHasNoData;
1237 7362 : const double dfNoDataValue = args.dfNoDataValue;
1238 7362 : const GDALColorTable *const poColorTable =
1239 : !bQuadraticMean &&
1240 : // AVERAGE_BIT2GRAYSCALE
1241 7279 : CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"))
1242 : ? nullptr
1243 : : args.poColorTable;
1244 7362 : const bool bPropagateNoData = args.bPropagateNoData;
1245 :
1246 7362 : T tNoDataValue = (!bHasNoData) ? 0 : static_cast<T>(dfNoDataValue);
1247 7362 : const T tReplacementVal =
1248 206 : bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
1249 72 : args.eOvrDataType, dfNoDataValue))
1250 : : 0;
1251 :
1252 7362 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
1253 7362 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1254 7362 : const int nDstXWidth = nDstXOff2 - nDstXOff;
1255 :
1256 : /* -------------------------------------------------------------------- */
1257 : /* Allocate buffers. */
1258 : /* -------------------------------------------------------------------- */
1259 7362 : *ppDstBuffer = static_cast<T *>(
1260 7362 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
1261 : GDALGetDataTypeSizeBytes(eWrkDataType)));
1262 7362 : if (*ppDstBuffer == nullptr)
1263 : {
1264 0 : return CE_Failure;
1265 : }
1266 7362 : T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
1267 :
1268 : struct PrecomputedXValue
1269 : {
1270 : int nLeftXOffShifted;
1271 : int nRightXOffShifted;
1272 : double dfLeftWeight;
1273 : double dfRightWeight;
1274 : double dfTotalWeightFullLine;
1275 : };
1276 :
1277 : PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
1278 7362 : VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(PrecomputedXValue)));
1279 :
1280 7362 : if (pasSrcX == nullptr)
1281 : {
1282 0 : return CE_Failure;
1283 : }
1284 :
1285 7362 : std::vector<GDALColorEntry> colorEntries;
1286 :
1287 7362 : if (poColorTable)
1288 : {
1289 5 : int nTransparentIdx = -1;
1290 5 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1291 :
1292 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1293 : // it as nodata value
1294 6 : if (bHasNoData && dfNoDataValue >= 0.0 &&
1295 1 : tNoDataValue < colorEntries.size())
1296 1 : colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1297 :
1298 : // Or if we have no explicit nodata, but a color table entry that is
1299 : // transparent, consider it as the nodata value
1300 4 : else if (!bHasNoData && nTransparentIdx >= 0)
1301 : {
1302 0 : bHasNoData = true;
1303 0 : tNoDataValue = static_cast<T>(nTransparentIdx);
1304 : }
1305 : }
1306 :
1307 : /* ==================================================================== */
1308 : /* Precompute inner loop constants. */
1309 : /* ==================================================================== */
1310 7362 : bool bSrcXSpacingIsTwo = true;
1311 7362 : int nLastSrcXOff2 = -1;
1312 1689160 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1313 : {
1314 1681805 : const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
1315 : // Apply some epsilon to avoid numerical precision issues
1316 1681805 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
1317 1681805 : const double dfSrcXOff2 =
1318 1681805 : dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
1319 1681805 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
1320 :
1321 1681805 : if (nSrcXOff < nChunkXOff)
1322 0 : nSrcXOff = nChunkXOff;
1323 1681805 : if (nSrcXOff2 == nSrcXOff)
1324 0 : nSrcXOff2++;
1325 1681805 : if (nSrcXOff2 > nChunkRightXOff)
1326 1 : nSrcXOff2 = nChunkRightXOff;
1327 :
1328 1681805 : pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
1329 1681805 : pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
1330 1681805 : nSrcXOff2 - nChunkXOff;
1331 21 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
1332 1681805 : (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
1333 1681805 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
1334 1681805 : 1 - (nSrcXOff2 - dfSrcXOff2);
1335 1681805 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
1336 1681805 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
1337 1681805 : if (nSrcXOff + 1 < nSrcXOff2)
1338 : {
1339 1681779 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1340 1681779 : nSrcXOff2 - nSrcXOff - 2;
1341 1681779 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1342 1681779 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
1343 : }
1344 :
1345 1681805 : if (nSrcXOff2 - nSrcXOff != 2 ||
1346 1583882 : (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
1347 : {
1348 91989 : bSrcXSpacingIsTwo = false;
1349 : }
1350 1681805 : nLastSrcXOff2 = nSrcXOff2;
1351 : }
1352 :
1353 : /* ==================================================================== */
1354 : /* Loop over destination scanlines. */
1355 : /* ==================================================================== */
1356 705422 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1357 : {
1358 698060 : const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
1359 698060 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
1360 698060 : if (nSrcYOff < nChunkYOff)
1361 0 : nSrcYOff = nChunkYOff;
1362 :
1363 698060 : const double dfSrcYOff2 =
1364 698060 : dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
1365 698060 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
1366 698060 : if (nSrcYOff2 == nSrcYOff)
1367 0 : ++nSrcYOff2;
1368 698060 : if (nSrcYOff2 > nChunkBottomYOff)
1369 3 : nSrcYOff2 = nChunkBottomYOff;
1370 :
1371 698060 : T *const pDstScanline =
1372 698060 : pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
1373 :
1374 : /* --------------------------------------------------------------------
1375 : */
1376 : /* Loop over destination pixels */
1377 : /* --------------------------------------------------------------------
1378 : */
1379 698060 : if (poColorTable == nullptr)
1380 : {
1381 697945 : if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
1382 : pabyChunkNodataMask == nullptr)
1383 : {
1384 : if constexpr (eWrkDataType == GDT_UInt8 ||
1385 : eWrkDataType == GDT_UInt16)
1386 : {
1387 : // Optimized case : no nodata, overview by a factor of 2 and
1388 : // regular x and y src spacing.
1389 129392 : const T *pSrcScanlineShifted =
1390 129392 : pChunk + pasSrcX[0].nLeftXOffShifted +
1391 129392 : static_cast<size_t>(nSrcYOff - nChunkYOff) *
1392 129392 : nChunkXSize;
1393 129392 : int iDstPixel = 0;
1394 : #ifdef USE_SSE2
1395 : if constexpr (eWrkDataType == GDT_UInt8)
1396 : {
1397 : if constexpr (bQuadraticMean)
1398 : {
1399 5389 : iDstPixel = QuadraticMeanByteSSE2OrAVX2(
1400 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1401 : pDstScanline);
1402 : }
1403 : else
1404 : {
1405 123976 : iDstPixel = AverageByteSSE2OrAVX2(
1406 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1407 : pDstScanline);
1408 : }
1409 : }
1410 : else
1411 : {
1412 : static_assert(eWrkDataType == GDT_UInt16);
1413 : if constexpr (bQuadraticMean)
1414 : {
1415 14 : iDstPixel = QuadraticMeanUInt16SSE2(
1416 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1417 : pDstScanline);
1418 : }
1419 : else
1420 : {
1421 13 : iDstPixel = AverageUInt16SSE2(
1422 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1423 : pDstScanline);
1424 : }
1425 : }
1426 : #endif
1427 303851 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1428 : {
1429 174459 : Tsum nTotal = 0;
1430 : T nVal;
1431 : if constexpr (bQuadraticMean)
1432 52 : nTotal =
1433 52 : SQUARE<Tsum>(pSrcScanlineShifted[0]) +
1434 52 : SQUARE<Tsum>(pSrcScanlineShifted[1]) +
1435 52 : SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
1436 52 : SQUARE<Tsum>(
1437 52 : pSrcScanlineShifted[1 + nChunkXSize]);
1438 : else
1439 174407 : nTotal = pSrcScanlineShifted[0] +
1440 174407 : pSrcScanlineShifted[1] +
1441 174407 : pSrcScanlineShifted[nChunkXSize] +
1442 174407 : pSrcScanlineShifted[1 + nChunkXSize];
1443 :
1444 174459 : constexpr int nTotalWeight = 4;
1445 : if constexpr (bQuadraticMean)
1446 52 : nVal = ComputeIntegerRMS_4values<T>(nTotal);
1447 : else
1448 174407 : nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
1449 : nTotalWeight);
1450 :
1451 : // No need to compare nVal against tNoDataValue as we
1452 : // are in a case where pabyChunkNodataMask == nullptr
1453 : // implies the absence of nodata value.
1454 174459 : pDstScanline[iDstPixel] = nVal;
1455 174459 : pSrcScanlineShifted += 2;
1456 : }
1457 : }
1458 : else
1459 : {
1460 : static_assert(eWrkDataType == GDT_Float32 ||
1461 : eWrkDataType == GDT_Float64);
1462 202 : const T *pSrcScanlineShifted =
1463 202 : pChunk + pasSrcX[0].nLeftXOffShifted +
1464 202 : static_cast<size_t>(nSrcYOff - nChunkYOff) *
1465 202 : nChunkXSize;
1466 202 : int iDstPixel = 0;
1467 : #if defined(USE_SSE2) && !defined(ARM_V7)
1468 : if constexpr (eWrkDataType == GDT_Float32)
1469 : {
1470 : static_assert(std::is_same_v<T, float>);
1471 : if constexpr (bQuadraticMean)
1472 : {
1473 66 : iDstPixel = QuadraticMeanFloatSSE2(
1474 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1475 : pDstScanline);
1476 : }
1477 : else
1478 : {
1479 50 : iDstPixel = AverageFloatSSE2(
1480 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1481 : pDstScanline);
1482 : }
1483 : }
1484 : else
1485 : {
1486 : if constexpr (!bQuadraticMean)
1487 : {
1488 50 : iDstPixel = AverageDoubleSSE2(
1489 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1490 : pDstScanline);
1491 : }
1492 : }
1493 : #endif
1494 :
1495 726 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1496 : {
1497 : T nVal;
1498 :
1499 : if constexpr (bQuadraticMean)
1500 : {
1501 : // Avoid issues with large values by renormalizing
1502 96 : const auto max = std::max(
1503 420 : {std::fabs(pSrcScanlineShifted[0]),
1504 420 : std::fabs(pSrcScanlineShifted[1]),
1505 420 : std::fabs(pSrcScanlineShifted[nChunkXSize]),
1506 420 : std::fabs(
1507 420 : pSrcScanlineShifted[1 + nChunkXSize])});
1508 420 : if (max == 0)
1509 : {
1510 8 : nVal = 0;
1511 : }
1512 412 : else if (std::isinf(max))
1513 : {
1514 : // If there is at least one infinity value,
1515 : // then just summing, and taking the abs
1516 : // value will give the expected result:
1517 : // * +inf if all values are +inf
1518 : // * +inf if all values are -inf
1519 : // * NaN otherwise
1520 82 : nVal = std::fabs(
1521 82 : pSrcScanlineShifted[0] +
1522 82 : pSrcScanlineShifted[1] +
1523 82 : pSrcScanlineShifted[nChunkXSize] +
1524 82 : pSrcScanlineShifted[1 + nChunkXSize]);
1525 : }
1526 : else
1527 : {
1528 330 : const auto inv_max = static_cast<T>(1.0) / max;
1529 330 : nVal =
1530 : max *
1531 330 : std::sqrt(
1532 : static_cast<T>(0.25) *
1533 330 : (SQUARE(pSrcScanlineShifted[0] *
1534 330 : inv_max) +
1535 330 : SQUARE(pSrcScanlineShifted[1] *
1536 330 : inv_max) +
1537 330 : SQUARE(
1538 330 : pSrcScanlineShifted[nChunkXSize] *
1539 330 : inv_max) +
1540 330 : SQUARE(
1541 330 : pSrcScanlineShifted[1 +
1542 : nChunkXSize] *
1543 : inv_max)));
1544 : }
1545 : }
1546 : else
1547 : {
1548 104 : constexpr auto weight = static_cast<T>(0.25);
1549 : // Multiply each value by weight to avoid
1550 : // potential overflow
1551 104 : nVal =
1552 104 : (weight * pSrcScanlineShifted[0] +
1553 104 : weight * pSrcScanlineShifted[1] +
1554 104 : weight * pSrcScanlineShifted[nChunkXSize] +
1555 104 : weight * pSrcScanlineShifted[1 + nChunkXSize]);
1556 : }
1557 :
1558 : // No need to compare nVal against tNoDataValue as we
1559 : // are in a case where pabyChunkNodataMask == nullptr
1560 : // implies the absence of nodata value.
1561 524 : pDstScanline[iDstPixel] = nVal;
1562 524 : pSrcScanlineShifted += 2;
1563 : }
1564 129594 : }
1565 : }
1566 : else
1567 : {
1568 17 : const double dfBottomWeight =
1569 568351 : (nSrcYOff + 1 == nSrcYOff2) ? 1.0
1570 568334 : : 1.0 - (dfSrcYOff - nSrcYOff);
1571 568351 : const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
1572 568351 : nSrcYOff -= nChunkYOff;
1573 568351 : nSrcYOff2 -= nChunkYOff;
1574 :
1575 568351 : double dfTotalWeightFullColumn = dfBottomWeight;
1576 568351 : if (nSrcYOff + 1 < nSrcYOff2)
1577 : {
1578 568334 : dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
1579 568334 : dfTotalWeightFullColumn += dfTopWeight;
1580 : }
1581 :
1582 9784185 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1583 : {
1584 9215839 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1585 9215839 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1586 :
1587 9215839 : double dfTotal = 0;
1588 9215839 : double dfTotalWeight = 0;
1589 9215839 : [[maybe_unused]] double dfMulFactor = 1.0;
1590 9215839 : [[maybe_unused]] double dfInvMulFactor = 1.0;
1591 9215839 : constexpr bool bUseMulFactor =
1592 : (eWrkDataType == GDT_Float32 ||
1593 : eWrkDataType == GDT_Float64);
1594 9215839 : if (pabyChunkNodataMask == nullptr)
1595 : {
1596 : if constexpr (bUseMulFactor)
1597 : {
1598 : if constexpr (bQuadraticMean)
1599 : {
1600 80 : T mulFactor = 0;
1601 80 : auto pChunkShifted =
1602 80 : pChunk +
1603 80 : static_cast<size_t>(nSrcYOff) * nChunkXSize;
1604 :
1605 240 : for (int iY = nSrcYOff; iY < nSrcYOff2;
1606 160 : ++iY, pChunkShifted += nChunkXSize)
1607 : {
1608 480 : for (int iX = nSrcXOff; iX < nSrcXOff2;
1609 : ++iX)
1610 640 : mulFactor = std::max(
1611 : mulFactor,
1612 320 : std::fabs(pChunkShifted[iX]));
1613 : }
1614 80 : dfMulFactor = double(mulFactor);
1615 142 : dfInvMulFactor =
1616 62 : dfMulFactor > 0 &&
1617 62 : std::isfinite(dfMulFactor)
1618 : ? 1.0 / dfMulFactor
1619 : : 1.0;
1620 : }
1621 : else
1622 : {
1623 139 : dfMulFactor = (nSrcYOff2 - nSrcYOff) *
1624 139 : (nSrcXOff2 - nSrcXOff);
1625 139 : dfInvMulFactor = 1.0 / dfMulFactor;
1626 : }
1627 : }
1628 :
1629 1746545 : auto pChunkShifted =
1630 227 : pChunk +
1631 1746545 : static_cast<size_t>(nSrcYOff) * nChunkXSize;
1632 1746545 : int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1633 1746545 : double dfWeightY = dfBottomWeight;
1634 3493539 : while (true)
1635 : {
1636 : double dfTotalLine;
1637 : if constexpr (bQuadraticMean)
1638 : {
1639 : // Left pixel
1640 : {
1641 216 : const T val = pChunkShifted[nSrcXOff];
1642 216 : dfTotalLine =
1643 216 : SQUARE(double(val) * dfInvMulFactor) *
1644 216 : pasSrcX[iDstPixel].dfLeftWeight;
1645 : }
1646 :
1647 216 : if (nSrcXOff + 1 < nSrcXOff2)
1648 : {
1649 : // Middle pixels
1650 216 : for (int iX = nSrcXOff + 1;
1651 536 : iX < nSrcXOff2 - 1; ++iX)
1652 : {
1653 320 : const T val = pChunkShifted[iX];
1654 320 : dfTotalLine += SQUARE(double(val) *
1655 : dfInvMulFactor);
1656 : }
1657 :
1658 : // Right pixel
1659 : {
1660 216 : const T val =
1661 216 : pChunkShifted[nSrcXOff2 - 1];
1662 216 : dfTotalLine +=
1663 216 : SQUARE(double(val) *
1664 216 : dfInvMulFactor) *
1665 216 : pasSrcX[iDstPixel].dfRightWeight;
1666 : }
1667 : }
1668 : }
1669 : else
1670 : {
1671 : // Left pixel
1672 : {
1673 5239868 : const T val = pChunkShifted[nSrcXOff];
1674 5239868 : dfTotalLine =
1675 5239868 : double(val) * dfInvMulFactor *
1676 5239868 : pasSrcX[iDstPixel].dfLeftWeight;
1677 : }
1678 :
1679 5239868 : if (nSrcXOff + 1 < nSrcXOff2)
1680 : {
1681 : // Middle pixels
1682 4239442 : for (int iX = nSrcXOff + 1;
1683 64183238 : iX < nSrcXOff2 - 1; ++iX)
1684 : {
1685 59943836 : const T val = pChunkShifted[iX];
1686 59943836 : dfTotalLine +=
1687 59943836 : double(val) * dfInvMulFactor;
1688 : }
1689 :
1690 : // Right pixel
1691 : {
1692 4239442 : const T val =
1693 4239442 : pChunkShifted[nSrcXOff2 - 1];
1694 4239442 : dfTotalLine +=
1695 4239442 : double(val) * dfInvMulFactor *
1696 4239442 : pasSrcX[iDstPixel].dfRightWeight;
1697 : }
1698 : }
1699 : }
1700 :
1701 5240084 : dfTotal += dfTotalLine * dfWeightY;
1702 5240084 : --nCounterY;
1703 5240084 : if (nCounterY < 0)
1704 1746545 : break;
1705 3493539 : pChunkShifted += nChunkXSize;
1706 3493539 : dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
1707 : }
1708 :
1709 1746545 : dfTotalWeight =
1710 1746545 : pasSrcX[iDstPixel].dfTotalWeightFullLine *
1711 : dfTotalWeightFullColumn;
1712 : }
1713 : else
1714 : {
1715 7469294 : size_t nCount = 0;
1716 30285576 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1717 : {
1718 22816292 : const auto pChunkShifted =
1719 22816292 : pChunk + static_cast<size_t>(iY) * nChunkXSize;
1720 :
1721 22816292 : double dfTotalLine = 0;
1722 22816292 : double dfTotalWeightLine = 0;
1723 : // Left pixel
1724 : {
1725 22816292 : const int iX = nSrcXOff;
1726 22816292 : const T val = pChunkShifted[iX];
1727 22816292 : if (pabyChunkNodataMask
1728 22816292 : [iX +
1729 22816292 : static_cast<size_t>(iY) * nChunkXSize])
1730 : {
1731 17325139 : nCount++;
1732 17325139 : const double dfWeightX =
1733 17325139 : pasSrcX[iDstPixel].dfLeftWeight;
1734 17325139 : dfTotalWeightLine = dfWeightX;
1735 : if constexpr (bQuadraticMean)
1736 508 : dfTotalLine =
1737 508 : SQUARE(double(val)) * dfWeightX;
1738 : else
1739 17324631 : dfTotalLine = double(val) * dfWeightX;
1740 : }
1741 : }
1742 :
1743 22816292 : if (nSrcXOff < nSrcXOff2 - 1)
1744 : {
1745 : // Middle pixels
1746 61618372 : for (int iX = nSrcXOff + 1; iX < nSrcXOff2 - 1;
1747 : ++iX)
1748 : {
1749 38802080 : const T val = pChunkShifted[iX];
1750 38802080 : if (pabyChunkNodataMask
1751 38802080 : [iX + static_cast<size_t>(iY) *
1752 38802080 : nChunkXSize])
1753 : {
1754 28038780 : nCount++;
1755 28038780 : dfTotalWeightLine += 1;
1756 : if constexpr (bQuadraticMean)
1757 640 : dfTotalLine += SQUARE(double(val));
1758 : else
1759 28038140 : dfTotalLine += double(val);
1760 : }
1761 : }
1762 :
1763 : // Right pixel
1764 : {
1765 22816292 : const int iX = nSrcXOff2 - 1;
1766 22816292 : const T val = pChunkShifted[iX];
1767 22816292 : if (pabyChunkNodataMask
1768 22816292 : [iX + static_cast<size_t>(iY) *
1769 22816292 : nChunkXSize])
1770 : {
1771 17324495 : nCount++;
1772 17324495 : const double dfWeightX =
1773 17324495 : pasSrcX[iDstPixel].dfRightWeight;
1774 17324495 : dfTotalWeightLine += dfWeightX;
1775 : if constexpr (bQuadraticMean)
1776 503 : dfTotalLine +=
1777 503 : SQUARE(double(val)) * dfWeightX;
1778 : else
1779 17323992 : dfTotalLine +=
1780 17323992 : double(val) * dfWeightX;
1781 : }
1782 : }
1783 : }
1784 :
1785 38163300 : const double dfWeightY =
1786 : (iY == nSrcYOff) ? dfBottomWeight
1787 15347008 : : (iY + 1 == nSrcYOff2) ? dfTopWeight
1788 : : 1.0;
1789 22816292 : dfTotal += dfTotalLine * dfWeightY;
1790 22816292 : dfTotalWeight += dfTotalWeightLine * dfWeightY;
1791 : }
1792 :
1793 7469294 : if (nCount == 0 ||
1794 8 : (bPropagateNoData &&
1795 : nCount <
1796 8 : static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
1797 8 : (nSrcXOff2 - nSrcXOff)))
1798 : {
1799 2307682 : pDstScanline[iDstPixel] = tNoDataValue;
1800 2307682 : continue;
1801 : }
1802 : }
1803 : if constexpr (eWrkDataType == GDT_UInt8)
1804 : {
1805 : T nVal;
1806 : if constexpr (bQuadraticMean)
1807 38 : nVal = ComputeIntegerRMS<T, int>(dfTotal,
1808 : dfTotalWeight);
1809 : else
1810 6901260 : nVal =
1811 6901260 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1812 6901298 : if (bHasNoData && nVal == tNoDataValue)
1813 0 : nVal = tReplacementVal;
1814 6901298 : pDstScanline[iDstPixel] = nVal;
1815 : }
1816 : else if constexpr (eWrkDataType == GDT_UInt16)
1817 : {
1818 : T nVal;
1819 : if constexpr (bQuadraticMean)
1820 4 : nVal = ComputeIntegerRMS<T, uint64_t>(
1821 : dfTotal, dfTotalWeight);
1822 : else
1823 4 : nVal =
1824 4 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1825 8 : if (bHasNoData && nVal == tNoDataValue)
1826 0 : nVal = tReplacementVal;
1827 8 : pDstScanline[iDstPixel] = nVal;
1828 : }
1829 : else
1830 : {
1831 : T nVal;
1832 : if constexpr (bQuadraticMean)
1833 : {
1834 : if constexpr (bUseMulFactor)
1835 249 : nVal = static_cast<T>(
1836 132 : dfMulFactor *
1837 249 : sqrt(dfTotal / dfTotalWeight));
1838 : else
1839 : nVal = static_cast<T>(
1840 : sqrt(dfTotal / dfTotalWeight));
1841 : }
1842 : else
1843 : {
1844 : if constexpr (bUseMulFactor)
1845 6602 : nVal = static_cast<T>(
1846 6602 : dfMulFactor * (dfTotal / dfTotalWeight));
1847 : else
1848 : nVal = static_cast<T>(dfTotal / dfTotalWeight);
1849 : }
1850 6851 : if (bHasNoData && nVal == tNoDataValue)
1851 2 : nVal = tReplacementVal;
1852 6851 : pDstScanline[iDstPixel] = nVal;
1853 : }
1854 : }
1855 : }
1856 : }
1857 : else
1858 : {
1859 115 : nSrcYOff -= nChunkYOff;
1860 115 : nSrcYOff2 -= nChunkYOff;
1861 :
1862 6590 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1863 : {
1864 6475 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1865 6475 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1866 :
1867 6475 : uint64_t nTotalR = 0;
1868 6475 : uint64_t nTotalG = 0;
1869 6475 : uint64_t nTotalB = 0;
1870 6475 : size_t nCount = 0;
1871 :
1872 19425 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1873 : {
1874 38850 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
1875 : {
1876 25900 : const T val =
1877 25900 : pChunk[iX + static_cast<size_t>(iY) * nChunkXSize];
1878 : // cppcheck-suppress unsignedLessThanZero
1879 25900 : if (val < 0 || val >= colorEntries.size())
1880 0 : continue;
1881 25900 : const size_t idx = static_cast<size_t>(val);
1882 25900 : const auto &entry = colorEntries[idx];
1883 25900 : if (entry.c4)
1884 : {
1885 : if constexpr (bQuadraticMean)
1886 : {
1887 800 : nTotalR += SQUARE<int>(entry.c1);
1888 800 : nTotalG += SQUARE<int>(entry.c2);
1889 800 : nTotalB += SQUARE<int>(entry.c3);
1890 800 : ++nCount;
1891 : }
1892 : else
1893 : {
1894 13328 : nTotalR += entry.c1;
1895 13328 : nTotalG += entry.c2;
1896 13328 : nTotalB += entry.c3;
1897 13328 : ++nCount;
1898 : }
1899 : }
1900 : }
1901 : }
1902 :
1903 6475 : if (nCount == 0 ||
1904 0 : (bPropagateNoData &&
1905 0 : nCount < static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
1906 0 : (nSrcXOff2 - nSrcXOff)))
1907 : {
1908 2838 : pDstScanline[iDstPixel] = tNoDataValue;
1909 : }
1910 : else
1911 : {
1912 : GDALColorEntry color;
1913 : if constexpr (bQuadraticMean)
1914 : {
1915 200 : color.c1 =
1916 200 : static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
1917 200 : color.c2 =
1918 200 : static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
1919 200 : color.c3 =
1920 200 : static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
1921 : }
1922 : else
1923 : {
1924 3437 : color.c1 =
1925 3437 : static_cast<short>((nTotalR + nCount / 2) / nCount);
1926 3437 : color.c2 =
1927 3437 : static_cast<short>((nTotalG + nCount / 2) / nCount);
1928 3437 : color.c3 =
1929 3437 : static_cast<short>((nTotalB + nCount / 2) / nCount);
1930 : }
1931 3637 : pDstScanline[iDstPixel] =
1932 3637 : static_cast<T>(BestColorEntry(colorEntries, color));
1933 : }
1934 : }
1935 : }
1936 : }
1937 :
1938 7362 : CPLFree(pasSrcX);
1939 :
1940 7362 : return CE_None;
1941 : }
1942 :
1943 : template <bool bQuadraticMean>
1944 : static CPLErr
1945 7362 : GDALResampleChunk_AverageOrRMSInternal(const GDALOverviewResampleArgs &args,
1946 : const void *pChunk, void **ppDstBuffer,
1947 : GDALDataType *peDstBufferDataType)
1948 : {
1949 7362 : *peDstBufferDataType = args.eWrkDataType;
1950 7362 : switch (args.eWrkDataType)
1951 : {
1952 7217 : case GDT_UInt8:
1953 : {
1954 : return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_UInt8,
1955 7217 : bQuadraticMean>(
1956 7217 : args, static_cast<const GByte *>(pChunk), ppDstBuffer);
1957 : }
1958 :
1959 11 : case GDT_UInt16:
1960 : {
1961 : if constexpr (bQuadraticMean)
1962 : {
1963 : // Use double as accumulation type, because UInt32 could overflow
1964 : return GDALResampleChunk_AverageOrRMS_T<
1965 6 : GUInt16, double, GDT_UInt16, bQuadraticMean>(
1966 6 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1967 : }
1968 : else
1969 : {
1970 : return GDALResampleChunk_AverageOrRMS_T<
1971 5 : GUInt16, GUInt32, GDT_UInt16, bQuadraticMean>(
1972 5 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1973 : }
1974 : }
1975 :
1976 81 : case GDT_Float32:
1977 : {
1978 : return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32,
1979 81 : bQuadraticMean>(
1980 81 : args, static_cast<const float *>(pChunk), ppDstBuffer);
1981 : }
1982 :
1983 53 : case GDT_Float64:
1984 : {
1985 : return GDALResampleChunk_AverageOrRMS_T<double, double, GDT_Float64,
1986 53 : bQuadraticMean>(
1987 53 : args, static_cast<const double *>(pChunk), ppDstBuffer);
1988 : }
1989 :
1990 0 : default:
1991 0 : break;
1992 : }
1993 :
1994 0 : CPLAssert(false);
1995 : return CE_Failure;
1996 : }
1997 :
1998 : static CPLErr
1999 7362 : GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
2000 : const void *pChunk, void **ppDstBuffer,
2001 : GDALDataType *peDstBufferDataType)
2002 : {
2003 7362 : if (EQUAL(args.pszResampling, "RMS"))
2004 83 : return GDALResampleChunk_AverageOrRMSInternal<true>(
2005 83 : args, pChunk, ppDstBuffer, peDstBufferDataType);
2006 : else
2007 7279 : return GDALResampleChunk_AverageOrRMSInternal<false>(
2008 7279 : args, pChunk, ppDstBuffer, peDstBufferDataType);
2009 : }
2010 :
2011 : /************************************************************************/
2012 : /* GDALResampleChunk_Gauss() */
2013 : /************************************************************************/
2014 :
2015 86 : static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
2016 : const void *pChunk, void **ppDstBuffer,
2017 : GDALDataType *peDstBufferDataType)
2018 :
2019 : {
2020 86 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2021 86 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2022 86 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2023 86 : const int nChunkXOff = args.nChunkXOff;
2024 86 : const int nChunkXSize = args.nChunkXSize;
2025 86 : const int nChunkYOff = args.nChunkYOff;
2026 86 : const int nChunkYSize = args.nChunkYSize;
2027 86 : const int nDstXOff = args.nDstXOff;
2028 86 : const int nDstXOff2 = args.nDstXOff2;
2029 86 : const int nDstYOff = args.nDstYOff;
2030 86 : const int nDstYOff2 = args.nDstYOff2;
2031 86 : const bool bHasNoData = args.bHasNoData;
2032 86 : double dfNoDataValue = args.dfNoDataValue;
2033 86 : const GDALColorTable *poColorTable = args.poColorTable;
2034 :
2035 86 : const double *const padfChunk = static_cast<const double *>(pChunk);
2036 :
2037 86 : *ppDstBuffer =
2038 86 : VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
2039 : GDALGetDataTypeSizeBytes(GDT_Float64));
2040 86 : if (*ppDstBuffer == nullptr)
2041 : {
2042 0 : return CE_Failure;
2043 : }
2044 86 : *peDstBufferDataType = GDT_Float64;
2045 86 : double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
2046 :
2047 : /* -------------------------------------------------------------------- */
2048 : /* Create the filter kernel and allocate scanline buffer. */
2049 : /* -------------------------------------------------------------------- */
2050 86 : int nGaussMatrixDim = 3;
2051 : const int *panGaussMatrix;
2052 86 : constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
2053 86 : constexpr int anGaussMatrix5x5[] = {1, 4, 6, 4, 1, 4, 16, 24, 16,
2054 : 4, 6, 24, 36, 24, 6, 4, 16, 24,
2055 : 16, 4, 1, 4, 6, 4, 1};
2056 86 : constexpr int anGaussMatrix7x7[] = {
2057 : 1, 6, 15, 20, 15, 6, 1, 6, 36, 90, 120, 90, 36,
2058 : 6, 15, 90, 225, 300, 225, 90, 15, 20, 120, 300, 400, 300,
2059 : 120, 20, 15, 90, 225, 300, 225, 90, 15, 6, 36, 90, 120,
2060 : 90, 36, 6, 1, 6, 15, 20, 15, 6, 1};
2061 :
2062 86 : const int nOXSize = args.nOvrXSize;
2063 86 : const int nOYSize = args.nOvrYSize;
2064 86 : const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
2065 :
2066 : // matrix for gauss filter
2067 86 : if (nResYFactor <= 2)
2068 : {
2069 85 : panGaussMatrix = anGaussMatrix3x3;
2070 85 : nGaussMatrixDim = 3;
2071 : }
2072 1 : else if (nResYFactor <= 4)
2073 : {
2074 0 : panGaussMatrix = anGaussMatrix5x5;
2075 0 : nGaussMatrixDim = 5;
2076 : }
2077 : else
2078 : {
2079 1 : panGaussMatrix = anGaussMatrix7x7;
2080 1 : nGaussMatrixDim = 7;
2081 : }
2082 :
2083 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
2084 : int *panGaussMatrixDup = static_cast<int *>(
2085 : CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
2086 : memcpy(panGaussMatrixDup, panGaussMatrix,
2087 : sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
2088 : panGaussMatrix = panGaussMatrixDup;
2089 : #endif
2090 :
2091 86 : if (!bHasNoData)
2092 79 : dfNoDataValue = 0.0;
2093 :
2094 86 : std::vector<GDALColorEntry> colorEntries;
2095 86 : int nTransparentIdx = -1;
2096 86 : if (poColorTable)
2097 2 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
2098 :
2099 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
2100 : // it as nodata value.
2101 92 : if (bHasNoData && dfNoDataValue >= 0.0 &&
2102 6 : dfNoDataValue < colorEntries.size())
2103 0 : colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
2104 :
2105 : // Or if we have no explicit nodata, but a color table entry that is
2106 : // transparent, consider it as the nodata value.
2107 86 : else if (!bHasNoData && nTransparentIdx >= 0)
2108 : {
2109 0 : dfNoDataValue = nTransparentIdx;
2110 : }
2111 :
2112 86 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2113 86 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2114 86 : const int nDstXWidth = nDstXOff2 - nDstXOff;
2115 :
2116 : /* ==================================================================== */
2117 : /* Loop over destination scanlines. */
2118 : /* ==================================================================== */
2119 16488 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2120 : {
2121 16402 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
2122 16402 : int nSrcYOff2 =
2123 16402 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
2124 :
2125 16402 : if (nSrcYOff < nChunkYOff)
2126 : {
2127 0 : nSrcYOff = nChunkYOff;
2128 0 : nSrcYOff2++;
2129 : }
2130 :
2131 16402 : const int iSizeY = nSrcYOff2 - nSrcYOff;
2132 16402 : nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
2133 16402 : nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
2134 :
2135 16402 : if (nSrcYOff2 > nChunkBottomYOff ||
2136 16359 : (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
2137 : {
2138 44 : nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
2139 : }
2140 :
2141 16402 : int nYShiftGaussMatrix = 0;
2142 16402 : if (nSrcYOff < nChunkYOff)
2143 : {
2144 0 : nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
2145 0 : nSrcYOff = nChunkYOff;
2146 : }
2147 :
2148 16402 : const double *const padfSrcScanline =
2149 16402 : padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
2150 16402 : const GByte *pabySrcScanlineNodataMask = nullptr;
2151 16402 : if (pabyChunkNodataMask != nullptr)
2152 152 : pabySrcScanlineNodataMask =
2153 152 : pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
2154 :
2155 : /* --------------------------------------------------------------------
2156 : */
2157 : /* Loop over destination pixels */
2158 : /* --------------------------------------------------------------------
2159 : */
2160 16402 : double *const padfDstScanline =
2161 16402 : padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
2162 4149980 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2163 : {
2164 4133580 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
2165 4133580 : int nSrcXOff2 =
2166 4133580 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
2167 :
2168 4133580 : if (nSrcXOff < nChunkXOff)
2169 : {
2170 0 : nSrcXOff = nChunkXOff;
2171 0 : nSrcXOff2++;
2172 : }
2173 :
2174 4133580 : const int iSizeX = nSrcXOff2 - nSrcXOff;
2175 4133580 : nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
2176 4133580 : nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
2177 :
2178 4133580 : if (nSrcXOff2 > nChunkRightXOff ||
2179 4127930 : (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
2180 : {
2181 5650 : nSrcXOff2 =
2182 5650 : std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
2183 : }
2184 :
2185 4133580 : int nXShiftGaussMatrix = 0;
2186 4133580 : if (nSrcXOff < nChunkXOff)
2187 : {
2188 0 : nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
2189 0 : nSrcXOff = nChunkXOff;
2190 : }
2191 :
2192 4133580 : if (poColorTable == nullptr)
2193 : {
2194 4133380 : double dfTotal = 0.0;
2195 4133380 : GInt64 nCount = 0;
2196 4133380 : const int *panLineWeight =
2197 4133380 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2198 : nXShiftGaussMatrix;
2199 :
2200 16527900 : for (int iY = nSrcYOff; iY < nSrcYOff2;
2201 12394500 : ++iY, panLineWeight += nGaussMatrixDim)
2202 : {
2203 49561300 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2204 : {
2205 37166800 : const double val =
2206 37166800 : padfSrcScanline[iX - nChunkXOff +
2207 37166800 : static_cast<GPtrDiff_t>(iY -
2208 37166800 : nSrcYOff) *
2209 37166800 : nChunkXSize];
2210 37166800 : if (pabySrcScanlineNodataMask == nullptr ||
2211 32872 : pabySrcScanlineNodataMask[iX - nChunkXOff +
2212 32872 : static_cast<GPtrDiff_t>(
2213 32872 : iY - nSrcYOff) *
2214 32872 : nChunkXSize])
2215 : {
2216 37146100 : const int nWeight = panLineWeight[i];
2217 37146100 : dfTotal += val * nWeight;
2218 37146100 : nCount += nWeight;
2219 : }
2220 : }
2221 : }
2222 :
2223 4133380 : if (nCount == 0)
2224 : {
2225 2217 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2226 : }
2227 : else
2228 : {
2229 4131160 : padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
2230 : }
2231 : }
2232 : else
2233 : {
2234 200 : GInt64 nTotalR = 0;
2235 200 : GInt64 nTotalG = 0;
2236 200 : GInt64 nTotalB = 0;
2237 200 : GInt64 nTotalWeight = 0;
2238 200 : const int *panLineWeight =
2239 200 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2240 : nXShiftGaussMatrix;
2241 :
2242 780 : for (int iY = nSrcYOff; iY < nSrcYOff2;
2243 580 : ++iY, panLineWeight += nGaussMatrixDim)
2244 : {
2245 2262 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2246 : {
2247 1682 : const double val =
2248 1682 : padfSrcScanline[iX - nChunkXOff +
2249 1682 : static_cast<GPtrDiff_t>(iY -
2250 1682 : nSrcYOff) *
2251 1682 : nChunkXSize];
2252 1682 : if (val < 0 || val >= colorEntries.size())
2253 0 : continue;
2254 :
2255 1682 : size_t idx = static_cast<size_t>(val);
2256 1682 : if (colorEntries[idx].c4)
2257 : {
2258 1682 : const int nWeight = panLineWeight[i];
2259 1682 : nTotalR +=
2260 1682 : static_cast<GInt64>(colorEntries[idx].c1) *
2261 1682 : nWeight;
2262 1682 : nTotalG +=
2263 1682 : static_cast<GInt64>(colorEntries[idx].c2) *
2264 1682 : nWeight;
2265 1682 : nTotalB +=
2266 1682 : static_cast<GInt64>(colorEntries[idx].c3) *
2267 1682 : nWeight;
2268 1682 : nTotalWeight += nWeight;
2269 : }
2270 : }
2271 : }
2272 :
2273 200 : if (nTotalWeight == 0)
2274 : {
2275 0 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2276 : }
2277 : else
2278 : {
2279 : GDALColorEntry color;
2280 :
2281 200 : color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
2282 : nTotalWeight);
2283 200 : color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
2284 : nTotalWeight);
2285 200 : color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
2286 : nTotalWeight);
2287 200 : padfDstScanline[iDstPixel - nDstXOff] =
2288 200 : BestColorEntry(colorEntries, color);
2289 : }
2290 : }
2291 : }
2292 : }
2293 :
2294 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
2295 : CPLFree(panGaussMatrixDup);
2296 : #endif
2297 :
2298 86 : return CE_None;
2299 : }
2300 :
2301 : /************************************************************************/
2302 : /* GDALResampleChunk_Mode() */
2303 : /************************************************************************/
2304 :
2305 688 : template <class T> static inline bool IsSame(T a, T b)
2306 : {
2307 688 : return a == b;
2308 : }
2309 :
2310 60 : template <> bool IsSame<GFloat16>(GFloat16 a, GFloat16 b)
2311 : {
2312 60 : return a == b || (CPLIsNan(a) && CPLIsNan(b));
2313 : }
2314 :
2315 5583 : template <> bool IsSame<float>(float a, float b)
2316 : {
2317 5583 : return a == b || (std::isnan(a) && std::isnan(b));
2318 : }
2319 :
2320 1701 : template <> bool IsSame<double>(double a, double b)
2321 : {
2322 1701 : return a == b || (std::isnan(a) && std::isnan(b));
2323 : }
2324 :
2325 : namespace
2326 : {
2327 : struct ComplexFloat16
2328 : {
2329 : GFloat16 r;
2330 : GFloat16 i;
2331 : };
2332 : } // namespace
2333 :
2334 60 : template <> bool IsSame<ComplexFloat16>(ComplexFloat16 a, ComplexFloat16 b)
2335 : {
2336 90 : return (a.r == b.r && a.i == b.i) ||
2337 90 : (CPLIsNan(a.r) && CPLIsNan(a.i) && CPLIsNan(b.r) && CPLIsNan(b.i));
2338 : }
2339 :
2340 : template <>
2341 60 : bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
2342 : {
2343 120 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2344 120 : std::isnan(b.real()) && std::isnan(b.imag()));
2345 : }
2346 :
2347 : template <>
2348 60 : bool IsSame<std::complex<double>>(std::complex<double> a,
2349 : std::complex<double> b)
2350 : {
2351 120 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2352 120 : std::isnan(b.real()) && std::isnan(b.imag()));
2353 : }
2354 :
2355 : template <class T>
2356 182 : static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
2357 : const T *pChunk, T *const pDstBuffer)
2358 :
2359 : {
2360 182 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2361 182 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2362 182 : const double dfSrcXDelta = args.dfSrcXDelta;
2363 182 : const double dfSrcYDelta = args.dfSrcYDelta;
2364 182 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2365 182 : const int nChunkXOff = args.nChunkXOff;
2366 182 : const int nChunkXSize = args.nChunkXSize;
2367 182 : const int nChunkYOff = args.nChunkYOff;
2368 182 : const int nChunkYSize = args.nChunkYSize;
2369 182 : const int nDstXOff = args.nDstXOff;
2370 182 : const int nDstXOff2 = args.nDstXOff2;
2371 182 : const int nDstYOff = args.nDstYOff;
2372 182 : const int nDstYOff2 = args.nDstYOff2;
2373 182 : const bool bHasNoData = args.bHasNoData;
2374 182 : const GDALColorTable *poColorTable = args.poColorTable;
2375 182 : const int nDstXSize = nDstXOff2 - nDstXOff;
2376 :
2377 8 : T tNoDataValue;
2378 : if constexpr (std::is_same<T, ComplexFloat16>::value)
2379 : {
2380 4 : tNoDataValue.r = cpl::NumericLimits<GFloat16>::quiet_NaN();
2381 4 : tNoDataValue.i = cpl::NumericLimits<GFloat16>::quiet_NaN();
2382 : }
2383 : else if constexpr (std::is_same<T, std::complex<float>>::value ||
2384 : std::is_same<T, std::complex<double>>::value)
2385 : {
2386 : using BaseT = typename T::value_type;
2387 8 : tNoDataValue =
2388 : std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
2389 : std::numeric_limits<BaseT>::quiet_NaN());
2390 : }
2391 170 : else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
2392 169 : tNoDataValue = 0;
2393 : else
2394 1 : tNoDataValue = static_cast<T>(args.dfNoDataValue);
2395 :
2396 : using CountType = uint32_t;
2397 182 : CountType nMaxNumPx = 0;
2398 182 : T *paVals = nullptr;
2399 182 : CountType *panCounts = nullptr;
2400 :
2401 182 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2402 182 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2403 364 : std::vector<int> anVals(256, 0);
2404 :
2405 : /* ==================================================================== */
2406 : /* Loop over destination scanlines. */
2407 : /* ==================================================================== */
2408 7713 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2409 : {
2410 7531 : const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
2411 7531 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
2412 : #ifdef only_pixels_with_more_than_10_pct_participation
2413 : // When oversampling, don't take into account pixels that have a tiny
2414 : // participation in the resulting pixel
2415 : if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2416 : nSrcYOff < nChunkBottomYOff)
2417 : nSrcYOff++;
2418 : #endif
2419 7531 : if (nSrcYOff < nChunkYOff)
2420 0 : nSrcYOff = nChunkYOff;
2421 :
2422 7531 : const double dfSrcYOff2 =
2423 7531 : dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
2424 7531 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
2425 : #ifdef only_pixels_with_more_than_10_pct_participation
2426 : // When oversampling, don't take into account pixels that have a tiny
2427 : // participation in the resulting pixel
2428 : if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2429 : nSrcYOff2 > nChunkYOff)
2430 : nSrcYOff2--;
2431 : #endif
2432 7531 : if (nSrcYOff2 == nSrcYOff)
2433 0 : ++nSrcYOff2;
2434 7531 : if (nSrcYOff2 > nChunkBottomYOff)
2435 0 : nSrcYOff2 = nChunkBottomYOff;
2436 :
2437 7531 : const T *const paSrcScanline =
2438 281 : pChunk +
2439 7531 : (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
2440 7531 : const GByte *pabySrcScanlineNodataMask = nullptr;
2441 7531 : if (pabyChunkNodataMask != nullptr)
2442 1838 : pabySrcScanlineNodataMask =
2443 : pabyChunkNodataMask +
2444 1838 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
2445 :
2446 7531 : T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
2447 : /* --------------------------------------------------------------------
2448 : */
2449 : /* Loop over destination pixels */
2450 : /* --------------------------------------------------------------------
2451 : */
2452 4260596 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2453 : {
2454 4253061 : const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
2455 : // Apply some epsilon to avoid numerical precision issues
2456 4253061 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
2457 : #ifdef only_pixels_with_more_than_10_pct_participation
2458 : // When oversampling, don't take into account pixels that have a
2459 : // tiny participation in the resulting pixel
2460 : if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2461 : nSrcXOff < nChunkRightXOff)
2462 : nSrcXOff++;
2463 : #endif
2464 4253061 : if (nSrcXOff < nChunkXOff)
2465 0 : nSrcXOff = nChunkXOff;
2466 :
2467 4253061 : const double dfSrcXOff2 =
2468 4253061 : dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
2469 4253061 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
2470 : #ifdef only_pixels_with_more_than_10_pct_participation
2471 : // When oversampling, don't take into account pixels that have a
2472 : // tiny participation in the resulting pixel
2473 : if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2474 : nSrcXOff2 > nChunkXOff)
2475 : nSrcXOff2--;
2476 : #endif
2477 4253061 : if (nSrcXOff2 == nSrcXOff)
2478 0 : nSrcXOff2++;
2479 4253061 : if (nSrcXOff2 > nChunkRightXOff)
2480 0 : nSrcXOff2 = nChunkRightXOff;
2481 :
2482 4253061 : bool bRegularProcessing = false;
2483 : if constexpr (!std::is_same<T, GByte>::value)
2484 1671 : bRegularProcessing = true;
2485 4251390 : else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
2486 0 : bRegularProcessing = true;
2487 :
2488 4253061 : if (bRegularProcessing)
2489 : {
2490 : // Sanity check to make sure the allocation of paVals and
2491 : // panCounts don't overflow.
2492 : static_assert(sizeof(CountType) <= sizeof(size_t));
2493 3342 : if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
2494 1671 : static_cast<CountType>(nSrcYOff2 - nSrcYOff) >
2495 1671 : (std::numeric_limits<CountType>::max() /
2496 3342 : std::max(sizeof(T), sizeof(CountType))) /
2497 1671 : static_cast<CountType>(nSrcXOff2 - nSrcXOff))
2498 : {
2499 0 : CPLError(CE_Failure, CPLE_NotSupported,
2500 : "Too big downsampling factor");
2501 0 : CPLFree(paVals);
2502 0 : CPLFree(panCounts);
2503 0 : return CE_Failure;
2504 : }
2505 1671 : const CountType nNumPx =
2506 1671 : static_cast<CountType>(nSrcYOff2 - nSrcYOff) *
2507 1671 : (nSrcXOff2 - nSrcXOff);
2508 1671 : CountType iMaxInd = 0;
2509 1671 : CountType iMaxVal = 0;
2510 :
2511 1671 : if (paVals == nullptr || nNumPx > nMaxNumPx)
2512 : {
2513 : T *paValsNew = static_cast<T *>(
2514 116 : VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
2515 : CountType *panCountsNew =
2516 116 : static_cast<CountType *>(VSI_REALLOC_VERBOSE(
2517 : panCounts, nNumPx * sizeof(CountType)));
2518 116 : if (paValsNew != nullptr)
2519 116 : paVals = paValsNew;
2520 116 : if (panCountsNew != nullptr)
2521 116 : panCounts = panCountsNew;
2522 116 : if (paValsNew == nullptr || panCountsNew == nullptr)
2523 : {
2524 0 : CPLFree(paVals);
2525 0 : CPLFree(panCounts);
2526 0 : return CE_Failure;
2527 : }
2528 116 : nMaxNumPx = nNumPx;
2529 : }
2530 :
2531 5245 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2532 : {
2533 3574 : const GPtrDiff_t iTotYOff =
2534 3574 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2535 3574 : nChunkXOff;
2536 11842 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2537 : {
2538 8268 : if (pabySrcScanlineNodataMask == nullptr ||
2539 1552 : pabySrcScanlineNodataMask[iX + iTotYOff])
2540 : {
2541 8247 : const T val = paSrcScanline[iX + iTotYOff];
2542 8247 : CountType i = 0; // Used after for.
2543 :
2544 : // Check array for existing entry.
2545 11611 : for (; i < iMaxInd; ++i)
2546 : {
2547 8212 : if (IsSame(paVals[i], val))
2548 : {
2549 4848 : if (++panCounts[i] > panCounts[iMaxVal])
2550 : {
2551 246 : iMaxVal = i;
2552 : }
2553 4848 : break;
2554 : }
2555 : }
2556 :
2557 : // Add to arr if entry not already there.
2558 8247 : if (i == iMaxInd)
2559 : {
2560 3399 : paVals[iMaxInd] = val;
2561 3399 : panCounts[iMaxInd] = 1;
2562 :
2563 3399 : if (iMaxInd == 0)
2564 : {
2565 1668 : iMaxVal = iMaxInd;
2566 : }
2567 :
2568 3399 : ++iMaxInd;
2569 : }
2570 : }
2571 : }
2572 : }
2573 :
2574 1671 : if (iMaxInd == 0)
2575 3 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2576 : else
2577 1668 : paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
2578 : }
2579 : else if constexpr (std::is_same<T, GByte>::value)
2580 : // ( eSrcDataType == GDT_UInt8 && nEntryCount < 256 )
2581 : {
2582 : // So we go here for a paletted or non-paletted byte band.
2583 : // The input values are then between 0 and 255.
2584 4251390 : int nMaxVal = 0;
2585 4251390 : int iMaxInd = -1;
2586 :
2587 : // The cost of this zeroing might be high. Perhaps we should
2588 : // just use the above generic case, and go to this one if the
2589 : // number of source pixels is large enough
2590 4251390 : std::fill(anVals.begin(), anVals.end(), 0);
2591 :
2592 12777800 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2593 : {
2594 8526440 : const GPtrDiff_t iTotYOff =
2595 8526440 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2596 8526440 : nChunkXOff;
2597 25649600 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2598 : {
2599 17123100 : const T val = paSrcScanline[iX + iTotYOff];
2600 17123100 : if (!bHasNoData || val != tNoDataValue)
2601 : {
2602 17123100 : int nVal = static_cast<int>(val);
2603 17123100 : if (++anVals[nVal] > nMaxVal)
2604 : {
2605 : // Sum the density.
2606 : // Is it the most common value so far?
2607 17006400 : iMaxInd = nVal;
2608 17006400 : nMaxVal = anVals[nVal];
2609 : }
2610 : }
2611 : }
2612 : }
2613 :
2614 4251390 : if (iMaxInd == -1)
2615 0 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2616 : else
2617 4251390 : paDstScanline[iDstPixel - nDstXOff] =
2618 : static_cast<T>(iMaxInd);
2619 : }
2620 : }
2621 : }
2622 :
2623 182 : CPLFree(paVals);
2624 182 : CPLFree(panCounts);
2625 :
2626 182 : return CE_None;
2627 : }
2628 :
2629 182 : static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
2630 : const void *pChunk, void **ppDstBuffer,
2631 : GDALDataType *peDstBufferDataType)
2632 : {
2633 182 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
2634 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
2635 : GDALGetDataTypeSizeBytes(args.eWrkDataType));
2636 182 : if (*ppDstBuffer == nullptr)
2637 : {
2638 0 : return CE_Failure;
2639 : }
2640 :
2641 182 : CPLAssert(args.eSrcDataType == args.eWrkDataType);
2642 :
2643 182 : *peDstBufferDataType = args.eWrkDataType;
2644 182 : switch (args.eWrkDataType)
2645 : {
2646 : // For mode resampling, as no computation is done, only the
2647 : // size of the data type matters... except for Byte where we have
2648 : // special processing. And for floating point values
2649 66 : case GDT_UInt8:
2650 : {
2651 66 : return GDALResampleChunk_ModeT(args,
2652 : static_cast<const GByte *>(pChunk),
2653 66 : static_cast<GByte *>(*ppDstBuffer));
2654 : }
2655 :
2656 4 : case GDT_Int8:
2657 : {
2658 4 : return GDALResampleChunk_ModeT(args,
2659 : static_cast<const int8_t *>(pChunk),
2660 4 : static_cast<int8_t *>(*ppDstBuffer));
2661 : }
2662 :
2663 10 : case GDT_Int16:
2664 : case GDT_UInt16:
2665 : {
2666 10 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
2667 10 : return GDALResampleChunk_ModeT(
2668 : args, static_cast<const uint16_t *>(pChunk),
2669 10 : static_cast<uint16_t *>(*ppDstBuffer));
2670 : }
2671 :
2672 15 : case GDT_CInt16:
2673 : case GDT_Int32:
2674 : case GDT_UInt32:
2675 : {
2676 15 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2677 15 : return GDALResampleChunk_ModeT(
2678 : args, static_cast<const uint32_t *>(pChunk),
2679 15 : static_cast<uint32_t *>(*ppDstBuffer));
2680 : }
2681 :
2682 12 : case GDT_CInt32:
2683 : case GDT_Int64:
2684 : case GDT_UInt64:
2685 : {
2686 12 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2687 12 : return GDALResampleChunk_ModeT(
2688 : args, static_cast<const uint64_t *>(pChunk),
2689 12 : static_cast<uint64_t *>(*ppDstBuffer));
2690 : }
2691 :
2692 4 : case GDT_Float16:
2693 : {
2694 4 : return GDALResampleChunk_ModeT(
2695 : args, static_cast<const GFloat16 *>(pChunk),
2696 4 : static_cast<GFloat16 *>(*ppDstBuffer));
2697 : }
2698 :
2699 35 : case GDT_Float32:
2700 : {
2701 35 : return GDALResampleChunk_ModeT(args,
2702 : static_cast<const float *>(pChunk),
2703 35 : static_cast<float *>(*ppDstBuffer));
2704 : }
2705 :
2706 24 : case GDT_Float64:
2707 : {
2708 24 : return GDALResampleChunk_ModeT(args,
2709 : static_cast<const double *>(pChunk),
2710 24 : static_cast<double *>(*ppDstBuffer));
2711 : }
2712 :
2713 4 : case GDT_CFloat16:
2714 : {
2715 4 : return GDALResampleChunk_ModeT(
2716 : args, static_cast<const ComplexFloat16 *>(pChunk),
2717 4 : static_cast<ComplexFloat16 *>(*ppDstBuffer));
2718 : }
2719 :
2720 4 : case GDT_CFloat32:
2721 : {
2722 4 : return GDALResampleChunk_ModeT(
2723 : args, static_cast<const std::complex<float> *>(pChunk),
2724 4 : static_cast<std::complex<float> *>(*ppDstBuffer));
2725 : }
2726 :
2727 4 : case GDT_CFloat64:
2728 : {
2729 4 : return GDALResampleChunk_ModeT(
2730 : args, static_cast<const std::complex<double> *>(pChunk),
2731 4 : static_cast<std::complex<double> *>(*ppDstBuffer));
2732 : }
2733 :
2734 0 : case GDT_Unknown:
2735 : case GDT_TypeCount:
2736 0 : break;
2737 : }
2738 :
2739 0 : CPLAssert(false);
2740 : return CE_Failure;
2741 : }
2742 :
2743 : /************************************************************************/
2744 : /* GDALResampleConvolutionHorizontal() */
2745 : /************************************************************************/
2746 :
2747 : template <class T>
2748 : static inline double
2749 46038 : GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
2750 : int nSrcPixelCount)
2751 : {
2752 46038 : double dfVal1 = 0.0;
2753 46038 : double dfVal2 = 0.0;
2754 46038 : int i = 0; // Used after for.
2755 : // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
2756 : // manually (untypical) unrolled loop in -O2 and -O3:
2757 : // https://github.com/OSGeo/gdal/issues/9508
2758 : #if !defined(__INTEL_CLANG_COMPILER)
2759 92396 : for (; i < nSrcPixelCount - 3; i += 4)
2760 : {
2761 46358 : dfVal1 += double(pChunk[i + 0]) * padfWeights[i];
2762 46358 : dfVal1 += double(pChunk[i + 1]) * padfWeights[i + 1];
2763 46358 : dfVal2 += double(pChunk[i + 2]) * padfWeights[i + 2];
2764 46358 : dfVal2 += double(pChunk[i + 3]) * padfWeights[i + 3];
2765 : }
2766 : #endif
2767 48662 : for (; i < nSrcPixelCount; ++i)
2768 : {
2769 2624 : dfVal1 += double(pChunk[i]) * padfWeights[i];
2770 : }
2771 46038 : return dfVal1 + dfVal2;
2772 : }
2773 :
2774 : template <class T, bool bHasNaN>
2775 46368 : static inline void GDALResampleConvolutionHorizontalWithMask(
2776 : const T *pChunk, const GByte *pabyMask, const double *padfWeights,
2777 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2778 : {
2779 46368 : dfVal = 0;
2780 46368 : dfWeightSum = 0;
2781 46368 : int i = 0;
2782 103804 : for (; i < nSrcPixelCount - 3; i += 4)
2783 : {
2784 57436 : double dfWeight0 = padfWeights[i + 0] * pabyMask[i + 0];
2785 57436 : double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
2786 57436 : double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
2787 57436 : double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
2788 :
2789 229744 : const auto MulNaNAware = [](double v, double &w, double &val)
2790 : {
2791 : if constexpr (bHasNaN)
2792 : {
2793 14848 : if (std::isnan(v))
2794 : {
2795 76 : w = 0;
2796 76 : return;
2797 : }
2798 : }
2799 14772 : val += v * w;
2800 : };
2801 :
2802 57436 : MulNaNAware(double(pChunk[i + 0]), dfWeight0, dfVal);
2803 57436 : MulNaNAware(double(pChunk[i + 1]), dfWeight1, dfVal);
2804 57436 : MulNaNAware(double(pChunk[i + 2]), dfWeight2, dfVal);
2805 57436 : MulNaNAware(double(pChunk[i + 3]), dfWeight3, dfVal);
2806 57436 : dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
2807 : }
2808 64874 : for (; i < nSrcPixelCount; ++i)
2809 : {
2810 18506 : const double dfWeight = padfWeights[i] * pabyMask[i];
2811 : if constexpr (bHasNaN)
2812 : {
2813 1920 : if (!std::isnan(pChunk[i]))
2814 : {
2815 1920 : dfVal += double(pChunk[i]) * dfWeight;
2816 1920 : dfWeightSum += dfWeight;
2817 : }
2818 : }
2819 : else
2820 : {
2821 16586 : dfVal += double(pChunk[i]) * dfWeight;
2822 16586 : dfWeightSum += dfWeight;
2823 : }
2824 : }
2825 46368 : }
2826 :
2827 : template <class T, bool bHasNaN>
2828 1341366 : static inline void GDALResampleConvolutionHorizontal_3rows(
2829 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2830 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2831 : double &dfRes2, double &dfRes3)
2832 : {
2833 1341366 : double dfVal1 = 0.0;
2834 1341366 : double dfVal2 = 0.0;
2835 1341366 : double dfVal3 = 0.0;
2836 1341366 : double dfVal4 = 0.0;
2837 1341366 : double dfVal5 = 0.0;
2838 1341366 : double dfVal6 = 0.0;
2839 1341366 : int i = 0; // Used after for.
2840 :
2841 16866840 : const auto MulNaNAware = [](double a, double w)
2842 : {
2843 : if constexpr (bHasNaN)
2844 : {
2845 0 : if (std::isnan(a))
2846 0 : return 0.0;
2847 : }
2848 16866900 : return a * w;
2849 : };
2850 :
2851 2736937 : for (; i < nSrcPixelCount - 3; i += 4)
2852 : {
2853 1395570 : dfVal1 += MulNaNAware(double(pChunkRow1[i + 0]), padfWeights[i + 0]);
2854 1395570 : dfVal1 += MulNaNAware(double(pChunkRow1[i + 1]), padfWeights[i + 1]);
2855 1395570 : dfVal2 += MulNaNAware(double(pChunkRow1[i + 2]), padfWeights[i + 2]);
2856 1395570 : dfVal2 += MulNaNAware(double(pChunkRow1[i + 3]), padfWeights[i + 3]);
2857 1395570 : dfVal3 += MulNaNAware(double(pChunkRow2[i + 0]), padfWeights[i + 0]);
2858 1395570 : dfVal3 += MulNaNAware(double(pChunkRow2[i + 1]), padfWeights[i + 1]);
2859 1395570 : dfVal4 += MulNaNAware(double(pChunkRow2[i + 2]), padfWeights[i + 2]);
2860 1395570 : dfVal4 += MulNaNAware(double(pChunkRow2[i + 3]), padfWeights[i + 3]);
2861 1395570 : dfVal5 += MulNaNAware(double(pChunkRow3[i + 0]), padfWeights[i + 0]);
2862 1395570 : dfVal5 += MulNaNAware(double(pChunkRow3[i + 1]), padfWeights[i + 1]);
2863 1395570 : dfVal6 += MulNaNAware(double(pChunkRow3[i + 2]), padfWeights[i + 2]);
2864 1395570 : dfVal6 += MulNaNAware(double(pChunkRow3[i + 3]), padfWeights[i + 3]);
2865 : }
2866 1381377 : for (; i < nSrcPixelCount; ++i)
2867 : {
2868 40011 : dfVal1 += MulNaNAware(double(pChunkRow1[i]), padfWeights[i]);
2869 40011 : dfVal3 += MulNaNAware(double(pChunkRow2[i]), padfWeights[i]);
2870 40011 : dfVal5 += MulNaNAware(double(pChunkRow3[i]), padfWeights[i]);
2871 : }
2872 1341366 : dfRes1 = dfVal1 + dfVal2;
2873 1341366 : dfRes2 = dfVal3 + dfVal4;
2874 1341366 : dfRes3 = dfVal5 + dfVal6;
2875 1341366 : }
2876 :
2877 : template <class T, bool bHasNaN>
2878 18980 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
2879 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2880 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2881 : double &dfRes2, double &dfRes3)
2882 : {
2883 18980 : GDALResampleConvolutionHorizontal_3rows<T, bHasNaN>(
2884 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeights, nSrcPixelCount, dfRes1,
2885 : dfRes2, dfRes3);
2886 18980 : }
2887 :
2888 : template <class T, bool bHasNaN>
2889 1256690 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
2890 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2891 : const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
2892 : {
2893 1256690 : GDALResampleConvolutionHorizontal_3rows<T, bHasNaN>(
2894 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeights, 4, dfRes1, dfRes2,
2895 : dfRes3);
2896 1256690 : }
2897 :
2898 : /************************************************************************/
2899 : /* GDALResampleConvolutionVertical() */
2900 : /************************************************************************/
2901 :
2902 : template <class T>
2903 : static inline double
2904 471387 : GDALResampleConvolutionVertical(const T *pChunk, size_t nStride,
2905 : const double *padfWeights, int nSrcLineCount)
2906 : {
2907 471387 : double dfVal1 = 0.0;
2908 471387 : double dfVal2 = 0.0;
2909 471387 : int i = 0;
2910 471387 : size_t j = 0;
2911 933894 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2912 : {
2913 462507 : dfVal1 += pChunk[j + 0 * nStride] * padfWeights[i + 0];
2914 462507 : dfVal1 += pChunk[j + 1 * nStride] * padfWeights[i + 1];
2915 462507 : dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2916 462507 : dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2917 : }
2918 525654 : for (; i < nSrcLineCount; ++i, j += nStride)
2919 : {
2920 54267 : dfVal1 += pChunk[j] * padfWeights[i];
2921 : }
2922 471387 : return dfVal1 + dfVal2;
2923 : }
2924 :
2925 : template <class T>
2926 2930610 : static inline void GDALResampleConvolutionVertical_2cols(
2927 : const T *pChunk, size_t nStride, const double *padfWeights,
2928 : int nSrcLineCount, double &dfRes1, double &dfRes2)
2929 : {
2930 2930610 : double dfVal1 = 0.0;
2931 2930610 : double dfVal2 = 0.0;
2932 2930610 : double dfVal3 = 0.0;
2933 2930610 : double dfVal4 = 0.0;
2934 2930610 : int i = 0;
2935 2930610 : size_t j = 0;
2936 5863170 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2937 : {
2938 2932560 : dfVal1 += pChunk[j + 0 + 0 * nStride] * padfWeights[i + 0];
2939 2932560 : dfVal3 += pChunk[j + 1 + 0 * nStride] * padfWeights[i + 0];
2940 2932560 : dfVal1 += pChunk[j + 0 + 1 * nStride] * padfWeights[i + 1];
2941 2932560 : dfVal3 += pChunk[j + 1 + 1 * nStride] * padfWeights[i + 1];
2942 2932560 : dfVal2 += pChunk[j + 0 + 2 * nStride] * padfWeights[i + 2];
2943 2932560 : dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
2944 2932560 : dfVal2 += pChunk[j + 0 + 3 * nStride] * padfWeights[i + 3];
2945 2932560 : dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
2946 : }
2947 3053490 : for (; i < nSrcLineCount; ++i, j += nStride)
2948 : {
2949 122880 : dfVal1 += pChunk[j + 0] * padfWeights[i];
2950 122880 : dfVal3 += pChunk[j + 1] * padfWeights[i];
2951 : }
2952 2930610 : dfRes1 = dfVal1 + dfVal2;
2953 2930610 : dfRes2 = dfVal3 + dfVal4;
2954 2930610 : }
2955 :
2956 : #ifdef USE_SSE2
2957 :
2958 : #ifdef __AVX__
2959 : /************************************************************************/
2960 : /* GDALResampleConvolutionVertical_16cols<T> */
2961 : /************************************************************************/
2962 :
2963 : template <class T>
2964 : static inline void
2965 : GDALResampleConvolutionVertical_16cols(const T *pChunk, size_t nStride,
2966 : const double *padfWeights,
2967 : int nSrcLineCount, float *afDest)
2968 : {
2969 : int i = 0;
2970 : size_t j = 0;
2971 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2972 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2973 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2974 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2975 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2976 : {
2977 : XMMReg4Double w0 =
2978 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2979 : XMMReg4Double w1 =
2980 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2981 : XMMReg4Double w2 =
2982 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2983 : XMMReg4Double w3 =
2984 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2985 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2986 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2987 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
2988 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
2989 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2990 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2991 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
2992 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
2993 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2994 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2995 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
2996 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
2997 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2998 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2999 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
3000 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
3001 : }
3002 : for (; i < nSrcLineCount; ++i, j += nStride)
3003 : {
3004 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
3005 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
3006 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
3007 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
3008 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
3009 : }
3010 : v_acc0.Store4Val(afDest);
3011 : v_acc1.Store4Val(afDest + 4);
3012 : v_acc2.Store4Val(afDest + 8);
3013 : v_acc3.Store4Val(afDest + 12);
3014 : }
3015 :
3016 : template <class T>
3017 : static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
3018 : const double *, int,
3019 : double *)
3020 : {
3021 : // Cannot be reached
3022 : CPLAssert(false);
3023 : }
3024 :
3025 : #else
3026 :
3027 : /************************************************************************/
3028 : /* GDALResampleConvolutionVertical_8cols<T> */
3029 : /************************************************************************/
3030 :
3031 : template <class T>
3032 : static inline void
3033 25689200 : GDALResampleConvolutionVertical_8cols(const T *pChunk, size_t nStride,
3034 : const double *padfWeights,
3035 : int nSrcLineCount, float *afDest)
3036 : {
3037 25689200 : int i = 0;
3038 25689200 : size_t j = 0;
3039 25689200 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
3040 25689200 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3041 53654900 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
3042 : {
3043 27965700 : XMMReg4Double w0 =
3044 27965700 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
3045 27965700 : XMMReg4Double w1 =
3046 27965700 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
3047 27965700 : XMMReg4Double w2 =
3048 27965700 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
3049 27965700 : XMMReg4Double w3 =
3050 27965700 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
3051 27965700 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
3052 27965700 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
3053 27965700 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
3054 27965700 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
3055 27965700 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
3056 27965700 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
3057 27965700 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
3058 27965700 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
3059 : }
3060 37257700 : for (; i < nSrcLineCount; ++i, j += nStride)
3061 : {
3062 11568400 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
3063 11568400 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
3064 11568400 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
3065 : }
3066 25689200 : v_acc0.Store4Val(afDest);
3067 25689200 : v_acc1.Store4Val(afDest + 4);
3068 25689200 : }
3069 :
3070 : template <class T>
3071 : static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
3072 : const double *, int,
3073 : double *)
3074 : {
3075 : // Cannot be reached
3076 : CPLAssert(false);
3077 : }
3078 :
3079 : #endif // __AVX__
3080 :
3081 : /************************************************************************/
3082 : /* GDALResampleConvolutionHorizontalSSE2<T> */
3083 : /************************************************************************/
3084 :
3085 : template <class T>
3086 3141452 : static inline double GDALResampleConvolutionHorizontalSSE2(
3087 : const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3088 : {
3089 3141452 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3090 3141452 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3091 3141452 : int i = 0; // Used after for.
3092 3520408 : for (; i < nSrcPixelCount - 7; i += 8)
3093 : {
3094 : // Retrieve the pixel & accumulate
3095 378952 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
3096 378952 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
3097 378952 : const XMMReg4Double v_weight1 =
3098 378952 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3099 378952 : const XMMReg4Double v_weight2 =
3100 378952 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
3101 :
3102 378952 : v_acc1 += v_pixels1 * v_weight1;
3103 378952 : v_acc2 += v_pixels2 * v_weight2;
3104 : }
3105 :
3106 3141452 : v_acc1 += v_acc2;
3107 :
3108 3141452 : double dfVal = v_acc1.GetHorizSum();
3109 10321830 : for (; i < nSrcPixelCount; ++i)
3110 : {
3111 7180380 : dfVal += pChunk[i] * padfWeightsAligned[i];
3112 : }
3113 3141452 : return dfVal;
3114 : }
3115 :
3116 : /************************************************************************/
3117 : /* GDALResampleConvolutionHorizontal<GByte> */
3118 : /************************************************************************/
3119 :
3120 : template <>
3121 2592290 : inline double GDALResampleConvolutionHorizontal<GByte>(
3122 : const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3123 : {
3124 2592290 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
3125 2592290 : nSrcPixelCount);
3126 : }
3127 :
3128 : template <>
3129 549162 : inline double GDALResampleConvolutionHorizontal<GUInt16>(
3130 : const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3131 : {
3132 549162 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
3133 549162 : nSrcPixelCount);
3134 : }
3135 :
3136 : /************************************************************************/
3137 : /* GDALResampleConvolutionHorizontalWithMaskSSE2<T> */
3138 : /************************************************************************/
3139 :
3140 : template <class T>
3141 6408653 : static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
3142 : const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
3143 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
3144 : {
3145 6408653 : int i = 0; // Used after for.
3146 6408653 : XMMReg4Double v_acc = XMMReg4Double::Zero();
3147 6408653 : XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
3148 17785121 : for (; i < nSrcPixelCount - 3; i += 4)
3149 : {
3150 11376458 : const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
3151 11376458 : const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
3152 11376458 : XMMReg4Double v_weight =
3153 11376458 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3154 11376458 : v_weight *= v_mask;
3155 11376458 : v_acc += v_pixels * v_weight;
3156 11376458 : v_acc_weight += v_weight;
3157 : }
3158 :
3159 6408653 : dfVal = v_acc.GetHorizSum();
3160 6408653 : dfWeightSum = v_acc_weight.GetHorizSum();
3161 6614913 : for (; i < nSrcPixelCount; ++i)
3162 : {
3163 206258 : const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
3164 206258 : dfVal += pChunk[i] * dfWeight;
3165 206258 : dfWeightSum += dfWeight;
3166 : }
3167 6408653 : }
3168 :
3169 : /************************************************************************/
3170 : /* GDALResampleConvolutionHorizontalWithMask<GByte> */
3171 : /************************************************************************/
3172 :
3173 : template <>
3174 6408590 : inline void GDALResampleConvolutionHorizontalWithMask<GByte, false>(
3175 : const GByte *pChunk, const GByte *pabyMask,
3176 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
3177 : double &dfWeightSum)
3178 : {
3179 6408590 : GDALResampleConvolutionHorizontalWithMaskSSE2(
3180 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
3181 : dfWeightSum);
3182 6408590 : }
3183 :
3184 : template <>
3185 63 : inline void GDALResampleConvolutionHorizontalWithMask<GUInt16, false>(
3186 : const GUInt16 *pChunk, const GByte *pabyMask,
3187 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
3188 : double &dfWeightSum)
3189 : {
3190 63 : GDALResampleConvolutionHorizontalWithMaskSSE2(
3191 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
3192 : dfWeightSum);
3193 63 : }
3194 :
3195 : /************************************************************************/
3196 : /* GDALResampleConvolutionHorizontal_3rows_SSE2<T> */
3197 : /************************************************************************/
3198 :
3199 : template <class T>
3200 35560186 : static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
3201 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3202 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3203 : double &dfRes2, double &dfRes3)
3204 : {
3205 35560186 : XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
3206 35560186 : v_acc2 = XMMReg4Double::Zero(),
3207 35560186 : v_acc3 = XMMReg4Double::Zero();
3208 35560186 : int i = 0;
3209 70929556 : for (; i < nSrcPixelCount - 7; i += 8)
3210 : {
3211 : // Retrieve the pixel & accumulate.
3212 35369370 : XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3213 35369370 : XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
3214 35369370 : const XMMReg4Double v_weight1 =
3215 35369370 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3216 35369370 : const XMMReg4Double v_weight2 =
3217 35369370 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
3218 :
3219 35369370 : v_acc1 += v_pixels1 * v_weight1;
3220 35369370 : v_acc1 += v_pixels2 * v_weight2;
3221 :
3222 35369370 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3223 35369370 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
3224 35369370 : v_acc2 += v_pixels1 * v_weight1;
3225 35369370 : v_acc2 += v_pixels2 * v_weight2;
3226 :
3227 35369370 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3228 35369370 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
3229 35369370 : v_acc3 += v_pixels1 * v_weight1;
3230 35369370 : v_acc3 += v_pixels2 * v_weight2;
3231 : }
3232 :
3233 35560186 : dfRes1 = v_acc1.GetHorizSum();
3234 35560186 : dfRes2 = v_acc2.GetHorizSum();
3235 35560186 : dfRes3 = v_acc3.GetHorizSum();
3236 47825952 : for (; i < nSrcPixelCount; ++i)
3237 : {
3238 12265766 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3239 12265766 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3240 12265766 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3241 : }
3242 35560186 : }
3243 :
3244 : /************************************************************************/
3245 : /* GDALResampleConvolutionHorizontal_3rows<GByte> */
3246 : /************************************************************************/
3247 :
3248 : template <>
3249 35560100 : inline void GDALResampleConvolutionHorizontal_3rows<GByte, false>(
3250 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3251 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3252 : double &dfRes2, double &dfRes3)
3253 : {
3254 35560100 : GDALResampleConvolutionHorizontal_3rows_SSE2(
3255 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3256 : dfRes1, dfRes2, dfRes3);
3257 35560100 : }
3258 :
3259 : template <>
3260 86 : inline void GDALResampleConvolutionHorizontal_3rows<GUInt16, false>(
3261 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3262 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3263 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3264 : {
3265 86 : GDALResampleConvolutionHorizontal_3rows_SSE2(
3266 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3267 : dfRes1, dfRes2, dfRes3);
3268 86 : }
3269 :
3270 : /************************************************************************/
3271 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T> */
3272 : /************************************************************************/
3273 :
3274 : template <class T>
3275 7840250 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3276 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3277 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3278 : double &dfRes2, double &dfRes3)
3279 : {
3280 7840250 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3281 7840250 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3282 7840250 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
3283 7840250 : int i = 0; // Use after for.
3284 19104350 : for (; i < nSrcPixelCount - 3; i += 4)
3285 : {
3286 : // Retrieve the pixel & accumulate.
3287 11264100 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3288 11264100 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3289 11264100 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3290 11264100 : const XMMReg4Double v_weight =
3291 11264100 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3292 :
3293 11264100 : v_acc1 += v_pixels1 * v_weight;
3294 11264100 : v_acc2 += v_pixels2 * v_weight;
3295 11264100 : v_acc3 += v_pixels3 * v_weight;
3296 : }
3297 :
3298 7840250 : dfRes1 = v_acc1.GetHorizSum();
3299 7840250 : dfRes2 = v_acc2.GetHorizSum();
3300 7840250 : dfRes3 = v_acc3.GetHorizSum();
3301 :
3302 12290222 : for (; i < nSrcPixelCount; ++i)
3303 : {
3304 4449942 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3305 4449942 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3306 4449942 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3307 : }
3308 7840250 : }
3309 :
3310 : /************************************************************************/
3311 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte> */
3312 : /************************************************************************/
3313 :
3314 : template <>
3315 : inline void
3316 7773100 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte, false>(
3317 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3318 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3319 : double &dfRes2, double &dfRes3)
3320 : {
3321 7773100 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3322 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3323 : dfRes1, dfRes2, dfRes3);
3324 7773100 : }
3325 :
3326 : template <>
3327 : inline void
3328 67150 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16, false>(
3329 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3330 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3331 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3332 : {
3333 67150 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3334 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3335 : dfRes1, dfRes2, dfRes3);
3336 67150 : }
3337 :
3338 : /************************************************************************/
3339 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T> */
3340 : /************************************************************************/
3341 :
3342 : template <class T>
3343 13996740 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3344 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3345 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3346 : double &dfRes3)
3347 : {
3348 13996740 : const XMMReg4Double v_weight =
3349 : XMMReg4Double::Load4ValAligned(padfWeightsAligned);
3350 :
3351 : // Retrieve the pixel & accumulate.
3352 13996740 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
3353 13996740 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
3354 13996740 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
3355 :
3356 13996740 : XMMReg4Double v_acc1 = v_pixels1 * v_weight;
3357 13996740 : XMMReg4Double v_acc2 = v_pixels2 * v_weight;
3358 13996740 : XMMReg4Double v_acc3 = v_pixels3 * v_weight;
3359 :
3360 13996740 : dfRes1 = v_acc1.GetHorizSum();
3361 13996740 : dfRes2 = v_acc2.GetHorizSum();
3362 13996740 : dfRes3 = v_acc3.GetHorizSum();
3363 13996740 : }
3364 :
3365 : /************************************************************************/
3366 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte> */
3367 : /************************************************************************/
3368 :
3369 : template <>
3370 8284020 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte, false>(
3371 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3372 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3373 : double &dfRes3)
3374 : {
3375 8284020 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3376 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3377 : dfRes3);
3378 8284020 : }
3379 :
3380 : template <>
3381 5712720 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16, false>(
3382 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3383 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
3384 : double &dfRes2, double &dfRes3)
3385 : {
3386 5712720 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3387 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3388 : dfRes3);
3389 5712720 : }
3390 :
3391 : #endif // USE_SSE2
3392 :
3393 : /************************************************************************/
3394 : /* GDALResampleChunk_Convolution() */
3395 : /************************************************************************/
3396 :
3397 : template <class T, class Twork, GDALDataType eWrkDataType,
3398 : bool bKernelWithNegativeWeights, bool bNeedRescale>
3399 5148 : static CPLErr GDALResampleChunk_ConvolutionT(
3400 : const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
3401 : FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
3402 : int nKernelRadius, float fMaxVal)
3403 :
3404 : {
3405 5148 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
3406 5148 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
3407 5148 : const double dfSrcXDelta = args.dfSrcXDelta;
3408 5148 : const double dfSrcYDelta = args.dfSrcYDelta;
3409 5148 : constexpr int nBands = 1;
3410 5148 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
3411 5148 : const int nChunkXOff = args.nChunkXOff;
3412 5148 : const int nChunkXSize = args.nChunkXSize;
3413 5148 : const int nChunkYOff = args.nChunkYOff;
3414 5148 : const int nChunkYSize = args.nChunkYSize;
3415 5148 : const int nDstXOff = args.nDstXOff;
3416 5148 : const int nDstXOff2 = args.nDstXOff2;
3417 5148 : const int nDstYOff = args.nDstYOff;
3418 5148 : const int nDstYOff2 = args.nDstYOff2;
3419 5148 : const bool bHasNoData = args.bHasNoData;
3420 5148 : double dfNoDataValue = args.dfNoDataValue;
3421 :
3422 5148 : if (!bHasNoData)
3423 5049 : dfNoDataValue = 0.0;
3424 5148 : const auto dstDataType = args.eOvrDataType;
3425 5148 : const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
3426 5148 : const double dfReplacementVal =
3427 99 : bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
3428 : : dfNoDataValue;
3429 : // cppcheck-suppress unreadVariable
3430 5148 : const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
3431 5148 : const bool bNoDataValueInt64Valid =
3432 5148 : isIntegerDT && GDALIsValueExactAs<GInt64>(dfNoDataValue);
3433 5148 : const auto nNodataValueInt64 =
3434 : bNoDataValueInt64Valid ? static_cast<GInt64>(dfNoDataValue) : 0;
3435 5148 : constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
3436 :
3437 : // TODO: we should have some generic function to do this.
3438 5148 : Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
3439 5148 : Twork fDstMax = cpl::NumericLimits<Twork>::max();
3440 5148 : if (dstDataType == GDT_UInt8)
3441 : {
3442 4218 : fDstMin = std::numeric_limits<GByte>::min();
3443 4218 : fDstMax = std::numeric_limits<GByte>::max();
3444 : }
3445 930 : else if (dstDataType == GDT_Int8)
3446 : {
3447 1 : fDstMin = std::numeric_limits<GInt8>::min();
3448 1 : fDstMax = std::numeric_limits<GInt8>::max();
3449 : }
3450 929 : else if (dstDataType == GDT_UInt16)
3451 : {
3452 402 : fDstMin = std::numeric_limits<GUInt16>::min();
3453 402 : fDstMax = std::numeric_limits<GUInt16>::max();
3454 : }
3455 527 : else if (dstDataType == GDT_Int16)
3456 : {
3457 292 : fDstMin = std::numeric_limits<GInt16>::min();
3458 292 : fDstMax = std::numeric_limits<GInt16>::max();
3459 : }
3460 235 : else if (dstDataType == GDT_UInt32)
3461 : {
3462 1 : fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
3463 1 : fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
3464 : }
3465 234 : else if (dstDataType == GDT_Int32)
3466 : {
3467 : // cppcheck-suppress unreadVariable
3468 6 : fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
3469 : // cppcheck-suppress unreadVariable
3470 6 : fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
3471 : }
3472 228 : else if (dstDataType == GDT_UInt64)
3473 : {
3474 : // cppcheck-suppress unreadVariable
3475 1 : fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
3476 : // cppcheck-suppress unreadVariable
3477 : // (1 << 64) - 2048: largest uint64 value a double can hold
3478 1 : fDstMax = static_cast<Twork>(18446744073709549568ULL);
3479 : }
3480 227 : else if (dstDataType == GDT_Int64)
3481 : {
3482 : // cppcheck-suppress unreadVariable
3483 1 : fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
3484 : // cppcheck-suppress unreadVariable
3485 : // (1 << 63) - 1024: largest int64 that a double can hold
3486 1 : fDstMax = static_cast<Twork>(9223372036854774784LL);
3487 : }
3488 :
3489 5148 : bool bHasNaN = false;
3490 490 : if (pabyChunkNodataMask)
3491 : {
3492 : if constexpr (std::is_floating_point_v<T>)
3493 : {
3494 120140 : for (size_t i = 0;
3495 120140 : i < static_cast<size_t>(nChunkXSize) * nChunkYSize; ++i)
3496 : {
3497 120122 : if (std::isnan(pChunk[i]))
3498 : {
3499 24 : bHasNaN = true;
3500 24 : break;
3501 : }
3502 : }
3503 : }
3504 : }
3505 :
3506 36948368 : auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
3507 : bNoDataValueInt64Valid, nNodataValueInt64,
3508 : dfNoDataValue, dfReplacementVal](Twork fVal)
3509 : {
3510 15839600 : if (!bHasNoData)
3511 11618500 : return fVal;
3512 :
3513 : // Clamp value before comparing to nodata: this is only needed for
3514 : // kernels with negative weights (Lanczos)
3515 4221160 : Twork fClamped = fVal;
3516 4221160 : if (fClamped < fDstMin)
3517 15998 : fClamped = fDstMin;
3518 4205160 : else if (fClamped > fDstMax)
3519 16406 : fClamped = fDstMax;
3520 4221160 : if (isIntegerDT)
3521 : {
3522 4220480 : if (bNoDataValueInt64Valid)
3523 : {
3524 4220470 : const double fClampedRounded = double(std::round(fClamped));
3525 8440960 : if (fClampedRounded >=
3526 : static_cast<double>(static_cast<Twork>(
3527 8440960 : std::numeric_limits<int64_t>::min())) &&
3528 : fClampedRounded <= static_cast<double>(static_cast<Twork>(
3529 8440960 : 9223372036854774784LL)) &&
3530 4220470 : nNodataValueInt64 ==
3531 4220480 : static_cast<GInt64>(std::round(fClamped)))
3532 : {
3533 : // Do not use the nodata value
3534 14435 : return static_cast<Twork>(dfReplacementVal);
3535 : }
3536 : }
3537 : }
3538 679 : else if (dfNoDataValue == static_cast<double>(fClamped))
3539 : {
3540 : // Do not use the nodata value
3541 1 : return static_cast<Twork>(dfReplacementVal);
3542 : }
3543 4206720 : return fClamped;
3544 : };
3545 :
3546 : /* -------------------------------------------------------------------- */
3547 : /* Allocate work buffers. */
3548 : /* -------------------------------------------------------------------- */
3549 5148 : const int nDstXSize = nDstXOff2 - nDstXOff;
3550 5148 : Twork *pafWrkScanline = nullptr;
3551 5148 : if (dstDataType != eWrkDataType)
3552 : {
3553 : pafWrkScanline =
3554 4936 : static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
3555 4936 : if (pafWrkScanline == nullptr)
3556 0 : return CE_Failure;
3557 : }
3558 :
3559 5148 : const double dfXScale = 1.0 / dfXRatioDstToSrc;
3560 5148 : const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
3561 5148 : const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
3562 5148 : const double dfYScale = 1.0 / dfYRatioDstToSrc;
3563 5148 : const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
3564 5148 : const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
3565 :
3566 : // Temporary array to store result of horizontal filter.
3567 : double *const padfHorizontalFiltered = static_cast<double *>(
3568 5148 : VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
3569 :
3570 : // To store convolution coefficients.
3571 : double *const padfWeights =
3572 5148 : static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
3573 : static_cast<int>(
3574 : 2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) + 0.5) *
3575 : sizeof(double)));
3576 :
3577 5148 : GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
3578 5148 : if (pabyChunkNodataMask)
3579 : pabyChunkNodataMaskHorizontalFiltered =
3580 462 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
3581 5148 : if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
3582 462 : (pabyChunkNodataMask != nullptr &&
3583 : pabyChunkNodataMaskHorizontalFiltered == nullptr))
3584 : {
3585 0 : VSIFree(pafWrkScanline);
3586 0 : VSIFree(padfHorizontalFiltered);
3587 0 : VSIFreeAligned(padfWeights);
3588 0 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3589 0 : return CE_Failure;
3590 : }
3591 :
3592 : /* ==================================================================== */
3593 : /* First pass: horizontal filter */
3594 : /* ==================================================================== */
3595 5148 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
3596 : #ifdef USE_SSE2
3597 5148 : const bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
3598 : #endif
3599 3046832 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
3600 : {
3601 3041688 : const double dfSrcPixel =
3602 3041688 : (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
3603 3041688 : int nSrcPixelStart =
3604 3041688 : static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
3605 3041688 : if (nSrcPixelStart < nChunkXOff)
3606 57361 : nSrcPixelStart = nChunkXOff;
3607 3041688 : int nSrcPixelStop =
3608 3041688 : static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
3609 3041688 : if (nSrcPixelStop > nChunkRightXOff)
3610 57376 : nSrcPixelStop = nChunkRightXOff;
3611 : #if 0
3612 : if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
3613 : {
3614 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3615 : }
3616 : if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
3617 : {
3618 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3619 : }
3620 : #endif
3621 3041688 : const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
3622 3041688 : double dfWeightSum = 0.0;
3623 :
3624 : // Compute convolution coefficients.
3625 3041688 : int nSrcPixel = nSrcPixelStart;
3626 3041688 : double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
3627 4436866 : for (; nSrcPixel < nSrcPixelStop - 3; nSrcPixel += 4)
3628 : {
3629 1395184 : padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
3630 1395184 : dfX += dfXScaleWeight;
3631 1395184 : padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
3632 1395184 : dfX += dfXScaleWeight;
3633 1395184 : padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
3634 1395184 : dfX += dfXScaleWeight;
3635 1395184 : padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
3636 1395184 : dfX += dfXScaleWeight;
3637 1395184 : dfWeightSum +=
3638 1395184 : pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
3639 : }
3640 7032688 : for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
3641 : {
3642 3991000 : const double dfWeight = pfnFilterFunc(dfX);
3643 3991000 : padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3644 3991000 : dfWeightSum += dfWeight;
3645 : }
3646 :
3647 3041688 : const int nHeight = nChunkYSize * nBands;
3648 3041688 : if (pabyChunkNodataMask == nullptr)
3649 : {
3650 : // For floating-point data types, we must scale down a bit values
3651 : // if input values are close to +/- std::numeric_limits<T>::max()
3652 : #ifdef OLD_CPPCHECK
3653 : constexpr double mulFactor = 1;
3654 : #else
3655 2958653 : constexpr double mulFactor =
3656 : (bNeedRescale &&
3657 : (std::is_same_v<T, float> || std::is_same_v<T, double>))
3658 : ? 2
3659 : : 1;
3660 : #endif
3661 :
3662 2958653 : if (dfWeightSum != 0)
3663 : {
3664 2958653 : const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
3665 11921984 : for (int i = 0; i < nSrcPixelCount; ++i)
3666 : {
3667 8963341 : padfWeights[i] *= dfInvWeightSum;
3668 : }
3669 : }
3670 :
3671 179403230 : const auto ScaleValue = [
3672 : #ifdef _MSC_VER
3673 : mulFactor
3674 : #endif
3675 : ](double dfVal, [[maybe_unused]] const T *inputValues,
3676 : [[maybe_unused]] int nInputValues)
3677 : {
3678 179403000 : constexpr bool isFloat =
3679 : std::is_same_v<T, float> || std::is_same_v<T, double>;
3680 : if constexpr (isFloat)
3681 : {
3682 4070140 : if (std::isfinite(dfVal))
3683 : {
3684 : return std::clamp(dfVal,
3685 12204800 : -std::numeric_limits<double>::max() /
3686 : mulFactor,
3687 4068260 : std::numeric_limits<double>::max() /
3688 4068260 : mulFactor) *
3689 4068260 : mulFactor;
3690 : }
3691 : else if constexpr (bKernelWithNegativeWeights)
3692 : {
3693 936 : if (std::isnan(dfVal))
3694 : {
3695 : // Either one of the input value is NaN or they are +/-Inf
3696 936 : const bool isPositive = inputValues[0] >= 0;
3697 6008 : for (int i = 0; i < nInputValues; ++i)
3698 : {
3699 5384 : if (std::isnan(inputValues[i]))
3700 312 : return dfVal;
3701 : // cppcheck-suppress knownConditionTrueFalse
3702 5072 : if ((inputValues[i] >= 0) != isPositive)
3703 0 : return dfVal;
3704 : }
3705 : // All values are positive or negative infinity
3706 624 : return static_cast<double>(inputValues[0]);
3707 : }
3708 : }
3709 : }
3710 175334000 : return dfVal;
3711 : };
3712 :
3713 2958653 : int iSrcLineOff = 0;
3714 : #ifdef USE_SSE2
3715 2958653 : if (nSrcPixelCount == 4)
3716 : {
3717 15867269 : for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3718 : {
3719 15253428 : const size_t j =
3720 15253428 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3721 15253428 : (nSrcPixelStart - nChunkXOff);
3722 15253428 : double dfVal1 = 0.0;
3723 15253428 : double dfVal2 = 0.0;
3724 15253428 : double dfVal3 = 0.0;
3725 : if constexpr (std::is_floating_point_v<T>)
3726 : {
3727 1256690 : if (bHasNaN)
3728 : {
3729 : GDALResampleConvolutionHorizontalPixelCount4_3rows<
3730 0 : T, true>(pChunk + j, pChunk + j + nChunkXSize,
3731 0 : pChunk + j + 2 * nChunkXSize,
3732 : padfWeights, dfVal1, dfVal2, dfVal3);
3733 : }
3734 : else
3735 : {
3736 : GDALResampleConvolutionHorizontalPixelCount4_3rows<
3737 1256690 : T, false>(pChunk + j, pChunk + j + nChunkXSize,
3738 1256690 : pChunk + j + 2 * nChunkXSize,
3739 : padfWeights, dfVal1, dfVal2, dfVal3);
3740 : }
3741 : }
3742 : else
3743 : {
3744 : GDALResampleConvolutionHorizontalPixelCount4_3rows<
3745 13996738 : T, false>(pChunk + j, pChunk + j + nChunkXSize,
3746 13996738 : pChunk + j + 2 * nChunkXSize, padfWeights,
3747 : dfVal1, dfVal2, dfVal3);
3748 : }
3749 30506830 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3750 15253428 : nDstXSize +
3751 15253428 : iDstPixel - nDstXOff] =
3752 15253428 : ScaleValue(dfVal1, pChunk + j, 4);
3753 30506830 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3754 15253428 : 1) *
3755 15253428 : nDstXSize +
3756 15253428 : iDstPixel - nDstXOff] =
3757 15253428 : ScaleValue(dfVal2, pChunk + j + nChunkXSize, 4);
3758 15253837 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3759 15253428 : 2) *
3760 15253428 : nDstXSize +
3761 15253428 : iDstPixel - nDstXOff] =
3762 15253428 : ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize, 4);
3763 : }
3764 : }
3765 2344804 : else if (bSrcPixelCountLess8)
3766 : {
3767 9927838 : for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3768 : {
3769 7859228 : const size_t j =
3770 7859228 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3771 7859228 : (nSrcPixelStart - nChunkXOff);
3772 7859228 : double dfVal1 = 0.0;
3773 7859228 : double dfVal2 = 0.0;
3774 7859228 : double dfVal3 = 0.0;
3775 : if constexpr (std::is_floating_point_v<T>)
3776 : {
3777 18980 : if (bHasNaN)
3778 : {
3779 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows<
3780 0 : T, true>(pChunk + j, pChunk + j + nChunkXSize,
3781 0 : pChunk + j + 2 * nChunkXSize,
3782 : padfWeights, nSrcPixelCount, dfVal1,
3783 : dfVal2, dfVal3);
3784 : }
3785 : else
3786 : {
3787 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows<
3788 18980 : T, false>(pChunk + j, pChunk + j + nChunkXSize,
3789 18980 : pChunk + j + 2 * nChunkXSize,
3790 : padfWeights, nSrcPixelCount, dfVal1,
3791 : dfVal2, dfVal3);
3792 : }
3793 : }
3794 : else
3795 : {
3796 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows<
3797 7840248 : T, false>(pChunk + j, pChunk + j + nChunkXSize,
3798 7840248 : pChunk + j + 2 * nChunkXSize, padfWeights,
3799 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3800 : }
3801 15718416 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3802 7859228 : nDstXSize +
3803 7859228 : iDstPixel - nDstXOff] =
3804 7859228 : ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
3805 15718416 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3806 7859228 : 1) *
3807 7859228 : nDstXSize +
3808 7859228 : iDstPixel - nDstXOff] =
3809 7859228 : ScaleValue(dfVal2, pChunk + j + nChunkXSize,
3810 : nSrcPixelCount);
3811 7859316 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3812 7859228 : 2) *
3813 7859228 : nDstXSize +
3814 7859228 : iDstPixel - nDstXOff] =
3815 7859228 : ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
3816 : nSrcPixelCount);
3817 : }
3818 : }
3819 : else
3820 : #endif
3821 : {
3822 35902058 : for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3823 : {
3824 35625944 : const size_t j =
3825 35625944 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3826 35625944 : (nSrcPixelStart - nChunkXOff);
3827 35625944 : double dfVal1 = 0.0;
3828 35625944 : double dfVal2 = 0.0;
3829 35625944 : double dfVal3 = 0.0;
3830 : if constexpr (std::is_floating_point_v<T>)
3831 : {
3832 65696 : if (bHasNaN)
3833 : {
3834 0 : GDALResampleConvolutionHorizontal_3rows<T, true>(
3835 0 : pChunk + j, pChunk + j + nChunkXSize,
3836 0 : pChunk + j + 2 * nChunkXSize, padfWeights,
3837 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3838 : }
3839 : else
3840 : {
3841 65696 : GDALResampleConvolutionHorizontal_3rows<T, false>(
3842 65696 : pChunk + j, pChunk + j + nChunkXSize,
3843 65696 : pChunk + j + 2 * nChunkXSize, padfWeights,
3844 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3845 : }
3846 : }
3847 : else
3848 : {
3849 35560248 : GDALResampleConvolutionHorizontal_3rows<T, false>(
3850 35560248 : pChunk + j, pChunk + j + nChunkXSize,
3851 35560248 : pChunk + j + 2 * nChunkXSize, padfWeights,
3852 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3853 : }
3854 71251798 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3855 35625944 : nDstXSize +
3856 35625944 : iDstPixel - nDstXOff] =
3857 35625944 : ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
3858 71251798 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3859 35625944 : 1) *
3860 35625944 : nDstXSize +
3861 35625944 : iDstPixel - nDstXOff] =
3862 35625944 : ScaleValue(dfVal2, pChunk + j + nChunkXSize,
3863 : nSrcPixelCount);
3864 35691048 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3865 35625944 : 2) *
3866 35625944 : nDstXSize +
3867 35625944 : iDstPixel - nDstXOff] =
3868 35625944 : ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
3869 : nSrcPixelCount);
3870 : }
3871 : }
3872 6146150 : for (; iSrcLineOff < nHeight; ++iSrcLineOff)
3873 : {
3874 3187493 : const size_t j =
3875 3187493 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3876 3187493 : (nSrcPixelStart - nChunkXOff);
3877 3736653 : const double dfVal = GDALResampleConvolutionHorizontal(
3878 595200 : pChunk + j, padfWeights, nSrcPixelCount);
3879 3187942 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3880 3187493 : nDstXSize +
3881 3187493 : iDstPixel - nDstXOff] =
3882 3187493 : ScaleValue(dfVal, pChunk + j, nSrcPixelCount);
3883 : }
3884 : }
3885 : else
3886 : {
3887 19189371 : for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
3888 : {
3889 19106322 : const size_t j =
3890 19106322 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3891 19106322 : (nSrcPixelStart - nChunkXOff);
3892 :
3893 : if (bKernelWithNegativeWeights)
3894 : {
3895 18580308 : int nConsecutiveValid = 0;
3896 18580308 : int nMaxConsecutiveValid = 0;
3897 170151146 : for (int k = 0; k < nSrcPixelCount; k++)
3898 : {
3899 151569938 : if (pabyChunkNodataMask[j + k])
3900 43681801 : nConsecutiveValid++;
3901 107888837 : else if (nConsecutiveValid)
3902 : {
3903 107830 : nMaxConsecutiveValid = std::max(
3904 107830 : nMaxConsecutiveValid, nConsecutiveValid);
3905 107830 : nConsecutiveValid = 0;
3906 : }
3907 : }
3908 18580308 : nMaxConsecutiveValid =
3909 18580308 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
3910 18580308 : if (nMaxConsecutiveValid < nSrcPixelCount / 2)
3911 : {
3912 12651307 : const size_t nTempOffset =
3913 12651307 : static_cast<size_t>(iSrcLineOff) * nDstXSize +
3914 12651307 : iDstPixel - nDstXOff;
3915 12651307 : padfHorizontalFiltered[nTempOffset] = 0.0;
3916 12651307 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3917 12651307 : continue;
3918 : }
3919 : }
3920 :
3921 6455025 : double dfVal = 0.0;
3922 : if constexpr (std::is_floating_point_v<T>)
3923 : {
3924 46368 : if (bHasNaN)
3925 : {
3926 1792 : GDALResampleConvolutionHorizontalWithMask<T, true>(
3927 1792 : pChunk + j, pabyChunkNodataMask + j, padfWeights,
3928 : nSrcPixelCount, dfVal, dfWeightSum);
3929 : }
3930 : else
3931 : {
3932 44576 : GDALResampleConvolutionHorizontalWithMask<T, false>(
3933 44576 : pChunk + j, pabyChunkNodataMask + j, padfWeights,
3934 : nSrcPixelCount, dfVal, dfWeightSum);
3935 : }
3936 : }
3937 : else
3938 : {
3939 6408657 : GDALResampleConvolutionHorizontalWithMask<T, false>(
3940 63 : pChunk + j, pabyChunkNodataMask + j, padfWeights,
3941 : nSrcPixelCount, dfVal, dfWeightSum);
3942 : }
3943 6455025 : const size_t nTempOffset =
3944 6455025 : static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
3945 6455025 : nDstXOff;
3946 6455025 : if (dfWeightSum > 0.0)
3947 : {
3948 6410360 : padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
3949 6410360 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
3950 : }
3951 : else
3952 : {
3953 44663 : padfHorizontalFiltered[nTempOffset] = 0.0;
3954 44663 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3955 : }
3956 : }
3957 : }
3958 : }
3959 :
3960 : /* ==================================================================== */
3961 : /* Second pass: vertical filter */
3962 : /* ==================================================================== */
3963 5148 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
3964 :
3965 396762 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3966 : {
3967 391614 : Twork *const pafDstScanline =
3968 : pafWrkScanline
3969 391614 : ? pafWrkScanline
3970 14028 : : static_cast<Twork *>(pDstBuffer) +
3971 14028 : static_cast<size_t>(iDstLine - nDstYOff) * nDstXSize;
3972 :
3973 391614 : const double dfSrcLine =
3974 391614 : (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
3975 391614 : int nSrcLineStart =
3976 391614 : static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
3977 391614 : int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
3978 391614 : if (nSrcLineStart < nChunkYOff)
3979 3486 : nSrcLineStart = nChunkYOff;
3980 391614 : if (nSrcLineStop > nChunkBottomYOff)
3981 3530 : nSrcLineStop = nChunkBottomYOff;
3982 : #if 0
3983 : if( nSrcLineStart < nChunkYOff &&
3984 : nChunkYOff > 0 )
3985 : {
3986 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3987 : }
3988 : if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
3989 : {
3990 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3991 : }
3992 : #endif
3993 391614 : const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
3994 391614 : double dfWeightSum = 0.0;
3995 :
3996 : // Compute convolution coefficients.
3997 391614 : int nSrcLine = nSrcLineStart; // Used after for.
3998 391614 : double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
3999 1004175 : for (; nSrcLine < nSrcLineStop - 3;
4000 612561 : nSrcLine += 4, dfY += 4 * dfYScaleWeight)
4001 : {
4002 612561 : padfWeights[nSrcLine - nSrcLineStart] = dfY;
4003 612561 : padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
4004 612561 : padfWeights[nSrcLine + 2 - nSrcLineStart] =
4005 612561 : dfY + 2 * dfYScaleWeight;
4006 612561 : padfWeights[nSrcLine + 3 - nSrcLineStart] =
4007 612561 : dfY + 3 * dfYScaleWeight;
4008 612561 : dfWeightSum +=
4009 612561 : pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
4010 : }
4011 429592 : for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
4012 : {
4013 37978 : const double dfWeight = pfnFilterFunc(dfY);
4014 37978 : padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
4015 37978 : dfWeightSum += dfWeight;
4016 : }
4017 :
4018 391614 : if (pabyChunkNodataMask == nullptr)
4019 : {
4020 : // For floating-point data types, we must scale down a bit values
4021 : // if input values are close to +/- std::numeric_limits<T>::max()
4022 : #ifdef OLD_CPPCHECK
4023 : constexpr double mulFactor = 1;
4024 : #else
4025 355578 : constexpr double mulFactor =
4026 : (bNeedRescale &&
4027 : (std::is_same_v<T, float> || std::is_same_v<T, double>))
4028 : ? 2
4029 : : 1;
4030 : #endif
4031 :
4032 355578 : if (dfWeightSum != 0)
4033 : {
4034 355578 : const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
4035 2594627 : for (int i = 0; i < nSrcLineCount; ++i)
4036 2239055 : padfWeights[i] *= dfInvWeightSum;
4037 : }
4038 :
4039 355578 : int iFilteredPixelOff = 0; // Used after for.
4040 : // j used after for.
4041 355578 : size_t j =
4042 355578 : (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
4043 : #ifdef USE_SSE2
4044 : if constexpr ((!bNeedRescale || !std::is_same_v<T, float>) &&
4045 : eWrkDataType == GDT_Float32)
4046 : {
4047 : #ifdef __AVX__
4048 : for (; iFilteredPixelOff < nDstXSize - 15;
4049 : iFilteredPixelOff += 16, j += 16)
4050 : {
4051 : GDALResampleConvolutionVertical_16cols(
4052 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
4053 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
4054 : if (bHasNoData)
4055 : {
4056 : for (int k = 0; k < 16; k++)
4057 : {
4058 : pafDstScanline[iFilteredPixelOff + k] =
4059 : replaceValIfNodata(
4060 : pafDstScanline[iFilteredPixelOff + k]);
4061 : }
4062 : }
4063 : }
4064 : #else
4065 26036009 : for (; iFilteredPixelOff < nDstXSize - 7;
4066 : iFilteredPixelOff += 8, j += 8)
4067 : {
4068 25689208 : GDALResampleConvolutionVertical_8cols(
4069 25689208 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
4070 25689208 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
4071 25689208 : if (bHasNoData)
4072 : {
4073 123192 : for (int k = 0; k < 8; k++)
4074 : {
4075 109504 : pafDstScanline[iFilteredPixelOff + k] =
4076 109504 : replaceValIfNodata(
4077 109504 : pafDstScanline[iFilteredPixelOff + k]);
4078 : }
4079 : }
4080 : }
4081 : #endif
4082 :
4083 816719 : for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
4084 : {
4085 469960 : const Twork fVal =
4086 469960 : static_cast<Twork>(GDALResampleConvolutionVertical(
4087 469960 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
4088 : nSrcLineCount));
4089 469960 : pafDstScanline[iFilteredPixelOff] =
4090 469960 : replaceValIfNodata(fVal);
4091 : }
4092 : }
4093 : else
4094 : #endif
4095 : {
4096 5862642 : const auto ScaleValue = [
4097 : #ifdef _MSC_VER
4098 : mulFactor
4099 : #endif
4100 : ](double dfVal, [[maybe_unused]] const double *inputValues,
4101 : [[maybe_unused]] int nStride,
4102 : [[maybe_unused]] int nInputValues)
4103 : {
4104 5862640 : constexpr bool isFloat =
4105 : std::is_same_v<T, float> || std::is_same_v<T, double>;
4106 : if constexpr (isFloat)
4107 : {
4108 5862640 : if (std::isfinite(dfVal))
4109 : {
4110 : return std::clamp(
4111 : dfVal,
4112 : static_cast<double>(
4113 17585400 : -std::numeric_limits<Twork>::max()) /
4114 : mulFactor,
4115 : static_cast<double>(
4116 5861800 : std::numeric_limits<Twork>::max()) /
4117 5861800 : mulFactor) *
4118 5861800 : mulFactor;
4119 : }
4120 : else if constexpr (bKernelWithNegativeWeights)
4121 : {
4122 480 : if (std::isnan(dfVal))
4123 : {
4124 : // Either one of the input value is NaN or they are +/-Inf
4125 480 : const bool isPositive = inputValues[0] >= 0;
4126 2520 : for (int i = 0; i < nInputValues; ++i)
4127 : {
4128 2200 : if (std::isnan(inputValues[i * nStride]))
4129 160 : return dfVal;
4130 : // cppcheck-suppress knownConditionTrueFalse
4131 2040 : if ((inputValues[i] >= 0) != isPositive)
4132 0 : return dfVal;
4133 : }
4134 : // All values are positive or negative infinity
4135 320 : return inputValues[0];
4136 : }
4137 : }
4138 : }
4139 :
4140 360 : return dfVal;
4141 : };
4142 :
4143 2939422 : for (; iFilteredPixelOff < nDstXSize - 1;
4144 : iFilteredPixelOff += 2, j += 2)
4145 : {
4146 2930610 : double dfVal1 = 0.0;
4147 2930610 : double dfVal2 = 0.0;
4148 2930610 : GDALResampleConvolutionVertical_2cols(
4149 2930610 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
4150 : nSrcLineCount, dfVal1, dfVal2);
4151 5861220 : pafDstScanline[iFilteredPixelOff] =
4152 2930610 : replaceValIfNodata(static_cast<Twork>(
4153 2930610 : ScaleValue(dfVal1, padfHorizontalFiltered + j,
4154 : nDstXSize, nSrcLineCount)));
4155 2930610 : pafDstScanline[iFilteredPixelOff + 1] =
4156 2930610 : replaceValIfNodata(static_cast<Twork>(
4157 2930610 : ScaleValue(dfVal2, padfHorizontalFiltered + j + 1,
4158 : nDstXSize, nSrcLineCount)));
4159 : }
4160 8819 : if (iFilteredPixelOff < nDstXSize)
4161 : {
4162 1427 : const double dfVal = GDALResampleConvolutionVertical(
4163 1427 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
4164 : nSrcLineCount);
4165 1427 : pafDstScanline[iFilteredPixelOff] =
4166 1427 : replaceValIfNodata(static_cast<Twork>(
4167 1427 : ScaleValue(dfVal, padfHorizontalFiltered + j,
4168 : nDstXSize, nSrcLineCount)));
4169 : }
4170 : }
4171 : }
4172 : else
4173 : {
4174 18368135 : for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
4175 : ++iFilteredPixelOff)
4176 : {
4177 18332129 : double dfVal = 0.0;
4178 18332129 : dfWeightSum = 0.0;
4179 18332129 : size_t j = (nSrcLineStart - nChunkYOff) *
4180 18332129 : static_cast<size_t>(nDstXSize) +
4181 18332129 : iFilteredPixelOff;
4182 : if (bKernelWithNegativeWeights)
4183 : {
4184 18088237 : int nConsecutiveValid = 0;
4185 18088237 : int nMaxConsecutiveValid = 0;
4186 127259921 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
4187 : {
4188 109171284 : const double dfWeight =
4189 109171284 : padfWeights[i] *
4190 : pabyChunkNodataMaskHorizontalFiltered[j];
4191 109171284 : if (pabyChunkNodataMaskHorizontalFiltered[j])
4192 : {
4193 46111301 : nConsecutiveValid++;
4194 : }
4195 63060183 : else if (nConsecutiveValid)
4196 : {
4197 204376 : nMaxConsecutiveValid = std::max(
4198 204376 : nMaxConsecutiveValid, nConsecutiveValid);
4199 204376 : nConsecutiveValid = 0;
4200 : }
4201 109171284 : dfVal += padfHorizontalFiltered[j] * dfWeight;
4202 109171284 : dfWeightSum += dfWeight;
4203 : }
4204 18088237 : nMaxConsecutiveValid =
4205 18088237 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
4206 18088237 : if (nMaxConsecutiveValid < nSrcLineCount / 2)
4207 : {
4208 8918591 : pafDstScanline[iFilteredPixelOff] =
4209 8918499 : static_cast<Twork>(dfNoDataValue);
4210 8918591 : continue;
4211 : }
4212 : }
4213 : else
4214 : {
4215 1239606 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
4216 : {
4217 995712 : const double dfWeight =
4218 995712 : padfWeights[i] *
4219 : pabyChunkNodataMaskHorizontalFiltered[j];
4220 995712 : dfVal += padfHorizontalFiltered[j] * dfWeight;
4221 995712 : dfWeightSum += dfWeight;
4222 : }
4223 : }
4224 9413558 : if (dfWeightSum > 0.0)
4225 : {
4226 9397519 : pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
4227 9397171 : static_cast<Twork>(dfVal / dfWeightSum));
4228 : }
4229 : else
4230 : {
4231 16045 : pafDstScanline[iFilteredPixelOff] =
4232 16021 : static_cast<Twork>(dfNoDataValue);
4233 : }
4234 : }
4235 : }
4236 :
4237 391614 : if (fMaxVal != 0.0f)
4238 : {
4239 : if constexpr (std::is_same_v<T, double>)
4240 : {
4241 0 : for (int i = 0; i < nDstXSize; ++i)
4242 : {
4243 0 : if (pafDstScanline[i] > static_cast<double>(fMaxVal))
4244 0 : pafDstScanline[i] = static_cast<double>(fMaxVal);
4245 : }
4246 : }
4247 : else
4248 : {
4249 192324 : for (int i = 0; i < nDstXSize; ++i)
4250 : {
4251 192088 : if (pafDstScanline[i] > fMaxVal)
4252 96022 : pafDstScanline[i] = fMaxVal;
4253 : }
4254 : }
4255 : }
4256 :
4257 391614 : if (pafWrkScanline)
4258 : {
4259 377586 : GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
4260 : static_cast<GByte *>(pDstBuffer) +
4261 377586 : static_cast<size_t>(iDstLine - nDstYOff) *
4262 377586 : nDstXSize * nDstDataTypeSize,
4263 : dstDataType, nDstDataTypeSize, nDstXSize);
4264 : }
4265 : }
4266 :
4267 5148 : VSIFree(pafWrkScanline);
4268 5148 : VSIFreeAligned(padfWeights);
4269 5148 : VSIFree(padfHorizontalFiltered);
4270 5148 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
4271 :
4272 5148 : return CE_None;
4273 : }
4274 :
4275 : template <bool bKernelWithNegativeWeights, bool bNeedRescale>
4276 : static CPLErr
4277 5148 : GDALResampleChunk_ConvolutionInternal(const GDALOverviewResampleArgs &args,
4278 : const void *pChunk, void **ppDstBuffer,
4279 : GDALDataType *peDstBufferDataType)
4280 : {
4281 : GDALResampleAlg eResample;
4282 5148 : if (EQUAL(args.pszResampling, "BILINEAR"))
4283 2666 : eResample = GRA_Bilinear;
4284 2482 : else if (EQUAL(args.pszResampling, "CUBIC"))
4285 2300 : eResample = GRA_Cubic;
4286 182 : else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
4287 86 : eResample = GRA_CubicSpline;
4288 96 : else if (EQUAL(args.pszResampling, "LANCZOS"))
4289 96 : eResample = GRA_Lanczos;
4290 : else
4291 : {
4292 0 : CPLAssert(false);
4293 : return CE_Failure;
4294 : }
4295 5148 : const int nKernelRadius = GWKGetFilterRadius(eResample);
4296 5148 : FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
4297 : const FilterFunc4ValuesType pfnFilterFunc4Values =
4298 5148 : GWKGetFilterFunc4Values(eResample);
4299 :
4300 5148 : float fMaxVal = 0.f;
4301 : // Cubic, etc... can have overshoots, so make sure we clamp values to the
4302 : // maximum value if NBITS is set.
4303 5148 : if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
4304 8 : (args.eOvrDataType == GDT_UInt8 || args.eOvrDataType == GDT_UInt16 ||
4305 0 : args.eOvrDataType == GDT_UInt32))
4306 : {
4307 8 : int nBits = args.nOvrNBITS;
4308 8 : if (nBits == GDALGetDataTypeSizeBits(args.eOvrDataType))
4309 1 : nBits = 0;
4310 8 : if (nBits > 0 && nBits < 32)
4311 7 : fMaxVal = static_cast<float>((1U << nBits) - 1);
4312 : }
4313 :
4314 5148 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
4315 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
4316 : GDALGetDataTypeSizeBytes(args.eOvrDataType));
4317 5148 : if (*ppDstBuffer == nullptr)
4318 : {
4319 0 : return CE_Failure;
4320 : }
4321 5148 : *peDstBufferDataType = args.eOvrDataType;
4322 :
4323 5148 : switch (args.eWrkDataType)
4324 : {
4325 4256 : case GDT_UInt8:
4326 : {
4327 : return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32,
4328 : bKernelWithNegativeWeights,
4329 4256 : bNeedRescale>(
4330 : args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
4331 4256 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4332 : }
4333 :
4334 402 : case GDT_UInt16:
4335 : {
4336 : return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32,
4337 : bKernelWithNegativeWeights,
4338 402 : bNeedRescale>(
4339 : args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
4340 402 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4341 : }
4342 :
4343 387 : case GDT_Float32:
4344 : {
4345 : return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32,
4346 : bKernelWithNegativeWeights,
4347 387 : bNeedRescale>(
4348 : args, static_cast<const float *>(pChunk), *ppDstBuffer,
4349 387 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4350 : }
4351 :
4352 103 : case GDT_Float64:
4353 : {
4354 : return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64,
4355 : bKernelWithNegativeWeights,
4356 103 : bNeedRescale>(
4357 : args, static_cast<const double *>(pChunk), *ppDstBuffer,
4358 103 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4359 : }
4360 :
4361 0 : default:
4362 0 : break;
4363 : }
4364 :
4365 0 : CPLAssert(false);
4366 : return CE_Failure;
4367 : }
4368 :
4369 : static CPLErr
4370 5148 : GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
4371 : const void *pChunk, void **ppDstBuffer,
4372 : GDALDataType *peDstBufferDataType)
4373 : {
4374 5148 : if (EQUAL(args.pszResampling, "CUBIC") ||
4375 2848 : EQUAL(args.pszResampling, "LANCZOS"))
4376 : return GDALResampleChunk_ConvolutionInternal<
4377 2396 : /* bKernelWithNegativeWeights=*/true, /* bNeedRescale = */ true>(
4378 2396 : args, pChunk, ppDstBuffer, peDstBufferDataType);
4379 2752 : else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
4380 86 : return GDALResampleChunk_ConvolutionInternal<false, true>(
4381 86 : args, pChunk, ppDstBuffer, peDstBufferDataType);
4382 : else
4383 2666 : return GDALResampleChunk_ConvolutionInternal<false, false>(
4384 2666 : args, pChunk, ppDstBuffer, peDstBufferDataType);
4385 : }
4386 :
4387 : /************************************************************************/
4388 : /* GDALResampleChunkC32R() */
4389 : /************************************************************************/
4390 :
4391 2 : static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
4392 : const float *pafChunk, const int nChunkYOff,
4393 : const int nChunkYSize, const int nDstYOff,
4394 : const int nDstYOff2, const int nOvrXSize,
4395 : const int nOvrYSize, void **ppDstBuffer,
4396 : GDALDataType *peDstBufferDataType,
4397 : const char *pszResampling)
4398 :
4399 : {
4400 : enum Method
4401 : {
4402 : NEAR,
4403 : AVERAGE,
4404 : AVERAGE_MAGPHASE,
4405 : RMS,
4406 : };
4407 :
4408 2 : Method eMethod = NEAR;
4409 2 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
4410 : {
4411 0 : eMethod = NEAR;
4412 : }
4413 2 : else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
4414 : {
4415 0 : eMethod = AVERAGE_MAGPHASE;
4416 : }
4417 2 : else if (EQUAL(pszResampling, "RMS"))
4418 : {
4419 2 : eMethod = RMS;
4420 : }
4421 0 : else if (STARTS_WITH_CI(pszResampling, "AVER"))
4422 : {
4423 0 : eMethod = AVERAGE;
4424 : }
4425 : else
4426 : {
4427 0 : CPLError(
4428 : CE_Failure, CPLE_NotSupported,
4429 : "Resampling method %s is not supported for complex data types. "
4430 : "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
4431 : pszResampling);
4432 0 : return CE_Failure;
4433 : }
4434 :
4435 2 : const int nOXSize = nOvrXSize;
4436 2 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
4437 : GDALGetDataTypeSizeBytes(GDT_CFloat32));
4438 2 : if (*ppDstBuffer == nullptr)
4439 : {
4440 0 : return CE_Failure;
4441 : }
4442 2 : float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
4443 2 : *peDstBufferDataType = GDT_CFloat32;
4444 :
4445 2 : const int nOYSize = nOvrYSize;
4446 2 : const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
4447 2 : const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
4448 :
4449 : /* ==================================================================== */
4450 : /* Loop over destination scanlines. */
4451 : /* ==================================================================== */
4452 8 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
4453 : {
4454 6 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
4455 6 : if (nSrcYOff < nChunkYOff)
4456 0 : nSrcYOff = nChunkYOff;
4457 :
4458 6 : int nSrcYOff2 =
4459 6 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
4460 6 : if (nSrcYOff2 == nSrcYOff)
4461 0 : nSrcYOff2++;
4462 :
4463 6 : if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
4464 : {
4465 2 : if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
4466 0 : nSrcYOff = nSrcHeight - 1;
4467 2 : nSrcYOff2 = nSrcHeight;
4468 : }
4469 6 : if (nSrcYOff2 > nChunkYOff + nChunkYSize)
4470 0 : nSrcYOff2 = nChunkYOff + nChunkYSize;
4471 :
4472 6 : const float *const pafSrcScanline =
4473 6 : pafChunk +
4474 6 : (static_cast<size_t>(nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
4475 6 : float *const pafDstScanline =
4476 6 : pafDstBuffer +
4477 6 : static_cast<size_t>(iDstLine - nDstYOff) * 2 * nOXSize;
4478 :
4479 : /* --------------------------------------------------------------------
4480 : */
4481 : /* Loop over destination pixels */
4482 : /* --------------------------------------------------------------------
4483 : */
4484 18 : for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
4485 : {
4486 12 : const size_t iDstPixelSZ = static_cast<size_t>(iDstPixel);
4487 12 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
4488 12 : int nSrcXOff2 =
4489 12 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
4490 12 : if (nSrcXOff2 == nSrcXOff)
4491 0 : nSrcXOff2++;
4492 12 : if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
4493 : {
4494 6 : if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
4495 0 : nSrcXOff = nSrcWidth - 1;
4496 6 : nSrcXOff2 = nSrcWidth;
4497 : }
4498 12 : const size_t nSrcXOffSZ = static_cast<size_t>(nSrcXOff);
4499 :
4500 12 : if (eMethod == NEAR)
4501 : {
4502 0 : pafDstScanline[iDstPixelSZ * 2] =
4503 0 : pafSrcScanline[nSrcXOffSZ * 2];
4504 0 : pafDstScanline[iDstPixelSZ * 2 + 1] =
4505 0 : pafSrcScanline[nSrcXOffSZ * 2 + 1];
4506 : }
4507 12 : else if (eMethod == AVERAGE_MAGPHASE)
4508 : {
4509 0 : double dfTotalR = 0.0;
4510 0 : double dfTotalI = 0.0;
4511 0 : double dfTotalM = 0.0;
4512 0 : size_t nCount = 0;
4513 :
4514 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4515 : {
4516 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4517 : {
4518 0 : const double dfR = double(
4519 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4520 0 : static_cast<size_t>(iY - nSrcYOff) *
4521 0 : nSrcWidth * 2]);
4522 0 : const double dfI = double(
4523 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4524 0 : static_cast<size_t>(iY - nSrcYOff) *
4525 0 : nSrcWidth * 2 +
4526 0 : 1]);
4527 0 : dfTotalR += dfR;
4528 0 : dfTotalI += dfI;
4529 0 : dfTotalM += std::hypot(dfR, dfI);
4530 0 : ++nCount;
4531 : }
4532 : }
4533 :
4534 0 : CPLAssert(nCount > 0);
4535 0 : if (nCount == 0)
4536 : {
4537 0 : pafDstScanline[iDstPixelSZ * 2] = 0.0;
4538 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4539 : }
4540 : else
4541 : {
4542 0 : pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4543 0 : dfTotalR / static_cast<double>(nCount));
4544 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4545 0 : dfTotalI / static_cast<double>(nCount));
4546 : const double dfM =
4547 0 : double(std::hypot(pafDstScanline[iDstPixelSZ * 2],
4548 0 : pafDstScanline[iDstPixelSZ * 2 + 1]));
4549 0 : const double dfDesiredM =
4550 0 : dfTotalM / static_cast<double>(nCount);
4551 0 : double dfRatio = 1.0;
4552 0 : if (dfM != 0.0)
4553 0 : dfRatio = dfDesiredM / dfM;
4554 :
4555 0 : pafDstScanline[iDstPixelSZ * 2] *=
4556 0 : static_cast<float>(dfRatio);
4557 0 : pafDstScanline[iDstPixelSZ * 2 + 1] *=
4558 0 : static_cast<float>(dfRatio);
4559 : }
4560 : }
4561 12 : else if (eMethod == RMS)
4562 : {
4563 12 : double dfTotalR = 0.0;
4564 12 : double dfTotalI = 0.0;
4565 12 : size_t nCount = 0;
4566 :
4567 36 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4568 : {
4569 72 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4570 : {
4571 48 : const double dfR = double(
4572 48 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4573 48 : static_cast<size_t>(iY - nSrcYOff) *
4574 48 : nSrcWidth * 2]);
4575 48 : const double dfI = double(
4576 48 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4577 48 : static_cast<size_t>(iY - nSrcYOff) *
4578 48 : nSrcWidth * 2 +
4579 48 : 1]);
4580 :
4581 48 : dfTotalR += SQUARE(dfR);
4582 48 : dfTotalI += SQUARE(dfI);
4583 :
4584 48 : ++nCount;
4585 : }
4586 : }
4587 :
4588 12 : CPLAssert(nCount > 0);
4589 12 : if (nCount == 0)
4590 : {
4591 0 : pafDstScanline[iDstPixelSZ * 2] = 0.0;
4592 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4593 : }
4594 : else
4595 : {
4596 : /* compute RMS */
4597 12 : pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4598 12 : sqrt(dfTotalR / static_cast<double>(nCount)));
4599 12 : pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4600 12 : sqrt(dfTotalI / static_cast<double>(nCount)));
4601 : }
4602 : }
4603 0 : else if (eMethod == AVERAGE)
4604 : {
4605 0 : double dfTotalR = 0.0;
4606 0 : double dfTotalI = 0.0;
4607 0 : size_t nCount = 0;
4608 :
4609 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4610 : {
4611 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4612 : {
4613 : // TODO(schwehr): Maybe use std::complex?
4614 0 : dfTotalR += double(
4615 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4616 0 : static_cast<size_t>(iY - nSrcYOff) *
4617 0 : nSrcWidth * 2]);
4618 0 : dfTotalI += double(
4619 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4620 0 : static_cast<size_t>(iY - nSrcYOff) *
4621 0 : nSrcWidth * 2 +
4622 0 : 1]);
4623 0 : ++nCount;
4624 : }
4625 : }
4626 :
4627 0 : CPLAssert(nCount > 0);
4628 0 : if (nCount == 0)
4629 : {
4630 0 : pafDstScanline[iDstPixelSZ * 2] = 0.0;
4631 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4632 : }
4633 : else
4634 : {
4635 0 : pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4636 0 : dfTotalR / static_cast<double>(nCount));
4637 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4638 0 : dfTotalI / static_cast<double>(nCount));
4639 : }
4640 : }
4641 : }
4642 : }
4643 :
4644 2 : return CE_None;
4645 : }
4646 :
4647 : /************************************************************************/
4648 : /* GDALRegenerateCascadingOverviews() */
4649 : /* */
4650 : /* Generate a list of overviews in order from largest to */
4651 : /* smallest, computing each from the next larger. */
4652 : /************************************************************************/
4653 :
4654 44 : static CPLErr GDALRegenerateCascadingOverviews(
4655 : GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
4656 : const char *pszResampling, GDALProgressFunc pfnProgress,
4657 : void *pProgressData, CSLConstList papszOptions)
4658 :
4659 : {
4660 : /* -------------------------------------------------------------------- */
4661 : /* First, we must put the overviews in order from largest to */
4662 : /* smallest. */
4663 : /* -------------------------------------------------------------------- */
4664 127 : for (int i = 0; i < nOverviews - 1; ++i)
4665 : {
4666 292 : for (int j = 0; j < nOverviews - i - 1; ++j)
4667 : {
4668 209 : if (papoOvrBands[j]->GetXSize() *
4669 209 : static_cast<float>(papoOvrBands[j]->GetYSize()) <
4670 209 : papoOvrBands[j + 1]->GetXSize() *
4671 209 : static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
4672 : {
4673 0 : GDALRasterBand *poTempBand = papoOvrBands[j];
4674 0 : papoOvrBands[j] = papoOvrBands[j + 1];
4675 0 : papoOvrBands[j + 1] = poTempBand;
4676 : }
4677 : }
4678 : }
4679 :
4680 : /* -------------------------------------------------------------------- */
4681 : /* Count total pixels so we can prepare appropriate scaled */
4682 : /* progress functions. */
4683 : /* -------------------------------------------------------------------- */
4684 44 : double dfTotalPixels = 0.0;
4685 :
4686 171 : for (int i = 0; i < nOverviews; ++i)
4687 : {
4688 127 : dfTotalPixels += papoOvrBands[i]->GetXSize() *
4689 127 : static_cast<double>(papoOvrBands[i]->GetYSize());
4690 : }
4691 :
4692 : /* -------------------------------------------------------------------- */
4693 : /* Generate all the bands. */
4694 : /* -------------------------------------------------------------------- */
4695 44 : double dfPixelsProcessed = 0.0;
4696 :
4697 88 : CPLStringList aosOptions(papszOptions);
4698 44 : aosOptions.SetNameValue("CASCADING", "YES");
4699 171 : for (int i = 0; i < nOverviews; ++i)
4700 : {
4701 127 : GDALRasterBand *poBaseBand = poSrcBand;
4702 127 : if (i != 0)
4703 83 : poBaseBand = papoOvrBands[i - 1];
4704 :
4705 127 : double dfPixels = papoOvrBands[i]->GetXSize() *
4706 127 : static_cast<double>(papoOvrBands[i]->GetYSize());
4707 :
4708 254 : void *pScaledProgressData = GDALCreateScaledProgress(
4709 : dfPixelsProcessed / dfTotalPixels,
4710 127 : (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
4711 : pProgressData);
4712 :
4713 254 : const CPLErr eErr = GDALRegenerateOverviewsEx(
4714 : poBaseBand, 1,
4715 127 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
4716 : pszResampling, GDALScaledProgress, pScaledProgressData,
4717 127 : aosOptions.List());
4718 127 : GDALDestroyScaledProgress(pScaledProgressData);
4719 :
4720 127 : if (eErr != CE_None)
4721 0 : return eErr;
4722 :
4723 127 : dfPixelsProcessed += dfPixels;
4724 :
4725 : // Only do the bit2grayscale promotion on the base band.
4726 127 : if (STARTS_WITH_CI(pszResampling,
4727 : "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
4728 8 : pszResampling = "AVERAGE";
4729 : }
4730 :
4731 44 : return CE_None;
4732 : }
4733 :
4734 : /************************************************************************/
4735 : /* GDALGetResampleFunction() */
4736 : /************************************************************************/
4737 :
4738 16187 : GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
4739 : int *pnRadius)
4740 : {
4741 16187 : if (pnRadius)
4742 16187 : *pnRadius = 0;
4743 16187 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
4744 533 : return GDALResampleChunk_Near;
4745 15654 : else if (STARTS_WITH_CI(pszResampling, "AVER") ||
4746 4426 : EQUAL(pszResampling, "RMS"))
4747 11293 : return GDALResampleChunk_AverageOrRMS;
4748 4361 : else if (EQUAL(pszResampling, "GAUSS"))
4749 : {
4750 26 : if (pnRadius)
4751 26 : *pnRadius = 1;
4752 26 : return GDALResampleChunk_Gauss;
4753 : }
4754 4335 : else if (EQUAL(pszResampling, "MODE"))
4755 142 : return GDALResampleChunk_Mode;
4756 4193 : else if (EQUAL(pszResampling, "CUBIC"))
4757 : {
4758 1647 : if (pnRadius)
4759 1647 : *pnRadius = GWKGetFilterRadius(GRA_Cubic);
4760 1647 : return GDALResampleChunk_Convolution;
4761 : }
4762 2546 : else if (EQUAL(pszResampling, "CUBICSPLINE"))
4763 : {
4764 60 : if (pnRadius)
4765 60 : *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
4766 60 : return GDALResampleChunk_Convolution;
4767 : }
4768 2486 : else if (EQUAL(pszResampling, "LANCZOS"))
4769 : {
4770 50 : if (pnRadius)
4771 50 : *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
4772 50 : return GDALResampleChunk_Convolution;
4773 : }
4774 2436 : else if (EQUAL(pszResampling, "BILINEAR"))
4775 : {
4776 2436 : if (pnRadius)
4777 2436 : *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
4778 2436 : return GDALResampleChunk_Convolution;
4779 : }
4780 : else
4781 : {
4782 0 : CPLError(
4783 : CE_Failure, CPLE_AppDefined,
4784 : "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
4785 : pszResampling);
4786 0 : return nullptr;
4787 : }
4788 : }
4789 :
4790 : /************************************************************************/
4791 : /* GDALGetOvrWorkDataType() */
4792 : /************************************************************************/
4793 :
4794 16069 : GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
4795 : GDALDataType eSrcDataType)
4796 : {
4797 16069 : if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
4798 : {
4799 667 : return eSrcDataType;
4800 : }
4801 15402 : else if (eSrcDataType == GDT_UInt8 &&
4802 14829 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4803 3699 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4804 2294 : EQUAL(pszResampling, "CUBICSPLINE") ||
4805 2274 : EQUAL(pszResampling, "LANCZOS") ||
4806 2267 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4807 : {
4808 14822 : return GDT_UInt8;
4809 : }
4810 580 : else if (eSrcDataType == GDT_UInt16 &&
4811 131 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4812 126 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4813 8 : EQUAL(pszResampling, "CUBICSPLINE") ||
4814 6 : EQUAL(pszResampling, "LANCZOS") ||
4815 3 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4816 : {
4817 131 : return GDT_UInt16;
4818 : }
4819 449 : else if (EQUAL(pszResampling, "GAUSS"))
4820 20 : return GDT_Float64;
4821 :
4822 429 : if (eSrcDataType == GDT_UInt8 || eSrcDataType == GDT_Int8 ||
4823 428 : eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
4824 : eSrcDataType == GDT_Float32)
4825 : {
4826 277 : return GDT_Float32;
4827 : }
4828 152 : return GDT_Float64;
4829 : }
4830 :
4831 : namespace
4832 : {
4833 : // Structure to hold a pointer to free with CPLFree()
4834 : struct PointerHolder
4835 : {
4836 : void *ptr = nullptr;
4837 :
4838 4054 : template <class T> explicit PointerHolder(T *&ptrIn) : ptr(ptrIn)
4839 : {
4840 4054 : ptrIn = nullptr;
4841 4054 : }
4842 :
4843 : template <class T>
4844 32 : explicit PointerHolder(std::unique_ptr<T, VSIFreeReleaser> ptrIn)
4845 32 : : ptr(ptrIn.release())
4846 : {
4847 32 : }
4848 :
4849 4086 : ~PointerHolder()
4850 4086 : {
4851 4086 : CPLFree(ptr);
4852 4086 : }
4853 :
4854 : PointerHolder(const PointerHolder &) = delete;
4855 : PointerHolder &operator=(const PointerHolder &) = delete;
4856 : };
4857 : } // namespace
4858 :
4859 : /************************************************************************/
4860 : /* GDALRegenerateOverviews() */
4861 : /************************************************************************/
4862 :
4863 : /**
4864 : * \brief Generate downsampled overviews.
4865 : *
4866 : * This function will generate one or more overview images from a base image
4867 : * using the requested downsampling algorithm. Its primary use is for
4868 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4869 : * used to generate downsampled images in one file from another outside the
4870 : * overview architecture.
4871 : *
4872 : * The output bands need to exist in advance.
4873 : *
4874 : * The full set of resampling algorithms is documented in
4875 : * GDALDataset::BuildOverviews().
4876 : *
4877 : * This function will honour properly NODATA_VALUES tuples (special dataset
4878 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4879 : * considered as the nodata value and not each value of the triplet
4880 : * independently per band.
4881 : *
4882 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4883 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4884 : * overview computation.
4885 : *
4886 : * @param hSrcBand the source (base level) band.
4887 : * @param nOverviewCount the number of downsampled bands being generated.
4888 : * @param pahOvrBands the list of downsampled bands to be generated.
4889 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4890 : * @param pfnProgress progress report function.
4891 : * @param pProgressData progress function callback data.
4892 : * @return CE_None on success or CE_Failure on failure.
4893 : */
4894 113 : CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
4895 : GDALRasterBandH *pahOvrBands,
4896 : const char *pszResampling,
4897 : GDALProgressFunc pfnProgress,
4898 : void *pProgressData)
4899 :
4900 : {
4901 113 : return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
4902 : pszResampling, pfnProgress, pProgressData,
4903 113 : nullptr);
4904 : }
4905 :
4906 : /************************************************************************/
4907 : /* GDALRegenerateOverviewsEx() */
4908 : /************************************************************************/
4909 :
4910 : constexpr int RADIUS_TO_DIAMETER = 2;
4911 :
4912 : /**
4913 : * \brief Generate downsampled overviews.
4914 : *
4915 : * This function will generate one or more overview images from a base image
4916 : * using the requested downsampling algorithm. Its primary use is for
4917 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4918 : * used to generate downsampled images in one file from another outside the
4919 : * overview architecture.
4920 : *
4921 : * The output bands need to exist in advance.
4922 : *
4923 : * The full set of resampling algorithms is documented in
4924 : * GDALDataset::BuildOverviews().
4925 : *
4926 : * This function will honour properly NODATA_VALUES tuples (special dataset
4927 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4928 : * considered as the nodata value and not each value of the triplet
4929 : * independently per band.
4930 : *
4931 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4932 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4933 : * overview computation.
4934 : *
4935 : * @param hSrcBand the source (base level) band.
4936 : * @param nOverviewCount the number of downsampled bands being generated.
4937 : * @param pahOvrBands the list of downsampled bands to be generated.
4938 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4939 : * @param pfnProgress progress report function.
4940 : * @param pProgressData progress function callback data.
4941 : * @param papszOptions NULL terminated list of options as key=value pairs, or
4942 : * NULL
4943 : * @return CE_None on success or CE_Failure on failure.
4944 : * @since GDAL 3.6
4945 : */
4946 780 : CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
4947 : GDALRasterBandH *pahOvrBands,
4948 : const char *pszResampling,
4949 : GDALProgressFunc pfnProgress,
4950 : void *pProgressData, CSLConstList papszOptions)
4951 :
4952 : {
4953 780 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
4954 780 : GDALRasterBand **papoOvrBands =
4955 : reinterpret_cast<GDALRasterBand **>(pahOvrBands);
4956 :
4957 780 : if (pfnProgress == nullptr)
4958 102 : pfnProgress = GDALDummyProgress;
4959 :
4960 780 : if (EQUAL(pszResampling, "NONE"))
4961 50 : return CE_None;
4962 :
4963 730 : int nKernelRadius = 0;
4964 : GDALResampleFunction pfnResampleFn =
4965 730 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
4966 :
4967 730 : if (pfnResampleFn == nullptr)
4968 0 : return CE_Failure;
4969 :
4970 : /* -------------------------------------------------------------------- */
4971 : /* Check color tables... */
4972 : /* -------------------------------------------------------------------- */
4973 730 : GDALColorTable *poColorTable = nullptr;
4974 :
4975 507 : if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
4976 1538 : EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
4977 312 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4978 : {
4979 9 : poColorTable = poSrcBand->GetColorTable();
4980 9 : if (poColorTable != nullptr)
4981 : {
4982 9 : if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
4983 : {
4984 0 : CPLError(CE_Warning, CPLE_AppDefined,
4985 : "Computing overviews on palette index raster bands "
4986 : "with a palette whose color interpretation is not RGB "
4987 : "will probably lead to unexpected results.");
4988 0 : poColorTable = nullptr;
4989 : }
4990 9 : else if (poColorTable->IsIdentity())
4991 : {
4992 0 : poColorTable = nullptr;
4993 : }
4994 : }
4995 : else
4996 : {
4997 0 : CPLError(CE_Warning, CPLE_AppDefined,
4998 : "Computing overviews on palette index raster bands "
4999 : "without a palette will probably lead to unexpected "
5000 : "results.");
5001 : }
5002 : }
5003 : // Not ready yet
5004 2109 : else if ((EQUAL(pszResampling, "CUBIC") ||
5005 667 : EQUAL(pszResampling, "CUBICSPLINE") ||
5006 667 : EQUAL(pszResampling, "LANCZOS") ||
5007 1468 : EQUAL(pszResampling, "BILINEAR")) &&
5008 80 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
5009 : {
5010 0 : CPLError(CE_Warning, CPLE_AppDefined,
5011 : "Computing %s overviews on palette index raster bands "
5012 : "will probably lead to unexpected results.",
5013 : pszResampling);
5014 : }
5015 :
5016 : // If we have a nodata mask and we are doing something more complicated
5017 : // than nearest neighbouring, we have to fetch to nodata mask.
5018 :
5019 730 : GDALRasterBand *poMaskBand = nullptr;
5020 730 : bool bUseNoDataMask = false;
5021 730 : bool bCanUseCascaded = true;
5022 :
5023 730 : if (!STARTS_WITH_CI(pszResampling, "NEAR"))
5024 : {
5025 : // Special case if we are an alpha/mask band. We want it to be
5026 : // considered as the mask band to avoid alpha=0 to be taken into account
5027 : // in average computation.
5028 392 : if (poSrcBand->IsMaskBand())
5029 : {
5030 51 : poMaskBand = poSrcBand;
5031 51 : bUseNoDataMask = true;
5032 : }
5033 : else
5034 : {
5035 341 : poMaskBand = poSrcBand->GetMaskBand();
5036 341 : const int nMaskFlags = poSrcBand->GetMaskFlags();
5037 341 : bCanUseCascaded =
5038 341 : (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
5039 341 : bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
5040 : }
5041 : }
5042 :
5043 730 : int nHasNoData = 0;
5044 730 : const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
5045 730 : const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
5046 : const bool bPropagateNoData =
5047 730 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5048 :
5049 798 : if (poSrcBand->GetBand() == 1 && bUseNoDataMask &&
5050 68 : CSLFetchNameValue(papszOptions, "CASCADING") == nullptr)
5051 : {
5052 112 : std::string osDetailMessage;
5053 56 : if (poSrcBand->HasConflictingMaskSources(&osDetailMessage, false))
5054 : {
5055 2 : CPLError(
5056 : CE_Warning, CPLE_AppDefined, "%s%s", osDetailMessage.c_str(),
5057 : bHasNoData
5058 : ? "Only the nodata value will be taken into account."
5059 : : "Only the first listed one will be taken into account.");
5060 : }
5061 : }
5062 :
5063 : /* -------------------------------------------------------------------- */
5064 : /* If we are operating on multiple overviews, and using */
5065 : /* averaging, lets do them in cascading order to reduce the */
5066 : /* amount of computation. */
5067 : /* -------------------------------------------------------------------- */
5068 :
5069 : // In case the mask made be computed from another band of the dataset,
5070 : // we can't use cascaded generation, as the computation of the overviews
5071 : // of the band used for the mask band may not have yet occurred (#3033).
5072 730 : if ((STARTS_WITH_CI(pszResampling, "AVER") ||
5073 507 : EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
5074 476 : EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
5075 422 : EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
5076 730 : EQUAL(pszResampling, "MODE")) &&
5077 44 : nOverviewCount > 1 && bCanUseCascaded)
5078 44 : return GDALRegenerateCascadingOverviews(
5079 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
5080 44 : pProgressData, papszOptions);
5081 :
5082 : /* -------------------------------------------------------------------- */
5083 : /* Setup one horizontal swath to read from the raw buffer. */
5084 : /* -------------------------------------------------------------------- */
5085 686 : int nFRXBlockSize = 0;
5086 686 : int nFRYBlockSize = 0;
5087 686 : poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
5088 :
5089 686 : const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
5090 1034 : const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
5091 984 : EQUAL(pszResampling, "MODE") ||
5092 298 : !GDALDataTypeIsComplex(eSrcDataType);
5093 : const GDALDataType eWrkDataType =
5094 : bUseGenericResampleFn
5095 686 : ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
5096 686 : : GDT_CFloat32;
5097 :
5098 686 : const int nWidth = poSrcBand->GetXSize();
5099 686 : const int nHeight = poSrcBand->GetYSize();
5100 :
5101 686 : int nMaxOvrFactor = 1;
5102 1491 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
5103 : {
5104 805 : const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
5105 805 : const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
5106 805 : nMaxOvrFactor = std::max(
5107 : nMaxOvrFactor,
5108 805 : static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
5109 805 : nMaxOvrFactor = std::max(
5110 : nMaxOvrFactor,
5111 805 : static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
5112 : }
5113 :
5114 686 : int nFullResYChunk = nFRYBlockSize;
5115 686 : int nMaxChunkYSizeQueried = 0;
5116 :
5117 : const auto UpdateChunkHeightAndGetChunkSize =
5118 9220 : [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
5119 74721 : eWrkDataType, nWidth]()
5120 : {
5121 : // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
5122 : // + nFullResYChunk) / nMaxOvrFactor)
5123 9220 : if (nMaxOvrFactor > INT_MAX / RADIUS_TO_DIAMETER)
5124 : {
5125 1 : return GINTBIG_MAX;
5126 : }
5127 9219 : nFullResYChunk =
5128 9219 : std::max(nFullResYChunk, RADIUS_TO_DIAMETER * nMaxOvrFactor);
5129 9219 : if ((nKernelRadius > 0 &&
5130 970 : nMaxOvrFactor > INT_MAX / (RADIUS_TO_DIAMETER * nKernelRadius)) ||
5131 9219 : nFullResYChunk >
5132 9219 : INT_MAX - RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor)
5133 : {
5134 0 : return GINTBIG_MAX;
5135 : }
5136 9219 : nMaxChunkYSizeQueried =
5137 9219 : nFullResYChunk + RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor;
5138 9219 : if (GDALGetDataTypeSizeBytes(eWrkDataType) >
5139 9219 : std::numeric_limits<int64_t>::max() /
5140 9219 : (static_cast<int64_t>(nMaxChunkYSizeQueried) * nWidth))
5141 : {
5142 1 : return GINTBIG_MAX;
5143 : }
5144 9218 : return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
5145 9218 : nMaxChunkYSizeQueried * nWidth;
5146 686 : };
5147 :
5148 : const char *pszChunkYSize =
5149 686 : CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
5150 : #ifndef __COVERITY__
5151 : // Only configurable for debug / testing
5152 686 : if (pszChunkYSize)
5153 : {
5154 0 : nFullResYChunk = atoi(pszChunkYSize);
5155 : }
5156 : #endif
5157 :
5158 : // Only configurable for debug / testing
5159 : const int nChunkMaxSize =
5160 686 : atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
5161 :
5162 686 : auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
5163 686 : if (nChunkSize > nChunkMaxSize)
5164 : {
5165 15 : if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
5166 44 : !GDALDataTypeIsComplex(eSrcDataType) &&
5167 14 : (!STARTS_WITH_CI(pszResampling, "AVER") ||
5168 2 : EQUAL(pszResampling, "AVERAGE")))
5169 : {
5170 : // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
5171 : // which use a block based strategy, which is much less memory
5172 : // hungry.
5173 14 : return GDALRegenerateOverviewsMultiBand(
5174 : 1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
5175 14 : pfnProgress, pProgressData, papszOptions);
5176 : }
5177 1 : else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
5178 : {
5179 0 : return GDALRegenerateCascadingOverviews(
5180 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
5181 0 : pfnProgress, pProgressData, papszOptions);
5182 : }
5183 : }
5184 671 : else if (pszChunkYSize == nullptr)
5185 : {
5186 : // Try to get as close as possible to nChunkMaxSize
5187 9205 : while (nChunkSize < nChunkMaxSize / 2)
5188 : {
5189 8534 : nFullResYChunk *= 2;
5190 8534 : nChunkSize = UpdateChunkHeightAndGetChunkSize();
5191 : }
5192 : }
5193 :
5194 : // Structure describing a resampling job
5195 : struct OvrJob
5196 : {
5197 : // Buffers to free when job is finished
5198 : std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
5199 : std::shared_ptr<PointerHolder> oSrcBufferHolder{};
5200 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
5201 :
5202 : GDALRasterBand *poDstBand = nullptr;
5203 :
5204 : // Input parameters of pfnResampleFn
5205 : GDALResampleFunction pfnResampleFn = nullptr;
5206 : int nSrcWidth = 0;
5207 : int nSrcHeight = 0;
5208 : int nDstWidth = 0;
5209 : GDALOverviewResampleArgs args{};
5210 : const void *pChunk = nullptr;
5211 : bool bUseGenericResampleFn = false;
5212 :
5213 : // Output values of resampling function
5214 : CPLErr eErr = CE_Failure;
5215 : void *pDstBuffer = nullptr;
5216 : GDALDataType eDstBufferDataType = GDT_Unknown;
5217 :
5218 0 : void SetSrcMaskBufferHolder(
5219 : const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
5220 : {
5221 0 : oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
5222 0 : }
5223 :
5224 0 : void SetSrcBufferHolder(
5225 : const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
5226 : {
5227 0 : oSrcBufferHolder = oSrcBufferHolderIn;
5228 0 : }
5229 :
5230 774 : void NotifyFinished()
5231 : {
5232 1548 : std::lock_guard guard(mutex);
5233 774 : bFinished = true;
5234 774 : cv.notify_one();
5235 774 : }
5236 :
5237 0 : bool IsFinished()
5238 : {
5239 0 : std::lock_guard guard(mutex);
5240 0 : return bFinished;
5241 : }
5242 :
5243 0 : void WaitFinished()
5244 : {
5245 0 : std::unique_lock oGuard(mutex);
5246 0 : while (!bFinished)
5247 : {
5248 0 : cv.wait(oGuard);
5249 : }
5250 0 : }
5251 :
5252 : private:
5253 : // Synchronization
5254 : bool bFinished = false;
5255 : std::mutex mutex{};
5256 : std::condition_variable cv{};
5257 : };
5258 :
5259 : // Thread function to resample
5260 774 : const auto JobResampleFunc = [](void *pData)
5261 : {
5262 774 : OvrJob *poJob = static_cast<OvrJob *>(pData);
5263 :
5264 774 : if (poJob->bUseGenericResampleFn)
5265 : {
5266 772 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
5267 : &(poJob->pDstBuffer),
5268 : &(poJob->eDstBufferDataType));
5269 : }
5270 : else
5271 : {
5272 2 : poJob->eErr = GDALResampleChunkC32R(
5273 : poJob->nSrcWidth, poJob->nSrcHeight,
5274 2 : static_cast<const float *>(poJob->pChunk),
5275 : poJob->args.nChunkYOff, poJob->args.nChunkYSize,
5276 : poJob->args.nDstYOff, poJob->args.nDstYOff2,
5277 : poJob->args.nOvrXSize, poJob->args.nOvrYSize,
5278 : &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
5279 : poJob->args.pszResampling);
5280 : }
5281 :
5282 774 : auto pDstBuffer = poJob->pDstBuffer;
5283 774 : poJob->oDstBufferHolder = std::make_unique<PointerHolder>(pDstBuffer);
5284 :
5285 774 : poJob->NotifyFinished();
5286 774 : };
5287 :
5288 : // Function to write resample data to target band
5289 774 : const auto WriteJobData = [](const OvrJob *poJob)
5290 : {
5291 1548 : return poJob->poDstBand->RasterIO(
5292 774 : GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
5293 774 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
5294 774 : poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
5295 774 : poJob->eDstBufferDataType, 0, 0, nullptr);
5296 : };
5297 :
5298 : // Wait for completion of oldest job and serialize it
5299 : const auto WaitAndFinalizeOldestJob =
5300 0 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
5301 : {
5302 0 : auto poOldestJob = jobList.front().get();
5303 0 : poOldestJob->WaitFinished();
5304 0 : CPLErr l_eErr = poOldestJob->eErr;
5305 0 : if (l_eErr == CE_None)
5306 : {
5307 0 : l_eErr = WriteJobData(poOldestJob);
5308 : }
5309 :
5310 0 : jobList.pop_front();
5311 0 : return l_eErr;
5312 : };
5313 :
5314 : // Queue of jobs
5315 1344 : std::list<std::unique_ptr<OvrJob>> jobList;
5316 :
5317 672 : GByte *pabyChunkNodataMask = nullptr;
5318 672 : void *pChunk = nullptr;
5319 :
5320 672 : const int nThreads = GDALGetNumThreads(GDAL_DEFAULT_MAX_THREAD_COUNT,
5321 : /* bDefaultToAllCPUs=*/false);
5322 : auto poThreadPool =
5323 672 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5324 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5325 1344 : : std::unique_ptr<CPLJobQueue>(nullptr);
5326 :
5327 : /* -------------------------------------------------------------------- */
5328 : /* Loop over image operating on chunks. */
5329 : /* -------------------------------------------------------------------- */
5330 672 : int nChunkYOff = 0;
5331 672 : CPLErr eErr = CE_None;
5332 :
5333 1349 : for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
5334 677 : nChunkYOff += nFullResYChunk)
5335 : {
5336 677 : if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
5337 : pProgressData))
5338 : {
5339 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
5340 0 : eErr = CE_Failure;
5341 : }
5342 :
5343 677 : if (nFullResYChunk + nChunkYOff > nHeight)
5344 669 : nFullResYChunk = nHeight - nChunkYOff;
5345 :
5346 677 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
5347 677 : int nChunkYSizeQueried =
5348 677 : nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
5349 677 : if (nChunkYOffQueried < 0)
5350 : {
5351 83 : nChunkYSizeQueried += nChunkYOffQueried;
5352 83 : nChunkYOffQueried = 0;
5353 : }
5354 677 : if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
5355 83 : nChunkYSizeQueried = nHeight - nChunkYOffQueried;
5356 :
5357 : // Avoid accumulating too many tasks and exhaust RAM
5358 : // Try to complete already finished jobs
5359 677 : while (eErr == CE_None && !jobList.empty())
5360 : {
5361 0 : auto poOldestJob = jobList.front().get();
5362 0 : if (!poOldestJob->IsFinished())
5363 0 : break;
5364 0 : eErr = poOldestJob->eErr;
5365 0 : if (eErr == CE_None)
5366 : {
5367 0 : eErr = WriteJobData(poOldestJob);
5368 : }
5369 :
5370 0 : jobList.pop_front();
5371 : }
5372 :
5373 : // And in case we have saturated the number of threads,
5374 : // wait for completion of tasks to go below the threshold.
5375 1354 : while (eErr == CE_None &&
5376 677 : jobList.size() >= static_cast<size_t>(nThreads))
5377 : {
5378 0 : eErr = WaitAndFinalizeOldestJob(jobList);
5379 : }
5380 :
5381 : // (Re)allocate buffers if needed
5382 677 : if (pChunk == nullptr)
5383 : {
5384 672 : pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
5385 : nMaxChunkYSizeQueried, nWidth);
5386 : }
5387 677 : if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
5388 : {
5389 139 : pabyChunkNodataMask = static_cast<GByte *>(
5390 139 : VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
5391 : }
5392 :
5393 677 : if (pChunk == nullptr ||
5394 139 : (bUseNoDataMask && pabyChunkNodataMask == nullptr))
5395 : {
5396 0 : CPLFree(pChunk);
5397 0 : CPLFree(pabyChunkNodataMask);
5398 0 : return CE_Failure;
5399 : }
5400 :
5401 : // Read chunk.
5402 677 : if (eErr == CE_None)
5403 677 : eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
5404 : nChunkYSizeQueried, pChunk, nWidth,
5405 : nChunkYSizeQueried, eWrkDataType, 0, 0,
5406 : nullptr);
5407 677 : if (eErr == CE_None && bUseNoDataMask)
5408 139 : eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
5409 : nChunkYSizeQueried, pabyChunkNodataMask,
5410 : nWidth, nChunkYSizeQueried, GDT_UInt8,
5411 : 0, 0, nullptr);
5412 :
5413 : // Special case to promote 1bit data to 8bit 0/255 values.
5414 677 : if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
5415 : {
5416 9 : if (eWrkDataType == GDT_Float32)
5417 : {
5418 0 : float *pafChunk = static_cast<float *>(pChunk);
5419 0 : for (size_t i = 0;
5420 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5421 : {
5422 0 : if (pafChunk[i] == 1.0f)
5423 0 : pafChunk[i] = 255.0f;
5424 : }
5425 : }
5426 9 : else if (eWrkDataType == GDT_UInt8)
5427 : {
5428 9 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
5429 168417 : for (size_t i = 0;
5430 168417 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5431 : {
5432 168408 : if (pabyChunk[i] == 1)
5433 127437 : pabyChunk[i] = 255;
5434 : }
5435 : }
5436 0 : else if (eWrkDataType == GDT_UInt16)
5437 : {
5438 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
5439 0 : for (size_t i = 0;
5440 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5441 : {
5442 0 : if (pasChunk[i] == 1)
5443 0 : pasChunk[i] = 255;
5444 : }
5445 : }
5446 0 : else if (eWrkDataType == GDT_Float64)
5447 : {
5448 0 : double *padfChunk = static_cast<double *>(pChunk);
5449 0 : for (size_t i = 0;
5450 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5451 : {
5452 0 : if (padfChunk[i] == 1.0)
5453 0 : padfChunk[i] = 255.0;
5454 : }
5455 : }
5456 : else
5457 : {
5458 0 : CPLAssert(false);
5459 : }
5460 : }
5461 668 : else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
5462 : {
5463 0 : if (eWrkDataType == GDT_Float32)
5464 : {
5465 0 : float *pafChunk = static_cast<float *>(pChunk);
5466 0 : for (size_t i = 0;
5467 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5468 : {
5469 0 : if (pafChunk[i] == 1.0f)
5470 0 : pafChunk[i] = 0.0f;
5471 0 : else if (pafChunk[i] == 0.0f)
5472 0 : pafChunk[i] = 255.0f;
5473 : }
5474 : }
5475 0 : else if (eWrkDataType == GDT_UInt8)
5476 : {
5477 0 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
5478 0 : for (size_t i = 0;
5479 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5480 : {
5481 0 : if (pabyChunk[i] == 1)
5482 0 : pabyChunk[i] = 0;
5483 0 : else if (pabyChunk[i] == 0)
5484 0 : pabyChunk[i] = 255;
5485 : }
5486 : }
5487 0 : else if (eWrkDataType == GDT_UInt16)
5488 : {
5489 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
5490 0 : for (size_t i = 0;
5491 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5492 : {
5493 0 : if (pasChunk[i] == 1)
5494 0 : pasChunk[i] = 0;
5495 0 : else if (pasChunk[i] == 0)
5496 0 : pasChunk[i] = 255;
5497 : }
5498 : }
5499 0 : else if (eWrkDataType == GDT_Float64)
5500 : {
5501 0 : double *padfChunk = static_cast<double *>(pChunk);
5502 0 : for (size_t i = 0;
5503 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5504 : {
5505 0 : if (padfChunk[i] == 1.0)
5506 0 : padfChunk[i] = 0.0;
5507 0 : else if (padfChunk[i] == 0.0)
5508 0 : padfChunk[i] = 255.0;
5509 : }
5510 : }
5511 : else
5512 : {
5513 0 : CPLAssert(false);
5514 : }
5515 : }
5516 :
5517 677 : auto pChunkRaw = pChunk;
5518 677 : auto pabyChunkNodataMaskRaw = pabyChunkNodataMask;
5519 677 : std::shared_ptr<PointerHolder> oSrcBufferHolder;
5520 677 : std::shared_ptr<PointerHolder> oSrcMaskBufferHolder;
5521 677 : if (poJobQueue)
5522 : {
5523 0 : oSrcBufferHolder = std::make_shared<PointerHolder>(pChunk);
5524 : oSrcMaskBufferHolder =
5525 0 : std::make_shared<PointerHolder>(pabyChunkNodataMask);
5526 : }
5527 :
5528 1451 : for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
5529 : ++iOverview)
5530 : {
5531 774 : GDALRasterBand *poDstBand = papoOvrBands[iOverview];
5532 774 : const int nDstWidth = poDstBand->GetXSize();
5533 774 : const int nDstHeight = poDstBand->GetYSize();
5534 :
5535 774 : const double dfXRatioDstToSrc =
5536 774 : static_cast<double>(nWidth) / nDstWidth;
5537 774 : const double dfYRatioDstToSrc =
5538 774 : static_cast<double>(nHeight) / nDstHeight;
5539 :
5540 : /* --------------------------------------------------------------------
5541 : */
5542 : /* Figure out the line to start writing to, and the first line
5543 : */
5544 : /* to not write to. In theory this approach should ensure that
5545 : */
5546 : /* every output line will be written if all input chunks are */
5547 : /* processed. */
5548 : /* --------------------------------------------------------------------
5549 : */
5550 774 : int nDstYOff =
5551 774 : static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
5552 774 : if (nDstYOff == nDstHeight)
5553 0 : continue;
5554 774 : int nDstYOff2 = static_cast<int>(
5555 774 : 0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
5556 :
5557 774 : if (nChunkYOff + nFullResYChunk == nHeight)
5558 767 : nDstYOff2 = nDstHeight;
5559 : #if DEBUG_VERBOSE
5560 : CPLDebug("GDAL",
5561 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
5562 : nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
5563 : nDstWidth, nDstYOff2 - nDstYOff);
5564 : #endif
5565 :
5566 1548 : auto poJob = std::make_unique<OvrJob>();
5567 774 : poJob->pfnResampleFn = pfnResampleFn;
5568 774 : poJob->bUseGenericResampleFn = bUseGenericResampleFn;
5569 774 : poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
5570 774 : poJob->args.nOvrXSize = poDstBand->GetXSize();
5571 774 : poJob->args.nOvrYSize = poDstBand->GetYSize();
5572 : const char *pszNBITS =
5573 774 : poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
5574 774 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
5575 774 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
5576 774 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
5577 774 : poJob->args.eWrkDataType = eWrkDataType;
5578 774 : poJob->pChunk = pChunkRaw;
5579 774 : poJob->args.pabyChunkNodataMask = pabyChunkNodataMaskRaw;
5580 774 : poJob->nSrcWidth = nWidth;
5581 774 : poJob->nSrcHeight = nHeight;
5582 774 : poJob->args.nChunkXOff = 0;
5583 774 : poJob->args.nChunkXSize = nWidth;
5584 774 : poJob->args.nChunkYOff = nChunkYOffQueried;
5585 774 : poJob->args.nChunkYSize = nChunkYSizeQueried;
5586 774 : poJob->nDstWidth = nDstWidth;
5587 774 : poJob->args.nDstXOff = 0;
5588 774 : poJob->args.nDstXOff2 = nDstWidth;
5589 774 : poJob->args.nDstYOff = nDstYOff;
5590 774 : poJob->args.nDstYOff2 = nDstYOff2;
5591 774 : poJob->poDstBand = poDstBand;
5592 774 : poJob->args.pszResampling = pszResampling;
5593 774 : poJob->args.bHasNoData = bHasNoData;
5594 774 : poJob->args.dfNoDataValue = dfNoDataValue;
5595 774 : poJob->args.poColorTable = poColorTable;
5596 774 : poJob->args.eSrcDataType = eSrcDataType;
5597 774 : poJob->args.bPropagateNoData = bPropagateNoData;
5598 :
5599 774 : if (poJobQueue)
5600 : {
5601 0 : poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
5602 0 : poJob->SetSrcBufferHolder(oSrcBufferHolder);
5603 0 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
5604 0 : jobList.emplace_back(std::move(poJob));
5605 : }
5606 : else
5607 : {
5608 774 : JobResampleFunc(poJob.get());
5609 774 : eErr = poJob->eErr;
5610 774 : if (eErr == CE_None)
5611 : {
5612 774 : eErr = WriteJobData(poJob.get());
5613 : }
5614 : }
5615 : }
5616 : }
5617 :
5618 672 : VSIFree(pChunk);
5619 672 : VSIFree(pabyChunkNodataMask);
5620 :
5621 : // Wait for all pending jobs to complete
5622 672 : while (!jobList.empty())
5623 : {
5624 0 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5625 0 : if (l_eErr != CE_None && eErr == CE_None)
5626 0 : eErr = l_eErr;
5627 : }
5628 :
5629 : /* -------------------------------------------------------------------- */
5630 : /* Renormalized overview mean / stddev if needed. */
5631 : /* -------------------------------------------------------------------- */
5632 672 : if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
5633 : {
5634 0 : GDALOverviewMagnitudeCorrection(
5635 : poSrcBand, nOverviewCount,
5636 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
5637 : GDALDummyProgress, nullptr);
5638 : }
5639 :
5640 : /* -------------------------------------------------------------------- */
5641 : /* It can be important to flush out data to overviews. */
5642 : /* -------------------------------------------------------------------- */
5643 1439 : for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
5644 : ++iOverview)
5645 : {
5646 767 : eErr = papoOvrBands[iOverview]->FlushCache(false);
5647 : }
5648 :
5649 672 : if (eErr == CE_None)
5650 672 : pfnProgress(1.0, nullptr, pProgressData);
5651 :
5652 672 : return eErr;
5653 : }
5654 :
5655 : /************************************************************************/
5656 : /* GDALRegenerateOverviewsMultiBand() */
5657 : /************************************************************************/
5658 :
5659 : /**
5660 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5661 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5662 : *
5663 : * This function will generate one or more overview images from a base
5664 : * image using the requested downsampling algorithm. Its primary use
5665 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
5666 : * can also be used to generate downsampled images in one file from another
5667 : * outside the overview architecture.
5668 : *
5669 : * The output bands need to exist in advance and share the same characteristics
5670 : * (type, dimensions)
5671 : *
5672 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5673 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5674 : *
5675 : * It does not support color tables or complex data types.
5676 : *
5677 : * The pseudo-algorithm used by the function is :
5678 : * for each overview
5679 : * iterate on lines of the source by a step of deltay
5680 : * iterate on columns of the source by a step of deltax
5681 : * read the source data of size deltax * deltay for all the bands
5682 : * generate the corresponding overview block for all the bands
5683 : *
5684 : * This function will honour properly NODATA_VALUES tuples (special dataset
5685 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5686 : * considered as the nodata value and not each value of the triplet
5687 : * independently per band.
5688 : *
5689 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
5690 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5691 : * overview computation.
5692 : *
5693 : * @param nBands the number of bands, size of papoSrcBands and size of
5694 : * first dimension of papapoOverviewBands
5695 : * @param papoSrcBands the list of source bands to downsample
5696 : * @param nOverviews the number of downsampled overview levels being generated.
5697 : * @param papapoOverviewBands bidimension array of bands. First dimension is
5698 : * indexed by nBands. Second dimension is indexed by
5699 : * nOverviews.
5700 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5701 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5702 : * @param pfnProgress progress report function.
5703 : * @param pProgressData progress function callback data.
5704 : * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
5705 : * key=value pairs, or NULL
5706 : * Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
5707 : * options can be specified to express that overviews should
5708 : * be regenerated only in the specified subset of the source
5709 : * dataset.
5710 : * @return CE_None on success or CE_Failure on failure.
5711 : */
5712 :
5713 387 : CPLErr GDALRegenerateOverviewsMultiBand(
5714 : int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
5715 : GDALRasterBand *const *const *papapoOverviewBands,
5716 : const char *pszResampling, GDALProgressFunc pfnProgress,
5717 : void *pProgressData, CSLConstList papszOptions)
5718 : {
5719 387 : CPL_IGNORE_RET_VAL(papszOptions);
5720 :
5721 387 : if (pfnProgress == nullptr)
5722 11 : pfnProgress = GDALDummyProgress;
5723 :
5724 387 : if (EQUAL(pszResampling, "NONE") || nBands == 0 || nOverviews == 0)
5725 3 : return CE_None;
5726 :
5727 : // Sanity checks.
5728 384 : if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
5729 189 : !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
5730 80 : !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
5731 22 : !EQUAL(pszResampling, "CUBICSPLINE") &&
5732 21 : !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
5733 5 : !EQUAL(pszResampling, "MODE"))
5734 : {
5735 0 : CPLError(CE_Failure, CPLE_NotSupported,
5736 : "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
5737 : "not supported",
5738 : pszResampling);
5739 0 : return CE_Failure;
5740 : }
5741 :
5742 384 : int nKernelRadius = 0;
5743 : GDALResampleFunction pfnResampleFn =
5744 384 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
5745 384 : if (pfnResampleFn == nullptr)
5746 0 : return CE_Failure;
5747 :
5748 384 : const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
5749 384 : const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
5750 384 : if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
5751 0 : return CE_None;
5752 384 : GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
5753 66225 : for (int iBand = 1; iBand < nBands; ++iBand)
5754 : {
5755 131682 : if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
5756 65841 : papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
5757 : {
5758 0 : CPLError(
5759 : CE_Failure, CPLE_NotSupported,
5760 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5761 : "have the same dimensions");
5762 0 : return CE_Failure;
5763 : }
5764 65841 : if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
5765 : {
5766 0 : CPLError(
5767 : CE_Failure, CPLE_NotSupported,
5768 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5769 : "have the same data type");
5770 0 : return CE_Failure;
5771 : }
5772 : }
5773 :
5774 1024 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5775 : {
5776 640 : const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
5777 640 : const int nDstWidth = poOvrFirstBand->GetXSize();
5778 640 : const int nDstHeight = poOvrFirstBand->GetYSize();
5779 66739 : for (int iBand = 1; iBand < nBands; ++iBand)
5780 : {
5781 66099 : const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
5782 132198 : if (poOvrBand->GetXSize() != nDstWidth ||
5783 66099 : poOvrBand->GetYSize() != nDstHeight)
5784 : {
5785 0 : CPLError(
5786 : CE_Failure, CPLE_NotSupported,
5787 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5788 : "of the same level must have the same dimensions");
5789 0 : return CE_Failure;
5790 : }
5791 66099 : if (poOvrBand->GetRasterDataType() != eDataType)
5792 : {
5793 0 : CPLError(
5794 : CE_Failure, CPLE_NotSupported,
5795 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5796 : "must have the same data type as the source bands");
5797 0 : return CE_Failure;
5798 : }
5799 : }
5800 : }
5801 :
5802 : // First pass to compute the total number of pixels to write.
5803 384 : double dfTotalPixelCount = 0;
5804 384 : const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
5805 384 : const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
5806 384 : const int nSrcXSize = atoi(CSLFetchNameValueDef(
5807 : papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
5808 384 : const int nSrcYSize = atoi(CSLFetchNameValueDef(
5809 : papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
5810 1024 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5811 : {
5812 640 : dfTotalPixelCount +=
5813 1280 : static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
5814 640 : papapoOverviewBands[0][iOverview]->GetXSize() *
5815 1280 : static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
5816 640 : papapoOverviewBands[0][iOverview]->GetYSize();
5817 : }
5818 :
5819 : const GDALDataType eWrkDataType =
5820 384 : GDALGetOvrWorkDataType(pszResampling, eDataType);
5821 : const int nWrkDataTypeSize =
5822 384 : std::max(1, GDALGetDataTypeSizeBytes(eWrkDataType));
5823 :
5824 384 : const bool bIsMask = papoSrcBands[0]->IsMaskBand();
5825 :
5826 : // If we have a nodata mask and we are doing something more complicated
5827 : // than nearest neighbouring, we have to fetch to nodata mask.
5828 : const bool bUseNoDataMask =
5829 567 : !STARTS_WITH_CI(pszResampling, "NEAR") &&
5830 183 : (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
5831 :
5832 768 : std::vector<bool> abHasNoData(nBands);
5833 768 : std::vector<double> adfNoDataValue(nBands);
5834 :
5835 66609 : for (int iBand = 0; iBand < nBands; ++iBand)
5836 : {
5837 66225 : int nHasNoData = 0;
5838 132450 : adfNoDataValue[iBand] =
5839 66225 : papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
5840 66225 : abHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
5841 : }
5842 :
5843 768 : std::string osDetailMessage;
5844 436 : if (bUseNoDataMask &&
5845 52 : papoSrcBands[0]->HasConflictingMaskSources(&osDetailMessage, false))
5846 : {
5847 9 : CPLError(CE_Warning, CPLE_AppDefined, "%s%s", osDetailMessage.c_str(),
5848 18 : abHasNoData[0]
5849 : ? "Only the nodata value will be taken into account."
5850 9 : : "Only the first listed one will be taken into account.");
5851 : }
5852 :
5853 : const bool bPropagateNoData =
5854 384 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5855 :
5856 384 : const int nThreads = GDALGetNumThreads(GDAL_DEFAULT_MAX_THREAD_COUNT,
5857 : /* bDefaultToAllCPUs=*/false);
5858 : auto poThreadPool =
5859 384 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5860 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5861 768 : : std::unique_ptr<CPLJobQueue>(nullptr);
5862 :
5863 : // Only configurable for debug / testing
5864 384 : const GIntBig nChunkMaxSize = []() -> GIntBig
5865 : {
5866 : const char *pszVal =
5867 384 : CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", nullptr);
5868 384 : if (pszVal)
5869 : {
5870 15 : GIntBig nRet = 0;
5871 15 : CPLParseMemorySize(pszVal, &nRet, nullptr);
5872 15 : return std::max<GIntBig>(100, nRet);
5873 : }
5874 369 : return 10 * 1024 * 1024;
5875 384 : }();
5876 :
5877 : // Only configurable for debug / testing
5878 384 : const GIntBig nChunkMaxSizeForTempFile = []() -> GIntBig
5879 : {
5880 384 : const char *pszVal = CPLGetConfigOption(
5881 : "GDAL_OVR_CHUNK_MAX_SIZE_FOR_TEMP_FILE", nullptr);
5882 384 : if (pszVal)
5883 : {
5884 14 : GIntBig nRet = 0;
5885 14 : CPLParseMemorySize(pszVal, &nRet, nullptr);
5886 14 : return std::max<GIntBig>(100, nRet);
5887 : }
5888 370 : const auto nUsableRAM = CPLGetUsablePhysicalRAM();
5889 370 : if (nUsableRAM > 0)
5890 370 : return nUsableRAM / 10;
5891 : // Select a value to be able to at least downsample by 2 for a RGB
5892 : // 1024x1024 tiled output: (2 * 1024 + 2) * (2 * 1024 + 2) * 3 = 12 MB
5893 0 : return 100 * 1024 * 1024;
5894 384 : }();
5895 :
5896 : // Second pass to do the real job.
5897 384 : double dfCurPixelCount = 0;
5898 384 : CPLErr eErr = CE_None;
5899 1018 : for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
5900 : ++iOverview)
5901 : {
5902 639 : int iSrcOverview = -1; // -1 means the source bands.
5903 :
5904 : const int nDstTotalWidth =
5905 639 : papapoOverviewBands[0][iOverview]->GetXSize();
5906 : const int nDstTotalHeight =
5907 639 : papapoOverviewBands[0][iOverview]->GetYSize();
5908 :
5909 : // Compute the coordinates of the target region to refresh
5910 639 : constexpr double EPS = 1e-8;
5911 639 : const int nDstXOffStart = static_cast<int>(
5912 639 : static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
5913 : EPS);
5914 : const int nDstXOffEnd =
5915 1278 : std::min(static_cast<int>(
5916 639 : std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
5917 639 : nToplevelSrcWidth * nDstTotalWidth -
5918 : EPS)),
5919 639 : nDstTotalWidth);
5920 639 : const int nDstWidth = nDstXOffEnd - nDstXOffStart;
5921 639 : const int nDstYOffStart =
5922 639 : static_cast<int>(static_cast<double>(nSrcYOff) /
5923 639 : nToplevelSrcHeight * nDstTotalHeight +
5924 : EPS);
5925 : const int nDstYOffEnd =
5926 1278 : std::min(static_cast<int>(
5927 639 : std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
5928 639 : nToplevelSrcHeight * nDstTotalHeight -
5929 : EPS)),
5930 639 : nDstTotalHeight);
5931 639 : const int nDstHeight = nDstYOffEnd - nDstYOffStart;
5932 :
5933 : // Try to use previous level of overview as the source to compute
5934 : // the next level.
5935 639 : int nSrcWidth = nToplevelSrcWidth;
5936 639 : int nSrcHeight = nToplevelSrcHeight;
5937 894 : if (iOverview > 0 &&
5938 255 : papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
5939 : {
5940 247 : nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
5941 247 : nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
5942 247 : iSrcOverview = iOverview - 1;
5943 : }
5944 :
5945 639 : const double dfXRatioDstToSrc =
5946 639 : static_cast<double>(nSrcWidth) / nDstTotalWidth;
5947 639 : const double dfYRatioDstToSrc =
5948 639 : static_cast<double>(nSrcHeight) / nDstTotalHeight;
5949 :
5950 : const int nOvrFactor =
5951 1917 : std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
5952 639 : static_cast<int>(0.5 + dfYRatioDstToSrc)));
5953 :
5954 639 : int nDstChunkXSize = 0;
5955 639 : int nDstChunkYSize = 0;
5956 639 : papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
5957 : &nDstChunkYSize);
5958 :
5959 639 : constexpr int PIXEL_MARGIN = 2;
5960 : // Try to extend the chunk size so that the memory needed to acquire
5961 : // source pixels goes up to 10 MB.
5962 : // This can help for drivers that support multi-threaded reading
5963 639 : const int nFullResYChunk = static_cast<int>(std::min<double>(
5964 639 : nSrcHeight, PIXEL_MARGIN + nDstChunkYSize * dfYRatioDstToSrc));
5965 639 : const int nFullResYChunkQueried = static_cast<int>(std::min<int64_t>(
5966 1278 : nSrcHeight,
5967 1278 : nFullResYChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5968 639 : nKernelRadius * nOvrFactor));
5969 872 : while (nDstChunkXSize < nDstWidth)
5970 : {
5971 252 : constexpr int INCREASE_FACTOR = 2;
5972 :
5973 252 : const int nFullResXChunk = static_cast<int>(std::min<double>(
5974 504 : nSrcWidth, PIXEL_MARGIN + INCREASE_FACTOR * nDstChunkXSize *
5975 252 : dfXRatioDstToSrc));
5976 :
5977 : const int nFullResXChunkQueried =
5978 252 : static_cast<int>(std::min<int64_t>(
5979 504 : nSrcWidth,
5980 504 : nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5981 252 : nKernelRadius * nOvrFactor));
5982 :
5983 252 : if (nBands > nChunkMaxSize / nFullResXChunkQueried /
5984 252 : nFullResYChunkQueried / nWrkDataTypeSize)
5985 : {
5986 19 : break;
5987 : }
5988 :
5989 233 : nDstChunkXSize *= INCREASE_FACTOR;
5990 : }
5991 639 : nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
5992 :
5993 639 : const int nFullResXChunk = static_cast<int>(std::min<double>(
5994 639 : nSrcWidth, PIXEL_MARGIN + nDstChunkXSize * dfXRatioDstToSrc));
5995 639 : const int nFullResXChunkQueried = static_cast<int>(std::min<int64_t>(
5996 1278 : nSrcWidth,
5997 1278 : nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5998 639 : nKernelRadius * nOvrFactor));
5999 :
6000 : // Make sure that the RAM requirements to acquire the source data does
6001 : // not exceed nChunkMaxSizeForTempFile
6002 : // If so, reduce the destination chunk size, generate overviews in a
6003 : // temporary dataset, and copy that temporary dataset over the target
6004 : // overview bands (to avoid issues with lossy compression)
6005 : const bool bOverflowFullResXChunkYChunkQueried =
6006 639 : nBands > std::numeric_limits<int64_t>::max() /
6007 639 : nFullResXChunkQueried / nFullResYChunkQueried /
6008 639 : nWrkDataTypeSize;
6009 :
6010 639 : const auto nMemRequirement =
6011 : bOverflowFullResXChunkYChunkQueried
6012 639 : ? 0
6013 635 : : static_cast<GIntBig>(nFullResXChunkQueried) *
6014 635 : nFullResYChunkQueried * nBands * nWrkDataTypeSize;
6015 : // Use a temporary dataset with a smaller destination chunk size
6016 639 : const auto nOverShootFactor =
6017 : nMemRequirement / nChunkMaxSizeForTempFile;
6018 :
6019 639 : constexpr int MIN_OVERSHOOT_FACTOR = 4;
6020 : const auto nSqrtOverShootFactor = std::max<GIntBig>(
6021 1278 : MIN_OVERSHOOT_FACTOR, static_cast<GIntBig>(std::ceil(std::sqrt(
6022 639 : static_cast<double>(nOverShootFactor)))));
6023 639 : constexpr int DEFAULT_CHUNK_SIZE = 256;
6024 639 : constexpr int GTIFF_BLOCK_SIZE_MULTIPLE = 16;
6025 : const int nReducedDstChunkXSize =
6026 : bOverflowFullResXChunkYChunkQueried
6027 1274 : ? DEFAULT_CHUNK_SIZE
6028 1274 : : std::max(1, static_cast<int>(nDstChunkXSize /
6029 1274 : nSqrtOverShootFactor) &
6030 635 : ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
6031 : const int nReducedDstChunkYSize =
6032 : bOverflowFullResXChunkYChunkQueried
6033 1274 : ? DEFAULT_CHUNK_SIZE
6034 1274 : : std::max(1, static_cast<int>(nDstChunkYSize /
6035 1274 : nSqrtOverShootFactor) &
6036 635 : ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
6037 :
6038 639 : if (bOverflowFullResXChunkYChunkQueried ||
6039 : nMemRequirement > nChunkMaxSizeForTempFile)
6040 : {
6041 : const auto nDTSize =
6042 43 : std::max(1, GDALGetDataTypeSizeBytes(eDataType));
6043 : const bool bTmpDSMemRequirementOverflow =
6044 43 : nBands > std::numeric_limits<int64_t>::max() / nDstWidth /
6045 43 : nDstHeight / nDTSize;
6046 43 : const auto nTmpDSMemRequirement =
6047 : bTmpDSMemRequirementOverflow
6048 43 : ? 0
6049 41 : : static_cast<GIntBig>(nDstWidth) * nDstHeight * nBands *
6050 41 : nDTSize;
6051 :
6052 : // make sure that one band buffer doesn't overflow size_t
6053 : const bool bChunkSizeOverflow =
6054 43 : static_cast<size_t>(nDTSize) >
6055 43 : std::numeric_limits<size_t>::max() / nDstWidth / nDstHeight;
6056 43 : const size_t nChunkSize =
6057 : bChunkSizeOverflow
6058 43 : ? 0
6059 41 : : static_cast<size_t>(nDstWidth) * nDstHeight * nDTSize;
6060 :
6061 : const auto CreateVRT =
6062 41 : [nBands, nSrcWidth, nSrcHeight, nDstTotalWidth, nDstTotalHeight,
6063 : pszResampling, eWrkDataType, papoSrcBands, papapoOverviewBands,
6064 : iSrcOverview, &abHasNoData,
6065 393585 : &adfNoDataValue](int nVRTBlockXSize, int nVRTBlockYSize)
6066 : {
6067 : auto poVRTDS = std::make_unique<VRTDataset>(
6068 41 : nDstTotalWidth, nDstTotalHeight, nVRTBlockXSize,
6069 41 : nVRTBlockYSize);
6070 :
6071 65620 : for (int iBand = 0; iBand < nBands; ++iBand)
6072 : {
6073 131158 : auto poVRTSrc = std::make_unique<VRTSimpleSource>();
6074 65579 : poVRTSrc->SetResampling(pszResampling);
6075 65579 : poVRTDS->AddBand(eWrkDataType);
6076 : auto poVRTBand = static_cast<VRTSourcedRasterBand *>(
6077 65579 : poVRTDS->GetRasterBand(iBand + 1));
6078 :
6079 65579 : auto poSrcBand = papoSrcBands[iBand];
6080 65579 : if (iSrcOverview != -1)
6081 24 : poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
6082 65579 : poVRTBand->ConfigureSource(
6083 : poVRTSrc.get(), poSrcBand, false, 0, 0, nSrcWidth,
6084 : nSrcHeight, 0, 0, nDstTotalWidth, nDstTotalHeight);
6085 : // Add the source to the band
6086 65579 : poVRTBand->AddSource(poVRTSrc.release());
6087 65579 : if (abHasNoData[iBand])
6088 3 : poVRTBand->SetNoDataValue(adfNoDataValue[iBand]);
6089 : }
6090 :
6091 42 : if (papoSrcBands[0]->GetMaskFlags() == GMF_PER_DATASET &&
6092 1 : poVRTDS->CreateMaskBand(GMF_PER_DATASET) == CE_None)
6093 : {
6094 : VRTSourcedRasterBand *poMaskVRTBand =
6095 1 : cpl::down_cast<VRTSourcedRasterBand *>(
6096 1 : poVRTDS->GetRasterBand(1)->GetMaskBand());
6097 1 : auto poSrcBand = papoSrcBands[0];
6098 1 : if (iSrcOverview != -1)
6099 0 : poSrcBand = papapoOverviewBands[0][iSrcOverview];
6100 1 : poMaskVRTBand->AddMaskBandSource(
6101 1 : poSrcBand->GetMaskBand(), 0, 0, nSrcWidth, nSrcHeight,
6102 : 0, 0, nDstTotalWidth, nDstTotalHeight);
6103 : }
6104 :
6105 41 : return poVRTDS;
6106 43 : };
6107 :
6108 : // If the overview accommodates chunking, do so and recurse
6109 : // to avoid generating full size temporary files
6110 43 : if (!bOverflowFullResXChunkYChunkQueried &&
6111 39 : !bTmpDSMemRequirementOverflow && !bChunkSizeOverflow &&
6112 39 : (nDstChunkXSize < nDstWidth || nDstChunkYSize < nDstHeight))
6113 : {
6114 : // Create a VRT with the smaller chunk to do the scaling
6115 : auto poVRTDS =
6116 13 : CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
6117 :
6118 13 : std::vector<GDALRasterBand *> apoVRTBand(nBands);
6119 13 : std::vector<GDALRasterBand *> apoDstBand(nBands);
6120 65560 : for (int iBand = 0; iBand < nBands; ++iBand)
6121 : {
6122 65547 : apoDstBand[iBand] = papapoOverviewBands[iBand][iOverview];
6123 65547 : apoVRTBand[iBand] = poVRTDS->GetRasterBand(iBand + 1);
6124 : }
6125 :
6126 : // Use a flag to avoid reading from the overview being built
6127 : GDALRasterIOExtraArg sExtraArg;
6128 13 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
6129 13 : if (iSrcOverview == -1)
6130 13 : sExtraArg.bUseOnlyThisScale = true;
6131 :
6132 : // A single band buffer for data transfer to the overview
6133 13 : std::vector<GByte> abyChunk;
6134 : try
6135 : {
6136 13 : abyChunk.resize(nChunkSize);
6137 : }
6138 0 : catch (const std::exception &)
6139 : {
6140 0 : CPLError(CE_Failure, CPLE_OutOfMemory,
6141 : "Out of memory allocating temporary buffer");
6142 0 : return CE_Failure;
6143 : }
6144 :
6145 : // Loop over output height, in chunks
6146 13 : for (int nDstYOff = nDstYOffStart;
6147 38 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6148 : /* */)
6149 : {
6150 : const int nDstYCount =
6151 25 : std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
6152 : // Loop over output width, in output chunks
6153 25 : for (int nDstXOff = nDstXOffStart;
6154 74 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6155 : /* */)
6156 : {
6157 : const int nDstXCount =
6158 49 : std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
6159 : // Read and transfer the chunk to the overview
6160 98 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
6161 : ++iBand)
6162 : {
6163 98 : eErr = apoVRTBand[iBand]->RasterIO(
6164 : GF_Read, nDstXOff, nDstYOff, nDstXCount,
6165 49 : nDstYCount, abyChunk.data(), nDstXCount,
6166 : nDstYCount, eDataType, 0, 0, &sExtraArg);
6167 49 : if (eErr == CE_None)
6168 : {
6169 96 : eErr = apoDstBand[iBand]->RasterIO(
6170 : GF_Write, nDstXOff, nDstYOff, nDstXCount,
6171 48 : nDstYCount, abyChunk.data(), nDstXCount,
6172 : nDstYCount, eDataType, 0, 0, nullptr);
6173 : }
6174 : }
6175 :
6176 49 : dfCurPixelCount +=
6177 49 : static_cast<double>(nDstXCount) * nDstYCount;
6178 :
6179 49 : nDstXOff += nDstXCount;
6180 : } // width
6181 :
6182 25 : if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount,
6183 : nullptr, pProgressData))
6184 : {
6185 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
6186 : "User terminated");
6187 0 : eErr = CE_Failure;
6188 : }
6189 :
6190 25 : nDstYOff += nDstYCount;
6191 : } // height
6192 :
6193 13 : if (CE_None != eErr)
6194 : {
6195 1 : CPLError(CE_Failure, CPLE_AppDefined,
6196 : "Error while writing overview");
6197 1 : return CE_Failure;
6198 : }
6199 :
6200 12 : pfnProgress(1.0, nullptr, pProgressData);
6201 : // Flush the overviews we just generated
6202 24 : for (int iBand = 0; iBand < nBands; ++iBand)
6203 12 : apoDstBand[iBand]->FlushCache(false);
6204 :
6205 12 : continue; // Next overview
6206 : } // chunking via temporary dataset
6207 :
6208 0 : std::unique_ptr<GDALDataset> poTmpDS;
6209 : // Config option mostly/only for autotest purposes
6210 : const char *pszGDAL_OVR_TEMP_DRIVER =
6211 30 : CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
6212 30 : if ((!bTmpDSMemRequirementOverflow &&
6213 4 : nTmpDSMemRequirement <= nChunkMaxSizeForTempFile &&
6214 4 : !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
6215 26 : EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
6216 : {
6217 10 : auto poTmpDrv = GetGDALDriverManager()->GetDriverByName("MEM");
6218 10 : if (!poTmpDrv)
6219 : {
6220 0 : eErr = CE_Failure;
6221 0 : break;
6222 : }
6223 10 : poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
6224 : nDstTotalHeight, nBands,
6225 10 : eDataType, nullptr));
6226 : }
6227 : else
6228 : {
6229 : // Create a temporary file for the overview
6230 : auto poTmpDrv =
6231 20 : GetGDALDriverManager()->GetDriverByName("GTiff");
6232 20 : if (!poTmpDrv)
6233 : {
6234 0 : eErr = CE_Failure;
6235 0 : break;
6236 : }
6237 40 : std::string osTmpFilename;
6238 20 : auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
6239 20 : if (poDstDS)
6240 : {
6241 20 : osTmpFilename = poDstDS->GetDescription();
6242 : VSIStatBufL sStatBuf;
6243 20 : if (!osTmpFilename.empty() &&
6244 0 : VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
6245 0 : osTmpFilename += "_tmp_ovr.tif";
6246 : }
6247 20 : if (osTmpFilename.empty())
6248 : {
6249 20 : osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
6250 20 : osTmpFilename += ".tif";
6251 : }
6252 20 : CPLDebug("GDAL", "Creating temporary file %s of %d x %d x %d",
6253 : osTmpFilename.c_str(), nDstWidth, nDstHeight, nBands);
6254 40 : CPLStringList aosCO;
6255 20 : if (0 == ((nReducedDstChunkXSize % GTIFF_BLOCK_SIZE_MULTIPLE) |
6256 20 : (nReducedDstChunkYSize % GTIFF_BLOCK_SIZE_MULTIPLE)))
6257 : {
6258 14 : aosCO.SetNameValue("TILED", "YES");
6259 : aosCO.SetNameValue("BLOCKXSIZE",
6260 14 : CPLSPrintf("%d", nReducedDstChunkXSize));
6261 : aosCO.SetNameValue("BLOCKYSIZE",
6262 14 : CPLSPrintf("%d", nReducedDstChunkYSize));
6263 : }
6264 20 : if (const char *pszCOList =
6265 20 : poTmpDrv->GetMetadataItem(GDAL_DMD_CREATIONOPTIONLIST))
6266 : {
6267 : aosCO.SetNameValue(
6268 20 : "COMPRESS", strstr(pszCOList, "ZSTD") ? "ZSTD" : "LZW");
6269 : }
6270 20 : poTmpDS.reset(poTmpDrv->Create(osTmpFilename.c_str(), nDstWidth,
6271 : nDstHeight, nBands, eDataType,
6272 20 : aosCO.List()));
6273 20 : if (poTmpDS)
6274 : {
6275 18 : poTmpDS->MarkSuppressOnClose();
6276 18 : VSIUnlink(osTmpFilename.c_str());
6277 : }
6278 : }
6279 30 : if (!poTmpDS)
6280 : {
6281 2 : eErr = CE_Failure;
6282 2 : break;
6283 : }
6284 :
6285 : // Create a full size VRT to do the resampling without edge effects
6286 : auto poVRTDS =
6287 28 : CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
6288 :
6289 : // Allocate a band buffer with the overview chunk size
6290 : std::unique_ptr<void, VSIFreeReleaser> pDstBuffer(
6291 : VSI_MALLOC3_VERBOSE(size_t(nWrkDataTypeSize), nDstChunkXSize,
6292 28 : nDstChunkYSize));
6293 28 : if (pDstBuffer == nullptr)
6294 : {
6295 0 : eErr = CE_Failure;
6296 0 : break;
6297 : }
6298 :
6299 : // Use a flag to avoid reading the overview being built
6300 : GDALRasterIOExtraArg sExtraArg;
6301 28 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
6302 28 : if (iSrcOverview == -1)
6303 4 : sExtraArg.bUseOnlyThisScale = true;
6304 :
6305 : // Scale and copy data from the VRT to the temp file
6306 28 : for (int nDstYOff = nDstYOffStart;
6307 914 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6308 : /* */)
6309 : {
6310 : const int nDstYCount =
6311 886 : std::min(nReducedDstChunkYSize, nDstYOffEnd - nDstYOff);
6312 886 : for (int nDstXOff = nDstXOffStart;
6313 201218 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6314 : /* */)
6315 : {
6316 : const int nDstXCount =
6317 200332 : std::min(nReducedDstChunkXSize, nDstXOffEnd - nDstXOff);
6318 400668 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
6319 : ++iBand)
6320 : {
6321 200336 : auto poSrcBand = poVRTDS->GetRasterBand(iBand + 1);
6322 200336 : eErr = poSrcBand->RasterIO(
6323 : GF_Read, nDstXOff, nDstYOff, nDstXCount, nDstYCount,
6324 : pDstBuffer.get(), nDstXCount, nDstYCount,
6325 : eWrkDataType, 0, 0, &sExtraArg);
6326 200336 : if (eErr == CE_None)
6327 : {
6328 : // Write to the temporary dataset, shifted
6329 200334 : auto poOvrBand = poTmpDS->GetRasterBand(iBand + 1);
6330 200334 : eErr = poOvrBand->RasterIO(
6331 : GF_Write, nDstXOff - nDstXOffStart,
6332 : nDstYOff - nDstYOffStart, nDstXCount,
6333 : nDstYCount, pDstBuffer.get(), nDstXCount,
6334 : nDstYCount, eWrkDataType, 0, 0, nullptr);
6335 : }
6336 : }
6337 200332 : nDstXOff += nDstXCount;
6338 : }
6339 886 : nDstYOff += nDstYCount;
6340 : }
6341 :
6342 : // Copy from the temporary to the overview
6343 28 : for (int nDstYOff = nDstYOffStart;
6344 54 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6345 : /* */)
6346 : {
6347 : const int nDstYCount =
6348 26 : std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
6349 26 : for (int nDstXOff = nDstXOffStart;
6350 52 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6351 : /* */)
6352 : {
6353 : const int nDstXCount =
6354 26 : std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
6355 56 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
6356 : ++iBand)
6357 : {
6358 30 : auto poSrcBand = poTmpDS->GetRasterBand(iBand + 1);
6359 30 : eErr = poSrcBand->RasterIO(
6360 : GF_Read, nDstXOff - nDstXOffStart,
6361 : nDstYOff - nDstYOffStart, nDstXCount, nDstYCount,
6362 : pDstBuffer.get(), nDstXCount, nDstYCount,
6363 : eWrkDataType, 0, 0, nullptr);
6364 30 : if (eErr == CE_None)
6365 : {
6366 : // Write to the destination overview bands
6367 30 : auto poOvrBand =
6368 30 : papapoOverviewBands[iBand][iOverview];
6369 30 : eErr = poOvrBand->RasterIO(
6370 : GF_Write, nDstXOff, nDstYOff, nDstXCount,
6371 : nDstYCount, pDstBuffer.get(), nDstXCount,
6372 : nDstYCount, eWrkDataType, 0, 0, nullptr);
6373 : }
6374 : }
6375 26 : nDstXOff += nDstXCount;
6376 : }
6377 26 : nDstYOff += nDstYCount;
6378 : }
6379 :
6380 28 : if (eErr != CE_None)
6381 : {
6382 2 : CPLError(CE_Failure, CPLE_AppDefined,
6383 : "Failed to write overview %d", iOverview);
6384 2 : return eErr;
6385 : }
6386 :
6387 : // Flush the data to overviews.
6388 56 : for (int iBand = 0; iBand < nBands; ++iBand)
6389 30 : papapoOverviewBands[iBand][iOverview]->FlushCache(false);
6390 :
6391 26 : continue;
6392 : }
6393 :
6394 : // Structure describing a resampling job
6395 : struct OvrJob
6396 : {
6397 : // Buffers to free when job is finished
6398 : std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
6399 : std::unique_ptr<PointerHolder> oSrcBufferHolder{};
6400 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
6401 :
6402 : GDALRasterBand *poDstBand = nullptr;
6403 :
6404 : // Input parameters of pfnResampleFn
6405 : GDALResampleFunction pfnResampleFn = nullptr;
6406 : GDALOverviewResampleArgs args{};
6407 : const void *pChunk = nullptr;
6408 :
6409 : // Output values of resampling function
6410 : CPLErr eErr = CE_Failure;
6411 : void *pDstBuffer = nullptr;
6412 : GDALDataType eDstBufferDataType = GDT_Unknown;
6413 :
6414 3280 : void NotifyFinished()
6415 : {
6416 6560 : std::lock_guard guard(mutex);
6417 3280 : bFinished = true;
6418 3280 : cv.notify_one();
6419 3280 : }
6420 :
6421 2 : bool IsFinished()
6422 : {
6423 2 : std::lock_guard guard(mutex);
6424 4 : return bFinished;
6425 : }
6426 :
6427 14 : void WaitFinished()
6428 : {
6429 28 : std::unique_lock oGuard(mutex);
6430 22 : while (!bFinished)
6431 : {
6432 8 : cv.wait(oGuard);
6433 : }
6434 14 : }
6435 :
6436 : private:
6437 : // Synchronization
6438 : bool bFinished = false;
6439 : std::mutex mutex{};
6440 : std::condition_variable cv{};
6441 : };
6442 :
6443 : // Thread function to resample
6444 3280 : const auto JobResampleFunc = [](void *pData)
6445 : {
6446 3280 : OvrJob *poJob = static_cast<OvrJob *>(pData);
6447 :
6448 3280 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
6449 : &(poJob->pDstBuffer),
6450 : &(poJob->eDstBufferDataType));
6451 :
6452 3280 : auto pDstBuffer = poJob->pDstBuffer;
6453 : poJob->oDstBufferHolder =
6454 3280 : std::make_unique<PointerHolder>(pDstBuffer);
6455 :
6456 3280 : poJob->NotifyFinished();
6457 3280 : };
6458 :
6459 : // Function to write resample data to target band
6460 3280 : const auto WriteJobData = [](const OvrJob *poJob)
6461 : {
6462 6560 : return poJob->poDstBand->RasterIO(
6463 3280 : GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
6464 3280 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
6465 3280 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
6466 3280 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
6467 3280 : poJob->args.nDstYOff2 - poJob->args.nDstYOff,
6468 3280 : poJob->eDstBufferDataType, 0, 0, nullptr);
6469 : };
6470 :
6471 : // Wait for completion of oldest job and serialize it
6472 : const auto WaitAndFinalizeOldestJob =
6473 14 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
6474 : {
6475 14 : auto poOldestJob = jobList.front().get();
6476 14 : poOldestJob->WaitFinished();
6477 14 : CPLErr l_eErr = poOldestJob->eErr;
6478 14 : if (l_eErr == CE_None)
6479 : {
6480 14 : l_eErr = WriteJobData(poOldestJob);
6481 : }
6482 :
6483 14 : jobList.pop_front();
6484 14 : return l_eErr;
6485 : };
6486 :
6487 : // Queue of jobs
6488 1192 : std::list<std::unique_ptr<OvrJob>> jobList;
6489 :
6490 1192 : std::vector<std::unique_ptr<void, VSIFreeReleaser>> apaChunk(nBands);
6491 : std::vector<std::unique_ptr<GByte, VSIFreeReleaser>>
6492 1192 : apabyChunkNoDataMask(nBands);
6493 :
6494 : // Iterate on destination overview, block by block.
6495 596 : for (int nDstYOff = nDstYOffStart;
6496 2097 : nDstYOff < nDstYOffEnd && eErr == CE_None;
6497 1501 : nDstYOff += nDstChunkYSize)
6498 : {
6499 : int nDstYCount;
6500 1501 : if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
6501 1082 : nDstYCount = nDstChunkYSize;
6502 : else
6503 419 : nDstYCount = nDstYOffEnd - nDstYOff;
6504 :
6505 1501 : int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
6506 1501 : int nChunkYOff2 = static_cast<int>(
6507 1501 : ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
6508 1501 : if (nChunkYOff2 > nSrcHeight ||
6509 1501 : nDstYOff + nDstYCount == nDstTotalHeight)
6510 589 : nChunkYOff2 = nSrcHeight;
6511 1501 : int nYCount = nChunkYOff2 - nChunkYOff;
6512 1501 : CPLAssert(nYCount <= nFullResYChunk);
6513 :
6514 1501 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
6515 1501 : int nChunkYSizeQueried =
6516 1501 : nYCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
6517 1501 : if (nChunkYOffQueried < 0)
6518 : {
6519 141 : nChunkYSizeQueried += nChunkYOffQueried;
6520 141 : nChunkYOffQueried = 0;
6521 : }
6522 1501 : if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
6523 140 : nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
6524 1501 : CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
6525 :
6526 1501 : if (!pfnProgress(std::min(1.0, dfCurPixelCount / dfTotalPixelCount),
6527 : nullptr, pProgressData))
6528 : {
6529 1 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6530 1 : eErr = CE_Failure;
6531 : }
6532 :
6533 : // Iterate on destination overview, block by block.
6534 1501 : for (int nDstXOff = nDstXOffStart;
6535 3041 : nDstXOff < nDstXOffEnd && eErr == CE_None;
6536 1540 : nDstXOff += nDstChunkXSize)
6537 : {
6538 1540 : int nDstXCount = 0;
6539 1540 : if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
6540 1523 : nDstXCount = nDstChunkXSize;
6541 : else
6542 17 : nDstXCount = nDstXOffEnd - nDstXOff;
6543 :
6544 1540 : dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
6545 :
6546 1540 : int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
6547 1540 : int nChunkXOff2 = static_cast<int>(
6548 1540 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
6549 1540 : if (nChunkXOff2 > nSrcWidth ||
6550 1540 : nDstXOff + nDstXCount == nDstTotalWidth)
6551 1465 : nChunkXOff2 = nSrcWidth;
6552 1540 : const int nXCount = nChunkXOff2 - nChunkXOff;
6553 1540 : CPLAssert(nXCount <= nFullResXChunk);
6554 :
6555 1540 : int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
6556 1540 : int nChunkXSizeQueried =
6557 1540 : nXCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
6558 1540 : if (nChunkXOffQueried < 0)
6559 : {
6560 201 : nChunkXSizeQueried += nChunkXOffQueried;
6561 201 : nChunkXOffQueried = 0;
6562 : }
6563 1540 : if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
6564 210 : nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
6565 1540 : CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
6566 : #if DEBUG_VERBOSE
6567 : CPLDebug("GDAL",
6568 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
6569 : nChunkXOffQueried, nChunkYOffQueried,
6570 : nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
6571 : nDstYOff, nDstXCount, nDstYCount);
6572 : #endif
6573 :
6574 : // Avoid accumulating too many tasks and exhaust RAM
6575 :
6576 : // Try to complete already finished jobs
6577 1542 : while (eErr == CE_None && !jobList.empty())
6578 : {
6579 2 : auto poOldestJob = jobList.front().get();
6580 2 : if (!poOldestJob->IsFinished())
6581 0 : break;
6582 2 : eErr = poOldestJob->eErr;
6583 2 : if (eErr == CE_None)
6584 : {
6585 2 : eErr = WriteJobData(poOldestJob);
6586 : }
6587 :
6588 2 : jobList.pop_front();
6589 : }
6590 :
6591 : // And in case we have saturated the number of threads,
6592 : // wait for completion of tasks to go below the threshold.
6593 3080 : while (eErr == CE_None &&
6594 1540 : jobList.size() >= static_cast<size_t>(nThreads))
6595 : {
6596 0 : eErr = WaitAndFinalizeOldestJob(jobList);
6597 : }
6598 :
6599 : // Read the source buffers for all the bands.
6600 4821 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6601 : {
6602 : // (Re)allocate buffers if needed
6603 3281 : if (apaChunk[iBand] == nullptr)
6604 : {
6605 1159 : apaChunk[iBand].reset(VSI_MALLOC3_VERBOSE(
6606 : nFullResXChunkQueried, nFullResYChunkQueried,
6607 : nWrkDataTypeSize));
6608 1159 : if (apaChunk[iBand] == nullptr)
6609 : {
6610 0 : eErr = CE_Failure;
6611 : }
6612 : }
6613 3598 : if (bUseNoDataMask &&
6614 317 : apabyChunkNoDataMask[iBand] == nullptr)
6615 : {
6616 266 : apabyChunkNoDataMask[iBand].reset(
6617 266 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
6618 : nFullResXChunkQueried, nFullResYChunkQueried)));
6619 266 : if (apabyChunkNoDataMask[iBand] == nullptr)
6620 : {
6621 0 : eErr = CE_Failure;
6622 : }
6623 : }
6624 :
6625 3281 : if (eErr == CE_None)
6626 : {
6627 3281 : GDALRasterBand *poSrcBand = nullptr;
6628 3281 : if (iSrcOverview == -1)
6629 2391 : poSrcBand = papoSrcBands[iBand];
6630 : else
6631 890 : poSrcBand =
6632 890 : papapoOverviewBands[iBand][iSrcOverview];
6633 3281 : eErr = poSrcBand->RasterIO(
6634 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6635 : nChunkXSizeQueried, nChunkYSizeQueried,
6636 3281 : apaChunk[iBand].get(), nChunkXSizeQueried,
6637 : nChunkYSizeQueried, eWrkDataType, 0, 0, nullptr);
6638 :
6639 3281 : if (bUseNoDataMask && eErr == CE_None)
6640 : {
6641 317 : auto poMaskBand = poSrcBand->IsMaskBand()
6642 317 : ? poSrcBand
6643 244 : : poSrcBand->GetMaskBand();
6644 317 : eErr = poMaskBand->RasterIO(
6645 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6646 : nChunkXSizeQueried, nChunkYSizeQueried,
6647 317 : apabyChunkNoDataMask[iBand].get(),
6648 : nChunkXSizeQueried, nChunkYSizeQueried,
6649 : GDT_UInt8, 0, 0, nullptr);
6650 : }
6651 : }
6652 : }
6653 :
6654 : // Compute the resulting overview block.
6655 4820 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6656 : {
6657 6560 : auto poJob = std::make_unique<OvrJob>();
6658 3280 : poJob->pfnResampleFn = pfnResampleFn;
6659 3280 : poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
6660 6560 : poJob->args.eOvrDataType =
6661 3280 : poJob->poDstBand->GetRasterDataType();
6662 3280 : poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
6663 3280 : poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
6664 3280 : const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
6665 3280 : "NBITS", "IMAGE_STRUCTURE");
6666 3280 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
6667 3280 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
6668 3280 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
6669 3280 : poJob->args.eWrkDataType = eWrkDataType;
6670 3280 : poJob->pChunk = apaChunk[iBand].get();
6671 3280 : poJob->args.pabyChunkNodataMask =
6672 3280 : apabyChunkNoDataMask[iBand].get();
6673 3280 : poJob->args.nChunkXOff = nChunkXOffQueried;
6674 3280 : poJob->args.nChunkXSize = nChunkXSizeQueried;
6675 3280 : poJob->args.nChunkYOff = nChunkYOffQueried;
6676 3280 : poJob->args.nChunkYSize = nChunkYSizeQueried;
6677 3280 : poJob->args.nDstXOff = nDstXOff;
6678 3280 : poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
6679 3280 : poJob->args.nDstYOff = nDstYOff;
6680 3280 : poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
6681 3280 : poJob->args.pszResampling = pszResampling;
6682 3280 : poJob->args.bHasNoData = abHasNoData[iBand];
6683 3280 : poJob->args.dfNoDataValue = adfNoDataValue[iBand];
6684 3280 : poJob->args.eSrcDataType = eDataType;
6685 3280 : poJob->args.bPropagateNoData = bPropagateNoData;
6686 :
6687 3280 : if (poJobQueue)
6688 : {
6689 16 : poJob->oSrcMaskBufferHolder =
6690 32 : std::make_unique<PointerHolder>(
6691 32 : std::move(apabyChunkNoDataMask[iBand]));
6692 :
6693 16 : poJob->oSrcBufferHolder =
6694 32 : std::make_unique<PointerHolder>(
6695 32 : std::move(apaChunk[iBand]));
6696 :
6697 16 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
6698 16 : jobList.emplace_back(std::move(poJob));
6699 : }
6700 : else
6701 : {
6702 3264 : JobResampleFunc(poJob.get());
6703 3264 : eErr = poJob->eErr;
6704 3264 : if (eErr == CE_None)
6705 : {
6706 3264 : eErr = WriteJobData(poJob.get());
6707 : }
6708 : }
6709 : }
6710 : }
6711 : }
6712 :
6713 : // Wait for all pending jobs to complete
6714 610 : while (!jobList.empty())
6715 : {
6716 14 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
6717 14 : if (l_eErr != CE_None && eErr == CE_None)
6718 0 : eErr = l_eErr;
6719 : }
6720 :
6721 : // Flush the data to overviews.
6722 1753 : for (int iBand = 0; iBand < nBands; ++iBand)
6723 : {
6724 1157 : if (papapoOverviewBands[iBand][iOverview]->FlushCache(false) !=
6725 : CE_None)
6726 0 : eErr = CE_Failure;
6727 : }
6728 : }
6729 :
6730 381 : if (eErr == CE_None)
6731 377 : pfnProgress(1.0, nullptr, pProgressData);
6732 :
6733 381 : return eErr;
6734 : }
6735 :
6736 : /************************************************************************/
6737 : /* GDALRegenerateOverviewsMultiBand() */
6738 : /************************************************************************/
6739 :
6740 : /**
6741 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
6742 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
6743 : *
6744 : * This function will generate one or more overview images from a base
6745 : * image using the requested downsampling algorithm. Its primary use
6746 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
6747 : * can also be used to generate downsampled images in one file from another
6748 : * outside the overview architecture.
6749 : *
6750 : * The output bands need to exist in advance and share the same characteristics
6751 : * (type, dimensions)
6752 : *
6753 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
6754 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
6755 : *
6756 : * It does not support color tables or complex data types.
6757 : *
6758 : * The pseudo-algorithm used by the function is :
6759 : * for each overview
6760 : * iterate on lines of the source by a step of deltay
6761 : * iterate on columns of the source by a step of deltax
6762 : * read the source data of size deltax * deltay for all the bands
6763 : * generate the corresponding overview block for all the bands
6764 : *
6765 : * This function will honour properly NODATA_VALUES tuples (special dataset
6766 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
6767 : * considered as the nodata value and not each value of the triplet
6768 : * independently per band.
6769 : *
6770 : * The GDAL_NUM_THREADS configuration option can be set
6771 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
6772 : * overview computation.
6773 : *
6774 : * @param apoSrcBands the list of source bands to downsample
6775 : * @param aapoOverviewBands bidimension array of bands. First dimension is
6776 : * indexed by bands. Second dimension is indexed by
6777 : * overview levels. All aapoOverviewBands[i] arrays
6778 : * must have the same size (i.e. same number of
6779 : * overviews)
6780 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
6781 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
6782 : * @param pfnProgress progress report function.
6783 : * @param pProgressData progress function callback data.
6784 : * @param papszOptions NULL terminated list of options as
6785 : * key=value pairs, or NULL
6786 : * The XOFF, YOFF, XSIZE and YSIZE
6787 : * options can be specified to express that overviews should
6788 : * be regenerated only in the specified subset of the source
6789 : * dataset.
6790 : * @return CE_None on success or CE_Failure on failure.
6791 : * @since 3.10
6792 : */
6793 :
6794 19 : CPLErr GDALRegenerateOverviewsMultiBand(
6795 : const std::vector<GDALRasterBand *> &apoSrcBands,
6796 : const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
6797 : const char *pszResampling, GDALProgressFunc pfnProgress,
6798 : void *pProgressData, CSLConstList papszOptions)
6799 : {
6800 19 : CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
6801 29 : for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
6802 : {
6803 10 : CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
6804 : }
6805 :
6806 19 : if (aapoOverviewBands.empty())
6807 0 : return CE_None;
6808 :
6809 19 : std::vector<GDALRasterBand **> apapoOverviewBands;
6810 48 : for (auto &apoOverviewBands : aapoOverviewBands)
6811 : {
6812 : auto papoOverviewBands = static_cast<GDALRasterBand **>(
6813 29 : CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
6814 61 : for (size_t i = 0; i < apoOverviewBands.size(); ++i)
6815 : {
6816 32 : papoOverviewBands[i] = apoOverviewBands[i];
6817 : }
6818 29 : apapoOverviewBands.push_back(papoOverviewBands);
6819 : }
6820 38 : const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
6821 19 : static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
6822 19 : static_cast<int>(aapoOverviewBands[0].size()),
6823 19 : apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
6824 : papszOptions);
6825 48 : for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
6826 29 : CPLFree(papoOverviewBands);
6827 19 : return eErr;
6828 : }
6829 :
6830 : /************************************************************************/
6831 : /* GDALComputeBandStats() */
6832 : /************************************************************************/
6833 :
6834 : /** Undocumented
6835 : * @param hSrcBand undocumented.
6836 : * @param nSampleStep Step between scanlines used to compute statistics.
6837 : * When nSampleStep is equal to 1, all scanlines will
6838 : * be processed.
6839 : * @param pdfMean undocumented.
6840 : * @param pdfStdDev undocumented.
6841 : * @param pfnProgress undocumented.
6842 : * @param pProgressData undocumented.
6843 : * @return undocumented
6844 : */
6845 18 : CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
6846 : int nSampleStep, double *pdfMean,
6847 : double *pdfStdDev,
6848 : GDALProgressFunc pfnProgress,
6849 : void *pProgressData)
6850 :
6851 : {
6852 18 : VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
6853 :
6854 18 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
6855 :
6856 18 : if (pfnProgress == nullptr)
6857 18 : pfnProgress = GDALDummyProgress;
6858 :
6859 18 : const int nWidth = poSrcBand->GetXSize();
6860 18 : const int nHeight = poSrcBand->GetYSize();
6861 :
6862 18 : if (nSampleStep >= nHeight || nSampleStep < 1)
6863 5 : nSampleStep = 1;
6864 :
6865 18 : GDALDataType eWrkType = GDT_Unknown;
6866 18 : float *pafData = nullptr;
6867 18 : GDALDataType eType = poSrcBand->GetRasterDataType();
6868 18 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6869 18 : if (bComplex)
6870 : {
6871 : pafData = static_cast<float *>(
6872 0 : VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6873 0 : eWrkType = GDT_CFloat32;
6874 : }
6875 : else
6876 : {
6877 : pafData =
6878 18 : static_cast<float *>(VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6879 18 : eWrkType = GDT_Float32;
6880 : }
6881 :
6882 18 : if (nWidth == 0 || pafData == nullptr)
6883 : {
6884 0 : VSIFree(pafData);
6885 0 : return CE_Failure;
6886 : }
6887 :
6888 : /* -------------------------------------------------------------------- */
6889 : /* Loop over all sample lines. */
6890 : /* -------------------------------------------------------------------- */
6891 18 : double dfSum = 0.0;
6892 18 : double dfSum2 = 0.0;
6893 18 : int iLine = 0;
6894 18 : GIntBig nSamples = 0;
6895 :
6896 2143 : do
6897 : {
6898 2161 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6899 : pProgressData))
6900 : {
6901 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6902 0 : CPLFree(pafData);
6903 0 : return CE_Failure;
6904 : }
6905 :
6906 : const CPLErr eErr =
6907 2161 : poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
6908 : 1, eWrkType, 0, 0, nullptr);
6909 2161 : if (eErr != CE_None)
6910 : {
6911 1 : CPLFree(pafData);
6912 1 : return eErr;
6913 : }
6914 :
6915 725208 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6916 : {
6917 723048 : float fValue = 0.0f;
6918 :
6919 723048 : if (bComplex)
6920 : {
6921 : // Compute the magnitude of the complex value.
6922 : fValue =
6923 0 : std::hypot(pafData[static_cast<size_t>(iPixel) * 2],
6924 0 : pafData[static_cast<size_t>(iPixel) * 2 + 1]);
6925 : }
6926 : else
6927 : {
6928 723048 : fValue = pafData[iPixel];
6929 : }
6930 :
6931 723048 : dfSum += static_cast<double>(fValue);
6932 723048 : dfSum2 += static_cast<double>(fValue) * static_cast<double>(fValue);
6933 : }
6934 :
6935 2160 : nSamples += nWidth;
6936 2160 : iLine += nSampleStep;
6937 2160 : } while (iLine < nHeight);
6938 :
6939 17 : if (!pfnProgress(1.0, nullptr, pProgressData))
6940 : {
6941 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6942 0 : CPLFree(pafData);
6943 0 : return CE_Failure;
6944 : }
6945 :
6946 : /* -------------------------------------------------------------------- */
6947 : /* Produce the result values. */
6948 : /* -------------------------------------------------------------------- */
6949 17 : if (pdfMean != nullptr)
6950 17 : *pdfMean = dfSum / nSamples;
6951 :
6952 17 : if (pdfStdDev != nullptr)
6953 : {
6954 17 : const double dfMean = dfSum / nSamples;
6955 :
6956 17 : *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
6957 : }
6958 :
6959 17 : CPLFree(pafData);
6960 :
6961 17 : return CE_None;
6962 : }
6963 :
6964 : /************************************************************************/
6965 : /* GDALOverviewMagnitudeCorrection() */
6966 : /* */
6967 : /* Correct the mean and standard deviation of the overviews of */
6968 : /* the given band to match the base layer approximately. */
6969 : /************************************************************************/
6970 :
6971 : /** Undocumented
6972 : * @param hBaseBand undocumented.
6973 : * @param nOverviewCount undocumented.
6974 : * @param pahOverviews undocumented.
6975 : * @param pfnProgress undocumented.
6976 : * @param pProgressData undocumented.
6977 : * @return undocumented
6978 : */
6979 0 : CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
6980 : int nOverviewCount,
6981 : GDALRasterBandH *pahOverviews,
6982 : GDALProgressFunc pfnProgress,
6983 : void *pProgressData)
6984 :
6985 : {
6986 0 : VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
6987 :
6988 : /* -------------------------------------------------------------------- */
6989 : /* Compute mean/stddev for source raster. */
6990 : /* -------------------------------------------------------------------- */
6991 0 : double dfOrigMean = 0.0;
6992 0 : double dfOrigStdDev = 0.0;
6993 : {
6994 : const CPLErr eErr =
6995 0 : GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
6996 : pfnProgress, pProgressData);
6997 :
6998 0 : if (eErr != CE_None)
6999 0 : return eErr;
7000 : }
7001 :
7002 : /* -------------------------------------------------------------------- */
7003 : /* Loop on overview bands. */
7004 : /* -------------------------------------------------------------------- */
7005 0 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
7006 : {
7007 : GDALRasterBand *poOverview =
7008 0 : GDALRasterBand::FromHandle(pahOverviews[iOverview]);
7009 : double dfOverviewMean, dfOverviewStdDev;
7010 :
7011 : const CPLErr eErr =
7012 0 : GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
7013 : &dfOverviewStdDev, pfnProgress, pProgressData);
7014 :
7015 0 : if (eErr != CE_None)
7016 0 : return eErr;
7017 :
7018 0 : double dfGain = 1.0;
7019 0 : if (dfOrigStdDev >= 0.0001)
7020 0 : dfGain = dfOrigStdDev / dfOverviewStdDev;
7021 :
7022 : /* --------------------------------------------------------------------
7023 : */
7024 : /* Apply gain and offset. */
7025 : /* --------------------------------------------------------------------
7026 : */
7027 0 : const int nWidth = poOverview->GetXSize();
7028 0 : const int nHeight = poOverview->GetYSize();
7029 :
7030 0 : GDALDataType eWrkType = GDT_Unknown;
7031 0 : float *pafData = nullptr;
7032 0 : const GDALDataType eType = poOverview->GetRasterDataType();
7033 0 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
7034 0 : if (bComplex)
7035 : {
7036 : pafData = static_cast<float *>(
7037 0 : VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
7038 0 : eWrkType = GDT_CFloat32;
7039 : }
7040 : else
7041 : {
7042 : pafData = static_cast<float *>(
7043 0 : VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
7044 0 : eWrkType = GDT_Float32;
7045 : }
7046 :
7047 0 : if (pafData == nullptr)
7048 : {
7049 0 : return CE_Failure;
7050 : }
7051 :
7052 0 : for (int iLine = 0; iLine < nHeight; ++iLine)
7053 : {
7054 0 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
7055 : pProgressData))
7056 : {
7057 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
7058 0 : CPLFree(pafData);
7059 0 : return CE_Failure;
7060 : }
7061 :
7062 0 : if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
7063 : nWidth, 1, eWrkType, 0, 0,
7064 0 : nullptr) != CE_None)
7065 : {
7066 0 : CPLFree(pafData);
7067 0 : return CE_Failure;
7068 : }
7069 :
7070 0 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
7071 : {
7072 0 : if (bComplex)
7073 : {
7074 0 : pafData[static_cast<size_t>(iPixel) * 2] *=
7075 0 : static_cast<float>(dfGain);
7076 0 : pafData[static_cast<size_t>(iPixel) * 2 + 1] *=
7077 0 : static_cast<float>(dfGain);
7078 : }
7079 : else
7080 : {
7081 0 : pafData[iPixel] = static_cast<float>(
7082 0 : (double(pafData[iPixel]) - dfOverviewMean) * dfGain +
7083 : dfOrigMean);
7084 : }
7085 : }
7086 :
7087 0 : if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
7088 : nWidth, 1, eWrkType, 0, 0,
7089 0 : nullptr) != CE_None)
7090 : {
7091 0 : CPLFree(pafData);
7092 0 : return CE_Failure;
7093 : }
7094 : }
7095 :
7096 0 : if (!pfnProgress(1.0, nullptr, pProgressData))
7097 : {
7098 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
7099 0 : CPLFree(pafData);
7100 0 : return CE_Failure;
7101 : }
7102 :
7103 0 : CPLFree(pafData);
7104 : }
7105 :
7106 0 : return CE_None;
7107 : }
|