Line data Source code
1 :
2 : /******************************************************************************
3 : *
4 : * Project: GDAL Core
5 : * Purpose: Helper code to implement overview support in different drivers.
6 : * Author: Frank Warmerdam, warmerdam@pobox.com
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 2000, Frank Warmerdam
10 : * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * SPDX-License-Identifier: MIT
13 : ****************************************************************************/
14 :
15 : #include "cpl_port.h"
16 : #include "gdal_priv.h"
17 :
18 : #include <cmath>
19 : #include <cstddef>
20 : #include <cstdlib>
21 :
22 : #include <algorithm>
23 : #include <complex>
24 : #include <condition_variable>
25 : #include <limits>
26 : #include <list>
27 : #include <memory>
28 : #include <mutex>
29 : #include <vector>
30 :
31 : #include "cpl_conv.h"
32 : #include "cpl_error.h"
33 : #include "cpl_float.h"
34 : #include "cpl_progress.h"
35 : #include "cpl_vsi.h"
36 : #include "gdal.h"
37 : #include "gdal_thread_pool.h"
38 : #include "gdalwarper.h"
39 : #include "gdal_vrt.h"
40 : #include "vrtdataset.h"
41 :
42 : #ifdef USE_NEON_OPTIMIZATIONS
43 : #include "include_sse2neon.h"
44 : #define USE_SSE2
45 :
46 : #include "gdalsse_priv.h"
47 :
48 : // Restrict to 64bit processors because they are guaranteed to have SSE2,
49 : // or if __AVX2__ is defined.
50 : #elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
51 : #define USE_SSE2
52 :
53 : #include "gdalsse_priv.h"
54 :
55 : #ifdef __SSE3__
56 : #include <pmmintrin.h>
57 : #endif
58 : #ifdef __SSSE3__
59 : #include <tmmintrin.h>
60 : #endif
61 : #ifdef __SSE4_1__
62 : #include <smmintrin.h>
63 : #endif
64 : #ifdef __AVX2__
65 : #include <immintrin.h>
66 : #endif
67 :
68 : #endif
69 :
70 : // To be included after above USE_SSE2 and include gdalsse_priv.h
71 : // to avoid build issue on Windows x86
72 : #include "gdal_priv_templates.hpp"
73 :
74 : /************************************************************************/
75 : /* GDALResampleChunk_Near() */
76 : /************************************************************************/
77 :
78 : template <class T>
79 1233 : static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
80 : const T *pChunk, T **ppDstBuffer)
81 :
82 : {
83 1233 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
84 1233 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
85 1233 : const GDALDataType eWrkDataType = args.eWrkDataType;
86 1233 : const int nChunkXOff = args.nChunkXOff;
87 1233 : const int nChunkXSize = args.nChunkXSize;
88 1233 : const int nChunkYOff = args.nChunkYOff;
89 1233 : const int nDstXOff = args.nDstXOff;
90 1233 : const int nDstXOff2 = args.nDstXOff2;
91 1233 : const int nDstYOff = args.nDstYOff;
92 1233 : const int nDstYOff2 = args.nDstYOff2;
93 1233 : const int nDstXWidth = nDstXOff2 - nDstXOff;
94 :
95 : /* -------------------------------------------------------------------- */
96 : /* Allocate buffers. */
97 : /* -------------------------------------------------------------------- */
98 1233 : *ppDstBuffer = static_cast<T *>(
99 1233 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
100 : GDALGetDataTypeSizeBytes(eWrkDataType)));
101 1233 : if (*ppDstBuffer == nullptr)
102 : {
103 0 : return CE_Failure;
104 : }
105 1233 : T *const pDstBuffer = *ppDstBuffer;
106 :
107 : int *panSrcXOff =
108 1233 : static_cast<int *>(VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(int)));
109 :
110 1233 : if (panSrcXOff == nullptr)
111 : {
112 0 : return CE_Failure;
113 : }
114 :
115 : /* ==================================================================== */
116 : /* Precompute inner loop constants. */
117 : /* ==================================================================== */
118 842009 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
119 : {
120 840776 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
121 840776 : if (nSrcXOff < nChunkXOff)
122 0 : nSrcXOff = nChunkXOff;
123 :
124 840776 : panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
125 : }
126 :
127 : /* ==================================================================== */
128 : /* Loop over destination scanlines. */
129 : /* ==================================================================== */
130 141825 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
131 : {
132 140592 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
133 140592 : if (nSrcYOff < nChunkYOff)
134 0 : nSrcYOff = nChunkYOff;
135 :
136 140592 : const T *const pSrcScanline =
137 : pChunk +
138 140592 : (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
139 138074 : nChunkXOff;
140 :
141 : /* --------------------------------------------------------------------
142 : */
143 : /* Loop over destination pixels */
144 : /* --------------------------------------------------------------------
145 : */
146 140592 : T *pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
147 119627130 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
148 : {
149 119486612 : pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
150 : }
151 : }
152 :
153 1233 : CPLFree(panSrcXOff);
154 :
155 1233 : return CE_None;
156 : }
157 :
158 1233 : static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
159 : const void *pChunk, void **ppDstBuffer,
160 : GDALDataType *peDstBufferDataType)
161 : {
162 1233 : *peDstBufferDataType = args.eWrkDataType;
163 1233 : switch (args.eWrkDataType)
164 : {
165 : // For nearest resampling, as no computation is done, only the
166 : // size of the data type matters.
167 1081 : case GDT_Byte:
168 : case GDT_Int8:
169 : {
170 1081 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
171 1081 : return GDALResampleChunk_NearT(
172 : args, static_cast<const uint8_t *>(pChunk),
173 1081 : reinterpret_cast<uint8_t **>(ppDstBuffer));
174 : }
175 :
176 50 : case GDT_Int16:
177 : case GDT_UInt16:
178 : case GDT_Float16:
179 : {
180 50 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
181 50 : return GDALResampleChunk_NearT(
182 : args, static_cast<const uint16_t *>(pChunk),
183 50 : reinterpret_cast<uint16_t **>(ppDstBuffer));
184 : }
185 :
186 55 : case GDT_CInt16:
187 : case GDT_CFloat16:
188 : case GDT_Int32:
189 : case GDT_UInt32:
190 : case GDT_Float32:
191 : {
192 55 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
193 55 : return GDALResampleChunk_NearT(
194 : args, static_cast<const uint32_t *>(pChunk),
195 55 : reinterpret_cast<uint32_t **>(ppDstBuffer));
196 : }
197 :
198 43 : case GDT_CInt32:
199 : case GDT_CFloat32:
200 : case GDT_Int64:
201 : case GDT_UInt64:
202 : case GDT_Float64:
203 : {
204 43 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
205 43 : return GDALResampleChunk_NearT(
206 : args, static_cast<const uint64_t *>(pChunk),
207 43 : reinterpret_cast<uint64_t **>(ppDstBuffer));
208 : }
209 :
210 4 : case GDT_CFloat64:
211 : {
212 4 : return GDALResampleChunk_NearT(
213 : args, static_cast<const std::complex<double> *>(pChunk),
214 4 : reinterpret_cast<std::complex<double> **>(ppDstBuffer));
215 : }
216 :
217 0 : case GDT_Unknown:
218 : case GDT_TypeCount:
219 0 : break;
220 : }
221 0 : CPLAssert(false);
222 : return CE_Failure;
223 : }
224 :
225 : namespace
226 : {
227 :
228 : // Find in the color table the entry whose RGB value is the closest
229 : // (using quadratic distance) to the test color, ignoring transparent entries.
230 3837 : int BestColorEntry(const std::vector<GDALColorEntry> &entries,
231 : const GDALColorEntry &test)
232 : {
233 3837 : int nMinDist = std::numeric_limits<int>::max();
234 3837 : size_t bestEntry = 0;
235 986109 : for (size_t i = 0; i < entries.size(); ++i)
236 : {
237 982272 : const GDALColorEntry &entry = entries[i];
238 : // Ignore transparent entries
239 982272 : if (entry.c4 == 0)
240 3237 : continue;
241 :
242 979035 : int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
243 979035 : ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
244 979035 : ((test.c3 - entry.c3) * (test.c3 - entry.c3));
245 979035 : if (nDist < nMinDist)
246 : {
247 15847 : nMinDist = nDist;
248 15847 : bestEntry = i;
249 : }
250 : }
251 3837 : return static_cast<int>(bestEntry);
252 : }
253 :
254 7 : std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
255 : int &transparentIdx)
256 : {
257 7 : std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
258 :
259 7 : transparentIdx = -1;
260 7 : int i = 0;
261 1799 : for (auto &entry : entries)
262 : {
263 1792 : table.GetColorEntryAsRGB(i, &entry);
264 1792 : if (transparentIdx < 0 && entry.c4 == 0)
265 1 : transparentIdx = i;
266 1792 : ++i;
267 : }
268 7 : return entries;
269 : }
270 :
271 : } // unnamed namespace
272 :
273 : /************************************************************************/
274 : /* SQUARE() */
275 : /************************************************************************/
276 :
277 3721 : template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
278 : {
279 3721 : return static_cast<Tsquare>(val) * val;
280 : }
281 :
282 : /************************************************************************/
283 : /* ComputeIntegerRMS() */
284 : /************************************************************************/
285 : // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
286 : // integer that minimizes abs(rms**2 - sumSquares / weight)
287 : template <class T, class Twork>
288 42 : inline T ComputeIntegerRMS(double sumSquares, double weight)
289 : {
290 42 : const double sumDivWeight = sumSquares / weight;
291 42 : T rms = static_cast<T>(sqrt(sumDivWeight));
292 :
293 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
294 : // Naive version:
295 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
296 42 : if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
297 42 : 2 * sumDivWeight)
298 6 : rms += 1;
299 42 : return rms;
300 : }
301 :
302 0 : template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
303 : {
304 0 : CPLAssert(false);
305 : return 0;
306 : }
307 :
308 24 : template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
309 : {
310 : // It has been verified that given the correction on rms below, using
311 : // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
312 : // is equivalent, so use the former as it is used twice.
313 24 : const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
314 24 : const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
315 24 : GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
316 :
317 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
318 : // Naive version:
319 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
320 : // Optimized version for integer case and weight == 4
321 24 : if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
322 5 : rms += 1;
323 24 : return rms;
324 : }
325 :
326 : template <>
327 20 : inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
328 : {
329 20 : const double sumDivWeight = sumSquares * 0.25;
330 20 : GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
331 :
332 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
333 : // Naive version:
334 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
335 : // Optimized version for integer case and weight == 4
336 20 : if (static_cast<GUInt32>(rms) * (rms + 1) <
337 20 : static_cast<GUInt32>(sumDivWeight + 0.25))
338 4 : rms += 1;
339 20 : return rms;
340 : }
341 :
342 : #ifdef USE_SSE2
343 :
344 : /************************************************************************/
345 : /* QuadraticMeanByteSSE2OrAVX2() */
346 : /************************************************************************/
347 :
348 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
349 : #define sse2_packus_epi32 _mm_packus_epi32
350 : #else
351 516119 : inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
352 : {
353 516119 : const auto minus32768_32 = _mm_set1_epi32(-32768);
354 516119 : const auto minus32768_16 = _mm_set1_epi16(-32768);
355 516119 : a = _mm_add_epi32(a, minus32768_32);
356 516119 : b = _mm_add_epi32(b, minus32768_32);
357 516119 : a = _mm_packs_epi32(a, b);
358 516119 : a = _mm_sub_epi16(a, minus32768_16);
359 516119 : return a;
360 : }
361 : #endif
362 :
363 : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
364 : #define sse2_hadd_epi16 _mm_hadd_epi16
365 : #else
366 4669030 : inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
367 : {
368 : // Horizontal addition of adjacent pairs
369 4669030 : const auto mask = _mm_set1_epi32(0xFFFF);
370 : const auto horizLo =
371 14007100 : _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
372 : const auto horizHi =
373 14007100 : _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
374 :
375 : // Recombine low and high parts
376 4669030 : return _mm_packs_epi32(horizLo, horizHi);
377 : }
378 : #endif
379 :
380 : #ifdef __AVX2__
381 :
382 : #define DEST_ELTS 16
383 : #define set1_epi16 _mm256_set1_epi16
384 : #define set1_epi32 _mm256_set1_epi32
385 : #define setzero _mm256_setzero_si256
386 : #define set1_ps _mm256_set1_ps
387 : #define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
388 : #define unpacklo_epi8 _mm256_unpacklo_epi8
389 : #define unpackhi_epi8 _mm256_unpackhi_epi8
390 : #define madd_epi16 _mm256_madd_epi16
391 : #define add_epi32 _mm256_add_epi32
392 : #define mul_ps _mm256_mul_ps
393 : #define cvtepi32_ps _mm256_cvtepi32_ps
394 : #define sqrt_ps _mm256_sqrt_ps
395 : #define cvttps_epi32 _mm256_cvttps_epi32
396 : #define packs_epi32 _mm256_packs_epi32
397 : #define packus_epi32 _mm256_packus_epi32
398 : #define srli_epi32 _mm256_srli_epi32
399 : #define mullo_epi16 _mm256_mullo_epi16
400 : #define srli_epi16 _mm256_srli_epi16
401 : #define cmpgt_epi16 _mm256_cmpgt_epi16
402 : #define add_epi16 _mm256_add_epi16
403 : #define sub_epi16 _mm256_sub_epi16
404 : #define packus_epi16 _mm256_packus_epi16
405 : /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
406 : /* to get the lower 128-bit bits of what would be a true 256-bit vector register
407 : */
408 : #define store_lo(x, y) \
409 : _mm_storeu_si128(reinterpret_cast<__m128i *>(x), \
410 : _mm256_extracti128_si256( \
411 : _mm256_permute4x64_epi64((y), 0 | (2 << 2)), 0))
412 : #define hadd_epi16 _mm256_hadd_epi16
413 : #define zeroupper() _mm256_zeroupper()
414 : #else
415 : #define DEST_ELTS 8
416 : #define set1_epi16 _mm_set1_epi16
417 : #define set1_epi32 _mm_set1_epi32
418 : #define setzero _mm_setzero_si128
419 : #define set1_ps _mm_set1_ps
420 : #define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
421 : #define unpacklo_epi8 _mm_unpacklo_epi8
422 : #define unpackhi_epi8 _mm_unpackhi_epi8
423 : #define madd_epi16 _mm_madd_epi16
424 : #define add_epi32 _mm_add_epi32
425 : #define mul_ps _mm_mul_ps
426 : #define cvtepi32_ps _mm_cvtepi32_ps
427 : #define sqrt_ps _mm_sqrt_ps
428 : #define cvttps_epi32 _mm_cvttps_epi32
429 : #define packs_epi32 _mm_packs_epi32
430 : #define packus_epi32 sse2_packus_epi32
431 : #define srli_epi32 _mm_srli_epi32
432 : #define mullo_epi16 _mm_mullo_epi16
433 : #define srli_epi16 _mm_srli_epi16
434 : #define cmpgt_epi16 _mm_cmpgt_epi16
435 : #define add_epi16 _mm_add_epi16
436 : #define sub_epi16 _mm_sub_epi16
437 : #define packus_epi16 _mm_packus_epi16
438 : #define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
439 : #define hadd_epi16 sse2_hadd_epi16
440 : #define zeroupper() (void)0
441 : #endif
442 :
443 : #if defined(__GNUC__) && defined(__AVX2__)
444 : // Disabling inlining works around a bug with gcc 9.3 (Ubuntu 20.04) in
445 : // -O2 -mavx2 mode in QuadraticMeanFloatSSE2(),
446 : // where the registry that contains minus_zero is correctly
447 : // loaded the first time the function is called (looking at the disassembly,
448 : // one sees it is loaded much earlier than the function), but gets corrupted
449 : // (zeroed) in following iterations.
450 : // It appears the bug is due to the explicit zeroupper() call at the end of
451 : // the function.
452 : // The bug is at least solved in gcc 10.2.
453 : // Inlining doesn't bring much here to performance.
454 : // This is also needed with gcc 9.3 on QuadraticMeanByteSSE2OrAVX2() in
455 : // -O3 -mavx2 mode
456 : #define NOINLINE __attribute__((noinline))
457 : #else
458 : #define NOINLINE
459 : #endif
460 :
461 : template <class T>
462 : static int NOINLINE
463 5385 : QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
464 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
465 : T *CPL_RESTRICT pDstScanline)
466 : {
467 : // Optimized implementation for RMS on Byte by
468 : // processing by group of 8 output pixels, so as to use
469 : // a single _mm_sqrt_ps() call for 4 output pixels
470 5385 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
471 :
472 5385 : int iDstPixel = 0;
473 5385 : const auto one16 = set1_epi16(1);
474 5385 : const auto one32 = set1_epi32(1);
475 5385 : const auto zero = setzero();
476 5385 : const auto minus32768 = set1_epi16(-32768);
477 :
478 521496 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
479 : {
480 : // Load 2 * DEST_ELTS bytes from each line
481 516111 : auto firstLine = loadu_int(pSrcScanlineShifted);
482 1032220 : auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
483 : // Extend those Bytes as UInt16s
484 516111 : auto firstLineLo = unpacklo_epi8(firstLine, zero);
485 516111 : auto firstLineHi = unpackhi_epi8(firstLine, zero);
486 516111 : auto secondLineLo = unpacklo_epi8(secondLine, zero);
487 516111 : auto secondLineHi = unpackhi_epi8(secondLine, zero);
488 :
489 : // Multiplication of 16 bit values and horizontal
490 : // addition of 32 bit results
491 : // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
492 516111 : firstLineLo = madd_epi16(firstLineLo, firstLineLo);
493 516111 : firstLineHi = madd_epi16(firstLineHi, firstLineHi);
494 516111 : secondLineLo = madd_epi16(secondLineLo, secondLineLo);
495 516111 : secondLineHi = madd_epi16(secondLineHi, secondLineHi);
496 :
497 : // Vertical addition
498 516111 : const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
499 516111 : const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
500 :
501 : const auto sumSquaresPlusOneDiv4Lo =
502 1032220 : srli_epi32(add_epi32(sumSquaresLo, one32), 2);
503 : const auto sumSquaresPlusOneDiv4Hi =
504 1032220 : srli_epi32(add_epi32(sumSquaresHi, one32), 2);
505 :
506 : // Take square root and truncate/floor to int32
507 : const auto rmsLo =
508 1548330 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
509 : const auto rmsHi =
510 1548330 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
511 :
512 : // Merge back low and high registers with each RMS value
513 : // as a 16 bit value.
514 516111 : auto rms = packs_epi32(rmsLo, rmsHi);
515 :
516 : // Round to upper value if it minimizes the
517 : // error |rms^2 - sumSquares/4|
518 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
519 : // rms += 1;
520 : // which is equivalent to:
521 : // if( rms * (rms + 1) < (sumSquares+1) / 4 )
522 : // rms += 1;
523 : // And both left and right parts fit on 16 (unsigned) bits
524 : const auto sumSquaresPlusOneDiv4 =
525 516111 : packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
526 : // cmpgt_epi16 operates on signed int16, but here
527 : // we have unsigned values, so shift them by -32768 before
528 2580560 : auto mask = cmpgt_epi16(
529 : add_epi16(sumSquaresPlusOneDiv4, minus32768),
530 : add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
531 : // The value of the mask will be -1 when the correction needs to be
532 : // applied
533 516111 : rms = sub_epi16(rms, mask);
534 :
535 : // Pack each 16 bit RMS value to 8 bits
536 516111 : rms = packus_epi16(rms, rms /* could be anything */);
537 516111 : store_lo(&pDstScanline[iDstPixel], rms);
538 516111 : pSrcScanlineShifted += 2 * DEST_ELTS;
539 : }
540 : zeroupper();
541 :
542 5385 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
543 5385 : return iDstPixel;
544 : }
545 :
546 : /************************************************************************/
547 : /* AverageByteSSE2OrAVX2() */
548 : /************************************************************************/
549 :
550 : template <class T>
551 : static int
552 111280 : AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
553 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
554 : T *CPL_RESTRICT pDstScanline)
555 : {
556 : // Optimized implementation for average on Byte by
557 : // processing by group of 8 output pixels.
558 :
559 111280 : const auto zero = setzero();
560 111280 : const auto two16 = set1_epi16(2);
561 111280 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
562 :
563 111280 : int iDstPixel = 0;
564 4780310 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
565 : {
566 : // Load 2 * DEST_ELTS bytes from each line
567 4669030 : const auto firstLine = loadu_int(pSrcScanlineShifted);
568 9338050 : const auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
569 : // Extend those Bytes as UInt16s
570 4669030 : const auto firstLineLo = unpacklo_epi8(firstLine, zero);
571 4669030 : const auto firstLineHi = unpackhi_epi8(firstLine, zero);
572 4669030 : const auto secondLineLo = unpacklo_epi8(secondLine, zero);
573 4669030 : const auto secondLineHi = unpackhi_epi8(secondLine, zero);
574 :
575 : // Vertical addition
576 4669030 : const auto sumLo = add_epi16(firstLineLo, secondLineLo);
577 4669030 : const auto sumHi = add_epi16(firstLineHi, secondLineHi);
578 :
579 : // Horizontal addition of adjacent pairs, and recombine low and high
580 : // parts
581 4669030 : const auto sum = hadd_epi16(sumLo, sumHi);
582 :
583 : // average = (sum + 2) / 4
584 9338050 : auto average = srli_epi16(add_epi16(sum, two16), 2);
585 :
586 : // Pack each 16 bit average value to 8 bits
587 4669030 : average = packus_epi16(average, average /* could be anything */);
588 4669030 : store_lo(&pDstScanline[iDstPixel], average);
589 4669030 : pSrcScanlineShifted += 2 * DEST_ELTS;
590 : }
591 : zeroupper();
592 :
593 111280 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
594 111280 : return iDstPixel;
595 : }
596 :
597 : /************************************************************************/
598 : /* QuadraticMeanUInt16SSE2() */
599 : /************************************************************************/
600 :
601 : #ifdef __SSE3__
602 : #define sse2_hadd_pd _mm_hadd_pd
603 : #else
604 8 : inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
605 : {
606 : auto aLo_bLo =
607 32 : _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
608 : auto aHi_bHi =
609 32 : _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
610 8 : return _mm_add_pd(aLo_bLo, aHi_bHi); // (aLo + aHi, bLo + bHi)
611 : }
612 : #endif
613 :
614 40 : inline __m128d SQUARE_PD(__m128d x)
615 : {
616 40 : return _mm_mul_pd(x, x);
617 : }
618 :
619 : #ifdef __AVX2__
620 :
621 : inline __m256d SQUARE_PD(__m256d x)
622 : {
623 : return _mm256_mul_pd(x, x);
624 : }
625 :
626 : inline __m256d FIXUP_LANES(__m256d x)
627 : {
628 : return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
629 : }
630 :
631 : inline __m256 FIXUP_LANES(__m256 x)
632 : {
633 : return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
634 : }
635 :
636 : #endif
637 :
638 : template <class T>
639 : static int
640 10 : QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
641 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
642 : T *CPL_RESTRICT pDstScanline)
643 : {
644 : // Optimized implementation for RMS on UInt16 by
645 : // processing by group of 4 output pixels.
646 10 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
647 :
648 10 : int iDstPixel = 0;
649 10 : const auto zero = _mm_setzero_si128();
650 :
651 : #ifdef __AVX2__
652 : const auto zeroDot25 = _mm256_set1_pd(0.25);
653 : const auto zeroDot5 = _mm256_set1_pd(0.5);
654 :
655 : // The first four 0's could be anything, as we only take the bottom
656 : // 128 bits.
657 : const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
658 : #else
659 10 : const auto zeroDot25 = _mm_set1_pd(0.25);
660 10 : const auto zeroDot5 = _mm_set1_pd(0.5);
661 : #endif
662 :
663 40 : for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
664 : {
665 : // Load 8 UInt16 from each line
666 30 : const auto firstLine = _mm_loadu_si128(
667 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
668 : const auto secondLine =
669 30 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
670 30 : pSrcScanlineShifted + nChunkXSize));
671 :
672 : // Detect if all of the source values fit in 14 bits.
673 : // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
674 : // and we can do a much faster implementation.
675 : const auto maskTmp =
676 60 : _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
677 : #if defined(__i386__) || defined(_M_IX86)
678 : uint64_t nMaskFitsIn14Bits = 0;
679 : _mm_storel_epi64(
680 : reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
681 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
682 : #else
683 30 : const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
684 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
685 : #endif
686 30 : if (nMaskFitsIn14Bits == 0)
687 : {
688 : // Multiplication of 16 bit values and horizontal
689 : // addition of 32 bit results
690 : const auto firstLineHSumSquare =
691 26 : _mm_madd_epi16(firstLine, firstLine);
692 : const auto secondLineHSumSquare =
693 26 : _mm_madd_epi16(secondLine, secondLine);
694 : // Vertical addition
695 : const auto sumSquares =
696 26 : _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
697 : // In theory we should take sqrt(sumSquares * 0.25f)
698 : // but given the rounding we do, this is equivalent to
699 : // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
700 : // sumSquares <= 4 * 16383^2
701 26 : const auto one32 = _mm_set1_epi32(1);
702 : const auto sumSquaresPlusOneDiv4 =
703 52 : _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
704 : // Take square root and truncate/floor to int32
705 78 : auto rms = _mm_cvttps_epi32(
706 : _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
707 :
708 : // Round to upper value if it minimizes the
709 : // error |rms^2 - sumSquares/4|
710 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
711 : // rms += 1;
712 : // which is equivalent to:
713 : // if( rms * rms + rms < (sumSquares+1) / 4 )
714 : // rms += 1;
715 : auto mask =
716 78 : _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
717 : _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
718 26 : rms = _mm_sub_epi32(rms, mask);
719 : // Pack each 32 bit RMS value to 16 bits
720 26 : rms = _mm_packs_epi32(rms, rms /* could be anything */);
721 : _mm_storel_epi64(
722 26 : reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
723 26 : pSrcScanlineShifted += 8;
724 26 : continue;
725 : }
726 :
727 : // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
728 : // to 32 bit would result in 4 multiplications instead of 8, but
729 : // mullo/mulhi have a worse throughput than mul_pd.
730 :
731 : // Extend those UInt16s as UInt32s
732 4 : const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
733 4 : const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
734 4 : const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
735 4 : const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
736 :
737 : #ifdef __AVX2__
738 : // Multiplication of 32 bit values previously converted to 64 bit double
739 : const auto firstLineLoDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineLo));
740 : const auto firstLineHiDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineHi));
741 : const auto secondLineLoDbl =
742 : SQUARE_PD(_mm256_cvtepi32_pd(secondLineLo));
743 : const auto secondLineHiDbl =
744 : SQUARE_PD(_mm256_cvtepi32_pd(secondLineHi));
745 :
746 : // Vertical addition of squares
747 : const auto sumSquaresLo =
748 : _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
749 : const auto sumSquaresHi =
750 : _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
751 :
752 : // Horizontal addition of squares
753 : const auto sumSquares =
754 : FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
755 :
756 : const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
757 :
758 : // Take square root and truncate/floor to int32
759 : auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
760 : const auto rmsDouble = _mm256_cvtepi32_pd(rms);
761 : const auto right = _mm256_sub_pd(
762 : sumDivWeight, _mm256_add_pd(SQUARE_PD(rmsDouble), rmsDouble));
763 :
764 : auto mask =
765 : _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
766 : // Extract 32-bit from each of the 4 64-bit masks
767 : // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
768 : // _MM_SHUFFLE(2,0,2,0)));
769 : mask = _mm256_permutevar8x32_ps(mask, permutation);
770 : const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
771 :
772 : // Apply the correction
773 : rms = _mm_sub_epi32(rms, maskI);
774 :
775 : // Pack each 32 bit RMS value to 16 bits
776 : rms = _mm_packus_epi32(rms, rms /* could be anything */);
777 : #else
778 : // Multiplication of 32 bit values previously converted to 64 bit double
779 4 : const auto firstLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineLo));
780 : const auto firstLineLoHi =
781 8 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
782 4 : const auto firstLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineHi));
783 : const auto firstLineHiHi =
784 8 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
785 :
786 4 : const auto secondLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineLo));
787 : const auto secondLineLoHi =
788 8 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
789 4 : const auto secondLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineHi));
790 : const auto secondLineHiHi =
791 8 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
792 :
793 : // Vertical addition of squares
794 4 : const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
795 4 : const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
796 4 : const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
797 4 : const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
798 :
799 : // Horizontal addition of squares
800 4 : const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
801 4 : const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
802 :
803 4 : const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
804 4 : const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
805 : // Take square root and truncate/floor to int32
806 8 : const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
807 8 : const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
808 :
809 : // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
810 : // if( 0.5 < sumDivWeight - (rms * rms + rms) )
811 : // rms += 1;
812 4 : const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
813 4 : const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
814 8 : const auto rightLo = _mm_sub_pd(
815 : sumDivWeightLo, _mm_add_pd(SQUARE_PD(rmsLoDouble), rmsLoDouble));
816 12 : const auto rightHi = _mm_sub_pd(
817 : sumDivWeightHi, _mm_add_pd(SQUARE_PD(rmsHiDouble), rmsHiDouble));
818 :
819 8 : const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
820 4 : const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
821 : // The value of the mask will be -1 when the correction needs to be
822 : // applied
823 8 : const auto mask = _mm_castps_si128(_mm_shuffle_ps(
824 : maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
825 :
826 16 : auto rms = _mm_castps_si128(
827 : _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
828 : // Apply the correction
829 4 : rms = _mm_sub_epi32(rms, mask);
830 :
831 : // Pack each 32 bit RMS value to 16 bits
832 4 : rms = sse2_packus_epi32(rms, rms /* could be anything */);
833 : #endif
834 :
835 4 : _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
836 : rms);
837 4 : pSrcScanlineShifted += 8;
838 : }
839 :
840 : zeroupper();
841 :
842 10 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
843 10 : return iDstPixel;
844 : }
845 :
846 : /************************************************************************/
847 : /* AverageUInt16SSE2() */
848 : /************************************************************************/
849 :
850 : template <class T>
851 9 : static int AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
852 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
853 : T *CPL_RESTRICT pDstScanline)
854 : {
855 : // Optimized implementation for average on UInt16 by
856 : // processing by group of 8 output pixels.
857 :
858 9 : const auto mask = _mm_set1_epi32(0xFFFF);
859 9 : const auto two = _mm_set1_epi32(2);
860 9 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
861 :
862 9 : int iDstPixel = 0;
863 13 : for (; iDstPixel < nDstXWidth - 7; iDstPixel += 8)
864 : {
865 : __m128i averageLow;
866 : // Load 8 UInt16 from each line
867 : {
868 4 : const auto firstLine = _mm_loadu_si128(
869 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
870 : const auto secondLine =
871 4 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
872 4 : pSrcScanlineShifted + nChunkXSize));
873 :
874 : // Horizontal addition and extension to 32 bit
875 12 : const auto horizAddFirstLine = _mm_add_epi32(
876 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
877 : const auto horizAddSecondLine =
878 12 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
879 : _mm_srli_epi32(secondLine, 16));
880 :
881 : // Vertical addition and average computation
882 : // average = (sum + 2) >> 2
883 8 : const auto sum = _mm_add_epi32(
884 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
885 4 : averageLow = _mm_srli_epi32(sum, 2);
886 : }
887 : // Load 8 UInt16 from each line
888 : __m128i averageHigh;
889 : {
890 4 : const auto firstLine = _mm_loadu_si128(
891 4 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted + 8));
892 : const auto secondLine =
893 4 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
894 4 : pSrcScanlineShifted + 8 + nChunkXSize));
895 :
896 : // Horizontal addition and extension to 32 bit
897 12 : const auto horizAddFirstLine = _mm_add_epi32(
898 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
899 : const auto horizAddSecondLine =
900 12 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
901 : _mm_srli_epi32(secondLine, 16));
902 :
903 : // Vertical addition and average computation
904 : // average = (sum + 2) >> 2
905 8 : const auto sum = _mm_add_epi32(
906 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
907 4 : averageHigh = _mm_srli_epi32(sum, 2);
908 : }
909 :
910 : // Pack each 32 bit average value to 16 bits
911 4 : auto average = sse2_packus_epi32(averageLow, averageHigh);
912 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
913 : average);
914 4 : pSrcScanlineShifted += 16;
915 : }
916 :
917 9 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
918 9 : return iDstPixel;
919 : }
920 :
921 : /************************************************************************/
922 : /* QuadraticMeanFloatSSE2() */
923 : /************************************************************************/
924 :
925 : #ifdef __AVX2__
926 : #define RMS_FLOAT_ELTS 8
927 : #define set1_ps _mm256_set1_ps
928 : #define loadu_ps _mm256_loadu_ps
929 : #define andnot_ps _mm256_andnot_ps
930 : #define and_ps _mm256_and_ps
931 : #define max_ps _mm256_max_ps
932 : #define shuffle_ps _mm256_shuffle_ps
933 : #define div_ps _mm256_div_ps
934 : #define cmpeq_ps(x, y) _mm256_cmp_ps(x, y, _CMP_EQ_OQ)
935 : #define mul_ps _mm256_mul_ps
936 : #define add_ps _mm256_add_ps
937 : #define hadd_ps _mm256_hadd_ps
938 : #define sqrt_ps _mm256_sqrt_ps
939 : #define or_ps _mm256_or_ps
940 : #define unpacklo_ps _mm256_unpacklo_ps
941 : #define unpackhi_ps _mm256_unpackhi_ps
942 : #define storeu_ps _mm256_storeu_ps
943 :
944 : inline __m256 SQUARE_PS(__m256 x)
945 : {
946 : return _mm256_mul_ps(x, x);
947 : }
948 :
949 : #else
950 :
951 : #ifdef __SSE3__
952 : #define sse2_hadd_ps _mm_hadd_ps
953 : #else
954 : inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
955 : {
956 : auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
957 : auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
958 : return _mm_add_ps(aEven_bEven, aOdd_bOdd); // (aEven + aOdd, bEven + bOdd)
959 : }
960 : #endif
961 :
962 : #define RMS_FLOAT_ELTS 4
963 : #define set1_ps _mm_set1_ps
964 : #define loadu_ps _mm_loadu_ps
965 : #define andnot_ps _mm_andnot_ps
966 : #define and_ps _mm_and_ps
967 : #define max_ps _mm_max_ps
968 : #define shuffle_ps _mm_shuffle_ps
969 : #define div_ps _mm_div_ps
970 : #define cmpeq_ps _mm_cmpeq_ps
971 : #define mul_ps _mm_mul_ps
972 : #define add_ps _mm_add_ps
973 : #define hadd_ps sse2_hadd_ps
974 : #define sqrt_ps _mm_sqrt_ps
975 : #define or_ps _mm_or_ps
976 : #define unpacklo_ps _mm_unpacklo_ps
977 : #define unpackhi_ps _mm_unpackhi_ps
978 : #define storeu_ps _mm_storeu_ps
979 :
980 272 : inline __m128 SQUARE_PS(__m128 x)
981 : {
982 272 : return _mm_mul_ps(x, x);
983 : }
984 :
985 68 : inline __m128 FIXUP_LANES(__m128 x)
986 : {
987 68 : return x;
988 : }
989 :
990 : #endif
991 :
992 : template <class T>
993 : static int NOINLINE
994 34 : QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
995 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
996 : T *CPL_RESTRICT pDstScanline)
997 : {
998 : // Optimized implementation for RMS on Float32 by
999 : // processing by group of RMS_FLOAT_ELTS output pixels.
1000 34 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1001 :
1002 34 : int iDstPixel = 0;
1003 34 : const auto minus_zero = set1_ps(-0.0f);
1004 34 : const auto zeroDot25 = set1_ps(0.25f);
1005 34 : const auto one = set1_ps(1.0f);
1006 68 : const auto infv = set1_ps(std::numeric_limits<float>::infinity());
1007 :
1008 102 : for (; iDstPixel < nDstXWidth - (RMS_FLOAT_ELTS - 1);
1009 : iDstPixel += RMS_FLOAT_ELTS)
1010 : {
1011 : // Load 2*RMS_FLOAT_ELTS Float32 from each line
1012 : auto firstLineLo =
1013 68 : loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
1014 68 : auto firstLineHi = loadu_ps(reinterpret_cast<float const *>(
1015 68 : pSrcScanlineShifted + RMS_FLOAT_ELTS));
1016 68 : auto secondLineLo = loadu_ps(
1017 68 : reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
1018 68 : auto secondLineHi = loadu_ps(reinterpret_cast<float const *>(
1019 68 : pSrcScanlineShifted + RMS_FLOAT_ELTS + nChunkXSize));
1020 :
1021 : // Take the absolute value
1022 68 : firstLineLo = andnot_ps(minus_zero, firstLineLo);
1023 68 : firstLineHi = andnot_ps(minus_zero, firstLineHi);
1024 68 : secondLineLo = andnot_ps(minus_zero, secondLineLo);
1025 68 : secondLineHi = andnot_ps(minus_zero, secondLineHi);
1026 :
1027 : auto firstLineEven =
1028 68 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1029 : auto firstLineOdd =
1030 68 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1031 : auto secondLineEven =
1032 68 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1033 : auto secondLineOdd =
1034 68 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1035 :
1036 : // Compute the maximum of each RMS_FLOAT_ELTS value to RMS-average
1037 204 : const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
1038 : max_ps(secondLineEven, secondLineEven));
1039 :
1040 : // Normalize each value by the maximum of the RMS_FLOAT_ELTS ones.
1041 : // This step is important to avoid that the square evaluates to infinity
1042 : // for sufficiently big input.
1043 68 : auto invMax = div_ps(one, maxV);
1044 : // Deal with 0 being the maximum to correct division by zero
1045 : // note: comparing to -0 leads to identical results as to comparing with
1046 : // 0
1047 136 : invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
1048 :
1049 68 : firstLineEven = mul_ps(firstLineEven, invMax);
1050 68 : firstLineOdd = mul_ps(firstLineOdd, invMax);
1051 68 : secondLineEven = mul_ps(secondLineEven, invMax);
1052 68 : secondLineOdd = mul_ps(secondLineOdd, invMax);
1053 :
1054 : // Compute squares
1055 68 : firstLineEven = SQUARE_PS(firstLineEven);
1056 68 : firstLineOdd = SQUARE_PS(firstLineOdd);
1057 68 : secondLineEven = SQUARE_PS(secondLineEven);
1058 68 : secondLineOdd = SQUARE_PS(secondLineOdd);
1059 :
1060 204 : const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
1061 : add_ps(secondLineEven, secondLineOdd));
1062 :
1063 204 : auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
1064 :
1065 : // Deal with infinity being the maximum
1066 68 : const auto maskIsInf = cmpeq_ps(maxV, infv);
1067 136 : rms = or_ps(andnot_ps(maskIsInf, rms), and_ps(maskIsInf, infv));
1068 :
1069 68 : rms = FIXUP_LANES(rms);
1070 :
1071 : // coverity[incompatible_cast]
1072 68 : storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]), rms);
1073 68 : pSrcScanlineShifted += RMS_FLOAT_ELTS * 2;
1074 : }
1075 :
1076 : zeroupper();
1077 :
1078 34 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1079 34 : return iDstPixel;
1080 : }
1081 :
1082 : /************************************************************************/
1083 : /* AverageFloatSSE2() */
1084 : /************************************************************************/
1085 :
1086 : template <class T>
1087 14 : static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
1088 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1089 : T *CPL_RESTRICT pDstScanline)
1090 : {
1091 : // Optimized implementation for average on Float32 by
1092 : // processing by group of 4 output pixels.
1093 14 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1094 :
1095 14 : int iDstPixel = 0;
1096 14 : const auto zeroDot25 = _mm_set1_ps(0.25f);
1097 :
1098 32 : for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
1099 : {
1100 : // Load 8 Float32 from each line
1101 : const auto firstLineLo =
1102 18 : _mm_loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
1103 18 : const auto firstLineHi = _mm_loadu_ps(
1104 18 : reinterpret_cast<float const *>(pSrcScanlineShifted + 4));
1105 18 : const auto secondLineLo = _mm_loadu_ps(
1106 18 : reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
1107 18 : const auto secondLineHi = _mm_loadu_ps(reinterpret_cast<float const *>(
1108 18 : pSrcScanlineShifted + 4 + nChunkXSize));
1109 :
1110 : // Vertical addition
1111 18 : const auto sumLo = _mm_add_ps(firstLineLo, secondLineLo);
1112 18 : const auto sumHi = _mm_add_ps(firstLineHi, secondLineHi);
1113 :
1114 : // Horizontal addition
1115 : const auto A =
1116 18 : _mm_shuffle_ps(sumLo, sumHi, 0 | (2 << 2) | (0 << 4) | (2 << 6));
1117 : const auto B =
1118 18 : _mm_shuffle_ps(sumLo, sumHi, 1 | (3 << 2) | (1 << 4) | (3 << 6));
1119 18 : const auto sum = _mm_add_ps(A, B);
1120 :
1121 18 : const auto average = _mm_mul_ps(sum, zeroDot25);
1122 :
1123 : // coverity[incompatible_cast]
1124 18 : _mm_storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]),
1125 : average);
1126 18 : pSrcScanlineShifted += 8;
1127 : }
1128 :
1129 14 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1130 14 : return iDstPixel;
1131 : }
1132 :
1133 : #endif
1134 :
1135 : /************************************************************************/
1136 : /* GDALResampleChunk_AverageOrRMS() */
1137 : /************************************************************************/
1138 :
1139 : template <class T, class Tsum, GDALDataType eWrkDataType>
1140 : static CPLErr
1141 2319 : GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
1142 : const T *pChunk, void **ppDstBuffer)
1143 : {
1144 2319 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1145 2319 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1146 2319 : const double dfSrcXDelta = args.dfSrcXDelta;
1147 2319 : const double dfSrcYDelta = args.dfSrcYDelta;
1148 2319 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1149 2319 : const int nChunkXOff = args.nChunkXOff;
1150 2319 : const int nChunkYOff = args.nChunkYOff;
1151 2319 : const int nChunkXSize = args.nChunkXSize;
1152 2319 : const int nChunkYSize = args.nChunkYSize;
1153 2319 : const int nDstXOff = args.nDstXOff;
1154 2319 : const int nDstXOff2 = args.nDstXOff2;
1155 2319 : const int nDstYOff = args.nDstYOff;
1156 2319 : const int nDstYOff2 = args.nDstYOff2;
1157 2319 : const char *pszResampling = args.pszResampling;
1158 2319 : bool bHasNoData = args.bHasNoData;
1159 2319 : const double dfNoDataValue = args.dfNoDataValue;
1160 2319 : const GDALColorTable *poColorTable = args.poColorTable;
1161 2319 : const bool bPropagateNoData = args.bPropagateNoData;
1162 :
1163 : // AVERAGE_BIT2GRAYSCALE
1164 : const bool bBit2Grayscale =
1165 2319 : CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"));
1166 2319 : const bool bQuadraticMean = CPL_TO_BOOL(EQUAL(pszResampling, "RMS"));
1167 2319 : if (bBit2Grayscale)
1168 9 : poColorTable = nullptr;
1169 :
1170 : T tNoDataValue;
1171 2319 : if (!bHasNoData)
1172 2263 : tNoDataValue = 0;
1173 : else
1174 56 : tNoDataValue = static_cast<T>(dfNoDataValue);
1175 2319 : const T tReplacementVal =
1176 114 : bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
1177 56 : args.eOvrDataType, dfNoDataValue))
1178 : : 0;
1179 :
1180 2319 : int nChunkRightXOff = nChunkXOff + nChunkXSize;
1181 2319 : int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1182 2319 : int nDstXWidth = nDstXOff2 - nDstXOff;
1183 :
1184 : /* -------------------------------------------------------------------- */
1185 : /* Allocate buffers. */
1186 : /* -------------------------------------------------------------------- */
1187 2319 : *ppDstBuffer = static_cast<T *>(
1188 2319 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
1189 : GDALGetDataTypeSizeBytes(eWrkDataType)));
1190 2319 : if (*ppDstBuffer == nullptr)
1191 : {
1192 0 : return CE_Failure;
1193 : }
1194 2319 : T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
1195 :
1196 : struct PrecomputedXValue
1197 : {
1198 : int nLeftXOffShifted;
1199 : int nRightXOffShifted;
1200 : double dfLeftWeight;
1201 : double dfRightWeight;
1202 : double dfTotalWeightFullLine;
1203 : };
1204 :
1205 : PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
1206 2319 : VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(PrecomputedXValue)));
1207 :
1208 2319 : if (pasSrcX == nullptr)
1209 : {
1210 0 : return CE_Failure;
1211 : }
1212 :
1213 2319 : int nTransparentIdx = -1;
1214 2319 : std::vector<GDALColorEntry> colorEntries;
1215 2319 : if (poColorTable)
1216 5 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1217 :
1218 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1219 : // it as nodata value
1220 2349 : if (bHasNoData && dfNoDataValue >= 0.0f &&
1221 30 : tNoDataValue < colorEntries.size())
1222 1 : colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1223 :
1224 : // Or if we have no explicit nodata, but a color table entry that is
1225 : // transparent, consider it as the nodata value
1226 2318 : else if (!bHasNoData && nTransparentIdx >= 0)
1227 : {
1228 0 : bHasNoData = true;
1229 0 : tNoDataValue = static_cast<T>(nTransparentIdx);
1230 : }
1231 :
1232 : /* ==================================================================== */
1233 : /* Precompute inner loop constants. */
1234 : /* ==================================================================== */
1235 2319 : bool bSrcXSpacingIsTwo = true;
1236 2319 : int nLastSrcXOff2 = -1;
1237 852277 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1238 : {
1239 849958 : double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
1240 : // Apply some epsilon to avoid numerical precision issues
1241 849958 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
1242 849958 : double dfSrcXOff2 = dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
1243 849958 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
1244 :
1245 849958 : if (nSrcXOff < nChunkXOff)
1246 0 : nSrcXOff = nChunkXOff;
1247 849958 : if (nSrcXOff2 == nSrcXOff)
1248 0 : nSrcXOff2++;
1249 849958 : if (nSrcXOff2 > nChunkRightXOff)
1250 1 : nSrcXOff2 = nChunkRightXOff;
1251 :
1252 849958 : pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
1253 849958 : pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
1254 849958 : nSrcXOff2 - nChunkXOff;
1255 21 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
1256 849958 : (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
1257 849958 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
1258 849958 : 1 - (nSrcXOff2 - dfSrcXOff2);
1259 849958 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
1260 849958 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
1261 849958 : if (nSrcXOff + 1 < nSrcXOff2)
1262 : {
1263 849937 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1264 849937 : nSrcXOff2 - nSrcXOff - 2;
1265 849937 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1266 849937 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
1267 : }
1268 :
1269 849958 : if (nSrcXOff2 - nSrcXOff != 2 ||
1270 728548 : (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
1271 : {
1272 120599 : bSrcXSpacingIsTwo = false;
1273 : }
1274 849958 : nLastSrcXOff2 = nSrcXOff2;
1275 : }
1276 :
1277 : /* ==================================================================== */
1278 : /* Loop over destination scanlines. */
1279 : /* ==================================================================== */
1280 721820 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1281 : {
1282 719501 : double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
1283 719501 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
1284 719501 : if (nSrcYOff < nChunkYOff)
1285 0 : nSrcYOff = nChunkYOff;
1286 :
1287 719501 : double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
1288 719501 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
1289 719501 : if (nSrcYOff2 == nSrcYOff)
1290 0 : ++nSrcYOff2;
1291 719501 : if (nSrcYOff2 > nChunkBottomYOff)
1292 3 : nSrcYOff2 = nChunkBottomYOff;
1293 :
1294 719501 : T *const pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
1295 :
1296 : /* --------------------------------------------------------------------
1297 : */
1298 : /* Loop over destination pixels */
1299 : /* --------------------------------------------------------------------
1300 : */
1301 719501 : if (poColorTable == nullptr)
1302 : {
1303 719386 : if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
1304 : pabyChunkNodataMask == nullptr)
1305 : {
1306 : if (eWrkDataType == GDT_Byte || eWrkDataType == GDT_UInt16)
1307 : {
1308 : // Optimized case : no nodata, overview by a factor of 2 and
1309 : // regular x and y src spacing.
1310 116684 : const T *pSrcScanlineShifted =
1311 116684 : pChunk + pasSrcX[0].nLeftXOffShifted +
1312 116684 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
1313 116684 : nChunkXSize;
1314 116684 : int iDstPixel = 0;
1315 : #ifdef USE_SSE2
1316 116665 : if (bQuadraticMean && eWrkDataType == GDT_Byte)
1317 : {
1318 5385 : iDstPixel = QuadraticMeanByteSSE2OrAVX2(
1319 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1320 : pDstScanline);
1321 : }
1322 111299 : else if (bQuadraticMean /* && eWrkDataType == GDT_UInt16 */)
1323 : {
1324 10 : iDstPixel = QuadraticMeanUInt16SSE2(
1325 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1326 : pDstScanline);
1327 : }
1328 : else if (/* !bQuadraticMean && */ eWrkDataType == GDT_Byte)
1329 : {
1330 111280 : iDstPixel = AverageByteSSE2OrAVX2(
1331 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1332 : pDstScanline);
1333 : }
1334 : else /* if( !bQuadraticMean && eWrkDataType == GDT_UInt16 )
1335 : */
1336 : {
1337 9 : iDstPixel = AverageUInt16SSE2(nDstXWidth, nChunkXSize,
1338 : pSrcScanlineShifted,
1339 : pDstScanline);
1340 : }
1341 : #endif
1342 279043 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1343 : {
1344 162359 : Tsum nTotal = 0;
1345 : T nVal;
1346 162359 : if (bQuadraticMean)
1347 44 : nTotal =
1348 44 : SQUARE<Tsum>(pSrcScanlineShifted[0]) +
1349 44 : SQUARE<Tsum>(pSrcScanlineShifted[1]) +
1350 44 : SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
1351 44 : SQUARE<Tsum>(
1352 44 : pSrcScanlineShifted[1 + nChunkXSize]);
1353 : else
1354 162315 : nTotal = pSrcScanlineShifted[0] +
1355 162315 : pSrcScanlineShifted[1] +
1356 162315 : pSrcScanlineShifted[nChunkXSize] +
1357 162315 : pSrcScanlineShifted[1 + nChunkXSize];
1358 :
1359 162359 : constexpr int nTotalWeight = 4;
1360 162359 : if (bQuadraticMean)
1361 44 : nVal = ComputeIntegerRMS_4values<T>(nTotal);
1362 : else
1363 162315 : nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
1364 : nTotalWeight);
1365 :
1366 : // No need to compare nVal against tNoDataValue as we
1367 : // are in a case where pabyChunkNodataMask == nullptr
1368 : // implies the absence of nodata value.
1369 162359 : pDstScanline[iDstPixel] = nVal;
1370 162359 : pSrcScanlineShifted += 2;
1371 : }
1372 : }
1373 : else
1374 : {
1375 : CPLAssert(eWrkDataType == GDT_Float32 ||
1376 : eWrkDataType == GDT_Float64);
1377 70 : const T *pSrcScanlineShifted =
1378 70 : pChunk + pasSrcX[0].nLeftXOffShifted +
1379 70 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
1380 70 : nChunkXSize;
1381 70 : int iDstPixel = 0;
1382 : #ifdef USE_SSE2
1383 : if (eWrkDataType == GDT_Float32)
1384 : {
1385 48 : if (bQuadraticMean)
1386 : {
1387 34 : iDstPixel = QuadraticMeanFloatSSE2(
1388 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1389 : pDstScanline);
1390 : }
1391 : else
1392 : {
1393 14 : iDstPixel = AverageFloatSSE2(
1394 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1395 : pDstScanline);
1396 : }
1397 : }
1398 : #endif
1399 :
1400 268 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1401 : {
1402 : T nVal;
1403 198 : if (bQuadraticMean)
1404 : {
1405 : // Cast to double to avoid overflows
1406 : // (using std::hypot() is much slower)
1407 100 : nVal = static_cast<T>(std::sqrt(
1408 : 0.25 *
1409 100 : (SQUARE<double>(pSrcScanlineShifted[0]) +
1410 100 : SQUARE<double>(pSrcScanlineShifted[1]) +
1411 100 : SQUARE<double>(
1412 200 : pSrcScanlineShifted[nChunkXSize]) +
1413 100 : SQUARE<double>(
1414 100 : pSrcScanlineShifted[1 + nChunkXSize]))));
1415 : }
1416 : else
1417 : {
1418 98 : nVal = static_cast<T>(
1419 98 : 0.25f * (pSrcScanlineShifted[0] +
1420 98 : pSrcScanlineShifted[1] +
1421 98 : pSrcScanlineShifted[nChunkXSize] +
1422 98 : pSrcScanlineShifted[1 + nChunkXSize]));
1423 : }
1424 :
1425 : // No need to compare nVal against tNoDataValue as we
1426 : // are in a case where pabyChunkNodataMask == nullptr
1427 : // implies the absence of nodata value.
1428 198 : pDstScanline[iDstPixel] = nVal;
1429 198 : pSrcScanlineShifted += 2;
1430 : }
1431 116754 : }
1432 : }
1433 : else
1434 : {
1435 17 : const double dfBottomWeight =
1436 602632 : (nSrcYOff + 1 == nSrcYOff2) ? 1.0
1437 602615 : : 1.0 - (dfSrcYOff - nSrcYOff);
1438 602632 : const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
1439 602632 : nSrcYOff -= nChunkYOff;
1440 602632 : nSrcYOff2 -= nChunkYOff;
1441 :
1442 602632 : double dfTotalWeightFullColumn = dfBottomWeight;
1443 602632 : if (nSrcYOff + 1 < nSrcYOff2)
1444 : {
1445 602615 : dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
1446 602615 : dfTotalWeightFullColumn += dfTopWeight;
1447 : }
1448 :
1449 18754460 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1450 : {
1451 18147483 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1452 18147483 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1453 :
1454 18147483 : double dfTotal = 0;
1455 18147483 : double dfTotalWeight = 0;
1456 18147483 : if (pabyChunkNodataMask == nullptr)
1457 : {
1458 1746435 : auto pChunkShifted =
1459 115 : pChunk +
1460 1746435 : static_cast<GPtrDiff_t>(nSrcYOff) * nChunkXSize;
1461 1746435 : int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1462 1746435 : double dfWeightY = dfBottomWeight;
1463 3493427 : while (true)
1464 : {
1465 : double dfTotalLine;
1466 5239852 : if (bQuadraticMean)
1467 : {
1468 : // Left pixel
1469 : {
1470 104 : const T val = pChunkShifted[nSrcXOff];
1471 104 : dfTotalLine =
1472 104 : SQUARE<double>(val) *
1473 104 : pasSrcX[iDstPixel].dfLeftWeight;
1474 : }
1475 :
1476 104 : if (nSrcXOff + 1 < nSrcXOff2)
1477 : {
1478 : // Middle pixels
1479 104 : for (int iX = nSrcXOff + 1;
1480 424 : iX + 1 < nSrcXOff2; ++iX)
1481 : {
1482 320 : const T val = pChunkShifted[iX];
1483 320 : dfTotalLine += SQUARE<double>(val);
1484 : }
1485 :
1486 : // Right pixel
1487 : {
1488 104 : const T val =
1489 104 : pChunkShifted[nSrcXOff2 - 1];
1490 104 : dfTotalLine +=
1491 104 : SQUARE<double>(val) *
1492 104 : pasSrcX[iDstPixel].dfRightWeight;
1493 : }
1494 : }
1495 : }
1496 : else
1497 : {
1498 : // Left pixel
1499 : {
1500 5239756 : const T val = pChunkShifted[nSrcXOff];
1501 5239756 : dfTotalLine =
1502 5239756 : val * pasSrcX[iDstPixel].dfLeftWeight;
1503 : }
1504 :
1505 5239756 : if (nSrcXOff + 1 < nSrcXOff2)
1506 : {
1507 : // Middle pixels
1508 4239330 : for (int iX = nSrcXOff + 1;
1509 64183126 : iX + 1 < nSrcXOff2; ++iX)
1510 : {
1511 59943836 : const T val = pChunkShifted[iX];
1512 59943836 : dfTotalLine += val;
1513 : }
1514 :
1515 : // Right pixel
1516 : {
1517 4239330 : const T val =
1518 4239330 : pChunkShifted[nSrcXOff2 - 1];
1519 4239330 : dfTotalLine +=
1520 4239330 : val *
1521 4239330 : pasSrcX[iDstPixel].dfRightWeight;
1522 : }
1523 : }
1524 : }
1525 :
1526 5239852 : dfTotal += dfTotalLine * dfWeightY;
1527 5239852 : --nCounterY;
1528 5239852 : if (nCounterY < 0)
1529 1746435 : break;
1530 3493427 : pChunkShifted += nChunkXSize;
1531 3493427 : dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
1532 : }
1533 :
1534 1746435 : dfTotalWeight =
1535 1746435 : pasSrcX[iDstPixel].dfTotalWeightFullLine *
1536 : dfTotalWeightFullColumn;
1537 : }
1538 : else
1539 : {
1540 16401068 : GPtrDiff_t nCount = 0;
1541 71751504 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1542 : {
1543 55350336 : const auto pChunkShifted =
1544 136 : pChunk +
1545 55350336 : static_cast<GPtrDiff_t>(iY) * nChunkXSize;
1546 :
1547 55350336 : double dfTotalLine = 0;
1548 55350336 : double dfTotalWeightLine = 0;
1549 : // Left pixel
1550 : {
1551 55350336 : const int iX = nSrcXOff;
1552 55350336 : const T val = pChunkShifted[iX];
1553 55350336 : if (pabyChunkNodataMask[iX + iY * nChunkXSize])
1554 : {
1555 23508183 : nCount++;
1556 23508183 : const double dfWeightX =
1557 23508183 : pasSrcX[iDstPixel].dfLeftWeight;
1558 23508183 : dfTotalWeightLine = dfWeightX;
1559 23508183 : if (bQuadraticMean)
1560 60 : dfTotalLine =
1561 60 : SQUARE<double>(val) * dfWeightX;
1562 : else
1563 23508083 : dfTotalLine = val * dfWeightX;
1564 : }
1565 : }
1566 :
1567 55350336 : if (nSrcXOff + 1 < nSrcXOff2)
1568 : {
1569 : // Middle pixels
1570 152870136 : for (int iX = nSrcXOff + 1; iX + 1 < nSrcXOff2;
1571 : ++iX)
1572 : {
1573 97518100 : const T val = pChunkShifted[iX];
1574 97518100 : if (pabyChunkNodataMask[iX +
1575 97518100 : iY * nChunkXSize])
1576 : {
1577 39727100 : nCount++;
1578 39727100 : dfTotalWeightLine += 1;
1579 39727100 : if (bQuadraticMean)
1580 0 : dfTotalLine += SQUARE<double>(val);
1581 : else
1582 39727100 : dfTotalLine += val;
1583 : }
1584 : }
1585 :
1586 : // Right pixel
1587 : {
1588 55351936 : const int iX = nSrcXOff2 - 1;
1589 55351936 : const T val = pChunkShifted[iX];
1590 55351936 : if (pabyChunkNodataMask[iX +
1591 55351936 : iY * nChunkXSize])
1592 : {
1593 23509251 : nCount++;
1594 23509251 : const double dfWeightX =
1595 23509251 : pasSrcX[iDstPixel].dfRightWeight;
1596 23509251 : dfTotalWeightLine += dfWeightX;
1597 23509251 : if (bQuadraticMean)
1598 1 : dfTotalLine +=
1599 61 : SQUARE<double>(val) * dfWeightX;
1600 : else
1601 23509150 : dfTotalLine += val * dfWeightX;
1602 : }
1603 : }
1604 : }
1605 :
1606 94311604 : const double dfWeightY =
1607 : (iY == nSrcYOff) ? dfBottomWeight
1608 38961168 : : (iY + 1 == nSrcYOff2) ? dfTopWeight
1609 : : 1.0;
1610 55350436 : dfTotal += dfTotalLine * dfWeightY;
1611 55350436 : dfTotalWeight += dfTotalWeightLine * dfWeightY;
1612 : }
1613 :
1614 16401068 : if (nCount == 0 ||
1615 8 : (bPropagateNoData &&
1616 : nCount <
1617 8 : static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
1618 8 : (nSrcXOff2 - nSrcXOff)))
1619 : {
1620 9607202 : pDstScanline[iDstPixel] = tNoDataValue;
1621 9607202 : continue;
1622 : }
1623 : }
1624 : if (eWrkDataType == GDT_Byte)
1625 : {
1626 : T nVal;
1627 8540160 : if (bQuadraticMean)
1628 38 : nVal = ComputeIntegerRMS<T, int>(dfTotal,
1629 : dfTotalWeight);
1630 : else
1631 8540120 : nVal =
1632 8540120 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1633 8544440 : if (bHasNoData && nVal == tNoDataValue)
1634 0 : nVal = tReplacementVal;
1635 8544440 : pDstScanline[iDstPixel] = nVal;
1636 : }
1637 : else if (eWrkDataType == GDT_UInt16)
1638 : {
1639 : T nVal;
1640 8 : if (bQuadraticMean)
1641 4 : nVal = ComputeIntegerRMS<T, uint64_t>(
1642 : dfTotal, dfTotalWeight);
1643 : else
1644 4 : nVal =
1645 4 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1646 8 : if (bHasNoData && nVal == tNoDataValue)
1647 0 : nVal = tReplacementVal;
1648 8 : pDstScanline[iDstPixel] = nVal;
1649 : }
1650 : else
1651 : {
1652 : T nVal;
1653 153 : if (bQuadraticMean)
1654 20 : nVal =
1655 25 : static_cast<T>(sqrt(dfTotal / dfTotalWeight));
1656 : else
1657 128 : nVal = static_cast<T>(dfTotal / dfTotalWeight);
1658 153 : if (bHasNoData && nVal == tNoDataValue)
1659 2 : nVal = tReplacementVal;
1660 153 : pDstScanline[iDstPixel] = nVal;
1661 : }
1662 : }
1663 : }
1664 : }
1665 : else
1666 : {
1667 115 : nSrcYOff -= nChunkYOff;
1668 115 : nSrcYOff2 -= nChunkYOff;
1669 :
1670 2275 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1671 : {
1672 6475 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1673 6475 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1674 :
1675 6475 : GPtrDiff_t nTotalR = 0;
1676 6475 : GPtrDiff_t nTotalG = 0;
1677 6475 : GPtrDiff_t nTotalB = 0;
1678 6475 : GPtrDiff_t nCount = 0;
1679 :
1680 19425 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1681 : {
1682 38850 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
1683 : {
1684 25900 : const T val = pChunk[iX + static_cast<GPtrDiff_t>(iY) *
1685 25900 : nChunkXSize];
1686 : // cppcheck-suppress unsignedLessThanZero
1687 25900 : if (val < 0 || val >= colorEntries.size())
1688 0 : continue;
1689 25900 : size_t idx = static_cast<size_t>(val);
1690 25900 : const auto &entry = colorEntries[idx];
1691 25900 : if (entry.c4)
1692 : {
1693 14128 : if (bQuadraticMean)
1694 : {
1695 800 : nTotalR += SQUARE<int>(entry.c1);
1696 800 : nTotalG += SQUARE<int>(entry.c2);
1697 800 : nTotalB += SQUARE<int>(entry.c3);
1698 800 : ++nCount;
1699 : }
1700 : else
1701 : {
1702 13328 : nTotalR += entry.c1;
1703 13328 : nTotalG += entry.c2;
1704 13328 : nTotalB += entry.c3;
1705 13328 : ++nCount;
1706 : }
1707 : }
1708 : }
1709 : }
1710 :
1711 6475 : if (nCount == 0 ||
1712 0 : (bPropagateNoData &&
1713 0 : nCount < static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
1714 0 : (nSrcXOff2 - nSrcXOff)))
1715 : {
1716 2838 : pDstScanline[iDstPixel] = tNoDataValue;
1717 : }
1718 : else
1719 : {
1720 : GDALColorEntry color;
1721 3637 : if (bQuadraticMean)
1722 : {
1723 200 : color.c1 =
1724 200 : static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
1725 200 : color.c2 =
1726 200 : static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
1727 200 : color.c3 =
1728 200 : static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
1729 : }
1730 : else
1731 : {
1732 3437 : color.c1 =
1733 3437 : static_cast<short>((nTotalR + nCount / 2) / nCount);
1734 3437 : color.c2 =
1735 3437 : static_cast<short>((nTotalG + nCount / 2) / nCount);
1736 3437 : color.c3 =
1737 3437 : static_cast<short>((nTotalB + nCount / 2) / nCount);
1738 : }
1739 0 : pDstScanline[iDstPixel] =
1740 3637 : static_cast<T>(BestColorEntry(colorEntries, color));
1741 : }
1742 : }
1743 : }
1744 : }
1745 :
1746 2319 : CPLFree(pasSrcX);
1747 :
1748 2319 : return CE_None;
1749 : }
1750 :
1751 : static CPLErr
1752 2319 : GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
1753 : const void *pChunk, void **ppDstBuffer,
1754 : GDALDataType *peDstBufferDataType)
1755 : {
1756 2319 : *peDstBufferDataType = args.eWrkDataType;
1757 2319 : switch (args.eWrkDataType)
1758 : {
1759 2252 : case GDT_Byte:
1760 : {
1761 2252 : return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_Byte>(
1762 2252 : args, static_cast<const GByte *>(pChunk), ppDstBuffer);
1763 : }
1764 :
1765 9 : case GDT_UInt16:
1766 : {
1767 9 : if (EQUAL(args.pszResampling, "RMS"))
1768 : {
1769 : // Use double as accumulation type, because UInt32 could overflow
1770 : return GDALResampleChunk_AverageOrRMS_T<GUInt16, double,
1771 5 : GDT_UInt16>(
1772 5 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1773 : }
1774 : else
1775 : {
1776 : return GDALResampleChunk_AverageOrRMS_T<GUInt16, GUInt32,
1777 4 : GDT_UInt16>(
1778 4 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1779 : }
1780 : }
1781 :
1782 41 : case GDT_Float32:
1783 : {
1784 41 : return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32>(
1785 41 : args, static_cast<const float *>(pChunk), ppDstBuffer);
1786 : }
1787 :
1788 17 : case GDT_Float64:
1789 : {
1790 : return GDALResampleChunk_AverageOrRMS_T<double, double,
1791 17 : GDT_Float64>(
1792 17 : args, static_cast<const double *>(pChunk), ppDstBuffer);
1793 : }
1794 :
1795 0 : default:
1796 0 : break;
1797 : }
1798 :
1799 0 : CPLAssert(false);
1800 : return CE_Failure;
1801 : }
1802 :
1803 : /************************************************************************/
1804 : /* GDALResampleChunk_Gauss() */
1805 : /************************************************************************/
1806 :
1807 86 : static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
1808 : const void *pChunk, void **ppDstBuffer,
1809 : GDALDataType *peDstBufferDataType)
1810 :
1811 : {
1812 86 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1813 86 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1814 86 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1815 86 : const int nChunkXOff = args.nChunkXOff;
1816 86 : const int nChunkXSize = args.nChunkXSize;
1817 86 : const int nChunkYOff = args.nChunkYOff;
1818 86 : const int nChunkYSize = args.nChunkYSize;
1819 86 : const int nDstXOff = args.nDstXOff;
1820 86 : const int nDstXOff2 = args.nDstXOff2;
1821 86 : const int nDstYOff = args.nDstYOff;
1822 86 : const int nDstYOff2 = args.nDstYOff2;
1823 86 : const bool bHasNoData = args.bHasNoData;
1824 86 : double dfNoDataValue = args.dfNoDataValue;
1825 86 : const GDALColorTable *poColorTable = args.poColorTable;
1826 :
1827 86 : const double *const padfChunk = static_cast<const double *>(pChunk);
1828 :
1829 86 : *ppDstBuffer =
1830 86 : VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
1831 : GDALGetDataTypeSizeBytes(GDT_Float64));
1832 86 : if (*ppDstBuffer == nullptr)
1833 : {
1834 0 : return CE_Failure;
1835 : }
1836 86 : *peDstBufferDataType = GDT_Float64;
1837 86 : double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
1838 :
1839 : /* -------------------------------------------------------------------- */
1840 : /* Create the filter kernel and allocate scanline buffer. */
1841 : /* -------------------------------------------------------------------- */
1842 86 : int nGaussMatrixDim = 3;
1843 : const int *panGaussMatrix;
1844 86 : constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
1845 86 : constexpr int anGaussMatrix5x5[] = {1, 4, 6, 4, 1, 4, 16, 24, 16,
1846 : 4, 6, 24, 36, 24, 6, 4, 16, 24,
1847 : 16, 4, 1, 4, 6, 4, 1};
1848 86 : constexpr int anGaussMatrix7x7[] = {
1849 : 1, 6, 15, 20, 15, 6, 1, 6, 36, 90, 120, 90, 36,
1850 : 6, 15, 90, 225, 300, 225, 90, 15, 20, 120, 300, 400, 300,
1851 : 120, 20, 15, 90, 225, 300, 225, 90, 15, 6, 36, 90, 120,
1852 : 90, 36, 6, 1, 6, 15, 20, 15, 6, 1};
1853 :
1854 86 : const int nOXSize = args.nOvrXSize;
1855 86 : const int nOYSize = args.nOvrYSize;
1856 86 : const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
1857 :
1858 : // matrix for gauss filter
1859 86 : if (nResYFactor <= 2)
1860 : {
1861 85 : panGaussMatrix = anGaussMatrix3x3;
1862 85 : nGaussMatrixDim = 3;
1863 : }
1864 1 : else if (nResYFactor <= 4)
1865 : {
1866 0 : panGaussMatrix = anGaussMatrix5x5;
1867 0 : nGaussMatrixDim = 5;
1868 : }
1869 : else
1870 : {
1871 1 : panGaussMatrix = anGaussMatrix7x7;
1872 1 : nGaussMatrixDim = 7;
1873 : }
1874 :
1875 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
1876 : int *panGaussMatrixDup = static_cast<int *>(
1877 : CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
1878 : memcpy(panGaussMatrixDup, panGaussMatrix,
1879 : sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
1880 : panGaussMatrix = panGaussMatrixDup;
1881 : #endif
1882 :
1883 86 : if (!bHasNoData)
1884 79 : dfNoDataValue = 0.0;
1885 :
1886 86 : std::vector<GDALColorEntry> colorEntries;
1887 86 : int nTransparentIdx = -1;
1888 86 : if (poColorTable)
1889 2 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1890 :
1891 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1892 : // it as nodata value.
1893 92 : if (bHasNoData && dfNoDataValue >= 0.0f &&
1894 6 : dfNoDataValue < colorEntries.size())
1895 0 : colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
1896 :
1897 : // Or if we have no explicit nodata, but a color table entry that is
1898 : // transparent, consider it as the nodata value.
1899 86 : else if (!bHasNoData && nTransparentIdx >= 0)
1900 : {
1901 0 : dfNoDataValue = nTransparentIdx;
1902 : }
1903 :
1904 86 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
1905 86 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1906 86 : const int nDstXWidth = nDstXOff2 - nDstXOff;
1907 :
1908 : /* ==================================================================== */
1909 : /* Loop over destination scanlines. */
1910 : /* ==================================================================== */
1911 16488 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1912 : {
1913 16402 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
1914 16402 : int nSrcYOff2 =
1915 16402 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
1916 :
1917 16402 : if (nSrcYOff < nChunkYOff)
1918 : {
1919 0 : nSrcYOff = nChunkYOff;
1920 0 : nSrcYOff2++;
1921 : }
1922 :
1923 16402 : const int iSizeY = nSrcYOff2 - nSrcYOff;
1924 16402 : nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
1925 16402 : nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
1926 :
1927 16402 : if (nSrcYOff2 > nChunkBottomYOff ||
1928 16359 : (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
1929 : {
1930 44 : nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
1931 : }
1932 :
1933 16402 : int nYShiftGaussMatrix = 0;
1934 16402 : if (nSrcYOff < nChunkYOff)
1935 : {
1936 0 : nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
1937 0 : nSrcYOff = nChunkYOff;
1938 : }
1939 :
1940 16402 : const double *const padfSrcScanline =
1941 16402 : padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
1942 16402 : const GByte *pabySrcScanlineNodataMask = nullptr;
1943 16402 : if (pabyChunkNodataMask != nullptr)
1944 152 : pabySrcScanlineNodataMask =
1945 152 : pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
1946 :
1947 : /* --------------------------------------------------------------------
1948 : */
1949 : /* Loop over destination pixels */
1950 : /* --------------------------------------------------------------------
1951 : */
1952 16402 : double *const padfDstScanline =
1953 16402 : padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
1954 4149980 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1955 : {
1956 4133580 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
1957 4133580 : int nSrcXOff2 =
1958 4133580 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
1959 :
1960 4133580 : if (nSrcXOff < nChunkXOff)
1961 : {
1962 0 : nSrcXOff = nChunkXOff;
1963 0 : nSrcXOff2++;
1964 : }
1965 :
1966 4133580 : const int iSizeX = nSrcXOff2 - nSrcXOff;
1967 4133580 : nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
1968 4133580 : nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
1969 :
1970 4133580 : if (nSrcXOff2 > nChunkRightXOff ||
1971 4127930 : (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
1972 : {
1973 5650 : nSrcXOff2 =
1974 5650 : std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
1975 : }
1976 :
1977 4133580 : int nXShiftGaussMatrix = 0;
1978 4133580 : if (nSrcXOff < nChunkXOff)
1979 : {
1980 0 : nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
1981 0 : nSrcXOff = nChunkXOff;
1982 : }
1983 :
1984 4133580 : if (poColorTable == nullptr)
1985 : {
1986 4133380 : double dfTotal = 0.0;
1987 4133380 : GInt64 nCount = 0;
1988 4133380 : const int *panLineWeight =
1989 4133380 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
1990 : nXShiftGaussMatrix;
1991 :
1992 16527900 : for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
1993 12394500 : ++iY, ++j, panLineWeight += nGaussMatrixDim)
1994 : {
1995 49561300 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
1996 : {
1997 37166800 : const double val =
1998 37166800 : padfSrcScanline[iX - nChunkXOff +
1999 37166800 : static_cast<GPtrDiff_t>(iY -
2000 37166800 : nSrcYOff) *
2001 37166800 : nChunkXSize];
2002 37166800 : if (pabySrcScanlineNodataMask == nullptr ||
2003 32872 : pabySrcScanlineNodataMask[iX - nChunkXOff +
2004 32872 : static_cast<GPtrDiff_t>(
2005 32872 : iY - nSrcYOff) *
2006 32872 : nChunkXSize])
2007 : {
2008 37146100 : const int nWeight = panLineWeight[i];
2009 37146100 : dfTotal += val * nWeight;
2010 37146100 : nCount += nWeight;
2011 : }
2012 : }
2013 : }
2014 :
2015 4133380 : if (nCount == 0)
2016 : {
2017 2217 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2018 : }
2019 : else
2020 : {
2021 4131160 : padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
2022 : }
2023 : }
2024 : else
2025 : {
2026 200 : GInt64 nTotalR = 0;
2027 200 : GInt64 nTotalG = 0;
2028 200 : GInt64 nTotalB = 0;
2029 200 : GInt64 nTotalWeight = 0;
2030 200 : const int *panLineWeight =
2031 200 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2032 : nXShiftGaussMatrix;
2033 :
2034 780 : for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2035 580 : ++iY, ++j, panLineWeight += nGaussMatrixDim)
2036 : {
2037 2262 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2038 : {
2039 1682 : const double val =
2040 1682 : padfSrcScanline[iX - nChunkXOff +
2041 1682 : static_cast<GPtrDiff_t>(iY -
2042 1682 : nSrcYOff) *
2043 1682 : nChunkXSize];
2044 1682 : if (val < 0 || val >= colorEntries.size())
2045 0 : continue;
2046 :
2047 1682 : size_t idx = static_cast<size_t>(val);
2048 1682 : if (colorEntries[idx].c4)
2049 : {
2050 1682 : const int nWeight = panLineWeight[i];
2051 1682 : nTotalR +=
2052 1682 : static_cast<GInt64>(colorEntries[idx].c1) *
2053 1682 : nWeight;
2054 1682 : nTotalG +=
2055 1682 : static_cast<GInt64>(colorEntries[idx].c2) *
2056 1682 : nWeight;
2057 1682 : nTotalB +=
2058 1682 : static_cast<GInt64>(colorEntries[idx].c3) *
2059 1682 : nWeight;
2060 1682 : nTotalWeight += nWeight;
2061 : }
2062 : }
2063 : }
2064 :
2065 200 : if (nTotalWeight == 0)
2066 : {
2067 0 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2068 : }
2069 : else
2070 : {
2071 : GDALColorEntry color;
2072 :
2073 200 : color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
2074 : nTotalWeight);
2075 200 : color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
2076 : nTotalWeight);
2077 200 : color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
2078 : nTotalWeight);
2079 200 : padfDstScanline[iDstPixel - nDstXOff] =
2080 200 : BestColorEntry(colorEntries, color);
2081 : }
2082 : }
2083 : }
2084 : }
2085 :
2086 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
2087 : CPLFree(panGaussMatrixDup);
2088 : #endif
2089 :
2090 86 : return CE_None;
2091 : }
2092 :
2093 : /************************************************************************/
2094 : /* GDALResampleChunk_Mode() */
2095 : /************************************************************************/
2096 :
2097 4398 : template <class T> static inline bool IsSame(T a, T b)
2098 : {
2099 4398 : return a == b;
2100 : }
2101 :
2102 4854 : template <> bool IsSame<float>(float a, float b)
2103 : {
2104 4854 : return a == b || (std::isnan(a) && std::isnan(b));
2105 : }
2106 :
2107 504 : template <> bool IsSame<double>(double a, double b)
2108 : {
2109 504 : return a == b || (std::isnan(a) && std::isnan(b));
2110 : }
2111 :
2112 : template <>
2113 480 : bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
2114 : {
2115 960 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2116 960 : std::isnan(b.real()) && std::isnan(b.imag()));
2117 : }
2118 :
2119 : template <>
2120 480 : bool IsSame<std::complex<double>>(std::complex<double> a,
2121 : std::complex<double> b)
2122 : {
2123 960 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2124 960 : std::isnan(b.real()) && std::isnan(b.imag()));
2125 : }
2126 :
2127 : template <class T>
2128 136 : static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
2129 : const T *pChunk, T *const pDstBuffer)
2130 :
2131 : {
2132 136 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2133 136 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2134 136 : const double dfSrcXDelta = args.dfSrcXDelta;
2135 136 : const double dfSrcYDelta = args.dfSrcYDelta;
2136 136 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2137 136 : const int nChunkXOff = args.nChunkXOff;
2138 136 : const int nChunkXSize = args.nChunkXSize;
2139 136 : const int nChunkYOff = args.nChunkYOff;
2140 136 : const int nChunkYSize = args.nChunkYSize;
2141 136 : const int nDstXOff = args.nDstXOff;
2142 136 : const int nDstXOff2 = args.nDstXOff2;
2143 136 : const int nDstYOff = args.nDstYOff;
2144 136 : const int nDstYOff2 = args.nDstYOff2;
2145 136 : const bool bHasNoData = args.bHasNoData;
2146 136 : const GDALColorTable *poColorTable = args.poColorTable;
2147 136 : const int nDstXSize = nDstXOff2 - nDstXOff;
2148 :
2149 8 : T tNoDataValue;
2150 : if constexpr (std::is_same<T, std::complex<float>>::value ||
2151 : std::is_same<T, std::complex<double>>::value)
2152 : {
2153 : using BaseT = typename T::value_type;
2154 8 : tNoDataValue =
2155 : std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
2156 : std::numeric_limits<BaseT>::quiet_NaN());
2157 : }
2158 128 : else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
2159 127 : tNoDataValue = 0;
2160 : else
2161 1 : tNoDataValue = static_cast<T>(args.dfNoDataValue);
2162 :
2163 136 : size_t nMaxNumPx = 0;
2164 136 : T *paVals = nullptr;
2165 136 : int *panSums = nullptr;
2166 :
2167 136 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2168 136 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2169 272 : std::vector<int> anVals(256, 0);
2170 :
2171 : /* ==================================================================== */
2172 : /* Loop over destination scanlines. */
2173 : /* ==================================================================== */
2174 7531 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2175 : {
2176 7395 : double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
2177 7395 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
2178 : #ifdef only_pixels_with_more_than_10_pct_participation
2179 : // When oversampling, don't take into account pixels that have a tiny
2180 : // participation in the resulting pixel
2181 : if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2182 : nSrcYOff < nChunkBottomYOff)
2183 : nSrcYOff++;
2184 : #endif
2185 7395 : if (nSrcYOff < nChunkYOff)
2186 0 : nSrcYOff = nChunkYOff;
2187 :
2188 7395 : double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
2189 7395 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
2190 : #ifdef only_pixels_with_more_than_10_pct_participation
2191 : // When oversampling, don't take into account pixels that have a tiny
2192 : // participation in the resulting pixel
2193 : if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2194 : nSrcYOff2 > nChunkYOff)
2195 : nSrcYOff2--;
2196 : #endif
2197 7395 : if (nSrcYOff2 == nSrcYOff)
2198 0 : ++nSrcYOff2;
2199 7395 : if (nSrcYOff2 > nChunkBottomYOff)
2200 0 : nSrcYOff2 = nChunkBottomYOff;
2201 :
2202 7395 : const T *const paSrcScanline =
2203 149 : pChunk +
2204 7395 : (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
2205 7395 : const GByte *pabySrcScanlineNodataMask = nullptr;
2206 7395 : if (pabyChunkNodataMask != nullptr)
2207 1810 : pabySrcScanlineNodataMask =
2208 : pabyChunkNodataMask +
2209 1810 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
2210 :
2211 7395 : T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
2212 : /* --------------------------------------------------------------------
2213 : */
2214 : /* Loop over destination pixels */
2215 : /* --------------------------------------------------------------------
2216 : */
2217 4259580 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2218 : {
2219 4252187 : double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
2220 : // Apply some epsilon to avoid numerical precision issues
2221 4252187 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
2222 : #ifdef only_pixels_with_more_than_10_pct_participation
2223 : // When oversampling, don't take into account pixels that have a
2224 : // tiny participation in the resulting pixel
2225 : if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2226 : nSrcXOff < nChunkRightXOff)
2227 : nSrcXOff++;
2228 : #endif
2229 4252187 : if (nSrcXOff < nChunkXOff)
2230 0 : nSrcXOff = nChunkXOff;
2231 :
2232 4252187 : double dfSrcXOff2 =
2233 4252187 : dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
2234 4252187 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
2235 : #ifdef only_pixels_with_more_than_10_pct_participation
2236 : // When oversampling, don't take into account pixels that have a
2237 : // tiny participation in the resulting pixel
2238 : if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2239 : nSrcXOff2 > nChunkXOff)
2240 : nSrcXOff2--;
2241 : #endif
2242 4252187 : if (nSrcXOff2 == nSrcXOff)
2243 0 : nSrcXOff2++;
2244 4252187 : if (nSrcXOff2 > nChunkRightXOff)
2245 0 : nSrcXOff2 = nChunkRightXOff;
2246 :
2247 4252187 : bool bRegularProcessing = false;
2248 : if constexpr (!std::is_same<T, GByte>::value)
2249 827 : bRegularProcessing = true;
2250 4251360 : else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
2251 0 : bRegularProcessing = true;
2252 :
2253 4252187 : if (bRegularProcessing)
2254 : {
2255 : // Not sure how much sense it makes to run a majority
2256 : // filter on floating point data, but here it is for the sake
2257 : // of compatibility. It won't look right on RGB images by the
2258 : // nature of the filter.
2259 :
2260 827 : if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
2261 2481 : nSrcYOff2 - nSrcYOff > INT_MAX / (nSrcXOff2 - nSrcXOff) ||
2262 827 : static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
2263 827 : static_cast<size_t>(nSrcXOff2 - nSrcXOff) >
2264 827 : std::numeric_limits<size_t>::max() / sizeof(float))
2265 : {
2266 0 : CPLError(CE_Failure, CPLE_NotSupported,
2267 : "Too big downsampling factor");
2268 0 : CPLFree(paVals);
2269 0 : CPLFree(panSums);
2270 0 : return CE_Failure;
2271 : }
2272 827 : const size_t nNumPx =
2273 827 : static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
2274 827 : static_cast<size_t>(nSrcXOff2 - nSrcXOff);
2275 827 : size_t iMaxInd = 0;
2276 827 : size_t iMaxVal = 0;
2277 827 : bool biMaxValdValid = false;
2278 :
2279 827 : if (paVals == nullptr || nNumPx > nMaxNumPx)
2280 : {
2281 : T *paValsNew = static_cast<T *>(
2282 71 : VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
2283 : int *panSumsNew = static_cast<int *>(
2284 71 : VSI_REALLOC_VERBOSE(panSums, nNumPx * sizeof(int)));
2285 71 : if (paValsNew != nullptr)
2286 71 : paVals = paValsNew;
2287 71 : if (panSumsNew != nullptr)
2288 71 : panSums = panSumsNew;
2289 71 : if (paValsNew == nullptr || panSumsNew == nullptr)
2290 : {
2291 0 : CPLFree(paVals);
2292 0 : CPLFree(panSums);
2293 0 : return CE_Failure;
2294 : }
2295 71 : nMaxNumPx = nNumPx;
2296 : }
2297 :
2298 2585 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2299 : {
2300 1758 : const GPtrDiff_t iTotYOff =
2301 1758 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2302 1758 : nChunkXOff;
2303 5690 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2304 : {
2305 3932 : if (pabySrcScanlineNodataMask == nullptr ||
2306 16 : pabySrcScanlineNodataMask[iX + iTotYOff])
2307 : {
2308 3917 : const T val = paSrcScanline[iX + iTotYOff];
2309 3917 : size_t i = 0; // Used after for.
2310 :
2311 : // Check array for existing entry.
2312 14387 : for (; i < iMaxInd; ++i)
2313 17626 : if (IsSame(paVals[i], val) &&
2314 6910 : ++panSums[i] > panSums[iMaxVal])
2315 : {
2316 246 : iMaxVal = i;
2317 246 : biMaxValdValid = true;
2318 246 : break;
2319 : }
2320 :
2321 : // Add to arr if entry not already there.
2322 3917 : if (i == iMaxInd)
2323 : {
2324 3671 : paVals[iMaxInd] = val;
2325 3671 : panSums[iMaxInd] = 1;
2326 :
2327 3671 : if (!biMaxValdValid)
2328 : {
2329 824 : iMaxVal = iMaxInd;
2330 824 : biMaxValdValid = true;
2331 : }
2332 :
2333 3671 : ++iMaxInd;
2334 : }
2335 : }
2336 : }
2337 : }
2338 :
2339 827 : if (!biMaxValdValid)
2340 3 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2341 : else
2342 824 : paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
2343 : }
2344 : else if constexpr (std::is_same<T, GByte>::value)
2345 : // ( eSrcDataType == GDT_Byte && nEntryCount < 256 )
2346 : {
2347 : // So we go here for a paletted or non-paletted byte band.
2348 : // The input values are then between 0 and 255.
2349 4251360 : int nMaxVal = 0;
2350 4251360 : int iMaxInd = -1;
2351 :
2352 : // The cost of this zeroing might be high. Perhaps we should
2353 : // just use the above generic case, and go to this one if the
2354 : // number of source pixels is large enough
2355 4251360 : std::fill(anVals.begin(), anVals.end(), 0);
2356 :
2357 12777700 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2358 : {
2359 8526370 : const GPtrDiff_t iTotYOff =
2360 8526370 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2361 8526370 : nChunkXOff;
2362 25649400 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2363 : {
2364 17123000 : const T val = paSrcScanline[iX + iTotYOff];
2365 17123000 : if (!bHasNoData || val != tNoDataValue)
2366 : {
2367 17123000 : int nVal = static_cast<int>(val);
2368 17123000 : if (++anVals[nVal] > nMaxVal)
2369 : {
2370 : // Sum the density.
2371 : // Is it the most common value so far?
2372 17006300 : iMaxInd = nVal;
2373 17006300 : nMaxVal = anVals[nVal];
2374 : }
2375 : }
2376 : }
2377 : }
2378 :
2379 4251360 : if (iMaxInd == -1)
2380 0 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2381 : else
2382 4251360 : paDstScanline[iDstPixel - nDstXOff] =
2383 : static_cast<T>(iMaxInd);
2384 : }
2385 : }
2386 : }
2387 :
2388 136 : CPLFree(paVals);
2389 136 : CPLFree(panSums);
2390 :
2391 136 : return CE_None;
2392 : }
2393 :
2394 136 : static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
2395 : const void *pChunk, void **ppDstBuffer,
2396 : GDALDataType *peDstBufferDataType)
2397 : {
2398 136 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
2399 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
2400 : GDALGetDataTypeSizeBytes(args.eWrkDataType));
2401 136 : if (*ppDstBuffer == nullptr)
2402 : {
2403 0 : return CE_Failure;
2404 : }
2405 :
2406 136 : CPLAssert(args.eSrcDataType == args.eWrkDataType);
2407 :
2408 136 : *peDstBufferDataType = args.eWrkDataType;
2409 136 : switch (args.eWrkDataType)
2410 : {
2411 : // For mode resampling, as no computation is done, only the
2412 : // size of the data type matters... except for Byte where we have
2413 : // special processing. And for floating point values
2414 65 : case GDT_Byte:
2415 : {
2416 65 : return GDALResampleChunk_ModeT(args,
2417 : static_cast<const GByte *>(pChunk),
2418 65 : static_cast<GByte *>(*ppDstBuffer));
2419 : }
2420 :
2421 4 : case GDT_Int8:
2422 : {
2423 4 : return GDALResampleChunk_ModeT(args,
2424 : static_cast<const int8_t *>(pChunk),
2425 4 : static_cast<int8_t *>(*ppDstBuffer));
2426 : }
2427 :
2428 9 : case GDT_Int16:
2429 : case GDT_UInt16:
2430 : case GDT_Float16:
2431 : {
2432 9 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
2433 9 : return GDALResampleChunk_ModeT(
2434 : args, static_cast<const uint16_t *>(pChunk),
2435 9 : static_cast<uint16_t *>(*ppDstBuffer));
2436 : }
2437 :
2438 15 : case GDT_CInt16:
2439 : case GDT_CFloat16:
2440 : case GDT_Int32:
2441 : case GDT_UInt32:
2442 : {
2443 15 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2444 15 : return GDALResampleChunk_ModeT(
2445 : args, static_cast<const uint32_t *>(pChunk),
2446 15 : static_cast<uint32_t *>(*ppDstBuffer));
2447 : }
2448 :
2449 17 : case GDT_Float32:
2450 : {
2451 17 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2452 17 : return GDALResampleChunk_ModeT(args,
2453 : static_cast<const float *>(pChunk),
2454 17 : static_cast<float *>(*ppDstBuffer));
2455 : }
2456 :
2457 12 : case GDT_CInt32:
2458 : case GDT_Int64:
2459 : case GDT_UInt64:
2460 : {
2461 12 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2462 12 : return GDALResampleChunk_ModeT(
2463 : args, static_cast<const uint64_t *>(pChunk),
2464 12 : static_cast<uint64_t *>(*ppDstBuffer));
2465 : }
2466 :
2467 6 : case GDT_Float64:
2468 : {
2469 6 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2470 6 : return GDALResampleChunk_ModeT(args,
2471 : static_cast<const double *>(pChunk),
2472 6 : static_cast<double *>(*ppDstBuffer));
2473 : }
2474 :
2475 4 : case GDT_CFloat32:
2476 : {
2477 4 : return GDALResampleChunk_ModeT(
2478 : args, static_cast<const std::complex<float> *>(pChunk),
2479 4 : static_cast<std::complex<float> *>(*ppDstBuffer));
2480 : }
2481 :
2482 4 : case GDT_CFloat64:
2483 : {
2484 4 : return GDALResampleChunk_ModeT(
2485 : args, static_cast<const std::complex<double> *>(pChunk),
2486 4 : static_cast<std::complex<double> *>(*ppDstBuffer));
2487 : }
2488 :
2489 0 : case GDT_Unknown:
2490 : case GDT_TypeCount:
2491 0 : break;
2492 : }
2493 :
2494 0 : CPLAssert(false);
2495 : return CE_Failure;
2496 : }
2497 :
2498 : /************************************************************************/
2499 : /* GDALResampleConvolutionHorizontal() */
2500 : /************************************************************************/
2501 :
2502 : template <class T>
2503 : static inline double
2504 44886 : GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
2505 : int nSrcPixelCount)
2506 : {
2507 44886 : double dfVal1 = 0.0;
2508 44886 : double dfVal2 = 0.0;
2509 44886 : int i = 0; // Used after for.
2510 : // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
2511 : // manually (untypical) unrolled loop in -O2 and -O3:
2512 : // https://github.com/OSGeo/gdal/issues/9508
2513 : #if !defined(__INTEL_CLANG_COMPILER)
2514 89516 : for (; i + 3 < nSrcPixelCount; i += 4)
2515 : {
2516 44630 : dfVal1 += pChunk[i] * padfWeights[i];
2517 44630 : dfVal1 += pChunk[i + 1] * padfWeights[i + 1];
2518 44630 : dfVal2 += pChunk[i + 2] * padfWeights[i + 2];
2519 44630 : dfVal2 += pChunk[i + 3] * padfWeights[i + 3];
2520 : }
2521 : #endif
2522 46358 : for (; i < nSrcPixelCount; ++i)
2523 : {
2524 1472 : dfVal1 += pChunk[i] * padfWeights[i];
2525 : }
2526 44886 : return dfVal1 + dfVal2;
2527 : }
2528 :
2529 : template <class T>
2530 44576 : static inline void GDALResampleConvolutionHorizontalWithMask(
2531 : const T *pChunk, const GByte *pabyMask, const double *padfWeights,
2532 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2533 : {
2534 44576 : dfVal = 0;
2535 44576 : dfWeightSum = 0;
2536 44576 : int i = 0;
2537 98300 : for (; i + 3 < nSrcPixelCount; i += 4)
2538 : {
2539 53724 : const double dfWeight0 = padfWeights[i] * pabyMask[i];
2540 53724 : const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
2541 53724 : const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
2542 53724 : const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
2543 53724 : dfVal += pChunk[i] * dfWeight0;
2544 53724 : dfVal += pChunk[i + 1] * dfWeight1;
2545 53724 : dfVal += pChunk[i + 2] * dfWeight2;
2546 53724 : dfVal += pChunk[i + 3] * dfWeight3;
2547 53724 : dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
2548 : }
2549 61162 : for (; i < nSrcPixelCount; ++i)
2550 : {
2551 16586 : const double dfWeight = padfWeights[i] * pabyMask[i];
2552 16586 : dfVal += pChunk[i] * dfWeight;
2553 16586 : dfWeightSum += dfWeight;
2554 : }
2555 44576 : }
2556 :
2557 : template <class T>
2558 1340094 : static inline void GDALResampleConvolutionHorizontal_3rows(
2559 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2560 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2561 : double &dfRes2, double &dfRes3)
2562 : {
2563 1340094 : double dfVal1 = 0.0;
2564 1340094 : double dfVal2 = 0.0;
2565 1340094 : double dfVal3 = 0.0;
2566 1340094 : double dfVal4 = 0.0;
2567 1340094 : double dfVal5 = 0.0;
2568 1340094 : double dfVal6 = 0.0;
2569 1340094 : int i = 0; // Used after for.
2570 2733937 : for (; i + 3 < nSrcPixelCount; i += 4)
2571 : {
2572 1393842 : dfVal1 += pChunkRow1[i] * padfWeights[i];
2573 1393842 : dfVal1 += pChunkRow1[i + 1] * padfWeights[i + 1];
2574 1393842 : dfVal2 += pChunkRow1[i + 2] * padfWeights[i + 2];
2575 1393842 : dfVal2 += pChunkRow1[i + 3] * padfWeights[i + 3];
2576 1393842 : dfVal3 += pChunkRow2[i] * padfWeights[i];
2577 1393842 : dfVal3 += pChunkRow2[i + 1] * padfWeights[i + 1];
2578 1393842 : dfVal4 += pChunkRow2[i + 2] * padfWeights[i + 2];
2579 1393842 : dfVal4 += pChunkRow2[i + 3] * padfWeights[i + 3];
2580 1393842 : dfVal5 += pChunkRow3[i] * padfWeights[i];
2581 1393842 : dfVal5 += pChunkRow3[i + 1] * padfWeights[i + 1];
2582 1393842 : dfVal6 += pChunkRow3[i + 2] * padfWeights[i + 2];
2583 1393842 : dfVal6 += pChunkRow3[i + 3] * padfWeights[i + 3];
2584 : }
2585 1378621 : for (; i < nSrcPixelCount; ++i)
2586 : {
2587 38527 : dfVal1 += pChunkRow1[i] * padfWeights[i];
2588 38527 : dfVal3 += pChunkRow2[i] * padfWeights[i];
2589 38527 : dfVal5 += pChunkRow3[i] * padfWeights[i];
2590 : }
2591 1340094 : dfRes1 = dfVal1 + dfVal2;
2592 1340094 : dfRes2 = dfVal3 + dfVal4;
2593 1340094 : dfRes3 = dfVal5 + dfVal6;
2594 1340094 : }
2595 :
2596 : template <class T>
2597 18828 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
2598 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2599 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2600 : double &dfRes2, double &dfRes3)
2601 : {
2602 18828 : GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2603 : padfWeights, nSrcPixelCount, dfRes1,
2604 : dfRes2, dfRes3);
2605 18828 : }
2606 :
2607 : template <class T>
2608 1256466 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
2609 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2610 : const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
2611 : {
2612 1256466 : GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2613 : padfWeights, 4, dfRes1, dfRes2,
2614 : dfRes3);
2615 1256466 : }
2616 :
2617 : /************************************************************************/
2618 : /* GDALResampleConvolutionVertical() */
2619 : /************************************************************************/
2620 :
2621 : template <class T>
2622 : static inline double
2623 465199 : GDALResampleConvolutionVertical(const T *pChunk, int nStride,
2624 : const double *padfWeights, int nSrcLineCount)
2625 : {
2626 465199 : double dfVal1 = 0.0;
2627 465199 : double dfVal2 = 0.0;
2628 465199 : int i = 0;
2629 465199 : int j = 0;
2630 916010 : for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2631 : {
2632 450811 : dfVal1 += pChunk[j] * padfWeights[i];
2633 450811 : dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
2634 450811 : dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2635 450811 : dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2636 : }
2637 518702 : for (; i < nSrcLineCount; ++i, j += nStride)
2638 : {
2639 53503 : dfVal1 += pChunk[j] * padfWeights[i];
2640 : }
2641 465199 : return dfVal1 + dfVal2;
2642 : }
2643 :
2644 : template <class T>
2645 2880000 : static inline void GDALResampleConvolutionVertical_2cols(
2646 : const T *pChunk, int nStride, const double *padfWeights, int nSrcLineCount,
2647 : double &dfRes1, double &dfRes2)
2648 : {
2649 2880000 : double dfVal1 = 0.0;
2650 2880000 : double dfVal2 = 0.0;
2651 2880000 : double dfVal3 = 0.0;
2652 2880000 : double dfVal4 = 0.0;
2653 2880000 : int i = 0;
2654 2880000 : int j = 0;
2655 5716800 : for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2656 : {
2657 2836800 : dfVal1 += pChunk[j] * padfWeights[i];
2658 2836800 : dfVal3 += pChunk[j + 1] * padfWeights[i];
2659 2836800 : dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
2660 2836800 : dfVal3 += pChunk[j + 1 + nStride] * padfWeights[i + 1];
2661 2836800 : dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2662 2836800 : dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
2663 2836800 : dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2664 2836800 : dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
2665 : }
2666 2995210 : for (; i < nSrcLineCount; ++i, j += nStride)
2667 : {
2668 115210 : dfVal1 += pChunk[j] * padfWeights[i];
2669 115210 : dfVal3 += pChunk[j + 1] * padfWeights[i];
2670 : }
2671 2880000 : dfRes1 = dfVal1 + dfVal2;
2672 2880000 : dfRes2 = dfVal3 + dfVal4;
2673 2880000 : }
2674 :
2675 : #ifdef USE_SSE2
2676 :
2677 : #ifdef __AVX__
2678 : /************************************************************************/
2679 : /* GDALResampleConvolutionVertical_16cols<T> */
2680 : /************************************************************************/
2681 :
2682 : template <class T>
2683 : static inline void
2684 : GDALResampleConvolutionVertical_16cols(const T *pChunk, int nStride,
2685 : const double *padfWeights,
2686 : int nSrcLineCount, float *afDest)
2687 : {
2688 : int i = 0;
2689 : int j = 0;
2690 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2691 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2692 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2693 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2694 : for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2695 : {
2696 : XMMReg4Double w0 =
2697 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2698 : XMMReg4Double w1 =
2699 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2700 : XMMReg4Double w2 =
2701 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2702 : XMMReg4Double w3 =
2703 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2704 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2705 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2706 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
2707 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
2708 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2709 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2710 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
2711 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
2712 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2713 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2714 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
2715 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
2716 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2717 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2718 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
2719 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
2720 : }
2721 : for (; i < nSrcLineCount; ++i, j += nStride)
2722 : {
2723 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2724 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2725 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2726 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
2727 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
2728 : }
2729 : v_acc0.Store4Val(afDest);
2730 : v_acc1.Store4Val(afDest + 4);
2731 : v_acc2.Store4Val(afDest + 8);
2732 : v_acc3.Store4Val(afDest + 12);
2733 : }
2734 :
2735 : template <class T>
2736 : static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
2737 : const double *, int,
2738 : double *)
2739 : {
2740 : // Cannot be reached
2741 : CPLAssert(false);
2742 : }
2743 :
2744 : #else
2745 :
2746 : /************************************************************************/
2747 : /* GDALResampleConvolutionVertical_8cols<T> */
2748 : /************************************************************************/
2749 :
2750 : template <class T>
2751 : static inline void
2752 21404100 : GDALResampleConvolutionVertical_8cols(const T *pChunk, int nStride,
2753 : const double *padfWeights,
2754 : int nSrcLineCount, float *afDest)
2755 : {
2756 21404100 : int i = 0;
2757 21404100 : int j = 0;
2758 21404100 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2759 21372700 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2760 40878800 : for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2761 : {
2762 19489500 : XMMReg4Double w0 =
2763 19489500 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2764 19472200 : XMMReg4Double w1 =
2765 19472200 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2766 19477400 : XMMReg4Double w2 =
2767 19477400 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2768 19479600 : XMMReg4Double w3 =
2769 19479600 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2770 19482100 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2771 19463000 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2772 19459600 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2773 19442200 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2774 19469400 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2775 19470100 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2776 19457900 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2777 19453100 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2778 : }
2779 32915400 : for (; i < nSrcLineCount; ++i, j += nStride)
2780 : {
2781 11526100 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2782 11526100 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2783 11526100 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2784 : }
2785 21389300 : v_acc0.Store4Val(afDest);
2786 21381200 : v_acc1.Store4Val(afDest + 4);
2787 21407600 : }
2788 :
2789 : template <class T>
2790 : static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
2791 : const double *, int,
2792 : double *)
2793 : {
2794 : // Cannot be reached
2795 : CPLAssert(false);
2796 : }
2797 :
2798 : #endif // __AVX__
2799 :
2800 : /************************************************************************/
2801 : /* GDALResampleConvolutionHorizontalSSE2<T> */
2802 : /************************************************************************/
2803 :
2804 : template <class T>
2805 2987566 : static inline double GDALResampleConvolutionHorizontalSSE2(
2806 : const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2807 : {
2808 2987566 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2809 2987142 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2810 2986749 : int i = 0; // Used after for.
2811 3213386 : for (; i + 7 < nSrcPixelCount; i += 8)
2812 : {
2813 : // Retrieve the pixel & accumulate
2814 226604 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
2815 226606 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
2816 226604 : const XMMReg4Double v_weight1 =
2817 226604 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2818 226603 : const XMMReg4Double v_weight2 =
2819 226603 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
2820 :
2821 226606 : v_acc1 += v_pixels1 * v_weight1;
2822 226600 : v_acc2 += v_pixels2 * v_weight2;
2823 : }
2824 :
2825 2986784 : v_acc1 += v_acc2;
2826 :
2827 2987330 : double dfVal = v_acc1.GetHorizSum();
2828 10151620 : for (; i < nSrcPixelCount; ++i)
2829 : {
2830 7164480 : dfVal += pChunk[i] * padfWeightsAligned[i];
2831 : }
2832 2987141 : return dfVal;
2833 : }
2834 :
2835 : /************************************************************************/
2836 : /* GDALResampleConvolutionHorizontal<GByte> */
2837 : /************************************************************************/
2838 :
2839 : template <>
2840 2438940 : inline double GDALResampleConvolutionHorizontal<GByte>(
2841 : const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2842 : {
2843 2438940 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
2844 2438930 : nSrcPixelCount);
2845 : }
2846 :
2847 : template <>
2848 548694 : inline double GDALResampleConvolutionHorizontal<GUInt16>(
2849 : const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2850 : {
2851 548694 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
2852 548752 : nSrcPixelCount);
2853 : }
2854 :
2855 : /************************************************************************/
2856 : /* GDALResampleConvolutionHorizontalWithMaskSSE2<T> */
2857 : /************************************************************************/
2858 :
2859 : template <class T>
2860 7067643 : static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
2861 : const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
2862 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2863 : {
2864 7067643 : int i = 0; // Used after for.
2865 7067643 : XMMReg4Double v_acc = XMMReg4Double::Zero();
2866 7066693 : XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
2867 19752321 : for (; i + 3 < nSrcPixelCount; i += 4)
2868 : {
2869 12685358 : const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
2870 12670558 : const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
2871 12686658 : XMMReg4Double v_weight =
2872 12686658 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2873 12685158 : v_weight *= v_mask;
2874 12671358 : v_acc += v_pixels * v_weight;
2875 12685558 : v_acc_weight += v_weight;
2876 : }
2877 :
2878 7066933 : dfVal = v_acc.GetHorizSum();
2879 7066173 : dfWeightSum = v_acc_weight.GetHorizSum();
2880 7297133 : for (; i < nSrcPixelCount; ++i)
2881 : {
2882 231086 : const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
2883 231086 : dfVal += pChunk[i] * dfWeight;
2884 231086 : dfWeightSum += dfWeight;
2885 : }
2886 7066043 : }
2887 :
2888 : /************************************************************************/
2889 : /* GDALResampleConvolutionHorizontalWithMask<GByte> */
2890 : /************************************************************************/
2891 :
2892 : template <>
2893 7067210 : inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
2894 : const GByte *pChunk, const GByte *pabyMask,
2895 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2896 : double &dfWeightSum)
2897 : {
2898 7067210 : GDALResampleConvolutionHorizontalWithMaskSSE2(
2899 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2900 : dfWeightSum);
2901 7066510 : }
2902 :
2903 : template <>
2904 63 : inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
2905 : const GUInt16 *pChunk, const GByte *pabyMask,
2906 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2907 : double &dfWeightSum)
2908 : {
2909 63 : GDALResampleConvolutionHorizontalWithMaskSSE2(
2910 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2911 : dfWeightSum);
2912 63 : }
2913 :
2914 : /************************************************************************/
2915 : /* GDALResampleConvolutionHorizontal_3rows_SSE2<T> */
2916 : /************************************************************************/
2917 :
2918 : template <class T>
2919 16989430 : static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
2920 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2921 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2922 : double &dfRes2, double &dfRes3)
2923 : {
2924 16989430 : XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
2925 16971330 : v_acc2 = XMMReg4Double::Zero(),
2926 16984230 : v_acc3 = XMMReg4Double::Zero();
2927 16986330 : int i = 0;
2928 33843766 : for (; i + 7 < nSrcPixelCount; i += 8)
2929 : {
2930 : // Retrieve the pixel & accumulate.
2931 16870536 : XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
2932 16897636 : XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
2933 16901936 : const XMMReg4Double v_weight1 =
2934 16901936 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2935 16881736 : const XMMReg4Double v_weight2 =
2936 16881736 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
2937 :
2938 16891436 : v_acc1 += v_pixels1 * v_weight1;
2939 16871036 : v_acc1 += v_pixels2 * v_weight2;
2940 :
2941 16875936 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
2942 16877936 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
2943 16886436 : v_acc2 += v_pixels1 * v_weight1;
2944 16881636 : v_acc2 += v_pixels2 * v_weight2;
2945 :
2946 16878336 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
2947 16882136 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
2948 16892736 : v_acc3 += v_pixels1 * v_weight1;
2949 16871336 : v_acc3 += v_pixels2 * v_weight2;
2950 : }
2951 :
2952 16973230 : dfRes1 = v_acc1.GetHorizSum();
2953 16961030 : dfRes2 = v_acc2.GetHorizSum();
2954 16972230 : dfRes3 = v_acc3.GetHorizSum();
2955 28692526 : for (; i < nSrcPixelCount; ++i)
2956 : {
2957 11718796 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
2958 11718796 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
2959 11718796 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
2960 : }
2961 16973730 : }
2962 :
2963 : /************************************************************************/
2964 : /* GDALResampleConvolutionHorizontal_3rows<GByte> */
2965 : /************************************************************************/
2966 :
2967 : template <>
2968 17000100 : inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
2969 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
2970 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2971 : double &dfRes2, double &dfRes3)
2972 : {
2973 17000100 : GDALResampleConvolutionHorizontal_3rows_SSE2(
2974 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
2975 : dfRes1, dfRes2, dfRes3);
2976 16954300 : }
2977 :
2978 : template <>
2979 30 : inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
2980 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
2981 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
2982 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
2983 : {
2984 30 : GDALResampleConvolutionHorizontal_3rows_SSE2(
2985 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
2986 : dfRes1, dfRes2, dfRes3);
2987 30 : }
2988 :
2989 : /************************************************************************/
2990 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T> */
2991 : /************************************************************************/
2992 :
2993 : template <class T>
2994 3600705 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
2995 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2996 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2997 : double &dfRes2, double &dfRes3)
2998 : {
2999 3600705 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3000 3600607 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3001 3600648 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
3002 3600616 : int i = 0; // Use after for.
3003 6419107 : for (; i + 3 < nSrcPixelCount; i += 4)
3004 : {
3005 : // Retrieve the pixel & accumulate.
3006 2818480 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3007 2818480 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3008 2818480 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3009 2818480 : const XMMReg4Double v_weight =
3010 2818480 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3011 :
3012 2818480 : v_acc1 += v_pixels1 * v_weight;
3013 2818480 : v_acc2 += v_pixels2 * v_weight;
3014 2818480 : v_acc3 += v_pixels3 * v_weight;
3015 : }
3016 :
3017 3600627 : dfRes1 = v_acc1.GetHorizSum();
3018 3600574 : dfRes2 = v_acc2.GetHorizSum();
3019 3600605 : dfRes3 = v_acc3.GetHorizSum();
3020 :
3021 7983110 : for (; i < nSrcPixelCount; ++i)
3022 : {
3023 4382532 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3024 4382532 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3025 4382532 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3026 : }
3027 3600578 : }
3028 :
3029 : /************************************************************************/
3030 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte> */
3031 : /************************************************************************/
3032 :
3033 : template <>
3034 3533580 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
3035 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3036 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3037 : double &dfRes2, double &dfRes3)
3038 : {
3039 3533580 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3040 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3041 : dfRes1, dfRes2, dfRes3);
3042 3533720 : }
3043 :
3044 : template <>
3045 66920 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
3046 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3047 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3048 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3049 : {
3050 66920 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3051 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3052 : dfRes1, dfRes2, dfRes3);
3053 67044 : }
3054 :
3055 : /************************************************************************/
3056 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T> */
3057 : /************************************************************************/
3058 :
3059 : template <class T>
3060 13860200 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3061 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3062 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3063 : double &dfRes3)
3064 : {
3065 13860200 : const XMMReg4Double v_weight =
3066 : XMMReg4Double::Load4ValAligned(padfWeightsAligned);
3067 :
3068 : // Retrieve the pixel & accumulate.
3069 13864860 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
3070 13901430 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
3071 13882600 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
3072 :
3073 13876000 : XMMReg4Double v_acc1 = v_pixels1 * v_weight;
3074 13818110 : XMMReg4Double v_acc2 = v_pixels2 * v_weight;
3075 13867390 : XMMReg4Double v_acc3 = v_pixels3 * v_weight;
3076 :
3077 13807110 : dfRes1 = v_acc1.GetHorizSum();
3078 13860060 : dfRes2 = v_acc2.GetHorizSum();
3079 13873970 : dfRes3 = v_acc3.GetHorizSum();
3080 13875300 : }
3081 :
3082 : /************************************************************************/
3083 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte> */
3084 : /************************************************************************/
3085 :
3086 : template <>
3087 8262320 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
3088 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3089 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3090 : double &dfRes3)
3091 : {
3092 8262320 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3093 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3094 : dfRes3);
3095 8253720 : }
3096 :
3097 : template <>
3098 5601960 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
3099 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3100 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
3101 : double &dfRes2, double &dfRes3)
3102 : {
3103 5601960 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3104 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3105 : dfRes3);
3106 5626860 : }
3107 :
3108 : #endif // USE_SSE2
3109 :
3110 : /************************************************************************/
3111 : /* GDALResampleChunk_Convolution() */
3112 : /************************************************************************/
3113 :
3114 : template <class T, class Twork, GDALDataType eWrkDataType>
3115 4470 : static CPLErr GDALResampleChunk_ConvolutionT(
3116 : const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
3117 : FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
3118 : int nKernelRadius, bool bKernelWithNegativeWeights, float fMaxVal)
3119 :
3120 : {
3121 4470 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
3122 4470 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
3123 4470 : const double dfSrcXDelta = args.dfSrcXDelta;
3124 4470 : const double dfSrcYDelta = args.dfSrcYDelta;
3125 4470 : constexpr int nBands = 1;
3126 4470 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
3127 4470 : const int nChunkXOff = args.nChunkXOff;
3128 4470 : const int nChunkXSize = args.nChunkXSize;
3129 4470 : const int nChunkYOff = args.nChunkYOff;
3130 4470 : const int nChunkYSize = args.nChunkYSize;
3131 4470 : const int nDstXOff = args.nDstXOff;
3132 4470 : const int nDstXOff2 = args.nDstXOff2;
3133 4470 : const int nDstYOff = args.nDstYOff;
3134 4470 : const int nDstYOff2 = args.nDstYOff2;
3135 4470 : const bool bHasNoData = args.bHasNoData;
3136 4470 : double dfNoDataValue = args.dfNoDataValue;
3137 :
3138 4470 : if (!bHasNoData)
3139 4387 : dfNoDataValue = 0.0;
3140 4470 : const auto dstDataType = args.eOvrDataType;
3141 4470 : const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
3142 4467 : const double dfReplacementVal =
3143 75 : bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
3144 : : dfNoDataValue;
3145 : // cppcheck-suppress unreadVariable
3146 4467 : const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
3147 4455 : const bool bNoDataValueInt64Valid =
3148 4463 : isIntegerDT && GDALIsValueExactAs<GInt64>(dfNoDataValue);
3149 4455 : const auto nNodataValueInt64 =
3150 : bNoDataValueInt64Valid ? static_cast<GInt64>(dfNoDataValue) : 0;
3151 4455 : constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
3152 :
3153 : // TODO: we should have some generic function to do this.
3154 4455 : Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
3155 4455 : Twork fDstMax = cpl::NumericLimits<Twork>::max();
3156 4455 : if (dstDataType == GDT_Byte)
3157 : {
3158 3733 : fDstMin = std::numeric_limits<GByte>::min();
3159 3730 : fDstMax = std::numeric_limits<GByte>::max();
3160 : }
3161 725 : else if (dstDataType == GDT_Int8)
3162 : {
3163 1 : fDstMin = std::numeric_limits<GInt8>::min();
3164 1 : fDstMax = std::numeric_limits<GInt8>::max();
3165 : }
3166 724 : else if (dstDataType == GDT_UInt16)
3167 : {
3168 388 : fDstMin = std::numeric_limits<GUInt16>::min();
3169 385 : fDstMax = std::numeric_limits<GUInt16>::max();
3170 : }
3171 341 : else if (dstDataType == GDT_Int16)
3172 : {
3173 291 : fDstMin = std::numeric_limits<GInt16>::min();
3174 291 : fDstMax = std::numeric_limits<GInt16>::max();
3175 : }
3176 50 : else if (dstDataType == GDT_UInt32)
3177 : {
3178 1 : fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
3179 1 : fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
3180 : }
3181 49 : else if (dstDataType == GDT_Int32)
3182 : {
3183 : // cppcheck-suppress unreadVariable
3184 2 : fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
3185 : // cppcheck-suppress unreadVariable
3186 2 : fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
3187 : }
3188 47 : else if (dstDataType == GDT_UInt64)
3189 : {
3190 : // cppcheck-suppress unreadVariable
3191 1 : fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
3192 : // cppcheck-suppress unreadVariable
3193 1 : fDstMax = static_cast<Twork>(std::numeric_limits<uint64_t>::max());
3194 : }
3195 46 : else if (dstDataType == GDT_Int64)
3196 : {
3197 : // cppcheck-suppress unreadVariable
3198 1 : fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
3199 : // cppcheck-suppress unreadVariable
3200 1 : fDstMax = static_cast<Twork>(std::numeric_limits<int64_t>::max());
3201 : }
3202 :
3203 37021372 : auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
3204 : bNoDataValueInt64Valid, nNodataValueInt64,
3205 : dfNoDataValue, dfReplacementVal](Twork fVal)
3206 : {
3207 16036800 : if (!bHasNoData)
3208 11838800 : return fVal;
3209 :
3210 : // Clamp value before comparing to nodata: this is only needed for
3211 : // kernels with negative weights (Lanczos)
3212 4197970 : Twork fClamped = fVal;
3213 4197970 : if (fClamped < fDstMin)
3214 15998 : fClamped = fDstMin;
3215 4181970 : else if (fClamped > fDstMax)
3216 16406 : fClamped = fDstMax;
3217 4197970 : if (isIntegerDT)
3218 : {
3219 8381680 : if (bNoDataValueInt64Valid &&
3220 4197870 : nNodataValueInt64 == static_cast<GInt64>(std::round(fClamped)))
3221 : {
3222 : // Do not use the nodata value
3223 14435 : return static_cast<Twork>(dfReplacementVal);
3224 : }
3225 : }
3226 6165 : else if (dfNoDataValue == fClamped)
3227 : {
3228 : // Do not use the nodata value
3229 1 : return static_cast<Twork>(dfReplacementVal);
3230 : }
3231 4175550 : return fClamped;
3232 : };
3233 :
3234 : /* -------------------------------------------------------------------- */
3235 : /* Allocate work buffers. */
3236 : /* -------------------------------------------------------------------- */
3237 4465 : const int nDstXSize = nDstXOff2 - nDstXOff;
3238 4465 : Twork *pafWrkScanline = nullptr;
3239 4465 : if (dstDataType != eWrkDataType)
3240 : {
3241 : pafWrkScanline =
3242 4420 : static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
3243 4423 : if (pafWrkScanline == nullptr)
3244 0 : return CE_Failure;
3245 : }
3246 :
3247 4468 : const double dfXScale = 1.0 / dfXRatioDstToSrc;
3248 4468 : const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
3249 4468 : const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
3250 4468 : const double dfYScale = 1.0 / dfYRatioDstToSrc;
3251 4468 : const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
3252 4468 : const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
3253 :
3254 : // Temporary array to store result of horizontal filter.
3255 : double *padfHorizontalFiltered = static_cast<double *>(
3256 4468 : VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
3257 :
3258 : // To store convolution coefficients.
3259 4471 : double *padfWeights = static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
3260 : static_cast<int>(2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) +
3261 : 0.5) *
3262 : sizeof(double)));
3263 :
3264 4467 : GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
3265 4467 : if (pabyChunkNodataMask)
3266 : pabyChunkNodataMaskHorizontalFiltered =
3267 462 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
3268 4467 : if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
3269 462 : (pabyChunkNodataMask != nullptr &&
3270 : pabyChunkNodataMaskHorizontalFiltered == nullptr))
3271 : {
3272 4 : VSIFree(pafWrkScanline);
3273 0 : VSIFree(padfHorizontalFiltered);
3274 0 : VSIFreeAligned(padfWeights);
3275 0 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3276 0 : return CE_Failure;
3277 : }
3278 :
3279 : /* ==================================================================== */
3280 : /* First pass: horizontal filter */
3281 : /* ==================================================================== */
3282 4464 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
3283 : #ifdef USE_SSE2
3284 4464 : bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
3285 : #endif
3286 2919188 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
3287 : {
3288 2914710 : const double dfSrcPixel =
3289 2914710 : (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
3290 2914710 : int nSrcPixelStart =
3291 2914710 : static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
3292 2914710 : if (nSrcPixelStart < nChunkXOff)
3293 56693 : nSrcPixelStart = nChunkXOff;
3294 2914710 : int nSrcPixelStop =
3295 2914710 : static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
3296 2914710 : if (nSrcPixelStop > nChunkRightXOff)
3297 56714 : nSrcPixelStop = nChunkRightXOff;
3298 : #if 0
3299 : if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
3300 : {
3301 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3302 : }
3303 : if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
3304 : {
3305 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3306 : }
3307 : #endif
3308 2914710 : const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
3309 2914710 : double dfWeightSum = 0.0;
3310 :
3311 : // Compute convolution coefficients.
3312 2914710 : int nSrcPixel = nSrcPixelStart;
3313 2914710 : double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
3314 4057496 : for (; nSrcPixel + 3 < nSrcPixelStop; nSrcPixel += 4)
3315 : {
3316 1142685 : padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
3317 1142685 : dfX += dfXScaleWeight;
3318 1142685 : padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
3319 1142685 : dfX += dfXScaleWeight;
3320 1142685 : padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
3321 1142685 : dfX += dfXScaleWeight;
3322 1142685 : padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
3323 1142685 : dfX += dfXScaleWeight;
3324 1142780 : dfWeightSum +=
3325 1142685 : pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
3326 : }
3327 6902815 : for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
3328 : {
3329 3988570 : const double dfWeight = pfnFilterFunc(dfX);
3330 3988011 : padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3331 3988011 : dfWeightSum += dfWeight;
3332 : }
3333 :
3334 2914245 : const int nHeight = nChunkYSize * nBands;
3335 2914245 : if (pabyChunkNodataMask == nullptr)
3336 : {
3337 2826730 : if (dfWeightSum != 0)
3338 : {
3339 2826748 : const double dfInvWeightSum = 1.0 / dfWeightSum;
3340 10735547 : for (int i = 0; i < nSrcPixelCount; ++i)
3341 7908785 : padfWeights[i] *= dfInvWeightSum;
3342 : }
3343 2826730 : int iSrcLineOff = 0;
3344 : #ifdef USE_SSE2
3345 2826730 : if (nSrcPixelCount == 4)
3346 : {
3347 15737504 : for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
3348 : {
3349 15113176 : const GPtrDiff_t j =
3350 15113176 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3351 15113176 : (nSrcPixelStart - nChunkXOff);
3352 15113176 : double dfVal1 = 0.0;
3353 15113176 : double dfVal2 = 0.0;
3354 15113176 : double dfVal3 = 0.0;
3355 15113176 : GDALResampleConvolutionHorizontalPixelCount4_3rows(
3356 15113176 : pChunk + j, pChunk + j + nChunkXSize,
3357 15113176 : pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
3358 : dfVal2, dfVal3);
3359 15124056 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3360 15124056 : nDstXSize +
3361 15124056 : iDstPixel - nDstXOff] = dfVal1;
3362 15124056 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3363 15124056 : 1) *
3364 15124056 : nDstXSize +
3365 15124056 : iDstPixel - nDstXOff] = dfVal2;
3366 15124056 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3367 15124056 : 2) *
3368 15124056 : nDstXSize +
3369 15124056 : iDstPixel - nDstXOff] = dfVal3;
3370 : }
3371 : }
3372 2213285 : else if (bSrcPixelCountLess8)
3373 : {
3374 5663461 : for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
3375 : {
3376 3619333 : const GPtrDiff_t j =
3377 3619333 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3378 3619333 : (nSrcPixelStart - nChunkXOff);
3379 3619333 : double dfVal1 = 0.0;
3380 3619333 : double dfVal2 = 0.0;
3381 3619333 : double dfVal3 = 0.0;
3382 3619333 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
3383 3619333 : pChunk + j, pChunk + j + nChunkXSize,
3384 3619333 : pChunk + j + 2 * nChunkXSize, padfWeights,
3385 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3386 3619575 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3387 3619575 : nDstXSize +
3388 3619575 : iDstPixel - nDstXOff] = dfVal1;
3389 3619575 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3390 3619575 : 1) *
3391 3619575 : nDstXSize +
3392 3619575 : iDstPixel - nDstXOff] = dfVal2;
3393 3619575 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3394 3619575 : 2) *
3395 3619575 : nDstXSize +
3396 3619575 : iDstPixel - nDstXOff] = dfVal3;
3397 : }
3398 : }
3399 : else
3400 : #endif
3401 : {
3402 17218239 : for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
3403 : {
3404 17042830 : const GPtrDiff_t j =
3405 17042830 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3406 17042830 : (nSrcPixelStart - nChunkXOff);
3407 17042830 : double dfVal1 = 0.0;
3408 17042830 : double dfVal2 = 0.0;
3409 17042830 : double dfVal3 = 0.0;
3410 17042830 : GDALResampleConvolutionHorizontal_3rows(
3411 17042830 : pChunk + j, pChunk + j + nChunkXSize,
3412 17042830 : pChunk + j + 2 * nChunkXSize, padfWeights,
3413 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3414 17048930 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3415 17048930 : nDstXSize +
3416 17048930 : iDstPixel - nDstXOff] = dfVal1;
3417 17048930 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3418 17048930 : 1) *
3419 17048930 : nDstXSize +
3420 17048930 : iDstPixel - nDstXOff] = dfVal2;
3421 17048930 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3422 17048930 : 2) *
3423 17048930 : nDstXSize +
3424 17048930 : iDstPixel - nDstXOff] = dfVal3;
3425 : }
3426 : }
3427 5876499 : for (; iSrcLineOff < nHeight; ++iSrcLineOff)
3428 : {
3429 3032466 : const GPtrDiff_t j =
3430 3032466 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3431 3032466 : (nSrcPixelStart - nChunkXOff);
3432 6020156 : const double dfVal = GDALResampleConvolutionHorizontal(
3433 3032466 : pChunk + j, padfWeights, nSrcPixelCount);
3434 3032579 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3435 3032579 : nDstXSize +
3436 3032579 : iDstPixel - nDstXOff] = dfVal;
3437 : }
3438 : }
3439 : else
3440 : {
3441 20497568 : for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
3442 : {
3443 20412846 : const GPtrDiff_t j =
3444 20412846 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3445 20412846 : (nSrcPixelStart - nChunkXOff);
3446 :
3447 20412846 : if (bKernelWithNegativeWeights)
3448 : {
3449 19899712 : int nConsecutiveValid = 0;
3450 19899712 : int nMaxConsecutiveValid = 0;
3451 181965458 : for (int k = 0; k < nSrcPixelCount; k++)
3452 : {
3453 162065146 : if (pabyChunkNodataMask[j + k])
3454 48858253 : nConsecutiveValid++;
3455 113206793 : else if (nConsecutiveValid)
3456 : {
3457 108870 : nMaxConsecutiveValid = std::max(
3458 107790 : nMaxConsecutiveValid, nConsecutiveValid);
3459 108870 : nConsecutiveValid = 0;
3460 : }
3461 : }
3462 19889412 : nMaxConsecutiveValid =
3463 19900812 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
3464 19889412 : if (nMaxConsecutiveValid < nSrcPixelCount / 2)
3465 : {
3466 13314907 : const size_t nTempOffset =
3467 13314907 : static_cast<size_t>(iSrcLineOff) * nDstXSize +
3468 13314907 : iDstPixel - nDstXOff;
3469 13314907 : padfHorizontalFiltered[nTempOffset] = 0.0;
3470 13314907 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3471 13314907 : continue;
3472 : }
3473 : }
3474 :
3475 7087729 : double dfVal = 0.0;
3476 7087729 : GDALResampleConvolutionHorizontalWithMask(
3477 7087729 : pChunk + j, pabyChunkNodataMask + j, padfWeights,
3478 : nSrcPixelCount, dfVal, dfWeightSum);
3479 7095096 : const size_t nTempOffset =
3480 7095096 : static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
3481 7095096 : nDstXOff;
3482 7095096 : if (dfWeightSum > 0.0)
3483 : {
3484 7067156 : padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
3485 7067156 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
3486 : }
3487 : else
3488 : {
3489 28002 : padfHorizontalFiltered[nTempOffset] = 0.0;
3490 28002 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3491 : }
3492 : }
3493 : }
3494 : }
3495 :
3496 : /* ==================================================================== */
3497 : /* Second pass: vertical filter */
3498 : /* ==================================================================== */
3499 4473 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
3500 :
3501 266292 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3502 : {
3503 261819 : Twork *const pafDstScanline =
3504 261819 : pafWrkScanline ? pafWrkScanline
3505 8421 : : static_cast<Twork *>(pDstBuffer) +
3506 8421 : (iDstLine - nDstYOff) * nDstXSize;
3507 :
3508 261819 : const double dfSrcLine =
3509 261819 : (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
3510 261819 : int nSrcLineStart =
3511 261819 : static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
3512 261819 : int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
3513 261819 : if (nSrcLineStart < nChunkYOff)
3514 2815 : nSrcLineStart = nChunkYOff;
3515 261819 : if (nSrcLineStop > nChunkBottomYOff)
3516 2859 : nSrcLineStop = nChunkBottomYOff;
3517 : #if 0
3518 : if( nSrcLineStart < nChunkYOff &&
3519 : nChunkYOff > 0 )
3520 : {
3521 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3522 : }
3523 : if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
3524 : {
3525 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3526 : }
3527 : #endif
3528 261819 : const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
3529 261819 : double dfWeightSum = 0.0;
3530 :
3531 : // Compute convolution coefficients.
3532 261819 : int nSrcLine = nSrcLineStart; // Used after for.
3533 261819 : double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
3534 616063 : for (; nSrcLine + 3 < nSrcLineStop;
3535 354244 : nSrcLine += 4, dfY += 4 * dfYScaleWeight)
3536 : {
3537 354243 : padfWeights[nSrcLine - nSrcLineStart] = dfY;
3538 354243 : padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
3539 354243 : padfWeights[nSrcLine + 2 - nSrcLineStart] =
3540 354243 : dfY + 2 * dfYScaleWeight;
3541 354243 : padfWeights[nSrcLine + 3 - nSrcLineStart] =
3542 354243 : dfY + 3 * dfYScaleWeight;
3543 354244 : dfWeightSum +=
3544 354243 : pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
3545 : }
3546 297506 : for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
3547 : {
3548 35694 : const double dfWeight = pfnFilterFunc(dfY);
3549 35686 : padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
3550 35686 : dfWeightSum += dfWeight;
3551 : }
3552 :
3553 261812 : if (pabyChunkNodataMask == nullptr)
3554 : {
3555 222820 : if (dfWeightSum != 0)
3556 : {
3557 222803 : const double dfInvWeightSum = 1.0 / dfWeightSum;
3558 1402834 : for (int i = 0; i < nSrcLineCount; ++i)
3559 1180031 : padfWeights[i] *= dfInvWeightSum;
3560 : }
3561 : }
3562 :
3563 261812 : if (pabyChunkNodataMask == nullptr)
3564 : {
3565 222804 : int iFilteredPixelOff = 0; // Used after for.
3566 : // j used after for.
3567 222804 : size_t j =
3568 222804 : (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
3569 : #ifdef USE_SSE2
3570 : if constexpr (eWrkDataType == GDT_Float32)
3571 : {
3572 : #ifdef __AVX__
3573 : for (; iFilteredPixelOff + 15 < nDstXSize;
3574 : iFilteredPixelOff += 16, j += 16)
3575 : {
3576 : GDALResampleConvolutionVertical_16cols(
3577 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3578 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3579 : if (bHasNoData)
3580 : {
3581 : for (int k = 0; k < 16; k++)
3582 : {
3583 : pafDstScanline[iFilteredPixelOff + k] =
3584 : replaceValIfNodata(
3585 : pafDstScanline[iFilteredPixelOff + k]);
3586 : }
3587 : }
3588 : }
3589 : #else
3590 21591058 : for (; iFilteredPixelOff + 7 < nDstXSize;
3591 : iFilteredPixelOff += 8, j += 8)
3592 : {
3593 21418218 : GDALResampleConvolutionVertical_8cols(
3594 21418218 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3595 21418218 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3596 21375448 : if (bHasNoData)
3597 : {
3598 123192 : for (int k = 0; k < 8; k++)
3599 : {
3600 109504 : pafDstScanline[iFilteredPixelOff + k] =
3601 109504 : replaceValIfNodata(
3602 109504 : pafDstScanline[iFilteredPixelOff + k]);
3603 : }
3604 : }
3605 : }
3606 : #endif
3607 :
3608 638007 : for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
3609 : {
3610 465244 : const Twork fVal =
3611 465187 : static_cast<Twork>(GDALResampleConvolutionVertical(
3612 465187 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3613 : nSrcLineCount));
3614 465180 : pafDstScanline[iFilteredPixelOff] =
3615 465244 : replaceValIfNodata(fVal);
3616 : }
3617 : }
3618 : else
3619 : #endif
3620 : {
3621 2887210 : for (; iFilteredPixelOff + 1 < nDstXSize;
3622 : iFilteredPixelOff += 2, j += 2)
3623 : {
3624 2880000 : double dfVal1 = 0.0;
3625 2880000 : double dfVal2 = 0.0;
3626 2880000 : GDALResampleConvolutionVertical_2cols(
3627 2880000 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3628 : nSrcLineCount, dfVal1, dfVal2);
3629 5760010 : pafDstScanline[iFilteredPixelOff] =
3630 2880000 : replaceValIfNodata(static_cast<Twork>(dfVal1));
3631 2880000 : pafDstScanline[iFilteredPixelOff + 1] =
3632 2880000 : replaceValIfNodata(static_cast<Twork>(dfVal2));
3633 : }
3634 7206 : if (iFilteredPixelOff < nDstXSize)
3635 : {
3636 2 : const double dfVal = GDALResampleConvolutionVertical(
3637 2 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3638 : nSrcLineCount);
3639 2 : pafDstScanline[iFilteredPixelOff] =
3640 2 : replaceValIfNodata(static_cast<Twork>(dfVal));
3641 : }
3642 : }
3643 : }
3644 : else
3645 : {
3646 18980239 : for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
3647 : ++iFilteredPixelOff)
3648 : {
3649 18969033 : double dfVal = 0.0;
3650 18969033 : dfWeightSum = 0.0;
3651 18969033 : size_t j = (nSrcLineStart - nChunkYOff) *
3652 18969033 : static_cast<size_t>(nDstXSize) +
3653 18969033 : iFilteredPixelOff;
3654 18969033 : if (bKernelWithNegativeWeights)
3655 : {
3656 18722001 : int nConsecutiveValid = 0;
3657 18722001 : int nMaxConsecutiveValid = 0;
3658 132756321 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3659 : {
3660 114033020 : const double dfWeight =
3661 114033020 : padfWeights[i] *
3662 : pabyChunkNodataMaskHorizontalFiltered[j];
3663 114033020 : if (pabyChunkNodataMaskHorizontalFiltered[j])
3664 : {
3665 48323137 : nConsecutiveValid++;
3666 : }
3667 65709483 : else if (nConsecutiveValid)
3668 : {
3669 205685 : nMaxConsecutiveValid = std::max(
3670 204376 : nMaxConsecutiveValid, nConsecutiveValid);
3671 205685 : nConsecutiveValid = 0;
3672 : }
3673 114034020 : dfVal += padfHorizontalFiltered[j] * dfWeight;
3674 114034020 : dfWeightSum += dfWeight;
3675 : }
3676 18693601 : nMaxConsecutiveValid =
3677 18723301 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
3678 18693601 : if (nMaxConsecutiveValid < nSrcLineCount / 2)
3679 : {
3680 9246271 : pafDstScanline[iFilteredPixelOff] =
3681 9246179 : static_cast<Twork>(dfNoDataValue);
3682 9246271 : continue;
3683 : }
3684 : }
3685 : else
3686 : {
3687 1240572 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3688 : {
3689 993504 : const double dfWeight =
3690 993504 : padfWeights[i] *
3691 : pabyChunkNodataMaskHorizontalFiltered[j];
3692 993504 : dfVal += padfHorizontalFiltered[j] * dfWeight;
3693 993504 : dfWeightSum += dfWeight;
3694 : }
3695 : }
3696 9694362 : if (dfWeightSum > 0.0)
3697 : {
3698 9705461 : pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
3699 9704899 : static_cast<Twork>(dfVal / dfWeightSum));
3700 : }
3701 : else
3702 : {
3703 41 : pafDstScanline[iFilteredPixelOff] =
3704 17 : static_cast<Twork>(dfNoDataValue);
3705 : }
3706 : }
3707 : }
3708 :
3709 191178 : if (fMaxVal != 0.0f)
3710 : {
3711 192324 : for (int i = 0; i < nDstXSize; ++i)
3712 : {
3713 192088 : if (pafDstScanline[i] > fMaxVal)
3714 96022 : pafDstScanline[i] = fMaxVal;
3715 : }
3716 : }
3717 :
3718 191178 : if (pafWrkScanline)
3719 : {
3720 253382 : GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
3721 : static_cast<GByte *>(pDstBuffer) +
3722 253382 : static_cast<size_t>(iDstLine - nDstYOff) *
3723 253382 : nDstXSize * nDstDataTypeSize,
3724 : dstDataType, nDstDataTypeSize, nDstXSize);
3725 : }
3726 : }
3727 :
3728 4473 : VSIFree(pafWrkScanline);
3729 4473 : VSIFreeAligned(padfWeights);
3730 4473 : VSIFree(padfHorizontalFiltered);
3731 4473 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3732 :
3733 4473 : return CE_None;
3734 : }
3735 :
3736 : static CPLErr
3737 4473 : GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
3738 : const void *pChunk, void **ppDstBuffer,
3739 : GDALDataType *peDstBufferDataType)
3740 : {
3741 : GDALResampleAlg eResample;
3742 4473 : bool bKernelWithNegativeWeights = false;
3743 4473 : if (EQUAL(args.pszResampling, "BILINEAR"))
3744 2628 : eResample = GRA_Bilinear;
3745 1845 : else if (EQUAL(args.pszResampling, "CUBIC"))
3746 : {
3747 1761 : eResample = GRA_Cubic;
3748 1761 : bKernelWithNegativeWeights = true;
3749 : }
3750 84 : else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
3751 23 : eResample = GRA_CubicSpline;
3752 61 : else if (EQUAL(args.pszResampling, "LANCZOS"))
3753 : {
3754 54 : eResample = GRA_Lanczos;
3755 54 : bKernelWithNegativeWeights = true;
3756 : }
3757 : else
3758 : {
3759 7 : CPLAssert(false);
3760 : return CE_Failure;
3761 : }
3762 4466 : const int nKernelRadius = GWKGetFilterRadius(eResample);
3763 4469 : FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
3764 : const FilterFunc4ValuesType pfnFilterFunc4Values =
3765 4470 : GWKGetFilterFunc4Values(eResample);
3766 :
3767 4466 : float fMaxVal = 0.f;
3768 : // Cubic, etc... can have overshoots, so make sure we clamp values to the
3769 : // maximum value if NBITS is set.
3770 4466 : if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
3771 8 : (args.eOvrDataType == GDT_Byte || args.eOvrDataType == GDT_UInt16 ||
3772 0 : args.eOvrDataType == GDT_UInt32))
3773 : {
3774 8 : int nBits = args.nOvrNBITS;
3775 8 : if (nBits == GDALGetDataTypeSize(args.eOvrDataType))
3776 1 : nBits = 0;
3777 8 : if (nBits > 0 && nBits < 32)
3778 7 : fMaxVal = static_cast<float>((1U << nBits) - 1);
3779 : }
3780 :
3781 4466 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
3782 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
3783 : GDALGetDataTypeSizeBytes(args.eOvrDataType));
3784 4468 : if (*ppDstBuffer == nullptr)
3785 : {
3786 0 : return CE_Failure;
3787 : }
3788 4468 : *peDstBufferDataType = args.eOvrDataType;
3789 :
3790 4468 : switch (args.eWrkDataType)
3791 : {
3792 3732 : case GDT_Byte:
3793 : {
3794 3732 : return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32>(
3795 : args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
3796 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3797 3735 : bKernelWithNegativeWeights, fMaxVal);
3798 : }
3799 :
3800 394 : case GDT_UInt16:
3801 : {
3802 394 : return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32>(
3803 : args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
3804 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3805 396 : bKernelWithNegativeWeights, fMaxVal);
3806 : }
3807 :
3808 313 : case GDT_Float32:
3809 : {
3810 313 : return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32>(
3811 : args, static_cast<const float *>(pChunk), *ppDstBuffer,
3812 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3813 313 : bKernelWithNegativeWeights, fMaxVal);
3814 : }
3815 :
3816 29 : case GDT_Float64:
3817 : {
3818 29 : return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64>(
3819 : args, static_cast<const double *>(pChunk), *ppDstBuffer,
3820 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3821 29 : bKernelWithNegativeWeights, fMaxVal);
3822 : }
3823 :
3824 0 : default:
3825 0 : break;
3826 : }
3827 :
3828 0 : CPLAssert(false);
3829 : return CE_Failure;
3830 : }
3831 :
3832 : /************************************************************************/
3833 : /* GDALResampleChunkC32R() */
3834 : /************************************************************************/
3835 :
3836 2 : static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
3837 : const float *pafChunk, const int nChunkYOff,
3838 : const int nChunkYSize, const int nDstYOff,
3839 : const int nDstYOff2, const int nOvrXSize,
3840 : const int nOvrYSize, void **ppDstBuffer,
3841 : GDALDataType *peDstBufferDataType,
3842 : const char *pszResampling)
3843 :
3844 : {
3845 : enum Method
3846 : {
3847 : NEAR,
3848 : AVERAGE,
3849 : AVERAGE_MAGPHASE,
3850 : RMS,
3851 : };
3852 :
3853 2 : Method eMethod = NEAR;
3854 2 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
3855 : {
3856 0 : eMethod = NEAR;
3857 : }
3858 2 : else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
3859 : {
3860 0 : eMethod = AVERAGE_MAGPHASE;
3861 : }
3862 2 : else if (EQUAL(pszResampling, "RMS"))
3863 : {
3864 2 : eMethod = RMS;
3865 : }
3866 0 : else if (STARTS_WITH_CI(pszResampling, "AVER"))
3867 : {
3868 0 : eMethod = AVERAGE;
3869 : }
3870 : else
3871 : {
3872 0 : CPLError(
3873 : CE_Failure, CPLE_NotSupported,
3874 : "Resampling method %s is not supported for complex data types. "
3875 : "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
3876 : pszResampling);
3877 0 : return CE_Failure;
3878 : }
3879 :
3880 2 : const int nOXSize = nOvrXSize;
3881 2 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
3882 : GDALGetDataTypeSizeBytes(GDT_CFloat32));
3883 2 : if (*ppDstBuffer == nullptr)
3884 : {
3885 0 : return CE_Failure;
3886 : }
3887 2 : float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
3888 2 : *peDstBufferDataType = GDT_CFloat32;
3889 :
3890 2 : const int nOYSize = nOvrYSize;
3891 2 : const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
3892 2 : const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
3893 :
3894 : /* ==================================================================== */
3895 : /* Loop over destination scanlines. */
3896 : /* ==================================================================== */
3897 8 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3898 : {
3899 6 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
3900 6 : if (nSrcYOff < nChunkYOff)
3901 0 : nSrcYOff = nChunkYOff;
3902 :
3903 6 : int nSrcYOff2 =
3904 6 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
3905 6 : if (nSrcYOff2 == nSrcYOff)
3906 0 : nSrcYOff2++;
3907 :
3908 6 : if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
3909 : {
3910 2 : if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
3911 0 : nSrcYOff = nSrcHeight - 1;
3912 2 : nSrcYOff2 = nSrcHeight;
3913 : }
3914 6 : if (nSrcYOff2 > nChunkYOff + nChunkYSize)
3915 0 : nSrcYOff2 = nChunkYOff + nChunkYSize;
3916 :
3917 6 : const float *const pafSrcScanline =
3918 6 : pafChunk + ((nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
3919 6 : float *const pafDstScanline =
3920 6 : pafDstBuffer + (iDstLine - nDstYOff) * 2 * nOXSize;
3921 :
3922 : /* --------------------------------------------------------------------
3923 : */
3924 : /* Loop over destination pixels */
3925 : /* --------------------------------------------------------------------
3926 : */
3927 18 : for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
3928 : {
3929 12 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
3930 12 : int nSrcXOff2 =
3931 12 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
3932 12 : if (nSrcXOff2 == nSrcXOff)
3933 0 : nSrcXOff2++;
3934 12 : if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
3935 : {
3936 6 : if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
3937 0 : nSrcXOff = nSrcWidth - 1;
3938 6 : nSrcXOff2 = nSrcWidth;
3939 : }
3940 :
3941 12 : if (eMethod == NEAR)
3942 : {
3943 0 : pafDstScanline[iDstPixel * 2] = pafSrcScanline[nSrcXOff * 2];
3944 0 : pafDstScanline[iDstPixel * 2 + 1] =
3945 0 : pafSrcScanline[nSrcXOff * 2 + 1];
3946 : }
3947 12 : else if (eMethod == AVERAGE_MAGPHASE)
3948 : {
3949 0 : double dfTotalR = 0.0;
3950 0 : double dfTotalI = 0.0;
3951 0 : double dfTotalM = 0.0;
3952 0 : int nCount = 0;
3953 :
3954 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
3955 : {
3956 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
3957 : {
3958 0 : const double dfR =
3959 0 : pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
3960 0 : iY - nSrcYOff) *
3961 0 : nSrcWidth * 2];
3962 0 : const double dfI =
3963 0 : pafSrcScanline[iX * 2 +
3964 0 : static_cast<GPtrDiff_t>(iY -
3965 0 : nSrcYOff) *
3966 0 : nSrcWidth * 2 +
3967 0 : 1];
3968 0 : dfTotalR += dfR;
3969 0 : dfTotalI += dfI;
3970 0 : dfTotalM += std::hypot(dfR, dfI);
3971 0 : ++nCount;
3972 : }
3973 : }
3974 :
3975 0 : CPLAssert(nCount > 0);
3976 0 : if (nCount == 0)
3977 : {
3978 0 : pafDstScanline[iDstPixel * 2] = 0.0;
3979 0 : pafDstScanline[iDstPixel * 2 + 1] = 0.0;
3980 : }
3981 : else
3982 : {
3983 0 : pafDstScanline[iDstPixel * 2] =
3984 0 : static_cast<float>(dfTotalR / nCount);
3985 0 : pafDstScanline[iDstPixel * 2 + 1] =
3986 0 : static_cast<float>(dfTotalI / nCount);
3987 : const double dfM =
3988 0 : std::hypot(pafDstScanline[iDstPixel * 2],
3989 0 : pafDstScanline[iDstPixel * 2 + 1]);
3990 0 : const double dfDesiredM = dfTotalM / nCount;
3991 0 : double dfRatio = 1.0;
3992 0 : if (dfM != 0.0)
3993 0 : dfRatio = dfDesiredM / dfM;
3994 :
3995 0 : pafDstScanline[iDstPixel * 2] *=
3996 0 : static_cast<float>(dfRatio);
3997 0 : pafDstScanline[iDstPixel * 2 + 1] *=
3998 0 : static_cast<float>(dfRatio);
3999 : }
4000 : }
4001 12 : else if (eMethod == RMS)
4002 : {
4003 12 : double dfTotalR = 0.0;
4004 12 : double dfTotalI = 0.0;
4005 12 : int nCount = 0;
4006 :
4007 36 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4008 : {
4009 72 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4010 : {
4011 48 : const double dfR =
4012 48 : pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
4013 48 : iY - nSrcYOff) *
4014 48 : nSrcWidth * 2];
4015 48 : const double dfI =
4016 48 : pafSrcScanline[iX * 2 +
4017 48 : static_cast<GPtrDiff_t>(iY -
4018 48 : nSrcYOff) *
4019 48 : nSrcWidth * 2 +
4020 48 : 1];
4021 :
4022 48 : dfTotalR += SQUARE(dfR);
4023 48 : dfTotalI += SQUARE(dfI);
4024 :
4025 48 : ++nCount;
4026 : }
4027 : }
4028 :
4029 12 : CPLAssert(nCount > 0);
4030 12 : if (nCount == 0)
4031 : {
4032 0 : pafDstScanline[iDstPixel * 2] = 0.0;
4033 0 : pafDstScanline[iDstPixel * 2 + 1] = 0.0;
4034 : }
4035 : else
4036 : {
4037 : /* compute RMS */
4038 12 : pafDstScanline[iDstPixel * 2] =
4039 12 : static_cast<float>(sqrt(dfTotalR / nCount));
4040 12 : pafDstScanline[iDstPixel * 2 + 1] =
4041 12 : static_cast<float>(sqrt(dfTotalI / nCount));
4042 : }
4043 : }
4044 0 : else if (eMethod == AVERAGE)
4045 : {
4046 0 : double dfTotalR = 0.0;
4047 0 : double dfTotalI = 0.0;
4048 0 : int nCount = 0;
4049 :
4050 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4051 : {
4052 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4053 : {
4054 : // TODO(schwehr): Maybe use std::complex?
4055 0 : dfTotalR +=
4056 0 : pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
4057 0 : iY - nSrcYOff) *
4058 0 : nSrcWidth * 2];
4059 0 : dfTotalI += pafSrcScanline[iX * 2 +
4060 0 : static_cast<GPtrDiff_t>(
4061 0 : iY - nSrcYOff) *
4062 0 : nSrcWidth * 2 +
4063 0 : 1];
4064 0 : ++nCount;
4065 : }
4066 : }
4067 :
4068 0 : CPLAssert(nCount > 0);
4069 0 : if (nCount == 0)
4070 : {
4071 0 : pafDstScanline[iDstPixel * 2] = 0.0;
4072 0 : pafDstScanline[iDstPixel * 2 + 1] = 0.0;
4073 : }
4074 : else
4075 : {
4076 0 : pafDstScanline[iDstPixel * 2] =
4077 0 : static_cast<float>(dfTotalR / nCount);
4078 0 : pafDstScanline[iDstPixel * 2 + 1] =
4079 0 : static_cast<float>(dfTotalI / nCount);
4080 : }
4081 : }
4082 : }
4083 : }
4084 :
4085 2 : return CE_None;
4086 : }
4087 :
4088 : /************************************************************************/
4089 : /* GDALRegenerateCascadingOverviews() */
4090 : /* */
4091 : /* Generate a list of overviews in order from largest to */
4092 : /* smallest, computing each from the next larger. */
4093 : /************************************************************************/
4094 :
4095 44 : static CPLErr GDALRegenerateCascadingOverviews(
4096 : GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
4097 : const char *pszResampling, GDALProgressFunc pfnProgress,
4098 : void *pProgressData, CSLConstList papszOptions)
4099 :
4100 : {
4101 : /* -------------------------------------------------------------------- */
4102 : /* First, we must put the overviews in order from largest to */
4103 : /* smallest. */
4104 : /* -------------------------------------------------------------------- */
4105 127 : for (int i = 0; i < nOverviews - 1; ++i)
4106 : {
4107 292 : for (int j = 0; j < nOverviews - i - 1; ++j)
4108 : {
4109 209 : if (papoOvrBands[j]->GetXSize() *
4110 209 : static_cast<float>(papoOvrBands[j]->GetYSize()) <
4111 209 : papoOvrBands[j + 1]->GetXSize() *
4112 209 : static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
4113 : {
4114 0 : GDALRasterBand *poTempBand = papoOvrBands[j];
4115 0 : papoOvrBands[j] = papoOvrBands[j + 1];
4116 0 : papoOvrBands[j + 1] = poTempBand;
4117 : }
4118 : }
4119 : }
4120 :
4121 : /* -------------------------------------------------------------------- */
4122 : /* Count total pixels so we can prepare appropriate scaled */
4123 : /* progress functions. */
4124 : /* -------------------------------------------------------------------- */
4125 44 : double dfTotalPixels = 0.0;
4126 :
4127 171 : for (int i = 0; i < nOverviews; ++i)
4128 : {
4129 127 : dfTotalPixels += papoOvrBands[i]->GetXSize() *
4130 127 : static_cast<double>(papoOvrBands[i]->GetYSize());
4131 : }
4132 :
4133 : /* -------------------------------------------------------------------- */
4134 : /* Generate all the bands. */
4135 : /* -------------------------------------------------------------------- */
4136 44 : double dfPixelsProcessed = 0.0;
4137 :
4138 171 : for (int i = 0; i < nOverviews; ++i)
4139 : {
4140 127 : GDALRasterBand *poBaseBand = poSrcBand;
4141 127 : if (i != 0)
4142 83 : poBaseBand = papoOvrBands[i - 1];
4143 :
4144 127 : double dfPixels = papoOvrBands[i]->GetXSize() *
4145 127 : static_cast<double>(papoOvrBands[i]->GetYSize());
4146 :
4147 254 : void *pScaledProgressData = GDALCreateScaledProgress(
4148 : dfPixelsProcessed / dfTotalPixels,
4149 127 : (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
4150 : pProgressData);
4151 :
4152 254 : const CPLErr eErr = GDALRegenerateOverviewsEx(
4153 : poBaseBand, 1,
4154 127 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
4155 : pszResampling, GDALScaledProgress, pScaledProgressData,
4156 : papszOptions);
4157 127 : GDALDestroyScaledProgress(pScaledProgressData);
4158 :
4159 127 : if (eErr != CE_None)
4160 0 : return eErr;
4161 :
4162 127 : dfPixelsProcessed += dfPixels;
4163 :
4164 : // Only do the bit2grayscale promotion on the base band.
4165 127 : if (STARTS_WITH_CI(pszResampling,
4166 : "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
4167 8 : pszResampling = "AVERAGE";
4168 : }
4169 :
4170 44 : return CE_None;
4171 : }
4172 :
4173 : /************************************************************************/
4174 : /* GDALGetResampleFunction() */
4175 : /************************************************************************/
4176 :
4177 4945 : GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
4178 : int *pnRadius)
4179 : {
4180 4945 : if (pnRadius)
4181 4944 : *pnRadius = 0;
4182 4945 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
4183 501 : return GDALResampleChunk_Near;
4184 4444 : else if (STARTS_WITH_CI(pszResampling, "AVER") ||
4185 3890 : EQUAL(pszResampling, "RMS"))
4186 574 : return GDALResampleChunk_AverageOrRMS;
4187 3870 : else if (EQUAL(pszResampling, "GAUSS"))
4188 : {
4189 26 : if (pnRadius)
4190 26 : *pnRadius = 1;
4191 26 : return GDALResampleChunk_Gauss;
4192 : }
4193 3844 : else if (EQUAL(pszResampling, "MODE"))
4194 96 : return GDALResampleChunk_Mode;
4195 3748 : else if (EQUAL(pszResampling, "CUBIC"))
4196 : {
4197 1341 : if (pnRadius)
4198 1338 : *pnRadius = GWKGetFilterRadius(GRA_Cubic);
4199 1337 : return GDALResampleChunk_Convolution;
4200 : }
4201 2407 : else if (EQUAL(pszResampling, "CUBICSPLINE"))
4202 : {
4203 3 : if (pnRadius)
4204 3 : *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
4205 3 : return GDALResampleChunk_Convolution;
4206 : }
4207 2404 : else if (EQUAL(pszResampling, "LANCZOS"))
4208 : {
4209 8 : if (pnRadius)
4210 8 : *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
4211 8 : return GDALResampleChunk_Convolution;
4212 : }
4213 2396 : else if (EQUAL(pszResampling, "BILINEAR"))
4214 : {
4215 2398 : if (pnRadius)
4216 2398 : *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
4217 2398 : return GDALResampleChunk_Convolution;
4218 : }
4219 : else
4220 : {
4221 0 : CPLError(
4222 : CE_Failure, CPLE_AppDefined,
4223 : "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
4224 : pszResampling);
4225 0 : return nullptr;
4226 : }
4227 : }
4228 :
4229 : /************************************************************************/
4230 : /* GDALGetOvrWorkDataType() */
4231 : /************************************************************************/
4232 :
4233 4819 : GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
4234 : GDALDataType eSrcDataType)
4235 : {
4236 4819 : if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
4237 : {
4238 595 : return eSrcDataType;
4239 : }
4240 4224 : else if (eSrcDataType == GDT_Byte &&
4241 3890 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4242 3422 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4243 2274 : EQUAL(pszResampling, "CUBICSPLINE") ||
4244 2271 : EQUAL(pszResampling, "LANCZOS") ||
4245 2266 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4246 : {
4247 3887 : return GDT_Byte;
4248 : }
4249 337 : else if (eSrcDataType == GDT_UInt16 &&
4250 120 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4251 113 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4252 3 : EQUAL(pszResampling, "CUBICSPLINE") ||
4253 3 : EQUAL(pszResampling, "LANCZOS") ||
4254 2 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4255 : {
4256 111 : return GDT_UInt16;
4257 : }
4258 226 : else if (EQUAL(pszResampling, "GAUSS"))
4259 20 : return GDT_Float64;
4260 :
4261 206 : if (eSrcDataType == GDT_Byte || eSrcDataType == GDT_Int8 ||
4262 204 : eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
4263 : eSrcDataType == GDT_Float32)
4264 : {
4265 164 : return GDT_Float32;
4266 : }
4267 42 : return GDT_Float64;
4268 : }
4269 :
4270 : namespace
4271 : {
4272 : // Structure to hold a pointer to free with CPLFree()
4273 : struct PointerHolder
4274 : {
4275 : void *ptr = nullptr;
4276 :
4277 5792 : explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
4278 : {
4279 5792 : }
4280 :
4281 5792 : ~PointerHolder()
4282 5792 : {
4283 5792 : CPLFree(ptr);
4284 5792 : }
4285 :
4286 : PointerHolder(const PointerHolder &) = delete;
4287 : PointerHolder &operator=(const PointerHolder &) = delete;
4288 : };
4289 : } // namespace
4290 :
4291 : /************************************************************************/
4292 : /* GDALRegenerateOverviews() */
4293 : /************************************************************************/
4294 :
4295 : /**
4296 : * \brief Generate downsampled overviews.
4297 : *
4298 : * This function will generate one or more overview images from a base image
4299 : * using the requested downsampling algorithm. Its primary use is for
4300 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4301 : * used to generate downsampled images in one file from another outside the
4302 : * overview architecture.
4303 : *
4304 : * The output bands need to exist in advance.
4305 : *
4306 : * The full set of resampling algorithms is documented in
4307 : * GDALDataset::BuildOverviews().
4308 : *
4309 : * This function will honour properly NODATA_VALUES tuples (special dataset
4310 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4311 : * considered as the nodata value and not each value of the triplet
4312 : * independently per band.
4313 : *
4314 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4315 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4316 : * overview computation.
4317 : *
4318 : * @param hSrcBand the source (base level) band.
4319 : * @param nOverviewCount the number of downsampled bands being generated.
4320 : * @param pahOvrBands the list of downsampled bands to be generated.
4321 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4322 : * @param pfnProgress progress report function.
4323 : * @param pProgressData progress function callback data.
4324 : * @return CE_None on success or CE_Failure on failure.
4325 : */
4326 250 : CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
4327 : GDALRasterBandH *pahOvrBands,
4328 : const char *pszResampling,
4329 : GDALProgressFunc pfnProgress,
4330 : void *pProgressData)
4331 :
4332 : {
4333 250 : return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
4334 : pszResampling, pfnProgress, pProgressData,
4335 250 : nullptr);
4336 : }
4337 :
4338 : /************************************************************************/
4339 : /* GDALRegenerateOverviewsEx() */
4340 : /************************************************************************/
4341 :
4342 : constexpr int RADIUS_TO_DIAMETER = 2;
4343 :
4344 : /**
4345 : * \brief Generate downsampled overviews.
4346 : *
4347 : * This function will generate one or more overview images from a base image
4348 : * using the requested downsampling algorithm. Its primary use is for
4349 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4350 : * used to generate downsampled images in one file from another outside the
4351 : * overview architecture.
4352 : *
4353 : * The output bands need to exist in advance.
4354 : *
4355 : * The full set of resampling algorithms is documented in
4356 : * GDALDataset::BuildOverviews().
4357 : *
4358 : * This function will honour properly NODATA_VALUES tuples (special dataset
4359 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4360 : * considered as the nodata value and not each value of the triplet
4361 : * independently per band.
4362 : *
4363 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4364 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4365 : * overview computation.
4366 : *
4367 : * @param hSrcBand the source (base level) band.
4368 : * @param nOverviewCount the number of downsampled bands being generated.
4369 : * @param pahOvrBands the list of downsampled bands to be generated.
4370 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4371 : * @param pfnProgress progress report function.
4372 : * @param pProgressData progress function callback data.
4373 : * @param papszOptions NULL terminated list of options as key=value pairs, or
4374 : * NULL
4375 : * @return CE_None on success or CE_Failure on failure.
4376 : * @since GDAL 3.6
4377 : */
4378 902 : CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
4379 : GDALRasterBandH *pahOvrBands,
4380 : const char *pszResampling,
4381 : GDALProgressFunc pfnProgress,
4382 : void *pProgressData, CSLConstList papszOptions)
4383 :
4384 : {
4385 902 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
4386 902 : GDALRasterBand **papoOvrBands =
4387 : reinterpret_cast<GDALRasterBand **>(pahOvrBands);
4388 :
4389 902 : if (pfnProgress == nullptr)
4390 252 : pfnProgress = GDALDummyProgress;
4391 :
4392 902 : if (EQUAL(pszResampling, "NONE"))
4393 64 : return CE_None;
4394 :
4395 838 : int nKernelRadius = 0;
4396 : GDALResampleFunction pfnResampleFn =
4397 838 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
4398 :
4399 838 : if (pfnResampleFn == nullptr)
4400 0 : return CE_Failure;
4401 :
4402 : /* -------------------------------------------------------------------- */
4403 : /* Check color tables... */
4404 : /* -------------------------------------------------------------------- */
4405 838 : GDALColorTable *poColorTable = nullptr;
4406 :
4407 471 : if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
4408 1750 : EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
4409 452 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4410 : {
4411 9 : poColorTable = poSrcBand->GetColorTable();
4412 9 : if (poColorTable != nullptr)
4413 : {
4414 9 : if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
4415 : {
4416 0 : CPLError(CE_Warning, CPLE_AppDefined,
4417 : "Computing overviews on palette index raster bands "
4418 : "with a palette whose color interpretation is not RGB "
4419 : "will probably lead to unexpected results.");
4420 0 : poColorTable = nullptr;
4421 : }
4422 9 : else if (poColorTable->IsIdentity())
4423 : {
4424 0 : poColorTable = nullptr;
4425 : }
4426 : }
4427 : else
4428 : {
4429 0 : CPLError(CE_Warning, CPLE_AppDefined,
4430 : "Computing overviews on palette index raster bands "
4431 : "without a palette will probably lead to unexpected "
4432 : "results.");
4433 : }
4434 : }
4435 : // Not ready yet
4436 2433 : else if ((EQUAL(pszResampling, "CUBIC") ||
4437 775 : EQUAL(pszResampling, "CUBICSPLINE") ||
4438 775 : EQUAL(pszResampling, "LANCZOS") ||
4439 1684 : EQUAL(pszResampling, "BILINEAR")) &&
4440 80 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4441 : {
4442 0 : CPLError(CE_Warning, CPLE_AppDefined,
4443 : "Computing %s overviews on palette index raster bands "
4444 : "will probably lead to unexpected results.",
4445 : pszResampling);
4446 : }
4447 :
4448 : // If we have a nodata mask and we are doing something more complicated
4449 : // than nearest neighbouring, we have to fetch to nodata mask.
4450 :
4451 838 : GDALRasterBand *poMaskBand = nullptr;
4452 838 : bool bUseNoDataMask = false;
4453 838 : bool bCanUseCascaded = true;
4454 :
4455 838 : if (!STARTS_WITH_CI(pszResampling, "NEAR"))
4456 : {
4457 : // Special case if we are an alpha/mask band. We want it to be
4458 : // considered as the mask band to avoid alpha=0 to be taken into account
4459 : // in average computation.
4460 532 : if (poSrcBand->IsMaskBand())
4461 : {
4462 91 : poMaskBand = poSrcBand;
4463 91 : bUseNoDataMask = true;
4464 : }
4465 : else
4466 : {
4467 441 : poMaskBand = poSrcBand->GetMaskBand();
4468 441 : const int nMaskFlags = poSrcBand->GetMaskFlags();
4469 441 : bCanUseCascaded =
4470 441 : (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
4471 441 : bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
4472 : }
4473 : }
4474 :
4475 : /* -------------------------------------------------------------------- */
4476 : /* If we are operating on multiple overviews, and using */
4477 : /* averaging, lets do them in cascading order to reduce the */
4478 : /* amount of computation. */
4479 : /* -------------------------------------------------------------------- */
4480 :
4481 : // In case the mask made be computed from another band of the dataset,
4482 : // we can't use cascaded generation, as the computation of the overviews
4483 : // of the band used for the mask band may not have yet occurred (#3033).
4484 838 : if ((STARTS_WITH_CI(pszResampling, "AVER") ||
4485 471 : EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
4486 440 : EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
4487 386 : EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
4488 838 : EQUAL(pszResampling, "MODE")) &&
4489 44 : nOverviewCount > 1 && bCanUseCascaded)
4490 44 : return GDALRegenerateCascadingOverviews(
4491 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
4492 44 : pProgressData, papszOptions);
4493 :
4494 : /* -------------------------------------------------------------------- */
4495 : /* Setup one horizontal swath to read from the raw buffer. */
4496 : /* -------------------------------------------------------------------- */
4497 794 : int nFRXBlockSize = 0;
4498 794 : int nFRYBlockSize = 0;
4499 794 : poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
4500 :
4501 794 : const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
4502 1282 : const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
4503 1236 : EQUAL(pszResampling, "MODE") ||
4504 442 : !GDALDataTypeIsComplex(eSrcDataType);
4505 : const GDALDataType eWrkDataType =
4506 : bUseGenericResampleFn
4507 794 : ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
4508 794 : : GDT_CFloat32;
4509 :
4510 794 : const int nWidth = poSrcBand->GetXSize();
4511 794 : const int nHeight = poSrcBand->GetYSize();
4512 :
4513 794 : int nMaxOvrFactor = 1;
4514 1705 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
4515 : {
4516 911 : const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
4517 911 : const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
4518 911 : nMaxOvrFactor = std::max(
4519 : nMaxOvrFactor,
4520 911 : static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
4521 911 : nMaxOvrFactor = std::max(
4522 : nMaxOvrFactor,
4523 911 : static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
4524 : }
4525 :
4526 794 : int nFullResYChunk = nFRYBlockSize;
4527 794 : int nMaxChunkYSizeQueried = 0;
4528 :
4529 : const auto UpdateChunkHeightAndGetChunkSize =
4530 10356 : [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
4531 83809 : eWrkDataType, nWidth]()
4532 : {
4533 : // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
4534 : // + nFullResYChunk) / nMaxOvrFactor)
4535 10356 : if (nMaxOvrFactor > INT_MAX / RADIUS_TO_DIAMETER)
4536 : {
4537 1 : return GINTBIG_MAX;
4538 : }
4539 10355 : nFullResYChunk =
4540 10355 : std::max(nFullResYChunk, RADIUS_TO_DIAMETER * nMaxOvrFactor);
4541 10355 : if ((nKernelRadius > 0 &&
4542 970 : nMaxOvrFactor > INT_MAX / (RADIUS_TO_DIAMETER * nKernelRadius)) ||
4543 10355 : nFullResYChunk >
4544 10355 : INT_MAX - RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor)
4545 : {
4546 0 : return GINTBIG_MAX;
4547 : }
4548 10355 : nMaxChunkYSizeQueried =
4549 10355 : nFullResYChunk + RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor;
4550 10355 : if (GDALGetDataTypeSizeBytes(eWrkDataType) >
4551 10355 : std::numeric_limits<int64_t>::max() /
4552 10355 : (static_cast<int64_t>(nMaxChunkYSizeQueried) * nWidth))
4553 : {
4554 1 : return GINTBIG_MAX;
4555 : }
4556 10354 : return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
4557 10354 : nMaxChunkYSizeQueried * nWidth;
4558 794 : };
4559 :
4560 : // Only configurable for debug / testing
4561 : const char *pszChunkYSize =
4562 794 : CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
4563 794 : if (pszChunkYSize)
4564 : {
4565 : // coverity[tainted_data]
4566 0 : nFullResYChunk = atoi(pszChunkYSize);
4567 : }
4568 :
4569 : // Only configurable for debug / testing
4570 : const int nChunkMaxSize =
4571 794 : atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
4572 :
4573 794 : auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
4574 794 : if (nChunkSize > nChunkMaxSize)
4575 : {
4576 15 : if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
4577 44 : !GDALDataTypeIsComplex(eSrcDataType) &&
4578 14 : (!STARTS_WITH_CI(pszResampling, "AVER") ||
4579 2 : EQUAL(pszResampling, "AVERAGE")))
4580 : {
4581 : // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
4582 : // which use a block based strategy, which is much less memory
4583 : // hungry.
4584 14 : return GDALRegenerateOverviewsMultiBand(
4585 : 1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
4586 14 : pfnProgress, pProgressData, papszOptions);
4587 : }
4588 1 : else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
4589 : {
4590 0 : return GDALRegenerateCascadingOverviews(
4591 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
4592 0 : pfnProgress, pProgressData, papszOptions);
4593 : }
4594 : }
4595 779 : else if (pszChunkYSize == nullptr)
4596 : {
4597 : // Try to get as close as possible to nChunkMaxSize
4598 10341 : while (nChunkSize < nChunkMaxSize / 2)
4599 : {
4600 9562 : nFullResYChunk *= 2;
4601 9562 : nChunkSize = UpdateChunkHeightAndGetChunkSize();
4602 : }
4603 : }
4604 :
4605 780 : int nHasNoData = 0;
4606 780 : const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
4607 780 : const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
4608 : const bool bPropagateNoData =
4609 780 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
4610 :
4611 : // Structure describing a resampling job
4612 : struct OvrJob
4613 : {
4614 : // Buffers to free when job is finished
4615 : std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
4616 : std::shared_ptr<PointerHolder> oSrcBufferHolder{};
4617 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
4618 :
4619 : GDALRasterBand *poDstBand = nullptr;
4620 :
4621 : // Input parameters of pfnResampleFn
4622 : GDALResampleFunction pfnResampleFn = nullptr;
4623 : int nSrcWidth = 0;
4624 : int nSrcHeight = 0;
4625 : int nDstWidth = 0;
4626 : GDALOverviewResampleArgs args{};
4627 : const void *pChunk = nullptr;
4628 : bool bUseGenericResampleFn = false;
4629 :
4630 : // Output values of resampling function
4631 : CPLErr eErr = CE_Failure;
4632 : void *pDstBuffer = nullptr;
4633 : GDALDataType eDstBufferDataType = GDT_Unknown;
4634 :
4635 : // Synchronization
4636 : bool bFinished = false;
4637 : std::mutex mutex{};
4638 : std::condition_variable cv{};
4639 :
4640 0 : void SetSrcMaskBufferHolder(
4641 : const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
4642 : {
4643 0 : oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
4644 0 : }
4645 :
4646 0 : void SetSrcBufferHolder(
4647 : const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
4648 : {
4649 0 : oSrcBufferHolder = oSrcBufferHolderIn;
4650 0 : }
4651 : };
4652 :
4653 : // Thread function to resample
4654 880 : const auto JobResampleFunc = [](void *pData)
4655 : {
4656 880 : OvrJob *poJob = static_cast<OvrJob *>(pData);
4657 :
4658 880 : if (poJob->bUseGenericResampleFn)
4659 : {
4660 878 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
4661 : &(poJob->pDstBuffer),
4662 : &(poJob->eDstBufferDataType));
4663 : }
4664 : else
4665 : {
4666 2 : poJob->eErr = GDALResampleChunkC32R(
4667 : poJob->nSrcWidth, poJob->nSrcHeight,
4668 2 : static_cast<const float *>(poJob->pChunk),
4669 : poJob->args.nChunkYOff, poJob->args.nChunkYSize,
4670 : poJob->args.nDstYOff, poJob->args.nDstYOff2,
4671 : poJob->args.nOvrXSize, poJob->args.nOvrYSize,
4672 : &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
4673 : poJob->args.pszResampling);
4674 : }
4675 :
4676 : poJob->oDstBufferHolder =
4677 880 : std::make_unique<PointerHolder>(poJob->pDstBuffer);
4678 :
4679 : {
4680 1760 : std::lock_guard<std::mutex> guard(poJob->mutex);
4681 880 : poJob->bFinished = true;
4682 880 : poJob->cv.notify_one();
4683 : }
4684 880 : };
4685 :
4686 : // Function to write resample data to target band
4687 880 : const auto WriteJobData = [](const OvrJob *poJob)
4688 : {
4689 1760 : return poJob->poDstBand->RasterIO(
4690 880 : GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
4691 880 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
4692 880 : poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
4693 880 : poJob->eDstBufferDataType, 0, 0, nullptr);
4694 : };
4695 :
4696 : // Wait for completion of oldest job and serialize it
4697 : const auto WaitAndFinalizeOldestJob =
4698 0 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
4699 : {
4700 0 : auto poOldestJob = jobList.front().get();
4701 : {
4702 0 : std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
4703 : // coverity[missing_lock:FALSE]
4704 0 : while (!poOldestJob->bFinished)
4705 : {
4706 0 : poOldestJob->cv.wait(oGuard);
4707 : }
4708 : }
4709 0 : CPLErr l_eErr = poOldestJob->eErr;
4710 0 : if (l_eErr == CE_None)
4711 : {
4712 0 : l_eErr = WriteJobData(poOldestJob);
4713 : }
4714 :
4715 0 : jobList.pop_front();
4716 0 : return l_eErr;
4717 : };
4718 :
4719 : // Queue of jobs
4720 1560 : std::list<std::unique_ptr<OvrJob>> jobList;
4721 :
4722 780 : GByte *pabyChunkNodataMask = nullptr;
4723 780 : void *pChunk = nullptr;
4724 :
4725 780 : const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
4726 3120 : const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
4727 780 : ? CPLGetNumCPUs()
4728 780 : : atoi(pszThreads)));
4729 : auto poThreadPool =
4730 780 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
4731 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
4732 1560 : : std::unique_ptr<CPLJobQueue>(nullptr);
4733 :
4734 : /* -------------------------------------------------------------------- */
4735 : /* Loop over image operating on chunks. */
4736 : /* -------------------------------------------------------------------- */
4737 780 : int nChunkYOff = 0;
4738 780 : CPLErr eErr = CE_None;
4739 :
4740 1565 : for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
4741 785 : nChunkYOff += nFullResYChunk)
4742 : {
4743 785 : if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
4744 : pProgressData))
4745 : {
4746 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
4747 0 : eErr = CE_Failure;
4748 : }
4749 :
4750 785 : if (nFullResYChunk + nChunkYOff > nHeight)
4751 778 : nFullResYChunk = nHeight - nChunkYOff;
4752 :
4753 785 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
4754 785 : int nChunkYSizeQueried =
4755 785 : nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
4756 785 : if (nChunkYOffQueried < 0)
4757 : {
4758 83 : nChunkYSizeQueried += nChunkYOffQueried;
4759 83 : nChunkYOffQueried = 0;
4760 : }
4761 785 : if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
4762 83 : nChunkYSizeQueried = nHeight - nChunkYOffQueried;
4763 :
4764 : // Avoid accumulating too many tasks and exhaust RAM
4765 : // Try to complete already finished jobs
4766 785 : while (eErr == CE_None && !jobList.empty())
4767 : {
4768 0 : auto poOldestJob = jobList.front().get();
4769 : {
4770 0 : std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
4771 0 : if (!poOldestJob->bFinished)
4772 : {
4773 0 : break;
4774 : }
4775 : }
4776 0 : eErr = poOldestJob->eErr;
4777 0 : if (eErr == CE_None)
4778 : {
4779 0 : eErr = WriteJobData(poOldestJob);
4780 : }
4781 :
4782 0 : jobList.pop_front();
4783 : }
4784 :
4785 : // And in case we have saturated the number of threads,
4786 : // wait for completion of tasks to go below the threshold.
4787 1570 : while (eErr == CE_None &&
4788 785 : jobList.size() >= static_cast<size_t>(nThreads))
4789 : {
4790 0 : eErr = WaitAndFinalizeOldestJob(jobList);
4791 : }
4792 :
4793 : // (Re)allocate buffers if needed
4794 785 : if (pChunk == nullptr)
4795 : {
4796 780 : pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
4797 : nMaxChunkYSizeQueried, nWidth);
4798 : }
4799 785 : if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
4800 : {
4801 : pabyChunkNodataMask = static_cast<GByte *>(
4802 283 : VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
4803 : }
4804 :
4805 785 : if (pChunk == nullptr ||
4806 283 : (bUseNoDataMask && pabyChunkNodataMask == nullptr))
4807 : {
4808 0 : CPLFree(pChunk);
4809 0 : CPLFree(pabyChunkNodataMask);
4810 0 : return CE_Failure;
4811 : }
4812 :
4813 : // Read chunk.
4814 785 : if (eErr == CE_None)
4815 785 : eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
4816 : nChunkYSizeQueried, pChunk, nWidth,
4817 : nChunkYSizeQueried, eWrkDataType, 0, 0,
4818 : nullptr);
4819 785 : if (eErr == CE_None && bUseNoDataMask)
4820 283 : eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
4821 : nChunkYSizeQueried, pabyChunkNodataMask,
4822 : nWidth, nChunkYSizeQueried, GDT_Byte, 0,
4823 : 0, nullptr);
4824 :
4825 : // Special case to promote 1bit data to 8bit 0/255 values.
4826 785 : if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
4827 : {
4828 9 : if (eWrkDataType == GDT_Float32)
4829 : {
4830 0 : float *pafChunk = static_cast<float *>(pChunk);
4831 0 : for (GPtrDiff_t i = 0;
4832 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4833 : i++)
4834 : {
4835 0 : if (pafChunk[i] == 1.0)
4836 0 : pafChunk[i] = 255.0;
4837 : }
4838 : }
4839 9 : else if (eWrkDataType == GDT_Byte)
4840 : {
4841 9 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
4842 168417 : for (GPtrDiff_t i = 0;
4843 168417 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4844 : i++)
4845 : {
4846 168408 : if (pabyChunk[i] == 1)
4847 127437 : pabyChunk[i] = 255;
4848 : }
4849 : }
4850 0 : else if (eWrkDataType == GDT_UInt16)
4851 : {
4852 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
4853 0 : for (GPtrDiff_t i = 0;
4854 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4855 : i++)
4856 : {
4857 0 : if (pasChunk[i] == 1)
4858 0 : pasChunk[i] = 255;
4859 : }
4860 : }
4861 0 : else if (eWrkDataType == GDT_Float64)
4862 : {
4863 0 : double *padfChunk = static_cast<double *>(pChunk);
4864 0 : for (GPtrDiff_t i = 0;
4865 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4866 : i++)
4867 : {
4868 0 : if (padfChunk[i] == 1.0)
4869 0 : padfChunk[i] = 255.0;
4870 : }
4871 : }
4872 : else
4873 : {
4874 0 : CPLAssert(false);
4875 : }
4876 : }
4877 776 : else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
4878 : {
4879 0 : if (eWrkDataType == GDT_Float32)
4880 : {
4881 0 : float *pafChunk = static_cast<float *>(pChunk);
4882 0 : for (GPtrDiff_t i = 0;
4883 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4884 : i++)
4885 : {
4886 0 : if (pafChunk[i] == 1.0)
4887 0 : pafChunk[i] = 0.0;
4888 0 : else if (pafChunk[i] == 0.0)
4889 0 : pafChunk[i] = 255.0;
4890 : }
4891 : }
4892 0 : else if (eWrkDataType == GDT_Byte)
4893 : {
4894 0 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
4895 0 : for (GPtrDiff_t i = 0;
4896 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4897 : i++)
4898 : {
4899 0 : if (pabyChunk[i] == 1)
4900 0 : pabyChunk[i] = 0;
4901 0 : else if (pabyChunk[i] == 0)
4902 0 : pabyChunk[i] = 255;
4903 : }
4904 : }
4905 0 : else if (eWrkDataType == GDT_UInt16)
4906 : {
4907 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
4908 0 : for (GPtrDiff_t i = 0;
4909 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4910 : i++)
4911 : {
4912 0 : if (pasChunk[i] == 1)
4913 0 : pasChunk[i] = 0;
4914 0 : else if (pasChunk[i] == 0)
4915 0 : pasChunk[i] = 255;
4916 : }
4917 : }
4918 0 : else if (eWrkDataType == GDT_Float64)
4919 : {
4920 0 : double *padfChunk = static_cast<double *>(pChunk);
4921 0 : for (GPtrDiff_t i = 0;
4922 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4923 : i++)
4924 : {
4925 0 : if (padfChunk[i] == 1.0)
4926 0 : padfChunk[i] = 0.0;
4927 0 : else if (padfChunk[i] == 0.0)
4928 0 : padfChunk[i] = 255.0;
4929 : }
4930 : }
4931 : else
4932 : {
4933 0 : CPLAssert(false);
4934 : }
4935 : }
4936 :
4937 : auto oSrcBufferHolder =
4938 1570 : std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
4939 : auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
4940 1570 : poJobQueue ? pabyChunkNodataMask : nullptr);
4941 :
4942 1665 : for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
4943 : ++iOverview)
4944 : {
4945 880 : GDALRasterBand *poDstBand = papoOvrBands[iOverview];
4946 880 : const int nDstWidth = poDstBand->GetXSize();
4947 880 : const int nDstHeight = poDstBand->GetYSize();
4948 :
4949 880 : const double dfXRatioDstToSrc =
4950 880 : static_cast<double>(nWidth) / nDstWidth;
4951 880 : const double dfYRatioDstToSrc =
4952 880 : static_cast<double>(nHeight) / nDstHeight;
4953 :
4954 : /* --------------------------------------------------------------------
4955 : */
4956 : /* Figure out the line to start writing to, and the first line
4957 : */
4958 : /* to not write to. In theory this approach should ensure that
4959 : */
4960 : /* every output line will be written if all input chunks are */
4961 : /* processed. */
4962 : /* --------------------------------------------------------------------
4963 : */
4964 880 : int nDstYOff =
4965 880 : static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
4966 880 : if (nDstYOff == nDstHeight)
4967 0 : continue;
4968 880 : int nDstYOff2 = static_cast<int>(
4969 880 : 0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
4970 :
4971 880 : if (nChunkYOff + nFullResYChunk == nHeight)
4972 873 : nDstYOff2 = nDstHeight;
4973 : #if DEBUG_VERBOSE
4974 : CPLDebug("GDAL",
4975 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
4976 : nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
4977 : nDstWidth, nDstYOff2 - nDstYOff);
4978 : #endif
4979 :
4980 1760 : auto poJob = std::make_unique<OvrJob>();
4981 880 : poJob->pfnResampleFn = pfnResampleFn;
4982 880 : poJob->bUseGenericResampleFn = bUseGenericResampleFn;
4983 880 : poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
4984 880 : poJob->args.nOvrXSize = poDstBand->GetXSize();
4985 880 : poJob->args.nOvrYSize = poDstBand->GetYSize();
4986 : const char *pszNBITS =
4987 880 : poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
4988 880 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
4989 880 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
4990 880 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
4991 880 : poJob->args.eWrkDataType = eWrkDataType;
4992 880 : poJob->pChunk = pChunk;
4993 880 : poJob->args.pabyChunkNodataMask = pabyChunkNodataMask;
4994 880 : poJob->nSrcWidth = nWidth;
4995 880 : poJob->nSrcHeight = nHeight;
4996 880 : poJob->args.nChunkXOff = 0;
4997 880 : poJob->args.nChunkXSize = nWidth;
4998 880 : poJob->args.nChunkYOff = nChunkYOffQueried;
4999 880 : poJob->args.nChunkYSize = nChunkYSizeQueried;
5000 880 : poJob->nDstWidth = nDstWidth;
5001 880 : poJob->args.nDstXOff = 0;
5002 880 : poJob->args.nDstXOff2 = nDstWidth;
5003 880 : poJob->args.nDstYOff = nDstYOff;
5004 880 : poJob->args.nDstYOff2 = nDstYOff2;
5005 880 : poJob->poDstBand = poDstBand;
5006 880 : poJob->args.pszResampling = pszResampling;
5007 880 : poJob->args.bHasNoData = bHasNoData;
5008 880 : poJob->args.dfNoDataValue = dfNoDataValue;
5009 880 : poJob->args.poColorTable = poColorTable;
5010 880 : poJob->args.eSrcDataType = eSrcDataType;
5011 880 : poJob->args.bPropagateNoData = bPropagateNoData;
5012 :
5013 880 : if (poJobQueue)
5014 : {
5015 0 : poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
5016 0 : poJob->SetSrcBufferHolder(oSrcBufferHolder);
5017 0 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
5018 0 : jobList.emplace_back(std::move(poJob));
5019 : }
5020 : else
5021 : {
5022 880 : JobResampleFunc(poJob.get());
5023 880 : eErr = poJob->eErr;
5024 880 : if (eErr == CE_None)
5025 : {
5026 880 : eErr = WriteJobData(poJob.get());
5027 : }
5028 : }
5029 : }
5030 :
5031 785 : if (poJobQueue)
5032 : {
5033 0 : pChunk = nullptr;
5034 0 : pabyChunkNodataMask = nullptr;
5035 : }
5036 : }
5037 :
5038 780 : VSIFree(pChunk);
5039 780 : VSIFree(pabyChunkNodataMask);
5040 :
5041 : // Wait for all pending jobs to complete
5042 780 : while (!jobList.empty())
5043 : {
5044 0 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5045 0 : if (l_eErr != CE_None && eErr == CE_None)
5046 0 : eErr = l_eErr;
5047 : }
5048 :
5049 : /* -------------------------------------------------------------------- */
5050 : /* Renormalized overview mean / stddev if needed. */
5051 : /* -------------------------------------------------------------------- */
5052 780 : if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
5053 : {
5054 0 : GDALOverviewMagnitudeCorrection(
5055 : poSrcBand, nOverviewCount,
5056 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
5057 : GDALDummyProgress, nullptr);
5058 : }
5059 :
5060 : /* -------------------------------------------------------------------- */
5061 : /* It can be important to flush out data to overviews. */
5062 : /* -------------------------------------------------------------------- */
5063 1653 : for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
5064 : ++iOverview)
5065 : {
5066 873 : eErr = papoOvrBands[iOverview]->FlushCache(false);
5067 : }
5068 :
5069 780 : if (eErr == CE_None)
5070 780 : pfnProgress(1.0, nullptr, pProgressData);
5071 :
5072 780 : return eErr;
5073 : }
5074 :
5075 : /************************************************************************/
5076 : /* GDALRegenerateOverviewsMultiBand() */
5077 : /************************************************************************/
5078 :
5079 : /**
5080 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5081 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5082 : *
5083 : * This function will generate one or more overview images from a base
5084 : * image using the requested downsampling algorithm. Its primary use
5085 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
5086 : * can also be used to generate downsampled images in one file from another
5087 : * outside the overview architecture.
5088 : *
5089 : * The output bands need to exist in advance and share the same characteristics
5090 : * (type, dimensions)
5091 : *
5092 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5093 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5094 : *
5095 : * It does not support color tables or complex data types.
5096 : *
5097 : * The pseudo-algorithm used by the function is :
5098 : * for each overview
5099 : * iterate on lines of the source by a step of deltay
5100 : * iterate on columns of the source by a step of deltax
5101 : * read the source data of size deltax * deltay for all the bands
5102 : * generate the corresponding overview block for all the bands
5103 : *
5104 : * This function will honour properly NODATA_VALUES tuples (special dataset
5105 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5106 : * considered as the nodata value and not each value of the triplet
5107 : * independently per band.
5108 : *
5109 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
5110 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5111 : * overview computation.
5112 : *
5113 : * @param nBands the number of bands, size of papoSrcBands and size of
5114 : * first dimension of papapoOverviewBands
5115 : * @param papoSrcBands the list of source bands to downsample
5116 : * @param nOverviews the number of downsampled overview levels being generated.
5117 : * @param papapoOverviewBands bidimension array of bands. First dimension is
5118 : * indexed by nBands. Second dimension is indexed by
5119 : * nOverviews.
5120 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5121 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5122 : * @param pfnProgress progress report function.
5123 : * @param pProgressData progress function callback data.
5124 : * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
5125 : * key=value pairs, or NULL
5126 : * Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
5127 : * options can be specified to express that overviews should
5128 : * be regenerated only in the specified subset of the source
5129 : * dataset.
5130 : * @return CE_None on success or CE_Failure on failure.
5131 : */
5132 :
5133 388 : CPLErr GDALRegenerateOverviewsMultiBand(
5134 : int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
5135 : GDALRasterBand *const *const *papapoOverviewBands,
5136 : const char *pszResampling, GDALProgressFunc pfnProgress,
5137 : void *pProgressData, CSLConstList papszOptions)
5138 : {
5139 388 : CPL_IGNORE_RET_VAL(papszOptions);
5140 :
5141 388 : if (pfnProgress == nullptr)
5142 11 : pfnProgress = GDALDummyProgress;
5143 :
5144 388 : if (EQUAL(pszResampling, "NONE"))
5145 2 : return CE_None;
5146 :
5147 : // Sanity checks.
5148 386 : if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
5149 191 : !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
5150 84 : !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
5151 22 : !EQUAL(pszResampling, "CUBICSPLINE") &&
5152 21 : !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
5153 5 : !EQUAL(pszResampling, "MODE"))
5154 : {
5155 0 : CPLError(CE_Failure, CPLE_NotSupported,
5156 : "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
5157 : "not supported",
5158 : pszResampling);
5159 0 : return CE_Failure;
5160 : }
5161 :
5162 386 : int nKernelRadius = 0;
5163 : GDALResampleFunction pfnResampleFn =
5164 386 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
5165 386 : if (pfnResampleFn == nullptr)
5166 0 : return CE_Failure;
5167 :
5168 386 : const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
5169 386 : const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
5170 386 : if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
5171 0 : return CE_None;
5172 386 : GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
5173 66233 : for (int iBand = 1; iBand < nBands; ++iBand)
5174 : {
5175 131694 : if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
5176 65847 : papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
5177 : {
5178 0 : CPLError(
5179 : CE_Failure, CPLE_NotSupported,
5180 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5181 : "have the same dimensions");
5182 0 : return CE_Failure;
5183 : }
5184 65847 : if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
5185 : {
5186 0 : CPLError(
5187 : CE_Failure, CPLE_NotSupported,
5188 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5189 : "have the same data type");
5190 0 : return CE_Failure;
5191 : }
5192 : }
5193 :
5194 1032 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5195 : {
5196 646 : const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
5197 646 : const int nDstWidth = poOvrFirstBand->GetXSize();
5198 646 : const int nDstHeight = poOvrFirstBand->GetYSize();
5199 66759 : for (int iBand = 1; iBand < nBands; ++iBand)
5200 : {
5201 66113 : const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
5202 132226 : if (poOvrBand->GetXSize() != nDstWidth ||
5203 66113 : poOvrBand->GetYSize() != nDstHeight)
5204 : {
5205 0 : CPLError(
5206 : CE_Failure, CPLE_NotSupported,
5207 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5208 : "of the same level must have the same dimensions");
5209 0 : return CE_Failure;
5210 : }
5211 66113 : if (poOvrBand->GetRasterDataType() != eDataType)
5212 : {
5213 0 : CPLError(
5214 : CE_Failure, CPLE_NotSupported,
5215 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5216 : "must have the same data type as the source bands");
5217 0 : return CE_Failure;
5218 : }
5219 : }
5220 : }
5221 :
5222 : // First pass to compute the total number of pixels to write.
5223 386 : double dfTotalPixelCount = 0;
5224 386 : const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
5225 386 : const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
5226 386 : const int nSrcXSize = atoi(CSLFetchNameValueDef(
5227 : papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
5228 386 : const int nSrcYSize = atoi(CSLFetchNameValueDef(
5229 : papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
5230 1032 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5231 : {
5232 646 : dfTotalPixelCount +=
5233 1292 : static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
5234 646 : papapoOverviewBands[0][iOverview]->GetXSize() *
5235 1292 : static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
5236 646 : papapoOverviewBands[0][iOverview]->GetYSize();
5237 : }
5238 :
5239 : const GDALDataType eWrkDataType =
5240 386 : GDALGetOvrWorkDataType(pszResampling, eDataType);
5241 386 : const int nWrkDataTypeSize = GDALGetDataTypeSizeBytes(eWrkDataType);
5242 :
5243 386 : const bool bIsMask = papoSrcBands[0]->IsMaskBand();
5244 :
5245 : // If we have a nodata mask and we are doing something more complicated
5246 : // than nearest neighbouring, we have to fetch to nodata mask.
5247 : const bool bUseNoDataMask =
5248 569 : !STARTS_WITH_CI(pszResampling, "NEAR") &&
5249 183 : (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
5250 :
5251 772 : std::vector<bool> abHasNoData(nBands);
5252 772 : std::vector<double> adfNoDataValue(nBands);
5253 :
5254 66619 : for (int iBand = 0; iBand < nBands; ++iBand)
5255 : {
5256 66233 : int nHasNoData = 0;
5257 132466 : adfNoDataValue[iBand] =
5258 66233 : papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
5259 66233 : abHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
5260 : }
5261 : const bool bPropagateNoData =
5262 386 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5263 :
5264 386 : const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
5265 1544 : const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
5266 386 : ? CPLGetNumCPUs()
5267 386 : : atoi(pszThreads)));
5268 : auto poThreadPool =
5269 386 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5270 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5271 772 : : std::unique_ptr<CPLJobQueue>(nullptr);
5272 :
5273 : // Only configurable for debug / testing
5274 386 : const GIntBig nChunkMaxSize = []() -> GIntBig
5275 : {
5276 : const char *pszVal =
5277 386 : CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", nullptr);
5278 386 : if (pszVal)
5279 : {
5280 15 : GIntBig nRet = 0;
5281 15 : CPLParseMemorySize(pszVal, &nRet, nullptr);
5282 15 : return std::max<GIntBig>(100, nRet);
5283 : }
5284 371 : return 10 * 1024 * 1024;
5285 386 : }();
5286 :
5287 : // Only configurable for debug / testing
5288 386 : const GIntBig nChunkMaxSizeForTempFile = []() -> GIntBig
5289 : {
5290 386 : const char *pszVal = CPLGetConfigOption(
5291 : "GDAL_OVR_CHUNK_MAX_SIZE_FOR_TEMP_FILE", nullptr);
5292 386 : if (pszVal)
5293 : {
5294 14 : GIntBig nRet = 0;
5295 14 : CPLParseMemorySize(pszVal, &nRet, nullptr);
5296 14 : return std::max<GIntBig>(100, nRet);
5297 : }
5298 372 : const auto nUsableRAM = CPLGetUsablePhysicalRAM();
5299 372 : if (nUsableRAM > 0)
5300 372 : return nUsableRAM / 10;
5301 : // Select a value to be able to at least downsample by 2 for a RGB
5302 : // 1024x1024 tiled output: (2 * 1024 + 2) * (2 * 1024 + 2) * 3 = 12 MB
5303 0 : return 100 * 1024 * 1024;
5304 386 : }();
5305 :
5306 : // Second pass to do the real job.
5307 386 : double dfCurPixelCount = 0;
5308 386 : CPLErr eErr = CE_None;
5309 1026 : for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
5310 : ++iOverview)
5311 : {
5312 645 : int iSrcOverview = -1; // -1 means the source bands.
5313 :
5314 : const int nDstTotalWidth =
5315 645 : papapoOverviewBands[0][iOverview]->GetXSize();
5316 : const int nDstTotalHeight =
5317 645 : papapoOverviewBands[0][iOverview]->GetYSize();
5318 :
5319 : // Compute the coordinates of the target region to refresh
5320 645 : constexpr double EPS = 1e-8;
5321 645 : const int nDstXOffStart = static_cast<int>(
5322 645 : static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
5323 : EPS);
5324 : const int nDstXOffEnd =
5325 1290 : std::min(static_cast<int>(
5326 645 : std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
5327 645 : nToplevelSrcWidth * nDstTotalWidth -
5328 : EPS)),
5329 645 : nDstTotalWidth);
5330 645 : const int nDstWidth = nDstXOffEnd - nDstXOffStart;
5331 645 : const int nDstYOffStart =
5332 645 : static_cast<int>(static_cast<double>(nSrcYOff) /
5333 645 : nToplevelSrcHeight * nDstTotalHeight +
5334 : EPS);
5335 : const int nDstYOffEnd =
5336 1290 : std::min(static_cast<int>(
5337 645 : std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
5338 645 : nToplevelSrcHeight * nDstTotalHeight -
5339 : EPS)),
5340 645 : nDstTotalHeight);
5341 645 : const int nDstHeight = nDstYOffEnd - nDstYOffStart;
5342 :
5343 : // Try to use previous level of overview as the source to compute
5344 : // the next level.
5345 645 : int nSrcWidth = nToplevelSrcWidth;
5346 645 : int nSrcHeight = nToplevelSrcHeight;
5347 905 : if (iOverview > 0 &&
5348 260 : papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
5349 : {
5350 252 : nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
5351 252 : nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
5352 252 : iSrcOverview = iOverview - 1;
5353 : }
5354 :
5355 645 : const double dfXRatioDstToSrc =
5356 645 : static_cast<double>(nSrcWidth) / nDstTotalWidth;
5357 645 : const double dfYRatioDstToSrc =
5358 645 : static_cast<double>(nSrcHeight) / nDstTotalHeight;
5359 :
5360 : const int nOvrFactor =
5361 1935 : std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
5362 645 : static_cast<int>(0.5 + dfYRatioDstToSrc)));
5363 :
5364 645 : int nDstChunkXSize = 0;
5365 645 : int nDstChunkYSize = 0;
5366 645 : papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
5367 : &nDstChunkYSize);
5368 :
5369 645 : constexpr int PIXEL_MARGIN = 2;
5370 : // Try to extend the chunk size so that the memory needed to acquire
5371 : // source pixels goes up to 10 MB.
5372 : // This can help for drivers that support multi-threaded reading
5373 645 : const int nFullResYChunk = static_cast<int>(std::min<double>(
5374 645 : nSrcHeight, PIXEL_MARGIN + nDstChunkYSize * dfYRatioDstToSrc));
5375 645 : const int nFullResYChunkQueried = static_cast<int>(std::min<int64_t>(
5376 1290 : nSrcHeight,
5377 1290 : nFullResYChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5378 645 : nKernelRadius * nOvrFactor));
5379 881 : while (nDstChunkXSize < nDstWidth)
5380 : {
5381 255 : constexpr int INCREASE_FACTOR = 2;
5382 :
5383 255 : const int nFullResXChunk = static_cast<int>(std::min<double>(
5384 510 : nSrcWidth, PIXEL_MARGIN + INCREASE_FACTOR * nDstChunkXSize *
5385 255 : dfXRatioDstToSrc));
5386 :
5387 : const int nFullResXChunkQueried =
5388 255 : static_cast<int>(std::min<int64_t>(
5389 510 : nSrcWidth,
5390 510 : nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5391 255 : nKernelRadius * nOvrFactor));
5392 :
5393 255 : if (static_cast<GIntBig>(nFullResXChunkQueried) *
5394 255 : nFullResYChunkQueried >
5395 255 : nChunkMaxSize / (nBands * nWrkDataTypeSize))
5396 : {
5397 19 : break;
5398 : }
5399 :
5400 236 : nDstChunkXSize *= INCREASE_FACTOR;
5401 : }
5402 645 : nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
5403 :
5404 645 : const int nFullResXChunk = static_cast<int>(std::min<double>(
5405 645 : nSrcWidth, PIXEL_MARGIN + nDstChunkXSize * dfXRatioDstToSrc));
5406 645 : const int nFullResXChunkQueried = static_cast<int>(std::min<int64_t>(
5407 1290 : nSrcWidth,
5408 1290 : nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5409 645 : nKernelRadius * nOvrFactor));
5410 :
5411 : // Make sure that the RAM requirements to acquire the source data does
5412 : // not exceed nChunkMaxSizeForTempFile
5413 : // If so, reduce the destination chunk size, generate overviews in a
5414 : // temporary dataset, and copy that temporary dataset over the target
5415 : // overview bands (to avoid issues with lossy compression)
5416 : const bool bOverflowFullResXChunkYChunkQueried =
5417 1286 : nFullResYChunkQueried > INT_MAX / (nBands * nWrkDataTypeSize) ||
5418 641 : nFullResXChunkQueried >
5419 641 : std::numeric_limits<int64_t>::max() /
5420 641 : (nFullResYChunkQueried * nBands * nWrkDataTypeSize);
5421 :
5422 645 : const auto nMemRequirement =
5423 : bOverflowFullResXChunkYChunkQueried
5424 645 : ? 0
5425 641 : : static_cast<GIntBig>(nFullResXChunkQueried) *
5426 641 : nFullResYChunkQueried * nBands * nWrkDataTypeSize;
5427 : // Use a temporary dataset with a smaller destination chunk size
5428 645 : const auto nOverShootFactor =
5429 : nMemRequirement / nChunkMaxSizeForTempFile;
5430 :
5431 645 : constexpr int MIN_OVERSHOOT_FACTOR = 4;
5432 : const auto nSqrtOverShootFactor = std::max<GIntBig>(
5433 1290 : MIN_OVERSHOOT_FACTOR, static_cast<GIntBig>(std::ceil(std::sqrt(
5434 645 : static_cast<double>(nOverShootFactor)))));
5435 645 : constexpr int DEFAULT_CHUNK_SIZE = 256;
5436 645 : constexpr int GTIFF_BLOCK_SIZE_MULTIPLE = 16;
5437 : const int nReducedDstChunkXSize =
5438 : bOverflowFullResXChunkYChunkQueried
5439 1286 : ? DEFAULT_CHUNK_SIZE
5440 1286 : : std::max(1, static_cast<int>(nDstChunkXSize /
5441 1286 : nSqrtOverShootFactor) &
5442 641 : ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
5443 : const int nReducedDstChunkYSize =
5444 : bOverflowFullResXChunkYChunkQueried
5445 1286 : ? DEFAULT_CHUNK_SIZE
5446 1286 : : std::max(1, static_cast<int>(nDstChunkYSize /
5447 1286 : nSqrtOverShootFactor) &
5448 641 : ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
5449 :
5450 645 : if (bOverflowFullResXChunkYChunkQueried ||
5451 : nMemRequirement > nChunkMaxSizeForTempFile)
5452 : {
5453 43 : const auto nDTSize = GDALGetDataTypeSizeBytes(eDataType);
5454 : const bool bTmpDSMemRequirementOverflow =
5455 43 : nDTSize * nBands >
5456 43 : std::numeric_limits<int64_t>::max() /
5457 43 : (static_cast<int64_t>(nDstWidth) * nDstHeight);
5458 43 : const auto nTmpDSMemRequirement =
5459 : bTmpDSMemRequirementOverflow
5460 43 : ? 0
5461 41 : : static_cast<GIntBig>(nDstWidth) * nDstHeight * nBands *
5462 41 : nDTSize;
5463 :
5464 : // make sure that one band buffer doesn't overflow size_t
5465 : const bool bChunkSizeOverflow =
5466 43 : static_cast<size_t>(nDTSize) >
5467 43 : std::numeric_limits<size_t>::max() /
5468 43 : (static_cast<uint64_t>(nDstWidth) * nDstHeight);
5469 43 : const size_t nChunkSize =
5470 : bChunkSizeOverflow
5471 43 : ? 0
5472 41 : : static_cast<size_t>(nDstWidth) * nDstHeight * nDTSize;
5473 :
5474 : const auto CreateVRT =
5475 41 : [nBands, nSrcWidth, nSrcHeight, nDstTotalWidth, nDstTotalHeight,
5476 : pszResampling, eWrkDataType, papoSrcBands, papapoOverviewBands,
5477 : iSrcOverview, &abHasNoData,
5478 393585 : &adfNoDataValue](int nVRTBlockXSize, int nVRTBlockYSize)
5479 : {
5480 : auto poVRTDS = std::make_unique<VRTDataset>(
5481 41 : nDstTotalWidth, nDstTotalHeight, nVRTBlockXSize,
5482 41 : nVRTBlockYSize);
5483 :
5484 65620 : for (int iBand = 0; iBand < nBands; ++iBand)
5485 : {
5486 131158 : auto poVRTSrc = std::make_unique<VRTSimpleSource>();
5487 65579 : poVRTSrc->SetResampling(pszResampling);
5488 65579 : poVRTDS->AddBand(eWrkDataType);
5489 : auto poVRTBand = static_cast<VRTSourcedRasterBand *>(
5490 65579 : poVRTDS->GetRasterBand(iBand + 1));
5491 :
5492 65579 : auto poSrcBand = papoSrcBands[iBand];
5493 65579 : if (iSrcOverview != -1)
5494 24 : poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
5495 65579 : poVRTBand->ConfigureSource(
5496 : poVRTSrc.get(), poSrcBand, false, 0, 0, nSrcWidth,
5497 : nSrcHeight, 0, 0, nDstTotalWidth, nDstTotalHeight);
5498 : // Add the source to the band
5499 65579 : poVRTBand->AddSource(poVRTSrc.release());
5500 65579 : if (abHasNoData[iBand])
5501 3 : poVRTBand->SetNoDataValue(adfNoDataValue[iBand]);
5502 : }
5503 :
5504 42 : if (papoSrcBands[0]->GetMaskFlags() == GMF_PER_DATASET &&
5505 1 : poVRTDS->CreateMaskBand(GMF_PER_DATASET) == CE_None)
5506 : {
5507 : VRTSourcedRasterBand *poMaskVRTBand =
5508 1 : cpl::down_cast<VRTSourcedRasterBand *>(
5509 1 : poVRTDS->GetRasterBand(1)->GetMaskBand());
5510 1 : auto poSrcBand = papoSrcBands[0];
5511 1 : if (iSrcOverview != -1)
5512 0 : poSrcBand = papapoOverviewBands[0][iSrcOverview];
5513 1 : poMaskVRTBand->AddMaskBandSource(
5514 1 : poSrcBand->GetMaskBand(), 0, 0, nSrcWidth, nSrcHeight,
5515 : 0, 0, nDstTotalWidth, nDstTotalHeight);
5516 : }
5517 :
5518 41 : return poVRTDS;
5519 43 : };
5520 :
5521 : // If the overview accommodates chunking, do so and recurse
5522 : // to avoid generating full size temporary files
5523 43 : if (!bOverflowFullResXChunkYChunkQueried &&
5524 39 : !bTmpDSMemRequirementOverflow && !bChunkSizeOverflow &&
5525 39 : (nDstChunkXSize < nDstWidth || nDstChunkYSize < nDstHeight))
5526 : {
5527 : // Create a VRT with the smaller chunk to do the scaling
5528 : auto poVRTDS =
5529 13 : CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
5530 :
5531 13 : std::vector<GDALRasterBand *> apoVRTBand(nBands);
5532 13 : std::vector<GDALRasterBand *> apoDstBand(nBands);
5533 65560 : for (int iBand = 0; iBand < nBands; ++iBand)
5534 : {
5535 65547 : apoDstBand[iBand] = papapoOverviewBands[iBand][iOverview];
5536 65547 : apoVRTBand[iBand] = poVRTDS->GetRasterBand(iBand + 1);
5537 : }
5538 :
5539 : // Use a flag to avoid reading from the overview being built
5540 : GDALRasterIOExtraArg sExtraArg;
5541 13 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5542 13 : if (iSrcOverview == -1)
5543 13 : sExtraArg.bUseOnlyThisScale = true;
5544 :
5545 : // A single band buffer for data transfer to the overview
5546 13 : std::vector<GByte> abyChunk;
5547 : try
5548 : {
5549 13 : abyChunk.resize(nChunkSize);
5550 : }
5551 0 : catch (const std::exception &)
5552 : {
5553 0 : CPLError(CE_Failure, CPLE_OutOfMemory,
5554 : "Out of memory allocating temporary buffer");
5555 0 : return CE_Failure;
5556 : }
5557 :
5558 : // Loop over output height, in chunks
5559 13 : for (int nDstYOff = nDstYOffStart;
5560 38 : nDstYOff < nDstYOffEnd && eErr == CE_None;
5561 : /* */)
5562 : {
5563 : const int nDstYCount =
5564 25 : std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
5565 : // Loop over output width, in output chunks
5566 25 : for (int nDstXOff = nDstXOffStart;
5567 74 : nDstXOff < nDstXOffEnd && eErr == CE_None;
5568 : /* */)
5569 : {
5570 : const int nDstXCount =
5571 49 : std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
5572 : // Read and transfer the chunk to the overview
5573 98 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
5574 : ++iBand)
5575 : {
5576 98 : eErr = apoVRTBand[iBand]->RasterIO(
5577 : GF_Read, nDstXOff, nDstYOff, nDstXCount,
5578 49 : nDstYCount, abyChunk.data(), nDstXCount,
5579 : nDstYCount, eDataType, 0, 0, &sExtraArg);
5580 49 : if (eErr == CE_None)
5581 : {
5582 96 : eErr = apoDstBand[iBand]->RasterIO(
5583 : GF_Write, nDstXOff, nDstYOff, nDstXCount,
5584 48 : nDstYCount, abyChunk.data(), nDstXCount,
5585 : nDstYCount, eDataType, 0, 0, nullptr);
5586 : }
5587 : }
5588 :
5589 49 : dfCurPixelCount +=
5590 49 : static_cast<double>(nDstXCount) * nDstYCount;
5591 :
5592 49 : nDstXOff += nDstXCount;
5593 : } // width
5594 :
5595 25 : if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount,
5596 : nullptr, pProgressData))
5597 : {
5598 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
5599 : "User terminated");
5600 0 : eErr = CE_Failure;
5601 : }
5602 :
5603 25 : nDstYOff += nDstYCount;
5604 : } // height
5605 :
5606 13 : if (CE_None != eErr)
5607 : {
5608 1 : CPLError(CE_Failure, CPLE_AppDefined,
5609 : "Error while writing overview");
5610 1 : return CE_Failure;
5611 : }
5612 :
5613 12 : pfnProgress(1.0, nullptr, pProgressData);
5614 : // Flush the overviews we just generated
5615 24 : for (int iBand = 0; iBand < nBands; ++iBand)
5616 12 : apoDstBand[iBand]->FlushCache(false);
5617 :
5618 12 : continue; // Next overview
5619 : } // chunking via temporary dataset
5620 :
5621 0 : std::unique_ptr<GDALDataset> poTmpDS;
5622 : // Config option mostly/only for autotest purposes
5623 : const char *pszGDAL_OVR_TEMP_DRIVER =
5624 30 : CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
5625 30 : if ((!bTmpDSMemRequirementOverflow &&
5626 4 : nTmpDSMemRequirement <= nChunkMaxSizeForTempFile &&
5627 4 : !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
5628 26 : EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
5629 : {
5630 10 : auto poTmpDrv = GetGDALDriverManager()->GetDriverByName("MEM");
5631 10 : if (!poTmpDrv)
5632 : {
5633 0 : eErr = CE_Failure;
5634 0 : break;
5635 : }
5636 10 : poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
5637 : nDstTotalHeight, nBands,
5638 10 : eDataType, nullptr));
5639 : }
5640 : else
5641 : {
5642 : // Create a temporary file for the overview
5643 : auto poTmpDrv =
5644 20 : GetGDALDriverManager()->GetDriverByName("GTiff");
5645 20 : if (!poTmpDrv)
5646 : {
5647 0 : eErr = CE_Failure;
5648 0 : break;
5649 : }
5650 40 : std::string osTmpFilename;
5651 20 : auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
5652 20 : if (poDstDS)
5653 : {
5654 20 : osTmpFilename = poDstDS->GetDescription();
5655 : VSIStatBufL sStatBuf;
5656 20 : if (!osTmpFilename.empty() &&
5657 0 : VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
5658 0 : osTmpFilename += "_tmp_ovr.tif";
5659 : }
5660 20 : if (osTmpFilename.empty())
5661 : {
5662 20 : osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
5663 20 : osTmpFilename += ".tif";
5664 : }
5665 20 : CPLDebug("GDAL", "Creating temporary file %s of %d x %d x %d",
5666 : osTmpFilename.c_str(), nDstWidth, nDstHeight, nBands);
5667 40 : CPLStringList aosCO;
5668 20 : if (0 == ((nReducedDstChunkXSize % GTIFF_BLOCK_SIZE_MULTIPLE) |
5669 20 : (nReducedDstChunkYSize % GTIFF_BLOCK_SIZE_MULTIPLE)))
5670 : {
5671 14 : aosCO.SetNameValue("TILED", "YES");
5672 : aosCO.SetNameValue("BLOCKXSIZE",
5673 14 : CPLSPrintf("%d", nReducedDstChunkXSize));
5674 : aosCO.SetNameValue("BLOCKYSIZE",
5675 14 : CPLSPrintf("%d", nReducedDstChunkYSize));
5676 : }
5677 20 : if (const char *pszCOList =
5678 20 : poTmpDrv->GetMetadataItem(GDAL_DMD_CREATIONOPTIONLIST))
5679 : {
5680 : aosCO.SetNameValue(
5681 20 : "COMPRESS", strstr(pszCOList, "ZSTD") ? "ZSTD" : "LZW");
5682 : }
5683 20 : poTmpDS.reset(poTmpDrv->Create(osTmpFilename.c_str(), nDstWidth,
5684 : nDstHeight, nBands, eDataType,
5685 20 : aosCO.List()));
5686 20 : if (poTmpDS)
5687 : {
5688 18 : poTmpDS->MarkSuppressOnClose();
5689 18 : VSIUnlink(osTmpFilename.c_str());
5690 : }
5691 : }
5692 30 : if (!poTmpDS)
5693 : {
5694 2 : eErr = CE_Failure;
5695 2 : break;
5696 : }
5697 :
5698 : // Create a full size VRT to do the resampling without edge effects
5699 : auto poVRTDS =
5700 28 : CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
5701 :
5702 : // Allocate a band buffer with the overview chunk size
5703 : std::unique_ptr<void, VSIFreeReleaser> pDstBuffer(
5704 : VSI_MALLOC3_VERBOSE(size_t(nWrkDataTypeSize), nDstChunkXSize,
5705 28 : nDstChunkYSize));
5706 28 : if (pDstBuffer == nullptr)
5707 : {
5708 0 : eErr = CE_Failure;
5709 0 : break;
5710 : }
5711 :
5712 : // Use a flag to avoid reading the overview being built
5713 : GDALRasterIOExtraArg sExtraArg;
5714 28 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5715 28 : if (iSrcOverview == -1)
5716 4 : sExtraArg.bUseOnlyThisScale = true;
5717 :
5718 : // Scale and copy data from the VRT to the temp file
5719 28 : for (int nDstYOff = nDstYOffStart;
5720 914 : nDstYOff < nDstYOffEnd && eErr == CE_None;
5721 : /* */)
5722 : {
5723 : const int nDstYCount =
5724 886 : std::min(nReducedDstChunkYSize, nDstYOffEnd - nDstYOff);
5725 886 : for (int nDstXOff = nDstXOffStart;
5726 201218 : nDstXOff < nDstXOffEnd && eErr == CE_None;
5727 : /* */)
5728 : {
5729 : const int nDstXCount =
5730 200332 : std::min(nReducedDstChunkXSize, nDstXOffEnd - nDstXOff);
5731 400668 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
5732 : ++iBand)
5733 : {
5734 200336 : auto poSrcBand = poVRTDS->GetRasterBand(iBand + 1);
5735 200336 : eErr = poSrcBand->RasterIO(
5736 : GF_Read, nDstXOff, nDstYOff, nDstXCount, nDstYCount,
5737 : pDstBuffer.get(), nDstXCount, nDstYCount,
5738 : eWrkDataType, 0, 0, &sExtraArg);
5739 200336 : if (eErr == CE_None)
5740 : {
5741 : // Write to the temporary dataset, shifted
5742 200334 : auto poOvrBand = poTmpDS->GetRasterBand(iBand + 1);
5743 200334 : eErr = poOvrBand->RasterIO(
5744 : GF_Write, nDstXOff - nDstXOffStart,
5745 : nDstYOff - nDstYOffStart, nDstXCount,
5746 : nDstYCount, pDstBuffer.get(), nDstXCount,
5747 : nDstYCount, eWrkDataType, 0, 0, nullptr);
5748 : }
5749 : }
5750 200332 : nDstXOff += nDstXCount;
5751 : }
5752 886 : nDstYOff += nDstYCount;
5753 : }
5754 :
5755 : // Copy from the temporary to the overview
5756 28 : for (int nDstYOff = nDstYOffStart;
5757 54 : nDstYOff < nDstYOffEnd && eErr == CE_None;
5758 : /* */)
5759 : {
5760 : const int nDstYCount =
5761 26 : std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
5762 26 : for (int nDstXOff = nDstXOffStart;
5763 52 : nDstXOff < nDstXOffEnd && eErr == CE_None;
5764 : /* */)
5765 : {
5766 : const int nDstXCount =
5767 26 : std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
5768 56 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
5769 : ++iBand)
5770 : {
5771 30 : auto poSrcBand = poTmpDS->GetRasterBand(iBand + 1);
5772 30 : eErr = poSrcBand->RasterIO(
5773 : GF_Read, nDstXOff - nDstXOffStart,
5774 : nDstYOff - nDstYOffStart, nDstXCount, nDstYCount,
5775 : pDstBuffer.get(), nDstXCount, nDstYCount,
5776 : eWrkDataType, 0, 0, nullptr);
5777 30 : if (eErr == CE_None)
5778 : {
5779 : // Write to the destination overview bands
5780 30 : auto poOvrBand =
5781 30 : papapoOverviewBands[iBand][iOverview];
5782 30 : eErr = poOvrBand->RasterIO(
5783 : GF_Write, nDstXOff, nDstYOff, nDstXCount,
5784 : nDstYCount, pDstBuffer.get(), nDstXCount,
5785 : nDstYCount, eWrkDataType, 0, 0, nullptr);
5786 : }
5787 : }
5788 26 : nDstXOff += nDstXCount;
5789 : }
5790 26 : nDstYOff += nDstYCount;
5791 : }
5792 :
5793 28 : if (eErr != CE_None)
5794 : {
5795 2 : CPLError(CE_Failure, CPLE_AppDefined,
5796 : "Failed to write overview %d", iOverview);
5797 2 : return eErr;
5798 : }
5799 :
5800 : // Flush the data to overviews.
5801 56 : for (int iBand = 0; iBand < nBands; ++iBand)
5802 30 : papapoOverviewBands[iBand][iOverview]->FlushCache(false);
5803 :
5804 26 : continue;
5805 : }
5806 :
5807 : // Structure describing a resampling job
5808 : struct OvrJob
5809 : {
5810 : // Buffers to free when job is finished
5811 : std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
5812 : std::unique_ptr<PointerHolder> oSrcBufferHolder{};
5813 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
5814 :
5815 : GDALRasterBand *poDstBand = nullptr;
5816 :
5817 : // Input parameters of pfnResampleFn
5818 : GDALResampleFunction pfnResampleFn = nullptr;
5819 : GDALOverviewResampleArgs args{};
5820 : const void *pChunk = nullptr;
5821 :
5822 : // Output values of resampling function
5823 : CPLErr eErr = CE_Failure;
5824 : void *pDstBuffer = nullptr;
5825 : GDALDataType eDstBufferDataType = GDT_Unknown;
5826 :
5827 : // Synchronization
5828 : bool bFinished = false;
5829 : std::mutex mutex{};
5830 : std::condition_variable cv{};
5831 : };
5832 :
5833 : // Thread function to resample
5834 3310 : const auto JobResampleFunc = [](void *pData)
5835 : {
5836 3310 : OvrJob *poJob = static_cast<OvrJob *>(pData);
5837 :
5838 3310 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
5839 : &(poJob->pDstBuffer),
5840 : &(poJob->eDstBufferDataType));
5841 :
5842 3310 : poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
5843 :
5844 : {
5845 6620 : std::lock_guard<std::mutex> guard(poJob->mutex);
5846 3310 : poJob->bFinished = true;
5847 3310 : poJob->cv.notify_one();
5848 : }
5849 3310 : };
5850 :
5851 : // Function to write resample data to target band
5852 3310 : const auto WriteJobData = [](const OvrJob *poJob)
5853 : {
5854 6620 : return poJob->poDstBand->RasterIO(
5855 3310 : GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
5856 3310 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
5857 3310 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
5858 3310 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
5859 3310 : poJob->args.nDstYOff2 - poJob->args.nDstYOff,
5860 3310 : poJob->eDstBufferDataType, 0, 0, nullptr);
5861 : };
5862 :
5863 : // Wait for completion of oldest job and serialize it
5864 : const auto WaitAndFinalizeOldestJob =
5865 16 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
5866 : {
5867 16 : auto poOldestJob = jobList.front().get();
5868 : {
5869 32 : std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
5870 : // coverity[missing_lock:FALSE]
5871 22 : while (!poOldestJob->bFinished)
5872 : {
5873 6 : poOldestJob->cv.wait(oGuard);
5874 : }
5875 : }
5876 16 : CPLErr l_eErr = poOldestJob->eErr;
5877 16 : if (l_eErr == CE_None)
5878 : {
5879 16 : l_eErr = WriteJobData(poOldestJob);
5880 : }
5881 :
5882 16 : jobList.pop_front();
5883 16 : return l_eErr;
5884 : };
5885 :
5886 : // Queue of jobs
5887 1204 : std::list<std::unique_ptr<OvrJob>> jobList;
5888 :
5889 1204 : std::vector<std::unique_ptr<void, VSIFreeReleaser>> apaChunk(nBands);
5890 : std::vector<std::unique_ptr<GByte, VSIFreeReleaser>>
5891 1204 : apabyChunkNoDataMask(nBands);
5892 :
5893 : // Iterate on destination overview, block by block.
5894 602 : for (int nDstYOff = nDstYOffStart;
5895 2111 : nDstYOff < nDstYOffEnd && eErr == CE_None;
5896 1509 : nDstYOff += nDstChunkYSize)
5897 : {
5898 : int nDstYCount;
5899 1509 : if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
5900 1099 : nDstYCount = nDstChunkYSize;
5901 : else
5902 410 : nDstYCount = nDstYOffEnd - nDstYOff;
5903 :
5904 1509 : int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
5905 1509 : int nChunkYOff2 = static_cast<int>(
5906 1509 : ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
5907 1509 : if (nChunkYOff2 > nSrcHeight ||
5908 1509 : nDstYOff + nDstYCount == nDstTotalHeight)
5909 595 : nChunkYOff2 = nSrcHeight;
5910 1509 : int nYCount = nChunkYOff2 - nChunkYOff;
5911 1509 : CPLAssert(nYCount <= nFullResYChunk);
5912 :
5913 1509 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
5914 1509 : int nChunkYSizeQueried =
5915 1509 : nYCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
5916 1509 : if (nChunkYOffQueried < 0)
5917 : {
5918 148 : nChunkYSizeQueried += nChunkYOffQueried;
5919 148 : nChunkYOffQueried = 0;
5920 : }
5921 1509 : if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
5922 147 : nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
5923 1509 : CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
5924 :
5925 1509 : if (!pfnProgress(std::min(1.0, dfCurPixelCount / dfTotalPixelCount),
5926 : nullptr, pProgressData))
5927 : {
5928 1 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
5929 1 : eErr = CE_Failure;
5930 : }
5931 :
5932 : // Iterate on destination overview, block by block.
5933 1509 : for (int nDstXOff = nDstXOffStart;
5934 3057 : nDstXOff < nDstXOffEnd && eErr == CE_None;
5935 1548 : nDstXOff += nDstChunkXSize)
5936 : {
5937 1548 : int nDstXCount = 0;
5938 1548 : if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
5939 1531 : nDstXCount = nDstChunkXSize;
5940 : else
5941 17 : nDstXCount = nDstXOffEnd - nDstXOff;
5942 :
5943 1548 : dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
5944 :
5945 1548 : int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
5946 1548 : int nChunkXOff2 = static_cast<int>(
5947 1548 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
5948 1548 : if (nChunkXOff2 > nSrcWidth ||
5949 1548 : nDstXOff + nDstXCount == nDstTotalWidth)
5950 1473 : nChunkXOff2 = nSrcWidth;
5951 1548 : const int nXCount = nChunkXOff2 - nChunkXOff;
5952 1548 : CPLAssert(nXCount <= nFullResXChunk);
5953 :
5954 1548 : int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
5955 1548 : int nChunkXSizeQueried =
5956 1548 : nXCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
5957 1548 : if (nChunkXOffQueried < 0)
5958 : {
5959 208 : nChunkXSizeQueried += nChunkXOffQueried;
5960 208 : nChunkXOffQueried = 0;
5961 : }
5962 1548 : if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
5963 217 : nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
5964 1548 : CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
5965 : #if DEBUG_VERBOSE
5966 : CPLDebug("GDAL",
5967 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
5968 : nChunkXOffQueried, nChunkYOffQueried,
5969 : nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
5970 : nDstYOff, nDstXCount, nDstYCount);
5971 : #endif
5972 :
5973 : // Avoid accumulating too many tasks and exhaust RAM
5974 :
5975 : // Try to complete already finished jobs
5976 1548 : while (eErr == CE_None && !jobList.empty())
5977 : {
5978 2 : auto poOldestJob = jobList.front().get();
5979 : {
5980 2 : std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
5981 2 : if (!poOldestJob->bFinished)
5982 : {
5983 2 : break;
5984 : }
5985 : }
5986 0 : eErr = poOldestJob->eErr;
5987 0 : if (eErr == CE_None)
5988 : {
5989 0 : eErr = WriteJobData(poOldestJob);
5990 : }
5991 :
5992 0 : jobList.pop_front();
5993 : }
5994 :
5995 : // And in case we have saturated the number of threads,
5996 : // wait for completion of tasks to go below the threshold.
5997 3096 : while (eErr == CE_None &&
5998 1548 : jobList.size() >= static_cast<size_t>(nThreads))
5999 : {
6000 0 : eErr = WaitAndFinalizeOldestJob(jobList);
6001 : }
6002 :
6003 : // Read the source buffers for all the bands.
6004 4859 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6005 : {
6006 : // (Re)allocate buffers if needed
6007 3311 : if (apaChunk[iBand] == nullptr)
6008 : {
6009 1179 : apaChunk[iBand].reset(VSI_MALLOC3_VERBOSE(
6010 : nFullResXChunkQueried, nFullResYChunkQueried,
6011 : nWrkDataTypeSize));
6012 1179 : if (apaChunk[iBand] == nullptr)
6013 : {
6014 0 : eErr = CE_Failure;
6015 : }
6016 : }
6017 3652 : if (bUseNoDataMask &&
6018 341 : apabyChunkNoDataMask[iBand] == nullptr)
6019 : {
6020 282 : apabyChunkNoDataMask[iBand].reset(
6021 282 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
6022 : nFullResXChunkQueried, nFullResYChunkQueried)));
6023 282 : if (apabyChunkNoDataMask[iBand] == nullptr)
6024 : {
6025 0 : eErr = CE_Failure;
6026 : }
6027 : }
6028 :
6029 3311 : if (eErr == CE_None)
6030 : {
6031 3311 : GDALRasterBand *poSrcBand = nullptr;
6032 3311 : if (iSrcOverview == -1)
6033 2409 : poSrcBand = papoSrcBands[iBand];
6034 : else
6035 902 : poSrcBand =
6036 902 : papapoOverviewBands[iBand][iSrcOverview];
6037 3311 : eErr = poSrcBand->RasterIO(
6038 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6039 : nChunkXSizeQueried, nChunkYSizeQueried,
6040 3311 : apaChunk[iBand].get(), nChunkXSizeQueried,
6041 : nChunkYSizeQueried, eWrkDataType, 0, 0, nullptr);
6042 :
6043 3311 : if (bUseNoDataMask && eErr == CE_None)
6044 : {
6045 341 : auto poMaskBand = poSrcBand->IsMaskBand()
6046 341 : ? poSrcBand
6047 262 : : poSrcBand->GetMaskBand();
6048 341 : eErr = poMaskBand->RasterIO(
6049 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6050 : nChunkXSizeQueried, nChunkYSizeQueried,
6051 341 : apabyChunkNoDataMask[iBand].get(),
6052 : nChunkXSizeQueried, nChunkYSizeQueried,
6053 : GDT_Byte, 0, 0, nullptr);
6054 : }
6055 : }
6056 : }
6057 :
6058 : // Compute the resulting overview block.
6059 4858 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6060 : {
6061 6620 : auto poJob = std::make_unique<OvrJob>();
6062 3310 : poJob->pfnResampleFn = pfnResampleFn;
6063 3310 : poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
6064 6620 : poJob->args.eOvrDataType =
6065 3310 : poJob->poDstBand->GetRasterDataType();
6066 3310 : poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
6067 3310 : poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
6068 3310 : const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
6069 3310 : "NBITS", "IMAGE_STRUCTURE");
6070 3310 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
6071 3310 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
6072 3310 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
6073 3310 : poJob->args.eWrkDataType = eWrkDataType;
6074 3310 : poJob->pChunk = apaChunk[iBand].get();
6075 3310 : poJob->args.pabyChunkNodataMask =
6076 3310 : apabyChunkNoDataMask[iBand].get();
6077 3310 : poJob->args.nChunkXOff = nChunkXOffQueried;
6078 3310 : poJob->args.nChunkXSize = nChunkXSizeQueried;
6079 3310 : poJob->args.nChunkYOff = nChunkYOffQueried;
6080 3310 : poJob->args.nChunkYSize = nChunkYSizeQueried;
6081 3310 : poJob->args.nDstXOff = nDstXOff;
6082 3310 : poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
6083 3310 : poJob->args.nDstYOff = nDstYOff;
6084 3310 : poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
6085 3310 : poJob->args.pszResampling = pszResampling;
6086 3310 : poJob->args.bHasNoData = abHasNoData[iBand];
6087 3310 : poJob->args.dfNoDataValue = adfNoDataValue[iBand];
6088 3310 : poJob->args.eSrcDataType = eDataType;
6089 3310 : poJob->args.bPropagateNoData = bPropagateNoData;
6090 :
6091 3310 : if (poJobQueue)
6092 : {
6093 32 : poJob->oSrcMaskBufferHolder.reset(new PointerHolder(
6094 16 : apabyChunkNoDataMask[iBand].release()));
6095 :
6096 32 : poJob->oSrcBufferHolder.reset(
6097 16 : new PointerHolder(apaChunk[iBand].release()));
6098 :
6099 16 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
6100 16 : jobList.emplace_back(std::move(poJob));
6101 : }
6102 : else
6103 : {
6104 3294 : JobResampleFunc(poJob.get());
6105 3294 : eErr = poJob->eErr;
6106 3294 : if (eErr == CE_None)
6107 : {
6108 3294 : eErr = WriteJobData(poJob.get());
6109 : }
6110 : }
6111 : }
6112 : }
6113 : }
6114 :
6115 : // Wait for all pending jobs to complete
6116 618 : while (!jobList.empty())
6117 : {
6118 16 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
6119 16 : if (l_eErr != CE_None && eErr == CE_None)
6120 0 : eErr = l_eErr;
6121 : }
6122 :
6123 : // Flush the data to overviews.
6124 1779 : for (int iBand = 0; iBand < nBands; ++iBand)
6125 : {
6126 1177 : if (papapoOverviewBands[iBand][iOverview]->FlushCache(false) !=
6127 : CE_None)
6128 0 : eErr = CE_Failure;
6129 : }
6130 : }
6131 :
6132 383 : if (eErr == CE_None)
6133 379 : pfnProgress(1.0, nullptr, pProgressData);
6134 :
6135 383 : return eErr;
6136 : }
6137 :
6138 : /************************************************************************/
6139 : /* GDALRegenerateOverviewsMultiBand() */
6140 : /************************************************************************/
6141 :
6142 : /**
6143 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
6144 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
6145 : *
6146 : * This function will generate one or more overview images from a base
6147 : * image using the requested downsampling algorithm. Its primary use
6148 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
6149 : * can also be used to generate downsampled images in one file from another
6150 : * outside the overview architecture.
6151 : *
6152 : * The output bands need to exist in advance and share the same characteristics
6153 : * (type, dimensions)
6154 : *
6155 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
6156 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
6157 : *
6158 : * It does not support color tables or complex data types.
6159 : *
6160 : * The pseudo-algorithm used by the function is :
6161 : * for each overview
6162 : * iterate on lines of the source by a step of deltay
6163 : * iterate on columns of the source by a step of deltax
6164 : * read the source data of size deltax * deltay for all the bands
6165 : * generate the corresponding overview block for all the bands
6166 : *
6167 : * This function will honour properly NODATA_VALUES tuples (special dataset
6168 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
6169 : * considered as the nodata value and not each value of the triplet
6170 : * independently per band.
6171 : *
6172 : * The GDAL_NUM_THREADS configuration option can be set
6173 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
6174 : * overview computation.
6175 : *
6176 : * @param apoSrcBands the list of source bands to downsample
6177 : * @param aapoOverviewBands bidimension array of bands. First dimension is
6178 : * indexed by bands. Second dimension is indexed by
6179 : * overview levels. All aapoOverviewBands[i] arrays
6180 : * must have the same size (i.e. same number of
6181 : * overviews)
6182 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
6183 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
6184 : * @param pfnProgress progress report function.
6185 : * @param pProgressData progress function callback data.
6186 : * @param papszOptions NULL terminated list of options as
6187 : * key=value pairs, or NULL
6188 : * The XOFF, YOFF, XSIZE and YSIZE
6189 : * options can be specified to express that overviews should
6190 : * be regenerated only in the specified subset of the source
6191 : * dataset.
6192 : * @return CE_None on success or CE_Failure on failure.
6193 : * @since 3.10
6194 : */
6195 :
6196 19 : CPLErr GDALRegenerateOverviewsMultiBand(
6197 : const std::vector<GDALRasterBand *> &apoSrcBands,
6198 : const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
6199 : const char *pszResampling, GDALProgressFunc pfnProgress,
6200 : void *pProgressData, CSLConstList papszOptions)
6201 : {
6202 19 : CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
6203 29 : for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
6204 : {
6205 10 : CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
6206 : }
6207 :
6208 19 : if (aapoOverviewBands.empty())
6209 0 : return CE_None;
6210 :
6211 19 : std::vector<GDALRasterBand **> apapoOverviewBands;
6212 48 : for (auto &apoOverviewBands : aapoOverviewBands)
6213 : {
6214 : auto papoOverviewBands = static_cast<GDALRasterBand **>(
6215 29 : CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
6216 61 : for (size_t i = 0; i < apoOverviewBands.size(); ++i)
6217 : {
6218 32 : papoOverviewBands[i] = apoOverviewBands[i];
6219 : }
6220 29 : apapoOverviewBands.push_back(papoOverviewBands);
6221 : }
6222 38 : const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
6223 19 : static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
6224 19 : static_cast<int>(aapoOverviewBands[0].size()),
6225 19 : apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
6226 : papszOptions);
6227 48 : for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
6228 29 : CPLFree(papoOverviewBands);
6229 19 : return eErr;
6230 : }
6231 :
6232 : /************************************************************************/
6233 : /* GDALComputeBandStats() */
6234 : /************************************************************************/
6235 :
6236 : /** Undocumented
6237 : * @param hSrcBand undocumented.
6238 : * @param nSampleStep Step between scanlines used to compute statistics.
6239 : * When nSampleStep is equal to 1, all scanlines will
6240 : * be processed.
6241 : * @param pdfMean undocumented.
6242 : * @param pdfStdDev undocumented.
6243 : * @param pfnProgress undocumented.
6244 : * @param pProgressData undocumented.
6245 : * @return undocumented
6246 : */
6247 18 : CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
6248 : int nSampleStep, double *pdfMean,
6249 : double *pdfStdDev,
6250 : GDALProgressFunc pfnProgress,
6251 : void *pProgressData)
6252 :
6253 : {
6254 18 : VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
6255 :
6256 18 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
6257 :
6258 18 : if (pfnProgress == nullptr)
6259 18 : pfnProgress = GDALDummyProgress;
6260 :
6261 18 : const int nWidth = poSrcBand->GetXSize();
6262 18 : const int nHeight = poSrcBand->GetYSize();
6263 :
6264 18 : if (nSampleStep >= nHeight || nSampleStep < 1)
6265 5 : nSampleStep = 1;
6266 :
6267 18 : GDALDataType eWrkType = GDT_Unknown;
6268 18 : float *pafData = nullptr;
6269 18 : GDALDataType eType = poSrcBand->GetRasterDataType();
6270 18 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6271 18 : if (bComplex)
6272 : {
6273 : pafData = static_cast<float *>(
6274 0 : VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6275 0 : eWrkType = GDT_CFloat32;
6276 : }
6277 : else
6278 : {
6279 : pafData =
6280 18 : static_cast<float *>(VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6281 18 : eWrkType = GDT_Float32;
6282 : }
6283 :
6284 18 : if (nWidth == 0 || pafData == nullptr)
6285 : {
6286 0 : VSIFree(pafData);
6287 0 : return CE_Failure;
6288 : }
6289 :
6290 : /* -------------------------------------------------------------------- */
6291 : /* Loop over all sample lines. */
6292 : /* -------------------------------------------------------------------- */
6293 18 : double dfSum = 0.0;
6294 18 : double dfSum2 = 0.0;
6295 18 : int iLine = 0;
6296 18 : GIntBig nSamples = 0;
6297 :
6298 2143 : do
6299 : {
6300 2161 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6301 : pProgressData))
6302 : {
6303 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6304 0 : CPLFree(pafData);
6305 0 : return CE_Failure;
6306 : }
6307 :
6308 : const CPLErr eErr =
6309 2161 : poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
6310 : 1, eWrkType, 0, 0, nullptr);
6311 2161 : if (eErr != CE_None)
6312 : {
6313 1 : CPLFree(pafData);
6314 1 : return eErr;
6315 : }
6316 :
6317 725208 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6318 : {
6319 723048 : float fValue = 0.0f;
6320 :
6321 723048 : if (bComplex)
6322 : {
6323 : // Compute the magnitude of the complex value.
6324 : fValue =
6325 0 : std::hypot(pafData[iPixel * 2], pafData[iPixel * 2 + 1]);
6326 : }
6327 : else
6328 : {
6329 723048 : fValue = pafData[iPixel];
6330 : }
6331 :
6332 723048 : dfSum += fValue;
6333 723048 : dfSum2 += static_cast<double>(fValue) * fValue;
6334 : }
6335 :
6336 2160 : nSamples += nWidth;
6337 2160 : iLine += nSampleStep;
6338 2160 : } while (iLine < nHeight);
6339 :
6340 17 : if (!pfnProgress(1.0, nullptr, pProgressData))
6341 : {
6342 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6343 0 : CPLFree(pafData);
6344 0 : return CE_Failure;
6345 : }
6346 :
6347 : /* -------------------------------------------------------------------- */
6348 : /* Produce the result values. */
6349 : /* -------------------------------------------------------------------- */
6350 17 : if (pdfMean != nullptr)
6351 17 : *pdfMean = dfSum / nSamples;
6352 :
6353 17 : if (pdfStdDev != nullptr)
6354 : {
6355 17 : const double dfMean = dfSum / nSamples;
6356 :
6357 17 : *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
6358 : }
6359 :
6360 17 : CPLFree(pafData);
6361 :
6362 17 : return CE_None;
6363 : }
6364 :
6365 : /************************************************************************/
6366 : /* GDALOverviewMagnitudeCorrection() */
6367 : /* */
6368 : /* Correct the mean and standard deviation of the overviews of */
6369 : /* the given band to match the base layer approximately. */
6370 : /************************************************************************/
6371 :
6372 : /** Undocumented
6373 : * @param hBaseBand undocumented.
6374 : * @param nOverviewCount undocumented.
6375 : * @param pahOverviews undocumented.
6376 : * @param pfnProgress undocumented.
6377 : * @param pProgressData undocumented.
6378 : * @return undocumented
6379 : */
6380 0 : CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
6381 : int nOverviewCount,
6382 : GDALRasterBandH *pahOverviews,
6383 : GDALProgressFunc pfnProgress,
6384 : void *pProgressData)
6385 :
6386 : {
6387 0 : VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
6388 :
6389 : /* -------------------------------------------------------------------- */
6390 : /* Compute mean/stddev for source raster. */
6391 : /* -------------------------------------------------------------------- */
6392 0 : double dfOrigMean = 0.0;
6393 0 : double dfOrigStdDev = 0.0;
6394 : {
6395 : const CPLErr eErr =
6396 0 : GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
6397 : pfnProgress, pProgressData);
6398 :
6399 0 : if (eErr != CE_None)
6400 0 : return eErr;
6401 : }
6402 :
6403 : /* -------------------------------------------------------------------- */
6404 : /* Loop on overview bands. */
6405 : /* -------------------------------------------------------------------- */
6406 0 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
6407 : {
6408 : GDALRasterBand *poOverview =
6409 0 : GDALRasterBand::FromHandle(pahOverviews[iOverview]);
6410 : double dfOverviewMean, dfOverviewStdDev;
6411 :
6412 : const CPLErr eErr =
6413 0 : GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
6414 : &dfOverviewStdDev, pfnProgress, pProgressData);
6415 :
6416 0 : if (eErr != CE_None)
6417 0 : return eErr;
6418 :
6419 0 : double dfGain = 1.0;
6420 0 : if (dfOrigStdDev >= 0.0001)
6421 0 : dfGain = dfOrigStdDev / dfOverviewStdDev;
6422 :
6423 : /* --------------------------------------------------------------------
6424 : */
6425 : /* Apply gain and offset. */
6426 : /* --------------------------------------------------------------------
6427 : */
6428 0 : const int nWidth = poOverview->GetXSize();
6429 0 : const int nHeight = poOverview->GetYSize();
6430 :
6431 0 : GDALDataType eWrkType = GDT_Unknown;
6432 0 : float *pafData = nullptr;
6433 0 : const GDALDataType eType = poOverview->GetRasterDataType();
6434 0 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6435 0 : if (bComplex)
6436 : {
6437 : pafData = static_cast<float *>(
6438 0 : VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6439 0 : eWrkType = GDT_CFloat32;
6440 : }
6441 : else
6442 : {
6443 : pafData = static_cast<float *>(
6444 0 : VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6445 0 : eWrkType = GDT_Float32;
6446 : }
6447 :
6448 0 : if (pafData == nullptr)
6449 : {
6450 0 : return CE_Failure;
6451 : }
6452 :
6453 0 : for (int iLine = 0; iLine < nHeight; ++iLine)
6454 : {
6455 0 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6456 : pProgressData))
6457 : {
6458 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6459 0 : CPLFree(pafData);
6460 0 : return CE_Failure;
6461 : }
6462 :
6463 0 : if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
6464 : nWidth, 1, eWrkType, 0, 0,
6465 0 : nullptr) != CE_None)
6466 : {
6467 0 : CPLFree(pafData);
6468 0 : return CE_Failure;
6469 : }
6470 :
6471 0 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6472 : {
6473 0 : if (bComplex)
6474 : {
6475 0 : pafData[iPixel * 2] *= static_cast<float>(dfGain);
6476 0 : pafData[iPixel * 2 + 1] *= static_cast<float>(dfGain);
6477 : }
6478 : else
6479 : {
6480 0 : pafData[iPixel] = static_cast<float>(
6481 0 : (pafData[iPixel] - dfOverviewMean) * dfGain +
6482 : dfOrigMean);
6483 : }
6484 : }
6485 :
6486 0 : if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
6487 : nWidth, 1, eWrkType, 0, 0,
6488 0 : nullptr) != CE_None)
6489 : {
6490 0 : CPLFree(pafData);
6491 0 : return CE_Failure;
6492 : }
6493 : }
6494 :
6495 0 : if (!pfnProgress(1.0, nullptr, pProgressData))
6496 : {
6497 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6498 0 : CPLFree(pafData);
6499 0 : return CE_Failure;
6500 : }
6501 :
6502 0 : CPLFree(pafData);
6503 : }
6504 :
6505 0 : return CE_None;
6506 : }
|