Line data Source code
1 :
2 : /******************************************************************************
3 : *
4 : * Project: GDAL Core
5 : * Purpose: Helper code to implement overview support in different drivers.
6 : * Author: Frank Warmerdam, warmerdam@pobox.com
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 2000, Frank Warmerdam
10 : * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * SPDX-License-Identifier: MIT
13 : ****************************************************************************/
14 :
15 : #include "cpl_port.h"
16 : #include "gdal_priv.h"
17 :
18 : #include <cmath>
19 : #include <cstddef>
20 : #include <cstdlib>
21 :
22 : #include <algorithm>
23 : #include <complex>
24 : #include <condition_variable>
25 : #include <limits>
26 : #include <list>
27 : #include <memory>
28 : #include <mutex>
29 : #include <vector>
30 :
31 : #include "cpl_conv.h"
32 : #include "cpl_error.h"
33 : #include "cpl_float.h"
34 : #include "cpl_progress.h"
35 : #include "cpl_vsi.h"
36 : #include "gdal.h"
37 : #include "gdal_thread_pool.h"
38 : #include "gdalwarper.h"
39 : #include "gdal_vrt.h"
40 : #include "vrtdataset.h"
41 :
42 : #ifdef USE_NEON_OPTIMIZATIONS
43 : #include "include_sse2neon.h"
44 : #define USE_SSE2
45 :
46 : #include "gdalsse_priv.h"
47 :
48 : // Restrict to 64bit processors because they are guaranteed to have SSE2,
49 : // or if __AVX2__ is defined.
50 : #elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
51 : #define USE_SSE2
52 :
53 : #include "gdalsse_priv.h"
54 :
55 : #ifdef __SSE3__
56 : #include <pmmintrin.h>
57 : #endif
58 : #ifdef __SSSE3__
59 : #include <tmmintrin.h>
60 : #endif
61 : #ifdef __SSE4_1__
62 : #include <smmintrin.h>
63 : #endif
64 : #ifdef __AVX2__
65 : #include <immintrin.h>
66 : #endif
67 :
68 : #endif
69 :
70 : // To be included after above USE_SSE2 and include gdalsse_priv.h
71 : // to avoid build issue on Windows x86
72 : #include "gdal_priv_templates.hpp"
73 :
74 : /************************************************************************/
75 : /* GDALResampleChunk_Near() */
76 : /************************************************************************/
77 :
78 : template <class T>
79 1233 : static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
80 : const T *pChunk, T **ppDstBuffer)
81 :
82 : {
83 1233 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
84 1233 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
85 1233 : const GDALDataType eWrkDataType = args.eWrkDataType;
86 1233 : const int nChunkXOff = args.nChunkXOff;
87 1233 : const int nChunkXSize = args.nChunkXSize;
88 1233 : const int nChunkYOff = args.nChunkYOff;
89 1233 : const int nDstXOff = args.nDstXOff;
90 1233 : const int nDstXOff2 = args.nDstXOff2;
91 1233 : const int nDstYOff = args.nDstYOff;
92 1233 : const int nDstYOff2 = args.nDstYOff2;
93 1233 : const int nDstXWidth = nDstXOff2 - nDstXOff;
94 :
95 : /* -------------------------------------------------------------------- */
96 : /* Allocate buffers. */
97 : /* -------------------------------------------------------------------- */
98 1233 : *ppDstBuffer = static_cast<T *>(
99 1233 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
100 : GDALGetDataTypeSizeBytes(eWrkDataType)));
101 1233 : if (*ppDstBuffer == nullptr)
102 : {
103 0 : return CE_Failure;
104 : }
105 1233 : T *const pDstBuffer = *ppDstBuffer;
106 :
107 : int *panSrcXOff =
108 1233 : static_cast<int *>(VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(int)));
109 :
110 1233 : if (panSrcXOff == nullptr)
111 : {
112 0 : return CE_Failure;
113 : }
114 :
115 : /* ==================================================================== */
116 : /* Precompute inner loop constants. */
117 : /* ==================================================================== */
118 842009 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
119 : {
120 840776 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
121 840776 : if (nSrcXOff < nChunkXOff)
122 0 : nSrcXOff = nChunkXOff;
123 :
124 840776 : panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
125 : }
126 :
127 : /* ==================================================================== */
128 : /* Loop over destination scanlines. */
129 : /* ==================================================================== */
130 141825 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
131 : {
132 140592 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
133 140592 : if (nSrcYOff < nChunkYOff)
134 0 : nSrcYOff = nChunkYOff;
135 :
136 140592 : const T *const pSrcScanline =
137 : pChunk +
138 140592 : (static_cast<size_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
139 138074 : nChunkXOff;
140 :
141 : /* --------------------------------------------------------------------
142 : */
143 : /* Loop over destination pixels */
144 : /* --------------------------------------------------------------------
145 : */
146 140592 : T *pDstScanline =
147 140592 : pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
148 119627130 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
149 : {
150 119486612 : pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
151 : }
152 : }
153 :
154 1233 : CPLFree(panSrcXOff);
155 :
156 1233 : return CE_None;
157 : }
158 :
159 1233 : static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
160 : const void *pChunk, void **ppDstBuffer,
161 : GDALDataType *peDstBufferDataType)
162 : {
163 1233 : *peDstBufferDataType = args.eWrkDataType;
164 1233 : switch (args.eWrkDataType)
165 : {
166 : // For nearest resampling, as no computation is done, only the
167 : // size of the data type matters.
168 1081 : case GDT_Byte:
169 : case GDT_Int8:
170 : {
171 1081 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
172 1081 : return GDALResampleChunk_NearT(
173 : args, static_cast<const uint8_t *>(pChunk),
174 1081 : reinterpret_cast<uint8_t **>(ppDstBuffer));
175 : }
176 :
177 50 : case GDT_Int16:
178 : case GDT_UInt16:
179 : case GDT_Float16:
180 : {
181 50 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
182 50 : return GDALResampleChunk_NearT(
183 : args, static_cast<const uint16_t *>(pChunk),
184 50 : reinterpret_cast<uint16_t **>(ppDstBuffer));
185 : }
186 :
187 55 : case GDT_CInt16:
188 : case GDT_CFloat16:
189 : case GDT_Int32:
190 : case GDT_UInt32:
191 : case GDT_Float32:
192 : {
193 55 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
194 55 : return GDALResampleChunk_NearT(
195 : args, static_cast<const uint32_t *>(pChunk),
196 55 : reinterpret_cast<uint32_t **>(ppDstBuffer));
197 : }
198 :
199 43 : case GDT_CInt32:
200 : case GDT_CFloat32:
201 : case GDT_Int64:
202 : case GDT_UInt64:
203 : case GDT_Float64:
204 : {
205 43 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
206 43 : return GDALResampleChunk_NearT(
207 : args, static_cast<const uint64_t *>(pChunk),
208 43 : reinterpret_cast<uint64_t **>(ppDstBuffer));
209 : }
210 :
211 4 : case GDT_CFloat64:
212 : {
213 4 : return GDALResampleChunk_NearT(
214 : args, static_cast<const std::complex<double> *>(pChunk),
215 4 : reinterpret_cast<std::complex<double> **>(ppDstBuffer));
216 : }
217 :
218 0 : case GDT_Unknown:
219 : case GDT_TypeCount:
220 0 : break;
221 : }
222 0 : CPLAssert(false);
223 : return CE_Failure;
224 : }
225 :
226 : namespace
227 : {
228 :
229 : // Find in the color table the entry whose RGB value is the closest
230 : // (using quadratic distance) to the test color, ignoring transparent entries.
231 3837 : int BestColorEntry(const std::vector<GDALColorEntry> &entries,
232 : const GDALColorEntry &test)
233 : {
234 3837 : int nMinDist = std::numeric_limits<int>::max();
235 3837 : size_t bestEntry = 0;
236 986109 : for (size_t i = 0; i < entries.size(); ++i)
237 : {
238 982272 : const GDALColorEntry &entry = entries[i];
239 : // Ignore transparent entries
240 982272 : if (entry.c4 == 0)
241 3237 : continue;
242 :
243 979035 : int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
244 979035 : ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
245 979035 : ((test.c3 - entry.c3) * (test.c3 - entry.c3));
246 979035 : if (nDist < nMinDist)
247 : {
248 15847 : nMinDist = nDist;
249 15847 : bestEntry = i;
250 : }
251 : }
252 3837 : return static_cast<int>(bestEntry);
253 : }
254 :
255 7 : std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
256 : int &transparentIdx)
257 : {
258 7 : std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
259 :
260 7 : transparentIdx = -1;
261 7 : int i = 0;
262 1799 : for (auto &entry : entries)
263 : {
264 1792 : table.GetColorEntryAsRGB(i, &entry);
265 1792 : if (transparentIdx < 0 && entry.c4 == 0)
266 1 : transparentIdx = i;
267 1792 : ++i;
268 : }
269 7 : return entries;
270 : }
271 :
272 : } // unnamed namespace
273 :
274 : /************************************************************************/
275 : /* SQUARE() */
276 : /************************************************************************/
277 :
278 3721 : template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
279 : {
280 3721 : return static_cast<Tsquare>(val) * val;
281 : }
282 :
283 : /************************************************************************/
284 : /* ComputeIntegerRMS() */
285 : /************************************************************************/
286 : // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
287 : // integer that minimizes abs(rms**2 - sumSquares / weight)
288 : template <class T, class Twork>
289 42 : inline T ComputeIntegerRMS(double sumSquares, double weight)
290 : {
291 42 : const double sumDivWeight = sumSquares / weight;
292 42 : T rms = static_cast<T>(sqrt(sumDivWeight));
293 :
294 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
295 : // Naive version:
296 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
297 42 : if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
298 42 : 2 * sumDivWeight)
299 6 : rms += 1;
300 42 : return rms;
301 : }
302 :
303 0 : template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
304 : {
305 0 : CPLAssert(false);
306 : return 0;
307 : }
308 :
309 24 : template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
310 : {
311 : // It has been verified that given the correction on rms below, using
312 : // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
313 : // is equivalent, so use the former as it is used twice.
314 24 : const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
315 24 : const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
316 24 : GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
317 :
318 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
319 : // Naive version:
320 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
321 : // Optimized version for integer case and weight == 4
322 24 : if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
323 5 : rms += 1;
324 24 : return rms;
325 : }
326 :
327 : template <>
328 20 : inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
329 : {
330 20 : const double sumDivWeight = sumSquares * 0.25;
331 20 : GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
332 :
333 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
334 : // Naive version:
335 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
336 : // Optimized version for integer case and weight == 4
337 20 : if (static_cast<GUInt32>(rms) * (rms + 1) <
338 20 : static_cast<GUInt32>(sumDivWeight + 0.25))
339 4 : rms += 1;
340 20 : return rms;
341 : }
342 :
343 : #ifdef USE_SSE2
344 :
345 : /************************************************************************/
346 : /* QuadraticMeanByteSSE2OrAVX2() */
347 : /************************************************************************/
348 :
349 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
350 : #define sse2_packus_epi32 _mm_packus_epi32
351 : #else
352 516119 : inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
353 : {
354 516119 : const auto minus32768_32 = _mm_set1_epi32(-32768);
355 516119 : const auto minus32768_16 = _mm_set1_epi16(-32768);
356 516119 : a = _mm_add_epi32(a, minus32768_32);
357 516119 : b = _mm_add_epi32(b, minus32768_32);
358 516119 : a = _mm_packs_epi32(a, b);
359 516119 : a = _mm_sub_epi16(a, minus32768_16);
360 516119 : return a;
361 : }
362 : #endif
363 :
364 : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
365 : #define sse2_hadd_epi16 _mm_hadd_epi16
366 : #else
367 4667530 : inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
368 : {
369 : // Horizontal addition of adjacent pairs
370 4667530 : const auto mask = _mm_set1_epi32(0xFFFF);
371 : const auto horizLo =
372 14002600 : _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
373 : const auto horizHi =
374 14002600 : _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
375 :
376 : // Recombine low and high parts
377 4667530 : return _mm_packs_epi32(horizLo, horizHi);
378 : }
379 : #endif
380 :
381 : #ifdef __AVX2__
382 :
383 : #define DEST_ELTS 16
384 : #define set1_epi16 _mm256_set1_epi16
385 : #define set1_epi32 _mm256_set1_epi32
386 : #define setzero _mm256_setzero_si256
387 : #define set1_ps _mm256_set1_ps
388 : #define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
389 : #define unpacklo_epi8 _mm256_unpacklo_epi8
390 : #define unpackhi_epi8 _mm256_unpackhi_epi8
391 : #define madd_epi16 _mm256_madd_epi16
392 : #define add_epi32 _mm256_add_epi32
393 : #define mul_ps _mm256_mul_ps
394 : #define cvtepi32_ps _mm256_cvtepi32_ps
395 : #define sqrt_ps _mm256_sqrt_ps
396 : #define cvttps_epi32 _mm256_cvttps_epi32
397 : #define packs_epi32 _mm256_packs_epi32
398 : #define packus_epi32 _mm256_packus_epi32
399 : #define srli_epi32 _mm256_srli_epi32
400 : #define mullo_epi16 _mm256_mullo_epi16
401 : #define srli_epi16 _mm256_srli_epi16
402 : #define cmpgt_epi16 _mm256_cmpgt_epi16
403 : #define add_epi16 _mm256_add_epi16
404 : #define sub_epi16 _mm256_sub_epi16
405 : #define packus_epi16 _mm256_packus_epi16
406 :
407 : /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
408 : /* to get the lower 128-bit bits of what would be a true 256-bit vector register
409 : */
410 :
411 : inline __m256i FIXUP_LANES(__m256i x)
412 : {
413 : return _mm256_permute4x64_epi64(x, _MM_SHUFFLE(3, 1, 2, 0));
414 : }
415 :
416 : #define store_lo(x, y) \
417 : _mm_storeu_si128(reinterpret_cast<__m128i *>(x), \
418 : _mm256_extracti128_si256(FIXUP_LANES(y), 0))
419 : #define storeu_int(x, y) \
420 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(x), FIXUP_LANES(y))
421 : #define hadd_epi16 _mm256_hadd_epi16
422 : #else
423 : #define DEST_ELTS 8
424 : #define set1_epi16 _mm_set1_epi16
425 : #define set1_epi32 _mm_set1_epi32
426 : #define setzero _mm_setzero_si128
427 : #define set1_ps _mm_set1_ps
428 : #define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
429 : #define unpacklo_epi8 _mm_unpacklo_epi8
430 : #define unpackhi_epi8 _mm_unpackhi_epi8
431 : #define madd_epi16 _mm_madd_epi16
432 : #define add_epi32 _mm_add_epi32
433 : #define mul_ps _mm_mul_ps
434 : #define cvtepi32_ps _mm_cvtepi32_ps
435 : #define sqrt_ps _mm_sqrt_ps
436 : #define cvttps_epi32 _mm_cvttps_epi32
437 : #define packs_epi32 _mm_packs_epi32
438 : #define packus_epi32 sse2_packus_epi32
439 : #define srli_epi32 _mm_srli_epi32
440 : #define mullo_epi16 _mm_mullo_epi16
441 : #define srli_epi16 _mm_srli_epi16
442 : #define cmpgt_epi16 _mm_cmpgt_epi16
443 : #define add_epi16 _mm_add_epi16
444 : #define sub_epi16 _mm_sub_epi16
445 : #define packus_epi16 _mm_packus_epi16
446 : #define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
447 : #define storeu_int(x, y) _mm_storeu_si128(reinterpret_cast<__m128i *>(x), (y))
448 : #define hadd_epi16 sse2_hadd_epi16
449 : #endif
450 :
451 : template <class T>
452 : static int
453 : #if defined(__GNUC__)
454 : __attribute__((noinline))
455 : #endif
456 5385 : QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
457 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
458 : T *CPL_RESTRICT pDstScanline)
459 : {
460 : // Optimized implementation for RMS on Byte by
461 : // processing by group of 8 output pixels, so as to use
462 : // a single _mm_sqrt_ps() call for 4 output pixels
463 5385 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
464 :
465 5385 : int iDstPixel = 0;
466 5385 : const auto one16 = set1_epi16(1);
467 5385 : const auto one32 = set1_epi32(1);
468 5385 : const auto zero = setzero();
469 5385 : const auto minus32768 = set1_epi16(-32768);
470 :
471 521496 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
472 : {
473 : // Load 2 * DEST_ELTS bytes from each line
474 516111 : auto firstLine = loadu_int(pSrcScanlineShifted);
475 1032220 : auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
476 : // Extend those Bytes as UInt16s
477 516111 : auto firstLineLo = unpacklo_epi8(firstLine, zero);
478 516111 : auto firstLineHi = unpackhi_epi8(firstLine, zero);
479 516111 : auto secondLineLo = unpacklo_epi8(secondLine, zero);
480 516111 : auto secondLineHi = unpackhi_epi8(secondLine, zero);
481 :
482 : // Multiplication of 16 bit values and horizontal
483 : // addition of 32 bit results
484 : // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
485 516111 : firstLineLo = madd_epi16(firstLineLo, firstLineLo);
486 516111 : firstLineHi = madd_epi16(firstLineHi, firstLineHi);
487 516111 : secondLineLo = madd_epi16(secondLineLo, secondLineLo);
488 516111 : secondLineHi = madd_epi16(secondLineHi, secondLineHi);
489 :
490 : // Vertical addition
491 516111 : const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
492 516111 : const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
493 :
494 : const auto sumSquaresPlusOneDiv4Lo =
495 1032220 : srli_epi32(add_epi32(sumSquaresLo, one32), 2);
496 : const auto sumSquaresPlusOneDiv4Hi =
497 1032220 : srli_epi32(add_epi32(sumSquaresHi, one32), 2);
498 :
499 : // Take square root and truncate/floor to int32
500 : const auto rmsLo =
501 1548330 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
502 : const auto rmsHi =
503 1548330 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
504 :
505 : // Merge back low and high registers with each RMS value
506 : // as a 16 bit value.
507 516111 : auto rms = packs_epi32(rmsLo, rmsHi);
508 :
509 : // Round to upper value if it minimizes the
510 : // error |rms^2 - sumSquares/4|
511 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
512 : // rms += 1;
513 : // which is equivalent to:
514 : // if( rms * (rms + 1) < (sumSquares+1) / 4 )
515 : // rms += 1;
516 : // And both left and right parts fit on 16 (unsigned) bits
517 : const auto sumSquaresPlusOneDiv4 =
518 516111 : packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
519 : // cmpgt_epi16 operates on signed int16, but here
520 : // we have unsigned values, so shift them by -32768 before
521 2580560 : auto mask = cmpgt_epi16(
522 : add_epi16(sumSquaresPlusOneDiv4, minus32768),
523 : add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
524 : // The value of the mask will be -1 when the correction needs to be
525 : // applied
526 516111 : rms = sub_epi16(rms, mask);
527 :
528 : // Pack each 16 bit RMS value to 8 bits
529 516111 : rms = packus_epi16(rms, rms /* could be anything */);
530 516111 : store_lo(&pDstScanline[iDstPixel], rms);
531 516111 : pSrcScanlineShifted += 2 * DEST_ELTS;
532 : }
533 :
534 5385 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
535 5385 : return iDstPixel;
536 : }
537 :
538 : /************************************************************************/
539 : /* AverageByteSSE2OrAVX2() */
540 : /************************************************************************/
541 :
542 : template <class T>
543 : static int
544 111280 : AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
545 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
546 : T *CPL_RESTRICT pDstScanline)
547 : {
548 : // Optimized implementation for average on Byte by
549 : // processing by group of 16 output pixels for SSE2, or 32 for AVX2
550 :
551 111280 : const auto zero = setzero();
552 111280 : const auto two16 = set1_epi16(2);
553 111280 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
554 :
555 111280 : int iDstPixel = 0;
556 2445050 : for (; iDstPixel < nDstXWidth - (2 * DEST_ELTS - 1);
557 : iDstPixel += 2 * DEST_ELTS)
558 : {
559 : decltype(setzero()) average0;
560 : {
561 : // Load 2 * DEST_ELTS bytes from each line
562 2333770 : const auto firstLine = loadu_int(pSrcScanlineShifted);
563 : const auto secondLine =
564 4667530 : loadu_int(pSrcScanlineShifted + nChunkXSize);
565 : // Extend those Bytes as UInt16s
566 2333770 : const auto firstLineLo = unpacklo_epi8(firstLine, zero);
567 2333770 : const auto firstLineHi = unpackhi_epi8(firstLine, zero);
568 2333770 : const auto secondLineLo = unpacklo_epi8(secondLine, zero);
569 2333770 : const auto secondLineHi = unpackhi_epi8(secondLine, zero);
570 :
571 : // Vertical addition
572 2333770 : const auto sumLo = add_epi16(firstLineLo, secondLineLo);
573 2333770 : const auto sumHi = add_epi16(firstLineHi, secondLineHi);
574 :
575 : // Horizontal addition of adjacent pairs, and recombine low and high
576 : // parts
577 2333770 : const auto sum = hadd_epi16(sumLo, sumHi);
578 :
579 : // average = (sum + 2) / 4
580 2333770 : average0 = srli_epi16(add_epi16(sum, two16), 2);
581 :
582 2333770 : pSrcScanlineShifted += 2 * DEST_ELTS;
583 : }
584 :
585 : decltype(setzero()) average1;
586 : {
587 : // Load 2 * DEST_ELTS bytes from each line
588 2333770 : const auto firstLine = loadu_int(pSrcScanlineShifted);
589 : const auto secondLine =
590 4667530 : loadu_int(pSrcScanlineShifted + nChunkXSize);
591 : // Extend those Bytes as UInt16s
592 2333770 : const auto firstLineLo = unpacklo_epi8(firstLine, zero);
593 2333770 : const auto firstLineHi = unpackhi_epi8(firstLine, zero);
594 2333770 : const auto secondLineLo = unpacklo_epi8(secondLine, zero);
595 2333770 : const auto secondLineHi = unpackhi_epi8(secondLine, zero);
596 :
597 : // Vertical addition
598 2333770 : const auto sumLo = add_epi16(firstLineLo, secondLineLo);
599 2333770 : const auto sumHi = add_epi16(firstLineHi, secondLineHi);
600 :
601 : // Horizontal addition of adjacent pairs, and recombine low and high
602 : // parts
603 2333770 : const auto sum = hadd_epi16(sumLo, sumHi);
604 :
605 : // average = (sum + 2) / 4
606 2333770 : average1 = srli_epi16(add_epi16(sum, two16), 2);
607 :
608 2333770 : pSrcScanlineShifted += 2 * DEST_ELTS;
609 : }
610 :
611 : // Pack each 16 bit average value to 8 bits
612 2333770 : const auto average = packus_epi16(average0, average1);
613 2333770 : storeu_int(&pDstScanline[iDstPixel], average);
614 : }
615 :
616 111280 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
617 111280 : return iDstPixel;
618 : }
619 :
620 : /************************************************************************/
621 : /* QuadraticMeanUInt16SSE2() */
622 : /************************************************************************/
623 :
624 : #ifdef __SSE3__
625 : #define sse2_hadd_pd _mm_hadd_pd
626 : #else
627 8 : inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
628 : {
629 : auto aLo_bLo =
630 32 : _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
631 : auto aHi_bHi =
632 32 : _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
633 8 : return _mm_add_pd(aLo_bLo, aHi_bHi); // (aLo + aHi, bLo + bHi)
634 : }
635 : #endif
636 :
637 40 : inline __m128d SQUARE_PD(__m128d x)
638 : {
639 40 : return _mm_mul_pd(x, x);
640 : }
641 :
642 : #ifdef __AVX2__
643 :
644 : inline __m256d SQUARE_PD(__m256d x)
645 : {
646 : return _mm256_mul_pd(x, x);
647 : }
648 :
649 : inline __m256d FIXUP_LANES(__m256d x)
650 : {
651 : return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
652 : }
653 :
654 : inline __m256 FIXUP_LANES(__m256 x)
655 : {
656 : return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
657 : }
658 :
659 : #endif
660 :
661 : template <class T>
662 : static int
663 10 : QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
664 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
665 : T *CPL_RESTRICT pDstScanline)
666 : {
667 : // Optimized implementation for RMS on UInt16 by
668 : // processing by group of 4 output pixels.
669 10 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
670 :
671 10 : int iDstPixel = 0;
672 10 : const auto zero = _mm_setzero_si128();
673 :
674 : #ifdef __AVX2__
675 : const auto zeroDot25 = _mm256_set1_pd(0.25);
676 : const auto zeroDot5 = _mm256_set1_pd(0.5);
677 :
678 : // The first four 0's could be anything, as we only take the bottom
679 : // 128 bits.
680 : const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
681 : #else
682 10 : const auto zeroDot25 = _mm_set1_pd(0.25);
683 10 : const auto zeroDot5 = _mm_set1_pd(0.5);
684 : #endif
685 :
686 40 : for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
687 : {
688 : // Load 8 UInt16 from each line
689 30 : const auto firstLine = _mm_loadu_si128(
690 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
691 : const auto secondLine =
692 30 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
693 30 : pSrcScanlineShifted + nChunkXSize));
694 :
695 : // Detect if all of the source values fit in 14 bits.
696 : // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
697 : // and we can do a much faster implementation.
698 : const auto maskTmp =
699 60 : _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
700 : #if defined(__i386__) || defined(_M_IX86)
701 : uint64_t nMaskFitsIn14Bits = 0;
702 : _mm_storel_epi64(
703 : reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
704 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
705 : #else
706 30 : const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
707 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
708 : #endif
709 30 : if (nMaskFitsIn14Bits == 0)
710 : {
711 : // Multiplication of 16 bit values and horizontal
712 : // addition of 32 bit results
713 : const auto firstLineHSumSquare =
714 26 : _mm_madd_epi16(firstLine, firstLine);
715 : const auto secondLineHSumSquare =
716 26 : _mm_madd_epi16(secondLine, secondLine);
717 : // Vertical addition
718 : const auto sumSquares =
719 26 : _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
720 : // In theory we should take sqrt(sumSquares * 0.25f)
721 : // but given the rounding we do, this is equivalent to
722 : // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
723 : // sumSquares <= 4 * 16383^2
724 26 : const auto one32 = _mm_set1_epi32(1);
725 : const auto sumSquaresPlusOneDiv4 =
726 52 : _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
727 : // Take square root and truncate/floor to int32
728 78 : auto rms = _mm_cvttps_epi32(
729 : _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
730 :
731 : // Round to upper value if it minimizes the
732 : // error |rms^2 - sumSquares/4|
733 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
734 : // rms += 1;
735 : // which is equivalent to:
736 : // if( rms * rms + rms < (sumSquares+1) / 4 )
737 : // rms += 1;
738 : auto mask =
739 78 : _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
740 : _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
741 26 : rms = _mm_sub_epi32(rms, mask);
742 : // Pack each 32 bit RMS value to 16 bits
743 26 : rms = _mm_packs_epi32(rms, rms /* could be anything */);
744 : _mm_storel_epi64(
745 26 : reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
746 26 : pSrcScanlineShifted += 8;
747 26 : continue;
748 : }
749 :
750 : // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
751 : // to 32 bit would result in 4 multiplications instead of 8, but
752 : // mullo/mulhi have a worse throughput than mul_pd.
753 :
754 : // Extend those UInt16s as UInt32s
755 4 : const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
756 4 : const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
757 4 : const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
758 4 : const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
759 :
760 : #ifdef __AVX2__
761 : // Multiplication of 32 bit values previously converted to 64 bit double
762 : const auto firstLineLoDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineLo));
763 : const auto firstLineHiDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineHi));
764 : const auto secondLineLoDbl =
765 : SQUARE_PD(_mm256_cvtepi32_pd(secondLineLo));
766 : const auto secondLineHiDbl =
767 : SQUARE_PD(_mm256_cvtepi32_pd(secondLineHi));
768 :
769 : // Vertical addition of squares
770 : const auto sumSquaresLo =
771 : _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
772 : const auto sumSquaresHi =
773 : _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
774 :
775 : // Horizontal addition of squares
776 : const auto sumSquares =
777 : FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
778 :
779 : const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
780 :
781 : // Take square root and truncate/floor to int32
782 : auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
783 : const auto rmsDouble = _mm256_cvtepi32_pd(rms);
784 : const auto right = _mm256_sub_pd(
785 : sumDivWeight, _mm256_add_pd(SQUARE_PD(rmsDouble), rmsDouble));
786 :
787 : auto mask =
788 : _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
789 : // Extract 32-bit from each of the 4 64-bit masks
790 : // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
791 : // _MM_SHUFFLE(2,0,2,0)));
792 : mask = _mm256_permutevar8x32_ps(mask, permutation);
793 : const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
794 :
795 : // Apply the correction
796 : rms = _mm_sub_epi32(rms, maskI);
797 :
798 : // Pack each 32 bit RMS value to 16 bits
799 : rms = _mm_packus_epi32(rms, rms /* could be anything */);
800 : #else
801 : // Multiplication of 32 bit values previously converted to 64 bit double
802 4 : const auto firstLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineLo));
803 : const auto firstLineLoHi =
804 8 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
805 4 : const auto firstLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineHi));
806 : const auto firstLineHiHi =
807 8 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
808 :
809 4 : const auto secondLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineLo));
810 : const auto secondLineLoHi =
811 8 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
812 4 : const auto secondLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineHi));
813 : const auto secondLineHiHi =
814 8 : SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
815 :
816 : // Vertical addition of squares
817 4 : const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
818 4 : const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
819 4 : const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
820 4 : const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
821 :
822 : // Horizontal addition of squares
823 4 : const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
824 4 : const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
825 :
826 4 : const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
827 4 : const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
828 : // Take square root and truncate/floor to int32
829 8 : const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
830 8 : const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
831 :
832 : // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
833 : // if( 0.5 < sumDivWeight - (rms * rms + rms) )
834 : // rms += 1;
835 4 : const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
836 4 : const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
837 8 : const auto rightLo = _mm_sub_pd(
838 : sumDivWeightLo, _mm_add_pd(SQUARE_PD(rmsLoDouble), rmsLoDouble));
839 12 : const auto rightHi = _mm_sub_pd(
840 : sumDivWeightHi, _mm_add_pd(SQUARE_PD(rmsHiDouble), rmsHiDouble));
841 :
842 8 : const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
843 4 : const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
844 : // The value of the mask will be -1 when the correction needs to be
845 : // applied
846 8 : const auto mask = _mm_castps_si128(_mm_shuffle_ps(
847 : maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
848 :
849 16 : auto rms = _mm_castps_si128(
850 : _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
851 : // Apply the correction
852 4 : rms = _mm_sub_epi32(rms, mask);
853 :
854 : // Pack each 32 bit RMS value to 16 bits
855 4 : rms = sse2_packus_epi32(rms, rms /* could be anything */);
856 : #endif
857 :
858 4 : _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
859 : rms);
860 4 : pSrcScanlineShifted += 8;
861 : }
862 :
863 10 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
864 10 : return iDstPixel;
865 : }
866 :
867 : /************************************************************************/
868 : /* AverageUInt16SSE2() */
869 : /************************************************************************/
870 :
871 : template <class T>
872 9 : static int AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
873 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
874 : T *CPL_RESTRICT pDstScanline)
875 : {
876 : // Optimized implementation for average on UInt16 by
877 : // processing by group of 8 output pixels.
878 :
879 9 : const auto mask = _mm_set1_epi32(0xFFFF);
880 9 : const auto two = _mm_set1_epi32(2);
881 9 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
882 :
883 9 : int iDstPixel = 0;
884 13 : for (; iDstPixel < nDstXWidth - 7; iDstPixel += 8)
885 : {
886 : __m128i averageLow;
887 : // Load 8 UInt16 from each line
888 : {
889 4 : const auto firstLine = _mm_loadu_si128(
890 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
891 : const auto secondLine =
892 4 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
893 4 : pSrcScanlineShifted + nChunkXSize));
894 :
895 : // Horizontal addition and extension to 32 bit
896 12 : const auto horizAddFirstLine = _mm_add_epi32(
897 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
898 : const auto horizAddSecondLine =
899 12 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
900 : _mm_srli_epi32(secondLine, 16));
901 :
902 : // Vertical addition and average computation
903 : // average = (sum + 2) >> 2
904 8 : const auto sum = _mm_add_epi32(
905 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
906 4 : averageLow = _mm_srli_epi32(sum, 2);
907 : }
908 : // Load 8 UInt16 from each line
909 : __m128i averageHigh;
910 : {
911 4 : const auto firstLine = _mm_loadu_si128(
912 4 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted + 8));
913 : const auto secondLine =
914 4 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
915 4 : pSrcScanlineShifted + 8 + nChunkXSize));
916 :
917 : // Horizontal addition and extension to 32 bit
918 12 : const auto horizAddFirstLine = _mm_add_epi32(
919 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
920 : const auto horizAddSecondLine =
921 12 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
922 : _mm_srli_epi32(secondLine, 16));
923 :
924 : // Vertical addition and average computation
925 : // average = (sum + 2) >> 2
926 8 : const auto sum = _mm_add_epi32(
927 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
928 4 : averageHigh = _mm_srli_epi32(sum, 2);
929 : }
930 :
931 : // Pack each 32 bit average value to 16 bits
932 4 : auto average = sse2_packus_epi32(averageLow, averageHigh);
933 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
934 : average);
935 4 : pSrcScanlineShifted += 16;
936 : }
937 :
938 9 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
939 9 : return iDstPixel;
940 : }
941 :
942 : /************************************************************************/
943 : /* QuadraticMeanFloatSSE2() */
944 : /************************************************************************/
945 :
946 : #ifdef __SSE3__
947 : #define sse2_hadd_ps _mm_hadd_ps
948 : #else
949 18 : inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
950 : {
951 18 : auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
952 18 : auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
953 18 : return _mm_add_ps(aEven_bEven, aOdd_bOdd); // (aEven + aOdd, bEven + bOdd)
954 : }
955 : #endif
956 :
957 : #ifdef __AVX2__
958 : #define RMS_FLOAT_ELTS 8
959 : #define set1_ps _mm256_set1_ps
960 : #define loadu_ps _mm256_loadu_ps
961 : #define andnot_ps _mm256_andnot_ps
962 : #define and_ps _mm256_and_ps
963 : #define max_ps _mm256_max_ps
964 : #define shuffle_ps _mm256_shuffle_ps
965 : #define div_ps _mm256_div_ps
966 : #define cmpeq_ps(x, y) _mm256_cmp_ps(x, y, _CMP_EQ_OQ)
967 : #define mul_ps _mm256_mul_ps
968 : #define add_ps _mm256_add_ps
969 : #define hadd_ps _mm256_hadd_ps
970 : #define sqrt_ps _mm256_sqrt_ps
971 : #define or_ps _mm256_or_ps
972 : #define unpacklo_ps _mm256_unpacklo_ps
973 : #define unpackhi_ps _mm256_unpackhi_ps
974 : #define storeu_ps _mm256_storeu_ps
975 :
976 : inline __m256 SQUARE_PS(__m256 x)
977 : {
978 : return _mm256_mul_ps(x, x);
979 : }
980 :
981 : #else
982 :
983 : #define RMS_FLOAT_ELTS 4
984 : #define set1_ps _mm_set1_ps
985 : #define loadu_ps _mm_loadu_ps
986 : #define andnot_ps _mm_andnot_ps
987 : #define and_ps _mm_and_ps
988 : #define max_ps _mm_max_ps
989 : #define shuffle_ps _mm_shuffle_ps
990 : #define div_ps _mm_div_ps
991 : #define cmpeq_ps _mm_cmpeq_ps
992 : #define mul_ps _mm_mul_ps
993 : #define add_ps _mm_add_ps
994 : #define hadd_ps sse2_hadd_ps
995 : #define sqrt_ps _mm_sqrt_ps
996 : #define or_ps _mm_or_ps
997 : #define unpacklo_ps _mm_unpacklo_ps
998 : #define unpackhi_ps _mm_unpackhi_ps
999 : #define storeu_ps _mm_storeu_ps
1000 :
1001 272 : inline __m128 SQUARE_PS(__m128 x)
1002 : {
1003 272 : return _mm_mul_ps(x, x);
1004 : }
1005 :
1006 68 : inline __m128 FIXUP_LANES(__m128 x)
1007 : {
1008 68 : return x;
1009 : }
1010 :
1011 : #endif
1012 :
1013 : static int
1014 : #if defined(__GNUC__)
1015 : __attribute__((noinline))
1016 : #endif
1017 34 : QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
1018 : const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1019 : float *CPL_RESTRICT pDstScanline)
1020 : {
1021 : // Optimized implementation for RMS on Float32 by
1022 : // processing by group of RMS_FLOAT_ELTS output pixels.
1023 34 : const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1024 :
1025 34 : int iDstPixel = 0;
1026 34 : const auto minus_zero = set1_ps(-0.0f);
1027 34 : const auto zeroDot25 = set1_ps(0.25f);
1028 34 : const auto one = set1_ps(1.0f);
1029 68 : const auto infv = set1_ps(std::numeric_limits<float>::infinity());
1030 :
1031 102 : for (; iDstPixel < nDstXWidth - (RMS_FLOAT_ELTS - 1);
1032 68 : iDstPixel += RMS_FLOAT_ELTS)
1033 : {
1034 : // Load 2*RMS_FLOAT_ELTS Float32 from each line
1035 68 : auto firstLineLo = loadu_ps(pSrcScanlineShifted);
1036 68 : auto firstLineHi = loadu_ps(pSrcScanlineShifted + RMS_FLOAT_ELTS);
1037 68 : auto secondLineLo = loadu_ps(pSrcScanlineShifted + nChunkXSize);
1038 : auto secondLineHi =
1039 136 : loadu_ps(pSrcScanlineShifted + RMS_FLOAT_ELTS + nChunkXSize);
1040 :
1041 : // Take the absolute value
1042 68 : firstLineLo = andnot_ps(minus_zero, firstLineLo);
1043 68 : firstLineHi = andnot_ps(minus_zero, firstLineHi);
1044 68 : secondLineLo = andnot_ps(minus_zero, secondLineLo);
1045 68 : secondLineHi = andnot_ps(minus_zero, secondLineHi);
1046 :
1047 : auto firstLineEven =
1048 68 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1049 : auto firstLineOdd =
1050 68 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1051 : auto secondLineEven =
1052 68 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1053 : auto secondLineOdd =
1054 68 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1055 :
1056 : // Compute the maximum of each RMS_FLOAT_ELTS value to RMS-average
1057 204 : const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
1058 : max_ps(secondLineEven, secondLineEven));
1059 :
1060 : // Normalize each value by the maximum of the RMS_FLOAT_ELTS ones.
1061 : // This step is important to avoid that the square evaluates to infinity
1062 : // for sufficiently big input.
1063 68 : auto invMax = div_ps(one, maxV);
1064 : // Deal with 0 being the maximum to correct division by zero
1065 : // note: comparing to -0 leads to identical results as to comparing with
1066 : // 0
1067 136 : invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
1068 :
1069 68 : firstLineEven = mul_ps(firstLineEven, invMax);
1070 68 : firstLineOdd = mul_ps(firstLineOdd, invMax);
1071 68 : secondLineEven = mul_ps(secondLineEven, invMax);
1072 68 : secondLineOdd = mul_ps(secondLineOdd, invMax);
1073 :
1074 : // Compute squares
1075 68 : firstLineEven = SQUARE_PS(firstLineEven);
1076 68 : firstLineOdd = SQUARE_PS(firstLineOdd);
1077 68 : secondLineEven = SQUARE_PS(secondLineEven);
1078 68 : secondLineOdd = SQUARE_PS(secondLineOdd);
1079 :
1080 204 : const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
1081 : add_ps(secondLineEven, secondLineOdd));
1082 :
1083 204 : auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
1084 :
1085 : // Deal with infinity being the maximum
1086 68 : const auto maskIsInf = cmpeq_ps(maxV, infv);
1087 136 : rms = or_ps(andnot_ps(maskIsInf, rms), and_ps(maskIsInf, infv));
1088 :
1089 68 : rms = FIXUP_LANES(rms);
1090 :
1091 68 : storeu_ps(&pDstScanline[iDstPixel], rms);
1092 68 : pSrcScanlineShifted += RMS_FLOAT_ELTS * 2;
1093 : }
1094 :
1095 34 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1096 34 : return iDstPixel;
1097 : }
1098 :
1099 : /************************************************************************/
1100 : /* AverageFloatSSE2() */
1101 : /************************************************************************/
1102 :
1103 14 : static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
1104 : const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1105 : float *CPL_RESTRICT pDstScanline)
1106 : {
1107 : // Optimized implementation for average on Float32 by
1108 : // processing by group of 4 output pixels.
1109 14 : const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1110 :
1111 14 : int iDstPixel = 0;
1112 14 : const auto zeroDot25 = _mm_set1_ps(0.25f);
1113 :
1114 32 : for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
1115 : {
1116 : // Load 8 Float32 from each line
1117 18 : const auto firstLineLo = _mm_loadu_ps(pSrcScanlineShifted);
1118 18 : const auto firstLineHi = _mm_loadu_ps(pSrcScanlineShifted + 4);
1119 : const auto secondLineLo =
1120 18 : _mm_loadu_ps(pSrcScanlineShifted + nChunkXSize);
1121 : const auto secondLineHi =
1122 36 : _mm_loadu_ps(pSrcScanlineShifted + 4 + nChunkXSize);
1123 :
1124 : // Vertical addition
1125 18 : const auto sumLo = _mm_add_ps(firstLineLo, secondLineLo);
1126 18 : const auto sumHi = _mm_add_ps(firstLineHi, secondLineHi);
1127 :
1128 : // Horizontal addition
1129 18 : const auto sum = sse2_hadd_ps(sumLo, sumHi);
1130 :
1131 18 : const auto average = _mm_mul_ps(sum, zeroDot25);
1132 :
1133 18 : _mm_storeu_ps(&pDstScanline[iDstPixel], average);
1134 18 : pSrcScanlineShifted += 8;
1135 : }
1136 :
1137 14 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1138 14 : return iDstPixel;
1139 : }
1140 :
1141 : #endif
1142 :
1143 : /************************************************************************/
1144 : /* GDALResampleChunk_AverageOrRMS() */
1145 : /************************************************************************/
1146 :
1147 : template <class T, class Tsum, GDALDataType eWrkDataType>
1148 : static CPLErr
1149 2319 : GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
1150 : const T *pChunk, void **ppDstBuffer)
1151 : {
1152 2319 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1153 2319 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1154 2319 : const double dfSrcXDelta = args.dfSrcXDelta;
1155 2319 : const double dfSrcYDelta = args.dfSrcYDelta;
1156 2319 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1157 2319 : const int nChunkXOff = args.nChunkXOff;
1158 2319 : const int nChunkYOff = args.nChunkYOff;
1159 2319 : const int nChunkXSize = args.nChunkXSize;
1160 2319 : const int nChunkYSize = args.nChunkYSize;
1161 2319 : const int nDstXOff = args.nDstXOff;
1162 2319 : const int nDstXOff2 = args.nDstXOff2;
1163 2319 : const int nDstYOff = args.nDstYOff;
1164 2319 : const int nDstYOff2 = args.nDstYOff2;
1165 2319 : const char *pszResampling = args.pszResampling;
1166 2319 : bool bHasNoData = args.bHasNoData;
1167 2319 : const double dfNoDataValue = args.dfNoDataValue;
1168 2319 : const GDALColorTable *poColorTable = args.poColorTable;
1169 2319 : const bool bPropagateNoData = args.bPropagateNoData;
1170 :
1171 : // AVERAGE_BIT2GRAYSCALE
1172 : const bool bBit2Grayscale =
1173 2319 : CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"));
1174 2319 : const bool bQuadraticMean = CPL_TO_BOOL(EQUAL(pszResampling, "RMS"));
1175 2319 : if (bBit2Grayscale)
1176 9 : poColorTable = nullptr;
1177 :
1178 : T tNoDataValue;
1179 2319 : if (!bHasNoData)
1180 2263 : tNoDataValue = 0;
1181 : else
1182 56 : tNoDataValue = static_cast<T>(dfNoDataValue);
1183 2319 : const T tReplacementVal =
1184 114 : bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
1185 56 : args.eOvrDataType, dfNoDataValue))
1186 : : 0;
1187 :
1188 2319 : int nChunkRightXOff = nChunkXOff + nChunkXSize;
1189 2319 : int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1190 2319 : int nDstXWidth = nDstXOff2 - nDstXOff;
1191 :
1192 : /* -------------------------------------------------------------------- */
1193 : /* Allocate buffers. */
1194 : /* -------------------------------------------------------------------- */
1195 2319 : *ppDstBuffer = static_cast<T *>(
1196 2319 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
1197 : GDALGetDataTypeSizeBytes(eWrkDataType)));
1198 2319 : if (*ppDstBuffer == nullptr)
1199 : {
1200 0 : return CE_Failure;
1201 : }
1202 2319 : T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
1203 :
1204 : struct PrecomputedXValue
1205 : {
1206 : int nLeftXOffShifted;
1207 : int nRightXOffShifted;
1208 : double dfLeftWeight;
1209 : double dfRightWeight;
1210 : double dfTotalWeightFullLine;
1211 : };
1212 :
1213 : PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
1214 2319 : VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(PrecomputedXValue)));
1215 :
1216 2319 : if (pasSrcX == nullptr)
1217 : {
1218 0 : return CE_Failure;
1219 : }
1220 :
1221 2319 : int nTransparentIdx = -1;
1222 2319 : std::vector<GDALColorEntry> colorEntries;
1223 2319 : if (poColorTable)
1224 5 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1225 :
1226 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1227 : // it as nodata value
1228 2349 : if (bHasNoData && dfNoDataValue >= 0.0f &&
1229 30 : tNoDataValue < colorEntries.size())
1230 1 : colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1231 :
1232 : // Or if we have no explicit nodata, but a color table entry that is
1233 : // transparent, consider it as the nodata value
1234 2318 : else if (!bHasNoData && nTransparentIdx >= 0)
1235 : {
1236 0 : bHasNoData = true;
1237 0 : tNoDataValue = static_cast<T>(nTransparentIdx);
1238 : }
1239 :
1240 : /* ==================================================================== */
1241 : /* Precompute inner loop constants. */
1242 : /* ==================================================================== */
1243 2319 : bool bSrcXSpacingIsTwo = true;
1244 2319 : int nLastSrcXOff2 = -1;
1245 852325 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1246 : {
1247 850006 : double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
1248 : // Apply some epsilon to avoid numerical precision issues
1249 850006 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
1250 850006 : double dfSrcXOff2 = dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
1251 850006 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
1252 :
1253 850006 : if (nSrcXOff < nChunkXOff)
1254 0 : nSrcXOff = nChunkXOff;
1255 850006 : if (nSrcXOff2 == nSrcXOff)
1256 0 : nSrcXOff2++;
1257 850006 : if (nSrcXOff2 > nChunkRightXOff)
1258 1 : nSrcXOff2 = nChunkRightXOff;
1259 :
1260 850006 : pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
1261 850006 : pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
1262 850006 : nSrcXOff2 - nChunkXOff;
1263 21 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
1264 850006 : (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
1265 850006 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
1266 850006 : 1 - (nSrcXOff2 - dfSrcXOff2);
1267 850006 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
1268 850006 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
1269 850006 : if (nSrcXOff + 1 < nSrcXOff2)
1270 : {
1271 849985 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1272 849985 : nSrcXOff2 - nSrcXOff - 2;
1273 849985 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1274 849985 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
1275 : }
1276 :
1277 850006 : if (nSrcXOff2 - nSrcXOff != 2 ||
1278 728596 : (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
1279 : {
1280 120599 : bSrcXSpacingIsTwo = false;
1281 : }
1282 850006 : nLastSrcXOff2 = nSrcXOff2;
1283 : }
1284 :
1285 : /* ==================================================================== */
1286 : /* Loop over destination scanlines. */
1287 : /* ==================================================================== */
1288 721819 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1289 : {
1290 719500 : double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
1291 719500 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
1292 719500 : if (nSrcYOff < nChunkYOff)
1293 0 : nSrcYOff = nChunkYOff;
1294 :
1295 719500 : double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
1296 719500 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
1297 719500 : if (nSrcYOff2 == nSrcYOff)
1298 0 : ++nSrcYOff2;
1299 719500 : if (nSrcYOff2 > nChunkBottomYOff)
1300 3 : nSrcYOff2 = nChunkBottomYOff;
1301 :
1302 719500 : T *const pDstScanline =
1303 719500 : pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
1304 :
1305 : /* --------------------------------------------------------------------
1306 : */
1307 : /* Loop over destination pixels */
1308 : /* --------------------------------------------------------------------
1309 : */
1310 719500 : if (poColorTable == nullptr)
1311 : {
1312 719385 : if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
1313 : pabyChunkNodataMask == nullptr)
1314 : {
1315 : if constexpr (eWrkDataType == GDT_Byte ||
1316 : eWrkDataType == GDT_UInt16)
1317 : {
1318 : // Optimized case : no nodata, overview by a factor of 2 and
1319 : // regular x and y src spacing.
1320 116684 : const T *pSrcScanlineShifted =
1321 116684 : pChunk + pasSrcX[0].nLeftXOffShifted +
1322 116684 : static_cast<size_t>(nSrcYOff - nChunkYOff) *
1323 116684 : nChunkXSize;
1324 116684 : int iDstPixel = 0;
1325 : #ifdef USE_SSE2
1326 : if constexpr (eWrkDataType == GDT_Byte)
1327 : {
1328 116665 : if (bQuadraticMean)
1329 : {
1330 5385 : iDstPixel = QuadraticMeanByteSSE2OrAVX2(
1331 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1332 : pDstScanline);
1333 : }
1334 : else
1335 : {
1336 111280 : iDstPixel = AverageByteSSE2OrAVX2(
1337 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1338 : pDstScanline);
1339 : }
1340 : }
1341 : else
1342 : {
1343 : static_assert(eWrkDataType == GDT_UInt16);
1344 19 : if (bQuadraticMean)
1345 : {
1346 10 : iDstPixel = QuadraticMeanUInt16SSE2(
1347 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1348 : pDstScanline);
1349 : }
1350 : else
1351 : {
1352 9 : iDstPixel = AverageUInt16SSE2(
1353 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1354 : pDstScanline);
1355 : }
1356 : }
1357 : #endif
1358 291091 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1359 : {
1360 174407 : Tsum nTotal = 0;
1361 : T nVal;
1362 174407 : if (bQuadraticMean)
1363 44 : nTotal =
1364 44 : SQUARE<Tsum>(pSrcScanlineShifted[0]) +
1365 44 : SQUARE<Tsum>(pSrcScanlineShifted[1]) +
1366 44 : SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
1367 44 : SQUARE<Tsum>(
1368 44 : pSrcScanlineShifted[1 + nChunkXSize]);
1369 : else
1370 174363 : nTotal = pSrcScanlineShifted[0] +
1371 174363 : pSrcScanlineShifted[1] +
1372 174363 : pSrcScanlineShifted[nChunkXSize] +
1373 174363 : pSrcScanlineShifted[1 + nChunkXSize];
1374 :
1375 174407 : constexpr int nTotalWeight = 4;
1376 174407 : if (bQuadraticMean)
1377 44 : nVal = ComputeIntegerRMS_4values<T>(nTotal);
1378 : else
1379 174363 : nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
1380 : nTotalWeight);
1381 :
1382 : // No need to compare nVal against tNoDataValue as we
1383 : // are in a case where pabyChunkNodataMask == nullptr
1384 : // implies the absence of nodata value.
1385 174407 : pDstScanline[iDstPixel] = nVal;
1386 174407 : pSrcScanlineShifted += 2;
1387 : }
1388 : }
1389 : else
1390 : {
1391 : static_assert(eWrkDataType == GDT_Float32 ||
1392 : eWrkDataType == GDT_Float64);
1393 70 : const T *pSrcScanlineShifted =
1394 70 : pChunk + pasSrcX[0].nLeftXOffShifted +
1395 70 : static_cast<size_t>(nSrcYOff - nChunkYOff) *
1396 70 : nChunkXSize;
1397 70 : int iDstPixel = 0;
1398 : #ifdef USE_SSE2
1399 : if constexpr (eWrkDataType == GDT_Float32)
1400 : {
1401 : static_assert(std::is_same_v<T, float>);
1402 48 : if (bQuadraticMean)
1403 : {
1404 34 : iDstPixel = QuadraticMeanFloatSSE2(
1405 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1406 : pDstScanline);
1407 : }
1408 : else
1409 : {
1410 14 : iDstPixel = AverageFloatSSE2(
1411 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1412 : pDstScanline);
1413 : }
1414 : }
1415 : #endif
1416 :
1417 268 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1418 : {
1419 : T nVal;
1420 198 : if (bQuadraticMean)
1421 : {
1422 : // Cast to double to avoid overflows
1423 : // (using std::hypot() is much slower)
1424 100 : nVal = static_cast<T>(std::sqrt(
1425 : 0.25 *
1426 100 : (SQUARE<double>(pSrcScanlineShifted[0]) +
1427 100 : SQUARE<double>(pSrcScanlineShifted[1]) +
1428 100 : SQUARE<double>(
1429 200 : pSrcScanlineShifted[nChunkXSize]) +
1430 100 : SQUARE<double>(
1431 100 : pSrcScanlineShifted[1 + nChunkXSize]))));
1432 : }
1433 : else
1434 : {
1435 98 : nVal = static_cast<T>(
1436 98 : 0.25f * (pSrcScanlineShifted[0] +
1437 98 : pSrcScanlineShifted[1] +
1438 98 : pSrcScanlineShifted[nChunkXSize] +
1439 98 : pSrcScanlineShifted[1 + nChunkXSize]));
1440 : }
1441 :
1442 : // No need to compare nVal against tNoDataValue as we
1443 : // are in a case where pabyChunkNodataMask == nullptr
1444 : // implies the absence of nodata value.
1445 198 : pDstScanline[iDstPixel] = nVal;
1446 198 : pSrcScanlineShifted += 2;
1447 : }
1448 116754 : }
1449 : }
1450 : else
1451 : {
1452 17 : const double dfBottomWeight =
1453 602631 : (nSrcYOff + 1 == nSrcYOff2) ? 1.0
1454 602614 : : 1.0 - (dfSrcYOff - nSrcYOff);
1455 602631 : const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
1456 602631 : nSrcYOff -= nChunkYOff;
1457 602631 : nSrcYOff2 -= nChunkYOff;
1458 :
1459 602631 : double dfTotalWeightFullColumn = dfBottomWeight;
1460 602631 : if (nSrcYOff + 1 < nSrcYOff2)
1461 : {
1462 602615 : dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
1463 602615 : dfTotalWeightFullColumn += dfTopWeight;
1464 : }
1465 :
1466 18757660 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1467 : {
1468 18151383 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1469 18151383 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1470 :
1471 18151383 : double dfTotal = 0;
1472 18151383 : double dfTotalWeight = 0;
1473 18151383 : if (pabyChunkNodataMask == nullptr)
1474 : {
1475 1746435 : auto pChunkShifted =
1476 115 : pChunk +
1477 1746435 : static_cast<size_t>(nSrcYOff) * nChunkXSize;
1478 1746435 : int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1479 1746435 : double dfWeightY = dfBottomWeight;
1480 3493427 : while (true)
1481 : {
1482 : double dfTotalLine;
1483 5239852 : if (bQuadraticMean)
1484 : {
1485 : // Left pixel
1486 : {
1487 104 : const T val = pChunkShifted[nSrcXOff];
1488 104 : dfTotalLine =
1489 104 : SQUARE<double>(val) *
1490 104 : pasSrcX[iDstPixel].dfLeftWeight;
1491 : }
1492 :
1493 104 : if (nSrcXOff + 1 < nSrcXOff2)
1494 : {
1495 : // Middle pixels
1496 104 : for (int iX = nSrcXOff + 1;
1497 424 : iX < nSrcXOff2 - 1; ++iX)
1498 : {
1499 320 : const T val = pChunkShifted[iX];
1500 320 : dfTotalLine += SQUARE<double>(val);
1501 : }
1502 :
1503 : // Right pixel
1504 : {
1505 104 : const T val =
1506 104 : pChunkShifted[nSrcXOff2 - 1];
1507 104 : dfTotalLine +=
1508 104 : SQUARE<double>(val) *
1509 104 : pasSrcX[iDstPixel].dfRightWeight;
1510 : }
1511 : }
1512 : }
1513 : else
1514 : {
1515 : // Left pixel
1516 : {
1517 5239756 : const T val = pChunkShifted[nSrcXOff];
1518 5239756 : dfTotalLine =
1519 5239756 : val * pasSrcX[iDstPixel].dfLeftWeight;
1520 : }
1521 :
1522 5239756 : if (nSrcXOff + 1 < nSrcXOff2)
1523 : {
1524 : // Middle pixels
1525 4239330 : for (int iX = nSrcXOff + 1;
1526 64183126 : iX < nSrcXOff2 - 1; ++iX)
1527 : {
1528 59943836 : const T val = pChunkShifted[iX];
1529 59943836 : dfTotalLine += val;
1530 : }
1531 :
1532 : // Right pixel
1533 : {
1534 4239330 : const T val =
1535 4239330 : pChunkShifted[nSrcXOff2 - 1];
1536 4239330 : dfTotalLine +=
1537 4239330 : val *
1538 4239330 : pasSrcX[iDstPixel].dfRightWeight;
1539 : }
1540 : }
1541 : }
1542 :
1543 5239852 : dfTotal += dfTotalLine * dfWeightY;
1544 5239852 : --nCounterY;
1545 5239852 : if (nCounterY < 0)
1546 1746435 : break;
1547 3493427 : pChunkShifted += nChunkXSize;
1548 3493427 : dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
1549 : }
1550 :
1551 1746435 : dfTotalWeight =
1552 1746435 : pasSrcX[iDstPixel].dfTotalWeightFullLine *
1553 : dfTotalWeightFullColumn;
1554 : }
1555 : else
1556 : {
1557 16404968 : size_t nCount = 0;
1558 71769204 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1559 : {
1560 55362736 : const auto pChunkShifted =
1561 55362736 : pChunk + static_cast<size_t>(iY) * nChunkXSize;
1562 :
1563 55362736 : double dfTotalLine = 0;
1564 55362736 : double dfTotalWeightLine = 0;
1565 : // Left pixel
1566 : {
1567 55362736 : const int iX = nSrcXOff;
1568 55362736 : const T val = pChunkShifted[iX];
1569 55362736 : if (pabyChunkNodataMask
1570 55362736 : [iX +
1571 55362736 : static_cast<size_t>(iY) * nChunkXSize])
1572 : {
1573 23514583 : nCount++;
1574 23514583 : const double dfWeightX =
1575 23514583 : pasSrcX[iDstPixel].dfLeftWeight;
1576 23514583 : dfTotalWeightLine = dfWeightX;
1577 23514583 : if (bQuadraticMean)
1578 60 : dfTotalLine =
1579 60 : SQUARE<double>(val) * dfWeightX;
1580 : else
1581 23514583 : dfTotalLine = val * dfWeightX;
1582 : }
1583 : }
1584 :
1585 55362736 : if (nSrcXOff < nSrcXOff2 - 1)
1586 : {
1587 : // Middle pixels
1588 152883136 : for (int iX = nSrcXOff + 1; iX < nSrcXOff2 - 1;
1589 : ++iX)
1590 : {
1591 97520300 : const T val = pChunkShifted[iX];
1592 97520300 : if (pabyChunkNodataMask
1593 97520300 : [iX + static_cast<size_t>(iY) *
1594 97520300 : nChunkXSize])
1595 : {
1596 39728000 : nCount++;
1597 39728000 : dfTotalWeightLine += 1;
1598 39728000 : if (bQuadraticMean)
1599 0 : dfTotalLine += SQUARE<double>(val);
1600 : else
1601 39728000 : dfTotalLine += val;
1602 : }
1603 : }
1604 :
1605 : // Right pixel
1606 : {
1607 55362636 : const int iX = nSrcXOff2 - 1;
1608 55362636 : const T val = pChunkShifted[iX];
1609 55362636 : if (pabyChunkNodataMask
1610 55362636 : [iX + static_cast<size_t>(iY) *
1611 55362636 : nChunkXSize])
1612 : {
1613 23514151 : nCount++;
1614 23514151 : const double dfWeightX =
1615 23514151 : pasSrcX[iDstPixel].dfRightWeight;
1616 23514151 : dfTotalWeightLine += dfWeightX;
1617 23514151 : if (bQuadraticMean)
1618 782 : dfTotalLine +=
1619 61 : SQUARE<double>(val) * dfWeightX;
1620 : else
1621 23514050 : dfTotalLine += val * dfWeightX;
1622 : }
1623 : }
1624 : }
1625 :
1626 94328104 : const double dfWeightY =
1627 : (iY == nSrcYOff) ? dfBottomWeight
1628 38963768 : : (iY + 1 == nSrcYOff2) ? dfTopWeight
1629 : : 1.0;
1630 55364236 : dfTotal += dfTotalLine * dfWeightY;
1631 55364236 : dfTotalWeight += dfTotalWeightLine * dfWeightY;
1632 : }
1633 :
1634 16406468 : if (nCount == 0 ||
1635 8 : (bPropagateNoData &&
1636 : nCount <
1637 8 : static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
1638 8 : (nSrcXOff2 - nSrcXOff)))
1639 : {
1640 9608362 : pDstScanline[iDstPixel] = tNoDataValue;
1641 9608362 : continue;
1642 : }
1643 : }
1644 : if constexpr (eWrkDataType == GDT_Byte)
1645 : {
1646 : T nVal;
1647 8544340 : if (bQuadraticMean)
1648 38 : nVal = ComputeIntegerRMS<T, int>(dfTotal,
1649 : dfTotalWeight);
1650 : else
1651 8544300 : nVal =
1652 8544300 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1653 8546550 : if (bHasNoData && nVal == tNoDataValue)
1654 0 : nVal = tReplacementVal;
1655 8546550 : pDstScanline[iDstPixel] = nVal;
1656 : }
1657 : else if constexpr (eWrkDataType == GDT_UInt16)
1658 : {
1659 : T nVal;
1660 8 : if (bQuadraticMean)
1661 4 : nVal = ComputeIntegerRMS<T, uint64_t>(
1662 : dfTotal, dfTotalWeight);
1663 : else
1664 4 : nVal =
1665 4 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1666 8 : if (bHasNoData && nVal == tNoDataValue)
1667 0 : nVal = tReplacementVal;
1668 8 : pDstScanline[iDstPixel] = nVal;
1669 : }
1670 : else
1671 : {
1672 : T nVal;
1673 153 : if (bQuadraticMean)
1674 20 : nVal =
1675 25 : static_cast<T>(sqrt(dfTotal / dfTotalWeight));
1676 : else
1677 128 : nVal = static_cast<T>(dfTotal / dfTotalWeight);
1678 153 : if (bHasNoData && nVal == tNoDataValue)
1679 2 : nVal = tReplacementVal;
1680 153 : pDstScanline[iDstPixel] = nVal;
1681 : }
1682 : }
1683 : }
1684 : }
1685 : else
1686 : {
1687 115 : nSrcYOff -= nChunkYOff;
1688 115 : nSrcYOff2 -= nChunkYOff;
1689 :
1690 2878 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1691 : {
1692 6475 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1693 6475 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1694 :
1695 6475 : uint64_t nTotalR = 0;
1696 6475 : uint64_t nTotalG = 0;
1697 6475 : uint64_t nTotalB = 0;
1698 6475 : size_t nCount = 0;
1699 :
1700 19425 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1701 : {
1702 38850 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
1703 : {
1704 25900 : const T val =
1705 25900 : pChunk[iX + static_cast<size_t>(iY) * nChunkXSize];
1706 : // cppcheck-suppress unsignedLessThanZero
1707 25900 : if (val < 0 || val >= colorEntries.size())
1708 0 : continue;
1709 25900 : const size_t idx = static_cast<size_t>(val);
1710 25900 : const auto &entry = colorEntries[idx];
1711 25900 : if (entry.c4)
1712 : {
1713 14128 : if (bQuadraticMean)
1714 : {
1715 800 : nTotalR += SQUARE<int>(entry.c1);
1716 800 : nTotalG += SQUARE<int>(entry.c2);
1717 800 : nTotalB += SQUARE<int>(entry.c3);
1718 800 : ++nCount;
1719 : }
1720 : else
1721 : {
1722 13328 : nTotalR += entry.c1;
1723 13328 : nTotalG += entry.c2;
1724 13328 : nTotalB += entry.c3;
1725 13328 : ++nCount;
1726 : }
1727 : }
1728 : }
1729 : }
1730 :
1731 6475 : if (nCount == 0 ||
1732 0 : (bPropagateNoData &&
1733 0 : nCount < static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
1734 0 : (nSrcXOff2 - nSrcXOff)))
1735 : {
1736 2838 : pDstScanline[iDstPixel] = tNoDataValue;
1737 : }
1738 : else
1739 : {
1740 : GDALColorEntry color;
1741 3637 : if (bQuadraticMean)
1742 : {
1743 200 : color.c1 =
1744 200 : static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
1745 200 : color.c2 =
1746 200 : static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
1747 200 : color.c3 =
1748 200 : static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
1749 : }
1750 : else
1751 : {
1752 3437 : color.c1 =
1753 3437 : static_cast<short>((nTotalR + nCount / 2) / nCount);
1754 3437 : color.c2 =
1755 3437 : static_cast<short>((nTotalG + nCount / 2) / nCount);
1756 3437 : color.c3 =
1757 3437 : static_cast<short>((nTotalB + nCount / 2) / nCount);
1758 : }
1759 0 : pDstScanline[iDstPixel] =
1760 3637 : static_cast<T>(BestColorEntry(colorEntries, color));
1761 : }
1762 : }
1763 : }
1764 : }
1765 :
1766 2319 : CPLFree(pasSrcX);
1767 :
1768 2319 : return CE_None;
1769 : }
1770 :
1771 : static CPLErr
1772 2319 : GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
1773 : const void *pChunk, void **ppDstBuffer,
1774 : GDALDataType *peDstBufferDataType)
1775 : {
1776 2319 : *peDstBufferDataType = args.eWrkDataType;
1777 2319 : switch (args.eWrkDataType)
1778 : {
1779 2252 : case GDT_Byte:
1780 : {
1781 2252 : return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_Byte>(
1782 2252 : args, static_cast<const GByte *>(pChunk), ppDstBuffer);
1783 : }
1784 :
1785 9 : case GDT_UInt16:
1786 : {
1787 9 : if (EQUAL(args.pszResampling, "RMS"))
1788 : {
1789 : // Use double as accumulation type, because UInt32 could overflow
1790 : return GDALResampleChunk_AverageOrRMS_T<GUInt16, double,
1791 5 : GDT_UInt16>(
1792 5 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1793 : }
1794 : else
1795 : {
1796 : return GDALResampleChunk_AverageOrRMS_T<GUInt16, GUInt32,
1797 4 : GDT_UInt16>(
1798 4 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1799 : }
1800 : }
1801 :
1802 41 : case GDT_Float32:
1803 : {
1804 41 : return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32>(
1805 41 : args, static_cast<const float *>(pChunk), ppDstBuffer);
1806 : }
1807 :
1808 17 : case GDT_Float64:
1809 : {
1810 : return GDALResampleChunk_AverageOrRMS_T<double, double,
1811 17 : GDT_Float64>(
1812 17 : args, static_cast<const double *>(pChunk), ppDstBuffer);
1813 : }
1814 :
1815 0 : default:
1816 0 : break;
1817 : }
1818 :
1819 0 : CPLAssert(false);
1820 : return CE_Failure;
1821 : }
1822 :
1823 : /************************************************************************/
1824 : /* GDALResampleChunk_Gauss() */
1825 : /************************************************************************/
1826 :
1827 86 : static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
1828 : const void *pChunk, void **ppDstBuffer,
1829 : GDALDataType *peDstBufferDataType)
1830 :
1831 : {
1832 86 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1833 86 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1834 86 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1835 86 : const int nChunkXOff = args.nChunkXOff;
1836 86 : const int nChunkXSize = args.nChunkXSize;
1837 86 : const int nChunkYOff = args.nChunkYOff;
1838 86 : const int nChunkYSize = args.nChunkYSize;
1839 86 : const int nDstXOff = args.nDstXOff;
1840 86 : const int nDstXOff2 = args.nDstXOff2;
1841 86 : const int nDstYOff = args.nDstYOff;
1842 86 : const int nDstYOff2 = args.nDstYOff2;
1843 86 : const bool bHasNoData = args.bHasNoData;
1844 86 : double dfNoDataValue = args.dfNoDataValue;
1845 86 : const GDALColorTable *poColorTable = args.poColorTable;
1846 :
1847 86 : const double *const padfChunk = static_cast<const double *>(pChunk);
1848 :
1849 86 : *ppDstBuffer =
1850 86 : VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
1851 : GDALGetDataTypeSizeBytes(GDT_Float64));
1852 86 : if (*ppDstBuffer == nullptr)
1853 : {
1854 0 : return CE_Failure;
1855 : }
1856 86 : *peDstBufferDataType = GDT_Float64;
1857 86 : double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
1858 :
1859 : /* -------------------------------------------------------------------- */
1860 : /* Create the filter kernel and allocate scanline buffer. */
1861 : /* -------------------------------------------------------------------- */
1862 86 : int nGaussMatrixDim = 3;
1863 : const int *panGaussMatrix;
1864 86 : constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
1865 86 : constexpr int anGaussMatrix5x5[] = {1, 4, 6, 4, 1, 4, 16, 24, 16,
1866 : 4, 6, 24, 36, 24, 6, 4, 16, 24,
1867 : 16, 4, 1, 4, 6, 4, 1};
1868 86 : constexpr int anGaussMatrix7x7[] = {
1869 : 1, 6, 15, 20, 15, 6, 1, 6, 36, 90, 120, 90, 36,
1870 : 6, 15, 90, 225, 300, 225, 90, 15, 20, 120, 300, 400, 300,
1871 : 120, 20, 15, 90, 225, 300, 225, 90, 15, 6, 36, 90, 120,
1872 : 90, 36, 6, 1, 6, 15, 20, 15, 6, 1};
1873 :
1874 86 : const int nOXSize = args.nOvrXSize;
1875 86 : const int nOYSize = args.nOvrYSize;
1876 86 : const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
1877 :
1878 : // matrix for gauss filter
1879 86 : if (nResYFactor <= 2)
1880 : {
1881 85 : panGaussMatrix = anGaussMatrix3x3;
1882 85 : nGaussMatrixDim = 3;
1883 : }
1884 1 : else if (nResYFactor <= 4)
1885 : {
1886 0 : panGaussMatrix = anGaussMatrix5x5;
1887 0 : nGaussMatrixDim = 5;
1888 : }
1889 : else
1890 : {
1891 1 : panGaussMatrix = anGaussMatrix7x7;
1892 1 : nGaussMatrixDim = 7;
1893 : }
1894 :
1895 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
1896 : int *panGaussMatrixDup = static_cast<int *>(
1897 : CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
1898 : memcpy(panGaussMatrixDup, panGaussMatrix,
1899 : sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
1900 : panGaussMatrix = panGaussMatrixDup;
1901 : #endif
1902 :
1903 86 : if (!bHasNoData)
1904 79 : dfNoDataValue = 0.0;
1905 :
1906 86 : std::vector<GDALColorEntry> colorEntries;
1907 86 : int nTransparentIdx = -1;
1908 86 : if (poColorTable)
1909 2 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1910 :
1911 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1912 : // it as nodata value.
1913 92 : if (bHasNoData && dfNoDataValue >= 0.0f &&
1914 6 : dfNoDataValue < colorEntries.size())
1915 0 : colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
1916 :
1917 : // Or if we have no explicit nodata, but a color table entry that is
1918 : // transparent, consider it as the nodata value.
1919 86 : else if (!bHasNoData && nTransparentIdx >= 0)
1920 : {
1921 0 : dfNoDataValue = nTransparentIdx;
1922 : }
1923 :
1924 86 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
1925 86 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1926 86 : const int nDstXWidth = nDstXOff2 - nDstXOff;
1927 :
1928 : /* ==================================================================== */
1929 : /* Loop over destination scanlines. */
1930 : /* ==================================================================== */
1931 16488 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1932 : {
1933 16402 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
1934 16402 : int nSrcYOff2 =
1935 16402 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
1936 :
1937 16402 : if (nSrcYOff < nChunkYOff)
1938 : {
1939 0 : nSrcYOff = nChunkYOff;
1940 0 : nSrcYOff2++;
1941 : }
1942 :
1943 16402 : const int iSizeY = nSrcYOff2 - nSrcYOff;
1944 16402 : nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
1945 16402 : nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
1946 :
1947 16402 : if (nSrcYOff2 > nChunkBottomYOff ||
1948 16359 : (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
1949 : {
1950 44 : nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
1951 : }
1952 :
1953 16402 : int nYShiftGaussMatrix = 0;
1954 16402 : if (nSrcYOff < nChunkYOff)
1955 : {
1956 0 : nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
1957 0 : nSrcYOff = nChunkYOff;
1958 : }
1959 :
1960 16402 : const double *const padfSrcScanline =
1961 16402 : padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
1962 16402 : const GByte *pabySrcScanlineNodataMask = nullptr;
1963 16402 : if (pabyChunkNodataMask != nullptr)
1964 152 : pabySrcScanlineNodataMask =
1965 152 : pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
1966 :
1967 : /* --------------------------------------------------------------------
1968 : */
1969 : /* Loop over destination pixels */
1970 : /* --------------------------------------------------------------------
1971 : */
1972 16402 : double *const padfDstScanline =
1973 16402 : padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
1974 4149980 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1975 : {
1976 4133580 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
1977 4133580 : int nSrcXOff2 =
1978 4133580 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
1979 :
1980 4133580 : if (nSrcXOff < nChunkXOff)
1981 : {
1982 0 : nSrcXOff = nChunkXOff;
1983 0 : nSrcXOff2++;
1984 : }
1985 :
1986 4133580 : const int iSizeX = nSrcXOff2 - nSrcXOff;
1987 4133580 : nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
1988 4133580 : nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
1989 :
1990 4133580 : if (nSrcXOff2 > nChunkRightXOff ||
1991 4127930 : (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
1992 : {
1993 5650 : nSrcXOff2 =
1994 5650 : std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
1995 : }
1996 :
1997 4133580 : int nXShiftGaussMatrix = 0;
1998 4133580 : if (nSrcXOff < nChunkXOff)
1999 : {
2000 0 : nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
2001 0 : nSrcXOff = nChunkXOff;
2002 : }
2003 :
2004 4133580 : if (poColorTable == nullptr)
2005 : {
2006 4133380 : double dfTotal = 0.0;
2007 4133380 : GInt64 nCount = 0;
2008 4133380 : const int *panLineWeight =
2009 4133380 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2010 : nXShiftGaussMatrix;
2011 :
2012 16527900 : for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2013 12394500 : ++iY, ++j, panLineWeight += nGaussMatrixDim)
2014 : {
2015 49561300 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2016 : {
2017 37166800 : const double val =
2018 37166800 : padfSrcScanline[iX - nChunkXOff +
2019 37166800 : static_cast<GPtrDiff_t>(iY -
2020 37166800 : nSrcYOff) *
2021 37166800 : nChunkXSize];
2022 37166800 : if (pabySrcScanlineNodataMask == nullptr ||
2023 32872 : pabySrcScanlineNodataMask[iX - nChunkXOff +
2024 32872 : static_cast<GPtrDiff_t>(
2025 32872 : iY - nSrcYOff) *
2026 32872 : nChunkXSize])
2027 : {
2028 37146100 : const int nWeight = panLineWeight[i];
2029 37146100 : dfTotal += val * nWeight;
2030 37146100 : nCount += nWeight;
2031 : }
2032 : }
2033 : }
2034 :
2035 4133380 : if (nCount == 0)
2036 : {
2037 2217 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2038 : }
2039 : else
2040 : {
2041 4131160 : padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
2042 : }
2043 : }
2044 : else
2045 : {
2046 200 : GInt64 nTotalR = 0;
2047 200 : GInt64 nTotalG = 0;
2048 200 : GInt64 nTotalB = 0;
2049 200 : GInt64 nTotalWeight = 0;
2050 200 : const int *panLineWeight =
2051 200 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2052 : nXShiftGaussMatrix;
2053 :
2054 780 : for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2055 580 : ++iY, ++j, panLineWeight += nGaussMatrixDim)
2056 : {
2057 2262 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2058 : {
2059 1682 : const double val =
2060 1682 : padfSrcScanline[iX - nChunkXOff +
2061 1682 : static_cast<GPtrDiff_t>(iY -
2062 1682 : nSrcYOff) *
2063 1682 : nChunkXSize];
2064 1682 : if (val < 0 || val >= colorEntries.size())
2065 0 : continue;
2066 :
2067 1682 : size_t idx = static_cast<size_t>(val);
2068 1682 : if (colorEntries[idx].c4)
2069 : {
2070 1682 : const int nWeight = panLineWeight[i];
2071 1682 : nTotalR +=
2072 1682 : static_cast<GInt64>(colorEntries[idx].c1) *
2073 1682 : nWeight;
2074 1682 : nTotalG +=
2075 1682 : static_cast<GInt64>(colorEntries[idx].c2) *
2076 1682 : nWeight;
2077 1682 : nTotalB +=
2078 1682 : static_cast<GInt64>(colorEntries[idx].c3) *
2079 1682 : nWeight;
2080 1682 : nTotalWeight += nWeight;
2081 : }
2082 : }
2083 : }
2084 :
2085 200 : if (nTotalWeight == 0)
2086 : {
2087 0 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2088 : }
2089 : else
2090 : {
2091 : GDALColorEntry color;
2092 :
2093 200 : color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
2094 : nTotalWeight);
2095 200 : color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
2096 : nTotalWeight);
2097 200 : color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
2098 : nTotalWeight);
2099 200 : padfDstScanline[iDstPixel - nDstXOff] =
2100 200 : BestColorEntry(colorEntries, color);
2101 : }
2102 : }
2103 : }
2104 : }
2105 :
2106 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
2107 : CPLFree(panGaussMatrixDup);
2108 : #endif
2109 :
2110 86 : return CE_None;
2111 : }
2112 :
2113 : /************************************************************************/
2114 : /* GDALResampleChunk_Mode() */
2115 : /************************************************************************/
2116 :
2117 4398 : template <class T> static inline bool IsSame(T a, T b)
2118 : {
2119 4398 : return a == b;
2120 : }
2121 :
2122 4854 : template <> bool IsSame<float>(float a, float b)
2123 : {
2124 4854 : return a == b || (std::isnan(a) && std::isnan(b));
2125 : }
2126 :
2127 504 : template <> bool IsSame<double>(double a, double b)
2128 : {
2129 504 : return a == b || (std::isnan(a) && std::isnan(b));
2130 : }
2131 :
2132 : template <>
2133 480 : bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
2134 : {
2135 960 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2136 960 : std::isnan(b.real()) && std::isnan(b.imag()));
2137 : }
2138 :
2139 : template <>
2140 480 : bool IsSame<std::complex<double>>(std::complex<double> a,
2141 : std::complex<double> b)
2142 : {
2143 960 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2144 960 : std::isnan(b.real()) && std::isnan(b.imag()));
2145 : }
2146 :
2147 : template <class T>
2148 136 : static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
2149 : const T *pChunk, T *const pDstBuffer)
2150 :
2151 : {
2152 136 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2153 136 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2154 136 : const double dfSrcXDelta = args.dfSrcXDelta;
2155 136 : const double dfSrcYDelta = args.dfSrcYDelta;
2156 136 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2157 136 : const int nChunkXOff = args.nChunkXOff;
2158 136 : const int nChunkXSize = args.nChunkXSize;
2159 136 : const int nChunkYOff = args.nChunkYOff;
2160 136 : const int nChunkYSize = args.nChunkYSize;
2161 136 : const int nDstXOff = args.nDstXOff;
2162 136 : const int nDstXOff2 = args.nDstXOff2;
2163 136 : const int nDstYOff = args.nDstYOff;
2164 136 : const int nDstYOff2 = args.nDstYOff2;
2165 136 : const bool bHasNoData = args.bHasNoData;
2166 136 : const GDALColorTable *poColorTable = args.poColorTable;
2167 136 : const int nDstXSize = nDstXOff2 - nDstXOff;
2168 :
2169 8 : T tNoDataValue;
2170 : if constexpr (std::is_same<T, std::complex<float>>::value ||
2171 : std::is_same<T, std::complex<double>>::value)
2172 : {
2173 : using BaseT = typename T::value_type;
2174 8 : tNoDataValue =
2175 : std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
2176 : std::numeric_limits<BaseT>::quiet_NaN());
2177 : }
2178 128 : else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
2179 127 : tNoDataValue = 0;
2180 : else
2181 1 : tNoDataValue = static_cast<T>(args.dfNoDataValue);
2182 :
2183 136 : size_t nMaxNumPx = 0;
2184 136 : T *paVals = nullptr;
2185 136 : int *panSums = nullptr;
2186 :
2187 136 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2188 136 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2189 272 : std::vector<int> anVals(256, 0);
2190 :
2191 : /* ==================================================================== */
2192 : /* Loop over destination scanlines. */
2193 : /* ==================================================================== */
2194 7531 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2195 : {
2196 7395 : double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
2197 7395 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
2198 : #ifdef only_pixels_with_more_than_10_pct_participation
2199 : // When oversampling, don't take into account pixels that have a tiny
2200 : // participation in the resulting pixel
2201 : if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2202 : nSrcYOff < nChunkBottomYOff)
2203 : nSrcYOff++;
2204 : #endif
2205 7395 : if (nSrcYOff < nChunkYOff)
2206 0 : nSrcYOff = nChunkYOff;
2207 :
2208 7395 : double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
2209 7395 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
2210 : #ifdef only_pixels_with_more_than_10_pct_participation
2211 : // When oversampling, don't take into account pixels that have a tiny
2212 : // participation in the resulting pixel
2213 : if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2214 : nSrcYOff2 > nChunkYOff)
2215 : nSrcYOff2--;
2216 : #endif
2217 7395 : if (nSrcYOff2 == nSrcYOff)
2218 0 : ++nSrcYOff2;
2219 7395 : if (nSrcYOff2 > nChunkBottomYOff)
2220 0 : nSrcYOff2 = nChunkBottomYOff;
2221 :
2222 7395 : const T *const paSrcScanline =
2223 149 : pChunk +
2224 7395 : (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
2225 7395 : const GByte *pabySrcScanlineNodataMask = nullptr;
2226 7395 : if (pabyChunkNodataMask != nullptr)
2227 1810 : pabySrcScanlineNodataMask =
2228 : pabyChunkNodataMask +
2229 1810 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
2230 :
2231 7395 : T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
2232 : /* --------------------------------------------------------------------
2233 : */
2234 : /* Loop over destination pixels */
2235 : /* --------------------------------------------------------------------
2236 : */
2237 4259580 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2238 : {
2239 4252187 : double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
2240 : // Apply some epsilon to avoid numerical precision issues
2241 4252187 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
2242 : #ifdef only_pixels_with_more_than_10_pct_participation
2243 : // When oversampling, don't take into account pixels that have a
2244 : // tiny participation in the resulting pixel
2245 : if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2246 : nSrcXOff < nChunkRightXOff)
2247 : nSrcXOff++;
2248 : #endif
2249 4252187 : if (nSrcXOff < nChunkXOff)
2250 0 : nSrcXOff = nChunkXOff;
2251 :
2252 4252187 : double dfSrcXOff2 =
2253 4252187 : dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
2254 4252187 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
2255 : #ifdef only_pixels_with_more_than_10_pct_participation
2256 : // When oversampling, don't take into account pixels that have a
2257 : // tiny participation in the resulting pixel
2258 : if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2259 : nSrcXOff2 > nChunkXOff)
2260 : nSrcXOff2--;
2261 : #endif
2262 4252187 : if (nSrcXOff2 == nSrcXOff)
2263 0 : nSrcXOff2++;
2264 4252187 : if (nSrcXOff2 > nChunkRightXOff)
2265 0 : nSrcXOff2 = nChunkRightXOff;
2266 :
2267 4252187 : bool bRegularProcessing = false;
2268 : if constexpr (!std::is_same<T, GByte>::value)
2269 827 : bRegularProcessing = true;
2270 4251360 : else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
2271 0 : bRegularProcessing = true;
2272 :
2273 4252187 : if (bRegularProcessing)
2274 : {
2275 : // Not sure how much sense it makes to run a majority
2276 : // filter on floating point data, but here it is for the sake
2277 : // of compatibility. It won't look right on RGB images by the
2278 : // nature of the filter.
2279 :
2280 827 : if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
2281 2481 : nSrcYOff2 - nSrcYOff > INT_MAX / (nSrcXOff2 - nSrcXOff) ||
2282 827 : static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
2283 827 : static_cast<size_t>(nSrcXOff2 - nSrcXOff) >
2284 827 : std::numeric_limits<size_t>::max() / sizeof(float))
2285 : {
2286 0 : CPLError(CE_Failure, CPLE_NotSupported,
2287 : "Too big downsampling factor");
2288 0 : CPLFree(paVals);
2289 0 : CPLFree(panSums);
2290 0 : return CE_Failure;
2291 : }
2292 827 : const size_t nNumPx =
2293 827 : static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
2294 827 : static_cast<size_t>(nSrcXOff2 - nSrcXOff);
2295 827 : size_t iMaxInd = 0;
2296 827 : size_t iMaxVal = 0;
2297 827 : bool biMaxValdValid = false;
2298 :
2299 827 : if (paVals == nullptr || nNumPx > nMaxNumPx)
2300 : {
2301 : T *paValsNew = static_cast<T *>(
2302 71 : VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
2303 : int *panSumsNew = static_cast<int *>(
2304 71 : VSI_REALLOC_VERBOSE(panSums, nNumPx * sizeof(int)));
2305 71 : if (paValsNew != nullptr)
2306 71 : paVals = paValsNew;
2307 71 : if (panSumsNew != nullptr)
2308 71 : panSums = panSumsNew;
2309 71 : if (paValsNew == nullptr || panSumsNew == nullptr)
2310 : {
2311 0 : CPLFree(paVals);
2312 0 : CPLFree(panSums);
2313 0 : return CE_Failure;
2314 : }
2315 71 : nMaxNumPx = nNumPx;
2316 : }
2317 :
2318 2585 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2319 : {
2320 1758 : const GPtrDiff_t iTotYOff =
2321 1758 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2322 1758 : nChunkXOff;
2323 5690 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2324 : {
2325 3932 : if (pabySrcScanlineNodataMask == nullptr ||
2326 16 : pabySrcScanlineNodataMask[iX + iTotYOff])
2327 : {
2328 3917 : const T val = paSrcScanline[iX + iTotYOff];
2329 3917 : size_t i = 0; // Used after for.
2330 :
2331 : // Check array for existing entry.
2332 14387 : for (; i < iMaxInd; ++i)
2333 17626 : if (IsSame(paVals[i], val) &&
2334 6910 : ++panSums[i] > panSums[iMaxVal])
2335 : {
2336 246 : iMaxVal = i;
2337 246 : biMaxValdValid = true;
2338 246 : break;
2339 : }
2340 :
2341 : // Add to arr if entry not already there.
2342 3917 : if (i == iMaxInd)
2343 : {
2344 3671 : paVals[iMaxInd] = val;
2345 3671 : panSums[iMaxInd] = 1;
2346 :
2347 3671 : if (!biMaxValdValid)
2348 : {
2349 824 : iMaxVal = iMaxInd;
2350 824 : biMaxValdValid = true;
2351 : }
2352 :
2353 3671 : ++iMaxInd;
2354 : }
2355 : }
2356 : }
2357 : }
2358 :
2359 827 : if (!biMaxValdValid)
2360 3 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2361 : else
2362 824 : paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
2363 : }
2364 : else if constexpr (std::is_same<T, GByte>::value)
2365 : // ( eSrcDataType == GDT_Byte && nEntryCount < 256 )
2366 : {
2367 : // So we go here for a paletted or non-paletted byte band.
2368 : // The input values are then between 0 and 255.
2369 4251360 : int nMaxVal = 0;
2370 4251360 : int iMaxInd = -1;
2371 :
2372 : // The cost of this zeroing might be high. Perhaps we should
2373 : // just use the above generic case, and go to this one if the
2374 : // number of source pixels is large enough
2375 4251360 : std::fill(anVals.begin(), anVals.end(), 0);
2376 :
2377 12777700 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2378 : {
2379 8526370 : const GPtrDiff_t iTotYOff =
2380 8526370 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2381 8526370 : nChunkXOff;
2382 25649400 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2383 : {
2384 17123000 : const T val = paSrcScanline[iX + iTotYOff];
2385 17123000 : if (!bHasNoData || val != tNoDataValue)
2386 : {
2387 17123000 : int nVal = static_cast<int>(val);
2388 17123000 : if (++anVals[nVal] > nMaxVal)
2389 : {
2390 : // Sum the density.
2391 : // Is it the most common value so far?
2392 17006300 : iMaxInd = nVal;
2393 17006300 : nMaxVal = anVals[nVal];
2394 : }
2395 : }
2396 : }
2397 : }
2398 :
2399 4251360 : if (iMaxInd == -1)
2400 0 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2401 : else
2402 4251360 : paDstScanline[iDstPixel - nDstXOff] =
2403 : static_cast<T>(iMaxInd);
2404 : }
2405 : }
2406 : }
2407 :
2408 136 : CPLFree(paVals);
2409 136 : CPLFree(panSums);
2410 :
2411 136 : return CE_None;
2412 : }
2413 :
2414 136 : static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
2415 : const void *pChunk, void **ppDstBuffer,
2416 : GDALDataType *peDstBufferDataType)
2417 : {
2418 136 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
2419 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
2420 : GDALGetDataTypeSizeBytes(args.eWrkDataType));
2421 136 : if (*ppDstBuffer == nullptr)
2422 : {
2423 0 : return CE_Failure;
2424 : }
2425 :
2426 136 : CPLAssert(args.eSrcDataType == args.eWrkDataType);
2427 :
2428 136 : *peDstBufferDataType = args.eWrkDataType;
2429 136 : switch (args.eWrkDataType)
2430 : {
2431 : // For mode resampling, as no computation is done, only the
2432 : // size of the data type matters... except for Byte where we have
2433 : // special processing. And for floating point values
2434 65 : case GDT_Byte:
2435 : {
2436 65 : return GDALResampleChunk_ModeT(args,
2437 : static_cast<const GByte *>(pChunk),
2438 65 : static_cast<GByte *>(*ppDstBuffer));
2439 : }
2440 :
2441 4 : case GDT_Int8:
2442 : {
2443 4 : return GDALResampleChunk_ModeT(args,
2444 : static_cast<const int8_t *>(pChunk),
2445 4 : static_cast<int8_t *>(*ppDstBuffer));
2446 : }
2447 :
2448 9 : case GDT_Int16:
2449 : case GDT_UInt16:
2450 : case GDT_Float16:
2451 : {
2452 9 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
2453 9 : return GDALResampleChunk_ModeT(
2454 : args, static_cast<const uint16_t *>(pChunk),
2455 9 : static_cast<uint16_t *>(*ppDstBuffer));
2456 : }
2457 :
2458 15 : case GDT_CInt16:
2459 : case GDT_CFloat16:
2460 : case GDT_Int32:
2461 : case GDT_UInt32:
2462 : {
2463 15 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2464 15 : return GDALResampleChunk_ModeT(
2465 : args, static_cast<const uint32_t *>(pChunk),
2466 15 : static_cast<uint32_t *>(*ppDstBuffer));
2467 : }
2468 :
2469 17 : case GDT_Float32:
2470 : {
2471 17 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2472 17 : return GDALResampleChunk_ModeT(args,
2473 : static_cast<const float *>(pChunk),
2474 17 : static_cast<float *>(*ppDstBuffer));
2475 : }
2476 :
2477 12 : case GDT_CInt32:
2478 : case GDT_Int64:
2479 : case GDT_UInt64:
2480 : {
2481 12 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2482 12 : return GDALResampleChunk_ModeT(
2483 : args, static_cast<const uint64_t *>(pChunk),
2484 12 : static_cast<uint64_t *>(*ppDstBuffer));
2485 : }
2486 :
2487 6 : case GDT_Float64:
2488 : {
2489 6 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2490 6 : return GDALResampleChunk_ModeT(args,
2491 : static_cast<const double *>(pChunk),
2492 6 : static_cast<double *>(*ppDstBuffer));
2493 : }
2494 :
2495 4 : case GDT_CFloat32:
2496 : {
2497 4 : return GDALResampleChunk_ModeT(
2498 : args, static_cast<const std::complex<float> *>(pChunk),
2499 4 : static_cast<std::complex<float> *>(*ppDstBuffer));
2500 : }
2501 :
2502 4 : case GDT_CFloat64:
2503 : {
2504 4 : return GDALResampleChunk_ModeT(
2505 : args, static_cast<const std::complex<double> *>(pChunk),
2506 4 : static_cast<std::complex<double> *>(*ppDstBuffer));
2507 : }
2508 :
2509 0 : case GDT_Unknown:
2510 : case GDT_TypeCount:
2511 0 : break;
2512 : }
2513 :
2514 0 : CPLAssert(false);
2515 : return CE_Failure;
2516 : }
2517 :
2518 : /************************************************************************/
2519 : /* GDALResampleConvolutionHorizontal() */
2520 : /************************************************************************/
2521 :
2522 : template <class T>
2523 : static inline double
2524 44886 : GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
2525 : int nSrcPixelCount)
2526 : {
2527 44886 : double dfVal1 = 0.0;
2528 44886 : double dfVal2 = 0.0;
2529 44886 : int i = 0; // Used after for.
2530 : // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
2531 : // manually (untypical) unrolled loop in -O2 and -O3:
2532 : // https://github.com/OSGeo/gdal/issues/9508
2533 : #if !defined(__INTEL_CLANG_COMPILER)
2534 89516 : for (; i < nSrcPixelCount - 3; i += 4)
2535 : {
2536 44630 : dfVal1 += pChunk[i] * padfWeights[i];
2537 44630 : dfVal1 += pChunk[i + 1] * padfWeights[i + 1];
2538 44630 : dfVal2 += pChunk[i + 2] * padfWeights[i + 2];
2539 44630 : dfVal2 += pChunk[i + 3] * padfWeights[i + 3];
2540 : }
2541 : #endif
2542 46358 : for (; i < nSrcPixelCount; ++i)
2543 : {
2544 1472 : dfVal1 += pChunk[i] * padfWeights[i];
2545 : }
2546 44886 : return dfVal1 + dfVal2;
2547 : }
2548 :
2549 : template <class T>
2550 44576 : static inline void GDALResampleConvolutionHorizontalWithMask(
2551 : const T *pChunk, const GByte *pabyMask, const double *padfWeights,
2552 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2553 : {
2554 44576 : dfVal = 0;
2555 44576 : dfWeightSum = 0;
2556 44576 : int i = 0;
2557 98300 : for (; i < nSrcPixelCount - 3; i += 4)
2558 : {
2559 53724 : const double dfWeight0 = padfWeights[i] * pabyMask[i];
2560 53724 : const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
2561 53724 : const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
2562 53724 : const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
2563 53724 : dfVal += pChunk[i] * dfWeight0;
2564 53724 : dfVal += pChunk[i + 1] * dfWeight1;
2565 53724 : dfVal += pChunk[i + 2] * dfWeight2;
2566 53724 : dfVal += pChunk[i + 3] * dfWeight3;
2567 53724 : dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
2568 : }
2569 61162 : for (; i < nSrcPixelCount; ++i)
2570 : {
2571 16586 : const double dfWeight = padfWeights[i] * pabyMask[i];
2572 16586 : dfVal += pChunk[i] * dfWeight;
2573 16586 : dfWeightSum += dfWeight;
2574 : }
2575 44576 : }
2576 :
2577 : template <class T>
2578 1340094 : static inline void GDALResampleConvolutionHorizontal_3rows(
2579 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2580 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2581 : double &dfRes2, double &dfRes3)
2582 : {
2583 1340094 : double dfVal1 = 0.0;
2584 1340094 : double dfVal2 = 0.0;
2585 1340094 : double dfVal3 = 0.0;
2586 1340094 : double dfVal4 = 0.0;
2587 1340094 : double dfVal5 = 0.0;
2588 1340094 : double dfVal6 = 0.0;
2589 1340094 : int i = 0; // Used after for.
2590 2733937 : for (; i < nSrcPixelCount - 3; i += 4)
2591 : {
2592 1393842 : dfVal1 += pChunkRow1[i] * padfWeights[i];
2593 1393842 : dfVal1 += pChunkRow1[i + 1] * padfWeights[i + 1];
2594 1393842 : dfVal2 += pChunkRow1[i + 2] * padfWeights[i + 2];
2595 1393842 : dfVal2 += pChunkRow1[i + 3] * padfWeights[i + 3];
2596 1393842 : dfVal3 += pChunkRow2[i] * padfWeights[i];
2597 1393842 : dfVal3 += pChunkRow2[i + 1] * padfWeights[i + 1];
2598 1393842 : dfVal4 += pChunkRow2[i + 2] * padfWeights[i + 2];
2599 1393842 : dfVal4 += pChunkRow2[i + 3] * padfWeights[i + 3];
2600 1393842 : dfVal5 += pChunkRow3[i] * padfWeights[i];
2601 1393842 : dfVal5 += pChunkRow3[i + 1] * padfWeights[i + 1];
2602 1393842 : dfVal6 += pChunkRow3[i + 2] * padfWeights[i + 2];
2603 1393842 : dfVal6 += pChunkRow3[i + 3] * padfWeights[i + 3];
2604 : }
2605 1378621 : for (; i < nSrcPixelCount; ++i)
2606 : {
2607 38527 : dfVal1 += pChunkRow1[i] * padfWeights[i];
2608 38527 : dfVal3 += pChunkRow2[i] * padfWeights[i];
2609 38527 : dfVal5 += pChunkRow3[i] * padfWeights[i];
2610 : }
2611 1340094 : dfRes1 = dfVal1 + dfVal2;
2612 1340094 : dfRes2 = dfVal3 + dfVal4;
2613 1340094 : dfRes3 = dfVal5 + dfVal6;
2614 1340094 : }
2615 :
2616 : template <class T>
2617 18828 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
2618 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2619 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2620 : double &dfRes2, double &dfRes3)
2621 : {
2622 18828 : GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2623 : padfWeights, nSrcPixelCount, dfRes1,
2624 : dfRes2, dfRes3);
2625 18828 : }
2626 :
2627 : template <class T>
2628 1256466 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
2629 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2630 : const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
2631 : {
2632 1256466 : GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2633 : padfWeights, 4, dfRes1, dfRes2,
2634 : dfRes3);
2635 1256466 : }
2636 :
2637 : /************************************************************************/
2638 : /* GDALResampleConvolutionVertical() */
2639 : /************************************************************************/
2640 :
2641 : template <class T>
2642 : static inline double
2643 465244 : GDALResampleConvolutionVertical(const T *pChunk, size_t nStride,
2644 : const double *padfWeights, int nSrcLineCount)
2645 : {
2646 465244 : double dfVal1 = 0.0;
2647 465244 : double dfVal2 = 0.0;
2648 465244 : int i = 0;
2649 465244 : size_t j = 0;
2650 916100 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2651 : {
2652 450856 : dfVal1 += pChunk[j + 0 * nStride] * padfWeights[i + 0];
2653 450856 : dfVal1 += pChunk[j + 1 * nStride] * padfWeights[i + 1];
2654 450856 : dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2655 450856 : dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2656 : }
2657 518747 : for (; i < nSrcLineCount; ++i, j += nStride)
2658 : {
2659 53503 : dfVal1 += pChunk[j] * padfWeights[i];
2660 : }
2661 465244 : return dfVal1 + dfVal2;
2662 : }
2663 :
2664 : template <class T>
2665 2880000 : static inline void GDALResampleConvolutionVertical_2cols(
2666 : const T *pChunk, size_t nStride, const double *padfWeights,
2667 : int nSrcLineCount, double &dfRes1, double &dfRes2)
2668 : {
2669 2880000 : double dfVal1 = 0.0;
2670 2880000 : double dfVal2 = 0.0;
2671 2880000 : double dfVal3 = 0.0;
2672 2880000 : double dfVal4 = 0.0;
2673 2880000 : int i = 0;
2674 2880000 : size_t j = 0;
2675 5716800 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2676 : {
2677 2836800 : dfVal1 += pChunk[j + 0 + 0 * nStride] * padfWeights[i + 0];
2678 2836800 : dfVal3 += pChunk[j + 1 + 0 * nStride] * padfWeights[i + 0];
2679 2836800 : dfVal1 += pChunk[j + 0 + 1 * nStride] * padfWeights[i + 1];
2680 2836800 : dfVal3 += pChunk[j + 1 + 1 * nStride] * padfWeights[i + 1];
2681 2836800 : dfVal2 += pChunk[j + 0 + 2 * nStride] * padfWeights[i + 2];
2682 2836800 : dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
2683 2836800 : dfVal2 += pChunk[j + 0 + 3 * nStride] * padfWeights[i + 3];
2684 2836800 : dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
2685 : }
2686 2995210 : for (; i < nSrcLineCount; ++i, j += nStride)
2687 : {
2688 115210 : dfVal1 += pChunk[j + 0] * padfWeights[i];
2689 115210 : dfVal3 += pChunk[j + 1] * padfWeights[i];
2690 : }
2691 2880000 : dfRes1 = dfVal1 + dfVal2;
2692 2880000 : dfRes2 = dfVal3 + dfVal4;
2693 2880000 : }
2694 :
2695 : #ifdef USE_SSE2
2696 :
2697 : #ifdef __AVX__
2698 : /************************************************************************/
2699 : /* GDALResampleConvolutionVertical_16cols<T> */
2700 : /************************************************************************/
2701 :
2702 : template <class T>
2703 : static inline void
2704 : GDALResampleConvolutionVertical_16cols(const T *pChunk, size_t nStride,
2705 : const double *padfWeights,
2706 : int nSrcLineCount, float *afDest)
2707 : {
2708 : int i = 0;
2709 : size_t j = 0;
2710 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2711 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2712 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2713 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2714 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2715 : {
2716 : XMMReg4Double w0 =
2717 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2718 : XMMReg4Double w1 =
2719 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2720 : XMMReg4Double w2 =
2721 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2722 : XMMReg4Double w3 =
2723 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2724 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2725 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2726 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
2727 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
2728 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2729 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2730 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
2731 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
2732 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2733 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2734 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
2735 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
2736 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2737 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2738 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
2739 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
2740 : }
2741 : for (; i < nSrcLineCount; ++i, j += nStride)
2742 : {
2743 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2744 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2745 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2746 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
2747 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
2748 : }
2749 : v_acc0.Store4Val(afDest);
2750 : v_acc1.Store4Val(afDest + 4);
2751 : v_acc2.Store4Val(afDest + 8);
2752 : v_acc3.Store4Val(afDest + 12);
2753 : }
2754 :
2755 : template <class T>
2756 : static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
2757 : const double *, int,
2758 : double *)
2759 : {
2760 : // Cannot be reached
2761 : CPLAssert(false);
2762 : }
2763 :
2764 : #else
2765 :
2766 : /************************************************************************/
2767 : /* GDALResampleConvolutionVertical_8cols<T> */
2768 : /************************************************************************/
2769 :
2770 : template <class T>
2771 : static inline void
2772 22764800 : GDALResampleConvolutionVertical_8cols(const T *pChunk, size_t nStride,
2773 : const double *padfWeights,
2774 : int nSrcLineCount, float *afDest)
2775 : {
2776 22764800 : int i = 0;
2777 22764800 : size_t j = 0;
2778 22764800 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2779 22753300 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2780 44995200 : for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2781 : {
2782 22228600 : XMMReg4Double w0 =
2783 22228600 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2784 22204400 : XMMReg4Double w1 =
2785 22204400 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2786 22195800 : XMMReg4Double w2 =
2787 22195800 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2788 22217900 : XMMReg4Double w3 =
2789 22217900 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2790 22211200 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2791 22210300 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2792 22192500 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2793 22205500 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2794 22209700 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2795 22209000 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2796 22207200 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2797 22217600 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2798 : }
2799 34307000 : for (; i < nSrcLineCount; ++i, j += nStride)
2800 : {
2801 11540500 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2802 11540500 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2803 11540500 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2804 : }
2805 22766500 : v_acc0.Store4Val(afDest);
2806 22745600 : v_acc1.Store4Val(afDest + 4);
2807 22779400 : }
2808 :
2809 : template <class T>
2810 : static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
2811 : const double *, int,
2812 : double *)
2813 : {
2814 : // Cannot be reached
2815 : CPLAssert(false);
2816 : }
2817 :
2818 : #endif // __AVX__
2819 :
2820 : /************************************************************************/
2821 : /* GDALResampleConvolutionHorizontalSSE2<T> */
2822 : /************************************************************************/
2823 :
2824 : template <class T>
2825 3037035 : static inline double GDALResampleConvolutionHorizontalSSE2(
2826 : const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2827 : {
2828 3037035 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2829 3036859 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2830 3036899 : int i = 0; // Used after for.
2831 3312425 : for (; i < nSrcPixelCount - 7; i += 8)
2832 : {
2833 : // Retrieve the pixel & accumulate
2834 275504 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
2835 275506 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
2836 275506 : const XMMReg4Double v_weight1 =
2837 275506 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2838 275502 : const XMMReg4Double v_weight2 =
2839 275502 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
2840 :
2841 275503 : v_acc1 += v_pixels1 * v_weight1;
2842 275504 : v_acc2 += v_pixels2 * v_weight2;
2843 : }
2844 :
2845 3036923 : v_acc1 += v_acc2;
2846 :
2847 3036890 : double dfVal = v_acc1.GetHorizSum();
2848 10209620 : for (; i < nSrcPixelCount; ++i)
2849 : {
2850 7172770 : dfVal += pChunk[i] * padfWeightsAligned[i];
2851 : }
2852 3036858 : return dfVal;
2853 : }
2854 :
2855 : /************************************************************************/
2856 : /* GDALResampleConvolutionHorizontal<GByte> */
2857 : /************************************************************************/
2858 :
2859 : template <>
2860 2488100 : inline double GDALResampleConvolutionHorizontal<GByte>(
2861 : const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2862 : {
2863 2488100 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
2864 2488130 : nSrcPixelCount);
2865 : }
2866 :
2867 : template <>
2868 548956 : inline double GDALResampleConvolutionHorizontal<GUInt16>(
2869 : const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2870 : {
2871 548956 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
2872 548950 : nSrcPixelCount);
2873 : }
2874 :
2875 : /************************************************************************/
2876 : /* GDALResampleConvolutionHorizontalWithMaskSSE2<T> */
2877 : /************************************************************************/
2878 :
2879 : template <class T>
2880 7062423 : static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
2881 : const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
2882 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2883 : {
2884 7062423 : int i = 0; // Used after for.
2885 7062423 : XMMReg4Double v_acc = XMMReg4Double::Zero();
2886 7052503 : XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
2887 19726921 : for (; i < nSrcPixelCount - 3; i += 4)
2888 : {
2889 12681358 : const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
2890 12686158 : const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
2891 12687258 : XMMReg4Double v_weight =
2892 12687258 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2893 12685458 : v_weight *= v_mask;
2894 12681858 : v_acc += v_pixels * v_weight;
2895 12684258 : v_acc_weight += v_weight;
2896 : }
2897 :
2898 7045503 : dfVal = v_acc.GetHorizSum();
2899 7065433 : dfWeightSum = v_acc_weight.GetHorizSum();
2900 7296643 : for (; i < nSrcPixelCount; ++i)
2901 : {
2902 231077 : const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
2903 231077 : dfVal += pChunk[i] * dfWeight;
2904 231077 : dfWeightSum += dfWeight;
2905 : }
2906 7065563 : }
2907 :
2908 : /************************************************************************/
2909 : /* GDALResampleConvolutionHorizontalWithMask<GByte> */
2910 : /************************************************************************/
2911 :
2912 : template <>
2913 7067680 : inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
2914 : const GByte *pChunk, const GByte *pabyMask,
2915 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2916 : double &dfWeightSum)
2917 : {
2918 7067680 : GDALResampleConvolutionHorizontalWithMaskSSE2(
2919 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2920 : dfWeightSum);
2921 7058330 : }
2922 :
2923 : template <>
2924 63 : inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
2925 : const GUInt16 *pChunk, const GByte *pabyMask,
2926 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2927 : double &dfWeightSum)
2928 : {
2929 63 : GDALResampleConvolutionHorizontalWithMaskSSE2(
2930 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2931 : dfWeightSum);
2932 63 : }
2933 :
2934 : /************************************************************************/
2935 : /* GDALResampleConvolutionHorizontal_3rows_SSE2<T> */
2936 : /************************************************************************/
2937 :
2938 : template <class T>
2939 22991830 : static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
2940 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2941 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2942 : double &dfRes2, double &dfRes3)
2943 : {
2944 22991830 : XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
2945 22959530 : v_acc2 = XMMReg4Double::Zero(),
2946 22977230 : v_acc3 = XMMReg4Double::Zero();
2947 22973930 : int i = 0;
2948 45803666 : for (; i < nSrcPixelCount - 7; i += 8)
2949 : {
2950 : // Retrieve the pixel & accumulate.
2951 22841336 : XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
2952 22870536 : XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
2953 22870736 : const XMMReg4Double v_weight1 =
2954 22870736 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2955 22843236 : const XMMReg4Double v_weight2 =
2956 22843236 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
2957 :
2958 22857336 : v_acc1 += v_pixels1 * v_weight1;
2959 22836636 : v_acc1 += v_pixels2 * v_weight2;
2960 :
2961 22839936 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
2962 22845936 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
2963 22851336 : v_acc2 += v_pixels1 * v_weight1;
2964 22844636 : v_acc2 += v_pixels2 * v_weight2;
2965 :
2966 22848736 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
2967 22843136 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
2968 22851136 : v_acc3 += v_pixels1 * v_weight1;
2969 22848536 : v_acc3 += v_pixels2 * v_weight2;
2970 : }
2971 :
2972 22962330 : dfRes1 = v_acc1.GetHorizSum();
2973 22958930 : dfRes2 = v_acc2.GetHorizSum();
2974 22950030 : dfRes3 = v_acc3.GetHorizSum();
2975 34850126 : for (; i < nSrcPixelCount; ++i)
2976 : {
2977 11883796 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
2978 11883796 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
2979 11883796 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
2980 : }
2981 22966430 : }
2982 :
2983 : /************************************************************************/
2984 : /* GDALResampleConvolutionHorizontal_3rows<GByte> */
2985 : /************************************************************************/
2986 :
2987 : template <>
2988 22975600 : inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
2989 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
2990 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2991 : double &dfRes2, double &dfRes3)
2992 : {
2993 22975600 : GDALResampleConvolutionHorizontal_3rows_SSE2(
2994 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
2995 : dfRes1, dfRes2, dfRes3);
2996 22956000 : }
2997 :
2998 : template <>
2999 30 : inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
3000 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3001 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3002 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3003 : {
3004 30 : GDALResampleConvolutionHorizontal_3rows_SSE2(
3005 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3006 : dfRes1, dfRes2, dfRes3);
3007 30 : }
3008 :
3009 : /************************************************************************/
3010 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T> */
3011 : /************************************************************************/
3012 :
3013 : template <class T>
3014 5004922 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3015 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3016 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3017 : double &dfRes2, double &dfRes3)
3018 : {
3019 5004922 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3020 4997450 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3021 5003038 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
3022 5002819 : int i = 0; // Use after for.
3023 10581930 : for (; i < nSrcPixelCount - 3; i += 4)
3024 : {
3025 : // Retrieve the pixel & accumulate.
3026 5581700 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3027 5607630 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3028 5602430 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3029 5613620 : const XMMReg4Double v_weight =
3030 5613620 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3031 :
3032 5583600 : v_acc1 += v_pixels1 * v_weight;
3033 5587950 : v_acc2 += v_pixels2 * v_weight;
3034 5594870 : v_acc3 += v_pixels3 * v_weight;
3035 : }
3036 :
3037 5000200 : dfRes1 = v_acc1.GetHorizSum();
3038 5000936 : dfRes2 = v_acc2.GetHorizSum();
3039 5000057 : dfRes3 = v_acc3.GetHorizSum();
3040 :
3041 9406369 : for (; i < nSrcPixelCount; ++i)
3042 : {
3043 4405122 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3044 4405122 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3045 4405122 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3046 : }
3047 5001247 : }
3048 :
3049 : /************************************************************************/
3050 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte> */
3051 : /************************************************************************/
3052 :
3053 : template <>
3054 4934660 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
3055 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3056 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3057 : double &dfRes2, double &dfRes3)
3058 : {
3059 4934660 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3060 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3061 : dfRes1, dfRes2, dfRes3);
3062 4933680 : }
3063 :
3064 : template <>
3065 67024 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
3066 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3067 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3068 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3069 : {
3070 67024 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3071 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3072 : dfRes1, dfRes2, dfRes3);
3073 67089 : }
3074 :
3075 : /************************************************************************/
3076 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T> */
3077 : /************************************************************************/
3078 :
3079 : template <class T>
3080 13894130 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3081 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3082 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3083 : double &dfRes3)
3084 : {
3085 13894130 : const XMMReg4Double v_weight =
3086 : XMMReg4Double::Load4ValAligned(padfWeightsAligned);
3087 :
3088 : // Retrieve the pixel & accumulate.
3089 13926200 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
3090 13935150 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
3091 13938510 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
3092 :
3093 13955150 : XMMReg4Double v_acc1 = v_pixels1 * v_weight;
3094 13878870 : XMMReg4Double v_acc2 = v_pixels2 * v_weight;
3095 13899650 : XMMReg4Double v_acc3 = v_pixels3 * v_weight;
3096 :
3097 13899040 : dfRes1 = v_acc1.GetHorizSum();
3098 13905290 : dfRes2 = v_acc2.GetHorizSum();
3099 13921340 : dfRes3 = v_acc3.GetHorizSum();
3100 13908980 : }
3101 :
3102 : /************************************************************************/
3103 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte> */
3104 : /************************************************************************/
3105 :
3106 : template <>
3107 8241970 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
3108 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3109 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3110 : double &dfRes3)
3111 : {
3112 8241970 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3113 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3114 : dfRes3);
3115 8247550 : }
3116 :
3117 : template <>
3118 5676770 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
3119 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3120 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
3121 : double &dfRes2, double &dfRes3)
3122 : {
3123 5676770 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3124 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3125 : dfRes3);
3126 5670600 : }
3127 :
3128 : #endif // USE_SSE2
3129 :
3130 : /************************************************************************/
3131 : /* GDALResampleChunk_Convolution() */
3132 : /************************************************************************/
3133 :
3134 : template <class T, class Twork, GDALDataType eWrkDataType>
3135 4639 : static CPLErr GDALResampleChunk_ConvolutionT(
3136 : const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
3137 : FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
3138 : int nKernelRadius, bool bKernelWithNegativeWeights, float fMaxVal)
3139 :
3140 : {
3141 4639 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
3142 4639 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
3143 4639 : const double dfSrcXDelta = args.dfSrcXDelta;
3144 4639 : const double dfSrcYDelta = args.dfSrcYDelta;
3145 4639 : constexpr int nBands = 1;
3146 4639 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
3147 4639 : const int nChunkXOff = args.nChunkXOff;
3148 4639 : const int nChunkXSize = args.nChunkXSize;
3149 4639 : const int nChunkYOff = args.nChunkYOff;
3150 4639 : const int nChunkYSize = args.nChunkYSize;
3151 4639 : const int nDstXOff = args.nDstXOff;
3152 4639 : const int nDstXOff2 = args.nDstXOff2;
3153 4639 : const int nDstYOff = args.nDstYOff;
3154 4639 : const int nDstYOff2 = args.nDstYOff2;
3155 4639 : const bool bHasNoData = args.bHasNoData;
3156 4639 : double dfNoDataValue = args.dfNoDataValue;
3157 :
3158 4639 : if (!bHasNoData)
3159 4562 : dfNoDataValue = 0.0;
3160 4639 : const auto dstDataType = args.eOvrDataType;
3161 4639 : const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
3162 4639 : const double dfReplacementVal =
3163 75 : bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
3164 : : dfNoDataValue;
3165 : // cppcheck-suppress unreadVariable
3166 4639 : const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
3167 4638 : const bool bNoDataValueInt64Valid =
3168 4638 : isIntegerDT && GDALIsValueExactAs<GInt64>(dfNoDataValue);
3169 4638 : const auto nNodataValueInt64 =
3170 : bNoDataValueInt64Valid ? static_cast<GInt64>(dfNoDataValue) : 0;
3171 4638 : constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
3172 :
3173 : // TODO: we should have some generic function to do this.
3174 4638 : Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
3175 4638 : Twork fDstMax = cpl::NumericLimits<Twork>::max();
3176 4638 : if (dstDataType == GDT_Byte)
3177 : {
3178 3903 : fDstMin = std::numeric_limits<GByte>::min();
3179 3901 : fDstMax = std::numeric_limits<GByte>::max();
3180 : }
3181 737 : else if (dstDataType == GDT_Int8)
3182 : {
3183 1 : fDstMin = std::numeric_limits<GInt8>::min();
3184 1 : fDstMax = std::numeric_limits<GInt8>::max();
3185 : }
3186 736 : else if (dstDataType == GDT_UInt16)
3187 : {
3188 393 : fDstMin = std::numeric_limits<GUInt16>::min();
3189 387 : fDstMax = std::numeric_limits<GUInt16>::max();
3190 : }
3191 343 : else if (dstDataType == GDT_Int16)
3192 : {
3193 291 : fDstMin = std::numeric_limits<GInt16>::min();
3194 291 : fDstMax = std::numeric_limits<GInt16>::max();
3195 : }
3196 52 : else if (dstDataType == GDT_UInt32)
3197 : {
3198 1 : fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
3199 1 : fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
3200 : }
3201 51 : else if (dstDataType == GDT_Int32)
3202 : {
3203 : // cppcheck-suppress unreadVariable
3204 2 : fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
3205 : // cppcheck-suppress unreadVariable
3206 2 : fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
3207 : }
3208 49 : else if (dstDataType == GDT_UInt64)
3209 : {
3210 : // cppcheck-suppress unreadVariable
3211 1 : fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
3212 : // cppcheck-suppress unreadVariable
3213 : // (1 << 64) - 2048: largest uint64 value a double can hold
3214 1 : fDstMax = static_cast<Twork>(18446744073709549568ULL);
3215 : }
3216 48 : else if (dstDataType == GDT_Int64)
3217 : {
3218 : // cppcheck-suppress unreadVariable
3219 1 : fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
3220 : // cppcheck-suppress unreadVariable
3221 : // (1 << 63) - 1024: largest int64 that a double can hold
3222 1 : fDstMax = static_cast<Twork>(9223372036854774784LL);
3223 : }
3224 :
3225 36966174 : auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
3226 : bNoDataValueInt64Valid, nNodataValueInt64,
3227 : dfNoDataValue, dfReplacementVal](Twork fVal)
3228 : {
3229 16023700 : if (!bHasNoData)
3230 11838700 : return fVal;
3231 :
3232 : // Clamp value before comparing to nodata: this is only needed for
3233 : // kernels with negative weights (Lanczos)
3234 4185030 : Twork fClamped = fVal;
3235 4185030 : if (fClamped < fDstMin)
3236 15998 : fClamped = fDstMin;
3237 4169030 : else if (fClamped > fDstMax)
3238 16406 : fClamped = fDstMax;
3239 4185030 : if (isIntegerDT)
3240 : {
3241 4216630 : if (bNoDataValueInt64Valid)
3242 : {
3243 4214900 : const double fClampedRounded = std::round(fClamped);
3244 8417100 : if (fClampedRounded >=
3245 : static_cast<Twork>(
3246 8417760 : std::numeric_limits<int64_t>::min()) &&
3247 : fClampedRounded <=
3248 8412010 : static_cast<Twork>(9223372036854774784LL) &&
3249 4199200 : nNodataValueInt64 ==
3250 4202550 : static_cast<GInt64>(std::round(fClamped)))
3251 : {
3252 : // Do not use the nodata value
3253 14435 : return static_cast<Twork>(dfReplacementVal);
3254 : }
3255 : }
3256 : }
3257 0 : else if (dfNoDataValue == fClamped)
3258 : {
3259 : // Do not use the nodata value
3260 1 : return static_cast<Twork>(dfReplacementVal);
3261 : }
3262 4162560 : return fClamped;
3263 : };
3264 :
3265 : /* -------------------------------------------------------------------- */
3266 : /* Allocate work buffers. */
3267 : /* -------------------------------------------------------------------- */
3268 4635 : const int nDstXSize = nDstXOff2 - nDstXOff;
3269 4635 : Twork *pafWrkScanline = nullptr;
3270 4635 : if (dstDataType != eWrkDataType)
3271 : {
3272 : pafWrkScanline =
3273 4590 : static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
3274 4596 : if (pafWrkScanline == nullptr)
3275 0 : return CE_Failure;
3276 : }
3277 :
3278 4641 : const double dfXScale = 1.0 / dfXRatioDstToSrc;
3279 4641 : const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
3280 4641 : const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
3281 4641 : const double dfYScale = 1.0 / dfYRatioDstToSrc;
3282 4641 : const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
3283 4641 : const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
3284 :
3285 : // Temporary array to store result of horizontal filter.
3286 : double *padfHorizontalFiltered = static_cast<double *>(
3287 4641 : VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
3288 :
3289 : // To store convolution coefficients.
3290 4641 : double *padfWeights = static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
3291 : static_cast<int>(2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) +
3292 : 0.5) *
3293 : sizeof(double)));
3294 :
3295 4639 : GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
3296 4639 : if (pabyChunkNodataMask)
3297 : pabyChunkNodataMaskHorizontalFiltered =
3298 462 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
3299 4639 : if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
3300 462 : (pabyChunkNodataMask != nullptr &&
3301 : pabyChunkNodataMaskHorizontalFiltered == nullptr))
3302 : {
3303 1 : VSIFree(pafWrkScanline);
3304 0 : VSIFree(padfHorizontalFiltered);
3305 0 : VSIFreeAligned(padfWeights);
3306 0 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3307 0 : return CE_Failure;
3308 : }
3309 :
3310 : /* ==================================================================== */
3311 : /* First pass: horizontal filter */
3312 : /* ==================================================================== */
3313 4639 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
3314 : #ifdef USE_SSE2
3315 4639 : bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
3316 : #endif
3317 2962381 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
3318 : {
3319 2957733 : const double dfSrcPixel =
3320 2957733 : (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
3321 2957733 : int nSrcPixelStart =
3322 2957733 : static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
3323 2957733 : if (nSrcPixelStart < nChunkXOff)
3324 56807 : nSrcPixelStart = nChunkXOff;
3325 2957733 : int nSrcPixelStop =
3326 2957733 : static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
3327 2957733 : if (nSrcPixelStop > nChunkRightXOff)
3328 56826 : nSrcPixelStop = nChunkRightXOff;
3329 : #if 0
3330 : if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
3331 : {
3332 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3333 : }
3334 : if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
3335 : {
3336 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3337 : }
3338 : #endif
3339 2957733 : const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
3340 2957733 : double dfWeightSum = 0.0;
3341 :
3342 : // Compute convolution coefficients.
3343 2957733 : int nSrcPixel = nSrcPixelStart;
3344 2957733 : double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
3345 4186216 : for (; nSrcPixel < nSrcPixelStop - 3; nSrcPixel += 4)
3346 : {
3347 1228748 : padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
3348 1228748 : dfX += dfXScaleWeight;
3349 1228748 : padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
3350 1228748 : dfX += dfXScaleWeight;
3351 1228748 : padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
3352 1228748 : dfX += dfXScaleWeight;
3353 1228748 : padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
3354 1228748 : dfX += dfXScaleWeight;
3355 1228474 : dfWeightSum +=
3356 1228748 : pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
3357 : }
3358 6946891 : for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
3359 : {
3360 3989048 : const double dfWeight = pfnFilterFunc(dfX);
3361 3989420 : padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3362 3989420 : dfWeightSum += dfWeight;
3363 : }
3364 :
3365 2957843 : const int nHeight = nChunkYSize * nBands;
3366 2957843 : if (pabyChunkNodataMask == nullptr)
3367 : {
3368 2869857 : if (dfWeightSum != 0)
3369 : {
3370 2869845 : const double dfInvWeightSum = 1.0 / dfWeightSum;
3371 11124967 : for (int i = 0; i < nSrcPixelCount; ++i)
3372 8255125 : padfWeights[i] *= dfInvWeightSum;
3373 : }
3374 2869857 : int iSrcLineOff = 0;
3375 : #ifdef USE_SSE2
3376 2869857 : if (nSrcPixelCount == 4)
3377 : {
3378 15780864 : for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3379 : {
3380 15171416 : const size_t j =
3381 15171416 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3382 15171416 : (nSrcPixelStart - nChunkXOff);
3383 15171416 : double dfVal1 = 0.0;
3384 15171416 : double dfVal2 = 0.0;
3385 15171416 : double dfVal3 = 0.0;
3386 15171416 : GDALResampleConvolutionHorizontalPixelCount4_3rows(
3387 15171416 : pChunk + j, pChunk + j + nChunkXSize,
3388 15171416 : pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
3389 : dfVal2, dfVal3);
3390 15167306 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3391 15167306 : nDstXSize +
3392 15167306 : iDstPixel - nDstXOff] = dfVal1;
3393 15167306 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3394 15167306 : 1) *
3395 15167306 : nDstXSize +
3396 15167306 : iDstPixel - nDstXOff] = dfVal2;
3397 15167306 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3398 15167306 : 2) *
3399 15167306 : nDstXSize +
3400 15167306 : iDstPixel - nDstXOff] = dfVal3;
3401 : }
3402 : }
3403 2256292 : else if (bSrcPixelCountLess8)
3404 : {
3405 7072188 : for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3406 : {
3407 5017447 : const size_t j =
3408 5017447 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3409 5017447 : (nSrcPixelStart - nChunkXOff);
3410 5017447 : double dfVal1 = 0.0;
3411 5017447 : double dfVal2 = 0.0;
3412 5017447 : double dfVal3 = 0.0;
3413 5017447 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
3414 5017447 : pChunk + j, pChunk + j + nChunkXSize,
3415 5017447 : pChunk + j + 2 * nChunkXSize, padfWeights,
3416 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3417 5020102 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3418 5020102 : nDstXSize +
3419 5020102 : iDstPixel - nDstXOff] = dfVal1;
3420 5020102 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3421 5020102 : 1) *
3422 5020102 : nDstXSize +
3423 5020102 : iDstPixel - nDstXOff] = dfVal2;
3424 5020102 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3425 5020102 : 2) *
3426 5020102 : nDstXSize +
3427 5020102 : iDstPixel - nDstXOff] = dfVal3;
3428 : }
3429 : }
3430 : else
3431 : #endif
3432 : {
3433 23238126 : for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3434 : {
3435 23034430 : const size_t j =
3436 23034430 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3437 23034430 : (nSrcPixelStart - nChunkXOff);
3438 23034430 : double dfVal1 = 0.0;
3439 23034430 : double dfVal2 = 0.0;
3440 23034430 : double dfVal3 = 0.0;
3441 23034430 : GDALResampleConvolutionHorizontal_3rows(
3442 23034430 : pChunk + j, pChunk + j + nChunkXSize,
3443 23034430 : pChunk + j + 2 * nChunkXSize, padfWeights,
3444 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3445 23033930 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3446 23033930 : nDstXSize +
3447 23033930 : iDstPixel - nDstXOff] = dfVal1;
3448 23033930 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3449 23033930 : 1) *
3450 23033930 : nDstXSize +
3451 23033930 : iDstPixel - nDstXOff] = dfVal2;
3452 23033930 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3453 23033930 : 2) *
3454 23033930 : nDstXSize +
3455 23033930 : iDstPixel - nDstXOff] = dfVal3;
3456 : }
3457 : }
3458 5949777 : for (; iSrcLineOff < nHeight; ++iSrcLineOff)
3459 : {
3460 3081953 : const size_t j =
3461 3081953 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3462 3081953 : (nSrcPixelStart - nChunkXOff);
3463 3630906 : const double dfVal = GDALResampleConvolutionHorizontal(
3464 593853 : pChunk + j, padfWeights, nSrcPixelCount);
3465 3081973 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3466 3081973 : nDstXSize +
3467 3081973 : iDstPixel - nDstXOff] = dfVal;
3468 : }
3469 : }
3470 : else
3471 : {
3472 20503076 : for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
3473 : {
3474 20413146 : const size_t j =
3475 20413146 : static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3476 20413146 : (nSrcPixelStart - nChunkXOff);
3477 :
3478 20413146 : if (bKernelWithNegativeWeights)
3479 : {
3480 19899912 : int nConsecutiveValid = 0;
3481 19899912 : int nMaxConsecutiveValid = 0;
3482 181970458 : for (int k = 0; k < nSrcPixelCount; k++)
3483 : {
3484 162066146 : if (pabyChunkNodataMask[j + k])
3485 48904253 : nConsecutiveValid++;
3486 113162793 : else if (nConsecutiveValid)
3487 : {
3488 111293 : nMaxConsecutiveValid = std::max(
3489 107790 : nMaxConsecutiveValid, nConsecutiveValid);
3490 111293 : nConsecutiveValid = 0;
3491 : }
3492 : }
3493 19902112 : nMaxConsecutiveValid =
3494 19903412 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
3495 19902112 : if (nMaxConsecutiveValid < nSrcPixelCount / 2)
3496 : {
3497 13314907 : const size_t nTempOffset =
3498 13314907 : static_cast<size_t>(iSrcLineOff) * nDstXSize +
3499 13314907 : iDstPixel - nDstXOff;
3500 13314907 : padfHorizontalFiltered[nTempOffset] = 0.0;
3501 13314907 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3502 13314907 : continue;
3503 : }
3504 : }
3505 :
3506 7100439 : double dfVal = 0.0;
3507 7100439 : GDALResampleConvolutionHorizontalWithMask(
3508 44639 : pChunk + j, pabyChunkNodataMask + j, padfWeights,
3509 : nSrcPixelCount, dfVal, dfWeightSum);
3510 7100213 : const size_t nTempOffset =
3511 7100213 : static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
3512 7100213 : nDstXOff;
3513 7100213 : if (dfWeightSum > 0.0)
3514 : {
3515 7056846 : padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
3516 7056846 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
3517 : }
3518 : else
3519 : {
3520 43368 : padfHorizontalFiltered[nTempOffset] = 0.0;
3521 43368 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3522 : }
3523 : }
3524 : }
3525 : }
3526 :
3527 : /* ==================================================================== */
3528 : /* Second pass: vertical filter */
3529 : /* ==================================================================== */
3530 4641 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
3531 :
3532 309474 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3533 : {
3534 304833 : Twork *const pafDstScanline =
3535 : pafWrkScanline
3536 304833 : ? pafWrkScanline
3537 8421 : : static_cast<Twork *>(pDstBuffer) +
3538 8421 : static_cast<size_t>(iDstLine - nDstYOff) * nDstXSize;
3539 :
3540 304833 : const double dfSrcLine =
3541 304833 : (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
3542 304833 : int nSrcLineStart =
3543 304833 : static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
3544 304833 : int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
3545 304833 : if (nSrcLineStart < nChunkYOff)
3546 2927 : nSrcLineStart = nChunkYOff;
3547 304833 : if (nSrcLineStop > nChunkBottomYOff)
3548 2971 : nSrcLineStop = nChunkBottomYOff;
3549 : #if 0
3550 : if( nSrcLineStart < nChunkYOff &&
3551 : nChunkYOff > 0 )
3552 : {
3553 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3554 : }
3555 : if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
3556 : {
3557 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3558 : }
3559 : #endif
3560 304833 : const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
3561 304833 : double dfWeightSum = 0.0;
3562 :
3563 : // Compute convolution coefficients.
3564 304833 : int nSrcLine = nSrcLineStart; // Used after for.
3565 304833 : double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
3566 744873 : for (; nSrcLine < nSrcLineStop - 3;
3567 440040 : nSrcLine += 4, dfY += 4 * dfYScaleWeight)
3568 : {
3569 440038 : padfWeights[nSrcLine - nSrcLineStart] = dfY;
3570 440038 : padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
3571 440038 : padfWeights[nSrcLine + 2 - nSrcLineStart] =
3572 440038 : dfY + 2 * dfYScaleWeight;
3573 440038 : padfWeights[nSrcLine + 3 - nSrcLineStart] =
3574 440038 : dfY + 3 * dfYScaleWeight;
3575 440040 : dfWeightSum +=
3576 440038 : pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
3577 : }
3578 340967 : for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
3579 : {
3580 36142 : const double dfWeight = pfnFilterFunc(dfY);
3581 36132 : padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
3582 36132 : dfWeightSum += dfWeight;
3583 : }
3584 :
3585 304825 : if (pabyChunkNodataMask == nullptr)
3586 : {
3587 265832 : if (dfWeightSum != 0)
3588 : {
3589 265834 : const double dfInvWeightSum = 1.0 / dfWeightSum;
3590 1789495 : for (int i = 0; i < nSrcLineCount; ++i)
3591 1523661 : padfWeights[i] *= dfInvWeightSum;
3592 : }
3593 : }
3594 :
3595 304825 : if (pabyChunkNodataMask == nullptr)
3596 : {
3597 265831 : int iFilteredPixelOff = 0; // Used after for.
3598 : // j used after for.
3599 265831 : size_t j =
3600 265831 : (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
3601 : #ifdef USE_SSE2
3602 : if constexpr (eWrkDataType == GDT_Float32)
3603 : {
3604 : #ifdef __AVX__
3605 : for (; iFilteredPixelOff < nDstXSize - 15;
3606 : iFilteredPixelOff += 16, j += 16)
3607 : {
3608 : GDALResampleConvolutionVertical_16cols(
3609 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3610 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3611 : if (bHasNoData)
3612 : {
3613 : for (int k = 0; k < 16; k++)
3614 : {
3615 : pafDstScanline[iFilteredPixelOff + k] =
3616 : replaceValIfNodata(
3617 : pafDstScanline[iFilteredPixelOff + k]);
3618 : }
3619 : }
3620 : }
3621 : #else
3622 23017798 : for (; iFilteredPixelOff < nDstXSize - 7;
3623 : iFilteredPixelOff += 8, j += 8)
3624 : {
3625 22799608 : GDALResampleConvolutionVertical_8cols(
3626 22799608 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3627 22799608 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3628 22759178 : if (bHasNoData)
3629 : {
3630 123192 : for (int k = 0; k < 8; k++)
3631 : {
3632 109504 : pafDstScanline[iFilteredPixelOff + k] =
3633 109504 : replaceValIfNodata(
3634 109504 : pafDstScanline[iFilteredPixelOff + k]);
3635 : }
3636 : }
3637 : }
3638 : #endif
3639 :
3640 683426 : for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
3641 : {
3642 465258 : const Twork fVal =
3643 465241 : static_cast<Twork>(GDALResampleConvolutionVertical(
3644 465241 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3645 : nSrcLineCount));
3646 465239 : pafDstScanline[iFilteredPixelOff] =
3647 465258 : replaceValIfNodata(fVal);
3648 : }
3649 : }
3650 : else
3651 : #endif
3652 : {
3653 2887210 : for (; iFilteredPixelOff < nDstXSize - 1;
3654 : iFilteredPixelOff += 2, j += 2)
3655 : {
3656 2880000 : double dfVal1 = 0.0;
3657 2880000 : double dfVal2 = 0.0;
3658 2880000 : GDALResampleConvolutionVertical_2cols(
3659 2880000 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3660 : nSrcLineCount, dfVal1, dfVal2);
3661 5760010 : pafDstScanline[iFilteredPixelOff] =
3662 2880000 : replaceValIfNodata(static_cast<Twork>(dfVal1));
3663 2880000 : pafDstScanline[iFilteredPixelOff + 1] =
3664 2880000 : replaceValIfNodata(static_cast<Twork>(dfVal2));
3665 : }
3666 7206 : if (iFilteredPixelOff < nDstXSize)
3667 : {
3668 2 : const double dfVal = GDALResampleConvolutionVertical(
3669 2 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3670 : nSrcLineCount);
3671 2 : pafDstScanline[iFilteredPixelOff] =
3672 2 : replaceValIfNodata(static_cast<Twork>(dfVal));
3673 : }
3674 : }
3675 : }
3676 : else
3677 : {
3678 18979539 : for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
3679 : ++iFilteredPixelOff)
3680 : {
3681 18940633 : double dfVal = 0.0;
3682 18940633 : dfWeightSum = 0.0;
3683 18940633 : size_t j = (nSrcLineStart - nChunkYOff) *
3684 18940633 : static_cast<size_t>(nDstXSize) +
3685 18940633 : iFilteredPixelOff;
3686 18940633 : if (bKernelWithNegativeWeights)
3687 : {
3688 18700801 : int nConsecutiveValid = 0;
3689 18700801 : int nMaxConsecutiveValid = 0;
3690 133007321 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3691 : {
3692 114268020 : const double dfWeight =
3693 114268020 : padfWeights[i] *
3694 : pabyChunkNodataMaskHorizontalFiltered[j];
3695 114268020 : if (pabyChunkNodataMaskHorizontalFiltered[j])
3696 : {
3697 48650337 : nConsecutiveValid++;
3698 : }
3699 65617183 : else if (nConsecutiveValid)
3700 : {
3701 243325 : nMaxConsecutiveValid = std::max(
3702 204376 : nMaxConsecutiveValid, nConsecutiveValid);
3703 243325 : nConsecutiveValid = 0;
3704 : }
3705 114307020 : dfVal += padfHorizontalFiltered[j] * dfWeight;
3706 114307020 : dfWeightSum += dfWeight;
3707 : }
3708 18740901 : nMaxConsecutiveValid =
3709 18739801 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
3710 18740901 : if (nMaxConsecutiveValid < nSrcLineCount / 2)
3711 : {
3712 9246271 : pafDstScanline[iFilteredPixelOff] =
3713 9246179 : static_cast<Twork>(dfNoDataValue);
3714 9246271 : continue;
3715 : }
3716 : }
3717 : else
3718 : {
3719 1233322 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3720 : {
3721 993504 : const double dfWeight =
3722 993504 : padfWeights[i] *
3723 : pabyChunkNodataMaskHorizontalFiltered[j];
3724 993504 : dfVal += padfHorizontalFiltered[j] * dfWeight;
3725 993504 : dfWeightSum += dfWeight;
3726 : }
3727 : }
3728 9734482 : if (dfWeightSum > 0.0)
3729 : {
3730 9682091 : pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
3731 9722259 : static_cast<Twork>(dfVal / dfWeightSum));
3732 : }
3733 : else
3734 : {
3735 12205 : pafDstScanline[iFilteredPixelOff] =
3736 12181 : static_cast<Twork>(dfNoDataValue);
3737 : }
3738 : }
3739 : }
3740 :
3741 264321 : if (fMaxVal != 0.0f)
3742 : {
3743 192324 : for (int i = 0; i < nDstXSize; ++i)
3744 : {
3745 192088 : if (pafDstScanline[i] > fMaxVal)
3746 96022 : pafDstScanline[i] = fMaxVal;
3747 : }
3748 : }
3749 :
3750 264321 : if (pafWrkScanline)
3751 : {
3752 296414 : GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
3753 : static_cast<GByte *>(pDstBuffer) +
3754 296414 : static_cast<size_t>(iDstLine - nDstYOff) *
3755 296414 : nDstXSize * nDstDataTypeSize,
3756 : dstDataType, nDstDataTypeSize, nDstXSize);
3757 : }
3758 : }
3759 :
3760 4641 : VSIFree(pafWrkScanline);
3761 4641 : VSIFreeAligned(padfWeights);
3762 4641 : VSIFree(padfHorizontalFiltered);
3763 4641 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3764 :
3765 4641 : return CE_None;
3766 : }
3767 :
3768 : static CPLErr
3769 4640 : GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
3770 : const void *pChunk, void **ppDstBuffer,
3771 : GDALDataType *peDstBufferDataType)
3772 : {
3773 : GDALResampleAlg eResample;
3774 4640 : bool bKernelWithNegativeWeights = false;
3775 4640 : if (EQUAL(args.pszResampling, "BILINEAR"))
3776 2628 : eResample = GRA_Bilinear;
3777 2012 : else if (EQUAL(args.pszResampling, "CUBIC"))
3778 : {
3779 1935 : eResample = GRA_Cubic;
3780 1935 : bKernelWithNegativeWeights = true;
3781 : }
3782 77 : else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
3783 23 : eResample = GRA_CubicSpline;
3784 54 : else if (EQUAL(args.pszResampling, "LANCZOS"))
3785 : {
3786 54 : eResample = GRA_Lanczos;
3787 54 : bKernelWithNegativeWeights = true;
3788 : }
3789 : else
3790 : {
3791 0 : CPLAssert(false);
3792 : return CE_Failure;
3793 : }
3794 4640 : const int nKernelRadius = GWKGetFilterRadius(eResample);
3795 4636 : FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
3796 : const FilterFunc4ValuesType pfnFilterFunc4Values =
3797 4638 : GWKGetFilterFunc4Values(eResample);
3798 :
3799 4638 : float fMaxVal = 0.f;
3800 : // Cubic, etc... can have overshoots, so make sure we clamp values to the
3801 : // maximum value if NBITS is set.
3802 4638 : if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
3803 8 : (args.eOvrDataType == GDT_Byte || args.eOvrDataType == GDT_UInt16 ||
3804 0 : args.eOvrDataType == GDT_UInt32))
3805 : {
3806 8 : int nBits = args.nOvrNBITS;
3807 8 : if (nBits == GDALGetDataTypeSizeBits(args.eOvrDataType))
3808 1 : nBits = 0;
3809 8 : if (nBits > 0 && nBits < 32)
3810 7 : fMaxVal = static_cast<float>((1U << nBits) - 1);
3811 : }
3812 :
3813 4638 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
3814 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
3815 : GDALGetDataTypeSizeBytes(args.eOvrDataType));
3816 4640 : if (*ppDstBuffer == nullptr)
3817 : {
3818 0 : return CE_Failure;
3819 : }
3820 4640 : *peDstBufferDataType = args.eOvrDataType;
3821 :
3822 4640 : switch (args.eWrkDataType)
3823 : {
3824 3903 : case GDT_Byte:
3825 : {
3826 3903 : return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32>(
3827 : args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
3828 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3829 3903 : bKernelWithNegativeWeights, fMaxVal);
3830 : }
3831 :
3832 395 : case GDT_UInt16:
3833 : {
3834 395 : return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32>(
3835 : args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
3836 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3837 396 : bKernelWithNegativeWeights, fMaxVal);
3838 : }
3839 :
3840 313 : case GDT_Float32:
3841 : {
3842 313 : return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32>(
3843 : args, static_cast<const float *>(pChunk), *ppDstBuffer,
3844 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3845 313 : bKernelWithNegativeWeights, fMaxVal);
3846 : }
3847 :
3848 29 : case GDT_Float64:
3849 : {
3850 29 : return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64>(
3851 : args, static_cast<const double *>(pChunk), *ppDstBuffer,
3852 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3853 29 : bKernelWithNegativeWeights, fMaxVal);
3854 : }
3855 :
3856 0 : default:
3857 0 : break;
3858 : }
3859 :
3860 0 : CPLAssert(false);
3861 : return CE_Failure;
3862 : }
3863 :
3864 : /************************************************************************/
3865 : /* GDALResampleChunkC32R() */
3866 : /************************************************************************/
3867 :
3868 2 : static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
3869 : const float *pafChunk, const int nChunkYOff,
3870 : const int nChunkYSize, const int nDstYOff,
3871 : const int nDstYOff2, const int nOvrXSize,
3872 : const int nOvrYSize, void **ppDstBuffer,
3873 : GDALDataType *peDstBufferDataType,
3874 : const char *pszResampling)
3875 :
3876 : {
3877 : enum Method
3878 : {
3879 : NEAR,
3880 : AVERAGE,
3881 : AVERAGE_MAGPHASE,
3882 : RMS,
3883 : };
3884 :
3885 2 : Method eMethod = NEAR;
3886 2 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
3887 : {
3888 0 : eMethod = NEAR;
3889 : }
3890 2 : else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
3891 : {
3892 0 : eMethod = AVERAGE_MAGPHASE;
3893 : }
3894 2 : else if (EQUAL(pszResampling, "RMS"))
3895 : {
3896 2 : eMethod = RMS;
3897 : }
3898 0 : else if (STARTS_WITH_CI(pszResampling, "AVER"))
3899 : {
3900 0 : eMethod = AVERAGE;
3901 : }
3902 : else
3903 : {
3904 0 : CPLError(
3905 : CE_Failure, CPLE_NotSupported,
3906 : "Resampling method %s is not supported for complex data types. "
3907 : "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
3908 : pszResampling);
3909 0 : return CE_Failure;
3910 : }
3911 :
3912 2 : const int nOXSize = nOvrXSize;
3913 2 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
3914 : GDALGetDataTypeSizeBytes(GDT_CFloat32));
3915 2 : if (*ppDstBuffer == nullptr)
3916 : {
3917 0 : return CE_Failure;
3918 : }
3919 2 : float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
3920 2 : *peDstBufferDataType = GDT_CFloat32;
3921 :
3922 2 : const int nOYSize = nOvrYSize;
3923 2 : const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
3924 2 : const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
3925 :
3926 : /* ==================================================================== */
3927 : /* Loop over destination scanlines. */
3928 : /* ==================================================================== */
3929 8 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3930 : {
3931 6 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
3932 6 : if (nSrcYOff < nChunkYOff)
3933 0 : nSrcYOff = nChunkYOff;
3934 :
3935 6 : int nSrcYOff2 =
3936 6 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
3937 6 : if (nSrcYOff2 == nSrcYOff)
3938 0 : nSrcYOff2++;
3939 :
3940 6 : if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
3941 : {
3942 2 : if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
3943 0 : nSrcYOff = nSrcHeight - 1;
3944 2 : nSrcYOff2 = nSrcHeight;
3945 : }
3946 6 : if (nSrcYOff2 > nChunkYOff + nChunkYSize)
3947 0 : nSrcYOff2 = nChunkYOff + nChunkYSize;
3948 :
3949 6 : const float *const pafSrcScanline =
3950 6 : pafChunk +
3951 6 : (static_cast<size_t>(nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
3952 6 : float *const pafDstScanline =
3953 6 : pafDstBuffer +
3954 6 : static_cast<size_t>(iDstLine - nDstYOff) * 2 * nOXSize;
3955 :
3956 : /* --------------------------------------------------------------------
3957 : */
3958 : /* Loop over destination pixels */
3959 : /* --------------------------------------------------------------------
3960 : */
3961 18 : for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
3962 : {
3963 12 : const size_t iDstPixelSZ = static_cast<size_t>(iDstPixel);
3964 12 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
3965 12 : int nSrcXOff2 =
3966 12 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
3967 12 : if (nSrcXOff2 == nSrcXOff)
3968 0 : nSrcXOff2++;
3969 12 : if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
3970 : {
3971 6 : if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
3972 0 : nSrcXOff = nSrcWidth - 1;
3973 6 : nSrcXOff2 = nSrcWidth;
3974 : }
3975 12 : const size_t nSrcXOffSZ = static_cast<size_t>(nSrcXOff);
3976 :
3977 12 : if (eMethod == NEAR)
3978 : {
3979 0 : pafDstScanline[iDstPixelSZ * 2] =
3980 0 : pafSrcScanline[nSrcXOffSZ * 2];
3981 0 : pafDstScanline[iDstPixelSZ * 2 + 1] =
3982 0 : pafSrcScanline[nSrcXOffSZ * 2 + 1];
3983 : }
3984 12 : else if (eMethod == AVERAGE_MAGPHASE)
3985 : {
3986 0 : double dfTotalR = 0.0;
3987 0 : double dfTotalI = 0.0;
3988 0 : double dfTotalM = 0.0;
3989 0 : size_t nCount = 0;
3990 :
3991 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
3992 : {
3993 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
3994 : {
3995 0 : const double dfR =
3996 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
3997 0 : static_cast<size_t>(iY - nSrcYOff) *
3998 0 : nSrcWidth * 2];
3999 0 : const double dfI =
4000 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4001 0 : static_cast<size_t>(iY - nSrcYOff) *
4002 0 : nSrcWidth * 2 +
4003 0 : 1];
4004 0 : dfTotalR += dfR;
4005 0 : dfTotalI += dfI;
4006 0 : dfTotalM += std::hypot(dfR, dfI);
4007 0 : ++nCount;
4008 : }
4009 : }
4010 :
4011 0 : CPLAssert(nCount > 0);
4012 0 : if (nCount == 0)
4013 : {
4014 0 : pafDstScanline[iDstPixelSZ * 2] = 0.0;
4015 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4016 : }
4017 : else
4018 : {
4019 0 : pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4020 0 : dfTotalR / static_cast<double>(nCount));
4021 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4022 0 : dfTotalI / static_cast<double>(nCount));
4023 : const double dfM =
4024 0 : std::hypot(pafDstScanline[iDstPixelSZ * 2],
4025 0 : pafDstScanline[iDstPixelSZ * 2 + 1]);
4026 0 : const double dfDesiredM =
4027 0 : dfTotalM / static_cast<double>(nCount);
4028 0 : double dfRatio = 1.0;
4029 0 : if (dfM != 0.0)
4030 0 : dfRatio = dfDesiredM / dfM;
4031 :
4032 0 : pafDstScanline[iDstPixelSZ * 2] *=
4033 0 : static_cast<float>(dfRatio);
4034 0 : pafDstScanline[iDstPixelSZ * 2 + 1] *=
4035 0 : static_cast<float>(dfRatio);
4036 : }
4037 : }
4038 12 : else if (eMethod == RMS)
4039 : {
4040 12 : double dfTotalR = 0.0;
4041 12 : double dfTotalI = 0.0;
4042 12 : size_t nCount = 0;
4043 :
4044 36 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4045 : {
4046 72 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4047 : {
4048 48 : const double dfR =
4049 48 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4050 48 : static_cast<size_t>(iY - nSrcYOff) *
4051 48 : nSrcWidth * 2];
4052 48 : const double dfI =
4053 48 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4054 48 : static_cast<size_t>(iY - nSrcYOff) *
4055 48 : nSrcWidth * 2 +
4056 48 : 1];
4057 :
4058 48 : dfTotalR += SQUARE(dfR);
4059 48 : dfTotalI += SQUARE(dfI);
4060 :
4061 48 : ++nCount;
4062 : }
4063 : }
4064 :
4065 12 : CPLAssert(nCount > 0);
4066 12 : if (nCount == 0)
4067 : {
4068 0 : pafDstScanline[iDstPixelSZ * 2] = 0.0;
4069 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4070 : }
4071 : else
4072 : {
4073 : /* compute RMS */
4074 12 : pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4075 12 : sqrt(dfTotalR / static_cast<double>(nCount)));
4076 12 : pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4077 12 : sqrt(dfTotalI / static_cast<double>(nCount)));
4078 : }
4079 : }
4080 0 : else if (eMethod == AVERAGE)
4081 : {
4082 0 : double dfTotalR = 0.0;
4083 0 : double dfTotalI = 0.0;
4084 0 : size_t nCount = 0;
4085 :
4086 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4087 : {
4088 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4089 : {
4090 : // TODO(schwehr): Maybe use std::complex?
4091 0 : dfTotalR +=
4092 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4093 0 : static_cast<size_t>(iY - nSrcYOff) *
4094 0 : nSrcWidth * 2];
4095 0 : dfTotalI +=
4096 0 : pafSrcScanline[static_cast<size_t>(iX) * 2 +
4097 0 : static_cast<size_t>(iY - nSrcYOff) *
4098 0 : nSrcWidth * 2 +
4099 0 : 1];
4100 0 : ++nCount;
4101 : }
4102 : }
4103 :
4104 0 : CPLAssert(nCount > 0);
4105 0 : if (nCount == 0)
4106 : {
4107 0 : pafDstScanline[iDstPixelSZ * 2] = 0.0;
4108 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4109 : }
4110 : else
4111 : {
4112 0 : pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4113 0 : dfTotalR / static_cast<double>(nCount));
4114 0 : pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4115 0 : dfTotalI / static_cast<double>(nCount));
4116 : }
4117 : }
4118 : }
4119 : }
4120 :
4121 2 : return CE_None;
4122 : }
4123 :
4124 : /************************************************************************/
4125 : /* GDALRegenerateCascadingOverviews() */
4126 : /* */
4127 : /* Generate a list of overviews in order from largest to */
4128 : /* smallest, computing each from the next larger. */
4129 : /************************************************************************/
4130 :
4131 44 : static CPLErr GDALRegenerateCascadingOverviews(
4132 : GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
4133 : const char *pszResampling, GDALProgressFunc pfnProgress,
4134 : void *pProgressData, CSLConstList papszOptions)
4135 :
4136 : {
4137 : /* -------------------------------------------------------------------- */
4138 : /* First, we must put the overviews in order from largest to */
4139 : /* smallest. */
4140 : /* -------------------------------------------------------------------- */
4141 127 : for (int i = 0; i < nOverviews - 1; ++i)
4142 : {
4143 292 : for (int j = 0; j < nOverviews - i - 1; ++j)
4144 : {
4145 209 : if (papoOvrBands[j]->GetXSize() *
4146 209 : static_cast<float>(papoOvrBands[j]->GetYSize()) <
4147 209 : papoOvrBands[j + 1]->GetXSize() *
4148 209 : static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
4149 : {
4150 0 : GDALRasterBand *poTempBand = papoOvrBands[j];
4151 0 : papoOvrBands[j] = papoOvrBands[j + 1];
4152 0 : papoOvrBands[j + 1] = poTempBand;
4153 : }
4154 : }
4155 : }
4156 :
4157 : /* -------------------------------------------------------------------- */
4158 : /* Count total pixels so we can prepare appropriate scaled */
4159 : /* progress functions. */
4160 : /* -------------------------------------------------------------------- */
4161 44 : double dfTotalPixels = 0.0;
4162 :
4163 171 : for (int i = 0; i < nOverviews; ++i)
4164 : {
4165 127 : dfTotalPixels += papoOvrBands[i]->GetXSize() *
4166 127 : static_cast<double>(papoOvrBands[i]->GetYSize());
4167 : }
4168 :
4169 : /* -------------------------------------------------------------------- */
4170 : /* Generate all the bands. */
4171 : /* -------------------------------------------------------------------- */
4172 44 : double dfPixelsProcessed = 0.0;
4173 :
4174 171 : for (int i = 0; i < nOverviews; ++i)
4175 : {
4176 127 : GDALRasterBand *poBaseBand = poSrcBand;
4177 127 : if (i != 0)
4178 83 : poBaseBand = papoOvrBands[i - 1];
4179 :
4180 127 : double dfPixels = papoOvrBands[i]->GetXSize() *
4181 127 : static_cast<double>(papoOvrBands[i]->GetYSize());
4182 :
4183 254 : void *pScaledProgressData = GDALCreateScaledProgress(
4184 : dfPixelsProcessed / dfTotalPixels,
4185 127 : (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
4186 : pProgressData);
4187 :
4188 254 : const CPLErr eErr = GDALRegenerateOverviewsEx(
4189 : poBaseBand, 1,
4190 127 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
4191 : pszResampling, GDALScaledProgress, pScaledProgressData,
4192 : papszOptions);
4193 127 : GDALDestroyScaledProgress(pScaledProgressData);
4194 :
4195 127 : if (eErr != CE_None)
4196 0 : return eErr;
4197 :
4198 127 : dfPixelsProcessed += dfPixels;
4199 :
4200 : // Only do the bit2grayscale promotion on the base band.
4201 127 : if (STARTS_WITH_CI(pszResampling,
4202 : "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
4203 8 : pszResampling = "AVERAGE";
4204 : }
4205 :
4206 44 : return CE_None;
4207 : }
4208 :
4209 : /************************************************************************/
4210 : /* GDALGetResampleFunction() */
4211 : /************************************************************************/
4212 :
4213 5028 : GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
4214 : int *pnRadius)
4215 : {
4216 5028 : if (pnRadius)
4217 5028 : *pnRadius = 0;
4218 5028 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
4219 500 : return GDALResampleChunk_Near;
4220 4528 : else if (STARTS_WITH_CI(pszResampling, "AVER") ||
4221 3988 : EQUAL(pszResampling, "RMS"))
4222 565 : return GDALResampleChunk_AverageOrRMS;
4223 3963 : else if (EQUAL(pszResampling, "GAUSS"))
4224 : {
4225 26 : if (pnRadius)
4226 26 : *pnRadius = 1;
4227 26 : return GDALResampleChunk_Gauss;
4228 : }
4229 3937 : else if (EQUAL(pszResampling, "MODE"))
4230 96 : return GDALResampleChunk_Mode;
4231 3841 : else if (EQUAL(pszResampling, "CUBIC"))
4232 : {
4233 1432 : if (pnRadius)
4234 1432 : *pnRadius = GWKGetFilterRadius(GRA_Cubic);
4235 1431 : return GDALResampleChunk_Convolution;
4236 : }
4237 2409 : else if (EQUAL(pszResampling, "CUBICSPLINE"))
4238 : {
4239 3 : if (pnRadius)
4240 3 : *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
4241 3 : return GDALResampleChunk_Convolution;
4242 : }
4243 2406 : else if (EQUAL(pszResampling, "LANCZOS"))
4244 : {
4245 8 : if (pnRadius)
4246 8 : *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
4247 8 : return GDALResampleChunk_Convolution;
4248 : }
4249 2398 : else if (EQUAL(pszResampling, "BILINEAR"))
4250 : {
4251 2398 : if (pnRadius)
4252 2398 : *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
4253 2398 : return GDALResampleChunk_Convolution;
4254 : }
4255 : else
4256 : {
4257 0 : CPLError(
4258 : CE_Failure, CPLE_AppDefined,
4259 : "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
4260 : pszResampling);
4261 0 : return nullptr;
4262 : }
4263 : }
4264 :
4265 : /************************************************************************/
4266 : /* GDALGetOvrWorkDataType() */
4267 : /************************************************************************/
4268 :
4269 4910 : GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
4270 : GDALDataType eSrcDataType)
4271 : {
4272 4910 : if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
4273 : {
4274 591 : return eSrcDataType;
4275 : }
4276 4319 : else if (eSrcDataType == GDT_Byte &&
4277 3985 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4278 3507 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4279 2274 : EQUAL(pszResampling, "CUBICSPLINE") ||
4280 2271 : EQUAL(pszResampling, "LANCZOS") ||
4281 2266 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4282 : {
4283 3977 : return GDT_Byte;
4284 : }
4285 342 : else if (eSrcDataType == GDT_UInt16 &&
4286 122 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4287 113 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4288 3 : EQUAL(pszResampling, "CUBICSPLINE") ||
4289 3 : EQUAL(pszResampling, "LANCZOS") ||
4290 2 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4291 : {
4292 119 : return GDT_UInt16;
4293 : }
4294 223 : else if (EQUAL(pszResampling, "GAUSS"))
4295 20 : return GDT_Float64;
4296 :
4297 203 : if (eSrcDataType == GDT_Byte || eSrcDataType == GDT_Int8 ||
4298 204 : eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
4299 : eSrcDataType == GDT_Float32)
4300 : {
4301 161 : return GDT_Float32;
4302 : }
4303 42 : return GDT_Float64;
4304 : }
4305 :
4306 : namespace
4307 : {
4308 : // Structure to hold a pointer to free with CPLFree()
4309 : struct PointerHolder
4310 : {
4311 : void *ptr = nullptr;
4312 :
4313 5792 : explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
4314 : {
4315 5792 : }
4316 :
4317 5792 : ~PointerHolder()
4318 5792 : {
4319 5792 : CPLFree(ptr);
4320 5792 : }
4321 :
4322 : PointerHolder(const PointerHolder &) = delete;
4323 : PointerHolder &operator=(const PointerHolder &) = delete;
4324 : };
4325 : } // namespace
4326 :
4327 : /************************************************************************/
4328 : /* GDALRegenerateOverviews() */
4329 : /************************************************************************/
4330 :
4331 : /**
4332 : * \brief Generate downsampled overviews.
4333 : *
4334 : * This function will generate one or more overview images from a base image
4335 : * using the requested downsampling algorithm. Its primary use is for
4336 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4337 : * used to generate downsampled images in one file from another outside the
4338 : * overview architecture.
4339 : *
4340 : * The output bands need to exist in advance.
4341 : *
4342 : * The full set of resampling algorithms is documented in
4343 : * GDALDataset::BuildOverviews().
4344 : *
4345 : * This function will honour properly NODATA_VALUES tuples (special dataset
4346 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4347 : * considered as the nodata value and not each value of the triplet
4348 : * independently per band.
4349 : *
4350 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4351 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4352 : * overview computation.
4353 : *
4354 : * @param hSrcBand the source (base level) band.
4355 : * @param nOverviewCount the number of downsampled bands being generated.
4356 : * @param pahOvrBands the list of downsampled bands to be generated.
4357 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4358 : * @param pfnProgress progress report function.
4359 : * @param pProgressData progress function callback data.
4360 : * @return CE_None on success or CE_Failure on failure.
4361 : */
4362 250 : CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
4363 : GDALRasterBandH *pahOvrBands,
4364 : const char *pszResampling,
4365 : GDALProgressFunc pfnProgress,
4366 : void *pProgressData)
4367 :
4368 : {
4369 250 : return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
4370 : pszResampling, pfnProgress, pProgressData,
4371 250 : nullptr);
4372 : }
4373 :
4374 : /************************************************************************/
4375 : /* GDALRegenerateOverviewsEx() */
4376 : /************************************************************************/
4377 :
4378 : constexpr int RADIUS_TO_DIAMETER = 2;
4379 :
4380 : /**
4381 : * \brief Generate downsampled overviews.
4382 : *
4383 : * This function will generate one or more overview images from a base image
4384 : * using the requested downsampling algorithm. Its primary use is for
4385 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4386 : * used to generate downsampled images in one file from another outside the
4387 : * overview architecture.
4388 : *
4389 : * The output bands need to exist in advance.
4390 : *
4391 : * The full set of resampling algorithms is documented in
4392 : * GDALDataset::BuildOverviews().
4393 : *
4394 : * This function will honour properly NODATA_VALUES tuples (special dataset
4395 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4396 : * considered as the nodata value and not each value of the triplet
4397 : * independently per band.
4398 : *
4399 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4400 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4401 : * overview computation.
4402 : *
4403 : * @param hSrcBand the source (base level) band.
4404 : * @param nOverviewCount the number of downsampled bands being generated.
4405 : * @param pahOvrBands the list of downsampled bands to be generated.
4406 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4407 : * @param pfnProgress progress report function.
4408 : * @param pProgressData progress function callback data.
4409 : * @param papszOptions NULL terminated list of options as key=value pairs, or
4410 : * NULL
4411 : * @return CE_None on success or CE_Failure on failure.
4412 : * @since GDAL 3.6
4413 : */
4414 887 : CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
4415 : GDALRasterBandH *pahOvrBands,
4416 : const char *pszResampling,
4417 : GDALProgressFunc pfnProgress,
4418 : void *pProgressData, CSLConstList papszOptions)
4419 :
4420 : {
4421 887 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
4422 887 : GDALRasterBand **papoOvrBands =
4423 : reinterpret_cast<GDALRasterBand **>(pahOvrBands);
4424 :
4425 887 : if (pfnProgress == nullptr)
4426 252 : pfnProgress = GDALDummyProgress;
4427 :
4428 887 : if (EQUAL(pszResampling, "NONE"))
4429 49 : return CE_None;
4430 :
4431 838 : int nKernelRadius = 0;
4432 : GDALResampleFunction pfnResampleFn =
4433 838 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
4434 :
4435 838 : if (pfnResampleFn == nullptr)
4436 0 : return CE_Failure;
4437 :
4438 : /* -------------------------------------------------------------------- */
4439 : /* Check color tables... */
4440 : /* -------------------------------------------------------------------- */
4441 838 : GDALColorTable *poColorTable = nullptr;
4442 :
4443 471 : if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
4444 1750 : EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
4445 452 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4446 : {
4447 9 : poColorTable = poSrcBand->GetColorTable();
4448 9 : if (poColorTable != nullptr)
4449 : {
4450 9 : if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
4451 : {
4452 0 : CPLError(CE_Warning, CPLE_AppDefined,
4453 : "Computing overviews on palette index raster bands "
4454 : "with a palette whose color interpretation is not RGB "
4455 : "will probably lead to unexpected results.");
4456 0 : poColorTable = nullptr;
4457 : }
4458 9 : else if (poColorTable->IsIdentity())
4459 : {
4460 0 : poColorTable = nullptr;
4461 : }
4462 : }
4463 : else
4464 : {
4465 0 : CPLError(CE_Warning, CPLE_AppDefined,
4466 : "Computing overviews on palette index raster bands "
4467 : "without a palette will probably lead to unexpected "
4468 : "results.");
4469 : }
4470 : }
4471 : // Not ready yet
4472 2433 : else if ((EQUAL(pszResampling, "CUBIC") ||
4473 775 : EQUAL(pszResampling, "CUBICSPLINE") ||
4474 775 : EQUAL(pszResampling, "LANCZOS") ||
4475 1684 : EQUAL(pszResampling, "BILINEAR")) &&
4476 80 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4477 : {
4478 0 : CPLError(CE_Warning, CPLE_AppDefined,
4479 : "Computing %s overviews on palette index raster bands "
4480 : "will probably lead to unexpected results.",
4481 : pszResampling);
4482 : }
4483 :
4484 : // If we have a nodata mask and we are doing something more complicated
4485 : // than nearest neighbouring, we have to fetch to nodata mask.
4486 :
4487 838 : GDALRasterBand *poMaskBand = nullptr;
4488 838 : bool bUseNoDataMask = false;
4489 838 : bool bCanUseCascaded = true;
4490 :
4491 838 : if (!STARTS_WITH_CI(pszResampling, "NEAR"))
4492 : {
4493 : // Special case if we are an alpha/mask band. We want it to be
4494 : // considered as the mask band to avoid alpha=0 to be taken into account
4495 : // in average computation.
4496 532 : if (poSrcBand->IsMaskBand())
4497 : {
4498 91 : poMaskBand = poSrcBand;
4499 91 : bUseNoDataMask = true;
4500 : }
4501 : else
4502 : {
4503 441 : poMaskBand = poSrcBand->GetMaskBand();
4504 441 : const int nMaskFlags = poSrcBand->GetMaskFlags();
4505 441 : bCanUseCascaded =
4506 441 : (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
4507 441 : bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
4508 : }
4509 : }
4510 :
4511 : /* -------------------------------------------------------------------- */
4512 : /* If we are operating on multiple overviews, and using */
4513 : /* averaging, lets do them in cascading order to reduce the */
4514 : /* amount of computation. */
4515 : /* -------------------------------------------------------------------- */
4516 :
4517 : // In case the mask made be computed from another band of the dataset,
4518 : // we can't use cascaded generation, as the computation of the overviews
4519 : // of the band used for the mask band may not have yet occurred (#3033).
4520 838 : if ((STARTS_WITH_CI(pszResampling, "AVER") ||
4521 471 : EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
4522 440 : EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
4523 386 : EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
4524 838 : EQUAL(pszResampling, "MODE")) &&
4525 44 : nOverviewCount > 1 && bCanUseCascaded)
4526 44 : return GDALRegenerateCascadingOverviews(
4527 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
4528 44 : pProgressData, papszOptions);
4529 :
4530 : /* -------------------------------------------------------------------- */
4531 : /* Setup one horizontal swath to read from the raw buffer. */
4532 : /* -------------------------------------------------------------------- */
4533 794 : int nFRXBlockSize = 0;
4534 794 : int nFRYBlockSize = 0;
4535 794 : poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
4536 :
4537 794 : const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
4538 1282 : const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
4539 1236 : EQUAL(pszResampling, "MODE") ||
4540 442 : !GDALDataTypeIsComplex(eSrcDataType);
4541 : const GDALDataType eWrkDataType =
4542 : bUseGenericResampleFn
4543 794 : ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
4544 794 : : GDT_CFloat32;
4545 :
4546 794 : const int nWidth = poSrcBand->GetXSize();
4547 794 : const int nHeight = poSrcBand->GetYSize();
4548 :
4549 794 : int nMaxOvrFactor = 1;
4550 1705 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
4551 : {
4552 911 : const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
4553 911 : const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
4554 911 : nMaxOvrFactor = std::max(
4555 : nMaxOvrFactor,
4556 911 : static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
4557 911 : nMaxOvrFactor = std::max(
4558 : nMaxOvrFactor,
4559 911 : static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
4560 : }
4561 :
4562 794 : int nFullResYChunk = nFRYBlockSize;
4563 794 : int nMaxChunkYSizeQueried = 0;
4564 :
4565 : const auto UpdateChunkHeightAndGetChunkSize =
4566 10354 : [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
4567 83793 : eWrkDataType, nWidth]()
4568 : {
4569 : // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
4570 : // + nFullResYChunk) / nMaxOvrFactor)
4571 10354 : if (nMaxOvrFactor > INT_MAX / RADIUS_TO_DIAMETER)
4572 : {
4573 1 : return GINTBIG_MAX;
4574 : }
4575 10353 : nFullResYChunk =
4576 10353 : std::max(nFullResYChunk, RADIUS_TO_DIAMETER * nMaxOvrFactor);
4577 10353 : if ((nKernelRadius > 0 &&
4578 970 : nMaxOvrFactor > INT_MAX / (RADIUS_TO_DIAMETER * nKernelRadius)) ||
4579 10353 : nFullResYChunk >
4580 10353 : INT_MAX - RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor)
4581 : {
4582 0 : return GINTBIG_MAX;
4583 : }
4584 10353 : nMaxChunkYSizeQueried =
4585 10353 : nFullResYChunk + RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor;
4586 10353 : if (GDALGetDataTypeSizeBytes(eWrkDataType) >
4587 10353 : std::numeric_limits<int64_t>::max() /
4588 10353 : (static_cast<int64_t>(nMaxChunkYSizeQueried) * nWidth))
4589 : {
4590 1 : return GINTBIG_MAX;
4591 : }
4592 10352 : return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
4593 10352 : nMaxChunkYSizeQueried * nWidth;
4594 794 : };
4595 :
4596 : const char *pszChunkYSize =
4597 794 : CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
4598 : #ifndef __COVERITY__
4599 : // Only configurable for debug / testing
4600 794 : if (pszChunkYSize)
4601 : {
4602 0 : nFullResYChunk = atoi(pszChunkYSize);
4603 : }
4604 : #endif
4605 :
4606 : // Only configurable for debug / testing
4607 : const int nChunkMaxSize =
4608 794 : atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
4609 :
4610 794 : auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
4611 794 : if (nChunkSize > nChunkMaxSize)
4612 : {
4613 15 : if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
4614 44 : !GDALDataTypeIsComplex(eSrcDataType) &&
4615 14 : (!STARTS_WITH_CI(pszResampling, "AVER") ||
4616 2 : EQUAL(pszResampling, "AVERAGE")))
4617 : {
4618 : // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
4619 : // which use a block based strategy, which is much less memory
4620 : // hungry.
4621 14 : return GDALRegenerateOverviewsMultiBand(
4622 : 1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
4623 14 : pfnProgress, pProgressData, papszOptions);
4624 : }
4625 1 : else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
4626 : {
4627 0 : return GDALRegenerateCascadingOverviews(
4628 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
4629 0 : pfnProgress, pProgressData, papszOptions);
4630 : }
4631 : }
4632 779 : else if (pszChunkYSize == nullptr)
4633 : {
4634 : // Try to get as close as possible to nChunkMaxSize
4635 10339 : while (nChunkSize < nChunkMaxSize / 2)
4636 : {
4637 9560 : nFullResYChunk *= 2;
4638 9560 : nChunkSize = UpdateChunkHeightAndGetChunkSize();
4639 : }
4640 : }
4641 :
4642 780 : int nHasNoData = 0;
4643 780 : const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
4644 780 : const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
4645 : const bool bPropagateNoData =
4646 780 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
4647 :
4648 : // Structure describing a resampling job
4649 : struct OvrJob
4650 : {
4651 : // Buffers to free when job is finished
4652 : std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
4653 : std::shared_ptr<PointerHolder> oSrcBufferHolder{};
4654 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
4655 :
4656 : GDALRasterBand *poDstBand = nullptr;
4657 :
4658 : // Input parameters of pfnResampleFn
4659 : GDALResampleFunction pfnResampleFn = nullptr;
4660 : int nSrcWidth = 0;
4661 : int nSrcHeight = 0;
4662 : int nDstWidth = 0;
4663 : GDALOverviewResampleArgs args{};
4664 : const void *pChunk = nullptr;
4665 : bool bUseGenericResampleFn = false;
4666 :
4667 : // Output values of resampling function
4668 : CPLErr eErr = CE_Failure;
4669 : void *pDstBuffer = nullptr;
4670 : GDALDataType eDstBufferDataType = GDT_Unknown;
4671 :
4672 0 : void SetSrcMaskBufferHolder(
4673 : const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
4674 : {
4675 0 : oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
4676 0 : }
4677 :
4678 0 : void SetSrcBufferHolder(
4679 : const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
4680 : {
4681 0 : oSrcBufferHolder = oSrcBufferHolderIn;
4682 0 : }
4683 :
4684 880 : void NotifyFinished()
4685 : {
4686 1760 : std::lock_guard guard(mutex);
4687 880 : bFinished = true;
4688 880 : cv.notify_one();
4689 880 : }
4690 :
4691 0 : bool IsFinished()
4692 : {
4693 0 : std::lock_guard guard(mutex);
4694 0 : return bFinished;
4695 : }
4696 :
4697 0 : void WaitFinished()
4698 : {
4699 0 : std::unique_lock oGuard(mutex);
4700 0 : while (!bFinished)
4701 : {
4702 0 : cv.wait(oGuard);
4703 : }
4704 0 : }
4705 :
4706 : private:
4707 : // Synchronization
4708 : bool bFinished = false;
4709 : std::mutex mutex{};
4710 : std::condition_variable cv{};
4711 : };
4712 :
4713 : // Thread function to resample
4714 880 : const auto JobResampleFunc = [](void *pData)
4715 : {
4716 880 : OvrJob *poJob = static_cast<OvrJob *>(pData);
4717 :
4718 880 : if (poJob->bUseGenericResampleFn)
4719 : {
4720 878 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
4721 : &(poJob->pDstBuffer),
4722 : &(poJob->eDstBufferDataType));
4723 : }
4724 : else
4725 : {
4726 2 : poJob->eErr = GDALResampleChunkC32R(
4727 : poJob->nSrcWidth, poJob->nSrcHeight,
4728 2 : static_cast<const float *>(poJob->pChunk),
4729 : poJob->args.nChunkYOff, poJob->args.nChunkYSize,
4730 : poJob->args.nDstYOff, poJob->args.nDstYOff2,
4731 : poJob->args.nOvrXSize, poJob->args.nOvrYSize,
4732 : &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
4733 : poJob->args.pszResampling);
4734 : }
4735 :
4736 : poJob->oDstBufferHolder =
4737 880 : std::make_unique<PointerHolder>(poJob->pDstBuffer);
4738 :
4739 880 : poJob->NotifyFinished();
4740 880 : };
4741 :
4742 : // Function to write resample data to target band
4743 880 : const auto WriteJobData = [](const OvrJob *poJob)
4744 : {
4745 1760 : return poJob->poDstBand->RasterIO(
4746 880 : GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
4747 880 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
4748 880 : poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
4749 880 : poJob->eDstBufferDataType, 0, 0, nullptr);
4750 : };
4751 :
4752 : // Wait for completion of oldest job and serialize it
4753 : const auto WaitAndFinalizeOldestJob =
4754 0 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
4755 : {
4756 0 : auto poOldestJob = jobList.front().get();
4757 0 : poOldestJob->WaitFinished();
4758 0 : CPLErr l_eErr = poOldestJob->eErr;
4759 0 : if (l_eErr == CE_None)
4760 : {
4761 0 : l_eErr = WriteJobData(poOldestJob);
4762 : }
4763 :
4764 0 : jobList.pop_front();
4765 0 : return l_eErr;
4766 : };
4767 :
4768 : // Queue of jobs
4769 1560 : std::list<std::unique_ptr<OvrJob>> jobList;
4770 :
4771 780 : GByte *pabyChunkNodataMask = nullptr;
4772 780 : void *pChunk = nullptr;
4773 :
4774 780 : const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
4775 3120 : const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
4776 780 : ? CPLGetNumCPUs()
4777 780 : : atoi(pszThreads)));
4778 : auto poThreadPool =
4779 780 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
4780 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
4781 1560 : : std::unique_ptr<CPLJobQueue>(nullptr);
4782 :
4783 : /* -------------------------------------------------------------------- */
4784 : /* Loop over image operating on chunks. */
4785 : /* -------------------------------------------------------------------- */
4786 780 : int nChunkYOff = 0;
4787 780 : CPLErr eErr = CE_None;
4788 :
4789 1565 : for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
4790 785 : nChunkYOff += nFullResYChunk)
4791 : {
4792 785 : if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
4793 : pProgressData))
4794 : {
4795 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
4796 0 : eErr = CE_Failure;
4797 : }
4798 :
4799 785 : if (nFullResYChunk + nChunkYOff > nHeight)
4800 778 : nFullResYChunk = nHeight - nChunkYOff;
4801 :
4802 785 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
4803 785 : int nChunkYSizeQueried =
4804 785 : nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
4805 785 : if (nChunkYOffQueried < 0)
4806 : {
4807 83 : nChunkYSizeQueried += nChunkYOffQueried;
4808 83 : nChunkYOffQueried = 0;
4809 : }
4810 785 : if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
4811 83 : nChunkYSizeQueried = nHeight - nChunkYOffQueried;
4812 :
4813 : // Avoid accumulating too many tasks and exhaust RAM
4814 : // Try to complete already finished jobs
4815 785 : while (eErr == CE_None && !jobList.empty())
4816 : {
4817 0 : auto poOldestJob = jobList.front().get();
4818 0 : if (!poOldestJob->IsFinished())
4819 0 : break;
4820 0 : eErr = poOldestJob->eErr;
4821 0 : if (eErr == CE_None)
4822 : {
4823 0 : eErr = WriteJobData(poOldestJob);
4824 : }
4825 :
4826 0 : jobList.pop_front();
4827 : }
4828 :
4829 : // And in case we have saturated the number of threads,
4830 : // wait for completion of tasks to go below the threshold.
4831 1570 : while (eErr == CE_None &&
4832 785 : jobList.size() >= static_cast<size_t>(nThreads))
4833 : {
4834 0 : eErr = WaitAndFinalizeOldestJob(jobList);
4835 : }
4836 :
4837 : // (Re)allocate buffers if needed
4838 785 : if (pChunk == nullptr)
4839 : {
4840 780 : pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
4841 : nMaxChunkYSizeQueried, nWidth);
4842 : }
4843 785 : if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
4844 : {
4845 : pabyChunkNodataMask = static_cast<GByte *>(
4846 283 : VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
4847 : }
4848 :
4849 785 : if (pChunk == nullptr ||
4850 283 : (bUseNoDataMask && pabyChunkNodataMask == nullptr))
4851 : {
4852 0 : CPLFree(pChunk);
4853 0 : CPLFree(pabyChunkNodataMask);
4854 0 : return CE_Failure;
4855 : }
4856 :
4857 : // Read chunk.
4858 785 : if (eErr == CE_None)
4859 785 : eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
4860 : nChunkYSizeQueried, pChunk, nWidth,
4861 : nChunkYSizeQueried, eWrkDataType, 0, 0,
4862 : nullptr);
4863 785 : if (eErr == CE_None && bUseNoDataMask)
4864 283 : eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
4865 : nChunkYSizeQueried, pabyChunkNodataMask,
4866 : nWidth, nChunkYSizeQueried, GDT_Byte, 0,
4867 : 0, nullptr);
4868 :
4869 : // Special case to promote 1bit data to 8bit 0/255 values.
4870 785 : if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
4871 : {
4872 9 : if (eWrkDataType == GDT_Float32)
4873 : {
4874 0 : float *pafChunk = static_cast<float *>(pChunk);
4875 0 : for (size_t i = 0;
4876 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
4877 : {
4878 0 : if (pafChunk[i] == 1.0)
4879 0 : pafChunk[i] = 255.0;
4880 : }
4881 : }
4882 9 : else if (eWrkDataType == GDT_Byte)
4883 : {
4884 9 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
4885 168417 : for (size_t i = 0;
4886 168417 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
4887 : {
4888 168408 : if (pabyChunk[i] == 1)
4889 127437 : pabyChunk[i] = 255;
4890 : }
4891 : }
4892 0 : else if (eWrkDataType == GDT_UInt16)
4893 : {
4894 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
4895 0 : for (size_t i = 0;
4896 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
4897 : {
4898 0 : if (pasChunk[i] == 1)
4899 0 : pasChunk[i] = 255;
4900 : }
4901 : }
4902 0 : else if (eWrkDataType == GDT_Float64)
4903 : {
4904 0 : double *padfChunk = static_cast<double *>(pChunk);
4905 0 : for (size_t i = 0;
4906 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
4907 : {
4908 0 : if (padfChunk[i] == 1.0)
4909 0 : padfChunk[i] = 255.0;
4910 : }
4911 : }
4912 : else
4913 : {
4914 0 : CPLAssert(false);
4915 : }
4916 : }
4917 776 : else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
4918 : {
4919 0 : if (eWrkDataType == GDT_Float32)
4920 : {
4921 0 : float *pafChunk = static_cast<float *>(pChunk);
4922 0 : for (size_t i = 0;
4923 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
4924 : {
4925 0 : if (pafChunk[i] == 1.0)
4926 0 : pafChunk[i] = 0.0;
4927 0 : else if (pafChunk[i] == 0.0)
4928 0 : pafChunk[i] = 255.0;
4929 : }
4930 : }
4931 0 : else if (eWrkDataType == GDT_Byte)
4932 : {
4933 0 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
4934 0 : for (size_t i = 0;
4935 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
4936 : {
4937 0 : if (pabyChunk[i] == 1)
4938 0 : pabyChunk[i] = 0;
4939 0 : else if (pabyChunk[i] == 0)
4940 0 : pabyChunk[i] = 255;
4941 : }
4942 : }
4943 0 : else if (eWrkDataType == GDT_UInt16)
4944 : {
4945 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
4946 0 : for (size_t i = 0;
4947 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
4948 : {
4949 0 : if (pasChunk[i] == 1)
4950 0 : pasChunk[i] = 0;
4951 0 : else if (pasChunk[i] == 0)
4952 0 : pasChunk[i] = 255;
4953 : }
4954 : }
4955 0 : else if (eWrkDataType == GDT_Float64)
4956 : {
4957 0 : double *padfChunk = static_cast<double *>(pChunk);
4958 0 : for (size_t i = 0;
4959 0 : i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
4960 : {
4961 0 : if (padfChunk[i] == 1.0)
4962 0 : padfChunk[i] = 0.0;
4963 0 : else if (padfChunk[i] == 0.0)
4964 0 : padfChunk[i] = 255.0;
4965 : }
4966 : }
4967 : else
4968 : {
4969 0 : CPLAssert(false);
4970 : }
4971 : }
4972 :
4973 : auto oSrcBufferHolder =
4974 1570 : std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
4975 : auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
4976 1570 : poJobQueue ? pabyChunkNodataMask : nullptr);
4977 :
4978 1665 : for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
4979 : ++iOverview)
4980 : {
4981 880 : GDALRasterBand *poDstBand = papoOvrBands[iOverview];
4982 880 : const int nDstWidth = poDstBand->GetXSize();
4983 880 : const int nDstHeight = poDstBand->GetYSize();
4984 :
4985 880 : const double dfXRatioDstToSrc =
4986 880 : static_cast<double>(nWidth) / nDstWidth;
4987 880 : const double dfYRatioDstToSrc =
4988 880 : static_cast<double>(nHeight) / nDstHeight;
4989 :
4990 : /* --------------------------------------------------------------------
4991 : */
4992 : /* Figure out the line to start writing to, and the first line
4993 : */
4994 : /* to not write to. In theory this approach should ensure that
4995 : */
4996 : /* every output line will be written if all input chunks are */
4997 : /* processed. */
4998 : /* --------------------------------------------------------------------
4999 : */
5000 880 : int nDstYOff =
5001 880 : static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
5002 880 : if (nDstYOff == nDstHeight)
5003 0 : continue;
5004 880 : int nDstYOff2 = static_cast<int>(
5005 880 : 0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
5006 :
5007 880 : if (nChunkYOff + nFullResYChunk == nHeight)
5008 873 : nDstYOff2 = nDstHeight;
5009 : #if DEBUG_VERBOSE
5010 : CPLDebug("GDAL",
5011 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
5012 : nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
5013 : nDstWidth, nDstYOff2 - nDstYOff);
5014 : #endif
5015 :
5016 1760 : auto poJob = std::make_unique<OvrJob>();
5017 880 : poJob->pfnResampleFn = pfnResampleFn;
5018 880 : poJob->bUseGenericResampleFn = bUseGenericResampleFn;
5019 880 : poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
5020 880 : poJob->args.nOvrXSize = poDstBand->GetXSize();
5021 880 : poJob->args.nOvrYSize = poDstBand->GetYSize();
5022 : const char *pszNBITS =
5023 880 : poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
5024 880 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
5025 880 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
5026 880 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
5027 880 : poJob->args.eWrkDataType = eWrkDataType;
5028 880 : poJob->pChunk = pChunk;
5029 880 : poJob->args.pabyChunkNodataMask = pabyChunkNodataMask;
5030 880 : poJob->nSrcWidth = nWidth;
5031 880 : poJob->nSrcHeight = nHeight;
5032 880 : poJob->args.nChunkXOff = 0;
5033 880 : poJob->args.nChunkXSize = nWidth;
5034 880 : poJob->args.nChunkYOff = nChunkYOffQueried;
5035 880 : poJob->args.nChunkYSize = nChunkYSizeQueried;
5036 880 : poJob->nDstWidth = nDstWidth;
5037 880 : poJob->args.nDstXOff = 0;
5038 880 : poJob->args.nDstXOff2 = nDstWidth;
5039 880 : poJob->args.nDstYOff = nDstYOff;
5040 880 : poJob->args.nDstYOff2 = nDstYOff2;
5041 880 : poJob->poDstBand = poDstBand;
5042 880 : poJob->args.pszResampling = pszResampling;
5043 880 : poJob->args.bHasNoData = bHasNoData;
5044 880 : poJob->args.dfNoDataValue = dfNoDataValue;
5045 880 : poJob->args.poColorTable = poColorTable;
5046 880 : poJob->args.eSrcDataType = eSrcDataType;
5047 880 : poJob->args.bPropagateNoData = bPropagateNoData;
5048 :
5049 880 : if (poJobQueue)
5050 : {
5051 0 : poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
5052 0 : poJob->SetSrcBufferHolder(oSrcBufferHolder);
5053 0 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
5054 0 : jobList.emplace_back(std::move(poJob));
5055 : }
5056 : else
5057 : {
5058 880 : JobResampleFunc(poJob.get());
5059 880 : eErr = poJob->eErr;
5060 880 : if (eErr == CE_None)
5061 : {
5062 880 : eErr = WriteJobData(poJob.get());
5063 : }
5064 : }
5065 : }
5066 :
5067 785 : if (poJobQueue)
5068 : {
5069 0 : pChunk = nullptr;
5070 0 : pabyChunkNodataMask = nullptr;
5071 : }
5072 : }
5073 :
5074 780 : VSIFree(pChunk);
5075 780 : VSIFree(pabyChunkNodataMask);
5076 :
5077 : // Wait for all pending jobs to complete
5078 780 : while (!jobList.empty())
5079 : {
5080 0 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5081 0 : if (l_eErr != CE_None && eErr == CE_None)
5082 0 : eErr = l_eErr;
5083 : }
5084 :
5085 : /* -------------------------------------------------------------------- */
5086 : /* Renormalized overview mean / stddev if needed. */
5087 : /* -------------------------------------------------------------------- */
5088 780 : if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
5089 : {
5090 0 : GDALOverviewMagnitudeCorrection(
5091 : poSrcBand, nOverviewCount,
5092 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
5093 : GDALDummyProgress, nullptr);
5094 : }
5095 :
5096 : /* -------------------------------------------------------------------- */
5097 : /* It can be important to flush out data to overviews. */
5098 : /* -------------------------------------------------------------------- */
5099 1653 : for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
5100 : ++iOverview)
5101 : {
5102 873 : eErr = papoOvrBands[iOverview]->FlushCache(false);
5103 : }
5104 :
5105 780 : if (eErr == CE_None)
5106 780 : pfnProgress(1.0, nullptr, pProgressData);
5107 :
5108 780 : return eErr;
5109 : }
5110 :
5111 : /************************************************************************/
5112 : /* GDALRegenerateOverviewsMultiBand() */
5113 : /************************************************************************/
5114 :
5115 : /**
5116 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5117 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5118 : *
5119 : * This function will generate one or more overview images from a base
5120 : * image using the requested downsampling algorithm. Its primary use
5121 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
5122 : * can also be used to generate downsampled images in one file from another
5123 : * outside the overview architecture.
5124 : *
5125 : * The output bands need to exist in advance and share the same characteristics
5126 : * (type, dimensions)
5127 : *
5128 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5129 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5130 : *
5131 : * It does not support color tables or complex data types.
5132 : *
5133 : * The pseudo-algorithm used by the function is :
5134 : * for each overview
5135 : * iterate on lines of the source by a step of deltay
5136 : * iterate on columns of the source by a step of deltax
5137 : * read the source data of size deltax * deltay for all the bands
5138 : * generate the corresponding overview block for all the bands
5139 : *
5140 : * This function will honour properly NODATA_VALUES tuples (special dataset
5141 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5142 : * considered as the nodata value and not each value of the triplet
5143 : * independently per band.
5144 : *
5145 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
5146 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5147 : * overview computation.
5148 : *
5149 : * @param nBands the number of bands, size of papoSrcBands and size of
5150 : * first dimension of papapoOverviewBands
5151 : * @param papoSrcBands the list of source bands to downsample
5152 : * @param nOverviews the number of downsampled overview levels being generated.
5153 : * @param papapoOverviewBands bidimension array of bands. First dimension is
5154 : * indexed by nBands. Second dimension is indexed by
5155 : * nOverviews.
5156 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5157 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5158 : * @param pfnProgress progress report function.
5159 : * @param pProgressData progress function callback data.
5160 : * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
5161 : * key=value pairs, or NULL
5162 : * Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
5163 : * options can be specified to express that overviews should
5164 : * be regenerated only in the specified subset of the source
5165 : * dataset.
5166 : * @return CE_None on success or CE_Failure on failure.
5167 : */
5168 :
5169 388 : CPLErr GDALRegenerateOverviewsMultiBand(
5170 : int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
5171 : GDALRasterBand *const *const *papapoOverviewBands,
5172 : const char *pszResampling, GDALProgressFunc pfnProgress,
5173 : void *pProgressData, CSLConstList papszOptions)
5174 : {
5175 388 : CPL_IGNORE_RET_VAL(papszOptions);
5176 :
5177 388 : if (pfnProgress == nullptr)
5178 11 : pfnProgress = GDALDummyProgress;
5179 :
5180 388 : if (EQUAL(pszResampling, "NONE") || nBands == 0 || nOverviews == 0)
5181 3 : return CE_None;
5182 :
5183 : // Sanity checks.
5184 385 : if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
5185 191 : !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
5186 84 : !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
5187 22 : !EQUAL(pszResampling, "CUBICSPLINE") &&
5188 21 : !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
5189 5 : !EQUAL(pszResampling, "MODE"))
5190 : {
5191 0 : CPLError(CE_Failure, CPLE_NotSupported,
5192 : "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
5193 : "not supported",
5194 : pszResampling);
5195 0 : return CE_Failure;
5196 : }
5197 :
5198 385 : int nKernelRadius = 0;
5199 : GDALResampleFunction pfnResampleFn =
5200 385 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
5201 385 : if (pfnResampleFn == nullptr)
5202 0 : return CE_Failure;
5203 :
5204 385 : const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
5205 385 : const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
5206 385 : if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
5207 0 : return CE_None;
5208 385 : GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
5209 66232 : for (int iBand = 1; iBand < nBands; ++iBand)
5210 : {
5211 131694 : if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
5212 65847 : papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
5213 : {
5214 0 : CPLError(
5215 : CE_Failure, CPLE_NotSupported,
5216 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5217 : "have the same dimensions");
5218 0 : return CE_Failure;
5219 : }
5220 65847 : if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
5221 : {
5222 0 : CPLError(
5223 : CE_Failure, CPLE_NotSupported,
5224 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5225 : "have the same data type");
5226 0 : return CE_Failure;
5227 : }
5228 : }
5229 :
5230 1031 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5231 : {
5232 646 : const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
5233 646 : const int nDstWidth = poOvrFirstBand->GetXSize();
5234 646 : const int nDstHeight = poOvrFirstBand->GetYSize();
5235 66759 : for (int iBand = 1; iBand < nBands; ++iBand)
5236 : {
5237 66113 : const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
5238 132226 : if (poOvrBand->GetXSize() != nDstWidth ||
5239 66113 : poOvrBand->GetYSize() != nDstHeight)
5240 : {
5241 0 : CPLError(
5242 : CE_Failure, CPLE_NotSupported,
5243 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5244 : "of the same level must have the same dimensions");
5245 0 : return CE_Failure;
5246 : }
5247 66113 : if (poOvrBand->GetRasterDataType() != eDataType)
5248 : {
5249 0 : CPLError(
5250 : CE_Failure, CPLE_NotSupported,
5251 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5252 : "must have the same data type as the source bands");
5253 0 : return CE_Failure;
5254 : }
5255 : }
5256 : }
5257 :
5258 : // First pass to compute the total number of pixels to write.
5259 385 : double dfTotalPixelCount = 0;
5260 385 : const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
5261 385 : const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
5262 385 : const int nSrcXSize = atoi(CSLFetchNameValueDef(
5263 : papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
5264 385 : const int nSrcYSize = atoi(CSLFetchNameValueDef(
5265 : papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
5266 1031 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5267 : {
5268 646 : dfTotalPixelCount +=
5269 1292 : static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
5270 646 : papapoOverviewBands[0][iOverview]->GetXSize() *
5271 1292 : static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
5272 646 : papapoOverviewBands[0][iOverview]->GetYSize();
5273 : }
5274 :
5275 : const GDALDataType eWrkDataType =
5276 385 : GDALGetOvrWorkDataType(pszResampling, eDataType);
5277 : const int nWrkDataTypeSize =
5278 385 : std::max(1, GDALGetDataTypeSizeBytes(eWrkDataType));
5279 :
5280 385 : const bool bIsMask = papoSrcBands[0]->IsMaskBand();
5281 :
5282 : // If we have a nodata mask and we are doing something more complicated
5283 : // than nearest neighbouring, we have to fetch to nodata mask.
5284 : const bool bUseNoDataMask =
5285 568 : !STARTS_WITH_CI(pszResampling, "NEAR") &&
5286 183 : (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
5287 :
5288 770 : std::vector<bool> abHasNoData(nBands);
5289 770 : std::vector<double> adfNoDataValue(nBands);
5290 :
5291 66617 : for (int iBand = 0; iBand < nBands; ++iBand)
5292 : {
5293 66232 : int nHasNoData = 0;
5294 132464 : adfNoDataValue[iBand] =
5295 66232 : papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
5296 66232 : abHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
5297 : }
5298 : const bool bPropagateNoData =
5299 385 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5300 :
5301 385 : const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
5302 1540 : const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
5303 385 : ? CPLGetNumCPUs()
5304 385 : : atoi(pszThreads)));
5305 : auto poThreadPool =
5306 385 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5307 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5308 770 : : std::unique_ptr<CPLJobQueue>(nullptr);
5309 :
5310 : // Only configurable for debug / testing
5311 385 : const GIntBig nChunkMaxSize = []() -> GIntBig
5312 : {
5313 : const char *pszVal =
5314 385 : CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", nullptr);
5315 385 : if (pszVal)
5316 : {
5317 15 : GIntBig nRet = 0;
5318 15 : CPLParseMemorySize(pszVal, &nRet, nullptr);
5319 15 : return std::max<GIntBig>(100, nRet);
5320 : }
5321 370 : return 10 * 1024 * 1024;
5322 385 : }();
5323 :
5324 : // Only configurable for debug / testing
5325 385 : const GIntBig nChunkMaxSizeForTempFile = []() -> GIntBig
5326 : {
5327 385 : const char *pszVal = CPLGetConfigOption(
5328 : "GDAL_OVR_CHUNK_MAX_SIZE_FOR_TEMP_FILE", nullptr);
5329 385 : if (pszVal)
5330 : {
5331 14 : GIntBig nRet = 0;
5332 14 : CPLParseMemorySize(pszVal, &nRet, nullptr);
5333 14 : return std::max<GIntBig>(100, nRet);
5334 : }
5335 371 : const auto nUsableRAM = CPLGetUsablePhysicalRAM();
5336 371 : if (nUsableRAM > 0)
5337 371 : return nUsableRAM / 10;
5338 : // Select a value to be able to at least downsample by 2 for a RGB
5339 : // 1024x1024 tiled output: (2 * 1024 + 2) * (2 * 1024 + 2) * 3 = 12 MB
5340 0 : return 100 * 1024 * 1024;
5341 385 : }();
5342 :
5343 : // Second pass to do the real job.
5344 385 : double dfCurPixelCount = 0;
5345 385 : CPLErr eErr = CE_None;
5346 1025 : for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
5347 : ++iOverview)
5348 : {
5349 645 : int iSrcOverview = -1; // -1 means the source bands.
5350 :
5351 : const int nDstTotalWidth =
5352 645 : papapoOverviewBands[0][iOverview]->GetXSize();
5353 : const int nDstTotalHeight =
5354 645 : papapoOverviewBands[0][iOverview]->GetYSize();
5355 :
5356 : // Compute the coordinates of the target region to refresh
5357 645 : constexpr double EPS = 1e-8;
5358 645 : const int nDstXOffStart = static_cast<int>(
5359 645 : static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
5360 : EPS);
5361 : const int nDstXOffEnd =
5362 1290 : std::min(static_cast<int>(
5363 645 : std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
5364 645 : nToplevelSrcWidth * nDstTotalWidth -
5365 : EPS)),
5366 645 : nDstTotalWidth);
5367 645 : const int nDstWidth = nDstXOffEnd - nDstXOffStart;
5368 645 : const int nDstYOffStart =
5369 645 : static_cast<int>(static_cast<double>(nSrcYOff) /
5370 645 : nToplevelSrcHeight * nDstTotalHeight +
5371 : EPS);
5372 : const int nDstYOffEnd =
5373 1290 : std::min(static_cast<int>(
5374 645 : std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
5375 645 : nToplevelSrcHeight * nDstTotalHeight -
5376 : EPS)),
5377 645 : nDstTotalHeight);
5378 645 : const int nDstHeight = nDstYOffEnd - nDstYOffStart;
5379 :
5380 : // Try to use previous level of overview as the source to compute
5381 : // the next level.
5382 645 : int nSrcWidth = nToplevelSrcWidth;
5383 645 : int nSrcHeight = nToplevelSrcHeight;
5384 905 : if (iOverview > 0 &&
5385 260 : papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
5386 : {
5387 252 : nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
5388 252 : nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
5389 252 : iSrcOverview = iOverview - 1;
5390 : }
5391 :
5392 645 : const double dfXRatioDstToSrc =
5393 645 : static_cast<double>(nSrcWidth) / nDstTotalWidth;
5394 645 : const double dfYRatioDstToSrc =
5395 645 : static_cast<double>(nSrcHeight) / nDstTotalHeight;
5396 :
5397 : const int nOvrFactor =
5398 1935 : std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
5399 645 : static_cast<int>(0.5 + dfYRatioDstToSrc)));
5400 :
5401 645 : int nDstChunkXSize = 0;
5402 645 : int nDstChunkYSize = 0;
5403 645 : papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
5404 : &nDstChunkYSize);
5405 :
5406 645 : constexpr int PIXEL_MARGIN = 2;
5407 : // Try to extend the chunk size so that the memory needed to acquire
5408 : // source pixels goes up to 10 MB.
5409 : // This can help for drivers that support multi-threaded reading
5410 645 : const int nFullResYChunk = static_cast<int>(std::min<double>(
5411 645 : nSrcHeight, PIXEL_MARGIN + nDstChunkYSize * dfYRatioDstToSrc));
5412 645 : const int nFullResYChunkQueried = static_cast<int>(std::min<int64_t>(
5413 1290 : nSrcHeight,
5414 1290 : nFullResYChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5415 645 : nKernelRadius * nOvrFactor));
5416 881 : while (nDstChunkXSize < nDstWidth)
5417 : {
5418 255 : constexpr int INCREASE_FACTOR = 2;
5419 :
5420 255 : const int nFullResXChunk = static_cast<int>(std::min<double>(
5421 510 : nSrcWidth, PIXEL_MARGIN + INCREASE_FACTOR * nDstChunkXSize *
5422 255 : dfXRatioDstToSrc));
5423 :
5424 : const int nFullResXChunkQueried =
5425 255 : static_cast<int>(std::min<int64_t>(
5426 510 : nSrcWidth,
5427 510 : nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5428 255 : nKernelRadius * nOvrFactor));
5429 :
5430 255 : if (nBands > nChunkMaxSize / nFullResXChunkQueried /
5431 255 : nFullResYChunkQueried / nWrkDataTypeSize)
5432 : {
5433 19 : break;
5434 : }
5435 :
5436 236 : nDstChunkXSize *= INCREASE_FACTOR;
5437 : }
5438 645 : nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
5439 :
5440 645 : const int nFullResXChunk = static_cast<int>(std::min<double>(
5441 645 : nSrcWidth, PIXEL_MARGIN + nDstChunkXSize * dfXRatioDstToSrc));
5442 645 : const int nFullResXChunkQueried = static_cast<int>(std::min<int64_t>(
5443 1290 : nSrcWidth,
5444 1290 : nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5445 645 : nKernelRadius * nOvrFactor));
5446 :
5447 : // Make sure that the RAM requirements to acquire the source data does
5448 : // not exceed nChunkMaxSizeForTempFile
5449 : // If so, reduce the destination chunk size, generate overviews in a
5450 : // temporary dataset, and copy that temporary dataset over the target
5451 : // overview bands (to avoid issues with lossy compression)
5452 : const bool bOverflowFullResXChunkYChunkQueried =
5453 645 : nBands > std::numeric_limits<int64_t>::max() /
5454 645 : nFullResXChunkQueried / nFullResYChunkQueried /
5455 645 : nWrkDataTypeSize;
5456 :
5457 645 : const auto nMemRequirement =
5458 : bOverflowFullResXChunkYChunkQueried
5459 645 : ? 0
5460 641 : : static_cast<GIntBig>(nFullResXChunkQueried) *
5461 641 : nFullResYChunkQueried * nBands * nWrkDataTypeSize;
5462 : // Use a temporary dataset with a smaller destination chunk size
5463 645 : const auto nOverShootFactor =
5464 : nMemRequirement / nChunkMaxSizeForTempFile;
5465 :
5466 645 : constexpr int MIN_OVERSHOOT_FACTOR = 4;
5467 : const auto nSqrtOverShootFactor = std::max<GIntBig>(
5468 1290 : MIN_OVERSHOOT_FACTOR, static_cast<GIntBig>(std::ceil(std::sqrt(
5469 645 : static_cast<double>(nOverShootFactor)))));
5470 645 : constexpr int DEFAULT_CHUNK_SIZE = 256;
5471 645 : constexpr int GTIFF_BLOCK_SIZE_MULTIPLE = 16;
5472 : const int nReducedDstChunkXSize =
5473 : bOverflowFullResXChunkYChunkQueried
5474 1286 : ? DEFAULT_CHUNK_SIZE
5475 1286 : : std::max(1, static_cast<int>(nDstChunkXSize /
5476 1286 : nSqrtOverShootFactor) &
5477 641 : ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
5478 : const int nReducedDstChunkYSize =
5479 : bOverflowFullResXChunkYChunkQueried
5480 1286 : ? DEFAULT_CHUNK_SIZE
5481 1286 : : std::max(1, static_cast<int>(nDstChunkYSize /
5482 1286 : nSqrtOverShootFactor) &
5483 641 : ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
5484 :
5485 645 : if (bOverflowFullResXChunkYChunkQueried ||
5486 : nMemRequirement > nChunkMaxSizeForTempFile)
5487 : {
5488 : const auto nDTSize =
5489 43 : std::max(1, GDALGetDataTypeSizeBytes(eDataType));
5490 : const bool bTmpDSMemRequirementOverflow =
5491 43 : nBands > std::numeric_limits<int64_t>::max() / nDstWidth /
5492 43 : nDstHeight / nDTSize;
5493 43 : const auto nTmpDSMemRequirement =
5494 : bTmpDSMemRequirementOverflow
5495 43 : ? 0
5496 41 : : static_cast<GIntBig>(nDstWidth) * nDstHeight * nBands *
5497 41 : nDTSize;
5498 :
5499 : // make sure that one band buffer doesn't overflow size_t
5500 : const bool bChunkSizeOverflow =
5501 43 : static_cast<size_t>(nDTSize) >
5502 43 : std::numeric_limits<size_t>::max() / nDstWidth / nDstHeight;
5503 43 : const size_t nChunkSize =
5504 : bChunkSizeOverflow
5505 43 : ? 0
5506 41 : : static_cast<size_t>(nDstWidth) * nDstHeight * nDTSize;
5507 :
5508 : const auto CreateVRT =
5509 41 : [nBands, nSrcWidth, nSrcHeight, nDstTotalWidth, nDstTotalHeight,
5510 : pszResampling, eWrkDataType, papoSrcBands, papapoOverviewBands,
5511 : iSrcOverview, &abHasNoData,
5512 393585 : &adfNoDataValue](int nVRTBlockXSize, int nVRTBlockYSize)
5513 : {
5514 : auto poVRTDS = std::make_unique<VRTDataset>(
5515 41 : nDstTotalWidth, nDstTotalHeight, nVRTBlockXSize,
5516 41 : nVRTBlockYSize);
5517 :
5518 65620 : for (int iBand = 0; iBand < nBands; ++iBand)
5519 : {
5520 131158 : auto poVRTSrc = std::make_unique<VRTSimpleSource>();
5521 65579 : poVRTSrc->SetResampling(pszResampling);
5522 65579 : poVRTDS->AddBand(eWrkDataType);
5523 : auto poVRTBand = static_cast<VRTSourcedRasterBand *>(
5524 65579 : poVRTDS->GetRasterBand(iBand + 1));
5525 :
5526 65579 : auto poSrcBand = papoSrcBands[iBand];
5527 65579 : if (iSrcOverview != -1)
5528 24 : poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
5529 65579 : poVRTBand->ConfigureSource(
5530 : poVRTSrc.get(), poSrcBand, false, 0, 0, nSrcWidth,
5531 : nSrcHeight, 0, 0, nDstTotalWidth, nDstTotalHeight);
5532 : // Add the source to the band
5533 65579 : poVRTBand->AddSource(poVRTSrc.release());
5534 65579 : if (abHasNoData[iBand])
5535 3 : poVRTBand->SetNoDataValue(adfNoDataValue[iBand]);
5536 : }
5537 :
5538 42 : if (papoSrcBands[0]->GetMaskFlags() == GMF_PER_DATASET &&
5539 1 : poVRTDS->CreateMaskBand(GMF_PER_DATASET) == CE_None)
5540 : {
5541 : VRTSourcedRasterBand *poMaskVRTBand =
5542 1 : cpl::down_cast<VRTSourcedRasterBand *>(
5543 1 : poVRTDS->GetRasterBand(1)->GetMaskBand());
5544 1 : auto poSrcBand = papoSrcBands[0];
5545 1 : if (iSrcOverview != -1)
5546 0 : poSrcBand = papapoOverviewBands[0][iSrcOverview];
5547 1 : poMaskVRTBand->AddMaskBandSource(
5548 1 : poSrcBand->GetMaskBand(), 0, 0, nSrcWidth, nSrcHeight,
5549 : 0, 0, nDstTotalWidth, nDstTotalHeight);
5550 : }
5551 :
5552 41 : return poVRTDS;
5553 43 : };
5554 :
5555 : // If the overview accommodates chunking, do so and recurse
5556 : // to avoid generating full size temporary files
5557 43 : if (!bOverflowFullResXChunkYChunkQueried &&
5558 39 : !bTmpDSMemRequirementOverflow && !bChunkSizeOverflow &&
5559 39 : (nDstChunkXSize < nDstWidth || nDstChunkYSize < nDstHeight))
5560 : {
5561 : // Create a VRT with the smaller chunk to do the scaling
5562 : auto poVRTDS =
5563 13 : CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
5564 :
5565 13 : std::vector<GDALRasterBand *> apoVRTBand(nBands);
5566 13 : std::vector<GDALRasterBand *> apoDstBand(nBands);
5567 65560 : for (int iBand = 0; iBand < nBands; ++iBand)
5568 : {
5569 65547 : apoDstBand[iBand] = papapoOverviewBands[iBand][iOverview];
5570 65547 : apoVRTBand[iBand] = poVRTDS->GetRasterBand(iBand + 1);
5571 : }
5572 :
5573 : // Use a flag to avoid reading from the overview being built
5574 : GDALRasterIOExtraArg sExtraArg;
5575 13 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5576 13 : if (iSrcOverview == -1)
5577 13 : sExtraArg.bUseOnlyThisScale = true;
5578 :
5579 : // A single band buffer for data transfer to the overview
5580 13 : std::vector<GByte> abyChunk;
5581 : try
5582 : {
5583 13 : abyChunk.resize(nChunkSize);
5584 : }
5585 0 : catch (const std::exception &)
5586 : {
5587 0 : CPLError(CE_Failure, CPLE_OutOfMemory,
5588 : "Out of memory allocating temporary buffer");
5589 0 : return CE_Failure;
5590 : }
5591 :
5592 : // Loop over output height, in chunks
5593 13 : for (int nDstYOff = nDstYOffStart;
5594 38 : nDstYOff < nDstYOffEnd && eErr == CE_None;
5595 : /* */)
5596 : {
5597 : const int nDstYCount =
5598 25 : std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
5599 : // Loop over output width, in output chunks
5600 25 : for (int nDstXOff = nDstXOffStart;
5601 74 : nDstXOff < nDstXOffEnd && eErr == CE_None;
5602 : /* */)
5603 : {
5604 : const int nDstXCount =
5605 49 : std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
5606 : // Read and transfer the chunk to the overview
5607 98 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
5608 : ++iBand)
5609 : {
5610 98 : eErr = apoVRTBand[iBand]->RasterIO(
5611 : GF_Read, nDstXOff, nDstYOff, nDstXCount,
5612 49 : nDstYCount, abyChunk.data(), nDstXCount,
5613 : nDstYCount, eDataType, 0, 0, &sExtraArg);
5614 49 : if (eErr == CE_None)
5615 : {
5616 96 : eErr = apoDstBand[iBand]->RasterIO(
5617 : GF_Write, nDstXOff, nDstYOff, nDstXCount,
5618 48 : nDstYCount, abyChunk.data(), nDstXCount,
5619 : nDstYCount, eDataType, 0, 0, nullptr);
5620 : }
5621 : }
5622 :
5623 49 : dfCurPixelCount +=
5624 49 : static_cast<double>(nDstXCount) * nDstYCount;
5625 :
5626 49 : nDstXOff += nDstXCount;
5627 : } // width
5628 :
5629 25 : if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount,
5630 : nullptr, pProgressData))
5631 : {
5632 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
5633 : "User terminated");
5634 0 : eErr = CE_Failure;
5635 : }
5636 :
5637 25 : nDstYOff += nDstYCount;
5638 : } // height
5639 :
5640 13 : if (CE_None != eErr)
5641 : {
5642 1 : CPLError(CE_Failure, CPLE_AppDefined,
5643 : "Error while writing overview");
5644 1 : return CE_Failure;
5645 : }
5646 :
5647 12 : pfnProgress(1.0, nullptr, pProgressData);
5648 : // Flush the overviews we just generated
5649 24 : for (int iBand = 0; iBand < nBands; ++iBand)
5650 12 : apoDstBand[iBand]->FlushCache(false);
5651 :
5652 12 : continue; // Next overview
5653 : } // chunking via temporary dataset
5654 :
5655 0 : std::unique_ptr<GDALDataset> poTmpDS;
5656 : // Config option mostly/only for autotest purposes
5657 : const char *pszGDAL_OVR_TEMP_DRIVER =
5658 30 : CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
5659 30 : if ((!bTmpDSMemRequirementOverflow &&
5660 4 : nTmpDSMemRequirement <= nChunkMaxSizeForTempFile &&
5661 4 : !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
5662 26 : EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
5663 : {
5664 10 : auto poTmpDrv = GetGDALDriverManager()->GetDriverByName("MEM");
5665 10 : if (!poTmpDrv)
5666 : {
5667 0 : eErr = CE_Failure;
5668 0 : break;
5669 : }
5670 10 : poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
5671 : nDstTotalHeight, nBands,
5672 10 : eDataType, nullptr));
5673 : }
5674 : else
5675 : {
5676 : // Create a temporary file for the overview
5677 : auto poTmpDrv =
5678 20 : GetGDALDriverManager()->GetDriverByName("GTiff");
5679 20 : if (!poTmpDrv)
5680 : {
5681 0 : eErr = CE_Failure;
5682 0 : break;
5683 : }
5684 40 : std::string osTmpFilename;
5685 20 : auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
5686 20 : if (poDstDS)
5687 : {
5688 20 : osTmpFilename = poDstDS->GetDescription();
5689 : VSIStatBufL sStatBuf;
5690 20 : if (!osTmpFilename.empty() &&
5691 0 : VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
5692 0 : osTmpFilename += "_tmp_ovr.tif";
5693 : }
5694 20 : if (osTmpFilename.empty())
5695 : {
5696 20 : osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
5697 20 : osTmpFilename += ".tif";
5698 : }
5699 20 : CPLDebug("GDAL", "Creating temporary file %s of %d x %d x %d",
5700 : osTmpFilename.c_str(), nDstWidth, nDstHeight, nBands);
5701 40 : CPLStringList aosCO;
5702 20 : if (0 == ((nReducedDstChunkXSize % GTIFF_BLOCK_SIZE_MULTIPLE) |
5703 20 : (nReducedDstChunkYSize % GTIFF_BLOCK_SIZE_MULTIPLE)))
5704 : {
5705 14 : aosCO.SetNameValue("TILED", "YES");
5706 : aosCO.SetNameValue("BLOCKXSIZE",
5707 14 : CPLSPrintf("%d", nReducedDstChunkXSize));
5708 : aosCO.SetNameValue("BLOCKYSIZE",
5709 14 : CPLSPrintf("%d", nReducedDstChunkYSize));
5710 : }
5711 20 : if (const char *pszCOList =
5712 20 : poTmpDrv->GetMetadataItem(GDAL_DMD_CREATIONOPTIONLIST))
5713 : {
5714 : aosCO.SetNameValue(
5715 20 : "COMPRESS", strstr(pszCOList, "ZSTD") ? "ZSTD" : "LZW");
5716 : }
5717 20 : poTmpDS.reset(poTmpDrv->Create(osTmpFilename.c_str(), nDstWidth,
5718 : nDstHeight, nBands, eDataType,
5719 20 : aosCO.List()));
5720 20 : if (poTmpDS)
5721 : {
5722 18 : poTmpDS->MarkSuppressOnClose();
5723 18 : VSIUnlink(osTmpFilename.c_str());
5724 : }
5725 : }
5726 30 : if (!poTmpDS)
5727 : {
5728 2 : eErr = CE_Failure;
5729 2 : break;
5730 : }
5731 :
5732 : // Create a full size VRT to do the resampling without edge effects
5733 : auto poVRTDS =
5734 28 : CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
5735 :
5736 : // Allocate a band buffer with the overview chunk size
5737 : std::unique_ptr<void, VSIFreeReleaser> pDstBuffer(
5738 : VSI_MALLOC3_VERBOSE(size_t(nWrkDataTypeSize), nDstChunkXSize,
5739 28 : nDstChunkYSize));
5740 28 : if (pDstBuffer == nullptr)
5741 : {
5742 0 : eErr = CE_Failure;
5743 0 : break;
5744 : }
5745 :
5746 : // Use a flag to avoid reading the overview being built
5747 : GDALRasterIOExtraArg sExtraArg;
5748 28 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5749 28 : if (iSrcOverview == -1)
5750 4 : sExtraArg.bUseOnlyThisScale = true;
5751 :
5752 : // Scale and copy data from the VRT to the temp file
5753 28 : for (int nDstYOff = nDstYOffStart;
5754 914 : nDstYOff < nDstYOffEnd && eErr == CE_None;
5755 : /* */)
5756 : {
5757 : const int nDstYCount =
5758 886 : std::min(nReducedDstChunkYSize, nDstYOffEnd - nDstYOff);
5759 886 : for (int nDstXOff = nDstXOffStart;
5760 201218 : nDstXOff < nDstXOffEnd && eErr == CE_None;
5761 : /* */)
5762 : {
5763 : const int nDstXCount =
5764 200332 : std::min(nReducedDstChunkXSize, nDstXOffEnd - nDstXOff);
5765 400668 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
5766 : ++iBand)
5767 : {
5768 200336 : auto poSrcBand = poVRTDS->GetRasterBand(iBand + 1);
5769 200336 : eErr = poSrcBand->RasterIO(
5770 : GF_Read, nDstXOff, nDstYOff, nDstXCount, nDstYCount,
5771 : pDstBuffer.get(), nDstXCount, nDstYCount,
5772 : eWrkDataType, 0, 0, &sExtraArg);
5773 200336 : if (eErr == CE_None)
5774 : {
5775 : // Write to the temporary dataset, shifted
5776 200334 : auto poOvrBand = poTmpDS->GetRasterBand(iBand + 1);
5777 200334 : eErr = poOvrBand->RasterIO(
5778 : GF_Write, nDstXOff - nDstXOffStart,
5779 : nDstYOff - nDstYOffStart, nDstXCount,
5780 : nDstYCount, pDstBuffer.get(), nDstXCount,
5781 : nDstYCount, eWrkDataType, 0, 0, nullptr);
5782 : }
5783 : }
5784 200332 : nDstXOff += nDstXCount;
5785 : }
5786 886 : nDstYOff += nDstYCount;
5787 : }
5788 :
5789 : // Copy from the temporary to the overview
5790 28 : for (int nDstYOff = nDstYOffStart;
5791 54 : nDstYOff < nDstYOffEnd && eErr == CE_None;
5792 : /* */)
5793 : {
5794 : const int nDstYCount =
5795 26 : std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
5796 26 : for (int nDstXOff = nDstXOffStart;
5797 52 : nDstXOff < nDstXOffEnd && eErr == CE_None;
5798 : /* */)
5799 : {
5800 : const int nDstXCount =
5801 26 : std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
5802 56 : for (int iBand = 0; iBand < nBands && eErr == CE_None;
5803 : ++iBand)
5804 : {
5805 30 : auto poSrcBand = poTmpDS->GetRasterBand(iBand + 1);
5806 30 : eErr = poSrcBand->RasterIO(
5807 : GF_Read, nDstXOff - nDstXOffStart,
5808 : nDstYOff - nDstYOffStart, nDstXCount, nDstYCount,
5809 : pDstBuffer.get(), nDstXCount, nDstYCount,
5810 : eWrkDataType, 0, 0, nullptr);
5811 30 : if (eErr == CE_None)
5812 : {
5813 : // Write to the destination overview bands
5814 30 : auto poOvrBand =
5815 30 : papapoOverviewBands[iBand][iOverview];
5816 30 : eErr = poOvrBand->RasterIO(
5817 : GF_Write, nDstXOff, nDstYOff, nDstXCount,
5818 : nDstYCount, pDstBuffer.get(), nDstXCount,
5819 : nDstYCount, eWrkDataType, 0, 0, nullptr);
5820 : }
5821 : }
5822 26 : nDstXOff += nDstXCount;
5823 : }
5824 26 : nDstYOff += nDstYCount;
5825 : }
5826 :
5827 28 : if (eErr != CE_None)
5828 : {
5829 2 : CPLError(CE_Failure, CPLE_AppDefined,
5830 : "Failed to write overview %d", iOverview);
5831 2 : return eErr;
5832 : }
5833 :
5834 : // Flush the data to overviews.
5835 56 : for (int iBand = 0; iBand < nBands; ++iBand)
5836 30 : papapoOverviewBands[iBand][iOverview]->FlushCache(false);
5837 :
5838 26 : continue;
5839 : }
5840 :
5841 : // Structure describing a resampling job
5842 : struct OvrJob
5843 : {
5844 : // Buffers to free when job is finished
5845 : std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
5846 : std::unique_ptr<PointerHolder> oSrcBufferHolder{};
5847 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
5848 :
5849 : GDALRasterBand *poDstBand = nullptr;
5850 :
5851 : // Input parameters of pfnResampleFn
5852 : GDALResampleFunction pfnResampleFn = nullptr;
5853 : GDALOverviewResampleArgs args{};
5854 : const void *pChunk = nullptr;
5855 :
5856 : // Output values of resampling function
5857 : CPLErr eErr = CE_Failure;
5858 : void *pDstBuffer = nullptr;
5859 : GDALDataType eDstBufferDataType = GDT_Unknown;
5860 :
5861 3310 : void NotifyFinished()
5862 : {
5863 6620 : std::lock_guard guard(mutex);
5864 3310 : bFinished = true;
5865 3310 : cv.notify_one();
5866 3310 : }
5867 :
5868 2 : bool IsFinished()
5869 : {
5870 2 : std::lock_guard guard(mutex);
5871 4 : return bFinished;
5872 : }
5873 :
5874 14 : void WaitFinished()
5875 : {
5876 28 : std::unique_lock oGuard(mutex);
5877 18 : while (!bFinished)
5878 : {
5879 4 : cv.wait(oGuard);
5880 : }
5881 14 : }
5882 :
5883 : private:
5884 : // Synchronization
5885 : bool bFinished = false;
5886 : std::mutex mutex{};
5887 : std::condition_variable cv{};
5888 : };
5889 :
5890 : // Thread function to resample
5891 3310 : const auto JobResampleFunc = [](void *pData)
5892 : {
5893 3310 : OvrJob *poJob = static_cast<OvrJob *>(pData);
5894 :
5895 3310 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
5896 : &(poJob->pDstBuffer),
5897 : &(poJob->eDstBufferDataType));
5898 :
5899 3310 : poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
5900 :
5901 3310 : poJob->NotifyFinished();
5902 3310 : };
5903 :
5904 : // Function to write resample data to target band
5905 3310 : const auto WriteJobData = [](const OvrJob *poJob)
5906 : {
5907 6620 : return poJob->poDstBand->RasterIO(
5908 3310 : GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
5909 3310 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
5910 3310 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
5911 3310 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
5912 3310 : poJob->args.nDstYOff2 - poJob->args.nDstYOff,
5913 3310 : poJob->eDstBufferDataType, 0, 0, nullptr);
5914 : };
5915 :
5916 : // Wait for completion of oldest job and serialize it
5917 : const auto WaitAndFinalizeOldestJob =
5918 14 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
5919 : {
5920 14 : auto poOldestJob = jobList.front().get();
5921 14 : poOldestJob->WaitFinished();
5922 14 : CPLErr l_eErr = poOldestJob->eErr;
5923 14 : if (l_eErr == CE_None)
5924 : {
5925 14 : l_eErr = WriteJobData(poOldestJob);
5926 : }
5927 :
5928 14 : jobList.pop_front();
5929 14 : return l_eErr;
5930 : };
5931 :
5932 : // Queue of jobs
5933 1204 : std::list<std::unique_ptr<OvrJob>> jobList;
5934 :
5935 1204 : std::vector<std::unique_ptr<void, VSIFreeReleaser>> apaChunk(nBands);
5936 : std::vector<std::unique_ptr<GByte, VSIFreeReleaser>>
5937 1204 : apabyChunkNoDataMask(nBands);
5938 :
5939 : // Iterate on destination overview, block by block.
5940 602 : for (int nDstYOff = nDstYOffStart;
5941 2111 : nDstYOff < nDstYOffEnd && eErr == CE_None;
5942 1509 : nDstYOff += nDstChunkYSize)
5943 : {
5944 : int nDstYCount;
5945 1509 : if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
5946 1099 : nDstYCount = nDstChunkYSize;
5947 : else
5948 410 : nDstYCount = nDstYOffEnd - nDstYOff;
5949 :
5950 1509 : int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
5951 1509 : int nChunkYOff2 = static_cast<int>(
5952 1509 : ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
5953 1509 : if (nChunkYOff2 > nSrcHeight ||
5954 1509 : nDstYOff + nDstYCount == nDstTotalHeight)
5955 595 : nChunkYOff2 = nSrcHeight;
5956 1509 : int nYCount = nChunkYOff2 - nChunkYOff;
5957 1509 : CPLAssert(nYCount <= nFullResYChunk);
5958 :
5959 1509 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
5960 1509 : int nChunkYSizeQueried =
5961 1509 : nYCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
5962 1509 : if (nChunkYOffQueried < 0)
5963 : {
5964 148 : nChunkYSizeQueried += nChunkYOffQueried;
5965 148 : nChunkYOffQueried = 0;
5966 : }
5967 1509 : if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
5968 147 : nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
5969 1509 : CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
5970 :
5971 1509 : if (!pfnProgress(std::min(1.0, dfCurPixelCount / dfTotalPixelCount),
5972 : nullptr, pProgressData))
5973 : {
5974 1 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
5975 1 : eErr = CE_Failure;
5976 : }
5977 :
5978 : // Iterate on destination overview, block by block.
5979 1509 : for (int nDstXOff = nDstXOffStart;
5980 3057 : nDstXOff < nDstXOffEnd && eErr == CE_None;
5981 1548 : nDstXOff += nDstChunkXSize)
5982 : {
5983 1548 : int nDstXCount = 0;
5984 1548 : if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
5985 1531 : nDstXCount = nDstChunkXSize;
5986 : else
5987 17 : nDstXCount = nDstXOffEnd - nDstXOff;
5988 :
5989 1548 : dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
5990 :
5991 1548 : int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
5992 1548 : int nChunkXOff2 = static_cast<int>(
5993 1548 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
5994 1548 : if (nChunkXOff2 > nSrcWidth ||
5995 1548 : nDstXOff + nDstXCount == nDstTotalWidth)
5996 1473 : nChunkXOff2 = nSrcWidth;
5997 1548 : const int nXCount = nChunkXOff2 - nChunkXOff;
5998 1548 : CPLAssert(nXCount <= nFullResXChunk);
5999 :
6000 1548 : int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
6001 1548 : int nChunkXSizeQueried =
6002 1548 : nXCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
6003 1548 : if (nChunkXOffQueried < 0)
6004 : {
6005 208 : nChunkXSizeQueried += nChunkXOffQueried;
6006 208 : nChunkXOffQueried = 0;
6007 : }
6008 1548 : if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
6009 217 : nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
6010 1548 : CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
6011 : #if DEBUG_VERBOSE
6012 : CPLDebug("GDAL",
6013 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
6014 : nChunkXOffQueried, nChunkYOffQueried,
6015 : nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
6016 : nDstYOff, nDstXCount, nDstYCount);
6017 : #endif
6018 :
6019 : // Avoid accumulating too many tasks and exhaust RAM
6020 :
6021 : // Try to complete already finished jobs
6022 1550 : while (eErr == CE_None && !jobList.empty())
6023 : {
6024 2 : auto poOldestJob = jobList.front().get();
6025 2 : if (!poOldestJob->IsFinished())
6026 0 : break;
6027 2 : eErr = poOldestJob->eErr;
6028 2 : if (eErr == CE_None)
6029 : {
6030 2 : eErr = WriteJobData(poOldestJob);
6031 : }
6032 :
6033 2 : jobList.pop_front();
6034 : }
6035 :
6036 : // And in case we have saturated the number of threads,
6037 : // wait for completion of tasks to go below the threshold.
6038 3096 : while (eErr == CE_None &&
6039 1548 : jobList.size() >= static_cast<size_t>(nThreads))
6040 : {
6041 0 : eErr = WaitAndFinalizeOldestJob(jobList);
6042 : }
6043 :
6044 : // Read the source buffers for all the bands.
6045 4859 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6046 : {
6047 : // (Re)allocate buffers if needed
6048 3311 : if (apaChunk[iBand] == nullptr)
6049 : {
6050 1179 : apaChunk[iBand].reset(VSI_MALLOC3_VERBOSE(
6051 : nFullResXChunkQueried, nFullResYChunkQueried,
6052 : nWrkDataTypeSize));
6053 1179 : if (apaChunk[iBand] == nullptr)
6054 : {
6055 0 : eErr = CE_Failure;
6056 : }
6057 : }
6058 3652 : if (bUseNoDataMask &&
6059 341 : apabyChunkNoDataMask[iBand] == nullptr)
6060 : {
6061 282 : apabyChunkNoDataMask[iBand].reset(
6062 282 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
6063 : nFullResXChunkQueried, nFullResYChunkQueried)));
6064 282 : if (apabyChunkNoDataMask[iBand] == nullptr)
6065 : {
6066 0 : eErr = CE_Failure;
6067 : }
6068 : }
6069 :
6070 3311 : if (eErr == CE_None)
6071 : {
6072 3311 : GDALRasterBand *poSrcBand = nullptr;
6073 3311 : if (iSrcOverview == -1)
6074 2409 : poSrcBand = papoSrcBands[iBand];
6075 : else
6076 902 : poSrcBand =
6077 902 : papapoOverviewBands[iBand][iSrcOverview];
6078 3311 : eErr = poSrcBand->RasterIO(
6079 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6080 : nChunkXSizeQueried, nChunkYSizeQueried,
6081 3311 : apaChunk[iBand].get(), nChunkXSizeQueried,
6082 : nChunkYSizeQueried, eWrkDataType, 0, 0, nullptr);
6083 :
6084 3311 : if (bUseNoDataMask && eErr == CE_None)
6085 : {
6086 341 : auto poMaskBand = poSrcBand->IsMaskBand()
6087 341 : ? poSrcBand
6088 262 : : poSrcBand->GetMaskBand();
6089 341 : eErr = poMaskBand->RasterIO(
6090 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6091 : nChunkXSizeQueried, nChunkYSizeQueried,
6092 341 : apabyChunkNoDataMask[iBand].get(),
6093 : nChunkXSizeQueried, nChunkYSizeQueried,
6094 : GDT_Byte, 0, 0, nullptr);
6095 : }
6096 : }
6097 : }
6098 :
6099 : // Compute the resulting overview block.
6100 4858 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6101 : {
6102 6620 : auto poJob = std::make_unique<OvrJob>();
6103 3310 : poJob->pfnResampleFn = pfnResampleFn;
6104 3310 : poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
6105 6620 : poJob->args.eOvrDataType =
6106 3310 : poJob->poDstBand->GetRasterDataType();
6107 3310 : poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
6108 3310 : poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
6109 3310 : const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
6110 3310 : "NBITS", "IMAGE_STRUCTURE");
6111 3310 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
6112 3310 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
6113 3310 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
6114 3310 : poJob->args.eWrkDataType = eWrkDataType;
6115 3310 : poJob->pChunk = apaChunk[iBand].get();
6116 3310 : poJob->args.pabyChunkNodataMask =
6117 3310 : apabyChunkNoDataMask[iBand].get();
6118 3310 : poJob->args.nChunkXOff = nChunkXOffQueried;
6119 3310 : poJob->args.nChunkXSize = nChunkXSizeQueried;
6120 3310 : poJob->args.nChunkYOff = nChunkYOffQueried;
6121 3310 : poJob->args.nChunkYSize = nChunkYSizeQueried;
6122 3310 : poJob->args.nDstXOff = nDstXOff;
6123 3310 : poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
6124 3310 : poJob->args.nDstYOff = nDstYOff;
6125 3310 : poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
6126 3310 : poJob->args.pszResampling = pszResampling;
6127 3310 : poJob->args.bHasNoData = abHasNoData[iBand];
6128 3310 : poJob->args.dfNoDataValue = adfNoDataValue[iBand];
6129 3310 : poJob->args.eSrcDataType = eDataType;
6130 3310 : poJob->args.bPropagateNoData = bPropagateNoData;
6131 :
6132 3310 : if (poJobQueue)
6133 : {
6134 32 : poJob->oSrcMaskBufferHolder.reset(new PointerHolder(
6135 16 : apabyChunkNoDataMask[iBand].release()));
6136 :
6137 32 : poJob->oSrcBufferHolder.reset(
6138 16 : new PointerHolder(apaChunk[iBand].release()));
6139 :
6140 16 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
6141 16 : jobList.emplace_back(std::move(poJob));
6142 : }
6143 : else
6144 : {
6145 3294 : JobResampleFunc(poJob.get());
6146 3294 : eErr = poJob->eErr;
6147 3294 : if (eErr == CE_None)
6148 : {
6149 3294 : eErr = WriteJobData(poJob.get());
6150 : }
6151 : }
6152 : }
6153 : }
6154 : }
6155 :
6156 : // Wait for all pending jobs to complete
6157 616 : while (!jobList.empty())
6158 : {
6159 14 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
6160 14 : if (l_eErr != CE_None && eErr == CE_None)
6161 0 : eErr = l_eErr;
6162 : }
6163 :
6164 : // Flush the data to overviews.
6165 1779 : for (int iBand = 0; iBand < nBands; ++iBand)
6166 : {
6167 1177 : if (papapoOverviewBands[iBand][iOverview]->FlushCache(false) !=
6168 : CE_None)
6169 0 : eErr = CE_Failure;
6170 : }
6171 : }
6172 :
6173 382 : if (eErr == CE_None)
6174 378 : pfnProgress(1.0, nullptr, pProgressData);
6175 :
6176 382 : return eErr;
6177 : }
6178 :
6179 : /************************************************************************/
6180 : /* GDALRegenerateOverviewsMultiBand() */
6181 : /************************************************************************/
6182 :
6183 : /**
6184 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
6185 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
6186 : *
6187 : * This function will generate one or more overview images from a base
6188 : * image using the requested downsampling algorithm. Its primary use
6189 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
6190 : * can also be used to generate downsampled images in one file from another
6191 : * outside the overview architecture.
6192 : *
6193 : * The output bands need to exist in advance and share the same characteristics
6194 : * (type, dimensions)
6195 : *
6196 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
6197 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
6198 : *
6199 : * It does not support color tables or complex data types.
6200 : *
6201 : * The pseudo-algorithm used by the function is :
6202 : * for each overview
6203 : * iterate on lines of the source by a step of deltay
6204 : * iterate on columns of the source by a step of deltax
6205 : * read the source data of size deltax * deltay for all the bands
6206 : * generate the corresponding overview block for all the bands
6207 : *
6208 : * This function will honour properly NODATA_VALUES tuples (special dataset
6209 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
6210 : * considered as the nodata value and not each value of the triplet
6211 : * independently per band.
6212 : *
6213 : * The GDAL_NUM_THREADS configuration option can be set
6214 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
6215 : * overview computation.
6216 : *
6217 : * @param apoSrcBands the list of source bands to downsample
6218 : * @param aapoOverviewBands bidimension array of bands. First dimension is
6219 : * indexed by bands. Second dimension is indexed by
6220 : * overview levels. All aapoOverviewBands[i] arrays
6221 : * must have the same size (i.e. same number of
6222 : * overviews)
6223 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
6224 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
6225 : * @param pfnProgress progress report function.
6226 : * @param pProgressData progress function callback data.
6227 : * @param papszOptions NULL terminated list of options as
6228 : * key=value pairs, or NULL
6229 : * The XOFF, YOFF, XSIZE and YSIZE
6230 : * options can be specified to express that overviews should
6231 : * be regenerated only in the specified subset of the source
6232 : * dataset.
6233 : * @return CE_None on success or CE_Failure on failure.
6234 : * @since 3.10
6235 : */
6236 :
6237 19 : CPLErr GDALRegenerateOverviewsMultiBand(
6238 : const std::vector<GDALRasterBand *> &apoSrcBands,
6239 : const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
6240 : const char *pszResampling, GDALProgressFunc pfnProgress,
6241 : void *pProgressData, CSLConstList papszOptions)
6242 : {
6243 19 : CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
6244 29 : for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
6245 : {
6246 10 : CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
6247 : }
6248 :
6249 19 : if (aapoOverviewBands.empty())
6250 0 : return CE_None;
6251 :
6252 19 : std::vector<GDALRasterBand **> apapoOverviewBands;
6253 48 : for (auto &apoOverviewBands : aapoOverviewBands)
6254 : {
6255 : auto papoOverviewBands = static_cast<GDALRasterBand **>(
6256 29 : CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
6257 61 : for (size_t i = 0; i < apoOverviewBands.size(); ++i)
6258 : {
6259 32 : papoOverviewBands[i] = apoOverviewBands[i];
6260 : }
6261 29 : apapoOverviewBands.push_back(papoOverviewBands);
6262 : }
6263 38 : const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
6264 19 : static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
6265 19 : static_cast<int>(aapoOverviewBands[0].size()),
6266 19 : apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
6267 : papszOptions);
6268 48 : for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
6269 29 : CPLFree(papoOverviewBands);
6270 19 : return eErr;
6271 : }
6272 :
6273 : /************************************************************************/
6274 : /* GDALComputeBandStats() */
6275 : /************************************************************************/
6276 :
6277 : /** Undocumented
6278 : * @param hSrcBand undocumented.
6279 : * @param nSampleStep Step between scanlines used to compute statistics.
6280 : * When nSampleStep is equal to 1, all scanlines will
6281 : * be processed.
6282 : * @param pdfMean undocumented.
6283 : * @param pdfStdDev undocumented.
6284 : * @param pfnProgress undocumented.
6285 : * @param pProgressData undocumented.
6286 : * @return undocumented
6287 : */
6288 18 : CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
6289 : int nSampleStep, double *pdfMean,
6290 : double *pdfStdDev,
6291 : GDALProgressFunc pfnProgress,
6292 : void *pProgressData)
6293 :
6294 : {
6295 18 : VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
6296 :
6297 18 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
6298 :
6299 18 : if (pfnProgress == nullptr)
6300 18 : pfnProgress = GDALDummyProgress;
6301 :
6302 18 : const int nWidth = poSrcBand->GetXSize();
6303 18 : const int nHeight = poSrcBand->GetYSize();
6304 :
6305 18 : if (nSampleStep >= nHeight || nSampleStep < 1)
6306 5 : nSampleStep = 1;
6307 :
6308 18 : GDALDataType eWrkType = GDT_Unknown;
6309 18 : float *pafData = nullptr;
6310 18 : GDALDataType eType = poSrcBand->GetRasterDataType();
6311 18 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6312 18 : if (bComplex)
6313 : {
6314 : pafData = static_cast<float *>(
6315 0 : VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6316 0 : eWrkType = GDT_CFloat32;
6317 : }
6318 : else
6319 : {
6320 : pafData =
6321 18 : static_cast<float *>(VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6322 18 : eWrkType = GDT_Float32;
6323 : }
6324 :
6325 18 : if (nWidth == 0 || pafData == nullptr)
6326 : {
6327 0 : VSIFree(pafData);
6328 0 : return CE_Failure;
6329 : }
6330 :
6331 : /* -------------------------------------------------------------------- */
6332 : /* Loop over all sample lines. */
6333 : /* -------------------------------------------------------------------- */
6334 18 : double dfSum = 0.0;
6335 18 : double dfSum2 = 0.0;
6336 18 : int iLine = 0;
6337 18 : GIntBig nSamples = 0;
6338 :
6339 2143 : do
6340 : {
6341 2161 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6342 : pProgressData))
6343 : {
6344 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6345 0 : CPLFree(pafData);
6346 0 : return CE_Failure;
6347 : }
6348 :
6349 : const CPLErr eErr =
6350 2161 : poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
6351 : 1, eWrkType, 0, 0, nullptr);
6352 2161 : if (eErr != CE_None)
6353 : {
6354 1 : CPLFree(pafData);
6355 1 : return eErr;
6356 : }
6357 :
6358 725208 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6359 : {
6360 723048 : float fValue = 0.0f;
6361 :
6362 723048 : if (bComplex)
6363 : {
6364 : // Compute the magnitude of the complex value.
6365 : fValue =
6366 0 : std::hypot(pafData[static_cast<size_t>(iPixel) * 2],
6367 0 : pafData[static_cast<size_t>(iPixel) * 2 + 1]);
6368 : }
6369 : else
6370 : {
6371 723048 : fValue = pafData[iPixel];
6372 : }
6373 :
6374 723048 : dfSum += fValue;
6375 723048 : dfSum2 += static_cast<double>(fValue) * fValue;
6376 : }
6377 :
6378 2160 : nSamples += nWidth;
6379 2160 : iLine += nSampleStep;
6380 2160 : } while (iLine < nHeight);
6381 :
6382 17 : if (!pfnProgress(1.0, nullptr, pProgressData))
6383 : {
6384 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6385 0 : CPLFree(pafData);
6386 0 : return CE_Failure;
6387 : }
6388 :
6389 : /* -------------------------------------------------------------------- */
6390 : /* Produce the result values. */
6391 : /* -------------------------------------------------------------------- */
6392 17 : if (pdfMean != nullptr)
6393 17 : *pdfMean = dfSum / nSamples;
6394 :
6395 17 : if (pdfStdDev != nullptr)
6396 : {
6397 17 : const double dfMean = dfSum / nSamples;
6398 :
6399 17 : *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
6400 : }
6401 :
6402 17 : CPLFree(pafData);
6403 :
6404 17 : return CE_None;
6405 : }
6406 :
6407 : /************************************************************************/
6408 : /* GDALOverviewMagnitudeCorrection() */
6409 : /* */
6410 : /* Correct the mean and standard deviation of the overviews of */
6411 : /* the given band to match the base layer approximately. */
6412 : /************************************************************************/
6413 :
6414 : /** Undocumented
6415 : * @param hBaseBand undocumented.
6416 : * @param nOverviewCount undocumented.
6417 : * @param pahOverviews undocumented.
6418 : * @param pfnProgress undocumented.
6419 : * @param pProgressData undocumented.
6420 : * @return undocumented
6421 : */
6422 0 : CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
6423 : int nOverviewCount,
6424 : GDALRasterBandH *pahOverviews,
6425 : GDALProgressFunc pfnProgress,
6426 : void *pProgressData)
6427 :
6428 : {
6429 0 : VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
6430 :
6431 : /* -------------------------------------------------------------------- */
6432 : /* Compute mean/stddev for source raster. */
6433 : /* -------------------------------------------------------------------- */
6434 0 : double dfOrigMean = 0.0;
6435 0 : double dfOrigStdDev = 0.0;
6436 : {
6437 : const CPLErr eErr =
6438 0 : GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
6439 : pfnProgress, pProgressData);
6440 :
6441 0 : if (eErr != CE_None)
6442 0 : return eErr;
6443 : }
6444 :
6445 : /* -------------------------------------------------------------------- */
6446 : /* Loop on overview bands. */
6447 : /* -------------------------------------------------------------------- */
6448 0 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
6449 : {
6450 : GDALRasterBand *poOverview =
6451 0 : GDALRasterBand::FromHandle(pahOverviews[iOverview]);
6452 : double dfOverviewMean, dfOverviewStdDev;
6453 :
6454 : const CPLErr eErr =
6455 0 : GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
6456 : &dfOverviewStdDev, pfnProgress, pProgressData);
6457 :
6458 0 : if (eErr != CE_None)
6459 0 : return eErr;
6460 :
6461 0 : double dfGain = 1.0;
6462 0 : if (dfOrigStdDev >= 0.0001)
6463 0 : dfGain = dfOrigStdDev / dfOverviewStdDev;
6464 :
6465 : /* --------------------------------------------------------------------
6466 : */
6467 : /* Apply gain and offset. */
6468 : /* --------------------------------------------------------------------
6469 : */
6470 0 : const int nWidth = poOverview->GetXSize();
6471 0 : const int nHeight = poOverview->GetYSize();
6472 :
6473 0 : GDALDataType eWrkType = GDT_Unknown;
6474 0 : float *pafData = nullptr;
6475 0 : const GDALDataType eType = poOverview->GetRasterDataType();
6476 0 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6477 0 : if (bComplex)
6478 : {
6479 : pafData = static_cast<float *>(
6480 0 : VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6481 0 : eWrkType = GDT_CFloat32;
6482 : }
6483 : else
6484 : {
6485 : pafData = static_cast<float *>(
6486 0 : VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6487 0 : eWrkType = GDT_Float32;
6488 : }
6489 :
6490 0 : if (pafData == nullptr)
6491 : {
6492 0 : return CE_Failure;
6493 : }
6494 :
6495 0 : for (int iLine = 0; iLine < nHeight; ++iLine)
6496 : {
6497 0 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6498 : pProgressData))
6499 : {
6500 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6501 0 : CPLFree(pafData);
6502 0 : return CE_Failure;
6503 : }
6504 :
6505 0 : if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
6506 : nWidth, 1, eWrkType, 0, 0,
6507 0 : nullptr) != CE_None)
6508 : {
6509 0 : CPLFree(pafData);
6510 0 : return CE_Failure;
6511 : }
6512 :
6513 0 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6514 : {
6515 0 : if (bComplex)
6516 : {
6517 0 : pafData[static_cast<size_t>(iPixel) * 2] *=
6518 0 : static_cast<float>(dfGain);
6519 0 : pafData[static_cast<size_t>(iPixel) * 2 + 1] *=
6520 0 : static_cast<float>(dfGain);
6521 : }
6522 : else
6523 : {
6524 0 : pafData[iPixel] = static_cast<float>(
6525 0 : (pafData[iPixel] - dfOverviewMean) * dfGain +
6526 : dfOrigMean);
6527 : }
6528 : }
6529 :
6530 0 : if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
6531 : nWidth, 1, eWrkType, 0, 0,
6532 0 : nullptr) != CE_None)
6533 : {
6534 0 : CPLFree(pafData);
6535 0 : return CE_Failure;
6536 : }
6537 : }
6538 :
6539 0 : if (!pfnProgress(1.0, nullptr, pProgressData))
6540 : {
6541 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6542 0 : CPLFree(pafData);
6543 0 : return CE_Failure;
6544 : }
6545 :
6546 0 : CPLFree(pafData);
6547 : }
6548 :
6549 0 : return CE_None;
6550 : }
|