Line data Source code
1 :
2 : /******************************************************************************
3 : *
4 : * Project: GDAL Core
5 : * Purpose: Helper code to implement overview support in different drivers.
6 : * Author: Frank Warmerdam, warmerdam@pobox.com
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 2000, Frank Warmerdam
10 : * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * SPDX-License-Identifier: MIT
13 : ****************************************************************************/
14 :
15 : #include "cpl_port.h"
16 : #include "gdal_priv.h"
17 :
18 : #include <cmath>
19 : #include <cstddef>
20 : #include <cstdlib>
21 :
22 : #include <algorithm>
23 : #include <complex>
24 : #include <condition_variable>
25 : #include <limits>
26 : #include <list>
27 : #include <memory>
28 : #include <mutex>
29 : #include <vector>
30 :
31 : #include "cpl_conv.h"
32 : #include "cpl_error.h"
33 : #include "cpl_float.h"
34 : #include "cpl_progress.h"
35 : #include "cpl_vsi.h"
36 : #include "gdal.h"
37 : #include "gdal_thread_pool.h"
38 : #include "gdalwarper.h"
39 :
40 : #ifdef USE_NEON_OPTIMIZATIONS
41 : #include "include_sse2neon.h"
42 : #define USE_SSE2
43 :
44 : #include "gdalsse_priv.h"
45 :
46 : // Restrict to 64bit processors because they are guaranteed to have SSE2,
47 : // or if __AVX2__ is defined.
48 : #elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
49 : #define USE_SSE2
50 :
51 : #include "gdalsse_priv.h"
52 :
53 : #ifdef __SSE3__
54 : #include <pmmintrin.h>
55 : #endif
56 : #ifdef __SSSE3__
57 : #include <tmmintrin.h>
58 : #endif
59 : #ifdef __SSE4_1__
60 : #include <smmintrin.h>
61 : #endif
62 : #ifdef __AVX2__
63 : #include <immintrin.h>
64 : #endif
65 :
66 : #endif
67 :
68 : // To be included after above USE_SSE2 and include gdalsse_priv.h
69 : // to avoid build issue on Windows x86
70 : #include "gdal_priv_templates.hpp"
71 :
72 : /************************************************************************/
73 : /* GDALResampleChunk_Near() */
74 : /************************************************************************/
75 :
76 : template <class T>
77 6077 : static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
78 : const T *pChunk, T **ppDstBuffer)
79 :
80 : {
81 6077 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
82 6077 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
83 6077 : const GDALDataType eWrkDataType = args.eWrkDataType;
84 6077 : const int nChunkXOff = args.nChunkXOff;
85 6077 : const int nChunkXSize = args.nChunkXSize;
86 6077 : const int nChunkYOff = args.nChunkYOff;
87 6077 : const int nDstXOff = args.nDstXOff;
88 6077 : const int nDstXOff2 = args.nDstXOff2;
89 6077 : const int nDstYOff = args.nDstYOff;
90 6077 : const int nDstYOff2 = args.nDstYOff2;
91 6077 : const int nDstXWidth = nDstXOff2 - nDstXOff;
92 :
93 : /* -------------------------------------------------------------------- */
94 : /* Allocate buffers. */
95 : /* -------------------------------------------------------------------- */
96 6077 : *ppDstBuffer = static_cast<T *>(
97 6077 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
98 : GDALGetDataTypeSizeBytes(eWrkDataType)));
99 6077 : if (*ppDstBuffer == nullptr)
100 : {
101 0 : return CE_Failure;
102 : }
103 6077 : T *const pDstBuffer = *ppDstBuffer;
104 :
105 : int *panSrcXOff =
106 6077 : static_cast<int *>(VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(int)));
107 :
108 6077 : if (panSrcXOff == nullptr)
109 : {
110 0 : VSIFree(panSrcXOff);
111 0 : return CE_Failure;
112 : }
113 :
114 : /* ==================================================================== */
115 : /* Precompute inner loop constants. */
116 : /* ==================================================================== */
117 586452 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
118 : {
119 580375 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
120 580375 : if (nSrcXOff < nChunkXOff)
121 0 : nSrcXOff = nChunkXOff;
122 :
123 580375 : panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
124 : }
125 :
126 : /* ==================================================================== */
127 : /* Loop over destination scanlines. */
128 : /* ==================================================================== */
129 216573 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
130 : {
131 210496 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
132 210496 : if (nSrcYOff < nChunkYOff)
133 0 : nSrcYOff = nChunkYOff;
134 :
135 210496 : const T *const pSrcScanline =
136 : pChunk +
137 210496 : (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
138 208026 : nChunkXOff;
139 :
140 : /* --------------------------------------------------------------------
141 : */
142 : /* Loop over destination pixels */
143 : /* --------------------------------------------------------------------
144 : */
145 210496 : T *pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
146 119221034 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
147 : {
148 119010564 : pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
149 : }
150 : }
151 :
152 6077 : CPLFree(panSrcXOff);
153 :
154 6077 : return CE_None;
155 : }
156 :
157 6077 : static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
158 : const void *pChunk, void **ppDstBuffer,
159 : GDALDataType *peDstBufferDataType)
160 : {
161 6077 : *peDstBufferDataType = args.eWrkDataType;
162 6077 : switch (args.eWrkDataType)
163 : {
164 : // For nearest resampling, as no computation is done, only the
165 : // size of the data type matters.
166 5949 : case GDT_Byte:
167 : case GDT_Int8:
168 : {
169 5949 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
170 5949 : return GDALResampleChunk_NearT(
171 : args, static_cast<const uint8_t *>(pChunk),
172 5949 : reinterpret_cast<uint8_t **>(ppDstBuffer));
173 : }
174 :
175 26 : case GDT_Int16:
176 : case GDT_UInt16:
177 : case GDT_Float16:
178 : {
179 26 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
180 26 : return GDALResampleChunk_NearT(
181 : args, static_cast<const uint16_t *>(pChunk),
182 26 : reinterpret_cast<uint16_t **>(ppDstBuffer));
183 : }
184 :
185 55 : case GDT_CInt16:
186 : case GDT_CFloat16:
187 : case GDT_Int32:
188 : case GDT_UInt32:
189 : case GDT_Float32:
190 : {
191 55 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
192 55 : return GDALResampleChunk_NearT(
193 : args, static_cast<const uint32_t *>(pChunk),
194 55 : reinterpret_cast<uint32_t **>(ppDstBuffer));
195 : }
196 :
197 43 : case GDT_CInt32:
198 : case GDT_CFloat32:
199 : case GDT_Int64:
200 : case GDT_UInt64:
201 : case GDT_Float64:
202 : {
203 43 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
204 43 : return GDALResampleChunk_NearT(
205 : args, static_cast<const uint64_t *>(pChunk),
206 43 : reinterpret_cast<uint64_t **>(ppDstBuffer));
207 : }
208 :
209 4 : case GDT_CFloat64:
210 : {
211 4 : return GDALResampleChunk_NearT(
212 : args, static_cast<const std::complex<double> *>(pChunk),
213 4 : reinterpret_cast<std::complex<double> **>(ppDstBuffer));
214 : }
215 :
216 0 : case GDT_Unknown:
217 : case GDT_TypeCount:
218 0 : break;
219 : }
220 0 : CPLAssert(false);
221 : return CE_Failure;
222 : }
223 :
224 : namespace
225 : {
226 :
227 : // Find in the color table the entry whose RGB value is the closest
228 : // (using quadratic distance) to the test color, ignoring transparent entries.
229 3837 : int BestColorEntry(const std::vector<GDALColorEntry> &entries,
230 : const GDALColorEntry &test)
231 : {
232 3837 : int nMinDist = std::numeric_limits<int>::max();
233 3837 : size_t bestEntry = 0;
234 986109 : for (size_t i = 0; i < entries.size(); ++i)
235 : {
236 982272 : const GDALColorEntry &entry = entries[i];
237 : // Ignore transparent entries
238 982272 : if (entry.c4 == 0)
239 3237 : continue;
240 :
241 979035 : int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
242 979035 : ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
243 979035 : ((test.c3 - entry.c3) * (test.c3 - entry.c3));
244 979035 : if (nDist < nMinDist)
245 : {
246 15847 : nMinDist = nDist;
247 15847 : bestEntry = i;
248 : }
249 : }
250 3837 : return static_cast<int>(bestEntry);
251 : }
252 :
253 7 : std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
254 : int &transparentIdx)
255 : {
256 7 : std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
257 :
258 7 : transparentIdx = -1;
259 7 : int i = 0;
260 1799 : for (auto &entry : entries)
261 : {
262 1792 : table.GetColorEntryAsRGB(i, &entry);
263 1792 : if (transparentIdx < 0 && entry.c4 == 0)
264 1 : transparentIdx = i;
265 1792 : ++i;
266 : }
267 7 : return entries;
268 : }
269 :
270 : } // unnamed namespace
271 :
272 : /************************************************************************/
273 : /* SQUARE() */
274 : /************************************************************************/
275 :
276 3721 : template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
277 : {
278 3721 : return static_cast<Tsquare>(val) * val;
279 : }
280 :
281 : /************************************************************************/
282 : /* ComputeIntegerRMS() */
283 : /************************************************************************/
284 : // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
285 : // integer that minimizes abs(rms**2 - sumSquares / weight)
286 : template <class T, class Twork>
287 42 : inline T ComputeIntegerRMS(double sumSquares, double weight)
288 : {
289 42 : const double sumDivWeight = sumSquares / weight;
290 42 : T rms = static_cast<T>(sqrt(sumDivWeight));
291 :
292 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
293 : // Naive version:
294 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
295 42 : if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
296 42 : 2 * sumDivWeight)
297 6 : rms += 1;
298 42 : return rms;
299 : }
300 :
301 0 : template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
302 : {
303 0 : CPLAssert(false);
304 : return 0;
305 : }
306 :
307 24 : template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
308 : {
309 : // It has been verified that given the correction on rms below, using
310 : // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
311 : // is equivalent, so use the former as it is used twice.
312 24 : const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
313 24 : const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
314 24 : GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
315 :
316 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
317 : // Naive version:
318 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
319 : // Optimized version for integer case and weight == 4
320 24 : if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
321 5 : rms += 1;
322 24 : return rms;
323 : }
324 :
325 : template <>
326 20 : inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
327 : {
328 20 : const double sumDivWeight = sumSquares * 0.25;
329 20 : GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
330 :
331 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
332 : // Naive version:
333 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
334 : // Optimized version for integer case and weight == 4
335 20 : if (static_cast<GUInt32>(rms) * (rms + 1) <
336 20 : static_cast<GUInt32>(sumDivWeight + 0.25))
337 4 : rms += 1;
338 20 : return rms;
339 : }
340 :
341 : #ifdef USE_SSE2
342 :
343 : /************************************************************************/
344 : /* QuadraticMeanByteSSE2OrAVX2() */
345 : /************************************************************************/
346 :
347 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
348 : #define sse2_packus_epi32 _mm_packus_epi32
349 : #else
350 516119 : inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
351 : {
352 516119 : const auto minus32768_32 = _mm_set1_epi32(-32768);
353 516119 : const auto minus32768_16 = _mm_set1_epi16(-32768);
354 516119 : a = _mm_add_epi32(a, minus32768_32);
355 516119 : b = _mm_add_epi32(b, minus32768_32);
356 516119 : a = _mm_packs_epi32(a, b);
357 516119 : a = _mm_sub_epi16(a, minus32768_16);
358 516119 : return a;
359 : }
360 : #endif
361 :
362 : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
363 : #define sse2_hadd_epi16 _mm_hadd_epi16
364 : #else
365 4660830 : inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
366 : {
367 : // Horizontal addition of adjacent pairs
368 4660830 : const auto mask = _mm_set1_epi32(0xFFFF);
369 : const auto horizLo =
370 13982500 : _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
371 : const auto horizHi =
372 13982500 : _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
373 :
374 : // Recombine low and high parts
375 4660830 : return _mm_packs_epi32(horizLo, horizHi);
376 : }
377 : #endif
378 :
379 : #ifdef __AVX2__
380 :
381 : #define DEST_ELTS 16
382 : #define set1_epi16 _mm256_set1_epi16
383 : #define set1_epi32 _mm256_set1_epi32
384 : #define setzero _mm256_setzero_si256
385 : #define set1_ps _mm256_set1_ps
386 : #define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
387 : #define unpacklo_epi8 _mm256_unpacklo_epi8
388 : #define unpackhi_epi8 _mm256_unpackhi_epi8
389 : #define madd_epi16 _mm256_madd_epi16
390 : #define add_epi32 _mm256_add_epi32
391 : #define mul_ps _mm256_mul_ps
392 : #define cvtepi32_ps _mm256_cvtepi32_ps
393 : #define sqrt_ps _mm256_sqrt_ps
394 : #define cvttps_epi32 _mm256_cvttps_epi32
395 : #define packs_epi32 _mm256_packs_epi32
396 : #define packus_epi32 _mm256_packus_epi32
397 : #define srli_epi32 _mm256_srli_epi32
398 : #define mullo_epi16 _mm256_mullo_epi16
399 : #define srli_epi16 _mm256_srli_epi16
400 : #define cmpgt_epi16 _mm256_cmpgt_epi16
401 : #define add_epi16 _mm256_add_epi16
402 : #define sub_epi16 _mm256_sub_epi16
403 : #define packus_epi16 _mm256_packus_epi16
404 : /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
405 : /* to get the lower 128-bit bits of what would be a true 256-bit vector register
406 : */
407 : #define store_lo(x, y) \
408 : _mm_storeu_si128(reinterpret_cast<__m128i *>(x), \
409 : _mm256_extracti128_si256( \
410 : _mm256_permute4x64_epi64((y), 0 | (2 << 2)), 0))
411 : #define hadd_epi16 _mm256_hadd_epi16
412 : #define zeroupper() _mm256_zeroupper()
413 : #else
414 : #define DEST_ELTS 8
415 : #define set1_epi16 _mm_set1_epi16
416 : #define set1_epi32 _mm_set1_epi32
417 : #define setzero _mm_setzero_si128
418 : #define set1_ps _mm_set1_ps
419 : #define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
420 : #define unpacklo_epi8 _mm_unpacklo_epi8
421 : #define unpackhi_epi8 _mm_unpackhi_epi8
422 : #define madd_epi16 _mm_madd_epi16
423 : #define add_epi32 _mm_add_epi32
424 : #define mul_ps _mm_mul_ps
425 : #define cvtepi32_ps _mm_cvtepi32_ps
426 : #define sqrt_ps _mm_sqrt_ps
427 : #define cvttps_epi32 _mm_cvttps_epi32
428 : #define packs_epi32 _mm_packs_epi32
429 : #define packus_epi32 sse2_packus_epi32
430 : #define srli_epi32 _mm_srli_epi32
431 : #define mullo_epi16 _mm_mullo_epi16
432 : #define srli_epi16 _mm_srli_epi16
433 : #define cmpgt_epi16 _mm_cmpgt_epi16
434 : #define add_epi16 _mm_add_epi16
435 : #define sub_epi16 _mm_sub_epi16
436 : #define packus_epi16 _mm_packus_epi16
437 : #define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
438 : #define hadd_epi16 sse2_hadd_epi16
439 : #define zeroupper() (void)0
440 : #endif
441 :
442 : #if defined(__GNUC__) && defined(__AVX2__)
443 : // Disabling inlining works around a bug with gcc 9.3 (Ubuntu 20.04) in
444 : // -O2 -mavx2 mode in QuadraticMeanFloatSSE2(),
445 : // where the registry that contains minus_zero is correctly
446 : // loaded the first time the function is called (looking at the disassembly,
447 : // one sees it is loaded much earlier than the function), but gets corrupted
448 : // (zeroed) in following iterations.
449 : // It appears the bug is due to the explicit zeroupper() call at the end of
450 : // the function.
451 : // The bug is at least solved in gcc 10.2.
452 : // Inlining doesn't bring much here to performance.
453 : // This is also needed with gcc 9.3 on QuadraticMeanByteSSE2OrAVX2() in
454 : // -O3 -mavx2 mode
455 : #define NOINLINE __attribute__((noinline))
456 : #else
457 : #define NOINLINE
458 : #endif
459 :
460 : template <class T>
461 : static int NOINLINE
462 5385 : QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
463 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
464 : T *CPL_RESTRICT pDstScanline)
465 : {
466 : // Optimized implementation for RMS on Byte by
467 : // processing by group of 8 output pixels, so as to use
468 : // a single _mm_sqrt_ps() call for 4 output pixels
469 5385 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
470 :
471 5385 : int iDstPixel = 0;
472 5385 : const auto one16 = set1_epi16(1);
473 5385 : const auto one32 = set1_epi32(1);
474 5385 : const auto zero = setzero();
475 5385 : const auto minus32768 = set1_epi16(-32768);
476 :
477 521496 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
478 : {
479 : // Load 2 * DEST_ELTS bytes from each line
480 516111 : auto firstLine = loadu_int(pSrcScanlineShifted);
481 1032220 : auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
482 : // Extend those Bytes as UInt16s
483 516111 : auto firstLineLo = unpacklo_epi8(firstLine, zero);
484 516111 : auto firstLineHi = unpackhi_epi8(firstLine, zero);
485 516111 : auto secondLineLo = unpacklo_epi8(secondLine, zero);
486 516111 : auto secondLineHi = unpackhi_epi8(secondLine, zero);
487 :
488 : // Multiplication of 16 bit values and horizontal
489 : // addition of 32 bit results
490 : // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
491 516111 : firstLineLo = madd_epi16(firstLineLo, firstLineLo);
492 516111 : firstLineHi = madd_epi16(firstLineHi, firstLineHi);
493 516111 : secondLineLo = madd_epi16(secondLineLo, secondLineLo);
494 516111 : secondLineHi = madd_epi16(secondLineHi, secondLineHi);
495 :
496 : // Vertical addition
497 516111 : const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
498 516111 : const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
499 :
500 : const auto sumSquaresPlusOneDiv4Lo =
501 1032220 : srli_epi32(add_epi32(sumSquaresLo, one32), 2);
502 : const auto sumSquaresPlusOneDiv4Hi =
503 1032220 : srli_epi32(add_epi32(sumSquaresHi, one32), 2);
504 :
505 : // Take square root and truncate/floor to int32
506 : const auto rmsLo =
507 1548330 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
508 : const auto rmsHi =
509 1548330 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
510 :
511 : // Merge back low and high registers with each RMS value
512 : // as a 16 bit value.
513 516111 : auto rms = packs_epi32(rmsLo, rmsHi);
514 :
515 : // Round to upper value if it minimizes the
516 : // error |rms^2 - sumSquares/4|
517 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
518 : // rms += 1;
519 : // which is equivalent to:
520 : // if( rms * (rms + 1) < (sumSquares+1) / 4 )
521 : // rms += 1;
522 : // And both left and right parts fit on 16 (unsigned) bits
523 : const auto sumSquaresPlusOneDiv4 =
524 516111 : packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
525 : // cmpgt_epi16 operates on signed int16, but here
526 : // we have unsigned values, so shift them by -32768 before
527 2580560 : auto mask = cmpgt_epi16(
528 : add_epi16(sumSquaresPlusOneDiv4, minus32768),
529 : add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
530 : // The value of the mask will be -1 when the correction needs to be
531 : // applied
532 516111 : rms = sub_epi16(rms, mask);
533 :
534 : // Pack each 16 bit RMS value to 8 bits
535 516111 : rms = packus_epi16(rms, rms /* could be anything */);
536 516111 : store_lo(&pDstScanline[iDstPixel], rms);
537 516111 : pSrcScanlineShifted += 2 * DEST_ELTS;
538 : }
539 : zeroupper();
540 :
541 5385 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
542 5385 : return iDstPixel;
543 : }
544 :
545 : /************************************************************************/
546 : /* AverageByteSSE2OrAVX2() */
547 : /************************************************************************/
548 :
549 : template <class T>
550 : static int
551 111021 : AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
552 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
553 : T *CPL_RESTRICT pDstScanline)
554 : {
555 : // Optimized implementation for average on Byte by
556 : // processing by group of 8 output pixels.
557 :
558 111021 : const auto zero = setzero();
559 111021 : const auto two16 = set1_epi16(2);
560 111021 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
561 :
562 111021 : int iDstPixel = 0;
563 4771860 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
564 : {
565 : // Load 2 * DEST_ELTS bytes from each line
566 4660830 : const auto firstLine = loadu_int(pSrcScanlineShifted);
567 9321670 : const auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
568 : // Extend those Bytes as UInt16s
569 4660830 : const auto firstLineLo = unpacklo_epi8(firstLine, zero);
570 4660830 : const auto firstLineHi = unpackhi_epi8(firstLine, zero);
571 4660830 : const auto secondLineLo = unpacklo_epi8(secondLine, zero);
572 4660830 : const auto secondLineHi = unpackhi_epi8(secondLine, zero);
573 :
574 : // Vertical addition
575 4660830 : const auto sumLo = add_epi16(firstLineLo, secondLineLo);
576 4660830 : const auto sumHi = add_epi16(firstLineHi, secondLineHi);
577 :
578 : // Horizontal addition of adjacent pairs, and recombine low and high
579 : // parts
580 4660830 : const auto sum = hadd_epi16(sumLo, sumHi);
581 :
582 : // average = (sum + 2) / 4
583 9321670 : auto average = srli_epi16(add_epi16(sum, two16), 2);
584 :
585 : // Pack each 16 bit average value to 8 bits
586 4660830 : average = packus_epi16(average, average /* could be anything */);
587 4660830 : store_lo(&pDstScanline[iDstPixel], average);
588 4660830 : pSrcScanlineShifted += 2 * DEST_ELTS;
589 : }
590 : zeroupper();
591 :
592 111021 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
593 111021 : return iDstPixel;
594 : }
595 :
596 : /************************************************************************/
597 : /* QuadraticMeanUInt16SSE2() */
598 : /************************************************************************/
599 :
600 : #ifdef __SSE3__
601 : #define sse2_hadd_pd _mm_hadd_pd
602 : #else
603 8 : inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
604 : {
605 : auto aLo_bLo =
606 32 : _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
607 : auto aHi_bHi =
608 32 : _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
609 8 : return _mm_add_pd(aLo_bLo, aHi_bHi); // (aLo + aHi, bLo + bHi)
610 : }
611 : #endif
612 :
613 40 : inline __m128d SQUARE(__m128d x)
614 : {
615 40 : return _mm_mul_pd(x, x);
616 : }
617 :
618 : #ifdef __AVX2__
619 :
620 : inline __m256d SQUARE(__m256d x)
621 : {
622 : return _mm256_mul_pd(x, x);
623 : }
624 :
625 : inline __m256d FIXUP_LANES(__m256d x)
626 : {
627 : return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
628 : }
629 :
630 : inline __m256 FIXUP_LANES(__m256 x)
631 : {
632 : return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
633 : }
634 :
635 : #endif
636 :
637 : template <class T>
638 : static int
639 10 : QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
640 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
641 : T *CPL_RESTRICT pDstScanline)
642 : {
643 : // Optimized implementation for RMS on UInt16 by
644 : // processing by group of 4 output pixels.
645 10 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
646 :
647 10 : int iDstPixel = 0;
648 10 : const auto zero = _mm_setzero_si128();
649 :
650 : #ifdef __AVX2__
651 : const auto zeroDot25 = _mm256_set1_pd(0.25);
652 : const auto zeroDot5 = _mm256_set1_pd(0.5);
653 :
654 : // The first four 0's could be anything, as we only take the bottom
655 : // 128 bits.
656 : const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
657 : #else
658 10 : const auto zeroDot25 = _mm_set1_pd(0.25);
659 10 : const auto zeroDot5 = _mm_set1_pd(0.5);
660 : #endif
661 :
662 40 : for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
663 : {
664 : // Load 8 UInt16 from each line
665 30 : const auto firstLine = _mm_loadu_si128(
666 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
667 : const auto secondLine =
668 30 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
669 30 : pSrcScanlineShifted + nChunkXSize));
670 :
671 : // Detect if all of the source values fit in 14 bits.
672 : // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
673 : // and we can do a much faster implementation.
674 : const auto maskTmp =
675 60 : _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
676 : #if defined(__i386__) || defined(_M_IX86)
677 : uint64_t nMaskFitsIn14Bits = 0;
678 : _mm_storel_epi64(
679 : reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
680 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
681 : #else
682 30 : const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
683 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
684 : #endif
685 30 : if (nMaskFitsIn14Bits == 0)
686 : {
687 : // Multiplication of 16 bit values and horizontal
688 : // addition of 32 bit results
689 : const auto firstLineHSumSquare =
690 26 : _mm_madd_epi16(firstLine, firstLine);
691 : const auto secondLineHSumSquare =
692 26 : _mm_madd_epi16(secondLine, secondLine);
693 : // Vertical addition
694 : const auto sumSquares =
695 26 : _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
696 : // In theory we should take sqrt(sumSquares * 0.25f)
697 : // but given the rounding we do, this is equivalent to
698 : // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
699 : // sumSquares <= 4 * 16383^2
700 26 : const auto one32 = _mm_set1_epi32(1);
701 : const auto sumSquaresPlusOneDiv4 =
702 52 : _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
703 : // Take square root and truncate/floor to int32
704 78 : auto rms = _mm_cvttps_epi32(
705 : _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
706 :
707 : // Round to upper value if it minimizes the
708 : // error |rms^2 - sumSquares/4|
709 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
710 : // rms += 1;
711 : // which is equivalent to:
712 : // if( rms * rms + rms < (sumSquares+1) / 4 )
713 : // rms += 1;
714 : auto mask =
715 78 : _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
716 : _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
717 26 : rms = _mm_sub_epi32(rms, mask);
718 : // Pack each 32 bit RMS value to 16 bits
719 26 : rms = _mm_packs_epi32(rms, rms /* could be anything */);
720 : _mm_storel_epi64(
721 26 : reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
722 26 : pSrcScanlineShifted += 8;
723 26 : continue;
724 : }
725 :
726 : // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
727 : // to 32 bit would result in 4 multiplications instead of 8, but
728 : // mullo/mulhi have a worse throughput than mul_pd.
729 :
730 : // Extend those UInt16s as UInt32s
731 4 : const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
732 4 : const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
733 4 : const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
734 4 : const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
735 :
736 : #ifdef __AVX2__
737 : // Multiplication of 32 bit values previously converted to 64 bit double
738 : const auto firstLineLoDbl = SQUARE(_mm256_cvtepi32_pd(firstLineLo));
739 : const auto firstLineHiDbl = SQUARE(_mm256_cvtepi32_pd(firstLineHi));
740 : const auto secondLineLoDbl = SQUARE(_mm256_cvtepi32_pd(secondLineLo));
741 : const auto secondLineHiDbl = SQUARE(_mm256_cvtepi32_pd(secondLineHi));
742 :
743 : // Vertical addition of squares
744 : const auto sumSquaresLo =
745 : _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
746 : const auto sumSquaresHi =
747 : _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
748 :
749 : // Horizontal addition of squares
750 : const auto sumSquares =
751 : FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
752 :
753 : const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
754 :
755 : // Take square root and truncate/floor to int32
756 : auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
757 : const auto rmsDouble = _mm256_cvtepi32_pd(rms);
758 : const auto right = _mm256_sub_pd(
759 : sumDivWeight, _mm256_add_pd(SQUARE(rmsDouble), rmsDouble));
760 :
761 : auto mask =
762 : _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
763 : // Extract 32-bit from each of the 4 64-bit masks
764 : // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
765 : // _MM_SHUFFLE(2,0,2,0)));
766 : mask = _mm256_permutevar8x32_ps(mask, permutation);
767 : const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
768 :
769 : // Apply the correction
770 : rms = _mm_sub_epi32(rms, maskI);
771 :
772 : // Pack each 32 bit RMS value to 16 bits
773 : rms = _mm_packus_epi32(rms, rms /* could be anything */);
774 : #else
775 : // Multiplication of 32 bit values previously converted to 64 bit double
776 4 : const auto firstLineLoLo = SQUARE(_mm_cvtepi32_pd(firstLineLo));
777 : const auto firstLineLoHi =
778 8 : SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
779 4 : const auto firstLineHiLo = SQUARE(_mm_cvtepi32_pd(firstLineHi));
780 : const auto firstLineHiHi =
781 8 : SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
782 :
783 4 : const auto secondLineLoLo = SQUARE(_mm_cvtepi32_pd(secondLineLo));
784 : const auto secondLineLoHi =
785 8 : SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
786 4 : const auto secondLineHiLo = SQUARE(_mm_cvtepi32_pd(secondLineHi));
787 : const auto secondLineHiHi =
788 8 : SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
789 :
790 : // Vertical addition of squares
791 4 : const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
792 4 : const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
793 4 : const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
794 4 : const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
795 :
796 : // Horizontal addition of squares
797 4 : const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
798 4 : const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
799 :
800 4 : const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
801 4 : const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
802 : // Take square root and truncate/floor to int32
803 8 : const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
804 8 : const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
805 :
806 : // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
807 : // if( 0.5 < sumDivWeight - (rms * rms + rms) )
808 : // rms += 1;
809 4 : const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
810 4 : const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
811 8 : const auto rightLo = _mm_sub_pd(
812 : sumDivWeightLo, _mm_add_pd(SQUARE(rmsLoDouble), rmsLoDouble));
813 12 : const auto rightHi = _mm_sub_pd(
814 : sumDivWeightHi, _mm_add_pd(SQUARE(rmsHiDouble), rmsHiDouble));
815 :
816 8 : const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
817 4 : const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
818 : // The value of the mask will be -1 when the correction needs to be
819 : // applied
820 8 : const auto mask = _mm_castps_si128(_mm_shuffle_ps(
821 : maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
822 :
823 16 : auto rms = _mm_castps_si128(
824 : _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
825 : // Apply the correction
826 4 : rms = _mm_sub_epi32(rms, mask);
827 :
828 : // Pack each 32 bit RMS value to 16 bits
829 4 : rms = sse2_packus_epi32(rms, rms /* could be anything */);
830 : #endif
831 :
832 4 : _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
833 : rms);
834 4 : pSrcScanlineShifted += 8;
835 : }
836 :
837 : zeroupper();
838 :
839 10 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
840 10 : return iDstPixel;
841 : }
842 :
843 : /************************************************************************/
844 : /* AverageUInt16SSE2() */
845 : /************************************************************************/
846 :
847 : template <class T>
848 9 : static int AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
849 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
850 : T *CPL_RESTRICT pDstScanline)
851 : {
852 : // Optimized implementation for average on UInt16 by
853 : // processing by group of 8 output pixels.
854 :
855 9 : const auto mask = _mm_set1_epi32(0xFFFF);
856 9 : const auto two = _mm_set1_epi32(2);
857 9 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
858 :
859 9 : int iDstPixel = 0;
860 13 : for (; iDstPixel < nDstXWidth - 7; iDstPixel += 8)
861 : {
862 : __m128i averageLow;
863 : // Load 8 UInt16 from each line
864 : {
865 4 : const auto firstLine = _mm_loadu_si128(
866 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
867 : const auto secondLine =
868 4 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
869 4 : pSrcScanlineShifted + nChunkXSize));
870 :
871 : // Horizontal addition and extension to 32 bit
872 12 : const auto horizAddFirstLine = _mm_add_epi32(
873 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
874 : const auto horizAddSecondLine =
875 12 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
876 : _mm_srli_epi32(secondLine, 16));
877 :
878 : // Vertical addition and average computation
879 : // average = (sum + 2) >> 2
880 8 : const auto sum = _mm_add_epi32(
881 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
882 4 : averageLow = _mm_srli_epi32(sum, 2);
883 : }
884 : // Load 8 UInt16 from each line
885 : __m128i averageHigh;
886 : {
887 4 : const auto firstLine = _mm_loadu_si128(
888 4 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted + 8));
889 : const auto secondLine =
890 4 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
891 4 : pSrcScanlineShifted + 8 + nChunkXSize));
892 :
893 : // Horizontal addition and extension to 32 bit
894 12 : const auto horizAddFirstLine = _mm_add_epi32(
895 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
896 : const auto horizAddSecondLine =
897 12 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
898 : _mm_srli_epi32(secondLine, 16));
899 :
900 : // Vertical addition and average computation
901 : // average = (sum + 2) >> 2
902 8 : const auto sum = _mm_add_epi32(
903 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
904 4 : averageHigh = _mm_srli_epi32(sum, 2);
905 : }
906 :
907 : // Pack each 32 bit average value to 16 bits
908 4 : auto average = sse2_packus_epi32(averageLow, averageHigh);
909 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
910 : average);
911 4 : pSrcScanlineShifted += 16;
912 : }
913 :
914 9 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
915 9 : return iDstPixel;
916 : }
917 :
918 : /************************************************************************/
919 : /* QuadraticMeanFloatSSE2() */
920 : /************************************************************************/
921 :
922 : #ifdef __AVX2__
923 : #define RMS_FLOAT_ELTS 8
924 : #define set1_ps _mm256_set1_ps
925 : #define loadu_ps _mm256_loadu_ps
926 : #define andnot_ps _mm256_andnot_ps
927 : #define and_ps _mm256_and_ps
928 : #define max_ps _mm256_max_ps
929 : #define shuffle_ps _mm256_shuffle_ps
930 : #define div_ps _mm256_div_ps
931 : #define cmpeq_ps(x, y) _mm256_cmp_ps(x, y, _CMP_EQ_OQ)
932 : #define mul_ps _mm256_mul_ps
933 : #define add_ps _mm256_add_ps
934 : #define hadd_ps _mm256_hadd_ps
935 : #define sqrt_ps _mm256_sqrt_ps
936 : #define or_ps _mm256_or_ps
937 : #define unpacklo_ps _mm256_unpacklo_ps
938 : #define unpackhi_ps _mm256_unpackhi_ps
939 : #define storeu_ps _mm256_storeu_ps
940 :
941 : inline __m256 SQUARE(__m256 x)
942 : {
943 : return _mm256_mul_ps(x, x);
944 : }
945 :
946 : #else
947 :
948 : #ifdef __SSE3__
949 : #define sse2_hadd_ps _mm_hadd_ps
950 : #else
951 : inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
952 : {
953 : auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
954 : auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
955 : return _mm_add_ps(aEven_bEven, aOdd_bOdd); // (aEven + aOdd, bEven + bOdd)
956 : }
957 : #endif
958 :
959 : #define RMS_FLOAT_ELTS 4
960 : #define set1_ps _mm_set1_ps
961 : #define loadu_ps _mm_loadu_ps
962 : #define andnot_ps _mm_andnot_ps
963 : #define and_ps _mm_and_ps
964 : #define max_ps _mm_max_ps
965 : #define shuffle_ps _mm_shuffle_ps
966 : #define div_ps _mm_div_ps
967 : #define cmpeq_ps _mm_cmpeq_ps
968 : #define mul_ps _mm_mul_ps
969 : #define add_ps _mm_add_ps
970 : #define hadd_ps sse2_hadd_ps
971 : #define sqrt_ps _mm_sqrt_ps
972 : #define or_ps _mm_or_ps
973 : #define unpacklo_ps _mm_unpacklo_ps
974 : #define unpackhi_ps _mm_unpackhi_ps
975 : #define storeu_ps _mm_storeu_ps
976 :
977 272 : inline __m128 SQUARE(__m128 x)
978 : {
979 272 : return _mm_mul_ps(x, x);
980 : }
981 :
982 68 : inline __m128 FIXUP_LANES(__m128 x)
983 : {
984 68 : return x;
985 : }
986 :
987 : #endif
988 :
989 : template <class T>
990 : static int NOINLINE
991 34 : QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
992 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
993 : T *CPL_RESTRICT pDstScanline)
994 : {
995 : // Optimized implementation for RMS on Float32 by
996 : // processing by group of RMS_FLOAT_ELTS output pixels.
997 34 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
998 :
999 34 : int iDstPixel = 0;
1000 34 : const auto minus_zero = set1_ps(-0.0f);
1001 34 : const auto zeroDot25 = set1_ps(0.25f);
1002 34 : const auto one = set1_ps(1.0f);
1003 68 : const auto infv = set1_ps(std::numeric_limits<float>::infinity());
1004 :
1005 102 : for (; iDstPixel < nDstXWidth - (RMS_FLOAT_ELTS - 1);
1006 : iDstPixel += RMS_FLOAT_ELTS)
1007 : {
1008 : // Load 2*RMS_FLOAT_ELTS Float32 from each line
1009 : auto firstLineLo =
1010 68 : loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
1011 68 : auto firstLineHi = loadu_ps(reinterpret_cast<float const *>(
1012 68 : pSrcScanlineShifted + RMS_FLOAT_ELTS));
1013 68 : auto secondLineLo = loadu_ps(
1014 68 : reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
1015 68 : auto secondLineHi = loadu_ps(reinterpret_cast<float const *>(
1016 68 : pSrcScanlineShifted + RMS_FLOAT_ELTS + nChunkXSize));
1017 :
1018 : // Take the absolute value
1019 68 : firstLineLo = andnot_ps(minus_zero, firstLineLo);
1020 68 : firstLineHi = andnot_ps(minus_zero, firstLineHi);
1021 68 : secondLineLo = andnot_ps(minus_zero, secondLineLo);
1022 68 : secondLineHi = andnot_ps(minus_zero, secondLineHi);
1023 :
1024 : auto firstLineEven =
1025 68 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1026 : auto firstLineOdd =
1027 68 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1028 : auto secondLineEven =
1029 68 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1030 : auto secondLineOdd =
1031 68 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1032 :
1033 : // Compute the maximum of each RMS_FLOAT_ELTS value to RMS-average
1034 204 : const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
1035 : max_ps(secondLineEven, secondLineEven));
1036 :
1037 : // Normalize each value by the maximum of the RMS_FLOAT_ELTS ones.
1038 : // This step is important to avoid that the square evaluates to infinity
1039 : // for sufficiently big input.
1040 68 : auto invMax = div_ps(one, maxV);
1041 : // Deal with 0 being the maximum to correct division by zero
1042 : // note: comparing to -0 leads to identical results as to comparing with
1043 : // 0
1044 136 : invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
1045 :
1046 68 : firstLineEven = mul_ps(firstLineEven, invMax);
1047 68 : firstLineOdd = mul_ps(firstLineOdd, invMax);
1048 68 : secondLineEven = mul_ps(secondLineEven, invMax);
1049 68 : secondLineOdd = mul_ps(secondLineOdd, invMax);
1050 :
1051 : // Compute squares
1052 68 : firstLineEven = SQUARE(firstLineEven);
1053 68 : firstLineOdd = SQUARE(firstLineOdd);
1054 68 : secondLineEven = SQUARE(secondLineEven);
1055 68 : secondLineOdd = SQUARE(secondLineOdd);
1056 :
1057 204 : const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
1058 : add_ps(secondLineEven, secondLineOdd));
1059 :
1060 204 : auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
1061 :
1062 : // Deal with infinity being the maximum
1063 68 : const auto maskIsInf = cmpeq_ps(maxV, infv);
1064 136 : rms = or_ps(andnot_ps(maskIsInf, rms), and_ps(maskIsInf, infv));
1065 :
1066 68 : rms = FIXUP_LANES(rms);
1067 :
1068 : // coverity[incompatible_cast]
1069 68 : storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]), rms);
1070 68 : pSrcScanlineShifted += RMS_FLOAT_ELTS * 2;
1071 : }
1072 :
1073 : zeroupper();
1074 :
1075 34 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1076 34 : return iDstPixel;
1077 : }
1078 :
1079 : /************************************************************************/
1080 : /* AverageFloatSSE2() */
1081 : /************************************************************************/
1082 :
1083 : template <class T>
1084 14 : static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
1085 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1086 : T *CPL_RESTRICT pDstScanline)
1087 : {
1088 : // Optimized implementation for average on Float32 by
1089 : // processing by group of 4 output pixels.
1090 14 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1091 :
1092 14 : int iDstPixel = 0;
1093 14 : const auto zeroDot25 = _mm_set1_ps(0.25f);
1094 :
1095 32 : for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
1096 : {
1097 : // Load 8 Float32 from each line
1098 : const auto firstLineLo =
1099 18 : _mm_loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
1100 18 : const auto firstLineHi = _mm_loadu_ps(
1101 18 : reinterpret_cast<float const *>(pSrcScanlineShifted + 4));
1102 18 : const auto secondLineLo = _mm_loadu_ps(
1103 18 : reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
1104 18 : const auto secondLineHi = _mm_loadu_ps(reinterpret_cast<float const *>(
1105 18 : pSrcScanlineShifted + 4 + nChunkXSize));
1106 :
1107 : // Vertical addition
1108 18 : const auto sumLo = _mm_add_ps(firstLineLo, secondLineLo);
1109 18 : const auto sumHi = _mm_add_ps(firstLineHi, secondLineHi);
1110 :
1111 : // Horizontal addition
1112 : const auto A =
1113 18 : _mm_shuffle_ps(sumLo, sumHi, 0 | (2 << 2) | (0 << 4) | (2 << 6));
1114 : const auto B =
1115 18 : _mm_shuffle_ps(sumLo, sumHi, 1 | (3 << 2) | (1 << 4) | (3 << 6));
1116 18 : const auto sum = _mm_add_ps(A, B);
1117 :
1118 18 : const auto average = _mm_mul_ps(sum, zeroDot25);
1119 :
1120 : // coverity[incompatible_cast]
1121 18 : _mm_storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]),
1122 : average);
1123 18 : pSrcScanlineShifted += 8;
1124 : }
1125 :
1126 14 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1127 14 : return iDstPixel;
1128 : }
1129 :
1130 : #endif
1131 :
1132 : /************************************************************************/
1133 : /* GDALResampleChunk_AverageOrRMS() */
1134 : /************************************************************************/
1135 :
1136 : template <class T, class Tsum, GDALDataType eWrkDataType>
1137 : static CPLErr
1138 10399 : GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
1139 : const T *pChunk, void **ppDstBuffer)
1140 : {
1141 10399 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1142 10399 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1143 10399 : const double dfSrcXDelta = args.dfSrcXDelta;
1144 10399 : const double dfSrcYDelta = args.dfSrcYDelta;
1145 10399 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1146 10399 : const int nChunkXOff = args.nChunkXOff;
1147 10399 : const int nChunkYOff = args.nChunkYOff;
1148 10399 : const int nChunkXSize = args.nChunkXSize;
1149 10399 : const int nChunkYSize = args.nChunkYSize;
1150 10399 : const int nDstXOff = args.nDstXOff;
1151 10399 : const int nDstXOff2 = args.nDstXOff2;
1152 10399 : const int nDstYOff = args.nDstYOff;
1153 10399 : const int nDstYOff2 = args.nDstYOff2;
1154 10399 : const char *pszResampling = args.pszResampling;
1155 10399 : bool bHasNoData = args.bHasNoData;
1156 10399 : const double dfNoDataValue = args.dfNoDataValue;
1157 10399 : const GDALColorTable *poColorTable = args.poColorTable;
1158 10399 : const bool bPropagateNoData = args.bPropagateNoData;
1159 :
1160 : // AVERAGE_BIT2GRAYSCALE
1161 : const bool bBit2Grayscale =
1162 10399 : CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"));
1163 10399 : const bool bQuadraticMean = CPL_TO_BOOL(EQUAL(pszResampling, "RMS"));
1164 10399 : if (bBit2Grayscale)
1165 9 : poColorTable = nullptr;
1166 :
1167 : T tNoDataValue;
1168 10399 : if (!bHasNoData)
1169 10348 : tNoDataValue = 0;
1170 : else
1171 51 : tNoDataValue = static_cast<T>(dfNoDataValue);
1172 10399 : const T tReplacementVal =
1173 107 : bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
1174 51 : args.eOvrDataType, dfNoDataValue))
1175 : : 0;
1176 :
1177 10399 : int nChunkRightXOff = nChunkXOff + nChunkXSize;
1178 10399 : int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1179 10399 : int nDstXWidth = nDstXOff2 - nDstXOff;
1180 :
1181 : /* -------------------------------------------------------------------- */
1182 : /* Allocate buffers. */
1183 : /* -------------------------------------------------------------------- */
1184 10399 : *ppDstBuffer = static_cast<T *>(
1185 10399 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
1186 : GDALGetDataTypeSizeBytes(eWrkDataType)));
1187 10399 : if (*ppDstBuffer == nullptr)
1188 : {
1189 0 : return CE_Failure;
1190 : }
1191 10399 : T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
1192 :
1193 : struct PrecomputedXValue
1194 : {
1195 : int nLeftXOffShifted;
1196 : int nRightXOffShifted;
1197 : double dfLeftWeight;
1198 : double dfRightWeight;
1199 : double dfTotalWeightFullLine;
1200 : };
1201 :
1202 : PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
1203 10399 : VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(PrecomputedXValue)));
1204 :
1205 10399 : if (pasSrcX == nullptr)
1206 : {
1207 0 : VSIFree(pasSrcX);
1208 0 : return CE_Failure;
1209 : }
1210 :
1211 10399 : int nTransparentIdx = -1;
1212 10399 : std::vector<GDALColorEntry> colorEntries;
1213 10399 : if (poColorTable)
1214 5 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1215 :
1216 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1217 : // it as nodata value
1218 10426 : if (bHasNoData && dfNoDataValue >= 0.0f &&
1219 27 : tNoDataValue < colorEntries.size())
1220 1 : colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1221 :
1222 : // Or if we have no explicit nodata, but a color table entry that is
1223 : // transparent, consider it as the nodata value
1224 10398 : else if (!bHasNoData && nTransparentIdx >= 0)
1225 : {
1226 0 : bHasNoData = true;
1227 0 : tNoDataValue = static_cast<T>(nTransparentIdx);
1228 : }
1229 :
1230 : /* ==================================================================== */
1231 : /* Precompute inner loop constants. */
1232 : /* ==================================================================== */
1233 10399 : bool bSrcXSpacingIsTwo = true;
1234 10399 : int nLastSrcXOff2 = -1;
1235 867096 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1236 : {
1237 856697 : double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
1238 : // Apply some epsilon to avoid numerical precision issues
1239 856697 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
1240 856697 : double dfSrcXOff2 = dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
1241 856697 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
1242 :
1243 856697 : if (nSrcXOff < nChunkXOff)
1244 0 : nSrcXOff = nChunkXOff;
1245 856697 : if (nSrcXOff2 == nSrcXOff)
1246 0 : nSrcXOff2++;
1247 856697 : if (nSrcXOff2 > nChunkRightXOff)
1248 1 : nSrcXOff2 = nChunkRightXOff;
1249 :
1250 856697 : pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
1251 856697 : pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
1252 856697 : nSrcXOff2 - nChunkXOff;
1253 21 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
1254 856697 : (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
1255 856697 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
1256 856697 : 1 - (nSrcXOff2 - dfSrcXOff2);
1257 856697 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
1258 856697 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
1259 856697 : if (nSrcXOff + 1 < nSrcXOff2)
1260 : {
1261 856676 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1262 856676 : nSrcXOff2 - nSrcXOff - 2;
1263 856676 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1264 856676 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
1265 : }
1266 :
1267 856697 : if (nSrcXOff2 - nSrcXOff != 2 ||
1268 727208 : (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
1269 : {
1270 120592 : bSrcXSpacingIsTwo = false;
1271 : }
1272 856697 : nLastSrcXOff2 = nSrcXOff2;
1273 : }
1274 :
1275 : /* ==================================================================== */
1276 : /* Loop over destination scanlines. */
1277 : /* ==================================================================== */
1278 752862 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1279 : {
1280 742463 : double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
1281 742463 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
1282 742463 : if (nSrcYOff < nChunkYOff)
1283 0 : nSrcYOff = nChunkYOff;
1284 :
1285 742463 : double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
1286 742463 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
1287 742463 : if (nSrcYOff2 == nSrcYOff)
1288 0 : ++nSrcYOff2;
1289 742463 : if (nSrcYOff2 > nChunkBottomYOff)
1290 3 : nSrcYOff2 = nChunkBottomYOff;
1291 :
1292 742463 : T *const pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
1293 :
1294 : /* --------------------------------------------------------------------
1295 : */
1296 : /* Loop over destination pixels */
1297 : /* --------------------------------------------------------------------
1298 : */
1299 742463 : if (poColorTable == nullptr)
1300 : {
1301 742348 : if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
1302 : pabyChunkNodataMask == nullptr)
1303 : {
1304 : if (eWrkDataType == GDT_Byte || eWrkDataType == GDT_UInt16)
1305 : {
1306 : // Optimized case : no nodata, overview by a factor of 2 and
1307 : // regular x and y src spacing.
1308 116425 : const T *pSrcScanlineShifted =
1309 116425 : pChunk + pasSrcX[0].nLeftXOffShifted +
1310 116425 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
1311 116425 : nChunkXSize;
1312 116425 : int iDstPixel = 0;
1313 : #ifdef USE_SSE2
1314 116406 : if (bQuadraticMean && eWrkDataType == GDT_Byte)
1315 : {
1316 5385 : iDstPixel = QuadraticMeanByteSSE2OrAVX2(
1317 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1318 : pDstScanline);
1319 : }
1320 111040 : else if (bQuadraticMean /* && eWrkDataType == GDT_UInt16 */)
1321 : {
1322 10 : iDstPixel = QuadraticMeanUInt16SSE2(
1323 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1324 : pDstScanline);
1325 : }
1326 : else if (/* !bQuadraticMean && */ eWrkDataType == GDT_Byte)
1327 : {
1328 111021 : iDstPixel = AverageByteSSE2OrAVX2(
1329 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1330 : pDstScanline);
1331 : }
1332 : else /* if( !bQuadraticMean && eWrkDataType == GDT_UInt16 )
1333 : */
1334 : {
1335 9 : iDstPixel = AverageUInt16SSE2(nDstXWidth, nChunkXSize,
1336 : pSrcScanlineShifted,
1337 : pDstScanline);
1338 : }
1339 : #endif
1340 278781 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1341 : {
1342 162356 : Tsum nTotal = 0;
1343 : T nVal;
1344 162356 : if (bQuadraticMean)
1345 44 : nTotal =
1346 44 : SQUARE<Tsum>(pSrcScanlineShifted[0]) +
1347 44 : SQUARE<Tsum>(pSrcScanlineShifted[1]) +
1348 44 : SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
1349 44 : SQUARE<Tsum>(
1350 44 : pSrcScanlineShifted[1 + nChunkXSize]);
1351 : else
1352 162312 : nTotal = pSrcScanlineShifted[0] +
1353 162312 : pSrcScanlineShifted[1] +
1354 162312 : pSrcScanlineShifted[nChunkXSize] +
1355 162312 : pSrcScanlineShifted[1 + nChunkXSize];
1356 :
1357 162356 : constexpr int nTotalWeight = 4;
1358 162356 : if (bQuadraticMean)
1359 44 : nVal = ComputeIntegerRMS_4values<T>(nTotal);
1360 : else
1361 162312 : nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
1362 : nTotalWeight);
1363 :
1364 : // No need to compare nVal against tNoDataValue as we
1365 : // are in a case where pabyChunkNodataMask == nullptr
1366 : // implies the absence of nodata value.
1367 162356 : pDstScanline[iDstPixel] = nVal;
1368 162356 : pSrcScanlineShifted += 2;
1369 : }
1370 : }
1371 : else
1372 : {
1373 : CPLAssert(eWrkDataType == GDT_Float32 ||
1374 : eWrkDataType == GDT_Float64);
1375 70 : const T *pSrcScanlineShifted =
1376 70 : pChunk + pasSrcX[0].nLeftXOffShifted +
1377 70 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
1378 70 : nChunkXSize;
1379 70 : int iDstPixel = 0;
1380 : #ifdef USE_SSE2
1381 : if (eWrkDataType == GDT_Float32)
1382 : {
1383 48 : if (bQuadraticMean)
1384 : {
1385 34 : iDstPixel = QuadraticMeanFloatSSE2(
1386 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1387 : pDstScanline);
1388 : }
1389 : else
1390 : {
1391 14 : iDstPixel = AverageFloatSSE2(
1392 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1393 : pDstScanline);
1394 : }
1395 : }
1396 : #endif
1397 :
1398 268 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1399 : {
1400 : T nVal;
1401 198 : if (bQuadraticMean)
1402 : {
1403 : // Cast to double to avoid overflows
1404 : // (using std::hypot() is much slower)
1405 100 : nVal = static_cast<T>(std::sqrt(
1406 : 0.25 *
1407 100 : (SQUARE<double>(pSrcScanlineShifted[0]) +
1408 100 : SQUARE<double>(pSrcScanlineShifted[1]) +
1409 100 : SQUARE<double>(
1410 200 : pSrcScanlineShifted[nChunkXSize]) +
1411 100 : SQUARE<double>(
1412 100 : pSrcScanlineShifted[1 + nChunkXSize]))));
1413 : }
1414 : else
1415 : {
1416 98 : nVal = static_cast<T>(
1417 98 : 0.25f * (pSrcScanlineShifted[0] +
1418 98 : pSrcScanlineShifted[1] +
1419 98 : pSrcScanlineShifted[nChunkXSize] +
1420 98 : pSrcScanlineShifted[1 + nChunkXSize]));
1421 : }
1422 :
1423 : // No need to compare nVal against tNoDataValue as we
1424 : // are in a case where pabyChunkNodataMask == nullptr
1425 : // implies the absence of nodata value.
1426 198 : pDstScanline[iDstPixel] = nVal;
1427 198 : pSrcScanlineShifted += 2;
1428 : }
1429 116495 : }
1430 : }
1431 : else
1432 : {
1433 17 : const double dfBottomWeight =
1434 625853 : (nSrcYOff + 1 == nSrcYOff2) ? 1.0
1435 625836 : : 1.0 - (dfSrcYOff - nSrcYOff);
1436 625853 : const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
1437 625853 : nSrcYOff -= nChunkYOff;
1438 625853 : nSrcYOff2 -= nChunkYOff;
1439 :
1440 625853 : double dfTotalWeightFullColumn = dfBottomWeight;
1441 625853 : if (nSrcYOff + 1 < nSrcYOff2)
1442 : {
1443 625835 : dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
1444 625835 : dfTotalWeightFullColumn += dfTopWeight;
1445 : }
1446 :
1447 18585856 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1448 : {
1449 17959981 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1450 17959981 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1451 :
1452 17959981 : double dfTotal = 0;
1453 17959981 : double dfTotalWeight = 0;
1454 17959981 : if (pabyChunkNodataMask == nullptr)
1455 : {
1456 1746435 : auto pChunkShifted =
1457 115 : pChunk +
1458 1746435 : static_cast<GPtrDiff_t>(nSrcYOff) * nChunkXSize;
1459 1746435 : int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1460 1746435 : double dfWeightY = dfBottomWeight;
1461 3493427 : while (true)
1462 : {
1463 : double dfTotalLine;
1464 5239852 : if (bQuadraticMean)
1465 : {
1466 : // Left pixel
1467 : {
1468 104 : const T val = pChunkShifted[nSrcXOff];
1469 104 : dfTotalLine =
1470 104 : SQUARE<double>(val) *
1471 104 : pasSrcX[iDstPixel].dfLeftWeight;
1472 : }
1473 :
1474 104 : if (nSrcXOff + 1 < nSrcXOff2)
1475 : {
1476 : // Middle pixels
1477 104 : for (int iX = nSrcXOff + 1;
1478 424 : iX + 1 < nSrcXOff2; ++iX)
1479 : {
1480 320 : const T val = pChunkShifted[iX];
1481 320 : dfTotalLine += SQUARE<double>(val);
1482 : }
1483 :
1484 : // Right pixel
1485 : {
1486 104 : const T val =
1487 104 : pChunkShifted[nSrcXOff2 - 1];
1488 104 : dfTotalLine +=
1489 104 : SQUARE<double>(val) *
1490 104 : pasSrcX[iDstPixel].dfRightWeight;
1491 : }
1492 : }
1493 : }
1494 : else
1495 : {
1496 : // Left pixel
1497 : {
1498 5239756 : const T val = pChunkShifted[nSrcXOff];
1499 5239756 : dfTotalLine =
1500 5239756 : val * pasSrcX[iDstPixel].dfLeftWeight;
1501 : }
1502 :
1503 5239756 : if (nSrcXOff + 1 < nSrcXOff2)
1504 : {
1505 : // Middle pixels
1506 4239330 : for (int iX = nSrcXOff + 1;
1507 64183126 : iX + 1 < nSrcXOff2; ++iX)
1508 : {
1509 59943836 : const T val = pChunkShifted[iX];
1510 59943836 : dfTotalLine += val;
1511 : }
1512 :
1513 : // Right pixel
1514 : {
1515 4239330 : const T val =
1516 4239330 : pChunkShifted[nSrcXOff2 - 1];
1517 4239330 : dfTotalLine +=
1518 4239330 : val *
1519 4239330 : pasSrcX[iDstPixel].dfRightWeight;
1520 : }
1521 : }
1522 : }
1523 :
1524 5239852 : dfTotal += dfTotalLine * dfWeightY;
1525 5239852 : --nCounterY;
1526 5239852 : if (nCounterY < 0)
1527 1746435 : break;
1528 3493427 : pChunkShifted += nChunkXSize;
1529 3493427 : dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
1530 : }
1531 :
1532 1746435 : dfTotalWeight =
1533 1746435 : pasSrcX[iDstPixel].dfTotalWeightFullLine *
1534 : dfTotalWeightFullColumn;
1535 : }
1536 : else
1537 : {
1538 16213566 : GPtrDiff_t nCount = 0;
1539 71190898 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1540 : {
1541 54977432 : const auto pChunkShifted =
1542 132 : pChunk +
1543 54977432 : static_cast<GPtrDiff_t>(iY) * nChunkXSize;
1544 :
1545 54977432 : double dfTotalLine = 0;
1546 54977432 : double dfTotalWeightLine = 0;
1547 : // Left pixel
1548 : {
1549 54977432 : const int iX = nSrcXOff;
1550 54977432 : const T val = pChunkShifted[iX];
1551 54977432 : if (pabyChunkNodataMask[iX + iY * nChunkXSize])
1552 : {
1553 23420081 : nCount++;
1554 23420081 : const double dfWeightX =
1555 23420081 : pasSrcX[iDstPixel].dfLeftWeight;
1556 23420081 : dfTotalWeightLine = dfWeightX;
1557 23420081 : if (bQuadraticMean)
1558 60 : dfTotalLine =
1559 60 : SQUARE<double>(val) * dfWeightX;
1560 : else
1561 23419981 : dfTotalLine = val * dfWeightX;
1562 : }
1563 : }
1564 :
1565 54977432 : if (nSrcXOff + 1 < nSrcXOff2)
1566 : {
1567 : // Middle pixels
1568 145172132 : for (int iX = nSrcXOff + 1; iX + 1 < nSrcXOff2;
1569 : ++iX)
1570 : {
1571 90195000 : const T val = pChunkShifted[iX];
1572 90195000 : if (pabyChunkNodataMask[iX +
1573 90195000 : iY * nChunkXSize])
1574 : {
1575 39728200 : nCount++;
1576 39728200 : dfTotalWeightLine += 1;
1577 39728200 : if (bQuadraticMean)
1578 0 : dfTotalLine += SQUARE<double>(val);
1579 : else
1580 39728200 : dfTotalLine += val;
1581 : }
1582 : }
1583 :
1584 : // Right pixel
1585 : {
1586 54977332 : const int iX = nSrcXOff2 - 1;
1587 54977332 : const T val = pChunkShifted[iX];
1588 54977332 : if (pabyChunkNodataMask[iX +
1589 54977332 : iY * nChunkXSize])
1590 : {
1591 23419247 : nCount++;
1592 23419247 : const double dfWeightX =
1593 23419247 : pasSrcX[iDstPixel].dfRightWeight;
1594 23419247 : dfTotalWeightLine += dfWeightX;
1595 23419247 : if (bQuadraticMean)
1596 70 : dfTotalLine +=
1597 61 : SQUARE<double>(val) * dfWeightX;
1598 : else
1599 23419246 : dfTotalLine += val * dfWeightX;
1600 : }
1601 : }
1602 : }
1603 :
1604 93741198 : const double dfWeightY =
1605 : (iY == nSrcYOff) ? dfBottomWeight
1606 38763866 : : (iY + 1 == nSrcYOff2) ? dfTopWeight
1607 : : 1.0;
1608 54977432 : dfTotal += dfTotalLine * dfWeightY;
1609 54977432 : dfTotalWeight += dfTotalWeightLine * dfWeightY;
1610 : }
1611 :
1612 16213566 : if (nCount == 0 ||
1613 8 : (bPropagateNoData &&
1614 : nCount <
1615 8 : static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
1616 8 : (nSrcXOff2 - nSrcXOff)))
1617 : {
1618 9461832 : pDstScanline[iDstPixel] = tNoDataValue;
1619 9461832 : continue;
1620 : }
1621 : }
1622 : if (eWrkDataType == GDT_Byte)
1623 : {
1624 : T nVal;
1625 8497990 : if (bQuadraticMean)
1626 38 : nVal = ComputeIntegerRMS<T, int>(dfTotal,
1627 : dfTotalWeight);
1628 : else
1629 8497950 : nVal =
1630 8497950 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1631 8497990 : if (bHasNoData && nVal == tNoDataValue)
1632 0 : nVal = tReplacementVal;
1633 8497990 : pDstScanline[iDstPixel] = nVal;
1634 : }
1635 : else if (eWrkDataType == GDT_UInt16)
1636 : {
1637 : T nVal;
1638 8 : if (bQuadraticMean)
1639 4 : nVal = ComputeIntegerRMS<T, uint64_t>(
1640 : dfTotal, dfTotalWeight);
1641 : else
1642 4 : nVal =
1643 4 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1644 8 : if (bHasNoData && nVal == tNoDataValue)
1645 0 : nVal = tReplacementVal;
1646 8 : pDstScanline[iDstPixel] = nVal;
1647 : }
1648 : else
1649 : {
1650 : T nVal;
1651 151 : if (bQuadraticMean)
1652 20 : nVal =
1653 25 : static_cast<T>(sqrt(dfTotal / dfTotalWeight));
1654 : else
1655 126 : nVal = static_cast<T>(dfTotal / dfTotalWeight);
1656 151 : if (bHasNoData && nVal == tNoDataValue)
1657 2 : nVal = tReplacementVal;
1658 151 : pDstScanline[iDstPixel] = nVal;
1659 : }
1660 : }
1661 : }
1662 : }
1663 : else
1664 : {
1665 115 : nSrcYOff -= nChunkYOff;
1666 115 : nSrcYOff2 -= nChunkYOff;
1667 :
1668 6590 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1669 : {
1670 6475 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1671 6475 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1672 :
1673 6475 : GPtrDiff_t nTotalR = 0;
1674 6475 : GPtrDiff_t nTotalG = 0;
1675 6475 : GPtrDiff_t nTotalB = 0;
1676 6475 : GPtrDiff_t nCount = 0;
1677 :
1678 19425 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1679 : {
1680 38850 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
1681 : {
1682 25900 : const T val = pChunk[iX + static_cast<GPtrDiff_t>(iY) *
1683 25900 : nChunkXSize];
1684 : // cppcheck-suppress unsignedLessThanZero
1685 25900 : if (val < 0 || val >= colorEntries.size())
1686 0 : continue;
1687 25900 : size_t idx = static_cast<size_t>(val);
1688 25900 : const auto &entry = colorEntries[idx];
1689 25900 : if (entry.c4)
1690 : {
1691 14128 : if (bQuadraticMean)
1692 : {
1693 800 : nTotalR += SQUARE<int>(entry.c1);
1694 800 : nTotalG += SQUARE<int>(entry.c2);
1695 800 : nTotalB += SQUARE<int>(entry.c3);
1696 800 : ++nCount;
1697 : }
1698 : else
1699 : {
1700 13328 : nTotalR += entry.c1;
1701 13328 : nTotalG += entry.c2;
1702 13328 : nTotalB += entry.c3;
1703 13328 : ++nCount;
1704 : }
1705 : }
1706 : }
1707 : }
1708 :
1709 6475 : if (nCount == 0 ||
1710 0 : (bPropagateNoData &&
1711 0 : nCount < static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
1712 0 : (nSrcXOff2 - nSrcXOff)))
1713 : {
1714 2838 : pDstScanline[iDstPixel] = tNoDataValue;
1715 : }
1716 : else
1717 : {
1718 : GDALColorEntry color;
1719 3637 : if (bQuadraticMean)
1720 : {
1721 200 : color.c1 =
1722 200 : static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
1723 200 : color.c2 =
1724 200 : static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
1725 200 : color.c3 =
1726 200 : static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
1727 : }
1728 : else
1729 : {
1730 3437 : color.c1 =
1731 3437 : static_cast<short>((nTotalR + nCount / 2) / nCount);
1732 3437 : color.c2 =
1733 3437 : static_cast<short>((nTotalG + nCount / 2) / nCount);
1734 3437 : color.c3 =
1735 3437 : static_cast<short>((nTotalB + nCount / 2) / nCount);
1736 : }
1737 3637 : pDstScanline[iDstPixel] =
1738 3637 : static_cast<T>(BestColorEntry(colorEntries, color));
1739 : }
1740 : }
1741 : }
1742 : }
1743 :
1744 10399 : CPLFree(pasSrcX);
1745 :
1746 10398 : return CE_None;
1747 : }
1748 :
1749 : static CPLErr
1750 10399 : GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
1751 : const void *pChunk, void **ppDstBuffer,
1752 : GDALDataType *peDstBufferDataType)
1753 : {
1754 10399 : *peDstBufferDataType = args.eWrkDataType;
1755 10399 : switch (args.eWrkDataType)
1756 : {
1757 10334 : case GDT_Byte:
1758 : {
1759 10334 : return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_Byte>(
1760 10333 : args, static_cast<const GByte *>(pChunk), ppDstBuffer);
1761 : }
1762 :
1763 9 : case GDT_UInt16:
1764 : {
1765 9 : if (EQUAL(args.pszResampling, "RMS"))
1766 : {
1767 : // Use double as accumulation type, because UInt32 could overflow
1768 : return GDALResampleChunk_AverageOrRMS_T<GUInt16, double,
1769 5 : GDT_UInt16>(
1770 5 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1771 : }
1772 : else
1773 : {
1774 : return GDALResampleChunk_AverageOrRMS_T<GUInt16, GUInt32,
1775 4 : GDT_UInt16>(
1776 4 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1777 : }
1778 : }
1779 :
1780 39 : case GDT_Float32:
1781 : {
1782 39 : return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32>(
1783 39 : args, static_cast<const float *>(pChunk), ppDstBuffer);
1784 : }
1785 :
1786 17 : case GDT_Float64:
1787 : {
1788 : return GDALResampleChunk_AverageOrRMS_T<double, double,
1789 17 : GDT_Float64>(
1790 17 : args, static_cast<const double *>(pChunk), ppDstBuffer);
1791 : }
1792 :
1793 0 : default:
1794 0 : break;
1795 : }
1796 :
1797 0 : CPLAssert(false);
1798 : return CE_Failure;
1799 : }
1800 :
1801 : /************************************************************************/
1802 : /* GDALResampleChunk_Gauss() */
1803 : /************************************************************************/
1804 :
1805 86 : static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
1806 : const void *pChunk, void **ppDstBuffer,
1807 : GDALDataType *peDstBufferDataType)
1808 :
1809 : {
1810 86 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1811 86 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1812 86 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1813 86 : const int nChunkXOff = args.nChunkXOff;
1814 86 : const int nChunkXSize = args.nChunkXSize;
1815 86 : const int nChunkYOff = args.nChunkYOff;
1816 86 : const int nChunkYSize = args.nChunkYSize;
1817 86 : const int nDstXOff = args.nDstXOff;
1818 86 : const int nDstXOff2 = args.nDstXOff2;
1819 86 : const int nDstYOff = args.nDstYOff;
1820 86 : const int nDstYOff2 = args.nDstYOff2;
1821 86 : const bool bHasNoData = args.bHasNoData;
1822 86 : double dfNoDataValue = args.dfNoDataValue;
1823 86 : const GDALColorTable *poColorTable = args.poColorTable;
1824 :
1825 86 : const double *const padfChunk = static_cast<const double *>(pChunk);
1826 :
1827 86 : *ppDstBuffer =
1828 86 : VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
1829 : GDALGetDataTypeSizeBytes(GDT_Float64));
1830 86 : if (*ppDstBuffer == nullptr)
1831 : {
1832 0 : return CE_Failure;
1833 : }
1834 86 : *peDstBufferDataType = GDT_Float64;
1835 86 : double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
1836 :
1837 : /* -------------------------------------------------------------------- */
1838 : /* Create the filter kernel and allocate scanline buffer. */
1839 : /* -------------------------------------------------------------------- */
1840 86 : int nGaussMatrixDim = 3;
1841 : const int *panGaussMatrix;
1842 86 : constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
1843 86 : constexpr int anGaussMatrix5x5[] = {1, 4, 6, 4, 1, 4, 16, 24, 16,
1844 : 4, 6, 24, 36, 24, 6, 4, 16, 24,
1845 : 16, 4, 1, 4, 6, 4, 1};
1846 86 : constexpr int anGaussMatrix7x7[] = {
1847 : 1, 6, 15, 20, 15, 6, 1, 6, 36, 90, 120, 90, 36,
1848 : 6, 15, 90, 225, 300, 225, 90, 15, 20, 120, 300, 400, 300,
1849 : 120, 20, 15, 90, 225, 300, 225, 90, 15, 6, 36, 90, 120,
1850 : 90, 36, 6, 1, 6, 15, 20, 15, 6, 1};
1851 :
1852 86 : const int nOXSize = args.nOvrXSize;
1853 86 : const int nOYSize = args.nOvrYSize;
1854 86 : const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
1855 :
1856 : // matrix for gauss filter
1857 86 : if (nResYFactor <= 2)
1858 : {
1859 85 : panGaussMatrix = anGaussMatrix3x3;
1860 85 : nGaussMatrixDim = 3;
1861 : }
1862 1 : else if (nResYFactor <= 4)
1863 : {
1864 0 : panGaussMatrix = anGaussMatrix5x5;
1865 0 : nGaussMatrixDim = 5;
1866 : }
1867 : else
1868 : {
1869 1 : panGaussMatrix = anGaussMatrix7x7;
1870 1 : nGaussMatrixDim = 7;
1871 : }
1872 :
1873 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
1874 : int *panGaussMatrixDup = static_cast<int *>(
1875 : CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
1876 : memcpy(panGaussMatrixDup, panGaussMatrix,
1877 : sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
1878 : panGaussMatrix = panGaussMatrixDup;
1879 : #endif
1880 :
1881 86 : if (!bHasNoData)
1882 79 : dfNoDataValue = 0.0;
1883 :
1884 86 : std::vector<GDALColorEntry> colorEntries;
1885 86 : int nTransparentIdx = -1;
1886 86 : if (poColorTable)
1887 2 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1888 :
1889 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1890 : // it as nodata value.
1891 92 : if (bHasNoData && dfNoDataValue >= 0.0f &&
1892 6 : dfNoDataValue < colorEntries.size())
1893 0 : colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
1894 :
1895 : // Or if we have no explicit nodata, but a color table entry that is
1896 : // transparent, consider it as the nodata value.
1897 86 : else if (!bHasNoData && nTransparentIdx >= 0)
1898 : {
1899 0 : dfNoDataValue = nTransparentIdx;
1900 : }
1901 :
1902 86 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
1903 86 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1904 86 : const int nDstXWidth = nDstXOff2 - nDstXOff;
1905 :
1906 : /* ==================================================================== */
1907 : /* Loop over destination scanlines. */
1908 : /* ==================================================================== */
1909 16488 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1910 : {
1911 16402 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
1912 16402 : int nSrcYOff2 =
1913 16402 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
1914 :
1915 16402 : if (nSrcYOff < nChunkYOff)
1916 : {
1917 0 : nSrcYOff = nChunkYOff;
1918 0 : nSrcYOff2++;
1919 : }
1920 :
1921 16402 : const int iSizeY = nSrcYOff2 - nSrcYOff;
1922 16402 : nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
1923 16402 : nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
1924 :
1925 16402 : if (nSrcYOff2 > nChunkBottomYOff ||
1926 16359 : (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
1927 : {
1928 44 : nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
1929 : }
1930 :
1931 16402 : int nYShiftGaussMatrix = 0;
1932 16402 : if (nSrcYOff < nChunkYOff)
1933 : {
1934 0 : nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
1935 0 : nSrcYOff = nChunkYOff;
1936 : }
1937 :
1938 16402 : const double *const padfSrcScanline =
1939 16402 : padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
1940 16402 : const GByte *pabySrcScanlineNodataMask = nullptr;
1941 16402 : if (pabyChunkNodataMask != nullptr)
1942 152 : pabySrcScanlineNodataMask =
1943 152 : pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
1944 :
1945 : /* --------------------------------------------------------------------
1946 : */
1947 : /* Loop over destination pixels */
1948 : /* --------------------------------------------------------------------
1949 : */
1950 16402 : double *const padfDstScanline =
1951 16402 : padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
1952 4149980 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1953 : {
1954 4133580 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
1955 4133580 : int nSrcXOff2 =
1956 4133580 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
1957 :
1958 4133580 : if (nSrcXOff < nChunkXOff)
1959 : {
1960 0 : nSrcXOff = nChunkXOff;
1961 0 : nSrcXOff2++;
1962 : }
1963 :
1964 4133580 : const int iSizeX = nSrcXOff2 - nSrcXOff;
1965 4133580 : nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
1966 4133580 : nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
1967 :
1968 4133580 : if (nSrcXOff2 > nChunkRightXOff ||
1969 4127930 : (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
1970 : {
1971 5650 : nSrcXOff2 =
1972 5650 : std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
1973 : }
1974 :
1975 4133580 : int nXShiftGaussMatrix = 0;
1976 4133580 : if (nSrcXOff < nChunkXOff)
1977 : {
1978 0 : nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
1979 0 : nSrcXOff = nChunkXOff;
1980 : }
1981 :
1982 4133580 : if (poColorTable == nullptr)
1983 : {
1984 4133380 : double dfTotal = 0.0;
1985 4133380 : GInt64 nCount = 0;
1986 4133380 : const int *panLineWeight =
1987 4133380 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
1988 : nXShiftGaussMatrix;
1989 :
1990 16527900 : for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
1991 12394500 : ++iY, ++j, panLineWeight += nGaussMatrixDim)
1992 : {
1993 49561300 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
1994 : {
1995 37166800 : const double val =
1996 37166800 : padfSrcScanline[iX - nChunkXOff +
1997 37166800 : static_cast<GPtrDiff_t>(iY -
1998 37166800 : nSrcYOff) *
1999 37166800 : nChunkXSize];
2000 37166800 : if (pabySrcScanlineNodataMask == nullptr ||
2001 32872 : pabySrcScanlineNodataMask[iX - nChunkXOff +
2002 32872 : static_cast<GPtrDiff_t>(
2003 32872 : iY - nSrcYOff) *
2004 32872 : nChunkXSize])
2005 : {
2006 37146100 : const int nWeight = panLineWeight[i];
2007 37146100 : dfTotal += val * nWeight;
2008 37146100 : nCount += nWeight;
2009 : }
2010 : }
2011 : }
2012 :
2013 4133380 : if (nCount == 0)
2014 : {
2015 2217 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2016 : }
2017 : else
2018 : {
2019 4131160 : padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
2020 : }
2021 : }
2022 : else
2023 : {
2024 200 : GInt64 nTotalR = 0;
2025 200 : GInt64 nTotalG = 0;
2026 200 : GInt64 nTotalB = 0;
2027 200 : GInt64 nTotalWeight = 0;
2028 200 : const int *panLineWeight =
2029 200 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2030 : nXShiftGaussMatrix;
2031 :
2032 780 : for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2033 580 : ++iY, ++j, panLineWeight += nGaussMatrixDim)
2034 : {
2035 2262 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2036 : {
2037 1682 : const double val =
2038 1682 : padfSrcScanline[iX - nChunkXOff +
2039 1682 : static_cast<GPtrDiff_t>(iY -
2040 1682 : nSrcYOff) *
2041 1682 : nChunkXSize];
2042 1682 : if (val < 0 || val >= colorEntries.size())
2043 0 : continue;
2044 :
2045 1682 : size_t idx = static_cast<size_t>(val);
2046 1682 : if (colorEntries[idx].c4)
2047 : {
2048 1682 : const int nWeight = panLineWeight[i];
2049 1682 : nTotalR +=
2050 1682 : static_cast<GInt64>(colorEntries[idx].c1) *
2051 1682 : nWeight;
2052 1682 : nTotalG +=
2053 1682 : static_cast<GInt64>(colorEntries[idx].c2) *
2054 1682 : nWeight;
2055 1682 : nTotalB +=
2056 1682 : static_cast<GInt64>(colorEntries[idx].c3) *
2057 1682 : nWeight;
2058 1682 : nTotalWeight += nWeight;
2059 : }
2060 : }
2061 : }
2062 :
2063 200 : if (nTotalWeight == 0)
2064 : {
2065 0 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2066 : }
2067 : else
2068 : {
2069 : GDALColorEntry color;
2070 :
2071 200 : color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
2072 : nTotalWeight);
2073 200 : color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
2074 : nTotalWeight);
2075 200 : color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
2076 : nTotalWeight);
2077 200 : padfDstScanline[iDstPixel - nDstXOff] =
2078 200 : BestColorEntry(colorEntries, color);
2079 : }
2080 : }
2081 : }
2082 : }
2083 :
2084 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
2085 : CPLFree(panGaussMatrixDup);
2086 : #endif
2087 :
2088 86 : return CE_None;
2089 : }
2090 :
2091 : /************************************************************************/
2092 : /* GDALResampleChunk_Mode() */
2093 : /************************************************************************/
2094 :
2095 4398 : template <class T> static inline bool IsSame(T a, T b)
2096 : {
2097 4398 : return a == b;
2098 : }
2099 :
2100 4854 : template <> bool IsSame<float>(float a, float b)
2101 : {
2102 4854 : return a == b || (std::isnan(a) && std::isnan(b));
2103 : }
2104 :
2105 504 : template <> bool IsSame<double>(double a, double b)
2106 : {
2107 504 : return a == b || (std::isnan(a) && std::isnan(b));
2108 : }
2109 :
2110 : template <>
2111 480 : bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
2112 : {
2113 960 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2114 960 : std::isnan(b.real()) && std::isnan(b.imag()));
2115 : }
2116 :
2117 : template <>
2118 480 : bool IsSame<std::complex<double>>(std::complex<double> a,
2119 : std::complex<double> b)
2120 : {
2121 960 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2122 960 : std::isnan(b.real()) && std::isnan(b.imag()));
2123 : }
2124 :
2125 : template <class T>
2126 136 : static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
2127 : const T *pChunk, T *const pDstBuffer)
2128 :
2129 : {
2130 136 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2131 136 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2132 136 : const double dfSrcXDelta = args.dfSrcXDelta;
2133 136 : const double dfSrcYDelta = args.dfSrcYDelta;
2134 136 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2135 136 : const int nChunkXOff = args.nChunkXOff;
2136 136 : const int nChunkXSize = args.nChunkXSize;
2137 136 : const int nChunkYOff = args.nChunkYOff;
2138 136 : const int nChunkYSize = args.nChunkYSize;
2139 136 : const int nDstXOff = args.nDstXOff;
2140 136 : const int nDstXOff2 = args.nDstXOff2;
2141 136 : const int nDstYOff = args.nDstYOff;
2142 136 : const int nDstYOff2 = args.nDstYOff2;
2143 136 : const bool bHasNoData = args.bHasNoData;
2144 136 : const GDALColorTable *poColorTable = args.poColorTable;
2145 136 : const int nDstXSize = nDstXOff2 - nDstXOff;
2146 :
2147 8 : T tNoDataValue;
2148 : if constexpr (std::is_same<T, std::complex<float>>::value ||
2149 : std::is_same<T, std::complex<double>>::value)
2150 : {
2151 : using BaseT = typename T::value_type;
2152 8 : tNoDataValue =
2153 : std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
2154 : std::numeric_limits<BaseT>::quiet_NaN());
2155 : }
2156 128 : else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
2157 127 : tNoDataValue = 0;
2158 : else
2159 1 : tNoDataValue = static_cast<T>(args.dfNoDataValue);
2160 :
2161 136 : size_t nMaxNumPx = 0;
2162 136 : T *paVals = nullptr;
2163 136 : int *panSums = nullptr;
2164 :
2165 136 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2166 136 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2167 272 : std::vector<int> anVals(256, 0);
2168 :
2169 : /* ==================================================================== */
2170 : /* Loop over destination scanlines. */
2171 : /* ==================================================================== */
2172 7531 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2173 : {
2174 7395 : double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
2175 7395 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
2176 : #ifdef only_pixels_with_more_than_10_pct_participation
2177 : // When oversampling, don't take into account pixels that have a tiny
2178 : // participation in the resulting pixel
2179 : if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2180 : nSrcYOff < nChunkBottomYOff)
2181 : nSrcYOff++;
2182 : #endif
2183 7395 : if (nSrcYOff < nChunkYOff)
2184 0 : nSrcYOff = nChunkYOff;
2185 :
2186 7395 : double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
2187 7395 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
2188 : #ifdef only_pixels_with_more_than_10_pct_participation
2189 : // When oversampling, don't take into account pixels that have a tiny
2190 : // participation in the resulting pixel
2191 : if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2192 : nSrcYOff2 > nChunkYOff)
2193 : nSrcYOff2--;
2194 : #endif
2195 7395 : if (nSrcYOff2 == nSrcYOff)
2196 0 : ++nSrcYOff2;
2197 7395 : if (nSrcYOff2 > nChunkBottomYOff)
2198 0 : nSrcYOff2 = nChunkBottomYOff;
2199 :
2200 7395 : const T *const paSrcScanline =
2201 149 : pChunk +
2202 7395 : (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
2203 7395 : const GByte *pabySrcScanlineNodataMask = nullptr;
2204 7395 : if (pabyChunkNodataMask != nullptr)
2205 1810 : pabySrcScanlineNodataMask =
2206 : pabyChunkNodataMask +
2207 1810 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
2208 :
2209 7395 : T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
2210 : /* --------------------------------------------------------------------
2211 : */
2212 : /* Loop over destination pixels */
2213 : /* --------------------------------------------------------------------
2214 : */
2215 4259580 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2216 : {
2217 4252187 : double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
2218 : // Apply some epsilon to avoid numerical precision issues
2219 4252187 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
2220 : #ifdef only_pixels_with_more_than_10_pct_participation
2221 : // When oversampling, don't take into account pixels that have a
2222 : // tiny participation in the resulting pixel
2223 : if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2224 : nSrcXOff < nChunkRightXOff)
2225 : nSrcXOff++;
2226 : #endif
2227 4252187 : if (nSrcXOff < nChunkXOff)
2228 0 : nSrcXOff = nChunkXOff;
2229 :
2230 4252187 : double dfSrcXOff2 =
2231 4252187 : dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
2232 4252187 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
2233 : #ifdef only_pixels_with_more_than_10_pct_participation
2234 : // When oversampling, don't take into account pixels that have a
2235 : // tiny participation in the resulting pixel
2236 : if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2237 : nSrcXOff2 > nChunkXOff)
2238 : nSrcXOff2--;
2239 : #endif
2240 4252187 : if (nSrcXOff2 == nSrcXOff)
2241 0 : nSrcXOff2++;
2242 4252187 : if (nSrcXOff2 > nChunkRightXOff)
2243 0 : nSrcXOff2 = nChunkRightXOff;
2244 :
2245 4252187 : bool bRegularProcessing = false;
2246 : if constexpr (!std::is_same<T, GByte>::value)
2247 827 : bRegularProcessing = true;
2248 4251360 : else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
2249 0 : bRegularProcessing = true;
2250 :
2251 4252187 : if (bRegularProcessing)
2252 : {
2253 : // Not sure how much sense it makes to run a majority
2254 : // filter on floating point data, but here it is for the sake
2255 : // of compatibility. It won't look right on RGB images by the
2256 : // nature of the filter.
2257 :
2258 827 : if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
2259 2481 : nSrcYOff2 - nSrcYOff > INT_MAX / (nSrcXOff2 - nSrcXOff) ||
2260 827 : static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
2261 827 : static_cast<size_t>(nSrcXOff2 - nSrcXOff) >
2262 827 : std::numeric_limits<size_t>::max() / sizeof(float))
2263 : {
2264 0 : CPLError(CE_Failure, CPLE_NotSupported,
2265 : "Too big downsampling factor");
2266 0 : CPLFree(paVals);
2267 0 : CPLFree(panSums);
2268 0 : return CE_Failure;
2269 : }
2270 827 : const size_t nNumPx =
2271 827 : static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
2272 827 : static_cast<size_t>(nSrcXOff2 - nSrcXOff);
2273 827 : size_t iMaxInd = 0;
2274 827 : size_t iMaxVal = 0;
2275 827 : bool biMaxValdValid = false;
2276 :
2277 827 : if (paVals == nullptr || nNumPx > nMaxNumPx)
2278 : {
2279 : T *paValsNew = static_cast<T *>(
2280 71 : VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
2281 : int *panSumsNew = static_cast<int *>(
2282 71 : VSI_REALLOC_VERBOSE(panSums, nNumPx * sizeof(int)));
2283 71 : if (paValsNew != nullptr)
2284 71 : paVals = paValsNew;
2285 71 : if (panSumsNew != nullptr)
2286 71 : panSums = panSumsNew;
2287 71 : if (paValsNew == nullptr || panSumsNew == nullptr)
2288 : {
2289 0 : CPLFree(paVals);
2290 0 : CPLFree(panSums);
2291 0 : return CE_Failure;
2292 : }
2293 71 : nMaxNumPx = nNumPx;
2294 : }
2295 :
2296 2585 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2297 : {
2298 1758 : const GPtrDiff_t iTotYOff =
2299 1758 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2300 1758 : nChunkXOff;
2301 5690 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2302 : {
2303 3932 : if (pabySrcScanlineNodataMask == nullptr ||
2304 16 : pabySrcScanlineNodataMask[iX + iTotYOff])
2305 : {
2306 3917 : const T val = paSrcScanline[iX + iTotYOff];
2307 3917 : size_t i = 0; // Used after for.
2308 :
2309 : // Check array for existing entry.
2310 14387 : for (; i < iMaxInd; ++i)
2311 17626 : if (IsSame(paVals[i], val) &&
2312 6910 : ++panSums[i] > panSums[iMaxVal])
2313 : {
2314 246 : iMaxVal = i;
2315 246 : biMaxValdValid = true;
2316 246 : break;
2317 : }
2318 :
2319 : // Add to arr if entry not already there.
2320 3917 : if (i == iMaxInd)
2321 : {
2322 3671 : paVals[iMaxInd] = val;
2323 3671 : panSums[iMaxInd] = 1;
2324 :
2325 3671 : if (!biMaxValdValid)
2326 : {
2327 824 : iMaxVal = iMaxInd;
2328 824 : biMaxValdValid = true;
2329 : }
2330 :
2331 3671 : ++iMaxInd;
2332 : }
2333 : }
2334 : }
2335 : }
2336 :
2337 827 : if (!biMaxValdValid)
2338 3 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2339 : else
2340 824 : paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
2341 : }
2342 : else if constexpr (std::is_same<T, GByte>::value)
2343 : // ( eSrcDataType == GDT_Byte && nEntryCount < 256 )
2344 : {
2345 : // So we go here for a paletted or non-paletted byte band.
2346 : // The input values are then between 0 and 255.
2347 4251360 : int nMaxVal = 0;
2348 4251360 : int iMaxInd = -1;
2349 :
2350 : // The cost of this zeroing might be high. Perhaps we should
2351 : // just use the above generic case, and go to this one if the
2352 : // number of source pixels is large enough
2353 4251360 : std::fill(anVals.begin(), anVals.end(), 0);
2354 :
2355 12777700 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2356 : {
2357 8526370 : const GPtrDiff_t iTotYOff =
2358 8526370 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2359 8526370 : nChunkXOff;
2360 25649400 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2361 : {
2362 17123000 : const T val = paSrcScanline[iX + iTotYOff];
2363 17123000 : if (!bHasNoData || val != tNoDataValue)
2364 : {
2365 17123000 : int nVal = static_cast<int>(val);
2366 17123000 : if (++anVals[nVal] > nMaxVal)
2367 : {
2368 : // Sum the density.
2369 : // Is it the most common value so far?
2370 17006300 : iMaxInd = nVal;
2371 17006300 : nMaxVal = anVals[nVal];
2372 : }
2373 : }
2374 : }
2375 : }
2376 :
2377 4251360 : if (iMaxInd == -1)
2378 0 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2379 : else
2380 4251360 : paDstScanline[iDstPixel - nDstXOff] =
2381 : static_cast<T>(iMaxInd);
2382 : }
2383 : }
2384 : }
2385 :
2386 136 : CPLFree(paVals);
2387 136 : CPLFree(panSums);
2388 :
2389 136 : return CE_None;
2390 : }
2391 :
2392 136 : static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
2393 : const void *pChunk, void **ppDstBuffer,
2394 : GDALDataType *peDstBufferDataType)
2395 : {
2396 136 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
2397 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
2398 : GDALGetDataTypeSizeBytes(args.eWrkDataType));
2399 136 : if (*ppDstBuffer == nullptr)
2400 : {
2401 0 : return CE_Failure;
2402 : }
2403 :
2404 136 : CPLAssert(args.eSrcDataType == args.eWrkDataType);
2405 :
2406 136 : *peDstBufferDataType = args.eWrkDataType;
2407 136 : switch (args.eWrkDataType)
2408 : {
2409 : // For mode resampling, as no computation is done, only the
2410 : // size of the data type matters... except for Byte where we have
2411 : // special processing. And for floating point values
2412 65 : case GDT_Byte:
2413 : {
2414 65 : return GDALResampleChunk_ModeT(args,
2415 : static_cast<const GByte *>(pChunk),
2416 65 : static_cast<GByte *>(*ppDstBuffer));
2417 : }
2418 :
2419 4 : case GDT_Int8:
2420 : {
2421 4 : return GDALResampleChunk_ModeT(args,
2422 : static_cast<const int8_t *>(pChunk),
2423 4 : static_cast<int8_t *>(*ppDstBuffer));
2424 : }
2425 :
2426 9 : case GDT_Int16:
2427 : case GDT_UInt16:
2428 : case GDT_Float16:
2429 : {
2430 9 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
2431 9 : return GDALResampleChunk_ModeT(
2432 : args, static_cast<const uint16_t *>(pChunk),
2433 9 : static_cast<uint16_t *>(*ppDstBuffer));
2434 : }
2435 :
2436 15 : case GDT_CInt16:
2437 : case GDT_CFloat16:
2438 : case GDT_Int32:
2439 : case GDT_UInt32:
2440 : {
2441 15 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2442 15 : return GDALResampleChunk_ModeT(
2443 : args, static_cast<const uint32_t *>(pChunk),
2444 15 : static_cast<uint32_t *>(*ppDstBuffer));
2445 : }
2446 :
2447 17 : case GDT_Float32:
2448 : {
2449 17 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2450 17 : return GDALResampleChunk_ModeT(args,
2451 : static_cast<const float *>(pChunk),
2452 17 : static_cast<float *>(*ppDstBuffer));
2453 : }
2454 :
2455 12 : case GDT_CInt32:
2456 : case GDT_Int64:
2457 : case GDT_UInt64:
2458 : {
2459 12 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2460 12 : return GDALResampleChunk_ModeT(
2461 : args, static_cast<const uint64_t *>(pChunk),
2462 12 : static_cast<uint64_t *>(*ppDstBuffer));
2463 : }
2464 :
2465 6 : case GDT_Float64:
2466 : {
2467 6 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2468 6 : return GDALResampleChunk_ModeT(args,
2469 : static_cast<const double *>(pChunk),
2470 6 : static_cast<double *>(*ppDstBuffer));
2471 : }
2472 :
2473 4 : case GDT_CFloat32:
2474 : {
2475 4 : return GDALResampleChunk_ModeT(
2476 : args, static_cast<const std::complex<float> *>(pChunk),
2477 4 : static_cast<std::complex<float> *>(*ppDstBuffer));
2478 : }
2479 :
2480 4 : case GDT_CFloat64:
2481 : {
2482 4 : return GDALResampleChunk_ModeT(
2483 : args, static_cast<const std::complex<double> *>(pChunk),
2484 4 : static_cast<std::complex<double> *>(*ppDstBuffer));
2485 : }
2486 :
2487 0 : case GDT_Unknown:
2488 : case GDT_TypeCount:
2489 0 : break;
2490 : }
2491 :
2492 0 : CPLAssert(false);
2493 : return CE_Failure;
2494 : }
2495 :
2496 : /************************************************************************/
2497 : /* GDALResampleConvolutionHorizontal() */
2498 : /************************************************************************/
2499 :
2500 : template <class T>
2501 : static inline double
2502 44642 : GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
2503 : int nSrcPixelCount)
2504 : {
2505 44642 : double dfVal1 = 0.0;
2506 44642 : double dfVal2 = 0.0;
2507 44642 : int i = 0; // Used after for.
2508 : // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
2509 : // manually (untypical) unrolled loop in -O2 and -O3:
2510 : // https://github.com/OSGeo/gdal/issues/9508
2511 : #if !defined(__INTEL_CLANG_COMPILER)
2512 89044 : for (; i + 3 < nSrcPixelCount; i += 4)
2513 : {
2514 44402 : dfVal1 += pChunk[i] * padfWeights[i];
2515 44402 : dfVal1 += pChunk[i + 1] * padfWeights[i + 1];
2516 44402 : dfVal2 += pChunk[i + 2] * padfWeights[i + 2];
2517 44402 : dfVal2 += pChunk[i + 3] * padfWeights[i + 3];
2518 : }
2519 : #endif
2520 46066 : for (; i < nSrcPixelCount; ++i)
2521 : {
2522 1424 : dfVal1 += pChunk[i] * padfWeights[i];
2523 : }
2524 44642 : return dfVal1 + dfVal2;
2525 : }
2526 :
2527 : template <class T>
2528 48 : static inline void GDALResampleConvolutionHorizontalWithMask(
2529 : const T *pChunk, const GByte *pabyMask, const double *padfWeights,
2530 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2531 : {
2532 48 : dfVal = 0;
2533 48 : dfWeightSum = 0;
2534 48 : int i = 0;
2535 48 : for (; i + 3 < nSrcPixelCount; i += 4)
2536 : {
2537 0 : const double dfWeight0 = padfWeights[i] * pabyMask[i];
2538 0 : const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
2539 0 : const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
2540 0 : const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
2541 0 : dfVal += pChunk[i] * dfWeight0;
2542 0 : dfVal += pChunk[i + 1] * dfWeight1;
2543 0 : dfVal += pChunk[i + 2] * dfWeight2;
2544 0 : dfVal += pChunk[i + 3] * dfWeight3;
2545 0 : dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
2546 : }
2547 178 : for (; i < nSrcPixelCount; ++i)
2548 : {
2549 130 : const double dfWeight = padfWeights[i] * pabyMask[i];
2550 130 : dfVal += pChunk[i] * dfWeight;
2551 130 : dfWeightSum += dfWeight;
2552 : }
2553 48 : }
2554 :
2555 : template <class T>
2556 1330334 : static inline void GDALResampleConvolutionHorizontal_3rows(
2557 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2558 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2559 : double &dfRes2, double &dfRes3)
2560 : {
2561 1330334 : double dfVal1 = 0.0;
2562 1330334 : double dfVal2 = 0.0;
2563 1330334 : double dfVal3 = 0.0;
2564 1330334 : double dfVal4 = 0.0;
2565 1330334 : double dfVal5 = 0.0;
2566 1330334 : double dfVal6 = 0.0;
2567 1330334 : int i = 0; // Used after for.
2568 2715057 : for (; i + 3 < nSrcPixelCount; i += 4)
2569 : {
2570 1384722 : dfVal1 += pChunkRow1[i] * padfWeights[i];
2571 1384722 : dfVal1 += pChunkRow1[i + 1] * padfWeights[i + 1];
2572 1384722 : dfVal2 += pChunkRow1[i + 2] * padfWeights[i + 2];
2573 1384722 : dfVal2 += pChunkRow1[i + 3] * padfWeights[i + 3];
2574 1384722 : dfVal3 += pChunkRow2[i] * padfWeights[i];
2575 1384722 : dfVal3 += pChunkRow2[i + 1] * padfWeights[i + 1];
2576 1384722 : dfVal4 += pChunkRow2[i + 2] * padfWeights[i + 2];
2577 1384722 : dfVal4 += pChunkRow2[i + 3] * padfWeights[i + 3];
2578 1384722 : dfVal5 += pChunkRow3[i] * padfWeights[i];
2579 1384722 : dfVal5 += pChunkRow3[i + 1] * padfWeights[i + 1];
2580 1384722 : dfVal6 += pChunkRow3[i + 2] * padfWeights[i + 2];
2581 1384722 : dfVal6 += pChunkRow3[i + 3] * padfWeights[i + 3];
2582 : }
2583 1366941 : for (; i < nSrcPixelCount; ++i)
2584 : {
2585 36607 : dfVal1 += pChunkRow1[i] * padfWeights[i];
2586 36607 : dfVal3 += pChunkRow2[i] * padfWeights[i];
2587 36607 : dfVal5 += pChunkRow3[i] * padfWeights[i];
2588 : }
2589 1330334 : dfRes1 = dfVal1 + dfVal2;
2590 1330334 : dfRes2 = dfVal3 + dfVal4;
2591 1330334 : dfRes3 = dfVal5 + dfVal6;
2592 1330334 : }
2593 :
2594 : template <class T>
2595 18188 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
2596 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2597 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2598 : double &dfRes2, double &dfRes3)
2599 : {
2600 18188 : GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2601 : padfWeights, nSrcPixelCount, dfRes1,
2602 : dfRes2, dfRes3);
2603 18188 : }
2604 :
2605 : template <class T>
2606 1247346 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
2607 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2608 : const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
2609 : {
2610 1247346 : GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2611 : padfWeights, 4, dfRes1, dfRes2,
2612 : dfRes3);
2613 1247346 : }
2614 :
2615 : /************************************************************************/
2616 : /* GDALResampleConvolutionVertical() */
2617 : /************************************************************************/
2618 :
2619 : template <class T>
2620 : static inline double
2621 463488 : GDALResampleConvolutionVertical(const T *pChunk, int nStride,
2622 : const double *padfWeights, int nSrcLineCount)
2623 : {
2624 463488 : double dfVal1 = 0.0;
2625 463488 : double dfVal2 = 0.0;
2626 463488 : int i = 0;
2627 463488 : int j = 0;
2628 912801 : for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2629 : {
2630 449313 : dfVal1 += pChunk[j] * padfWeights[i];
2631 449313 : dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
2632 449313 : dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2633 449313 : dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2634 : }
2635 516471 : for (; i < nSrcLineCount; ++i, j += nStride)
2636 : {
2637 52983 : dfVal1 += pChunk[j] * padfWeights[i];
2638 : }
2639 463488 : return dfVal1 + dfVal2;
2640 : }
2641 :
2642 : template <class T>
2643 2880000 : static inline void GDALResampleConvolutionVertical_2cols(
2644 : const T *pChunk, int nStride, const double *padfWeights, int nSrcLineCount,
2645 : double &dfRes1, double &dfRes2)
2646 : {
2647 2880000 : double dfVal1 = 0.0;
2648 2880000 : double dfVal2 = 0.0;
2649 2880000 : double dfVal3 = 0.0;
2650 2880000 : double dfVal4 = 0.0;
2651 2880000 : int i = 0;
2652 2880000 : int j = 0;
2653 5716800 : for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2654 : {
2655 2836800 : dfVal1 += pChunk[j] * padfWeights[i];
2656 2836800 : dfVal3 += pChunk[j + 1] * padfWeights[i];
2657 2836800 : dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
2658 2836800 : dfVal3 += pChunk[j + 1 + nStride] * padfWeights[i + 1];
2659 2836800 : dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2660 2836800 : dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
2661 2836800 : dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2662 2836800 : dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
2663 : }
2664 2995210 : for (; i < nSrcLineCount; ++i, j += nStride)
2665 : {
2666 115210 : dfVal1 += pChunk[j] * padfWeights[i];
2667 115210 : dfVal3 += pChunk[j + 1] * padfWeights[i];
2668 : }
2669 2880000 : dfRes1 = dfVal1 + dfVal2;
2670 2880000 : dfRes2 = dfVal3 + dfVal4;
2671 2880000 : }
2672 :
2673 : #ifdef USE_SSE2
2674 :
2675 : #ifdef __AVX__
2676 : /************************************************************************/
2677 : /* GDALResampleConvolutionVertical_16cols<T> */
2678 : /************************************************************************/
2679 :
2680 : template <class T>
2681 : static inline void
2682 : GDALResampleConvolutionVertical_16cols(const T *pChunk, int nStride,
2683 : const double *padfWeights,
2684 : int nSrcLineCount, float *afDest)
2685 : {
2686 : int i = 0;
2687 : int j = 0;
2688 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2689 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2690 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2691 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2692 : for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2693 : {
2694 : XMMReg4Double w0 =
2695 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2696 : XMMReg4Double w1 =
2697 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2698 : XMMReg4Double w2 =
2699 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2700 : XMMReg4Double w3 =
2701 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2702 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2703 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2704 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
2705 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
2706 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2707 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2708 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
2709 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
2710 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2711 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2712 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
2713 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
2714 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2715 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2716 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
2717 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
2718 : }
2719 : for (; i < nSrcLineCount; ++i, j += nStride)
2720 : {
2721 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2722 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2723 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2724 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
2725 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
2726 : }
2727 : v_acc0.Store4Val(afDest);
2728 : v_acc1.Store4Val(afDest + 4);
2729 : v_acc2.Store4Val(afDest + 8);
2730 : v_acc3.Store4Val(afDest + 12);
2731 : }
2732 :
2733 : template <class T>
2734 : static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
2735 : const double *, int,
2736 : double *)
2737 : {
2738 : // Cannot be reached
2739 : CPLAssert(false);
2740 : }
2741 :
2742 : #else
2743 :
2744 : /************************************************************************/
2745 : /* GDALResampleConvolutionVertical_8cols<T> */
2746 : /************************************************************************/
2747 :
2748 : template <class T>
2749 : static inline void
2750 18870600 : GDALResampleConvolutionVertical_8cols(const T *pChunk, int nStride,
2751 : const double *padfWeights,
2752 : int nSrcLineCount, float *afDest)
2753 : {
2754 18870600 : int i = 0;
2755 18870600 : int j = 0;
2756 18870600 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2757 18859800 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2758 34366800 : for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2759 : {
2760 15486100 : XMMReg4Double w0 =
2761 15486100 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2762 15472600 : XMMReg4Double w1 =
2763 15472600 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2764 15486800 : XMMReg4Double w2 =
2765 15486800 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2766 15483900 : XMMReg4Double w3 =
2767 15483900 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2768 15480600 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2769 15471800 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2770 15465300 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2771 15463400 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2772 15470900 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2773 15459700 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2774 15457200 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2775 15451700 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2776 : }
2777 30292700 : for (; i < nSrcLineCount; ++i, j += nStride)
2778 : {
2779 11412000 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2780 11412000 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2781 11412000 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2782 : }
2783 18880700 : v_acc0.Store4Val(afDest);
2784 18867900 : v_acc1.Store4Val(afDest + 4);
2785 18902700 : }
2786 :
2787 : template <class T>
2788 : static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
2789 : const double *, int,
2790 : double *)
2791 : {
2792 : // Cannot be reached
2793 : CPLAssert(false);
2794 : }
2795 :
2796 : #endif // __AVX__
2797 :
2798 : /************************************************************************/
2799 : /* GDALResampleConvolutionHorizontalSSE2<T> */
2800 : /************************************************************************/
2801 :
2802 : template <class T>
2803 2752656 : static inline double GDALResampleConvolutionHorizontalSSE2(
2804 : const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2805 : {
2806 2752656 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2807 2752391 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2808 2752299 : int i = 0; // Used after for.
2809 2828593 : for (; i + 7 < nSrcPixelCount; i += 8)
2810 : {
2811 : // Retrieve the pixel & accumulate
2812 76095 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
2813 76095 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
2814 76095 : const XMMReg4Double v_weight1 =
2815 76095 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2816 76095 : const XMMReg4Double v_weight2 =
2817 76095 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
2818 :
2819 76095 : v_acc1 += v_pixels1 * v_weight1;
2820 76095 : v_acc2 += v_pixels2 * v_weight2;
2821 : }
2822 :
2823 2752491 : v_acc1 += v_acc2;
2824 :
2825 2752455 : double dfVal = v_acc1.GetHorizSum();
2826 9577100 : for (; i < nSrcPixelCount; ++i)
2827 : {
2828 6824660 : dfVal += pChunk[i] * padfWeightsAligned[i];
2829 : }
2830 2752442 : return dfVal;
2831 : }
2832 :
2833 : /************************************************************************/
2834 : /* GDALResampleConvolutionHorizontal<GByte> */
2835 : /************************************************************************/
2836 :
2837 : template <>
2838 2203990 : inline double GDALResampleConvolutionHorizontal<GByte>(
2839 : const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2840 : {
2841 2203990 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
2842 2204000 : nSrcPixelCount);
2843 : }
2844 :
2845 : template <>
2846 548266 : inline double GDALResampleConvolutionHorizontal<GUInt16>(
2847 : const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2848 : {
2849 548266 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
2850 548790 : nSrcPixelCount);
2851 : }
2852 :
2853 : /************************************************************************/
2854 : /* GDALResampleConvolutionHorizontalWithMaskSSE2<T> */
2855 : /************************************************************************/
2856 :
2857 : template <class T>
2858 5806833 : static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
2859 : const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
2860 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2861 : {
2862 5806833 : int i = 0; // Used after for.
2863 5806833 : XMMReg4Double v_acc = XMMReg4Double::Zero();
2864 5806833 : XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
2865 16456921 : for (; i + 3 < nSrcPixelCount; i += 4)
2866 : {
2867 10650058 : const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
2868 10650058 : const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
2869 10650058 : XMMReg4Double v_weight =
2870 10650058 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2871 10650058 : v_weight *= v_mask;
2872 10650058 : v_acc += v_pixels * v_weight;
2873 10650058 : v_acc_weight += v_weight;
2874 : }
2875 :
2876 5806833 : dfVal = v_acc.GetHorizSum();
2877 5806833 : dfWeightSum = v_acc_weight.GetHorizSum();
2878 6005033 : for (; i < nSrcPixelCount; ++i)
2879 : {
2880 198202 : const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
2881 198202 : dfVal += pChunk[i] * dfWeight;
2882 198202 : dfWeightSum += dfWeight;
2883 : }
2884 5806833 : }
2885 :
2886 : /************************************************************************/
2887 : /* GDALResampleConvolutionHorizontalWithMask<GByte> */
2888 : /************************************************************************/
2889 :
2890 : template <>
2891 5806770 : inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
2892 : const GByte *pChunk, const GByte *pabyMask,
2893 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2894 : double &dfWeightSum)
2895 : {
2896 5806770 : GDALResampleConvolutionHorizontalWithMaskSSE2(
2897 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2898 : dfWeightSum);
2899 5806770 : }
2900 :
2901 : template <>
2902 63 : inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
2903 : const GUInt16 *pChunk, const GByte *pabyMask,
2904 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2905 : double &dfWeightSum)
2906 : {
2907 63 : GDALResampleConvolutionHorizontalWithMaskSSE2(
2908 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2909 : dfWeightSum);
2910 63 : }
2911 :
2912 : /************************************************************************/
2913 : /* GDALResampleConvolutionHorizontal_3rows_SSE2<T> */
2914 : /************************************************************************/
2915 :
2916 : template <class T>
2917 10026430 : static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
2918 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2919 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2920 : double &dfRes2, double &dfRes3)
2921 : {
2922 10026430 : XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
2923 10026430 : v_acc2 = XMMReg4Double::Zero(),
2924 10026430 : v_acc3 = XMMReg4Double::Zero();
2925 10026430 : int i = 0;
2926 19995066 : for (; i + 7 < nSrcPixelCount; i += 8)
2927 : {
2928 : // Retrieve the pixel & accumulate.
2929 9968656 : XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
2930 9968656 : XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
2931 9968656 : const XMMReg4Double v_weight1 =
2932 9968656 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2933 9968656 : const XMMReg4Double v_weight2 =
2934 9968656 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
2935 :
2936 9968656 : v_acc1 += v_pixels1 * v_weight1;
2937 9968656 : v_acc1 += v_pixels2 * v_weight2;
2938 :
2939 9968656 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
2940 9968656 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
2941 9968656 : v_acc2 += v_pixels1 * v_weight1;
2942 9968656 : v_acc2 += v_pixels2 * v_weight2;
2943 :
2944 9968656 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
2945 9968656 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
2946 9968656 : v_acc3 += v_pixels1 * v_weight1;
2947 9968656 : v_acc3 += v_pixels2 * v_weight2;
2948 : }
2949 :
2950 10026430 : dfRes1 = v_acc1.GetHorizSum();
2951 10026430 : dfRes2 = v_acc2.GetHorizSum();
2952 10026430 : dfRes3 = v_acc3.GetHorizSum();
2953 21493126 : for (; i < nSrcPixelCount; ++i)
2954 : {
2955 11466796 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
2956 11466796 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
2957 11466796 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
2958 : }
2959 10026430 : }
2960 :
2961 : /************************************************************************/
2962 : /* GDALResampleConvolutionHorizontal_3rows<GByte> */
2963 : /************************************************************************/
2964 :
2965 : template <>
2966 10026400 : inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
2967 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
2968 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2969 : double &dfRes2, double &dfRes3)
2970 : {
2971 10026400 : GDALResampleConvolutionHorizontal_3rows_SSE2(
2972 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
2973 : dfRes1, dfRes2, dfRes3);
2974 10026400 : }
2975 :
2976 : template <>
2977 30 : inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
2978 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
2979 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
2980 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
2981 : {
2982 30 : GDALResampleConvolutionHorizontal_3rows_SSE2(
2983 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
2984 : dfRes1, dfRes2, dfRes3);
2985 30 : }
2986 :
2987 : /************************************************************************/
2988 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T> */
2989 : /************************************************************************/
2990 :
2991 : template <class T>
2992 2175580 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
2993 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2994 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2995 : double &dfRes2, double &dfRes3)
2996 : {
2997 2175580 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2998 2175494 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2999 2175539 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
3000 2175536 : int i = 0; // Use after for.
3001 2178825 : for (; i + 3 < nSrcPixelCount; i += 4)
3002 : {
3003 : // Retrieve the pixel & accumulate.
3004 3284 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3005 3284 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3006 3284 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3007 3284 : const XMMReg4Double v_weight =
3008 3284 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3009 :
3010 3284 : v_acc1 += v_pixels1 * v_weight;
3011 3284 : v_acc2 += v_pixels2 * v_weight;
3012 3284 : v_acc3 += v_pixels3 * v_weight;
3013 : }
3014 :
3015 2175545 : dfRes1 = v_acc1.GetHorizSum();
3016 2175505 : dfRes2 = v_acc2.GetHorizSum();
3017 2175514 : dfRes3 = v_acc3.GetHorizSum();
3018 :
3019 6503582 : for (; i < nSrcPixelCount; ++i)
3020 : {
3021 4328058 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3022 4328058 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3023 4328058 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3024 : }
3025 2175524 : }
3026 :
3027 : /************************************************************************/
3028 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte> */
3029 : /************************************************************************/
3030 :
3031 : template <>
3032 2108530 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
3033 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3034 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3035 : double &dfRes2, double &dfRes3)
3036 : {
3037 2108530 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3038 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3039 : dfRes1, dfRes2, dfRes3);
3040 2108540 : }
3041 :
3042 : template <>
3043 66965 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
3044 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3045 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3046 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3047 : {
3048 66965 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3049 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3050 : dfRes1, dfRes2, dfRes3);
3051 67064 : }
3052 :
3053 : /************************************************************************/
3054 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T> */
3055 : /************************************************************************/
3056 :
3057 : template <class T>
3058 12571400 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3059 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3060 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3061 : double &dfRes3)
3062 : {
3063 12571400 : const XMMReg4Double v_weight =
3064 : XMMReg4Double::Load4ValAligned(padfWeightsAligned);
3065 :
3066 : // Retrieve the pixel & accumulate.
3067 12610440 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
3068 12605460 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
3069 12614430 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
3070 :
3071 12619590 : XMMReg4Double v_acc1 = v_pixels1 * v_weight;
3072 12572550 : XMMReg4Double v_acc2 = v_pixels2 * v_weight;
3073 12591670 : XMMReg4Double v_acc3 = v_pixels3 * v_weight;
3074 :
3075 12583870 : dfRes1 = v_acc1.GetHorizSum();
3076 12566090 : dfRes2 = v_acc2.GetHorizSum();
3077 12618950 : dfRes3 = v_acc3.GetHorizSum();
3078 12605350 : }
3079 :
3080 : /************************************************************************/
3081 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte> */
3082 : /************************************************************************/
3083 :
3084 : template <>
3085 6988180 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
3086 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3087 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3088 : double &dfRes3)
3089 : {
3090 6988180 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3091 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3092 : dfRes3);
3093 6960980 : }
3094 :
3095 : template <>
3096 5602700 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
3097 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3098 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
3099 : double &dfRes2, double &dfRes3)
3100 : {
3101 5602700 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3102 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3103 : dfRes3);
3104 5607660 : }
3105 :
3106 : #endif // USE_SSE2
3107 :
3108 : /************************************************************************/
3109 : /* GDALResampleChunk_Convolution() */
3110 : /************************************************************************/
3111 :
3112 : template <class T, class Twork, GDALDataType eWrkDataType>
3113 3730 : static CPLErr GDALResampleChunk_ConvolutionT(
3114 : const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
3115 : FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
3116 : int nKernelRadius, bool bKernelWithNegativeWeights, float fMaxVal)
3117 :
3118 : {
3119 3730 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
3120 3730 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
3121 3730 : const double dfSrcXDelta = args.dfSrcXDelta;
3122 3730 : const double dfSrcYDelta = args.dfSrcYDelta;
3123 3730 : constexpr int nBands = 1;
3124 3730 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
3125 3730 : const int nChunkXOff = args.nChunkXOff;
3126 3730 : const int nChunkXSize = args.nChunkXSize;
3127 3730 : const int nChunkYOff = args.nChunkYOff;
3128 3730 : const int nChunkYSize = args.nChunkYSize;
3129 3730 : const int nDstXOff = args.nDstXOff;
3130 3730 : const int nDstXOff2 = args.nDstXOff2;
3131 3730 : const int nDstYOff = args.nDstYOff;
3132 3730 : const int nDstYOff2 = args.nDstYOff2;
3133 3730 : const bool bHasNoData = args.bHasNoData;
3134 3730 : double dfNoDataValue = args.dfNoDataValue;
3135 :
3136 3730 : if (!bHasNoData)
3137 3685 : dfNoDataValue = 0.0;
3138 3730 : const auto dstDataType = args.eOvrDataType;
3139 3730 : const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
3140 3733 : const double dfReplacementVal =
3141 46 : bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
3142 : : dfNoDataValue;
3143 : // cppcheck-suppress unreadVariable
3144 3733 : const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
3145 3726 : const bool bNoDataValueInt64Valid =
3146 3728 : isIntegerDT && GDALIsValueExactAs<GInt64>(dfNoDataValue);
3147 3726 : const auto nNodataValueInt64 =
3148 : bNoDataValueInt64Valid ? static_cast<GInt64>(dfNoDataValue) : 0;
3149 3726 : constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
3150 :
3151 : // TODO: we should have some generic function to do this.
3152 3726 : Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
3153 3726 : Twork fDstMax = cpl::NumericLimits<Twork>::max();
3154 3726 : if (dstDataType == GDT_Byte)
3155 : {
3156 3010 : fDstMin = std::numeric_limits<GByte>::min();
3157 3007 : fDstMax = std::numeric_limits<GByte>::max();
3158 : }
3159 719 : else if (dstDataType == GDT_Int8)
3160 : {
3161 1 : fDstMin = std::numeric_limits<GInt8>::min();
3162 1 : fDstMax = std::numeric_limits<GInt8>::max();
3163 : }
3164 718 : else if (dstDataType == GDT_UInt16)
3165 : {
3166 395 : fDstMin = std::numeric_limits<GUInt16>::min();
3167 388 : fDstMax = std::numeric_limits<GUInt16>::max();
3168 : }
3169 329 : else if (dstDataType == GDT_Int16)
3170 : {
3171 279 : fDstMin = std::numeric_limits<GInt16>::min();
3172 279 : fDstMax = std::numeric_limits<GInt16>::max();
3173 : }
3174 50 : else if (dstDataType == GDT_UInt32)
3175 : {
3176 1 : fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
3177 1 : fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
3178 : }
3179 49 : else if (dstDataType == GDT_Int32)
3180 : {
3181 : // cppcheck-suppress unreadVariable
3182 2 : fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
3183 : // cppcheck-suppress unreadVariable
3184 2 : fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
3185 : }
3186 47 : else if (dstDataType == GDT_UInt64)
3187 : {
3188 : // cppcheck-suppress unreadVariable
3189 1 : fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
3190 : // cppcheck-suppress unreadVariable
3191 1 : fDstMax = static_cast<Twork>(std::numeric_limits<uint64_t>::max());
3192 : }
3193 46 : else if (dstDataType == GDT_Int64)
3194 : {
3195 : // cppcheck-suppress unreadVariable
3196 1 : fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
3197 : // cppcheck-suppress unreadVariable
3198 1 : fDstMax = static_cast<Twork>(std::numeric_limits<int64_t>::max());
3199 : }
3200 :
3201 30807194 : auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
3202 : bNoDataValueInt64Valid, nNodataValueInt64,
3203 : dfNoDataValue, dfReplacementVal](Twork fVal)
3204 : {
3205 14670600 : if (!bHasNoData)
3206 11444200 : return fVal;
3207 :
3208 : // Clamp value before comparing to nodata: this is only needed for
3209 : // kernels with negative weights (Lanczos)
3210 3226380 : Twork fClamped = fVal;
3211 3226380 : if (fClamped < fDstMin)
3212 12874 : fClamped = fDstMin;
3213 3213500 : else if (fClamped > fDstMax)
3214 12852 : fClamped = fDstMax;
3215 3226380 : if (isIntegerDT)
3216 : {
3217 6452730 : if (bNoDataValueInt64Valid &&
3218 3226370 : nNodataValueInt64 == static_cast<GInt64>(std::round(fClamped)))
3219 : {
3220 : // Do not use the nodata value
3221 13869 : return static_cast<Twork>(dfReplacementVal);
3222 : }
3223 : }
3224 9 : else if (dfNoDataValue == fClamped)
3225 : {
3226 : // Do not use the nodata value
3227 1 : return static_cast<Twork>(dfReplacementVal);
3228 : }
3229 3212510 : return fClamped;
3230 : };
3231 :
3232 : /* -------------------------------------------------------------------- */
3233 : /* Allocate work buffers. */
3234 : /* -------------------------------------------------------------------- */
3235 3728 : const int nDstXSize = nDstXOff2 - nDstXOff;
3236 3728 : Twork *pafWrkScanline = nullptr;
3237 3728 : if (dstDataType != eWrkDataType)
3238 : {
3239 : pafWrkScanline =
3240 3688 : static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
3241 3691 : if (pafWrkScanline == nullptr)
3242 0 : return CE_Failure;
3243 : }
3244 :
3245 3731 : const double dfXScale = 1.0 / dfXRatioDstToSrc;
3246 3731 : const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
3247 3731 : const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
3248 3731 : const double dfYScale = 1.0 / dfYRatioDstToSrc;
3249 3731 : const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
3250 3731 : const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
3251 :
3252 : // Temporary array to store result of horizontal filter.
3253 : double *padfHorizontalFiltered = static_cast<double *>(
3254 3731 : VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
3255 :
3256 : // To store convolution coefficients.
3257 3733 : double *padfWeights = static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
3258 : static_cast<int>(2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) +
3259 : 0.5) *
3260 : sizeof(double)));
3261 :
3262 3732 : GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
3263 3732 : if (pabyChunkNodataMask)
3264 : pabyChunkNodataMaskHorizontalFiltered =
3265 401 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
3266 3732 : if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
3267 401 : (pabyChunkNodataMask != nullptr &&
3268 : pabyChunkNodataMaskHorizontalFiltered == nullptr))
3269 : {
3270 0 : VSIFree(pafWrkScanline);
3271 0 : VSIFree(padfHorizontalFiltered);
3272 0 : VSIFreeAligned(padfWeights);
3273 0 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3274 0 : return CE_Failure;
3275 : }
3276 :
3277 : /* ==================================================================== */
3278 : /* First pass: horizontal filter */
3279 : /* ==================================================================== */
3280 3733 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
3281 : #ifdef USE_SSE2
3282 3733 : bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
3283 : #endif
3284 2740691 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
3285 : {
3286 2736955 : const double dfSrcPixel =
3287 2736955 : (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
3288 2736955 : int nSrcPixelStart =
3289 2736955 : static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
3290 2736955 : if (nSrcPixelStart < nChunkXOff)
3291 55224 : nSrcPixelStart = nChunkXOff;
3292 2736955 : int nSrcPixelStop =
3293 2736955 : static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
3294 2736955 : if (nSrcPixelStop > nChunkRightXOff)
3295 55239 : nSrcPixelStop = nChunkRightXOff;
3296 : #if 0
3297 : if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
3298 : {
3299 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3300 : }
3301 : if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
3302 : {
3303 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3304 : }
3305 : #endif
3306 2736955 : const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
3307 2736955 : double dfWeightSum = 0.0;
3308 :
3309 : // Compute convolution coefficients.
3310 2736955 : int nSrcPixel = nSrcPixelStart;
3311 2736955 : double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
3312 3599690 : for (; nSrcPixel + 3 < nSrcPixelStop; nSrcPixel += 4)
3313 : {
3314 862768 : padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
3315 862768 : dfX += dfXScaleWeight;
3316 862768 : padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
3317 862768 : dfX += dfXScaleWeight;
3318 862768 : padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
3319 862768 : dfX += dfXScaleWeight;
3320 862768 : padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
3321 862768 : dfX += dfXScaleWeight;
3322 862734 : dfWeightSum +=
3323 862768 : pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
3324 : }
3325 6719218 : for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
3326 : {
3327 3981953 : const double dfWeight = pfnFilterFunc(dfX);
3328 3982294 : padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3329 3982294 : dfWeightSum += dfWeight;
3330 : }
3331 :
3332 2737275 : const int nHeight = nChunkYSize * nBands;
3333 2737275 : if (pabyChunkNodataMask == nullptr)
3334 : {
3335 2664622 : if (dfWeightSum != 0)
3336 : {
3337 2665062 : const double dfInvWeightSum = 1.0 / dfWeightSum;
3338 9535413 : for (int i = 0; i < nSrcPixelCount; ++i)
3339 6870345 : padfWeights[i] *= dfInvWeightSum;
3340 : }
3341 2664622 : int iSrcLineOff = 0;
3342 : #ifdef USE_SSE2
3343 2664622 : if (nSrcPixelCount == 4)
3344 : {
3345 14411406 : for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
3346 : {
3347 13821916 : const GPtrDiff_t j =
3348 13821916 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3349 13821916 : (nSrcPixelStart - nChunkXOff);
3350 13821916 : double dfVal1 = 0.0;
3351 13821916 : double dfVal2 = 0.0;
3352 13821916 : double dfVal3 = 0.0;
3353 13821916 : GDALResampleConvolutionHorizontalPixelCount4_3rows(
3354 13821916 : pChunk + j, pChunk + j + nChunkXSize,
3355 13821916 : pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
3356 : dfVal2, dfVal3);
3357 13860256 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3358 13860256 : nDstXSize +
3359 13860256 : iDstPixel - nDstXOff] = dfVal1;
3360 13860256 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3361 13860256 : 1) *
3362 13860256 : nDstXSize +
3363 13860256 : iDstPixel - nDstXOff] = dfVal2;
3364 13860256 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3365 13860256 : 2) *
3366 13860256 : nDstXSize +
3367 13860256 : iDstPixel - nDstXOff] = dfVal3;
3368 : }
3369 : }
3370 2113469 : else if (bSrcPixelCountLess8)
3371 : {
3372 4228636 : for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
3373 : {
3374 2193628 : const GPtrDiff_t j =
3375 2193628 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3376 2193628 : (nSrcPixelStart - nChunkXOff);
3377 2193628 : double dfVal1 = 0.0;
3378 2193628 : double dfVal2 = 0.0;
3379 2193628 : double dfVal3 = 0.0;
3380 2193628 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
3381 2193628 : pChunk + j, pChunk + j + nChunkXSize,
3382 2193628 : pChunk + j + 2 * nChunkXSize, padfWeights,
3383 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3384 2193789 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3385 2193789 : nDstXSize +
3386 2193789 : iDstPixel - nDstXOff] = dfVal1;
3387 2193789 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3388 2193789 : 1) *
3389 2193789 : nDstXSize +
3390 2193789 : iDstPixel - nDstXOff] = dfVal2;
3391 2193789 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3392 2193789 : 2) *
3393 2193789 : nDstXSize +
3394 2193789 : iDstPixel - nDstXOff] = dfVal3;
3395 : }
3396 : }
3397 : else
3398 : #endif
3399 : {
3400 10169832 : for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
3401 : {
3402 10091230 : const GPtrDiff_t j =
3403 10091230 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3404 10091230 : (nSrcPixelStart - nChunkXOff);
3405 10091230 : double dfVal1 = 0.0;
3406 10091230 : double dfVal2 = 0.0;
3407 10091230 : double dfVal3 = 0.0;
3408 10091230 : GDALResampleConvolutionHorizontal_3rows(
3409 10091230 : pChunk + j, pChunk + j + nChunkXSize,
3410 10091230 : pChunk + j + 2 * nChunkXSize, padfWeights,
3411 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3412 10091230 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3413 10091230 : nDstXSize +
3414 10091230 : iDstPixel - nDstXOff] = dfVal1;
3415 10091230 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3416 10091230 : 1) *
3417 10091230 : nDstXSize +
3418 10091230 : iDstPixel - nDstXOff] = dfVal2;
3419 10091230 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3420 10091230 : 2) *
3421 10091230 : nDstXSize +
3422 10091230 : iDstPixel - nDstXOff] = dfVal3;
3423 : }
3424 : }
3425 5500519 : for (; iSrcLineOff < nHeight; ++iSrcLineOff)
3426 : {
3427 2797228 : const GPtrDiff_t j =
3428 2797228 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3429 2797228 : (nSrcPixelStart - nChunkXOff);
3430 5549992 : const double dfVal = GDALResampleConvolutionHorizontal(
3431 2797228 : pChunk + j, padfWeights, nSrcPixelCount);
3432 2797410 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3433 2797410 : nDstXSize +
3434 2797410 : iDstPixel - nDstXOff] = dfVal;
3435 : }
3436 : }
3437 : else
3438 : {
3439 18405072 : for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
3440 : {
3441 18333218 : const GPtrDiff_t j =
3442 18333218 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3443 18333218 : (nSrcPixelStart - nChunkXOff);
3444 :
3445 18333218 : if (bKernelWithNegativeWeights)
3446 : {
3447 17852612 : int nConsecutiveValid = 0;
3448 17852612 : int nMaxConsecutiveValid = 0;
3449 165500458 : for (int k = 0; k < nSrcPixelCount; k++)
3450 : {
3451 147648146 : if (pabyChunkNodataMask[j + k])
3452 40762353 : nConsecutiveValid++;
3453 106885793 : else if (nConsecutiveValid)
3454 : {
3455 105332 : nMaxConsecutiveValid = std::max(
3456 105332 : nMaxConsecutiveValid, nConsecutiveValid);
3457 105332 : nConsecutiveValid = 0;
3458 : }
3459 : }
3460 17852612 : nMaxConsecutiveValid =
3461 17852612 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
3462 17852612 : if (nMaxConsecutiveValid < nSrcPixelCount / 2)
3463 : {
3464 12526307 : const size_t nTempOffset =
3465 12526307 : static_cast<size_t>(iSrcLineOff) * nDstXSize +
3466 12526307 : iDstPixel - nDstXOff;
3467 12526307 : padfHorizontalFiltered[nTempOffset] = 0.0;
3468 12526307 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3469 12526307 : continue;
3470 : }
3471 : }
3472 :
3473 5806881 : double dfVal = 0.0;
3474 5806881 : GDALResampleConvolutionHorizontalWithMask(
3475 5806881 : pChunk + j, pabyChunkNodataMask + j, padfWeights,
3476 : nSrcPixelCount, dfVal, dfWeightSum);
3477 5806428 : const size_t nTempOffset =
3478 5806428 : static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
3479 5806428 : nDstXOff;
3480 5806428 : if (dfWeightSum > 0.0)
3481 : {
3482 5762218 : padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
3483 5762218 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
3484 : }
3485 : else
3486 : {
3487 44263 : padfHorizontalFiltered[nTempOffset] = 0.0;
3488 44263 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3489 : }
3490 : }
3491 : }
3492 : }
3493 :
3494 : /* ==================================================================== */
3495 : /* Second pass: vertical filter */
3496 : /* ==================================================================== */
3497 3736 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
3498 :
3499 202268 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3500 : {
3501 198532 : Twork *const pafDstScanline =
3502 198532 : pafWrkScanline ? pafWrkScanline
3503 8421 : : static_cast<Twork *>(pDstBuffer) +
3504 8421 : (iDstLine - nDstYOff) * nDstXSize;
3505 :
3506 198532 : const double dfSrcLine =
3507 198532 : (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
3508 198532 : int nSrcLineStart =
3509 198532 : static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
3510 198532 : int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
3511 198532 : if (nSrcLineStart < nChunkYOff)
3512 2359 : nSrcLineStart = nChunkYOff;
3513 198532 : if (nSrcLineStop > nChunkBottomYOff)
3514 2403 : nSrcLineStop = nChunkBottomYOff;
3515 : #if 0
3516 : if( nSrcLineStart < nChunkYOff &&
3517 : nChunkYOff > 0 )
3518 : {
3519 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3520 : }
3521 : if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
3522 : {
3523 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3524 : }
3525 : #endif
3526 198532 : const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
3527 198532 : double dfWeightSum = 0.0;
3528 :
3529 : // Compute convolution coefficients.
3530 198532 : int nSrcLine = nSrcLineStart; // Used after for.
3531 198532 : double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
3532 440224 : for (; nSrcLine + 3 < nSrcLineStop;
3533 241692 : nSrcLine += 4, dfY += 4 * dfYScaleWeight)
3534 : {
3535 241691 : padfWeights[nSrcLine - nSrcLineStart] = dfY;
3536 241691 : padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
3537 241691 : padfWeights[nSrcLine + 2 - nSrcLineStart] =
3538 241691 : dfY + 2 * dfYScaleWeight;
3539 241691 : padfWeights[nSrcLine + 3 - nSrcLineStart] =
3540 241691 : dfY + 3 * dfYScaleWeight;
3541 241692 : dfWeightSum +=
3542 241691 : pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
3543 : }
3544 231887 : for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
3545 : {
3546 33364 : const double dfWeight = pfnFilterFunc(dfY);
3547 33354 : padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
3548 33354 : dfWeightSum += dfWeight;
3549 : }
3550 :
3551 198523 : if (pabyChunkNodataMask == nullptr)
3552 : {
3553 163972 : if (dfWeightSum != 0)
3554 : {
3555 163972 : const double dfInvWeightSum = 1.0 / dfWeightSum;
3556 921433 : for (int i = 0; i < nSrcLineCount; ++i)
3557 757461 : padfWeights[i] *= dfInvWeightSum;
3558 : }
3559 : }
3560 :
3561 198523 : if (pabyChunkNodataMask == nullptr)
3562 : {
3563 163966 : int iFilteredPixelOff = 0; // Used after for.
3564 : // j used after for.
3565 163966 : size_t j =
3566 163966 : (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
3567 : #ifdef USE_SSE2
3568 : if constexpr (eWrkDataType == GDT_Float32)
3569 : {
3570 : #ifdef __AVX__
3571 : for (; iFilteredPixelOff + 15 < nDstXSize;
3572 : iFilteredPixelOff += 16, j += 16)
3573 : {
3574 : GDALResampleConvolutionVertical_16cols(
3575 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3576 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3577 : if (bHasNoData)
3578 : {
3579 : for (int k = 0; k < 16; k++)
3580 : {
3581 : pafDstScanline[iFilteredPixelOff + k] =
3582 : replaceValIfNodata(
3583 : pafDstScanline[iFilteredPixelOff + k]);
3584 : }
3585 : }
3586 : }
3587 : #else
3588 19050286 : for (; iFilteredPixelOff + 7 < nDstXSize;
3589 : iFilteredPixelOff += 8, j += 8)
3590 : {
3591 18917450 : GDALResampleConvolutionVertical_8cols(
3592 18917450 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3593 18917450 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3594 18893510 : if (bHasNoData)
3595 : {
3596 17820 : for (int k = 0; k < 8; k++)
3597 : {
3598 15840 : pafDstScanline[iFilteredPixelOff + k] =
3599 15840 : replaceValIfNodata(
3600 15840 : pafDstScanline[iFilteredPixelOff + k]);
3601 : }
3602 : }
3603 : }
3604 : #endif
3605 :
3606 596404 : for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
3607 : {
3608 463636 : const Twork fVal =
3609 463573 : static_cast<Twork>(GDALResampleConvolutionVertical(
3610 463573 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3611 : nSrcLineCount));
3612 463572 : pafDstScanline[iFilteredPixelOff] =
3613 463636 : replaceValIfNodata(fVal);
3614 : }
3615 : }
3616 : else
3617 : #endif
3618 : {
3619 2887210 : for (; iFilteredPixelOff + 1 < nDstXSize;
3620 : iFilteredPixelOff += 2, j += 2)
3621 : {
3622 2880000 : double dfVal1 = 0.0;
3623 2880000 : double dfVal2 = 0.0;
3624 2880000 : GDALResampleConvolutionVertical_2cols(
3625 2880000 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3626 : nSrcLineCount, dfVal1, dfVal2);
3627 5760010 : pafDstScanline[iFilteredPixelOff] =
3628 2880000 : replaceValIfNodata(static_cast<Twork>(dfVal1));
3629 2880000 : pafDstScanline[iFilteredPixelOff + 1] =
3630 2880000 : replaceValIfNodata(static_cast<Twork>(dfVal2));
3631 : }
3632 7206 : if (iFilteredPixelOff < nDstXSize)
3633 : {
3634 2 : const double dfVal = GDALResampleConvolutionVertical(
3635 2 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3636 : nSrcLineCount);
3637 2 : pafDstScanline[iFilteredPixelOff] =
3638 2 : replaceValIfNodata(static_cast<Twork>(dfVal));
3639 : }
3640 : }
3641 : }
3642 : else
3643 : {
3644 17349045 : for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
3645 : ++iFilteredPixelOff)
3646 : {
3647 17314505 : double dfVal = 0.0;
3648 17314505 : dfWeightSum = 0.0;
3649 17314505 : size_t j = (nSrcLineStart - nChunkYOff) *
3650 17314505 : static_cast<size_t>(nDstXSize) +
3651 17314505 : iFilteredPixelOff;
3652 17314505 : if (bKernelWithNegativeWeights)
3653 : {
3654 17089601 : int nConsecutiveValid = 0;
3655 17089601 : int nMaxConsecutiveValid = 0;
3656 121806321 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3657 : {
3658 104717020 : const double dfWeight =
3659 104717020 : padfWeights[i] *
3660 : pabyChunkNodataMaskHorizontalFiltered[j];
3661 104717020 : if (pabyChunkNodataMaskHorizontalFiltered[j])
3662 : {
3663 42068237 : nConsecutiveValid++;
3664 : }
3665 62648683 : else if (nConsecutiveValid)
3666 : {
3667 203800 : nMaxConsecutiveValid = std::max(
3668 203800 : nMaxConsecutiveValid, nConsecutiveValid);
3669 203800 : nConsecutiveValid = 0;
3670 : }
3671 104717020 : dfVal += padfHorizontalFiltered[j] * dfWeight;
3672 104717020 : dfWeightSum += dfWeight;
3673 : }
3674 17089601 : nMaxConsecutiveValid =
3675 17089601 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
3676 17089601 : if (nMaxConsecutiveValid < nSrcLineCount / 2)
3677 : {
3678 8867341 : pafDstScanline[iFilteredPixelOff] =
3679 8867249 : static_cast<Twork>(dfNoDataValue);
3680 8867341 : continue;
3681 : }
3682 : }
3683 : else
3684 : {
3685 1130262 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3686 : {
3687 905432 : const double dfWeight =
3688 905432 : padfWeights[i] *
3689 : pabyChunkNodataMaskHorizontalFiltered[j];
3690 905432 : dfVal += padfHorizontalFiltered[j] * dfWeight;
3691 905432 : dfWeightSum += dfWeight;
3692 : }
3693 : }
3694 8447134 : if (dfWeightSum > 0.0)
3695 : {
3696 8431093 : pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
3697 8431081 : static_cast<Twork>(dfVal / dfWeightSum));
3698 : }
3699 : else
3700 : {
3701 16045 : pafDstScanline[iFilteredPixelOff] =
3702 16021 : static_cast<Twork>(dfNoDataValue);
3703 : }
3704 : }
3705 : }
3706 :
3707 174594 : if (fMaxVal != 0.0f)
3708 : {
3709 192324 : for (int i = 0; i < nDstXSize; ++i)
3710 : {
3711 192088 : if (pafDstScanline[i] > fMaxVal)
3712 96022 : pafDstScanline[i] = fMaxVal;
3713 : }
3714 : }
3715 :
3716 174594 : if (pafWrkScanline)
3717 : {
3718 190111 : GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
3719 : static_cast<GByte *>(pDstBuffer) +
3720 190111 : static_cast<size_t>(iDstLine - nDstYOff) *
3721 190111 : nDstXSize * nDstDataTypeSize,
3722 : dstDataType, nDstDataTypeSize, nDstXSize);
3723 : }
3724 : }
3725 :
3726 3736 : VSIFree(pafWrkScanline);
3727 3736 : VSIFreeAligned(padfWeights);
3728 3736 : VSIFree(padfHorizontalFiltered);
3729 3736 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3730 :
3731 3736 : return CE_None;
3732 : }
3733 :
3734 : static CPLErr
3735 3736 : GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
3736 : const void *pChunk, void **ppDstBuffer,
3737 : GDALDataType *peDstBufferDataType)
3738 : {
3739 : GDALResampleAlg eResample;
3740 3736 : bool bKernelWithNegativeWeights = false;
3741 3736 : if (EQUAL(args.pszResampling, "BILINEAR"))
3742 2597 : eResample = GRA_Bilinear;
3743 1139 : else if (EQUAL(args.pszResampling, "CUBIC"))
3744 : {
3745 1061 : eResample = GRA_Cubic;
3746 1061 : bKernelWithNegativeWeights = true;
3747 : }
3748 78 : else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
3749 23 : eResample = GRA_CubicSpline;
3750 55 : else if (EQUAL(args.pszResampling, "LANCZOS"))
3751 : {
3752 54 : eResample = GRA_Lanczos;
3753 54 : bKernelWithNegativeWeights = true;
3754 : }
3755 : else
3756 : {
3757 1 : CPLAssert(false);
3758 : return CE_Failure;
3759 : }
3760 3735 : const int nKernelRadius = GWKGetFilterRadius(eResample);
3761 3733 : FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
3762 : const FilterFunc4ValuesType pfnFilterFunc4Values =
3763 3736 : GWKGetFilterFunc4Values(eResample);
3764 :
3765 3732 : float fMaxVal = 0.f;
3766 : // Cubic, etc... can have overshoots, so make sure we clamp values to the
3767 : // maximum value if NBITS is set.
3768 3732 : if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
3769 8 : (args.eOvrDataType == GDT_Byte || args.eOvrDataType == GDT_UInt16 ||
3770 0 : args.eOvrDataType == GDT_UInt32))
3771 : {
3772 8 : int nBits = args.nOvrNBITS;
3773 8 : if (nBits == GDALGetDataTypeSize(args.eOvrDataType))
3774 1 : nBits = 0;
3775 8 : if (nBits > 0 && nBits < 32)
3776 7 : fMaxVal = static_cast<float>((1U << nBits) - 1);
3777 : }
3778 :
3779 3732 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
3780 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
3781 : GDALGetDataTypeSizeBytes(args.eOvrDataType));
3782 3735 : if (*ppDstBuffer == nullptr)
3783 : {
3784 0 : return CE_Failure;
3785 : }
3786 3735 : *peDstBufferDataType = args.eOvrDataType;
3787 :
3788 3735 : switch (args.eWrkDataType)
3789 : {
3790 3009 : case GDT_Byte:
3791 : {
3792 3009 : return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32>(
3793 : args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
3794 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3795 3010 : bKernelWithNegativeWeights, fMaxVal);
3796 : }
3797 :
3798 396 : case GDT_UInt16:
3799 : {
3800 396 : return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32>(
3801 : args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
3802 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3803 396 : bKernelWithNegativeWeights, fMaxVal);
3804 : }
3805 :
3806 301 : case GDT_Float32:
3807 : {
3808 301 : return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32>(
3809 : args, static_cast<const float *>(pChunk), *ppDstBuffer,
3810 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3811 301 : bKernelWithNegativeWeights, fMaxVal);
3812 : }
3813 :
3814 29 : case GDT_Float64:
3815 : {
3816 29 : return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64>(
3817 : args, static_cast<const double *>(pChunk), *ppDstBuffer,
3818 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3819 29 : bKernelWithNegativeWeights, fMaxVal);
3820 : }
3821 :
3822 0 : default:
3823 0 : break;
3824 : }
3825 :
3826 0 : CPLAssert(false);
3827 : return CE_Failure;
3828 : }
3829 :
3830 : /************************************************************************/
3831 : /* GDALResampleChunkC32R() */
3832 : /************************************************************************/
3833 :
3834 2 : static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
3835 : const float *pafChunk, const int nChunkYOff,
3836 : const int nChunkYSize, const int nDstYOff,
3837 : const int nDstYOff2, const int nOvrXSize,
3838 : const int nOvrYSize, void **ppDstBuffer,
3839 : GDALDataType *peDstBufferDataType,
3840 : const char *pszResampling)
3841 :
3842 : {
3843 : enum Method
3844 : {
3845 : NEAR,
3846 : AVERAGE,
3847 : AVERAGE_MAGPHASE,
3848 : RMS,
3849 : };
3850 :
3851 2 : Method eMethod = NEAR;
3852 2 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
3853 : {
3854 0 : eMethod = NEAR;
3855 : }
3856 2 : else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
3857 : {
3858 0 : eMethod = AVERAGE_MAGPHASE;
3859 : }
3860 2 : else if (EQUAL(pszResampling, "RMS"))
3861 : {
3862 2 : eMethod = RMS;
3863 : }
3864 0 : else if (STARTS_WITH_CI(pszResampling, "AVER"))
3865 : {
3866 0 : eMethod = AVERAGE;
3867 : }
3868 : else
3869 : {
3870 0 : CPLError(
3871 : CE_Failure, CPLE_NotSupported,
3872 : "Resampling method %s is not supported for complex data types. "
3873 : "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
3874 : pszResampling);
3875 0 : return CE_Failure;
3876 : }
3877 :
3878 2 : const int nOXSize = nOvrXSize;
3879 2 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
3880 : GDALGetDataTypeSizeBytes(GDT_CFloat32));
3881 2 : if (*ppDstBuffer == nullptr)
3882 : {
3883 0 : return CE_Failure;
3884 : }
3885 2 : float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
3886 2 : *peDstBufferDataType = GDT_CFloat32;
3887 :
3888 2 : const int nOYSize = nOvrYSize;
3889 2 : const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
3890 2 : const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
3891 :
3892 : /* ==================================================================== */
3893 : /* Loop over destination scanlines. */
3894 : /* ==================================================================== */
3895 8 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3896 : {
3897 6 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
3898 6 : if (nSrcYOff < nChunkYOff)
3899 0 : nSrcYOff = nChunkYOff;
3900 :
3901 6 : int nSrcYOff2 =
3902 6 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
3903 6 : if (nSrcYOff2 == nSrcYOff)
3904 0 : nSrcYOff2++;
3905 :
3906 6 : if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
3907 : {
3908 2 : if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
3909 0 : nSrcYOff = nSrcHeight - 1;
3910 2 : nSrcYOff2 = nSrcHeight;
3911 : }
3912 6 : if (nSrcYOff2 > nChunkYOff + nChunkYSize)
3913 0 : nSrcYOff2 = nChunkYOff + nChunkYSize;
3914 :
3915 6 : const float *const pafSrcScanline =
3916 6 : pafChunk + ((nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
3917 6 : float *const pafDstScanline =
3918 6 : pafDstBuffer + (iDstLine - nDstYOff) * 2 * nOXSize;
3919 :
3920 : /* --------------------------------------------------------------------
3921 : */
3922 : /* Loop over destination pixels */
3923 : /* --------------------------------------------------------------------
3924 : */
3925 18 : for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
3926 : {
3927 12 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
3928 12 : int nSrcXOff2 =
3929 12 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
3930 12 : if (nSrcXOff2 == nSrcXOff)
3931 0 : nSrcXOff2++;
3932 12 : if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
3933 : {
3934 6 : if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
3935 0 : nSrcXOff = nSrcWidth - 1;
3936 6 : nSrcXOff2 = nSrcWidth;
3937 : }
3938 :
3939 12 : if (eMethod == NEAR)
3940 : {
3941 0 : pafDstScanline[iDstPixel * 2] = pafSrcScanline[nSrcXOff * 2];
3942 0 : pafDstScanline[iDstPixel * 2 + 1] =
3943 0 : pafSrcScanline[nSrcXOff * 2 + 1];
3944 : }
3945 12 : else if (eMethod == AVERAGE_MAGPHASE)
3946 : {
3947 0 : double dfTotalR = 0.0;
3948 0 : double dfTotalI = 0.0;
3949 0 : double dfTotalM = 0.0;
3950 0 : int nCount = 0;
3951 :
3952 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
3953 : {
3954 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
3955 : {
3956 0 : const double dfR =
3957 0 : pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
3958 0 : iY - nSrcYOff) *
3959 0 : nSrcWidth * 2];
3960 0 : const double dfI =
3961 0 : pafSrcScanline[iX * 2 +
3962 0 : static_cast<GPtrDiff_t>(iY -
3963 0 : nSrcYOff) *
3964 0 : nSrcWidth * 2 +
3965 0 : 1];
3966 0 : dfTotalR += dfR;
3967 0 : dfTotalI += dfI;
3968 0 : dfTotalM += std::hypot(dfR, dfI);
3969 0 : ++nCount;
3970 : }
3971 : }
3972 :
3973 0 : CPLAssert(nCount > 0);
3974 0 : if (nCount == 0)
3975 : {
3976 0 : pafDstScanline[iDstPixel * 2] = 0.0;
3977 0 : pafDstScanline[iDstPixel * 2 + 1] = 0.0;
3978 : }
3979 : else
3980 : {
3981 0 : pafDstScanline[iDstPixel * 2] =
3982 0 : static_cast<float>(dfTotalR / nCount);
3983 0 : pafDstScanline[iDstPixel * 2 + 1] =
3984 0 : static_cast<float>(dfTotalI / nCount);
3985 : const double dfM =
3986 0 : std::hypot(pafDstScanline[iDstPixel * 2],
3987 0 : pafDstScanline[iDstPixel * 2 + 1]);
3988 0 : const double dfDesiredM = dfTotalM / nCount;
3989 0 : double dfRatio = 1.0;
3990 0 : if (dfM != 0.0)
3991 0 : dfRatio = dfDesiredM / dfM;
3992 :
3993 0 : pafDstScanline[iDstPixel * 2] *=
3994 0 : static_cast<float>(dfRatio);
3995 0 : pafDstScanline[iDstPixel * 2 + 1] *=
3996 0 : static_cast<float>(dfRatio);
3997 : }
3998 : }
3999 12 : else if (eMethod == RMS)
4000 : {
4001 12 : double dfTotalR = 0.0;
4002 12 : double dfTotalI = 0.0;
4003 12 : int nCount = 0;
4004 :
4005 36 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4006 : {
4007 72 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4008 : {
4009 48 : const double dfR =
4010 48 : pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
4011 48 : iY - nSrcYOff) *
4012 48 : nSrcWidth * 2];
4013 48 : const double dfI =
4014 48 : pafSrcScanline[iX * 2 +
4015 48 : static_cast<GPtrDiff_t>(iY -
4016 48 : nSrcYOff) *
4017 48 : nSrcWidth * 2 +
4018 48 : 1];
4019 :
4020 48 : dfTotalR += SQUARE(dfR);
4021 48 : dfTotalI += SQUARE(dfI);
4022 :
4023 48 : ++nCount;
4024 : }
4025 : }
4026 :
4027 12 : CPLAssert(nCount > 0);
4028 12 : if (nCount == 0)
4029 : {
4030 0 : pafDstScanline[iDstPixel * 2] = 0.0;
4031 0 : pafDstScanline[iDstPixel * 2 + 1] = 0.0;
4032 : }
4033 : else
4034 : {
4035 : /* compute RMS */
4036 12 : pafDstScanline[iDstPixel * 2] =
4037 12 : static_cast<float>(sqrt(dfTotalR / nCount));
4038 12 : pafDstScanline[iDstPixel * 2 + 1] =
4039 12 : static_cast<float>(sqrt(dfTotalI / nCount));
4040 : }
4041 : }
4042 0 : else if (eMethod == AVERAGE)
4043 : {
4044 0 : double dfTotalR = 0.0;
4045 0 : double dfTotalI = 0.0;
4046 0 : int nCount = 0;
4047 :
4048 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4049 : {
4050 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4051 : {
4052 : // TODO(schwehr): Maybe use std::complex?
4053 0 : dfTotalR +=
4054 0 : pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
4055 0 : iY - nSrcYOff) *
4056 0 : nSrcWidth * 2];
4057 0 : dfTotalI += pafSrcScanline[iX * 2 +
4058 0 : static_cast<GPtrDiff_t>(
4059 0 : iY - nSrcYOff) *
4060 0 : nSrcWidth * 2 +
4061 0 : 1];
4062 0 : ++nCount;
4063 : }
4064 : }
4065 :
4066 0 : CPLAssert(nCount > 0);
4067 0 : if (nCount == 0)
4068 : {
4069 0 : pafDstScanline[iDstPixel * 2] = 0.0;
4070 0 : pafDstScanline[iDstPixel * 2 + 1] = 0.0;
4071 : }
4072 : else
4073 : {
4074 0 : pafDstScanline[iDstPixel * 2] =
4075 0 : static_cast<float>(dfTotalR / nCount);
4076 0 : pafDstScanline[iDstPixel * 2 + 1] =
4077 0 : static_cast<float>(dfTotalI / nCount);
4078 : }
4079 : }
4080 : }
4081 : }
4082 :
4083 2 : return CE_None;
4084 : }
4085 :
4086 : /************************************************************************/
4087 : /* GDALRegenerateCascadingOverviews() */
4088 : /* */
4089 : /* Generate a list of overviews in order from largest to */
4090 : /* smallest, computing each from the next larger. */
4091 : /************************************************************************/
4092 :
4093 42 : static CPLErr GDALRegenerateCascadingOverviews(
4094 : GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
4095 : const char *pszResampling, GDALProgressFunc pfnProgress,
4096 : void *pProgressData, CSLConstList papszOptions)
4097 :
4098 : {
4099 : /* -------------------------------------------------------------------- */
4100 : /* First, we must put the overviews in order from largest to */
4101 : /* smallest. */
4102 : /* -------------------------------------------------------------------- */
4103 120 : for (int i = 0; i < nOverviews - 1; ++i)
4104 : {
4105 270 : for (int j = 0; j < nOverviews - i - 1; ++j)
4106 : {
4107 192 : if (papoOvrBands[j]->GetXSize() *
4108 192 : static_cast<float>(papoOvrBands[j]->GetYSize()) <
4109 192 : papoOvrBands[j + 1]->GetXSize() *
4110 192 : static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
4111 : {
4112 0 : GDALRasterBand *poTempBand = papoOvrBands[j];
4113 0 : papoOvrBands[j] = papoOvrBands[j + 1];
4114 0 : papoOvrBands[j + 1] = poTempBand;
4115 : }
4116 : }
4117 : }
4118 :
4119 : /* -------------------------------------------------------------------- */
4120 : /* Count total pixels so we can prepare appropriate scaled */
4121 : /* progress functions. */
4122 : /* -------------------------------------------------------------------- */
4123 42 : double dfTotalPixels = 0.0;
4124 :
4125 162 : for (int i = 0; i < nOverviews; ++i)
4126 : {
4127 120 : dfTotalPixels += papoOvrBands[i]->GetXSize() *
4128 120 : static_cast<double>(papoOvrBands[i]->GetYSize());
4129 : }
4130 :
4131 : /* -------------------------------------------------------------------- */
4132 : /* Generate all the bands. */
4133 : /* -------------------------------------------------------------------- */
4134 42 : double dfPixelsProcessed = 0.0;
4135 :
4136 162 : for (int i = 0; i < nOverviews; ++i)
4137 : {
4138 120 : GDALRasterBand *poBaseBand = poSrcBand;
4139 120 : if (i != 0)
4140 78 : poBaseBand = papoOvrBands[i - 1];
4141 :
4142 120 : double dfPixels = papoOvrBands[i]->GetXSize() *
4143 120 : static_cast<double>(papoOvrBands[i]->GetYSize());
4144 :
4145 240 : void *pScaledProgressData = GDALCreateScaledProgress(
4146 : dfPixelsProcessed / dfTotalPixels,
4147 120 : (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
4148 : pProgressData);
4149 :
4150 240 : const CPLErr eErr = GDALRegenerateOverviewsEx(
4151 : poBaseBand, 1,
4152 120 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
4153 : pszResampling, GDALScaledProgress, pScaledProgressData,
4154 : papszOptions);
4155 120 : GDALDestroyScaledProgress(pScaledProgressData);
4156 :
4157 120 : if (eErr != CE_None)
4158 0 : return eErr;
4159 :
4160 120 : dfPixelsProcessed += dfPixels;
4161 :
4162 : // Only do the bit2grayscale promotion on the base band.
4163 120 : if (STARTS_WITH_CI(pszResampling,
4164 : "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
4165 8 : pszResampling = "AVERAGE";
4166 : }
4167 :
4168 42 : return CE_None;
4169 : }
4170 :
4171 : /************************************************************************/
4172 : /* GDALGetResampleFunction() */
4173 : /************************************************************************/
4174 :
4175 3906 : GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
4176 : int *pnRadius)
4177 : {
4178 3906 : if (pnRadius)
4179 3905 : *pnRadius = 0;
4180 3906 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
4181 448 : return GDALResampleChunk_Near;
4182 3458 : else if (STARTS_WITH_CI(pszResampling, "AVER") ||
4183 2942 : EQUAL(pszResampling, "RMS"))
4184 543 : return GDALResampleChunk_AverageOrRMS;
4185 2915 : else if (EQUAL(pszResampling, "GAUSS"))
4186 : {
4187 26 : if (pnRadius)
4188 26 : *pnRadius = 1;
4189 26 : return GDALResampleChunk_Gauss;
4190 : }
4191 2889 : else if (EQUAL(pszResampling, "MODE"))
4192 96 : return GDALResampleChunk_Mode;
4193 2793 : else if (EQUAL(pszResampling, "CUBIC"))
4194 : {
4195 416 : if (pnRadius)
4196 415 : *pnRadius = GWKGetFilterRadius(GRA_Cubic);
4197 416 : return GDALResampleChunk_Convolution;
4198 : }
4199 2377 : else if (EQUAL(pszResampling, "CUBICSPLINE"))
4200 : {
4201 3 : if (pnRadius)
4202 3 : *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
4203 3 : return GDALResampleChunk_Convolution;
4204 : }
4205 2374 : else if (EQUAL(pszResampling, "LANCZOS"))
4206 : {
4207 8 : if (pnRadius)
4208 8 : *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
4209 8 : return GDALResampleChunk_Convolution;
4210 : }
4211 2366 : else if (EQUAL(pszResampling, "BILINEAR"))
4212 : {
4213 2367 : if (pnRadius)
4214 2367 : *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
4215 2367 : return GDALResampleChunk_Convolution;
4216 : }
4217 : else
4218 : {
4219 0 : CPLError(
4220 : CE_Failure, CPLE_AppDefined,
4221 : "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
4222 : pszResampling);
4223 0 : return nullptr;
4224 : }
4225 : }
4226 :
4227 : /************************************************************************/
4228 : /* GDALGetOvrWorkDataType() */
4229 : /************************************************************************/
4230 :
4231 3801 : GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
4232 : GDALDataType eSrcDataType)
4233 : {
4234 3801 : if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
4235 : {
4236 532 : return eSrcDataType;
4237 : }
4238 3269 : else if (eSrcDataType == GDT_Byte &&
4239 2949 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4240 2489 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4241 2257 : EQUAL(pszResampling, "CUBICSPLINE") ||
4242 2254 : EQUAL(pszResampling, "LANCZOS") ||
4243 2249 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4244 : {
4245 2943 : return GDT_Byte;
4246 : }
4247 326 : else if (eSrcDataType == GDT_UInt16 &&
4248 122 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4249 118 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4250 3 : EQUAL(pszResampling, "CUBICSPLINE") ||
4251 3 : EQUAL(pszResampling, "LANCZOS") ||
4252 2 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4253 : {
4254 115 : return GDT_UInt16;
4255 : }
4256 211 : else if (EQUAL(pszResampling, "GAUSS"))
4257 20 : return GDT_Float64;
4258 :
4259 191 : if (eSrcDataType == GDT_Byte || eSrcDataType == GDT_Int8 ||
4260 184 : eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
4261 : eSrcDataType == GDT_Float32)
4262 : {
4263 155 : return GDT_Float32;
4264 : }
4265 36 : return GDT_Float64;
4266 : }
4267 :
4268 : namespace
4269 : {
4270 : // Structure to hold a pointer to free with CPLFree()
4271 : struct PointerHolder
4272 : {
4273 : void *ptr = nullptr;
4274 :
4275 34744 : explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
4276 : {
4277 34744 : }
4278 :
4279 34747 : ~PointerHolder()
4280 34747 : {
4281 34747 : CPLFree(ptr);
4282 34747 : }
4283 :
4284 : PointerHolder(const PointerHolder &) = delete;
4285 : PointerHolder &operator=(const PointerHolder &) = delete;
4286 : };
4287 : } // namespace
4288 :
4289 : /************************************************************************/
4290 : /* GDALRegenerateOverviews() */
4291 : /************************************************************************/
4292 :
4293 : /**
4294 : * \brief Generate downsampled overviews.
4295 : *
4296 : * This function will generate one or more overview images from a base image
4297 : * using the requested downsampling algorithm. Its primary use is for
4298 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4299 : * used to generate downsampled images in one file from another outside the
4300 : * overview architecture.
4301 : *
4302 : * The output bands need to exist in advance.
4303 : *
4304 : * The full set of resampling algorithms is documented in
4305 : * GDALDataset::BuildOverviews().
4306 : *
4307 : * This function will honour properly NODATA_VALUES tuples (special dataset
4308 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4309 : * considered as the nodata value and not each value of the triplet
4310 : * independently per band.
4311 : *
4312 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4313 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4314 : * overview computation.
4315 : *
4316 : * @param hSrcBand the source (base level) band.
4317 : * @param nOverviewCount the number of downsampled bands being generated.
4318 : * @param pahOvrBands the list of downsampled bands to be generated.
4319 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4320 : * @param pfnProgress progress report function.
4321 : * @param pProgressData progress function callback data.
4322 : * @return CE_None on success or CE_Failure on failure.
4323 : */
4324 250 : CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
4325 : GDALRasterBandH *pahOvrBands,
4326 : const char *pszResampling,
4327 : GDALProgressFunc pfnProgress,
4328 : void *pProgressData)
4329 :
4330 : {
4331 250 : return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
4332 : pszResampling, pfnProgress, pProgressData,
4333 250 : nullptr);
4334 : }
4335 :
4336 : /************************************************************************/
4337 : /* GDALRegenerateOverviewsEx() */
4338 : /************************************************************************/
4339 :
4340 : /**
4341 : * \brief Generate downsampled overviews.
4342 : *
4343 : * This function will generate one or more overview images from a base image
4344 : * using the requested downsampling algorithm. Its primary use is for
4345 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4346 : * used to generate downsampled images in one file from another outside the
4347 : * overview architecture.
4348 : *
4349 : * The output bands need to exist in advance.
4350 : *
4351 : * The full set of resampling algorithms is documented in
4352 : * GDALDataset::BuildOverviews().
4353 : *
4354 : * This function will honour properly NODATA_VALUES tuples (special dataset
4355 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4356 : * considered as the nodata value and not each value of the triplet
4357 : * independently per band.
4358 : *
4359 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4360 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4361 : * overview computation.
4362 : *
4363 : * @param hSrcBand the source (base level) band.
4364 : * @param nOverviewCount the number of downsampled bands being generated.
4365 : * @param pahOvrBands the list of downsampled bands to be generated.
4366 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4367 : * @param pfnProgress progress report function.
4368 : * @param pProgressData progress function callback data.
4369 : * @param papszOptions NULL terminated list of options as key=value pairs, or
4370 : * NULL
4371 : * @return CE_None on success or CE_Failure on failure.
4372 : * @since GDAL 3.6
4373 : */
4374 819 : CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
4375 : GDALRasterBandH *pahOvrBands,
4376 : const char *pszResampling,
4377 : GDALProgressFunc pfnProgress,
4378 : void *pProgressData, CSLConstList papszOptions)
4379 :
4380 : {
4381 819 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
4382 819 : GDALRasterBand **papoOvrBands =
4383 : reinterpret_cast<GDALRasterBand **>(pahOvrBands);
4384 :
4385 819 : if (pfnProgress == nullptr)
4386 250 : pfnProgress = GDALDummyProgress;
4387 :
4388 819 : if (EQUAL(pszResampling, "NONE"))
4389 61 : return CE_None;
4390 :
4391 758 : int nKernelRadius = 0;
4392 : GDALResampleFunction pfnResampleFn =
4393 758 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
4394 :
4395 758 : if (pfnResampleFn == nullptr)
4396 0 : return CE_Failure;
4397 :
4398 : /* -------------------------------------------------------------------- */
4399 : /* Check color tables... */
4400 : /* -------------------------------------------------------------------- */
4401 758 : GDALColorTable *poColorTable = nullptr;
4402 :
4403 397 : if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
4404 1590 : EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
4405 446 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4406 : {
4407 9 : poColorTable = poSrcBand->GetColorTable();
4408 9 : if (poColorTable != nullptr)
4409 : {
4410 9 : if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
4411 : {
4412 0 : CPLError(CE_Warning, CPLE_AppDefined,
4413 : "Computing overviews on palette index raster bands "
4414 : "with a palette whose color interpretation is not RGB "
4415 : "will probably lead to unexpected results.");
4416 0 : poColorTable = nullptr;
4417 : }
4418 9 : else if (poColorTable->IsIdentity())
4419 : {
4420 0 : poColorTable = nullptr;
4421 : }
4422 : }
4423 : else
4424 : {
4425 0 : CPLError(CE_Warning, CPLE_AppDefined,
4426 : "Computing overviews on palette index raster bands "
4427 : "without a palette will probably lead to unexpected "
4428 : "results.");
4429 : }
4430 : }
4431 : // Not ready yet
4432 2193 : else if ((EQUAL(pszResampling, "CUBIC") ||
4433 695 : EQUAL(pszResampling, "CUBICSPLINE") ||
4434 695 : EQUAL(pszResampling, "LANCZOS") ||
4435 1501 : EQUAL(pszResampling, "BILINEAR")) &&
4436 57 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4437 : {
4438 0 : CPLError(CE_Warning, CPLE_AppDefined,
4439 : "Computing %s overviews on palette index raster bands "
4440 : "will probably lead to unexpected results.",
4441 : pszResampling);
4442 : }
4443 :
4444 : // If we have a nodata mask and we are doing something more complicated
4445 : // than nearest neighbouring, we have to fetch to nodata mask.
4446 :
4447 758 : GDALRasterBand *poMaskBand = nullptr;
4448 758 : bool bUseNoDataMask = false;
4449 758 : bool bCanUseCascaded = true;
4450 :
4451 758 : if (!STARTS_WITH_CI(pszResampling, "NEAR"))
4452 : {
4453 : // Special case if we are an alpha/mask band. We want it to be
4454 : // considered as the mask band to avoid alpha=0 to be taken into account
4455 : // in average computation.
4456 503 : if (poSrcBand->IsMaskBand())
4457 : {
4458 90 : poMaskBand = poSrcBand;
4459 90 : bUseNoDataMask = true;
4460 : }
4461 : else
4462 : {
4463 413 : poMaskBand = poSrcBand->GetMaskBand();
4464 413 : const int nMaskFlags = poSrcBand->GetMaskFlags();
4465 413 : bCanUseCascaded =
4466 413 : (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
4467 413 : bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
4468 : }
4469 : }
4470 :
4471 : /* -------------------------------------------------------------------- */
4472 : /* If we are operating on multiple overviews, and using */
4473 : /* averaging, lets do them in cascading order to reduce the */
4474 : /* amount of computation. */
4475 : /* -------------------------------------------------------------------- */
4476 :
4477 : // In case the mask made be computed from another band of the dataset,
4478 : // we can't use cascaded generation, as the computation of the overviews
4479 : // of the band used for the mask band may not have yet occurred (#3033).
4480 758 : if ((STARTS_WITH_CI(pszResampling, "AVER") ||
4481 397 : EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
4482 366 : EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
4483 312 : EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
4484 758 : EQUAL(pszResampling, "MODE")) &&
4485 42 : nOverviewCount > 1 && bCanUseCascaded)
4486 42 : return GDALRegenerateCascadingOverviews(
4487 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
4488 42 : pProgressData, papszOptions);
4489 :
4490 : /* -------------------------------------------------------------------- */
4491 : /* Setup one horizontal swath to read from the raw buffer. */
4492 : /* -------------------------------------------------------------------- */
4493 716 : int nFRXBlockSize = 0;
4494 716 : int nFRYBlockSize = 0;
4495 716 : poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
4496 :
4497 716 : const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
4498 1177 : const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
4499 1131 : EQUAL(pszResampling, "MODE") ||
4500 415 : !GDALDataTypeIsComplex(eSrcDataType);
4501 : const GDALDataType eWrkDataType =
4502 : bUseGenericResampleFn
4503 716 : ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
4504 716 : : GDT_CFloat32;
4505 :
4506 716 : const int nWidth = poSrcBand->GetXSize();
4507 716 : const int nHeight = poSrcBand->GetYSize();
4508 :
4509 716 : int nMaxOvrFactor = 1;
4510 1531 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
4511 : {
4512 815 : const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
4513 815 : const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
4514 815 : nMaxOvrFactor = std::max(
4515 : nMaxOvrFactor,
4516 815 : static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
4517 815 : nMaxOvrFactor = std::max(
4518 : nMaxOvrFactor,
4519 815 : static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
4520 : }
4521 :
4522 716 : int nFullResYChunk = nFRYBlockSize;
4523 716 : int nMaxChunkYSizeQueried = 0;
4524 :
4525 : const auto UpdateChunkHeightAndGetChunkSize =
4526 9286 : [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
4527 27858 : eWrkDataType, nWidth]()
4528 : {
4529 : // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
4530 : // + nFullResYChunk) / nMaxOvrFactor)
4531 9286 : nFullResYChunk = std::max(nFullResYChunk, 2 * nMaxOvrFactor);
4532 9286 : nMaxChunkYSizeQueried =
4533 9286 : nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
4534 9286 : return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
4535 9286 : nMaxChunkYSizeQueried * nWidth;
4536 716 : };
4537 :
4538 : // Only configurable for debug / testing
4539 : const char *pszChunkYSize =
4540 716 : CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
4541 716 : if (pszChunkYSize)
4542 : {
4543 : // coverity[tainted_data]
4544 0 : nFullResYChunk = atoi(pszChunkYSize);
4545 : }
4546 :
4547 : // Only configurable for debug / testing
4548 : const int nChunkMaxSize =
4549 716 : atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
4550 :
4551 716 : auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
4552 716 : if (nChunkSize > nChunkMaxSize)
4553 : {
4554 3 : if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
4555 9 : !GDALDataTypeIsComplex(eSrcDataType) &&
4556 3 : (!STARTS_WITH_CI(pszResampling, "AVER") ||
4557 0 : EQUAL(pszResampling, "AVERAGE")))
4558 : {
4559 : // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
4560 : // which use a block based strategy, which is much less memory
4561 : // hungry.
4562 3 : return GDALRegenerateOverviewsMultiBand(
4563 : 1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
4564 3 : pfnProgress, pProgressData, papszOptions);
4565 : }
4566 0 : else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
4567 : {
4568 0 : return GDALRegenerateCascadingOverviews(
4569 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
4570 0 : pfnProgress, pProgressData, papszOptions);
4571 : }
4572 : }
4573 713 : else if (pszChunkYSize == nullptr)
4574 : {
4575 : // Try to get as close as possible to nChunkMaxSize
4576 9283 : while (nChunkSize * 2 < nChunkMaxSize)
4577 : {
4578 8570 : nFullResYChunk *= 2;
4579 8570 : nChunkSize = UpdateChunkHeightAndGetChunkSize();
4580 : }
4581 : }
4582 :
4583 713 : int nHasNoData = 0;
4584 713 : const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
4585 713 : const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
4586 : const bool bPropagateNoData =
4587 713 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
4588 :
4589 : // Structure describing a resampling job
4590 : struct OvrJob
4591 : {
4592 : // Buffers to free when job is finished
4593 : std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
4594 : std::shared_ptr<PointerHolder> oSrcBufferHolder{};
4595 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
4596 :
4597 : GDALRasterBand *poDstBand = nullptr;
4598 :
4599 : // Input parameters of pfnResampleFn
4600 : GDALResampleFunction pfnResampleFn = nullptr;
4601 : int nSrcWidth = 0;
4602 : int nSrcHeight = 0;
4603 : int nDstWidth = 0;
4604 : GDALOverviewResampleArgs args{};
4605 : const void *pChunk = nullptr;
4606 : bool bUseGenericResampleFn = false;
4607 :
4608 : // Output values of resampling function
4609 : CPLErr eErr = CE_Failure;
4610 : void *pDstBuffer = nullptr;
4611 : GDALDataType eDstBufferDataType = GDT_Unknown;
4612 :
4613 : // Synchronization
4614 : bool bFinished = false;
4615 : std::mutex mutex{};
4616 : std::condition_variable cv{};
4617 :
4618 0 : void SetSrcMaskBufferHolder(
4619 : const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
4620 : {
4621 0 : oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
4622 0 : }
4623 :
4624 0 : void SetSrcBufferHolder(
4625 : const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
4626 : {
4627 0 : oSrcBufferHolder = oSrcBufferHolderIn;
4628 0 : }
4629 : };
4630 :
4631 : // Thread function to resample
4632 813 : const auto JobResampleFunc = [](void *pData)
4633 : {
4634 813 : OvrJob *poJob = static_cast<OvrJob *>(pData);
4635 :
4636 813 : if (poJob->bUseGenericResampleFn)
4637 : {
4638 811 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
4639 : &(poJob->pDstBuffer),
4640 : &(poJob->eDstBufferDataType));
4641 : }
4642 : else
4643 : {
4644 2 : poJob->eErr = GDALResampleChunkC32R(
4645 : poJob->nSrcWidth, poJob->nSrcHeight,
4646 2 : static_cast<const float *>(poJob->pChunk),
4647 : poJob->args.nChunkYOff, poJob->args.nChunkYSize,
4648 : poJob->args.nDstYOff, poJob->args.nDstYOff2,
4649 : poJob->args.nOvrXSize, poJob->args.nOvrYSize,
4650 : &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
4651 : poJob->args.pszResampling);
4652 : }
4653 :
4654 : poJob->oDstBufferHolder =
4655 813 : std::make_unique<PointerHolder>(poJob->pDstBuffer);
4656 :
4657 : {
4658 1626 : std::lock_guard<std::mutex> guard(poJob->mutex);
4659 813 : poJob->bFinished = true;
4660 813 : poJob->cv.notify_one();
4661 : }
4662 813 : };
4663 :
4664 : // Function to write resample data to target band
4665 813 : const auto WriteJobData = [](const OvrJob *poJob)
4666 : {
4667 1626 : return poJob->poDstBand->RasterIO(
4668 813 : GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
4669 813 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
4670 813 : poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
4671 813 : poJob->eDstBufferDataType, 0, 0, nullptr);
4672 : };
4673 :
4674 : // Wait for completion of oldest job and serialize it
4675 : const auto WaitAndFinalizeOldestJob =
4676 0 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
4677 : {
4678 0 : auto poOldestJob = jobList.front().get();
4679 : {
4680 0 : std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
4681 : // coverity[missing_lock:FALSE]
4682 0 : while (!poOldestJob->bFinished)
4683 : {
4684 0 : poOldestJob->cv.wait(oGuard);
4685 : }
4686 : }
4687 0 : CPLErr l_eErr = poOldestJob->eErr;
4688 0 : if (l_eErr == CE_None)
4689 : {
4690 0 : l_eErr = WriteJobData(poOldestJob);
4691 : }
4692 :
4693 0 : jobList.pop_front();
4694 0 : return l_eErr;
4695 : };
4696 :
4697 : // Queue of jobs
4698 1426 : std::list<std::unique_ptr<OvrJob>> jobList;
4699 :
4700 713 : GByte *pabyChunkNodataMask = nullptr;
4701 713 : void *pChunk = nullptr;
4702 :
4703 713 : const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
4704 2852 : const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
4705 713 : ? CPLGetNumCPUs()
4706 713 : : atoi(pszThreads)));
4707 : auto poThreadPool =
4708 713 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
4709 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
4710 1426 : : std::unique_ptr<CPLJobQueue>(nullptr);
4711 :
4712 : /* -------------------------------------------------------------------- */
4713 : /* Loop over image operating on chunks. */
4714 : /* -------------------------------------------------------------------- */
4715 713 : int nChunkYOff = 0;
4716 713 : CPLErr eErr = CE_None;
4717 :
4718 1431 : for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
4719 718 : nChunkYOff += nFullResYChunk)
4720 : {
4721 718 : if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
4722 : pProgressData))
4723 : {
4724 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
4725 0 : eErr = CE_Failure;
4726 : }
4727 :
4728 718 : if (nFullResYChunk + nChunkYOff > nHeight)
4729 711 : nFullResYChunk = nHeight - nChunkYOff;
4730 :
4731 718 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
4732 718 : int nChunkYSizeQueried =
4733 718 : nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
4734 718 : if (nChunkYOffQueried < 0)
4735 : {
4736 62 : nChunkYSizeQueried += nChunkYOffQueried;
4737 62 : nChunkYOffQueried = 0;
4738 : }
4739 718 : if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
4740 62 : nChunkYSizeQueried = nHeight - nChunkYOffQueried;
4741 :
4742 : // Avoid accumulating too many tasks and exhaust RAM
4743 : // Try to complete already finished jobs
4744 718 : while (eErr == CE_None && !jobList.empty())
4745 : {
4746 0 : auto poOldestJob = jobList.front().get();
4747 : {
4748 0 : std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
4749 0 : if (!poOldestJob->bFinished)
4750 : {
4751 0 : break;
4752 : }
4753 : }
4754 0 : eErr = poOldestJob->eErr;
4755 0 : if (eErr == CE_None)
4756 : {
4757 0 : eErr = WriteJobData(poOldestJob);
4758 : }
4759 :
4760 0 : jobList.pop_front();
4761 : }
4762 :
4763 : // And in case we have saturated the number of threads,
4764 : // wait for completion of tasks to go below the threshold.
4765 1436 : while (eErr == CE_None &&
4766 718 : jobList.size() >= static_cast<size_t>(nThreads))
4767 : {
4768 0 : eErr = WaitAndFinalizeOldestJob(jobList);
4769 : }
4770 :
4771 : // (Re)allocate buffers if needed
4772 718 : if (pChunk == nullptr)
4773 : {
4774 713 : pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
4775 : nMaxChunkYSizeQueried, nWidth);
4776 : }
4777 718 : if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
4778 : {
4779 : pabyChunkNodataMask = static_cast<GByte *>(
4780 274 : VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
4781 : }
4782 :
4783 718 : if (pChunk == nullptr ||
4784 274 : (bUseNoDataMask && pabyChunkNodataMask == nullptr))
4785 : {
4786 0 : CPLFree(pChunk);
4787 0 : CPLFree(pabyChunkNodataMask);
4788 0 : return CE_Failure;
4789 : }
4790 :
4791 : // Read chunk.
4792 718 : if (eErr == CE_None)
4793 718 : eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
4794 : nChunkYSizeQueried, pChunk, nWidth,
4795 : nChunkYSizeQueried, eWrkDataType, 0, 0,
4796 : nullptr);
4797 718 : if (eErr == CE_None && bUseNoDataMask)
4798 274 : eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
4799 : nChunkYSizeQueried, pabyChunkNodataMask,
4800 : nWidth, nChunkYSizeQueried, GDT_Byte, 0,
4801 : 0, nullptr);
4802 :
4803 : // Special case to promote 1bit data to 8bit 0/255 values.
4804 718 : if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
4805 : {
4806 9 : if (eWrkDataType == GDT_Float32)
4807 : {
4808 0 : float *pafChunk = static_cast<float *>(pChunk);
4809 0 : for (GPtrDiff_t i = 0;
4810 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4811 : i++)
4812 : {
4813 0 : if (pafChunk[i] == 1.0)
4814 0 : pafChunk[i] = 255.0;
4815 : }
4816 : }
4817 9 : else if (eWrkDataType == GDT_Byte)
4818 : {
4819 9 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
4820 168417 : for (GPtrDiff_t i = 0;
4821 168417 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4822 : i++)
4823 : {
4824 168408 : if (pabyChunk[i] == 1)
4825 127437 : pabyChunk[i] = 255;
4826 : }
4827 : }
4828 0 : else if (eWrkDataType == GDT_UInt16)
4829 : {
4830 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
4831 0 : for (GPtrDiff_t i = 0;
4832 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4833 : i++)
4834 : {
4835 0 : if (pasChunk[i] == 1)
4836 0 : pasChunk[i] = 255;
4837 : }
4838 : }
4839 0 : else if (eWrkDataType == GDT_Float64)
4840 : {
4841 0 : double *padfChunk = static_cast<double *>(pChunk);
4842 0 : for (GPtrDiff_t i = 0;
4843 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4844 : i++)
4845 : {
4846 0 : if (padfChunk[i] == 1.0)
4847 0 : padfChunk[i] = 255.0;
4848 : }
4849 : }
4850 : else
4851 : {
4852 0 : CPLAssert(false);
4853 : }
4854 : }
4855 709 : else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
4856 : {
4857 0 : if (eWrkDataType == GDT_Float32)
4858 : {
4859 0 : float *pafChunk = static_cast<float *>(pChunk);
4860 0 : for (GPtrDiff_t i = 0;
4861 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4862 : i++)
4863 : {
4864 0 : if (pafChunk[i] == 1.0)
4865 0 : pafChunk[i] = 0.0;
4866 0 : else if (pafChunk[i] == 0.0)
4867 0 : pafChunk[i] = 255.0;
4868 : }
4869 : }
4870 0 : else if (eWrkDataType == GDT_Byte)
4871 : {
4872 0 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
4873 0 : for (GPtrDiff_t i = 0;
4874 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4875 : i++)
4876 : {
4877 0 : if (pabyChunk[i] == 1)
4878 0 : pabyChunk[i] = 0;
4879 0 : else if (pabyChunk[i] == 0)
4880 0 : pabyChunk[i] = 255;
4881 : }
4882 : }
4883 0 : else if (eWrkDataType == GDT_UInt16)
4884 : {
4885 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
4886 0 : for (GPtrDiff_t i = 0;
4887 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4888 : i++)
4889 : {
4890 0 : if (pasChunk[i] == 1)
4891 0 : pasChunk[i] = 0;
4892 0 : else if (pasChunk[i] == 0)
4893 0 : pasChunk[i] = 255;
4894 : }
4895 : }
4896 0 : else if (eWrkDataType == GDT_Float64)
4897 : {
4898 0 : double *padfChunk = static_cast<double *>(pChunk);
4899 0 : for (GPtrDiff_t i = 0;
4900 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4901 : i++)
4902 : {
4903 0 : if (padfChunk[i] == 1.0)
4904 0 : padfChunk[i] = 0.0;
4905 0 : else if (padfChunk[i] == 0.0)
4906 0 : padfChunk[i] = 255.0;
4907 : }
4908 : }
4909 : else
4910 : {
4911 0 : CPLAssert(false);
4912 : }
4913 : }
4914 :
4915 : auto oSrcBufferHolder =
4916 1436 : std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
4917 : auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
4918 1436 : poJobQueue ? pabyChunkNodataMask : nullptr);
4919 :
4920 1531 : for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
4921 : ++iOverview)
4922 : {
4923 813 : GDALRasterBand *poDstBand = papoOvrBands[iOverview];
4924 813 : const int nDstWidth = poDstBand->GetXSize();
4925 813 : const int nDstHeight = poDstBand->GetYSize();
4926 :
4927 813 : const double dfXRatioDstToSrc =
4928 813 : static_cast<double>(nWidth) / nDstWidth;
4929 813 : const double dfYRatioDstToSrc =
4930 813 : static_cast<double>(nHeight) / nDstHeight;
4931 :
4932 : /* --------------------------------------------------------------------
4933 : */
4934 : /* Figure out the line to start writing to, and the first line
4935 : */
4936 : /* to not write to. In theory this approach should ensure that
4937 : */
4938 : /* every output line will be written if all input chunks are */
4939 : /* processed. */
4940 : /* --------------------------------------------------------------------
4941 : */
4942 813 : int nDstYOff =
4943 813 : static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
4944 813 : if (nDstYOff == nDstHeight)
4945 0 : continue;
4946 813 : int nDstYOff2 = static_cast<int>(
4947 813 : 0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
4948 :
4949 813 : if (nChunkYOff + nFullResYChunk == nHeight)
4950 806 : nDstYOff2 = nDstHeight;
4951 : #if DEBUG_VERBOSE
4952 : CPLDebug("GDAL",
4953 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
4954 : nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
4955 : nDstWidth, nDstYOff2 - nDstYOff);
4956 : #endif
4957 :
4958 1626 : auto poJob = std::make_unique<OvrJob>();
4959 813 : poJob->pfnResampleFn = pfnResampleFn;
4960 813 : poJob->bUseGenericResampleFn = bUseGenericResampleFn;
4961 813 : poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
4962 813 : poJob->args.nOvrXSize = poDstBand->GetXSize();
4963 813 : poJob->args.nOvrYSize = poDstBand->GetYSize();
4964 : const char *pszNBITS =
4965 813 : poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
4966 813 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
4967 813 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
4968 813 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
4969 813 : poJob->args.eWrkDataType = eWrkDataType;
4970 813 : poJob->pChunk = pChunk;
4971 813 : poJob->args.pabyChunkNodataMask = pabyChunkNodataMask;
4972 813 : poJob->nSrcWidth = nWidth;
4973 813 : poJob->nSrcHeight = nHeight;
4974 813 : poJob->args.nChunkXOff = 0;
4975 813 : poJob->args.nChunkXSize = nWidth;
4976 813 : poJob->args.nChunkYOff = nChunkYOffQueried;
4977 813 : poJob->args.nChunkYSize = nChunkYSizeQueried;
4978 813 : poJob->nDstWidth = nDstWidth;
4979 813 : poJob->args.nDstXOff = 0;
4980 813 : poJob->args.nDstXOff2 = nDstWidth;
4981 813 : poJob->args.nDstYOff = nDstYOff;
4982 813 : poJob->args.nDstYOff2 = nDstYOff2;
4983 813 : poJob->poDstBand = poDstBand;
4984 813 : poJob->args.pszResampling = pszResampling;
4985 813 : poJob->args.bHasNoData = bHasNoData;
4986 813 : poJob->args.dfNoDataValue = dfNoDataValue;
4987 813 : poJob->args.poColorTable = poColorTable;
4988 813 : poJob->args.eSrcDataType = eSrcDataType;
4989 813 : poJob->args.bPropagateNoData = bPropagateNoData;
4990 :
4991 813 : if (poJobQueue)
4992 : {
4993 0 : poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
4994 0 : poJob->SetSrcBufferHolder(oSrcBufferHolder);
4995 0 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
4996 0 : jobList.emplace_back(std::move(poJob));
4997 : }
4998 : else
4999 : {
5000 813 : JobResampleFunc(poJob.get());
5001 813 : eErr = poJob->eErr;
5002 813 : if (eErr == CE_None)
5003 : {
5004 813 : eErr = WriteJobData(poJob.get());
5005 : }
5006 : }
5007 : }
5008 :
5009 718 : if (poJobQueue)
5010 : {
5011 0 : pChunk = nullptr;
5012 0 : pabyChunkNodataMask = nullptr;
5013 : }
5014 : }
5015 :
5016 713 : VSIFree(pChunk);
5017 713 : VSIFree(pabyChunkNodataMask);
5018 :
5019 : // Wait for all pending jobs to complete
5020 713 : while (!jobList.empty())
5021 : {
5022 0 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5023 0 : if (l_eErr != CE_None && eErr == CE_None)
5024 0 : eErr = l_eErr;
5025 : }
5026 :
5027 : /* -------------------------------------------------------------------- */
5028 : /* Renormalized overview mean / stddev if needed. */
5029 : /* -------------------------------------------------------------------- */
5030 713 : if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
5031 : {
5032 0 : GDALOverviewMagnitudeCorrection(
5033 : poSrcBand, nOverviewCount,
5034 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
5035 : GDALDummyProgress, nullptr);
5036 : }
5037 :
5038 : /* -------------------------------------------------------------------- */
5039 : /* It can be important to flush out data to overviews. */
5040 : /* -------------------------------------------------------------------- */
5041 1519 : for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
5042 : ++iOverview)
5043 : {
5044 806 : eErr = papoOvrBands[iOverview]->FlushCache(false);
5045 : }
5046 :
5047 713 : if (eErr == CE_None)
5048 713 : pfnProgress(1.0, nullptr, pProgressData);
5049 :
5050 713 : return eErr;
5051 : }
5052 :
5053 : /************************************************************************/
5054 : /* GDALRegenerateOverviewsMultiBand() */
5055 : /************************************************************************/
5056 :
5057 : /**
5058 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5059 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5060 : *
5061 : * This function will generate one or more overview images from a base
5062 : * image using the requested downsampling algorithm. Its primary use
5063 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
5064 : * can also be used to generate downsampled images in one file from another
5065 : * outside the overview architecture.
5066 : *
5067 : * The output bands need to exist in advance and share the same characteristics
5068 : * (type, dimensions)
5069 : *
5070 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5071 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5072 : *
5073 : * It does not support color tables or complex data types.
5074 : *
5075 : * The pseudo-algorithm used by the function is :
5076 : * for each overview
5077 : * iterate on lines of the source by a step of deltay
5078 : * iterate on columns of the source by a step of deltax
5079 : * read the source data of size deltax * deltay for all the bands
5080 : * generate the corresponding overview block for all the bands
5081 : *
5082 : * This function will honour properly NODATA_VALUES tuples (special dataset
5083 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5084 : * considered as the nodata value and not each value of the triplet
5085 : * independently per band.
5086 : *
5087 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
5088 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5089 : * overview computation.
5090 : *
5091 : * @param nBands the number of bands, size of papoSrcBands and size of
5092 : * first dimension of papapoOverviewBands
5093 : * @param papoSrcBands the list of source bands to downsample
5094 : * @param nOverviews the number of downsampled overview levels being generated.
5095 : * @param papapoOverviewBands bidimension array of bands. First dimension is
5096 : * indexed by nBands. Second dimension is indexed by
5097 : * nOverviews.
5098 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5099 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5100 : * @param pfnProgress progress report function.
5101 : * @param pProgressData progress function callback data.
5102 : * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
5103 : * key=value pairs, or NULL
5104 : * Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
5105 : * options can be specified to express that overviews should
5106 : * be regenerated only in the specified subset of the source
5107 : * dataset.
5108 : * @return CE_None on success or CE_Failure on failure.
5109 : */
5110 :
5111 372 : CPLErr GDALRegenerateOverviewsMultiBand(
5112 : int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
5113 : GDALRasterBand *const *const *papapoOverviewBands,
5114 : const char *pszResampling, GDALProgressFunc pfnProgress,
5115 : void *pProgressData, CSLConstList papszOptions)
5116 : {
5117 372 : CPL_IGNORE_RET_VAL(papszOptions);
5118 :
5119 372 : if (pfnProgress == nullptr)
5120 6 : pfnProgress = GDALDummyProgress;
5121 :
5122 372 : if (EQUAL(pszResampling, "NONE"))
5123 2 : return CE_None;
5124 :
5125 : // Sanity checks.
5126 370 : if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
5127 177 : !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
5128 76 : !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
5129 18 : !EQUAL(pszResampling, "CUBICSPLINE") &&
5130 17 : !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
5131 5 : !EQUAL(pszResampling, "MODE"))
5132 : {
5133 0 : CPLError(CE_Failure, CPLE_NotSupported,
5134 : "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
5135 : "not supported",
5136 : pszResampling);
5137 0 : return CE_Failure;
5138 : }
5139 :
5140 370 : int nKernelRadius = 0;
5141 : GDALResampleFunction pfnResampleFn =
5142 370 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
5143 370 : if (pfnResampleFn == nullptr)
5144 0 : return CE_Failure;
5145 :
5146 370 : const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
5147 370 : const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
5148 370 : if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
5149 0 : return CE_None;
5150 370 : GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
5151 682 : for (int iBand = 1; iBand < nBands; ++iBand)
5152 : {
5153 624 : if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
5154 312 : papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
5155 : {
5156 0 : CPLError(
5157 : CE_Failure, CPLE_NotSupported,
5158 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5159 : "have the same dimensions");
5160 0 : return CE_Failure;
5161 : }
5162 312 : if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
5163 : {
5164 0 : CPLError(
5165 : CE_Failure, CPLE_NotSupported,
5166 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5167 : "have the same data type");
5168 0 : return CE_Failure;
5169 : }
5170 : }
5171 :
5172 978 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5173 : {
5174 608 : const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
5175 608 : const int nDstWidth = poOvrFirstBand->GetXSize();
5176 608 : const int nDstHeight = poOvrFirstBand->GetYSize();
5177 1186 : for (int iBand = 1; iBand < nBands; ++iBand)
5178 : {
5179 578 : const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
5180 1156 : if (poOvrBand->GetXSize() != nDstWidth ||
5181 578 : poOvrBand->GetYSize() != nDstHeight)
5182 : {
5183 0 : CPLError(
5184 : CE_Failure, CPLE_NotSupported,
5185 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5186 : "of the same level must have the same dimensions");
5187 0 : return CE_Failure;
5188 : }
5189 578 : if (poOvrBand->GetRasterDataType() != eDataType)
5190 : {
5191 0 : CPLError(
5192 : CE_Failure, CPLE_NotSupported,
5193 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5194 : "must have the same data type as the source bands");
5195 0 : return CE_Failure;
5196 : }
5197 : }
5198 : }
5199 :
5200 : // First pass to compute the total number of pixels to write.
5201 370 : double dfTotalPixelCount = 0;
5202 370 : const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
5203 370 : const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
5204 370 : const int nSrcXSize = atoi(CSLFetchNameValueDef(
5205 : papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
5206 370 : const int nSrcYSize = atoi(CSLFetchNameValueDef(
5207 : papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
5208 978 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5209 : {
5210 608 : dfTotalPixelCount +=
5211 1216 : static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
5212 608 : papapoOverviewBands[0][iOverview]->GetXSize() *
5213 1216 : static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
5214 608 : papapoOverviewBands[0][iOverview]->GetYSize();
5215 : }
5216 :
5217 : const GDALDataType eWrkDataType =
5218 370 : GDALGetOvrWorkDataType(pszResampling, eDataType);
5219 370 : const int nWrkDataTypeSize = GDALGetDataTypeSizeBytes(eWrkDataType);
5220 :
5221 370 : const bool bIsMask = papoSrcBands[0]->IsMaskBand();
5222 :
5223 : // If we have a nodata mask and we are doing something more complicated
5224 : // than nearest neighbouring, we have to fetch to nodata mask.
5225 : const bool bUseNoDataMask =
5226 539 : !STARTS_WITH_CI(pszResampling, "NEAR") &&
5227 169 : (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
5228 :
5229 : bool *const pabHasNoData =
5230 370 : static_cast<bool *>(VSI_MALLOC_VERBOSE(nBands * sizeof(bool)));
5231 : double *const padfNoDataValue =
5232 370 : static_cast<double *>(VSI_MALLOC_VERBOSE(nBands * sizeof(double)));
5233 370 : if (pabHasNoData == nullptr || padfNoDataValue == nullptr)
5234 : {
5235 0 : CPLFree(pabHasNoData);
5236 0 : CPLFree(padfNoDataValue);
5237 0 : return CE_Failure;
5238 : }
5239 :
5240 1052 : for (int iBand = 0; iBand < nBands; ++iBand)
5241 : {
5242 682 : int nHasNoData = 0;
5243 1364 : padfNoDataValue[iBand] =
5244 682 : papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
5245 682 : pabHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
5246 : }
5247 : const bool bPropagateNoData =
5248 370 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5249 :
5250 370 : const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
5251 1480 : const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
5252 370 : ? CPLGetNumCPUs()
5253 370 : : atoi(pszThreads)));
5254 : auto poThreadPool =
5255 370 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5256 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5257 370 : : std::unique_ptr<CPLJobQueue>(nullptr);
5258 :
5259 : // Only configurable for debug / testing
5260 : const int nChunkMaxSize = std::max(
5261 370 : 100, atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760")));
5262 :
5263 : // Second pass to do the real job.
5264 370 : double dfCurPixelCount = 0;
5265 370 : CPLErr eErr = CE_None;
5266 977 : for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
5267 : ++iOverview)
5268 : {
5269 607 : int iSrcOverview = -1; // -1 means the source bands.
5270 :
5271 : const int nDstTotalWidth =
5272 607 : papapoOverviewBands[0][iOverview]->GetXSize();
5273 : const int nDstTotalHeight =
5274 607 : papapoOverviewBands[0][iOverview]->GetYSize();
5275 :
5276 : // Compute the coordinates of the target region to refresh
5277 607 : constexpr double EPS = 1e-8;
5278 607 : const int nDstXOffStart = static_cast<int>(
5279 607 : static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
5280 : EPS);
5281 : const int nDstXOffEnd =
5282 1214 : std::min(static_cast<int>(
5283 607 : std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
5284 607 : nToplevelSrcWidth * nDstTotalWidth -
5285 : EPS)),
5286 607 : nDstTotalWidth);
5287 607 : const int nDstWidth = nDstXOffEnd - nDstXOffStart;
5288 607 : const int nDstYOffStart =
5289 607 : static_cast<int>(static_cast<double>(nSrcYOff) /
5290 607 : nToplevelSrcHeight * nDstTotalHeight +
5291 : EPS);
5292 : const int nDstYOffEnd =
5293 1214 : std::min(static_cast<int>(
5294 607 : std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
5295 607 : nToplevelSrcHeight * nDstTotalHeight -
5296 : EPS)),
5297 607 : nDstTotalHeight);
5298 :
5299 : // Try to use previous level of overview as the source to compute
5300 : // the next level.
5301 607 : int nSrcWidth = nToplevelSrcWidth;
5302 607 : int nSrcHeight = nToplevelSrcHeight;
5303 844 : if (iOverview > 0 &&
5304 237 : papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
5305 : {
5306 229 : nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
5307 229 : nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
5308 229 : iSrcOverview = iOverview - 1;
5309 : }
5310 :
5311 607 : const double dfXRatioDstToSrc =
5312 607 : static_cast<double>(nSrcWidth) / nDstTotalWidth;
5313 607 : const double dfYRatioDstToSrc =
5314 607 : static_cast<double>(nSrcHeight) / nDstTotalHeight;
5315 :
5316 1214 : int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
5317 607 : static_cast<int>(0.5 + dfYRatioDstToSrc));
5318 607 : if (nOvrFactor == 0)
5319 0 : nOvrFactor = 1;
5320 :
5321 607 : int nDstChunkXSize = 0;
5322 607 : int nDstChunkYSize = 0;
5323 607 : papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
5324 : &nDstChunkYSize);
5325 :
5326 : const char *pszDST_CHUNK_X_SIZE =
5327 607 : CSLFetchNameValue(papszOptions, "DST_CHUNK_X_SIZE");
5328 : const char *pszDST_CHUNK_Y_SIZE =
5329 607 : CSLFetchNameValue(papszOptions, "DST_CHUNK_Y_SIZE");
5330 607 : if (pszDST_CHUNK_X_SIZE && pszDST_CHUNK_Y_SIZE)
5331 : {
5332 12 : nDstChunkXSize = std::max(1, atoi(pszDST_CHUNK_X_SIZE));
5333 12 : nDstChunkYSize = std::max(1, atoi(pszDST_CHUNK_Y_SIZE));
5334 12 : CPLDebug("GDAL", "Using dst chunk size %d x %d", nDstChunkXSize,
5335 : nDstChunkYSize);
5336 : }
5337 :
5338 : // Try to extend the chunk size so that the memory needed to acquire
5339 : // source pixels goes up to 10 MB.
5340 : // This can help for drivers that support multi-threaded reading
5341 607 : const int nFullResYChunk =
5342 607 : 2 + static_cast<int>(nDstChunkYSize * dfYRatioDstToSrc);
5343 607 : const int nFullResYChunkQueried =
5344 607 : nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
5345 843 : while (nDstChunkXSize < nDstWidth)
5346 : {
5347 253 : const int nFullResXChunk =
5348 253 : 2 + static_cast<int>(2 * nDstChunkXSize * dfXRatioDstToSrc);
5349 :
5350 253 : const int nFullResXChunkQueried =
5351 253 : nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
5352 :
5353 253 : if (static_cast<GIntBig>(nFullResXChunkQueried) *
5354 253 : nFullResYChunkQueried * nBands * nWrkDataTypeSize >
5355 253 : nChunkMaxSize)
5356 : {
5357 17 : break;
5358 : }
5359 :
5360 236 : nDstChunkXSize *= 2;
5361 : }
5362 607 : nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
5363 :
5364 607 : const int nFullResXChunk =
5365 607 : 2 + static_cast<int>(nDstChunkXSize * dfXRatioDstToSrc);
5366 607 : const int nFullResXChunkQueried =
5367 607 : nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
5368 :
5369 : // Make sure that the RAM requirements to acquire the source data does
5370 : // not exceed nChunkMaxSize
5371 : // If so, reduce the destination chunk size, generate overviews in a
5372 : // temporary dataset, and copy that temporary dataset over the target
5373 : // overview bands (to avoid issues with lossy compression)
5374 607 : const auto nMemRequirement =
5375 607 : static_cast<GIntBig>(nFullResXChunkQueried) *
5376 607 : nFullResYChunkQueried * nBands * nWrkDataTypeSize;
5377 607 : if (nMemRequirement > nChunkMaxSize &&
5378 10 : !(pszDST_CHUNK_X_SIZE && pszDST_CHUNK_Y_SIZE))
5379 : {
5380 : // Compute a smaller destination chunk size
5381 12 : const auto nOverShootFactor = nMemRequirement / nChunkMaxSize;
5382 : const auto nSqrtOverShootFactor = std::max<GIntBig>(
5383 24 : 4, static_cast<GIntBig>(std::ceil(
5384 12 : std::sqrt(static_cast<double>(nOverShootFactor)))));
5385 : const int nReducedDstChunkXSize = std::max(
5386 12 : 1, static_cast<int>(nDstChunkXSize / nSqrtOverShootFactor));
5387 : const int nReducedDstChunkYSize = std::max(
5388 12 : 1, static_cast<int>(nDstChunkYSize / nSqrtOverShootFactor));
5389 12 : if (nReducedDstChunkXSize < nDstChunkXSize ||
5390 0 : nReducedDstChunkYSize < nDstChunkYSize)
5391 : {
5392 12 : CPLStringList aosOptions(papszOptions);
5393 : aosOptions.SetNameValue(
5394 : "DST_CHUNK_X_SIZE",
5395 12 : CPLSPrintf("%d", nReducedDstChunkXSize));
5396 : aosOptions.SetNameValue(
5397 : "DST_CHUNK_Y_SIZE",
5398 12 : CPLSPrintf("%d", nReducedDstChunkYSize));
5399 :
5400 : const auto nTmpDSMemRequirement =
5401 12 : static_cast<GIntBig>(nDstTotalWidth) * nDstTotalHeight *
5402 12 : nBands * GDALGetDataTypeSizeBytes(eDataType);
5403 0 : std::unique_ptr<GDALDataset> poTmpDS;
5404 : // Config option mostly/only for autotest purposes
5405 : const char *pszGDAL_OVR_TEMP_DRIVER =
5406 12 : CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
5407 12 : if ((nTmpDSMemRequirement <= nChunkMaxSize &&
5408 2 : !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
5409 10 : EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
5410 : {
5411 : auto poTmpDrv =
5412 11 : GetGDALDriverManager()->GetDriverByName("MEM");
5413 11 : if (!poTmpDrv)
5414 : {
5415 0 : eErr = CE_Failure;
5416 0 : break;
5417 : }
5418 11 : poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
5419 : nDstTotalHeight, nBands,
5420 11 : eDataType, nullptr));
5421 : }
5422 : else
5423 : {
5424 : auto poTmpDrv =
5425 1 : GetGDALDriverManager()->GetDriverByName("GTiff");
5426 1 : if (!poTmpDrv)
5427 : {
5428 0 : eErr = CE_Failure;
5429 0 : break;
5430 : }
5431 2 : std::string osTmpFilename;
5432 1 : auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
5433 1 : if (poDstDS)
5434 : {
5435 1 : osTmpFilename = poDstDS->GetDescription();
5436 : VSIStatBufL sStatBuf;
5437 1 : if (!osTmpFilename.empty() &&
5438 0 : VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
5439 0 : osTmpFilename += "_tmp_ovr.tif";
5440 : }
5441 1 : if (osTmpFilename.empty())
5442 : {
5443 1 : osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
5444 1 : osTmpFilename += ".tif";
5445 : }
5446 1 : CPLDebug("GDAL",
5447 : "Creating temporary file %s of %d x %d x %d",
5448 : osTmpFilename.c_str(), nDstTotalWidth,
5449 : nDstTotalHeight, nBands);
5450 2 : CPLStringList aosCO;
5451 1 : poTmpDS.reset(poTmpDrv->Create(
5452 : osTmpFilename.c_str(), nDstTotalWidth, nDstTotalHeight,
5453 1 : nBands, eDataType, aosCO.List()));
5454 1 : if (poTmpDS)
5455 : {
5456 1 : poTmpDS->MarkSuppressOnClose();
5457 1 : VSIUnlink(osTmpFilename.c_str());
5458 : }
5459 : }
5460 12 : if (!poTmpDS)
5461 : {
5462 0 : eErr = CE_Failure;
5463 0 : break;
5464 : }
5465 :
5466 12 : std::vector<GDALRasterBand **> apapoOverviewBands(nBands);
5467 27 : for (int i = 0; i < nBands; ++i)
5468 : {
5469 30 : apapoOverviewBands[i] = static_cast<GDALRasterBand **>(
5470 15 : CPLMalloc(sizeof(GDALRasterBand *)));
5471 15 : apapoOverviewBands[i][0] = poTmpDS->GetRasterBand(i + 1);
5472 : }
5473 :
5474 : const double dfExtraPixels =
5475 24 : static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
5476 12 : papapoOverviewBands[0][iOverview]->GetXSize() *
5477 24 : static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
5478 12 : papapoOverviewBands[0][iOverview]->GetYSize();
5479 :
5480 24 : void *pScaledProgressData = GDALCreateScaledProgress(
5481 : dfCurPixelCount / dfTotalPixelCount,
5482 12 : (dfCurPixelCount + dfExtraPixels) / dfTotalPixelCount,
5483 : pfnProgress, pProgressData);
5484 :
5485 : // Generate overviews in temporary dataset
5486 12 : eErr = GDALRegenerateOverviewsMultiBand(
5487 12 : nBands, papoSrcBands, 1, apapoOverviewBands.data(),
5488 : pszResampling, GDALScaledProgress, pScaledProgressData,
5489 12 : aosOptions.List());
5490 :
5491 12 : GDALDestroyScaledProgress(pScaledProgressData);
5492 :
5493 12 : dfCurPixelCount += dfExtraPixels;
5494 :
5495 27 : for (int i = 0; i < nBands; ++i)
5496 : {
5497 15 : CPLFree(apapoOverviewBands[i]);
5498 : }
5499 :
5500 : // Copy temporary dataset to destination overview bands
5501 :
5502 12 : if (eErr == CE_None)
5503 : {
5504 : // Check if all papapoOverviewBands[][iOverview] bands point
5505 : // to the same dataset. If so, we can use
5506 : // GDALDatasetCopyWholeRaster()
5507 : GDALDataset *poDstOvrBandDS =
5508 12 : papapoOverviewBands[0][iOverview]->GetDataset();
5509 12 : if (poDstOvrBandDS)
5510 : {
5511 15 : if (poDstOvrBandDS->GetRasterCount() != nBands ||
5512 3 : poDstOvrBandDS->GetRasterBand(1) !=
5513 3 : papapoOverviewBands[0][iOverview])
5514 : {
5515 9 : poDstOvrBandDS = nullptr;
5516 : }
5517 : else
5518 : {
5519 6 : for (int i = 1; poDstOvrBandDS && i < nBands; ++i)
5520 : {
5521 : GDALDataset *poThisDstOvrBandDS =
5522 3 : papapoOverviewBands[i][iOverview]
5523 3 : ->GetDataset();
5524 3 : if (poThisDstOvrBandDS == nullptr ||
5525 6 : poThisDstOvrBandDS != poDstOvrBandDS ||
5526 3 : poThisDstOvrBandDS->GetRasterBand(i + 1) !=
5527 3 : papapoOverviewBands[i][iOverview])
5528 : {
5529 0 : poDstOvrBandDS = nullptr;
5530 : }
5531 : }
5532 : }
5533 : }
5534 12 : if (poDstOvrBandDS)
5535 : {
5536 3 : eErr = GDALDatasetCopyWholeRaster(
5537 : GDALDataset::ToHandle(poTmpDS.get()),
5538 : GDALDataset::ToHandle(poDstOvrBandDS), nullptr,
5539 : nullptr, nullptr);
5540 : }
5541 : else
5542 : {
5543 18 : for (int i = 0; eErr == CE_None && i < nBands; ++i)
5544 : {
5545 9 : eErr = GDALRasterBandCopyWholeRaster(
5546 : GDALRasterBand::ToHandle(
5547 : poTmpDS->GetRasterBand(i + 1)),
5548 : GDALRasterBand::ToHandle(
5549 9 : papapoOverviewBands[i][iOverview]),
5550 : nullptr, nullptr, nullptr);
5551 : }
5552 : }
5553 : }
5554 :
5555 12 : if (eErr != CE_None)
5556 0 : break;
5557 :
5558 12 : continue;
5559 : }
5560 : }
5561 :
5562 : // Structure describing a resampling job
5563 : struct OvrJob
5564 : {
5565 : // Buffers to free when job is finished
5566 : std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
5567 : std::unique_ptr<PointerHolder> oSrcBufferHolder{};
5568 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
5569 :
5570 : GDALRasterBand *poDstBand = nullptr;
5571 :
5572 : // Input parameters of pfnResampleFn
5573 : GDALResampleFunction pfnResampleFn = nullptr;
5574 : GDALOverviewResampleArgs args{};
5575 : const void *pChunk = nullptr;
5576 :
5577 : // Output values of resampling function
5578 : CPLErr eErr = CE_Failure;
5579 : void *pDstBuffer = nullptr;
5580 : GDALDataType eDstBufferDataType = GDT_Unknown;
5581 :
5582 : // Synchronization
5583 : bool bFinished = false;
5584 : std::mutex mutex{};
5585 : std::condition_variable cv{};
5586 : };
5587 :
5588 : // Thread function to resample
5589 16274 : const auto JobResampleFunc = [](void *pData)
5590 : {
5591 16274 : OvrJob *poJob = static_cast<OvrJob *>(pData);
5592 :
5593 16274 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
5594 : &(poJob->pDstBuffer),
5595 : &(poJob->eDstBufferDataType));
5596 :
5597 16274 : poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
5598 :
5599 : {
5600 32547 : std::lock_guard<std::mutex> guard(poJob->mutex);
5601 16273 : poJob->bFinished = true;
5602 16273 : poJob->cv.notify_one();
5603 : }
5604 16273 : };
5605 :
5606 : // Function to write resample data to target band
5607 16274 : const auto WriteJobData = [](const OvrJob *poJob)
5608 : {
5609 32548 : return poJob->poDstBand->RasterIO(
5610 16274 : GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
5611 16274 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
5612 16274 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
5613 16274 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
5614 16274 : poJob->args.nDstYOff2 - poJob->args.nDstYOff,
5615 16274 : poJob->eDstBufferDataType, 0, 0, nullptr);
5616 : };
5617 :
5618 : // Wait for completion of oldest job and serialize it
5619 : const auto WaitAndFinalizeOldestJob =
5620 58 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
5621 : {
5622 58 : auto poOldestJob = jobList.front().get();
5623 : {
5624 116 : std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
5625 : // coverity[missing_lock:FALSE]
5626 84 : while (!poOldestJob->bFinished)
5627 : {
5628 26 : poOldestJob->cv.wait(oGuard);
5629 : }
5630 : }
5631 58 : CPLErr l_eErr = poOldestJob->eErr;
5632 58 : if (l_eErr == CE_None)
5633 : {
5634 58 : l_eErr = WriteJobData(poOldestJob);
5635 : }
5636 :
5637 58 : jobList.pop_front();
5638 58 : return l_eErr;
5639 : };
5640 :
5641 : // Queue of jobs
5642 1190 : std::list<std::unique_ptr<OvrJob>> jobList;
5643 :
5644 1190 : std::vector<void *> apaChunk(nBands);
5645 1190 : std::vector<GByte *> apabyChunkNoDataMask(nBands);
5646 :
5647 : // Iterate on destination overview, block by block.
5648 595 : for (int nDstYOff = nDstYOffStart;
5649 2256 : nDstYOff < nDstYOffEnd && eErr == CE_None;
5650 1661 : nDstYOff += nDstChunkYSize)
5651 : {
5652 : int nDstYCount;
5653 1661 : if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
5654 1251 : nDstYCount = nDstChunkYSize;
5655 : else
5656 410 : nDstYCount = nDstYOffEnd - nDstYOff;
5657 :
5658 1661 : int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
5659 1661 : int nChunkYOff2 = static_cast<int>(
5660 1661 : ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
5661 1661 : if (nChunkYOff2 > nSrcHeight ||
5662 1661 : nDstYOff + nDstYCount == nDstTotalHeight)
5663 592 : nChunkYOff2 = nSrcHeight;
5664 1661 : int nYCount = nChunkYOff2 - nChunkYOff;
5665 1661 : CPLAssert(nYCount <= nFullResYChunk);
5666 :
5667 1661 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
5668 1661 : int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
5669 1661 : if (nChunkYOffQueried < 0)
5670 : {
5671 140 : nChunkYSizeQueried += nChunkYOffQueried;
5672 140 : nChunkYOffQueried = 0;
5673 : }
5674 1661 : if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
5675 139 : nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
5676 1661 : CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
5677 :
5678 1661 : if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount, nullptr,
5679 : pProgressData))
5680 : {
5681 1 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
5682 1 : eErr = CE_Failure;
5683 : }
5684 :
5685 : // Iterate on destination overview, block by block.
5686 1661 : for (int nDstXOff = nDstXOffStart;
5687 10101 : nDstXOff < nDstXOffEnd && eErr == CE_None;
5688 8440 : nDstXOff += nDstChunkXSize)
5689 : {
5690 8440 : int nDstXCount = 0;
5691 8440 : if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
5692 8243 : nDstXCount = nDstChunkXSize;
5693 : else
5694 197 : nDstXCount = nDstXOffEnd - nDstXOff;
5695 :
5696 8440 : dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
5697 :
5698 8440 : int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
5699 8440 : int nChunkXOff2 = static_cast<int>(
5700 8440 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
5701 8440 : if (nChunkXOff2 > nSrcWidth ||
5702 8440 : nDstXOff + nDstXCount == nDstTotalWidth)
5703 1659 : nChunkXOff2 = nSrcWidth;
5704 8440 : const int nXCount = nChunkXOff2 - nChunkXOff;
5705 8440 : CPLAssert(nXCount <= nFullResXChunk);
5706 :
5707 8440 : int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
5708 8440 : int nChunkXSizeQueried =
5709 8440 : nXCount + 2 * nKernelRadius * nOvrFactor;
5710 8440 : if (nChunkXOffQueried < 0)
5711 : {
5712 200 : nChunkXSizeQueried += nChunkXOffQueried;
5713 200 : nChunkXOffQueried = 0;
5714 : }
5715 8440 : if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
5716 203 : nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
5717 8440 : CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
5718 : #if DEBUG_VERBOSE
5719 : CPLDebug("GDAL",
5720 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
5721 : nChunkXOffQueried, nChunkYOffQueried,
5722 : nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
5723 : nDstYOff, nDstXCount, nDstYCount);
5724 : #endif
5725 :
5726 : // Avoid accumulating too many tasks and exhaust RAM
5727 :
5728 : // Try to complete already finished jobs
5729 16494 : while (eErr == CE_None && !jobList.empty())
5730 : {
5731 8536 : auto poOldestJob = jobList.front().get();
5732 : {
5733 8536 : std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
5734 8536 : if (!poOldestJob->bFinished)
5735 : {
5736 482 : break;
5737 : }
5738 : }
5739 8054 : eErr = poOldestJob->eErr;
5740 8054 : if (eErr == CE_None)
5741 : {
5742 8054 : eErr = WriteJobData(poOldestJob);
5743 : }
5744 :
5745 8054 : jobList.pop_front();
5746 : }
5747 :
5748 : // And in case we have saturated the number of threads,
5749 : // wait for completion of tasks to go below the threshold.
5750 16964 : while (eErr == CE_None &&
5751 8482 : jobList.size() >= static_cast<size_t>(nThreads))
5752 : {
5753 42 : eErr = WaitAndFinalizeOldestJob(jobList);
5754 : }
5755 :
5756 : // (Re)allocate buffers if needed
5757 24715 : for (int iBand = 0; iBand < nBands; ++iBand)
5758 : {
5759 16275 : if (apaChunk[iBand] == nullptr)
5760 : {
5761 9268 : apaChunk[iBand] = VSI_MALLOC3_VERBOSE(
5762 : nFullResXChunkQueried, nFullResYChunkQueried,
5763 : nWrkDataTypeSize);
5764 9268 : if (apaChunk[iBand] == nullptr)
5765 : {
5766 0 : eErr = CE_Failure;
5767 : }
5768 : }
5769 24712 : if (bUseNoDataMask &&
5770 8437 : apabyChunkNoDataMask[iBand] == nullptr)
5771 : {
5772 16756 : apabyChunkNoDataMask[iBand] =
5773 8378 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
5774 : nFullResXChunkQueried, nFullResYChunkQueried));
5775 8378 : if (apabyChunkNoDataMask[iBand] == nullptr)
5776 : {
5777 0 : eErr = CE_Failure;
5778 : }
5779 : }
5780 : }
5781 :
5782 : // Read the source buffers for all the bands.
5783 24715 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
5784 : {
5785 16275 : GDALRasterBand *poSrcBand = nullptr;
5786 16275 : if (iSrcOverview == -1)
5787 15387 : poSrcBand = papoSrcBands[iBand];
5788 : else
5789 888 : poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
5790 16275 : eErr = poSrcBand->RasterIO(
5791 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
5792 16275 : nChunkXSizeQueried, nChunkYSizeQueried, apaChunk[iBand],
5793 : nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType, 0,
5794 : 0, nullptr);
5795 :
5796 16275 : if (bUseNoDataMask && eErr == CE_None)
5797 : {
5798 8437 : auto poMaskBand = poSrcBand->IsMaskBand()
5799 8437 : ? poSrcBand
5800 6334 : : poSrcBand->GetMaskBand();
5801 8437 : eErr = poMaskBand->RasterIO(
5802 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
5803 : nChunkXSizeQueried, nChunkYSizeQueried,
5804 8437 : apabyChunkNoDataMask[iBand], nChunkXSizeQueried,
5805 : nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
5806 : }
5807 : }
5808 :
5809 : // Compute the resulting overview block.
5810 24714 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
5811 : {
5812 32548 : auto poJob = std::make_unique<OvrJob>();
5813 16274 : poJob->pfnResampleFn = pfnResampleFn;
5814 16274 : poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
5815 32548 : poJob->args.eOvrDataType =
5816 16274 : poJob->poDstBand->GetRasterDataType();
5817 16274 : poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
5818 16274 : poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
5819 16274 : const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
5820 16274 : "NBITS", "IMAGE_STRUCTURE");
5821 16274 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
5822 16274 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
5823 16274 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
5824 16274 : poJob->args.eWrkDataType = eWrkDataType;
5825 16274 : poJob->pChunk = apaChunk[iBand];
5826 16274 : poJob->args.pabyChunkNodataMask =
5827 16274 : apabyChunkNoDataMask[iBand];
5828 16274 : poJob->args.nChunkXOff = nChunkXOffQueried;
5829 16274 : poJob->args.nChunkXSize = nChunkXSizeQueried;
5830 16274 : poJob->args.nChunkYOff = nChunkYOffQueried;
5831 16274 : poJob->args.nChunkYSize = nChunkYSizeQueried;
5832 16274 : poJob->args.nDstXOff = nDstXOff;
5833 16274 : poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
5834 16274 : poJob->args.nDstYOff = nDstYOff;
5835 16274 : poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
5836 16274 : poJob->args.pszResampling = pszResampling;
5837 16274 : poJob->args.bHasNoData = pabHasNoData[iBand];
5838 16274 : poJob->args.dfNoDataValue = padfNoDataValue[iBand];
5839 16274 : poJob->args.eSrcDataType = eDataType;
5840 16274 : poJob->args.bPropagateNoData = bPropagateNoData;
5841 :
5842 16274 : if (poJobQueue)
5843 : {
5844 16224 : poJob->oSrcMaskBufferHolder.reset(
5845 8112 : new PointerHolder(apabyChunkNoDataMask[iBand]));
5846 8112 : apabyChunkNoDataMask[iBand] = nullptr;
5847 :
5848 16224 : poJob->oSrcBufferHolder.reset(
5849 8112 : new PointerHolder(apaChunk[iBand]));
5850 8112 : apaChunk[iBand] = nullptr;
5851 :
5852 8112 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
5853 8112 : jobList.emplace_back(std::move(poJob));
5854 : }
5855 : else
5856 : {
5857 8162 : JobResampleFunc(poJob.get());
5858 8162 : eErr = poJob->eErr;
5859 8162 : if (eErr == CE_None)
5860 : {
5861 8162 : eErr = WriteJobData(poJob.get());
5862 : }
5863 : }
5864 : }
5865 : }
5866 : }
5867 :
5868 : // Wait for all pending jobs to complete
5869 611 : while (!jobList.empty())
5870 : {
5871 16 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5872 16 : if (l_eErr != CE_None && eErr == CE_None)
5873 0 : eErr = l_eErr;
5874 : }
5875 :
5876 : // Flush the data to overviews.
5877 1765 : for (int iBand = 0; iBand < nBands; ++iBand)
5878 : {
5879 1170 : CPLFree(apaChunk[iBand]);
5880 1170 : papapoOverviewBands[iBand][iOverview]->FlushCache(false);
5881 :
5882 1170 : CPLFree(apabyChunkNoDataMask[iBand]);
5883 : }
5884 : }
5885 :
5886 370 : CPLFree(pabHasNoData);
5887 370 : CPLFree(padfNoDataValue);
5888 :
5889 370 : if (eErr == CE_None)
5890 368 : pfnProgress(1.0, nullptr, pProgressData);
5891 :
5892 370 : return eErr;
5893 : }
5894 :
5895 : /************************************************************************/
5896 : /* GDALRegenerateOverviewsMultiBand() */
5897 : /************************************************************************/
5898 :
5899 : /**
5900 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5901 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5902 : *
5903 : * This function will generate one or more overview images from a base
5904 : * image using the requested downsampling algorithm. Its primary use
5905 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
5906 : * can also be used to generate downsampled images in one file from another
5907 : * outside the overview architecture.
5908 : *
5909 : * The output bands need to exist in advance and share the same characteristics
5910 : * (type, dimensions)
5911 : *
5912 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5913 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5914 : *
5915 : * It does not support color tables or complex data types.
5916 : *
5917 : * The pseudo-algorithm used by the function is :
5918 : * for each overview
5919 : * iterate on lines of the source by a step of deltay
5920 : * iterate on columns of the source by a step of deltax
5921 : * read the source data of size deltax * deltay for all the bands
5922 : * generate the corresponding overview block for all the bands
5923 : *
5924 : * This function will honour properly NODATA_VALUES tuples (special dataset
5925 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5926 : * considered as the nodata value and not each value of the triplet
5927 : * independently per band.
5928 : *
5929 : * The GDAL_NUM_THREADS configuration option can be set
5930 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5931 : * overview computation.
5932 : *
5933 : * @param apoSrcBands the list of source bands to downsample
5934 : * @param aapoOverviewBands bidimension array of bands. First dimension is
5935 : * indexed by bands. Second dimension is indexed by
5936 : * overview levels. All aapoOverviewBands[i] arrays
5937 : * must have the same size (i.e. same number of
5938 : * overviews)
5939 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5940 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5941 : * @param pfnProgress progress report function.
5942 : * @param pProgressData progress function callback data.
5943 : * @param papszOptions NULL terminated list of options as
5944 : * key=value pairs, or NULL
5945 : * The XOFF, YOFF, XSIZE and YSIZE
5946 : * options can be specified to express that overviews should
5947 : * be regenerated only in the specified subset of the source
5948 : * dataset.
5949 : * @return CE_None on success or CE_Failure on failure.
5950 : * @since 3.10
5951 : */
5952 :
5953 5 : CPLErr GDALRegenerateOverviewsMultiBand(
5954 : const std::vector<GDALRasterBand *> &apoSrcBands,
5955 : const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
5956 : const char *pszResampling, GDALProgressFunc pfnProgress,
5957 : void *pProgressData, CSLConstList papszOptions)
5958 : {
5959 5 : CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
5960 15 : for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
5961 : {
5962 10 : CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
5963 : }
5964 :
5965 5 : if (aapoOverviewBands.empty())
5966 0 : return CE_None;
5967 :
5968 5 : std::vector<GDALRasterBand **> apapoOverviewBands;
5969 20 : for (auto &apoOverviewBands : aapoOverviewBands)
5970 : {
5971 : auto papoOverviewBands = static_cast<GDALRasterBand **>(
5972 15 : CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
5973 30 : for (size_t i = 0; i < apoOverviewBands.size(); ++i)
5974 : {
5975 15 : papoOverviewBands[i] = apoOverviewBands[i];
5976 : }
5977 15 : apapoOverviewBands.push_back(papoOverviewBands);
5978 : }
5979 10 : const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
5980 5 : static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
5981 5 : static_cast<int>(aapoOverviewBands[0].size()),
5982 5 : apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
5983 : papszOptions);
5984 20 : for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
5985 15 : CPLFree(papoOverviewBands);
5986 5 : return eErr;
5987 : }
5988 :
5989 : /************************************************************************/
5990 : /* GDALComputeBandStats() */
5991 : /************************************************************************/
5992 :
5993 : /** Undocumented
5994 : * @param hSrcBand undocumented.
5995 : * @param nSampleStep Step between scanlines used to compute statistics.
5996 : * When nSampleStep is equal to 1, all scanlines will
5997 : * be processed.
5998 : * @param pdfMean undocumented.
5999 : * @param pdfStdDev undocumented.
6000 : * @param pfnProgress undocumented.
6001 : * @param pProgressData undocumented.
6002 : * @return undocumented
6003 : */
6004 16 : CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
6005 : int nSampleStep, double *pdfMean,
6006 : double *pdfStdDev,
6007 : GDALProgressFunc pfnProgress,
6008 : void *pProgressData)
6009 :
6010 : {
6011 16 : VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
6012 :
6013 16 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
6014 :
6015 16 : if (pfnProgress == nullptr)
6016 16 : pfnProgress = GDALDummyProgress;
6017 :
6018 16 : const int nWidth = poSrcBand->GetXSize();
6019 16 : const int nHeight = poSrcBand->GetYSize();
6020 :
6021 16 : if (nSampleStep >= nHeight || nSampleStep < 1)
6022 3 : nSampleStep = 1;
6023 :
6024 16 : GDALDataType eWrkType = GDT_Unknown;
6025 16 : float *pafData = nullptr;
6026 16 : GDALDataType eType = poSrcBand->GetRasterDataType();
6027 16 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6028 16 : if (bComplex)
6029 : {
6030 : pafData = static_cast<float *>(
6031 0 : VSI_MALLOC_VERBOSE(nWidth * 2 * sizeof(float)));
6032 0 : eWrkType = GDT_CFloat32;
6033 : }
6034 : else
6035 : {
6036 : pafData =
6037 16 : static_cast<float *>(VSI_MALLOC_VERBOSE(nWidth * sizeof(float)));
6038 16 : eWrkType = GDT_Float32;
6039 : }
6040 :
6041 16 : if (nWidth == 0 || pafData == nullptr)
6042 : {
6043 0 : VSIFree(pafData);
6044 0 : return CE_Failure;
6045 : }
6046 :
6047 : /* -------------------------------------------------------------------- */
6048 : /* Loop over all sample lines. */
6049 : /* -------------------------------------------------------------------- */
6050 16 : double dfSum = 0.0;
6051 16 : double dfSum2 = 0.0;
6052 16 : int iLine = 0;
6053 16 : GIntBig nSamples = 0;
6054 :
6055 2143 : do
6056 : {
6057 2159 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6058 : pProgressData))
6059 : {
6060 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6061 0 : CPLFree(pafData);
6062 0 : return CE_Failure;
6063 : }
6064 :
6065 : const CPLErr eErr =
6066 2159 : poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
6067 : 1, eWrkType, 0, 0, nullptr);
6068 2159 : if (eErr != CE_None)
6069 : {
6070 1 : CPLFree(pafData);
6071 1 : return eErr;
6072 : }
6073 :
6074 725204 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6075 : {
6076 723046 : float fValue = 0.0f;
6077 :
6078 723046 : if (bComplex)
6079 : {
6080 : // Compute the magnitude of the complex value.
6081 : fValue =
6082 0 : std::hypot(pafData[iPixel * 2], pafData[iPixel * 2 + 1]);
6083 : }
6084 : else
6085 : {
6086 723046 : fValue = pafData[iPixel];
6087 : }
6088 :
6089 723046 : dfSum += fValue;
6090 723046 : dfSum2 += static_cast<double>(fValue) * fValue;
6091 : }
6092 :
6093 2158 : nSamples += nWidth;
6094 2158 : iLine += nSampleStep;
6095 2158 : } while (iLine < nHeight);
6096 :
6097 15 : if (!pfnProgress(1.0, nullptr, pProgressData))
6098 : {
6099 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6100 0 : CPLFree(pafData);
6101 0 : return CE_Failure;
6102 : }
6103 :
6104 : /* -------------------------------------------------------------------- */
6105 : /* Produce the result values. */
6106 : /* -------------------------------------------------------------------- */
6107 15 : if (pdfMean != nullptr)
6108 15 : *pdfMean = dfSum / nSamples;
6109 :
6110 15 : if (pdfStdDev != nullptr)
6111 : {
6112 15 : const double dfMean = dfSum / nSamples;
6113 :
6114 15 : *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
6115 : }
6116 :
6117 15 : CPLFree(pafData);
6118 :
6119 15 : return CE_None;
6120 : }
6121 :
6122 : /************************************************************************/
6123 : /* GDALOverviewMagnitudeCorrection() */
6124 : /* */
6125 : /* Correct the mean and standard deviation of the overviews of */
6126 : /* the given band to match the base layer approximately. */
6127 : /************************************************************************/
6128 :
6129 : /** Undocumented
6130 : * @param hBaseBand undocumented.
6131 : * @param nOverviewCount undocumented.
6132 : * @param pahOverviews undocumented.
6133 : * @param pfnProgress undocumented.
6134 : * @param pProgressData undocumented.
6135 : * @return undocumented
6136 : */
6137 0 : CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
6138 : int nOverviewCount,
6139 : GDALRasterBandH *pahOverviews,
6140 : GDALProgressFunc pfnProgress,
6141 : void *pProgressData)
6142 :
6143 : {
6144 0 : VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
6145 :
6146 : /* -------------------------------------------------------------------- */
6147 : /* Compute mean/stddev for source raster. */
6148 : /* -------------------------------------------------------------------- */
6149 0 : double dfOrigMean = 0.0;
6150 0 : double dfOrigStdDev = 0.0;
6151 : {
6152 : const CPLErr eErr =
6153 0 : GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
6154 : pfnProgress, pProgressData);
6155 :
6156 0 : if (eErr != CE_None)
6157 0 : return eErr;
6158 : }
6159 :
6160 : /* -------------------------------------------------------------------- */
6161 : /* Loop on overview bands. */
6162 : /* -------------------------------------------------------------------- */
6163 0 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
6164 : {
6165 : GDALRasterBand *poOverview =
6166 0 : GDALRasterBand::FromHandle(pahOverviews[iOverview]);
6167 : double dfOverviewMean, dfOverviewStdDev;
6168 :
6169 : const CPLErr eErr =
6170 0 : GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
6171 : &dfOverviewStdDev, pfnProgress, pProgressData);
6172 :
6173 0 : if (eErr != CE_None)
6174 0 : return eErr;
6175 :
6176 0 : double dfGain = 1.0;
6177 0 : if (dfOrigStdDev >= 0.0001)
6178 0 : dfGain = dfOrigStdDev / dfOverviewStdDev;
6179 :
6180 : /* --------------------------------------------------------------------
6181 : */
6182 : /* Apply gain and offset. */
6183 : /* --------------------------------------------------------------------
6184 : */
6185 0 : const int nWidth = poOverview->GetXSize();
6186 0 : const int nHeight = poOverview->GetYSize();
6187 :
6188 0 : GDALDataType eWrkType = GDT_Unknown;
6189 0 : float *pafData = nullptr;
6190 0 : const GDALDataType eType = poOverview->GetRasterDataType();
6191 0 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6192 0 : if (bComplex)
6193 : {
6194 : pafData = static_cast<float *>(
6195 0 : VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6196 0 : eWrkType = GDT_CFloat32;
6197 : }
6198 : else
6199 : {
6200 : pafData = static_cast<float *>(
6201 0 : VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6202 0 : eWrkType = GDT_Float32;
6203 : }
6204 :
6205 0 : if (pafData == nullptr)
6206 : {
6207 0 : return CE_Failure;
6208 : }
6209 :
6210 0 : for (int iLine = 0; iLine < nHeight; ++iLine)
6211 : {
6212 0 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6213 : pProgressData))
6214 : {
6215 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6216 0 : CPLFree(pafData);
6217 0 : return CE_Failure;
6218 : }
6219 :
6220 0 : if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
6221 : nWidth, 1, eWrkType, 0, 0,
6222 0 : nullptr) != CE_None)
6223 : {
6224 0 : CPLFree(pafData);
6225 0 : return CE_Failure;
6226 : }
6227 :
6228 0 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6229 : {
6230 0 : if (bComplex)
6231 : {
6232 0 : pafData[iPixel * 2] *= static_cast<float>(dfGain);
6233 0 : pafData[iPixel * 2 + 1] *= static_cast<float>(dfGain);
6234 : }
6235 : else
6236 : {
6237 0 : pafData[iPixel] = static_cast<float>(
6238 0 : (pafData[iPixel] - dfOverviewMean) * dfGain +
6239 : dfOrigMean);
6240 : }
6241 : }
6242 :
6243 0 : if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
6244 : nWidth, 1, eWrkType, 0, 0,
6245 0 : nullptr) != CE_None)
6246 : {
6247 0 : CPLFree(pafData);
6248 0 : return CE_Failure;
6249 : }
6250 : }
6251 :
6252 0 : if (!pfnProgress(1.0, nullptr, pProgressData))
6253 : {
6254 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6255 0 : CPLFree(pafData);
6256 0 : return CE_Failure;
6257 : }
6258 :
6259 0 : CPLFree(pafData);
6260 : }
6261 :
6262 0 : return CE_None;
6263 : }
|