Line data Source code
1 :
2 : /******************************************************************************
3 : *
4 : * Project: GDAL Core
5 : * Purpose: Helper code to implement overview support in different drivers.
6 : * Author: Frank Warmerdam, warmerdam@pobox.com
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 2000, Frank Warmerdam
10 : * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * SPDX-License-Identifier: MIT
13 : ****************************************************************************/
14 :
15 : #include "cpl_port.h"
16 : #include "gdal_priv.h"
17 :
18 : #include <cmath>
19 : #include <cstddef>
20 : #include <cstdlib>
21 :
22 : #include <algorithm>
23 : #include <complex>
24 : #include <condition_variable>
25 : #include <limits>
26 : #include <list>
27 : #include <memory>
28 : #include <mutex>
29 : #include <vector>
30 :
31 : #include "cpl_conv.h"
32 : #include "cpl_error.h"
33 : #include "cpl_float.h"
34 : #include "cpl_progress.h"
35 : #include "cpl_vsi.h"
36 : #include "gdal.h"
37 : #include "gdal_thread_pool.h"
38 : #include "gdalwarper.h"
39 :
40 : #ifdef USE_NEON_OPTIMIZATIONS
41 : #include "include_sse2neon.h"
42 : #define USE_SSE2
43 :
44 : #include "gdalsse_priv.h"
45 :
46 : // Restrict to 64bit processors because they are guaranteed to have SSE2,
47 : // or if __AVX2__ is defined.
48 : #elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
49 : #define USE_SSE2
50 :
51 : #include "gdalsse_priv.h"
52 :
53 : #ifdef __SSE3__
54 : #include <pmmintrin.h>
55 : #endif
56 : #ifdef __SSSE3__
57 : #include <tmmintrin.h>
58 : #endif
59 : #ifdef __SSE4_1__
60 : #include <smmintrin.h>
61 : #endif
62 : #ifdef __AVX2__
63 : #include <immintrin.h>
64 : #endif
65 :
66 : #endif
67 :
68 : // To be included after above USE_SSE2 and include gdalsse_priv.h
69 : // to avoid build issue on Windows x86
70 : #include "gdal_priv_templates.hpp"
71 :
72 : /************************************************************************/
73 : /* GDALResampleChunk_Near() */
74 : /************************************************************************/
75 :
76 : template <class T>
77 6095 : static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
78 : const T *pChunk, T **ppDstBuffer)
79 :
80 : {
81 6095 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
82 6095 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
83 6095 : const GDALDataType eWrkDataType = args.eWrkDataType;
84 6095 : const int nChunkXOff = args.nChunkXOff;
85 6095 : const int nChunkXSize = args.nChunkXSize;
86 6095 : const int nChunkYOff = args.nChunkYOff;
87 6095 : const int nDstXOff = args.nDstXOff;
88 6095 : const int nDstXOff2 = args.nDstXOff2;
89 6095 : const int nDstYOff = args.nDstYOff;
90 6095 : const int nDstYOff2 = args.nDstYOff2;
91 6095 : const int nDstXWidth = nDstXOff2 - nDstXOff;
92 :
93 : /* -------------------------------------------------------------------- */
94 : /* Allocate buffers. */
95 : /* -------------------------------------------------------------------- */
96 6095 : *ppDstBuffer = static_cast<T *>(
97 6095 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
98 : GDALGetDataTypeSizeBytes(eWrkDataType)));
99 6095 : if (*ppDstBuffer == nullptr)
100 : {
101 0 : return CE_Failure;
102 : }
103 6095 : T *const pDstBuffer = *ppDstBuffer;
104 :
105 : int *panSrcXOff =
106 6095 : static_cast<int *>(VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(int)));
107 :
108 6095 : if (panSrcXOff == nullptr)
109 : {
110 0 : VSIFree(panSrcXOff);
111 0 : return CE_Failure;
112 : }
113 :
114 : /* ==================================================================== */
115 : /* Precompute inner loop constants. */
116 : /* ==================================================================== */
117 592860 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
118 : {
119 586765 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
120 586765 : if (nSrcXOff < nChunkXOff)
121 0 : nSrcXOff = nChunkXOff;
122 :
123 586765 : panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
124 : }
125 :
126 : /* ==================================================================== */
127 : /* Loop over destination scanlines. */
128 : /* ==================================================================== */
129 216591 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
130 : {
131 210496 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
132 210496 : if (nSrcYOff < nChunkYOff)
133 0 : nSrcYOff = nChunkYOff;
134 :
135 210496 : const T *const pSrcScanline =
136 : pChunk +
137 210496 : (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
138 208026 : nChunkXOff;
139 :
140 : /* --------------------------------------------------------------------
141 : */
142 : /* Loop over destination pixels */
143 : /* --------------------------------------------------------------------
144 : */
145 210496 : T *pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
146 119221034 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
147 : {
148 119010564 : pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
149 : }
150 : }
151 :
152 6095 : CPLFree(panSrcXOff);
153 :
154 6095 : return CE_None;
155 : }
156 :
157 6095 : static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
158 : const void *pChunk, void **ppDstBuffer,
159 : GDALDataType *peDstBufferDataType)
160 : {
161 6095 : *peDstBufferDataType = args.eWrkDataType;
162 6095 : switch (args.eWrkDataType)
163 : {
164 : // For nearest resampling, as no computation is done, only the
165 : // size of the data type matters.
166 5967 : case GDT_Byte:
167 : case GDT_Int8:
168 : {
169 5967 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
170 5967 : return GDALResampleChunk_NearT(
171 : args, static_cast<const uint8_t *>(pChunk),
172 5967 : reinterpret_cast<uint8_t **>(ppDstBuffer));
173 : }
174 :
175 26 : case GDT_Int16:
176 : case GDT_UInt16:
177 : case GDT_Float16:
178 : {
179 26 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
180 26 : return GDALResampleChunk_NearT(
181 : args, static_cast<const uint16_t *>(pChunk),
182 26 : reinterpret_cast<uint16_t **>(ppDstBuffer));
183 : }
184 :
185 55 : case GDT_CInt16:
186 : case GDT_CFloat16:
187 : case GDT_Int32:
188 : case GDT_UInt32:
189 : case GDT_Float32:
190 : {
191 55 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
192 55 : return GDALResampleChunk_NearT(
193 : args, static_cast<const uint32_t *>(pChunk),
194 55 : reinterpret_cast<uint32_t **>(ppDstBuffer));
195 : }
196 :
197 43 : case GDT_CInt32:
198 : case GDT_CFloat32:
199 : case GDT_Int64:
200 : case GDT_UInt64:
201 : case GDT_Float64:
202 : {
203 43 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
204 43 : return GDALResampleChunk_NearT(
205 : args, static_cast<const uint64_t *>(pChunk),
206 43 : reinterpret_cast<uint64_t **>(ppDstBuffer));
207 : }
208 :
209 4 : case GDT_CFloat64:
210 : {
211 4 : return GDALResampleChunk_NearT(
212 : args, static_cast<const std::complex<double> *>(pChunk),
213 4 : reinterpret_cast<std::complex<double> **>(ppDstBuffer));
214 : }
215 :
216 0 : case GDT_Unknown:
217 : case GDT_TypeCount:
218 0 : break;
219 : }
220 0 : CPLAssert(false);
221 : return CE_Failure;
222 : }
223 :
224 : namespace
225 : {
226 :
227 : // Find in the color table the entry whose RGB value is the closest
228 : // (using quadratic distance) to the test color, ignoring transparent entries.
229 3837 : int BestColorEntry(const std::vector<GDALColorEntry> &entries,
230 : const GDALColorEntry &test)
231 : {
232 3837 : int nMinDist = std::numeric_limits<int>::max();
233 3837 : size_t bestEntry = 0;
234 986109 : for (size_t i = 0; i < entries.size(); ++i)
235 : {
236 982272 : const GDALColorEntry &entry = entries[i];
237 : // Ignore transparent entries
238 982272 : if (entry.c4 == 0)
239 3237 : continue;
240 :
241 979035 : int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
242 979035 : ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
243 979035 : ((test.c3 - entry.c3) * (test.c3 - entry.c3));
244 979035 : if (nDist < nMinDist)
245 : {
246 15847 : nMinDist = nDist;
247 15847 : bestEntry = i;
248 : }
249 : }
250 3837 : return static_cast<int>(bestEntry);
251 : }
252 :
253 7 : std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
254 : int &transparentIdx)
255 : {
256 7 : std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
257 :
258 7 : transparentIdx = -1;
259 7 : int i = 0;
260 1799 : for (auto &entry : entries)
261 : {
262 1792 : table.GetColorEntryAsRGB(i, &entry);
263 1792 : if (transparentIdx < 0 && entry.c4 == 0)
264 1 : transparentIdx = i;
265 1792 : ++i;
266 : }
267 7 : return entries;
268 : }
269 :
270 : } // unnamed namespace
271 :
272 : /************************************************************************/
273 : /* SQUARE() */
274 : /************************************************************************/
275 :
276 3721 : template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
277 : {
278 3721 : return static_cast<Tsquare>(val) * val;
279 : }
280 :
281 : /************************************************************************/
282 : /* ComputeIntegerRMS() */
283 : /************************************************************************/
284 : // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
285 : // integer that minimizes abs(rms**2 - sumSquares / weight)
286 : template <class T, class Twork>
287 42 : inline T ComputeIntegerRMS(double sumSquares, double weight)
288 : {
289 42 : const double sumDivWeight = sumSquares / weight;
290 42 : T rms = static_cast<T>(sqrt(sumDivWeight));
291 :
292 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
293 : // Naive version:
294 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
295 42 : if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
296 42 : 2 * sumDivWeight)
297 6 : rms += 1;
298 42 : return rms;
299 : }
300 :
301 0 : template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
302 : {
303 0 : CPLAssert(false);
304 : return 0;
305 : }
306 :
307 24 : template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
308 : {
309 : // It has been verified that given the correction on rms below, using
310 : // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
311 : // is equivalent, so use the former as it is used twice.
312 24 : const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
313 24 : const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
314 24 : GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
315 :
316 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
317 : // Naive version:
318 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
319 : // Optimized version for integer case and weight == 4
320 24 : if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
321 5 : rms += 1;
322 24 : return rms;
323 : }
324 :
325 : template <>
326 20 : inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
327 : {
328 20 : const double sumDivWeight = sumSquares * 0.25;
329 20 : GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
330 :
331 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
332 : // Naive version:
333 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
334 : // Optimized version for integer case and weight == 4
335 20 : if (static_cast<GUInt32>(rms) * (rms + 1) <
336 20 : static_cast<GUInt32>(sumDivWeight + 0.25))
337 4 : rms += 1;
338 20 : return rms;
339 : }
340 :
341 : #ifdef USE_SSE2
342 :
343 : /************************************************************************/
344 : /* QuadraticMeanByteSSE2OrAVX2() */
345 : /************************************************************************/
346 :
347 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
348 : #define sse2_packus_epi32 _mm_packus_epi32
349 : #else
350 516119 : inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
351 : {
352 516119 : const auto minus32768_32 = _mm_set1_epi32(-32768);
353 516119 : const auto minus32768_16 = _mm_set1_epi16(-32768);
354 516119 : a = _mm_add_epi32(a, minus32768_32);
355 516119 : b = _mm_add_epi32(b, minus32768_32);
356 516119 : a = _mm_packs_epi32(a, b);
357 516119 : a = _mm_sub_epi16(a, minus32768_16);
358 516119 : return a;
359 : }
360 : #endif
361 :
362 : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
363 : #define sse2_hadd_epi16 _mm_hadd_epi16
364 : #else
365 4660840 : inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
366 : {
367 : // Horizontal addition of adjacent pairs
368 4660840 : const auto mask = _mm_set1_epi32(0xFFFF);
369 : const auto horizLo =
370 13982500 : _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
371 : const auto horizHi =
372 13982500 : _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
373 :
374 : // Recombine low and high parts
375 4660840 : return _mm_packs_epi32(horizLo, horizHi);
376 : }
377 : #endif
378 :
379 : #ifdef __AVX2__
380 :
381 : #define DEST_ELTS 16
382 : #define set1_epi16 _mm256_set1_epi16
383 : #define set1_epi32 _mm256_set1_epi32
384 : #define setzero _mm256_setzero_si256
385 : #define set1_ps _mm256_set1_ps
386 : #define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
387 : #define unpacklo_epi8 _mm256_unpacklo_epi8
388 : #define unpackhi_epi8 _mm256_unpackhi_epi8
389 : #define madd_epi16 _mm256_madd_epi16
390 : #define add_epi32 _mm256_add_epi32
391 : #define mul_ps _mm256_mul_ps
392 : #define cvtepi32_ps _mm256_cvtepi32_ps
393 : #define sqrt_ps _mm256_sqrt_ps
394 : #define cvttps_epi32 _mm256_cvttps_epi32
395 : #define packs_epi32 _mm256_packs_epi32
396 : #define packus_epi32 _mm256_packus_epi32
397 : #define srli_epi32 _mm256_srli_epi32
398 : #define mullo_epi16 _mm256_mullo_epi16
399 : #define srli_epi16 _mm256_srli_epi16
400 : #define cmpgt_epi16 _mm256_cmpgt_epi16
401 : #define add_epi16 _mm256_add_epi16
402 : #define sub_epi16 _mm256_sub_epi16
403 : #define packus_epi16 _mm256_packus_epi16
404 : /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
405 : /* to get the lower 128-bit bits of what would be a true 256-bit vector register
406 : */
407 : #define store_lo(x, y) \
408 : _mm_storeu_si128(reinterpret_cast<__m128i *>(x), \
409 : _mm256_extracti128_si256( \
410 : _mm256_permute4x64_epi64((y), 0 | (2 << 2)), 0))
411 : #define hadd_epi16 _mm256_hadd_epi16
412 : #define zeroupper() _mm256_zeroupper()
413 : #else
414 : #define DEST_ELTS 8
415 : #define set1_epi16 _mm_set1_epi16
416 : #define set1_epi32 _mm_set1_epi32
417 : #define setzero _mm_setzero_si128
418 : #define set1_ps _mm_set1_ps
419 : #define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
420 : #define unpacklo_epi8 _mm_unpacklo_epi8
421 : #define unpackhi_epi8 _mm_unpackhi_epi8
422 : #define madd_epi16 _mm_madd_epi16
423 : #define add_epi32 _mm_add_epi32
424 : #define mul_ps _mm_mul_ps
425 : #define cvtepi32_ps _mm_cvtepi32_ps
426 : #define sqrt_ps _mm_sqrt_ps
427 : #define cvttps_epi32 _mm_cvttps_epi32
428 : #define packs_epi32 _mm_packs_epi32
429 : #define packus_epi32 sse2_packus_epi32
430 : #define srli_epi32 _mm_srli_epi32
431 : #define mullo_epi16 _mm_mullo_epi16
432 : #define srli_epi16 _mm_srli_epi16
433 : #define cmpgt_epi16 _mm_cmpgt_epi16
434 : #define add_epi16 _mm_add_epi16
435 : #define sub_epi16 _mm_sub_epi16
436 : #define packus_epi16 _mm_packus_epi16
437 : #define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
438 : #define hadd_epi16 sse2_hadd_epi16
439 : #define zeroupper() (void)0
440 : #endif
441 :
442 : #if defined(__GNUC__) && defined(__AVX2__)
443 : // Disabling inlining works around a bug with gcc 9.3 (Ubuntu 20.04) in
444 : // -O2 -mavx2 mode in QuadraticMeanFloatSSE2(),
445 : // where the registry that contains minus_zero is correctly
446 : // loaded the first time the function is called (looking at the disassembly,
447 : // one sees it is loaded much earlier than the function), but gets corrupted
448 : // (zeroed) in following iterations.
449 : // It appears the bug is due to the explicit zeroupper() call at the end of
450 : // the function.
451 : // The bug is at least solved in gcc 10.2.
452 : // Inlining doesn't bring much here to performance.
453 : // This is also needed with gcc 9.3 on QuadraticMeanByteSSE2OrAVX2() in
454 : // -O3 -mavx2 mode
455 : #define NOINLINE __attribute__((noinline))
456 : #else
457 : #define NOINLINE
458 : #endif
459 :
460 : template <class T>
461 : static int NOINLINE
462 5385 : QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
463 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
464 : T *CPL_RESTRICT pDstScanline)
465 : {
466 : // Optimized implementation for RMS on Byte by
467 : // processing by group of 8 output pixels, so as to use
468 : // a single _mm_sqrt_ps() call for 4 output pixels
469 5385 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
470 :
471 5385 : int iDstPixel = 0;
472 5385 : const auto one16 = set1_epi16(1);
473 5385 : const auto one32 = set1_epi32(1);
474 5385 : const auto zero = setzero();
475 5385 : const auto minus32768 = set1_epi16(-32768);
476 :
477 521496 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
478 : {
479 : // Load 2 * DEST_ELTS bytes from each line
480 516111 : auto firstLine = loadu_int(pSrcScanlineShifted);
481 1032220 : auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
482 : // Extend those Bytes as UInt16s
483 516111 : auto firstLineLo = unpacklo_epi8(firstLine, zero);
484 516111 : auto firstLineHi = unpackhi_epi8(firstLine, zero);
485 516111 : auto secondLineLo = unpacklo_epi8(secondLine, zero);
486 516111 : auto secondLineHi = unpackhi_epi8(secondLine, zero);
487 :
488 : // Multiplication of 16 bit values and horizontal
489 : // addition of 32 bit results
490 : // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
491 516111 : firstLineLo = madd_epi16(firstLineLo, firstLineLo);
492 516111 : firstLineHi = madd_epi16(firstLineHi, firstLineHi);
493 516111 : secondLineLo = madd_epi16(secondLineLo, secondLineLo);
494 516111 : secondLineHi = madd_epi16(secondLineHi, secondLineHi);
495 :
496 : // Vertical addition
497 516111 : const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
498 516111 : const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
499 :
500 : const auto sumSquaresPlusOneDiv4Lo =
501 1032220 : srli_epi32(add_epi32(sumSquaresLo, one32), 2);
502 : const auto sumSquaresPlusOneDiv4Hi =
503 1032220 : srli_epi32(add_epi32(sumSquaresHi, one32), 2);
504 :
505 : // Take square root and truncate/floor to int32
506 : const auto rmsLo =
507 1548330 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
508 : const auto rmsHi =
509 1548330 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
510 :
511 : // Merge back low and high registers with each RMS value
512 : // as a 16 bit value.
513 516111 : auto rms = packs_epi32(rmsLo, rmsHi);
514 :
515 : // Round to upper value if it minimizes the
516 : // error |rms^2 - sumSquares/4|
517 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
518 : // rms += 1;
519 : // which is equivalent to:
520 : // if( rms * (rms + 1) < (sumSquares+1) / 4 )
521 : // rms += 1;
522 : // And both left and right parts fit on 16 (unsigned) bits
523 : const auto sumSquaresPlusOneDiv4 =
524 516111 : packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
525 : // cmpgt_epi16 operates on signed int16, but here
526 : // we have unsigned values, so shift them by -32768 before
527 2580560 : auto mask = cmpgt_epi16(
528 : add_epi16(sumSquaresPlusOneDiv4, minus32768),
529 : add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
530 : // The value of the mask will be -1 when the correction needs to be
531 : // applied
532 516111 : rms = sub_epi16(rms, mask);
533 :
534 : // Pack each 16 bit RMS value to 8 bits
535 516111 : rms = packus_epi16(rms, rms /* could be anything */);
536 516111 : store_lo(&pDstScanline[iDstPixel], rms);
537 516111 : pSrcScanlineShifted += 2 * DEST_ELTS;
538 : }
539 : zeroupper();
540 :
541 5385 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
542 5385 : return iDstPixel;
543 : }
544 :
545 : /************************************************************************/
546 : /* AverageByteSSE2OrAVX2() */
547 : /************************************************************************/
548 :
549 : template <class T>
550 : static int
551 111036 : AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
552 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
553 : T *CPL_RESTRICT pDstScanline)
554 : {
555 : // Optimized implementation for average on Byte by
556 : // processing by group of 8 output pixels.
557 :
558 111036 : const auto zero = setzero();
559 111036 : const auto two16 = set1_epi16(2);
560 111036 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
561 :
562 111036 : int iDstPixel = 0;
563 4771880 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
564 : {
565 : // Load 2 * DEST_ELTS bytes from each line
566 4660840 : const auto firstLine = loadu_int(pSrcScanlineShifted);
567 9321690 : const auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
568 : // Extend those Bytes as UInt16s
569 4660840 : const auto firstLineLo = unpacklo_epi8(firstLine, zero);
570 4660840 : const auto firstLineHi = unpackhi_epi8(firstLine, zero);
571 4660840 : const auto secondLineLo = unpacklo_epi8(secondLine, zero);
572 4660840 : const auto secondLineHi = unpackhi_epi8(secondLine, zero);
573 :
574 : // Vertical addition
575 4660840 : const auto sumLo = add_epi16(firstLineLo, secondLineLo);
576 4660840 : const auto sumHi = add_epi16(firstLineHi, secondLineHi);
577 :
578 : // Horizontal addition of adjacent pairs, and recombine low and high
579 : // parts
580 4660840 : const auto sum = hadd_epi16(sumLo, sumHi);
581 :
582 : // average = (sum + 2) / 4
583 9321690 : auto average = srli_epi16(add_epi16(sum, two16), 2);
584 :
585 : // Pack each 16 bit average value to 8 bits
586 4660840 : average = packus_epi16(average, average /* could be anything */);
587 4660840 : store_lo(&pDstScanline[iDstPixel], average);
588 4660840 : pSrcScanlineShifted += 2 * DEST_ELTS;
589 : }
590 : zeroupper();
591 :
592 111036 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
593 111036 : return iDstPixel;
594 : }
595 :
596 : /************************************************************************/
597 : /* QuadraticMeanUInt16SSE2() */
598 : /************************************************************************/
599 :
600 : #ifdef __SSE3__
601 : #define sse2_hadd_pd _mm_hadd_pd
602 : #else
603 8 : inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
604 : {
605 : auto aLo_bLo =
606 32 : _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
607 : auto aHi_bHi =
608 32 : _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
609 8 : return _mm_add_pd(aLo_bLo, aHi_bHi); // (aLo + aHi, bLo + bHi)
610 : }
611 : #endif
612 :
613 40 : inline __m128d SQUARE(__m128d x)
614 : {
615 40 : return _mm_mul_pd(x, x);
616 : }
617 :
618 : #ifdef __AVX2__
619 :
620 : inline __m256d SQUARE(__m256d x)
621 : {
622 : return _mm256_mul_pd(x, x);
623 : }
624 :
625 : inline __m256d FIXUP_LANES(__m256d x)
626 : {
627 : return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
628 : }
629 :
630 : inline __m256 FIXUP_LANES(__m256 x)
631 : {
632 : return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
633 : }
634 :
635 : #endif
636 :
637 : template <class T>
638 : static int
639 10 : QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
640 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
641 : T *CPL_RESTRICT pDstScanline)
642 : {
643 : // Optimized implementation for RMS on UInt16 by
644 : // processing by group of 4 output pixels.
645 10 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
646 :
647 10 : int iDstPixel = 0;
648 10 : const auto zero = _mm_setzero_si128();
649 :
650 : #ifdef __AVX2__
651 : const auto zeroDot25 = _mm256_set1_pd(0.25);
652 : const auto zeroDot5 = _mm256_set1_pd(0.5);
653 :
654 : // The first four 0's could be anything, as we only take the bottom
655 : // 128 bits.
656 : const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
657 : #else
658 10 : const auto zeroDot25 = _mm_set1_pd(0.25);
659 10 : const auto zeroDot5 = _mm_set1_pd(0.5);
660 : #endif
661 :
662 40 : for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
663 : {
664 : // Load 8 UInt16 from each line
665 30 : const auto firstLine = _mm_loadu_si128(
666 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
667 : const auto secondLine =
668 30 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
669 30 : pSrcScanlineShifted + nChunkXSize));
670 :
671 : // Detect if all of the source values fit in 14 bits.
672 : // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
673 : // and we can do a much faster implementation.
674 : const auto maskTmp =
675 60 : _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
676 : #if defined(__i386__) || defined(_M_IX86)
677 : uint64_t nMaskFitsIn14Bits = 0;
678 : _mm_storel_epi64(
679 : reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
680 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
681 : #else
682 30 : const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
683 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
684 : #endif
685 30 : if (nMaskFitsIn14Bits == 0)
686 : {
687 : // Multiplication of 16 bit values and horizontal
688 : // addition of 32 bit results
689 : const auto firstLineHSumSquare =
690 26 : _mm_madd_epi16(firstLine, firstLine);
691 : const auto secondLineHSumSquare =
692 26 : _mm_madd_epi16(secondLine, secondLine);
693 : // Vertical addition
694 : const auto sumSquares =
695 26 : _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
696 : // In theory we should take sqrt(sumSquares * 0.25f)
697 : // but given the rounding we do, this is equivalent to
698 : // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
699 : // sumSquares <= 4 * 16383^2
700 26 : const auto one32 = _mm_set1_epi32(1);
701 : const auto sumSquaresPlusOneDiv4 =
702 52 : _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
703 : // Take square root and truncate/floor to int32
704 78 : auto rms = _mm_cvttps_epi32(
705 : _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
706 :
707 : // Round to upper value if it minimizes the
708 : // error |rms^2 - sumSquares/4|
709 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
710 : // rms += 1;
711 : // which is equivalent to:
712 : // if( rms * rms + rms < (sumSquares+1) / 4 )
713 : // rms += 1;
714 : auto mask =
715 78 : _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
716 : _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
717 26 : rms = _mm_sub_epi32(rms, mask);
718 : // Pack each 32 bit RMS value to 16 bits
719 26 : rms = _mm_packs_epi32(rms, rms /* could be anything */);
720 : _mm_storel_epi64(
721 26 : reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
722 26 : pSrcScanlineShifted += 8;
723 26 : continue;
724 : }
725 :
726 : // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
727 : // to 32 bit would result in 4 multiplications instead of 8, but
728 : // mullo/mulhi have a worse throughput than mul_pd.
729 :
730 : // Extend those UInt16s as UInt32s
731 4 : const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
732 4 : const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
733 4 : const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
734 4 : const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
735 :
736 : #ifdef __AVX2__
737 : // Multiplication of 32 bit values previously converted to 64 bit double
738 : const auto firstLineLoDbl = SQUARE(_mm256_cvtepi32_pd(firstLineLo));
739 : const auto firstLineHiDbl = SQUARE(_mm256_cvtepi32_pd(firstLineHi));
740 : const auto secondLineLoDbl = SQUARE(_mm256_cvtepi32_pd(secondLineLo));
741 : const auto secondLineHiDbl = SQUARE(_mm256_cvtepi32_pd(secondLineHi));
742 :
743 : // Vertical addition of squares
744 : const auto sumSquaresLo =
745 : _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
746 : const auto sumSquaresHi =
747 : _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
748 :
749 : // Horizontal addition of squares
750 : const auto sumSquares =
751 : FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
752 :
753 : const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
754 :
755 : // Take square root and truncate/floor to int32
756 : auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
757 : const auto rmsDouble = _mm256_cvtepi32_pd(rms);
758 : const auto right = _mm256_sub_pd(
759 : sumDivWeight, _mm256_add_pd(SQUARE(rmsDouble), rmsDouble));
760 :
761 : auto mask =
762 : _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
763 : // Extract 32-bit from each of the 4 64-bit masks
764 : // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
765 : // _MM_SHUFFLE(2,0,2,0)));
766 : mask = _mm256_permutevar8x32_ps(mask, permutation);
767 : const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
768 :
769 : // Apply the correction
770 : rms = _mm_sub_epi32(rms, maskI);
771 :
772 : // Pack each 32 bit RMS value to 16 bits
773 : rms = _mm_packus_epi32(rms, rms /* could be anything */);
774 : #else
775 : // Multiplication of 32 bit values previously converted to 64 bit double
776 4 : const auto firstLineLoLo = SQUARE(_mm_cvtepi32_pd(firstLineLo));
777 : const auto firstLineLoHi =
778 8 : SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
779 4 : const auto firstLineHiLo = SQUARE(_mm_cvtepi32_pd(firstLineHi));
780 : const auto firstLineHiHi =
781 8 : SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
782 :
783 4 : const auto secondLineLoLo = SQUARE(_mm_cvtepi32_pd(secondLineLo));
784 : const auto secondLineLoHi =
785 8 : SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
786 4 : const auto secondLineHiLo = SQUARE(_mm_cvtepi32_pd(secondLineHi));
787 : const auto secondLineHiHi =
788 8 : SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
789 :
790 : // Vertical addition of squares
791 4 : const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
792 4 : const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
793 4 : const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
794 4 : const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
795 :
796 : // Horizontal addition of squares
797 4 : const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
798 4 : const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
799 :
800 4 : const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
801 4 : const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
802 : // Take square root and truncate/floor to int32
803 8 : const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
804 8 : const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
805 :
806 : // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
807 : // if( 0.5 < sumDivWeight - (rms * rms + rms) )
808 : // rms += 1;
809 4 : const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
810 4 : const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
811 8 : const auto rightLo = _mm_sub_pd(
812 : sumDivWeightLo, _mm_add_pd(SQUARE(rmsLoDouble), rmsLoDouble));
813 12 : const auto rightHi = _mm_sub_pd(
814 : sumDivWeightHi, _mm_add_pd(SQUARE(rmsHiDouble), rmsHiDouble));
815 :
816 8 : const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
817 4 : const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
818 : // The value of the mask will be -1 when the correction needs to be
819 : // applied
820 8 : const auto mask = _mm_castps_si128(_mm_shuffle_ps(
821 : maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
822 :
823 16 : auto rms = _mm_castps_si128(
824 : _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
825 : // Apply the correction
826 4 : rms = _mm_sub_epi32(rms, mask);
827 :
828 : // Pack each 32 bit RMS value to 16 bits
829 4 : rms = sse2_packus_epi32(rms, rms /* could be anything */);
830 : #endif
831 :
832 4 : _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
833 : rms);
834 4 : pSrcScanlineShifted += 8;
835 : }
836 :
837 : zeroupper();
838 :
839 10 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
840 10 : return iDstPixel;
841 : }
842 :
843 : /************************************************************************/
844 : /* AverageUInt16SSE2() */
845 : /************************************************************************/
846 :
847 : template <class T>
848 9 : static int AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
849 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
850 : T *CPL_RESTRICT pDstScanline)
851 : {
852 : // Optimized implementation for average on UInt16 by
853 : // processing by group of 8 output pixels.
854 :
855 9 : const auto mask = _mm_set1_epi32(0xFFFF);
856 9 : const auto two = _mm_set1_epi32(2);
857 9 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
858 :
859 9 : int iDstPixel = 0;
860 13 : for (; iDstPixel < nDstXWidth - 7; iDstPixel += 8)
861 : {
862 : __m128i averageLow;
863 : // Load 8 UInt16 from each line
864 : {
865 4 : const auto firstLine = _mm_loadu_si128(
866 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
867 : const auto secondLine =
868 4 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
869 4 : pSrcScanlineShifted + nChunkXSize));
870 :
871 : // Horizontal addition and extension to 32 bit
872 12 : const auto horizAddFirstLine = _mm_add_epi32(
873 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
874 : const auto horizAddSecondLine =
875 12 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
876 : _mm_srli_epi32(secondLine, 16));
877 :
878 : // Vertical addition and average computation
879 : // average = (sum + 2) >> 2
880 8 : const auto sum = _mm_add_epi32(
881 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
882 4 : averageLow = _mm_srli_epi32(sum, 2);
883 : }
884 : // Load 8 UInt16 from each line
885 : __m128i averageHigh;
886 : {
887 4 : const auto firstLine = _mm_loadu_si128(
888 4 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted + 8));
889 : const auto secondLine =
890 4 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
891 4 : pSrcScanlineShifted + 8 + nChunkXSize));
892 :
893 : // Horizontal addition and extension to 32 bit
894 12 : const auto horizAddFirstLine = _mm_add_epi32(
895 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
896 : const auto horizAddSecondLine =
897 12 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
898 : _mm_srli_epi32(secondLine, 16));
899 :
900 : // Vertical addition and average computation
901 : // average = (sum + 2) >> 2
902 8 : const auto sum = _mm_add_epi32(
903 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
904 4 : averageHigh = _mm_srli_epi32(sum, 2);
905 : }
906 :
907 : // Pack each 32 bit average value to 16 bits
908 4 : auto average = sse2_packus_epi32(averageLow, averageHigh);
909 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
910 : average);
911 4 : pSrcScanlineShifted += 16;
912 : }
913 :
914 9 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
915 9 : return iDstPixel;
916 : }
917 :
918 : /************************************************************************/
919 : /* QuadraticMeanFloatSSE2() */
920 : /************************************************************************/
921 :
922 : #ifdef __AVX2__
923 : #define RMS_FLOAT_ELTS 8
924 : #define set1_ps _mm256_set1_ps
925 : #define loadu_ps _mm256_loadu_ps
926 : #define andnot_ps _mm256_andnot_ps
927 : #define and_ps _mm256_and_ps
928 : #define max_ps _mm256_max_ps
929 : #define shuffle_ps _mm256_shuffle_ps
930 : #define div_ps _mm256_div_ps
931 : #define cmpeq_ps(x, y) _mm256_cmp_ps(x, y, _CMP_EQ_OQ)
932 : #define mul_ps _mm256_mul_ps
933 : #define add_ps _mm256_add_ps
934 : #define hadd_ps _mm256_hadd_ps
935 : #define sqrt_ps _mm256_sqrt_ps
936 : #define or_ps _mm256_or_ps
937 : #define unpacklo_ps _mm256_unpacklo_ps
938 : #define unpackhi_ps _mm256_unpackhi_ps
939 : #define storeu_ps _mm256_storeu_ps
940 :
941 : inline __m256 SQUARE(__m256 x)
942 : {
943 : return _mm256_mul_ps(x, x);
944 : }
945 :
946 : #else
947 :
948 : #ifdef __SSE3__
949 : #define sse2_hadd_ps _mm_hadd_ps
950 : #else
951 : inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
952 : {
953 : auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
954 : auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
955 : return _mm_add_ps(aEven_bEven, aOdd_bOdd); // (aEven + aOdd, bEven + bOdd)
956 : }
957 : #endif
958 :
959 : #define RMS_FLOAT_ELTS 4
960 : #define set1_ps _mm_set1_ps
961 : #define loadu_ps _mm_loadu_ps
962 : #define andnot_ps _mm_andnot_ps
963 : #define and_ps _mm_and_ps
964 : #define max_ps _mm_max_ps
965 : #define shuffle_ps _mm_shuffle_ps
966 : #define div_ps _mm_div_ps
967 : #define cmpeq_ps _mm_cmpeq_ps
968 : #define mul_ps _mm_mul_ps
969 : #define add_ps _mm_add_ps
970 : #define hadd_ps sse2_hadd_ps
971 : #define sqrt_ps _mm_sqrt_ps
972 : #define or_ps _mm_or_ps
973 : #define unpacklo_ps _mm_unpacklo_ps
974 : #define unpackhi_ps _mm_unpackhi_ps
975 : #define storeu_ps _mm_storeu_ps
976 :
977 272 : inline __m128 SQUARE(__m128 x)
978 : {
979 272 : return _mm_mul_ps(x, x);
980 : }
981 :
982 68 : inline __m128 FIXUP_LANES(__m128 x)
983 : {
984 68 : return x;
985 : }
986 :
987 : #endif
988 :
989 : template <class T>
990 : static int NOINLINE
991 34 : QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
992 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
993 : T *CPL_RESTRICT pDstScanline)
994 : {
995 : // Optimized implementation for RMS on Float32 by
996 : // processing by group of RMS_FLOAT_ELTS output pixels.
997 34 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
998 :
999 34 : int iDstPixel = 0;
1000 34 : const auto minus_zero = set1_ps(-0.0f);
1001 34 : const auto zeroDot25 = set1_ps(0.25f);
1002 34 : const auto one = set1_ps(1.0f);
1003 68 : const auto infv = set1_ps(std::numeric_limits<float>::infinity());
1004 :
1005 102 : for (; iDstPixel < nDstXWidth - (RMS_FLOAT_ELTS - 1);
1006 : iDstPixel += RMS_FLOAT_ELTS)
1007 : {
1008 : // Load 2*RMS_FLOAT_ELTS Float32 from each line
1009 : auto firstLineLo =
1010 68 : loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
1011 68 : auto firstLineHi = loadu_ps(reinterpret_cast<float const *>(
1012 68 : pSrcScanlineShifted + RMS_FLOAT_ELTS));
1013 68 : auto secondLineLo = loadu_ps(
1014 68 : reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
1015 68 : auto secondLineHi = loadu_ps(reinterpret_cast<float const *>(
1016 68 : pSrcScanlineShifted + RMS_FLOAT_ELTS + nChunkXSize));
1017 :
1018 : // Take the absolute value
1019 68 : firstLineLo = andnot_ps(minus_zero, firstLineLo);
1020 68 : firstLineHi = andnot_ps(minus_zero, firstLineHi);
1021 68 : secondLineLo = andnot_ps(minus_zero, secondLineLo);
1022 68 : secondLineHi = andnot_ps(minus_zero, secondLineHi);
1023 :
1024 : auto firstLineEven =
1025 68 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1026 : auto firstLineOdd =
1027 68 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1028 : auto secondLineEven =
1029 68 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1030 : auto secondLineOdd =
1031 68 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1032 :
1033 : // Compute the maximum of each RMS_FLOAT_ELTS value to RMS-average
1034 204 : const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
1035 : max_ps(secondLineEven, secondLineEven));
1036 :
1037 : // Normalize each value by the maximum of the RMS_FLOAT_ELTS ones.
1038 : // This step is important to avoid that the square evaluates to infinity
1039 : // for sufficiently big input.
1040 68 : auto invMax = div_ps(one, maxV);
1041 : // Deal with 0 being the maximum to correct division by zero
1042 : // note: comparing to -0 leads to identical results as to comparing with
1043 : // 0
1044 136 : invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
1045 :
1046 68 : firstLineEven = mul_ps(firstLineEven, invMax);
1047 68 : firstLineOdd = mul_ps(firstLineOdd, invMax);
1048 68 : secondLineEven = mul_ps(secondLineEven, invMax);
1049 68 : secondLineOdd = mul_ps(secondLineOdd, invMax);
1050 :
1051 : // Compute squares
1052 68 : firstLineEven = SQUARE(firstLineEven);
1053 68 : firstLineOdd = SQUARE(firstLineOdd);
1054 68 : secondLineEven = SQUARE(secondLineEven);
1055 68 : secondLineOdd = SQUARE(secondLineOdd);
1056 :
1057 204 : const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
1058 : add_ps(secondLineEven, secondLineOdd));
1059 :
1060 204 : auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
1061 :
1062 : // Deal with infinity being the maximum
1063 68 : const auto maskIsInf = cmpeq_ps(maxV, infv);
1064 136 : rms = or_ps(andnot_ps(maskIsInf, rms), and_ps(maskIsInf, infv));
1065 :
1066 68 : rms = FIXUP_LANES(rms);
1067 :
1068 : // coverity[incompatible_cast]
1069 68 : storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]), rms);
1070 68 : pSrcScanlineShifted += RMS_FLOAT_ELTS * 2;
1071 : }
1072 :
1073 : zeroupper();
1074 :
1075 34 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1076 34 : return iDstPixel;
1077 : }
1078 :
1079 : /************************************************************************/
1080 : /* AverageFloatSSE2() */
1081 : /************************************************************************/
1082 :
1083 : template <class T>
1084 14 : static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
1085 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1086 : T *CPL_RESTRICT pDstScanline)
1087 : {
1088 : // Optimized implementation for average on Float32 by
1089 : // processing by group of 4 output pixels.
1090 14 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1091 :
1092 14 : int iDstPixel = 0;
1093 14 : const auto zeroDot25 = _mm_set1_ps(0.25f);
1094 :
1095 32 : for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
1096 : {
1097 : // Load 8 Float32 from each line
1098 : const auto firstLineLo =
1099 18 : _mm_loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
1100 18 : const auto firstLineHi = _mm_loadu_ps(
1101 18 : reinterpret_cast<float const *>(pSrcScanlineShifted + 4));
1102 18 : const auto secondLineLo = _mm_loadu_ps(
1103 18 : reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
1104 18 : const auto secondLineHi = _mm_loadu_ps(reinterpret_cast<float const *>(
1105 18 : pSrcScanlineShifted + 4 + nChunkXSize));
1106 :
1107 : // Vertical addition
1108 18 : const auto sumLo = _mm_add_ps(firstLineLo, secondLineLo);
1109 18 : const auto sumHi = _mm_add_ps(firstLineHi, secondLineHi);
1110 :
1111 : // Horizontal addition
1112 : const auto A =
1113 18 : _mm_shuffle_ps(sumLo, sumHi, 0 | (2 << 2) | (0 << 4) | (2 << 6));
1114 : const auto B =
1115 18 : _mm_shuffle_ps(sumLo, sumHi, 1 | (3 << 2) | (1 << 4) | (3 << 6));
1116 18 : const auto sum = _mm_add_ps(A, B);
1117 :
1118 18 : const auto average = _mm_mul_ps(sum, zeroDot25);
1119 :
1120 : // coverity[incompatible_cast]
1121 18 : _mm_storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]),
1122 : average);
1123 18 : pSrcScanlineShifted += 8;
1124 : }
1125 :
1126 14 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1127 14 : return iDstPixel;
1128 : }
1129 :
1130 : #endif
1131 :
1132 : /************************************************************************/
1133 : /* GDALResampleChunk_AverageOrRMS() */
1134 : /************************************************************************/
1135 :
1136 : template <class T, class Tsum, GDALDataType eWrkDataType>
1137 : static CPLErr
1138 10400 : GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
1139 : const T *pChunk, void **ppDstBuffer)
1140 : {
1141 10400 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1142 10400 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1143 10400 : const double dfSrcXDelta = args.dfSrcXDelta;
1144 10400 : const double dfSrcYDelta = args.dfSrcYDelta;
1145 10400 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1146 10400 : const int nChunkXOff = args.nChunkXOff;
1147 10400 : const int nChunkYOff = args.nChunkYOff;
1148 10400 : const int nChunkXSize = args.nChunkXSize;
1149 10400 : const int nChunkYSize = args.nChunkYSize;
1150 10400 : const int nDstXOff = args.nDstXOff;
1151 10400 : const int nDstXOff2 = args.nDstXOff2;
1152 10400 : const int nDstYOff = args.nDstYOff;
1153 10400 : const int nDstYOff2 = args.nDstYOff2;
1154 10400 : const char *pszResampling = args.pszResampling;
1155 10400 : bool bHasNoData = args.bHasNoData;
1156 10400 : const double dfNoDataValue = args.dfNoDataValue;
1157 10400 : const GDALColorTable *poColorTable = args.poColorTable;
1158 10400 : const bool bPropagateNoData = args.bPropagateNoData;
1159 :
1160 : // AVERAGE_BIT2GRAYSCALE
1161 : const bool bBit2Grayscale =
1162 10400 : CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"));
1163 10401 : const bool bQuadraticMean = CPL_TO_BOOL(EQUAL(pszResampling, "RMS"));
1164 10401 : if (bBit2Grayscale)
1165 9 : poColorTable = nullptr;
1166 :
1167 : T tNoDataValue;
1168 10401 : if (!bHasNoData)
1169 10350 : tNoDataValue = 0;
1170 : else
1171 51 : tNoDataValue = static_cast<T>(dfNoDataValue);
1172 10401 : const T tReplacementVal =
1173 107 : bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
1174 51 : args.eOvrDataType, dfNoDataValue))
1175 : : 0;
1176 :
1177 10401 : int nChunkRightXOff = nChunkXOff + nChunkXSize;
1178 10401 : int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1179 10401 : int nDstXWidth = nDstXOff2 - nDstXOff;
1180 :
1181 : /* -------------------------------------------------------------------- */
1182 : /* Allocate buffers. */
1183 : /* -------------------------------------------------------------------- */
1184 10401 : *ppDstBuffer = static_cast<T *>(
1185 10401 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
1186 : GDALGetDataTypeSizeBytes(eWrkDataType)));
1187 10401 : if (*ppDstBuffer == nullptr)
1188 : {
1189 0 : return CE_Failure;
1190 : }
1191 10401 : T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
1192 :
1193 : struct PrecomputedXValue
1194 : {
1195 : int nLeftXOffShifted;
1196 : int nRightXOffShifted;
1197 : double dfLeftWeight;
1198 : double dfRightWeight;
1199 : double dfTotalWeightFullLine;
1200 : };
1201 :
1202 : PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
1203 10401 : VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(PrecomputedXValue)));
1204 :
1205 10401 : if (pasSrcX == nullptr)
1206 : {
1207 0 : VSIFree(pasSrcX);
1208 0 : return CE_Failure;
1209 : }
1210 :
1211 10401 : int nTransparentIdx = -1;
1212 10401 : std::vector<GDALColorEntry> colorEntries;
1213 10401 : if (poColorTable)
1214 5 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1215 :
1216 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1217 : // it as nodata value
1218 10428 : if (bHasNoData && dfNoDataValue >= 0.0f &&
1219 27 : tNoDataValue < colorEntries.size())
1220 1 : colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1221 :
1222 : // Or if we have no explicit nodata, but a color table entry that is
1223 : // transparent, consider it as the nodata value
1224 10400 : else if (!bHasNoData && nTransparentIdx >= 0)
1225 : {
1226 0 : bHasNoData = true;
1227 0 : tNoDataValue = static_cast<T>(nTransparentIdx);
1228 : }
1229 :
1230 : /* ==================================================================== */
1231 : /* Precompute inner loop constants. */
1232 : /* ==================================================================== */
1233 10401 : bool bSrcXSpacingIsTwo = true;
1234 10401 : int nLastSrcXOff2 = -1;
1235 867114 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1236 : {
1237 856713 : double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
1238 : // Apply some epsilon to avoid numerical precision issues
1239 856713 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
1240 856713 : double dfSrcXOff2 = dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
1241 856713 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
1242 :
1243 856713 : if (nSrcXOff < nChunkXOff)
1244 0 : nSrcXOff = nChunkXOff;
1245 856713 : if (nSrcXOff2 == nSrcXOff)
1246 0 : nSrcXOff2++;
1247 856713 : if (nSrcXOff2 > nChunkRightXOff)
1248 1 : nSrcXOff2 = nChunkRightXOff;
1249 :
1250 856713 : pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
1251 856713 : pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
1252 856713 : nSrcXOff2 - nChunkXOff;
1253 21 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
1254 856713 : (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
1255 856713 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
1256 856713 : 1 - (nSrcXOff2 - dfSrcXOff2);
1257 856713 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
1258 856713 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
1259 856713 : if (nSrcXOff + 1 < nSrcXOff2)
1260 : {
1261 856692 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1262 856692 : nSrcXOff2 - nSrcXOff - 2;
1263 856692 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1264 856692 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
1265 : }
1266 :
1267 856713 : if (nSrcXOff2 - nSrcXOff != 2 ||
1268 727221 : (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
1269 : {
1270 120592 : bSrcXSpacingIsTwo = false;
1271 : }
1272 856713 : nLastSrcXOff2 = nSrcXOff2;
1273 : }
1274 :
1275 : /* ==================================================================== */
1276 : /* Loop over destination scanlines. */
1277 : /* ==================================================================== */
1278 752881 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1279 : {
1280 742480 : double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
1281 742480 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
1282 742480 : if (nSrcYOff < nChunkYOff)
1283 0 : nSrcYOff = nChunkYOff;
1284 :
1285 742480 : double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
1286 742480 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
1287 742480 : if (nSrcYOff2 == nSrcYOff)
1288 0 : ++nSrcYOff2;
1289 742480 : if (nSrcYOff2 > nChunkBottomYOff)
1290 3 : nSrcYOff2 = nChunkBottomYOff;
1291 :
1292 742480 : T *const pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
1293 :
1294 : /* --------------------------------------------------------------------
1295 : */
1296 : /* Loop over destination pixels */
1297 : /* --------------------------------------------------------------------
1298 : */
1299 742480 : if (poColorTable == nullptr)
1300 : {
1301 742365 : if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
1302 : pabyChunkNodataMask == nullptr)
1303 : {
1304 : if (eWrkDataType == GDT_Byte || eWrkDataType == GDT_UInt16)
1305 : {
1306 : // Optimized case : no nodata, overview by a factor of 2 and
1307 : // regular x and y src spacing.
1308 116440 : const T *pSrcScanlineShifted =
1309 116440 : pChunk + pasSrcX[0].nLeftXOffShifted +
1310 116440 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
1311 116440 : nChunkXSize;
1312 116440 : int iDstPixel = 0;
1313 : #ifdef USE_SSE2
1314 116421 : if (bQuadraticMean && eWrkDataType == GDT_Byte)
1315 : {
1316 5385 : iDstPixel = QuadraticMeanByteSSE2OrAVX2(
1317 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1318 : pDstScanline);
1319 : }
1320 111055 : else if (bQuadraticMean /* && eWrkDataType == GDT_UInt16 */)
1321 : {
1322 10 : iDstPixel = QuadraticMeanUInt16SSE2(
1323 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1324 : pDstScanline);
1325 : }
1326 : else if (/* !bQuadraticMean && */ eWrkDataType == GDT_Byte)
1327 : {
1328 111036 : iDstPixel = AverageByteSSE2OrAVX2(
1329 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1330 : pDstScanline);
1331 : }
1332 : else /* if( !bQuadraticMean && eWrkDataType == GDT_UInt16 )
1333 : */
1334 : {
1335 9 : iDstPixel = AverageUInt16SSE2(nDstXWidth, nChunkXSize,
1336 : pSrcScanlineShifted,
1337 : pDstScanline);
1338 : }
1339 : #endif
1340 278841 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1341 : {
1342 162401 : Tsum nTotal = 0;
1343 : T nVal;
1344 162401 : if (bQuadraticMean)
1345 44 : nTotal =
1346 44 : SQUARE<Tsum>(pSrcScanlineShifted[0]) +
1347 44 : SQUARE<Tsum>(pSrcScanlineShifted[1]) +
1348 44 : SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
1349 44 : SQUARE<Tsum>(
1350 44 : pSrcScanlineShifted[1 + nChunkXSize]);
1351 : else
1352 162357 : nTotal = pSrcScanlineShifted[0] +
1353 162357 : pSrcScanlineShifted[1] +
1354 162357 : pSrcScanlineShifted[nChunkXSize] +
1355 162357 : pSrcScanlineShifted[1 + nChunkXSize];
1356 :
1357 162401 : constexpr int nTotalWeight = 4;
1358 162401 : if (bQuadraticMean)
1359 44 : nVal = ComputeIntegerRMS_4values<T>(nTotal);
1360 : else
1361 162357 : nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
1362 : nTotalWeight);
1363 :
1364 : // No need to compare nVal against tNoDataValue as we
1365 : // are in a case where pabyChunkNodataMask == nullptr
1366 : // implies the absence of nodata value.
1367 162401 : pDstScanline[iDstPixel] = nVal;
1368 162401 : pSrcScanlineShifted += 2;
1369 : }
1370 : }
1371 : else
1372 : {
1373 : CPLAssert(eWrkDataType == GDT_Float32 ||
1374 : eWrkDataType == GDT_Float64);
1375 70 : const T *pSrcScanlineShifted =
1376 70 : pChunk + pasSrcX[0].nLeftXOffShifted +
1377 70 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
1378 70 : nChunkXSize;
1379 70 : int iDstPixel = 0;
1380 : #ifdef USE_SSE2
1381 : if (eWrkDataType == GDT_Float32)
1382 : {
1383 48 : if (bQuadraticMean)
1384 : {
1385 34 : iDstPixel = QuadraticMeanFloatSSE2(
1386 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1387 : pDstScanline);
1388 : }
1389 : else
1390 : {
1391 14 : iDstPixel = AverageFloatSSE2(
1392 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1393 : pDstScanline);
1394 : }
1395 : }
1396 : #endif
1397 :
1398 268 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1399 : {
1400 : T nVal;
1401 198 : if (bQuadraticMean)
1402 : {
1403 : // Cast to double to avoid overflows
1404 : // (using std::hypot() is much slower)
1405 100 : nVal = static_cast<T>(std::sqrt(
1406 : 0.25 *
1407 100 : (SQUARE<double>(pSrcScanlineShifted[0]) +
1408 100 : SQUARE<double>(pSrcScanlineShifted[1]) +
1409 100 : SQUARE<double>(
1410 200 : pSrcScanlineShifted[nChunkXSize]) +
1411 100 : SQUARE<double>(
1412 100 : pSrcScanlineShifted[1 + nChunkXSize]))));
1413 : }
1414 : else
1415 : {
1416 98 : nVal = static_cast<T>(
1417 98 : 0.25f * (pSrcScanlineShifted[0] +
1418 98 : pSrcScanlineShifted[1] +
1419 98 : pSrcScanlineShifted[nChunkXSize] +
1420 98 : pSrcScanlineShifted[1 + nChunkXSize]));
1421 : }
1422 :
1423 : // No need to compare nVal against tNoDataValue as we
1424 : // are in a case where pabyChunkNodataMask == nullptr
1425 : // implies the absence of nodata value.
1426 198 : pDstScanline[iDstPixel] = nVal;
1427 198 : pSrcScanlineShifted += 2;
1428 : }
1429 116510 : }
1430 : }
1431 : else
1432 : {
1433 19 : const double dfBottomWeight =
1434 625855 : (nSrcYOff + 1 == nSrcYOff2) ? 1.0
1435 625836 : : 1.0 - (dfSrcYOff - nSrcYOff);
1436 625855 : const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
1437 625855 : nSrcYOff -= nChunkYOff;
1438 625855 : nSrcYOff2 -= nChunkYOff;
1439 :
1440 625855 : double dfTotalWeightFullColumn = dfBottomWeight;
1441 625855 : if (nSrcYOff + 1 < nSrcYOff2)
1442 : {
1443 625836 : dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
1444 625836 : dfTotalWeightFullColumn += dfTopWeight;
1445 : }
1446 :
1447 18585856 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1448 : {
1449 17959981 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1450 17959981 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1451 :
1452 17959981 : double dfTotal = 0;
1453 17959981 : double dfTotalWeight = 0;
1454 17959981 : if (pabyChunkNodataMask == nullptr)
1455 : {
1456 1746435 : auto pChunkShifted =
1457 115 : pChunk +
1458 1746435 : static_cast<GPtrDiff_t>(nSrcYOff) * nChunkXSize;
1459 1746435 : int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1460 1746435 : double dfWeightY = dfBottomWeight;
1461 3493427 : while (true)
1462 : {
1463 : double dfTotalLine;
1464 5239852 : if (bQuadraticMean)
1465 : {
1466 : // Left pixel
1467 : {
1468 104 : const T val = pChunkShifted[nSrcXOff];
1469 104 : dfTotalLine =
1470 104 : SQUARE<double>(val) *
1471 104 : pasSrcX[iDstPixel].dfLeftWeight;
1472 : }
1473 :
1474 104 : if (nSrcXOff + 1 < nSrcXOff2)
1475 : {
1476 : // Middle pixels
1477 104 : for (int iX = nSrcXOff + 1;
1478 424 : iX + 1 < nSrcXOff2; ++iX)
1479 : {
1480 320 : const T val = pChunkShifted[iX];
1481 320 : dfTotalLine += SQUARE<double>(val);
1482 : }
1483 :
1484 : // Right pixel
1485 : {
1486 104 : const T val =
1487 104 : pChunkShifted[nSrcXOff2 - 1];
1488 104 : dfTotalLine +=
1489 104 : SQUARE<double>(val) *
1490 104 : pasSrcX[iDstPixel].dfRightWeight;
1491 : }
1492 : }
1493 : }
1494 : else
1495 : {
1496 : // Left pixel
1497 : {
1498 5239756 : const T val = pChunkShifted[nSrcXOff];
1499 5239756 : dfTotalLine =
1500 5239756 : val * pasSrcX[iDstPixel].dfLeftWeight;
1501 : }
1502 :
1503 5239756 : if (nSrcXOff + 1 < nSrcXOff2)
1504 : {
1505 : // Middle pixels
1506 4239330 : for (int iX = nSrcXOff + 1;
1507 64183126 : iX + 1 < nSrcXOff2; ++iX)
1508 : {
1509 59943836 : const T val = pChunkShifted[iX];
1510 59943836 : dfTotalLine += val;
1511 : }
1512 :
1513 : // Right pixel
1514 : {
1515 4239330 : const T val =
1516 4239330 : pChunkShifted[nSrcXOff2 - 1];
1517 4239330 : dfTotalLine +=
1518 4239330 : val *
1519 4239330 : pasSrcX[iDstPixel].dfRightWeight;
1520 : }
1521 : }
1522 : }
1523 :
1524 5239852 : dfTotal += dfTotalLine * dfWeightY;
1525 5239852 : --nCounterY;
1526 5239852 : if (nCounterY < 0)
1527 1746435 : break;
1528 3493427 : pChunkShifted += nChunkXSize;
1529 3493427 : dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
1530 : }
1531 :
1532 1746435 : dfTotalWeight =
1533 1746435 : pasSrcX[iDstPixel].dfTotalWeightFullLine *
1534 : dfTotalWeightFullColumn;
1535 : }
1536 : else
1537 : {
1538 16213566 : GPtrDiff_t nCount = 0;
1539 71190898 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1540 : {
1541 54977432 : const auto pChunkShifted =
1542 132 : pChunk +
1543 54977432 : static_cast<GPtrDiff_t>(iY) * nChunkXSize;
1544 :
1545 54977432 : double dfTotalLine = 0;
1546 54977432 : double dfTotalWeightLine = 0;
1547 : // Left pixel
1548 : {
1549 54977432 : const int iX = nSrcXOff;
1550 54977432 : const T val = pChunkShifted[iX];
1551 54977432 : if (pabyChunkNodataMask[iX + iY * nChunkXSize])
1552 : {
1553 23420081 : nCount++;
1554 23420081 : const double dfWeightX =
1555 23420081 : pasSrcX[iDstPixel].dfLeftWeight;
1556 23420081 : dfTotalWeightLine = dfWeightX;
1557 23420081 : if (bQuadraticMean)
1558 60 : dfTotalLine =
1559 60 : SQUARE<double>(val) * dfWeightX;
1560 : else
1561 23419981 : dfTotalLine = val * dfWeightX;
1562 : }
1563 : }
1564 :
1565 54977432 : if (nSrcXOff + 1 < nSrcXOff2)
1566 : {
1567 : // Middle pixels
1568 145172132 : for (int iX = nSrcXOff + 1; iX + 1 < nSrcXOff2;
1569 : ++iX)
1570 : {
1571 90195000 : const T val = pChunkShifted[iX];
1572 90195000 : if (pabyChunkNodataMask[iX +
1573 90195000 : iY * nChunkXSize])
1574 : {
1575 39728200 : nCount++;
1576 39728200 : dfTotalWeightLine += 1;
1577 39728200 : if (bQuadraticMean)
1578 0 : dfTotalLine += SQUARE<double>(val);
1579 : else
1580 39728200 : dfTotalLine += val;
1581 : }
1582 : }
1583 :
1584 : // Right pixel
1585 : {
1586 54977432 : const int iX = nSrcXOff2 - 1;
1587 54977432 : const T val = pChunkShifted[iX];
1588 54977432 : if (pabyChunkNodataMask[iX +
1589 54977432 : iY * nChunkXSize])
1590 : {
1591 23419247 : nCount++;
1592 23419247 : const double dfWeightX =
1593 23419247 : pasSrcX[iDstPixel].dfRightWeight;
1594 23419247 : dfTotalWeightLine += dfWeightX;
1595 23419247 : if (bQuadraticMean)
1596 65 : dfTotalLine +=
1597 61 : SQUARE<double>(val) * dfWeightX;
1598 : else
1599 23419246 : dfTotalLine += val * dfWeightX;
1600 : }
1601 : }
1602 : }
1603 :
1604 93741198 : const double dfWeightY =
1605 : (iY == nSrcYOff) ? dfBottomWeight
1606 38763866 : : (iY + 1 == nSrcYOff2) ? dfTopWeight
1607 : : 1.0;
1608 54977432 : dfTotal += dfTotalLine * dfWeightY;
1609 54977432 : dfTotalWeight += dfTotalWeightLine * dfWeightY;
1610 : }
1611 :
1612 16213566 : if (nCount == 0 ||
1613 8 : (bPropagateNoData &&
1614 : nCount <
1615 8 : static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
1616 8 : (nSrcXOff2 - nSrcXOff)))
1617 : {
1618 9461842 : pDstScanline[iDstPixel] = tNoDataValue;
1619 9461842 : continue;
1620 : }
1621 : }
1622 : if (eWrkDataType == GDT_Byte)
1623 : {
1624 : T nVal;
1625 8497990 : if (bQuadraticMean)
1626 38 : nVal = ComputeIntegerRMS<T, int>(dfTotal,
1627 : dfTotalWeight);
1628 : else
1629 8497950 : nVal =
1630 8497950 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1631 8497990 : if (bHasNoData && nVal == tNoDataValue)
1632 0 : nVal = tReplacementVal;
1633 8497990 : pDstScanline[iDstPixel] = nVal;
1634 : }
1635 : else if (eWrkDataType == GDT_UInt16)
1636 : {
1637 : T nVal;
1638 8 : if (bQuadraticMean)
1639 4 : nVal = ComputeIntegerRMS<T, uint64_t>(
1640 : dfTotal, dfTotalWeight);
1641 : else
1642 4 : nVal =
1643 4 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1644 8 : if (bHasNoData && nVal == tNoDataValue)
1645 0 : nVal = tReplacementVal;
1646 8 : pDstScanline[iDstPixel] = nVal;
1647 : }
1648 : else
1649 : {
1650 : T nVal;
1651 151 : if (bQuadraticMean)
1652 20 : nVal =
1653 25 : static_cast<T>(sqrt(dfTotal / dfTotalWeight));
1654 : else
1655 126 : nVal = static_cast<T>(dfTotal / dfTotalWeight);
1656 151 : if (bHasNoData && nVal == tNoDataValue)
1657 2 : nVal = tReplacementVal;
1658 151 : pDstScanline[iDstPixel] = nVal;
1659 : }
1660 : }
1661 : }
1662 : }
1663 : else
1664 : {
1665 115 : nSrcYOff -= nChunkYOff;
1666 115 : nSrcYOff2 -= nChunkYOff;
1667 :
1668 6589 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1669 : {
1670 6475 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1671 6475 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1672 :
1673 6475 : GPtrDiff_t nTotalR = 0;
1674 6475 : GPtrDiff_t nTotalG = 0;
1675 6475 : GPtrDiff_t nTotalB = 0;
1676 6475 : GPtrDiff_t nCount = 0;
1677 :
1678 19425 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1679 : {
1680 38850 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
1681 : {
1682 25900 : const T val = pChunk[iX + static_cast<GPtrDiff_t>(iY) *
1683 25900 : nChunkXSize];
1684 : // cppcheck-suppress unsignedLessThanZero
1685 25900 : if (val < 0 || val >= colorEntries.size())
1686 0 : continue;
1687 25900 : size_t idx = static_cast<size_t>(val);
1688 25900 : const auto &entry = colorEntries[idx];
1689 25900 : if (entry.c4)
1690 : {
1691 14128 : if (bQuadraticMean)
1692 : {
1693 800 : nTotalR += SQUARE<int>(entry.c1);
1694 800 : nTotalG += SQUARE<int>(entry.c2);
1695 800 : nTotalB += SQUARE<int>(entry.c3);
1696 800 : ++nCount;
1697 : }
1698 : else
1699 : {
1700 13328 : nTotalR += entry.c1;
1701 13328 : nTotalG += entry.c2;
1702 13328 : nTotalB += entry.c3;
1703 13328 : ++nCount;
1704 : }
1705 : }
1706 : }
1707 : }
1708 :
1709 6475 : if (nCount == 0 ||
1710 0 : (bPropagateNoData &&
1711 0 : nCount < static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
1712 0 : (nSrcXOff2 - nSrcXOff)))
1713 : {
1714 2838 : pDstScanline[iDstPixel] = tNoDataValue;
1715 : }
1716 : else
1717 : {
1718 : GDALColorEntry color;
1719 3637 : if (bQuadraticMean)
1720 : {
1721 200 : color.c1 =
1722 200 : static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
1723 200 : color.c2 =
1724 200 : static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
1725 200 : color.c3 =
1726 200 : static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
1727 : }
1728 : else
1729 : {
1730 3437 : color.c1 =
1731 3437 : static_cast<short>((nTotalR + nCount / 2) / nCount);
1732 3437 : color.c2 =
1733 3437 : static_cast<short>((nTotalG + nCount / 2) / nCount);
1734 3437 : color.c3 =
1735 3437 : static_cast<short>((nTotalB + nCount / 2) / nCount);
1736 : }
1737 3636 : pDstScanline[iDstPixel] =
1738 3637 : static_cast<T>(BestColorEntry(colorEntries, color));
1739 : }
1740 : }
1741 : }
1742 : }
1743 :
1744 10401 : CPLFree(pasSrcX);
1745 :
1746 10401 : return CE_None;
1747 : }
1748 :
1749 : static CPLErr
1750 10401 : GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
1751 : const void *pChunk, void **ppDstBuffer,
1752 : GDALDataType *peDstBufferDataType)
1753 : {
1754 10401 : *peDstBufferDataType = args.eWrkDataType;
1755 10401 : switch (args.eWrkDataType)
1756 : {
1757 10336 : case GDT_Byte:
1758 : {
1759 10336 : return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_Byte>(
1760 10335 : args, static_cast<const GByte *>(pChunk), ppDstBuffer);
1761 : }
1762 :
1763 9 : case GDT_UInt16:
1764 : {
1765 9 : if (EQUAL(args.pszResampling, "RMS"))
1766 : {
1767 : // Use double as accumulation type, because UInt32 could overflow
1768 : return GDALResampleChunk_AverageOrRMS_T<GUInt16, double,
1769 5 : GDT_UInt16>(
1770 5 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1771 : }
1772 : else
1773 : {
1774 : return GDALResampleChunk_AverageOrRMS_T<GUInt16, GUInt32,
1775 4 : GDT_UInt16>(
1776 4 : args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1777 : }
1778 : }
1779 :
1780 39 : case GDT_Float32:
1781 : {
1782 39 : return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32>(
1783 39 : args, static_cast<const float *>(pChunk), ppDstBuffer);
1784 : }
1785 :
1786 17 : case GDT_Float64:
1787 : {
1788 : return GDALResampleChunk_AverageOrRMS_T<double, double,
1789 17 : GDT_Float64>(
1790 17 : args, static_cast<const double *>(pChunk), ppDstBuffer);
1791 : }
1792 :
1793 0 : default:
1794 0 : break;
1795 : }
1796 :
1797 0 : CPLAssert(false);
1798 : return CE_Failure;
1799 : }
1800 :
1801 : /************************************************************************/
1802 : /* GDALResampleChunk_Gauss() */
1803 : /************************************************************************/
1804 :
1805 86 : static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
1806 : const void *pChunk, void **ppDstBuffer,
1807 : GDALDataType *peDstBufferDataType)
1808 :
1809 : {
1810 86 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1811 86 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1812 86 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1813 86 : const int nChunkXOff = args.nChunkXOff;
1814 86 : const int nChunkXSize = args.nChunkXSize;
1815 86 : const int nChunkYOff = args.nChunkYOff;
1816 86 : const int nChunkYSize = args.nChunkYSize;
1817 86 : const int nDstXOff = args.nDstXOff;
1818 86 : const int nDstXOff2 = args.nDstXOff2;
1819 86 : const int nDstYOff = args.nDstYOff;
1820 86 : const int nDstYOff2 = args.nDstYOff2;
1821 86 : const bool bHasNoData = args.bHasNoData;
1822 86 : double dfNoDataValue = args.dfNoDataValue;
1823 86 : const GDALColorTable *poColorTable = args.poColorTable;
1824 :
1825 86 : const double *const padfChunk = static_cast<const double *>(pChunk);
1826 :
1827 86 : *ppDstBuffer =
1828 86 : VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
1829 : GDALGetDataTypeSizeBytes(GDT_Float64));
1830 86 : if (*ppDstBuffer == nullptr)
1831 : {
1832 0 : return CE_Failure;
1833 : }
1834 86 : *peDstBufferDataType = GDT_Float64;
1835 86 : double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
1836 :
1837 : /* -------------------------------------------------------------------- */
1838 : /* Create the filter kernel and allocate scanline buffer. */
1839 : /* -------------------------------------------------------------------- */
1840 86 : int nGaussMatrixDim = 3;
1841 : const int *panGaussMatrix;
1842 86 : constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
1843 86 : constexpr int anGaussMatrix5x5[] = {1, 4, 6, 4, 1, 4, 16, 24, 16,
1844 : 4, 6, 24, 36, 24, 6, 4, 16, 24,
1845 : 16, 4, 1, 4, 6, 4, 1};
1846 86 : constexpr int anGaussMatrix7x7[] = {
1847 : 1, 6, 15, 20, 15, 6, 1, 6, 36, 90, 120, 90, 36,
1848 : 6, 15, 90, 225, 300, 225, 90, 15, 20, 120, 300, 400, 300,
1849 : 120, 20, 15, 90, 225, 300, 225, 90, 15, 6, 36, 90, 120,
1850 : 90, 36, 6, 1, 6, 15, 20, 15, 6, 1};
1851 :
1852 86 : const int nOXSize = args.nOvrXSize;
1853 86 : const int nOYSize = args.nOvrYSize;
1854 86 : const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
1855 :
1856 : // matrix for gauss filter
1857 86 : if (nResYFactor <= 2)
1858 : {
1859 85 : panGaussMatrix = anGaussMatrix3x3;
1860 85 : nGaussMatrixDim = 3;
1861 : }
1862 1 : else if (nResYFactor <= 4)
1863 : {
1864 0 : panGaussMatrix = anGaussMatrix5x5;
1865 0 : nGaussMatrixDim = 5;
1866 : }
1867 : else
1868 : {
1869 1 : panGaussMatrix = anGaussMatrix7x7;
1870 1 : nGaussMatrixDim = 7;
1871 : }
1872 :
1873 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
1874 : int *panGaussMatrixDup = static_cast<int *>(
1875 : CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
1876 : memcpy(panGaussMatrixDup, panGaussMatrix,
1877 : sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
1878 : panGaussMatrix = panGaussMatrixDup;
1879 : #endif
1880 :
1881 86 : if (!bHasNoData)
1882 79 : dfNoDataValue = 0.0;
1883 :
1884 86 : std::vector<GDALColorEntry> colorEntries;
1885 86 : int nTransparentIdx = -1;
1886 86 : if (poColorTable)
1887 2 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1888 :
1889 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1890 : // it as nodata value.
1891 92 : if (bHasNoData && dfNoDataValue >= 0.0f &&
1892 6 : dfNoDataValue < colorEntries.size())
1893 0 : colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
1894 :
1895 : // Or if we have no explicit nodata, but a color table entry that is
1896 : // transparent, consider it as the nodata value.
1897 86 : else if (!bHasNoData && nTransparentIdx >= 0)
1898 : {
1899 0 : dfNoDataValue = nTransparentIdx;
1900 : }
1901 :
1902 86 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
1903 86 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1904 86 : const int nDstXWidth = nDstXOff2 - nDstXOff;
1905 :
1906 : /* ==================================================================== */
1907 : /* Loop over destination scanlines. */
1908 : /* ==================================================================== */
1909 16488 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1910 : {
1911 16402 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
1912 16402 : int nSrcYOff2 =
1913 16402 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
1914 :
1915 16402 : if (nSrcYOff < nChunkYOff)
1916 : {
1917 0 : nSrcYOff = nChunkYOff;
1918 0 : nSrcYOff2++;
1919 : }
1920 :
1921 16402 : const int iSizeY = nSrcYOff2 - nSrcYOff;
1922 16402 : nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
1923 16402 : nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
1924 :
1925 16402 : if (nSrcYOff2 > nChunkBottomYOff ||
1926 16359 : (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
1927 : {
1928 44 : nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
1929 : }
1930 :
1931 16402 : int nYShiftGaussMatrix = 0;
1932 16402 : if (nSrcYOff < nChunkYOff)
1933 : {
1934 0 : nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
1935 0 : nSrcYOff = nChunkYOff;
1936 : }
1937 :
1938 16402 : const double *const padfSrcScanline =
1939 16402 : padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
1940 16402 : const GByte *pabySrcScanlineNodataMask = nullptr;
1941 16402 : if (pabyChunkNodataMask != nullptr)
1942 152 : pabySrcScanlineNodataMask =
1943 152 : pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
1944 :
1945 : /* --------------------------------------------------------------------
1946 : */
1947 : /* Loop over destination pixels */
1948 : /* --------------------------------------------------------------------
1949 : */
1950 16402 : double *const padfDstScanline =
1951 16402 : padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
1952 4149980 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1953 : {
1954 4133580 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
1955 4133580 : int nSrcXOff2 =
1956 4133580 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
1957 :
1958 4133580 : if (nSrcXOff < nChunkXOff)
1959 : {
1960 0 : nSrcXOff = nChunkXOff;
1961 0 : nSrcXOff2++;
1962 : }
1963 :
1964 4133580 : const int iSizeX = nSrcXOff2 - nSrcXOff;
1965 4133580 : nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
1966 4133580 : nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
1967 :
1968 4133580 : if (nSrcXOff2 > nChunkRightXOff ||
1969 4127930 : (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
1970 : {
1971 5650 : nSrcXOff2 =
1972 5650 : std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
1973 : }
1974 :
1975 4133580 : int nXShiftGaussMatrix = 0;
1976 4133580 : if (nSrcXOff < nChunkXOff)
1977 : {
1978 0 : nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
1979 0 : nSrcXOff = nChunkXOff;
1980 : }
1981 :
1982 4133580 : if (poColorTable == nullptr)
1983 : {
1984 4133380 : double dfTotal = 0.0;
1985 4133380 : GInt64 nCount = 0;
1986 4133380 : const int *panLineWeight =
1987 4133380 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
1988 : nXShiftGaussMatrix;
1989 :
1990 16527900 : for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
1991 12394500 : ++iY, ++j, panLineWeight += nGaussMatrixDim)
1992 : {
1993 49561300 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
1994 : {
1995 37166800 : const double val =
1996 37166800 : padfSrcScanline[iX - nChunkXOff +
1997 37166800 : static_cast<GPtrDiff_t>(iY -
1998 37166800 : nSrcYOff) *
1999 37166800 : nChunkXSize];
2000 37166800 : if (pabySrcScanlineNodataMask == nullptr ||
2001 32872 : pabySrcScanlineNodataMask[iX - nChunkXOff +
2002 32872 : static_cast<GPtrDiff_t>(
2003 32872 : iY - nSrcYOff) *
2004 32872 : nChunkXSize])
2005 : {
2006 37146100 : const int nWeight = panLineWeight[i];
2007 37146100 : dfTotal += val * nWeight;
2008 37146100 : nCount += nWeight;
2009 : }
2010 : }
2011 : }
2012 :
2013 4133380 : if (nCount == 0)
2014 : {
2015 2217 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2016 : }
2017 : else
2018 : {
2019 4131160 : padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
2020 : }
2021 : }
2022 : else
2023 : {
2024 200 : GInt64 nTotalR = 0;
2025 200 : GInt64 nTotalG = 0;
2026 200 : GInt64 nTotalB = 0;
2027 200 : GInt64 nTotalWeight = 0;
2028 200 : const int *panLineWeight =
2029 200 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2030 : nXShiftGaussMatrix;
2031 :
2032 780 : for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2033 580 : ++iY, ++j, panLineWeight += nGaussMatrixDim)
2034 : {
2035 2262 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2036 : {
2037 1682 : const double val =
2038 1682 : padfSrcScanline[iX - nChunkXOff +
2039 1682 : static_cast<GPtrDiff_t>(iY -
2040 1682 : nSrcYOff) *
2041 1682 : nChunkXSize];
2042 1682 : if (val < 0 || val >= colorEntries.size())
2043 0 : continue;
2044 :
2045 1682 : size_t idx = static_cast<size_t>(val);
2046 1682 : if (colorEntries[idx].c4)
2047 : {
2048 1682 : const int nWeight = panLineWeight[i];
2049 1682 : nTotalR +=
2050 1682 : static_cast<GInt64>(colorEntries[idx].c1) *
2051 1682 : nWeight;
2052 1682 : nTotalG +=
2053 1682 : static_cast<GInt64>(colorEntries[idx].c2) *
2054 1682 : nWeight;
2055 1682 : nTotalB +=
2056 1682 : static_cast<GInt64>(colorEntries[idx].c3) *
2057 1682 : nWeight;
2058 1682 : nTotalWeight += nWeight;
2059 : }
2060 : }
2061 : }
2062 :
2063 200 : if (nTotalWeight == 0)
2064 : {
2065 0 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2066 : }
2067 : else
2068 : {
2069 : GDALColorEntry color;
2070 :
2071 200 : color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
2072 : nTotalWeight);
2073 200 : color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
2074 : nTotalWeight);
2075 200 : color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
2076 : nTotalWeight);
2077 200 : padfDstScanline[iDstPixel - nDstXOff] =
2078 200 : BestColorEntry(colorEntries, color);
2079 : }
2080 : }
2081 : }
2082 : }
2083 :
2084 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
2085 : CPLFree(panGaussMatrixDup);
2086 : #endif
2087 :
2088 86 : return CE_None;
2089 : }
2090 :
2091 : /************************************************************************/
2092 : /* GDALResampleChunk_Mode() */
2093 : /************************************************************************/
2094 :
2095 4398 : template <class T> static inline bool IsSame(T a, T b)
2096 : {
2097 4398 : return a == b;
2098 : }
2099 :
2100 4854 : template <> bool IsSame<float>(float a, float b)
2101 : {
2102 4854 : return a == b || (std::isnan(a) && std::isnan(b));
2103 : }
2104 :
2105 504 : template <> bool IsSame<double>(double a, double b)
2106 : {
2107 504 : return a == b || (std::isnan(a) && std::isnan(b));
2108 : }
2109 :
2110 : template <>
2111 480 : bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
2112 : {
2113 960 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2114 960 : std::isnan(b.real()) && std::isnan(b.imag()));
2115 : }
2116 :
2117 : template <>
2118 480 : bool IsSame<std::complex<double>>(std::complex<double> a,
2119 : std::complex<double> b)
2120 : {
2121 960 : return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2122 960 : std::isnan(b.real()) && std::isnan(b.imag()));
2123 : }
2124 :
2125 : template <class T>
2126 136 : static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
2127 : const T *pChunk, T *const pDstBuffer)
2128 :
2129 : {
2130 136 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2131 136 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2132 136 : const double dfSrcXDelta = args.dfSrcXDelta;
2133 136 : const double dfSrcYDelta = args.dfSrcYDelta;
2134 136 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2135 136 : const int nChunkXOff = args.nChunkXOff;
2136 136 : const int nChunkXSize = args.nChunkXSize;
2137 136 : const int nChunkYOff = args.nChunkYOff;
2138 136 : const int nChunkYSize = args.nChunkYSize;
2139 136 : const int nDstXOff = args.nDstXOff;
2140 136 : const int nDstXOff2 = args.nDstXOff2;
2141 136 : const int nDstYOff = args.nDstYOff;
2142 136 : const int nDstYOff2 = args.nDstYOff2;
2143 136 : const bool bHasNoData = args.bHasNoData;
2144 136 : const GDALColorTable *poColorTable = args.poColorTable;
2145 136 : const int nDstXSize = nDstXOff2 - nDstXOff;
2146 :
2147 8 : T tNoDataValue;
2148 : if constexpr (std::is_same<T, std::complex<float>>::value ||
2149 : std::is_same<T, std::complex<double>>::value)
2150 : {
2151 : using BaseT = typename T::value_type;
2152 8 : tNoDataValue =
2153 : std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
2154 : std::numeric_limits<BaseT>::quiet_NaN());
2155 : }
2156 128 : else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
2157 127 : tNoDataValue = 0;
2158 : else
2159 1 : tNoDataValue = static_cast<T>(args.dfNoDataValue);
2160 :
2161 136 : size_t nMaxNumPx = 0;
2162 136 : T *paVals = nullptr;
2163 136 : int *panSums = nullptr;
2164 :
2165 136 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2166 136 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2167 272 : std::vector<int> anVals(256, 0);
2168 :
2169 : /* ==================================================================== */
2170 : /* Loop over destination scanlines. */
2171 : /* ==================================================================== */
2172 7531 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2173 : {
2174 7395 : double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
2175 7395 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
2176 : #ifdef only_pixels_with_more_than_10_pct_participation
2177 : // When oversampling, don't take into account pixels that have a tiny
2178 : // participation in the resulting pixel
2179 : if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2180 : nSrcYOff < nChunkBottomYOff)
2181 : nSrcYOff++;
2182 : #endif
2183 7395 : if (nSrcYOff < nChunkYOff)
2184 0 : nSrcYOff = nChunkYOff;
2185 :
2186 7395 : double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
2187 7395 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
2188 : #ifdef only_pixels_with_more_than_10_pct_participation
2189 : // When oversampling, don't take into account pixels that have a tiny
2190 : // participation in the resulting pixel
2191 : if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2192 : nSrcYOff2 > nChunkYOff)
2193 : nSrcYOff2--;
2194 : #endif
2195 7395 : if (nSrcYOff2 == nSrcYOff)
2196 0 : ++nSrcYOff2;
2197 7395 : if (nSrcYOff2 > nChunkBottomYOff)
2198 0 : nSrcYOff2 = nChunkBottomYOff;
2199 :
2200 7395 : const T *const paSrcScanline =
2201 149 : pChunk +
2202 7395 : (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
2203 7395 : const GByte *pabySrcScanlineNodataMask = nullptr;
2204 7395 : if (pabyChunkNodataMask != nullptr)
2205 1810 : pabySrcScanlineNodataMask =
2206 : pabyChunkNodataMask +
2207 1810 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
2208 :
2209 7395 : T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
2210 : /* --------------------------------------------------------------------
2211 : */
2212 : /* Loop over destination pixels */
2213 : /* --------------------------------------------------------------------
2214 : */
2215 4259580 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2216 : {
2217 4252187 : double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
2218 : // Apply some epsilon to avoid numerical precision issues
2219 4252187 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
2220 : #ifdef only_pixels_with_more_than_10_pct_participation
2221 : // When oversampling, don't take into account pixels that have a
2222 : // tiny participation in the resulting pixel
2223 : if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2224 : nSrcXOff < nChunkRightXOff)
2225 : nSrcXOff++;
2226 : #endif
2227 4252187 : if (nSrcXOff < nChunkXOff)
2228 0 : nSrcXOff = nChunkXOff;
2229 :
2230 4252187 : double dfSrcXOff2 =
2231 4252187 : dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
2232 4252187 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
2233 : #ifdef only_pixels_with_more_than_10_pct_participation
2234 : // When oversampling, don't take into account pixels that have a
2235 : // tiny participation in the resulting pixel
2236 : if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2237 : nSrcXOff2 > nChunkXOff)
2238 : nSrcXOff2--;
2239 : #endif
2240 4252187 : if (nSrcXOff2 == nSrcXOff)
2241 0 : nSrcXOff2++;
2242 4252187 : if (nSrcXOff2 > nChunkRightXOff)
2243 0 : nSrcXOff2 = nChunkRightXOff;
2244 :
2245 4252187 : bool bRegularProcessing = false;
2246 : if constexpr (!std::is_same<T, GByte>::value)
2247 827 : bRegularProcessing = true;
2248 4251360 : else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
2249 0 : bRegularProcessing = true;
2250 :
2251 4252187 : if (bRegularProcessing)
2252 : {
2253 : // Not sure how much sense it makes to run a majority
2254 : // filter on floating point data, but here it is for the sake
2255 : // of compatibility. It won't look right on RGB images by the
2256 : // nature of the filter.
2257 :
2258 827 : if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
2259 2481 : nSrcYOff2 - nSrcYOff > INT_MAX / (nSrcXOff2 - nSrcXOff) ||
2260 827 : static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
2261 827 : static_cast<size_t>(nSrcXOff2 - nSrcXOff) >
2262 827 : std::numeric_limits<size_t>::max() / sizeof(float))
2263 : {
2264 0 : CPLError(CE_Failure, CPLE_NotSupported,
2265 : "Too big downsampling factor");
2266 0 : CPLFree(paVals);
2267 0 : CPLFree(panSums);
2268 0 : return CE_Failure;
2269 : }
2270 827 : const size_t nNumPx =
2271 827 : static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
2272 827 : static_cast<size_t>(nSrcXOff2 - nSrcXOff);
2273 827 : size_t iMaxInd = 0;
2274 827 : size_t iMaxVal = 0;
2275 827 : bool biMaxValdValid = false;
2276 :
2277 827 : if (paVals == nullptr || nNumPx > nMaxNumPx)
2278 : {
2279 : T *paValsNew = static_cast<T *>(
2280 71 : VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
2281 : int *panSumsNew = static_cast<int *>(
2282 71 : VSI_REALLOC_VERBOSE(panSums, nNumPx * sizeof(int)));
2283 71 : if (paValsNew != nullptr)
2284 71 : paVals = paValsNew;
2285 71 : if (panSumsNew != nullptr)
2286 71 : panSums = panSumsNew;
2287 71 : if (paValsNew == nullptr || panSumsNew == nullptr)
2288 : {
2289 0 : CPLFree(paVals);
2290 0 : CPLFree(panSums);
2291 0 : return CE_Failure;
2292 : }
2293 71 : nMaxNumPx = nNumPx;
2294 : }
2295 :
2296 2585 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2297 : {
2298 1758 : const GPtrDiff_t iTotYOff =
2299 1758 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2300 1758 : nChunkXOff;
2301 5690 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2302 : {
2303 3932 : if (pabySrcScanlineNodataMask == nullptr ||
2304 16 : pabySrcScanlineNodataMask[iX + iTotYOff])
2305 : {
2306 3917 : const T val = paSrcScanline[iX + iTotYOff];
2307 3917 : size_t i = 0; // Used after for.
2308 :
2309 : // Check array for existing entry.
2310 14387 : for (; i < iMaxInd; ++i)
2311 17626 : if (IsSame(paVals[i], val) &&
2312 6910 : ++panSums[i] > panSums[iMaxVal])
2313 : {
2314 246 : iMaxVal = i;
2315 246 : biMaxValdValid = true;
2316 246 : break;
2317 : }
2318 :
2319 : // Add to arr if entry not already there.
2320 3917 : if (i == iMaxInd)
2321 : {
2322 3671 : paVals[iMaxInd] = val;
2323 3671 : panSums[iMaxInd] = 1;
2324 :
2325 3671 : if (!biMaxValdValid)
2326 : {
2327 824 : iMaxVal = iMaxInd;
2328 824 : biMaxValdValid = true;
2329 : }
2330 :
2331 3671 : ++iMaxInd;
2332 : }
2333 : }
2334 : }
2335 : }
2336 :
2337 827 : if (!biMaxValdValid)
2338 3 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2339 : else
2340 824 : paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
2341 : }
2342 : else if constexpr (std::is_same<T, GByte>::value)
2343 : // ( eSrcDataType == GDT_Byte && nEntryCount < 256 )
2344 : {
2345 : // So we go here for a paletted or non-paletted byte band.
2346 : // The input values are then between 0 and 255.
2347 4251360 : int nMaxVal = 0;
2348 4251360 : int iMaxInd = -1;
2349 :
2350 : // The cost of this zeroing might be high. Perhaps we should
2351 : // just use the above generic case, and go to this one if the
2352 : // number of source pixels is large enough
2353 4251360 : std::fill(anVals.begin(), anVals.end(), 0);
2354 :
2355 12777700 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2356 : {
2357 8526370 : const GPtrDiff_t iTotYOff =
2358 8526370 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2359 8526370 : nChunkXOff;
2360 25649400 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2361 : {
2362 17123000 : const T val = paSrcScanline[iX + iTotYOff];
2363 17123000 : if (!bHasNoData || val != tNoDataValue)
2364 : {
2365 17123000 : int nVal = static_cast<int>(val);
2366 17123000 : if (++anVals[nVal] > nMaxVal)
2367 : {
2368 : // Sum the density.
2369 : // Is it the most common value so far?
2370 17006300 : iMaxInd = nVal;
2371 17006300 : nMaxVal = anVals[nVal];
2372 : }
2373 : }
2374 : }
2375 : }
2376 :
2377 4251360 : if (iMaxInd == -1)
2378 0 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2379 : else
2380 4251360 : paDstScanline[iDstPixel - nDstXOff] =
2381 : static_cast<T>(iMaxInd);
2382 : }
2383 : }
2384 : }
2385 :
2386 136 : CPLFree(paVals);
2387 136 : CPLFree(panSums);
2388 :
2389 136 : return CE_None;
2390 : }
2391 :
2392 136 : static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
2393 : const void *pChunk, void **ppDstBuffer,
2394 : GDALDataType *peDstBufferDataType)
2395 : {
2396 136 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
2397 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
2398 : GDALGetDataTypeSizeBytes(args.eWrkDataType));
2399 136 : if (*ppDstBuffer == nullptr)
2400 : {
2401 0 : return CE_Failure;
2402 : }
2403 :
2404 136 : CPLAssert(args.eSrcDataType == args.eWrkDataType);
2405 :
2406 136 : *peDstBufferDataType = args.eWrkDataType;
2407 136 : switch (args.eWrkDataType)
2408 : {
2409 : // For mode resampling, as no computation is done, only the
2410 : // size of the data type matters... except for Byte where we have
2411 : // special processing. And for floating point values
2412 65 : case GDT_Byte:
2413 : {
2414 65 : return GDALResampleChunk_ModeT(args,
2415 : static_cast<const GByte *>(pChunk),
2416 65 : static_cast<GByte *>(*ppDstBuffer));
2417 : }
2418 :
2419 4 : case GDT_Int8:
2420 : {
2421 4 : return GDALResampleChunk_ModeT(args,
2422 : static_cast<const int8_t *>(pChunk),
2423 4 : static_cast<int8_t *>(*ppDstBuffer));
2424 : }
2425 :
2426 9 : case GDT_Int16:
2427 : case GDT_UInt16:
2428 : case GDT_Float16:
2429 : {
2430 9 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
2431 9 : return GDALResampleChunk_ModeT(
2432 : args, static_cast<const uint16_t *>(pChunk),
2433 9 : static_cast<uint16_t *>(*ppDstBuffer));
2434 : }
2435 :
2436 15 : case GDT_CInt16:
2437 : case GDT_CFloat16:
2438 : case GDT_Int32:
2439 : case GDT_UInt32:
2440 : {
2441 15 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2442 15 : return GDALResampleChunk_ModeT(
2443 : args, static_cast<const uint32_t *>(pChunk),
2444 15 : static_cast<uint32_t *>(*ppDstBuffer));
2445 : }
2446 :
2447 17 : case GDT_Float32:
2448 : {
2449 17 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2450 17 : return GDALResampleChunk_ModeT(args,
2451 : static_cast<const float *>(pChunk),
2452 17 : static_cast<float *>(*ppDstBuffer));
2453 : }
2454 :
2455 12 : case GDT_CInt32:
2456 : case GDT_Int64:
2457 : case GDT_UInt64:
2458 : {
2459 12 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2460 12 : return GDALResampleChunk_ModeT(
2461 : args, static_cast<const uint64_t *>(pChunk),
2462 12 : static_cast<uint64_t *>(*ppDstBuffer));
2463 : }
2464 :
2465 6 : case GDT_Float64:
2466 : {
2467 6 : CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2468 6 : return GDALResampleChunk_ModeT(args,
2469 : static_cast<const double *>(pChunk),
2470 6 : static_cast<double *>(*ppDstBuffer));
2471 : }
2472 :
2473 4 : case GDT_CFloat32:
2474 : {
2475 4 : return GDALResampleChunk_ModeT(
2476 : args, static_cast<const std::complex<float> *>(pChunk),
2477 4 : static_cast<std::complex<float> *>(*ppDstBuffer));
2478 : }
2479 :
2480 4 : case GDT_CFloat64:
2481 : {
2482 4 : return GDALResampleChunk_ModeT(
2483 : args, static_cast<const std::complex<double> *>(pChunk),
2484 4 : static_cast<std::complex<double> *>(*ppDstBuffer));
2485 : }
2486 :
2487 0 : case GDT_Unknown:
2488 : case GDT_TypeCount:
2489 0 : break;
2490 : }
2491 :
2492 0 : CPLAssert(false);
2493 : return CE_Failure;
2494 : }
2495 :
2496 : /************************************************************************/
2497 : /* GDALResampleConvolutionHorizontal() */
2498 : /************************************************************************/
2499 :
2500 : template <class T>
2501 : static inline double
2502 44642 : GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
2503 : int nSrcPixelCount)
2504 : {
2505 44642 : double dfVal1 = 0.0;
2506 44642 : double dfVal2 = 0.0;
2507 44642 : int i = 0; // Used after for.
2508 : // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
2509 : // manually (untypical) unrolled loop in -O2 and -O3:
2510 : // https://github.com/OSGeo/gdal/issues/9508
2511 : #if !defined(__INTEL_CLANG_COMPILER)
2512 89044 : for (; i + 3 < nSrcPixelCount; i += 4)
2513 : {
2514 44402 : dfVal1 += pChunk[i] * padfWeights[i];
2515 44402 : dfVal1 += pChunk[i + 1] * padfWeights[i + 1];
2516 44402 : dfVal2 += pChunk[i + 2] * padfWeights[i + 2];
2517 44402 : dfVal2 += pChunk[i + 3] * padfWeights[i + 3];
2518 : }
2519 : #endif
2520 46066 : for (; i < nSrcPixelCount; ++i)
2521 : {
2522 1424 : dfVal1 += pChunk[i] * padfWeights[i];
2523 : }
2524 44642 : return dfVal1 + dfVal2;
2525 : }
2526 :
2527 : template <class T>
2528 48 : static inline void GDALResampleConvolutionHorizontalWithMask(
2529 : const T *pChunk, const GByte *pabyMask, const double *padfWeights,
2530 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2531 : {
2532 48 : dfVal = 0;
2533 48 : dfWeightSum = 0;
2534 48 : int i = 0;
2535 48 : for (; i + 3 < nSrcPixelCount; i += 4)
2536 : {
2537 0 : const double dfWeight0 = padfWeights[i] * pabyMask[i];
2538 0 : const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
2539 0 : const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
2540 0 : const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
2541 0 : dfVal += pChunk[i] * dfWeight0;
2542 0 : dfVal += pChunk[i + 1] * dfWeight1;
2543 0 : dfVal += pChunk[i + 2] * dfWeight2;
2544 0 : dfVal += pChunk[i + 3] * dfWeight3;
2545 0 : dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
2546 : }
2547 178 : for (; i < nSrcPixelCount; ++i)
2548 : {
2549 130 : const double dfWeight = padfWeights[i] * pabyMask[i];
2550 130 : dfVal += pChunk[i] * dfWeight;
2551 130 : dfWeightSum += dfWeight;
2552 : }
2553 48 : }
2554 :
2555 : template <class T>
2556 1330334 : static inline void GDALResampleConvolutionHorizontal_3rows(
2557 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2558 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2559 : double &dfRes2, double &dfRes3)
2560 : {
2561 1330334 : double dfVal1 = 0.0;
2562 1330334 : double dfVal2 = 0.0;
2563 1330334 : double dfVal3 = 0.0;
2564 1330334 : double dfVal4 = 0.0;
2565 1330334 : double dfVal5 = 0.0;
2566 1330334 : double dfVal6 = 0.0;
2567 1330334 : int i = 0; // Used after for.
2568 2715057 : for (; i + 3 < nSrcPixelCount; i += 4)
2569 : {
2570 1384722 : dfVal1 += pChunkRow1[i] * padfWeights[i];
2571 1384722 : dfVal1 += pChunkRow1[i + 1] * padfWeights[i + 1];
2572 1384722 : dfVal2 += pChunkRow1[i + 2] * padfWeights[i + 2];
2573 1384722 : dfVal2 += pChunkRow1[i + 3] * padfWeights[i + 3];
2574 1384722 : dfVal3 += pChunkRow2[i] * padfWeights[i];
2575 1384722 : dfVal3 += pChunkRow2[i + 1] * padfWeights[i + 1];
2576 1384722 : dfVal4 += pChunkRow2[i + 2] * padfWeights[i + 2];
2577 1384722 : dfVal4 += pChunkRow2[i + 3] * padfWeights[i + 3];
2578 1384722 : dfVal5 += pChunkRow3[i] * padfWeights[i];
2579 1384722 : dfVal5 += pChunkRow3[i + 1] * padfWeights[i + 1];
2580 1384722 : dfVal6 += pChunkRow3[i + 2] * padfWeights[i + 2];
2581 1384722 : dfVal6 += pChunkRow3[i + 3] * padfWeights[i + 3];
2582 : }
2583 1366941 : for (; i < nSrcPixelCount; ++i)
2584 : {
2585 36607 : dfVal1 += pChunkRow1[i] * padfWeights[i];
2586 36607 : dfVal3 += pChunkRow2[i] * padfWeights[i];
2587 36607 : dfVal5 += pChunkRow3[i] * padfWeights[i];
2588 : }
2589 1330334 : dfRes1 = dfVal1 + dfVal2;
2590 1330334 : dfRes2 = dfVal3 + dfVal4;
2591 1330334 : dfRes3 = dfVal5 + dfVal6;
2592 1330334 : }
2593 :
2594 : template <class T>
2595 18188 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
2596 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2597 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2598 : double &dfRes2, double &dfRes3)
2599 : {
2600 18188 : GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2601 : padfWeights, nSrcPixelCount, dfRes1,
2602 : dfRes2, dfRes3);
2603 18188 : }
2604 :
2605 : template <class T>
2606 1247346 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
2607 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2608 : const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
2609 : {
2610 1247346 : GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2611 : padfWeights, 4, dfRes1, dfRes2,
2612 : dfRes3);
2613 1247346 : }
2614 :
2615 : /************************************************************************/
2616 : /* GDALResampleConvolutionVertical() */
2617 : /************************************************************************/
2618 :
2619 : template <class T>
2620 : static inline double
2621 463524 : GDALResampleConvolutionVertical(const T *pChunk, int nStride,
2622 : const double *padfWeights, int nSrcLineCount)
2623 : {
2624 463524 : double dfVal1 = 0.0;
2625 463524 : double dfVal2 = 0.0;
2626 463524 : int i = 0;
2627 463524 : int j = 0;
2628 912750 : for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2629 : {
2630 449226 : dfVal1 += pChunk[j] * padfWeights[i];
2631 449226 : dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
2632 449226 : dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2633 449226 : dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2634 : }
2635 516491 : for (; i < nSrcLineCount; ++i, j += nStride)
2636 : {
2637 52967 : dfVal1 += pChunk[j] * padfWeights[i];
2638 : }
2639 463524 : return dfVal1 + dfVal2;
2640 : }
2641 :
2642 : template <class T>
2643 2880000 : static inline void GDALResampleConvolutionVertical_2cols(
2644 : const T *pChunk, int nStride, const double *padfWeights, int nSrcLineCount,
2645 : double &dfRes1, double &dfRes2)
2646 : {
2647 2880000 : double dfVal1 = 0.0;
2648 2880000 : double dfVal2 = 0.0;
2649 2880000 : double dfVal3 = 0.0;
2650 2880000 : double dfVal4 = 0.0;
2651 2880000 : int i = 0;
2652 2880000 : int j = 0;
2653 5716800 : for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2654 : {
2655 2836800 : dfVal1 += pChunk[j] * padfWeights[i];
2656 2836800 : dfVal3 += pChunk[j + 1] * padfWeights[i];
2657 2836800 : dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
2658 2836800 : dfVal3 += pChunk[j + 1 + nStride] * padfWeights[i + 1];
2659 2836800 : dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2660 2836800 : dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
2661 2836800 : dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2662 2836800 : dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
2663 : }
2664 2995210 : for (; i < nSrcLineCount; ++i, j += nStride)
2665 : {
2666 115210 : dfVal1 += pChunk[j] * padfWeights[i];
2667 115210 : dfVal3 += pChunk[j + 1] * padfWeights[i];
2668 : }
2669 2880000 : dfRes1 = dfVal1 + dfVal2;
2670 2880000 : dfRes2 = dfVal3 + dfVal4;
2671 2880000 : }
2672 :
2673 : #ifdef USE_SSE2
2674 :
2675 : #ifdef __AVX__
2676 : /************************************************************************/
2677 : /* GDALResampleConvolutionVertical_16cols<T> */
2678 : /************************************************************************/
2679 :
2680 : template <class T>
2681 : static inline void
2682 : GDALResampleConvolutionVertical_16cols(const T *pChunk, int nStride,
2683 : const double *padfWeights,
2684 : int nSrcLineCount, float *afDest)
2685 : {
2686 : int i = 0;
2687 : int j = 0;
2688 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2689 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2690 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2691 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2692 : for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2693 : {
2694 : XMMReg4Double w0 =
2695 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2696 : XMMReg4Double w1 =
2697 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2698 : XMMReg4Double w2 =
2699 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2700 : XMMReg4Double w3 =
2701 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2702 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2703 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2704 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
2705 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
2706 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2707 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2708 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
2709 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
2710 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2711 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2712 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
2713 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
2714 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2715 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2716 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
2717 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
2718 : }
2719 : for (; i < nSrcLineCount; ++i, j += nStride)
2720 : {
2721 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2722 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2723 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2724 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
2725 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
2726 : }
2727 : v_acc0.Store4Val(afDest);
2728 : v_acc1.Store4Val(afDest + 4);
2729 : v_acc2.Store4Val(afDest + 8);
2730 : v_acc3.Store4Val(afDest + 12);
2731 : }
2732 :
2733 : template <class T>
2734 : static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
2735 : const double *, int,
2736 : double *)
2737 : {
2738 : // Cannot be reached
2739 : CPLAssert(false);
2740 : }
2741 :
2742 : #else
2743 :
2744 : /************************************************************************/
2745 : /* GDALResampleConvolutionVertical_8cols<T> */
2746 : /************************************************************************/
2747 :
2748 : template <class T>
2749 : static inline void
2750 18635500 : GDALResampleConvolutionVertical_8cols(const T *pChunk, int nStride,
2751 : const double *padfWeights,
2752 : int nSrcLineCount, float *afDest)
2753 : {
2754 18635500 : int i = 0;
2755 18635500 : int j = 0;
2756 18635500 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2757 18555800 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2758 33743500 : for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2759 : {
2760 15186500 : XMMReg4Double w0 =
2761 15186500 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2762 15161600 : XMMReg4Double w1 =
2763 15161600 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2764 15190900 : XMMReg4Double w2 =
2765 15190900 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2766 15196200 : XMMReg4Double w3 =
2767 15196200 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2768 15189700 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2769 15119900 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2770 15129500 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2771 15115400 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2772 15117000 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2773 15119300 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2774 15119400 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2775 15126900 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2776 : }
2777 29964300 : for (; i < nSrcLineCount; ++i, j += nStride)
2778 : {
2779 11407300 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2780 11407300 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2781 11407300 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2782 : }
2783 18557000 : v_acc0.Store4Val(afDest);
2784 18594800 : v_acc1.Store4Val(afDest + 4);
2785 18611400 : }
2786 :
2787 : template <class T>
2788 : static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
2789 : const double *, int,
2790 : double *)
2791 : {
2792 : // Cannot be reached
2793 : CPLAssert(false);
2794 : }
2795 :
2796 : #endif // __AVX__
2797 :
2798 : /************************************************************************/
2799 : /* GDALResampleConvolutionHorizontalSSE2<T> */
2800 : /************************************************************************/
2801 :
2802 : template <class T>
2803 2738105 : static inline double GDALResampleConvolutionHorizontalSSE2(
2804 : const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2805 : {
2806 2738105 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2807 2737728 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2808 2737892 : int i = 0; // Used after for.
2809 2814061 : for (; i + 7 < nSrcPixelCount; i += 8)
2810 : {
2811 : // Retrieve the pixel & accumulate
2812 76083 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
2813 76083 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
2814 76083 : const XMMReg4Double v_weight1 =
2815 76083 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2816 76083 : const XMMReg4Double v_weight2 =
2817 76083 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
2818 :
2819 76083 : v_acc1 += v_pixels1 * v_weight1;
2820 76083 : v_acc2 += v_pixels2 * v_weight2;
2821 : }
2822 :
2823 2737969 : v_acc1 += v_acc2;
2824 :
2825 2737823 : double dfVal = v_acc1.GetHorizSum();
2826 9505350 : for (; i < nSrcPixelCount; ++i)
2827 : {
2828 6767790 : dfVal += pChunk[i] * padfWeightsAligned[i];
2829 : }
2830 2737566 : return dfVal;
2831 : }
2832 :
2833 : /************************************************************************/
2834 : /* GDALResampleConvolutionHorizontal<GByte> */
2835 : /************************************************************************/
2836 :
2837 : template <>
2838 2189920 : inline double GDALResampleConvolutionHorizontal<GByte>(
2839 : const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2840 : {
2841 2189920 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
2842 2189930 : nSrcPixelCount);
2843 : }
2844 :
2845 : template <>
2846 548287 : inline double GDALResampleConvolutionHorizontal<GUInt16>(
2847 : const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2848 : {
2849 548287 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
2850 548501 : nSrcPixelCount);
2851 : }
2852 :
2853 : /************************************************************************/
2854 : /* GDALResampleConvolutionHorizontalWithMaskSSE2<T> */
2855 : /************************************************************************/
2856 :
2857 : template <class T>
2858 5806833 : static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
2859 : const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
2860 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2861 : {
2862 5806833 : int i = 0; // Used after for.
2863 5806833 : XMMReg4Double v_acc = XMMReg4Double::Zero();
2864 5806833 : XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
2865 16456921 : for (; i + 3 < nSrcPixelCount; i += 4)
2866 : {
2867 10650058 : const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
2868 10650058 : const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
2869 10650058 : XMMReg4Double v_weight =
2870 10650058 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2871 10650058 : v_weight *= v_mask;
2872 10650058 : v_acc += v_pixels * v_weight;
2873 10650058 : v_acc_weight += v_weight;
2874 : }
2875 :
2876 5806833 : dfVal = v_acc.GetHorizSum();
2877 5806833 : dfWeightSum = v_acc_weight.GetHorizSum();
2878 6005033 : for (; i < nSrcPixelCount; ++i)
2879 : {
2880 198202 : const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
2881 198202 : dfVal += pChunk[i] * dfWeight;
2882 198202 : dfWeightSum += dfWeight;
2883 : }
2884 5806833 : }
2885 :
2886 : /************************************************************************/
2887 : /* GDALResampleConvolutionHorizontalWithMask<GByte> */
2888 : /************************************************************************/
2889 :
2890 : template <>
2891 5806770 : inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
2892 : const GByte *pChunk, const GByte *pabyMask,
2893 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2894 : double &dfWeightSum)
2895 : {
2896 5806770 : GDALResampleConvolutionHorizontalWithMaskSSE2(
2897 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2898 : dfWeightSum);
2899 5806770 : }
2900 :
2901 : template <>
2902 63 : inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
2903 : const GUInt16 *pChunk, const GByte *pabyMask,
2904 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2905 : double &dfWeightSum)
2906 : {
2907 63 : GDALResampleConvolutionHorizontalWithMaskSSE2(
2908 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2909 : dfWeightSum);
2910 63 : }
2911 :
2912 : /************************************************************************/
2913 : /* GDALResampleConvolutionHorizontal_3rows_SSE2<T> */
2914 : /************************************************************************/
2915 :
2916 : template <class T>
2917 10026330 : static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
2918 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2919 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2920 : double &dfRes2, double &dfRes3)
2921 : {
2922 10026330 : XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
2923 10026330 : v_acc2 = XMMReg4Double::Zero(),
2924 10026330 : v_acc3 = XMMReg4Double::Zero();
2925 10026330 : int i = 0;
2926 19994966 : for (; i + 7 < nSrcPixelCount; i += 8)
2927 : {
2928 : // Retrieve the pixel & accumulate.
2929 9968616 : XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
2930 9968616 : XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
2931 9968616 : const XMMReg4Double v_weight1 =
2932 9968616 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2933 9968616 : const XMMReg4Double v_weight2 =
2934 9968616 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
2935 :
2936 9968616 : v_acc1 += v_pixels1 * v_weight1;
2937 9968616 : v_acc1 += v_pixels2 * v_weight2;
2938 :
2939 9968616 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
2940 9968616 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
2941 9968616 : v_acc2 += v_pixels1 * v_weight1;
2942 9968616 : v_acc2 += v_pixels2 * v_weight2;
2943 :
2944 9968616 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
2945 9968616 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
2946 9968616 : v_acc3 += v_pixels1 * v_weight1;
2947 9968616 : v_acc3 += v_pixels2 * v_weight2;
2948 : }
2949 :
2950 10026330 : dfRes1 = v_acc1.GetHorizSum();
2951 10026330 : dfRes2 = v_acc2.GetHorizSum();
2952 10026330 : dfRes3 = v_acc3.GetHorizSum();
2953 21492926 : for (; i < nSrcPixelCount; ++i)
2954 : {
2955 11466596 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
2956 11466596 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
2957 11466596 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
2958 : }
2959 10026330 : }
2960 :
2961 : /************************************************************************/
2962 : /* GDALResampleConvolutionHorizontal_3rows<GByte> */
2963 : /************************************************************************/
2964 :
2965 : template <>
2966 10026300 : inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
2967 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
2968 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2969 : double &dfRes2, double &dfRes3)
2970 : {
2971 10026300 : GDALResampleConvolutionHorizontal_3rows_SSE2(
2972 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
2973 : dfRes1, dfRes2, dfRes3);
2974 10026300 : }
2975 :
2976 : template <>
2977 30 : inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
2978 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
2979 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
2980 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
2981 : {
2982 30 : GDALResampleConvolutionHorizontal_3rows_SSE2(
2983 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
2984 : dfRes1, dfRes2, dfRes3);
2985 30 : }
2986 :
2987 : /************************************************************************/
2988 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T> */
2989 : /************************************************************************/
2990 :
2991 : template <class T>
2992 2173246 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
2993 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2994 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2995 : double &dfRes2, double &dfRes3)
2996 : {
2997 2173246 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2998 2173020 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2999 2173127 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
3000 2173118 : int i = 0; // Use after for.
3001 2176437 : for (; i + 3 < nSrcPixelCount; i += 4)
3002 : {
3003 : // Retrieve the pixel & accumulate.
3004 3284 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3005 3284 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3006 3284 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3007 3284 : const XMMReg4Double v_weight =
3008 3284 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3009 :
3010 3284 : v_acc1 += v_pixels1 * v_weight;
3011 3284 : v_acc2 += v_pixels2 * v_weight;
3012 3284 : v_acc3 += v_pixels3 * v_weight;
3013 : }
3014 :
3015 2173157 : dfRes1 = v_acc1.GetHorizSum();
3016 2173041 : dfRes2 = v_acc2.GetHorizSum();
3017 2173053 : dfRes3 = v_acc3.GetHorizSum();
3018 :
3019 6494380 : for (; i < nSrcPixelCount; ++i)
3020 : {
3021 4321322 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3022 4321322 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3023 4321322 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3024 : }
3025 2173058 : }
3026 :
3027 : /************************************************************************/
3028 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte> */
3029 : /************************************************************************/
3030 :
3031 : template <>
3032 2106390 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
3033 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3034 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3035 : double &dfRes2, double &dfRes3)
3036 : {
3037 2106390 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3038 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3039 : dfRes1, dfRes2, dfRes3);
3040 2106400 : }
3041 :
3042 : template <>
3043 66750 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
3044 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3045 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3046 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3047 : {
3048 66750 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3049 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3050 : dfRes1, dfRes2, dfRes3);
3051 66903 : }
3052 :
3053 : /************************************************************************/
3054 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T> */
3055 : /************************************************************************/
3056 :
3057 : template <class T>
3058 12200610 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3059 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3060 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3061 : double &dfRes3)
3062 : {
3063 12200610 : const XMMReg4Double v_weight =
3064 : XMMReg4Double::Load4ValAligned(padfWeightsAligned);
3065 :
3066 : // Retrieve the pixel & accumulate.
3067 12153940 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
3068 12224670 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
3069 12190940 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
3070 :
3071 12237040 : XMMReg4Double v_acc1 = v_pixels1 * v_weight;
3072 12182230 : XMMReg4Double v_acc2 = v_pixels2 * v_weight;
3073 12186190 : XMMReg4Double v_acc3 = v_pixels3 * v_weight;
3074 :
3075 12178950 : dfRes1 = v_acc1.GetHorizSum();
3076 12161000 : dfRes2 = v_acc2.GetHorizSum();
3077 12178770 : dfRes3 = v_acc3.GetHorizSum();
3078 12204550 : }
3079 :
3080 : /************************************************************************/
3081 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte> */
3082 : /************************************************************************/
3083 :
3084 : template <>
3085 6625740 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
3086 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3087 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3088 : double &dfRes3)
3089 : {
3090 6625740 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3091 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3092 : dfRes3);
3093 6613610 : }
3094 :
3095 : template <>
3096 5600910 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
3097 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3098 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
3099 : double &dfRes2, double &dfRes3)
3100 : {
3101 5600910 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3102 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3103 : dfRes3);
3104 5573700 : }
3105 :
3106 : #endif // USE_SSE2
3107 :
3108 : /************************************************************************/
3109 : /* GDALResampleChunk_Convolution() */
3110 : /************************************************************************/
3111 :
3112 : template <class T, class Twork, GDALDataType eWrkDataType>
3113 3700 : static CPLErr GDALResampleChunk_ConvolutionT(
3114 : const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
3115 : FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
3116 : int nKernelRadius, bool bKernelWithNegativeWeights, float fMaxVal)
3117 :
3118 : {
3119 3700 : const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
3120 3700 : const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
3121 3700 : const double dfSrcXDelta = args.dfSrcXDelta;
3122 3700 : const double dfSrcYDelta = args.dfSrcYDelta;
3123 3700 : constexpr int nBands = 1;
3124 3700 : const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
3125 3700 : const int nChunkXOff = args.nChunkXOff;
3126 3700 : const int nChunkXSize = args.nChunkXSize;
3127 3700 : const int nChunkYOff = args.nChunkYOff;
3128 3700 : const int nChunkYSize = args.nChunkYSize;
3129 3700 : const int nDstXOff = args.nDstXOff;
3130 3700 : const int nDstXOff2 = args.nDstXOff2;
3131 3700 : const int nDstYOff = args.nDstYOff;
3132 3700 : const int nDstYOff2 = args.nDstYOff2;
3133 3700 : const bool bHasNoData = args.bHasNoData;
3134 3700 : double dfNoDataValue = args.dfNoDataValue;
3135 :
3136 3700 : if (!bHasNoData)
3137 3649 : dfNoDataValue = 0.0;
3138 3700 : const auto dstDataType = args.eOvrDataType;
3139 3700 : const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
3140 3696 : const double dfReplacementVal =
3141 46 : bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
3142 : : dfNoDataValue;
3143 : // cppcheck-suppress unreadVariable
3144 3696 : const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
3145 3690 : const auto nNodataValueInt64 = static_cast<GInt64>(dfNoDataValue);
3146 3690 : constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
3147 :
3148 : // TODO: we should have some generic function to do this.
3149 3690 : Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
3150 3690 : Twork fDstMax = cpl::NumericLimits<Twork>::max();
3151 3690 : if (dstDataType == GDT_Byte)
3152 : {
3153 2977 : fDstMin = std::numeric_limits<GByte>::min();
3154 2975 : fDstMax = std::numeric_limits<GByte>::max();
3155 : }
3156 715 : else if (dstDataType == GDT_Int8)
3157 : {
3158 1 : fDstMin = std::numeric_limits<GInt8>::min();
3159 1 : fDstMax = std::numeric_limits<GInt8>::max();
3160 : }
3161 714 : else if (dstDataType == GDT_UInt16)
3162 : {
3163 386 : fDstMin = std::numeric_limits<GUInt16>::min();
3164 388 : fDstMax = std::numeric_limits<GUInt16>::max();
3165 : }
3166 329 : else if (dstDataType == GDT_Int16)
3167 : {
3168 279 : fDstMin = std::numeric_limits<GInt16>::min();
3169 279 : fDstMax = std::numeric_limits<GInt16>::max();
3170 : }
3171 50 : else if (dstDataType == GDT_UInt32)
3172 : {
3173 1 : fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
3174 1 : fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
3175 : }
3176 49 : else if (dstDataType == GDT_Int32)
3177 : {
3178 : // cppcheck-suppress unreadVariable
3179 2 : fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
3180 : // cppcheck-suppress unreadVariable
3181 2 : fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
3182 : }
3183 47 : else if (dstDataType == GDT_UInt64)
3184 : {
3185 : // cppcheck-suppress unreadVariable
3186 1 : fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
3187 : // cppcheck-suppress unreadVariable
3188 1 : fDstMax = static_cast<Twork>(std::numeric_limits<uint64_t>::max());
3189 : }
3190 46 : else if (dstDataType == GDT_Int64)
3191 : {
3192 : // cppcheck-suppress unreadVariable
3193 1 : fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
3194 : // cppcheck-suppress unreadVariable
3195 1 : fDstMax = static_cast<Twork>(std::numeric_limits<int64_t>::max());
3196 : }
3197 :
3198 27580835 : auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
3199 : nNodataValueInt64, dfNoDataValue,
3200 : dfReplacementVal](Twork fVal)
3201 : {
3202 14670600 : if (!bHasNoData)
3203 11444200 : return fVal;
3204 :
3205 : // Clamp value before comparing to nodata: this is only needed for
3206 : // kernels with negative weights (Lanczos)
3207 3226390 : Twork fClamped = fVal;
3208 3226390 : if (fClamped < fDstMin)
3209 12874 : fClamped = fDstMin;
3210 3213520 : else if (fClamped > fDstMax)
3211 12852 : fClamped = fDstMax;
3212 3226390 : if (isIntegerDT)
3213 : {
3214 3226370 : if (nNodataValueInt64 == static_cast<GInt64>(std::round(fClamped)))
3215 : {
3216 : // Do not use the nodata value
3217 13869 : return static_cast<Twork>(dfReplacementVal);
3218 : }
3219 : }
3220 24 : else if (dfNoDataValue == fClamped)
3221 : {
3222 : // Do not use the nodata value
3223 1 : return static_cast<Twork>(dfReplacementVal);
3224 : }
3225 3212520 : return fClamped;
3226 : };
3227 :
3228 : /* -------------------------------------------------------------------- */
3229 : /* Allocate work buffers. */
3230 : /* -------------------------------------------------------------------- */
3231 3686 : const int nDstXSize = nDstXOff2 - nDstXOff;
3232 3686 : Twork *pafWrkScanline = nullptr;
3233 3686 : if (dstDataType != eWrkDataType)
3234 : {
3235 : pafWrkScanline =
3236 3646 : static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
3237 3656 : if (pafWrkScanline == nullptr)
3238 0 : return CE_Failure;
3239 : }
3240 :
3241 3696 : const double dfXScale = 1.0 / dfXRatioDstToSrc;
3242 3696 : const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
3243 3696 : const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
3244 3696 : const double dfYScale = 1.0 / dfYRatioDstToSrc;
3245 3696 : const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
3246 3696 : const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
3247 :
3248 : // Temporary array to store result of horizontal filter.
3249 : double *padfHorizontalFiltered = static_cast<double *>(
3250 3696 : VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
3251 :
3252 : // To store convolution coefficients.
3253 3697 : double *padfWeights = static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
3254 : static_cast<int>(2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) +
3255 : 0.5) *
3256 : sizeof(double)));
3257 :
3258 3697 : GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
3259 3697 : if (pabyChunkNodataMask)
3260 : pabyChunkNodataMaskHorizontalFiltered =
3261 401 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
3262 3697 : if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
3263 401 : (pabyChunkNodataMask != nullptr &&
3264 : pabyChunkNodataMaskHorizontalFiltered == nullptr))
3265 : {
3266 2 : VSIFree(pafWrkScanline);
3267 0 : VSIFree(padfHorizontalFiltered);
3268 0 : VSIFreeAligned(padfWeights);
3269 0 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3270 0 : return CE_Failure;
3271 : }
3272 :
3273 : /* ==================================================================== */
3274 : /* First pass: horizontal filter */
3275 : /* ==================================================================== */
3276 3695 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
3277 : #ifdef USE_SSE2
3278 3695 : bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
3279 : #endif
3280 2724976 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
3281 : {
3282 2721270 : const double dfSrcPixel =
3283 2721270 : (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
3284 2721270 : int nSrcPixelStart =
3285 2721270 : static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
3286 2721270 : if (nSrcPixelStart < nChunkXOff)
3287 55170 : nSrcPixelStart = nChunkXOff;
3288 2721270 : int nSrcPixelStop =
3289 2721270 : static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
3290 2721270 : if (nSrcPixelStop > nChunkRightXOff)
3291 55188 : nSrcPixelStop = nChunkRightXOff;
3292 : #if 0
3293 : if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
3294 : {
3295 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3296 : }
3297 : if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
3298 : {
3299 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3300 : }
3301 : #endif
3302 2721270 : const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
3303 2721270 : double dfWeightSum = 0.0;
3304 :
3305 : // Compute convolution coefficients.
3306 2721270 : int nSrcPixel = nSrcPixelStart;
3307 2721270 : double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
3308 3568066 : for (; nSrcPixel + 3 < nSrcPixelStop; nSrcPixel += 4)
3309 : {
3310 846729 : padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
3311 846729 : dfX += dfXScaleWeight;
3312 846729 : padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
3313 846729 : dfX += dfXScaleWeight;
3314 846729 : padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
3315 846729 : dfX += dfXScaleWeight;
3316 846729 : padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
3317 846729 : dfX += dfXScaleWeight;
3318 846792 : dfWeightSum +=
3319 846729 : pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
3320 : }
3321 6702788 : for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
3322 : {
3323 3981677 : const double dfWeight = pfnFilterFunc(dfX);
3324 3981453 : padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3325 3981453 : dfWeightSum += dfWeight;
3326 : }
3327 :
3328 2721111 : const int nHeight = nChunkYSize * nBands;
3329 2721111 : if (pabyChunkNodataMask == nullptr)
3330 : {
3331 2648768 : if (dfWeightSum != 0)
3332 : {
3333 2648771 : const double dfInvWeightSum = 1.0 / dfWeightSum;
3334 9456113 : for (int i = 0; i < nSrcPixelCount; ++i)
3335 6807345 : padfWeights[i] *= dfInvWeightSum;
3336 : }
3337 2648768 : int iSrcLineOff = 0;
3338 : #ifdef USE_SSE2
3339 2648768 : if (nSrcPixelCount == 4)
3340 : {
3341 13987066 : for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
3342 : {
3343 13447536 : const GPtrDiff_t j =
3344 13447536 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3345 13447536 : (nSrcPixelStart - nChunkXOff);
3346 13447536 : double dfVal1 = 0.0;
3347 13447536 : double dfVal2 = 0.0;
3348 13447536 : double dfVal3 = 0.0;
3349 13447536 : GDALResampleConvolutionHorizontalPixelCount4_3rows(
3350 13447536 : pChunk + j, pChunk + j + nChunkXSize,
3351 13447536 : pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
3352 : dfVal2, dfVal3);
3353 13451656 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3354 13451656 : nDstXSize +
3355 13451656 : iDstPixel - nDstXOff] = dfVal1;
3356 13451656 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3357 13451656 : 1) *
3358 13451656 : nDstXSize +
3359 13451656 : iDstPixel - nDstXOff] = dfVal2;
3360 13451656 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3361 13451656 : 2) *
3362 13451656 : nDstXSize +
3363 13451656 : iDstPixel - nDstXOff] = dfVal3;
3364 : }
3365 : }
3366 2113350 : else if (bSrcPixelCountLess8)
3367 : {
3368 4226190 : for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
3369 : {
3370 2191224 : const GPtrDiff_t j =
3371 2191224 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3372 2191224 : (nSrcPixelStart - nChunkXOff);
3373 2191224 : double dfVal1 = 0.0;
3374 2191224 : double dfVal2 = 0.0;
3375 2191224 : double dfVal3 = 0.0;
3376 2191224 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
3377 2191224 : pChunk + j, pChunk + j + nChunkXSize,
3378 2191224 : pChunk + j + 2 * nChunkXSize, padfWeights,
3379 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3380 2191453 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3381 2191453 : nDstXSize +
3382 2191453 : iDstPixel - nDstXOff] = dfVal1;
3383 2191453 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3384 2191453 : 1) *
3385 2191453 : nDstXSize +
3386 2191453 : iDstPixel - nDstXOff] = dfVal2;
3387 2191453 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3388 2191453 : 2) *
3389 2191453 : nDstXSize +
3390 2191453 : iDstPixel - nDstXOff] = dfVal3;
3391 : }
3392 : }
3393 : else
3394 : #endif
3395 : {
3396 10169733 : for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
3397 : {
3398 10091130 : const GPtrDiff_t j =
3399 10091130 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3400 10091130 : (nSrcPixelStart - nChunkXOff);
3401 10091130 : double dfVal1 = 0.0;
3402 10091130 : double dfVal2 = 0.0;
3403 10091130 : double dfVal3 = 0.0;
3404 10091130 : GDALResampleConvolutionHorizontal_3rows(
3405 10091130 : pChunk + j, pChunk + j + nChunkXSize,
3406 10091130 : pChunk + j + 2 * nChunkXSize, padfWeights,
3407 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3408 10091130 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3409 10091130 : nDstXSize +
3410 10091130 : iDstPixel - nDstXOff] = dfVal1;
3411 10091130 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3412 10091130 : 1) *
3413 10091130 : nDstXSize +
3414 10091130 : iDstPixel - nDstXOff] = dfVal2;
3415 10091130 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3416 10091130 : 2) *
3417 10091130 : nDstXSize +
3418 10091130 : iDstPixel - nDstXOff] = dfVal3;
3419 : }
3420 : }
3421 5436193 : for (; iSrcLineOff < nHeight; ++iSrcLineOff)
3422 : {
3423 2782818 : const GPtrDiff_t j =
3424 2782818 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3425 2782818 : (nSrcPixelStart - nChunkXOff);
3426 5521262 : const double dfVal = GDALResampleConvolutionHorizontal(
3427 2782818 : pChunk + j, padfWeights, nSrcPixelCount);
3428 2783089 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3429 2783089 : nDstXSize +
3430 2783089 : iDstPixel - nDstXOff] = dfVal;
3431 : }
3432 : }
3433 : else
3434 : {
3435 18407872 : for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
3436 : {
3437 18333218 : const GPtrDiff_t j =
3438 18333218 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3439 18333218 : (nSrcPixelStart - nChunkXOff);
3440 :
3441 18333218 : if (bKernelWithNegativeWeights)
3442 : {
3443 17852612 : int nConsecutiveValid = 0;
3444 17852612 : int nMaxConsecutiveValid = 0;
3445 165500458 : for (int k = 0; k < nSrcPixelCount; k++)
3446 : {
3447 147648146 : if (pabyChunkNodataMask[j + k])
3448 40762353 : nConsecutiveValid++;
3449 106885793 : else if (nConsecutiveValid)
3450 : {
3451 105332 : nMaxConsecutiveValid = std::max(
3452 105332 : nMaxConsecutiveValid, nConsecutiveValid);
3453 105332 : nConsecutiveValid = 0;
3454 : }
3455 : }
3456 17852612 : nMaxConsecutiveValid =
3457 17852612 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
3458 17852612 : if (nMaxConsecutiveValid < nSrcPixelCount / 2)
3459 : {
3460 12526307 : const size_t nTempOffset =
3461 12526307 : static_cast<size_t>(iSrcLineOff) * nDstXSize +
3462 12526307 : iDstPixel - nDstXOff;
3463 12526307 : padfHorizontalFiltered[nTempOffset] = 0.0;
3464 12526307 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3465 12526307 : continue;
3466 : }
3467 : }
3468 :
3469 5806881 : double dfVal = 0.0;
3470 5806881 : GDALResampleConvolutionHorizontalWithMask(
3471 5806881 : pChunk + j, pabyChunkNodataMask + j, padfWeights,
3472 : nSrcPixelCount, dfVal, dfWeightSum);
3473 5809278 : const size_t nTempOffset =
3474 5809278 : static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
3475 5809278 : nDstXOff;
3476 5809278 : if (dfWeightSum > 0.0)
3477 : {
3478 5762218 : padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
3479 5762218 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
3480 : }
3481 : else
3482 : {
3483 47115 : padfHorizontalFiltered[nTempOffset] = 0.0;
3484 47115 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3485 : }
3486 : }
3487 : }
3488 : }
3489 :
3490 : /* ==================================================================== */
3491 : /* Second pass: vertical filter */
3492 : /* ==================================================================== */
3493 3703 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
3494 :
3495 198221 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3496 : {
3497 194518 : Twork *const pafDstScanline =
3498 194518 : pafWrkScanline ? pafWrkScanline
3499 8421 : : static_cast<Twork *>(pDstBuffer) +
3500 8421 : (iDstLine - nDstYOff) * nDstXSize;
3501 :
3502 194518 : const double dfSrcLine =
3503 194518 : (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
3504 194518 : int nSrcLineStart =
3505 194518 : static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
3506 194518 : int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
3507 194518 : if (nSrcLineStart < nChunkYOff)
3508 2345 : nSrcLineStart = nChunkYOff;
3509 194518 : if (nSrcLineStop > nChunkBottomYOff)
3510 2381 : nSrcLineStop = nChunkBottomYOff;
3511 : #if 0
3512 : if( nSrcLineStart < nChunkYOff &&
3513 : nChunkYOff > 0 )
3514 : {
3515 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3516 : }
3517 : if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
3518 : {
3519 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3520 : }
3521 : #endif
3522 194518 : const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
3523 194518 : double dfWeightSum = 0.0;
3524 :
3525 : // Compute convolution coefficients.
3526 194518 : int nSrcLine = nSrcLineStart; // Used after for.
3527 194518 : double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
3528 432222 : for (; nSrcLine + 3 < nSrcLineStop;
3529 237704 : nSrcLine += 4, dfY += 4 * dfYScaleWeight)
3530 : {
3531 237713 : padfWeights[nSrcLine - nSrcLineStart] = dfY;
3532 237713 : padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
3533 237713 : padfWeights[nSrcLine + 2 - nSrcLineStart] =
3534 237713 : dfY + 2 * dfYScaleWeight;
3535 237713 : padfWeights[nSrcLine + 3 - nSrcLineStart] =
3536 237713 : dfY + 3 * dfYScaleWeight;
3537 237704 : dfWeightSum +=
3538 237713 : pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
3539 : }
3540 227791 : for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
3541 : {
3542 33280 : const double dfWeight = pfnFilterFunc(dfY);
3543 33282 : padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
3544 33282 : dfWeightSum += dfWeight;
3545 : }
3546 :
3547 194511 : if (pabyChunkNodataMask == nullptr)
3548 : {
3549 159959 : if (dfWeightSum != 0)
3550 : {
3551 159960 : const double dfInvWeightSum = 1.0 / dfWeightSum;
3552 901423 : for (int i = 0; i < nSrcLineCount; ++i)
3553 741463 : padfWeights[i] *= dfInvWeightSum;
3554 : }
3555 : }
3556 :
3557 194511 : if (pabyChunkNodataMask == nullptr)
3558 : {
3559 159960 : int iFilteredPixelOff = 0; // Used after for.
3560 : // j used after for.
3561 159960 : size_t j =
3562 159960 : (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
3563 : #ifdef USE_SSE2
3564 : if constexpr (eWrkDataType == GDT_Float32)
3565 : {
3566 : #ifdef __AVX__
3567 : for (; iFilteredPixelOff + 15 < nDstXSize;
3568 : iFilteredPixelOff += 16, j += 16)
3569 : {
3570 : GDALResampleConvolutionVertical_16cols(
3571 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3572 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3573 : if (bHasNoData)
3574 : {
3575 : for (int k = 0; k < 16; k++)
3576 : {
3577 : pafDstScanline[iFilteredPixelOff + k] =
3578 : replaceValIfNodata(
3579 : pafDstScanline[iFilteredPixelOff + k]);
3580 : }
3581 : }
3582 : }
3583 : #else
3584 18784486 : for (; iFilteredPixelOff + 7 < nDstXSize;
3585 : iFilteredPixelOff += 8, j += 8)
3586 : {
3587 18640780 : GDALResampleConvolutionVertical_8cols(
3588 18640780 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3589 18640780 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3590 18631800 : if (bHasNoData)
3591 : {
3592 17820 : for (int k = 0; k < 8; k++)
3593 : {
3594 15840 : pafDstScanline[iFilteredPixelOff + k] =
3595 15840 : replaceValIfNodata(
3596 15840 : pafDstScanline[iFilteredPixelOff + k]);
3597 : }
3598 : }
3599 : }
3600 : #endif
3601 :
3602 607250 : for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
3603 : {
3604 463611 : const Twork fVal =
3605 463520 : static_cast<Twork>(GDALResampleConvolutionVertical(
3606 463520 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3607 : nSrcLineCount));
3608 463546 : pafDstScanline[iFilteredPixelOff] =
3609 463611 : replaceValIfNodata(fVal);
3610 : }
3611 : }
3612 : else
3613 : #endif
3614 : {
3615 2887210 : for (; iFilteredPixelOff + 1 < nDstXSize;
3616 : iFilteredPixelOff += 2, j += 2)
3617 : {
3618 2880000 : double dfVal1 = 0.0;
3619 2880000 : double dfVal2 = 0.0;
3620 2880000 : GDALResampleConvolutionVertical_2cols(
3621 2880000 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3622 : nSrcLineCount, dfVal1, dfVal2);
3623 5760010 : pafDstScanline[iFilteredPixelOff] =
3624 2880000 : replaceValIfNodata(static_cast<Twork>(dfVal1));
3625 2880000 : pafDstScanline[iFilteredPixelOff + 1] =
3626 2880000 : replaceValIfNodata(static_cast<Twork>(dfVal2));
3627 : }
3628 7206 : if (iFilteredPixelOff < nDstXSize)
3629 : {
3630 2 : const double dfVal = GDALResampleConvolutionVertical(
3631 2 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3632 : nSrcLineCount);
3633 2 : pafDstScanline[iFilteredPixelOff] =
3634 2 : replaceValIfNodata(static_cast<Twork>(dfVal));
3635 : }
3636 : }
3637 : }
3638 : else
3639 : {
3640 17349040 : for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
3641 : ++iFilteredPixelOff)
3642 : {
3643 17314505 : double dfVal = 0.0;
3644 17314505 : dfWeightSum = 0.0;
3645 17314505 : size_t j = (nSrcLineStart - nChunkYOff) *
3646 17314505 : static_cast<size_t>(nDstXSize) +
3647 17314505 : iFilteredPixelOff;
3648 17314505 : if (bKernelWithNegativeWeights)
3649 : {
3650 17089601 : int nConsecutiveValid = 0;
3651 17089601 : int nMaxConsecutiveValid = 0;
3652 121806321 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3653 : {
3654 104717020 : const double dfWeight =
3655 104717020 : padfWeights[i] *
3656 : pabyChunkNodataMaskHorizontalFiltered[j];
3657 104717020 : if (pabyChunkNodataMaskHorizontalFiltered[j])
3658 : {
3659 42068237 : nConsecutiveValid++;
3660 : }
3661 62648683 : else if (nConsecutiveValid)
3662 : {
3663 203800 : nMaxConsecutiveValid = std::max(
3664 203800 : nMaxConsecutiveValid, nConsecutiveValid);
3665 203800 : nConsecutiveValid = 0;
3666 : }
3667 104717020 : dfVal += padfHorizontalFiltered[j] * dfWeight;
3668 104717020 : dfWeightSum += dfWeight;
3669 : }
3670 17089601 : nMaxConsecutiveValid =
3671 17089601 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
3672 17089601 : if (nMaxConsecutiveValid < nSrcLineCount / 2)
3673 : {
3674 8867341 : pafDstScanline[iFilteredPixelOff] =
3675 8867249 : static_cast<Twork>(dfNoDataValue);
3676 8867341 : continue;
3677 : }
3678 : }
3679 : else
3680 : {
3681 1130262 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3682 : {
3683 905432 : const double dfWeight =
3684 905432 : padfWeights[i] *
3685 : pabyChunkNodataMaskHorizontalFiltered[j];
3686 905432 : dfVal += padfHorizontalFiltered[j] * dfWeight;
3687 905432 : dfWeightSum += dfWeight;
3688 : }
3689 : }
3690 8447134 : if (dfWeightSum > 0.0)
3691 : {
3692 8431093 : pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
3693 8431081 : static_cast<Twork>(dfVal / dfWeightSum));
3694 : }
3695 : else
3696 : {
3697 16045 : pafDstScanline[iFilteredPixelOff] =
3698 16021 : static_cast<Twork>(dfNoDataValue);
3699 : }
3700 : }
3701 : }
3702 :
3703 185487 : if (fMaxVal != 0.0f)
3704 : {
3705 192324 : for (int i = 0; i < nDstXSize; ++i)
3706 : {
3707 192088 : if (pafDstScanline[i] > fMaxVal)
3708 96022 : pafDstScanline[i] = fMaxVal;
3709 : }
3710 : }
3711 :
3712 185487 : if (pafWrkScanline)
3713 : {
3714 186098 : GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
3715 : static_cast<GByte *>(pDstBuffer) +
3716 186098 : static_cast<size_t>(iDstLine - nDstYOff) *
3717 186098 : nDstXSize * nDstDataTypeSize,
3718 : dstDataType, nDstDataTypeSize, nDstXSize);
3719 : }
3720 : }
3721 :
3722 3703 : VSIFree(pafWrkScanline);
3723 3703 : VSIFreeAligned(padfWeights);
3724 3703 : VSIFree(padfHorizontalFiltered);
3725 3703 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3726 :
3727 3703 : return CE_None;
3728 : }
3729 :
3730 : static CPLErr
3731 3702 : GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
3732 : const void *pChunk, void **ppDstBuffer,
3733 : GDALDataType *peDstBufferDataType)
3734 : {
3735 : GDALResampleAlg eResample;
3736 3702 : bool bKernelWithNegativeWeights = false;
3737 3702 : if (EQUAL(args.pszResampling, "BILINEAR"))
3738 2597 : eResample = GRA_Bilinear;
3739 1105 : else if (EQUAL(args.pszResampling, "CUBIC"))
3740 : {
3741 1027 : eResample = GRA_Cubic;
3742 1027 : bKernelWithNegativeWeights = true;
3743 : }
3744 78 : else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
3745 23 : eResample = GRA_CubicSpline;
3746 55 : else if (EQUAL(args.pszResampling, "LANCZOS"))
3747 : {
3748 54 : eResample = GRA_Lanczos;
3749 54 : bKernelWithNegativeWeights = true;
3750 : }
3751 : else
3752 : {
3753 1 : CPLAssert(false);
3754 : return CE_Failure;
3755 : }
3756 3701 : const int nKernelRadius = GWKGetFilterRadius(eResample);
3757 3700 : FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
3758 : const FilterFunc4ValuesType pfnFilterFunc4Values =
3759 3698 : GWKGetFilterFunc4Values(eResample);
3760 :
3761 3696 : float fMaxVal = 0.f;
3762 : // Cubic, etc... can have overshoots, so make sure we clamp values to the
3763 : // maximum value if NBITS is set.
3764 3696 : if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
3765 8 : (args.eOvrDataType == GDT_Byte || args.eOvrDataType == GDT_UInt16 ||
3766 0 : args.eOvrDataType == GDT_UInt32))
3767 : {
3768 8 : int nBits = args.nOvrNBITS;
3769 8 : if (nBits == GDALGetDataTypeSize(args.eOvrDataType))
3770 1 : nBits = 0;
3771 8 : if (nBits > 0 && nBits < 32)
3772 7 : fMaxVal = static_cast<float>((1U << nBits) - 1);
3773 : }
3774 :
3775 3696 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(
3776 : args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
3777 : GDALGetDataTypeSizeBytes(args.eOvrDataType));
3778 3702 : if (*ppDstBuffer == nullptr)
3779 : {
3780 0 : return CE_Failure;
3781 : }
3782 3702 : *peDstBufferDataType = args.eOvrDataType;
3783 :
3784 3702 : switch (args.eWrkDataType)
3785 : {
3786 2977 : case GDT_Byte:
3787 : {
3788 2977 : return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32>(
3789 : args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
3790 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3791 2977 : bKernelWithNegativeWeights, fMaxVal);
3792 : }
3793 :
3794 395 : case GDT_UInt16:
3795 : {
3796 395 : return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32>(
3797 : args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
3798 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3799 396 : bKernelWithNegativeWeights, fMaxVal);
3800 : }
3801 :
3802 301 : case GDT_Float32:
3803 : {
3804 301 : return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32>(
3805 : args, static_cast<const float *>(pChunk), *ppDstBuffer,
3806 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3807 301 : bKernelWithNegativeWeights, fMaxVal);
3808 : }
3809 :
3810 29 : case GDT_Float64:
3811 : {
3812 29 : return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64>(
3813 : args, static_cast<const double *>(pChunk), *ppDstBuffer,
3814 : pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3815 29 : bKernelWithNegativeWeights, fMaxVal);
3816 : }
3817 :
3818 0 : default:
3819 0 : break;
3820 : }
3821 :
3822 0 : CPLAssert(false);
3823 : return CE_Failure;
3824 : }
3825 :
3826 : /************************************************************************/
3827 : /* GDALResampleChunkC32R() */
3828 : /************************************************************************/
3829 :
3830 2 : static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
3831 : const float *pafChunk, const int nChunkYOff,
3832 : const int nChunkYSize, const int nDstYOff,
3833 : const int nDstYOff2, const int nOvrXSize,
3834 : const int nOvrYSize, void **ppDstBuffer,
3835 : GDALDataType *peDstBufferDataType,
3836 : const char *pszResampling)
3837 :
3838 : {
3839 : enum Method
3840 : {
3841 : NEAR,
3842 : AVERAGE,
3843 : AVERAGE_MAGPHASE,
3844 : RMS,
3845 : };
3846 :
3847 2 : Method eMethod = NEAR;
3848 2 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
3849 : {
3850 0 : eMethod = NEAR;
3851 : }
3852 2 : else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
3853 : {
3854 0 : eMethod = AVERAGE_MAGPHASE;
3855 : }
3856 2 : else if (EQUAL(pszResampling, "RMS"))
3857 : {
3858 2 : eMethod = RMS;
3859 : }
3860 0 : else if (STARTS_WITH_CI(pszResampling, "AVER"))
3861 : {
3862 0 : eMethod = AVERAGE;
3863 : }
3864 : else
3865 : {
3866 0 : CPLError(
3867 : CE_Failure, CPLE_NotSupported,
3868 : "Resampling method %s is not supported for complex data types. "
3869 : "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
3870 : pszResampling);
3871 0 : return CE_Failure;
3872 : }
3873 :
3874 2 : const int nOXSize = nOvrXSize;
3875 2 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
3876 : GDALGetDataTypeSizeBytes(GDT_CFloat32));
3877 2 : if (*ppDstBuffer == nullptr)
3878 : {
3879 0 : return CE_Failure;
3880 : }
3881 2 : float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
3882 2 : *peDstBufferDataType = GDT_CFloat32;
3883 :
3884 2 : const int nOYSize = nOvrYSize;
3885 2 : const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
3886 2 : const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
3887 :
3888 : /* ==================================================================== */
3889 : /* Loop over destination scanlines. */
3890 : /* ==================================================================== */
3891 8 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3892 : {
3893 6 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
3894 6 : if (nSrcYOff < nChunkYOff)
3895 0 : nSrcYOff = nChunkYOff;
3896 :
3897 6 : int nSrcYOff2 =
3898 6 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
3899 6 : if (nSrcYOff2 == nSrcYOff)
3900 0 : nSrcYOff2++;
3901 :
3902 6 : if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
3903 : {
3904 2 : if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
3905 0 : nSrcYOff = nSrcHeight - 1;
3906 2 : nSrcYOff2 = nSrcHeight;
3907 : }
3908 6 : if (nSrcYOff2 > nChunkYOff + nChunkYSize)
3909 0 : nSrcYOff2 = nChunkYOff + nChunkYSize;
3910 :
3911 6 : const float *const pafSrcScanline =
3912 6 : pafChunk + ((nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
3913 6 : float *const pafDstScanline =
3914 6 : pafDstBuffer + (iDstLine - nDstYOff) * 2 * nOXSize;
3915 :
3916 : /* --------------------------------------------------------------------
3917 : */
3918 : /* Loop over destination pixels */
3919 : /* --------------------------------------------------------------------
3920 : */
3921 18 : for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
3922 : {
3923 12 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
3924 12 : int nSrcXOff2 =
3925 12 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
3926 12 : if (nSrcXOff2 == nSrcXOff)
3927 0 : nSrcXOff2++;
3928 12 : if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
3929 : {
3930 6 : if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
3931 0 : nSrcXOff = nSrcWidth - 1;
3932 6 : nSrcXOff2 = nSrcWidth;
3933 : }
3934 :
3935 12 : if (eMethod == NEAR)
3936 : {
3937 0 : pafDstScanline[iDstPixel * 2] = pafSrcScanline[nSrcXOff * 2];
3938 0 : pafDstScanline[iDstPixel * 2 + 1] =
3939 0 : pafSrcScanline[nSrcXOff * 2 + 1];
3940 : }
3941 12 : else if (eMethod == AVERAGE_MAGPHASE)
3942 : {
3943 0 : double dfTotalR = 0.0;
3944 0 : double dfTotalI = 0.0;
3945 0 : double dfTotalM = 0.0;
3946 0 : int nCount = 0;
3947 :
3948 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
3949 : {
3950 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
3951 : {
3952 0 : const double dfR =
3953 0 : pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
3954 0 : iY - nSrcYOff) *
3955 0 : nSrcWidth * 2];
3956 0 : const double dfI =
3957 0 : pafSrcScanline[iX * 2 +
3958 0 : static_cast<GPtrDiff_t>(iY -
3959 0 : nSrcYOff) *
3960 0 : nSrcWidth * 2 +
3961 0 : 1];
3962 0 : dfTotalR += dfR;
3963 0 : dfTotalI += dfI;
3964 0 : dfTotalM += std::hypot(dfR, dfI);
3965 0 : ++nCount;
3966 : }
3967 : }
3968 :
3969 0 : CPLAssert(nCount > 0);
3970 0 : if (nCount == 0)
3971 : {
3972 0 : pafDstScanline[iDstPixel * 2] = 0.0;
3973 0 : pafDstScanline[iDstPixel * 2 + 1] = 0.0;
3974 : }
3975 : else
3976 : {
3977 0 : pafDstScanline[iDstPixel * 2] =
3978 0 : static_cast<float>(dfTotalR / nCount);
3979 0 : pafDstScanline[iDstPixel * 2 + 1] =
3980 0 : static_cast<float>(dfTotalI / nCount);
3981 : const double dfM =
3982 0 : std::hypot(pafDstScanline[iDstPixel * 2],
3983 0 : pafDstScanline[iDstPixel * 2 + 1]);
3984 0 : const double dfDesiredM = dfTotalM / nCount;
3985 0 : double dfRatio = 1.0;
3986 0 : if (dfM != 0.0)
3987 0 : dfRatio = dfDesiredM / dfM;
3988 :
3989 0 : pafDstScanline[iDstPixel * 2] *=
3990 0 : static_cast<float>(dfRatio);
3991 0 : pafDstScanline[iDstPixel * 2 + 1] *=
3992 0 : static_cast<float>(dfRatio);
3993 : }
3994 : }
3995 12 : else if (eMethod == RMS)
3996 : {
3997 12 : double dfTotalR = 0.0;
3998 12 : double dfTotalI = 0.0;
3999 12 : int nCount = 0;
4000 :
4001 36 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4002 : {
4003 72 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4004 : {
4005 48 : const double dfR =
4006 48 : pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
4007 48 : iY - nSrcYOff) *
4008 48 : nSrcWidth * 2];
4009 48 : const double dfI =
4010 48 : pafSrcScanline[iX * 2 +
4011 48 : static_cast<GPtrDiff_t>(iY -
4012 48 : nSrcYOff) *
4013 48 : nSrcWidth * 2 +
4014 48 : 1];
4015 :
4016 48 : dfTotalR += SQUARE(dfR);
4017 48 : dfTotalI += SQUARE(dfI);
4018 :
4019 48 : ++nCount;
4020 : }
4021 : }
4022 :
4023 12 : CPLAssert(nCount > 0);
4024 12 : if (nCount == 0)
4025 : {
4026 0 : pafDstScanline[iDstPixel * 2] = 0.0;
4027 0 : pafDstScanline[iDstPixel * 2 + 1] = 0.0;
4028 : }
4029 : else
4030 : {
4031 : /* compute RMS */
4032 12 : pafDstScanline[iDstPixel * 2] =
4033 12 : static_cast<float>(sqrt(dfTotalR / nCount));
4034 12 : pafDstScanline[iDstPixel * 2 + 1] =
4035 12 : static_cast<float>(sqrt(dfTotalI / nCount));
4036 : }
4037 : }
4038 0 : else if (eMethod == AVERAGE)
4039 : {
4040 0 : double dfTotalR = 0.0;
4041 0 : double dfTotalI = 0.0;
4042 0 : int nCount = 0;
4043 :
4044 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4045 : {
4046 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4047 : {
4048 : // TODO(schwehr): Maybe use std::complex?
4049 0 : dfTotalR +=
4050 0 : pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
4051 0 : iY - nSrcYOff) *
4052 0 : nSrcWidth * 2];
4053 0 : dfTotalI += pafSrcScanline[iX * 2 +
4054 0 : static_cast<GPtrDiff_t>(
4055 0 : iY - nSrcYOff) *
4056 0 : nSrcWidth * 2 +
4057 0 : 1];
4058 0 : ++nCount;
4059 : }
4060 : }
4061 :
4062 0 : CPLAssert(nCount > 0);
4063 0 : if (nCount == 0)
4064 : {
4065 0 : pafDstScanline[iDstPixel * 2] = 0.0;
4066 0 : pafDstScanline[iDstPixel * 2 + 1] = 0.0;
4067 : }
4068 : else
4069 : {
4070 0 : pafDstScanline[iDstPixel * 2] =
4071 0 : static_cast<float>(dfTotalR / nCount);
4072 0 : pafDstScanline[iDstPixel * 2 + 1] =
4073 0 : static_cast<float>(dfTotalI / nCount);
4074 : }
4075 : }
4076 : }
4077 : }
4078 :
4079 2 : return CE_None;
4080 : }
4081 :
4082 : /************************************************************************/
4083 : /* GDALRegenerateCascadingOverviews() */
4084 : /* */
4085 : /* Generate a list of overviews in order from largest to */
4086 : /* smallest, computing each from the next larger. */
4087 : /************************************************************************/
4088 :
4089 42 : static CPLErr GDALRegenerateCascadingOverviews(
4090 : GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
4091 : const char *pszResampling, GDALProgressFunc pfnProgress,
4092 : void *pProgressData, CSLConstList papszOptions)
4093 :
4094 : {
4095 : /* -------------------------------------------------------------------- */
4096 : /* First, we must put the overviews in order from largest to */
4097 : /* smallest. */
4098 : /* -------------------------------------------------------------------- */
4099 120 : for (int i = 0; i < nOverviews - 1; ++i)
4100 : {
4101 270 : for (int j = 0; j < nOverviews - i - 1; ++j)
4102 : {
4103 192 : if (papoOvrBands[j]->GetXSize() *
4104 192 : static_cast<float>(papoOvrBands[j]->GetYSize()) <
4105 192 : papoOvrBands[j + 1]->GetXSize() *
4106 192 : static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
4107 : {
4108 0 : GDALRasterBand *poTempBand = papoOvrBands[j];
4109 0 : papoOvrBands[j] = papoOvrBands[j + 1];
4110 0 : papoOvrBands[j + 1] = poTempBand;
4111 : }
4112 : }
4113 : }
4114 :
4115 : /* -------------------------------------------------------------------- */
4116 : /* Count total pixels so we can prepare appropriate scaled */
4117 : /* progress functions. */
4118 : /* -------------------------------------------------------------------- */
4119 42 : double dfTotalPixels = 0.0;
4120 :
4121 162 : for (int i = 0; i < nOverviews; ++i)
4122 : {
4123 120 : dfTotalPixels += papoOvrBands[i]->GetXSize() *
4124 120 : static_cast<double>(papoOvrBands[i]->GetYSize());
4125 : }
4126 :
4127 : /* -------------------------------------------------------------------- */
4128 : /* Generate all the bands. */
4129 : /* -------------------------------------------------------------------- */
4130 42 : double dfPixelsProcessed = 0.0;
4131 :
4132 162 : for (int i = 0; i < nOverviews; ++i)
4133 : {
4134 120 : GDALRasterBand *poBaseBand = poSrcBand;
4135 120 : if (i != 0)
4136 78 : poBaseBand = papoOvrBands[i - 1];
4137 :
4138 120 : double dfPixels = papoOvrBands[i]->GetXSize() *
4139 120 : static_cast<double>(papoOvrBands[i]->GetYSize());
4140 :
4141 240 : void *pScaledProgressData = GDALCreateScaledProgress(
4142 : dfPixelsProcessed / dfTotalPixels,
4143 120 : (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
4144 : pProgressData);
4145 :
4146 240 : const CPLErr eErr = GDALRegenerateOverviewsEx(
4147 : poBaseBand, 1,
4148 120 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
4149 : pszResampling, GDALScaledProgress, pScaledProgressData,
4150 : papszOptions);
4151 120 : GDALDestroyScaledProgress(pScaledProgressData);
4152 :
4153 120 : if (eErr != CE_None)
4154 0 : return eErr;
4155 :
4156 120 : dfPixelsProcessed += dfPixels;
4157 :
4158 : // Only do the bit2grayscale promotion on the base band.
4159 120 : if (STARTS_WITH_CI(pszResampling,
4160 : "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
4161 8 : pszResampling = "AVERAGE";
4162 : }
4163 :
4164 42 : return CE_None;
4165 : }
4166 :
4167 : /************************************************************************/
4168 : /* GDALGetResampleFunction() */
4169 : /************************************************************************/
4170 :
4171 3884 : GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
4172 : int *pnRadius)
4173 : {
4174 3884 : if (pnRadius)
4175 3885 : *pnRadius = 0;
4176 3884 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
4177 444 : return GDALResampleChunk_Near;
4178 3440 : else if (STARTS_WITH_CI(pszResampling, "AVER") ||
4179 2915 : EQUAL(pszResampling, "RMS"))
4180 552 : return GDALResampleChunk_AverageOrRMS;
4181 2888 : else if (EQUAL(pszResampling, "GAUSS"))
4182 : {
4183 26 : if (pnRadius)
4184 26 : *pnRadius = 1;
4185 26 : return GDALResampleChunk_Gauss;
4186 : }
4187 2862 : else if (EQUAL(pszResampling, "MODE"))
4188 96 : return GDALResampleChunk_Mode;
4189 2766 : else if (EQUAL(pszResampling, "CUBIC"))
4190 : {
4191 396 : if (pnRadius)
4192 396 : *pnRadius = GWKGetFilterRadius(GRA_Cubic);
4193 389 : return GDALResampleChunk_Convolution;
4194 : }
4195 2370 : else if (EQUAL(pszResampling, "CUBICSPLINE"))
4196 : {
4197 3 : if (pnRadius)
4198 3 : *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
4199 3 : return GDALResampleChunk_Convolution;
4200 : }
4201 2367 : else if (EQUAL(pszResampling, "LANCZOS"))
4202 : {
4203 8 : if (pnRadius)
4204 8 : *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
4205 8 : return GDALResampleChunk_Convolution;
4206 : }
4207 2359 : else if (EQUAL(pszResampling, "BILINEAR"))
4208 : {
4209 2367 : if (pnRadius)
4210 2367 : *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
4211 2367 : return GDALResampleChunk_Convolution;
4212 : }
4213 : else
4214 : {
4215 0 : CPLError(
4216 : CE_Failure, CPLE_AppDefined,
4217 : "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
4218 : pszResampling);
4219 0 : return nullptr;
4220 : }
4221 : }
4222 :
4223 : /************************************************************************/
4224 : /* GDALGetOvrWorkDataType() */
4225 : /************************************************************************/
4226 :
4227 3774 : GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
4228 : GDALDataType eSrcDataType)
4229 : {
4230 3774 : if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
4231 : {
4232 533 : return eSrcDataType;
4233 : }
4234 3241 : else if (eSrcDataType == GDT_Byte &&
4235 2931 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4236 2469 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4237 2257 : EQUAL(pszResampling, "CUBICSPLINE") ||
4238 2254 : EQUAL(pszResampling, "LANCZOS") ||
4239 2249 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4240 : {
4241 2928 : return GDT_Byte;
4242 : }
4243 313 : else if (eSrcDataType == GDT_UInt16 &&
4244 118 : (STARTS_WITH_CI(pszResampling, "AVER") ||
4245 107 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4246 3 : EQUAL(pszResampling, "CUBICSPLINE") ||
4247 3 : EQUAL(pszResampling, "LANCZOS") ||
4248 2 : EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4249 : {
4250 110 : return GDT_UInt16;
4251 : }
4252 203 : else if (EQUAL(pszResampling, "GAUSS"))
4253 20 : return GDT_Float64;
4254 :
4255 183 : if (eSrcDataType == GDT_Byte || eSrcDataType == GDT_Int8 ||
4256 184 : eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
4257 : eSrcDataType == GDT_Float32)
4258 : {
4259 147 : return GDT_Float32;
4260 : }
4261 36 : return GDT_Float64;
4262 : }
4263 :
4264 : namespace
4265 : {
4266 : // Structure to hold a pointer to free with CPLFree()
4267 : struct PointerHolder
4268 : {
4269 : void *ptr = nullptr;
4270 :
4271 34757 : explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
4272 : {
4273 34757 : }
4274 :
4275 34759 : ~PointerHolder()
4276 34759 : {
4277 34759 : CPLFree(ptr);
4278 34759 : }
4279 :
4280 : PointerHolder(const PointerHolder &) = delete;
4281 : PointerHolder &operator=(const PointerHolder &) = delete;
4282 : };
4283 : } // namespace
4284 :
4285 : /************************************************************************/
4286 : /* GDALRegenerateOverviews() */
4287 : /************************************************************************/
4288 :
4289 : /**
4290 : * \brief Generate downsampled overviews.
4291 : *
4292 : * This function will generate one or more overview images from a base image
4293 : * using the requested downsampling algorithm. Its primary use is for
4294 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4295 : * used to generate downsampled images in one file from another outside the
4296 : * overview architecture.
4297 : *
4298 : * The output bands need to exist in advance.
4299 : *
4300 : * The full set of resampling algorithms is documented in
4301 : * GDALDataset::BuildOverviews().
4302 : *
4303 : * This function will honour properly NODATA_VALUES tuples (special dataset
4304 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4305 : * considered as the nodata value and not each value of the triplet
4306 : * independently per band.
4307 : *
4308 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4309 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4310 : * overview computation.
4311 : *
4312 : * @param hSrcBand the source (base level) band.
4313 : * @param nOverviewCount the number of downsampled bands being generated.
4314 : * @param pahOvrBands the list of downsampled bands to be generated.
4315 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4316 : * @param pfnProgress progress report function.
4317 : * @param pProgressData progress function callback data.
4318 : * @return CE_None on success or CE_Failure on failure.
4319 : */
4320 252 : CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
4321 : GDALRasterBandH *pahOvrBands,
4322 : const char *pszResampling,
4323 : GDALProgressFunc pfnProgress,
4324 : void *pProgressData)
4325 :
4326 : {
4327 252 : return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
4328 : pszResampling, pfnProgress, pProgressData,
4329 252 : nullptr);
4330 : }
4331 :
4332 : /************************************************************************/
4333 : /* GDALRegenerateOverviewsEx() */
4334 : /************************************************************************/
4335 :
4336 : /**
4337 : * \brief Generate downsampled overviews.
4338 : *
4339 : * This function will generate one or more overview images from a base image
4340 : * using the requested downsampling algorithm. Its primary use is for
4341 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4342 : * used to generate downsampled images in one file from another outside the
4343 : * overview architecture.
4344 : *
4345 : * The output bands need to exist in advance.
4346 : *
4347 : * The full set of resampling algorithms is documented in
4348 : * GDALDataset::BuildOverviews().
4349 : *
4350 : * This function will honour properly NODATA_VALUES tuples (special dataset
4351 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4352 : * considered as the nodata value and not each value of the triplet
4353 : * independently per band.
4354 : *
4355 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4356 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4357 : * overview computation.
4358 : *
4359 : * @param hSrcBand the source (base level) band.
4360 : * @param nOverviewCount the number of downsampled bands being generated.
4361 : * @param pahOvrBands the list of downsampled bands to be generated.
4362 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4363 : * @param pfnProgress progress report function.
4364 : * @param pProgressData progress function callback data.
4365 : * @param papszOptions NULL terminated list of options as key=value pairs, or
4366 : * NULL
4367 : * @return CE_None on success or CE_Failure on failure.
4368 : * @since GDAL 3.6
4369 : */
4370 815 : CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
4371 : GDALRasterBandH *pahOvrBands,
4372 : const char *pszResampling,
4373 : GDALProgressFunc pfnProgress,
4374 : void *pProgressData, CSLConstList papszOptions)
4375 :
4376 : {
4377 815 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
4378 815 : GDALRasterBand **papoOvrBands =
4379 : reinterpret_cast<GDALRasterBand **>(pahOvrBands);
4380 :
4381 815 : if (pfnProgress == nullptr)
4382 252 : pfnProgress = GDALDummyProgress;
4383 :
4384 815 : if (EQUAL(pszResampling, "NONE"))
4385 61 : return CE_None;
4386 :
4387 754 : int nKernelRadius = 0;
4388 : GDALResampleFunction pfnResampleFn =
4389 754 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
4390 :
4391 754 : if (pfnResampleFn == nullptr)
4392 0 : return CE_Failure;
4393 :
4394 : /* -------------------------------------------------------------------- */
4395 : /* Check color tables... */
4396 : /* -------------------------------------------------------------------- */
4397 754 : GDALColorTable *poColorTable = nullptr;
4398 :
4399 391 : if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
4400 1582 : EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
4401 448 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4402 : {
4403 9 : poColorTable = poSrcBand->GetColorTable();
4404 9 : if (poColorTable != nullptr)
4405 : {
4406 9 : if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
4407 : {
4408 0 : CPLError(CE_Warning, CPLE_AppDefined,
4409 : "Computing overviews on palette index raster bands "
4410 : "with a palette whose color interpretation is not RGB "
4411 : "will probably lead to unexpected results.");
4412 0 : poColorTable = nullptr;
4413 : }
4414 9 : else if (poColorTable->IsIdentity())
4415 : {
4416 0 : poColorTable = nullptr;
4417 : }
4418 : }
4419 : else
4420 : {
4421 0 : CPLError(CE_Warning, CPLE_AppDefined,
4422 : "Computing overviews on palette index raster bands "
4423 : "without a palette will probably lead to unexpected "
4424 : "results.");
4425 : }
4426 : }
4427 : // Not ready yet
4428 2181 : else if ((EQUAL(pszResampling, "CUBIC") ||
4429 691 : EQUAL(pszResampling, "CUBICSPLINE") ||
4430 691 : EQUAL(pszResampling, "LANCZOS") ||
4431 1493 : EQUAL(pszResampling, "BILINEAR")) &&
4432 57 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4433 : {
4434 0 : CPLError(CE_Warning, CPLE_AppDefined,
4435 : "Computing %s overviews on palette index raster bands "
4436 : "will probably lead to unexpected results.",
4437 : pszResampling);
4438 : }
4439 :
4440 : // If we have a nodata mask and we are doing something more complicated
4441 : // than nearest neighbouring, we have to fetch to nodata mask.
4442 :
4443 754 : GDALRasterBand *poMaskBand = nullptr;
4444 754 : bool bUseNoDataMask = false;
4445 754 : bool bCanUseCascaded = true;
4446 :
4447 754 : if (!STARTS_WITH_CI(pszResampling, "NEAR"))
4448 : {
4449 : // Special case if we are an alpha/mask band. We want it to be
4450 : // considered as the mask band to avoid alpha=0 to be taken into account
4451 : // in average computation.
4452 505 : if (poSrcBand->IsMaskBand())
4453 : {
4454 90 : poMaskBand = poSrcBand;
4455 90 : bUseNoDataMask = true;
4456 : }
4457 : else
4458 : {
4459 415 : poMaskBand = poSrcBand->GetMaskBand();
4460 415 : const int nMaskFlags = poSrcBand->GetMaskFlags();
4461 415 : bCanUseCascaded =
4462 415 : (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
4463 415 : bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
4464 : }
4465 : }
4466 :
4467 : /* -------------------------------------------------------------------- */
4468 : /* If we are operating on multiple overviews, and using */
4469 : /* averaging, lets do them in cascading order to reduce the */
4470 : /* amount of computation. */
4471 : /* -------------------------------------------------------------------- */
4472 :
4473 : // In case the mask made be computed from another band of the dataset,
4474 : // we can't use cascaded generation, as the computation of the overviews
4475 : // of the band used for the mask band may not have yet occurred (#3033).
4476 754 : if ((STARTS_WITH_CI(pszResampling, "AVER") ||
4477 391 : EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
4478 360 : EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
4479 306 : EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
4480 754 : EQUAL(pszResampling, "MODE")) &&
4481 42 : nOverviewCount > 1 && bCanUseCascaded)
4482 42 : return GDALRegenerateCascadingOverviews(
4483 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
4484 42 : pProgressData, papszOptions);
4485 :
4486 : /* -------------------------------------------------------------------- */
4487 : /* Setup one horizontal swath to read from the raw buffer. */
4488 : /* -------------------------------------------------------------------- */
4489 712 : int nFRXBlockSize = 0;
4490 712 : int nFRYBlockSize = 0;
4491 712 : poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
4492 :
4493 712 : const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
4494 1175 : const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
4495 1129 : EQUAL(pszResampling, "MODE") ||
4496 417 : !GDALDataTypeIsComplex(eSrcDataType);
4497 : const GDALDataType eWrkDataType =
4498 : bUseGenericResampleFn
4499 712 : ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
4500 712 : : GDT_CFloat32;
4501 :
4502 712 : const int nWidth = poSrcBand->GetXSize();
4503 712 : const int nHeight = poSrcBand->GetYSize();
4504 :
4505 712 : int nMaxOvrFactor = 1;
4506 1505 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
4507 : {
4508 793 : const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
4509 793 : const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
4510 793 : nMaxOvrFactor = std::max(
4511 : nMaxOvrFactor,
4512 793 : static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
4513 793 : nMaxOvrFactor = std::max(
4514 : nMaxOvrFactor,
4515 793 : static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
4516 : }
4517 :
4518 712 : int nFullResYChunk = nFRYBlockSize;
4519 712 : int nMaxChunkYSizeQueried = 0;
4520 :
4521 : const auto UpdateChunkHeightAndGetChunkSize =
4522 9267 : [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
4523 27801 : eWrkDataType, nWidth]()
4524 : {
4525 : // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
4526 : // + nFullResYChunk) / nMaxOvrFactor)
4527 9267 : nFullResYChunk = std::max(nFullResYChunk, 2 * nMaxOvrFactor);
4528 9267 : nMaxChunkYSizeQueried =
4529 9267 : nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
4530 9267 : return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
4531 9267 : nMaxChunkYSizeQueried * nWidth;
4532 712 : };
4533 :
4534 : // Only configurable for debug / testing
4535 : const char *pszChunkYSize =
4536 712 : CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
4537 712 : if (pszChunkYSize)
4538 : {
4539 : // coverity[tainted_data]
4540 0 : nFullResYChunk = atoi(pszChunkYSize);
4541 : }
4542 :
4543 : // Only configurable for debug / testing
4544 : const int nChunkMaxSize =
4545 712 : atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
4546 :
4547 712 : auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
4548 712 : if (nChunkSize > nChunkMaxSize)
4549 : {
4550 3 : if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
4551 9 : !GDALDataTypeIsComplex(eSrcDataType) &&
4552 3 : (!STARTS_WITH_CI(pszResampling, "AVER") ||
4553 0 : EQUAL(pszResampling, "AVERAGE")))
4554 : {
4555 : // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
4556 : // which use a block based strategy, which is much less memory
4557 : // hungry.
4558 3 : return GDALRegenerateOverviewsMultiBand(
4559 : 1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
4560 3 : pfnProgress, pProgressData, papszOptions);
4561 : }
4562 0 : else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
4563 : {
4564 0 : return GDALRegenerateCascadingOverviews(
4565 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
4566 0 : pfnProgress, pProgressData, papszOptions);
4567 : }
4568 : }
4569 709 : else if (pszChunkYSize == nullptr)
4570 : {
4571 : // Try to get as close as possible to nChunkMaxSize
4572 9264 : while (nChunkSize * 2 < nChunkMaxSize)
4573 : {
4574 8555 : nFullResYChunk *= 2;
4575 8555 : nChunkSize = UpdateChunkHeightAndGetChunkSize();
4576 : }
4577 : }
4578 :
4579 709 : int nHasNoData = 0;
4580 709 : const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
4581 709 : const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
4582 : const bool bPropagateNoData =
4583 709 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
4584 :
4585 : // Structure describing a resampling job
4586 : struct OvrJob
4587 : {
4588 : // Buffers to free when job is finished
4589 : std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
4590 : std::shared_ptr<PointerHolder> oSrcBufferHolder{};
4591 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
4592 :
4593 : GDALRasterBand *poDstBand = nullptr;
4594 :
4595 : // Input parameters of pfnResampleFn
4596 : GDALResampleFunction pfnResampleFn = nullptr;
4597 : int nSrcWidth = 0;
4598 : int nSrcHeight = 0;
4599 : int nDstWidth = 0;
4600 : GDALOverviewResampleArgs args{};
4601 : const void *pChunk = nullptr;
4602 : bool bUseGenericResampleFn = false;
4603 :
4604 : // Output values of resampling function
4605 : CPLErr eErr = CE_Failure;
4606 : void *pDstBuffer = nullptr;
4607 : GDALDataType eDstBufferDataType = GDT_Unknown;
4608 :
4609 : // Synchronization
4610 : bool bFinished = false;
4611 : std::mutex mutex{};
4612 : std::condition_variable cv{};
4613 :
4614 0 : void SetSrcMaskBufferHolder(
4615 : const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
4616 : {
4617 0 : oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
4618 0 : }
4619 :
4620 0 : void SetSrcBufferHolder(
4621 : const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
4622 : {
4623 0 : oSrcBufferHolder = oSrcBufferHolderIn;
4624 0 : }
4625 : };
4626 :
4627 : // Thread function to resample
4628 791 : const auto JobResampleFunc = [](void *pData)
4629 : {
4630 791 : OvrJob *poJob = static_cast<OvrJob *>(pData);
4631 :
4632 791 : if (poJob->bUseGenericResampleFn)
4633 : {
4634 789 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
4635 : &(poJob->pDstBuffer),
4636 : &(poJob->eDstBufferDataType));
4637 : }
4638 : else
4639 : {
4640 2 : poJob->eErr = GDALResampleChunkC32R(
4641 : poJob->nSrcWidth, poJob->nSrcHeight,
4642 2 : static_cast<const float *>(poJob->pChunk),
4643 : poJob->args.nChunkYOff, poJob->args.nChunkYSize,
4644 : poJob->args.nDstYOff, poJob->args.nDstYOff2,
4645 : poJob->args.nOvrXSize, poJob->args.nOvrYSize,
4646 : &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
4647 : poJob->args.pszResampling);
4648 : }
4649 :
4650 : poJob->oDstBufferHolder =
4651 791 : std::make_unique<PointerHolder>(poJob->pDstBuffer);
4652 :
4653 : {
4654 1582 : std::lock_guard<std::mutex> guard(poJob->mutex);
4655 791 : poJob->bFinished = true;
4656 791 : poJob->cv.notify_one();
4657 : }
4658 791 : };
4659 :
4660 : // Function to write resample data to target band
4661 791 : const auto WriteJobData = [](const OvrJob *poJob)
4662 : {
4663 1582 : return poJob->poDstBand->RasterIO(
4664 791 : GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
4665 791 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
4666 791 : poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
4667 791 : poJob->eDstBufferDataType, 0, 0, nullptr);
4668 : };
4669 :
4670 : // Wait for completion of oldest job and serialize it
4671 : const auto WaitAndFinalizeOldestJob =
4672 0 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
4673 : {
4674 0 : auto poOldestJob = jobList.front().get();
4675 : {
4676 0 : std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
4677 : // coverity[missing_lock:FALSE]
4678 0 : while (!poOldestJob->bFinished)
4679 : {
4680 0 : poOldestJob->cv.wait(oGuard);
4681 : }
4682 : }
4683 0 : CPLErr l_eErr = poOldestJob->eErr;
4684 0 : if (l_eErr == CE_None)
4685 : {
4686 0 : l_eErr = WriteJobData(poOldestJob);
4687 : }
4688 :
4689 0 : jobList.pop_front();
4690 0 : return l_eErr;
4691 : };
4692 :
4693 : // Queue of jobs
4694 1418 : std::list<std::unique_ptr<OvrJob>> jobList;
4695 :
4696 709 : GByte *pabyChunkNodataMask = nullptr;
4697 709 : void *pChunk = nullptr;
4698 :
4699 709 : const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
4700 2836 : const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
4701 709 : ? CPLGetNumCPUs()
4702 709 : : atoi(pszThreads)));
4703 : auto poThreadPool =
4704 709 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
4705 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
4706 1418 : : std::unique_ptr<CPLJobQueue>(nullptr);
4707 :
4708 : /* -------------------------------------------------------------------- */
4709 : /* Loop over image operating on chunks. */
4710 : /* -------------------------------------------------------------------- */
4711 709 : int nChunkYOff = 0;
4712 709 : CPLErr eErr = CE_None;
4713 :
4714 1423 : for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
4715 714 : nChunkYOff += nFullResYChunk)
4716 : {
4717 714 : if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
4718 : pProgressData))
4719 : {
4720 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
4721 0 : eErr = CE_Failure;
4722 : }
4723 :
4724 714 : if (nFullResYChunk + nChunkYOff > nHeight)
4725 707 : nFullResYChunk = nHeight - nChunkYOff;
4726 :
4727 714 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
4728 714 : int nChunkYSizeQueried =
4729 714 : nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
4730 714 : if (nChunkYOffQueried < 0)
4731 : {
4732 62 : nChunkYSizeQueried += nChunkYOffQueried;
4733 62 : nChunkYOffQueried = 0;
4734 : }
4735 714 : if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
4736 62 : nChunkYSizeQueried = nHeight - nChunkYOffQueried;
4737 :
4738 : // Avoid accumulating too many tasks and exhaust RAM
4739 : // Try to complete already finished jobs
4740 714 : while (eErr == CE_None && !jobList.empty())
4741 : {
4742 0 : auto poOldestJob = jobList.front().get();
4743 : {
4744 0 : std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
4745 0 : if (!poOldestJob->bFinished)
4746 : {
4747 0 : break;
4748 : }
4749 : }
4750 0 : eErr = poOldestJob->eErr;
4751 0 : if (eErr == CE_None)
4752 : {
4753 0 : eErr = WriteJobData(poOldestJob);
4754 : }
4755 :
4756 0 : jobList.pop_front();
4757 : }
4758 :
4759 : // And in case we have saturated the number of threads,
4760 : // wait for completion of tasks to go below the threshold.
4761 1428 : while (eErr == CE_None &&
4762 714 : jobList.size() >= static_cast<size_t>(nThreads))
4763 : {
4764 0 : eErr = WaitAndFinalizeOldestJob(jobList);
4765 : }
4766 :
4767 : // (Re)allocate buffers if needed
4768 714 : if (pChunk == nullptr)
4769 : {
4770 709 : pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
4771 : nMaxChunkYSizeQueried, nWidth);
4772 : }
4773 714 : if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
4774 : {
4775 : pabyChunkNodataMask = static_cast<GByte *>(
4776 274 : VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
4777 : }
4778 :
4779 714 : if (pChunk == nullptr ||
4780 274 : (bUseNoDataMask && pabyChunkNodataMask == nullptr))
4781 : {
4782 0 : CPLFree(pChunk);
4783 0 : CPLFree(pabyChunkNodataMask);
4784 0 : return CE_Failure;
4785 : }
4786 :
4787 : // Read chunk.
4788 714 : if (eErr == CE_None)
4789 714 : eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
4790 : nChunkYSizeQueried, pChunk, nWidth,
4791 : nChunkYSizeQueried, eWrkDataType, 0, 0,
4792 : nullptr);
4793 714 : if (eErr == CE_None && bUseNoDataMask)
4794 274 : eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
4795 : nChunkYSizeQueried, pabyChunkNodataMask,
4796 : nWidth, nChunkYSizeQueried, GDT_Byte, 0,
4797 : 0, nullptr);
4798 :
4799 : // Special case to promote 1bit data to 8bit 0/255 values.
4800 714 : if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
4801 : {
4802 9 : if (eWrkDataType == GDT_Float32)
4803 : {
4804 0 : float *pafChunk = static_cast<float *>(pChunk);
4805 0 : for (GPtrDiff_t i = 0;
4806 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4807 : i++)
4808 : {
4809 0 : if (pafChunk[i] == 1.0)
4810 0 : pafChunk[i] = 255.0;
4811 : }
4812 : }
4813 9 : else if (eWrkDataType == GDT_Byte)
4814 : {
4815 9 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
4816 168417 : for (GPtrDiff_t i = 0;
4817 168417 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4818 : i++)
4819 : {
4820 168408 : if (pabyChunk[i] == 1)
4821 127437 : pabyChunk[i] = 255;
4822 : }
4823 : }
4824 0 : else if (eWrkDataType == GDT_UInt16)
4825 : {
4826 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
4827 0 : for (GPtrDiff_t i = 0;
4828 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4829 : i++)
4830 : {
4831 0 : if (pasChunk[i] == 1)
4832 0 : pasChunk[i] = 255;
4833 : }
4834 : }
4835 0 : else if (eWrkDataType == GDT_Float64)
4836 : {
4837 0 : double *padfChunk = static_cast<double *>(pChunk);
4838 0 : for (GPtrDiff_t i = 0;
4839 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4840 : i++)
4841 : {
4842 0 : if (padfChunk[i] == 1.0)
4843 0 : padfChunk[i] = 255.0;
4844 : }
4845 : }
4846 : else
4847 : {
4848 0 : CPLAssert(false);
4849 : }
4850 : }
4851 705 : else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
4852 : {
4853 0 : if (eWrkDataType == GDT_Float32)
4854 : {
4855 0 : float *pafChunk = static_cast<float *>(pChunk);
4856 0 : for (GPtrDiff_t i = 0;
4857 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4858 : i++)
4859 : {
4860 0 : if (pafChunk[i] == 1.0)
4861 0 : pafChunk[i] = 0.0;
4862 0 : else if (pafChunk[i] == 0.0)
4863 0 : pafChunk[i] = 255.0;
4864 : }
4865 : }
4866 0 : else if (eWrkDataType == GDT_Byte)
4867 : {
4868 0 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
4869 0 : for (GPtrDiff_t i = 0;
4870 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4871 : i++)
4872 : {
4873 0 : if (pabyChunk[i] == 1)
4874 0 : pabyChunk[i] = 0;
4875 0 : else if (pabyChunk[i] == 0)
4876 0 : pabyChunk[i] = 255;
4877 : }
4878 : }
4879 0 : else if (eWrkDataType == GDT_UInt16)
4880 : {
4881 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
4882 0 : for (GPtrDiff_t i = 0;
4883 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4884 : i++)
4885 : {
4886 0 : if (pasChunk[i] == 1)
4887 0 : pasChunk[i] = 0;
4888 0 : else if (pasChunk[i] == 0)
4889 0 : pasChunk[i] = 255;
4890 : }
4891 : }
4892 0 : else if (eWrkDataType == GDT_Float64)
4893 : {
4894 0 : double *padfChunk = static_cast<double *>(pChunk);
4895 0 : for (GPtrDiff_t i = 0;
4896 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4897 : i++)
4898 : {
4899 0 : if (padfChunk[i] == 1.0)
4900 0 : padfChunk[i] = 0.0;
4901 0 : else if (padfChunk[i] == 0.0)
4902 0 : padfChunk[i] = 255.0;
4903 : }
4904 : }
4905 : else
4906 : {
4907 0 : CPLAssert(false);
4908 : }
4909 : }
4910 :
4911 : auto oSrcBufferHolder =
4912 1428 : std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
4913 : auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
4914 1428 : poJobQueue ? pabyChunkNodataMask : nullptr);
4915 :
4916 1505 : for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
4917 : ++iOverview)
4918 : {
4919 791 : GDALRasterBand *poDstBand = papoOvrBands[iOverview];
4920 791 : const int nDstWidth = poDstBand->GetXSize();
4921 791 : const int nDstHeight = poDstBand->GetYSize();
4922 :
4923 791 : const double dfXRatioDstToSrc =
4924 791 : static_cast<double>(nWidth) / nDstWidth;
4925 791 : const double dfYRatioDstToSrc =
4926 791 : static_cast<double>(nHeight) / nDstHeight;
4927 :
4928 : /* --------------------------------------------------------------------
4929 : */
4930 : /* Figure out the line to start writing to, and the first line
4931 : */
4932 : /* to not write to. In theory this approach should ensure that
4933 : */
4934 : /* every output line will be written if all input chunks are */
4935 : /* processed. */
4936 : /* --------------------------------------------------------------------
4937 : */
4938 791 : int nDstYOff =
4939 791 : static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
4940 791 : if (nDstYOff == nDstHeight)
4941 0 : continue;
4942 791 : int nDstYOff2 = static_cast<int>(
4943 791 : 0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
4944 :
4945 791 : if (nChunkYOff + nFullResYChunk == nHeight)
4946 784 : nDstYOff2 = nDstHeight;
4947 : #if DEBUG_VERBOSE
4948 : CPLDebug("GDAL",
4949 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
4950 : nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
4951 : nDstWidth, nDstYOff2 - nDstYOff);
4952 : #endif
4953 :
4954 1582 : auto poJob = std::make_unique<OvrJob>();
4955 791 : poJob->pfnResampleFn = pfnResampleFn;
4956 791 : poJob->bUseGenericResampleFn = bUseGenericResampleFn;
4957 791 : poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
4958 791 : poJob->args.nOvrXSize = poDstBand->GetXSize();
4959 791 : poJob->args.nOvrYSize = poDstBand->GetYSize();
4960 : const char *pszNBITS =
4961 791 : poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
4962 791 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
4963 791 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
4964 791 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
4965 791 : poJob->args.eWrkDataType = eWrkDataType;
4966 791 : poJob->pChunk = pChunk;
4967 791 : poJob->args.pabyChunkNodataMask = pabyChunkNodataMask;
4968 791 : poJob->nSrcWidth = nWidth;
4969 791 : poJob->nSrcHeight = nHeight;
4970 791 : poJob->args.nChunkXOff = 0;
4971 791 : poJob->args.nChunkXSize = nWidth;
4972 791 : poJob->args.nChunkYOff = nChunkYOffQueried;
4973 791 : poJob->args.nChunkYSize = nChunkYSizeQueried;
4974 791 : poJob->nDstWidth = nDstWidth;
4975 791 : poJob->args.nDstXOff = 0;
4976 791 : poJob->args.nDstXOff2 = nDstWidth;
4977 791 : poJob->args.nDstYOff = nDstYOff;
4978 791 : poJob->args.nDstYOff2 = nDstYOff2;
4979 791 : poJob->poDstBand = poDstBand;
4980 791 : poJob->args.pszResampling = pszResampling;
4981 791 : poJob->args.bHasNoData = bHasNoData;
4982 791 : poJob->args.dfNoDataValue = dfNoDataValue;
4983 791 : poJob->args.poColorTable = poColorTable;
4984 791 : poJob->args.eSrcDataType = eSrcDataType;
4985 791 : poJob->args.bPropagateNoData = bPropagateNoData;
4986 :
4987 791 : if (poJobQueue)
4988 : {
4989 0 : poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
4990 0 : poJob->SetSrcBufferHolder(oSrcBufferHolder);
4991 0 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
4992 0 : jobList.emplace_back(std::move(poJob));
4993 : }
4994 : else
4995 : {
4996 791 : JobResampleFunc(poJob.get());
4997 791 : eErr = poJob->eErr;
4998 791 : if (eErr == CE_None)
4999 : {
5000 791 : eErr = WriteJobData(poJob.get());
5001 : }
5002 : }
5003 : }
5004 :
5005 714 : if (poJobQueue)
5006 : {
5007 0 : pChunk = nullptr;
5008 0 : pabyChunkNodataMask = nullptr;
5009 : }
5010 : }
5011 :
5012 709 : VSIFree(pChunk);
5013 709 : VSIFree(pabyChunkNodataMask);
5014 :
5015 : // Wait for all pending jobs to complete
5016 709 : while (!jobList.empty())
5017 : {
5018 0 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5019 0 : if (l_eErr != CE_None && eErr == CE_None)
5020 0 : eErr = l_eErr;
5021 : }
5022 :
5023 : /* -------------------------------------------------------------------- */
5024 : /* Renormalized overview mean / stddev if needed. */
5025 : /* -------------------------------------------------------------------- */
5026 709 : if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
5027 : {
5028 0 : GDALOverviewMagnitudeCorrection(
5029 : poSrcBand, nOverviewCount,
5030 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
5031 : GDALDummyProgress, nullptr);
5032 : }
5033 :
5034 : /* -------------------------------------------------------------------- */
5035 : /* It can be important to flush out data to overviews. */
5036 : /* -------------------------------------------------------------------- */
5037 1493 : for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
5038 : ++iOverview)
5039 : {
5040 784 : eErr = papoOvrBands[iOverview]->FlushCache(false);
5041 : }
5042 :
5043 709 : if (eErr == CE_None)
5044 709 : pfnProgress(1.0, nullptr, pProgressData);
5045 :
5046 709 : return eErr;
5047 : }
5048 :
5049 : /************************************************************************/
5050 : /* GDALRegenerateOverviewsMultiBand() */
5051 : /************************************************************************/
5052 :
5053 : /**
5054 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5055 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5056 : *
5057 : * This function will generate one or more overview images from a base
5058 : * image using the requested downsampling algorithm. Its primary use
5059 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
5060 : * can also be used to generate downsampled images in one file from another
5061 : * outside the overview architecture.
5062 : *
5063 : * The output bands need to exist in advance and share the same characteristics
5064 : * (type, dimensions)
5065 : *
5066 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5067 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5068 : *
5069 : * It does not support color tables or complex data types.
5070 : *
5071 : * The pseudo-algorithm used by the function is :
5072 : * for each overview
5073 : * iterate on lines of the source by a step of deltay
5074 : * iterate on columns of the source by a step of deltax
5075 : * read the source data of size deltax * deltay for all the bands
5076 : * generate the corresponding overview block for all the bands
5077 : *
5078 : * This function will honour properly NODATA_VALUES tuples (special dataset
5079 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5080 : * considered as the nodata value and not each value of the triplet
5081 : * independently per band.
5082 : *
5083 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
5084 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5085 : * overview computation.
5086 : *
5087 : * @param nBands the number of bands, size of papoSrcBands and size of
5088 : * first dimension of papapoOverviewBands
5089 : * @param papoSrcBands the list of source bands to downsample
5090 : * @param nOverviews the number of downsampled overview levels being generated.
5091 : * @param papapoOverviewBands bidimension array of bands. First dimension is
5092 : * indexed by nBands. Second dimension is indexed by
5093 : * nOverviews.
5094 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5095 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5096 : * @param pfnProgress progress report function.
5097 : * @param pProgressData progress function callback data.
5098 : * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
5099 : * key=value pairs, or NULL
5100 : * Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
5101 : * options can be specified to express that overviews should
5102 : * be regenerated only in the specified subset of the source
5103 : * dataset.
5104 : * @return CE_None on success or CE_Failure on failure.
5105 : */
5106 :
5107 374 : CPLErr GDALRegenerateOverviewsMultiBand(
5108 : int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
5109 : GDALRasterBand *const *const *papapoOverviewBands,
5110 : const char *pszResampling, GDALProgressFunc pfnProgress,
5111 : void *pProgressData, CSLConstList papszOptions)
5112 : {
5113 374 : CPL_IGNORE_RET_VAL(papszOptions);
5114 :
5115 374 : if (pfnProgress == nullptr)
5116 6 : pfnProgress = GDALDummyProgress;
5117 :
5118 374 : if (EQUAL(pszResampling, "NONE"))
5119 2 : return CE_None;
5120 :
5121 : // Sanity checks.
5122 372 : if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
5123 177 : !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
5124 76 : !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
5125 18 : !EQUAL(pszResampling, "CUBICSPLINE") &&
5126 17 : !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
5127 5 : !EQUAL(pszResampling, "MODE"))
5128 : {
5129 0 : CPLError(CE_Failure, CPLE_NotSupported,
5130 : "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
5131 : "not supported",
5132 : pszResampling);
5133 0 : return CE_Failure;
5134 : }
5135 :
5136 372 : int nKernelRadius = 0;
5137 : GDALResampleFunction pfnResampleFn =
5138 372 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
5139 372 : if (pfnResampleFn == nullptr)
5140 0 : return CE_Failure;
5141 :
5142 372 : const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
5143 372 : const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
5144 372 : if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
5145 0 : return CE_None;
5146 372 : GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
5147 688 : for (int iBand = 1; iBand < nBands; ++iBand)
5148 : {
5149 632 : if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
5150 316 : papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
5151 : {
5152 0 : CPLError(
5153 : CE_Failure, CPLE_NotSupported,
5154 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5155 : "have the same dimensions");
5156 0 : return CE_Failure;
5157 : }
5158 316 : if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
5159 : {
5160 0 : CPLError(
5161 : CE_Failure, CPLE_NotSupported,
5162 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5163 : "have the same data type");
5164 0 : return CE_Failure;
5165 : }
5166 : }
5167 :
5168 988 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5169 : {
5170 616 : const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
5171 616 : const int nDstWidth = poOvrFirstBand->GetXSize();
5172 616 : const int nDstHeight = poOvrFirstBand->GetYSize();
5173 1210 : for (int iBand = 1; iBand < nBands; ++iBand)
5174 : {
5175 594 : const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
5176 1188 : if (poOvrBand->GetXSize() != nDstWidth ||
5177 594 : poOvrBand->GetYSize() != nDstHeight)
5178 : {
5179 0 : CPLError(
5180 : CE_Failure, CPLE_NotSupported,
5181 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5182 : "of the same level must have the same dimensions");
5183 0 : return CE_Failure;
5184 : }
5185 594 : if (poOvrBand->GetRasterDataType() != eDataType)
5186 : {
5187 0 : CPLError(
5188 : CE_Failure, CPLE_NotSupported,
5189 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5190 : "must have the same data type as the source bands");
5191 0 : return CE_Failure;
5192 : }
5193 : }
5194 : }
5195 :
5196 : // First pass to compute the total number of pixels to write.
5197 372 : double dfTotalPixelCount = 0;
5198 372 : const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
5199 372 : const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
5200 372 : const int nSrcXSize = atoi(CSLFetchNameValueDef(
5201 : papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
5202 372 : const int nSrcYSize = atoi(CSLFetchNameValueDef(
5203 : papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
5204 988 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5205 : {
5206 616 : dfTotalPixelCount +=
5207 1232 : static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
5208 616 : papapoOverviewBands[0][iOverview]->GetXSize() *
5209 1232 : static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
5210 616 : papapoOverviewBands[0][iOverview]->GetYSize();
5211 : }
5212 :
5213 : const GDALDataType eWrkDataType =
5214 372 : GDALGetOvrWorkDataType(pszResampling, eDataType);
5215 372 : const int nWrkDataTypeSize = GDALGetDataTypeSizeBytes(eWrkDataType);
5216 :
5217 372 : const bool bIsMask = papoSrcBands[0]->IsMaskBand();
5218 :
5219 : // If we have a nodata mask and we are doing something more complicated
5220 : // than nearest neighbouring, we have to fetch to nodata mask.
5221 : const bool bUseNoDataMask =
5222 541 : !STARTS_WITH_CI(pszResampling, "NEAR") &&
5223 169 : (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
5224 :
5225 : bool *const pabHasNoData =
5226 372 : static_cast<bool *>(VSI_MALLOC_VERBOSE(nBands * sizeof(bool)));
5227 : double *const padfNoDataValue =
5228 372 : static_cast<double *>(VSI_MALLOC_VERBOSE(nBands * sizeof(double)));
5229 372 : if (pabHasNoData == nullptr || padfNoDataValue == nullptr)
5230 : {
5231 0 : CPLFree(pabHasNoData);
5232 0 : CPLFree(padfNoDataValue);
5233 0 : return CE_Failure;
5234 : }
5235 :
5236 1060 : for (int iBand = 0; iBand < nBands; ++iBand)
5237 : {
5238 688 : int nHasNoData = 0;
5239 1376 : padfNoDataValue[iBand] =
5240 688 : papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
5241 688 : pabHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
5242 : }
5243 : const bool bPropagateNoData =
5244 372 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5245 :
5246 372 : const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
5247 1488 : const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
5248 372 : ? CPLGetNumCPUs()
5249 372 : : atoi(pszThreads)));
5250 : auto poThreadPool =
5251 372 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5252 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5253 372 : : std::unique_ptr<CPLJobQueue>(nullptr);
5254 :
5255 : // Only configurable for debug / testing
5256 : const int nChunkMaxSize = std::max(
5257 372 : 100, atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760")));
5258 :
5259 : // Second pass to do the real job.
5260 372 : double dfCurPixelCount = 0;
5261 372 : CPLErr eErr = CE_None;
5262 987 : for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
5263 : ++iOverview)
5264 : {
5265 615 : int iSrcOverview = -1; // -1 means the source bands.
5266 :
5267 : const int nDstTotalWidth =
5268 615 : papapoOverviewBands[0][iOverview]->GetXSize();
5269 : const int nDstTotalHeight =
5270 615 : papapoOverviewBands[0][iOverview]->GetYSize();
5271 :
5272 : // Compute the coordinates of the target region to refresh
5273 615 : constexpr double EPS = 1e-8;
5274 615 : const int nDstXOffStart = static_cast<int>(
5275 615 : static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
5276 : EPS);
5277 : const int nDstXOffEnd =
5278 1230 : std::min(static_cast<int>(
5279 615 : std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
5280 615 : nToplevelSrcWidth * nDstTotalWidth -
5281 : EPS)),
5282 615 : nDstTotalWidth);
5283 615 : const int nDstWidth = nDstXOffEnd - nDstXOffStart;
5284 615 : const int nDstYOffStart =
5285 615 : static_cast<int>(static_cast<double>(nSrcYOff) /
5286 615 : nToplevelSrcHeight * nDstTotalHeight +
5287 : EPS);
5288 : const int nDstYOffEnd =
5289 1230 : std::min(static_cast<int>(
5290 615 : std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
5291 615 : nToplevelSrcHeight * nDstTotalHeight -
5292 : EPS)),
5293 615 : nDstTotalHeight);
5294 :
5295 : // Try to use previous level of overview as the source to compute
5296 : // the next level.
5297 615 : int nSrcWidth = nToplevelSrcWidth;
5298 615 : int nSrcHeight = nToplevelSrcHeight;
5299 858 : if (iOverview > 0 &&
5300 243 : papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
5301 : {
5302 235 : nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
5303 235 : nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
5304 235 : iSrcOverview = iOverview - 1;
5305 : }
5306 :
5307 615 : const double dfXRatioDstToSrc =
5308 615 : static_cast<double>(nSrcWidth) / nDstTotalWidth;
5309 615 : const double dfYRatioDstToSrc =
5310 615 : static_cast<double>(nSrcHeight) / nDstTotalHeight;
5311 :
5312 1230 : int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
5313 615 : static_cast<int>(0.5 + dfYRatioDstToSrc));
5314 615 : if (nOvrFactor == 0)
5315 0 : nOvrFactor = 1;
5316 :
5317 615 : int nDstChunkXSize = 0;
5318 615 : int nDstChunkYSize = 0;
5319 615 : papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
5320 : &nDstChunkYSize);
5321 :
5322 : const char *pszDST_CHUNK_X_SIZE =
5323 615 : CSLFetchNameValue(papszOptions, "DST_CHUNK_X_SIZE");
5324 : const char *pszDST_CHUNK_Y_SIZE =
5325 615 : CSLFetchNameValue(papszOptions, "DST_CHUNK_Y_SIZE");
5326 615 : if (pszDST_CHUNK_X_SIZE && pszDST_CHUNK_Y_SIZE)
5327 : {
5328 12 : nDstChunkXSize = std::max(1, atoi(pszDST_CHUNK_X_SIZE));
5329 12 : nDstChunkYSize = std::max(1, atoi(pszDST_CHUNK_Y_SIZE));
5330 12 : CPLDebug("GDAL", "Using dst chunk size %d x %d", nDstChunkXSize,
5331 : nDstChunkYSize);
5332 : }
5333 :
5334 : // Try to extend the chunk size so that the memory needed to acquire
5335 : // source pixels goes up to 10 MB.
5336 : // This can help for drivers that support multi-threaded reading
5337 615 : const int nFullResYChunk =
5338 615 : 2 + static_cast<int>(nDstChunkYSize * dfYRatioDstToSrc);
5339 615 : const int nFullResYChunkQueried =
5340 615 : nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
5341 857 : while (nDstChunkXSize < nDstWidth)
5342 : {
5343 259 : const int nFullResXChunk =
5344 259 : 2 + static_cast<int>(2 * nDstChunkXSize * dfXRatioDstToSrc);
5345 :
5346 259 : const int nFullResXChunkQueried =
5347 259 : nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
5348 :
5349 259 : if (static_cast<GIntBig>(nFullResXChunkQueried) *
5350 259 : nFullResYChunkQueried * nBands * nWrkDataTypeSize >
5351 259 : nChunkMaxSize)
5352 : {
5353 17 : break;
5354 : }
5355 :
5356 242 : nDstChunkXSize *= 2;
5357 : }
5358 615 : nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
5359 :
5360 615 : const int nFullResXChunk =
5361 615 : 2 + static_cast<int>(nDstChunkXSize * dfXRatioDstToSrc);
5362 615 : const int nFullResXChunkQueried =
5363 615 : nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
5364 :
5365 : // Make sure that the RAM requirements to acquire the source data does
5366 : // not exceed nChunkMaxSize
5367 : // If so, reduce the destination chunk size, generate overviews in a
5368 : // temporary dataset, and copy that temporary dataset over the target
5369 : // overview bands (to avoid issues with lossy compression)
5370 615 : const auto nMemRequirement =
5371 615 : static_cast<GIntBig>(nFullResXChunkQueried) *
5372 615 : nFullResYChunkQueried * nBands * nWrkDataTypeSize;
5373 615 : if (nMemRequirement > nChunkMaxSize &&
5374 10 : !(pszDST_CHUNK_X_SIZE && pszDST_CHUNK_Y_SIZE))
5375 : {
5376 : // Compute a smaller destination chunk size
5377 12 : const auto nOverShootFactor = nMemRequirement / nChunkMaxSize;
5378 : const auto nSqrtOverShootFactor = std::max<GIntBig>(
5379 24 : 4, static_cast<GIntBig>(std::ceil(
5380 12 : std::sqrt(static_cast<double>(nOverShootFactor)))));
5381 : const int nReducedDstChunkXSize = std::max(
5382 12 : 1, static_cast<int>(nDstChunkXSize / nSqrtOverShootFactor));
5383 : const int nReducedDstChunkYSize = std::max(
5384 12 : 1, static_cast<int>(nDstChunkYSize / nSqrtOverShootFactor));
5385 12 : if (nReducedDstChunkXSize < nDstChunkXSize ||
5386 0 : nReducedDstChunkYSize < nDstChunkYSize)
5387 : {
5388 12 : CPLStringList aosOptions(papszOptions);
5389 : aosOptions.SetNameValue(
5390 : "DST_CHUNK_X_SIZE",
5391 12 : CPLSPrintf("%d", nReducedDstChunkXSize));
5392 : aosOptions.SetNameValue(
5393 : "DST_CHUNK_Y_SIZE",
5394 12 : CPLSPrintf("%d", nReducedDstChunkYSize));
5395 :
5396 : const auto nTmpDSMemRequirement =
5397 12 : static_cast<GIntBig>(nDstTotalWidth) * nDstTotalHeight *
5398 12 : nBands * GDALGetDataTypeSizeBytes(eDataType);
5399 0 : std::unique_ptr<GDALDataset> poTmpDS;
5400 : // Config option mostly/only for autotest purposes
5401 : const char *pszGDAL_OVR_TEMP_DRIVER =
5402 12 : CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
5403 12 : if ((nTmpDSMemRequirement <= nChunkMaxSize &&
5404 2 : !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
5405 10 : EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
5406 : {
5407 : auto poTmpDrv =
5408 11 : GetGDALDriverManager()->GetDriverByName("MEM");
5409 11 : if (!poTmpDrv)
5410 : {
5411 0 : eErr = CE_Failure;
5412 0 : break;
5413 : }
5414 11 : poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
5415 : nDstTotalHeight, nBands,
5416 11 : eDataType, nullptr));
5417 : }
5418 : else
5419 : {
5420 : auto poTmpDrv =
5421 1 : GetGDALDriverManager()->GetDriverByName("GTiff");
5422 1 : if (!poTmpDrv)
5423 : {
5424 0 : eErr = CE_Failure;
5425 0 : break;
5426 : }
5427 2 : std::string osTmpFilename;
5428 1 : auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
5429 1 : if (poDstDS)
5430 : {
5431 1 : osTmpFilename = poDstDS->GetDescription();
5432 : VSIStatBufL sStatBuf;
5433 1 : if (!osTmpFilename.empty() &&
5434 0 : VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
5435 0 : osTmpFilename += "_tmp_ovr.tif";
5436 : }
5437 1 : if (osTmpFilename.empty())
5438 : {
5439 1 : osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
5440 1 : osTmpFilename += ".tif";
5441 : }
5442 1 : CPLDebug("GDAL",
5443 : "Creating temporary file %s of %d x %d x %d",
5444 : osTmpFilename.c_str(), nDstTotalWidth,
5445 : nDstTotalHeight, nBands);
5446 2 : CPLStringList aosCO;
5447 1 : poTmpDS.reset(poTmpDrv->Create(
5448 : osTmpFilename.c_str(), nDstTotalWidth, nDstTotalHeight,
5449 1 : nBands, eDataType, aosCO.List()));
5450 1 : if (poTmpDS)
5451 : {
5452 1 : poTmpDS->MarkSuppressOnClose();
5453 1 : VSIUnlink(osTmpFilename.c_str());
5454 : }
5455 : }
5456 12 : if (!poTmpDS)
5457 : {
5458 0 : eErr = CE_Failure;
5459 0 : break;
5460 : }
5461 :
5462 12 : std::vector<GDALRasterBand **> apapoOverviewBands(nBands);
5463 27 : for (int i = 0; i < nBands; ++i)
5464 : {
5465 30 : apapoOverviewBands[i] = static_cast<GDALRasterBand **>(
5466 15 : CPLMalloc(sizeof(GDALRasterBand *)));
5467 15 : apapoOverviewBands[i][0] = poTmpDS->GetRasterBand(i + 1);
5468 : }
5469 :
5470 : const double dfExtraPixels =
5471 24 : static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
5472 12 : papapoOverviewBands[0][iOverview]->GetXSize() *
5473 24 : static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
5474 12 : papapoOverviewBands[0][iOverview]->GetYSize();
5475 :
5476 24 : void *pScaledProgressData = GDALCreateScaledProgress(
5477 : dfCurPixelCount / dfTotalPixelCount,
5478 12 : (dfCurPixelCount + dfExtraPixels) / dfTotalPixelCount,
5479 : pfnProgress, pProgressData);
5480 :
5481 : // Generate overviews in temporary dataset
5482 12 : eErr = GDALRegenerateOverviewsMultiBand(
5483 12 : nBands, papoSrcBands, 1, apapoOverviewBands.data(),
5484 : pszResampling, GDALScaledProgress, pScaledProgressData,
5485 12 : aosOptions.List());
5486 :
5487 12 : GDALDestroyScaledProgress(pScaledProgressData);
5488 :
5489 12 : dfCurPixelCount += dfExtraPixels;
5490 :
5491 27 : for (int i = 0; i < nBands; ++i)
5492 : {
5493 15 : CPLFree(apapoOverviewBands[i]);
5494 : }
5495 :
5496 : // Copy temporary dataset to destination overview bands
5497 :
5498 12 : if (eErr == CE_None)
5499 : {
5500 : // Check if all papapoOverviewBands[][iOverview] bands point
5501 : // to the same dataset. If so, we can use
5502 : // GDALDatasetCopyWholeRaster()
5503 : GDALDataset *poDstOvrBandDS =
5504 12 : papapoOverviewBands[0][iOverview]->GetDataset();
5505 12 : if (poDstOvrBandDS)
5506 : {
5507 15 : if (poDstOvrBandDS->GetRasterCount() != nBands ||
5508 3 : poDstOvrBandDS->GetRasterBand(1) !=
5509 3 : papapoOverviewBands[0][iOverview])
5510 : {
5511 9 : poDstOvrBandDS = nullptr;
5512 : }
5513 : else
5514 : {
5515 6 : for (int i = 1; poDstOvrBandDS && i < nBands; ++i)
5516 : {
5517 : GDALDataset *poThisDstOvrBandDS =
5518 3 : papapoOverviewBands[i][iOverview]
5519 3 : ->GetDataset();
5520 3 : if (poThisDstOvrBandDS == nullptr ||
5521 6 : poThisDstOvrBandDS != poDstOvrBandDS ||
5522 3 : poThisDstOvrBandDS->GetRasterBand(i + 1) !=
5523 3 : papapoOverviewBands[i][iOverview])
5524 : {
5525 0 : poDstOvrBandDS = nullptr;
5526 : }
5527 : }
5528 : }
5529 : }
5530 12 : if (poDstOvrBandDS)
5531 : {
5532 3 : eErr = GDALDatasetCopyWholeRaster(
5533 : GDALDataset::ToHandle(poTmpDS.get()),
5534 : GDALDataset::ToHandle(poDstOvrBandDS), nullptr,
5535 : nullptr, nullptr);
5536 : }
5537 : else
5538 : {
5539 18 : for (int i = 0; eErr == CE_None && i < nBands; ++i)
5540 : {
5541 9 : eErr = GDALRasterBandCopyWholeRaster(
5542 : GDALRasterBand::ToHandle(
5543 : poTmpDS->GetRasterBand(i + 1)),
5544 : GDALRasterBand::ToHandle(
5545 9 : papapoOverviewBands[i][iOverview]),
5546 : nullptr, nullptr, nullptr);
5547 : }
5548 : }
5549 : }
5550 :
5551 12 : if (eErr != CE_None)
5552 0 : break;
5553 :
5554 12 : continue;
5555 : }
5556 : }
5557 :
5558 : // Structure describing a resampling job
5559 : struct OvrJob
5560 : {
5561 : // Buffers to free when job is finished
5562 : std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
5563 : std::unique_ptr<PointerHolder> oSrcBufferHolder{};
5564 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
5565 :
5566 : GDALRasterBand *poDstBand = nullptr;
5567 :
5568 : // Input parameters of pfnResampleFn
5569 : GDALResampleFunction pfnResampleFn = nullptr;
5570 : GDALOverviewResampleArgs args{};
5571 : const void *pChunk = nullptr;
5572 :
5573 : // Output values of resampling function
5574 : CPLErr eErr = CE_Failure;
5575 : void *pDstBuffer = nullptr;
5576 : GDALDataType eDstBufferDataType = GDT_Unknown;
5577 :
5578 : // Synchronization
5579 : bool bFinished = false;
5580 : std::mutex mutex{};
5581 : std::condition_variable cv{};
5582 : };
5583 :
5584 : // Thread function to resample
5585 16316 : const auto JobResampleFunc = [](void *pData)
5586 : {
5587 16316 : OvrJob *poJob = static_cast<OvrJob *>(pData);
5588 :
5589 16316 : poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
5590 : &(poJob->pDstBuffer),
5591 : &(poJob->eDstBufferDataType));
5592 :
5593 16316 : poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
5594 :
5595 : {
5596 32632 : std::lock_guard<std::mutex> guard(poJob->mutex);
5597 16316 : poJob->bFinished = true;
5598 16316 : poJob->cv.notify_one();
5599 : }
5600 16316 : };
5601 :
5602 : // Function to write resample data to target band
5603 16316 : const auto WriteJobData = [](const OvrJob *poJob)
5604 : {
5605 32632 : return poJob->poDstBand->RasterIO(
5606 16316 : GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
5607 16316 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
5608 16316 : poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
5609 16316 : poJob->args.nDstXOff2 - poJob->args.nDstXOff,
5610 16316 : poJob->args.nDstYOff2 - poJob->args.nDstYOff,
5611 16316 : poJob->eDstBufferDataType, 0, 0, nullptr);
5612 : };
5613 :
5614 : // Wait for completion of oldest job and serialize it
5615 : const auto WaitAndFinalizeOldestJob =
5616 38 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
5617 : {
5618 38 : auto poOldestJob = jobList.front().get();
5619 : {
5620 76 : std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
5621 : // coverity[missing_lock:FALSE]
5622 52 : while (!poOldestJob->bFinished)
5623 : {
5624 14 : poOldestJob->cv.wait(oGuard);
5625 : }
5626 : }
5627 38 : CPLErr l_eErr = poOldestJob->eErr;
5628 38 : if (l_eErr == CE_None)
5629 : {
5630 38 : l_eErr = WriteJobData(poOldestJob);
5631 : }
5632 :
5633 38 : jobList.pop_front();
5634 38 : return l_eErr;
5635 : };
5636 :
5637 : // Queue of jobs
5638 1206 : std::list<std::unique_ptr<OvrJob>> jobList;
5639 :
5640 1206 : std::vector<void *> apaChunk(nBands);
5641 1206 : std::vector<GByte *> apabyChunkNoDataMask(nBands);
5642 :
5643 : // Iterate on destination overview, block by block.
5644 603 : for (int nDstYOff = nDstYOffStart;
5645 2278 : nDstYOff < nDstYOffEnd && eErr == CE_None;
5646 1675 : nDstYOff += nDstChunkYSize)
5647 : {
5648 : int nDstYCount;
5649 1675 : if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
5650 1257 : nDstYCount = nDstChunkYSize;
5651 : else
5652 418 : nDstYCount = nDstYOffEnd - nDstYOff;
5653 :
5654 1675 : int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
5655 1675 : int nChunkYOff2 = static_cast<int>(
5656 1675 : ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
5657 1675 : if (nChunkYOff2 > nSrcHeight ||
5658 1675 : nDstYOff + nDstYCount == nDstTotalHeight)
5659 600 : nChunkYOff2 = nSrcHeight;
5660 1675 : int nYCount = nChunkYOff2 - nChunkYOff;
5661 1675 : CPLAssert(nYCount <= nFullResYChunk);
5662 :
5663 1675 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
5664 1675 : int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
5665 1675 : if (nChunkYOffQueried < 0)
5666 : {
5667 140 : nChunkYSizeQueried += nChunkYOffQueried;
5668 140 : nChunkYOffQueried = 0;
5669 : }
5670 1675 : if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
5671 139 : nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
5672 1675 : CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
5673 :
5674 1675 : if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount, nullptr,
5675 : pProgressData))
5676 : {
5677 1 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
5678 1 : eErr = CE_Failure;
5679 : }
5680 :
5681 : // Iterate on destination overview, block by block.
5682 1675 : for (int nDstXOff = nDstXOffStart;
5683 10129 : nDstXOff < nDstXOffEnd && eErr == CE_None;
5684 8454 : nDstXOff += nDstChunkXSize)
5685 : {
5686 8454 : int nDstXCount = 0;
5687 8454 : if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
5688 8257 : nDstXCount = nDstChunkXSize;
5689 : else
5690 197 : nDstXCount = nDstXOffEnd - nDstXOff;
5691 :
5692 8454 : dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
5693 :
5694 8454 : int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
5695 8454 : int nChunkXOff2 = static_cast<int>(
5696 8454 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
5697 8454 : if (nChunkXOff2 > nSrcWidth ||
5698 8454 : nDstXOff + nDstXCount == nDstTotalWidth)
5699 1673 : nChunkXOff2 = nSrcWidth;
5700 8454 : const int nXCount = nChunkXOff2 - nChunkXOff;
5701 8454 : CPLAssert(nXCount <= nFullResXChunk);
5702 :
5703 8454 : int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
5704 8454 : int nChunkXSizeQueried =
5705 8454 : nXCount + 2 * nKernelRadius * nOvrFactor;
5706 8454 : if (nChunkXOffQueried < 0)
5707 : {
5708 200 : nChunkXSizeQueried += nChunkXOffQueried;
5709 200 : nChunkXOffQueried = 0;
5710 : }
5711 8454 : if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
5712 203 : nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
5713 8454 : CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
5714 : #if DEBUG_VERBOSE
5715 : CPLDebug("GDAL",
5716 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
5717 : nChunkXOffQueried, nChunkYOffQueried,
5718 : nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
5719 : nDstYOff, nDstXCount, nDstYCount);
5720 : #endif
5721 :
5722 : // Avoid accumulating too many tasks and exhaust RAM
5723 :
5724 : // Try to complete already finished jobs
5725 16528 : while (eErr == CE_None && !jobList.empty())
5726 : {
5727 8133 : auto poOldestJob = jobList.front().get();
5728 : {
5729 8133 : std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
5730 8133 : if (!poOldestJob->bFinished)
5731 : {
5732 59 : break;
5733 : }
5734 : }
5735 8074 : eErr = poOldestJob->eErr;
5736 8074 : if (eErr == CE_None)
5737 : {
5738 8074 : eErr = WriteJobData(poOldestJob);
5739 : }
5740 :
5741 8074 : jobList.pop_front();
5742 : }
5743 :
5744 : // And in case we have saturated the number of threads,
5745 : // wait for completion of tasks to go below the threshold.
5746 16952 : while (eErr == CE_None &&
5747 8476 : jobList.size() >= static_cast<size_t>(nThreads))
5748 : {
5749 22 : eErr = WaitAndFinalizeOldestJob(jobList);
5750 : }
5751 :
5752 : // (Re)allocate buffers if needed
5753 24771 : for (int iBand = 0; iBand < nBands; ++iBand)
5754 : {
5755 16317 : if (apaChunk[iBand] == nullptr)
5756 : {
5757 9292 : apaChunk[iBand] = VSI_MALLOC3_VERBOSE(
5758 : nFullResXChunkQueried, nFullResYChunkQueried,
5759 : nWrkDataTypeSize);
5760 9292 : if (apaChunk[iBand] == nullptr)
5761 : {
5762 0 : eErr = CE_Failure;
5763 : }
5764 : }
5765 24754 : if (bUseNoDataMask &&
5766 8437 : apabyChunkNoDataMask[iBand] == nullptr)
5767 : {
5768 16756 : apabyChunkNoDataMask[iBand] =
5769 8378 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
5770 : nFullResXChunkQueried, nFullResYChunkQueried));
5771 8378 : if (apabyChunkNoDataMask[iBand] == nullptr)
5772 : {
5773 0 : eErr = CE_Failure;
5774 : }
5775 : }
5776 : }
5777 :
5778 : // Read the source buffers for all the bands.
5779 24771 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
5780 : {
5781 16317 : GDALRasterBand *poSrcBand = nullptr;
5782 16317 : if (iSrcOverview == -1)
5783 15405 : poSrcBand = papoSrcBands[iBand];
5784 : else
5785 912 : poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
5786 16317 : eErr = poSrcBand->RasterIO(
5787 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
5788 16317 : nChunkXSizeQueried, nChunkYSizeQueried, apaChunk[iBand],
5789 : nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType, 0,
5790 : 0, nullptr);
5791 :
5792 16317 : if (bUseNoDataMask && eErr == CE_None)
5793 : {
5794 8437 : auto poMaskBand = poSrcBand->IsMaskBand()
5795 8437 : ? poSrcBand
5796 6334 : : poSrcBand->GetMaskBand();
5797 8437 : eErr = poMaskBand->RasterIO(
5798 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
5799 : nChunkXSizeQueried, nChunkYSizeQueried,
5800 8437 : apabyChunkNoDataMask[iBand], nChunkXSizeQueried,
5801 : nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
5802 : }
5803 : }
5804 :
5805 : // Compute the resulting overview block.
5806 24770 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
5807 : {
5808 32632 : auto poJob = std::make_unique<OvrJob>();
5809 16316 : poJob->pfnResampleFn = pfnResampleFn;
5810 16316 : poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
5811 32632 : poJob->args.eOvrDataType =
5812 16316 : poJob->poDstBand->GetRasterDataType();
5813 16316 : poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
5814 16316 : poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
5815 16316 : const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
5816 16316 : "NBITS", "IMAGE_STRUCTURE");
5817 16316 : poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
5818 16316 : poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
5819 16316 : poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
5820 16316 : poJob->args.eWrkDataType = eWrkDataType;
5821 16316 : poJob->pChunk = apaChunk[iBand];
5822 16316 : poJob->args.pabyChunkNodataMask =
5823 16316 : apabyChunkNoDataMask[iBand];
5824 16316 : poJob->args.nChunkXOff = nChunkXOffQueried;
5825 16316 : poJob->args.nChunkXSize = nChunkXSizeQueried;
5826 16316 : poJob->args.nChunkYOff = nChunkYOffQueried;
5827 16316 : poJob->args.nChunkYSize = nChunkYSizeQueried;
5828 16316 : poJob->args.nDstXOff = nDstXOff;
5829 16316 : poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
5830 16316 : poJob->args.nDstYOff = nDstYOff;
5831 16316 : poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
5832 16316 : poJob->args.pszResampling = pszResampling;
5833 16316 : poJob->args.bHasNoData = pabHasNoData[iBand];
5834 16316 : poJob->args.dfNoDataValue = padfNoDataValue[iBand];
5835 16316 : poJob->args.eSrcDataType = eDataType;
5836 16316 : poJob->args.bPropagateNoData = bPropagateNoData;
5837 :
5838 16316 : if (poJobQueue)
5839 : {
5840 16224 : poJob->oSrcMaskBufferHolder.reset(
5841 8112 : new PointerHolder(apabyChunkNoDataMask[iBand]));
5842 8112 : apabyChunkNoDataMask[iBand] = nullptr;
5843 :
5844 16224 : poJob->oSrcBufferHolder.reset(
5845 8112 : new PointerHolder(apaChunk[iBand]));
5846 8112 : apaChunk[iBand] = nullptr;
5847 :
5848 8112 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
5849 8112 : jobList.emplace_back(std::move(poJob));
5850 : }
5851 : else
5852 : {
5853 8204 : JobResampleFunc(poJob.get());
5854 8204 : eErr = poJob->eErr;
5855 8204 : if (eErr == CE_None)
5856 : {
5857 8204 : eErr = WriteJobData(poJob.get());
5858 : }
5859 : }
5860 : }
5861 : }
5862 : }
5863 :
5864 : // Wait for all pending jobs to complete
5865 619 : while (!jobList.empty())
5866 : {
5867 16 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5868 16 : if (l_eErr != CE_None && eErr == CE_None)
5869 0 : eErr = l_eErr;
5870 : }
5871 :
5872 : // Flush the data to overviews.
5873 1797 : for (int iBand = 0; iBand < nBands; ++iBand)
5874 : {
5875 1194 : CPLFree(apaChunk[iBand]);
5876 1194 : papapoOverviewBands[iBand][iOverview]->FlushCache(false);
5877 :
5878 1194 : CPLFree(apabyChunkNoDataMask[iBand]);
5879 : }
5880 : }
5881 :
5882 372 : CPLFree(pabHasNoData);
5883 372 : CPLFree(padfNoDataValue);
5884 :
5885 372 : if (eErr == CE_None)
5886 370 : pfnProgress(1.0, nullptr, pProgressData);
5887 :
5888 372 : return eErr;
5889 : }
5890 :
5891 : /************************************************************************/
5892 : /* GDALRegenerateOverviewsMultiBand() */
5893 : /************************************************************************/
5894 :
5895 : /**
5896 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5897 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5898 : *
5899 : * This function will generate one or more overview images from a base
5900 : * image using the requested downsampling algorithm. Its primary use
5901 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
5902 : * can also be used to generate downsampled images in one file from another
5903 : * outside the overview architecture.
5904 : *
5905 : * The output bands need to exist in advance and share the same characteristics
5906 : * (type, dimensions)
5907 : *
5908 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5909 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5910 : *
5911 : * It does not support color tables or complex data types.
5912 : *
5913 : * The pseudo-algorithm used by the function is :
5914 : * for each overview
5915 : * iterate on lines of the source by a step of deltay
5916 : * iterate on columns of the source by a step of deltax
5917 : * read the source data of size deltax * deltay for all the bands
5918 : * generate the corresponding overview block for all the bands
5919 : *
5920 : * This function will honour properly NODATA_VALUES tuples (special dataset
5921 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5922 : * considered as the nodata value and not each value of the triplet
5923 : * independently per band.
5924 : *
5925 : * The GDAL_NUM_THREADS configuration option can be set
5926 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5927 : * overview computation.
5928 : *
5929 : * @param apoSrcBands the list of source bands to downsample
5930 : * @param aapoOverviewBands bidimension array of bands. First dimension is
5931 : * indexed by bands. Second dimension is indexed by
5932 : * overview levels. All aapoOverviewBands[i] arrays
5933 : * must have the same size (i.e. same number of
5934 : * overviews)
5935 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5936 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5937 : * @param pfnProgress progress report function.
5938 : * @param pProgressData progress function callback data.
5939 : * @param papszOptions NULL terminated list of options as
5940 : * key=value pairs, or NULL
5941 : * The XOFF, YOFF, XSIZE and YSIZE
5942 : * options can be specified to express that overviews should
5943 : * be regenerated only in the specified subset of the source
5944 : * dataset.
5945 : * @return CE_None on success or CE_Failure on failure.
5946 : * @since 3.10
5947 : */
5948 :
5949 5 : CPLErr GDALRegenerateOverviewsMultiBand(
5950 : const std::vector<GDALRasterBand *> &apoSrcBands,
5951 : const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
5952 : const char *pszResampling, GDALProgressFunc pfnProgress,
5953 : void *pProgressData, CSLConstList papszOptions)
5954 : {
5955 5 : CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
5956 15 : for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
5957 : {
5958 10 : CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
5959 : }
5960 :
5961 5 : if (aapoOverviewBands.empty())
5962 0 : return CE_None;
5963 :
5964 5 : std::vector<GDALRasterBand **> apapoOverviewBands;
5965 20 : for (auto &apoOverviewBands : aapoOverviewBands)
5966 : {
5967 : auto papoOverviewBands = static_cast<GDALRasterBand **>(
5968 15 : CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
5969 30 : for (size_t i = 0; i < apoOverviewBands.size(); ++i)
5970 : {
5971 15 : papoOverviewBands[i] = apoOverviewBands[i];
5972 : }
5973 15 : apapoOverviewBands.push_back(papoOverviewBands);
5974 : }
5975 10 : const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
5976 5 : static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
5977 5 : static_cast<int>(aapoOverviewBands[0].size()),
5978 5 : apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
5979 : papszOptions);
5980 20 : for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
5981 15 : CPLFree(papoOverviewBands);
5982 5 : return eErr;
5983 : }
5984 :
5985 : /************************************************************************/
5986 : /* GDALComputeBandStats() */
5987 : /************************************************************************/
5988 :
5989 : /** Undocumented
5990 : * @param hSrcBand undocumented.
5991 : * @param nSampleStep Step between scanlines used to compute statistics.
5992 : * When nSampleStep is equal to 1, all scanlines will
5993 : * be processed.
5994 : * @param pdfMean undocumented.
5995 : * @param pdfStdDev undocumented.
5996 : * @param pfnProgress undocumented.
5997 : * @param pProgressData undocumented.
5998 : * @return undocumented
5999 : */
6000 16 : CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
6001 : int nSampleStep, double *pdfMean,
6002 : double *pdfStdDev,
6003 : GDALProgressFunc pfnProgress,
6004 : void *pProgressData)
6005 :
6006 : {
6007 16 : VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
6008 :
6009 16 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
6010 :
6011 16 : if (pfnProgress == nullptr)
6012 16 : pfnProgress = GDALDummyProgress;
6013 :
6014 16 : const int nWidth = poSrcBand->GetXSize();
6015 16 : const int nHeight = poSrcBand->GetYSize();
6016 :
6017 16 : if (nSampleStep >= nHeight || nSampleStep < 1)
6018 3 : nSampleStep = 1;
6019 :
6020 16 : GDALDataType eWrkType = GDT_Unknown;
6021 16 : float *pafData = nullptr;
6022 16 : GDALDataType eType = poSrcBand->GetRasterDataType();
6023 16 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6024 16 : if (bComplex)
6025 : {
6026 : pafData = static_cast<float *>(
6027 0 : VSI_MALLOC_VERBOSE(nWidth * 2 * sizeof(float)));
6028 0 : eWrkType = GDT_CFloat32;
6029 : }
6030 : else
6031 : {
6032 : pafData =
6033 16 : static_cast<float *>(VSI_MALLOC_VERBOSE(nWidth * sizeof(float)));
6034 16 : eWrkType = GDT_Float32;
6035 : }
6036 :
6037 16 : if (nWidth == 0 || pafData == nullptr)
6038 : {
6039 0 : VSIFree(pafData);
6040 0 : return CE_Failure;
6041 : }
6042 :
6043 : /* -------------------------------------------------------------------- */
6044 : /* Loop over all sample lines. */
6045 : /* -------------------------------------------------------------------- */
6046 16 : double dfSum = 0.0;
6047 16 : double dfSum2 = 0.0;
6048 16 : int iLine = 0;
6049 16 : GIntBig nSamples = 0;
6050 :
6051 2143 : do
6052 : {
6053 2159 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6054 : pProgressData))
6055 : {
6056 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6057 0 : CPLFree(pafData);
6058 0 : return CE_Failure;
6059 : }
6060 :
6061 : const CPLErr eErr =
6062 2159 : poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
6063 : 1, eWrkType, 0, 0, nullptr);
6064 2159 : if (eErr != CE_None)
6065 : {
6066 1 : CPLFree(pafData);
6067 1 : return eErr;
6068 : }
6069 :
6070 725204 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6071 : {
6072 723046 : float fValue = 0.0f;
6073 :
6074 723046 : if (bComplex)
6075 : {
6076 : // Compute the magnitude of the complex value.
6077 : fValue =
6078 0 : std::hypot(pafData[iPixel * 2], pafData[iPixel * 2 + 1]);
6079 : }
6080 : else
6081 : {
6082 723046 : fValue = pafData[iPixel];
6083 : }
6084 :
6085 723046 : dfSum += fValue;
6086 723046 : dfSum2 += static_cast<double>(fValue) * fValue;
6087 : }
6088 :
6089 2158 : nSamples += nWidth;
6090 2158 : iLine += nSampleStep;
6091 2158 : } while (iLine < nHeight);
6092 :
6093 15 : if (!pfnProgress(1.0, nullptr, pProgressData))
6094 : {
6095 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6096 0 : CPLFree(pafData);
6097 0 : return CE_Failure;
6098 : }
6099 :
6100 : /* -------------------------------------------------------------------- */
6101 : /* Produce the result values. */
6102 : /* -------------------------------------------------------------------- */
6103 15 : if (pdfMean != nullptr)
6104 15 : *pdfMean = dfSum / nSamples;
6105 :
6106 15 : if (pdfStdDev != nullptr)
6107 : {
6108 15 : const double dfMean = dfSum / nSamples;
6109 :
6110 15 : *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
6111 : }
6112 :
6113 15 : CPLFree(pafData);
6114 :
6115 15 : return CE_None;
6116 : }
6117 :
6118 : /************************************************************************/
6119 : /* GDALOverviewMagnitudeCorrection() */
6120 : /* */
6121 : /* Correct the mean and standard deviation of the overviews of */
6122 : /* the given band to match the base layer approximately. */
6123 : /************************************************************************/
6124 :
6125 : /** Undocumented
6126 : * @param hBaseBand undocumented.
6127 : * @param nOverviewCount undocumented.
6128 : * @param pahOverviews undocumented.
6129 : * @param pfnProgress undocumented.
6130 : * @param pProgressData undocumented.
6131 : * @return undocumented
6132 : */
6133 0 : CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
6134 : int nOverviewCount,
6135 : GDALRasterBandH *pahOverviews,
6136 : GDALProgressFunc pfnProgress,
6137 : void *pProgressData)
6138 :
6139 : {
6140 0 : VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
6141 :
6142 : /* -------------------------------------------------------------------- */
6143 : /* Compute mean/stddev for source raster. */
6144 : /* -------------------------------------------------------------------- */
6145 0 : double dfOrigMean = 0.0;
6146 0 : double dfOrigStdDev = 0.0;
6147 : {
6148 : const CPLErr eErr =
6149 0 : GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
6150 : pfnProgress, pProgressData);
6151 :
6152 0 : if (eErr != CE_None)
6153 0 : return eErr;
6154 : }
6155 :
6156 : /* -------------------------------------------------------------------- */
6157 : /* Loop on overview bands. */
6158 : /* -------------------------------------------------------------------- */
6159 0 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
6160 : {
6161 : GDALRasterBand *poOverview =
6162 0 : GDALRasterBand::FromHandle(pahOverviews[iOverview]);
6163 : double dfOverviewMean, dfOverviewStdDev;
6164 :
6165 : const CPLErr eErr =
6166 0 : GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
6167 : &dfOverviewStdDev, pfnProgress, pProgressData);
6168 :
6169 0 : if (eErr != CE_None)
6170 0 : return eErr;
6171 :
6172 0 : double dfGain = 1.0;
6173 0 : if (dfOrigStdDev >= 0.0001)
6174 0 : dfGain = dfOrigStdDev / dfOverviewStdDev;
6175 :
6176 : /* --------------------------------------------------------------------
6177 : */
6178 : /* Apply gain and offset. */
6179 : /* --------------------------------------------------------------------
6180 : */
6181 0 : const int nWidth = poOverview->GetXSize();
6182 0 : const int nHeight = poOverview->GetYSize();
6183 :
6184 0 : GDALDataType eWrkType = GDT_Unknown;
6185 0 : float *pafData = nullptr;
6186 0 : const GDALDataType eType = poOverview->GetRasterDataType();
6187 0 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6188 0 : if (bComplex)
6189 : {
6190 : pafData = static_cast<float *>(
6191 0 : VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6192 0 : eWrkType = GDT_CFloat32;
6193 : }
6194 : else
6195 : {
6196 : pafData = static_cast<float *>(
6197 0 : VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6198 0 : eWrkType = GDT_Float32;
6199 : }
6200 :
6201 0 : if (pafData == nullptr)
6202 : {
6203 0 : return CE_Failure;
6204 : }
6205 :
6206 0 : for (int iLine = 0; iLine < nHeight; ++iLine)
6207 : {
6208 0 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6209 : pProgressData))
6210 : {
6211 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6212 0 : CPLFree(pafData);
6213 0 : return CE_Failure;
6214 : }
6215 :
6216 0 : if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
6217 : nWidth, 1, eWrkType, 0, 0,
6218 0 : nullptr) != CE_None)
6219 : {
6220 0 : CPLFree(pafData);
6221 0 : return CE_Failure;
6222 : }
6223 :
6224 0 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6225 : {
6226 0 : if (bComplex)
6227 : {
6228 0 : pafData[iPixel * 2] *= static_cast<float>(dfGain);
6229 0 : pafData[iPixel * 2 + 1] *= static_cast<float>(dfGain);
6230 : }
6231 : else
6232 : {
6233 0 : pafData[iPixel] = static_cast<float>(
6234 0 : (pafData[iPixel] - dfOverviewMean) * dfGain +
6235 : dfOrigMean);
6236 : }
6237 : }
6238 :
6239 0 : if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
6240 : nWidth, 1, eWrkType, 0, 0,
6241 0 : nullptr) != CE_None)
6242 : {
6243 0 : CPLFree(pafData);
6244 0 : return CE_Failure;
6245 : }
6246 : }
6247 :
6248 0 : if (!pfnProgress(1.0, nullptr, pProgressData))
6249 : {
6250 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6251 0 : CPLFree(pafData);
6252 0 : return CE_Failure;
6253 : }
6254 :
6255 0 : CPLFree(pafData);
6256 : }
6257 :
6258 0 : return CE_None;
6259 : }
|