Line data Source code
1 :
2 : /******************************************************************************
3 : *
4 : * Project: GDAL Core
5 : * Purpose: Helper code to implement overview support in different drivers.
6 : * Author: Frank Warmerdam, warmerdam@pobox.com
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 2000, Frank Warmerdam
10 : * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * Permission is hereby granted, free of charge, to any person obtaining a
13 : * copy of this software and associated documentation files (the "Software"),
14 : * to deal in the Software without restriction, including without limitation
15 : * the rights to use, copy, modify, merge, publish, distribute, sublicense,
16 : * and/or sell copies of the Software, and to permit persons to whom the
17 : * Software is furnished to do so, subject to the following conditions:
18 : *
19 : * The above copyright notice and this permission notice shall be included
20 : * in all copies or substantial portions of the Software.
21 : *
22 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
23 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 : * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
25 : * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 : * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
27 : * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
28 : * DEALINGS IN THE SOFTWARE.
29 : ****************************************************************************/
30 :
31 : #include "cpl_port.h"
32 : #include "gdal_priv.h"
33 :
34 : #include <cmath>
35 : #include <cstddef>
36 : #include <cstdlib>
37 :
38 : #include <algorithm>
39 : #include <complex>
40 : #include <condition_variable>
41 : #include <limits>
42 : #include <list>
43 : #include <memory>
44 : #include <mutex>
45 : #include <vector>
46 :
47 : #include "cpl_conv.h"
48 : #include "cpl_error.h"
49 : #include "cpl_progress.h"
50 : #include "cpl_vsi.h"
51 : #include "gdal.h"
52 : #include "gdal_thread_pool.h"
53 : #include "gdalwarper.h"
54 :
55 : // Restrict to 64bit processors because they are guaranteed to have SSE2,
56 : // or if __AVX2__ is defined.
57 : #if defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
58 : #define USE_SSE2
59 :
60 : #include "gdalsse_priv.h"
61 :
62 : #ifdef __SSE3__
63 : #include <pmmintrin.h>
64 : #endif
65 : #ifdef __SSSE3__
66 : #include <tmmintrin.h>
67 : #endif
68 : #ifdef __SSE4_1__
69 : #include <smmintrin.h>
70 : #endif
71 : #ifdef __AVX2__
72 : #include <immintrin.h>
73 : #endif
74 :
75 : #endif
76 :
77 : // To be included after above USE_SSE2 and include gdalsse_priv.h
78 : // to avoid build issue on Windows x86
79 : #include "gdal_priv_templates.hpp"
80 :
81 : /************************************************************************/
82 : /* GDALResampleChunk_Near() */
83 : /************************************************************************/
84 :
85 : template <class T>
86 1062 : static CPLErr GDALResampleChunk_NearT(
87 : double dfXRatioDstToSrc, double dfYRatioDstToSrc, GDALDataType eWrkDataType,
88 : const T *pChunk, int nChunkXOff, int nChunkXSize, int nChunkYOff,
89 : int nDstXOff, int nDstXOff2, int nDstYOff, int nDstYOff2, T **ppDstBuffer)
90 :
91 : {
92 1062 : const int nDstXWidth = nDstXOff2 - nDstXOff;
93 :
94 : /* -------------------------------------------------------------------- */
95 : /* Allocate buffers. */
96 : /* -------------------------------------------------------------------- */
97 1062 : *ppDstBuffer = static_cast<T *>(
98 1062 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
99 : GDALGetDataTypeSizeBytes(eWrkDataType)));
100 1062 : if (*ppDstBuffer == nullptr)
101 : {
102 0 : return CE_Failure;
103 : }
104 1062 : T *const pDstBuffer = *ppDstBuffer;
105 :
106 : int *panSrcXOff =
107 1062 : static_cast<int *>(VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(int)));
108 :
109 1062 : if (panSrcXOff == nullptr)
110 : {
111 0 : VSIFree(panSrcXOff);
112 0 : return CE_Failure;
113 : }
114 :
115 : /* ==================================================================== */
116 : /* Precompute inner loop constants. */
117 : /* ==================================================================== */
118 500226 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
119 : {
120 499164 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
121 499164 : if (nSrcXOff < nChunkXOff)
122 0 : nSrcXOff = nChunkXOff;
123 :
124 499164 : panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
125 : }
126 :
127 : /* ==================================================================== */
128 : /* Loop over destination scanlines. */
129 : /* ==================================================================== */
130 136491 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
131 : {
132 135429 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
133 135429 : if (nSrcYOff < nChunkYOff)
134 0 : nSrcYOff = nChunkYOff;
135 :
136 135429 : const T *const pSrcScanline =
137 : pChunk +
138 135429 : (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
139 133063 : nChunkXOff;
140 :
141 : /* --------------------------------------------------------------------
142 : */
143 : /* Loop over destination pixels */
144 : /* --------------------------------------------------------------------
145 : */
146 135429 : T *pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
147 116177106 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
148 : {
149 116041740 : pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
150 : }
151 : }
152 :
153 1062 : CPLFree(panSrcXOff);
154 :
155 1062 : return CE_None;
156 : }
157 :
158 1062 : static CPLErr GDALResampleChunk_Near(
159 : double dfXRatioDstToSrc, double dfYRatioDstToSrc, double /* dfSrcXDelta */,
160 : double /* dfSrcYDelta */, GDALDataType eWrkDataType, const void *pChunk,
161 : const GByte * /* pabyChunkNodataMask_unused */, int nChunkXOff,
162 : int nChunkXSize, int nChunkYOff, int /* nChunkYSize */, int nDstXOff,
163 : int nDstXOff2, int nDstYOff, int nDstYOff2, GDALRasterBand * /*poOverview*/,
164 : void **ppDstBuffer, GDALDataType *peDstBufferDataType,
165 : const char * /* pszResampling_unused */, bool /* bHasNoData_unused */,
166 : double /* fNoDataValue_unused */,
167 : GDALColorTable * /* poColorTable_unused */, GDALDataType /* eSrcDataType */,
168 : bool /* bPropagateNoData */)
169 : {
170 1062 : *peDstBufferDataType = eWrkDataType;
171 1062 : if (eWrkDataType == GDT_Byte)
172 966 : return GDALResampleChunk_NearT(
173 : dfXRatioDstToSrc, dfYRatioDstToSrc, eWrkDataType,
174 : static_cast<const GByte *>(pChunk), nChunkXOff, nChunkXSize,
175 : nChunkYOff, nDstXOff, nDstXOff2, nDstYOff, nDstYOff2,
176 966 : reinterpret_cast<GByte **>(ppDstBuffer));
177 96 : else if (eWrkDataType == GDT_UInt16)
178 5 : return GDALResampleChunk_NearT(
179 : dfXRatioDstToSrc, dfYRatioDstToSrc, eWrkDataType,
180 : static_cast<const GInt16 *>(pChunk), nChunkXOff, nChunkXSize,
181 : nChunkYOff, nDstXOff, nDstXOff2, nDstYOff, nDstYOff2,
182 5 : reinterpret_cast<GInt16 **>(ppDstBuffer));
183 91 : else if (eWrkDataType == GDT_Float32)
184 64 : return GDALResampleChunk_NearT(
185 : dfXRatioDstToSrc, dfYRatioDstToSrc, eWrkDataType,
186 : static_cast<const float *>(pChunk), nChunkXOff, nChunkXSize,
187 : nChunkYOff, nDstXOff, nDstXOff2, nDstYOff, nDstYOff2,
188 64 : reinterpret_cast<float **>(ppDstBuffer));
189 27 : else if (eWrkDataType == GDT_Float64)
190 27 : return GDALResampleChunk_NearT(
191 : dfXRatioDstToSrc, dfYRatioDstToSrc, eWrkDataType,
192 : static_cast<const double *>(pChunk), nChunkXOff, nChunkXSize,
193 : nChunkYOff, nDstXOff, nDstXOff2, nDstYOff, nDstYOff2,
194 27 : reinterpret_cast<double **>(ppDstBuffer));
195 :
196 0 : CPLAssert(false);
197 : return CE_Failure;
198 : }
199 :
200 : namespace
201 : {
202 :
203 : // Find in the color table the entry whose RGB value is the closest
204 : // (using quadratic distance) to the test color, ignoring transparent entries.
205 3837 : int BestColorEntry(const std::vector<GDALColorEntry> &entries,
206 : const GDALColorEntry &test)
207 : {
208 3837 : int nMinDist = std::numeric_limits<int>::max();
209 3837 : size_t bestEntry = 0;
210 986109 : for (size_t i = 0; i < entries.size(); ++i)
211 : {
212 982272 : const GDALColorEntry &entry = entries[i];
213 : // Ignore transparent entries
214 982272 : if (entry.c4 == 0)
215 3237 : continue;
216 :
217 979035 : int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
218 979035 : ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
219 979035 : ((test.c3 - entry.c3) * (test.c3 - entry.c3));
220 979035 : if (nDist < nMinDist)
221 : {
222 15847 : nMinDist = nDist;
223 15847 : bestEntry = i;
224 : }
225 : }
226 3837 : return static_cast<int>(bestEntry);
227 : }
228 :
229 7 : std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
230 : int &transparentIdx)
231 : {
232 7 : std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
233 :
234 7 : transparentIdx = -1;
235 7 : int i = 0;
236 1799 : for (auto &entry : entries)
237 : {
238 1792 : table.GetColorEntryAsRGB(i, &entry);
239 1792 : if (transparentIdx < 0 && entry.c4 == 0)
240 1 : transparentIdx = i;
241 1792 : ++i;
242 : }
243 7 : return entries;
244 : }
245 :
246 : } // unnamed namespace
247 :
248 : /************************************************************************/
249 : /* SQUARE() */
250 : /************************************************************************/
251 :
252 3721 : template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
253 : {
254 3721 : return static_cast<Tsquare>(val) * val;
255 : }
256 :
257 : /************************************************************************/
258 : /* ComputeIntegerRMS() */
259 : /************************************************************************/
260 : // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
261 : // integer that minimizes abs(rms**2 - sumSquares / weight)
262 : template <class T, class Twork>
263 42 : inline T ComputeIntegerRMS(double sumSquares, double weight)
264 : {
265 42 : const double sumDivWeight = sumSquares / weight;
266 42 : T rms = static_cast<T>(sqrt(sumDivWeight));
267 :
268 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
269 : // Naive version:
270 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
271 42 : if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
272 42 : 2 * sumDivWeight)
273 6 : rms += 1;
274 42 : return rms;
275 : }
276 :
277 0 : template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
278 : {
279 0 : CPLAssert(false);
280 : return 0;
281 : }
282 :
283 24 : template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
284 : {
285 : // It has been verified that given the correction on rms below, using
286 : // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
287 : // is equivalent, so use the former as it is used twice.
288 24 : const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
289 24 : const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
290 24 : GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
291 :
292 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
293 : // Naive version:
294 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
295 : // Optimized version for integer case and weight == 4
296 24 : if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
297 5 : rms += 1;
298 24 : return rms;
299 : }
300 :
301 : template <>
302 20 : inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
303 : {
304 20 : const double sumDivWeight = sumSquares * 0.25;
305 20 : GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
306 :
307 : // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
308 : // Naive version:
309 : // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
310 : // Optimized version for integer case and weight == 4
311 20 : if (static_cast<GUInt32>(rms) * (rms + 1) <
312 20 : static_cast<GUInt32>(sumDivWeight + 0.25))
313 4 : rms += 1;
314 20 : return rms;
315 : }
316 :
317 : #ifdef USE_SSE2
318 :
319 : /************************************************************************/
320 : /* QuadraticMeanByteSSE2OrAVX2() */
321 : /************************************************************************/
322 :
323 : #ifdef __SSE4_1__
324 : #define sse2_packus_epi32 _mm_packus_epi32
325 : #else
326 516119 : inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
327 : {
328 516119 : const auto minus32768_32 = _mm_set1_epi32(-32768);
329 516119 : const auto minus32768_16 = _mm_set1_epi16(-32768);
330 516119 : a = _mm_add_epi32(a, minus32768_32);
331 516119 : b = _mm_add_epi32(b, minus32768_32);
332 516119 : a = _mm_packs_epi32(a, b);
333 516119 : a = _mm_sub_epi16(a, minus32768_16);
334 516119 : return a;
335 : }
336 : #endif
337 :
338 : #ifdef __SSSE3__
339 : #define sse2_hadd_epi16 _mm_hadd_epi16
340 : #else
341 4660650 : inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
342 : {
343 : // Horizontal addition of adjacent pairs
344 4660650 : const auto mask = _mm_set1_epi32(0xFFFF);
345 : const auto horizLo =
346 13982000 : _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
347 : const auto horizHi =
348 13982000 : _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
349 :
350 : // Recombine low and high parts
351 4660650 : return _mm_packs_epi32(horizLo, horizHi);
352 : }
353 : #endif
354 :
355 : #ifdef __AVX2__
356 :
357 : #define DEST_ELTS 16
358 : #define set1_epi16 _mm256_set1_epi16
359 : #define set1_epi32 _mm256_set1_epi32
360 : #define setzero _mm256_setzero_si256
361 : #define set1_ps _mm256_set1_ps
362 : #define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
363 : #define unpacklo_epi8 _mm256_unpacklo_epi8
364 : #define unpackhi_epi8 _mm256_unpackhi_epi8
365 : #define madd_epi16 _mm256_madd_epi16
366 : #define add_epi32 _mm256_add_epi32
367 : #define mul_ps _mm256_mul_ps
368 : #define cvtepi32_ps _mm256_cvtepi32_ps
369 : #define sqrt_ps _mm256_sqrt_ps
370 : #define cvttps_epi32 _mm256_cvttps_epi32
371 : #define packs_epi32 _mm256_packs_epi32
372 : #define packus_epi32 _mm256_packus_epi32
373 : #define srli_epi32 _mm256_srli_epi32
374 : #define mullo_epi16 _mm256_mullo_epi16
375 : #define srli_epi16 _mm256_srli_epi16
376 : #define cmpgt_epi16 _mm256_cmpgt_epi16
377 : #define add_epi16 _mm256_add_epi16
378 : #define sub_epi16 _mm256_sub_epi16
379 : #define packus_epi16 _mm256_packus_epi16
380 : /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
381 : /* to get the lower 128-bit bits of what would be a true 256-bit vector register
382 : */
383 : #define store_lo(x, y) \
384 : _mm_storeu_si128(reinterpret_cast<__m128i *>(x), \
385 : _mm256_extracti128_si256( \
386 : _mm256_permute4x64_epi64((y), 0 | (2 << 2)), 0))
387 : #define hadd_epi16 _mm256_hadd_epi16
388 : #define zeroupper() _mm256_zeroupper()
389 : #else
390 : #define DEST_ELTS 8
391 : #define set1_epi16 _mm_set1_epi16
392 : #define set1_epi32 _mm_set1_epi32
393 : #define setzero _mm_setzero_si128
394 : #define set1_ps _mm_set1_ps
395 : #define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
396 : #define unpacklo_epi8 _mm_unpacklo_epi8
397 : #define unpackhi_epi8 _mm_unpackhi_epi8
398 : #define madd_epi16 _mm_madd_epi16
399 : #define add_epi32 _mm_add_epi32
400 : #define mul_ps _mm_mul_ps
401 : #define cvtepi32_ps _mm_cvtepi32_ps
402 : #define sqrt_ps _mm_sqrt_ps
403 : #define cvttps_epi32 _mm_cvttps_epi32
404 : #define packs_epi32 _mm_packs_epi32
405 : #define packus_epi32 sse2_packus_epi32
406 : #define srli_epi32 _mm_srli_epi32
407 : #define mullo_epi16 _mm_mullo_epi16
408 : #define srli_epi16 _mm_srli_epi16
409 : #define cmpgt_epi16 _mm_cmpgt_epi16
410 : #define add_epi16 _mm_add_epi16
411 : #define sub_epi16 _mm_sub_epi16
412 : #define packus_epi16 _mm_packus_epi16
413 : #define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
414 : #define hadd_epi16 sse2_hadd_epi16
415 : #define zeroupper() (void)0
416 : #endif
417 :
418 : #if defined(__GNUC__) && defined(__AVX2__)
419 : // Disabling inlining works around a bug with gcc 9.3 (Ubuntu 20.04) in
420 : // -O2 -mavx2 mode in QuadraticMeanFloatSSE2(),
421 : // where the registry that contains minus_zero is correctly
422 : // loaded the first time the function is called (looking at the disassembly,
423 : // one sees it is loaded much earlier than the function), but gets corrupted
424 : // (zeroed) in following iterations.
425 : // It appears the bug is due to the explicit zeroupper() call at the end of
426 : // the function.
427 : // The bug is at least solved in gcc 10.2.
428 : // Inlining doesn't bring much here to performance.
429 : // This is also needed with gcc 9.3 on QuadraticMeanByteSSE2OrAVX2() in
430 : // -O3 -mavx2 mode
431 : #define NOINLINE __attribute__((noinline))
432 : #else
433 : #define NOINLINE
434 : #endif
435 :
436 : template <class T>
437 : static int NOINLINE
438 5385 : QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
439 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
440 : T *CPL_RESTRICT pDstScanline)
441 : {
442 : // Optimized implementation for RMS on Byte by
443 : // processing by group of 8 output pixels, so as to use
444 : // a single _mm_sqrt_ps() call for 4 output pixels
445 5385 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
446 :
447 5385 : int iDstPixel = 0;
448 5385 : const auto one16 = set1_epi16(1);
449 5385 : const auto one32 = set1_epi32(1);
450 5385 : const auto zero = setzero();
451 5385 : const auto minus32768 = set1_epi16(-32768);
452 :
453 521496 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
454 : {
455 : // Load 2 * DEST_ELTS bytes from each line
456 516111 : auto firstLine = loadu_int(pSrcScanlineShifted);
457 1032220 : auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
458 : // Extend those Bytes as UInt16s
459 516111 : auto firstLineLo = unpacklo_epi8(firstLine, zero);
460 516111 : auto firstLineHi = unpackhi_epi8(firstLine, zero);
461 516111 : auto secondLineLo = unpacklo_epi8(secondLine, zero);
462 516111 : auto secondLineHi = unpackhi_epi8(secondLine, zero);
463 :
464 : // Multiplication of 16 bit values and horizontal
465 : // addition of 32 bit results
466 : // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
467 516111 : firstLineLo = madd_epi16(firstLineLo, firstLineLo);
468 516111 : firstLineHi = madd_epi16(firstLineHi, firstLineHi);
469 516111 : secondLineLo = madd_epi16(secondLineLo, secondLineLo);
470 516111 : secondLineHi = madd_epi16(secondLineHi, secondLineHi);
471 :
472 : // Vertical addition
473 516111 : const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
474 516111 : const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
475 :
476 : const auto sumSquaresPlusOneDiv4Lo =
477 1032220 : srli_epi32(add_epi32(sumSquaresLo, one32), 2);
478 : const auto sumSquaresPlusOneDiv4Hi =
479 1032220 : srli_epi32(add_epi32(sumSquaresHi, one32), 2);
480 :
481 : // Take square root and truncate/floor to int32
482 : const auto rmsLo =
483 1548330 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
484 : const auto rmsHi =
485 1548330 : cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
486 :
487 : // Merge back low and high registers with each RMS value
488 : // as a 16 bit value.
489 516111 : auto rms = packs_epi32(rmsLo, rmsHi);
490 :
491 : // Round to upper value if it minimizes the
492 : // error |rms^2 - sumSquares/4|
493 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
494 : // rms += 1;
495 : // which is equivalent to:
496 : // if( rms * (rms + 1) < (sumSquares+1) / 4 )
497 : // rms += 1;
498 : // And both left and right parts fit on 16 (unsigned) bits
499 : const auto sumSquaresPlusOneDiv4 =
500 516111 : packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
501 : // cmpgt_epi16 operates on signed int16, but here
502 : // we have unsigned values, so shift them by -32768 before
503 2580560 : auto mask = cmpgt_epi16(
504 : add_epi16(sumSquaresPlusOneDiv4, minus32768),
505 : add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
506 : // The value of the mask will be -1 when the correction needs to be
507 : // applied
508 516111 : rms = sub_epi16(rms, mask);
509 :
510 : // Pack each 16 bit RMS value to 8 bits
511 516111 : rms = packus_epi16(rms, rms /* could be anything */);
512 516111 : store_lo(&pDstScanline[iDstPixel], rms);
513 516111 : pSrcScanlineShifted += 2 * DEST_ELTS;
514 : }
515 : zeroupper();
516 :
517 5385 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
518 5385 : return iDstPixel;
519 : }
520 :
521 : /************************************************************************/
522 : /* AverageByteSSE2OrAVX2() */
523 : /************************************************************************/
524 :
525 : template <class T>
526 : static int
527 110946 : AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
528 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
529 : T *CPL_RESTRICT pDstScanline)
530 : {
531 : // Optimized implementation for average on Byte by
532 : // processing by group of 8 output pixels.
533 :
534 110946 : const auto zero = setzero();
535 110946 : const auto two16 = set1_epi16(2);
536 110946 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
537 :
538 110946 : int iDstPixel = 0;
539 4771600 : for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
540 : {
541 : // Load 2 * DEST_ELTS bytes from each line
542 4660650 : const auto firstLine = loadu_int(pSrcScanlineShifted);
543 9321310 : const auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
544 : // Extend those Bytes as UInt16s
545 4660650 : const auto firstLineLo = unpacklo_epi8(firstLine, zero);
546 4660650 : const auto firstLineHi = unpackhi_epi8(firstLine, zero);
547 4660650 : const auto secondLineLo = unpacklo_epi8(secondLine, zero);
548 4660650 : const auto secondLineHi = unpackhi_epi8(secondLine, zero);
549 :
550 : // Vertical addition
551 4660650 : const auto sumLo = add_epi16(firstLineLo, secondLineLo);
552 4660650 : const auto sumHi = add_epi16(firstLineHi, secondLineHi);
553 :
554 : // Horizontal addition of adjacent pairs, and recombine low and high
555 : // parts
556 4660650 : const auto sum = hadd_epi16(sumLo, sumHi);
557 :
558 : // average = (sum + 2) / 4
559 9321310 : auto average = srli_epi16(add_epi16(sum, two16), 2);
560 :
561 : // Pack each 16 bit average value to 8 bits
562 4660650 : average = packus_epi16(average, average /* could be anything */);
563 4660650 : store_lo(&pDstScanline[iDstPixel], average);
564 4660650 : pSrcScanlineShifted += 2 * DEST_ELTS;
565 : }
566 : zeroupper();
567 :
568 110946 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
569 110946 : return iDstPixel;
570 : }
571 :
572 : /************************************************************************/
573 : /* QuadraticMeanUInt16SSE2() */
574 : /************************************************************************/
575 :
576 : #ifdef __SSE3__
577 : #define sse2_hadd_pd _mm_hadd_pd
578 : #else
579 8 : inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
580 : {
581 : auto aLo_bLo =
582 32 : _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
583 : auto aHi_bHi =
584 32 : _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
585 8 : return _mm_add_pd(aLo_bLo, aHi_bHi); // (aLo + aHi, bLo + bHi)
586 : }
587 : #endif
588 :
589 40 : inline __m128d SQUARE(__m128d x)
590 : {
591 40 : return _mm_mul_pd(x, x);
592 : }
593 :
594 : #ifdef __AVX2__
595 :
596 : inline __m256d SQUARE(__m256d x)
597 : {
598 : return _mm256_mul_pd(x, x);
599 : }
600 :
601 : inline __m256d FIXUP_LANES(__m256d x)
602 : {
603 : return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
604 : }
605 :
606 : inline __m256 FIXUP_LANES(__m256 x)
607 : {
608 : return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
609 : }
610 :
611 : #endif
612 :
613 : template <class T>
614 : static int
615 10 : QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
616 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
617 : T *CPL_RESTRICT pDstScanline)
618 : {
619 : // Optimized implementation for RMS on UInt16 by
620 : // processing by group of 4 output pixels.
621 10 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
622 :
623 10 : int iDstPixel = 0;
624 10 : const auto zero = _mm_setzero_si128();
625 :
626 : #ifdef __AVX2__
627 : const auto zeroDot25 = _mm256_set1_pd(0.25);
628 : const auto zeroDot5 = _mm256_set1_pd(0.5);
629 :
630 : // The first four 0's could be anything, as we only take the bottom
631 : // 128 bits.
632 : const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
633 : #else
634 10 : const auto zeroDot25 = _mm_set1_pd(0.25);
635 10 : const auto zeroDot5 = _mm_set1_pd(0.5);
636 : #endif
637 :
638 40 : for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
639 : {
640 : // Load 8 UInt16 from each line
641 30 : const auto firstLine = _mm_loadu_si128(
642 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
643 : const auto secondLine =
644 30 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
645 30 : pSrcScanlineShifted + nChunkXSize));
646 :
647 : // Detect if all of the source values fit in 14 bits.
648 : // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
649 : // and we can do a much faster implementation.
650 : const auto maskTmp =
651 60 : _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
652 : #if defined(__i386__) || defined(_M_IX86)
653 : uint64_t nMaskFitsIn14Bits = 0;
654 : _mm_storel_epi64(
655 : reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
656 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
657 : #else
658 30 : const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
659 : _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
660 : #endif
661 30 : if (nMaskFitsIn14Bits == 0)
662 : {
663 : // Multiplication of 16 bit values and horizontal
664 : // addition of 32 bit results
665 : const auto firstLineHSumSquare =
666 26 : _mm_madd_epi16(firstLine, firstLine);
667 : const auto secondLineHSumSquare =
668 26 : _mm_madd_epi16(secondLine, secondLine);
669 : // Vertical addition
670 : const auto sumSquares =
671 26 : _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
672 : // In theory we should take sqrt(sumSquares * 0.25f)
673 : // but given the rounding we do, this is equivalent to
674 : // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
675 : // sumSquares <= 4 * 16383^2
676 26 : const auto one32 = _mm_set1_epi32(1);
677 : const auto sumSquaresPlusOneDiv4 =
678 52 : _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
679 : // Take square root and truncate/floor to int32
680 78 : auto rms = _mm_cvttps_epi32(
681 : _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
682 :
683 : // Round to upper value if it minimizes the
684 : // error |rms^2 - sumSquares/4|
685 : // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
686 : // rms += 1;
687 : // which is equivalent to:
688 : // if( rms * rms + rms < (sumSquares+1) / 4 )
689 : // rms += 1;
690 : auto mask =
691 78 : _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
692 : _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
693 26 : rms = _mm_sub_epi32(rms, mask);
694 : // Pack each 32 bit RMS value to 16 bits
695 26 : rms = _mm_packs_epi32(rms, rms /* could be anything */);
696 : _mm_storel_epi64(
697 26 : reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
698 26 : pSrcScanlineShifted += 8;
699 26 : continue;
700 : }
701 :
702 : // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
703 : // to 32 bit would result in 4 multiplications instead of 8, but
704 : // mullo/mulhi have a worse throughput than mul_pd.
705 :
706 : // Extend those UInt16s as UInt32s
707 4 : const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
708 4 : const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
709 4 : const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
710 4 : const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
711 :
712 : #ifdef __AVX2__
713 : // Multiplication of 32 bit values previously converted to 64 bit double
714 : const auto firstLineLoDbl = SQUARE(_mm256_cvtepi32_pd(firstLineLo));
715 : const auto firstLineHiDbl = SQUARE(_mm256_cvtepi32_pd(firstLineHi));
716 : const auto secondLineLoDbl = SQUARE(_mm256_cvtepi32_pd(secondLineLo));
717 : const auto secondLineHiDbl = SQUARE(_mm256_cvtepi32_pd(secondLineHi));
718 :
719 : // Vertical addition of squares
720 : const auto sumSquaresLo =
721 : _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
722 : const auto sumSquaresHi =
723 : _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
724 :
725 : // Horizontal addition of squares
726 : const auto sumSquares =
727 : FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
728 :
729 : const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
730 :
731 : // Take square root and truncate/floor to int32
732 : auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
733 : const auto rmsDouble = _mm256_cvtepi32_pd(rms);
734 : const auto right = _mm256_sub_pd(
735 : sumDivWeight, _mm256_add_pd(SQUARE(rmsDouble), rmsDouble));
736 :
737 : auto mask =
738 : _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
739 : // Extract 32-bit from each of the 4 64-bit masks
740 : // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
741 : // _MM_SHUFFLE(2,0,2,0)));
742 : mask = _mm256_permutevar8x32_ps(mask, permutation);
743 : const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
744 :
745 : // Apply the correction
746 : rms = _mm_sub_epi32(rms, maskI);
747 :
748 : // Pack each 32 bit RMS value to 16 bits
749 : rms = _mm_packus_epi32(rms, rms /* could be anything */);
750 : #else
751 : // Multiplication of 32 bit values previously converted to 64 bit double
752 4 : const auto firstLineLoLo = SQUARE(_mm_cvtepi32_pd(firstLineLo));
753 : const auto firstLineLoHi =
754 8 : SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
755 4 : const auto firstLineHiLo = SQUARE(_mm_cvtepi32_pd(firstLineHi));
756 : const auto firstLineHiHi =
757 8 : SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
758 :
759 4 : const auto secondLineLoLo = SQUARE(_mm_cvtepi32_pd(secondLineLo));
760 : const auto secondLineLoHi =
761 8 : SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
762 4 : const auto secondLineHiLo = SQUARE(_mm_cvtepi32_pd(secondLineHi));
763 : const auto secondLineHiHi =
764 8 : SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
765 :
766 : // Vertical addition of squares
767 4 : const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
768 4 : const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
769 4 : const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
770 4 : const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
771 :
772 : // Horizontal addition of squares
773 4 : const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
774 4 : const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
775 :
776 4 : const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
777 4 : const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
778 : // Take square root and truncate/floor to int32
779 8 : const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
780 8 : const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
781 :
782 : // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
783 : // if( 0.5 < sumDivWeight - (rms * rms + rms) )
784 : // rms += 1;
785 4 : const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
786 4 : const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
787 8 : const auto rightLo = _mm_sub_pd(
788 : sumDivWeightLo, _mm_add_pd(SQUARE(rmsLoDouble), rmsLoDouble));
789 12 : const auto rightHi = _mm_sub_pd(
790 : sumDivWeightHi, _mm_add_pd(SQUARE(rmsHiDouble), rmsHiDouble));
791 :
792 8 : const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
793 4 : const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
794 : // The value of the mask will be -1 when the correction needs to be
795 : // applied
796 8 : const auto mask = _mm_castps_si128(_mm_shuffle_ps(
797 : maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
798 :
799 16 : auto rms = _mm_castps_si128(
800 : _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
801 : // Apply the correction
802 4 : rms = _mm_sub_epi32(rms, mask);
803 :
804 : // Pack each 32 bit RMS value to 16 bits
805 4 : rms = sse2_packus_epi32(rms, rms /* could be anything */);
806 : #endif
807 :
808 4 : _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
809 : rms);
810 4 : pSrcScanlineShifted += 8;
811 : }
812 :
813 : zeroupper();
814 :
815 10 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
816 10 : return iDstPixel;
817 : }
818 :
819 : /************************************************************************/
820 : /* AverageUInt16SSE2() */
821 : /************************************************************************/
822 :
823 : template <class T>
824 9 : static int AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
825 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
826 : T *CPL_RESTRICT pDstScanline)
827 : {
828 : // Optimized implementation for average on UInt16 by
829 : // processing by group of 8 output pixels.
830 :
831 9 : const auto mask = _mm_set1_epi32(0xFFFF);
832 9 : const auto two = _mm_set1_epi32(2);
833 9 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
834 :
835 9 : int iDstPixel = 0;
836 13 : for (; iDstPixel < nDstXWidth - 7; iDstPixel += 8)
837 : {
838 : __m128i averageLow;
839 : // Load 8 UInt16 from each line
840 : {
841 4 : const auto firstLine = _mm_loadu_si128(
842 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
843 : const auto secondLine =
844 4 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
845 4 : pSrcScanlineShifted + nChunkXSize));
846 :
847 : // Horizontal addition and extension to 32 bit
848 12 : const auto horizAddFirstLine = _mm_add_epi32(
849 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
850 : const auto horizAddSecondLine =
851 12 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
852 : _mm_srli_epi32(secondLine, 16));
853 :
854 : // Vertical addition and average computation
855 : // average = (sum + 2) >> 2
856 8 : const auto sum = _mm_add_epi32(
857 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
858 4 : averageLow = _mm_srli_epi32(sum, 2);
859 : }
860 : // Load 8 UInt16 from each line
861 : __m128i averageHigh;
862 : {
863 4 : const auto firstLine = _mm_loadu_si128(
864 4 : reinterpret_cast<__m128i const *>(pSrcScanlineShifted + 8));
865 : const auto secondLine =
866 4 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(
867 4 : pSrcScanlineShifted + 8 + nChunkXSize));
868 :
869 : // Horizontal addition and extension to 32 bit
870 12 : const auto horizAddFirstLine = _mm_add_epi32(
871 : _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
872 : const auto horizAddSecondLine =
873 12 : _mm_add_epi32(_mm_and_si128(secondLine, mask),
874 : _mm_srli_epi32(secondLine, 16));
875 :
876 : // Vertical addition and average computation
877 : // average = (sum + 2) >> 2
878 8 : const auto sum = _mm_add_epi32(
879 : _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
880 4 : averageHigh = _mm_srli_epi32(sum, 2);
881 : }
882 :
883 : // Pack each 32 bit average value to 16 bits
884 4 : auto average = sse2_packus_epi32(averageLow, averageHigh);
885 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
886 : average);
887 4 : pSrcScanlineShifted += 16;
888 : }
889 :
890 9 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
891 9 : return iDstPixel;
892 : }
893 :
894 : /************************************************************************/
895 : /* QuadraticMeanFloatSSE2() */
896 : /************************************************************************/
897 :
898 : #ifdef __AVX2__
899 : #define RMS_FLOAT_ELTS 8
900 : #define set1_ps _mm256_set1_ps
901 : #define loadu_ps _mm256_loadu_ps
902 : #define andnot_ps _mm256_andnot_ps
903 : #define and_ps _mm256_and_ps
904 : #define max_ps _mm256_max_ps
905 : #define shuffle_ps _mm256_shuffle_ps
906 : #define div_ps _mm256_div_ps
907 : #define cmpeq_ps(x, y) _mm256_cmp_ps(x, y, _CMP_EQ_OQ)
908 : #define mul_ps _mm256_mul_ps
909 : #define add_ps _mm256_add_ps
910 : #define hadd_ps _mm256_hadd_ps
911 : #define sqrt_ps _mm256_sqrt_ps
912 : #define or_ps _mm256_or_ps
913 : #define unpacklo_ps _mm256_unpacklo_ps
914 : #define unpackhi_ps _mm256_unpackhi_ps
915 : #define storeu_ps _mm256_storeu_ps
916 :
917 : inline __m256 SQUARE(__m256 x)
918 : {
919 : return _mm256_mul_ps(x, x);
920 : }
921 :
922 : #else
923 :
924 : #ifdef __SSE3__
925 : #define sse2_hadd_ps _mm_hadd_ps
926 : #else
927 : inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
928 : {
929 : auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
930 : auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
931 : return _mm_add_ps(aEven_bEven, aOdd_bOdd); // (aEven + aOdd, bEven + bOdd)
932 : }
933 : #endif
934 :
935 : #define RMS_FLOAT_ELTS 4
936 : #define set1_ps _mm_set1_ps
937 : #define loadu_ps _mm_loadu_ps
938 : #define andnot_ps _mm_andnot_ps
939 : #define and_ps _mm_and_ps
940 : #define max_ps _mm_max_ps
941 : #define shuffle_ps _mm_shuffle_ps
942 : #define div_ps _mm_div_ps
943 : #define cmpeq_ps _mm_cmpeq_ps
944 : #define mul_ps _mm_mul_ps
945 : #define add_ps _mm_add_ps
946 : #define hadd_ps sse2_hadd_ps
947 : #define sqrt_ps _mm_sqrt_ps
948 : #define or_ps _mm_or_ps
949 : #define unpacklo_ps _mm_unpacklo_ps
950 : #define unpackhi_ps _mm_unpackhi_ps
951 : #define storeu_ps _mm_storeu_ps
952 :
953 272 : inline __m128 SQUARE(__m128 x)
954 : {
955 272 : return _mm_mul_ps(x, x);
956 : }
957 :
958 68 : inline __m128 FIXUP_LANES(__m128 x)
959 : {
960 68 : return x;
961 : }
962 :
963 : #endif
964 :
965 : template <class T>
966 : static int NOINLINE
967 34 : QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
968 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
969 : T *CPL_RESTRICT pDstScanline)
970 : {
971 : // Optimized implementation for RMS on Float32 by
972 : // processing by group of RMS_FLOAT_ELTS output pixels.
973 34 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
974 :
975 34 : int iDstPixel = 0;
976 34 : const auto minus_zero = set1_ps(-0.0f);
977 34 : const auto zeroDot25 = set1_ps(0.25f);
978 34 : const auto one = set1_ps(1.0f);
979 68 : const auto infv = set1_ps(std::numeric_limits<float>::infinity());
980 :
981 102 : for (; iDstPixel < nDstXWidth - (RMS_FLOAT_ELTS - 1);
982 : iDstPixel += RMS_FLOAT_ELTS)
983 : {
984 : // Load 2*RMS_FLOAT_ELTS Float32 from each line
985 : auto firstLineLo =
986 68 : loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
987 68 : auto firstLineHi = loadu_ps(reinterpret_cast<float const *>(
988 68 : pSrcScanlineShifted + RMS_FLOAT_ELTS));
989 68 : auto secondLineLo = loadu_ps(
990 68 : reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
991 68 : auto secondLineHi = loadu_ps(reinterpret_cast<float const *>(
992 68 : pSrcScanlineShifted + RMS_FLOAT_ELTS + nChunkXSize));
993 :
994 : // Take the absolute value
995 68 : firstLineLo = andnot_ps(minus_zero, firstLineLo);
996 68 : firstLineHi = andnot_ps(minus_zero, firstLineHi);
997 68 : secondLineLo = andnot_ps(minus_zero, secondLineLo);
998 68 : secondLineHi = andnot_ps(minus_zero, secondLineHi);
999 :
1000 : auto firstLineEven =
1001 68 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1002 : auto firstLineOdd =
1003 68 : shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1004 : auto secondLineEven =
1005 68 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1006 : auto secondLineOdd =
1007 68 : shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1008 :
1009 : // Compute the maximum of each RMS_FLOAT_ELTS value to RMS-average
1010 204 : const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
1011 : max_ps(secondLineEven, secondLineEven));
1012 :
1013 : // Normalize each value by the maximum of the RMS_FLOAT_ELTS ones.
1014 : // This step is important to avoid that the square evaluates to infinity
1015 : // for sufficiently big input.
1016 68 : auto invMax = div_ps(one, maxV);
1017 : // Deal with 0 being the maximum to correct division by zero
1018 : // note: comparing to -0 leads to identical results as to comparing with
1019 : // 0
1020 136 : invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
1021 :
1022 68 : firstLineEven = mul_ps(firstLineEven, invMax);
1023 68 : firstLineOdd = mul_ps(firstLineOdd, invMax);
1024 68 : secondLineEven = mul_ps(secondLineEven, invMax);
1025 68 : secondLineOdd = mul_ps(secondLineOdd, invMax);
1026 :
1027 : // Compute squares
1028 68 : firstLineEven = SQUARE(firstLineEven);
1029 68 : firstLineOdd = SQUARE(firstLineOdd);
1030 68 : secondLineEven = SQUARE(secondLineEven);
1031 68 : secondLineOdd = SQUARE(secondLineOdd);
1032 :
1033 204 : const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
1034 : add_ps(secondLineEven, secondLineOdd));
1035 :
1036 204 : auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
1037 :
1038 : // Deal with infinity being the maximum
1039 68 : const auto maskIsInf = cmpeq_ps(maxV, infv);
1040 136 : rms = or_ps(andnot_ps(maskIsInf, rms), and_ps(maskIsInf, infv));
1041 :
1042 68 : rms = FIXUP_LANES(rms);
1043 :
1044 : // coverity[incompatible_cast]
1045 68 : storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]), rms);
1046 68 : pSrcScanlineShifted += RMS_FLOAT_ELTS * 2;
1047 : }
1048 :
1049 : zeroupper();
1050 :
1051 34 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1052 34 : return iDstPixel;
1053 : }
1054 :
1055 : /************************************************************************/
1056 : /* AverageFloatSSE2() */
1057 : /************************************************************************/
1058 :
1059 : template <class T>
1060 27 : static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
1061 : const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1062 : T *CPL_RESTRICT pDstScanline)
1063 : {
1064 : // Optimized implementation for average on Float32 by
1065 : // processing by group of 4 output pixels.
1066 27 : const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1067 :
1068 27 : int iDstPixel = 0;
1069 27 : const auto zeroDot25 = _mm_set1_ps(0.25f);
1070 :
1071 55 : for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
1072 : {
1073 : // Load 8 Float32 from each line
1074 : const auto firstLineLo =
1075 28 : _mm_loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
1076 28 : const auto firstLineHi = _mm_loadu_ps(
1077 28 : reinterpret_cast<float const *>(pSrcScanlineShifted + 4));
1078 28 : const auto secondLineLo = _mm_loadu_ps(
1079 28 : reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
1080 28 : const auto secondLineHi = _mm_loadu_ps(reinterpret_cast<float const *>(
1081 28 : pSrcScanlineShifted + 4 + nChunkXSize));
1082 :
1083 : // Vertical addition
1084 28 : const auto sumLo = _mm_add_ps(firstLineLo, secondLineLo);
1085 28 : const auto sumHi = _mm_add_ps(firstLineHi, secondLineHi);
1086 :
1087 : // Horizontal addition
1088 : const auto A =
1089 28 : _mm_shuffle_ps(sumLo, sumHi, 0 | (2 << 2) | (0 << 4) | (2 << 6));
1090 : const auto B =
1091 28 : _mm_shuffle_ps(sumLo, sumHi, 1 | (3 << 2) | (1 << 4) | (3 << 6));
1092 28 : const auto sum = _mm_add_ps(A, B);
1093 :
1094 28 : const auto average = _mm_mul_ps(sum, zeroDot25);
1095 :
1096 : // coverity[incompatible_cast]
1097 28 : _mm_storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]),
1098 : average);
1099 28 : pSrcScanlineShifted += 8;
1100 : }
1101 :
1102 27 : pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1103 27 : return iDstPixel;
1104 : }
1105 :
1106 : #endif
1107 :
1108 : /************************************************************************/
1109 : /* GDALResampleChunk_AverageOrRMS() */
1110 : /************************************************************************/
1111 :
1112 : template <class T, class Tsum, GDALDataType eWrkDataType>
1113 2287 : static CPLErr GDALResampleChunk_AverageOrRMS_T(
1114 : double dfXRatioDstToSrc, double dfYRatioDstToSrc, double dfSrcXDelta,
1115 : double dfSrcYDelta, const T *pChunk, const GByte *pabyChunkNodataMask,
1116 : int nChunkXOff, int nChunkXSize, int nChunkYOff, int nChunkYSize,
1117 : int nDstXOff, int nDstXOff2, int nDstYOff, int nDstYOff2,
1118 : GDALRasterBand *poOverview, void **ppDstBuffer, const char *pszResampling,
1119 : bool bHasNoData, double dfNoDataValue, GDALColorTable *poColorTable,
1120 : bool bPropagateNoData)
1121 : {
1122 : // AVERAGE_BIT2GRAYSCALE
1123 : const bool bBit2Grayscale =
1124 2287 : CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"));
1125 2287 : const bool bQuadraticMean = CPL_TO_BOOL(EQUAL(pszResampling, "RMS"));
1126 2287 : if (bBit2Grayscale)
1127 9 : poColorTable = nullptr;
1128 :
1129 : T tNoDataValue;
1130 2287 : if (!bHasNoData)
1131 2240 : tNoDataValue = 0;
1132 : else
1133 47 : tNoDataValue = static_cast<T>(dfNoDataValue);
1134 2287 : const T tReplacementVal =
1135 77 : bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
1136 : poOverview->GetRasterDataType(), dfNoDataValue))
1137 : : 0;
1138 :
1139 2287 : int nChunkRightXOff = nChunkXOff + nChunkXSize;
1140 2287 : int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1141 2287 : int nDstXWidth = nDstXOff2 - nDstXOff;
1142 :
1143 : /* -------------------------------------------------------------------- */
1144 : /* Allocate buffers. */
1145 : /* -------------------------------------------------------------------- */
1146 2287 : *ppDstBuffer = static_cast<T *>(
1147 2287 : VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
1148 : GDALGetDataTypeSizeBytes(eWrkDataType)));
1149 2287 : if (*ppDstBuffer == nullptr)
1150 : {
1151 0 : return CE_Failure;
1152 : }
1153 2287 : T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
1154 :
1155 : struct PrecomputedXValue
1156 : {
1157 : int nLeftXOffShifted;
1158 : int nRightXOffShifted;
1159 : double dfLeftWeight;
1160 : double dfRightWeight;
1161 : double dfTotalWeightFullLine;
1162 : };
1163 :
1164 : PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
1165 2287 : VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(PrecomputedXValue)));
1166 :
1167 2287 : if (pasSrcX == nullptr)
1168 : {
1169 0 : VSIFree(pasSrcX);
1170 0 : return CE_Failure;
1171 : }
1172 :
1173 2287 : int nTransparentIdx = -1;
1174 2287 : std::vector<GDALColorEntry> colorEntries;
1175 2287 : if (poColorTable)
1176 5 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1177 :
1178 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1179 : // it as nodata value
1180 2310 : if (bHasNoData && dfNoDataValue >= 0.0f &&
1181 23 : tNoDataValue < colorEntries.size())
1182 1 : colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1183 :
1184 : // Or if we have no explicit nodata, but a color table entry that is
1185 : // transparent, consider it as the nodata value
1186 2286 : else if (!bHasNoData && nTransparentIdx >= 0)
1187 : {
1188 0 : bHasNoData = TRUE;
1189 0 : tNoDataValue = static_cast<T>(nTransparentIdx);
1190 : }
1191 :
1192 : /* ==================================================================== */
1193 : /* Precompute inner loop constants. */
1194 : /* ==================================================================== */
1195 2287 : bool bSrcXSpacingIsTwo = true;
1196 2287 : int nLastSrcXOff2 = -1;
1197 848986 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1198 : {
1199 846699 : double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
1200 : // Apply some epsilon to avoid numerical precision issues
1201 846699 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
1202 846699 : double dfSrcXOff2 = dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
1203 846699 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
1204 :
1205 846699 : if (nSrcXOff < nChunkXOff)
1206 0 : nSrcXOff = nChunkXOff;
1207 846699 : if (nSrcXOff2 == nSrcXOff)
1208 0 : nSrcXOff2++;
1209 846699 : if (nSrcXOff2 > nChunkRightXOff)
1210 1 : nSrcXOff2 = nChunkRightXOff;
1211 :
1212 846699 : pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
1213 846699 : pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
1214 846699 : nSrcXOff2 - nChunkXOff;
1215 21 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
1216 846699 : (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
1217 846699 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
1218 846699 : 1 - (nSrcXOff2 - dfSrcXOff2);
1219 846699 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
1220 846699 : pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
1221 846699 : if (nSrcXOff + 1 < nSrcXOff2)
1222 : {
1223 846678 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1224 846678 : nSrcXOff2 - nSrcXOff - 2;
1225 846678 : pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1226 846678 : pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
1227 : }
1228 :
1229 846699 : if (nSrcXOff2 - nSrcXOff != 2 ||
1230 726341 : (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
1231 : {
1232 119568 : bSrcXSpacingIsTwo = false;
1233 : }
1234 846699 : nLastSrcXOff2 = nSrcXOff2;
1235 : }
1236 :
1237 : /* ==================================================================== */
1238 : /* Loop over destination scanlines. */
1239 : /* ==================================================================== */
1240 718529 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1241 : {
1242 716242 : double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
1243 716242 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
1244 716242 : if (nSrcYOff < nChunkYOff)
1245 0 : nSrcYOff = nChunkYOff;
1246 :
1247 716242 : double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
1248 716242 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
1249 716242 : if (nSrcYOff2 == nSrcYOff)
1250 0 : ++nSrcYOff2;
1251 716242 : if (nSrcYOff2 > nChunkBottomYOff)
1252 3 : nSrcYOff2 = nChunkBottomYOff;
1253 :
1254 716242 : T *const pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
1255 :
1256 : /* --------------------------------------------------------------------
1257 : */
1258 : /* Loop over destination pixels */
1259 : /* --------------------------------------------------------------------
1260 : */
1261 716242 : if (poColorTable == nullptr)
1262 : {
1263 716127 : if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
1264 : pabyChunkNodataMask == nullptr)
1265 : {
1266 : if (eWrkDataType == GDT_Byte || eWrkDataType == GDT_UInt16)
1267 : {
1268 : // Optimized case : no nodata, overview by a factor of 2 and
1269 : // regular x and y src spacing.
1270 116350 : const T *pSrcScanlineShifted =
1271 116350 : pChunk + pasSrcX[0].nLeftXOffShifted +
1272 116350 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
1273 116350 : nChunkXSize;
1274 116350 : int iDstPixel = 0;
1275 : #ifdef USE_SSE2
1276 116331 : if (bQuadraticMean && eWrkDataType == GDT_Byte)
1277 : {
1278 5385 : iDstPixel = QuadraticMeanByteSSE2OrAVX2(
1279 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1280 : pDstScanline);
1281 : }
1282 110965 : else if (bQuadraticMean /* && eWrkDataType == GDT_UInt16 */)
1283 : {
1284 10 : iDstPixel = QuadraticMeanUInt16SSE2(
1285 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1286 : pDstScanline);
1287 : }
1288 : else if (/* !bQuadraticMean && */ eWrkDataType == GDT_Byte)
1289 : {
1290 110946 : iDstPixel = AverageByteSSE2OrAVX2(
1291 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1292 : pDstScanline);
1293 : }
1294 : else /* if( !bQuadraticMean && eWrkDataType == GDT_UInt16 )
1295 : */
1296 : {
1297 9 : iDstPixel = AverageUInt16SSE2(nDstXWidth, nChunkXSize,
1298 : pSrcScanlineShifted,
1299 : pDstScanline);
1300 : }
1301 : #endif
1302 278621 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1303 : {
1304 162271 : Tsum nTotal = 0;
1305 : T nVal;
1306 162271 : if (bQuadraticMean)
1307 44 : nTotal =
1308 44 : SQUARE<Tsum>(pSrcScanlineShifted[0]) +
1309 44 : SQUARE<Tsum>(pSrcScanlineShifted[1]) +
1310 44 : SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
1311 44 : SQUARE<Tsum>(
1312 44 : pSrcScanlineShifted[1 + nChunkXSize]);
1313 : else
1314 162227 : nTotal = pSrcScanlineShifted[0] +
1315 162227 : pSrcScanlineShifted[1] +
1316 162227 : pSrcScanlineShifted[nChunkXSize] +
1317 162227 : pSrcScanlineShifted[1 + nChunkXSize];
1318 :
1319 162271 : constexpr int nTotalWeight = 4;
1320 162271 : if (bQuadraticMean)
1321 44 : nVal = ComputeIntegerRMS_4values<T>(nTotal);
1322 : else
1323 162227 : nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
1324 : nTotalWeight);
1325 :
1326 : // No need to compare nVal against tNoDataValue as we
1327 : // are in a case where pabyChunkNodataMask == nullptr
1328 : // implies the absence of nodata value.
1329 162271 : pDstScanline[iDstPixel] = nVal;
1330 162271 : pSrcScanlineShifted += 2;
1331 : }
1332 : }
1333 : else
1334 : {
1335 : CPLAssert(eWrkDataType == GDT_Float32 ||
1336 : eWrkDataType == GDT_Float64);
1337 70 : const T *pSrcScanlineShifted =
1338 70 : pChunk + pasSrcX[0].nLeftXOffShifted +
1339 70 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
1340 70 : nChunkXSize;
1341 70 : int iDstPixel = 0;
1342 : #ifdef USE_SSE2
1343 : if (eWrkDataType == GDT_Float32)
1344 : {
1345 61 : if (bQuadraticMean)
1346 : {
1347 34 : iDstPixel = QuadraticMeanFloatSSE2(
1348 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1349 : pDstScanline);
1350 : }
1351 : else
1352 : {
1353 27 : iDstPixel = AverageFloatSSE2(
1354 : nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1355 : pDstScanline);
1356 : }
1357 : }
1358 : #endif
1359 :
1360 228 : for (; iDstPixel < nDstXWidth; ++iDstPixel)
1361 : {
1362 : T nVal;
1363 158 : if (bQuadraticMean)
1364 : {
1365 : // Cast to double to avoid overflows
1366 : // (using std::hypot() is much slower)
1367 100 : nVal = static_cast<T>(std::sqrt(
1368 : 0.25 *
1369 100 : (SQUARE<double>(pSrcScanlineShifted[0]) +
1370 100 : SQUARE<double>(pSrcScanlineShifted[1]) +
1371 100 : SQUARE<double>(
1372 200 : pSrcScanlineShifted[nChunkXSize]) +
1373 100 : SQUARE<double>(
1374 100 : pSrcScanlineShifted[1 + nChunkXSize]))));
1375 : }
1376 : else
1377 : {
1378 58 : nVal = static_cast<T>(
1379 58 : 0.25f * (pSrcScanlineShifted[0] +
1380 58 : pSrcScanlineShifted[1] +
1381 58 : pSrcScanlineShifted[nChunkXSize] +
1382 58 : pSrcScanlineShifted[1 + nChunkXSize]));
1383 : }
1384 :
1385 : // No need to compare nVal against tNoDataValue as we
1386 : // are in a case where pabyChunkNodataMask == nullptr
1387 : // implies the absence of nodata value.
1388 158 : pDstScanline[iDstPixel] = nVal;
1389 158 : pSrcScanlineShifted += 2;
1390 : }
1391 116420 : }
1392 : }
1393 : else
1394 : {
1395 17 : const double dfBottomWeight =
1396 599707 : (nSrcYOff + 1 == nSrcYOff2) ? 1.0
1397 599690 : : 1.0 - (dfSrcYOff - nSrcYOff);
1398 599707 : const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
1399 599707 : nSrcYOff -= nChunkYOff;
1400 599707 : nSrcYOff2 -= nChunkYOff;
1401 :
1402 599707 : double dfTotalWeightFullColumn = dfBottomWeight;
1403 599707 : if (nSrcYOff + 1 < nSrcYOff2)
1404 : {
1405 599690 : dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
1406 599690 : dfTotalWeightFullColumn += dfTopWeight;
1407 : }
1408 :
1409 18032356 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1410 : {
1411 17431981 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1412 17431981 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1413 :
1414 17431981 : double dfTotal = 0;
1415 17431981 : double dfTotalWeight = 0;
1416 17431981 : if (pabyChunkNodataMask == nullptr)
1417 : {
1418 1746435 : auto pChunkShifted =
1419 115 : pChunk +
1420 1746435 : static_cast<GPtrDiff_t>(nSrcYOff) * nChunkXSize;
1421 1746435 : int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1422 1746435 : double dfWeightY = dfBottomWeight;
1423 3493427 : while (true)
1424 : {
1425 : double dfTotalLine;
1426 5239852 : if (bQuadraticMean)
1427 : {
1428 : // Left pixel
1429 : {
1430 104 : const T val = pChunkShifted[nSrcXOff];
1431 104 : dfTotalLine =
1432 104 : SQUARE<double>(val) *
1433 104 : pasSrcX[iDstPixel].dfLeftWeight;
1434 : }
1435 :
1436 104 : if (nSrcXOff + 1 < nSrcXOff2)
1437 : {
1438 : // Middle pixels
1439 104 : for (int iX = nSrcXOff + 1;
1440 424 : iX + 1 < nSrcXOff2; ++iX)
1441 : {
1442 320 : const T val = pChunkShifted[iX];
1443 320 : dfTotalLine += SQUARE<double>(val);
1444 : }
1445 :
1446 : // Right pixel
1447 : {
1448 104 : const T val =
1449 104 : pChunkShifted[nSrcXOff2 - 1];
1450 104 : dfTotalLine +=
1451 104 : SQUARE<double>(val) *
1452 104 : pasSrcX[iDstPixel].dfRightWeight;
1453 : }
1454 : }
1455 : }
1456 : else
1457 : {
1458 : // Left pixel
1459 : {
1460 5239756 : const T val = pChunkShifted[nSrcXOff];
1461 5239756 : dfTotalLine =
1462 5239756 : val * pasSrcX[iDstPixel].dfLeftWeight;
1463 : }
1464 :
1465 5239756 : if (nSrcXOff + 1 < nSrcXOff2)
1466 : {
1467 : // Middle pixels
1468 4239330 : for (int iX = nSrcXOff + 1;
1469 64183126 : iX + 1 < nSrcXOff2; ++iX)
1470 : {
1471 59943836 : const T val = pChunkShifted[iX];
1472 59943836 : dfTotalLine += val;
1473 : }
1474 :
1475 : // Right pixel
1476 : {
1477 4239330 : const T val =
1478 4239330 : pChunkShifted[nSrcXOff2 - 1];
1479 4239330 : dfTotalLine +=
1480 4239330 : val *
1481 4239330 : pasSrcX[iDstPixel].dfRightWeight;
1482 : }
1483 : }
1484 : }
1485 :
1486 5239852 : dfTotal += dfTotalLine * dfWeightY;
1487 5239852 : --nCounterY;
1488 5239852 : if (nCounterY < 0)
1489 1746435 : break;
1490 3493427 : pChunkShifted += nChunkXSize;
1491 3493427 : dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
1492 : }
1493 :
1494 1746435 : dfTotalWeight =
1495 1746435 : pasSrcX[iDstPixel].dfTotalWeightFullLine *
1496 : dfTotalWeightFullColumn;
1497 : }
1498 : else
1499 : {
1500 15685566 : GPtrDiff_t nCount = 0;
1501 69080198 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1502 : {
1503 53394432 : const auto pChunkShifted =
1504 132 : pChunk +
1505 53394432 : static_cast<GPtrDiff_t>(iY) * nChunkXSize;
1506 :
1507 53394432 : double dfTotalLine = 0;
1508 53394432 : double dfTotalWeightLine = 0;
1509 : // Left pixel
1510 : {
1511 53394432 : const int iX = nSrcXOff;
1512 53394432 : const T val = pChunkShifted[iX];
1513 53394432 : if (pabyChunkNodataMask[iX + iY * nChunkXSize])
1514 : {
1515 23412781 : nCount++;
1516 23412781 : const double dfWeightX =
1517 23412781 : pasSrcX[iDstPixel].dfLeftWeight;
1518 23412781 : dfTotalWeightLine = dfWeightX;
1519 23412781 : if (bQuadraticMean)
1520 60 : dfTotalLine =
1521 60 : SQUARE<double>(val) * dfWeightX;
1522 : else
1523 23412781 : dfTotalLine = val * dfWeightX;
1524 : }
1525 : }
1526 :
1527 53394432 : if (nSrcXOff + 1 < nSrcXOff2)
1528 : {
1529 : // Middle pixels
1530 141491132 : for (int iX = nSrcXOff + 1; iX + 1 < nSrcXOff2;
1531 : ++iX)
1532 : {
1533 88095700 : const T val = pChunkShifted[iX];
1534 88095700 : if (pabyChunkNodataMask[iX +
1535 88095700 : iY * nChunkXSize])
1536 : {
1537 39727500 : nCount++;
1538 39727500 : dfTotalWeightLine += 1;
1539 39727500 : if (bQuadraticMean)
1540 0 : dfTotalLine += SQUARE<double>(val);
1541 : else
1542 39727500 : dfTotalLine += val;
1543 : }
1544 : }
1545 :
1546 : // Right pixel
1547 : {
1548 53395332 : const int iX = nSrcXOff2 - 1;
1549 53395332 : const T val = pChunkShifted[iX];
1550 53395332 : if (pabyChunkNodataMask[iX +
1551 53395332 : iY * nChunkXSize])
1552 : {
1553 23412747 : nCount++;
1554 23412747 : const double dfWeightX =
1555 23412747 : pasSrcX[iDstPixel].dfRightWeight;
1556 23412747 : dfTotalWeightLine += dfWeightX;
1557 23412747 : if (bQuadraticMean)
1558 1 : dfTotalLine +=
1559 61 : SQUARE<double>(val) * dfWeightX;
1560 : else
1561 23412646 : dfTotalLine += val * dfWeightX;
1562 : }
1563 : }
1564 : }
1565 :
1566 91105998 : const double dfWeightY =
1567 : (iY == nSrcYOff) ? dfBottomWeight
1568 37711466 : : (iY + 1 == nSrcYOff2) ? dfTopWeight
1569 : : 1.0;
1570 53394532 : dfTotal += dfTotalLine * dfWeightY;
1571 53394532 : dfTotalWeight += dfTotalWeightLine * dfWeightY;
1572 : }
1573 :
1574 15685766 : if (nCount == 0 ||
1575 8 : (bPropagateNoData &&
1576 : nCount <
1577 8 : static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
1578 8 : (nSrcXOff2 - nSrcXOff)))
1579 : {
1580 8937432 : pDstScanline[iDstPixel] = tNoDataValue;
1581 8937432 : continue;
1582 : }
1583 : }
1584 : if (eWrkDataType == GDT_Byte)
1585 : {
1586 : T nVal;
1587 8494610 : if (bQuadraticMean)
1588 38 : nVal = ComputeIntegerRMS<T, int>(dfTotal,
1589 : dfTotalWeight);
1590 : else
1591 8494570 : nVal =
1592 8494570 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1593 8495070 : if (bHasNoData && nVal == tNoDataValue)
1594 0 : nVal = tReplacementVal;
1595 8495070 : pDstScanline[iDstPixel] = nVal;
1596 : }
1597 : else if (eWrkDataType == GDT_UInt16)
1598 : {
1599 : T nVal;
1600 8 : if (bQuadraticMean)
1601 4 : nVal = ComputeIntegerRMS<T, uint64_t>(
1602 : dfTotal, dfTotalWeight);
1603 : else
1604 4 : nVal =
1605 4 : static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1606 8 : if (bHasNoData && nVal == tNoDataValue)
1607 0 : nVal = tReplacementVal;
1608 8 : pDstScanline[iDstPixel] = nVal;
1609 : }
1610 : else
1611 : {
1612 : T nVal;
1613 151 : if (bQuadraticMean)
1614 20 : nVal =
1615 25 : static_cast<T>(sqrt(dfTotal / dfTotalWeight));
1616 : else
1617 126 : nVal = static_cast<T>(dfTotal / dfTotalWeight);
1618 151 : if (bHasNoData && nVal == tNoDataValue)
1619 2 : nVal = tReplacementVal;
1620 151 : pDstScanline[iDstPixel] = nVal;
1621 : }
1622 : }
1623 : }
1624 : }
1625 : else
1626 : {
1627 115 : nSrcYOff -= nChunkYOff;
1628 115 : nSrcYOff2 -= nChunkYOff;
1629 :
1630 5948 : for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1631 : {
1632 6475 : const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1633 6475 : const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1634 :
1635 6475 : GPtrDiff_t nTotalR = 0;
1636 6475 : GPtrDiff_t nTotalG = 0;
1637 6475 : GPtrDiff_t nTotalB = 0;
1638 6475 : GPtrDiff_t nCount = 0;
1639 :
1640 19425 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1641 : {
1642 38850 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
1643 : {
1644 25900 : const T val = pChunk[iX + static_cast<GPtrDiff_t>(iY) *
1645 25900 : nChunkXSize];
1646 : // cppcheck-suppress unsignedLessThanZero
1647 25900 : if (val < 0 || val >= colorEntries.size())
1648 0 : continue;
1649 25900 : size_t idx = static_cast<size_t>(val);
1650 25900 : const auto &entry = colorEntries[idx];
1651 25900 : if (entry.c4)
1652 : {
1653 14128 : if (bQuadraticMean)
1654 : {
1655 800 : nTotalR += SQUARE<int>(entry.c1);
1656 800 : nTotalG += SQUARE<int>(entry.c2);
1657 800 : nTotalB += SQUARE<int>(entry.c3);
1658 800 : ++nCount;
1659 : }
1660 : else
1661 : {
1662 13328 : nTotalR += entry.c1;
1663 13328 : nTotalG += entry.c2;
1664 13328 : nTotalB += entry.c3;
1665 13328 : ++nCount;
1666 : }
1667 : }
1668 : }
1669 : }
1670 :
1671 6475 : if (nCount == 0 ||
1672 0 : (bPropagateNoData &&
1673 0 : nCount < static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
1674 0 : (nSrcXOff2 - nSrcXOff)))
1675 : {
1676 2838 : pDstScanline[iDstPixel] = tNoDataValue;
1677 : }
1678 : else
1679 : {
1680 : GDALColorEntry color;
1681 3637 : if (bQuadraticMean)
1682 : {
1683 200 : color.c1 =
1684 200 : static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
1685 200 : color.c2 =
1686 200 : static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
1687 200 : color.c3 =
1688 200 : static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
1689 : }
1690 : else
1691 : {
1692 3437 : color.c1 =
1693 3437 : static_cast<short>((nTotalR + nCount / 2) / nCount);
1694 3437 : color.c2 =
1695 3437 : static_cast<short>((nTotalG + nCount / 2) / nCount);
1696 3437 : color.c3 =
1697 3437 : static_cast<short>((nTotalB + nCount / 2) / nCount);
1698 : }
1699 2995 : pDstScanline[iDstPixel] =
1700 3637 : static_cast<T>(BestColorEntry(colorEntries, color));
1701 : }
1702 : }
1703 : }
1704 : }
1705 :
1706 2287 : CPLFree(pasSrcX);
1707 :
1708 2287 : return CE_None;
1709 : }
1710 :
1711 2287 : static CPLErr GDALResampleChunk_AverageOrRMS(
1712 : double dfXRatioDstToSrc, double dfYRatioDstToSrc, double dfSrcXDelta,
1713 : double dfSrcYDelta, GDALDataType eWrkDataType, const void *pChunk,
1714 : const GByte *pabyChunkNodataMask, int nChunkXOff, int nChunkXSize,
1715 : int nChunkYOff, int nChunkYSize, int nDstXOff, int nDstXOff2, int nDstYOff,
1716 : int nDstYOff2, GDALRasterBand *poOverview, void **ppDstBuffer,
1717 : GDALDataType *peDstBufferDataType, const char *pszResampling,
1718 : bool bHasNoData, double dfNoDataValue, GDALColorTable *poColorTable,
1719 : GDALDataType /* eSrcDataType */, bool bPropagateNoData)
1720 : {
1721 2287 : if (eWrkDataType == GDT_Byte)
1722 : {
1723 2222 : *peDstBufferDataType = eWrkDataType;
1724 2222 : return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_Byte>(
1725 : dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
1726 : static_cast<const GByte *>(pChunk), pabyChunkNodataMask, nChunkXOff,
1727 : nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff, nDstXOff2, nDstYOff,
1728 : nDstYOff2, poOverview, ppDstBuffer, pszResampling, bHasNoData,
1729 2222 : dfNoDataValue, poColorTable, bPropagateNoData);
1730 : }
1731 65 : else if (eWrkDataType == GDT_UInt16)
1732 : {
1733 9 : *peDstBufferDataType = eWrkDataType;
1734 9 : if (EQUAL(pszResampling, "RMS"))
1735 : {
1736 : // Use double as accumulation type, because UInt32 could overflow
1737 : return GDALResampleChunk_AverageOrRMS_T<GUInt16, double,
1738 5 : GDT_UInt16>(
1739 : dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
1740 : static_cast<const GUInt16 *>(pChunk), pabyChunkNodataMask,
1741 : nChunkXOff, nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff,
1742 : nDstXOff2, nDstYOff, nDstYOff2, poOverview, ppDstBuffer,
1743 : pszResampling, bHasNoData, dfNoDataValue, poColorTable,
1744 5 : bPropagateNoData);
1745 : }
1746 : else
1747 : {
1748 : return GDALResampleChunk_AverageOrRMS_T<GUInt16, GUInt32,
1749 4 : GDT_UInt16>(
1750 : dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
1751 : static_cast<const GUInt16 *>(pChunk), pabyChunkNodataMask,
1752 : nChunkXOff, nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff,
1753 : nDstXOff2, nDstYOff, nDstYOff2, poOverview, ppDstBuffer,
1754 : pszResampling, bHasNoData, dfNoDataValue, poColorTable,
1755 4 : bPropagateNoData);
1756 : }
1757 : }
1758 56 : else if (eWrkDataType == GDT_Float32)
1759 : {
1760 49 : *peDstBufferDataType = eWrkDataType;
1761 49 : return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32>(
1762 : dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
1763 : static_cast<const float *>(pChunk), pabyChunkNodataMask, nChunkXOff,
1764 : nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff, nDstXOff2, nDstYOff,
1765 : nDstYOff2, poOverview, ppDstBuffer, pszResampling, bHasNoData,
1766 49 : dfNoDataValue, poColorTable, bPropagateNoData);
1767 : }
1768 7 : else if (eWrkDataType == GDT_Float64)
1769 : {
1770 7 : *peDstBufferDataType = eWrkDataType;
1771 7 : return GDALResampleChunk_AverageOrRMS_T<double, double, GDT_Float64>(
1772 : dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
1773 : static_cast<const double *>(pChunk), pabyChunkNodataMask,
1774 : nChunkXOff, nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff,
1775 : nDstXOff2, nDstYOff, nDstYOff2, poOverview, ppDstBuffer,
1776 : pszResampling, bHasNoData, dfNoDataValue, poColorTable,
1777 7 : bPropagateNoData);
1778 : }
1779 :
1780 0 : CPLAssert(false);
1781 : return CE_Failure;
1782 : }
1783 :
1784 : /************************************************************************/
1785 : /* GDALResampleChunk_Gauss() */
1786 : /************************************************************************/
1787 :
1788 86 : static CPLErr GDALResampleChunk_Gauss(
1789 : double dfXRatioDstToSrc, double dfYRatioDstToSrc, double /* dfSrcXDelta */,
1790 : double /* dfSrcYDelta */, GDALDataType /* eWrkDataType */,
1791 : const void *pChunk, const GByte *pabyChunkNodataMask, int nChunkXOff,
1792 : int nChunkXSize, int nChunkYOff, int nChunkYSize, int nDstXOff,
1793 : int nDstXOff2, int nDstYOff, int nDstYOff2, GDALRasterBand *poOverview,
1794 : void **ppDstBuffer, GDALDataType *peDstBufferDataType,
1795 : const char * /* pszResampling */, bool bHasNoData, double dfNoDataValue,
1796 : GDALColorTable *poColorTable, GDALDataType /* eSrcDataType */,
1797 : bool /* bPropagateNoData */)
1798 :
1799 : {
1800 86 : const double *const padfChunk = static_cast<const double *>(pChunk);
1801 :
1802 86 : *ppDstBuffer =
1803 86 : VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
1804 : GDALGetDataTypeSizeBytes(GDT_Float64));
1805 86 : if (*ppDstBuffer == nullptr)
1806 : {
1807 0 : return CE_Failure;
1808 : }
1809 86 : *peDstBufferDataType = GDT_Float64;
1810 86 : double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
1811 :
1812 : /* -------------------------------------------------------------------- */
1813 : /* Create the filter kernel and allocate scanline buffer. */
1814 : /* -------------------------------------------------------------------- */
1815 86 : int nGaussMatrixDim = 3;
1816 : const int *panGaussMatrix;
1817 86 : constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
1818 86 : constexpr int anGaussMatrix5x5[] = {1, 4, 6, 4, 1, 4, 16, 24, 16,
1819 : 4, 6, 24, 36, 24, 6, 4, 16, 24,
1820 : 16, 4, 1, 4, 6, 4, 1};
1821 86 : constexpr int anGaussMatrix7x7[] = {
1822 : 1, 6, 15, 20, 15, 6, 1, 6, 36, 90, 120, 90, 36,
1823 : 6, 15, 90, 225, 300, 225, 90, 15, 20, 120, 300, 400, 300,
1824 : 120, 20, 15, 90, 225, 300, 225, 90, 15, 6, 36, 90, 120,
1825 : 90, 36, 6, 1, 6, 15, 20, 15, 6, 1};
1826 :
1827 86 : const int nOXSize = poOverview->GetXSize();
1828 86 : const int nOYSize = poOverview->GetYSize();
1829 86 : const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
1830 :
1831 : // matrix for gauss filter
1832 86 : if (nResYFactor <= 2)
1833 : {
1834 85 : panGaussMatrix = anGaussMatrix3x3;
1835 85 : nGaussMatrixDim = 3;
1836 : }
1837 1 : else if (nResYFactor <= 4)
1838 : {
1839 0 : panGaussMatrix = anGaussMatrix5x5;
1840 0 : nGaussMatrixDim = 5;
1841 : }
1842 : else
1843 : {
1844 1 : panGaussMatrix = anGaussMatrix7x7;
1845 1 : nGaussMatrixDim = 7;
1846 : }
1847 :
1848 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
1849 : int *panGaussMatrixDup = static_cast<int *>(
1850 : CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
1851 : memcpy(panGaussMatrixDup, panGaussMatrix,
1852 : sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
1853 : panGaussMatrix = panGaussMatrixDup;
1854 : #endif
1855 :
1856 86 : if (!bHasNoData)
1857 79 : dfNoDataValue = 0.0;
1858 :
1859 86 : std::vector<GDALColorEntry> colorEntries;
1860 86 : int nTransparentIdx = -1;
1861 86 : if (poColorTable)
1862 2 : colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1863 :
1864 : // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1865 : // it as nodata value.
1866 92 : if (bHasNoData && dfNoDataValue >= 0.0f &&
1867 6 : dfNoDataValue < colorEntries.size())
1868 0 : colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
1869 :
1870 : // Or if we have no explicit nodata, but a color table entry that is
1871 : // transparent, consider it as the nodata value.
1872 86 : else if (!bHasNoData && nTransparentIdx >= 0)
1873 : {
1874 0 : dfNoDataValue = nTransparentIdx;
1875 : }
1876 :
1877 86 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
1878 86 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1879 86 : const int nDstXWidth = nDstXOff2 - nDstXOff;
1880 :
1881 : /* ==================================================================== */
1882 : /* Loop over destination scanlines. */
1883 : /* ==================================================================== */
1884 16488 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1885 : {
1886 16402 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
1887 16402 : int nSrcYOff2 =
1888 16402 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
1889 :
1890 16402 : if (nSrcYOff < nChunkYOff)
1891 : {
1892 0 : nSrcYOff = nChunkYOff;
1893 0 : nSrcYOff2++;
1894 : }
1895 :
1896 16402 : const int iSizeY = nSrcYOff2 - nSrcYOff;
1897 16402 : nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
1898 16402 : nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
1899 :
1900 16402 : if (nSrcYOff2 > nChunkBottomYOff ||
1901 16359 : (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
1902 : {
1903 44 : nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
1904 : }
1905 :
1906 16402 : int nYShiftGaussMatrix = 0;
1907 16402 : if (nSrcYOff < nChunkYOff)
1908 : {
1909 0 : nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
1910 0 : nSrcYOff = nChunkYOff;
1911 : }
1912 :
1913 16402 : const double *const padfSrcScanline =
1914 16402 : padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
1915 16402 : const GByte *pabySrcScanlineNodataMask = nullptr;
1916 16402 : if (pabyChunkNodataMask != nullptr)
1917 152 : pabySrcScanlineNodataMask =
1918 152 : pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
1919 :
1920 : /* --------------------------------------------------------------------
1921 : */
1922 : /* Loop over destination pixels */
1923 : /* --------------------------------------------------------------------
1924 : */
1925 16402 : double *const padfDstScanline =
1926 16402 : padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
1927 4149980 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1928 : {
1929 4133580 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
1930 4133580 : int nSrcXOff2 =
1931 4133580 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
1932 :
1933 4133580 : if (nSrcXOff < nChunkXOff)
1934 : {
1935 0 : nSrcXOff = nChunkXOff;
1936 0 : nSrcXOff2++;
1937 : }
1938 :
1939 4133580 : const int iSizeX = nSrcXOff2 - nSrcXOff;
1940 4133580 : nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
1941 4133580 : nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
1942 :
1943 4133580 : if (nSrcXOff2 > nChunkRightXOff ||
1944 4127930 : (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
1945 : {
1946 5650 : nSrcXOff2 =
1947 5650 : std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
1948 : }
1949 :
1950 4133580 : int nXShiftGaussMatrix = 0;
1951 4133580 : if (nSrcXOff < nChunkXOff)
1952 : {
1953 0 : nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
1954 0 : nSrcXOff = nChunkXOff;
1955 : }
1956 :
1957 4133580 : if (poColorTable == nullptr)
1958 : {
1959 4133380 : double dfTotal = 0.0;
1960 4133380 : GInt64 nCount = 0;
1961 4133380 : const int *panLineWeight =
1962 4133380 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
1963 : nXShiftGaussMatrix;
1964 :
1965 16527900 : for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
1966 12394500 : ++iY, ++j, panLineWeight += nGaussMatrixDim)
1967 : {
1968 49561300 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
1969 : {
1970 37166800 : const double val =
1971 37166800 : padfSrcScanline[iX - nChunkXOff +
1972 37166800 : static_cast<GPtrDiff_t>(iY -
1973 37166800 : nSrcYOff) *
1974 37166800 : nChunkXSize];
1975 37166800 : if (pabySrcScanlineNodataMask == nullptr ||
1976 32872 : pabySrcScanlineNodataMask[iX - nChunkXOff +
1977 32872 : static_cast<GPtrDiff_t>(
1978 32872 : iY - nSrcYOff) *
1979 32872 : nChunkXSize])
1980 : {
1981 37146100 : const int nWeight = panLineWeight[i];
1982 37146100 : dfTotal += val * nWeight;
1983 37146100 : nCount += nWeight;
1984 : }
1985 : }
1986 : }
1987 :
1988 4133380 : if (nCount == 0)
1989 : {
1990 2217 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
1991 : }
1992 : else
1993 : {
1994 4131160 : padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
1995 : }
1996 : }
1997 : else
1998 : {
1999 200 : GInt64 nTotalR = 0;
2000 200 : GInt64 nTotalG = 0;
2001 200 : GInt64 nTotalB = 0;
2002 200 : GInt64 nTotalWeight = 0;
2003 200 : const int *panLineWeight =
2004 200 : panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2005 : nXShiftGaussMatrix;
2006 :
2007 780 : for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2008 580 : ++iY, ++j, panLineWeight += nGaussMatrixDim)
2009 : {
2010 2262 : for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2011 : {
2012 1682 : const double val =
2013 1682 : padfSrcScanline[iX - nChunkXOff +
2014 1682 : static_cast<GPtrDiff_t>(iY -
2015 1682 : nSrcYOff) *
2016 1682 : nChunkXSize];
2017 1682 : if (val < 0 || val >= colorEntries.size())
2018 0 : continue;
2019 :
2020 1682 : size_t idx = static_cast<size_t>(val);
2021 1682 : if (colorEntries[idx].c4)
2022 : {
2023 1682 : const int nWeight = panLineWeight[i];
2024 1682 : nTotalR +=
2025 1682 : static_cast<GInt64>(colorEntries[idx].c1) *
2026 1682 : nWeight;
2027 1682 : nTotalG +=
2028 1682 : static_cast<GInt64>(colorEntries[idx].c2) *
2029 1682 : nWeight;
2030 1682 : nTotalB +=
2031 1682 : static_cast<GInt64>(colorEntries[idx].c3) *
2032 1682 : nWeight;
2033 1682 : nTotalWeight += nWeight;
2034 : }
2035 : }
2036 : }
2037 :
2038 200 : if (nTotalWeight == 0)
2039 : {
2040 0 : padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2041 : }
2042 : else
2043 : {
2044 : GDALColorEntry color;
2045 :
2046 200 : color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
2047 : nTotalWeight);
2048 200 : color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
2049 : nTotalWeight);
2050 200 : color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
2051 : nTotalWeight);
2052 200 : padfDstScanline[iDstPixel - nDstXOff] =
2053 200 : BestColorEntry(colorEntries, color);
2054 : }
2055 : }
2056 : }
2057 : }
2058 :
2059 : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
2060 : CPLFree(panGaussMatrixDup);
2061 : #endif
2062 :
2063 86 : return CE_None;
2064 : }
2065 :
2066 : /************************************************************************/
2067 : /* GDALResampleChunk_Mode() */
2068 : /************************************************************************/
2069 :
2070 : template <class T>
2071 80 : static CPLErr GDALResampleChunk_Mode_T(
2072 : double dfXRatioDstToSrc, double dfYRatioDstToSrc, double dfSrcXDelta,
2073 : double dfSrcYDelta, const T *pChunk, const GByte *pabyChunkNodataMask,
2074 : int nChunkXOff, int nChunkXSize, int nChunkYOff, int nChunkYSize,
2075 : int nDstXOff, int nDstXOff2, int nDstYOff, int nDstYOff2,
2076 : T *const pDstBuffer, bool bHasNoData, double dfNoDataValue,
2077 : GDALColorTable *poColorTable, GDALDataType eSrcDataType)
2078 :
2079 : {
2080 80 : const int nDstXSize = nDstXOff2 - nDstXOff;
2081 :
2082 : T tNoDataValue;
2083 80 : if (!bHasNoData || !GDALIsValueInRange<T>(dfNoDataValue))
2084 79 : tNoDataValue = 0;
2085 : else
2086 1 : tNoDataValue = static_cast<T>(dfNoDataValue);
2087 :
2088 80 : size_t nMaxNumPx = 0;
2089 80 : T *padfVals = nullptr;
2090 80 : int *panSums = nullptr;
2091 :
2092 80 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2093 80 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2094 160 : std::vector<int> anVals(256, 0);
2095 :
2096 : /* ==================================================================== */
2097 : /* Loop over destination scanlines. */
2098 : /* ==================================================================== */
2099 7419 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2100 : {
2101 7339 : double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
2102 7339 : int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
2103 : #ifdef only_pixels_with_more_than_10_pct_participation
2104 : // When oversampling, don't take into account pixels that have a tiny
2105 : // participation in the resulting pixel
2106 : if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2107 : nSrcYOff < nChunkBottomYOff)
2108 : nSrcYOff++;
2109 : #endif
2110 7339 : if (nSrcYOff < nChunkYOff)
2111 0 : nSrcYOff = nChunkYOff;
2112 :
2113 7339 : double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
2114 7339 : int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
2115 : #ifdef only_pixels_with_more_than_10_pct_participation
2116 : // When oversampling, don't take into account pixels that have a tiny
2117 : // participation in the resulting pixel
2118 : if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2119 : nSrcYOff2 > nChunkYOff)
2120 : nSrcYOff2--;
2121 : #endif
2122 7339 : if (nSrcYOff2 == nSrcYOff)
2123 0 : ++nSrcYOff2;
2124 7339 : if (nSrcYOff2 > nChunkBottomYOff)
2125 0 : nSrcYOff2 = nChunkBottomYOff;
2126 :
2127 7339 : const T *const paSrcScanline =
2128 101 : pChunk +
2129 7339 : (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
2130 7339 : const GByte *pabySrcScanlineNodataMask = nullptr;
2131 7339 : if (pabyChunkNodataMask != nullptr)
2132 1810 : pabySrcScanlineNodataMask =
2133 : pabyChunkNodataMask +
2134 1810 : static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
2135 :
2136 7339 : T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
2137 : /* --------------------------------------------------------------------
2138 : */
2139 : /* Loop over destination pixels */
2140 : /* --------------------------------------------------------------------
2141 : */
2142 4259466 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2143 : {
2144 4252125 : double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
2145 : // Apply some epsilon to avoid numerical precision issues
2146 4252125 : int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
2147 : #ifdef only_pixels_with_more_than_10_pct_participation
2148 : // When oversampling, don't take into account pixels that have a
2149 : // tiny participation in the resulting pixel
2150 : if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2151 : nSrcXOff < nChunkRightXOff)
2152 : nSrcXOff++;
2153 : #endif
2154 4252125 : if (nSrcXOff < nChunkXOff)
2155 0 : nSrcXOff = nChunkXOff;
2156 :
2157 4252125 : double dfSrcXOff2 =
2158 4252125 : dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
2159 4252125 : int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
2160 : #ifdef only_pixels_with_more_than_10_pct_participation
2161 : // When oversampling, don't take into account pixels that have a
2162 : // tiny participation in the resulting pixel
2163 : if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2164 : nSrcXOff2 > nChunkXOff)
2165 : nSrcXOff2--;
2166 : #endif
2167 4252125 : if (nSrcXOff2 == nSrcXOff)
2168 0 : nSrcXOff2++;
2169 4252125 : if (nSrcXOff2 > nChunkRightXOff)
2170 0 : nSrcXOff2 = nChunkRightXOff;
2171 :
2172 4252325 : if (eSrcDataType != GDT_Byte ||
2173 200 : (poColorTable && poColorTable->GetColorEntryCount() > 256))
2174 : {
2175 : // Not sure how much sense it makes to run a majority
2176 : // filter on floating point data, but here it is for the sake
2177 : // of compatibility. It won't look right on RGB images by the
2178 : // nature of the filter.
2179 :
2180 775 : if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
2181 2325 : nSrcYOff2 - nSrcYOff > INT_MAX / (nSrcXOff2 - nSrcXOff) ||
2182 775 : static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
2183 775 : static_cast<size_t>(nSrcXOff2 - nSrcXOff) >
2184 775 : std::numeric_limits<size_t>::max() / sizeof(float))
2185 : {
2186 0 : CPLError(CE_Failure, CPLE_NotSupported,
2187 : "Too big downsampling factor");
2188 0 : CPLFree(padfVals);
2189 0 : CPLFree(panSums);
2190 0 : return CE_Failure;
2191 : }
2192 775 : const size_t nNumPx =
2193 775 : static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
2194 775 : static_cast<size_t>(nSrcXOff2 - nSrcXOff);
2195 775 : size_t iMaxInd = 0;
2196 775 : size_t iMaxVal = 0;
2197 775 : bool biMaxValdValid = false;
2198 :
2199 775 : if (padfVals == nullptr || nNumPx > nMaxNumPx)
2200 : {
2201 : T *padfValsNew = static_cast<T *>(
2202 19 : VSI_REALLOC_VERBOSE(padfVals, nNumPx * sizeof(T)));
2203 : int *panSumsNew = static_cast<int *>(
2204 19 : VSI_REALLOC_VERBOSE(panSums, nNumPx * sizeof(int)));
2205 19 : if (padfValsNew != nullptr)
2206 19 : padfVals = padfValsNew;
2207 19 : if (panSumsNew != nullptr)
2208 19 : panSums = panSumsNew;
2209 19 : if (padfValsNew == nullptr || panSumsNew == nullptr)
2210 : {
2211 0 : CPLFree(padfVals);
2212 0 : CPLFree(panSums);
2213 0 : return CE_Failure;
2214 : }
2215 19 : nMaxNumPx = nNumPx;
2216 : }
2217 :
2218 2325 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2219 : {
2220 1550 : const GPtrDiff_t iTotYOff =
2221 1550 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2222 1550 : nChunkXOff;
2223 4650 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2224 : {
2225 3100 : if (pabySrcScanlineNodataMask == nullptr ||
2226 16 : pabySrcScanlineNodataMask[iX + iTotYOff])
2227 : {
2228 3085 : const T dfVal = paSrcScanline[iX + iTotYOff];
2229 3085 : size_t i = 0; // Used after for.
2230 :
2231 : // Check array for existing entry.
2232 7315 : for (; i < iMaxInd; ++i)
2233 5146 : if (padfVals[i] == dfVal &&
2234 670 : ++panSums[i] > panSums[iMaxVal])
2235 : {
2236 246 : iMaxVal = i;
2237 246 : biMaxValdValid = true;
2238 246 : break;
2239 : }
2240 :
2241 : // Add to arr if entry not already there.
2242 3085 : if (i == iMaxInd)
2243 : {
2244 2839 : padfVals[iMaxInd] = dfVal;
2245 2839 : panSums[iMaxInd] = 1;
2246 :
2247 2839 : if (!biMaxValdValid)
2248 : {
2249 772 : iMaxVal = iMaxInd;
2250 772 : biMaxValdValid = true;
2251 : }
2252 :
2253 2839 : ++iMaxInd;
2254 : }
2255 : }
2256 : }
2257 : }
2258 :
2259 775 : if (!biMaxValdValid)
2260 3 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2261 : else
2262 772 : paDstScanline[iDstPixel - nDstXOff] = padfVals[iMaxVal];
2263 : }
2264 : else // if( eSrcDataType == GDT_Byte && nEntryCount < 256 )
2265 : {
2266 : // So we go here for a paletted or non-paletted byte band.
2267 : // The input values are then between 0 and 255.
2268 4251350 : int nMaxVal = 0;
2269 4251350 : int iMaxInd = -1;
2270 :
2271 : // The cost of this zeroing might be high. Perhaps we should
2272 : // just use the above generic case, and go to this one if the
2273 : // number of source pixels is large enough
2274 4251350 : std::fill(anVals.begin(), anVals.end(), 0);
2275 :
2276 12777700 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2277 : {
2278 8526360 : const GPtrDiff_t iTotYOff =
2279 8526360 : static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2280 8526360 : nChunkXOff;
2281 25649300 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2282 : {
2283 17122900 : const T val = paSrcScanline[iX + iTotYOff];
2284 17122900 : if (!bHasNoData || val != tNoDataValue)
2285 : {
2286 17122900 : int nVal = static_cast<int>(val);
2287 17122900 : if (++anVals[nVal] > nMaxVal)
2288 : {
2289 : // Sum the density.
2290 : // Is it the most common value so far?
2291 17006200 : iMaxInd = nVal;
2292 17006200 : nMaxVal = anVals[nVal];
2293 : }
2294 : }
2295 : }
2296 : }
2297 :
2298 4251350 : if (iMaxInd == -1)
2299 0 : paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2300 : else
2301 4251350 : paDstScanline[iDstPixel - nDstXOff] =
2302 : static_cast<T>(iMaxInd);
2303 : }
2304 : }
2305 : }
2306 :
2307 80 : CPLFree(padfVals);
2308 80 : CPLFree(panSums);
2309 :
2310 80 : return CE_None;
2311 : }
2312 :
2313 80 : static CPLErr GDALResampleChunk_Mode(
2314 : double dfXRatioDstToSrc, double dfYRatioDstToSrc, double dfSrcXDelta,
2315 : double dfSrcYDelta, GDALDataType eWrkDataType, const void *pChunk,
2316 : const GByte *pabyChunkNodataMask, int nChunkXOff, int nChunkXSize,
2317 : int nChunkYOff, int nChunkYSize, int nDstXOff, int nDstXOff2, int nDstYOff,
2318 : int nDstYOff2, GDALRasterBand * /*poOverview*/, void **ppDstBuffer,
2319 : GDALDataType *peDstBufferDataType, const char * /* pszResampling */,
2320 : bool bHasNoData, double dfNoDataValue, GDALColorTable *poColorTable,
2321 : GDALDataType eSrcDataType, bool /*bPropagateNoData*/)
2322 : {
2323 80 : *ppDstBuffer =
2324 80 : VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
2325 : GDALGetDataTypeSizeBytes(eWrkDataType));
2326 80 : if (*ppDstBuffer == nullptr)
2327 : {
2328 0 : return CE_Failure;
2329 : }
2330 :
2331 80 : *peDstBufferDataType = eWrkDataType;
2332 80 : if (eWrkDataType == GDT_Byte)
2333 : {
2334 61 : return GDALResampleChunk_Mode_T<GByte>(
2335 : dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
2336 : static_cast<const GByte *>(pChunk), pabyChunkNodataMask, nChunkXOff,
2337 : nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff, nDstXOff2, nDstYOff,
2338 : nDstYOff2, static_cast<GByte *>(*ppDstBuffer), bHasNoData,
2339 61 : dfNoDataValue, poColorTable, eSrcDataType);
2340 : }
2341 19 : else if (eWrkDataType == GDT_UInt16)
2342 : {
2343 1 : return GDALResampleChunk_Mode_T<GUInt16>(
2344 : dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
2345 : static_cast<const GUInt16 *>(pChunk), pabyChunkNodataMask,
2346 : nChunkXOff, nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff,
2347 : nDstXOff2, nDstYOff, nDstYOff2,
2348 : static_cast<GUInt16 *>(*ppDstBuffer), bHasNoData, dfNoDataValue,
2349 1 : poColorTable, eSrcDataType);
2350 : }
2351 18 : else if (eWrkDataType == GDT_Float32)
2352 : {
2353 16 : return GDALResampleChunk_Mode_T<float>(
2354 : dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
2355 : static_cast<const float *>(pChunk), pabyChunkNodataMask, nChunkXOff,
2356 : nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff, nDstXOff2, nDstYOff,
2357 : nDstYOff2, static_cast<float *>(*ppDstBuffer), bHasNoData,
2358 16 : dfNoDataValue, poColorTable, eSrcDataType);
2359 : }
2360 2 : else if (eWrkDataType == GDT_Float64)
2361 : {
2362 2 : return GDALResampleChunk_Mode_T<double>(
2363 : dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
2364 : static_cast<const double *>(pChunk), pabyChunkNodataMask,
2365 : nChunkXOff, nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff,
2366 : nDstXOff2, nDstYOff, nDstYOff2, static_cast<double *>(*ppDstBuffer),
2367 2 : bHasNoData, dfNoDataValue, poColorTable, eSrcDataType);
2368 : }
2369 :
2370 0 : CPLAssert(false);
2371 : return CE_Failure;
2372 : }
2373 :
2374 : /************************************************************************/
2375 : /* GDALResampleConvolutionHorizontal() */
2376 : /************************************************************************/
2377 :
2378 : template <class T>
2379 : static inline double
2380 44642 : GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
2381 : int nSrcPixelCount)
2382 : {
2383 44642 : double dfVal1 = 0.0;
2384 44642 : double dfVal2 = 0.0;
2385 44642 : int i = 0; // Used after for.
2386 : // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
2387 : // manually (untypical) unrolled loop in -O2 and -O3:
2388 : // https://github.com/OSGeo/gdal/issues/9508
2389 : #if !defined(__INTEL_CLANG_COMPILER)
2390 89044 : for (; i + 3 < nSrcPixelCount; i += 4)
2391 : {
2392 44402 : dfVal1 += pChunk[i] * padfWeights[i];
2393 44402 : dfVal1 += pChunk[i + 1] * padfWeights[i + 1];
2394 44402 : dfVal2 += pChunk[i + 2] * padfWeights[i + 2];
2395 44402 : dfVal2 += pChunk[i + 3] * padfWeights[i + 3];
2396 : }
2397 : #endif
2398 46066 : for (; i < nSrcPixelCount; ++i)
2399 : {
2400 1424 : dfVal1 += pChunk[i] * padfWeights[i];
2401 : }
2402 44642 : return dfVal1 + dfVal2;
2403 : }
2404 :
2405 : template <class T>
2406 13 : static inline void GDALResampleConvolutionHorizontalWithMask(
2407 : const T *pChunk, const GByte *pabyMask, const double *padfWeights,
2408 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2409 : {
2410 13 : dfVal = 0;
2411 13 : dfWeightSum = 0;
2412 13 : int i = 0;
2413 13 : for (; i + 3 < nSrcPixelCount; i += 4)
2414 : {
2415 0 : const double dfWeight0 = padfWeights[i] * pabyMask[i];
2416 0 : const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
2417 0 : const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
2418 0 : const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
2419 0 : dfVal += pChunk[i] * dfWeight0;
2420 0 : dfVal += pChunk[i + 1] * dfWeight1;
2421 0 : dfVal += pChunk[i + 2] * dfWeight2;
2422 0 : dfVal += pChunk[i + 3] * dfWeight3;
2423 0 : dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
2424 : }
2425 45 : for (; i < nSrcPixelCount; ++i)
2426 : {
2427 32 : const double dfWeight = padfWeights[i] * pabyMask[i];
2428 32 : dfVal += pChunk[i] * dfWeight;
2429 32 : dfWeightSum += dfWeight;
2430 : }
2431 13 : }
2432 :
2433 : template <class T>
2434 1330333 : static inline void GDALResampleConvolutionHorizontal_3rows(
2435 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2436 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2437 : double &dfRes2, double &dfRes3)
2438 : {
2439 1330333 : double dfVal1 = 0.0;
2440 1330333 : double dfVal2 = 0.0;
2441 1330333 : double dfVal3 = 0.0;
2442 1330333 : double dfVal4 = 0.0;
2443 1330333 : double dfVal5 = 0.0;
2444 1330333 : double dfVal6 = 0.0;
2445 1330333 : int i = 0; // Used after for.
2446 2715059 : for (; i + 3 < nSrcPixelCount; i += 4)
2447 : {
2448 1384722 : dfVal1 += pChunkRow1[i] * padfWeights[i];
2449 1384722 : dfVal1 += pChunkRow1[i + 1] * padfWeights[i + 1];
2450 1384722 : dfVal2 += pChunkRow1[i + 2] * padfWeights[i + 2];
2451 1384722 : dfVal2 += pChunkRow1[i + 3] * padfWeights[i + 3];
2452 1384722 : dfVal3 += pChunkRow2[i] * padfWeights[i];
2453 1384722 : dfVal3 += pChunkRow2[i + 1] * padfWeights[i + 1];
2454 1384722 : dfVal4 += pChunkRow2[i + 2] * padfWeights[i + 2];
2455 1384722 : dfVal4 += pChunkRow2[i + 3] * padfWeights[i + 3];
2456 1384722 : dfVal5 += pChunkRow3[i] * padfWeights[i];
2457 1384722 : dfVal5 += pChunkRow3[i + 1] * padfWeights[i + 1];
2458 1384722 : dfVal6 += pChunkRow3[i + 2] * padfWeights[i + 2];
2459 1384722 : dfVal6 += pChunkRow3[i + 3] * padfWeights[i + 3];
2460 : }
2461 1366938 : for (; i < nSrcPixelCount; ++i)
2462 : {
2463 36605 : dfVal1 += pChunkRow1[i] * padfWeights[i];
2464 36605 : dfVal3 += pChunkRow2[i] * padfWeights[i];
2465 36605 : dfVal5 += pChunkRow3[i] * padfWeights[i];
2466 : }
2467 1330333 : dfRes1 = dfVal1 + dfVal2;
2468 1330333 : dfRes2 = dfVal3 + dfVal4;
2469 1330333 : dfRes3 = dfVal5 + dfVal6;
2470 1330333 : }
2471 :
2472 : template <class T>
2473 18187 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
2474 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2475 : const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2476 : double &dfRes2, double &dfRes3)
2477 : {
2478 18187 : GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2479 : padfWeights, nSrcPixelCount, dfRes1,
2480 : dfRes2, dfRes3);
2481 18187 : }
2482 :
2483 : template <class T>
2484 1247346 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
2485 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2486 : const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
2487 : {
2488 1247346 : GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2489 : padfWeights, 4, dfRes1, dfRes2,
2490 : dfRes3);
2491 1247346 : }
2492 :
2493 : /************************************************************************/
2494 : /* GDALResampleConvolutionVertical() */
2495 : /************************************************************************/
2496 :
2497 : template <class T>
2498 : static inline double
2499 462638 : GDALResampleConvolutionVertical(const T *pChunk, int nStride,
2500 : const double *padfWeights, int nSrcLineCount)
2501 : {
2502 462638 : double dfVal1 = 0.0;
2503 462638 : double dfVal2 = 0.0;
2504 462638 : int i = 0;
2505 462638 : int j = 0;
2506 911564 : for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2507 : {
2508 448926 : dfVal1 += pChunk[j] * padfWeights[i];
2509 448926 : dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
2510 448926 : dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2511 448926 : dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2512 : }
2513 514426 : for (; i < nSrcLineCount; ++i, j += nStride)
2514 : {
2515 51788 : dfVal1 += pChunk[j] * padfWeights[i];
2516 : }
2517 462638 : return dfVal1 + dfVal2;
2518 : }
2519 :
2520 : template <class T>
2521 2880000 : static inline void GDALResampleConvolutionVertical_2cols(
2522 : const T *pChunk, int nStride, const double *padfWeights, int nSrcLineCount,
2523 : double &dfRes1, double &dfRes2)
2524 : {
2525 2880000 : double dfVal1 = 0.0;
2526 2880000 : double dfVal2 = 0.0;
2527 2880000 : double dfVal3 = 0.0;
2528 2880000 : double dfVal4 = 0.0;
2529 2880000 : int i = 0;
2530 2880000 : int j = 0;
2531 5716800 : for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2532 : {
2533 2836800 : dfVal1 += pChunk[j] * padfWeights[i];
2534 2836800 : dfVal3 += pChunk[j + 1] * padfWeights[i];
2535 2836800 : dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
2536 2836800 : dfVal3 += pChunk[j + 1 + nStride] * padfWeights[i + 1];
2537 2836800 : dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2538 2836800 : dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
2539 2836800 : dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2540 2836800 : dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
2541 : }
2542 2995210 : for (; i < nSrcLineCount; ++i, j += nStride)
2543 : {
2544 115210 : dfVal1 += pChunk[j] * padfWeights[i];
2545 115210 : dfVal3 += pChunk[j + 1] * padfWeights[i];
2546 : }
2547 2880000 : dfRes1 = dfVal1 + dfVal2;
2548 2880000 : dfRes2 = dfVal3 + dfVal4;
2549 2880000 : }
2550 :
2551 : #ifdef USE_SSE2
2552 :
2553 : #ifdef __AVX__
2554 : /************************************************************************/
2555 : /* GDALResampleConvolutionVertical_16cols<T> */
2556 : /************************************************************************/
2557 :
2558 : template <class T>
2559 : static inline void
2560 : GDALResampleConvolutionVertical_16cols(const T *pChunk, int nStride,
2561 : const double *padfWeights,
2562 : int nSrcLineCount, float *afDest)
2563 : {
2564 : int i = 0;
2565 : int j = 0;
2566 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2567 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2568 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2569 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2570 : for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2571 : {
2572 : XMMReg4Double w0 =
2573 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2574 : XMMReg4Double w1 =
2575 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2576 : XMMReg4Double w2 =
2577 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2578 : XMMReg4Double w3 =
2579 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2580 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2581 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2582 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
2583 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
2584 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2585 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2586 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
2587 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
2588 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2589 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2590 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
2591 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
2592 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2593 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2594 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
2595 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
2596 : }
2597 : for (; i < nSrcLineCount; ++i, j += nStride)
2598 : {
2599 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2600 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2601 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2602 : v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
2603 : v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
2604 : }
2605 : v_acc0.Store4Val(afDest);
2606 : v_acc1.Store4Val(afDest + 4);
2607 : v_acc2.Store4Val(afDest + 8);
2608 : v_acc3.Store4Val(afDest + 12);
2609 : }
2610 :
2611 : template <class T>
2612 : static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
2613 : const double *, int,
2614 : double *)
2615 : {
2616 : // Cannot be reached
2617 : CPLAssert(false);
2618 : }
2619 :
2620 : #else
2621 :
2622 : /************************************************************************/
2623 : /* GDALResampleConvolutionVertical_8cols<T> */
2624 : /************************************************************************/
2625 :
2626 : template <class T>
2627 : static inline void
2628 18601600 : GDALResampleConvolutionVertical_8cols(const T *pChunk, int nStride,
2629 : const double *padfWeights,
2630 : int nSrcLineCount, float *afDest)
2631 : {
2632 18601600 : int i = 0;
2633 18601600 : int j = 0;
2634 18601600 : XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2635 18467700 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2636 33692600 : for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2637 : {
2638 15168300 : XMMReg4Double w0 =
2639 15168300 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2640 15114600 : XMMReg4Double w1 =
2641 15114600 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2642 15106700 : XMMReg4Double w2 =
2643 15106700 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2644 15116900 : XMMReg4Double w3 =
2645 15116900 : XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2646 15142900 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2647 15084600 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2648 15084400 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2649 15115300 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2650 15113000 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2651 15113800 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2652 15123000 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2653 15116500 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2654 : }
2655 29914900 : for (; i < nSrcLineCount; ++i, j += nStride)
2656 : {
2657 11390600 : XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2658 11390600 : v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2659 11390600 : v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2660 : }
2661 18524300 : v_acc0.Store4Val(afDest);
2662 18545100 : v_acc1.Store4Val(afDest + 4);
2663 18579000 : }
2664 :
2665 : template <class T>
2666 : static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
2667 : const double *, int,
2668 : double *)
2669 : {
2670 : // Cannot be reached
2671 : CPLAssert(false);
2672 : }
2673 :
2674 : #endif // __AVX__
2675 :
2676 : /************************************************************************/
2677 : /* GDALResampleConvolutionHorizontalSSE2<T> */
2678 : /************************************************************************/
2679 :
2680 : template <class T>
2681 2736394 : static inline double GDALResampleConvolutionHorizontalSSE2(
2682 : const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2683 : {
2684 2736394 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2685 2735919 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2686 2736316 : int i = 0; // Used after for.
2687 2811917 : for (; i + 7 < nSrcPixelCount; i += 8)
2688 : {
2689 : // Retrieve the pixel & accumulate
2690 75571 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
2691 75571 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
2692 75571 : const XMMReg4Double v_weight1 =
2693 75571 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2694 75571 : const XMMReg4Double v_weight2 =
2695 75571 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
2696 :
2697 75571 : v_acc1 += v_pixels1 * v_weight1;
2698 75571 : v_acc2 += v_pixels2 * v_weight2;
2699 : }
2700 :
2701 2736345 : v_acc1 += v_acc2;
2702 :
2703 2736020 : double dfVal = v_acc1.GetHorizSum();
2704 9501510 : for (; i < nSrcPixelCount; ++i)
2705 : {
2706 6765520 : dfVal += pChunk[i] * padfWeightsAligned[i];
2707 : }
2708 2735983 : return dfVal;
2709 : }
2710 :
2711 : /************************************************************************/
2712 : /* GDALResampleConvolutionHorizontal<GByte> */
2713 : /************************************************************************/
2714 :
2715 : template <>
2716 2188440 : inline double GDALResampleConvolutionHorizontal<GByte>(
2717 : const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2718 : {
2719 2188440 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
2720 2188480 : nSrcPixelCount);
2721 : }
2722 :
2723 : template <>
2724 548283 : inline double GDALResampleConvolutionHorizontal<GUInt16>(
2725 : const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2726 : {
2727 548283 : return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
2728 548492 : nSrcPixelCount);
2729 : }
2730 :
2731 : /************************************************************************/
2732 : /* GDALResampleConvolutionHorizontalWithMaskSSE2<T> */
2733 : /************************************************************************/
2734 :
2735 : template <class T>
2736 4582263 : static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
2737 : const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
2738 : int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2739 : {
2740 4582263 : int i = 0; // Used after for.
2741 4582263 : XMMReg4Double v_acc = XMMReg4Double::Zero();
2742 4582263 : XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
2743 11403121 : for (; i + 3 < nSrcPixelCount; i += 4)
2744 : {
2745 6820848 : const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
2746 6820848 : const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
2747 6820848 : XMMReg4Double v_weight =
2748 6820848 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2749 6820848 : v_weight *= v_mask;
2750 6820848 : v_acc += v_pixels * v_weight;
2751 6820848 : v_acc_weight += v_weight;
2752 : }
2753 :
2754 4582263 : dfVal = v_acc.GetHorizSum();
2755 4582263 : dfWeightSum = v_acc_weight.GetHorizSum();
2756 4780123 : for (; i < nSrcPixelCount; ++i)
2757 : {
2758 197860 : const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
2759 197860 : dfVal += pChunk[i] * dfWeight;
2760 197860 : dfWeightSum += dfWeight;
2761 : }
2762 4582263 : }
2763 :
2764 : /************************************************************************/
2765 : /* GDALResampleConvolutionHorizontalWithMask<GByte> */
2766 : /************************************************************************/
2767 :
2768 : template <>
2769 4582200 : inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
2770 : const GByte *pChunk, const GByte *pabyMask,
2771 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2772 : double &dfWeightSum)
2773 : {
2774 4582200 : GDALResampleConvolutionHorizontalWithMaskSSE2(
2775 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2776 : dfWeightSum);
2777 4582200 : }
2778 :
2779 : template <>
2780 63 : inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
2781 : const GUInt16 *pChunk, const GByte *pabyMask,
2782 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2783 : double &dfWeightSum)
2784 : {
2785 63 : GDALResampleConvolutionHorizontalWithMaskSSE2(
2786 : pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2787 : dfWeightSum);
2788 63 : }
2789 :
2790 : /************************************************************************/
2791 : /* GDALResampleConvolutionHorizontal_3rows_SSE2<T> */
2792 : /************************************************************************/
2793 :
2794 : template <class T>
2795 10023630 : static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
2796 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2797 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2798 : double &dfRes2, double &dfRes3)
2799 : {
2800 10023630 : XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
2801 10023630 : v_acc2 = XMMReg4Double::Zero(),
2802 10023630 : v_acc3 = XMMReg4Double::Zero();
2803 10023630 : int i = 0;
2804 19989466 : for (; i + 7 < nSrcPixelCount; i += 8)
2805 : {
2806 : // Retrieve the pixel & accumulate.
2807 9965826 : XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
2808 9965826 : XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
2809 9965826 : const XMMReg4Double v_weight1 =
2810 9965826 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2811 9965826 : const XMMReg4Double v_weight2 =
2812 9965826 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
2813 :
2814 9965826 : v_acc1 += v_pixels1 * v_weight1;
2815 9965826 : v_acc1 += v_pixels2 * v_weight2;
2816 :
2817 9965826 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
2818 9965826 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
2819 9965826 : v_acc2 += v_pixels1 * v_weight1;
2820 9965826 : v_acc2 += v_pixels2 * v_weight2;
2821 :
2822 9965826 : v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
2823 9965826 : v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
2824 9965826 : v_acc3 += v_pixels1 * v_weight1;
2825 9965826 : v_acc3 += v_pixels2 * v_weight2;
2826 : }
2827 :
2828 10023630 : dfRes1 = v_acc1.GetHorizSum();
2829 10023630 : dfRes2 = v_acc2.GetHorizSum();
2830 10023630 : dfRes3 = v_acc3.GetHorizSum();
2831 21487226 : for (; i < nSrcPixelCount; ++i)
2832 : {
2833 11463596 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
2834 11463596 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
2835 11463596 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
2836 : }
2837 10023630 : }
2838 :
2839 : /************************************************************************/
2840 : /* GDALResampleConvolutionHorizontal_3rows<GByte> */
2841 : /************************************************************************/
2842 :
2843 : template <>
2844 10023600 : inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
2845 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
2846 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2847 : double &dfRes2, double &dfRes3)
2848 : {
2849 10023600 : GDALResampleConvolutionHorizontal_3rows_SSE2(
2850 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
2851 : dfRes1, dfRes2, dfRes3);
2852 10023600 : }
2853 :
2854 : template <>
2855 30 : inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
2856 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
2857 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
2858 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
2859 : {
2860 30 : GDALResampleConvolutionHorizontal_3rows_SSE2(
2861 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
2862 : dfRes1, dfRes2, dfRes3);
2863 30 : }
2864 :
2865 : /************************************************************************/
2866 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T> */
2867 : /************************************************************************/
2868 :
2869 : template <class T>
2870 2173103 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
2871 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2872 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2873 : double &dfRes2, double &dfRes3)
2874 : {
2875 2173103 : XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2876 2172788 : XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2877 2172943 : XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2878 2172989 : int i = 0; // Use after for.
2879 2176255 : for (; i + 3 < nSrcPixelCount; i += 4)
2880 : {
2881 : // Retrieve the pixel & accumulate.
2882 3236 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
2883 3236 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
2884 3236 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
2885 3236 : const XMMReg4Double v_weight =
2886 3236 : XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2887 :
2888 3236 : v_acc1 += v_pixels1 * v_weight;
2889 3236 : v_acc2 += v_pixels2 * v_weight;
2890 3236 : v_acc3 += v_pixels3 * v_weight;
2891 : }
2892 :
2893 2173025 : dfRes1 = v_acc1.GetHorizSum();
2894 2172783 : dfRes2 = v_acc2.GetHorizSum();
2895 2172831 : dfRes3 = v_acc3.GetHorizSum();
2896 :
2897 6493540 : for (; i < nSrcPixelCount; ++i)
2898 : {
2899 4320664 : dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
2900 4320664 : dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
2901 4320664 : dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
2902 : }
2903 2172886 : }
2904 :
2905 : /************************************************************************/
2906 : /* GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte> */
2907 : /************************************************************************/
2908 :
2909 : template <>
2910 2106280 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
2911 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
2912 : const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2913 : double &dfRes2, double &dfRes3)
2914 : {
2915 2106280 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
2916 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
2917 : dfRes1, dfRes2, dfRes3);
2918 2106310 : }
2919 :
2920 : template <>
2921 66660 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
2922 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
2923 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
2924 : int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
2925 : {
2926 66660 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
2927 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
2928 : dfRes1, dfRes2, dfRes3);
2929 66849 : }
2930 :
2931 : /************************************************************************/
2932 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T> */
2933 : /************************************************************************/
2934 :
2935 : template <class T>
2936 12211540 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
2937 : const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2938 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
2939 : double &dfRes3)
2940 : {
2941 12211540 : const XMMReg4Double v_weight =
2942 : XMMReg4Double::Load4ValAligned(padfWeightsAligned);
2943 :
2944 : // Retrieve the pixel & accumulate.
2945 12129440 : const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
2946 12138510 : const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
2947 12190660 : const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
2948 :
2949 12223880 : XMMReg4Double v_acc1 = v_pixels1 * v_weight;
2950 12098420 : XMMReg4Double v_acc2 = v_pixels2 * v_weight;
2951 12139450 : XMMReg4Double v_acc3 = v_pixels3 * v_weight;
2952 :
2953 12147690 : dfRes1 = v_acc1.GetHorizSum();
2954 12158770 : dfRes2 = v_acc2.GetHorizSum();
2955 12184480 : dfRes3 = v_acc3.GetHorizSum();
2956 12177780 : }
2957 :
2958 : /************************************************************************/
2959 : /* GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte> */
2960 : /************************************************************************/
2961 :
2962 : template <>
2963 6620250 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
2964 : const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
2965 : const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
2966 : double &dfRes3)
2967 : {
2968 6620250 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
2969 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
2970 : dfRes3);
2971 6614630 : }
2972 :
2973 : template <>
2974 5585650 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
2975 : const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
2976 : const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
2977 : double &dfRes2, double &dfRes3)
2978 : {
2979 5585650 : GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
2980 : pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
2981 : dfRes3);
2982 5574190 : }
2983 :
2984 : #endif // USE_SSE2
2985 :
2986 : /************************************************************************/
2987 : /* GDALResampleChunk_Convolution() */
2988 : /************************************************************************/
2989 :
2990 : template <class T, class Twork, GDALDataType eWrkDataType>
2991 3593 : static CPLErr GDALResampleChunk_ConvolutionT(
2992 : double dfXRatioDstToSrc, double dfYRatioDstToSrc, double dfSrcXDelta,
2993 : double dfSrcYDelta, const T *pChunk, int nBands,
2994 : const GByte *pabyChunkNodataMask, int nChunkXOff, int nChunkXSize,
2995 : int nChunkYOff, int nChunkYSize, int nDstXOff, int nDstXOff2, int nDstYOff,
2996 : int nDstYOff2, GDALRasterBand *poDstBand, void *pDstBuffer, bool bHasNoData,
2997 : double dfNoDataValue, FilterFuncType pfnFilterFunc,
2998 : FilterFunc4ValuesType pfnFilterFunc4Values, int nKernelRadius,
2999 : bool bKernelWithNegativeWeights, float fMaxVal)
3000 :
3001 : {
3002 3593 : if (!bHasNoData)
3003 3559 : dfNoDataValue = 0.0;
3004 3593 : const auto dstDataType = poDstBand->GetRasterDataType();
3005 3591 : const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
3006 3596 : const double dfReplacementVal =
3007 39 : bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
3008 : : dfNoDataValue;
3009 : // cppcheck-suppress unreadVariable
3010 3596 : const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
3011 3595 : const auto nNodataValueInt64 = static_cast<GInt64>(dfNoDataValue);
3012 :
3013 : // TODO: we should have some generic function to do this.
3014 3595 : Twork fDstMin = -std::numeric_limits<Twork>::max();
3015 3595 : Twork fDstMax = std::numeric_limits<Twork>::max();
3016 3595 : if (dstDataType == GDT_Byte)
3017 : {
3018 2886 : fDstMin = std::numeric_limits<GByte>::min();
3019 2884 : fDstMax = std::numeric_limits<GByte>::max();
3020 : }
3021 709 : else if (dstDataType == GDT_Int8)
3022 : {
3023 0 : fDstMin = std::numeric_limits<GInt8>::min();
3024 0 : fDstMax = std::numeric_limits<GInt8>::max();
3025 : }
3026 709 : else if (dstDataType == GDT_UInt16)
3027 : {
3028 386 : fDstMin = std::numeric_limits<GUInt16>::min();
3029 385 : fDstMax = std::numeric_limits<GUInt16>::max();
3030 : }
3031 323 : else if (dstDataType == GDT_Int16)
3032 : {
3033 278 : fDstMin = std::numeric_limits<GInt16>::min();
3034 278 : fDstMax = std::numeric_limits<GInt16>::max();
3035 : }
3036 45 : else if (dstDataType == GDT_UInt32)
3037 : {
3038 1 : fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
3039 1 : fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
3040 : }
3041 44 : else if (dstDataType == GDT_Int32)
3042 : {
3043 : // cppcheck-suppress unreadVariable
3044 1 : fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
3045 : // cppcheck-suppress unreadVariable
3046 1 : fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
3047 : }
3048 :
3049 26993710 : auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
3050 : nNodataValueInt64, dfNoDataValue,
3051 : dfReplacementVal](Twork fVal)
3052 : {
3053 14083600 : if (!bHasNoData)
3054 10857300 : return fVal;
3055 :
3056 : // Clamp value before comparing to nodata: this is only needed for
3057 : // kernels with negative weights (Lanczos)
3058 3226360 : Twork fClamped = fVal;
3059 3226360 : if (fClamped < fDstMin)
3060 12874 : fClamped = fDstMin;
3061 3213490 : else if (fClamped > fDstMax)
3062 12852 : fClamped = fDstMax;
3063 3226360 : if (isIntegerDT)
3064 : {
3065 3226360 : if (nNodataValueInt64 == static_cast<GInt64>(std::round(fClamped)))
3066 : {
3067 : // Do not use the nodata value
3068 13869 : return static_cast<Twork>(dfReplacementVal);
3069 : }
3070 : }
3071 4 : else if (dfNoDataValue == fClamped)
3072 : {
3073 : // Do not use the nodata value
3074 1 : return static_cast<Twork>(dfReplacementVal);
3075 : }
3076 3212490 : return fClamped;
3077 : };
3078 :
3079 : /* -------------------------------------------------------------------- */
3080 : /* Allocate work buffers. */
3081 : /* -------------------------------------------------------------------- */
3082 3595 : const int nDstXSize = nDstXOff2 - nDstXOff;
3083 3595 : Twork *pafWrkScanline = nullptr;
3084 3595 : if (dstDataType != eWrkDataType)
3085 : {
3086 : pafWrkScanline =
3087 3558 : static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
3088 3563 : if (pafWrkScanline == nullptr)
3089 0 : return CE_Failure;
3090 : }
3091 :
3092 3600 : const double dfXScale = 1.0 / dfXRatioDstToSrc;
3093 3600 : const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
3094 3600 : const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
3095 3600 : const double dfYScale = 1.0 / dfYRatioDstToSrc;
3096 3600 : const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
3097 3600 : const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
3098 :
3099 : // Temporary array to store result of horizontal filter.
3100 : double *padfHorizontalFiltered = static_cast<double *>(
3101 3600 : VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
3102 :
3103 : // To store convolution coefficients.
3104 3601 : double *padfWeights = static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
3105 : static_cast<int>(2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) +
3106 : 0.5) *
3107 : sizeof(double)));
3108 :
3109 3594 : GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
3110 3594 : if (pabyChunkNodataMask)
3111 : pabyChunkNodataMaskHorizontalFiltered =
3112 342 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
3113 3594 : if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
3114 342 : (pabyChunkNodataMask != nullptr &&
3115 : pabyChunkNodataMaskHorizontalFiltered == nullptr))
3116 : {
3117 3 : VSIFree(pafWrkScanline);
3118 0 : VSIFree(padfHorizontalFiltered);
3119 0 : VSIFreeAligned(padfWeights);
3120 0 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3121 0 : return CE_Failure;
3122 : }
3123 :
3124 : /* ==================================================================== */
3125 : /* First pass: horizontal filter */
3126 : /* ==================================================================== */
3127 3591 : const int nChunkRightXOff = nChunkXOff + nChunkXSize;
3128 : #ifdef USE_SSE2
3129 3591 : bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
3130 : #endif
3131 2711634 : for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
3132 : {
3133 2708036 : const double dfSrcPixel =
3134 2708036 : (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
3135 2708036 : int nSrcPixelStart =
3136 2708036 : static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
3137 2708036 : if (nSrcPixelStart < nChunkXOff)
3138 54479 : nSrcPixelStart = nChunkXOff;
3139 2708036 : int nSrcPixelStop =
3140 2708036 : static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
3141 2708036 : if (nSrcPixelStop > nChunkRightXOff)
3142 54492 : nSrcPixelStop = nChunkRightXOff;
3143 : #if 0
3144 : if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
3145 : {
3146 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3147 : }
3148 : if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
3149 : {
3150 : printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3151 : }
3152 : #endif
3153 2708036 : const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
3154 2708036 : double dfWeightSum = 0.0;
3155 :
3156 : // Compute convolution coefficients.
3157 2708036 : int nSrcPixel = nSrcPixelStart;
3158 2708036 : double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
3159 3519436 : for (; nSrcPixel + 3 < nSrcPixelStop; nSrcPixel += 4)
3160 : {
3161 811506 : padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
3162 811506 : dfX += dfXScaleWeight;
3163 811506 : padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
3164 811506 : dfX += dfXScaleWeight;
3165 811506 : padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
3166 811506 : dfX += dfXScaleWeight;
3167 811506 : padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
3168 811506 : dfX += dfXScaleWeight;
3169 811408 : dfWeightSum +=
3170 811506 : pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
3171 : }
3172 6687365 : for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
3173 : {
3174 3979751 : const double dfWeight = pfnFilterFunc(dfX);
3175 3979434 : padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3176 3979434 : dfWeightSum += dfWeight;
3177 : }
3178 :
3179 2707614 : const int nHeight = nChunkYSize * nBands;
3180 2707614 : if (pabyChunkNodataMask == nullptr)
3181 : {
3182 2647397 : if (dfWeightSum != 0)
3183 : {
3184 2647304 : const double dfInvWeightSum = 1.0 / dfWeightSum;
3185 9449510 : for (int i = 0; i < nSrcPixelCount; ++i)
3186 6802203 : padfWeights[i] *= dfInvWeightSum;
3187 : }
3188 2647397 : int iSrcLineOff = 0;
3189 : #ifdef USE_SSE2
3190 2647397 : if (nSrcPixelCount == 4)
3191 : {
3192 13963046 : for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
3193 : {
3194 13399166 : const GPtrDiff_t j =
3195 13399166 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3196 13399166 : (nSrcPixelStart - nChunkXOff);
3197 13399166 : double dfVal1 = 0.0;
3198 13399166 : double dfVal2 = 0.0;
3199 13399166 : double dfVal3 = 0.0;
3200 13399166 : GDALResampleConvolutionHorizontalPixelCount4_3rows(
3201 13399166 : pChunk + j, pChunk + j + nChunkXSize,
3202 13399166 : pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
3203 : dfVal2, dfVal3);
3204 13427716 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3205 13427716 : nDstXSize +
3206 13427716 : iDstPixel - nDstXOff] = dfVal1;
3207 13427716 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3208 13427716 : 1) *
3209 13427716 : nDstXSize +
3210 13427716 : iDstPixel - nDstXOff] = dfVal2;
3211 13427716 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3212 13427716 : 2) *
3213 13427716 : nDstXSize +
3214 13427716 : iDstPixel - nDstXOff] = dfVal3;
3215 : }
3216 : }
3217 2112058 : else if (bSrcPixelCountLess8)
3218 : {
3219 4225011 : for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
3220 : {
3221 2190968 : const GPtrDiff_t j =
3222 2190968 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3223 2190968 : (nSrcPixelStart - nChunkXOff);
3224 2190968 : double dfVal1 = 0.0;
3225 2190968 : double dfVal2 = 0.0;
3226 2190968 : double dfVal3 = 0.0;
3227 2190968 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
3228 2190968 : pChunk + j, pChunk + j + nChunkXSize,
3229 2190968 : pChunk + j + 2 * nChunkXSize, padfWeights,
3230 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3231 2191327 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3232 2191327 : nDstXSize +
3233 2191327 : iDstPixel - nDstXOff] = dfVal1;
3234 2191327 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3235 2191327 : 1) *
3236 2191327 : nDstXSize +
3237 2191327 : iDstPixel - nDstXOff] = dfVal2;
3238 2191327 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3239 2191327 : 2) *
3240 2191327 : nDstXSize +
3241 2191327 : iDstPixel - nDstXOff] = dfVal3;
3242 : }
3243 : }
3244 : else
3245 : #endif
3246 : {
3247 10166834 : for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
3248 : {
3249 10088430 : const GPtrDiff_t j =
3250 10088430 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3251 10088430 : (nSrcPixelStart - nChunkXOff);
3252 10088430 : double dfVal1 = 0.0;
3253 10088430 : double dfVal2 = 0.0;
3254 10088430 : double dfVal3 = 0.0;
3255 10088430 : GDALResampleConvolutionHorizontal_3rows(
3256 10088430 : pChunk + j, pChunk + j + nChunkXSize,
3257 10088430 : pChunk + j + 2 * nChunkXSize, padfWeights,
3258 : nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3259 10088430 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3260 10088430 : nDstXSize +
3261 10088430 : iDstPixel - nDstXOff] = dfVal1;
3262 10088430 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3263 10088430 : 1) *
3264 10088430 : nDstXSize +
3265 10088430 : iDstPixel - nDstXOff] = dfVal2;
3266 10088430 : padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3267 10088430 : 2) *
3268 10088430 : nDstXSize +
3269 10088430 : iDstPixel - nDstXOff] = dfVal3;
3270 : }
3271 : }
3272 5457904 : for (; iSrcLineOff < nHeight; ++iSrcLineOff)
3273 : {
3274 2781344 : const GPtrDiff_t j =
3275 2781344 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3276 2781344 : (nSrcPixelStart - nChunkXOff);
3277 5518302 : const double dfVal = GDALResampleConvolutionHorizontal(
3278 2781344 : pChunk + j, padfWeights, nSrcPixelCount);
3279 2781596 : padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3280 2781596 : nDstXSize +
3281 2781596 : iDstPixel - nDstXOff] = dfVal;
3282 : }
3283 : }
3284 : else
3285 : {
3286 15629548 : for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
3287 : {
3288 15571750 : const GPtrDiff_t j =
3289 15571750 : static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3290 15571750 : (nSrcPixelStart - nChunkXOff);
3291 :
3292 15571750 : if (bKernelWithNegativeWeights)
3293 : {
3294 15091144 : int nConsecutiveValid = 0;
3295 15091144 : int nMaxConsecutiveValid = 0;
3296 130248786 : for (int k = 0; k < nSrcPixelCount; k++)
3297 : {
3298 115157642 : if (pabyChunkNodataMask[j + k])
3299 25502104 : nConsecutiveValid++;
3300 89655138 : else if (nConsecutiveValid)
3301 : {
3302 39842 : nMaxConsecutiveValid = std::max(
3303 39842 : nMaxConsecutiveValid, nConsecutiveValid);
3304 39842 : nConsecutiveValid = 0;
3305 : }
3306 : }
3307 15091144 : nMaxConsecutiveValid =
3308 15091144 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
3309 15091144 : if (nMaxConsecutiveValid < nSrcPixelCount / 2)
3310 : {
3311 10989474 : const size_t nTempOffset =
3312 10989474 : static_cast<size_t>(iSrcLineOff) * nDstXSize +
3313 10989474 : iDstPixel - nDstXOff;
3314 10989474 : padfHorizontalFiltered[nTempOffset] = 0.0;
3315 10989474 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3316 10989474 : continue;
3317 : }
3318 : }
3319 :
3320 4582276 : double dfVal = 0.0;
3321 4582276 : GDALResampleConvolutionHorizontalWithMask(
3322 4582276 : pChunk + j, pabyChunkNodataMask + j, padfWeights,
3323 : nSrcPixelCount, dfVal, dfWeightSum);
3324 4579813 : const size_t nTempOffset =
3325 4579813 : static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
3326 4579813 : nDstXOff;
3327 4579813 : if (dfWeightSum > 0.0)
3328 : {
3329 4538924 : padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
3330 4538924 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
3331 : }
3332 : else
3333 : {
3334 40950 : padfHorizontalFiltered[nTempOffset] = 0.0;
3335 40950 : pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3336 : }
3337 : }
3338 : }
3339 : }
3340 :
3341 : /* ==================================================================== */
3342 : /* Second pass: vertical filter */
3343 : /* ==================================================================== */
3344 3606 : const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
3345 :
3346 192485 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3347 : {
3348 188879 : Twork *const pafDstScanline =
3349 188879 : pafWrkScanline ? pafWrkScanline
3350 8414 : : static_cast<Twork *>(pDstBuffer) +
3351 8414 : (iDstLine - nDstYOff) * nDstXSize;
3352 :
3353 188879 : const double dfSrcLine =
3354 188879 : (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
3355 188879 : int nSrcLineStart =
3356 188879 : static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
3357 188879 : int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
3358 188879 : if (nSrcLineStart < nChunkYOff)
3359 1719 : nSrcLineStart = nChunkYOff;
3360 188879 : if (nSrcLineStop > nChunkBottomYOff)
3361 1744 : nSrcLineStop = nChunkBottomYOff;
3362 : #if 0
3363 : if( nSrcLineStart < nChunkYOff &&
3364 : nChunkYOff > 0 )
3365 : {
3366 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3367 : }
3368 : if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
3369 : {
3370 : printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3371 : }
3372 : #endif
3373 188879 : const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
3374 188879 : double dfWeightSum = 0.0;
3375 :
3376 : // Compute convolution coefficients.
3377 188879 : int nSrcLine = nSrcLineStart; // Used after for.
3378 188879 : double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
3379 414441 : for (; nSrcLine + 3 < nSrcLineStop;
3380 225562 : nSrcLine += 4, dfY += 4 * dfYScaleWeight)
3381 : {
3382 225566 : padfWeights[nSrcLine - nSrcLineStart] = dfY;
3383 225566 : padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
3384 225566 : padfWeights[nSrcLine + 2 - nSrcLineStart] =
3385 225566 : dfY + 2 * dfYScaleWeight;
3386 225566 : padfWeights[nSrcLine + 3 - nSrcLineStart] =
3387 225566 : dfY + 3 * dfYScaleWeight;
3388 225562 : dfWeightSum +=
3389 225566 : pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
3390 : }
3391 220347 : for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
3392 : {
3393 31475 : const double dfWeight = pfnFilterFunc(dfY);
3394 31472 : padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
3395 31472 : dfWeightSum += dfWeight;
3396 : }
3397 :
3398 188872 : if (pabyChunkNodataMask == nullptr)
3399 : {
3400 158693 : if (dfWeightSum != 0)
3401 : {
3402 158691 : const double dfInvWeightSum = 1.0 / dfWeightSum;
3403 897624 : for (int i = 0; i < nSrcLineCount; ++i)
3404 738933 : padfWeights[i] *= dfInvWeightSum;
3405 : }
3406 : }
3407 :
3408 188872 : if (pabyChunkNodataMask == nullptr)
3409 : {
3410 158698 : int iFilteredPixelOff = 0; // Used after for.
3411 : // j used after for.
3412 158698 : size_t j =
3413 158698 : (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
3414 : #ifdef USE_SSE2
3415 : if constexpr (eWrkDataType == GDT_Float32)
3416 : {
3417 : #ifdef __AVX__
3418 : for (; iFilteredPixelOff + 15 < nDstXSize;
3419 : iFilteredPixelOff += 16, j += 16)
3420 : {
3421 : GDALResampleConvolutionVertical_16cols(
3422 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3423 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3424 : if (bHasNoData)
3425 : {
3426 : for (int k = 0; k < 16; k++)
3427 : {
3428 : pafDstScanline[iFilteredPixelOff + k] =
3429 : replaceValIfNodata(
3430 : pafDstScanline[iFilteredPixelOff + k]);
3431 : }
3432 : }
3433 : }
3434 : #else
3435 18719887 : for (; iFilteredPixelOff + 7 < nDstXSize;
3436 : iFilteredPixelOff += 8, j += 8)
3437 : {
3438 18615900 : GDALResampleConvolutionVertical_8cols(
3439 18615900 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3440 18615900 : nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3441 18568400 : if (bHasNoData)
3442 : {
3443 17820 : for (int k = 0; k < 8; k++)
3444 : {
3445 15840 : pafDstScanline[iFilteredPixelOff + k] =
3446 15840 : replaceValIfNodata(
3447 15840 : pafDstScanline[iFilteredPixelOff + k]);
3448 : }
3449 : }
3450 : }
3451 : #endif
3452 :
3453 566658 : for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
3454 : {
3455 462717 : const Twork fVal =
3456 462639 : static_cast<Twork>(GDALResampleConvolutionVertical(
3457 462639 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3458 : nSrcLineCount));
3459 462655 : pafDstScanline[iFilteredPixelOff] =
3460 462717 : replaceValIfNodata(fVal);
3461 : }
3462 : }
3463 : else
3464 : #endif
3465 : {
3466 2887210 : for (; iFilteredPixelOff + 1 < nDstXSize;
3467 : iFilteredPixelOff += 2, j += 2)
3468 : {
3469 2880000 : double dfVal1 = 0.0;
3470 2880000 : double dfVal2 = 0.0;
3471 2880000 : GDALResampleConvolutionVertical_2cols(
3472 2880000 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3473 : nSrcLineCount, dfVal1, dfVal2);
3474 5760010 : pafDstScanline[iFilteredPixelOff] =
3475 2880000 : replaceValIfNodata(static_cast<Twork>(dfVal1));
3476 2880000 : pafDstScanline[iFilteredPixelOff + 1] =
3477 2880000 : replaceValIfNodata(static_cast<Twork>(dfVal2));
3478 : }
3479 7204 : if (iFilteredPixelOff < nDstXSize)
3480 : {
3481 0 : const double dfVal = GDALResampleConvolutionVertical(
3482 0 : padfHorizontalFiltered + j, nDstXSize, padfWeights,
3483 : nSrcLineCount);
3484 0 : pafDstScanline[iFilteredPixelOff] =
3485 0 : replaceValIfNodata(static_cast<Twork>(dfVal));
3486 : }
3487 : }
3488 : }
3489 : else
3490 : {
3491 16007350 : for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
3492 : ++iFilteredPixelOff)
3493 : {
3494 15977137 : double dfVal = 0.0;
3495 15977137 : dfWeightSum = 0.0;
3496 15977137 : size_t j = (nSrcLineStart - nChunkYOff) *
3497 15977137 : static_cast<size_t>(nDstXSize) +
3498 15977137 : iFilteredPixelOff;
3499 15977137 : if (bKernelWithNegativeWeights)
3500 : {
3501 15752333 : int nConsecutiveValid = 0;
3502 15752333 : int nMaxConsecutiveValid = 0;
3503 104744705 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3504 : {
3505 88992272 : const double dfWeight =
3506 88992272 : padfWeights[i] *
3507 : pabyChunkNodataMaskHorizontalFiltered[j];
3508 88992272 : if (pabyChunkNodataMaskHorizontalFiltered[j])
3509 : {
3510 34763295 : nConsecutiveValid++;
3511 : }
3512 54228977 : else if (nConsecutiveValid)
3513 : {
3514 172294 : nMaxConsecutiveValid = std::max(
3515 172294 : nMaxConsecutiveValid, nConsecutiveValid);
3516 172294 : nConsecutiveValid = 0;
3517 : }
3518 88992272 : dfVal += padfHorizontalFiltered[j] * dfWeight;
3519 88992272 : dfWeightSum += dfWeight;
3520 : }
3521 15752333 : nMaxConsecutiveValid =
3522 15752333 : std::max(nMaxConsecutiveValid, nConsecutiveValid);
3523 15752333 : if (nMaxConsecutiveValid < nSrcLineCount / 2)
3524 : {
3525 8116035 : pafDstScanline[iFilteredPixelOff] =
3526 8116033 : static_cast<Twork>(dfNoDataValue);
3527 8116035 : continue;
3528 : }
3529 : }
3530 : else
3531 : {
3532 1130262 : for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3533 : {
3534 905432 : const double dfWeight =
3535 905432 : padfWeights[i] *
3536 : pabyChunkNodataMaskHorizontalFiltered[j];
3537 905432 : dfVal += padfHorizontalFiltered[j] * dfWeight;
3538 905432 : dfWeightSum += dfWeight;
3539 : }
3540 : }
3541 7861102 : if (dfWeightSum > 0.0)
3542 : {
3543 7845089 : pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
3544 7845087 : static_cast<Twork>(dfVal / dfWeightSum));
3545 : }
3546 : else
3547 : {
3548 16011 : pafDstScanline[iFilteredPixelOff] =
3549 16007 : static_cast<Twork>(dfNoDataValue);
3550 : }
3551 : }
3552 : }
3553 :
3554 141397 : if (fMaxVal != 0.0f)
3555 : {
3556 192324 : for (int i = 0; i < nDstXSize; ++i)
3557 : {
3558 192088 : if (pafDstScanline[i] > fMaxVal)
3559 96022 : pafDstScanline[i] = fMaxVal;
3560 : }
3561 : }
3562 :
3563 141397 : if (pafWrkScanline)
3564 : {
3565 180461 : GDALCopyWords(pafWrkScanline, eWrkDataType, 4,
3566 : static_cast<GByte *>(pDstBuffer) +
3567 180461 : static_cast<size_t>(iDstLine - nDstYOff) *
3568 180461 : nDstXSize * nDstDataTypeSize,
3569 : dstDataType, nDstDataTypeSize, nDstXSize);
3570 : }
3571 : }
3572 :
3573 3606 : VSIFree(pafWrkScanline);
3574 3606 : VSIFreeAligned(padfWeights);
3575 3606 : VSIFree(padfHorizontalFiltered);
3576 3606 : VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3577 :
3578 3606 : return CE_None;
3579 : }
3580 :
3581 3605 : static CPLErr GDALResampleChunk_Convolution(
3582 : double dfXRatioDstToSrc, double dfYRatioDstToSrc, double dfSrcXDelta,
3583 : double dfSrcYDelta, GDALDataType eWrkDataType, const void *pChunk,
3584 : const GByte *pabyChunkNodataMask, int nChunkXOff, int nChunkXSize,
3585 : int nChunkYOff, int nChunkYSize, int nDstXOff, int nDstXOff2, int nDstYOff,
3586 : int nDstYOff2, GDALRasterBand *poOverview, void **ppDstBuffer,
3587 : GDALDataType *peDstBufferDataType, const char *pszResampling,
3588 : bool bHasNoData, double dfNoDataValue,
3589 : GDALColorTable * /* poColorTable_unused */, GDALDataType /* eSrcDataType */,
3590 : bool /* bPropagateNoData */)
3591 : {
3592 : GDALResampleAlg eResample;
3593 3605 : bool bKernelWithNegativeWeights = false;
3594 3605 : if (EQUAL(pszResampling, "BILINEAR"))
3595 2569 : eResample = GRA_Bilinear;
3596 1036 : else if (EQUAL(pszResampling, "CUBIC"))
3597 : {
3598 981 : eResample = GRA_Cubic;
3599 981 : bKernelWithNegativeWeights = true;
3600 : }
3601 55 : else if (EQUAL(pszResampling, "CUBICSPLINE"))
3602 23 : eResample = GRA_CubicSpline;
3603 32 : else if (EQUAL(pszResampling, "LANCZOS"))
3604 : {
3605 26 : eResample = GRA_Lanczos;
3606 26 : bKernelWithNegativeWeights = true;
3607 : }
3608 : else
3609 : {
3610 6 : CPLAssert(false);
3611 : return CE_Failure;
3612 : }
3613 3599 : const int nKernelRadius = GWKGetFilterRadius(eResample);
3614 3599 : FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
3615 : const FilterFunc4ValuesType pfnFilterFunc4Values =
3616 3602 : GWKGetFilterFunc4Values(eResample);
3617 :
3618 3599 : float fMaxVal = 0.f;
3619 : // Cubic, etc... can have overshoots, so make sure we clamp values to the
3620 : // maximum value if NBITS is set.
3621 : const char *pszNBITS =
3622 3599 : poOverview->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
3623 3600 : GDALDataType eBandDT = poOverview->GetRasterDataType();
3624 3602 : if (eResample != GRA_Bilinear && pszNBITS != nullptr &&
3625 2 : (eBandDT == GDT_Byte || eBandDT == GDT_UInt16 || eBandDT == GDT_UInt32))
3626 : {
3627 8 : int nBits = atoi(pszNBITS);
3628 8 : if (nBits == GDALGetDataTypeSize(eBandDT))
3629 1 : nBits = 0;
3630 8 : if (nBits > 0 && nBits < 32)
3631 7 : fMaxVal = static_cast<float>((1U << nBits) - 1);
3632 : }
3633 :
3634 3603 : *ppDstBuffer =
3635 3602 : VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
3636 : GDALGetDataTypeSizeBytes(eBandDT));
3637 3603 : if (*ppDstBuffer == nullptr)
3638 : {
3639 0 : return CE_Failure;
3640 : }
3641 3603 : *peDstBufferDataType = eBandDT;
3642 :
3643 3603 : if (eWrkDataType == GDT_Byte)
3644 2888 : return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32>(
3645 : dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
3646 : static_cast<const GByte *>(pChunk), 1, pabyChunkNodataMask,
3647 : nChunkXOff, nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff,
3648 : nDstXOff2, nDstYOff, nDstYOff2, poOverview, *ppDstBuffer,
3649 : bHasNoData, dfNoDataValue, pfnFilterFunc, pfnFilterFunc4Values,
3650 2888 : nKernelRadius, bKernelWithNegativeWeights, fMaxVal);
3651 715 : else if (eWrkDataType == GDT_UInt16)
3652 394 : return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32>(
3653 : dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
3654 : static_cast<const GUInt16 *>(pChunk), 1, pabyChunkNodataMask,
3655 : nChunkXOff, nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff,
3656 : nDstXOff2, nDstYOff, nDstYOff2, poOverview, *ppDstBuffer,
3657 : bHasNoData, dfNoDataValue, pfnFilterFunc, pfnFilterFunc4Values,
3658 396 : nKernelRadius, bKernelWithNegativeWeights, fMaxVal);
3659 321 : else if (eWrkDataType == GDT_Float32)
3660 300 : return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32>(
3661 : dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
3662 : static_cast<const float *>(pChunk), 1, pabyChunkNodataMask,
3663 : nChunkXOff, nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff,
3664 : nDstXOff2, nDstYOff, nDstYOff2, poOverview, *ppDstBuffer,
3665 : bHasNoData, dfNoDataValue, pfnFilterFunc, pfnFilterFunc4Values,
3666 300 : nKernelRadius, bKernelWithNegativeWeights, fMaxVal);
3667 21 : else if (eWrkDataType == GDT_Float64)
3668 22 : return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64>(
3669 : dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
3670 : static_cast<const double *>(pChunk), 1, pabyChunkNodataMask,
3671 : nChunkXOff, nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff,
3672 : nDstXOff2, nDstYOff, nDstYOff2, poOverview, *ppDstBuffer,
3673 : bHasNoData, dfNoDataValue, pfnFilterFunc, pfnFilterFunc4Values,
3674 22 : nKernelRadius, bKernelWithNegativeWeights, fMaxVal);
3675 :
3676 0 : CPLAssert(false);
3677 : return CE_Failure;
3678 : }
3679 :
3680 : /************************************************************************/
3681 : /* GDALResampleChunkC32R() */
3682 : /************************************************************************/
3683 :
3684 10 : static CPLErr GDALResampleChunkC32R(int nSrcWidth, int nSrcHeight,
3685 : const float *pafChunk, int nChunkYOff,
3686 : int nChunkYSize, int nDstYOff,
3687 : int nDstYOff2, GDALRasterBand *poOverview,
3688 : void **ppDstBuffer,
3689 : GDALDataType *peDstBufferDataType,
3690 : const char *pszResampling)
3691 :
3692 : {
3693 : enum Method
3694 : {
3695 : NEAR,
3696 : AVERAGE,
3697 : AVERAGE_MAGPHASE,
3698 : RMS,
3699 : };
3700 :
3701 10 : Method eMethod = NEAR;
3702 10 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
3703 : {
3704 8 : eMethod = NEAR;
3705 : }
3706 2 : else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
3707 : {
3708 0 : eMethod = AVERAGE_MAGPHASE;
3709 : }
3710 2 : else if (EQUAL(pszResampling, "RMS"))
3711 : {
3712 2 : eMethod = RMS;
3713 : }
3714 0 : else if (STARTS_WITH_CI(pszResampling, "AVER"))
3715 : {
3716 0 : eMethod = AVERAGE;
3717 : }
3718 : else
3719 : {
3720 0 : CPLError(
3721 : CE_Failure, CPLE_NotSupported,
3722 : "Resampling method %s is not supported for complex data types. "
3723 : "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
3724 : pszResampling);
3725 0 : return CE_Failure;
3726 : }
3727 :
3728 10 : const int nOXSize = poOverview->GetXSize();
3729 10 : *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
3730 : GDALGetDataTypeSizeBytes(GDT_CFloat32));
3731 10 : if (*ppDstBuffer == nullptr)
3732 : {
3733 0 : return CE_Failure;
3734 : }
3735 10 : float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
3736 10 : *peDstBufferDataType = GDT_CFloat32;
3737 :
3738 10 : const int nOYSize = poOverview->GetYSize();
3739 10 : const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
3740 10 : const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
3741 :
3742 : /* ==================================================================== */
3743 : /* Loop over destination scanlines. */
3744 : /* ==================================================================== */
3745 96 : for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3746 : {
3747 86 : int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
3748 86 : if (nSrcYOff < nChunkYOff)
3749 0 : nSrcYOff = nChunkYOff;
3750 :
3751 86 : int nSrcYOff2 =
3752 86 : static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
3753 86 : if (nSrcYOff2 == nSrcYOff)
3754 0 : nSrcYOff2++;
3755 :
3756 86 : if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
3757 : {
3758 10 : if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
3759 0 : nSrcYOff = nSrcHeight - 1;
3760 10 : nSrcYOff2 = nSrcHeight;
3761 : }
3762 86 : if (nSrcYOff2 > nChunkYOff + nChunkYSize)
3763 0 : nSrcYOff2 = nChunkYOff + nChunkYSize;
3764 :
3765 86 : const float *const pafSrcScanline =
3766 86 : pafChunk + ((nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
3767 86 : float *const pafDstScanline =
3768 86 : pafDstBuffer + (iDstLine - nDstYOff) * 2 * nOXSize;
3769 :
3770 : /* --------------------------------------------------------------------
3771 : */
3772 : /* Loop over destination pixels */
3773 : /* --------------------------------------------------------------------
3774 : */
3775 898 : for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
3776 : {
3777 812 : int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
3778 812 : int nSrcXOff2 =
3779 812 : static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
3780 812 : if (nSrcXOff2 == nSrcXOff)
3781 0 : nSrcXOff2++;
3782 812 : if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
3783 : {
3784 86 : if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
3785 0 : nSrcXOff = nSrcWidth - 1;
3786 86 : nSrcXOff2 = nSrcWidth;
3787 : }
3788 :
3789 812 : if (eMethod == NEAR)
3790 : {
3791 800 : pafDstScanline[iDstPixel * 2] = pafSrcScanline[nSrcXOff * 2];
3792 800 : pafDstScanline[iDstPixel * 2 + 1] =
3793 800 : pafSrcScanline[nSrcXOff * 2 + 1];
3794 : }
3795 12 : else if (eMethod == AVERAGE_MAGPHASE)
3796 : {
3797 0 : double dfTotalR = 0.0;
3798 0 : double dfTotalI = 0.0;
3799 0 : double dfTotalM = 0.0;
3800 0 : int nCount = 0;
3801 :
3802 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
3803 : {
3804 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
3805 : {
3806 0 : const double dfR =
3807 0 : pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
3808 0 : iY - nSrcYOff) *
3809 0 : nSrcWidth * 2];
3810 0 : const double dfI =
3811 0 : pafSrcScanline[iX * 2 +
3812 0 : static_cast<GPtrDiff_t>(iY -
3813 0 : nSrcYOff) *
3814 0 : nSrcWidth * 2 +
3815 0 : 1];
3816 0 : dfTotalR += dfR;
3817 0 : dfTotalI += dfI;
3818 0 : dfTotalM += std::hypot(dfR, dfI);
3819 0 : ++nCount;
3820 : }
3821 : }
3822 :
3823 0 : CPLAssert(nCount > 0);
3824 0 : if (nCount == 0)
3825 : {
3826 0 : pafDstScanline[iDstPixel * 2] = 0.0;
3827 0 : pafDstScanline[iDstPixel * 2 + 1] = 0.0;
3828 : }
3829 : else
3830 : {
3831 0 : pafDstScanline[iDstPixel * 2] =
3832 0 : static_cast<float>(dfTotalR / nCount);
3833 0 : pafDstScanline[iDstPixel * 2 + 1] =
3834 0 : static_cast<float>(dfTotalI / nCount);
3835 : const double dfM =
3836 0 : std::hypot(pafDstScanline[iDstPixel * 2],
3837 0 : pafDstScanline[iDstPixel * 2 + 1]);
3838 0 : const double dfDesiredM = dfTotalM / nCount;
3839 0 : double dfRatio = 1.0;
3840 0 : if (dfM != 0.0)
3841 0 : dfRatio = dfDesiredM / dfM;
3842 :
3843 0 : pafDstScanline[iDstPixel * 2] *=
3844 0 : static_cast<float>(dfRatio);
3845 0 : pafDstScanline[iDstPixel * 2 + 1] *=
3846 0 : static_cast<float>(dfRatio);
3847 : }
3848 : }
3849 12 : else if (eMethod == RMS)
3850 : {
3851 12 : double dfTotalR = 0.0;
3852 12 : double dfTotalI = 0.0;
3853 12 : int nCount = 0;
3854 :
3855 36 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
3856 : {
3857 72 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
3858 : {
3859 48 : const double dfR =
3860 48 : pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
3861 48 : iY - nSrcYOff) *
3862 48 : nSrcWidth * 2];
3863 48 : const double dfI =
3864 48 : pafSrcScanline[iX * 2 +
3865 48 : static_cast<GPtrDiff_t>(iY -
3866 48 : nSrcYOff) *
3867 48 : nSrcWidth * 2 +
3868 48 : 1];
3869 :
3870 48 : dfTotalR += SQUARE(dfR);
3871 48 : dfTotalI += SQUARE(dfI);
3872 :
3873 48 : ++nCount;
3874 : }
3875 : }
3876 :
3877 12 : CPLAssert(nCount > 0);
3878 12 : if (nCount == 0)
3879 : {
3880 0 : pafDstScanline[iDstPixel * 2] = 0.0;
3881 0 : pafDstScanline[iDstPixel * 2 + 1] = 0.0;
3882 : }
3883 : else
3884 : {
3885 : /* compute RMS */
3886 12 : pafDstScanline[iDstPixel * 2] =
3887 12 : static_cast<float>(sqrt(dfTotalR / nCount));
3888 12 : pafDstScanline[iDstPixel * 2 + 1] =
3889 12 : static_cast<float>(sqrt(dfTotalI / nCount));
3890 : }
3891 : }
3892 0 : else if (eMethod == AVERAGE)
3893 : {
3894 0 : double dfTotalR = 0.0;
3895 0 : double dfTotalI = 0.0;
3896 0 : int nCount = 0;
3897 :
3898 0 : for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
3899 : {
3900 0 : for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
3901 : {
3902 : // TODO(schwehr): Maybe use std::complex?
3903 0 : dfTotalR +=
3904 0 : pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
3905 0 : iY - nSrcYOff) *
3906 0 : nSrcWidth * 2];
3907 0 : dfTotalI += pafSrcScanline[iX * 2 +
3908 0 : static_cast<GPtrDiff_t>(
3909 0 : iY - nSrcYOff) *
3910 0 : nSrcWidth * 2 +
3911 0 : 1];
3912 0 : ++nCount;
3913 : }
3914 : }
3915 :
3916 0 : CPLAssert(nCount > 0);
3917 0 : if (nCount == 0)
3918 : {
3919 0 : pafDstScanline[iDstPixel * 2] = 0.0;
3920 0 : pafDstScanline[iDstPixel * 2 + 1] = 0.0;
3921 : }
3922 : else
3923 : {
3924 0 : pafDstScanline[iDstPixel * 2] =
3925 0 : static_cast<float>(dfTotalR / nCount);
3926 0 : pafDstScanline[iDstPixel * 2 + 1] =
3927 0 : static_cast<float>(dfTotalI / nCount);
3928 : }
3929 : }
3930 : }
3931 : }
3932 :
3933 10 : return CE_None;
3934 : }
3935 :
3936 : /************************************************************************/
3937 : /* GDALRegenerateCascadingOverviews() */
3938 : /* */
3939 : /* Generate a list of overviews in order from largest to */
3940 : /* smallest, computing each from the next larger. */
3941 : /************************************************************************/
3942 :
3943 42 : static CPLErr GDALRegenerateCascadingOverviews(
3944 : GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
3945 : const char *pszResampling, GDALProgressFunc pfnProgress,
3946 : void *pProgressData, CSLConstList papszOptions)
3947 :
3948 : {
3949 : /* -------------------------------------------------------------------- */
3950 : /* First, we must put the overviews in order from largest to */
3951 : /* smallest. */
3952 : /* -------------------------------------------------------------------- */
3953 120 : for (int i = 0; i < nOverviews - 1; ++i)
3954 : {
3955 270 : for (int j = 0; j < nOverviews - i - 1; ++j)
3956 : {
3957 192 : if (papoOvrBands[j]->GetXSize() *
3958 192 : static_cast<float>(papoOvrBands[j]->GetYSize()) <
3959 192 : papoOvrBands[j + 1]->GetXSize() *
3960 192 : static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
3961 : {
3962 0 : GDALRasterBand *poTempBand = papoOvrBands[j];
3963 0 : papoOvrBands[j] = papoOvrBands[j + 1];
3964 0 : papoOvrBands[j + 1] = poTempBand;
3965 : }
3966 : }
3967 : }
3968 :
3969 : /* -------------------------------------------------------------------- */
3970 : /* Count total pixels so we can prepare appropriate scaled */
3971 : /* progress functions. */
3972 : /* -------------------------------------------------------------------- */
3973 42 : double dfTotalPixels = 0.0;
3974 :
3975 162 : for (int i = 0; i < nOverviews; ++i)
3976 : {
3977 120 : dfTotalPixels += papoOvrBands[i]->GetXSize() *
3978 120 : static_cast<double>(papoOvrBands[i]->GetYSize());
3979 : }
3980 :
3981 : /* -------------------------------------------------------------------- */
3982 : /* Generate all the bands. */
3983 : /* -------------------------------------------------------------------- */
3984 42 : double dfPixelsProcessed = 0.0;
3985 :
3986 162 : for (int i = 0; i < nOverviews; ++i)
3987 : {
3988 120 : GDALRasterBand *poBaseBand = poSrcBand;
3989 120 : if (i != 0)
3990 78 : poBaseBand = papoOvrBands[i - 1];
3991 :
3992 120 : double dfPixels = papoOvrBands[i]->GetXSize() *
3993 120 : static_cast<double>(papoOvrBands[i]->GetYSize());
3994 :
3995 240 : void *pScaledProgressData = GDALCreateScaledProgress(
3996 : dfPixelsProcessed / dfTotalPixels,
3997 120 : (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
3998 : pProgressData);
3999 :
4000 240 : const CPLErr eErr = GDALRegenerateOverviewsEx(
4001 : poBaseBand, 1,
4002 120 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
4003 : pszResampling, GDALScaledProgress, pScaledProgressData,
4004 : papszOptions);
4005 120 : GDALDestroyScaledProgress(pScaledProgressData);
4006 :
4007 120 : if (eErr != CE_None)
4008 0 : return eErr;
4009 :
4010 120 : dfPixelsProcessed += dfPixels;
4011 :
4012 : // Only do the bit2grayscale promotion on the base band.
4013 120 : if (STARTS_WITH_CI(pszResampling,
4014 : "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
4015 8 : pszResampling = "AVERAGE";
4016 : }
4017 :
4018 42 : return CE_None;
4019 : }
4020 :
4021 : /************************************************************************/
4022 : /* GDALGetResampleFunction() */
4023 : /************************************************************************/
4024 :
4025 3688 : GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
4026 : int *pnRadius)
4027 : {
4028 3688 : if (pnRadius)
4029 3689 : *pnRadius = 0;
4030 3688 : if (STARTS_WITH_CI(pszResampling, "NEAR"))
4031 365 : return GDALResampleChunk_Near;
4032 3323 : else if (STARTS_WITH_CI(pszResampling, "AVER") ||
4033 2812 : EQUAL(pszResampling, "RMS"))
4034 535 : return GDALResampleChunk_AverageOrRMS;
4035 2788 : else if (EQUAL(pszResampling, "GAUSS"))
4036 : {
4037 26 : if (pnRadius)
4038 26 : *pnRadius = 1;
4039 26 : return GDALResampleChunk_Gauss;
4040 : }
4041 2762 : else if (EQUAL(pszResampling, "MODE"))
4042 40 : return GDALResampleChunk_Mode;
4043 2722 : else if (EQUAL(pszResampling, "CUBIC"))
4044 : {
4045 363 : if (pnRadius)
4046 363 : *pnRadius = GWKGetFilterRadius(GRA_Cubic);
4047 367 : return GDALResampleChunk_Convolution;
4048 : }
4049 2359 : else if (EQUAL(pszResampling, "CUBICSPLINE"))
4050 : {
4051 3 : if (pnRadius)
4052 3 : *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
4053 3 : return GDALResampleChunk_Convolution;
4054 : }
4055 2356 : else if (EQUAL(pszResampling, "LANCZOS"))
4056 : {
4057 6 : if (pnRadius)
4058 6 : *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
4059 6 : return GDALResampleChunk_Convolution;
4060 : }
4061 2350 : else if (EQUAL(pszResampling, "BILINEAR"))
4062 : {
4063 2347 : if (pnRadius)
4064 2347 : *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
4065 2347 : return GDALResampleChunk_Convolution;
4066 : }
4067 : else
4068 : {
4069 3 : CPLError(
4070 : CE_Failure, CPLE_AppDefined,
4071 : "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
4072 : pszResampling);
4073 0 : return nullptr;
4074 : }
4075 : }
4076 :
4077 : /************************************************************************/
4078 : /* GDALGetOvrWorkDataType() */
4079 : /************************************************************************/
4080 :
4081 3567 : GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
4082 : GDALDataType eSrcDataType)
4083 : {
4084 3567 : if ((STARTS_WITH_CI(pszResampling, "NEAR") ||
4085 3216 : STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
4086 2709 : EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
4087 2405 : EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
4088 3567 : EQUAL(pszResampling, "MODE")) &&
4089 : eSrcDataType == GDT_Byte)
4090 : {
4091 3191 : return GDT_Byte;
4092 : }
4093 376 : else if ((STARTS_WITH_CI(pszResampling, "NEAR") ||
4094 330 : STARTS_WITH_CI(pszResampling, "AVER") ||
4095 285 : EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4096 166 : EQUAL(pszResampling, "CUBICSPLINE") ||
4097 166 : EQUAL(pszResampling, "LANCZOS") ||
4098 163 : EQUAL(pszResampling, "BILINEAR") ||
4099 376 : EQUAL(pszResampling, "MODE")) &&
4100 : eSrcDataType == GDT_UInt16)
4101 : {
4102 104 : return GDT_UInt16;
4103 : }
4104 272 : else if (EQUAL(pszResampling, "GAUSS"))
4105 20 : return GDT_Float64;
4106 :
4107 252 : if (eSrcDataType == GDT_Float64)
4108 34 : return GDT_Float64;
4109 :
4110 218 : return GDT_Float32;
4111 : }
4112 :
4113 : namespace
4114 : {
4115 : // Structure to hold a pointer to free with CPLFree()
4116 : struct PointerHolder
4117 : {
4118 : void *ptr = nullptr;
4119 :
4120 5170 : explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
4121 : {
4122 5170 : }
4123 :
4124 5170 : ~PointerHolder()
4125 5170 : {
4126 5170 : CPLFree(ptr);
4127 5170 : }
4128 :
4129 : PointerHolder(const PointerHolder &) = delete;
4130 : PointerHolder &operator=(const PointerHolder &) = delete;
4131 : };
4132 : } // namespace
4133 :
4134 : /************************************************************************/
4135 : /* GDALRegenerateOverviews() */
4136 : /************************************************************************/
4137 :
4138 : /**
4139 : * \brief Generate downsampled overviews.
4140 : *
4141 : * This function will generate one or more overview images from a base image
4142 : * using the requested downsampling algorithm. Its primary use is for
4143 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4144 : * used to generate downsampled images in one file from another outside the
4145 : * overview architecture.
4146 : *
4147 : * The output bands need to exist in advance.
4148 : *
4149 : * The full set of resampling algorithms is documented in
4150 : * GDALDataset::BuildOverviews().
4151 : *
4152 : * This function will honour properly NODATA_VALUES tuples (special dataset
4153 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4154 : * considered as the nodata value and not each value of the triplet
4155 : * independently per band.
4156 : *
4157 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4158 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4159 : * overview computation.
4160 : *
4161 : * @param hSrcBand the source (base level) band.
4162 : * @param nOverviewCount the number of downsampled bands being generated.
4163 : * @param pahOvrBands the list of downsampled bands to be generated.
4164 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4165 : * @param pfnProgress progress report function.
4166 : * @param pProgressData progress function callback data.
4167 : * @return CE_None on success or CE_Failure on failure.
4168 : */
4169 244 : CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
4170 : GDALRasterBandH *pahOvrBands,
4171 : const char *pszResampling,
4172 : GDALProgressFunc pfnProgress,
4173 : void *pProgressData)
4174 :
4175 : {
4176 244 : return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
4177 : pszResampling, pfnProgress, pProgressData,
4178 244 : nullptr);
4179 : }
4180 :
4181 : /************************************************************************/
4182 : /* GDALRegenerateOverviewsEx() */
4183 : /************************************************************************/
4184 :
4185 : /**
4186 : * \brief Generate downsampled overviews.
4187 : *
4188 : * This function will generate one or more overview images from a base image
4189 : * using the requested downsampling algorithm. Its primary use is for
4190 : * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4191 : * used to generate downsampled images in one file from another outside the
4192 : * overview architecture.
4193 : *
4194 : * The output bands need to exist in advance.
4195 : *
4196 : * The full set of resampling algorithms is documented in
4197 : * GDALDataset::BuildOverviews().
4198 : *
4199 : * This function will honour properly NODATA_VALUES tuples (special dataset
4200 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4201 : * considered as the nodata value and not each value of the triplet
4202 : * independently per band.
4203 : *
4204 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4205 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4206 : * overview computation.
4207 : *
4208 : * @param hSrcBand the source (base level) band.
4209 : * @param nOverviewCount the number of downsampled bands being generated.
4210 : * @param pahOvrBands the list of downsampled bands to be generated.
4211 : * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4212 : * @param pfnProgress progress report function.
4213 : * @param pProgressData progress function callback data.
4214 : * @param papszOptions NULL terminated list of options as key=value pairs, or
4215 : * NULL
4216 : * @return CE_None on success or CE_Failure on failure.
4217 : * @since GDAL 3.6
4218 : */
4219 729 : CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
4220 : GDALRasterBandH *pahOvrBands,
4221 : const char *pszResampling,
4222 : GDALProgressFunc pfnProgress,
4223 : void *pProgressData, CSLConstList papszOptions)
4224 :
4225 : {
4226 729 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
4227 729 : GDALRasterBand **papoOvrBands =
4228 : reinterpret_cast<GDALRasterBand **>(pahOvrBands);
4229 :
4230 729 : if (pfnProgress == nullptr)
4231 244 : pfnProgress = GDALDummyProgress;
4232 :
4233 729 : if (EQUAL(pszResampling, "NONE"))
4234 61 : return CE_None;
4235 :
4236 668 : int nKernelRadius = 0;
4237 : GDALResampleFunction pfnResampleFn =
4238 668 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
4239 :
4240 668 : if (pfnResampleFn == nullptr)
4241 0 : return CE_Failure;
4242 :
4243 : /* -------------------------------------------------------------------- */
4244 : /* Check color tables... */
4245 : /* -------------------------------------------------------------------- */
4246 668 : GDALColorTable *poColorTable = nullptr;
4247 :
4248 318 : if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
4249 1382 : EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
4250 407 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4251 : {
4252 9 : poColorTable = poSrcBand->GetColorTable();
4253 9 : if (poColorTable != nullptr)
4254 : {
4255 9 : if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
4256 : {
4257 0 : CPLError(CE_Warning, CPLE_AppDefined,
4258 : "Computing overviews on palette index raster bands "
4259 : "with a palette whose color interpretation is not RGB "
4260 : "will probably lead to unexpected results.");
4261 0 : poColorTable = nullptr;
4262 : }
4263 9 : else if (poColorTable->IsIdentity())
4264 : {
4265 0 : poColorTable = nullptr;
4266 : }
4267 : }
4268 : else
4269 : {
4270 0 : CPLError(CE_Warning, CPLE_AppDefined,
4271 : "Computing overviews on palette index raster bands "
4272 : "without a palette will probably lead to unexpected "
4273 : "results.");
4274 : }
4275 : }
4276 : // Not ready yet
4277 1923 : else if ((EQUAL(pszResampling, "CUBIC") ||
4278 605 : EQUAL(pszResampling, "CUBICSPLINE") ||
4279 605 : EQUAL(pszResampling, "LANCZOS") ||
4280 1321 : EQUAL(pszResampling, "BILINEAR")) &&
4281 57 : poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4282 : {
4283 0 : CPLError(CE_Warning, CPLE_AppDefined,
4284 : "Computing %s overviews on palette index raster bands "
4285 : "will probably lead to unexpected results.",
4286 : pszResampling);
4287 : }
4288 :
4289 : // If we have a nodata mask and we are doing something more complicated
4290 : // than nearest neighbouring, we have to fetch to nodata mask.
4291 :
4292 668 : GDALRasterBand *poMaskBand = nullptr;
4293 668 : bool bUseNoDataMask = false;
4294 668 : bool bCanUseCascaded = true;
4295 :
4296 668 : if (!STARTS_WITH_CI(pszResampling, "NEAR"))
4297 : {
4298 : // Special case if we are an alpha/mask band. We want it to be
4299 : // considered as the mask band to avoid alpha=0 to be taken into account
4300 : // in average computation.
4301 464 : if (poSrcBand->IsMaskBand())
4302 : {
4303 88 : poMaskBand = poSrcBand;
4304 88 : bUseNoDataMask = true;
4305 : }
4306 : else
4307 : {
4308 376 : poMaskBand = poSrcBand->GetMaskBand();
4309 376 : const int nMaskFlags = poSrcBand->GetMaskFlags();
4310 376 : bCanUseCascaded =
4311 376 : (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
4312 376 : bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
4313 : }
4314 : }
4315 :
4316 : /* -------------------------------------------------------------------- */
4317 : /* If we are operating on multiple overviews, and using */
4318 : /* averaging, lets do them in cascading order to reduce the */
4319 : /* amount of computation. */
4320 : /* -------------------------------------------------------------------- */
4321 :
4322 : // In case the mask made be computed from another band of the dataset,
4323 : // we can't use cascaded generation, as the computation of the overviews
4324 : // of the band used for the mask band may not have yet occurred (#3033).
4325 668 : if ((STARTS_WITH_CI(pszResampling, "AVER") ||
4326 318 : EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
4327 287 : EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
4328 233 : EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
4329 668 : EQUAL(pszResampling, "MODE")) &&
4330 42 : nOverviewCount > 1 && bCanUseCascaded)
4331 42 : return GDALRegenerateCascadingOverviews(
4332 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
4333 42 : pProgressData, papszOptions);
4334 :
4335 : /* -------------------------------------------------------------------- */
4336 : /* Setup one horizontal swath to read from the raw buffer. */
4337 : /* -------------------------------------------------------------------- */
4338 626 : int nFRXBlockSize = 0;
4339 626 : int nFRYBlockSize = 0;
4340 626 : poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
4341 :
4342 626 : const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
4343 : const GDALDataType eWrkDataType =
4344 626 : GDALDataTypeIsComplex(eSrcDataType)
4345 626 : ? GDT_CFloat32
4346 616 : : GDALGetOvrWorkDataType(pszResampling, eSrcDataType);
4347 :
4348 626 : const int nWidth = poSrcBand->GetXSize();
4349 626 : const int nHeight = poSrcBand->GetYSize();
4350 :
4351 626 : int nMaxOvrFactor = 1;
4352 1330 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
4353 : {
4354 704 : const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
4355 704 : const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
4356 704 : nMaxOvrFactor = std::max(
4357 : nMaxOvrFactor,
4358 704 : static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
4359 704 : nMaxOvrFactor = std::max(
4360 : nMaxOvrFactor,
4361 704 : static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
4362 : }
4363 :
4364 626 : int nFullResYChunk = nFRYBlockSize;
4365 626 : int nMaxChunkYSizeQueried = 0;
4366 :
4367 : const auto UpdateChunkHeightAndGetChunkSize =
4368 7903 : [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
4369 23709 : eWrkDataType, nWidth]()
4370 : {
4371 : // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
4372 : // + nFullResYChunk) / nMaxOvrFactor)
4373 7903 : nFullResYChunk = std::max(nFullResYChunk, 2 * nMaxOvrFactor);
4374 7903 : nMaxChunkYSizeQueried =
4375 7903 : nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
4376 7903 : return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
4377 7903 : nMaxChunkYSizeQueried * nWidth;
4378 626 : };
4379 :
4380 : // Only configurable for debug / testing
4381 : const char *pszChunkYSize =
4382 626 : CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
4383 626 : if (pszChunkYSize)
4384 : {
4385 : // coverity[tainted_data]
4386 0 : nFullResYChunk = atoi(pszChunkYSize);
4387 : }
4388 :
4389 : // Only configurable for debug / testing
4390 : const int nChunkMaxSize =
4391 626 : atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
4392 :
4393 626 : auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
4394 626 : if (nChunkSize > nChunkMaxSize)
4395 : {
4396 3 : if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
4397 9 : !GDALDataTypeIsComplex(eSrcDataType) &&
4398 3 : (!STARTS_WITH_CI(pszResampling, "AVER") ||
4399 0 : EQUAL(pszResampling, "AVERAGE")))
4400 : {
4401 : // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
4402 : // which use a block based strategy, which is much less memory
4403 : // hungry.
4404 3 : return GDALRegenerateOverviewsMultiBand(
4405 : 1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
4406 3 : pfnProgress, pProgressData, papszOptions);
4407 : }
4408 0 : else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
4409 : {
4410 0 : return GDALRegenerateCascadingOverviews(
4411 : poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
4412 0 : pfnProgress, pProgressData, papszOptions);
4413 : }
4414 : }
4415 623 : else if (pszChunkYSize == nullptr)
4416 : {
4417 : // Try to get as close as possible to nChunkMaxSize
4418 7900 : while (nChunkSize * 2 < nChunkMaxSize)
4419 : {
4420 7277 : nFullResYChunk *= 2;
4421 7277 : nChunkSize = UpdateChunkHeightAndGetChunkSize();
4422 : }
4423 : }
4424 :
4425 623 : int nHasNoData = 0;
4426 623 : const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
4427 623 : const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
4428 : const bool bPropagateNoData =
4429 623 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
4430 :
4431 : // Structure describing a resampling job
4432 : struct OvrJob
4433 : {
4434 : // Buffers to free when job is finished
4435 : std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
4436 : std::shared_ptr<PointerHolder> oSrcBufferHolder{};
4437 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
4438 :
4439 : // Input parameters of pfnResampleFn
4440 : GDALResampleFunction pfnResampleFn = nullptr;
4441 : double dfXRatioDstToSrc{};
4442 : double dfYRatioDstToSrc{};
4443 : GDALDataType eWrkDataType = GDT_Unknown;
4444 : const void *pChunk = nullptr;
4445 : const GByte *pabyChunkNodataMask = nullptr;
4446 : int nWidth = 0;
4447 : int nHeight = 0;
4448 : int nChunkYOff = 0;
4449 : int nChunkYSize = 0;
4450 : int nDstWidth = 0;
4451 : int nDstYOff = 0;
4452 : int nDstYOff2 = 0;
4453 : GDALRasterBand *poDstBand = nullptr;
4454 : const char *pszResampling = nullptr;
4455 : bool bHasNoData = false;
4456 : double dfNoDataValue = 0.0;
4457 : GDALColorTable *poColorTable = nullptr;
4458 : GDALDataType eSrcDataType = GDT_Unknown;
4459 : bool bPropagateNoData = false;
4460 :
4461 : // Output values of resampling function
4462 : CPLErr eErr = CE_Failure;
4463 : void *pDstBuffer = nullptr;
4464 : GDALDataType eDstBufferDataType = GDT_Unknown;
4465 :
4466 : // Synchronization
4467 : bool bFinished = false;
4468 : std::mutex mutex{};
4469 : std::condition_variable cv{};
4470 :
4471 0 : void SetSrcMaskBufferHolder(
4472 : const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
4473 : {
4474 0 : oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
4475 0 : }
4476 :
4477 0 : void SetSrcBufferHolder(
4478 : const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
4479 : {
4480 0 : oSrcBufferHolder = oSrcBufferHolderIn;
4481 0 : }
4482 : };
4483 :
4484 : // Thread function to resample
4485 702 : const auto JobResampleFunc = [](void *pData)
4486 : {
4487 702 : OvrJob *poJob = static_cast<OvrJob *>(pData);
4488 :
4489 702 : if (poJob->eWrkDataType != GDT_CFloat32)
4490 : {
4491 692 : poJob->eErr = poJob->pfnResampleFn(
4492 : poJob->dfXRatioDstToSrc, poJob->dfYRatioDstToSrc, 0.0, 0.0,
4493 : poJob->eWrkDataType, poJob->pChunk, poJob->pabyChunkNodataMask,
4494 : 0, poJob->nWidth, poJob->nChunkYOff, poJob->nChunkYSize, 0,
4495 : poJob->nDstWidth, poJob->nDstYOff, poJob->nDstYOff2,
4496 : poJob->poDstBand, &(poJob->pDstBuffer),
4497 : &(poJob->eDstBufferDataType), poJob->pszResampling,
4498 692 : poJob->bHasNoData, poJob->dfNoDataValue, poJob->poColorTable,
4499 692 : poJob->eSrcDataType, poJob->bPropagateNoData);
4500 : }
4501 : else
4502 : {
4503 10 : poJob->eErr = GDALResampleChunkC32R(
4504 : poJob->nWidth, poJob->nHeight,
4505 10 : static_cast<const float *>(poJob->pChunk), poJob->nChunkYOff,
4506 : poJob->nChunkYSize, poJob->nDstYOff, poJob->nDstYOff2,
4507 : poJob->poDstBand, &(poJob->pDstBuffer),
4508 : &(poJob->eDstBufferDataType), poJob->pszResampling);
4509 : }
4510 :
4511 : poJob->oDstBufferHolder =
4512 702 : std::make_unique<PointerHolder>(poJob->pDstBuffer);
4513 :
4514 : {
4515 1404 : std::lock_guard<std::mutex> guard(poJob->mutex);
4516 702 : poJob->bFinished = true;
4517 702 : poJob->cv.notify_one();
4518 : }
4519 702 : };
4520 :
4521 : // Function to write resample data to target band
4522 702 : const auto WriteJobData = [](const OvrJob *poJob)
4523 : {
4524 1404 : return poJob->poDstBand->RasterIO(
4525 702 : GF_Write, 0, poJob->nDstYOff, poJob->nDstWidth,
4526 702 : poJob->nDstYOff2 - poJob->nDstYOff, poJob->pDstBuffer,
4527 702 : poJob->nDstWidth, poJob->nDstYOff2 - poJob->nDstYOff,
4528 702 : poJob->eDstBufferDataType, 0, 0, nullptr);
4529 : };
4530 :
4531 : // Wait for completion of oldest job and serialize it
4532 : const auto WaitAndFinalizeOldestJob =
4533 0 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
4534 : {
4535 0 : auto poOldestJob = jobList.front().get();
4536 : {
4537 0 : std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
4538 : // coverity[missing_lock:FALSE]
4539 0 : while (!poOldestJob->bFinished)
4540 : {
4541 0 : poOldestJob->cv.wait(oGuard);
4542 : }
4543 : }
4544 0 : CPLErr l_eErr = poOldestJob->eErr;
4545 0 : if (l_eErr == CE_None)
4546 : {
4547 0 : l_eErr = WriteJobData(poOldestJob);
4548 : }
4549 :
4550 0 : jobList.pop_front();
4551 0 : return l_eErr;
4552 : };
4553 :
4554 : // Queue of jobs
4555 1246 : std::list<std::unique_ptr<OvrJob>> jobList;
4556 :
4557 623 : GByte *pabyChunkNodataMask = nullptr;
4558 623 : void *pChunk = nullptr;
4559 :
4560 623 : const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
4561 2492 : const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
4562 623 : ? CPLGetNumCPUs()
4563 623 : : atoi(pszThreads)));
4564 : auto poThreadPool =
4565 623 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
4566 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
4567 1246 : : std::unique_ptr<CPLJobQueue>(nullptr);
4568 :
4569 : /* -------------------------------------------------------------------- */
4570 : /* Loop over image operating on chunks. */
4571 : /* -------------------------------------------------------------------- */
4572 623 : int nChunkYOff = 0;
4573 623 : CPLErr eErr = CE_None;
4574 :
4575 1251 : for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
4576 628 : nChunkYOff += nFullResYChunk)
4577 : {
4578 628 : if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
4579 : pProgressData))
4580 : {
4581 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
4582 0 : eErr = CE_Failure;
4583 : }
4584 :
4585 628 : if (nFullResYChunk + nChunkYOff > nHeight)
4586 621 : nFullResYChunk = nHeight - nChunkYOff;
4587 :
4588 628 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
4589 628 : int nChunkYSizeQueried =
4590 628 : nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
4591 628 : if (nChunkYOffQueried < 0)
4592 : {
4593 62 : nChunkYSizeQueried += nChunkYOffQueried;
4594 62 : nChunkYOffQueried = 0;
4595 : }
4596 628 : if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
4597 62 : nChunkYSizeQueried = nHeight - nChunkYOffQueried;
4598 :
4599 : // Avoid accumulating too many tasks and exhaust RAM
4600 : // Try to complete already finished jobs
4601 628 : while (eErr == CE_None && !jobList.empty())
4602 : {
4603 0 : auto poOldestJob = jobList.front().get();
4604 : {
4605 0 : std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
4606 0 : if (!poOldestJob->bFinished)
4607 : {
4608 0 : break;
4609 : }
4610 : }
4611 0 : eErr = poOldestJob->eErr;
4612 0 : if (eErr == CE_None)
4613 : {
4614 0 : eErr = WriteJobData(poOldestJob);
4615 : }
4616 :
4617 0 : jobList.pop_front();
4618 : }
4619 :
4620 : // And in case we have saturated the number of threads,
4621 : // wait for completion of tasks to go below the threshold.
4622 1256 : while (eErr == CE_None &&
4623 628 : jobList.size() >= static_cast<size_t>(nThreads))
4624 : {
4625 0 : eErr = WaitAndFinalizeOldestJob(jobList);
4626 : }
4627 :
4628 : // (Re)allocate buffers if needed
4629 628 : if (pChunk == nullptr)
4630 : {
4631 623 : pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
4632 : nMaxChunkYSizeQueried, nWidth);
4633 : }
4634 628 : if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
4635 : {
4636 : pabyChunkNodataMask = static_cast<GByte *>(
4637 265 : VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
4638 : }
4639 :
4640 628 : if (pChunk == nullptr ||
4641 265 : (bUseNoDataMask && pabyChunkNodataMask == nullptr))
4642 : {
4643 0 : CPLFree(pChunk);
4644 0 : CPLFree(pabyChunkNodataMask);
4645 0 : return CE_Failure;
4646 : }
4647 :
4648 : // Read chunk.
4649 628 : if (eErr == CE_None)
4650 628 : eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
4651 : nChunkYSizeQueried, pChunk, nWidth,
4652 : nChunkYSizeQueried, eWrkDataType, 0, 0,
4653 : nullptr);
4654 628 : if (eErr == CE_None && bUseNoDataMask)
4655 265 : eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
4656 : nChunkYSizeQueried, pabyChunkNodataMask,
4657 : nWidth, nChunkYSizeQueried, GDT_Byte, 0,
4658 : 0, nullptr);
4659 :
4660 : // Special case to promote 1bit data to 8bit 0/255 values.
4661 628 : if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
4662 : {
4663 9 : if (eWrkDataType == GDT_Float32)
4664 : {
4665 0 : float *pafChunk = static_cast<float *>(pChunk);
4666 0 : for (GPtrDiff_t i = 0;
4667 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4668 : i++)
4669 : {
4670 0 : if (pafChunk[i] == 1.0)
4671 0 : pafChunk[i] = 255.0;
4672 : }
4673 : }
4674 9 : else if (eWrkDataType == GDT_Byte)
4675 : {
4676 9 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
4677 168417 : for (GPtrDiff_t i = 0;
4678 168417 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4679 : i++)
4680 : {
4681 168408 : if (pabyChunk[i] == 1)
4682 127437 : pabyChunk[i] = 255;
4683 : }
4684 : }
4685 0 : else if (eWrkDataType == GDT_UInt16)
4686 : {
4687 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
4688 0 : for (GPtrDiff_t i = 0;
4689 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4690 : i++)
4691 : {
4692 0 : if (pasChunk[i] == 1)
4693 0 : pasChunk[i] = 255;
4694 : }
4695 : }
4696 0 : else if (eWrkDataType == GDT_Float64)
4697 : {
4698 0 : double *padfChunk = static_cast<double *>(pChunk);
4699 0 : for (GPtrDiff_t i = 0;
4700 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4701 : i++)
4702 : {
4703 0 : if (padfChunk[i] == 1.0)
4704 0 : padfChunk[i] = 255.0;
4705 : }
4706 : }
4707 : else
4708 : {
4709 0 : CPLAssert(false);
4710 : }
4711 : }
4712 619 : else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
4713 : {
4714 0 : if (eWrkDataType == GDT_Float32)
4715 : {
4716 0 : float *pafChunk = static_cast<float *>(pChunk);
4717 0 : for (GPtrDiff_t i = 0;
4718 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4719 : i++)
4720 : {
4721 0 : if (pafChunk[i] == 1.0)
4722 0 : pafChunk[i] = 0.0;
4723 0 : else if (pafChunk[i] == 0.0)
4724 0 : pafChunk[i] = 255.0;
4725 : }
4726 : }
4727 0 : else if (eWrkDataType == GDT_Byte)
4728 : {
4729 0 : GByte *pabyChunk = static_cast<GByte *>(pChunk);
4730 0 : for (GPtrDiff_t i = 0;
4731 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4732 : i++)
4733 : {
4734 0 : if (pabyChunk[i] == 1)
4735 0 : pabyChunk[i] = 0;
4736 0 : else if (pabyChunk[i] == 0)
4737 0 : pabyChunk[i] = 255;
4738 : }
4739 : }
4740 0 : else if (eWrkDataType == GDT_UInt16)
4741 : {
4742 0 : GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
4743 0 : for (GPtrDiff_t i = 0;
4744 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4745 : i++)
4746 : {
4747 0 : if (pasChunk[i] == 1)
4748 0 : pasChunk[i] = 0;
4749 0 : else if (pasChunk[i] == 0)
4750 0 : pasChunk[i] = 255;
4751 : }
4752 : }
4753 0 : else if (eWrkDataType == GDT_Float64)
4754 : {
4755 0 : double *padfChunk = static_cast<double *>(pChunk);
4756 0 : for (GPtrDiff_t i = 0;
4757 0 : i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4758 : i++)
4759 : {
4760 0 : if (padfChunk[i] == 1.0)
4761 0 : padfChunk[i] = 0.0;
4762 0 : else if (padfChunk[i] == 0.0)
4763 0 : padfChunk[i] = 255.0;
4764 : }
4765 : }
4766 : else
4767 : {
4768 0 : CPLAssert(false);
4769 : }
4770 : }
4771 :
4772 : auto oSrcBufferHolder =
4773 1256 : std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
4774 : auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
4775 1256 : poJobQueue ? pabyChunkNodataMask : nullptr);
4776 :
4777 1330 : for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
4778 : ++iOverview)
4779 : {
4780 702 : GDALRasterBand *poDstBand = papoOvrBands[iOverview];
4781 702 : const int nDstWidth = poDstBand->GetXSize();
4782 702 : const int nDstHeight = poDstBand->GetYSize();
4783 :
4784 702 : const double dfXRatioDstToSrc =
4785 702 : static_cast<double>(nWidth) / nDstWidth;
4786 702 : const double dfYRatioDstToSrc =
4787 702 : static_cast<double>(nHeight) / nDstHeight;
4788 :
4789 : /* --------------------------------------------------------------------
4790 : */
4791 : /* Figure out the line to start writing to, and the first line
4792 : */
4793 : /* to not write to. In theory this approach should ensure that
4794 : */
4795 : /* every output line will be written if all input chunks are */
4796 : /* processed. */
4797 : /* --------------------------------------------------------------------
4798 : */
4799 702 : int nDstYOff =
4800 702 : static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
4801 702 : if (nDstYOff == nDstHeight)
4802 0 : continue;
4803 702 : int nDstYOff2 = static_cast<int>(
4804 702 : 0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
4805 :
4806 702 : if (nChunkYOff + nFullResYChunk == nHeight)
4807 695 : nDstYOff2 = nDstHeight;
4808 : #if DEBUG_VERBOSE
4809 : CPLDebug("GDAL",
4810 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
4811 : nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
4812 : nDstWidth, nDstYOff2 - nDstYOff);
4813 : #endif
4814 :
4815 1404 : auto poJob = std::unique_ptr<OvrJob>(new OvrJob());
4816 702 : poJob->pfnResampleFn = pfnResampleFn;
4817 702 : poJob->dfXRatioDstToSrc = dfXRatioDstToSrc;
4818 702 : poJob->dfYRatioDstToSrc = dfYRatioDstToSrc;
4819 702 : poJob->eWrkDataType = eWrkDataType;
4820 702 : poJob->pChunk = pChunk;
4821 702 : poJob->pabyChunkNodataMask = pabyChunkNodataMask;
4822 702 : poJob->nWidth = nWidth;
4823 702 : poJob->nHeight = nHeight;
4824 702 : poJob->nChunkYOff = nChunkYOffQueried;
4825 702 : poJob->nChunkYSize = nChunkYSizeQueried;
4826 702 : poJob->nDstWidth = nDstWidth;
4827 702 : poJob->nDstYOff = nDstYOff;
4828 702 : poJob->nDstYOff2 = nDstYOff2;
4829 702 : poJob->poDstBand = poDstBand;
4830 702 : poJob->pszResampling = pszResampling;
4831 702 : poJob->bHasNoData = bHasNoData;
4832 702 : poJob->dfNoDataValue = dfNoDataValue;
4833 702 : poJob->poColorTable = poColorTable;
4834 702 : poJob->eSrcDataType = eSrcDataType;
4835 702 : poJob->bPropagateNoData = bPropagateNoData;
4836 :
4837 702 : if (poJobQueue)
4838 : {
4839 0 : poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
4840 0 : poJob->SetSrcBufferHolder(oSrcBufferHolder);
4841 0 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
4842 0 : jobList.emplace_back(std::move(poJob));
4843 : }
4844 : else
4845 : {
4846 702 : JobResampleFunc(poJob.get());
4847 702 : eErr = poJob->eErr;
4848 702 : if (eErr == CE_None)
4849 : {
4850 702 : eErr = WriteJobData(poJob.get());
4851 : }
4852 : }
4853 : }
4854 :
4855 628 : if (poJobQueue)
4856 : {
4857 0 : pChunk = nullptr;
4858 0 : pabyChunkNodataMask = nullptr;
4859 : }
4860 : }
4861 :
4862 623 : VSIFree(pChunk);
4863 623 : VSIFree(pabyChunkNodataMask);
4864 :
4865 : // Wait for all pending jobs to complete
4866 623 : while (!jobList.empty())
4867 : {
4868 0 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
4869 0 : if (l_eErr != CE_None && eErr == CE_None)
4870 0 : eErr = l_eErr;
4871 : }
4872 :
4873 : /* -------------------------------------------------------------------- */
4874 : /* Renormalized overview mean / stddev if needed. */
4875 : /* -------------------------------------------------------------------- */
4876 623 : if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
4877 : {
4878 0 : GDALOverviewMagnitudeCorrection(
4879 : poSrcBand, nOverviewCount,
4880 : reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
4881 : GDALDummyProgress, nullptr);
4882 : }
4883 :
4884 : /* -------------------------------------------------------------------- */
4885 : /* It can be important to flush out data to overviews. */
4886 : /* -------------------------------------------------------------------- */
4887 1318 : for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
4888 : ++iOverview)
4889 : {
4890 695 : eErr = papoOvrBands[iOverview]->FlushCache(false);
4891 : }
4892 :
4893 623 : if (eErr == CE_None)
4894 623 : pfnProgress(1.0, nullptr, pProgressData);
4895 :
4896 623 : return eErr;
4897 : }
4898 :
4899 : /************************************************************************/
4900 : /* GDALRegenerateOverviewsMultiBand() */
4901 : /************************************************************************/
4902 :
4903 : /**
4904 : * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
4905 : * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
4906 : *
4907 : * This function will generate one or more overview images from a base
4908 : * image using the requested downsampling algorithm. Its primary use
4909 : * is for generating overviews via GDALDataset::BuildOverviews(), but it
4910 : * can also be used to generate downsampled images in one file from another
4911 : * outside the overview architecture.
4912 : *
4913 : * The output bands need to exist in advance and share the same characteristics
4914 : * (type, dimensions)
4915 : *
4916 : * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
4917 : * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
4918 : *
4919 : * It does not support color tables or complex data types.
4920 : *
4921 : * The pseudo-algorithm used by the function is :
4922 : * for each overview
4923 : * iterate on lines of the source by a step of deltay
4924 : * iterate on columns of the source by a step of deltax
4925 : * read the source data of size deltax * deltay for all the bands
4926 : * generate the corresponding overview block for all the bands
4927 : *
4928 : * This function will honour properly NODATA_VALUES tuples (special dataset
4929 : * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4930 : * considered as the nodata value and not each value of the triplet
4931 : * independently per band.
4932 : *
4933 : * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4934 : * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4935 : * overview computation.
4936 : *
4937 : * @param nBands the number of bands, size of papoSrcBands and size of
4938 : * first dimension of papapoOverviewBands
4939 : * @param papoSrcBands the list of source bands to downsample
4940 : * @param nOverviews the number of downsampled overview levels being generated.
4941 : * @param papapoOverviewBands bidimension array of bands. First dimension is
4942 : * indexed by nBands. Second dimension is indexed by
4943 : * nOverviews.
4944 : * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
4945 : * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
4946 : * @param pfnProgress progress report function.
4947 : * @param pProgressData progress function callback data.
4948 : * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
4949 : * key=value pairs, or NULL
4950 : * Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
4951 : * options can be specified to express that overviews should
4952 : * be regenerated only in the specified subset of the source
4953 : * dataset.
4954 : * @return CE_None on success or CE_Failure on failure.
4955 : */
4956 :
4957 328 : CPLErr GDALRegenerateOverviewsMultiBand(
4958 : int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
4959 : GDALRasterBand *const *const *papapoOverviewBands,
4960 : const char *pszResampling, GDALProgressFunc pfnProgress,
4961 : void *pProgressData, CSLConstList papszOptions)
4962 : {
4963 328 : CPL_IGNORE_RET_VAL(papszOptions);
4964 :
4965 328 : if (pfnProgress == nullptr)
4966 6 : pfnProgress = GDALDummyProgress;
4967 :
4968 328 : if (EQUAL(pszResampling, "NONE"))
4969 2 : return CE_None;
4970 :
4971 : // Sanity checks.
4972 326 : if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
4973 165 : !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
4974 68 : !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
4975 16 : !EQUAL(pszResampling, "CUBICSPLINE") &&
4976 15 : !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
4977 5 : !EQUAL(pszResampling, "MODE"))
4978 : {
4979 0 : CPLError(CE_Failure, CPLE_NotSupported,
4980 : "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
4981 : "not supported",
4982 : pszResampling);
4983 0 : return CE_Failure;
4984 : }
4985 :
4986 326 : int nKernelRadius = 0;
4987 : GDALResampleFunction pfnResampleFn =
4988 326 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
4989 326 : if (pfnResampleFn == nullptr)
4990 0 : return CE_Failure;
4991 :
4992 326 : const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
4993 326 : const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
4994 326 : if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
4995 0 : return CE_None;
4996 326 : GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
4997 603 : for (int iBand = 1; iBand < nBands; ++iBand)
4998 : {
4999 554 : if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
5000 277 : papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
5001 : {
5002 0 : CPLError(
5003 : CE_Failure, CPLE_NotSupported,
5004 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5005 : "have the same dimensions");
5006 0 : return CE_Failure;
5007 : }
5008 277 : if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
5009 : {
5010 0 : CPLError(
5011 : CE_Failure, CPLE_NotSupported,
5012 : "GDALRegenerateOverviewsMultiBand: all the source bands must "
5013 : "have the same data type");
5014 0 : return CE_Failure;
5015 : }
5016 : }
5017 :
5018 884 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5019 : {
5020 558 : const int nDstWidth = papapoOverviewBands[0][iOverview]->GetXSize();
5021 558 : const int nDstHeight = papapoOverviewBands[0][iOverview]->GetYSize();
5022 1101 : for (int iBand = 1; iBand < nBands; ++iBand)
5023 : {
5024 543 : if (papapoOverviewBands[iBand][iOverview]->GetXSize() !=
5025 1086 : nDstWidth ||
5026 543 : papapoOverviewBands[iBand][iOverview]->GetYSize() != nDstHeight)
5027 : {
5028 0 : CPLError(
5029 : CE_Failure, CPLE_NotSupported,
5030 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5031 : "of the same level must have the same dimensions");
5032 0 : return CE_Failure;
5033 : }
5034 543 : if (papapoOverviewBands[iBand][iOverview]->GetRasterDataType() !=
5035 : eDataType)
5036 : {
5037 0 : CPLError(
5038 : CE_Failure, CPLE_NotSupported,
5039 : "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5040 : "must have the same data type as the source bands");
5041 0 : return CE_Failure;
5042 : }
5043 : }
5044 : }
5045 :
5046 : // First pass to compute the total number of pixels to write.
5047 326 : double dfTotalPixelCount = 0;
5048 326 : const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
5049 326 : const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
5050 326 : const int nSrcXSize = atoi(CSLFetchNameValueDef(
5051 : papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
5052 326 : const int nSrcYSize = atoi(CSLFetchNameValueDef(
5053 : papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
5054 884 : for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5055 : {
5056 558 : dfTotalPixelCount +=
5057 1116 : static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
5058 558 : papapoOverviewBands[0][iOverview]->GetXSize() *
5059 1116 : static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
5060 558 : papapoOverviewBands[0][iOverview]->GetYSize();
5061 : }
5062 :
5063 : const GDALDataType eWrkDataType =
5064 326 : GDALGetOvrWorkDataType(pszResampling, eDataType);
5065 :
5066 326 : const bool bIsMask = papoSrcBands[0]->IsMaskBand();
5067 :
5068 : // If we have a nodata mask and we are doing something more complicated
5069 : // than nearest neighbouring, we have to fetch to nodata mask.
5070 : const bool bUseNoDataMask =
5071 486 : !STARTS_WITH_CI(pszResampling, "NEAR") &&
5072 160 : (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
5073 :
5074 : bool *const pabHasNoData =
5075 326 : static_cast<bool *>(VSI_MALLOC_VERBOSE(nBands * sizeof(bool)));
5076 : double *const padfNoDataValue =
5077 326 : static_cast<double *>(VSI_MALLOC_VERBOSE(nBands * sizeof(double)));
5078 326 : if (pabHasNoData == nullptr || padfNoDataValue == nullptr)
5079 : {
5080 0 : CPLFree(pabHasNoData);
5081 0 : CPLFree(padfNoDataValue);
5082 0 : return CE_Failure;
5083 : }
5084 :
5085 929 : for (int iBand = 0; iBand < nBands; ++iBand)
5086 : {
5087 603 : int nHasNoData = 0;
5088 1206 : padfNoDataValue[iBand] =
5089 603 : papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
5090 603 : pabHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
5091 : }
5092 : const bool bPropagateNoData =
5093 326 : CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5094 :
5095 326 : const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
5096 1304 : const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
5097 326 : ? CPLGetNumCPUs()
5098 326 : : atoi(pszThreads)));
5099 : auto poThreadPool =
5100 326 : nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5101 : auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5102 326 : : std::unique_ptr<CPLJobQueue>(nullptr);
5103 :
5104 : // Only configurable for debug / testing
5105 : const int nChunkMaxSize =
5106 326 : atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
5107 :
5108 : // Second pass to do the real job.
5109 326 : double dfCurPixelCount = 0;
5110 326 : CPLErr eErr = CE_None;
5111 883 : for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
5112 : ++iOverview)
5113 : {
5114 557 : int iSrcOverview = -1; // -1 means the source bands.
5115 :
5116 557 : int nDstChunkXSize = 0;
5117 557 : int nDstChunkYSize = 0;
5118 557 : papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
5119 : &nDstChunkYSize);
5120 :
5121 : const int nDstTotalWidth =
5122 557 : papapoOverviewBands[0][iOverview]->GetXSize();
5123 : const int nDstTotalHeight =
5124 557 : papapoOverviewBands[0][iOverview]->GetYSize();
5125 :
5126 : // Compute the coordinates of the target region to refresh
5127 557 : constexpr double EPS = 1e-8;
5128 557 : const int nDstXOffStart = static_cast<int>(
5129 557 : static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
5130 : EPS);
5131 : const int nDstXOffEnd =
5132 1114 : std::min(static_cast<int>(
5133 557 : std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
5134 557 : nToplevelSrcWidth * nDstTotalWidth -
5135 : EPS)),
5136 557 : nDstTotalWidth);
5137 557 : const int nDstWidth = nDstXOffEnd - nDstXOffStart;
5138 557 : const int nDstYOffStart =
5139 557 : static_cast<int>(static_cast<double>(nSrcYOff) /
5140 557 : nToplevelSrcHeight * nDstTotalHeight +
5141 : EPS);
5142 : const int nDstYOffEnd =
5143 1114 : std::min(static_cast<int>(
5144 557 : std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
5145 557 : nToplevelSrcHeight * nDstTotalHeight -
5146 : EPS)),
5147 557 : nDstTotalHeight);
5148 :
5149 : // Try to use previous level of overview as the source to compute
5150 : // the next level.
5151 557 : int nSrcWidth = nToplevelSrcWidth;
5152 557 : int nSrcHeight = nToplevelSrcHeight;
5153 788 : if (iOverview > 0 &&
5154 231 : papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
5155 : {
5156 223 : nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
5157 223 : nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
5158 223 : iSrcOverview = iOverview - 1;
5159 : }
5160 :
5161 557 : const double dfXRatioDstToSrc =
5162 557 : static_cast<double>(nSrcWidth) / nDstTotalWidth;
5163 557 : const double dfYRatioDstToSrc =
5164 557 : static_cast<double>(nSrcHeight) / nDstTotalHeight;
5165 :
5166 1114 : int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
5167 557 : static_cast<int>(0.5 + dfYRatioDstToSrc));
5168 557 : if (nOvrFactor == 0)
5169 0 : nOvrFactor = 1;
5170 :
5171 : // Try to extend the chunk size so that the memory needed to acquire
5172 : // source pixels goes up to 10 MB.
5173 : // This can help for drivers that support multi-threaded reading
5174 557 : const int nFullResYChunk =
5175 557 : 2 + static_cast<int>(nDstChunkYSize * dfYRatioDstToSrc);
5176 557 : const int nFullResYChunkQueried =
5177 557 : nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
5178 782 : while (nDstChunkXSize < nDstWidth)
5179 : {
5180 232 : const int nFullResXChunk =
5181 232 : 2 + static_cast<int>(2 * nDstChunkXSize * dfXRatioDstToSrc);
5182 :
5183 232 : const int nFullResXChunkQueried =
5184 232 : nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
5185 :
5186 464 : if (static_cast<GIntBig>(nFullResXChunkQueried) *
5187 464 : nFullResYChunkQueried * nBands *
5188 232 : GDALGetDataTypeSizeBytes(eWrkDataType) >
5189 232 : nChunkMaxSize)
5190 : {
5191 7 : break;
5192 : }
5193 :
5194 225 : nDstChunkXSize *= 2;
5195 : }
5196 557 : nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
5197 :
5198 557 : const int nFullResXChunk =
5199 557 : 2 + static_cast<int>(nDstChunkXSize * dfXRatioDstToSrc);
5200 557 : const int nFullResXChunkQueried =
5201 557 : nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
5202 :
5203 : // Structure describing a resampling job
5204 : struct OvrJob
5205 : {
5206 : // Buffers to free when job is finished
5207 : std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
5208 : std::unique_ptr<PointerHolder> oSrcBufferHolder{};
5209 : std::unique_ptr<PointerHolder> oDstBufferHolder{};
5210 :
5211 : // Input parameters of pfnResampleFn
5212 : GDALResampleFunction pfnResampleFn = nullptr;
5213 : double dfXRatioDstToSrc{};
5214 : double dfYRatioDstToSrc{};
5215 : GDALDataType eWrkDataType = GDT_Unknown;
5216 : const void *pChunk = nullptr;
5217 : const GByte *pabyChunkNodataMask = nullptr;
5218 : int nChunkXOff = 0;
5219 : int nChunkXSize = 0;
5220 : int nChunkYOff = 0;
5221 : int nChunkYSize = 0;
5222 : int nDstXOff = 0;
5223 : int nDstXOff2 = 0;
5224 : int nDstYOff = 0;
5225 : int nDstYOff2 = 0;
5226 : GDALRasterBand *poOverview = nullptr;
5227 : const char *pszResampling = nullptr;
5228 : bool bHasNoData = false;
5229 : double dfNoDataValue = 0.0;
5230 : GDALDataType eSrcDataType = GDT_Unknown;
5231 : bool bPropagateNoData = false;
5232 :
5233 : // Output values of resampling function
5234 : CPLErr eErr = CE_Failure;
5235 : void *pDstBuffer = nullptr;
5236 : GDALDataType eDstBufferDataType = GDT_Unknown;
5237 :
5238 : // Synchronization
5239 : bool bFinished = false;
5240 : std::mutex mutex{};
5241 : std::condition_variable cv{};
5242 : };
5243 :
5244 : // Thread function to resample
5245 3180 : const auto JobResampleFunc = [](void *pData)
5246 : {
5247 3180 : OvrJob *poJob = static_cast<OvrJob *>(pData);
5248 :
5249 6360 : poJob->eErr = poJob->pfnResampleFn(
5250 : poJob->dfXRatioDstToSrc, poJob->dfYRatioDstToSrc, 0.0, 0.0,
5251 : poJob->eWrkDataType, poJob->pChunk, poJob->pabyChunkNodataMask,
5252 : poJob->nChunkXOff, poJob->nChunkXSize, poJob->nChunkYOff,
5253 : poJob->nChunkYSize, poJob->nDstXOff, poJob->nDstXOff2,
5254 : poJob->nDstYOff, poJob->nDstYOff2, poJob->poOverview,
5255 : &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
5256 3180 : poJob->pszResampling, poJob->bHasNoData, poJob->dfNoDataValue,
5257 3180 : nullptr, poJob->eSrcDataType, poJob->bPropagateNoData);
5258 :
5259 3180 : poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
5260 :
5261 : {
5262 6360 : std::lock_guard<std::mutex> guard(poJob->mutex);
5263 3180 : poJob->bFinished = true;
5264 3180 : poJob->cv.notify_one();
5265 : }
5266 3180 : };
5267 :
5268 : // Function to write resample data to target band
5269 3180 : const auto WriteJobData = [](const OvrJob *poJob)
5270 : {
5271 6360 : return poJob->poOverview->RasterIO(
5272 3180 : GF_Write, poJob->nDstXOff, poJob->nDstYOff,
5273 3180 : poJob->nDstXOff2 - poJob->nDstXOff,
5274 3180 : poJob->nDstYOff2 - poJob->nDstYOff, poJob->pDstBuffer,
5275 3180 : poJob->nDstXOff2 - poJob->nDstXOff,
5276 3180 : poJob->nDstYOff2 - poJob->nDstYOff, poJob->eDstBufferDataType,
5277 3180 : 0, 0, nullptr);
5278 : };
5279 :
5280 : // Wait for completion of oldest job and serialize it
5281 : const auto WaitAndFinalizeOldestJob =
5282 16 : [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
5283 : {
5284 16 : auto poOldestJob = jobList.front().get();
5285 : {
5286 32 : std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
5287 : // coverity[missing_lock:FALSE]
5288 18 : while (!poOldestJob->bFinished)
5289 : {
5290 2 : poOldestJob->cv.wait(oGuard);
5291 : }
5292 : }
5293 16 : CPLErr l_eErr = poOldestJob->eErr;
5294 16 : if (l_eErr == CE_None)
5295 : {
5296 16 : l_eErr = WriteJobData(poOldestJob);
5297 : }
5298 :
5299 16 : jobList.pop_front();
5300 16 : return l_eErr;
5301 : };
5302 :
5303 : // Queue of jobs
5304 1114 : std::list<std::unique_ptr<OvrJob>> jobList;
5305 :
5306 1114 : std::vector<void *> apaChunk(nBands);
5307 1114 : std::vector<GByte *> apabyChunkNoDataMask(nBands);
5308 :
5309 : // Iterate on destination overview, block by block.
5310 557 : for (int nDstYOff = nDstYOffStart;
5311 1976 : nDstYOff < nDstYOffEnd && eErr == CE_None;
5312 1419 : nDstYOff += nDstChunkYSize)
5313 : {
5314 : int nDstYCount;
5315 1419 : if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
5316 1052 : nDstYCount = nDstChunkYSize;
5317 : else
5318 367 : nDstYCount = nDstYOffEnd - nDstYOff;
5319 :
5320 1419 : int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
5321 1419 : int nChunkYOff2 = static_cast<int>(
5322 1419 : ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
5323 1419 : if (nChunkYOff2 > nSrcHeight ||
5324 1419 : nDstYOff + nDstYCount == nDstTotalHeight)
5325 554 : nChunkYOff2 = nSrcHeight;
5326 1419 : int nYCount = nChunkYOff2 - nChunkYOff;
5327 1419 : CPLAssert(nYCount <= nFullResYChunk);
5328 :
5329 1419 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
5330 1419 : int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
5331 1419 : if (nChunkYOffQueried < 0)
5332 : {
5333 120 : nChunkYSizeQueried += nChunkYOffQueried;
5334 120 : nChunkYOffQueried = 0;
5335 : }
5336 1419 : if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
5337 119 : nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
5338 1419 : CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
5339 :
5340 1419 : if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount, nullptr,
5341 : pProgressData))
5342 : {
5343 1 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
5344 1 : eErr = CE_Failure;
5345 : }
5346 :
5347 : // Iterate on destination overview, block by block.
5348 1419 : for (int nDstXOff = nDstXOffStart;
5349 2883 : nDstXOff < nDstXOffEnd && eErr == CE_None;
5350 1464 : nDstXOff += nDstChunkXSize)
5351 : {
5352 1464 : int nDstXCount = 0;
5353 1464 : if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
5354 1447 : nDstXCount = nDstChunkXSize;
5355 : else
5356 17 : nDstXCount = nDstXOffEnd - nDstXOff;
5357 :
5358 1464 : dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
5359 :
5360 1464 : int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
5361 1464 : int nChunkXOff2 = static_cast<int>(
5362 1464 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
5363 1464 : if (nChunkXOff2 > nSrcWidth ||
5364 1464 : nDstXOff + nDstXCount == nDstTotalWidth)
5365 1417 : nChunkXOff2 = nSrcWidth;
5366 1464 : const int nXCount = nChunkXOff2 - nChunkXOff;
5367 1464 : CPLAssert(nXCount <= nFullResXChunk);
5368 :
5369 1464 : int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
5370 1464 : int nChunkXSizeQueried =
5371 1464 : nXCount + 2 * nKernelRadius * nOvrFactor;
5372 1464 : if (nChunkXOffQueried < 0)
5373 : {
5374 172 : nChunkXSizeQueried += nChunkXOffQueried;
5375 172 : nChunkXOffQueried = 0;
5376 : }
5377 1464 : if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
5378 175 : nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
5379 1464 : CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
5380 : #if DEBUG_VERBOSE
5381 : CPLDebug("GDAL",
5382 : "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
5383 : nChunkXOffQueried, nChunkYOffQueried,
5384 : nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
5385 : nDstYOff, nDstXCount, nDstYCount);
5386 : #endif
5387 :
5388 : // Avoid accumulating too many tasks and exhaust RAM
5389 :
5390 : // Try to complete already finished jobs
5391 1464 : while (eErr == CE_None && !jobList.empty())
5392 : {
5393 2 : auto poOldestJob = jobList.front().get();
5394 : {
5395 2 : std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
5396 2 : if (!poOldestJob->bFinished)
5397 : {
5398 2 : break;
5399 : }
5400 : }
5401 0 : eErr = poOldestJob->eErr;
5402 0 : if (eErr == CE_None)
5403 : {
5404 0 : eErr = WriteJobData(poOldestJob);
5405 : }
5406 :
5407 0 : jobList.pop_front();
5408 : }
5409 :
5410 : // And in case we have saturated the number of threads,
5411 : // wait for completion of tasks to go below the threshold.
5412 2928 : while (eErr == CE_None &&
5413 1464 : jobList.size() >= static_cast<size_t>(nThreads))
5414 : {
5415 0 : eErr = WaitAndFinalizeOldestJob(jobList);
5416 : }
5417 :
5418 : // (Re)allocate buffers if needed
5419 4645 : for (int iBand = 0; iBand < nBands; ++iBand)
5420 : {
5421 3181 : if (apaChunk[iBand] == nullptr)
5422 : {
5423 1102 : apaChunk[iBand] = VSI_MALLOC3_VERBOSE(
5424 : nFullResXChunkQueried, nFullResYChunkQueried,
5425 : GDALGetDataTypeSizeBytes(eWrkDataType));
5426 1102 : if (apaChunk[iBand] == nullptr)
5427 : {
5428 0 : eErr = CE_Failure;
5429 : }
5430 : }
5431 3467 : if (bUseNoDataMask &&
5432 286 : apabyChunkNoDataMask[iBand] == nullptr)
5433 : {
5434 486 : apabyChunkNoDataMask[iBand] =
5435 243 : static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
5436 : nFullResXChunkQueried, nFullResYChunkQueried));
5437 243 : if (apabyChunkNoDataMask[iBand] == nullptr)
5438 : {
5439 0 : eErr = CE_Failure;
5440 : }
5441 : }
5442 : }
5443 :
5444 : // Read the source buffers for all the bands.
5445 4645 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
5446 : {
5447 3181 : GDALRasterBand *poSrcBand = nullptr;
5448 3181 : if (iSrcOverview == -1)
5449 2291 : poSrcBand = papoSrcBands[iBand];
5450 : else
5451 890 : poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
5452 3181 : eErr = poSrcBand->RasterIO(
5453 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
5454 3181 : nChunkXSizeQueried, nChunkYSizeQueried, apaChunk[iBand],
5455 : nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType, 0,
5456 : 0, nullptr);
5457 :
5458 3181 : if (bUseNoDataMask && eErr == CE_None)
5459 : {
5460 286 : auto poMaskBand = poSrcBand->IsMaskBand()
5461 286 : ? poSrcBand
5462 221 : : poSrcBand->GetMaskBand();
5463 286 : eErr = poMaskBand->RasterIO(
5464 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
5465 : nChunkXSizeQueried, nChunkYSizeQueried,
5466 286 : apabyChunkNoDataMask[iBand], nChunkXSizeQueried,
5467 : nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
5468 : }
5469 : }
5470 :
5471 : // Compute the resulting overview block.
5472 4644 : for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
5473 : {
5474 6360 : auto poJob = std::unique_ptr<OvrJob>(new OvrJob());
5475 3180 : poJob->pfnResampleFn = pfnResampleFn;
5476 3180 : poJob->dfXRatioDstToSrc = dfXRatioDstToSrc;
5477 3180 : poJob->dfYRatioDstToSrc = dfYRatioDstToSrc;
5478 3180 : poJob->eWrkDataType = eWrkDataType;
5479 3180 : poJob->pChunk = apaChunk[iBand];
5480 3180 : poJob->pabyChunkNodataMask = apabyChunkNoDataMask[iBand];
5481 3180 : poJob->nChunkXOff = nChunkXOffQueried;
5482 3180 : poJob->nChunkXSize = nChunkXSizeQueried;
5483 3180 : poJob->nChunkYOff = nChunkYOffQueried;
5484 3180 : poJob->nChunkYSize = nChunkYSizeQueried;
5485 3180 : poJob->nDstXOff = nDstXOff;
5486 3180 : poJob->nDstXOff2 = nDstXOff + nDstXCount;
5487 3180 : poJob->nDstYOff = nDstYOff;
5488 3180 : poJob->nDstYOff2 = nDstYOff + nDstYCount;
5489 3180 : poJob->poOverview = papapoOverviewBands[iBand][iOverview];
5490 3180 : poJob->pszResampling = pszResampling;
5491 3180 : poJob->bHasNoData = pabHasNoData[iBand];
5492 3180 : poJob->dfNoDataValue = padfNoDataValue[iBand];
5493 3180 : poJob->eSrcDataType = eDataType;
5494 3180 : poJob->bPropagateNoData = bPropagateNoData;
5495 :
5496 3180 : if (poJobQueue)
5497 : {
5498 32 : poJob->oSrcMaskBufferHolder.reset(
5499 16 : new PointerHolder(apabyChunkNoDataMask[iBand]));
5500 16 : apabyChunkNoDataMask[iBand] = nullptr;
5501 :
5502 32 : poJob->oSrcBufferHolder.reset(
5503 16 : new PointerHolder(apaChunk[iBand]));
5504 16 : apaChunk[iBand] = nullptr;
5505 :
5506 16 : poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
5507 16 : jobList.emplace_back(std::move(poJob));
5508 : }
5509 : else
5510 : {
5511 3164 : JobResampleFunc(poJob.get());
5512 3164 : eErr = poJob->eErr;
5513 3164 : if (eErr == CE_None)
5514 : {
5515 3164 : eErr = WriteJobData(poJob.get());
5516 : }
5517 : }
5518 : }
5519 : }
5520 : }
5521 :
5522 : // Wait for all pending jobs to complete
5523 573 : while (!jobList.empty())
5524 : {
5525 16 : const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5526 16 : if (l_eErr != CE_None && eErr == CE_None)
5527 0 : eErr = l_eErr;
5528 : }
5529 :
5530 : // Flush the data to overviews.
5531 1657 : for (int iBand = 0; iBand < nBands; ++iBand)
5532 : {
5533 1100 : CPLFree(apaChunk[iBand]);
5534 1100 : papapoOverviewBands[iBand][iOverview]->FlushCache(false);
5535 :
5536 1100 : CPLFree(apabyChunkNoDataMask[iBand]);
5537 : }
5538 : }
5539 :
5540 326 : CPLFree(pabHasNoData);
5541 326 : CPLFree(padfNoDataValue);
5542 :
5543 326 : if (eErr == CE_None)
5544 324 : pfnProgress(1.0, nullptr, pProgressData);
5545 :
5546 326 : return eErr;
5547 : }
5548 :
5549 : /************************************************************************/
5550 : /* GDALComputeBandStats() */
5551 : /************************************************************************/
5552 :
5553 : /** Undocumented
5554 : * @param hSrcBand undocumented.
5555 : * @param nSampleStep Step between scanlines used to compute statistics.
5556 : * When nSampleStep is equal to 1, all scanlines will
5557 : * be processed.
5558 : * @param pdfMean undocumented.
5559 : * @param pdfStdDev undocumented.
5560 : * @param pfnProgress undocumented.
5561 : * @param pProgressData undocumented.
5562 : * @return undocumented
5563 : */
5564 16 : CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
5565 : int nSampleStep, double *pdfMean,
5566 : double *pdfStdDev,
5567 : GDALProgressFunc pfnProgress,
5568 : void *pProgressData)
5569 :
5570 : {
5571 16 : VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
5572 :
5573 16 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
5574 :
5575 16 : if (pfnProgress == nullptr)
5576 16 : pfnProgress = GDALDummyProgress;
5577 :
5578 16 : const int nWidth = poSrcBand->GetXSize();
5579 16 : const int nHeight = poSrcBand->GetYSize();
5580 :
5581 16 : if (nSampleStep >= nHeight || nSampleStep < 1)
5582 3 : nSampleStep = 1;
5583 :
5584 16 : GDALDataType eWrkType = GDT_Unknown;
5585 16 : float *pafData = nullptr;
5586 16 : GDALDataType eType = poSrcBand->GetRasterDataType();
5587 16 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
5588 16 : if (bComplex)
5589 : {
5590 : pafData = static_cast<float *>(
5591 0 : VSI_MALLOC_VERBOSE(nWidth * 2 * sizeof(float)));
5592 0 : eWrkType = GDT_CFloat32;
5593 : }
5594 : else
5595 : {
5596 : pafData =
5597 16 : static_cast<float *>(VSI_MALLOC_VERBOSE(nWidth * sizeof(float)));
5598 16 : eWrkType = GDT_Float32;
5599 : }
5600 :
5601 16 : if (nWidth == 0 || pafData == nullptr)
5602 : {
5603 0 : VSIFree(pafData);
5604 0 : return CE_Failure;
5605 : }
5606 :
5607 : /* -------------------------------------------------------------------- */
5608 : /* Loop over all sample lines. */
5609 : /* -------------------------------------------------------------------- */
5610 16 : double dfSum = 0.0;
5611 16 : double dfSum2 = 0.0;
5612 16 : int iLine = 0;
5613 16 : GIntBig nSamples = 0;
5614 :
5615 2143 : do
5616 : {
5617 2159 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
5618 : pProgressData))
5619 : {
5620 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
5621 0 : CPLFree(pafData);
5622 0 : return CE_Failure;
5623 : }
5624 :
5625 : const CPLErr eErr =
5626 2159 : poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
5627 : 1, eWrkType, 0, 0, nullptr);
5628 2159 : if (eErr != CE_None)
5629 : {
5630 1 : CPLFree(pafData);
5631 1 : return eErr;
5632 : }
5633 :
5634 725204 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
5635 : {
5636 723046 : float fValue = 0.0f;
5637 :
5638 723046 : if (bComplex)
5639 : {
5640 : // Compute the magnitude of the complex value.
5641 : fValue =
5642 0 : std::hypot(pafData[iPixel * 2], pafData[iPixel * 2 + 1]);
5643 : }
5644 : else
5645 : {
5646 723046 : fValue = pafData[iPixel];
5647 : }
5648 :
5649 723046 : dfSum += fValue;
5650 723046 : dfSum2 += static_cast<double>(fValue) * fValue;
5651 : }
5652 :
5653 2158 : nSamples += nWidth;
5654 2158 : iLine += nSampleStep;
5655 2158 : } while (iLine < nHeight);
5656 :
5657 15 : if (!pfnProgress(1.0, nullptr, pProgressData))
5658 : {
5659 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
5660 0 : CPLFree(pafData);
5661 0 : return CE_Failure;
5662 : }
5663 :
5664 : /* -------------------------------------------------------------------- */
5665 : /* Produce the result values. */
5666 : /* -------------------------------------------------------------------- */
5667 15 : if (pdfMean != nullptr)
5668 15 : *pdfMean = dfSum / nSamples;
5669 :
5670 15 : if (pdfStdDev != nullptr)
5671 : {
5672 15 : const double dfMean = dfSum / nSamples;
5673 :
5674 15 : *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
5675 : }
5676 :
5677 15 : CPLFree(pafData);
5678 :
5679 15 : return CE_None;
5680 : }
5681 :
5682 : /************************************************************************/
5683 : /* GDALOverviewMagnitudeCorrection() */
5684 : /* */
5685 : /* Correct the mean and standard deviation of the overviews of */
5686 : /* the given band to match the base layer approximately. */
5687 : /************************************************************************/
5688 :
5689 : /** Undocumented
5690 : * @param hBaseBand undocumented.
5691 : * @param nOverviewCount undocumented.
5692 : * @param pahOverviews undocumented.
5693 : * @param pfnProgress undocumented.
5694 : * @param pProgressData undocumented.
5695 : * @return undocumented
5696 : */
5697 0 : CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
5698 : int nOverviewCount,
5699 : GDALRasterBandH *pahOverviews,
5700 : GDALProgressFunc pfnProgress,
5701 : void *pProgressData)
5702 :
5703 : {
5704 0 : VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
5705 :
5706 : /* -------------------------------------------------------------------- */
5707 : /* Compute mean/stddev for source raster. */
5708 : /* -------------------------------------------------------------------- */
5709 0 : double dfOrigMean = 0.0;
5710 0 : double dfOrigStdDev = 0.0;
5711 : {
5712 : const CPLErr eErr =
5713 0 : GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
5714 : pfnProgress, pProgressData);
5715 :
5716 0 : if (eErr != CE_None)
5717 0 : return eErr;
5718 : }
5719 :
5720 : /* -------------------------------------------------------------------- */
5721 : /* Loop on overview bands. */
5722 : /* -------------------------------------------------------------------- */
5723 0 : for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
5724 : {
5725 : GDALRasterBand *poOverview =
5726 0 : GDALRasterBand::FromHandle(pahOverviews[iOverview]);
5727 : double dfOverviewMean, dfOverviewStdDev;
5728 :
5729 : const CPLErr eErr =
5730 0 : GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
5731 : &dfOverviewStdDev, pfnProgress, pProgressData);
5732 :
5733 0 : if (eErr != CE_None)
5734 0 : return eErr;
5735 :
5736 0 : double dfGain = 1.0;
5737 0 : if (dfOrigStdDev >= 0.0001)
5738 0 : dfGain = dfOrigStdDev / dfOverviewStdDev;
5739 :
5740 : /* --------------------------------------------------------------------
5741 : */
5742 : /* Apply gain and offset. */
5743 : /* --------------------------------------------------------------------
5744 : */
5745 0 : const int nWidth = poOverview->GetXSize();
5746 0 : const int nHeight = poOverview->GetYSize();
5747 :
5748 0 : GDALDataType eWrkType = GDT_Unknown;
5749 0 : float *pafData = nullptr;
5750 0 : const GDALDataType eType = poOverview->GetRasterDataType();
5751 0 : const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
5752 0 : if (bComplex)
5753 : {
5754 : pafData = static_cast<float *>(
5755 0 : VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
5756 0 : eWrkType = GDT_CFloat32;
5757 : }
5758 : else
5759 : {
5760 : pafData = static_cast<float *>(
5761 0 : VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
5762 0 : eWrkType = GDT_Float32;
5763 : }
5764 :
5765 0 : if (pafData == nullptr)
5766 : {
5767 0 : return CE_Failure;
5768 : }
5769 :
5770 0 : for (int iLine = 0; iLine < nHeight; ++iLine)
5771 : {
5772 0 : if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
5773 : pProgressData))
5774 : {
5775 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
5776 0 : CPLFree(pafData);
5777 0 : return CE_Failure;
5778 : }
5779 :
5780 0 : if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
5781 : nWidth, 1, eWrkType, 0, 0,
5782 0 : nullptr) != CE_None)
5783 : {
5784 0 : CPLFree(pafData);
5785 0 : return CE_Failure;
5786 : }
5787 :
5788 0 : for (int iPixel = 0; iPixel < nWidth; ++iPixel)
5789 : {
5790 0 : if (bComplex)
5791 : {
5792 0 : pafData[iPixel * 2] *= static_cast<float>(dfGain);
5793 0 : pafData[iPixel * 2 + 1] *= static_cast<float>(dfGain);
5794 : }
5795 : else
5796 : {
5797 0 : pafData[iPixel] = static_cast<float>(
5798 0 : (pafData[iPixel] - dfOverviewMean) * dfGain +
5799 : dfOrigMean);
5800 : }
5801 : }
5802 :
5803 0 : if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
5804 : nWidth, 1, eWrkType, 0, 0,
5805 0 : nullptr) != CE_None)
5806 : {
5807 0 : CPLFree(pafData);
5808 0 : return CE_Failure;
5809 : }
5810 : }
5811 :
5812 0 : if (!pfnProgress(1.0, nullptr, pProgressData))
5813 : {
5814 0 : CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
5815 0 : CPLFree(pafData);
5816 0 : return CE_Failure;
5817 : }
5818 :
5819 0 : CPLFree(pafData);
5820 : }
5821 :
5822 0 : return CE_None;
5823 : }
|