Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: GDAL Core
4 : * Purpose: Contains default implementation of GDALRasterBand::IRasterIO()
5 : * and supporting functions of broader utility.
6 : * Author: Frank Warmerdam, warmerdam@pobox.com
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 1998, Frank Warmerdam
10 : * Copyright (c) 2007-2014, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * SPDX-License-Identifier: MIT
13 : ****************************************************************************/
14 :
15 : #include "cpl_port.h"
16 : #include "gdal.h"
17 : #include "gdal_priv.h"
18 :
19 : #include <cassert>
20 : #include <climits>
21 : #include <cmath>
22 : #include <cstddef>
23 : #include <cstdio>
24 : #include <cstdlib>
25 : #include <cstring>
26 :
27 : #include <algorithm>
28 : #include <limits>
29 : #include <stdexcept>
30 : #include <type_traits>
31 :
32 : #include "cpl_conv.h"
33 : #include "cpl_cpu_features.h"
34 : #include "cpl_error.h"
35 : #include "cpl_float.h"
36 : #include "cpl_progress.h"
37 : #include "cpl_string.h"
38 : #include "cpl_vsi.h"
39 : #include "gdal_priv_templates.hpp"
40 : #include "gdal_vrt.h"
41 : #include "gdalwarper.h"
42 : #include "memdataset.h"
43 : #include "vrtdataset.h"
44 :
45 : #if defined(__x86_64) || defined(_M_X64)
46 : #include <emmintrin.h>
47 : #include <immintrin.h>
48 : #define HAVE_SSE2
49 : // AVX2 dispatch: compile AVX2 code with target attribute, detect at runtime
50 : #if (defined(__GNUC__) || defined(__clang__)) && \
51 : defined(HAVE_AVX2_AT_COMPILE_TIME)
52 : #define HAVE_AVX2_DISPATCH
53 : #elif defined(_MSC_VER)
54 : #include <intrin.h>
55 : #define HAVE_AVX2_DISPATCH
56 : #endif
57 : #elif defined(USE_NEON_OPTIMIZATIONS)
58 : #include "include_sse2neon.h"
59 : #define HAVE_SSE2
60 : #endif
61 :
62 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
63 : #include "rasterio_ssse3.h"
64 : #ifdef __SSSE3__
65 : #include <tmmintrin.h>
66 : #endif
67 : #endif
68 :
69 : #ifdef __SSE4_1__
70 : #include <smmintrin.h>
71 : #endif
72 :
73 : #ifdef __GNUC__
74 : #define CPL_NOINLINE __attribute__((noinline))
75 : #else
76 : #define CPL_NOINLINE
77 : #endif
78 :
79 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
80 : int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
81 : int nDstPixelStride, GPtrDiff_t nWordCount);
82 :
83 : /************************************************************************/
84 : /* DownsamplingIntegerXFactor() */
85 : /************************************************************************/
86 :
87 : template <bool bSameDataType, int DATA_TYPE_SIZE>
88 695850 : static bool DownsamplingIntegerXFactor(
89 : GDALRasterBand *poBand, int iSrcX, int nSrcXInc, GPtrDiff_t iSrcOffsetCst,
90 : GByte *CPL_RESTRICT pabyDstData, int nPixelSpace, int nBufXSize,
91 : GDALDataType eDataType, GDALDataType eBufType, int &nStartBlockX,
92 : int nBlockXSize, GDALRasterBlock *&poBlock, int nLBlockY)
93 : {
94 695850 : const int nBandDataSize =
95 : bSameDataType ? DATA_TYPE_SIZE : GDALGetDataTypeSizeBytes(eDataType);
96 695850 : int nOuterLoopIters = nBufXSize - 1;
97 695850 : const int nIncSrcOffset = nSrcXInc * nBandDataSize;
98 : const GByte *CPL_RESTRICT pabySrcData;
99 695850 : int nEndBlockX = nBlockXSize + nStartBlockX;
100 :
101 695850 : if (iSrcX < nEndBlockX)
102 : {
103 295062 : CPLAssert(poBlock);
104 295062 : goto no_reload_block;
105 : }
106 400788 : goto reload_block;
107 :
108 : // Don't do the last iteration in the loop, as iSrcX might go beyond
109 : // nRasterXSize - 1
110 1265113 : while (--nOuterLoopIters >= 1)
111 : {
112 201834 : iSrcX += nSrcXInc;
113 201834 : pabySrcData += nIncSrcOffset;
114 201834 : pabyDstData += nPixelSpace;
115 :
116 : /* --------------------------------------------------------------------
117 : */
118 : /* Ensure we have the appropriate block loaded. */
119 : /* --------------------------------------------------------------------
120 : */
121 201834 : if (iSrcX >= nEndBlockX)
122 : {
123 201834 : reload_block:
124 : {
125 615212 : const int nLBlockX = iSrcX / nBlockXSize;
126 615212 : nStartBlockX = nLBlockX * nBlockXSize;
127 615212 : nEndBlockX = nStartBlockX + nBlockXSize;
128 :
129 615212 : if (poBlock != nullptr)
130 341376 : poBlock->DropLock();
131 :
132 615212 : poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
133 615212 : if (poBlock == nullptr)
134 : {
135 1 : return false;
136 : }
137 : }
138 :
139 615211 : no_reload_block:
140 : const GByte *pabySrcBlock =
141 1265113 : static_cast<const GByte *>(poBlock->GetDataRef());
142 1265113 : GPtrDiff_t iSrcOffset =
143 1265113 : (iSrcX - nStartBlockX + iSrcOffsetCst) * nBandDataSize;
144 1265113 : pabySrcData = pabySrcBlock + iSrcOffset;
145 : }
146 :
147 : /* --------------------------------------------------------------------
148 : */
149 : /* Copy the maximum run of pixels. */
150 : /* --------------------------------------------------------------------
151 : */
152 :
153 1265113 : const int nIters = std::min(
154 1265113 : (nEndBlockX - iSrcX + (nSrcXInc - 1)) / nSrcXInc, nOuterLoopIters);
155 : if (bSameDataType)
156 : {
157 1264670 : memcpy(pabyDstData, pabySrcData, nBandDataSize);
158 1264670 : if (nIters > 1)
159 : {
160 : if (DATA_TYPE_SIZE == 1)
161 : {
162 326320 : pabySrcData += nIncSrcOffset;
163 326320 : pabyDstData += nPixelSpace;
164 326320 : GDALFastCopyByte(pabySrcData, nIncSrcOffset, pabyDstData,
165 326320 : nPixelSpace, nIters - 1);
166 326320 : pabySrcData +=
167 326320 : static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 2);
168 326320 : pabyDstData +=
169 326320 : static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 2);
170 : }
171 : else
172 : {
173 4395716 : for (int i = 0; i < nIters - 1; i++)
174 : {
175 4197550 : pabySrcData += nIncSrcOffset;
176 4197550 : pabyDstData += nPixelSpace;
177 4197550 : memcpy(pabyDstData, pabySrcData, nBandDataSize);
178 : }
179 : }
180 524490 : iSrcX += nSrcXInc * (nIters - 1);
181 524490 : nOuterLoopIters -= nIters - 1;
182 : }
183 : }
184 : else
185 : {
186 : // Type to type conversion ...
187 443 : GDALCopyWords64(pabySrcData, eDataType, nIncSrcOffset, pabyDstData,
188 443 : eBufType, nPixelSpace, std::max(1, nIters));
189 443 : if (nIters > 1)
190 : {
191 216 : pabySrcData +=
192 216 : static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 1);
193 216 : pabyDstData +=
194 216 : static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 1);
195 216 : iSrcX += nSrcXInc * (nIters - 1);
196 216 : nOuterLoopIters -= nIters - 1;
197 : }
198 : }
199 : }
200 :
201 : // Deal with last iteration to avoid iSrcX to go beyond nRasterXSize - 1
202 1063279 : if (nOuterLoopIters == 0)
203 : {
204 367430 : const int nRasterXSize = poBand->GetXSize();
205 367430 : iSrcX =
206 734860 : static_cast<int>(std::min(static_cast<GInt64>(iSrcX) + nSrcXInc,
207 367430 : static_cast<GInt64>(nRasterXSize - 1)));
208 367430 : pabyDstData += nPixelSpace;
209 367430 : if (iSrcX < nEndBlockX)
210 : {
211 354840 : goto no_reload_block;
212 : }
213 12590 : goto reload_block;
214 : }
215 695849 : return true;
216 : }
217 :
218 : template <class A, class B>
219 2832400 : CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW inline auto CPLUnsanitizedMul(A a, B b)
220 : {
221 2832400 : return a * b;
222 : }
223 :
224 : /************************************************************************/
225 : /* IRasterIO() */
226 : /* */
227 : /* Default internal implementation of RasterIO() ... utilizes */
228 : /* the Block access methods to satisfy the request. This would */
229 : /* normally only be overridden by formats with overviews. */
230 : /************************************************************************/
231 :
232 6193910 : CPLErr GDALRasterBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
233 : int nXSize, int nYSize, void *pData,
234 : int nBufXSize, int nBufYSize,
235 : GDALDataType eBufType, GSpacing nPixelSpace,
236 : GSpacing nLineSpace,
237 : GDALRasterIOExtraArg *psExtraArg)
238 :
239 : {
240 6193910 : if (eRWFlag == GF_Write && eFlushBlockErr != CE_None)
241 : {
242 0 : CPLError(eFlushBlockErr, CPLE_AppDefined,
243 : "An error occurred while writing a dirty block "
244 : "from GDALRasterBand::IRasterIO");
245 0 : CPLErr eErr = eFlushBlockErr;
246 0 : eFlushBlockErr = CE_None;
247 0 : return eErr;
248 : }
249 6193910 : if (nBlockXSize <= 0 || nBlockYSize <= 0)
250 : {
251 0 : CPLError(CE_Failure, CPLE_AppDefined, "Invalid block size");
252 0 : return CE_Failure;
253 : }
254 :
255 6193910 : const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
256 6193910 : const int nBufDataSize = GDALGetDataTypeSizeBytes(eBufType);
257 6193910 : GByte dummyBlock[2] = {0, 0};
258 6193910 : GByte *pabySrcBlock =
259 : dummyBlock; /* to avoid Coverity warning about nullptr dereference */
260 6193910 : GDALRasterBlock *poBlock = nullptr;
261 6193910 : const bool bUseIntegerRequestCoords =
262 6560480 : (!psExtraArg->bFloatingPointWindowValidity ||
263 366564 : (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
264 341632 : nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
265 :
266 : /* ==================================================================== */
267 : /* A common case is the data requested with the destination */
268 : /* is packed, and the block width is the raster width. */
269 : /* ==================================================================== */
270 6100550 : if (nPixelSpace == nBufDataSize && nLineSpace == nPixelSpace * nXSize &&
271 3239600 : nBlockXSize == GetXSize() && nBufXSize == nXSize &&
272 12294500 : nBufYSize == nYSize && bUseIntegerRequestCoords)
273 : {
274 3098630 : CPLErr eErr = CE_None;
275 3098630 : int nLBlockY = -1;
276 :
277 9804610 : for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
278 : {
279 6707060 : const int iSrcY = iBufYOff + nYOff;
280 :
281 6707060 : if (iSrcY < nLBlockY * nBlockYSize ||
282 6707060 : iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
283 : {
284 3368100 : nLBlockY = iSrcY / nBlockYSize;
285 3368100 : bool bJustInitialize =
286 298126 : eRWFlag == GF_Write && nXOff == 0 &&
287 3724240 : nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
288 58015 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize;
289 :
290 : // Is this a partial tile at right and/or bottom edges of
291 : // the raster, and that is going to be completely written?
292 : // If so, do not load it from storage, but zero it so that
293 : // the content outsize of the validity area is initialized.
294 3368100 : bool bMemZeroBuffer = false;
295 298126 : if (eRWFlag == GF_Write && !bJustInitialize && nXOff == 0 &&
296 25682 : nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
297 3666320 : nYOff + nYSize == GetYSize() &&
298 90 : nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)
299 : {
300 90 : bJustInitialize = true;
301 90 : bMemZeroBuffer = true;
302 : }
303 :
304 3368100 : if (poBlock)
305 269470 : poBlock->DropLock();
306 :
307 3368100 : const GUInt32 nErrorCounter = CPLGetErrorCounter();
308 3368100 : poBlock = GetLockedBlockRef(0, nLBlockY, bJustInitialize);
309 3368100 : if (poBlock == nullptr)
310 : {
311 1079 : if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
312 : nullptr)
313 : {
314 0 : CPLError(CE_Failure, CPLE_AppDefined,
315 : "GetBlockRef failed at X block offset %d, "
316 : "Y block offset %d%s",
317 : 0, nLBlockY,
318 0 : (nErrorCounter != CPLGetErrorCounter())
319 0 : ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
320 : : "");
321 : }
322 1079 : eErr = CE_Failure;
323 1079 : break;
324 : }
325 :
326 3367020 : if (eRWFlag == GF_Write)
327 298126 : poBlock->MarkDirty();
328 :
329 3367020 : pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
330 3367020 : if (bMemZeroBuffer)
331 : {
332 90 : memset(pabySrcBlock, 0,
333 90 : static_cast<GPtrDiff_t>(nBandDataSize) *
334 90 : nBlockXSize * nBlockYSize);
335 : }
336 : }
337 :
338 6705980 : const auto nSrcByteOffset =
339 6705980 : (static_cast<GPtrDiff_t>(iSrcY - nLBlockY * nBlockYSize) *
340 6705980 : nBlockXSize +
341 6705980 : nXOff) *
342 6705980 : nBandDataSize;
343 :
344 6705980 : if (eDataType == eBufType)
345 : {
346 3041130 : if (eRWFlag == GF_Read)
347 2565910 : memcpy(static_cast<GByte *>(pData) +
348 2565910 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
349 2565910 : pabySrcBlock + nSrcByteOffset,
350 : static_cast<size_t>(nLineSpace));
351 : else
352 475223 : memcpy(pabySrcBlock + nSrcByteOffset,
353 475223 : static_cast<GByte *>(pData) +
354 475223 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
355 : static_cast<size_t>(nLineSpace));
356 : }
357 : else
358 : {
359 : // Type to type conversion.
360 3664850 : if (eRWFlag == GF_Read)
361 3642550 : GDALCopyWords64(
362 3642550 : pabySrcBlock + nSrcByteOffset, eDataType, nBandDataSize,
363 : static_cast<GByte *>(pData) +
364 3642550 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
365 : eBufType, static_cast<int>(nPixelSpace), nBufXSize);
366 : else
367 22299 : GDALCopyWords64(static_cast<GByte *>(pData) +
368 22299 : static_cast<GPtrDiff_t>(iBufYOff) *
369 : nLineSpace,
370 : eBufType, static_cast<int>(nPixelSpace),
371 22299 : pabySrcBlock + nSrcByteOffset, eDataType,
372 : nBandDataSize, nBufXSize);
373 : }
374 :
375 6794130 : if (psExtraArg->pfnProgress != nullptr &&
376 88144 : !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
377 : psExtraArg->pProgressData))
378 : {
379 5 : eErr = CE_Failure;
380 5 : break;
381 : }
382 : }
383 :
384 3098630 : if (poBlock)
385 3097550 : poBlock->DropLock();
386 :
387 3098630 : return eErr;
388 : }
389 :
390 : /* ==================================================================== */
391 : /* Do we have overviews that would be appropriate to satisfy */
392 : /* this request? */
393 : /* ==================================================================== */
394 3095280 : if ((nBufXSize < nXSize || nBufYSize < nYSize) && GetOverviewCount() > 0 &&
395 : eRWFlag == GF_Read)
396 : {
397 : GDALRasterIOExtraArg sExtraArg;
398 2967 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
399 :
400 : const int nOverview =
401 2967 : GDALBandGetBestOverviewLevel2(this, nXOff, nYOff, nXSize, nYSize,
402 : nBufXSize, nBufYSize, &sExtraArg);
403 2967 : if (nOverview >= 0)
404 : {
405 2892 : GDALRasterBand *poOverviewBand = GetOverview(nOverview);
406 2892 : if (poOverviewBand == nullptr)
407 2892 : return CE_Failure;
408 :
409 2892 : return poOverviewBand->RasterIO(
410 : eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize,
411 2892 : nBufYSize, eBufType, nPixelSpace, nLineSpace, &sExtraArg);
412 : }
413 : }
414 :
415 902864 : if (eRWFlag == GF_Read && nBufXSize < nXSize / 100 &&
416 6 : nBufYSize < nYSize / 100 && nPixelSpace == nBufDataSize &&
417 3995260 : nLineSpace == nPixelSpace * nBufXSize &&
418 6 : CPLTestBool(CPLGetConfigOption("GDAL_NO_COSTLY_OVERVIEW", "NO")))
419 : {
420 0 : memset(pData, 0, static_cast<size_t>(nLineSpace * nBufYSize));
421 0 : return CE_None;
422 : }
423 :
424 : /* ==================================================================== */
425 : /* The second case when we don't need subsample data but likely */
426 : /* need data type conversion. */
427 : /* ==================================================================== */
428 3092390 : if ( // nPixelSpace == nBufDataSize &&
429 3092390 : nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
430 : {
431 : #if DEBUG_VERBOSE
432 : printf("IRasterIO(%d,%d,%d,%d) rw=%d case 2\n", /*ok*/
433 : nXOff, nYOff, nXSize, nYSize, static_cast<int>(eRWFlag));
434 : #endif
435 :
436 : /* --------------------------------------------------------------------
437 : */
438 : /* Loop over buffer computing source locations. */
439 : /* --------------------------------------------------------------------
440 : */
441 : // Calculate starting values out of loop
442 2512940 : const int nLBlockXStart = nXOff / nBlockXSize;
443 2512940 : const int nXSpanEnd = nBufXSize + nXOff;
444 :
445 2512940 : int iBufYOff = 0;
446 2512940 : int iSrcY = nYOff;
447 : while (true)
448 : {
449 2553990 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
450 : static_cast<GPtrDiff_t>(nLineSpace);
451 2553990 : int nLBlockY = iSrcY / nBlockYSize;
452 2553990 : int nLBlockX = nLBlockXStart;
453 2553990 : int iSrcX = nXOff;
454 5386320 : while (iSrcX < nXSpanEnd)
455 : {
456 2832400 : int nXSpan = nLBlockX * nBlockXSize;
457 2832400 : if (nXSpan < INT_MAX - nBlockXSize)
458 2832400 : nXSpan += nBlockXSize;
459 : else
460 0 : nXSpan = INT_MAX;
461 2832400 : const int nXRight = nXSpan;
462 2832400 : nXSpan = (nXSpan < nXSpanEnd ? nXSpan : nXSpanEnd) - iSrcX;
463 :
464 : const size_t nXSpanSize =
465 2832400 : CPLUnsanitizedMul(nXSpan, static_cast<size_t>(nPixelSpace));
466 :
467 2832400 : bool bJustInitialize =
468 2043060 : eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
469 38126 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
470 4901870 : nXOff <= nLBlockX * nBlockXSize &&
471 26406 : nXOff + nXSize >= nXRight;
472 :
473 : // Is this a partial tile at right and/or bottom edges of
474 : // the raster, and that is going to be completely written?
475 : // If so, do not load it from storage, but zero it so that
476 : // the content outsize of the validity area is initialized.
477 2832400 : bool bMemZeroBuffer = false;
478 2043060 : if (eRWFlag == GF_Write && !bJustInitialize &&
479 2017910 : nXOff <= nLBlockX * nBlockXSize &&
480 2016250 : nYOff <= nLBlockY * nBlockYSize &&
481 12206 : (nXOff + nXSize >= nXRight ||
482 : // cppcheck-suppress knownConditionTrueFalse
483 4878230 : (nXOff + nXSize == GetXSize() && nXRight > GetXSize())) &&
484 12026 : (nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize ||
485 10792 : (nYOff + nYSize == GetYSize() &&
486 2000 : nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)))
487 : {
488 3234 : bJustInitialize = true;
489 3234 : bMemZeroBuffer = true;
490 : }
491 :
492 : /* --------------------------------------------------------------------
493 : */
494 : /* Ensure we have the appropriate block loaded. */
495 : /* --------------------------------------------------------------------
496 : */
497 2832400 : const GUInt32 nErrorCounter = CPLGetErrorCounter();
498 2832400 : poBlock =
499 2832400 : GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
500 2832400 : if (!poBlock)
501 : {
502 73 : if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
503 : nullptr)
504 : {
505 0 : CPLError(CE_Failure, CPLE_AppDefined,
506 : "GetBlockRef failed at X block offset %d, "
507 : "Y block offset %d%s",
508 : nLBlockX, nLBlockY,
509 0 : (nErrorCounter != CPLGetErrorCounter())
510 0 : ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
511 : : "");
512 : }
513 73 : return (CE_Failure);
514 : }
515 :
516 2832330 : if (eRWFlag == GF_Write)
517 2043060 : poBlock->MarkDirty();
518 :
519 2832330 : pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
520 2832330 : if (bMemZeroBuffer)
521 : {
522 3234 : memset(pabySrcBlock, 0,
523 3234 : static_cast<GPtrDiff_t>(nBandDataSize) *
524 3234 : nBlockXSize * nBlockYSize);
525 : }
526 : /* --------------------------------------------------------------------
527 : */
528 : /* Copy over this chunk of data. */
529 : /* --------------------------------------------------------------------
530 : */
531 2832330 : GPtrDiff_t iSrcOffset =
532 2832330 : (static_cast<GPtrDiff_t>(iSrcX) -
533 2832330 : static_cast<GPtrDiff_t>(nLBlockX * nBlockXSize) +
534 2832330 : (static_cast<GPtrDiff_t>(iSrcY) -
535 2832330 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
536 2832330 : nBlockXSize) *
537 2832330 : nBandDataSize;
538 : // Fill up as many rows as possible for the loaded block.
539 5664660 : const int kmax = std::min(nBlockYSize - (iSrcY % nBlockYSize),
540 2832330 : nBufYSize - iBufYOff);
541 61152600 : for (int k = 0; k < kmax; k++)
542 : {
543 58320300 : if (eDataType == eBufType && nPixelSpace == nBufDataSize)
544 : {
545 53916000 : if (eRWFlag == GF_Read)
546 49466200 : memcpy(static_cast<GByte *>(pData) + iBufOffset +
547 49466200 : static_cast<GPtrDiff_t>(k) * nLineSpace,
548 49466200 : pabySrcBlock + iSrcOffset, nXSpanSize);
549 : else
550 4449780 : memcpy(pabySrcBlock + iSrcOffset,
551 4449780 : static_cast<GByte *>(pData) + iBufOffset +
552 4449780 : static_cast<GPtrDiff_t>(k) * nLineSpace,
553 : nXSpanSize);
554 : }
555 : else
556 : {
557 : /* type to type conversion */
558 4404310 : if (eRWFlag == GF_Read)
559 4254100 : GDALCopyWords64(
560 4254100 : pabySrcBlock + iSrcOffset, eDataType,
561 : nBandDataSize,
562 4254100 : static_cast<GByte *>(pData) + iBufOffset +
563 4254100 : static_cast<GPtrDiff_t>(k) * nLineSpace,
564 : eBufType, static_cast<int>(nPixelSpace),
565 : nXSpan);
566 : else
567 150209 : GDALCopyWords64(
568 150209 : static_cast<GByte *>(pData) + iBufOffset +
569 150209 : static_cast<GPtrDiff_t>(k) * nLineSpace,
570 : eBufType, static_cast<int>(nPixelSpace),
571 150209 : pabySrcBlock + iSrcOffset, eDataType,
572 : nBandDataSize, nXSpan);
573 : }
574 :
575 58320300 : iSrcOffset +=
576 58320300 : static_cast<GPtrDiff_t>(nBlockXSize) * nBandDataSize;
577 : }
578 :
579 : iBufOffset =
580 2832330 : CPLUnsanitizedAdd<GPtrDiff_t>(iBufOffset, nXSpanSize);
581 2832330 : nLBlockX++;
582 2832330 : iSrcX += nXSpan;
583 :
584 2832330 : poBlock->DropLock();
585 2832330 : poBlock = nullptr;
586 : }
587 :
588 : /* Compute the increment to go on a block boundary */
589 2553920 : const int nYInc = nBlockYSize - (iSrcY % nBlockYSize);
590 :
591 2555800 : if (psExtraArg->pfnProgress != nullptr &&
592 1889 : !psExtraArg->pfnProgress(
593 2555800 : 1.0 * std::min(nBufYSize, iBufYOff + nYInc) / nBufYSize, "",
594 : psExtraArg->pProgressData))
595 : {
596 0 : return CE_Failure;
597 : }
598 :
599 2553920 : iBufYOff += nYInc;
600 2553920 : if (iBufYOff >= nBufYSize)
601 2512860 : break;
602 : // Only increment iSrcY after above loop end check, to avoid
603 : // potential int overflow.
604 41052 : iSrcY += nYInc;
605 41052 : }
606 :
607 2512860 : return CE_None;
608 : }
609 :
610 : /* ==================================================================== */
611 : /* Loop reading required source blocks to satisfy output */
612 : /* request. This is the most general implementation. */
613 : /* ==================================================================== */
614 :
615 579452 : double dfXOff = nXOff;
616 579452 : double dfYOff = nYOff;
617 579452 : double dfXSize = nXSize;
618 579452 : double dfYSize = nYSize;
619 579452 : if (psExtraArg->bFloatingPointWindowValidity)
620 : {
621 244495 : dfXOff = psExtraArg->dfXOff;
622 244495 : dfYOff = psExtraArg->dfYOff;
623 244495 : dfXSize = psExtraArg->dfXSize;
624 244495 : dfYSize = psExtraArg->dfYSize;
625 : }
626 :
627 : /* -------------------------------------------------------------------- */
628 : /* Compute stepping increment. */
629 : /* -------------------------------------------------------------------- */
630 579452 : const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
631 579452 : const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
632 579452 : CPLErr eErr = CE_None;
633 :
634 579452 : if (eRWFlag == GF_Write)
635 : {
636 : /* --------------------------------------------------------------------
637 : */
638 : /* Write case */
639 : /* Loop over raster window computing source locations in the buffer.
640 : */
641 : /* --------------------------------------------------------------------
642 : */
643 166655 : GByte *pabyDstBlock = nullptr;
644 166655 : int nLBlockX = -1;
645 166655 : int nLBlockY = -1;
646 :
647 1260010 : for (int iDstY = nYOff; iDstY < nYOff + nYSize; iDstY++)
648 : {
649 1093360 : const int iBufYOff = static_cast<int>((iDstY - nYOff) / dfSrcYInc);
650 :
651 12384200 : for (int iDstX = nXOff; iDstX < nXOff + nXSize; iDstX++)
652 : {
653 11290800 : const int iBufXOff =
654 11290800 : static_cast<int>((iDstX - nXOff) / dfSrcXInc);
655 11290800 : GPtrDiff_t iBufOffset =
656 11290800 : static_cast<GPtrDiff_t>(iBufYOff) *
657 : static_cast<GPtrDiff_t>(nLineSpace) +
658 11290800 : iBufXOff * static_cast<GPtrDiff_t>(nPixelSpace);
659 :
660 : // FIXME: this code likely doesn't work if the dirty block gets
661 : // flushed to disk before being completely written.
662 : // In the meantime, bJustInitialize should probably be set to
663 : // FALSE even if it is not ideal performance wise, and for
664 : // lossy compression.
665 :
666 : /* --------------------------------------------------------------------
667 : */
668 : /* Ensure we have the appropriate block loaded. */
669 : /* --------------------------------------------------------------------
670 : */
671 11290800 : if (iDstX < nLBlockX * nBlockXSize ||
672 11041500 : iDstX - nBlockXSize >= nLBlockX * nBlockXSize ||
673 10584800 : iDstY < nLBlockY * nBlockYSize ||
674 10584800 : iDstY - nBlockYSize >= nLBlockY * nBlockYSize)
675 : {
676 738702 : nLBlockX = iDstX / nBlockXSize;
677 738702 : nLBlockY = iDstY / nBlockYSize;
678 :
679 738702 : const bool bJustInitialize =
680 1065990 : nYOff <= nLBlockY * nBlockYSize &&
681 327291 : nYOff + nYSize - nBlockYSize >=
682 327291 : nLBlockY * nBlockYSize &&
683 1116320 : nXOff <= nLBlockX * nBlockXSize &&
684 50325 : nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
685 : /*bool bMemZeroBuffer = FALSE;
686 : if( !bJustInitialize &&
687 : nXOff <= nLBlockX * nBlockXSize &&
688 : nYOff <= nLBlockY * nBlockYSize &&
689 : (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
690 : (nXOff + nXSize == GetXSize() &&
691 : (nLBlockX+1) * nBlockXSize > GetXSize())) &&
692 : (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
693 : (nYOff + nYSize == GetYSize() &&
694 : (nLBlockY+1) * nBlockYSize > GetYSize())) )
695 : {
696 : bJustInitialize = TRUE;
697 : bMemZeroBuffer = TRUE;
698 : }*/
699 738702 : if (poBlock != nullptr)
700 572047 : poBlock->DropLock();
701 :
702 738702 : poBlock =
703 738702 : GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
704 738702 : if (poBlock == nullptr)
705 : {
706 0 : return (CE_Failure);
707 : }
708 :
709 738702 : poBlock->MarkDirty();
710 :
711 738702 : pabyDstBlock = static_cast<GByte *>(poBlock->GetDataRef());
712 : /*if( bMemZeroBuffer )
713 : {
714 : memset(pabyDstBlock, 0,
715 : static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
716 : * nBlockYSize);
717 : }*/
718 : }
719 :
720 : // To make Coverity happy. Should not happen by design.
721 11290800 : if (pabyDstBlock == nullptr)
722 : {
723 0 : CPLAssert(false);
724 : eErr = CE_Failure;
725 : break;
726 : }
727 :
728 : /* --------------------------------------------------------------------
729 : */
730 : /* Copy over this pixel of data. */
731 : /* --------------------------------------------------------------------
732 : */
733 11290800 : GPtrDiff_t iDstOffset =
734 11290800 : (static_cast<GPtrDiff_t>(iDstX) -
735 11290800 : static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
736 11290800 : (static_cast<GPtrDiff_t>(iDstY) -
737 11290800 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
738 11290800 : nBlockXSize) *
739 11290800 : nBandDataSize;
740 :
741 11290800 : if (eDataType == eBufType)
742 : {
743 11287700 : memcpy(pabyDstBlock + iDstOffset,
744 11287700 : static_cast<GByte *>(pData) + iBufOffset,
745 : nBandDataSize);
746 : }
747 : else
748 : {
749 : /* type to type conversion ... ouch, this is expensive way
750 : of handling single words */
751 3096 : GDALCopyWords64(static_cast<GByte *>(pData) + iBufOffset,
752 3096 : eBufType, 0, pabyDstBlock + iDstOffset,
753 : eDataType, 0, 1);
754 : }
755 : }
756 :
757 1093360 : if (psExtraArg->pfnProgress != nullptr &&
758 0 : !psExtraArg->pfnProgress(1.0 * (iDstY - nYOff + 1) / nYSize, "",
759 : psExtraArg->pProgressData))
760 : {
761 0 : eErr = CE_Failure;
762 0 : break;
763 : }
764 : }
765 : }
766 : else
767 : {
768 412797 : if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
769 : {
770 46692 : if ((psExtraArg->eResampleAlg == GRIORA_Cubic ||
771 15098 : psExtraArg->eResampleAlg == GRIORA_CubicSpline ||
772 15045 : psExtraArg->eResampleAlg == GRIORA_Bilinear ||
773 31641 : psExtraArg->eResampleAlg == GRIORA_Lanczos) &&
774 4763 : GetColorTable() != nullptr)
775 : {
776 0 : CPLError(CE_Warning, CPLE_NotSupported,
777 : "Resampling method not supported on paletted band. "
778 : "Falling back to nearest neighbour");
779 : }
780 15800 : else if (psExtraArg->eResampleAlg == GRIORA_Gauss &&
781 3 : GDALDataTypeIsComplex(eDataType))
782 : {
783 0 : CPLError(CE_Warning, CPLE_NotSupported,
784 : "Resampling method not supported on complex data type "
785 : "band. Falling back to nearest neighbour");
786 : }
787 : else
788 : {
789 15797 : return RasterIOResampled(eRWFlag, nXOff, nYOff, nXSize, nYSize,
790 : pData, nBufXSize, nBufYSize, eBufType,
791 15797 : nPixelSpace, nLineSpace, psExtraArg);
792 : }
793 : }
794 :
795 397000 : int nLimitBlockY = 0;
796 397000 : const bool bByteCopy = eDataType == eBufType && nBandDataSize == 1;
797 397000 : int nStartBlockX = -nBlockXSize;
798 397000 : constexpr double EPS = 1e-10;
799 397000 : int nLBlockY = -1;
800 397000 : const double dfSrcXStart = 0.5 * dfSrcXInc + dfXOff + EPS;
801 397000 : const bool bIntegerXFactor =
802 372767 : bUseIntegerRequestCoords &&
803 670836 : static_cast<int>(dfSrcXInc) == dfSrcXInc &&
804 273836 : static_cast<int>(dfSrcXInc) < INT_MAX / nBandDataSize;
805 :
806 : /* --------------------------------------------------------------------
807 : */
808 : /* Read case */
809 : /* Loop over buffer computing source locations. */
810 : /* --------------------------------------------------------------------
811 : */
812 2367100 : for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
813 : {
814 : // Add small epsilon to avoid some numeric precision issues.
815 1970110 : const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
816 1970110 : const int iSrcY = static_cast<int>(std::min(
817 1970110 : std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
818 :
819 1970110 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
820 : static_cast<GPtrDiff_t>(nLineSpace);
821 :
822 1970110 : if (iSrcY >= nLimitBlockY)
823 : {
824 438018 : nLBlockY = iSrcY / nBlockYSize;
825 438018 : nLimitBlockY = nLBlockY * nBlockYSize;
826 438018 : if (nLimitBlockY < INT_MAX - nBlockYSize)
827 438018 : nLimitBlockY += nBlockYSize;
828 : else
829 0 : nLimitBlockY = INT_MAX;
830 : // Make sure a new block is loaded.
831 438018 : nStartBlockX = -nBlockXSize;
832 : }
833 1532090 : else if (static_cast<int>(dfSrcXStart) < nStartBlockX)
834 : {
835 : // Make sure a new block is loaded.
836 437363 : nStartBlockX = -nBlockXSize;
837 : }
838 :
839 1970110 : GPtrDiff_t iSrcOffsetCst = (iSrcY - nLBlockY * nBlockYSize) *
840 1970110 : static_cast<GPtrDiff_t>(nBlockXSize);
841 :
842 1970110 : if (bIntegerXFactor)
843 : {
844 695850 : int iSrcX = static_cast<int>(dfSrcXStart);
845 695850 : const int nSrcXInc = static_cast<int>(dfSrcXInc);
846 695850 : GByte *pabyDstData = static_cast<GByte *>(pData) + iBufOffset;
847 695850 : bool bRet = false;
848 695850 : if (bByteCopy)
849 : {
850 585842 : bRet = DownsamplingIntegerXFactor<true, 1>(
851 : this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
852 : static_cast<int>(nPixelSpace), nBufXSize, GDT_UInt8,
853 : GDT_UInt8, nStartBlockX, nBlockXSize, poBlock,
854 : nLBlockY);
855 : }
856 110008 : else if (eDataType == eBufType)
857 : {
858 109783 : switch (nBandDataSize)
859 : {
860 109630 : case 2:
861 109630 : bRet = DownsamplingIntegerXFactor<true, 2>(
862 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
863 : pabyDstData, static_cast<int>(nPixelSpace),
864 : nBufXSize, eDataType, eDataType, nStartBlockX,
865 : nBlockXSize, poBlock, nLBlockY);
866 109630 : break;
867 55 : case 4:
868 55 : bRet = DownsamplingIntegerXFactor<true, 4>(
869 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
870 : pabyDstData, static_cast<int>(nPixelSpace),
871 : nBufXSize, eDataType, eDataType, nStartBlockX,
872 : nBlockXSize, poBlock, nLBlockY);
873 55 : break;
874 96 : case 8:
875 96 : bRet = DownsamplingIntegerXFactor<true, 8>(
876 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
877 : pabyDstData, static_cast<int>(nPixelSpace),
878 : nBufXSize, eDataType, eDataType, nStartBlockX,
879 : nBlockXSize, poBlock, nLBlockY);
880 96 : break;
881 2 : case 16:
882 2 : bRet = DownsamplingIntegerXFactor<true, 16>(
883 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
884 : pabyDstData, static_cast<int>(nPixelSpace),
885 : nBufXSize, eDataType, eDataType, nStartBlockX,
886 : nBlockXSize, poBlock, nLBlockY);
887 2 : break;
888 0 : default:
889 0 : CPLAssert(false);
890 : break;
891 : }
892 : }
893 : else
894 : {
895 225 : bRet = DownsamplingIntegerXFactor<false, 0>(
896 : this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
897 : static_cast<int>(nPixelSpace), nBufXSize, eDataType,
898 : eBufType, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
899 : }
900 695850 : if (!bRet)
901 1 : eErr = CE_Failure;
902 : }
903 : else
904 : {
905 1274260 : double dfSrcX = dfSrcXStart;
906 503811000 : for (int iBufXOff = 0; iBufXOff < nBufXSize;
907 502537000 : iBufXOff++, dfSrcX += dfSrcXInc)
908 : {
909 : // TODO?: try to avoid the clamping for most iterations
910 : const int iSrcX = static_cast<int>(
911 1005070000 : std::min(std::max(0.0, dfSrcX),
912 502537000 : static_cast<double>(nRasterXSize - 1)));
913 :
914 : /* --------------------------------------------------------------------
915 : */
916 : /* Ensure we have the appropriate block loaded. */
917 : /* --------------------------------------------------------------------
918 : */
919 502537000 : if (iSrcX >= nBlockXSize + nStartBlockX)
920 : {
921 1697820 : const int nLBlockX = iSrcX / nBlockXSize;
922 1697820 : nStartBlockX = nLBlockX * nBlockXSize;
923 :
924 1697820 : if (poBlock != nullptr)
925 1574650 : poBlock->DropLock();
926 :
927 1697820 : poBlock = GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
928 1697820 : if (poBlock == nullptr)
929 : {
930 9 : eErr = CE_Failure;
931 9 : break;
932 : }
933 :
934 : pabySrcBlock =
935 1697810 : static_cast<GByte *>(poBlock->GetDataRef());
936 : }
937 502537000 : const GPtrDiff_t nDiffX =
938 502537000 : static_cast<GPtrDiff_t>(iSrcX - nStartBlockX);
939 :
940 : /* --------------------------------------------------------------------
941 : */
942 : /* Copy over this pixel of data. */
943 : /* --------------------------------------------------------------------
944 : */
945 :
946 502537000 : if (bByteCopy)
947 : {
948 442592000 : GPtrDiff_t iSrcOffset = nDiffX + iSrcOffsetCst;
949 442592000 : static_cast<GByte *>(pData)[iBufOffset] =
950 442592000 : pabySrcBlock[iSrcOffset];
951 : }
952 59944700 : else if (eDataType == eBufType)
953 : {
954 50322800 : GPtrDiff_t iSrcOffset =
955 50322800 : (nDiffX + iSrcOffsetCst) * nBandDataSize;
956 50322800 : memcpy(static_cast<GByte *>(pData) + iBufOffset,
957 50322800 : pabySrcBlock + iSrcOffset, nBandDataSize);
958 : }
959 : else
960 : {
961 : // Type to type conversion ...
962 9621890 : GPtrDiff_t iSrcOffset =
963 9621890 : (nDiffX + iSrcOffsetCst) * nBandDataSize;
964 9621890 : GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
965 : static_cast<GByte *>(pData) +
966 9621890 : iBufOffset,
967 : eBufType, 0, 1);
968 : }
969 :
970 502537000 : iBufOffset += static_cast<int>(nPixelSpace);
971 : }
972 : }
973 1970110 : if (eErr == CE_Failure)
974 11 : break;
975 :
976 2191530 : if (psExtraArg->pfnProgress != nullptr &&
977 221434 : !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
978 : psExtraArg->pProgressData))
979 : {
980 1 : eErr = CE_Failure;
981 1 : break;
982 : }
983 : }
984 : }
985 :
986 563655 : if (poBlock != nullptr)
987 563645 : poBlock->DropLock();
988 :
989 563655 : return eErr;
990 : }
991 :
992 : /************************************************************************/
993 : /* GDALRasterIOTransformer() */
994 : /************************************************************************/
995 :
996 : struct GDALRasterIOTransformerStruct
997 : {
998 : double dfXOff;
999 : double dfYOff;
1000 : double dfXRatioDstToSrc;
1001 : double dfYRatioDstToSrc;
1002 : };
1003 :
1004 6897 : static int GDALRasterIOTransformer(void *pTransformerArg, int bDstToSrc,
1005 : int nPointCount, double *x, double *y,
1006 : double * /* z */, int *panSuccess)
1007 : {
1008 6897 : GDALRasterIOTransformerStruct *psParams =
1009 : static_cast<GDALRasterIOTransformerStruct *>(pTransformerArg);
1010 6897 : if (bDstToSrc)
1011 : {
1012 311993 : for (int i = 0; i < nPointCount; i++)
1013 : {
1014 305684 : x[i] = x[i] * psParams->dfXRatioDstToSrc + psParams->dfXOff;
1015 305684 : y[i] = y[i] * psParams->dfYRatioDstToSrc + psParams->dfYOff;
1016 305684 : panSuccess[i] = TRUE;
1017 : }
1018 : }
1019 : else
1020 : {
1021 1176 : for (int i = 0; i < nPointCount; i++)
1022 : {
1023 588 : x[i] = (x[i] - psParams->dfXOff) / psParams->dfXRatioDstToSrc;
1024 588 : y[i] = (y[i] - psParams->dfYOff) / psParams->dfYRatioDstToSrc;
1025 588 : panSuccess[i] = TRUE;
1026 : }
1027 : }
1028 6897 : return TRUE;
1029 : }
1030 :
1031 : /************************************************************************/
1032 : /* RasterIOResampled() */
1033 : /************************************************************************/
1034 :
1035 : //! @cond Doxygen_Suppress
1036 15797 : CPLErr GDALRasterBand::RasterIOResampled(
1037 : GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
1038 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
1039 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
1040 : {
1041 : // Determine if we use warping resampling or overview resampling
1042 : const bool bUseWarp =
1043 15797 : (GDALDataTypeIsComplex(eDataType) &&
1044 15956 : psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
1045 159 : psExtraArg->eResampleAlg != GRIORA_Mode);
1046 :
1047 15797 : double dfXOff = nXOff;
1048 15797 : double dfYOff = nYOff;
1049 15797 : double dfXSize = nXSize;
1050 15797 : double dfYSize = nYSize;
1051 15797 : if (psExtraArg->bFloatingPointWindowValidity)
1052 : {
1053 15051 : dfXOff = psExtraArg->dfXOff;
1054 15051 : dfYOff = psExtraArg->dfYOff;
1055 15051 : dfXSize = psExtraArg->dfXSize;
1056 15051 : dfYSize = psExtraArg->dfYSize;
1057 : }
1058 :
1059 15797 : const double dfXRatioDstToSrc = dfXSize / nBufXSize;
1060 15797 : const double dfYRatioDstToSrc = dfYSize / nBufYSize;
1061 :
1062 : // Determine the coordinates in the "virtual" output raster to see
1063 : // if there are not integers, in which case we will use them as a shift
1064 : // so that subwindow extracts give the exact same results as entire raster
1065 : // scaling.
1066 15797 : double dfDestXOff = dfXOff / dfXRatioDstToSrc;
1067 15797 : bool bHasXOffVirtual = false;
1068 15797 : int nDestXOffVirtual = 0;
1069 15797 : if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
1070 : {
1071 15469 : bHasXOffVirtual = true;
1072 15469 : dfXOff = nXOff;
1073 15469 : nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
1074 : }
1075 :
1076 15797 : double dfDestYOff = dfYOff / dfYRatioDstToSrc;
1077 15797 : bool bHasYOffVirtual = false;
1078 15797 : int nDestYOffVirtual = 0;
1079 15797 : if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
1080 : {
1081 15465 : bHasYOffVirtual = true;
1082 15465 : dfYOff = nYOff;
1083 15465 : nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
1084 : }
1085 :
1086 : // Create a MEM dataset that wraps the output buffer.
1087 : GDALDataset *poMEMDS;
1088 15797 : void *pTempBuffer = nullptr;
1089 15797 : GSpacing nPSMem = nPixelSpace;
1090 15797 : GSpacing nLSMem = nLineSpace;
1091 15797 : void *pDataMem = pData;
1092 15797 : GDALDataType eDTMem = eBufType;
1093 15797 : if (eBufType != eDataType && !GDAL_GET_OPERATE_IN_BUF_TYPE(*psExtraArg))
1094 : {
1095 4 : nPSMem = GDALGetDataTypeSizeBytes(eDataType);
1096 4 : nLSMem = nPSMem * nBufXSize;
1097 : pTempBuffer =
1098 4 : VSI_MALLOC2_VERBOSE(nBufYSize, static_cast<size_t>(nLSMem));
1099 4 : if (pTempBuffer == nullptr)
1100 0 : return CE_Failure;
1101 4 : pDataMem = pTempBuffer;
1102 4 : eDTMem = eDataType;
1103 : }
1104 :
1105 : poMEMDS =
1106 15797 : MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
1107 : nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr);
1108 15797 : GByte *pabyData = static_cast<GByte *>(pDataMem) -
1109 15797 : nPSMem * nDestXOffVirtual - nLSMem * nDestYOffVirtual;
1110 15797 : GDALRasterBandH hMEMBand = MEMCreateRasterBandEx(
1111 : poMEMDS, 1, pabyData, eDTMem, nPSMem, nLSMem, false);
1112 15797 : poMEMDS->SetBand(1, GDALRasterBand::FromHandle(hMEMBand));
1113 :
1114 15797 : const char *pszNBITS = GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
1115 15797 : const int nNBITS = pszNBITS ? atoi(pszNBITS) : 0;
1116 15797 : if (pszNBITS)
1117 6 : GDALRasterBand::FromHandle(hMEMBand)->SetMetadataItem(
1118 6 : "NBITS", pszNBITS, "IMAGE_STRUCTURE");
1119 :
1120 15797 : CPLErr eErr = CE_None;
1121 :
1122 : // Do the resampling.
1123 15797 : if (bUseWarp)
1124 : {
1125 149 : int bHasNoData = FALSE;
1126 149 : double dfNoDataValue = GetNoDataValue(&bHasNoData);
1127 :
1128 149 : VRTDatasetH hVRTDS = nullptr;
1129 149 : GDALRasterBandH hVRTBand = nullptr;
1130 149 : if (GetDataset() == nullptr)
1131 : {
1132 : /* Create VRT dataset that wraps the whole dataset */
1133 0 : hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
1134 0 : VRTAddBand(hVRTDS, eDataType, nullptr);
1135 0 : hVRTBand = GDALGetRasterBand(hVRTDS, 1);
1136 0 : VRTAddSimpleSource(hVRTBand, this, 0, 0, nRasterXSize, nRasterYSize,
1137 : 0, 0, nRasterXSize, nRasterYSize, nullptr,
1138 : VRT_NODATA_UNSET);
1139 :
1140 : /* Add a mask band if needed */
1141 0 : if (GetMaskFlags() != GMF_ALL_VALID)
1142 : {
1143 0 : GDALDataset::FromHandle(hVRTDS)->CreateMaskBand(0);
1144 : VRTSourcedRasterBand *poVRTMaskBand =
1145 : reinterpret_cast<VRTSourcedRasterBand *>(
1146 : reinterpret_cast<GDALRasterBand *>(hVRTBand)
1147 0 : ->GetMaskBand());
1148 0 : poVRTMaskBand->AddMaskBandSource(this, 0, 0, nRasterXSize,
1149 0 : nRasterYSize, 0, 0,
1150 0 : nRasterXSize, nRasterYSize);
1151 : }
1152 : }
1153 :
1154 149 : GDALWarpOptions *psWarpOptions = GDALCreateWarpOptions();
1155 149 : switch (psExtraArg->eResampleAlg)
1156 : {
1157 0 : case GRIORA_NearestNeighbour:
1158 0 : psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
1159 0 : break;
1160 147 : case GRIORA_Bilinear:
1161 147 : psWarpOptions->eResampleAlg = GRA_Bilinear;
1162 147 : break;
1163 0 : case GRIORA_Cubic:
1164 0 : psWarpOptions->eResampleAlg = GRA_Cubic;
1165 0 : break;
1166 0 : case GRIORA_CubicSpline:
1167 0 : psWarpOptions->eResampleAlg = GRA_CubicSpline;
1168 0 : break;
1169 0 : case GRIORA_Lanczos:
1170 0 : psWarpOptions->eResampleAlg = GRA_Lanczos;
1171 0 : break;
1172 0 : case GRIORA_Average:
1173 0 : psWarpOptions->eResampleAlg = GRA_Average;
1174 0 : break;
1175 2 : case GRIORA_RMS:
1176 2 : psWarpOptions->eResampleAlg = GRA_RMS;
1177 2 : break;
1178 0 : case GRIORA_Mode:
1179 0 : psWarpOptions->eResampleAlg = GRA_Mode;
1180 0 : break;
1181 0 : default:
1182 0 : CPLAssert(false);
1183 : psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
1184 : break;
1185 : }
1186 149 : psWarpOptions->hSrcDS = hVRTDS ? hVRTDS : GetDataset();
1187 149 : psWarpOptions->hDstDS = poMEMDS;
1188 149 : psWarpOptions->nBandCount = 1;
1189 149 : int nSrcBandNumber = hVRTDS ? 1 : nBand;
1190 149 : int nDstBandNumber = 1;
1191 149 : psWarpOptions->panSrcBands = &nSrcBandNumber;
1192 149 : psWarpOptions->panDstBands = &nDstBandNumber;
1193 298 : psWarpOptions->pfnProgress = psExtraArg->pfnProgress
1194 149 : ? psExtraArg->pfnProgress
1195 : : GDALDummyProgress;
1196 149 : psWarpOptions->pProgressArg = psExtraArg->pProgressData;
1197 149 : psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
1198 149 : if (bHasNoData)
1199 : {
1200 0 : psWarpOptions->papszWarpOptions = CSLSetNameValue(
1201 : psWarpOptions->papszWarpOptions, "INIT_DEST", "NO_DATA");
1202 0 : if (psWarpOptions->padfSrcNoDataReal == nullptr)
1203 : {
1204 0 : psWarpOptions->padfSrcNoDataReal =
1205 0 : static_cast<double *>(CPLMalloc(sizeof(double)));
1206 0 : psWarpOptions->padfSrcNoDataReal[0] = dfNoDataValue;
1207 : }
1208 :
1209 0 : if (psWarpOptions->padfDstNoDataReal == nullptr)
1210 : {
1211 0 : psWarpOptions->padfDstNoDataReal =
1212 0 : static_cast<double *>(CPLMalloc(sizeof(double)));
1213 0 : psWarpOptions->padfDstNoDataReal[0] = dfNoDataValue;
1214 : }
1215 : }
1216 :
1217 : GDALRasterIOTransformerStruct sTransformer;
1218 149 : sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
1219 149 : sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
1220 149 : sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
1221 149 : sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
1222 149 : psWarpOptions->pTransformerArg = &sTransformer;
1223 :
1224 : GDALWarpOperationH hWarpOperation =
1225 149 : GDALCreateWarpOperation(psWarpOptions);
1226 149 : eErr = GDALChunkAndWarpImage(hWarpOperation, nDestXOffVirtual,
1227 : nDestYOffVirtual, nBufXSize, nBufYSize);
1228 149 : GDALDestroyWarpOperation(hWarpOperation);
1229 :
1230 149 : psWarpOptions->panSrcBands = nullptr;
1231 149 : psWarpOptions->panDstBands = nullptr;
1232 149 : GDALDestroyWarpOptions(psWarpOptions);
1233 :
1234 149 : if (hVRTDS)
1235 0 : GDALClose(hVRTDS);
1236 : }
1237 : else
1238 : {
1239 : const char *pszResampling =
1240 15648 : GDALRasterIOGetResampleAlg(psExtraArg->eResampleAlg);
1241 15648 : int nKernelRadius = 0;
1242 : GDALResampleFunction pfnResampleFunc =
1243 15648 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
1244 15648 : CPLAssert(pfnResampleFunc);
1245 : GDALDataType eWrkDataType =
1246 15648 : GDALGetOvrWorkDataType(pszResampling, eDataType);
1247 15648 : int nHasNoData = 0;
1248 15648 : double dfNoDataValue = GetNoDataValue(&nHasNoData);
1249 15648 : const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
1250 15648 : if (!bHasNoData)
1251 15516 : dfNoDataValue = 0.0;
1252 :
1253 15648 : int nDstBlockXSize = nBufXSize;
1254 15648 : int nDstBlockYSize = nBufYSize;
1255 15648 : int nFullResXChunk = 0;
1256 15648 : int nFullResYChunk = 0;
1257 : while (true)
1258 : {
1259 15659 : nFullResXChunk = static_cast<int>(std::min<double>(
1260 15659 : 3 + nDstBlockXSize * dfXRatioDstToSrc, nRasterXSize));
1261 15659 : nFullResYChunk = static_cast<int>(std::min<double>(
1262 15659 : 3 + nDstBlockYSize * dfYRatioDstToSrc, nRasterYSize));
1263 15659 : if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
1264 15601 : (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
1265 : 1024 * 1024))
1266 : break;
1267 : // When operating on the full width of a raster whose block width is
1268 : // the raster width, prefer doing chunks in height.
1269 11 : if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
1270 : nDstBlockYSize > 1)
1271 0 : nDstBlockYSize /= 2;
1272 : /* Otherwise cut the maximal dimension */
1273 11 : else if (nDstBlockXSize > 1 &&
1274 0 : (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
1275 11 : nDstBlockXSize /= 2;
1276 : else
1277 0 : nDstBlockYSize /= 2;
1278 : }
1279 :
1280 : const int nOvrXFactor =
1281 15648 : std::max(1, static_cast<int>(0.5 + dfXRatioDstToSrc));
1282 : const int nOvrYFactor =
1283 15648 : std::max(1, static_cast<int>(0.5 + dfYRatioDstToSrc));
1284 : const int nFullResXSizeQueried = static_cast<int>(
1285 31296 : std::min<int64_t>(nFullResXChunk + static_cast<int64_t>(2) *
1286 15648 : nKernelRadius * nOvrXFactor,
1287 15648 : nRasterXSize));
1288 : const int nFullResYSizeQueried = static_cast<int>(
1289 31296 : std::min<int64_t>(nFullResYChunk + static_cast<int64_t>(2) *
1290 15648 : nKernelRadius * nOvrYFactor,
1291 15648 : nRasterYSize));
1292 :
1293 : void *pChunk =
1294 15648 : VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
1295 : nFullResXSizeQueried, nFullResYSizeQueried);
1296 15648 : GByte *pabyChunkNoDataMask = nullptr;
1297 :
1298 15648 : GDALRasterBand *poMaskBand = GetMaskBand();
1299 15648 : int l_nMaskFlags = GetMaskFlags();
1300 :
1301 15648 : bool bUseNoDataMask = ((l_nMaskFlags & GMF_ALL_VALID) == 0);
1302 15648 : if (bUseNoDataMask)
1303 : {
1304 7525 : pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
1305 : nFullResXSizeQueried, nFullResYSizeQueried));
1306 : }
1307 15648 : if (pChunk == nullptr ||
1308 7525 : (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
1309 : {
1310 0 : GDALClose(poMEMDS);
1311 0 : CPLFree(pChunk);
1312 0 : CPLFree(pabyChunkNoDataMask);
1313 0 : VSIFree(pTempBuffer);
1314 0 : return CE_Failure;
1315 : }
1316 :
1317 : const int64_t nTotalBlocks =
1318 15648 : static_cast<int64_t>(cpl::div_round_up(nBufXSize, nDstBlockXSize)) *
1319 15648 : cpl::div_round_up(nBufYSize, nDstBlockYSize);
1320 15648 : int64_t nBlocksDone = 0;
1321 :
1322 31296 : for (int nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
1323 15648 : nDstYOff += nDstBlockYSize)
1324 : {
1325 : int nDstYCount;
1326 15648 : if (nDstYOff + nDstBlockYSize <= nBufYSize)
1327 15648 : nDstYCount = nDstBlockYSize;
1328 : else
1329 0 : nDstYCount = nBufYSize - nDstYOff;
1330 :
1331 15648 : int nChunkYOff =
1332 15648 : nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
1333 15648 : int nChunkYOff2 = nYOff + 1 +
1334 15648 : static_cast<int>(ceil((nDstYOff + nDstYCount) *
1335 : dfYRatioDstToSrc));
1336 15648 : if (nChunkYOff2 > nRasterYSize)
1337 789 : nChunkYOff2 = nRasterYSize;
1338 15648 : int nYCount = nChunkYOff2 - nChunkYOff;
1339 15648 : CPLAssert(nYCount <= nFullResYChunk);
1340 :
1341 15648 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrYFactor;
1342 15648 : int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrYFactor;
1343 15648 : if (nChunkYOffQueried < 0)
1344 : {
1345 498 : nChunkYSizeQueried += nChunkYOffQueried;
1346 498 : nChunkYOffQueried = 0;
1347 : }
1348 15648 : if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
1349 607 : nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
1350 15648 : CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
1351 :
1352 15648 : int nDstXOff = 0;
1353 31296 : for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
1354 15648 : nDstXOff += nDstBlockXSize)
1355 : {
1356 15648 : int nDstXCount = 0;
1357 15648 : if (nDstXOff + nDstBlockXSize <= nBufXSize)
1358 15648 : nDstXCount = nDstBlockXSize;
1359 : else
1360 0 : nDstXCount = nBufXSize - nDstXOff;
1361 :
1362 15648 : int nChunkXOff =
1363 15648 : nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
1364 15648 : int nChunkXOff2 =
1365 15648 : nXOff + 1 +
1366 15648 : static_cast<int>(
1367 15648 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
1368 15648 : if (nChunkXOff2 > nRasterXSize)
1369 9827 : nChunkXOff2 = nRasterXSize;
1370 15648 : int nXCount = nChunkXOff2 - nChunkXOff;
1371 15648 : CPLAssert(nXCount <= nFullResXChunk);
1372 :
1373 15648 : int nChunkXOffQueried =
1374 15648 : nChunkXOff - nKernelRadius * nOvrXFactor;
1375 15648 : int nChunkXSizeQueried =
1376 15648 : nXCount + 2 * nKernelRadius * nOvrXFactor;
1377 15648 : if (nChunkXOffQueried < 0)
1378 : {
1379 3310 : nChunkXSizeQueried += nChunkXOffQueried;
1380 3310 : nChunkXOffQueried = 0;
1381 : }
1382 15648 : if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
1383 3806 : nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
1384 15648 : CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
1385 :
1386 : // Read the source buffers.
1387 15648 : eErr = RasterIO(GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1388 : nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
1389 : nChunkXSizeQueried, nChunkYSizeQueried,
1390 : eWrkDataType, 0, 0, nullptr);
1391 :
1392 15648 : bool bSkipResample = false;
1393 15648 : bool bNoDataMaskFullyOpaque = false;
1394 15648 : if (eErr == CE_None && bUseNoDataMask)
1395 : {
1396 7525 : eErr = poMaskBand->RasterIO(
1397 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1398 : nChunkXSizeQueried, nChunkYSizeQueried,
1399 : pabyChunkNoDataMask, nChunkXSizeQueried,
1400 : nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
1401 :
1402 : /* Optimizations if mask if fully opaque or transparent */
1403 7525 : int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
1404 7525 : GByte bVal = pabyChunkNoDataMask[0];
1405 7525 : int i = 1;
1406 15237000 : for (; i < nPixels; i++)
1407 : {
1408 15230700 : if (pabyChunkNoDataMask[i] != bVal)
1409 1168 : break;
1410 : }
1411 7525 : if (i == nPixels)
1412 : {
1413 6357 : if (bVal == 0)
1414 : {
1415 12094 : for (int j = 0; j < nDstYCount; j++)
1416 : {
1417 6377 : GDALCopyWords64(&dfNoDataValue, GDT_Float64, 0,
1418 : static_cast<GByte *>(pDataMem) +
1419 6377 : nLSMem * (j + nDstYOff) +
1420 6377 : nDstXOff * nPSMem,
1421 : eDTMem,
1422 : static_cast<int>(nPSMem),
1423 : nDstXCount);
1424 : }
1425 5717 : bSkipResample = true;
1426 : }
1427 : else
1428 : {
1429 640 : bNoDataMaskFullyOpaque = true;
1430 : }
1431 : }
1432 : }
1433 :
1434 15648 : if (!bSkipResample && eErr == CE_None)
1435 : {
1436 9928 : const bool bPropagateNoData = false;
1437 9928 : void *pDstBuffer = nullptr;
1438 9928 : GDALDataType eDstBufferDataType = GDT_Unknown;
1439 : GDALRasterBand *poMEMBand =
1440 9928 : GDALRasterBand::FromHandle(hMEMBand);
1441 9928 : GDALOverviewResampleArgs args;
1442 9928 : args.eSrcDataType = eDataType;
1443 9928 : args.eOvrDataType = poMEMBand->GetRasterDataType();
1444 9928 : args.nOvrXSize = poMEMBand->GetXSize();
1445 9928 : args.nOvrYSize = poMEMBand->GetYSize();
1446 9928 : args.nOvrNBITS = nNBITS;
1447 9928 : args.dfXRatioDstToSrc = dfXRatioDstToSrc;
1448 9928 : args.dfYRatioDstToSrc = dfYRatioDstToSrc;
1449 9928 : args.dfSrcXDelta =
1450 9928 : dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
1451 9928 : args.dfSrcYDelta =
1452 9928 : dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
1453 9928 : args.eWrkDataType = eWrkDataType;
1454 9928 : args.pabyChunkNodataMask =
1455 9928 : bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask;
1456 9928 : args.nChunkXOff =
1457 9928 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
1458 9928 : args.nChunkXSize = nChunkXSizeQueried;
1459 9928 : args.nChunkYOff =
1460 9928 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
1461 9928 : args.nChunkYSize = nChunkYSizeQueried;
1462 9928 : args.nDstXOff = nDstXOff + nDestXOffVirtual;
1463 9928 : args.nDstXOff2 = nDstXOff + nDestXOffVirtual + nDstXCount;
1464 9928 : args.nDstYOff = nDstYOff + nDestYOffVirtual;
1465 9928 : args.nDstYOff2 = nDstYOff + nDestYOffVirtual + nDstYCount;
1466 9928 : args.pszResampling = pszResampling;
1467 9928 : args.bHasNoData = bHasNoData;
1468 9928 : args.dfNoDataValue = dfNoDataValue;
1469 9928 : args.poColorTable = GetColorTable();
1470 9928 : args.bPropagateNoData = bPropagateNoData;
1471 9928 : eErr = pfnResampleFunc(args, pChunk, &pDstBuffer,
1472 : &eDstBufferDataType);
1473 9928 : if (eErr == CE_None)
1474 : {
1475 9928 : eErr = poMEMBand->RasterIO(
1476 : GF_Write, nDstXOff + nDestXOffVirtual,
1477 : nDstYOff + nDestYOffVirtual, nDstXCount, nDstYCount,
1478 : pDstBuffer, nDstXCount, nDstYCount,
1479 : eDstBufferDataType, 0, 0, nullptr);
1480 : }
1481 9928 : CPLFree(pDstBuffer);
1482 : }
1483 :
1484 15648 : nBlocksDone++;
1485 28106 : if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
1486 12458 : !psExtraArg->pfnProgress(
1487 12458 : static_cast<double>(nBlocksDone) /
1488 12458 : static_cast<double>(nTotalBlocks),
1489 : "", psExtraArg->pProgressData))
1490 : {
1491 1 : eErr = CE_Failure;
1492 : }
1493 : }
1494 : }
1495 :
1496 15648 : CPLFree(pChunk);
1497 15648 : CPLFree(pabyChunkNoDataMask);
1498 : }
1499 :
1500 15797 : if (pTempBuffer)
1501 : {
1502 4 : CPL_IGNORE_RET_VAL(poMEMDS->GetRasterBand(1)->RasterIO(
1503 : GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
1504 : pData, nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
1505 : nullptr));
1506 : }
1507 15797 : GDALClose(poMEMDS);
1508 15797 : VSIFree(pTempBuffer);
1509 :
1510 15797 : return eErr;
1511 : }
1512 :
1513 : /************************************************************************/
1514 : /* RasterIOResampled() */
1515 : /************************************************************************/
1516 :
1517 2431 : CPLErr GDALDataset::RasterIOResampled(
1518 : GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
1519 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
1520 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
1521 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
1522 :
1523 : {
1524 : #if 0
1525 : // Determine if we use warping resampling or overview resampling
1526 : bool bUseWarp = false;
1527 : if( GDALDataTypeIsComplex( eDataType ) )
1528 : bUseWarp = true;
1529 : #endif
1530 :
1531 2431 : double dfXOff = nXOff;
1532 2431 : double dfYOff = nYOff;
1533 2431 : double dfXSize = nXSize;
1534 2431 : double dfYSize = nYSize;
1535 2431 : if (psExtraArg->bFloatingPointWindowValidity)
1536 : {
1537 2304 : dfXOff = psExtraArg->dfXOff;
1538 2304 : dfYOff = psExtraArg->dfYOff;
1539 2304 : dfXSize = psExtraArg->dfXSize;
1540 2304 : dfYSize = psExtraArg->dfYSize;
1541 : }
1542 :
1543 2431 : const double dfXRatioDstToSrc = dfXSize / nBufXSize;
1544 2431 : const double dfYRatioDstToSrc = dfYSize / nBufYSize;
1545 :
1546 : // Determine the coordinates in the "virtual" output raster to see
1547 : // if there are not integers, in which case we will use them as a shift
1548 : // so that subwindow extracts give the exact same results as entire raster
1549 : // scaling.
1550 2431 : double dfDestXOff = dfXOff / dfXRatioDstToSrc;
1551 2431 : bool bHasXOffVirtual = false;
1552 2431 : int nDestXOffVirtual = 0;
1553 2431 : if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
1554 : {
1555 2306 : bHasXOffVirtual = true;
1556 2306 : dfXOff = nXOff;
1557 2306 : nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
1558 : }
1559 :
1560 2431 : double dfDestYOff = dfYOff / dfYRatioDstToSrc;
1561 2431 : bool bHasYOffVirtual = false;
1562 2431 : int nDestYOffVirtual = 0;
1563 2431 : if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
1564 : {
1565 2266 : bHasYOffVirtual = true;
1566 2266 : dfYOff = nYOff;
1567 2266 : nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
1568 : }
1569 :
1570 : // Create a MEM dataset that wraps the output buffer.
1571 2431 : std::unique_ptr<void, VSIFreeReleaser> pTempBuffer;
1572 2431 : GSpacing nPSMem = nPixelSpace;
1573 2431 : GSpacing nLSMem = nLineSpace;
1574 2431 : GSpacing nBandSpaceMEM = nBandSpace;
1575 2431 : void *pDataMem = pData;
1576 2431 : GDALDataType eDTMem = eBufType;
1577 2431 : GDALRasterBand *poFirstSrcBand = GetRasterBand(panBandMap[0]);
1578 2431 : const GDALDataType eDataType = poFirstSrcBand->GetRasterDataType();
1579 2431 : if (eBufType != eDataType && !GDAL_GET_OPERATE_IN_BUF_TYPE(*psExtraArg))
1580 : {
1581 2 : nPSMem = GDALGetDataTypeSizeBytes(eDataType);
1582 2 : nLSMem = nPSMem * nBufXSize;
1583 2 : nBandSpaceMEM = nLSMem * nBandCount;
1584 2 : pTempBuffer.reset(VSI_MALLOC3_VERBOSE(nBandCount, nBufYSize,
1585 : static_cast<size_t>(nLSMem)));
1586 2 : if (pTempBuffer == nullptr)
1587 0 : return CE_Failure;
1588 2 : pDataMem = pTempBuffer.get();
1589 2 : eDTMem = eDataType;
1590 : }
1591 :
1592 : auto poMEMDS = std::unique_ptr<GDALDataset>(
1593 2431 : MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
1594 4862 : nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr));
1595 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1596 : std::vector<GDALRasterBand *> apoDstBands(nBandCount);
1597 : #endif
1598 2431 : int nNBITS = 0;
1599 9052 : for (int i = 0; i < nBandCount; i++)
1600 : {
1601 6621 : GByte *const pBandData = static_cast<GByte *>(pDataMem) -
1602 6621 : nPSMem * nDestXOffVirtual -
1603 6621 : nLSMem * nDestYOffVirtual + nBandSpaceMEM * i;
1604 6621 : auto poMEMBand = GDALRasterBand::FromHandle(MEMCreateRasterBandEx(
1605 : poMEMDS.get(), i + 1, pBandData, eDTMem, nPSMem, nLSMem, false));
1606 6621 : poMEMDS->SetBand(i + 1, poMEMBand);
1607 :
1608 6621 : GDALRasterBand *poSrcBand = GetRasterBand(panBandMap[i]);
1609 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1610 : apoDstBands[i] = poMEMBand;
1611 : #endif
1612 : const char *pszNBITS =
1613 6621 : poSrcBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
1614 6621 : if (pszNBITS)
1615 : {
1616 0 : nNBITS = atoi(pszNBITS);
1617 0 : poMEMDS->GetRasterBand(i + 1)->SetMetadataItem("NBITS", pszNBITS,
1618 0 : "IMAGE_STRUCTURE");
1619 : }
1620 : }
1621 :
1622 2431 : CPLErr eErr = CE_None;
1623 :
1624 : // TODO(schwehr): Why disabled? Why not just delete?
1625 : // Looks like this code was initially added as disable by copying
1626 : // from RasterIO here:
1627 : // https://trac.osgeo.org/gdal/changeset/29572
1628 : #if 0
1629 : // Do the resampling.
1630 : if( bUseWarp )
1631 : {
1632 : VRTDatasetH hVRTDS = nullptr;
1633 : GDALRasterBandH hVRTBand = nullptr;
1634 : if( GetDataset() == nullptr )
1635 : {
1636 : /* Create VRT dataset that wraps the whole dataset */
1637 : hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
1638 : VRTAddBand( hVRTDS, eDataType, nullptr );
1639 : hVRTBand = GDALGetRasterBand(hVRTDS, 1);
1640 : VRTAddSimpleSource( (VRTSourcedRasterBandH)hVRTBand,
1641 : (GDALRasterBandH)this,
1642 : 0, 0,
1643 : nRasterXSize, nRasterYSize,
1644 : 0, 0,
1645 : nRasterXSize, nRasterYSize,
1646 : nullptr, VRT_NODATA_UNSET );
1647 :
1648 : /* Add a mask band if needed */
1649 : if( GetMaskFlags() != GMF_ALL_VALID )
1650 : {
1651 : ((GDALDataset*)hVRTDS)->CreateMaskBand(0);
1652 : VRTSourcedRasterBand* poVRTMaskBand =
1653 : (VRTSourcedRasterBand*)(((GDALRasterBand*)hVRTBand)->GetMaskBand());
1654 : poVRTMaskBand->
1655 : AddMaskBandSource( this,
1656 : 0, 0,
1657 : nRasterXSize, nRasterYSize,
1658 : 0, 0,
1659 : nRasterXSize, nRasterYSize);
1660 : }
1661 : }
1662 :
1663 : GDALWarpOptions* psWarpOptions = GDALCreateWarpOptions();
1664 : psWarpOptions->eResampleAlg = (GDALResampleAlg)psExtraArg->eResampleAlg;
1665 : psWarpOptions->hSrcDS = (GDALDatasetH) (hVRTDS ? hVRTDS : GetDataset());
1666 : psWarpOptions->hDstDS = (GDALDatasetH) poMEMDS;
1667 : psWarpOptions->nBandCount = 1;
1668 : int nSrcBandNumber = (hVRTDS ? 1 : nBand);
1669 : int nDstBandNumber = 1;
1670 : psWarpOptions->panSrcBands = &nSrcBandNumber;
1671 : psWarpOptions->panDstBands = &nDstBandNumber;
1672 : psWarpOptions->pfnProgress = psExtraArg->pfnProgress ?
1673 : psExtraArg->pfnProgress : GDALDummyProgress;
1674 : psWarpOptions->pProgressArg = psExtraArg->pProgressData;
1675 : psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
1676 : GDALRasterIOTransformerStruct sTransformer;
1677 : sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
1678 : sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
1679 : sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
1680 : sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
1681 : psWarpOptions->pTransformerArg = &sTransformer;
1682 :
1683 : GDALWarpOperationH hWarpOperation = GDALCreateWarpOperation(psWarpOptions);
1684 : eErr = GDALChunkAndWarpImage( hWarpOperation,
1685 : nDestXOffVirtual, nDestYOffVirtual,
1686 : nBufXSize, nBufYSize );
1687 : GDALDestroyWarpOperation( hWarpOperation );
1688 :
1689 : psWarpOptions->panSrcBands = nullptr;
1690 : psWarpOptions->panDstBands = nullptr;
1691 : GDALDestroyWarpOptions( psWarpOptions );
1692 :
1693 : if( hVRTDS )
1694 : GDALClose(hVRTDS);
1695 : }
1696 : else
1697 : #endif
1698 : {
1699 : const char *pszResampling =
1700 2431 : GDALRasterIOGetResampleAlg(psExtraArg->eResampleAlg);
1701 :
1702 : int nBlockXSize, nBlockYSize;
1703 2431 : poFirstSrcBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
1704 :
1705 : int nKernelRadius;
1706 : GDALResampleFunction pfnResampleFunc =
1707 2431 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
1708 2431 : CPLAssert(pfnResampleFunc);
1709 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1710 : GDALResampleFunctionMultiBands pfnResampleFuncMultiBands =
1711 : GDALGetResampleFunctionMultiBands(pszResampling, &nKernelRadius);
1712 : #endif
1713 : GDALDataType eWrkDataType =
1714 2431 : GDALGetOvrWorkDataType(pszResampling, eDataType);
1715 :
1716 2431 : int nDstBlockXSize = nBufXSize;
1717 2431 : int nDstBlockYSize = nBufYSize;
1718 : int nFullResXChunk, nFullResYChunk;
1719 : while (true)
1720 : {
1721 2431 : nFullResXChunk = static_cast<int>(std::min<double>(
1722 2431 : 3 + nDstBlockXSize * dfXRatioDstToSrc, nRasterXSize));
1723 2431 : nFullResYChunk = static_cast<int>(std::min<double>(
1724 2431 : 3 + nDstBlockYSize * dfYRatioDstToSrc, nRasterYSize));
1725 2431 : if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
1726 2429 : (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
1727 : 1024 * 1024))
1728 : break;
1729 : // When operating on the full width of a raster whose block width is
1730 : // the raster width, prefer doing chunks in height.
1731 0 : if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
1732 : nDstBlockYSize > 1)
1733 0 : nDstBlockYSize /= 2;
1734 : /* Otherwise cut the maximal dimension */
1735 0 : else if (nDstBlockXSize > 1 &&
1736 0 : (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
1737 0 : nDstBlockXSize /= 2;
1738 : else
1739 0 : nDstBlockYSize /= 2;
1740 : }
1741 :
1742 : const int nOvrFactor =
1743 7293 : std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
1744 2431 : static_cast<int>(0.5 + dfYRatioDstToSrc)));
1745 : const int nFullResXSizeQueried = static_cast<int>(
1746 4862 : std::min<int64_t>(nFullResXChunk + static_cast<int64_t>(2) *
1747 2431 : nKernelRadius * nOvrFactor,
1748 2431 : nRasterXSize));
1749 : const int nFullResYSizeQueried = static_cast<int>(
1750 4862 : std::min<int64_t>(nFullResYChunk + static_cast<int64_t>(2) *
1751 2431 : nKernelRadius * nOvrFactor,
1752 2431 : nRasterYSize));
1753 :
1754 2431 : void *pChunk = VSI_MALLOC3_VERBOSE(
1755 : cpl::fits_on<int>(GDALGetDataTypeSizeBytes(eWrkDataType) *
1756 : nBandCount),
1757 : nFullResXSizeQueried, nFullResYSizeQueried);
1758 2431 : GByte *pabyChunkNoDataMask = nullptr;
1759 :
1760 2431 : GDALRasterBand *poMaskBand = poFirstSrcBand->GetMaskBand();
1761 2431 : int nMaskFlags = poFirstSrcBand->GetMaskFlags();
1762 :
1763 2431 : bool bUseNoDataMask = ((nMaskFlags & GMF_ALL_VALID) == 0);
1764 2431 : if (bUseNoDataMask)
1765 : {
1766 2156 : pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
1767 : nFullResXSizeQueried, nFullResYSizeQueried));
1768 : }
1769 2431 : if (pChunk == nullptr ||
1770 2156 : (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
1771 : {
1772 0 : CPLFree(pChunk);
1773 0 : CPLFree(pabyChunkNoDataMask);
1774 0 : return CE_Failure;
1775 : }
1776 :
1777 : const int64_t nTotalBlocks =
1778 2431 : static_cast<int64_t>(cpl::div_round_up(nBufXSize, nDstBlockXSize)) *
1779 2431 : cpl::div_round_up(nBufYSize, nDstBlockYSize);
1780 2431 : int64_t nBlocksDone = 0;
1781 :
1782 4862 : for (int nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
1783 2431 : nDstYOff += nDstBlockYSize)
1784 : {
1785 : int nDstYCount;
1786 2431 : if (nDstYOff + nDstBlockYSize <= nBufYSize)
1787 2431 : nDstYCount = nDstBlockYSize;
1788 : else
1789 0 : nDstYCount = nBufYSize - nDstYOff;
1790 :
1791 2431 : int nChunkYOff =
1792 2431 : nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
1793 2431 : int nChunkYOff2 = nYOff + 1 +
1794 2431 : static_cast<int>(ceil((nDstYOff + nDstYCount) *
1795 : dfYRatioDstToSrc));
1796 2431 : if (nChunkYOff2 > nRasterYSize)
1797 146 : nChunkYOff2 = nRasterYSize;
1798 2431 : int nYCount = nChunkYOff2 - nChunkYOff;
1799 2431 : CPLAssert(nYCount <= nFullResYChunk);
1800 :
1801 2431 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
1802 2431 : int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
1803 2431 : if (nChunkYOffQueried < 0)
1804 : {
1805 149 : nChunkYSizeQueried += nChunkYOffQueried;
1806 149 : nChunkYOffQueried = 0;
1807 : }
1808 2431 : if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
1809 170 : nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
1810 2431 : CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
1811 :
1812 : int nDstXOff;
1813 4862 : for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
1814 2431 : nDstXOff += nDstBlockXSize)
1815 : {
1816 : int nDstXCount;
1817 2431 : if (nDstXOff + nDstBlockXSize <= nBufXSize)
1818 2431 : nDstXCount = nDstBlockXSize;
1819 : else
1820 0 : nDstXCount = nBufXSize - nDstXOff;
1821 :
1822 2431 : int nChunkXOff =
1823 2431 : nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
1824 2431 : int nChunkXOff2 =
1825 2431 : nXOff + 1 +
1826 2431 : static_cast<int>(
1827 2431 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
1828 2431 : if (nChunkXOff2 > nRasterXSize)
1829 1672 : nChunkXOff2 = nRasterXSize;
1830 2431 : int nXCount = nChunkXOff2 - nChunkXOff;
1831 2431 : CPLAssert(nXCount <= nFullResXChunk);
1832 :
1833 2431 : int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
1834 2431 : int nChunkXSizeQueried =
1835 2431 : nXCount + 2 * nKernelRadius * nOvrFactor;
1836 2431 : if (nChunkXOffQueried < 0)
1837 : {
1838 1162 : nChunkXSizeQueried += nChunkXOffQueried;
1839 1162 : nChunkXOffQueried = 0;
1840 : }
1841 2431 : if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
1842 1680 : nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
1843 2431 : CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
1844 :
1845 2431 : bool bSkipResample = false;
1846 2431 : bool bNoDataMaskFullyOpaque = false;
1847 2431 : if (eErr == CE_None && bUseNoDataMask)
1848 : {
1849 2156 : eErr = poMaskBand->RasterIO(
1850 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1851 : nChunkXSizeQueried, nChunkYSizeQueried,
1852 : pabyChunkNoDataMask, nChunkXSizeQueried,
1853 : nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
1854 :
1855 : /* Optimizations if mask if fully opaque or transparent */
1856 2156 : const int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
1857 2156 : const GByte bVal = pabyChunkNoDataMask[0];
1858 2156 : int i = 1; // Used after for.
1859 49799600 : for (; i < nPixels; i++)
1860 : {
1861 49798500 : if (pabyChunkNoDataMask[i] != bVal)
1862 1031 : break;
1863 : }
1864 2156 : if (i == nPixels)
1865 : {
1866 1125 : if (bVal == 0)
1867 : {
1868 953 : GByte abyZero[16] = {0};
1869 3100 : for (int iBand = 0; iBand < nBandCount; iBand++)
1870 : {
1871 6979 : for (int j = 0; j < nDstYCount; j++)
1872 : {
1873 4832 : GDALCopyWords64(
1874 : abyZero, GDT_UInt8, 0,
1875 : static_cast<GByte *>(pDataMem) +
1876 4832 : iBand * nBandSpaceMEM +
1877 4832 : nLSMem * (j + nDstYOff) +
1878 4832 : nDstXOff * nPSMem,
1879 : eBufType, static_cast<int>(nPSMem),
1880 : nDstXCount);
1881 : }
1882 : }
1883 953 : bSkipResample = true;
1884 : }
1885 : else
1886 : {
1887 172 : bNoDataMaskFullyOpaque = true;
1888 : }
1889 : }
1890 : }
1891 :
1892 2431 : if (!bSkipResample && eErr == CE_None)
1893 : {
1894 : /* Read the source buffers */
1895 1475 : eErr = RasterIO(
1896 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1897 : nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
1898 : nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType,
1899 : nBandCount, panBandMap, 0, 0, 0, nullptr);
1900 : }
1901 :
1902 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1903 : if (pfnResampleFuncMultiBands && !bSkipResample &&
1904 : eErr == CE_None)
1905 : {
1906 : eErr = pfnResampleFuncMultiBands(
1907 : dfXRatioDstToSrc, dfYRatioDstToSrc,
1908 : dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
1909 : dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
1910 : eWrkDataType, (GByte *)pChunk, nBandCount,
1911 : bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask,
1912 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
1913 : nChunkXSizeQueried,
1914 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
1915 : nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
1916 : nDstXOff + nDestXOffVirtual + nDstXCount,
1917 : nDstYOff + nDestYOffVirtual,
1918 : nDstYOff + nDestYOffVirtual + nDstYCount,
1919 : apoDstBands.data(), pszResampling, FALSE /*bHasNoData*/,
1920 : 0.0 /* dfNoDataValue */, nullptr /* color table*/,
1921 : eDataType);
1922 : }
1923 : else
1924 : #endif
1925 : {
1926 : size_t nChunkBandOffset =
1927 2431 : static_cast<size_t>(nChunkXSizeQueried) *
1928 2431 : nChunkYSizeQueried *
1929 2431 : GDALGetDataTypeSizeBytes(eWrkDataType);
1930 6896 : for (int i = 0;
1931 6896 : i < nBandCount && !bSkipResample && eErr == CE_None;
1932 : i++)
1933 : {
1934 4465 : const bool bPropagateNoData = false;
1935 4465 : void *pDstBuffer = nullptr;
1936 4465 : GDALDataType eDstBufferDataType = GDT_Unknown;
1937 : GDALRasterBand *poMEMBand =
1938 4465 : poMEMDS->GetRasterBand(i + 1);
1939 4465 : GDALOverviewResampleArgs args;
1940 4465 : args.eSrcDataType = eDataType;
1941 4465 : args.eOvrDataType = poMEMBand->GetRasterDataType();
1942 4465 : args.nOvrXSize = poMEMBand->GetXSize();
1943 4465 : args.nOvrYSize = poMEMBand->GetYSize();
1944 4465 : args.nOvrNBITS = nNBITS;
1945 4465 : args.dfXRatioDstToSrc = dfXRatioDstToSrc;
1946 4465 : args.dfYRatioDstToSrc = dfYRatioDstToSrc;
1947 4465 : args.dfSrcXDelta =
1948 4465 : dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
1949 4465 : args.dfSrcYDelta =
1950 4465 : dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
1951 4465 : args.eWrkDataType = eWrkDataType;
1952 4465 : args.pabyChunkNodataMask = bNoDataMaskFullyOpaque
1953 4465 : ? nullptr
1954 : : pabyChunkNoDataMask;
1955 4465 : args.nChunkXOff =
1956 4465 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
1957 4465 : args.nChunkXSize = nChunkXSizeQueried;
1958 4465 : args.nChunkYOff =
1959 4465 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
1960 4465 : args.nChunkYSize = nChunkYSizeQueried;
1961 4465 : args.nDstXOff = nDstXOff + nDestXOffVirtual;
1962 4465 : args.nDstXOff2 =
1963 4465 : nDstXOff + nDestXOffVirtual + nDstXCount;
1964 4465 : args.nDstYOff = nDstYOff + nDestYOffVirtual;
1965 4465 : args.nDstYOff2 =
1966 4465 : nDstYOff + nDestYOffVirtual + nDstYCount;
1967 4465 : args.pszResampling = pszResampling;
1968 4465 : args.bHasNoData = false;
1969 4465 : args.dfNoDataValue = 0.0;
1970 4465 : args.poColorTable = nullptr;
1971 4465 : args.bPropagateNoData = bPropagateNoData;
1972 :
1973 : eErr =
1974 8930 : pfnResampleFunc(args,
1975 4465 : reinterpret_cast<GByte *>(pChunk) +
1976 4465 : i * nChunkBandOffset,
1977 : &pDstBuffer, &eDstBufferDataType);
1978 4465 : if (eErr == CE_None)
1979 : {
1980 4465 : eErr = poMEMBand->RasterIO(
1981 : GF_Write, nDstXOff + nDestXOffVirtual,
1982 : nDstYOff + nDestYOffVirtual, nDstXCount,
1983 : nDstYCount, pDstBuffer, nDstXCount, nDstYCount,
1984 : eDstBufferDataType, 0, 0, nullptr);
1985 : }
1986 4465 : CPLFree(pDstBuffer);
1987 : }
1988 : }
1989 :
1990 2431 : nBlocksDone++;
1991 4356 : if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
1992 1925 : !psExtraArg->pfnProgress(
1993 1925 : static_cast<double>(nBlocksDone) /
1994 1925 : static_cast<double>(nTotalBlocks),
1995 : "", psExtraArg->pProgressData))
1996 : {
1997 0 : eErr = CE_Failure;
1998 : }
1999 : }
2000 : }
2001 :
2002 2431 : CPLFree(pChunk);
2003 2431 : CPLFree(pabyChunkNoDataMask);
2004 : }
2005 :
2006 2431 : if (pTempBuffer)
2007 : {
2008 2 : CPL_IGNORE_RET_VAL(poMEMDS->RasterIO(
2009 : GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
2010 : pData, nBufXSize, nBufYSize, eBufType, nBandCount, nullptr,
2011 : nPixelSpace, nLineSpace, nBandSpace, nullptr));
2012 : }
2013 :
2014 2431 : return eErr;
2015 : }
2016 :
2017 : //! @endcond
2018 :
2019 : /************************************************************************/
2020 : /* GDALSwapWords() */
2021 : /************************************************************************/
2022 :
2023 : /**
2024 : * Byte swap words in-place.
2025 : *
2026 : * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
2027 : * a memory array. No assumption is made that the words being swapped are
2028 : * word aligned in memory. Use the CPL_LSB and CPL_MSB macros from cpl_port.h
2029 : * to determine if the current platform is big endian or little endian. Use
2030 : * The macros like CPL_SWAP32() to byte swap single values without the overhead
2031 : * of a function call.
2032 : *
2033 : * @param pData pointer to start of data buffer.
2034 : * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
2035 : * @param nWordCount the number of words to be swapped in this call.
2036 : * @param nWordSkip the byte offset from the start of one word to the start of
2037 : * the next. For packed buffers this is the same as nWordSize.
2038 : */
2039 :
2040 497405 : void CPL_STDCALL GDALSwapWords(void *pData, int nWordSize, int nWordCount,
2041 : int nWordSkip)
2042 :
2043 : {
2044 497405 : if (nWordCount > 0)
2045 497405 : VALIDATE_POINTER0(pData, "GDALSwapWords");
2046 :
2047 497405 : GByte *pabyData = static_cast<GByte *>(pData);
2048 :
2049 497405 : switch (nWordSize)
2050 : {
2051 7234 : case 1:
2052 7234 : break;
2053 :
2054 477161 : case 2:
2055 477161 : CPLAssert(nWordSkip >= 2 || nWordCount == 1);
2056 228194000 : for (int i = 0; i < nWordCount; i++)
2057 : {
2058 227716000 : CPL_SWAP16PTR(pabyData);
2059 227716000 : pabyData += nWordSkip;
2060 : }
2061 477161 : break;
2062 :
2063 10584 : case 4:
2064 10584 : CPLAssert(nWordSkip >= 4 || nWordCount == 1);
2065 10584 : if (CPL_IS_ALIGNED(pabyData, 4) && (nWordSkip % 4) == 0)
2066 : {
2067 29140600 : for (int i = 0; i < nWordCount; i++)
2068 : {
2069 29130000 : *reinterpret_cast<GUInt32 *>(pabyData) = CPL_SWAP32(
2070 : *reinterpret_cast<const GUInt32 *>(pabyData));
2071 29130000 : pabyData += nWordSkip;
2072 10581 : }
2073 : }
2074 : else
2075 : {
2076 9 : for (int i = 0; i < nWordCount; i++)
2077 : {
2078 6 : CPL_SWAP32PTR(pabyData);
2079 6 : pabyData += nWordSkip;
2080 : }
2081 : }
2082 10584 : break;
2083 :
2084 2426 : case 8:
2085 2426 : CPLAssert(nWordSkip >= 8 || nWordCount == 1);
2086 2426 : if (CPL_IS_ALIGNED(pabyData, 8) && (nWordSkip % 8) == 0)
2087 : {
2088 3356900 : for (int i = 0; i < nWordCount; i++)
2089 : {
2090 3354480 : *reinterpret_cast<GUInt64 *>(pabyData) = CPL_SWAP64(
2091 : *reinterpret_cast<const GUInt64 *>(pabyData));
2092 3354480 : pabyData += nWordSkip;
2093 2425 : }
2094 : }
2095 : else
2096 : {
2097 3 : for (int i = 0; i < nWordCount; i++)
2098 : {
2099 2 : CPL_SWAP64PTR(pabyData);
2100 2 : pabyData += nWordSkip;
2101 : }
2102 : }
2103 2426 : break;
2104 :
2105 0 : default:
2106 0 : CPLAssert(false);
2107 : }
2108 : }
2109 :
2110 : /************************************************************************/
2111 : /* GDALSwapWordsEx() */
2112 : /************************************************************************/
2113 :
2114 : /**
2115 : * Byte swap words in-place.
2116 : *
2117 : * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
2118 : * a memory array. No assumption is made that the words being swapped are
2119 : * word aligned in memory. Use the CPL_LSB and CPL_MSB macros from cpl_port.h
2120 : * to determine if the current platform is big endian or little endian. Use
2121 : * The macros like CPL_SWAP32() to byte swap single values without the overhead
2122 : * of a function call.
2123 : *
2124 : * @param pData pointer to start of data buffer.
2125 : * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
2126 : * @param nWordCount the number of words to be swapped in this call.
2127 : * @param nWordSkip the byte offset from the start of one word to the start of
2128 : * the next. For packed buffers this is the same as nWordSize.
2129 : */
2130 6130 : void CPL_STDCALL GDALSwapWordsEx(void *pData, int nWordSize, size_t nWordCount,
2131 : int nWordSkip)
2132 : {
2133 6130 : GByte *pabyData = static_cast<GByte *>(pData);
2134 12260 : while (nWordCount)
2135 : {
2136 : // Pick-up a multiple of 8 as max chunk size.
2137 6130 : const int nWordCountSmall =
2138 6130 : (nWordCount > (1 << 30)) ? (1 << 30) : static_cast<int>(nWordCount);
2139 6130 : GDALSwapWords(pabyData, nWordSize, nWordCountSmall, nWordSkip);
2140 6130 : pabyData += static_cast<size_t>(nWordSkip) * nWordCountSmall;
2141 6130 : nWordCount -= nWordCountSmall;
2142 : }
2143 6130 : }
2144 :
2145 : // Place the new GDALCopyWords helpers in an anonymous namespace
2146 : namespace
2147 : {
2148 :
2149 : /************************************************************************/
2150 : /* GDALCopyWordsT() */
2151 : /************************************************************************/
2152 : /**
2153 : * Template function, used to copy data from pSrcData into buffer
2154 : * pDstData, with stride nSrcPixelStride in the source data and
2155 : * stride nDstPixelStride in the destination data. This template can
2156 : * deal with the case where the input data type is real or complex and
2157 : * the output is real.
2158 : *
2159 : * @param pSrcData the source data buffer
2160 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
2161 : * of interest.
2162 : * @param pDstData the destination buffer.
2163 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
2164 : * interest.
2165 : * @param nWordCount the total number of pixel words to copy
2166 : *
2167 : * @code
2168 : * // Assume an input buffer of type GUInt16 named pBufferIn
2169 : * GByte *pBufferOut = new GByte[numBytesOut];
2170 : * GDALCopyWordsT<GUInt16, GByte>(pSrcData, 2, pDstData, 1, numBytesOut);
2171 : * @endcode
2172 : * @note
2173 : * This is a private function, and should not be exposed outside of
2174 : * rasterio.cpp. External users should call the GDALCopyWords driver function.
2175 : */
2176 :
2177 : template <class Tin, class Tout>
2178 48995024 : static void inline GDALCopyWordsGenericT(const Tin *const CPL_RESTRICT pSrcData,
2179 : int nSrcPixelStride,
2180 : Tout *const CPL_RESTRICT pDstData,
2181 : int nDstPixelStride,
2182 : GPtrDiff_t nWordCount)
2183 : {
2184 48995024 : decltype(nWordCount) nDstOffset = 0;
2185 :
2186 48995024 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
2187 48995024 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
2188 353953277 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
2189 : {
2190 304958217 : const Tin tValue =
2191 304958217 : *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
2192 304958217 : Tout *const pOutPixel =
2193 304958217 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2194 :
2195 304958217 : GDALCopyWord(tValue, *pOutPixel);
2196 :
2197 304958217 : nDstOffset += nDstPixelStride;
2198 : }
2199 48995024 : }
2200 :
2201 : template <class Tin, class Tout>
2202 29756673 : static void CPL_NOINLINE GDALCopyWordsT(const Tin *const CPL_RESTRICT pSrcData,
2203 : int nSrcPixelStride,
2204 : Tout *const CPL_RESTRICT pDstData,
2205 : int nDstPixelStride,
2206 : GPtrDiff_t nWordCount)
2207 : {
2208 29756673 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData, nDstPixelStride,
2209 : nWordCount);
2210 29756673 : }
2211 :
2212 : template <class Tin, class Tout>
2213 5101389 : static void inline GDALCopyWordsT_8atatime(
2214 : const Tin *const CPL_RESTRICT pSrcData, int nSrcPixelStride,
2215 : Tout *const CPL_RESTRICT pDstData, int nDstPixelStride,
2216 : GPtrDiff_t nWordCount)
2217 : {
2218 5101389 : decltype(nWordCount) nDstOffset = 0;
2219 :
2220 5101389 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
2221 5101389 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
2222 5101389 : decltype(nWordCount) n = 0;
2223 5101389 : if (nSrcPixelStride == static_cast<int>(sizeof(Tin)) &&
2224 : nDstPixelStride == static_cast<int>(sizeof(Tout)))
2225 : {
2226 53187719 : for (; n < nWordCount - 7; n += 8)
2227 : {
2228 52636144 : const Tin *pInValues = reinterpret_cast<const Tin *>(
2229 52636144 : pSrcDataPtr + (n * nSrcPixelStride));
2230 52636144 : Tout *const pOutPixels =
2231 52636144 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2232 :
2233 52636144 : GDALCopy8Words(pInValues, pOutPixels);
2234 :
2235 52636144 : nDstOffset += 8 * nDstPixelStride;
2236 : }
2237 : }
2238 10499093 : for (; n < nWordCount; n++)
2239 : {
2240 5397704 : const Tin tValue =
2241 5397704 : *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
2242 5397704 : Tout *const pOutPixel =
2243 5397704 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2244 :
2245 5397704 : GDALCopyWord(tValue, *pOutPixel);
2246 :
2247 5397704 : nDstOffset += nDstPixelStride;
2248 : }
2249 5101389 : }
2250 :
2251 : #ifdef HAVE_SSE2
2252 :
2253 : template <class Tout>
2254 1042126 : void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
2255 : int nSrcPixelStride,
2256 : Tout *const CPL_RESTRICT pDstData,
2257 : int nDstPixelStride, GPtrDiff_t nWordCount)
2258 : {
2259 : static_assert(std::is_integral<Tout>::value &&
2260 : sizeof(Tout) == sizeof(uint16_t),
2261 : "Bad Tout");
2262 1042126 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2263 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2264 : {
2265 35752 : decltype(nWordCount) n = 0;
2266 35752 : const __m128i xmm_zero = _mm_setzero_si128();
2267 35752 : GByte *CPL_RESTRICT pabyDstDataPtr =
2268 : reinterpret_cast<GByte *>(pDstData);
2269 1478148 : for (; n < nWordCount - 15; n += 16)
2270 : {
2271 1442396 : __m128i xmm = _mm_loadu_si128(
2272 1442396 : reinterpret_cast<const __m128i *>(pSrcData + n));
2273 1442396 : __m128i xmm0 = _mm_unpacklo_epi8(xmm, xmm_zero);
2274 1442396 : __m128i xmm1 = _mm_unpackhi_epi8(xmm, xmm_zero);
2275 : _mm_storeu_si128(
2276 1442396 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2), xmm0);
2277 : _mm_storeu_si128(
2278 1442396 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2 + 16), xmm1);
2279 : }
2280 : #if defined(__clang__)
2281 : #pragma clang loop vectorize(disable)
2282 : #endif
2283 111662 : for (; n < nWordCount; n++)
2284 : {
2285 75910 : pDstData[n] = pSrcData[n];
2286 35752 : }
2287 : }
2288 : else
2289 : {
2290 1006371 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2291 : nDstPixelStride, nWordCount);
2292 : }
2293 1042126 : }
2294 :
2295 : template <>
2296 1029400 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2297 : int nSrcPixelStride,
2298 : GUInt16 *const CPL_RESTRICT pDstData,
2299 : int nDstPixelStride, GPtrDiff_t nWordCount)
2300 : {
2301 1029400 : GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
2302 : nDstPixelStride, nWordCount);
2303 1029400 : }
2304 :
2305 : template <>
2306 12726 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2307 : int nSrcPixelStride,
2308 : GInt16 *const CPL_RESTRICT pDstData,
2309 : int nDstPixelStride, GPtrDiff_t nWordCount)
2310 : {
2311 12726 : GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
2312 : nDstPixelStride, nWordCount);
2313 12726 : }
2314 :
2315 : template <class Tout>
2316 16237476 : void GDALCopyWordsByteTo32Bit(const GByte *const CPL_RESTRICT pSrcData,
2317 : int nSrcPixelStride,
2318 : Tout *const CPL_RESTRICT pDstData,
2319 : int nDstPixelStride, GPtrDiff_t nWordCount)
2320 : {
2321 : static_assert(std::is_integral<Tout>::value &&
2322 : sizeof(Tout) == sizeof(uint32_t),
2323 : "Bad Tout");
2324 16237476 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2325 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2326 : {
2327 6532986 : decltype(nWordCount) n = 0;
2328 6532986 : const __m128i xmm_zero = _mm_setzero_si128();
2329 6532986 : GByte *CPL_RESTRICT pabyDstDataPtr =
2330 : reinterpret_cast<GByte *>(pDstData);
2331 74248727 : for (; n < nWordCount - 15; n += 16)
2332 : {
2333 67715761 : __m128i xmm = _mm_loadu_si128(
2334 67715761 : reinterpret_cast<const __m128i *>(pSrcData + n));
2335 67715761 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2336 67715761 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2337 67715761 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2338 67715761 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2339 67715761 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2340 67715761 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2341 : _mm_storeu_si128(
2342 67715761 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4), xmm0);
2343 : _mm_storeu_si128(
2344 67715761 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 16), xmm1);
2345 : _mm_storeu_si128(
2346 67715761 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 32), xmm2);
2347 : _mm_storeu_si128(
2348 67715761 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 48), xmm3);
2349 : }
2350 : #if defined(__clang__)
2351 : #pragma clang loop vectorize(disable)
2352 : #endif
2353 14828616 : for (; n < nWordCount; n++)
2354 : {
2355 8295670 : pDstData[n] = pSrcData[n];
2356 6532986 : }
2357 : }
2358 : else
2359 : {
2360 9704510 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2361 : nDstPixelStride, nWordCount);
2362 : }
2363 16237476 : }
2364 :
2365 : template <>
2366 476 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2367 : int nSrcPixelStride,
2368 : GUInt32 *const CPL_RESTRICT pDstData,
2369 : int nDstPixelStride, GPtrDiff_t nWordCount)
2370 : {
2371 476 : GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
2372 : nDstPixelStride, nWordCount);
2373 476 : }
2374 :
2375 : template <>
2376 16237000 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2377 : int nSrcPixelStride,
2378 : GInt32 *const CPL_RESTRICT pDstData,
2379 : int nDstPixelStride, GPtrDiff_t nWordCount)
2380 : {
2381 16237000 : GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
2382 : nDstPixelStride, nWordCount);
2383 16237000 : }
2384 :
2385 : template <>
2386 2851030 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2387 : int nSrcPixelStride,
2388 : float *const CPL_RESTRICT pDstData,
2389 : int nDstPixelStride, GPtrDiff_t nWordCount)
2390 : {
2391 2851030 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2392 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2393 : {
2394 228149 : decltype(nWordCount) n = 0;
2395 228149 : const __m128i xmm_zero = _mm_setzero_si128();
2396 228149 : GByte *CPL_RESTRICT pabyDstDataPtr =
2397 : reinterpret_cast<GByte *>(pDstData);
2398 2267080 : for (; n < nWordCount - 15; n += 16)
2399 : {
2400 2038930 : __m128i xmm = _mm_loadu_si128(
2401 2038930 : reinterpret_cast<const __m128i *>(pSrcData + n));
2402 2038930 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2403 2038930 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2404 2038930 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2405 2038930 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2406 2038930 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2407 2038930 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2408 2038930 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2409 2038930 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2410 2038930 : __m128 xmm2_f = _mm_cvtepi32_ps(xmm2);
2411 2038930 : __m128 xmm3_f = _mm_cvtepi32_ps(xmm3);
2412 2038930 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2413 : xmm0_f);
2414 : _mm_storeu_ps(
2415 2038930 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2416 : _mm_storeu_ps(
2417 2038930 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 32), xmm2_f);
2418 : _mm_storeu_ps(
2419 2038930 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 48), xmm3_f);
2420 : }
2421 : #if defined(__clang__)
2422 : #pragma clang loop vectorize(disable)
2423 : #endif
2424 951237 : for (; n < nWordCount; n++)
2425 : {
2426 723088 : pDstData[n] = pSrcData[n];
2427 228149 : }
2428 : }
2429 : else
2430 : {
2431 2622880 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2432 : nDstPixelStride, nWordCount);
2433 : }
2434 2851030 : }
2435 :
2436 : template <>
2437 170958 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2438 : int nSrcPixelStride,
2439 : double *const CPL_RESTRICT pDstData,
2440 : int nDstPixelStride, GPtrDiff_t nWordCount)
2441 : {
2442 170958 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2443 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2444 : {
2445 147160 : decltype(nWordCount) n = 0;
2446 147160 : const __m128i xmm_zero = _mm_setzero_si128();
2447 147160 : GByte *CPL_RESTRICT pabyDstDataPtr =
2448 : reinterpret_cast<GByte *>(pDstData);
2449 3127450 : for (; n < nWordCount - 15; n += 16)
2450 : {
2451 2980290 : __m128i xmm = _mm_loadu_si128(
2452 2980290 : reinterpret_cast<const __m128i *>(pSrcData + n));
2453 2980290 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2454 2980290 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2455 2980290 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2456 2980290 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2457 2980290 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2458 2980290 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2459 :
2460 : #if defined(__AVX2__) && defined(slightly_slower_than_SSE2)
2461 : _mm256_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2462 : _mm256_cvtepi32_pd(xmm0));
2463 : _mm256_storeu_pd(
2464 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2465 : _mm256_cvtepi32_pd(xmm1));
2466 : _mm256_storeu_pd(
2467 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
2468 : _mm256_cvtepi32_pd(xmm2));
2469 : _mm256_storeu_pd(
2470 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
2471 : _mm256_cvtepi32_pd(xmm3));
2472 : #else
2473 2980290 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2474 2980290 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2475 2980290 : __m128d xmm2_low_d = _mm_cvtepi32_pd(xmm2);
2476 2980290 : __m128d xmm3_low_d = _mm_cvtepi32_pd(xmm3);
2477 2980290 : xmm0 = _mm_srli_si128(xmm0, 8);
2478 2980290 : xmm1 = _mm_srli_si128(xmm1, 8);
2479 2980290 : xmm2 = _mm_srli_si128(xmm2, 8);
2480 2980290 : xmm3 = _mm_srli_si128(xmm3, 8);
2481 2980290 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2482 2980290 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2483 2980290 : __m128d xmm2_high_d = _mm_cvtepi32_pd(xmm2);
2484 2980290 : __m128d xmm3_high_d = _mm_cvtepi32_pd(xmm3);
2485 :
2486 2980290 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2487 : xmm0_low_d);
2488 : _mm_storeu_pd(
2489 2980290 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2490 : xmm0_high_d);
2491 : _mm_storeu_pd(
2492 2980290 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2493 : xmm1_low_d);
2494 : _mm_storeu_pd(
2495 2980290 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2496 : xmm1_high_d);
2497 : _mm_storeu_pd(
2498 2980290 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
2499 : xmm2_low_d);
2500 : _mm_storeu_pd(
2501 2980290 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 80),
2502 : xmm2_high_d);
2503 : _mm_storeu_pd(
2504 2980290 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
2505 : xmm3_low_d);
2506 : _mm_storeu_pd(
2507 2980290 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 112),
2508 : xmm3_high_d);
2509 : #endif
2510 : }
2511 : #if defined(__clang__)
2512 : #pragma clang loop vectorize(disable)
2513 : #endif
2514 280923 : for (; n < nWordCount; n++)
2515 : {
2516 133763 : pDstData[n] = pSrcData[n];
2517 147160 : }
2518 : }
2519 : else
2520 : {
2521 23798 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2522 : nDstPixelStride, nWordCount);
2523 : }
2524 170958 : }
2525 :
2526 : template <>
2527 148 : CPL_NOINLINE void GDALCopyWordsT(const uint8_t *const CPL_RESTRICT pSrcData,
2528 : int nSrcPixelStride,
2529 : int8_t *const CPL_RESTRICT pDstData,
2530 : int nDstPixelStride, GPtrDiff_t nWordCount)
2531 : {
2532 148 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2533 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2534 : {
2535 142 : decltype(nWordCount) n = 0;
2536 142 : const __m128i xmm_127 = _mm_set1_epi8(127);
2537 146 : for (; n < nWordCount - 31; n += 32)
2538 : {
2539 8 : __m128i xmm0 = _mm_loadu_si128(
2540 4 : reinterpret_cast<const __m128i *>(pSrcData + n));
2541 4 : __m128i xmm1 = _mm_loadu_si128(
2542 4 : reinterpret_cast<const __m128i *>(pSrcData + n + 16));
2543 4 : xmm0 = _mm_min_epu8(xmm0, xmm_127);
2544 4 : xmm1 = _mm_min_epu8(xmm1, xmm_127);
2545 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2546 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
2547 : xmm1);
2548 : }
2549 : #if defined(__clang__)
2550 : #pragma clang loop vectorize(disable)
2551 : #endif
2552 2424 : for (; n < nWordCount; n++)
2553 : {
2554 2282 : pDstData[n] = static_cast<int8_t>(std::min<int>(pSrcData[n], 127));
2555 142 : }
2556 : }
2557 : else
2558 : {
2559 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2560 : nDstPixelStride, nWordCount);
2561 : }
2562 148 : }
2563 :
2564 : template <>
2565 62 : CPL_NOINLINE void GDALCopyWordsT(const int8_t *const CPL_RESTRICT pSrcData,
2566 : int nSrcPixelStride,
2567 : uint8_t *const CPL_RESTRICT pDstData,
2568 : int nDstPixelStride, GPtrDiff_t nWordCount)
2569 : {
2570 62 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2571 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2572 : {
2573 56 : decltype(nWordCount) n = 0;
2574 : #if !(defined(__SSE4_1__) || defined(__AVX__) || \
2575 : defined(USE_NEON_OPTIMIZATIONS))
2576 56 : const __m128i xmm_INT8_to_UINT8 = _mm_set1_epi8(-128);
2577 : #endif
2578 117 : for (; n < nWordCount - 31; n += 32)
2579 : {
2580 122 : __m128i xmm0 = _mm_loadu_si128(
2581 61 : reinterpret_cast<const __m128i *>(pSrcData + n));
2582 61 : __m128i xmm1 = _mm_loadu_si128(
2583 61 : reinterpret_cast<const __m128i *>(pSrcData + n + 16));
2584 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
2585 : xmm0 = _mm_max_epi8(xmm0, _mm_setzero_si128());
2586 : xmm1 = _mm_max_epi8(xmm1, _mm_setzero_si128());
2587 : #else
2588 61 : xmm0 = _mm_add_epi8(xmm0, xmm_INT8_to_UINT8);
2589 61 : xmm1 = _mm_add_epi8(xmm1, xmm_INT8_to_UINT8);
2590 61 : xmm0 = _mm_max_epu8(xmm0, xmm_INT8_to_UINT8);
2591 61 : xmm1 = _mm_max_epu8(xmm1, xmm_INT8_to_UINT8);
2592 61 : xmm0 = _mm_sub_epi8(xmm0, xmm_INT8_to_UINT8);
2593 61 : xmm1 = _mm_sub_epi8(xmm1, xmm_INT8_to_UINT8);
2594 : #endif
2595 61 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2596 61 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
2597 : xmm1);
2598 : }
2599 : #if defined(__clang__)
2600 : #pragma clang loop vectorize(disable)
2601 : #endif
2602 352 : for (; n < nWordCount; n++)
2603 : {
2604 296 : pDstData[n] = static_cast<uint8_t>(std::max<int>(pSrcData[n], 0));
2605 56 : }
2606 : }
2607 : else
2608 : {
2609 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2610 : nDstPixelStride, nWordCount);
2611 : }
2612 62 : }
2613 :
2614 : template <>
2615 6037 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2616 : int nSrcPixelStride,
2617 : uint8_t *const CPL_RESTRICT pDstData,
2618 : int nDstPixelStride, GPtrDiff_t nWordCount)
2619 : {
2620 6037 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2621 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2622 : {
2623 5062 : decltype(nWordCount) n = 0;
2624 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
2625 : const auto xmm_MAX_INT16 = _mm_set1_epi16(32767);
2626 : #else
2627 : // In SSE2, min_epu16 does not exist, so shift from
2628 : // UInt16 to SInt16 to be able to use min_epi16
2629 5062 : const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
2630 5062 : const __m128i xmm_m255_shifted = _mm_set1_epi16(255 - 32768);
2631 : #endif
2632 71888 : for (; n < nWordCount - 15; n += 16)
2633 : {
2634 133652 : __m128i xmm0 = _mm_loadu_si128(
2635 66826 : reinterpret_cast<const __m128i *>(pSrcData + n));
2636 66826 : __m128i xmm1 = _mm_loadu_si128(
2637 66826 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2638 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
2639 : xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
2640 : xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
2641 : #else
2642 66826 : xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
2643 66826 : xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
2644 66826 : xmm0 = _mm_min_epi16(xmm0, xmm_m255_shifted);
2645 66826 : xmm1 = _mm_min_epi16(xmm1, xmm_m255_shifted);
2646 66826 : xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
2647 66826 : xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
2648 : #endif
2649 66826 : xmm0 = _mm_packus_epi16(xmm0, xmm1);
2650 66826 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2651 : }
2652 : #if defined(__clang__)
2653 : #pragma clang loop vectorize(disable)
2654 : #endif
2655 16403 : for (; n < nWordCount; n++)
2656 : {
2657 11341 : pDstData[n] = static_cast<uint8_t>(std::min<int>(pSrcData[n], 255));
2658 5062 : }
2659 : }
2660 : else
2661 : {
2662 975 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2663 : nDstPixelStride, nWordCount);
2664 : }
2665 6037 : }
2666 :
2667 : template <>
2668 46 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2669 : int nSrcPixelStride,
2670 : int16_t *const CPL_RESTRICT pDstData,
2671 : int nDstPixelStride, GPtrDiff_t nWordCount)
2672 : {
2673 46 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2674 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2675 : {
2676 40 : decltype(nWordCount) n = 0;
2677 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
2678 : const __m128i xmm_MAX_INT16 = _mm_set1_epi16(32767);
2679 : #else
2680 : // In SSE2, min_epu16 does not exist, so shift from
2681 : // UInt16 to SInt16 to be able to use min_epi16
2682 40 : const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
2683 40 : const __m128i xmm_32767_shifted = _mm_set1_epi16(32767 - 32768);
2684 : #endif
2685 169 : for (; n < nWordCount - 15; n += 16)
2686 : {
2687 258 : __m128i xmm0 = _mm_loadu_si128(
2688 129 : reinterpret_cast<const __m128i *>(pSrcData + n));
2689 129 : __m128i xmm1 = _mm_loadu_si128(
2690 129 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2691 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
2692 : xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
2693 : xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
2694 : #else
2695 129 : xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
2696 129 : xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
2697 129 : xmm0 = _mm_min_epi16(xmm0, xmm_32767_shifted);
2698 129 : xmm1 = _mm_min_epi16(xmm1, xmm_32767_shifted);
2699 129 : xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
2700 129 : xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
2701 : #endif
2702 129 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2703 129 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
2704 : xmm1);
2705 : }
2706 : #if defined(__clang__)
2707 : #pragma clang loop vectorize(disable)
2708 : #endif
2709 191 : for (; n < nWordCount; n++)
2710 : {
2711 151 : pDstData[n] =
2712 151 : static_cast<int16_t>(std::min<int>(pSrcData[n], 32767));
2713 40 : }
2714 : }
2715 : else
2716 : {
2717 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2718 : nDstPixelStride, nWordCount);
2719 : }
2720 46 : }
2721 :
2722 : template <>
2723 136 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2724 : int nSrcPixelStride,
2725 : uint16_t *const CPL_RESTRICT pDstData,
2726 : int nDstPixelStride, GPtrDiff_t nWordCount)
2727 : {
2728 136 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2729 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2730 : {
2731 93 : decltype(nWordCount) n = 0;
2732 93 : const __m128i xmm_zero = _mm_setzero_si128();
2733 278 : for (; n < nWordCount - 15; n += 16)
2734 : {
2735 370 : __m128i xmm0 = _mm_loadu_si128(
2736 185 : reinterpret_cast<const __m128i *>(pSrcData + n));
2737 185 : __m128i xmm1 = _mm_loadu_si128(
2738 185 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2739 185 : xmm0 = _mm_max_epi16(xmm0, xmm_zero);
2740 185 : xmm1 = _mm_max_epi16(xmm1, xmm_zero);
2741 185 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2742 185 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
2743 : xmm1);
2744 : }
2745 : #if defined(__clang__)
2746 : #pragma clang loop vectorize(disable)
2747 : #endif
2748 471 : for (; n < nWordCount; n++)
2749 : {
2750 378 : pDstData[n] = static_cast<uint16_t>(std::max<int>(pSrcData[n], 0));
2751 93 : }
2752 : }
2753 : else
2754 : {
2755 43 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2756 : nDstPixelStride, nWordCount);
2757 : }
2758 136 : }
2759 :
2760 : template <>
2761 3130 : CPL_NOINLINE void GDALCopyWordsT(const uint32_t *const CPL_RESTRICT pSrcData,
2762 : int nSrcPixelStride,
2763 : int32_t *const CPL_RESTRICT pDstData,
2764 : int nDstPixelStride, GPtrDiff_t nWordCount)
2765 : {
2766 3130 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2767 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2768 : {
2769 2176 : decltype(nWordCount) n = 0;
2770 2176 : const __m128i xmm_MAX_INT = _mm_set1_epi32(INT_MAX);
2771 2176 : [[maybe_unused]] const __m128i bias = _mm_set1_epi32(INT_MIN);
2772 : [[maybe_unused]] const __m128i xmm_MAX_INT_biased =
2773 2176 : _mm_xor_si128(xmm_MAX_INT, bias);
2774 45537 : for (; n < nWordCount - 7; n += 8)
2775 : {
2776 86722 : __m128i xmm0 = _mm_loadu_si128(
2777 43361 : reinterpret_cast<const __m128i *>(pSrcData + n));
2778 43361 : __m128i xmm1 = _mm_loadu_si128(
2779 43361 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
2780 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
2781 : xmm0 = _mm_min_epu32(xmm0, xmm_MAX_INT);
2782 : xmm1 = _mm_min_epu32(xmm1, xmm_MAX_INT);
2783 : #else
2784 43361 : const __m128i xmm0_biased = _mm_xor_si128(xmm0, bias);
2785 : const __m128i mask0 =
2786 43361 : _mm_cmplt_epi32(xmm0_biased, xmm_MAX_INT_biased);
2787 43361 : xmm0 = GDALIfThenElse(mask0, xmm0, xmm_MAX_INT);
2788 :
2789 43361 : const __m128i xmm1_biased = _mm_xor_si128(xmm1, bias);
2790 : const __m128i mask1 =
2791 43361 : _mm_cmplt_epi32(xmm1_biased, xmm_MAX_INT_biased);
2792 43361 : xmm1 = GDALIfThenElse(mask1, xmm1, xmm_MAX_INT);
2793 : #endif
2794 43361 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2795 43361 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
2796 : xmm1);
2797 : }
2798 : #if defined(__clang__)
2799 : #pragma clang loop vectorize(disable)
2800 : #endif
2801 9290 : for (; n < nWordCount; n++)
2802 : {
2803 7114 : pDstData[n] =
2804 7114 : static_cast<int32_t>(std::min<uint32_t>(pSrcData[n], INT_MAX));
2805 2176 : }
2806 : }
2807 : else
2808 : {
2809 954 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2810 : nDstPixelStride, nWordCount);
2811 : }
2812 3130 : }
2813 :
2814 : template <>
2815 93 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
2816 : int nSrcPixelStride,
2817 : uint32_t *const CPL_RESTRICT pDstData,
2818 : int nDstPixelStride, GPtrDiff_t nWordCount)
2819 : {
2820 93 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2821 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2822 : {
2823 38 : decltype(nWordCount) n = 0;
2824 38 : const __m128i xmm_zero = _mm_setzero_si128();
2825 333 : for (; n < nWordCount - 7; n += 8)
2826 : {
2827 590 : __m128i xmm0 = _mm_loadu_si128(
2828 295 : reinterpret_cast<const __m128i *>(pSrcData + n));
2829 295 : __m128i xmm1 = _mm_loadu_si128(
2830 295 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
2831 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
2832 : xmm0 = _mm_max_epi32(xmm0, xmm_zero);
2833 : xmm1 = _mm_max_epi32(xmm1, xmm_zero);
2834 : #else
2835 295 : const __m128i mask0 = _mm_cmpgt_epi32(xmm0, xmm_zero);
2836 295 : const __m128i mask1 = _mm_cmpgt_epi32(xmm1, xmm_zero);
2837 295 : xmm0 = _mm_and_si128(xmm0, mask0);
2838 295 : xmm1 = _mm_and_si128(xmm1, mask1);
2839 : #endif
2840 295 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2841 295 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
2842 : xmm1);
2843 : }
2844 : #if defined(__clang__)
2845 : #pragma clang loop vectorize(disable)
2846 : #endif
2847 192 : for (; n < nWordCount; n++)
2848 : {
2849 154 : pDstData[n] = static_cast<uint32_t>(std::max(pSrcData[n], 0));
2850 38 : }
2851 : }
2852 : else
2853 : {
2854 55 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2855 : nDstPixelStride, nWordCount);
2856 : }
2857 93 : }
2858 :
2859 : template <>
2860 403 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2861 : int nSrcPixelStride,
2862 : float *const CPL_RESTRICT pDstData,
2863 : int nDstPixelStride, GPtrDiff_t nWordCount)
2864 : {
2865 403 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2866 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2867 : {
2868 397 : decltype(nWordCount) n = 0;
2869 397 : const __m128i xmm_zero = _mm_setzero_si128();
2870 397 : GByte *CPL_RESTRICT pabyDstDataPtr =
2871 : reinterpret_cast<GByte *>(pDstData);
2872 1688 : for (; n < nWordCount - 7; n += 8)
2873 : {
2874 1291 : __m128i xmm = _mm_loadu_si128(
2875 1291 : reinterpret_cast<const __m128i *>(pSrcData + n));
2876 1291 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
2877 1291 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
2878 1291 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2879 1291 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2880 1291 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2881 : xmm0_f);
2882 : _mm_storeu_ps(
2883 1291 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2884 : }
2885 : #if defined(__clang__)
2886 : #pragma clang loop vectorize(disable)
2887 : #endif
2888 1415 : for (; n < nWordCount; n++)
2889 : {
2890 1018 : pDstData[n] = pSrcData[n];
2891 397 : }
2892 : }
2893 : else
2894 : {
2895 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2896 : nDstPixelStride, nWordCount);
2897 : }
2898 403 : }
2899 :
2900 : template <>
2901 1076640 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2902 : int nSrcPixelStride,
2903 : float *const CPL_RESTRICT pDstData,
2904 : int nDstPixelStride, GPtrDiff_t nWordCount)
2905 : {
2906 1076640 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2907 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2908 : {
2909 86742 : decltype(nWordCount) n = 0;
2910 86742 : GByte *CPL_RESTRICT pabyDstDataPtr =
2911 : reinterpret_cast<GByte *>(pDstData);
2912 586119 : for (; n < nWordCount - 7; n += 8)
2913 : {
2914 499377 : __m128i xmm = _mm_loadu_si128(
2915 499377 : reinterpret_cast<const __m128i *>(pSrcData + n));
2916 499377 : const auto sign = _mm_srai_epi16(xmm, 15);
2917 499377 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
2918 499377 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
2919 499377 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2920 499377 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2921 499377 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2922 : xmm0_f);
2923 : _mm_storeu_ps(
2924 499377 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2925 : }
2926 : #if defined(__clang__)
2927 : #pragma clang loop vectorize(disable)
2928 : #endif
2929 253882 : for (; n < nWordCount; n++)
2930 : {
2931 167140 : pDstData[n] = pSrcData[n];
2932 86742 : }
2933 : }
2934 : else
2935 : {
2936 989901 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2937 : nDstPixelStride, nWordCount);
2938 : }
2939 1076640 : }
2940 :
2941 : template <>
2942 449 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2943 : int nSrcPixelStride,
2944 : double *const CPL_RESTRICT pDstData,
2945 : int nDstPixelStride, GPtrDiff_t nWordCount)
2946 : {
2947 449 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2948 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2949 : {
2950 313 : decltype(nWordCount) n = 0;
2951 313 : const __m128i xmm_zero = _mm_setzero_si128();
2952 313 : GByte *CPL_RESTRICT pabyDstDataPtr =
2953 : reinterpret_cast<GByte *>(pDstData);
2954 829 : for (; n < nWordCount - 7; n += 8)
2955 : {
2956 516 : __m128i xmm = _mm_loadu_si128(
2957 516 : reinterpret_cast<const __m128i *>(pSrcData + n));
2958 516 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
2959 516 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
2960 :
2961 516 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2962 516 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2963 516 : xmm0 = _mm_srli_si128(xmm0, 8);
2964 516 : xmm1 = _mm_srli_si128(xmm1, 8);
2965 516 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2966 516 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2967 :
2968 516 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2969 : xmm0_low_d);
2970 : _mm_storeu_pd(
2971 516 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2972 : xmm0_high_d);
2973 : _mm_storeu_pd(
2974 516 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2975 : xmm1_low_d);
2976 : _mm_storeu_pd(
2977 516 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2978 : xmm1_high_d);
2979 : }
2980 : #if defined(__clang__)
2981 : #pragma clang loop vectorize(disable)
2982 : #endif
2983 1082 : for (; n < nWordCount; n++)
2984 : {
2985 769 : pDstData[n] = pSrcData[n];
2986 313 : }
2987 : }
2988 : else
2989 : {
2990 136 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2991 : nDstPixelStride, nWordCount);
2992 : }
2993 449 : }
2994 :
2995 : template <>
2996 4923280 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2997 : int nSrcPixelStride,
2998 : double *const CPL_RESTRICT pDstData,
2999 : int nDstPixelStride, GPtrDiff_t nWordCount)
3000 : {
3001 4923280 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
3002 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
3003 : {
3004 34874 : decltype(nWordCount) n = 0;
3005 34874 : GByte *CPL_RESTRICT pabyDstDataPtr =
3006 : reinterpret_cast<GByte *>(pDstData);
3007 403828 : for (; n < nWordCount - 7; n += 8)
3008 : {
3009 368954 : __m128i xmm = _mm_loadu_si128(
3010 368954 : reinterpret_cast<const __m128i *>(pSrcData + n));
3011 368954 : const auto sign = _mm_srai_epi16(xmm, 15);
3012 368954 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
3013 368954 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
3014 :
3015 368954 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
3016 368954 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
3017 368954 : xmm0 = _mm_srli_si128(xmm0, 8);
3018 368954 : xmm1 = _mm_srli_si128(xmm1, 8);
3019 368954 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
3020 368954 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
3021 :
3022 368954 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
3023 : xmm0_low_d);
3024 : _mm_storeu_pd(
3025 368954 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
3026 : xmm0_high_d);
3027 : _mm_storeu_pd(
3028 368954 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
3029 : xmm1_low_d);
3030 : _mm_storeu_pd(
3031 368954 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
3032 : xmm1_high_d);
3033 : }
3034 : #if defined(__clang__)
3035 : #pragma clang loop vectorize(disable)
3036 : #endif
3037 255934 : for (; n < nWordCount; n++)
3038 : {
3039 221060 : pDstData[n] = pSrcData[n];
3040 34874 : }
3041 : }
3042 : else
3043 : {
3044 4888400 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
3045 : nDstPixelStride, nWordCount);
3046 : }
3047 4923280 : }
3048 :
3049 : // ---- AVX2 helpers for int32 narrowing (runtime dispatch) ----
3050 :
3051 : #if defined(HAVE_AVX2_DISPATCH)
3052 : #if !defined(_MSC_VER)
3053 : __attribute__((target("avx2")))
3054 : #endif
3055 12723 : static void GDALCopyWordsInt32ToUInt8_AVX2(const int32_t *CPL_RESTRICT pSrc,
3056 : uint8_t *CPL_RESTRICT pDst,
3057 : GPtrDiff_t nWordCount)
3058 : {
3059 12723 : const __m256i permuteIdx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
3060 12723 : GPtrDiff_t n = 0;
3061 958119 : for (; n < nWordCount - 31; n += 32)
3062 : {
3063 : __m256i v0 =
3064 945396 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n));
3065 : __m256i v1 =
3066 945396 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n + 8));
3067 945396 : __m256i v2 = _mm256_loadu_si256(
3068 945396 : reinterpret_cast<const __m256i *>(pSrc + n + 16));
3069 945396 : __m256i v3 = _mm256_loadu_si256(
3070 945396 : reinterpret_cast<const __m256i *>(pSrc + n + 24));
3071 : // Clamp to [0, 255]
3072 : // Pack int32 -> int16 -> uint8, then fix cross-lane ordering
3073 945396 : __m256i ab16 = _mm256_packs_epi32(v0, v1);
3074 945396 : __m256i cd16 = _mm256_packs_epi32(v2, v3);
3075 945396 : __m256i bytes = _mm256_packus_epi16(ab16, cd16);
3076 945396 : bytes = _mm256_permutevar8x32_epi32(bytes, permuteIdx);
3077 945396 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(pDst + n), bytes);
3078 : }
3079 : #if defined(__clang__)
3080 : #pragma clang loop vectorize(disable)
3081 : #endif
3082 68589 : for (; n < nWordCount; n++)
3083 : {
3084 55866 : pDst[n] = static_cast<uint8_t>(std::clamp(pSrc[n], 0, 255));
3085 : }
3086 12723 : }
3087 :
3088 : #if !defined(_MSC_VER)
3089 : __attribute__((target("avx2")))
3090 : #endif
3091 10277 : static void GDALCopyWordsInt32ToUInt16_AVX2(const int32_t *CPL_RESTRICT pSrc,
3092 : uint16_t *CPL_RESTRICT pDst,
3093 : GPtrDiff_t nWordCount)
3094 : {
3095 : // _mm256_packus_epi32(v0, v1) produces per-lane interleaved result:
3096 : // [v0_lo4, v1_lo4, v0_hi4, v1_hi4] (in uint16 pairs per 32-bit lane)
3097 : // Permute to deinterleave: all v0 values first, then all v1 values
3098 10277 : const __m256i permuteIdx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
3099 10277 : GPtrDiff_t n = 0;
3100 670572 : for (; n < nWordCount - 15; n += 16)
3101 : {
3102 : __m256i v0 =
3103 660295 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n));
3104 : __m256i v1 =
3105 1320590 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n + 8));
3106 : // Clamp to [0, 65535]: _mm256_packus_epi32 saturates uint
3107 660295 : __m256i packed = _mm256_packus_epi32(v0, v1);
3108 : // Fix cross-lane interleave from packus
3109 660295 : packed = _mm256_permutevar8x32_epi32(packed, permuteIdx);
3110 660295 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(pDst + n), packed);
3111 : }
3112 : #if defined(__clang__)
3113 : #pragma clang loop vectorize(disable)
3114 : #endif
3115 163928 : for (; n < nWordCount; n++)
3116 : {
3117 153651 : pDst[n] = static_cast<uint16_t>(std::clamp(pSrc[n], 0, 65535));
3118 : }
3119 10277 : }
3120 : #endif // HAVE_AVX2_DISPATCH
3121 :
3122 : // ---- int32 -> uint8 with clamping to [0, 255] ----
3123 : template <>
3124 12837 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
3125 : int nSrcPixelStride,
3126 : uint8_t *const CPL_RESTRICT pDstData,
3127 : int nDstPixelStride, GPtrDiff_t nWordCount)
3128 : {
3129 12837 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
3130 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
3131 : {
3132 : #if defined(HAVE_AVX2_DISPATCH)
3133 12723 : if (CPLHaveRuntimeAVX2())
3134 : {
3135 12723 : GDALCopyWordsInt32ToUInt8_AVX2(pSrcData, pDstData, nWordCount);
3136 12723 : return;
3137 : }
3138 : #endif
3139 :
3140 : // SSE2 path: 16 pixels per iteration
3141 0 : decltype(nWordCount) n = 0;
3142 0 : for (; n < nWordCount - 15; n += 16)
3143 : {
3144 0 : __m128i v0 = _mm_loadu_si128(
3145 0 : reinterpret_cast<const __m128i *>(pSrcData + n));
3146 0 : __m128i v1 = _mm_loadu_si128(
3147 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
3148 0 : __m128i v2 = _mm_loadu_si128(
3149 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
3150 0 : __m128i v3 = _mm_loadu_si128(
3151 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 12));
3152 : // Pack int32->int16 with signed saturation to [-32768,32767] range
3153 0 : __m128i lo16 = _mm_packs_epi32(v0, v1);
3154 0 : __m128i hi16 = _mm_packs_epi32(v2, v3);
3155 : // Pack int16->uint8 with unsigned saturation to [0,255] range
3156 0 : __m128i bytes = _mm_packus_epi16(lo16, hi16);
3157 0 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), bytes);
3158 : }
3159 : #if defined(__clang__)
3160 : #pragma clang loop vectorize(disable)
3161 : #endif
3162 0 : for (; n < nWordCount; n++)
3163 : {
3164 0 : pDstData[n] = static_cast<uint8_t>(std::clamp(pSrcData[n], 0, 255));
3165 0 : }
3166 : }
3167 : else
3168 : {
3169 114 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
3170 : nDstPixelStride, nWordCount);
3171 : }
3172 : }
3173 :
3174 : // ---- int32 -> uint16 with clamping to [0, 65535] ----
3175 : template <>
3176 10322 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
3177 : int nSrcPixelStride,
3178 : uint16_t *const CPL_RESTRICT pDstData,
3179 : int nDstPixelStride, GPtrDiff_t nWordCount)
3180 : {
3181 10322 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
3182 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
3183 : {
3184 : #if defined(HAVE_AVX2_DISPATCH)
3185 10277 : if (CPLHaveRuntimeAVX2())
3186 : {
3187 10277 : GDALCopyWordsInt32ToUInt16_AVX2(pSrcData, pDstData, nWordCount);
3188 10277 : return;
3189 : }
3190 : #endif
3191 0 : decltype(nWordCount) n = 0;
3192 0 : for (; n < nWordCount - 15; n += 16)
3193 : {
3194 0 : __m128i v0 = _mm_loadu_si128(
3195 0 : reinterpret_cast<const __m128i *>(pSrcData + n));
3196 0 : __m128i v1 = _mm_loadu_si128(
3197 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
3198 0 : __m128i v2 = _mm_loadu_si128(
3199 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
3200 0 : __m128i v3 = _mm_loadu_si128(
3201 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 12));
3202 0 : const auto packed_lo = GDAL_mm_packus_epi32(v0, v1);
3203 0 : const auto packed_hi = GDAL_mm_packus_epi32(v2, v3);
3204 0 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n),
3205 : packed_lo);
3206 0 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
3207 : packed_hi);
3208 : }
3209 : #if defined(__clang__)
3210 : #pragma clang loop vectorize(disable)
3211 : #endif
3212 0 : for (; n < nWordCount; n++)
3213 : {
3214 0 : pDstData[n] =
3215 0 : static_cast<uint16_t>(std::clamp(pSrcData[n], 0, 65535));
3216 0 : }
3217 : }
3218 : else
3219 : {
3220 45 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
3221 : nDstPixelStride, nWordCount);
3222 : }
3223 : }
3224 :
3225 : // ---- int32 -> int16 with clamping to [-32768, 32767] ----
3226 : template <>
3227 98 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
3228 : int nSrcPixelStride,
3229 : int16_t *const CPL_RESTRICT pDstData,
3230 : int nDstPixelStride, GPtrDiff_t nWordCount)
3231 : {
3232 98 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
3233 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
3234 : {
3235 : // SSE2 path: 16 pixels per iteration
3236 43 : decltype(nWordCount) n = 0;
3237 268 : for (; n < nWordCount - 15; n += 16)
3238 : {
3239 450 : __m128i v0 = _mm_loadu_si128(
3240 225 : reinterpret_cast<const __m128i *>(pSrcData + n));
3241 450 : __m128i v1 = _mm_loadu_si128(
3242 225 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
3243 450 : __m128i v2 = _mm_loadu_si128(
3244 225 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
3245 225 : __m128i v3 = _mm_loadu_si128(
3246 225 : reinterpret_cast<const __m128i *>(pSrcData + n + 12));
3247 : // Pack int32->int16 with signed saturation to [-32768,32767] range
3248 225 : __m128i packed_lo = _mm_packs_epi32(v0, v1);
3249 225 : __m128i packed_hi = _mm_packs_epi32(v2, v3);
3250 225 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n),
3251 : packed_lo);
3252 225 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
3253 : packed_hi);
3254 : }
3255 : #if defined(__clang__)
3256 : #pragma clang loop vectorize(disable)
3257 : #endif
3258 191 : for (; n < nWordCount; n++)
3259 : {
3260 148 : pDstData[n] =
3261 148 : static_cast<int16_t>(std::clamp(pSrcData[n], -32768, 32767));
3262 43 : }
3263 : }
3264 : else
3265 : {
3266 55 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
3267 : nDstPixelStride, nWordCount);
3268 : }
3269 98 : }
3270 :
3271 : // ---- int16 -> uint8 with clamping to [0, 255] ----
3272 : template <>
3273 17428 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
3274 : int nSrcPixelStride,
3275 : uint8_t *const CPL_RESTRICT pDstData,
3276 : int nDstPixelStride, GPtrDiff_t nWordCount)
3277 : {
3278 17428 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
3279 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
3280 : {
3281 : // SSE2 path: 32 pixels per iteration
3282 17338 : decltype(nWordCount) n = 0;
3283 85649 : for (; n < nWordCount - 31; n += 32)
3284 : {
3285 136622 : __m128i v0 = _mm_loadu_si128(
3286 68311 : reinterpret_cast<const __m128i *>(pSrcData + n));
3287 136622 : __m128i v1 = _mm_loadu_si128(
3288 68311 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
3289 136622 : __m128i v2 = _mm_loadu_si128(
3290 68311 : reinterpret_cast<const __m128i *>(pSrcData + n + 16));
3291 68311 : __m128i v3 = _mm_loadu_si128(
3292 68311 : reinterpret_cast<const __m128i *>(pSrcData + n + 24));
3293 : // Pack int16->uint8 with unsigned saturation to [0, 255] range
3294 68311 : __m128i packed_lo = _mm_packus_epi16(v0, v1);
3295 68311 : __m128i packed_hi = _mm_packus_epi16(v2, v3);
3296 68311 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n),
3297 : packed_lo);
3298 68311 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
3299 : packed_hi);
3300 : }
3301 : #if defined(__clang__)
3302 : #pragma clang loop vectorize(disable)
3303 : #endif
3304 214741 : for (; n < nWordCount; n++)
3305 : {
3306 197403 : pDstData[n] =
3307 197403 : static_cast<uint8_t>(std::clamp<int>(pSrcData[n], 0, 255));
3308 17338 : }
3309 : }
3310 : else
3311 : {
3312 90 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
3313 : nDstPixelStride, nWordCount);
3314 : }
3315 17428 : }
3316 :
3317 : #endif // HAVE_SSE2
3318 :
3319 : template <>
3320 4437370 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3321 : int nSrcPixelStride,
3322 : GByte *const CPL_RESTRICT pDstData,
3323 : int nDstPixelStride, GPtrDiff_t nWordCount)
3324 : {
3325 4437370 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3326 : nDstPixelStride, nWordCount);
3327 4437370 : }
3328 :
3329 : template <>
3330 38394 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3331 : int nSrcPixelStride,
3332 : GUInt16 *const CPL_RESTRICT pDstData,
3333 : int nDstPixelStride, GPtrDiff_t nWordCount)
3334 : {
3335 38394 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3336 : nDstPixelStride, nWordCount);
3337 38394 : }
3338 :
3339 : template <>
3340 55891 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3341 : int nSrcPixelStride,
3342 : double *const CPL_RESTRICT pDstData,
3343 : int nDstPixelStride, GPtrDiff_t nWordCount)
3344 : {
3345 55891 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3346 : nDstPixelStride, nWordCount);
3347 55891 : }
3348 :
3349 : template <>
3350 122866 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3351 : int nSrcPixelStride,
3352 : float *const CPL_RESTRICT pDstData,
3353 : int nDstPixelStride, GPtrDiff_t nWordCount)
3354 : {
3355 122866 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3356 : nDstPixelStride, nWordCount);
3357 122866 : }
3358 :
3359 : template <>
3360 412 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
3361 : int nSrcPixelStride,
3362 : float *const CPL_RESTRICT pDstData,
3363 : int nDstPixelStride, GPtrDiff_t nWordCount)
3364 : {
3365 412 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3366 : nDstPixelStride, nWordCount);
3367 412 : }
3368 :
3369 : template <>
3370 544 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
3371 : int nSrcPixelStride,
3372 : double *const CPL_RESTRICT pDstData,
3373 : int nDstPixelStride, GPtrDiff_t nWordCount)
3374 : {
3375 544 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3376 : nDstPixelStride, nWordCount);
3377 544 : }
3378 :
3379 : template <>
3380 324215 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3381 : int nSrcPixelStride,
3382 : GByte *const CPL_RESTRICT pDstData,
3383 : int nDstPixelStride, GPtrDiff_t nWordCount)
3384 : {
3385 324215 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3386 : nDstPixelStride, nWordCount);
3387 324215 : }
3388 :
3389 : template <>
3390 61 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3391 : int nSrcPixelStride,
3392 : GInt8 *const CPL_RESTRICT pDstData,
3393 : int nDstPixelStride, GPtrDiff_t nWordCount)
3394 : {
3395 61 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3396 : nDstPixelStride, nWordCount);
3397 61 : }
3398 :
3399 : template <>
3400 15791 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3401 : int nSrcPixelStride,
3402 : GInt16 *const CPL_RESTRICT pDstData,
3403 : int nDstPixelStride, GPtrDiff_t nWordCount)
3404 : {
3405 15791 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3406 : nDstPixelStride, nWordCount);
3407 15791 : }
3408 :
3409 : template <>
3410 61719 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3411 : int nSrcPixelStride,
3412 : GUInt16 *const CPL_RESTRICT pDstData,
3413 : int nDstPixelStride, GPtrDiff_t nWordCount)
3414 : {
3415 61719 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3416 : nDstPixelStride, nWordCount);
3417 61719 : }
3418 :
3419 : template <>
3420 43991 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3421 : int nSrcPixelStride,
3422 : GInt32 *const CPL_RESTRICT pDstData,
3423 : int nDstPixelStride, GPtrDiff_t nWordCount)
3424 : {
3425 43991 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3426 : nDstPixelStride, nWordCount);
3427 43991 : }
3428 :
3429 : template <>
3430 72 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3431 : int nSrcPixelStride,
3432 : GFloat16 *const CPL_RESTRICT pDstData,
3433 : int nDstPixelStride, GPtrDiff_t nWordCount)
3434 : {
3435 72 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3436 : nDstPixelStride, nWordCount);
3437 72 : }
3438 :
3439 : template <>
3440 63 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3441 : int nSrcPixelStride,
3442 : GFloat16 *const CPL_RESTRICT pDstData,
3443 : int nDstPixelStride, GPtrDiff_t nWordCount)
3444 : {
3445 63 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3446 : nDstPixelStride, nWordCount);
3447 63 : }
3448 :
3449 : /************************************************************************/
3450 : /* GDALCopyWordsComplexT() */
3451 : /************************************************************************/
3452 : /**
3453 : * Template function, used to copy data from pSrcData into buffer
3454 : * pDstData, with stride nSrcPixelStride in the source data and
3455 : * stride nDstPixelStride in the destination data. Deals with the
3456 : * complex case, where input is complex and output is complex.
3457 : *
3458 : * @param pSrcData the source data buffer
3459 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
3460 : * of interest.
3461 : * @param pDstData the destination buffer.
3462 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
3463 : * interest.
3464 : * @param nWordCount the total number of pixel words to copy
3465 : *
3466 : */
3467 : template <class Tin, class Tout>
3468 98788 : inline void GDALCopyWordsComplexT(const Tin *const CPL_RESTRICT pSrcData,
3469 : int nSrcPixelStride,
3470 : Tout *const CPL_RESTRICT pDstData,
3471 : int nDstPixelStride, GPtrDiff_t nWordCount)
3472 : {
3473 98788 : decltype(nWordCount) nDstOffset = 0;
3474 98788 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
3475 98788 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
3476 :
3477 5631239 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
3478 : {
3479 5532446 : const Tin *const pPixelIn =
3480 5532446 : reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
3481 5532446 : Tout *const pPixelOut =
3482 5532446 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
3483 :
3484 5532446 : GDALCopyWord(pPixelIn[0], pPixelOut[0]);
3485 5532446 : GDALCopyWord(pPixelIn[1], pPixelOut[1]);
3486 :
3487 5532446 : nDstOffset += nDstPixelStride;
3488 : }
3489 98788 : }
3490 :
3491 : /************************************************************************/
3492 : /* GDALCopyWordsComplexOutT() */
3493 : /************************************************************************/
3494 : /**
3495 : * Template function, used to copy data from pSrcData into buffer
3496 : * pDstData, with stride nSrcPixelStride in the source data and
3497 : * stride nDstPixelStride in the destination data. Deals with the
3498 : * case where the value is real coming in, but complex going out.
3499 : *
3500 : * @param pSrcData the source data buffer
3501 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
3502 : * of interest, in bytes.
3503 : * @param pDstData the destination buffer.
3504 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
3505 : * interest, in bytes.
3506 : * @param nWordCount the total number of pixel words to copy
3507 : *
3508 : */
3509 : template <class Tin, class Tout>
3510 4778 : inline void GDALCopyWordsComplexOutT(const Tin *const CPL_RESTRICT pSrcData,
3511 : int nSrcPixelStride,
3512 : Tout *const CPL_RESTRICT pDstData,
3513 : int nDstPixelStride, GPtrDiff_t nWordCount)
3514 : {
3515 4778 : decltype(nWordCount) nDstOffset = 0;
3516 :
3517 4778 : const Tout tOutZero = static_cast<Tout>(0);
3518 :
3519 4778 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
3520 4778 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
3521 :
3522 1190456 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
3523 : {
3524 1185678 : const Tin tValue =
3525 1185678 : *reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
3526 1185678 : Tout *const pPixelOut =
3527 1185678 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
3528 1185678 : GDALCopyWord(tValue, *pPixelOut);
3529 :
3530 1185678 : pPixelOut[1] = tOutZero;
3531 :
3532 1185678 : nDstOffset += nDstPixelStride;
3533 : }
3534 4778 : }
3535 :
3536 : /************************************************************************/
3537 : /* GDALCopyWordsFromT() */
3538 : /************************************************************************/
3539 : /**
3540 : * Template driver function. Given the input type T, call the appropriate
3541 : * GDALCopyWordsT function template for the desired output type. You should
3542 : * never call this function directly (call GDALCopyWords instead).
3543 : *
3544 : * @param pSrcData source data buffer
3545 : * @param nSrcPixelStride pixel stride in input buffer, in pixel words
3546 : * @param bInComplex input is complex
3547 : * @param pDstData destination data buffer
3548 : * @param eDstType destination data type
3549 : * @param nDstPixelStride pixel stride in output buffer, in pixel words
3550 : * @param nWordCount number of pixel words to be copied
3551 : */
3552 : template <class T>
3553 61314467 : inline void GDALCopyWordsFromT(const T *const CPL_RESTRICT pSrcData,
3554 : int nSrcPixelStride, bool bInComplex,
3555 : void *CPL_RESTRICT pDstData,
3556 : GDALDataType eDstType, int nDstPixelStride,
3557 : GPtrDiff_t nWordCount)
3558 : {
3559 61314467 : switch (eDstType)
3560 : {
3561 4805731 : case GDT_UInt8:
3562 4805731 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3563 : static_cast<unsigned char *>(pDstData),
3564 : nDstPixelStride, nWordCount);
3565 4805731 : break;
3566 1903 : case GDT_Int8:
3567 1903 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3568 : static_cast<signed char *>(pDstData),
3569 : nDstPixelStride, nWordCount);
3570 1903 : break;
3571 1143791 : case GDT_UInt16:
3572 1143791 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3573 : static_cast<unsigned short *>(pDstData),
3574 : nDstPixelStride, nWordCount);
3575 1143791 : break;
3576 4162744 : case GDT_Int16:
3577 4162744 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3578 : static_cast<short *>(pDstData), nDstPixelStride,
3579 : nWordCount);
3580 4162744 : break;
3581 23348 : case GDT_UInt32:
3582 23348 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3583 : static_cast<unsigned int *>(pDstData),
3584 : nDstPixelStride, nWordCount);
3585 23348 : break;
3586 29460579 : case GDT_Int32:
3587 29460579 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3588 : static_cast<int *>(pDstData), nDstPixelStride,
3589 : nWordCount);
3590 29460579 : break;
3591 1262 : case GDT_UInt64:
3592 1262 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3593 : static_cast<std::uint64_t *>(pDstData),
3594 : nDstPixelStride, nWordCount);
3595 1262 : break;
3596 6169 : case GDT_Int64:
3597 6169 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3598 : static_cast<std::int64_t *>(pDstData),
3599 : nDstPixelStride, nWordCount);
3600 6169 : break;
3601 999 : case GDT_Float16:
3602 999 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3603 : static_cast<GFloat16 *>(pDstData), nDstPixelStride,
3604 : nWordCount);
3605 999 : break;
3606 4216031 : case GDT_Float32:
3607 4216031 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3608 : static_cast<float *>(pDstData), nDstPixelStride,
3609 : nWordCount);
3610 4216031 : break;
3611 17388204 : case GDT_Float64:
3612 17388204 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3613 : static_cast<double *>(pDstData), nDstPixelStride,
3614 : nWordCount);
3615 17388204 : break;
3616 94432 : case GDT_CInt16:
3617 94432 : if (bInComplex)
3618 : {
3619 93170 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3620 : static_cast<short *>(pDstData),
3621 : nDstPixelStride, nWordCount);
3622 : }
3623 : else // input is not complex, so we need to promote to a complex
3624 : // buffer
3625 : {
3626 1262 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3627 : static_cast<short *>(pDstData),
3628 : nDstPixelStride, nWordCount);
3629 : }
3630 94432 : break;
3631 1357 : case GDT_CInt32:
3632 1357 : if (bInComplex)
3633 : {
3634 717 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3635 : static_cast<int *>(pDstData),
3636 : nDstPixelStride, nWordCount);
3637 : }
3638 : else // input is not complex, so we need to promote to a complex
3639 : // buffer
3640 : {
3641 640 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3642 : static_cast<int *>(pDstData),
3643 : nDstPixelStride, nWordCount);
3644 : }
3645 1357 : break;
3646 313 : case GDT_CFloat16:
3647 313 : if (bInComplex)
3648 : {
3649 48 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3650 : static_cast<GFloat16 *>(pDstData),
3651 : nDstPixelStride, nWordCount);
3652 : }
3653 : else // input is not complex, so we need to promote to a complex
3654 : // buffer
3655 : {
3656 265 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3657 : static_cast<GFloat16 *>(pDstData),
3658 : nDstPixelStride, nWordCount);
3659 : }
3660 313 : break;
3661 3924 : case GDT_CFloat32:
3662 3924 : if (bInComplex)
3663 : {
3664 3115 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3665 : static_cast<float *>(pDstData),
3666 : nDstPixelStride, nWordCount);
3667 : }
3668 : else // input is not complex, so we need to promote to a complex
3669 : // buffer
3670 : {
3671 809 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3672 : static_cast<float *>(pDstData),
3673 : nDstPixelStride, nWordCount);
3674 : }
3675 3924 : break;
3676 3540 : case GDT_CFloat64:
3677 3540 : if (bInComplex)
3678 : {
3679 1738 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3680 : static_cast<double *>(pDstData),
3681 : nDstPixelStride, nWordCount);
3682 : }
3683 : else // input is not complex, so we need to promote to a complex
3684 : // buffer
3685 : {
3686 1802 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3687 : static_cast<double *>(pDstData),
3688 : nDstPixelStride, nWordCount);
3689 : }
3690 3540 : break;
3691 0 : case GDT_Unknown:
3692 : case GDT_TypeCount:
3693 0 : CPLAssert(false);
3694 : }
3695 61314467 : }
3696 :
3697 : } // end anonymous namespace
3698 :
3699 : /************************************************************************/
3700 : /* GDALReplicateWord() */
3701 : /************************************************************************/
3702 :
3703 : template <class T>
3704 600457 : inline void GDALReplicateWordT(void *pDstData, int nDstPixelStride,
3705 : GPtrDiff_t nWordCount)
3706 : {
3707 600457 : const T valSet = *static_cast<const T *>(pDstData);
3708 600457 : if (nDstPixelStride == static_cast<int>(sizeof(T)))
3709 : {
3710 570643 : T *pDstPtr = static_cast<T *>(pDstData) + 1;
3711 32018999 : while (nWordCount >= 4)
3712 : {
3713 31448340 : nWordCount -= 4;
3714 31448340 : pDstPtr[0] = valSet;
3715 31448340 : pDstPtr[1] = valSet;
3716 31448340 : pDstPtr[2] = valSet;
3717 31448340 : pDstPtr[3] = valSet;
3718 31448340 : pDstPtr += 4;
3719 : }
3720 1476831 : while (nWordCount > 0)
3721 : {
3722 906188 : --nWordCount;
3723 906188 : *pDstPtr = valSet;
3724 906188 : pDstPtr++;
3725 : }
3726 : }
3727 : else
3728 : {
3729 29814 : GByte *pabyDstPtr = static_cast<GByte *>(pDstData) + nDstPixelStride;
3730 1106520 : while (nWordCount > 0)
3731 : {
3732 1076706 : --nWordCount;
3733 1076706 : *reinterpret_cast<T *>(pabyDstPtr) = valSet;
3734 1076706 : pabyDstPtr += nDstPixelStride;
3735 : }
3736 : }
3737 600457 : }
3738 :
3739 1080420 : static void GDALReplicateWord(const void *CPL_RESTRICT pSrcData,
3740 : GDALDataType eSrcType,
3741 : void *CPL_RESTRICT pDstData,
3742 : GDALDataType eDstType, int nDstPixelStride,
3743 : GPtrDiff_t nWordCount)
3744 : {
3745 : /* -----------------------------------------------------------------------
3746 : */
3747 : /* Special case when the source data is always the same value */
3748 : /* (for VRTSourcedRasterBand::IRasterIO and
3749 : * VRTDerivedRasterBand::IRasterIO*/
3750 : /* for example) */
3751 : /* -----------------------------------------------------------------------
3752 : */
3753 : // Let the general translation case do the necessary conversions
3754 : // on the first destination element.
3755 1080420 : GDALCopyWords64(pSrcData, eSrcType, 0, pDstData, eDstType, 0, 1);
3756 :
3757 : // Now copy the first element to the nWordCount - 1 following destination
3758 : // elements.
3759 1080420 : nWordCount--;
3760 1080420 : GByte *pabyDstWord = reinterpret_cast<GByte *>(pDstData) + nDstPixelStride;
3761 :
3762 1080420 : switch (eDstType)
3763 : {
3764 479871 : case GDT_UInt8:
3765 : case GDT_Int8:
3766 : {
3767 479871 : if (nDstPixelStride == 1)
3768 : {
3769 369938 : if (nWordCount > 0)
3770 369938 : memset(pabyDstWord,
3771 369938 : *reinterpret_cast<const GByte *>(pDstData),
3772 : nWordCount);
3773 : }
3774 : else
3775 : {
3776 109933 : GByte valSet = *reinterpret_cast<const GByte *>(pDstData);
3777 72866900 : while (nWordCount > 0)
3778 : {
3779 72757000 : --nWordCount;
3780 72757000 : *pabyDstWord = valSet;
3781 72757000 : pabyDstWord += nDstPixelStride;
3782 : }
3783 : }
3784 479871 : break;
3785 : }
3786 :
3787 : #define CASE_DUPLICATE_SIMPLE(enum_type, c_type) \
3788 : case enum_type: \
3789 : { \
3790 : GDALReplicateWordT<c_type>(pDstData, nDstPixelStride, nWordCount); \
3791 : break; \
3792 : }
3793 :
3794 34514 : CASE_DUPLICATE_SIMPLE(GDT_UInt16, GUInt16)
3795 202455 : CASE_DUPLICATE_SIMPLE(GDT_Int16, GInt16)
3796 74 : CASE_DUPLICATE_SIMPLE(GDT_UInt32, GUInt32)
3797 301585 : CASE_DUPLICATE_SIMPLE(GDT_Int32, GInt32)
3798 41 : CASE_DUPLICATE_SIMPLE(GDT_UInt64, std::uint64_t)
3799 1072 : CASE_DUPLICATE_SIMPLE(GDT_Int64, std::int64_t)
3800 2 : CASE_DUPLICATE_SIMPLE(GDT_Float16, GFloat16)
3801 52858 : CASE_DUPLICATE_SIMPLE(GDT_Float32, float)
3802 7856 : CASE_DUPLICATE_SIMPLE(GDT_Float64, double)
3803 :
3804 : #define CASE_DUPLICATE_COMPLEX(enum_type, c_type) \
3805 : case enum_type: \
3806 : { \
3807 : c_type valSet1 = reinterpret_cast<const c_type *>(pDstData)[0]; \
3808 : c_type valSet2 = reinterpret_cast<const c_type *>(pDstData)[1]; \
3809 : while (nWordCount > 0) \
3810 : { \
3811 : --nWordCount; \
3812 : reinterpret_cast<c_type *>(pabyDstWord)[0] = valSet1; \
3813 : reinterpret_cast<c_type *>(pabyDstWord)[1] = valSet2; \
3814 : pabyDstWord += nDstPixelStride; \
3815 : } \
3816 : break; \
3817 : }
3818 :
3819 784 : CASE_DUPLICATE_COMPLEX(GDT_CInt16, GInt16)
3820 784 : CASE_DUPLICATE_COMPLEX(GDT_CInt32, GInt32)
3821 6 : CASE_DUPLICATE_COMPLEX(GDT_CFloat16, GFloat16)
3822 790 : CASE_DUPLICATE_COMPLEX(GDT_CFloat32, float)
3823 790 : CASE_DUPLICATE_COMPLEX(GDT_CFloat64, double)
3824 :
3825 0 : case GDT_Unknown:
3826 : case GDT_TypeCount:
3827 0 : CPLAssert(false);
3828 : }
3829 1080420 : }
3830 :
3831 : /************************************************************************/
3832 : /* GDALUnrolledCopy() */
3833 : /************************************************************************/
3834 :
3835 : template <class T, int srcStride, int dstStride>
3836 : #if defined(__GNUC__) && defined(__AVX2__)
3837 : __attribute__((optimize("tree-vectorize")))
3838 : #endif
3839 3057256 : static inline void GDALUnrolledCopyGeneric(T *CPL_RESTRICT pDest,
3840 : const T *CPL_RESTRICT pSrc,
3841 : GPtrDiff_t nIters)
3842 : {
3843 : #if !(defined(__GNUC__) && defined(__AVX2__))
3844 3057256 : if (nIters >= 16)
3845 : {
3846 133743688 : for (GPtrDiff_t i = nIters / 16; i != 0; i--)
3847 : {
3848 130807113 : pDest[0 * dstStride] = pSrc[0 * srcStride];
3849 130807113 : pDest[1 * dstStride] = pSrc[1 * srcStride];
3850 130807113 : pDest[2 * dstStride] = pSrc[2 * srcStride];
3851 130807113 : pDest[3 * dstStride] = pSrc[3 * srcStride];
3852 130807113 : pDest[4 * dstStride] = pSrc[4 * srcStride];
3853 130807113 : pDest[5 * dstStride] = pSrc[5 * srcStride];
3854 130807113 : pDest[6 * dstStride] = pSrc[6 * srcStride];
3855 130807113 : pDest[7 * dstStride] = pSrc[7 * srcStride];
3856 130807113 : pDest[8 * dstStride] = pSrc[8 * srcStride];
3857 130807113 : pDest[9 * dstStride] = pSrc[9 * srcStride];
3858 130807113 : pDest[10 * dstStride] = pSrc[10 * srcStride];
3859 130807113 : pDest[11 * dstStride] = pSrc[11 * srcStride];
3860 130807113 : pDest[12 * dstStride] = pSrc[12 * srcStride];
3861 130807113 : pDest[13 * dstStride] = pSrc[13 * srcStride];
3862 130807113 : pDest[14 * dstStride] = pSrc[14 * srcStride];
3863 130807113 : pDest[15 * dstStride] = pSrc[15 * srcStride];
3864 130807113 : pDest += 16 * dstStride;
3865 130807113 : pSrc += 16 * srcStride;
3866 : }
3867 2936688 : nIters = nIters % 16;
3868 : }
3869 : #else
3870 : #pragma GCC unroll 4
3871 : #endif
3872 5217496 : for (GPtrDiff_t i = 0; i < nIters; i++)
3873 : {
3874 2160243 : pDest[i * dstStride] = *pSrc;
3875 2160243 : pSrc += srcStride;
3876 : }
3877 3057256 : }
3878 :
3879 : template <class T, int srcStride, int dstStride>
3880 3057256 : static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
3881 : const T *CPL_RESTRICT pSrc,
3882 : GPtrDiff_t nIters)
3883 : {
3884 3057256 : GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
3885 3057256 : }
3886 :
3887 : #if defined(__AVX2__) && defined(HAVE_SSSE3_AT_COMPILE_TIME) && \
3888 : (defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS))
3889 :
3890 : template <>
3891 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
3892 : const GByte *CPL_RESTRICT pSrc,
3893 : GPtrDiff_t nIters)
3894 : {
3895 : if (nIters > 16)
3896 : {
3897 : // The SSSE3 variant is slightly faster than what the gcc autovectorizer
3898 : // generates
3899 : GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
3900 : }
3901 : else
3902 : {
3903 : for (GPtrDiff_t i = 0; i < nIters; i++)
3904 : {
3905 : pDest[i] = *pSrc;
3906 : pSrc += 3;
3907 : }
3908 : }
3909 : }
3910 :
3911 : #elif defined(HAVE_SSE2) && !(defined(__GNUC__) && defined(__AVX2__))
3912 :
3913 : template <>
3914 355218 : void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
3915 : const GByte *CPL_RESTRICT pSrc,
3916 : GPtrDiff_t nIters)
3917 : {
3918 355218 : decltype(nIters) i = 0;
3919 355218 : if (nIters > 16)
3920 : {
3921 195691 : const __m128i xmm_mask = _mm_set1_epi16(0xff);
3922 : // If we were sure that there would always be 1 trailing byte, we could
3923 : // check against nIters - 15
3924 3004490 : for (; i < nIters - 16; i += 16)
3925 : {
3926 : __m128i xmm0 =
3927 2808800 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
3928 : __m128i xmm1 =
3929 5617610 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
3930 : // Set higher 8bit of each int16 packed word to 0
3931 2808800 : xmm0 = _mm_and_si128(xmm0, xmm_mask);
3932 2808800 : xmm1 = _mm_and_si128(xmm1, xmm_mask);
3933 : // Pack int16 to uint8 and merge back both vector
3934 2808800 : xmm0 = _mm_packus_epi16(xmm0, xmm1);
3935 :
3936 : // Store result
3937 2808800 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
3938 :
3939 2808800 : pSrc += 2 * 16;
3940 : }
3941 : }
3942 4651210 : for (; i < nIters; i++)
3943 : {
3944 4295990 : pDest[i] = *pSrc;
3945 4295990 : pSrc += 2;
3946 : }
3947 355218 : }
3948 :
3949 1 : static void GDALUnrolledCopy_GByte_3_1_SSE2(GByte *CPL_RESTRICT pDest,
3950 : const GByte *CPL_RESTRICT pSrc,
3951 : GPtrDiff_t nIters)
3952 : {
3953 1 : decltype(nIters) i = 0;
3954 1 : const __m128i xmm_mask_ori = _mm_set_epi32(0, 0, 0, 255);
3955 : // If we were sure that there would always be 2 trailing bytes, we could
3956 : // check against nIters - 15
3957 2 : for (; i < nIters - 16; i += 16)
3958 : {
3959 : __m128i xmm0 =
3960 1 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
3961 : __m128i xmm1 =
3962 1 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
3963 : __m128i xmm2 =
3964 1 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
3965 :
3966 1 : auto xmm_mask0 = xmm_mask_ori;
3967 1 : auto xmm_mask1 = _mm_slli_si128(xmm_mask_ori, 6);
3968 1 : auto xmm_mask2 = _mm_slli_si128(xmm_mask_ori, 11);
3969 :
3970 1 : auto xmm = _mm_and_si128(xmm0, xmm_mask0);
3971 1 : auto xmm_res1 = _mm_and_si128(_mm_slli_si128(xmm1, 4), xmm_mask1);
3972 :
3973 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3974 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3975 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3976 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3977 2 : xmm_res1 = _mm_or_si128(
3978 : xmm_res1, _mm_and_si128(_mm_slli_si128(xmm1, 2), xmm_mask1));
3979 :
3980 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3981 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3982 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3983 2 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3984 1 : xmm_res1 = _mm_or_si128(xmm_res1, _mm_and_si128(xmm1, xmm_mask1));
3985 :
3986 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3987 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3988 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3989 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3990 2 : xmm_res1 = _mm_or_si128(
3991 : xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 2), xmm_mask1));
3992 :
3993 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3994 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3995 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3996 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3997 3 : xmm_res1 = _mm_or_si128(
3998 : xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 4), xmm_mask1));
3999 1 : xmm = _mm_or_si128(xmm, xmm_res1);
4000 :
4001 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
4002 1 : xmm0 = _mm_srli_si128(xmm0, 2);
4003 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
4004 :
4005 2 : xmm = _mm_or_si128(xmm,
4006 : _mm_and_si128(_mm_slli_si128(xmm2, 10), xmm_mask2));
4007 :
4008 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
4009 2 : xmm = _mm_or_si128(xmm,
4010 : _mm_and_si128(_mm_slli_si128(xmm2, 8), xmm_mask2));
4011 :
4012 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
4013 2 : xmm = _mm_or_si128(xmm,
4014 : _mm_and_si128(_mm_slli_si128(xmm2, 6), xmm_mask2));
4015 :
4016 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
4017 2 : xmm = _mm_or_si128(xmm,
4018 : _mm_and_si128(_mm_slli_si128(xmm2, 4), xmm_mask2));
4019 :
4020 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
4021 2 : xmm = _mm_or_si128(xmm,
4022 : _mm_and_si128(_mm_slli_si128(xmm2, 2), xmm_mask2));
4023 :
4024 1 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm);
4025 :
4026 1 : pSrc += 3 * 16;
4027 : }
4028 2 : for (; i < nIters; i++)
4029 : {
4030 1 : pDest[i] = *pSrc;
4031 1 : pSrc += 3;
4032 : }
4033 1 : }
4034 :
4035 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
4036 :
4037 : template <>
4038 193425 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
4039 : const GByte *CPL_RESTRICT pSrc,
4040 : GPtrDiff_t nIters)
4041 : {
4042 193425 : if (nIters > 16)
4043 : {
4044 187302 : if (CPLHaveRuntimeSSSE3())
4045 : {
4046 187301 : GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
4047 : }
4048 : else
4049 : {
4050 1 : GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
4051 : }
4052 : }
4053 : else
4054 : {
4055 20384 : for (GPtrDiff_t i = 0; i < nIters; i++)
4056 : {
4057 14261 : pDest[i] = *pSrc;
4058 14261 : pSrc += 3;
4059 : }
4060 : }
4061 193425 : }
4062 :
4063 : #else
4064 :
4065 : template <>
4066 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
4067 : const GByte *CPL_RESTRICT pSrc,
4068 : GPtrDiff_t nIters)
4069 : {
4070 : GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
4071 : }
4072 : #endif
4073 :
4074 : template <>
4075 332696 : void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
4076 : const GByte *CPL_RESTRICT pSrc,
4077 : GPtrDiff_t nIters)
4078 : {
4079 332696 : decltype(nIters) i = 0;
4080 332696 : if (nIters > 16)
4081 : {
4082 327399 : const __m128i xmm_mask = _mm_set1_epi32(0xff);
4083 : // If we were sure that there would always be 3 trailing bytes, we could
4084 : // check against nIters - 15
4085 28186800 : for (; i < nIters - 16; i += 16)
4086 : {
4087 : __m128i xmm0 =
4088 27859400 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
4089 : __m128i xmm1 =
4090 27859400 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
4091 : __m128i xmm2 =
4092 27859400 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
4093 : __m128i xmm3 =
4094 55718900 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 48));
4095 : // Set higher 24bit of each int32 packed word to 0
4096 27859400 : xmm0 = _mm_and_si128(xmm0, xmm_mask);
4097 27859400 : xmm1 = _mm_and_si128(xmm1, xmm_mask);
4098 27859400 : xmm2 = _mm_and_si128(xmm2, xmm_mask);
4099 27859400 : xmm3 = _mm_and_si128(xmm3, xmm_mask);
4100 : // Pack int32 to int16
4101 27859400 : xmm0 = _mm_packs_epi32(xmm0, xmm1);
4102 27859400 : xmm2 = _mm_packs_epi32(xmm2, xmm3);
4103 : // Pack int16 to uint8
4104 27859400 : xmm0 = _mm_packus_epi16(xmm0, xmm2);
4105 :
4106 : // Store result
4107 27859400 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
4108 :
4109 27859400 : pSrc += 4 * 16;
4110 : }
4111 : }
4112 5049340 : for (; i < nIters; i++)
4113 : {
4114 4716650 : pDest[i] = *pSrc;
4115 4716650 : pSrc += 4;
4116 : }
4117 332696 : }
4118 : #endif // HAVE_SSE2
4119 :
4120 : /************************************************************************/
4121 : /* GDALFastCopy() */
4122 : /************************************************************************/
4123 :
4124 : template <class T>
4125 40276000 : static inline void GDALFastCopy(T *CPL_RESTRICT pDest, int nDestStride,
4126 : const T *CPL_RESTRICT pSrc, int nSrcStride,
4127 : GPtrDiff_t nIters)
4128 : {
4129 40276000 : constexpr int sizeofT = static_cast<int>(sizeof(T));
4130 40276000 : if (nIters == 1)
4131 : {
4132 22545680 : *pDest = *pSrc;
4133 : }
4134 17730355 : else if (nDestStride == sizeofT)
4135 : {
4136 14599900 : if (nSrcStride == sizeofT)
4137 : {
4138 13508884 : memcpy(pDest, pSrc, nIters * sizeof(T));
4139 : }
4140 1091073 : else if (nSrcStride == 2 * sizeofT)
4141 : {
4142 358434 : GDALUnrolledCopy<T, 2, 1>(pDest, pSrc, nIters);
4143 : }
4144 732639 : else if (nSrcStride == 3 * sizeofT)
4145 : {
4146 290405 : GDALUnrolledCopy<T, 3, 1>(pDest, pSrc, nIters);
4147 : }
4148 442234 : else if (nSrcStride == 4 * sizeofT)
4149 : {
4150 336678 : GDALUnrolledCopy<T, 4, 1>(pDest, pSrc, nIters);
4151 : }
4152 : else
4153 : {
4154 17229290 : while (nIters-- > 0)
4155 : {
4156 17123750 : *pDest = *pSrc;
4157 17123750 : pSrc += nSrcStride / sizeofT;
4158 17123750 : pDest++;
4159 : }
4160 : }
4161 : }
4162 3130385 : else if (nSrcStride == sizeofT)
4163 : {
4164 3117389 : if (nDestStride == 2 * sizeofT)
4165 : {
4166 152788 : GDALUnrolledCopy<T, 1, 2>(pDest, pSrc, nIters);
4167 : }
4168 2964605 : else if (nDestStride == 3 * sizeofT)
4169 : {
4170 2136181 : GDALUnrolledCopy<T, 1, 3>(pDest, pSrc, nIters);
4171 : }
4172 828421 : else if (nDestStride == 4 * sizeofT)
4173 : {
4174 664109 : GDALUnrolledCopy<T, 1, 4>(pDest, pSrc, nIters);
4175 : }
4176 : else
4177 : {
4178 17169660 : while (nIters-- > 0)
4179 : {
4180 17005410 : *pDest = *pSrc;
4181 17005410 : pSrc++;
4182 17005410 : pDest += nDestStride / sizeofT;
4183 : }
4184 : }
4185 : }
4186 : else
4187 : {
4188 1220108 : while (nIters-- > 0)
4189 : {
4190 1207102 : *pDest = *pSrc;
4191 1207102 : pSrc += nSrcStride / sizeofT;
4192 1207102 : pDest += nDestStride / sizeofT;
4193 : }
4194 : }
4195 40276000 : }
4196 :
4197 : /************************************************************************/
4198 : /* GDALFastCopyByte() */
4199 : /************************************************************************/
4200 :
4201 326320 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
4202 : int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
4203 : int nDstPixelStride, GPtrDiff_t nWordCount)
4204 : {
4205 326320 : GDALFastCopy(pDstData, nDstPixelStride, pSrcData, nSrcPixelStride,
4206 : nWordCount);
4207 326320 : }
4208 :
4209 : /************************************************************************/
4210 : /* GDALCopyWords() */
4211 : /************************************************************************/
4212 :
4213 : /**
4214 : * Copy pixel words from buffer to buffer.
4215 : *
4216 : * @see GDALCopyWords64()
4217 : */
4218 80595400 : void CPL_STDCALL GDALCopyWords(const void *CPL_RESTRICT pSrcData,
4219 : GDALDataType eSrcType, int nSrcPixelStride,
4220 : void *CPL_RESTRICT pDstData,
4221 : GDALDataType eDstType, int nDstPixelStride,
4222 : int nWordCount)
4223 : {
4224 80595400 : GDALCopyWords64(pSrcData, eSrcType, nSrcPixelStride, pDstData, eDstType,
4225 : nDstPixelStride, nWordCount);
4226 80595400 : }
4227 :
4228 : /************************************************************************/
4229 : /* GDALCopyWords64() */
4230 : /************************************************************************/
4231 :
4232 : /**
4233 : * Copy pixel words from buffer to buffer.
4234 : *
4235 : * This function is used to copy pixel word values from one memory buffer
4236 : * to another, with support for conversion between data types, and differing
4237 : * step factors. The data type conversion is done using the following
4238 : * rules:
4239 : * <ul>
4240 : * <li>Values assigned to a lower range integer type are clipped. For
4241 : * instance assigning GDT_Int16 values to a GDT_UInt8 buffer will cause values
4242 : * less the 0 to be set to 0, and values larger than 255 to be set to 255.
4243 : * </li>
4244 : * <li>
4245 : * Assignment from floating point to integer rounds to closest integer.
4246 : * +Infinity is mapped to the largest integer. -Infinity is mapped to the
4247 : * smallest integer. NaN is mapped to 0.
4248 : * </li>
4249 : * <li>
4250 : * Assignment from non-complex to complex will result in the imaginary part
4251 : * being set to zero on output.
4252 : * </li>
4253 : * <li> Assignment from complex to
4254 : * non-complex will result in the complex portion being lost and the real
4255 : * component being preserved (<i>not magnitude!</i>).
4256 : * </li>
4257 : * </ul>
4258 : *
4259 : * No assumptions are made about the source or destination words occurring
4260 : * on word boundaries. It is assumed that all values are in native machine
4261 : * byte order.
4262 : *
4263 : * @param pSrcData Pointer to source data to be converted.
4264 : * @param eSrcType the source data type (see GDALDataType enum)
4265 : * @param nSrcPixelStride Source pixel stride (i.e. distance between 2 words),
4266 : * in bytes
4267 : * @param pDstData Pointer to buffer where destination data should go
4268 : * @param eDstType the destination data type (see GDALDataType enum)
4269 : * @param nDstPixelStride Destination pixel stride (i.e. distance between 2
4270 : * words), in bytes
4271 : * @param nWordCount number of words to be copied
4272 : *
4273 : * @note
4274 : * When adding a new data type to GDAL, you must do the following to
4275 : * support it properly within the GDALCopyWords function:
4276 : * 1. Add the data type to the switch on eSrcType in GDALCopyWords.
4277 : * This should invoke the appropriate GDALCopyWordsFromT wrapper.
4278 : * 2. Add the data type to the switch on eDstType in GDALCopyWordsFromT.
4279 : * This should call the appropriate GDALCopyWordsT template.
4280 : * 3. If appropriate, overload the appropriate CopyWord template in the
4281 : * above namespace. This will ensure that any conversion issues are
4282 : * handled (cases like the float -> int32 case, where the min/max)
4283 : * values are subject to roundoff error.
4284 : */
4285 :
4286 116985000 : void CPL_STDCALL GDALCopyWords64(const void *CPL_RESTRICT pSrcData,
4287 : GDALDataType eSrcType, int nSrcPixelStride,
4288 : void *CPL_RESTRICT pDstData,
4289 : GDALDataType eDstType, int nDstPixelStride,
4290 : GPtrDiff_t nWordCount)
4291 :
4292 : {
4293 : // On platforms where alignment matters, be careful
4294 116985000 : const int nSrcDataTypeSize = GDALGetDataTypeSizeBytes(eSrcType);
4295 116985000 : const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(eDstType);
4296 116985000 : if (CPL_UNLIKELY(nSrcDataTypeSize == 0 || nDstDataTypeSize == 0))
4297 : {
4298 2 : CPLError(CE_Failure, CPLE_NotSupported,
4299 : "GDALCopyWords64(): unsupported GDT_Unknown/GDT_TypeCount "
4300 : "argument");
4301 2 : return;
4302 : }
4303 116985000 : if (!(eSrcType == eDstType && nSrcPixelStride == nDstPixelStride) &&
4304 66415600 : ((reinterpret_cast<uintptr_t>(pSrcData) % nSrcDataTypeSize) != 0 ||
4305 66415600 : (reinterpret_cast<uintptr_t>(pDstData) % nDstDataTypeSize) != 0 ||
4306 66415200 : (nSrcPixelStride % nSrcDataTypeSize) != 0 ||
4307 66415100 : (nDstPixelStride % nDstDataTypeSize) != 0))
4308 : {
4309 905 : if (eSrcType == eDstType)
4310 : {
4311 34800 : for (decltype(nWordCount) i = 0; i < nWordCount; i++)
4312 : {
4313 34000 : memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
4314 : static_cast<const GByte *>(pSrcData) +
4315 34000 : nSrcPixelStride * i,
4316 : nDstDataTypeSize);
4317 : }
4318 : }
4319 : else
4320 : {
4321 210 : const auto getAlignedPtr = [](GByte *ptr, int align)
4322 : {
4323 : return ptr +
4324 210 : ((align - (reinterpret_cast<uintptr_t>(ptr) % align)) %
4325 210 : align);
4326 : };
4327 :
4328 : // The largest we need is for CFloat64 (16 bytes), so 32 bytes to
4329 : // be sure to get correctly aligned pointer.
4330 105 : constexpr size_t SIZEOF_CFLOAT64 = 2 * sizeof(double);
4331 : GByte abySrcBuffer[2 * SIZEOF_CFLOAT64];
4332 : GByte abyDstBuffer[2 * SIZEOF_CFLOAT64];
4333 : GByte *pabySrcBuffer =
4334 105 : getAlignedPtr(abySrcBuffer, nSrcDataTypeSize);
4335 : GByte *pabyDstBuffer =
4336 105 : getAlignedPtr(abyDstBuffer, nDstDataTypeSize);
4337 3360 : for (decltype(nWordCount) i = 0; i < nWordCount; i++)
4338 : {
4339 3255 : memcpy(pabySrcBuffer,
4340 : static_cast<const GByte *>(pSrcData) +
4341 3255 : nSrcPixelStride * i,
4342 : nSrcDataTypeSize);
4343 3255 : GDALCopyWords64(pabySrcBuffer, eSrcType, 0, pabyDstBuffer,
4344 : eDstType, 0, 1);
4345 3255 : memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
4346 : pabyDstBuffer, nDstDataTypeSize);
4347 : }
4348 : }
4349 905 : return;
4350 : }
4351 :
4352 : // Deal with the case where we're replicating a single word into the
4353 : // provided buffer
4354 116984000 : if (nSrcPixelStride == 0 && nWordCount > 1)
4355 : {
4356 1080420 : GDALReplicateWord(pSrcData, eSrcType, pDstData, eDstType,
4357 : nDstPixelStride, nWordCount);
4358 1080420 : return;
4359 : }
4360 :
4361 115904000 : if (eSrcType == eDstType)
4362 : {
4363 54851400 : if (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8)
4364 : {
4365 18153500 : GDALFastCopy(static_cast<GByte *>(pDstData), nDstPixelStride,
4366 : static_cast<const GByte *>(pSrcData), nSrcPixelStride,
4367 : nWordCount);
4368 18153500 : return;
4369 : }
4370 :
4371 36697800 : if (nSrcDataTypeSize == 2 && (nSrcPixelStride % 2) == 0 &&
4372 21796200 : (nDstPixelStride % 2) == 0)
4373 : {
4374 21796200 : GDALFastCopy(static_cast<short *>(pDstData), nDstPixelStride,
4375 : static_cast<const short *>(pSrcData), nSrcPixelStride,
4376 : nWordCount);
4377 21796200 : return;
4378 : }
4379 :
4380 14901700 : if (nWordCount == 1)
4381 : {
4382 : #if defined(CSA_BUILD) || defined(__COVERITY__)
4383 : // Avoid false positives...
4384 : memcpy(pDstData, pSrcData, nSrcDataTypeSize);
4385 : #else
4386 14411900 : if (nSrcDataTypeSize == 2)
4387 0 : memcpy(pDstData, pSrcData, 2);
4388 14411900 : else if (nSrcDataTypeSize == 4)
4389 13807600 : memcpy(pDstData, pSrcData, 4);
4390 604342 : else if (nSrcDataTypeSize == 8)
4391 587737 : memcpy(pDstData, pSrcData, 8);
4392 : else /* if( eSrcType == GDT_CFloat64 ) */
4393 16605 : memcpy(pDstData, pSrcData, 16);
4394 : #endif
4395 14411900 : return;
4396 : }
4397 :
4398 : // Let memcpy() handle the case where we're copying a packed buffer
4399 : // of pixels.
4400 489743 : if (nSrcPixelStride == nDstPixelStride)
4401 : {
4402 227899 : if (nSrcPixelStride == nSrcDataTypeSize)
4403 : {
4404 227819 : memcpy(pDstData, pSrcData, nWordCount * nSrcDataTypeSize);
4405 227819 : return;
4406 : }
4407 : }
4408 : }
4409 :
4410 : // Handle the more general case -- deals with conversion of data types
4411 : // directly.
4412 61314400 : switch (eSrcType)
4413 : {
4414 20306800 : case GDT_UInt8:
4415 20306800 : GDALCopyWordsFromT<unsigned char>(
4416 : static_cast<const unsigned char *>(pSrcData), nSrcPixelStride,
4417 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4418 20306800 : break;
4419 1786 : case GDT_Int8:
4420 1786 : GDALCopyWordsFromT<signed char>(
4421 : static_cast<const signed char *>(pSrcData), nSrcPixelStride,
4422 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4423 1786 : break;
4424 55545 : case GDT_UInt16:
4425 55545 : GDALCopyWordsFromT<unsigned short>(
4426 : static_cast<const unsigned short *>(pSrcData), nSrcPixelStride,
4427 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4428 55545 : break;
4429 6519830 : case GDT_Int16:
4430 6519830 : GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
4431 : nSrcPixelStride, false, pDstData,
4432 : eDstType, nDstPixelStride, nWordCount);
4433 6519830 : break;
4434 8262 : case GDT_UInt32:
4435 8262 : GDALCopyWordsFromT<unsigned int>(
4436 : static_cast<const unsigned int *>(pSrcData), nSrcPixelStride,
4437 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4438 8262 : break;
4439 12254800 : case GDT_Int32:
4440 12254800 : GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
4441 : nSrcPixelStride, false, pDstData, eDstType,
4442 : nDstPixelStride, nWordCount);
4443 12254800 : break;
4444 2205 : case GDT_UInt64:
4445 2205 : GDALCopyWordsFromT<std::uint64_t>(
4446 : static_cast<const std::uint64_t *>(pSrcData), nSrcPixelStride,
4447 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4448 2205 : break;
4449 11729 : case GDT_Int64:
4450 11729 : GDALCopyWordsFromT<std::int64_t>(
4451 : static_cast<const std::int64_t *>(pSrcData), nSrcPixelStride,
4452 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4453 11729 : break;
4454 1387 : case GDT_Float16:
4455 1387 : GDALCopyWordsFromT<GFloat16>(
4456 : static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, false,
4457 : pDstData, eDstType, nDstPixelStride, nWordCount);
4458 1387 : break;
4459 664998 : case GDT_Float32:
4460 664998 : GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
4461 : nSrcPixelStride, false, pDstData,
4462 : eDstType, nDstPixelStride, nWordCount);
4463 664998 : break;
4464 20726100 : case GDT_Float64:
4465 20726100 : GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
4466 : nSrcPixelStride, false, pDstData,
4467 : eDstType, nDstPixelStride, nWordCount);
4468 20726100 : break;
4469 478486 : case GDT_CInt16:
4470 478486 : GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
4471 : nSrcPixelStride, true, pDstData, eDstType,
4472 : nDstPixelStride, nWordCount);
4473 478486 : break;
4474 868 : case GDT_CInt32:
4475 868 : GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
4476 : nSrcPixelStride, true, pDstData, eDstType,
4477 : nDstPixelStride, nWordCount);
4478 868 : break;
4479 508 : case GDT_CFloat16:
4480 508 : GDALCopyWordsFromT<GFloat16>(
4481 : static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, true,
4482 : pDstData, eDstType, nDstPixelStride, nWordCount);
4483 508 : break;
4484 2437 : case GDT_CFloat32:
4485 2437 : GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
4486 : nSrcPixelStride, true, pDstData, eDstType,
4487 : nDstPixelStride, nWordCount);
4488 2437 : break;
4489 278618 : case GDT_CFloat64:
4490 278618 : GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
4491 : nSrcPixelStride, true, pDstData,
4492 : eDstType, nDstPixelStride, nWordCount);
4493 278618 : break;
4494 0 : case GDT_Unknown:
4495 : case GDT_TypeCount:
4496 0 : CPLAssert(false);
4497 : }
4498 : }
4499 :
4500 : /************************************************************************/
4501 : /* GDALCopyBits() */
4502 : /************************************************************************/
4503 :
4504 : /**
4505 : * Bitwise word copying.
4506 : *
4507 : * A function for moving sets of partial bytes around. Loosely
4508 : * speaking this is a bitwise analog to GDALCopyWords().
4509 : *
4510 : * It copies nStepCount "words" where each word is nBitCount bits long.
4511 : * The nSrcStep and nDstStep are the number of bits from the start of one
4512 : * word to the next (same as nBitCount if they are packed). The nSrcOffset
4513 : * and nDstOffset are the offset into the source and destination buffers
4514 : * to start at, also measured in bits.
4515 : *
4516 : * All bit offsets are assumed to start from the high order bit in a byte
4517 : * (i.e. most significant bit first). Currently this function is not very
4518 : * optimized, but it may be improved for some common cases in the future
4519 : * as needed.
4520 : *
4521 : * @param pabySrcData the source data buffer.
4522 : * @param nSrcOffset the offset (in bits) in pabySrcData to the start of the
4523 : * first word to copy.
4524 : * @param nSrcStep the offset in bits from the start one source word to the
4525 : * start of the next.
4526 : * @param pabyDstData the destination data buffer.
4527 : * @param nDstOffset the offset (in bits) in pabyDstData to the start of the
4528 : * first word to copy over.
4529 : * @param nDstStep the offset in bits from the start one word to the
4530 : * start of the next.
4531 : * @param nBitCount the number of bits in a word to be copied.
4532 : * @param nStepCount the number of words to copy.
4533 : */
4534 :
4535 0 : void GDALCopyBits(const GByte *pabySrcData, int nSrcOffset, int nSrcStep,
4536 : GByte *pabyDstData, int nDstOffset, int nDstStep,
4537 : int nBitCount, int nStepCount)
4538 :
4539 : {
4540 0 : VALIDATE_POINTER0(pabySrcData, "GDALCopyBits");
4541 :
4542 0 : for (int iStep = 0; iStep < nStepCount; iStep++)
4543 : {
4544 0 : for (int iBit = 0; iBit < nBitCount; iBit++)
4545 : {
4546 0 : if (pabySrcData[nSrcOffset >> 3] & (0x80 >> (nSrcOffset & 7)))
4547 0 : pabyDstData[nDstOffset >> 3] |= (0x80 >> (nDstOffset & 7));
4548 : else
4549 0 : pabyDstData[nDstOffset >> 3] &= ~(0x80 >> (nDstOffset & 7));
4550 :
4551 0 : nSrcOffset++;
4552 0 : nDstOffset++;
4553 : }
4554 :
4555 0 : nSrcOffset += (nSrcStep - nBitCount);
4556 0 : nDstOffset += (nDstStep - nBitCount);
4557 : }
4558 : }
4559 :
4560 : /************************************************************************/
4561 : /* GDALBandGetBestOverviewLevel() */
4562 : /************************************************************************/
4563 :
4564 525463 : int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand,
4565 : double dfTargetDownsamplingRatio,
4566 : double dfOversamplingThreshold)
4567 : {
4568 525463 : int iBestOvr = -1;
4569 525463 : double dfBestRatio = 0;
4570 525463 : const int nOvCount = poBand->GetOverviewCount();
4571 525463 : constexpr double EPSILON = 1e-1;
4572 1053620 : for (int iOvr = -1; iOvr < nOvCount; iOvr++)
4573 : {
4574 531090 : double dfOvrRatio = 1.0;
4575 531090 : GDALRasterBand *poOvrBand = nullptr;
4576 531090 : if (iOvr >= 0)
4577 : {
4578 5627 : poOvrBand = poBand->GetOverview(iOvr);
4579 11254 : if (poOvrBand == nullptr ||
4580 11254 : poOvrBand->GetXSize() > poBand->GetXSize() ||
4581 5627 : poOvrBand->GetYSize() > poBand->GetYSize())
4582 : {
4583 0 : continue;
4584 : }
4585 22508 : dfOvrRatio = std::min(static_cast<double>(poBand->GetXSize()) /
4586 5627 : poOvrBand->GetXSize(),
4587 11254 : static_cast<double>(poBand->GetYSize()) /
4588 11254 : poOvrBand->GetYSize());
4589 : }
4590 :
4591 : // Is it nearly the requested factor and better (lower) than
4592 : // the current best factor?
4593 : // Use an epsilon because of numerical instability.
4594 531197 : if (dfOvrRatio >=
4595 531090 : dfTargetDownsamplingRatio * dfOversamplingThreshold + EPSILON ||
4596 : dfOvrRatio <= dfBestRatio)
4597 : {
4598 107 : continue;
4599 : }
4600 :
4601 530983 : if (poOvrBand)
4602 : {
4603 : // Ignore AVERAGE_BIT2GRAYSCALE overviews.
4604 : const char *pszResampling =
4605 5520 : poOvrBand->GetMetadataItem("RESAMPLING");
4606 5520 : if (pszResampling != nullptr &&
4607 71 : STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2"))
4608 : {
4609 16 : continue;
4610 : }
4611 : }
4612 :
4613 530967 : iBestOvr = iOvr;
4614 530967 : dfBestRatio = dfOvrRatio;
4615 530967 : if (std::abs(dfTargetDownsamplingRatio - dfOvrRatio) < EPSILON)
4616 : {
4617 2938 : break;
4618 : }
4619 : }
4620 525463 : return iBestOvr;
4621 : }
4622 :
4623 : /************************************************************************/
4624 : /* GDALGetBestOverviewLevel() */
4625 : /* */
4626 : /* Returns the best overview level to satisfy the query or -1 if none */
4627 : /* Also updates nXOff, nYOff, nXSize, nYSize and psExtraArg when */
4628 : /* returning a valid overview level */
4629 : /************************************************************************/
4630 :
4631 0 : int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand, int &nXOff, int &nYOff,
4632 : int &nXSize, int &nYSize, int nBufXSize,
4633 : int nBufYSize)
4634 : {
4635 0 : return GDALBandGetBestOverviewLevel2(poBand, nXOff, nYOff, nXSize, nYSize,
4636 0 : nBufXSize, nBufYSize, nullptr);
4637 : }
4638 :
4639 525556 : int GDALBandGetBestOverviewLevel2(GDALRasterBand *poBand, int &nXOff,
4640 : int &nYOff, int &nXSize, int &nYSize,
4641 : int nBufXSize, int nBufYSize,
4642 : GDALRasterIOExtraArg *psExtraArg)
4643 : {
4644 525556 : if (psExtraArg != nullptr && psExtraArg->nVersion > 1 &&
4645 525556 : psExtraArg->bUseOnlyThisScale)
4646 109 : return -1;
4647 : /* -------------------------------------------------------------------- */
4648 : /* Compute the desired downsampling factor. It is */
4649 : /* based on the least reduced axis, and represents the number */
4650 : /* of source pixels to one destination pixel. */
4651 : /* -------------------------------------------------------------------- */
4652 525447 : const double dfDesiredDownsamplingFactor =
4653 525447 : ((nXSize / static_cast<double>(nBufXSize)) <
4654 363107 : (nYSize / static_cast<double>(nBufYSize)) ||
4655 : nBufYSize == 1)
4656 755372 : ? nXSize / static_cast<double>(nBufXSize)
4657 133182 : : nYSize / static_cast<double>(nBufYSize);
4658 :
4659 : /* -------------------------------------------------------------------- */
4660 : /* Find the overview level that largest downsampling factor (most */
4661 : /* downsampled) that is still less than (or only a little more) */
4662 : /* downsampled than the request. */
4663 : /* -------------------------------------------------------------------- */
4664 :
4665 : const char *pszOversampligThreshold =
4666 525447 : CPLGetConfigOption("GDAL_OVERVIEW_OVERSAMPLING_THRESHOLD", nullptr);
4667 :
4668 : // Cf https://github.com/OSGeo/gdal/pull/9040#issuecomment-1898524693
4669 : const double dfOversamplingThreshold =
4670 1050880 : pszOversampligThreshold ? CPLAtof(pszOversampligThreshold)
4671 525438 : : psExtraArg && psExtraArg->eResampleAlg != GRIORA_NearestNeighbour
4672 1050880 : ? 1.0
4673 525447 : : 1.2;
4674 525447 : const int iBestOvrLevel = GDALBandGetBestOverviewLevel(
4675 : poBand, dfDesiredDownsamplingFactor, dfOversamplingThreshold);
4676 :
4677 : /* -------------------------------------------------------------------- */
4678 : /* If we didn't find an overview that helps us, just return */
4679 : /* indicating failure and the full resolution image will be used. */
4680 : /* -------------------------------------------------------------------- */
4681 525447 : if (iBestOvrLevel < 0)
4682 522454 : return -1;
4683 2993 : const GDALRasterBand *poBestOverview = poBand->GetOverview(iBestOvrLevel);
4684 :
4685 : /* -------------------------------------------------------------------- */
4686 : /* Recompute the source window in terms of the selected */
4687 : /* overview. */
4688 : /* -------------------------------------------------------------------- */
4689 : const double dfXFactor =
4690 2993 : poBand->GetXSize() / static_cast<double>(poBestOverview->GetXSize());
4691 : const double dfYFactor =
4692 2993 : poBand->GetYSize() / static_cast<double>(poBestOverview->GetYSize());
4693 2993 : CPLDebug("GDAL", "Selecting overview %d x %d", poBestOverview->GetXSize(),
4694 : poBestOverview->GetYSize());
4695 :
4696 8979 : const int nOXOff = std::min(poBestOverview->GetXSize() - 1,
4697 2993 : static_cast<int>(nXOff / dfXFactor + 0.5));
4698 8979 : const int nOYOff = std::min(poBestOverview->GetYSize() - 1,
4699 2993 : static_cast<int>(nYOff / dfYFactor + 0.5));
4700 2993 : int nOXSize = std::max(1, static_cast<int>(nXSize / dfXFactor + 0.5));
4701 2993 : int nOYSize = std::max(1, static_cast<int>(nYSize / dfYFactor + 0.5));
4702 2993 : if (nOXOff + nOXSize > poBestOverview->GetXSize())
4703 0 : nOXSize = poBestOverview->GetXSize() - nOXOff;
4704 2993 : if (nOYOff + nOYSize > poBestOverview->GetYSize())
4705 2 : nOYSize = poBestOverview->GetYSize() - nOYOff;
4706 :
4707 2993 : if (psExtraArg)
4708 : {
4709 2993 : if (psExtraArg->bFloatingPointWindowValidity)
4710 : {
4711 117 : psExtraArg->dfXOff /= dfXFactor;
4712 117 : psExtraArg->dfXSize /= dfXFactor;
4713 117 : psExtraArg->dfYOff /= dfYFactor;
4714 117 : psExtraArg->dfYSize /= dfYFactor;
4715 : }
4716 2876 : else if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
4717 : {
4718 16 : psExtraArg->bFloatingPointWindowValidity = true;
4719 16 : psExtraArg->dfXOff = nXOff / dfXFactor;
4720 16 : psExtraArg->dfXSize = nXSize / dfXFactor;
4721 16 : psExtraArg->dfYOff = nYOff / dfYFactor;
4722 16 : psExtraArg->dfYSize = nYSize / dfYFactor;
4723 : }
4724 : }
4725 :
4726 2993 : nXOff = nOXOff;
4727 2993 : nYOff = nOYOff;
4728 2993 : nXSize = nOXSize;
4729 2993 : nYSize = nOYSize;
4730 :
4731 2993 : return iBestOvrLevel;
4732 : }
4733 :
4734 : /************************************************************************/
4735 : /* OverviewRasterIO() */
4736 : /* */
4737 : /* Special work function to utilize available overviews to */
4738 : /* more efficiently satisfy downsampled requests. It will */
4739 : /* return CE_Failure if there are no appropriate overviews */
4740 : /* available but it doesn't emit any error messages. */
4741 : /************************************************************************/
4742 :
4743 : //! @cond Doxygen_Suppress
4744 1 : CPLErr GDALRasterBand::OverviewRasterIO(
4745 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4746 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4747 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
4748 :
4749 : {
4750 : GDALRasterIOExtraArg sExtraArg;
4751 1 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4752 :
4753 1 : const int nOverview = GDALBandGetBestOverviewLevel2(
4754 : this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, &sExtraArg);
4755 1 : if (nOverview < 0)
4756 1 : return CE_Failure;
4757 :
4758 : /* -------------------------------------------------------------------- */
4759 : /* Recast the call in terms of the new raster layer. */
4760 : /* -------------------------------------------------------------------- */
4761 0 : GDALRasterBand *poOverviewBand = GetOverview(nOverview);
4762 0 : if (poOverviewBand == nullptr)
4763 0 : return CE_Failure;
4764 :
4765 0 : return poOverviewBand->RasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
4766 : pData, nBufXSize, nBufYSize, eBufType,
4767 0 : nPixelSpace, nLineSpace, &sExtraArg);
4768 : }
4769 :
4770 : /************************************************************************/
4771 : /* TryOverviewRasterIO() */
4772 : /************************************************************************/
4773 :
4774 362428 : CPLErr GDALRasterBand::TryOverviewRasterIO(
4775 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4776 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4777 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg,
4778 : int *pbTried)
4779 : {
4780 362428 : int nXOffMod = nXOff;
4781 362428 : int nYOffMod = nYOff;
4782 362428 : int nXSizeMod = nXSize;
4783 362428 : int nYSizeMod = nYSize;
4784 : GDALRasterIOExtraArg sExtraArg;
4785 :
4786 362428 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4787 :
4788 362428 : int iOvrLevel = GDALBandGetBestOverviewLevel2(
4789 : this, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize, nBufYSize,
4790 : &sExtraArg);
4791 :
4792 362428 : if (iOvrLevel >= 0)
4793 : {
4794 53 : GDALRasterBand *poOverviewBand = GetOverview(iOvrLevel);
4795 53 : if (poOverviewBand)
4796 : {
4797 53 : *pbTried = TRUE;
4798 53 : return poOverviewBand->RasterIO(
4799 : eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData,
4800 : nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
4801 53 : &sExtraArg);
4802 : }
4803 : }
4804 :
4805 362375 : *pbTried = FALSE;
4806 362375 : return CE_None;
4807 : }
4808 :
4809 : /************************************************************************/
4810 : /* TryOverviewRasterIO() */
4811 : /************************************************************************/
4812 :
4813 160153 : CPLErr GDALDataset::TryOverviewRasterIO(
4814 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4815 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4816 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
4817 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg,
4818 : int *pbTried)
4819 : {
4820 160153 : int nXOffMod = nXOff;
4821 160153 : int nYOffMod = nYOff;
4822 160153 : int nXSizeMod = nXSize;
4823 160153 : int nYSizeMod = nYSize;
4824 : GDALRasterIOExtraArg sExtraArg;
4825 160153 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4826 :
4827 320306 : int iOvrLevel = GDALBandGetBestOverviewLevel2(
4828 160153 : papoBands[0], nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize,
4829 : nBufYSize, &sExtraArg);
4830 :
4831 160196 : if (iOvrLevel >= 0 && papoBands[0]->GetOverview(iOvrLevel) != nullptr &&
4832 43 : papoBands[0]->GetOverview(iOvrLevel)->GetDataset() != nullptr)
4833 : {
4834 43 : *pbTried = TRUE;
4835 43 : return papoBands[0]->GetOverview(iOvrLevel)->GetDataset()->RasterIO(
4836 : eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData, nBufXSize,
4837 : nBufYSize, eBufType, nBandCount, panBandMap, nPixelSpace,
4838 43 : nLineSpace, nBandSpace, &sExtraArg);
4839 : }
4840 : else
4841 : {
4842 160110 : *pbTried = FALSE;
4843 160110 : return CE_None;
4844 : }
4845 : }
4846 :
4847 : /************************************************************************/
4848 : /* GetBestOverviewLevel() */
4849 : /* */
4850 : /* Returns the best overview level to satisfy the query or -1 if none */
4851 : /* Also updates nXOff, nYOff, nXSize, nYSize when returning a valid */
4852 : /* overview level */
4853 : /************************************************************************/
4854 :
4855 4 : static int GDALDatasetGetBestOverviewLevel(GDALDataset *poDS, int &nXOff,
4856 : int &nYOff, int &nXSize, int &nYSize,
4857 : int nBufXSize, int nBufYSize,
4858 : int nBandCount,
4859 : const int *panBandMap,
4860 : GDALRasterIOExtraArg *psExtraArg)
4861 : {
4862 4 : int nOverviewCount = 0;
4863 4 : GDALRasterBand *poFirstBand = nullptr;
4864 :
4865 : /* -------------------------------------------------------------------- */
4866 : /* Check that all bands have the same number of overviews and */
4867 : /* that they have all the same size and block dimensions */
4868 : /* -------------------------------------------------------------------- */
4869 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
4870 : {
4871 8 : GDALRasterBand *poBand = poDS->GetRasterBand(panBandMap[iBand]);
4872 8 : if (poBand == nullptr)
4873 0 : return -1;
4874 8 : if (iBand == 0)
4875 : {
4876 4 : poFirstBand = poBand;
4877 4 : nOverviewCount = poBand->GetOverviewCount();
4878 : }
4879 4 : else if (nOverviewCount != poBand->GetOverviewCount())
4880 : {
4881 0 : CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
4882 : "mismatched overview count, use std method.");
4883 0 : return -1;
4884 : }
4885 : else
4886 : {
4887 4 : for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
4888 : {
4889 0 : GDALRasterBand *poOvrBand = poBand->GetOverview(iOverview);
4890 : GDALRasterBand *poOvrFirstBand =
4891 0 : poFirstBand->GetOverview(iOverview);
4892 0 : if (poOvrBand == nullptr || poOvrFirstBand == nullptr)
4893 0 : continue;
4894 :
4895 0 : if (poOvrFirstBand->GetXSize() != poOvrBand->GetXSize() ||
4896 0 : poOvrFirstBand->GetYSize() != poOvrBand->GetYSize())
4897 : {
4898 0 : CPLDebug("GDAL",
4899 : "GDALDataset::GetBestOverviewLevel() ... "
4900 : "mismatched overview sizes, use std method.");
4901 0 : return -1;
4902 : }
4903 0 : int nBlockXSizeFirst = 0;
4904 0 : int nBlockYSizeFirst = 0;
4905 0 : poOvrFirstBand->GetBlockSize(&nBlockXSizeFirst,
4906 : &nBlockYSizeFirst);
4907 :
4908 0 : int nBlockXSizeCurrent = 0;
4909 0 : int nBlockYSizeCurrent = 0;
4910 0 : poOvrBand->GetBlockSize(&nBlockXSizeCurrent,
4911 : &nBlockYSizeCurrent);
4912 :
4913 0 : if (nBlockXSizeFirst != nBlockXSizeCurrent ||
4914 0 : nBlockYSizeFirst != nBlockYSizeCurrent)
4915 : {
4916 0 : CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
4917 : "mismatched block sizes, use std method.");
4918 0 : return -1;
4919 : }
4920 : }
4921 : }
4922 : }
4923 4 : if (poFirstBand == nullptr)
4924 0 : return -1;
4925 :
4926 4 : return GDALBandGetBestOverviewLevel2(poFirstBand, nXOff, nYOff, nXSize,
4927 : nYSize, nBufXSize, nBufYSize,
4928 4 : psExtraArg);
4929 : }
4930 :
4931 : /************************************************************************/
4932 : /* BlockBasedRasterIO() */
4933 : /* */
4934 : /* This convenience function implements a dataset level */
4935 : /* RasterIO() interface based on calling down to fetch blocks, */
4936 : /* much like the GDALRasterBand::IRasterIO(), but it handles */
4937 : /* all bands at once, so that a format driver that handles a */
4938 : /* request for different bands of the same block efficiently */
4939 : /* (i.e. without re-reading interleaved data) will efficiently. */
4940 : /* */
4941 : /* This method is intended to be called by an overridden */
4942 : /* IRasterIO() method in the driver specific GDALDataset */
4943 : /* derived class. */
4944 : /* */
4945 : /* Default internal implementation of RasterIO() ... utilizes */
4946 : /* the Block access methods to satisfy the request. This would */
4947 : /* normally only be overridden by formats with overviews. */
4948 : /* */
4949 : /* To keep things relatively simple, this method does not */
4950 : /* currently take advantage of some special cases addressed in */
4951 : /* GDALRasterBand::IRasterIO(), so it is likely best to only */
4952 : /* call it when you know it will help. That is in cases where */
4953 : /* data is at 1:1 to the buffer, and you know the driver is */
4954 : /* implementing interleaved IO efficiently on a block by block */
4955 : /* basis. Overviews will be used when possible. */
4956 : /************************************************************************/
4957 :
4958 65948 : CPLErr GDALDataset::BlockBasedRasterIO(
4959 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4960 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4961 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
4962 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
4963 :
4964 : {
4965 65948 : CPLAssert(nullptr != pData);
4966 :
4967 65948 : GByte **papabySrcBlock = nullptr;
4968 65948 : GDALRasterBlock *poBlock = nullptr;
4969 65948 : GDALRasterBlock **papoBlocks = nullptr;
4970 65948 : int nLBlockX = -1;
4971 65948 : int nLBlockY = -1;
4972 : int iBufYOff;
4973 : int iBufXOff;
4974 65948 : int nBlockXSize = 1;
4975 65948 : int nBlockYSize = 1;
4976 65948 : CPLErr eErr = CE_None;
4977 65948 : GDALDataType eDataType = GDT_UInt8;
4978 :
4979 65948 : const bool bUseIntegerRequestCoords =
4980 65991 : (!psExtraArg->bFloatingPointWindowValidity ||
4981 43 : (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
4982 41 : nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
4983 :
4984 : /* -------------------------------------------------------------------- */
4985 : /* Ensure that all bands share a common block size and data type. */
4986 : /* -------------------------------------------------------------------- */
4987 312052 : for (int iBand = 0; iBand < nBandCount; iBand++)
4988 : {
4989 246104 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
4990 :
4991 246104 : if (iBand == 0)
4992 : {
4993 65948 : poBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
4994 65948 : eDataType = poBand->GetRasterDataType();
4995 : }
4996 : else
4997 : {
4998 180156 : int nThisBlockXSize = 0;
4999 180156 : int nThisBlockYSize = 0;
5000 180156 : poBand->GetBlockSize(&nThisBlockXSize, &nThisBlockYSize);
5001 180156 : if (nThisBlockXSize != nBlockXSize ||
5002 180156 : nThisBlockYSize != nBlockYSize)
5003 : {
5004 0 : CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
5005 : "mismatched block sizes, use std method.");
5006 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
5007 : pData, nBufXSize, nBufYSize, eBufType,
5008 : nBandCount, panBandMap, nPixelSpace,
5009 0 : nLineSpace, nBandSpace, psExtraArg);
5010 : }
5011 :
5012 180156 : if (eDataType != poBand->GetRasterDataType() &&
5013 0 : (nXSize != nBufXSize || nYSize != nBufYSize))
5014 : {
5015 0 : CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
5016 : "mismatched band data types, use std method.");
5017 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
5018 : pData, nBufXSize, nBufYSize, eBufType,
5019 : nBandCount, panBandMap, nPixelSpace,
5020 0 : nLineSpace, nBandSpace, psExtraArg);
5021 : }
5022 : }
5023 : }
5024 :
5025 : /* ==================================================================== */
5026 : /* In this special case at full resolution we step through in */
5027 : /* blocks, turning the request over to the per-band */
5028 : /* IRasterIO(), but ensuring that all bands of one block are */
5029 : /* called before proceeding to the next. */
5030 : /* ==================================================================== */
5031 :
5032 65948 : if (nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
5033 : {
5034 : GDALRasterIOExtraArg sDummyExtraArg;
5035 65944 : INIT_RASTERIO_EXTRA_ARG(sDummyExtraArg);
5036 :
5037 65944 : int nChunkYSize = 0;
5038 65944 : int nChunkXSize = 0;
5039 :
5040 215391 : for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff += nChunkYSize)
5041 : {
5042 150463 : const int nChunkYOff = iBufYOff + nYOff;
5043 150463 : nChunkYSize = nBlockYSize - (nChunkYOff % nBlockYSize);
5044 150463 : if (nChunkYOff + nChunkYSize > nYOff + nYSize)
5045 60939 : nChunkYSize = (nYOff + nYSize) - nChunkYOff;
5046 :
5047 825901 : for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff += nChunkXSize)
5048 : {
5049 676453 : const int nChunkXOff = iBufXOff + nXOff;
5050 676453 : nChunkXSize = nBlockXSize - (nChunkXOff % nBlockXSize);
5051 676453 : if (nChunkXOff + nChunkXSize > nXOff + nXSize)
5052 71011 : nChunkXSize = (nXOff + nXSize) - nChunkXOff;
5053 :
5054 676453 : GByte *pabyChunkData =
5055 676453 : static_cast<GByte *>(pData) + iBufXOff * nPixelSpace +
5056 676453 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace;
5057 :
5058 3291120 : for (int iBand = 0; iBand < nBandCount; iBand++)
5059 : {
5060 2615680 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
5061 :
5062 5231370 : eErr = poBand->IRasterIO(
5063 : eRWFlag, nChunkXOff, nChunkYOff, nChunkXSize,
5064 : nChunkYSize,
5065 2615680 : pabyChunkData +
5066 2615680 : static_cast<GPtrDiff_t>(iBand) * nBandSpace,
5067 : nChunkXSize, nChunkYSize, eBufType, nPixelSpace,
5068 2615680 : nLineSpace, &sDummyExtraArg);
5069 2615680 : if (eErr != CE_None)
5070 1015 : return eErr;
5071 : }
5072 : }
5073 :
5074 168362 : if (psExtraArg->pfnProgress != nullptr &&
5075 18914 : !psExtraArg->pfnProgress(
5076 168362 : 1.0 * std::min(nBufYSize, iBufYOff + nChunkYSize) /
5077 : nBufYSize,
5078 : "", psExtraArg->pProgressData))
5079 : {
5080 1 : return CE_Failure;
5081 : }
5082 : }
5083 :
5084 64928 : return CE_None;
5085 : }
5086 :
5087 : /* Below code is not compatible with that case. It would need a complete */
5088 : /* separate code like done in GDALRasterBand::IRasterIO. */
5089 4 : if (eRWFlag == GF_Write && (nBufXSize < nXSize || nBufYSize < nYSize))
5090 : {
5091 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
5092 : nBufXSize, nBufYSize, eBufType, nBandCount,
5093 : panBandMap, nPixelSpace, nLineSpace,
5094 0 : nBandSpace, psExtraArg);
5095 : }
5096 :
5097 : /* We could have a smarter implementation, but that will do for now */
5098 4 : if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
5099 0 : (nBufXSize != nXSize || nBufYSize != nYSize))
5100 : {
5101 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
5102 : nBufXSize, nBufYSize, eBufType, nBandCount,
5103 : panBandMap, nPixelSpace, nLineSpace,
5104 0 : nBandSpace, psExtraArg);
5105 : }
5106 :
5107 : /* ==================================================================== */
5108 : /* Loop reading required source blocks to satisfy output */
5109 : /* request. This is the most general implementation. */
5110 : /* ==================================================================== */
5111 :
5112 4 : const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
5113 :
5114 : papabySrcBlock =
5115 4 : static_cast<GByte **>(CPLCalloc(sizeof(GByte *), nBandCount));
5116 : papoBlocks =
5117 4 : static_cast<GDALRasterBlock **>(CPLCalloc(sizeof(void *), nBandCount));
5118 :
5119 : /* -------------------------------------------------------------------- */
5120 : /* Select an overview level if appropriate. */
5121 : /* -------------------------------------------------------------------- */
5122 :
5123 : GDALRasterIOExtraArg sExtraArg;
5124 4 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
5125 4 : const int nOverviewLevel = GDALDatasetGetBestOverviewLevel(
5126 : this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, nBandCount,
5127 : panBandMap, &sExtraArg);
5128 4 : if (nOverviewLevel >= 0)
5129 : {
5130 2 : GetRasterBand(panBandMap[0])
5131 2 : ->GetOverview(nOverviewLevel)
5132 2 : ->GetBlockSize(&nBlockXSize, &nBlockYSize);
5133 : }
5134 :
5135 4 : double dfXOff = nXOff;
5136 4 : double dfYOff = nYOff;
5137 4 : double dfXSize = nXSize;
5138 4 : double dfYSize = nYSize;
5139 4 : if (sExtraArg.bFloatingPointWindowValidity)
5140 : {
5141 2 : dfXOff = sExtraArg.dfXOff;
5142 2 : dfYOff = sExtraArg.dfYOff;
5143 2 : dfXSize = sExtraArg.dfXSize;
5144 2 : dfYSize = sExtraArg.dfYSize;
5145 : }
5146 :
5147 : /* -------------------------------------------------------------------- */
5148 : /* Compute stepping increment. */
5149 : /* -------------------------------------------------------------------- */
5150 4 : const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
5151 4 : const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
5152 :
5153 4 : constexpr double EPS = 1e-10;
5154 : /* -------------------------------------------------------------------- */
5155 : /* Loop over buffer computing source locations. */
5156 : /* -------------------------------------------------------------------- */
5157 36 : for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
5158 : {
5159 : GPtrDiff_t iSrcOffset;
5160 :
5161 : // Add small epsilon to avoid some numeric precision issues.
5162 32 : const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
5163 32 : const int iSrcY = static_cast<int>(std::min(
5164 32 : std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
5165 :
5166 32 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
5167 : static_cast<GPtrDiff_t>(nLineSpace);
5168 :
5169 302 : for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff++)
5170 : {
5171 270 : const double dfSrcX = (iBufXOff + 0.5) * dfSrcXInc + dfXOff + EPS;
5172 270 : const int iSrcX = static_cast<int>(std::min(
5173 270 : std::max(0.0, dfSrcX), static_cast<double>(nRasterXSize - 1)));
5174 :
5175 : // FIXME: this code likely doesn't work if the dirty block gets
5176 : // flushed to disk before being completely written. In the meantime,
5177 : // bJustInitialize should probably be set to FALSE even if it is not
5178 : // ideal performance wise, and for lossy compression
5179 :
5180 : /* --------------------------------------------------------------------
5181 : */
5182 : /* Ensure we have the appropriate block loaded. */
5183 : /* --------------------------------------------------------------------
5184 : */
5185 270 : if (iSrcX < nLBlockX * nBlockXSize ||
5186 270 : iSrcX - nBlockXSize >= nLBlockX * nBlockXSize ||
5187 266 : iSrcY < nLBlockY * nBlockYSize ||
5188 266 : iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
5189 : {
5190 4 : nLBlockX = iSrcX / nBlockXSize;
5191 4 : nLBlockY = iSrcY / nBlockYSize;
5192 :
5193 4 : const bool bJustInitialize =
5194 0 : eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
5195 0 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
5196 4 : nXOff <= nLBlockX * nBlockXSize &&
5197 0 : nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
5198 : /*bool bMemZeroBuffer = FALSE;
5199 : if( eRWFlag == GF_Write && !bJustInitialize &&
5200 : nXOff <= nLBlockX * nBlockXSize &&
5201 : nYOff <= nLBlockY * nBlockYSize &&
5202 : (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
5203 : (nXOff + nXSize == GetRasterXSize() &&
5204 : (nLBlockX+1) * nBlockXSize > GetRasterXSize())) &&
5205 : (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
5206 : (nYOff + nYSize == GetRasterYSize() &&
5207 : (nLBlockY+1) * nBlockYSize > GetRasterYSize())) )
5208 : {
5209 : bJustInitialize = TRUE;
5210 : bMemZeroBuffer = TRUE;
5211 : }*/
5212 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
5213 : {
5214 8 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
5215 8 : if (nOverviewLevel >= 0)
5216 2 : poBand = poBand->GetOverview(nOverviewLevel);
5217 16 : poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY,
5218 8 : bJustInitialize);
5219 8 : if (poBlock == nullptr)
5220 : {
5221 0 : eErr = CE_Failure;
5222 0 : goto CleanupAndReturn;
5223 : }
5224 :
5225 8 : if (eRWFlag == GF_Write)
5226 0 : poBlock->MarkDirty();
5227 :
5228 8 : if (papoBlocks[iBand] != nullptr)
5229 0 : papoBlocks[iBand]->DropLock();
5230 :
5231 8 : papoBlocks[iBand] = poBlock;
5232 :
5233 8 : papabySrcBlock[iBand] =
5234 8 : static_cast<GByte *>(poBlock->GetDataRef());
5235 : /*if( bMemZeroBuffer )
5236 : {
5237 : memset(papabySrcBlock[iBand], 0,
5238 : static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
5239 : * nBlockYSize);
5240 : }*/
5241 : }
5242 : }
5243 :
5244 : /* --------------------------------------------------------------------
5245 : */
5246 : /* Copy over this pixel of data. */
5247 : /* --------------------------------------------------------------------
5248 : */
5249 270 : iSrcOffset = (static_cast<GPtrDiff_t>(iSrcX) -
5250 270 : static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
5251 270 : (static_cast<GPtrDiff_t>(iSrcY) -
5252 270 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
5253 270 : nBlockXSize) *
5254 270 : nBandDataSize;
5255 :
5256 980 : for (int iBand = 0; iBand < nBandCount; iBand++)
5257 : {
5258 710 : GByte *pabySrcBlock = papabySrcBlock[iBand];
5259 710 : GPtrDiff_t iBandBufOffset =
5260 710 : iBufOffset + static_cast<GPtrDiff_t>(iBand) *
5261 : static_cast<GPtrDiff_t>(nBandSpace);
5262 :
5263 710 : if (eDataType == eBufType)
5264 : {
5265 710 : if (eRWFlag == GF_Read)
5266 710 : memcpy(static_cast<GByte *>(pData) + iBandBufOffset,
5267 710 : pabySrcBlock + iSrcOffset, nBandDataSize);
5268 : else
5269 0 : memcpy(pabySrcBlock + iSrcOffset,
5270 : static_cast<const GByte *>(pData) +
5271 0 : iBandBufOffset,
5272 : nBandDataSize);
5273 : }
5274 : else
5275 : {
5276 : /* type to type conversion ... ouch, this is expensive way
5277 : of handling single words */
5278 :
5279 0 : if (eRWFlag == GF_Read)
5280 0 : GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
5281 : static_cast<GByte *>(pData) +
5282 0 : iBandBufOffset,
5283 : eBufType, 0, 1);
5284 : else
5285 0 : GDALCopyWords64(static_cast<const GByte *>(pData) +
5286 0 : iBandBufOffset,
5287 0 : eBufType, 0, pabySrcBlock + iSrcOffset,
5288 : eDataType, 0, 1);
5289 : }
5290 : }
5291 :
5292 270 : iBufOffset += static_cast<int>(nPixelSpace);
5293 : }
5294 : }
5295 :
5296 : /* -------------------------------------------------------------------- */
5297 : /* CleanupAndReturn. */
5298 : /* -------------------------------------------------------------------- */
5299 4 : CleanupAndReturn:
5300 4 : CPLFree(papabySrcBlock);
5301 4 : if (papoBlocks != nullptr)
5302 : {
5303 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
5304 : {
5305 8 : if (papoBlocks[iBand] != nullptr)
5306 8 : papoBlocks[iBand]->DropLock();
5307 : }
5308 4 : CPLFree(papoBlocks);
5309 : }
5310 :
5311 4 : return eErr;
5312 : }
5313 :
5314 : //! @endcond
5315 :
5316 : /************************************************************************/
5317 : /* GDALCopyWholeRasterGetSwathSize() */
5318 : /************************************************************************/
5319 :
5320 3405 : static void GDALCopyWholeRasterGetSwathSize(GDALRasterBand *poSrcPrototypeBand,
5321 : GDALRasterBand *poDstPrototypeBand,
5322 : int nBandCount,
5323 : int bDstIsCompressed,
5324 : int bInterleave, int *pnSwathCols,
5325 : int *pnSwathLines)
5326 : {
5327 3405 : GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
5328 3405 : int nSrcBlockXSize = 0;
5329 3405 : int nSrcBlockYSize = 0;
5330 3405 : int nBlockXSize = 0;
5331 3405 : int nBlockYSize = 0;
5332 :
5333 3405 : int nXSize = poSrcPrototypeBand->GetXSize();
5334 3405 : int nYSize = poSrcPrototypeBand->GetYSize();
5335 :
5336 3405 : poSrcPrototypeBand->GetBlockSize(&nSrcBlockXSize, &nSrcBlockYSize);
5337 3405 : poDstPrototypeBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
5338 :
5339 3405 : const int nMaxBlockXSize = std::max(nBlockXSize, nSrcBlockXSize);
5340 3405 : const int nMaxBlockYSize = std::max(nBlockYSize, nSrcBlockYSize);
5341 :
5342 3405 : int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5343 3405 : if (bInterleave)
5344 585 : nPixelSize *= nBandCount;
5345 :
5346 : // aim for one row of blocks. Do not settle for less.
5347 3405 : int nSwathCols = nXSize;
5348 3405 : int nSwathLines = nMaxBlockYSize;
5349 :
5350 : const char *pszSrcCompression =
5351 3405 : poSrcPrototypeBand->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
5352 3405 : if (pszSrcCompression == nullptr)
5353 : {
5354 3385 : auto poSrcDS = poSrcPrototypeBand->GetDataset();
5355 3385 : if (poSrcDS)
5356 : pszSrcCompression =
5357 3379 : poSrcDS->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
5358 : }
5359 :
5360 : /* -------------------------------------------------------------------- */
5361 : /* What will our swath size be? */
5362 : /* -------------------------------------------------------------------- */
5363 : // When writing interleaved data in a compressed format, we want to be sure
5364 : // that each block will only be written once, so the swath size must not be
5365 : // greater than the block cache.
5366 3405 : const char *pszSwathSize = CPLGetConfigOption("GDAL_SWATH_SIZE", nullptr);
5367 : int nTargetSwathSize;
5368 3405 : if (pszSwathSize != nullptr)
5369 0 : nTargetSwathSize = static_cast<int>(
5370 0 : std::min(GIntBig(INT_MAX), CPLAtoGIntBig(pszSwathSize)));
5371 : else
5372 : {
5373 : // As a default, take one 1/4 of the cache size.
5374 3405 : nTargetSwathSize = static_cast<int>(
5375 3405 : std::min(GIntBig(INT_MAX), GDALGetCacheMax64() / 4));
5376 :
5377 : // but if the minimum idal swath buf size is less, then go for it to
5378 : // avoid unnecessarily abusing RAM usage.
5379 : // but try to use 10 MB at least.
5380 3405 : GIntBig nIdealSwathBufSize =
5381 3405 : static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize;
5382 3405 : int nMinTargetSwathSize = 10 * 1000 * 1000;
5383 :
5384 3405 : if ((poSrcPrototypeBand->GetSuggestedBlockAccessPattern() &
5385 3405 : GSBAP_LARGEST_CHUNK_POSSIBLE) != 0)
5386 : {
5387 1 : nMinTargetSwathSize = nTargetSwathSize;
5388 : }
5389 :
5390 3405 : if (nIdealSwathBufSize < nTargetSwathSize &&
5391 3395 : nIdealSwathBufSize < nMinTargetSwathSize)
5392 : {
5393 3392 : nIdealSwathBufSize = nMinTargetSwathSize;
5394 : }
5395 :
5396 3405 : if (pszSrcCompression != nullptr &&
5397 185 : EQUAL(pszSrcCompression, "JPEG2000") &&
5398 0 : (!bDstIsCompressed || ((nSrcBlockXSize % nBlockXSize) == 0 &&
5399 0 : (nSrcBlockYSize % nBlockYSize) == 0)))
5400 : {
5401 2 : nIdealSwathBufSize =
5402 4 : std::max(nIdealSwathBufSize, static_cast<GIntBig>(nSwathCols) *
5403 2 : nSrcBlockYSize * nPixelSize);
5404 : }
5405 3405 : if (nTargetSwathSize > nIdealSwathBufSize)
5406 3392 : nTargetSwathSize = static_cast<int>(
5407 3392 : std::min(GIntBig(INT_MAX), nIdealSwathBufSize));
5408 : }
5409 :
5410 3405 : if (nTargetSwathSize < 1000000)
5411 8 : nTargetSwathSize = 1000000;
5412 :
5413 : /* But let's check that */
5414 3626 : if (bDstIsCompressed && bInterleave &&
5415 221 : nTargetSwathSize > GDALGetCacheMax64())
5416 : {
5417 0 : CPLError(CE_Warning, CPLE_AppDefined,
5418 : "When translating into a compressed interleave format, "
5419 : "the block cache size (" CPL_FRMT_GIB ") "
5420 : "should be at least the size of the swath (%d) "
5421 : "(GDAL_SWATH_SIZE config. option)",
5422 : GDALGetCacheMax64(), nTargetSwathSize);
5423 : }
5424 :
5425 : #define IS_DIVIDER_OF(x, y) ((y) % (x) == 0)
5426 : #define ROUND_TO(x, y) (((x) / (y)) * (y))
5427 :
5428 : // if both input and output datasets are tiled, that the tile dimensions
5429 : // are "compatible", try to stick to a swath dimension that is a multiple
5430 : // of input and output block dimensions.
5431 3405 : if (nBlockXSize != nXSize && nSrcBlockXSize != nXSize &&
5432 47 : IS_DIVIDER_OF(nBlockXSize, nMaxBlockXSize) &&
5433 47 : IS_DIVIDER_OF(nSrcBlockXSize, nMaxBlockXSize) &&
5434 47 : IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
5435 47 : IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
5436 : {
5437 47 : if (static_cast<GIntBig>(nMaxBlockXSize) * nMaxBlockYSize *
5438 47 : nPixelSize <=
5439 47 : static_cast<GIntBig>(nTargetSwathSize))
5440 : {
5441 47 : nSwathCols = nTargetSwathSize / (nMaxBlockYSize * nPixelSize);
5442 47 : nSwathCols = ROUND_TO(nSwathCols, nMaxBlockXSize);
5443 47 : if (nSwathCols == 0)
5444 0 : nSwathCols = nMaxBlockXSize;
5445 47 : if (nSwathCols > nXSize)
5446 45 : nSwathCols = nXSize;
5447 47 : nSwathLines = nMaxBlockYSize;
5448 :
5449 47 : if (static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize >
5450 47 : static_cast<GIntBig>(nTargetSwathSize))
5451 : {
5452 0 : nSwathCols = nXSize;
5453 0 : nSwathLines = nBlockYSize;
5454 : }
5455 : }
5456 : }
5457 :
5458 3405 : const GIntBig nMemoryPerCol = static_cast<GIntBig>(nSwathCols) * nPixelSize;
5459 3405 : const GIntBig nSwathBufSize = nMemoryPerCol * nSwathLines;
5460 3405 : if (nSwathBufSize > static_cast<GIntBig>(nTargetSwathSize))
5461 : {
5462 1 : nSwathLines = static_cast<int>(nTargetSwathSize / nMemoryPerCol);
5463 1 : if (nSwathLines == 0)
5464 1 : nSwathLines = 1;
5465 :
5466 1 : CPLDebug(
5467 : "GDAL",
5468 : "GDALCopyWholeRasterGetSwathSize(): adjusting to %d line swath "
5469 : "since requirement (" CPL_FRMT_GIB " bytes) exceed target swath "
5470 : "size (%d bytes) (GDAL_SWATH_SIZE config. option)",
5471 1 : nSwathLines, nBlockYSize * nMemoryPerCol, nTargetSwathSize);
5472 : }
5473 : // If we are processing single scans, try to handle several at once.
5474 : // If we are handling swaths already, only grow the swath if a row
5475 : // of blocks is substantially less than our target buffer size.
5476 3404 : else if (nSwathLines == 1 ||
5477 2850 : nMemoryPerCol * nSwathLines <
5478 2850 : static_cast<GIntBig>(nTargetSwathSize) / 10)
5479 : {
5480 3376 : nSwathLines = std::min(
5481 : nYSize,
5482 3376 : std::max(1, static_cast<int>(nTargetSwathSize / nMemoryPerCol)));
5483 :
5484 : /* If possible try to align to source and target block height */
5485 3376 : if ((nSwathLines % nMaxBlockYSize) != 0 &&
5486 273 : nSwathLines > nMaxBlockYSize &&
5487 273 : IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
5488 244 : IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
5489 217 : nSwathLines = ROUND_TO(nSwathLines, nMaxBlockYSize);
5490 : }
5491 :
5492 3405 : if (pszSrcCompression != nullptr && EQUAL(pszSrcCompression, "JPEG2000") &&
5493 0 : (!bDstIsCompressed || (IS_DIVIDER_OF(nBlockXSize, nSrcBlockXSize) &&
5494 0 : IS_DIVIDER_OF(nBlockYSize, nSrcBlockYSize))))
5495 : {
5496 : // Typical use case: converting from Pleaiades that is 2048x2048 tiled.
5497 2 : if (nSwathLines < nSrcBlockYSize)
5498 : {
5499 0 : nSwathLines = nSrcBlockYSize;
5500 :
5501 : // Number of pixels that can be read/write simultaneously.
5502 0 : nSwathCols = nTargetSwathSize / (nSrcBlockXSize * nPixelSize);
5503 0 : nSwathCols = ROUND_TO(nSwathCols, nSrcBlockXSize);
5504 0 : if (nSwathCols == 0)
5505 0 : nSwathCols = nSrcBlockXSize;
5506 0 : if (nSwathCols > nXSize)
5507 0 : nSwathCols = nXSize;
5508 :
5509 0 : CPLDebug(
5510 : "GDAL",
5511 : "GDALCopyWholeRasterGetSwathSize(): because of compression and "
5512 : "too high block, "
5513 : "use partial width at one time");
5514 : }
5515 2 : else if ((nSwathLines % nSrcBlockYSize) != 0)
5516 : {
5517 : /* Round on a multiple of nSrcBlockYSize */
5518 0 : nSwathLines = ROUND_TO(nSwathLines, nSrcBlockYSize);
5519 0 : CPLDebug(
5520 : "GDAL",
5521 : "GDALCopyWholeRasterGetSwathSize(): because of compression, "
5522 : "round nSwathLines to block height : %d",
5523 : nSwathLines);
5524 : }
5525 : }
5526 3403 : else if (bDstIsCompressed)
5527 : {
5528 426 : if (nSwathLines < nBlockYSize)
5529 : {
5530 153 : nSwathLines = nBlockYSize;
5531 :
5532 : // Number of pixels that can be read/write simultaneously.
5533 153 : nSwathCols = nTargetSwathSize / (nSwathLines * nPixelSize);
5534 153 : nSwathCols = ROUND_TO(nSwathCols, nBlockXSize);
5535 153 : if (nSwathCols == 0)
5536 0 : nSwathCols = nBlockXSize;
5537 153 : if (nSwathCols > nXSize)
5538 153 : nSwathCols = nXSize;
5539 :
5540 153 : CPLDebug(
5541 : "GDAL",
5542 : "GDALCopyWholeRasterGetSwathSize(): because of compression and "
5543 : "too high block, "
5544 : "use partial width at one time");
5545 : }
5546 273 : else if ((nSwathLines % nBlockYSize) != 0)
5547 : {
5548 : // Round on a multiple of nBlockYSize.
5549 9 : nSwathLines = ROUND_TO(nSwathLines, nBlockYSize);
5550 9 : CPLDebug(
5551 : "GDAL",
5552 : "GDALCopyWholeRasterGetSwathSize(): because of compression, "
5553 : "round nSwathLines to block height : %d",
5554 : nSwathLines);
5555 : }
5556 : }
5557 :
5558 3405 : *pnSwathCols = nSwathCols;
5559 3405 : *pnSwathLines = nSwathLines;
5560 3405 : }
5561 :
5562 : /************************************************************************/
5563 : /* GDALDatasetCopyWholeRaster() */
5564 : /************************************************************************/
5565 :
5566 : /**
5567 : * \brief Copy all dataset raster data.
5568 : *
5569 : * This function copies the complete raster contents of one dataset to
5570 : * another similarly configured dataset. The source and destination
5571 : * dataset must have the same number of bands, and the same width
5572 : * and height. The bands do not have to have the same data type.
5573 : *
5574 : * This function is primarily intended to support implementation of
5575 : * driver specific CreateCopy() functions. It implements efficient copying,
5576 : * in particular "chunking" the copy in substantial blocks and, if appropriate,
5577 : * performing the transfer in a pixel interleaved fashion.
5578 : *
5579 : * Currently the only papszOptions value supported are :
5580 : * <ul>
5581 : * <li>"INTERLEAVE=PIXEL/BAND" to force pixel (resp. band) interleaved read and
5582 : * write access pattern (this does not modify the layout of the destination
5583 : * data)</li>
5584 : * <li>"COMPRESSED=YES" to force alignment on target dataset block
5585 : * sizes to achieve best compression.</li>
5586 : * <li>"SKIP_HOLES=YES" to skip chunks
5587 : * for which GDALGetDataCoverageStatus() returns GDAL_DATA_COVERAGE_STATUS_EMPTY
5588 : * (GDAL >= 2.2)</li>
5589 : * </ul>
5590 : * More options may be supported in the future.
5591 : *
5592 : * @param hSrcDS the source dataset
5593 : * @param hDstDS the destination dataset
5594 : * @param papszOptions transfer hints in "StringList" Name=Value format.
5595 : * @param pfnProgress progress reporting function.
5596 : * @param pProgressData callback data for progress function.
5597 : *
5598 : * @return CE_None on success, or CE_Failure on failure.
5599 : */
5600 :
5601 3377 : CPLErr CPL_STDCALL GDALDatasetCopyWholeRaster(GDALDatasetH hSrcDS,
5602 : GDALDatasetH hDstDS,
5603 : CSLConstList papszOptions,
5604 : GDALProgressFunc pfnProgress,
5605 : void *pProgressData)
5606 :
5607 : {
5608 3377 : VALIDATE_POINTER1(hSrcDS, "GDALDatasetCopyWholeRaster", CE_Failure);
5609 3377 : VALIDATE_POINTER1(hDstDS, "GDALDatasetCopyWholeRaster", CE_Failure);
5610 :
5611 3377 : GDALDataset *poSrcDS = GDALDataset::FromHandle(hSrcDS);
5612 3377 : GDALDataset *poDstDS = GDALDataset::FromHandle(hDstDS);
5613 :
5614 3377 : if (pfnProgress == nullptr)
5615 0 : pfnProgress = GDALDummyProgress;
5616 :
5617 : /* -------------------------------------------------------------------- */
5618 : /* Confirm the datasets match in size and band counts. */
5619 : /* -------------------------------------------------------------------- */
5620 3377 : const int nXSize = poDstDS->GetRasterXSize();
5621 3377 : const int nYSize = poDstDS->GetRasterYSize();
5622 3377 : const int nBandCount = poDstDS->GetRasterCount();
5623 :
5624 3377 : if (poSrcDS->GetRasterXSize() != nXSize ||
5625 6754 : poSrcDS->GetRasterYSize() != nYSize ||
5626 3377 : poSrcDS->GetRasterCount() != nBandCount)
5627 : {
5628 0 : CPLError(CE_Failure, CPLE_AppDefined,
5629 : "Input and output dataset sizes or band counts do not\n"
5630 : "match in GDALDatasetCopyWholeRaster()");
5631 0 : return CE_Failure;
5632 : }
5633 :
5634 : /* -------------------------------------------------------------------- */
5635 : /* Report preliminary (0) progress. */
5636 : /* -------------------------------------------------------------------- */
5637 3377 : if (!pfnProgress(0.0, nullptr, pProgressData))
5638 : {
5639 1 : CPLError(CE_Failure, CPLE_UserInterrupt,
5640 : "User terminated CreateCopy()");
5641 1 : return CE_Failure;
5642 : }
5643 :
5644 : /* -------------------------------------------------------------------- */
5645 : /* Get our prototype band, and assume the others are similarly */
5646 : /* configured. */
5647 : /* -------------------------------------------------------------------- */
5648 3376 : if (nBandCount == 0)
5649 0 : return CE_None;
5650 :
5651 3376 : GDALRasterBand *poSrcPrototypeBand = poSrcDS->GetRasterBand(1);
5652 3376 : GDALRasterBand *poDstPrototypeBand = poDstDS->GetRasterBand(1);
5653 3376 : GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
5654 :
5655 : /* -------------------------------------------------------------------- */
5656 : /* Do we want to try and do the operation in a pixel */
5657 : /* interleaved fashion? */
5658 : /* -------------------------------------------------------------------- */
5659 3376 : bool bInterleave = false;
5660 : const char *pszInterleave =
5661 3376 : poSrcDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
5662 3376 : if (pszInterleave != nullptr &&
5663 2965 : (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
5664 209 : bInterleave = true;
5665 :
5666 3376 : pszInterleave = poDstDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
5667 3376 : if (pszInterleave != nullptr &&
5668 2910 : (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
5669 530 : bInterleave = true;
5670 :
5671 3376 : pszInterleave = CSLFetchNameValue(papszOptions, "INTERLEAVE");
5672 3376 : if (pszInterleave != nullptr && EQUAL(pszInterleave, "PIXEL"))
5673 5 : bInterleave = true;
5674 3371 : else if (pszInterleave != nullptr && EQUAL(pszInterleave, "BAND"))
5675 13 : bInterleave = false;
5676 : // attributes is specific to the TileDB driver
5677 3358 : else if (pszInterleave != nullptr && EQUAL(pszInterleave, "ATTRIBUTES"))
5678 4 : bInterleave = true;
5679 3354 : else if (pszInterleave != nullptr)
5680 : {
5681 0 : CPLError(CE_Warning, CPLE_NotSupported,
5682 : "Unsupported value for option INTERLEAVE");
5683 : }
5684 :
5685 : // If the destination is compressed, we must try to write blocks just once,
5686 : // to save disk space (GTiff case for example), and to avoid data loss
5687 : // (JPEG compression for example).
5688 3376 : bool bDstIsCompressed = false;
5689 : const char *pszDstCompressed =
5690 3376 : CSLFetchNameValue(papszOptions, "COMPRESSED");
5691 3376 : if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
5692 400 : bDstIsCompressed = true;
5693 :
5694 : /* -------------------------------------------------------------------- */
5695 : /* What will our swath size be? */
5696 : /* -------------------------------------------------------------------- */
5697 :
5698 3376 : int nSwathCols = 0;
5699 3376 : int nSwathLines = 0;
5700 3376 : GDALCopyWholeRasterGetSwathSize(poSrcPrototypeBand, poDstPrototypeBand,
5701 : nBandCount, bDstIsCompressed, bInterleave,
5702 : &nSwathCols, &nSwathLines);
5703 :
5704 3376 : int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5705 3376 : if (bInterleave)
5706 585 : nPixelSize *= nBandCount;
5707 :
5708 3376 : void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
5709 3376 : if (pSwathBuf == nullptr)
5710 : {
5711 0 : return CE_Failure;
5712 : }
5713 :
5714 3376 : CPLDebug("GDAL",
5715 : "GDALDatasetCopyWholeRaster(): %d*%d swaths, bInterleave=%d",
5716 : nSwathCols, nSwathLines, static_cast<int>(bInterleave));
5717 :
5718 : // Advise the source raster that we are going to read it completely
5719 : // Note: this might already have been done by GDALCreateCopy() in the
5720 : // likely case this function is indirectly called by it
5721 3376 : poSrcDS->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nBandCount,
5722 3376 : nullptr, nullptr);
5723 :
5724 : /* ==================================================================== */
5725 : /* Band oriented (uninterleaved) case. */
5726 : /* ==================================================================== */
5727 3376 : CPLErr eErr = CE_None;
5728 : const bool bCheckHoles =
5729 3376 : CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
5730 :
5731 3376 : if (!bInterleave)
5732 : {
5733 : GDALRasterIOExtraArg sExtraArg;
5734 2791 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5735 2791 : CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress); // to make cppcheck happy
5736 :
5737 8373 : const GIntBig nTotalBlocks = static_cast<GIntBig>(nBandCount) *
5738 2791 : DIV_ROUND_UP(nYSize, nSwathLines) *
5739 2791 : DIV_ROUND_UP(nXSize, nSwathCols);
5740 2791 : GIntBig nBlocksDone = 0;
5741 :
5742 8025 : for (int iBand = 0; iBand < nBandCount && eErr == CE_None; iBand++)
5743 : {
5744 5234 : int nBand = iBand + 1;
5745 :
5746 10731 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5747 : {
5748 5497 : int nThisLines = nSwathLines;
5749 :
5750 5497 : if (iY + nThisLines > nYSize)
5751 375 : nThisLines = nYSize - iY;
5752 :
5753 10994 : for (int iX = 0; iX < nXSize && eErr == CE_None;
5754 5497 : iX += nSwathCols)
5755 : {
5756 5497 : int nThisCols = nSwathCols;
5757 :
5758 5497 : if (iX + nThisCols > nXSize)
5759 0 : nThisCols = nXSize - iX;
5760 :
5761 5497 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5762 5497 : if (bCheckHoles)
5763 : {
5764 : nStatus = poSrcDS->GetRasterBand(nBand)
5765 3779 : ->GetDataCoverageStatus(
5766 : iX, iY, nThisCols, nThisLines,
5767 : GDAL_DATA_COVERAGE_STATUS_DATA);
5768 : }
5769 5497 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5770 : {
5771 5493 : sExtraArg.pfnProgress = GDALScaledProgress;
5772 10986 : sExtraArg.pProgressData = GDALCreateScaledProgress(
5773 5493 : nBlocksDone / static_cast<double>(nTotalBlocks),
5774 5493 : (nBlocksDone + 0.5) /
5775 5493 : static_cast<double>(nTotalBlocks),
5776 : pfnProgress, pProgressData);
5777 5493 : if (sExtraArg.pProgressData == nullptr)
5778 1688 : sExtraArg.pfnProgress = nullptr;
5779 :
5780 5493 : eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
5781 : nThisLines, pSwathBuf,
5782 : nThisCols, nThisLines, eDT, 1,
5783 : &nBand, 0, 0, 0, &sExtraArg);
5784 :
5785 5493 : GDALDestroyScaledProgress(sExtraArg.pProgressData);
5786 :
5787 5493 : if (eErr == CE_None)
5788 5485 : eErr = poDstDS->RasterIO(
5789 : GF_Write, iX, iY, nThisCols, nThisLines,
5790 : pSwathBuf, nThisCols, nThisLines, eDT, 1,
5791 : &nBand, 0, 0, 0, nullptr);
5792 : }
5793 :
5794 5497 : nBlocksDone++;
5795 10951 : if (eErr == CE_None &&
5796 5454 : !pfnProgress(nBlocksDone /
5797 5454 : static_cast<double>(nTotalBlocks),
5798 : nullptr, pProgressData))
5799 : {
5800 2 : eErr = CE_Failure;
5801 2 : CPLError(CE_Failure, CPLE_UserInterrupt,
5802 : "User terminated CreateCopy()");
5803 : }
5804 : }
5805 : }
5806 : }
5807 : }
5808 :
5809 : /* ==================================================================== */
5810 : /* Pixel interleaved case. */
5811 : /* ==================================================================== */
5812 : else /* if( bInterleave ) */
5813 : {
5814 : GDALRasterIOExtraArg sExtraArg;
5815 585 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5816 585 : CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress); // to make cppcheck happy
5817 :
5818 585 : const GIntBig nTotalBlocks =
5819 585 : static_cast<GIntBig>(DIV_ROUND_UP(nYSize, nSwathLines)) *
5820 585 : DIV_ROUND_UP(nXSize, nSwathCols);
5821 585 : GIntBig nBlocksDone = 0;
5822 :
5823 1392 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5824 : {
5825 807 : int nThisLines = nSwathLines;
5826 :
5827 807 : if (iY + nThisLines > nYSize)
5828 198 : nThisLines = nYSize - iY;
5829 :
5830 1619 : for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
5831 : {
5832 812 : int nThisCols = nSwathCols;
5833 :
5834 812 : if (iX + nThisCols > nXSize)
5835 3 : nThisCols = nXSize - iX;
5836 :
5837 812 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5838 812 : if (bCheckHoles)
5839 : {
5840 553 : nStatus = 0;
5841 606 : for (int iBand = 0; iBand < nBandCount; iBand++)
5842 : {
5843 587 : nStatus |= poSrcDS->GetRasterBand(iBand + 1)
5844 587 : ->GetDataCoverageStatus(
5845 : iX, iY, nThisCols, nThisLines,
5846 : GDAL_DATA_COVERAGE_STATUS_DATA);
5847 587 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5848 534 : break;
5849 : }
5850 : }
5851 812 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5852 : {
5853 793 : sExtraArg.pfnProgress = GDALScaledProgress;
5854 1586 : sExtraArg.pProgressData = GDALCreateScaledProgress(
5855 793 : nBlocksDone / static_cast<double>(nTotalBlocks),
5856 793 : (nBlocksDone + 0.5) / static_cast<double>(nTotalBlocks),
5857 : pfnProgress, pProgressData);
5858 793 : if (sExtraArg.pProgressData == nullptr)
5859 377 : sExtraArg.pfnProgress = nullptr;
5860 :
5861 793 : eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
5862 : nThisLines, pSwathBuf, nThisCols,
5863 : nThisLines, eDT, nBandCount,
5864 : nullptr, 0, 0, 0, &sExtraArg);
5865 :
5866 793 : GDALDestroyScaledProgress(sExtraArg.pProgressData);
5867 :
5868 793 : if (eErr == CE_None)
5869 792 : eErr = poDstDS->RasterIO(
5870 : GF_Write, iX, iY, nThisCols, nThisLines, pSwathBuf,
5871 : nThisCols, nThisLines, eDT, nBandCount, nullptr, 0,
5872 : 0, 0, nullptr);
5873 : }
5874 :
5875 812 : nBlocksDone++;
5876 1619 : if (eErr == CE_None &&
5877 807 : !pfnProgress(nBlocksDone /
5878 807 : static_cast<double>(nTotalBlocks),
5879 : nullptr, pProgressData))
5880 : {
5881 1 : eErr = CE_Failure;
5882 1 : CPLError(CE_Failure, CPLE_UserInterrupt,
5883 : "User terminated CreateCopy()");
5884 : }
5885 : }
5886 : }
5887 : }
5888 :
5889 : /* -------------------------------------------------------------------- */
5890 : /* Cleanup */
5891 : /* -------------------------------------------------------------------- */
5892 3376 : CPLFree(pSwathBuf);
5893 :
5894 3376 : return eErr;
5895 : }
5896 :
5897 : /************************************************************************/
5898 : /* GDALRasterBandCopyWholeRaster() */
5899 : /************************************************************************/
5900 :
5901 : /**
5902 : * \brief Copy a whole raster band
5903 : *
5904 : * This function copies the complete raster contents of one band to
5905 : * another similarly configured band. The source and destination
5906 : * bands must have the same width and height. The bands do not have
5907 : * to have the same data type.
5908 : *
5909 : * It implements efficient copying, in particular "chunking" the copy in
5910 : * substantial blocks.
5911 : *
5912 : * Currently the only papszOptions value supported are :
5913 : * <ul>
5914 : * <li>"COMPRESSED=YES" to force alignment on target dataset block sizes to
5915 : * achieve best compression.</li>
5916 : * <li>"SKIP_HOLES=YES" to skip chunks for which GDALGetDataCoverageStatus()
5917 : * returns GDAL_DATA_COVERAGE_STATUS_EMPTY (GDAL >= 2.2)</li>
5918 : * </ul>
5919 : *
5920 : * @param hSrcBand the source band
5921 : * @param hDstBand the destination band
5922 : * @param papszOptions transfer hints in "StringList" Name=Value format.
5923 : * @param pfnProgress progress reporting function.
5924 : * @param pProgressData callback data for progress function.
5925 : *
5926 : * @return CE_None on success, or CE_Failure on failure.
5927 : */
5928 :
5929 29 : CPLErr CPL_STDCALL GDALRasterBandCopyWholeRaster(
5930 : GDALRasterBandH hSrcBand, GDALRasterBandH hDstBand,
5931 : const char *const *const papszOptions, GDALProgressFunc pfnProgress,
5932 : void *pProgressData)
5933 :
5934 : {
5935 29 : VALIDATE_POINTER1(hSrcBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
5936 29 : VALIDATE_POINTER1(hDstBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
5937 :
5938 29 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
5939 29 : GDALRasterBand *poDstBand = GDALRasterBand::FromHandle(hDstBand);
5940 29 : CPLErr eErr = CE_None;
5941 :
5942 29 : if (pfnProgress == nullptr)
5943 2 : pfnProgress = GDALDummyProgress;
5944 :
5945 : /* -------------------------------------------------------------------- */
5946 : /* Confirm the datasets match in size and band counts. */
5947 : /* -------------------------------------------------------------------- */
5948 29 : int nXSize = poSrcBand->GetXSize();
5949 29 : int nYSize = poSrcBand->GetYSize();
5950 :
5951 29 : if (poDstBand->GetXSize() != nXSize || poDstBand->GetYSize() != nYSize)
5952 : {
5953 0 : CPLError(CE_Failure, CPLE_AppDefined,
5954 : "Input and output band sizes do not\n"
5955 : "match in GDALRasterBandCopyWholeRaster()");
5956 0 : return CE_Failure;
5957 : }
5958 :
5959 : /* -------------------------------------------------------------------- */
5960 : /* Report preliminary (0) progress. */
5961 : /* -------------------------------------------------------------------- */
5962 29 : if (!pfnProgress(0.0, nullptr, pProgressData))
5963 : {
5964 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
5965 : "User terminated CreateCopy()");
5966 0 : return CE_Failure;
5967 : }
5968 :
5969 29 : GDALDataType eDT = poDstBand->GetRasterDataType();
5970 :
5971 : // If the destination is compressed, we must try to write blocks just once,
5972 : // to save disk space (GTiff case for example), and to avoid data loss
5973 : // (JPEG compression for example).
5974 29 : bool bDstIsCompressed = false;
5975 : const char *pszDstCompressed =
5976 29 : CSLFetchNameValue(const_cast<char **>(papszOptions), "COMPRESSED");
5977 29 : if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
5978 26 : bDstIsCompressed = true;
5979 :
5980 : /* -------------------------------------------------------------------- */
5981 : /* What will our swath size be? */
5982 : /* -------------------------------------------------------------------- */
5983 :
5984 29 : int nSwathCols = 0;
5985 29 : int nSwathLines = 0;
5986 29 : GDALCopyWholeRasterGetSwathSize(poSrcBand, poDstBand, 1, bDstIsCompressed,
5987 : FALSE, &nSwathCols, &nSwathLines);
5988 :
5989 29 : const int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5990 :
5991 29 : void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
5992 29 : if (pSwathBuf == nullptr)
5993 : {
5994 0 : return CE_Failure;
5995 : }
5996 :
5997 29 : CPLDebug("GDAL", "GDALRasterBandCopyWholeRaster(): %d*%d swaths",
5998 : nSwathCols, nSwathLines);
5999 :
6000 : const bool bCheckHoles =
6001 29 : CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
6002 :
6003 : // Advise the source raster that we are going to read it completely
6004 29 : poSrcBand->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nullptr);
6005 :
6006 : /* ==================================================================== */
6007 : /* Band oriented (uninterleaved) case. */
6008 : /* ==================================================================== */
6009 :
6010 72 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
6011 : {
6012 43 : int nThisLines = nSwathLines;
6013 :
6014 43 : if (iY + nThisLines > nYSize)
6015 8 : nThisLines = nYSize - iY;
6016 :
6017 86 : for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
6018 : {
6019 43 : int nThisCols = nSwathCols;
6020 :
6021 43 : if (iX + nThisCols > nXSize)
6022 0 : nThisCols = nXSize - iX;
6023 :
6024 43 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
6025 43 : if (bCheckHoles)
6026 : {
6027 0 : nStatus = poSrcBand->GetDataCoverageStatus(
6028 : iX, iY, nThisCols, nThisLines,
6029 : GDAL_DATA_COVERAGE_STATUS_DATA);
6030 : }
6031 43 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
6032 : {
6033 43 : eErr = poSrcBand->RasterIO(GF_Read, iX, iY, nThisCols,
6034 : nThisLines, pSwathBuf, nThisCols,
6035 : nThisLines, eDT, 0, 0, nullptr);
6036 :
6037 43 : if (eErr == CE_None)
6038 43 : eErr = poDstBand->RasterIO(GF_Write, iX, iY, nThisCols,
6039 : nThisLines, pSwathBuf, nThisCols,
6040 : nThisLines, eDT, 0, 0, nullptr);
6041 : }
6042 :
6043 86 : if (eErr == CE_None && !pfnProgress(double(iY + nThisLines) /
6044 43 : static_cast<double>(nYSize),
6045 : nullptr, pProgressData))
6046 : {
6047 0 : eErr = CE_Failure;
6048 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
6049 : "User terminated CreateCopy()");
6050 : }
6051 : }
6052 : }
6053 :
6054 : /* -------------------------------------------------------------------- */
6055 : /* Cleanup */
6056 : /* -------------------------------------------------------------------- */
6057 29 : CPLFree(pSwathBuf);
6058 :
6059 29 : return eErr;
6060 : }
6061 :
6062 : /************************************************************************/
6063 : /* GDALCopyRasterIOExtraArg () */
6064 : /************************************************************************/
6065 :
6066 535023 : void GDALCopyRasterIOExtraArg(GDALRasterIOExtraArg *psDestArg,
6067 : const GDALRasterIOExtraArg *psSrcArg)
6068 : {
6069 535023 : INIT_RASTERIO_EXTRA_ARG(*psDestArg);
6070 535023 : if (psSrcArg)
6071 : {
6072 535023 : psDestArg->eResampleAlg = psSrcArg->eResampleAlg;
6073 535023 : psDestArg->pfnProgress = psSrcArg->pfnProgress;
6074 535023 : psDestArg->pProgressData = psSrcArg->pProgressData;
6075 535023 : psDestArg->bFloatingPointWindowValidity =
6076 535023 : psSrcArg->bFloatingPointWindowValidity;
6077 535023 : if (psSrcArg->bFloatingPointWindowValidity)
6078 : {
6079 212051 : psDestArg->dfXOff = psSrcArg->dfXOff;
6080 212051 : psDestArg->dfYOff = psSrcArg->dfYOff;
6081 212051 : psDestArg->dfXSize = psSrcArg->dfXSize;
6082 212051 : psDestArg->dfYSize = psSrcArg->dfYSize;
6083 : }
6084 535023 : if (psSrcArg->nVersion >= 2)
6085 : {
6086 535023 : psDestArg->bUseOnlyThisScale = psSrcArg->bUseOnlyThisScale;
6087 : }
6088 535023 : if (psSrcArg->nVersion >= 3)
6089 : {
6090 535023 : psDestArg->bOperateInBufType = psSrcArg->bOperateInBufType;
6091 : }
6092 : }
6093 535023 : }
6094 :
6095 : /************************************************************************/
6096 : /* HasOnlyNoData() */
6097 : /************************************************************************/
6098 :
6099 51285976 : template <class T> static inline bool IsEqualToNoData(T value, T noDataValue)
6100 : {
6101 51285976 : return value == noDataValue;
6102 : }
6103 :
6104 5509 : template <> bool IsEqualToNoData<GFloat16>(GFloat16 value, GFloat16 noDataValue)
6105 : {
6106 : using std::isnan;
6107 5509 : return isnan(noDataValue) ? isnan(value) : value == noDataValue;
6108 : }
6109 :
6110 251221 : template <> bool IsEqualToNoData<float>(float value, float noDataValue)
6111 : {
6112 251221 : return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
6113 : }
6114 :
6115 264257 : template <> bool IsEqualToNoData<double>(double value, double noDataValue)
6116 : {
6117 264257 : return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
6118 : }
6119 :
6120 : template <class T>
6121 12024 : static bool HasOnlyNoDataT(const T *pBuffer, T noDataValue, size_t nWidth,
6122 : size_t nHeight, size_t nLineStride,
6123 : size_t nComponents)
6124 : {
6125 : // Fast test: check the 4 corners and the middle pixel.
6126 23297 : for (size_t iBand = 0; iBand < nComponents; iBand++)
6127 : {
6128 24095 : if (!(IsEqualToNoData(pBuffer[iBand], noDataValue) &&
6129 11880 : IsEqualToNoData(pBuffer[(nWidth - 1) * nComponents + iBand],
6130 11750 : noDataValue) &&
6131 11750 : IsEqualToNoData(
6132 11750 : pBuffer[((nHeight - 1) / 2 * nLineStride + (nWidth - 1) / 2) *
6133 11750 : nComponents +
6134 : iBand],
6135 11276 : noDataValue) &&
6136 11276 : IsEqualToNoData(
6137 11276 : pBuffer[(nHeight - 1) * nLineStride * nComponents + iBand],
6138 : noDataValue) &&
6139 11276 : IsEqualToNoData(
6140 11276 : pBuffer[((nHeight - 1) * nLineStride + nWidth - 1) *
6141 11276 : nComponents +
6142 : iBand],
6143 : noDataValue)))
6144 : {
6145 942 : return false;
6146 : }
6147 : }
6148 :
6149 : // Test all pixels.
6150 52954 : for (size_t iY = 0; iY < nHeight; iY++)
6151 : {
6152 41993 : const T *pBufferLine = pBuffer + iY * nLineStride * nComponents;
6153 51790448 : for (size_t iX = 0; iX < nWidth * nComponents; iX++)
6154 : {
6155 51748615 : if (!IsEqualToNoData(pBufferLine[iX], noDataValue))
6156 : {
6157 121 : return false;
6158 : }
6159 : }
6160 : }
6161 10961 : return true;
6162 : }
6163 :
6164 : /************************************************************************/
6165 : /* GDALBufferHasOnlyNoData() */
6166 : /************************************************************************/
6167 :
6168 44011 : bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
6169 : size_t nWidth, size_t nHeight, size_t nLineStride,
6170 : size_t nComponents, int nBitsPerSample,
6171 : GDALBufferSampleFormat nSampleFormat)
6172 : {
6173 : // In the case where the nodata is 0, we can compare several bytes at
6174 : // once. Select the largest natural integer type for the architecture.
6175 44011 : if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
6176 : // Do not use this optimized code path for floating point numbers,
6177 : // as it can't detect negative zero.
6178 : nSampleFormat != GSF_FLOATING_POINT)
6179 : {
6180 27265 : const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
6181 27265 : const size_t nSize =
6182 27265 : static_cast<size_t>((static_cast<uint64_t>(nWidth) * nHeight *
6183 27265 : nComponents * nBitsPerSample +
6184 : 7) /
6185 : 8);
6186 : #ifdef HAVE_SSE2
6187 27265 : size_t n = nSize;
6188 : // Align to 16 bytes
6189 27328 : while ((reinterpret_cast<uintptr_t>(pabyBuffer) & 15) != 0 && n > 0)
6190 : {
6191 73 : --n;
6192 73 : if (*pabyBuffer)
6193 10 : return false;
6194 63 : pabyBuffer++;
6195 : }
6196 :
6197 27255 : const auto zero = _mm_setzero_si128();
6198 27255 : constexpr int UNROLLING = 4;
6199 2223230 : while (n >= UNROLLING * sizeof(zero))
6200 : {
6201 2207980 : const auto v0 = _mm_load_si128(reinterpret_cast<const __m128i *>(
6202 : pabyBuffer + 0 * sizeof(zero)));
6203 2207980 : const auto v1 = _mm_load_si128(reinterpret_cast<const __m128i *>(
6204 2207980 : pabyBuffer + 1 * sizeof(zero)));
6205 2207980 : const auto v2 = _mm_load_si128(reinterpret_cast<const __m128i *>(
6206 2207980 : pabyBuffer + 2 * sizeof(zero)));
6207 2207980 : const auto v3 = _mm_load_si128(reinterpret_cast<const __m128i *>(
6208 2207980 : pabyBuffer + 3 * sizeof(zero)));
6209 : const auto v =
6210 6623930 : _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
6211 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
6212 : if (!_mm_test_all_zeros(v, v))
6213 : #else
6214 4415960 : if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
6215 : #endif
6216 : {
6217 12001 : return false;
6218 : }
6219 2195980 : pabyBuffer += UNROLLING * sizeof(zero);
6220 2195980 : n -= UNROLLING * sizeof(zero);
6221 : }
6222 :
6223 233639 : while (n > 0)
6224 : {
6225 218489 : --n;
6226 218489 : if (*pabyBuffer)
6227 104 : return false;
6228 218385 : pabyBuffer++;
6229 : }
6230 : #else
6231 : #if SIZEOF_VOIDP >= 8 || defined(__x86_64__)
6232 : // We test __x86_64__ for x32 arch where SIZEOF_VOIDP == 4
6233 : typedef std::uint64_t WordType;
6234 : #else
6235 : typedef std::uint32_t WordType;
6236 : #endif
6237 :
6238 : const size_t nInitialIters =
6239 : std::min(sizeof(WordType) -
6240 : static_cast<size_t>(
6241 : reinterpret_cast<std::uintptr_t>(pabyBuffer) %
6242 : sizeof(WordType)),
6243 : nSize);
6244 : size_t i = 0;
6245 : for (; i < nInitialIters; i++)
6246 : {
6247 : if (pabyBuffer[i])
6248 : return false;
6249 : }
6250 : for (; i + sizeof(WordType) - 1 < nSize; i += sizeof(WordType))
6251 : {
6252 : if (*(reinterpret_cast<const WordType *>(pabyBuffer + i)))
6253 : return false;
6254 : }
6255 : for (; i < nSize; i++)
6256 : {
6257 : if (pabyBuffer[i])
6258 : return false;
6259 : }
6260 : #endif
6261 15150 : return true;
6262 : }
6263 :
6264 : #ifdef HAVE_SSE2
6265 16746 : else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
6266 710 : nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
6267 : {
6268 710 : const auto signMask = _mm_set1_epi32(0x7FFFFFFF);
6269 710 : const auto zero = _mm_setzero_si128();
6270 710 : const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
6271 710 : const size_t n = nWidth * nHeight * nComponents;
6272 :
6273 710 : size_t i = 0;
6274 710 : constexpr int UNROLLING = 4;
6275 710 : constexpr size_t VALUES_PER_ITER =
6276 : UNROLLING * sizeof(zero) / sizeof(float);
6277 24985 : for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
6278 : {
6279 24936 : const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6280 : pabyBuffer + 0 * sizeof(zero)));
6281 24936 : const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6282 24936 : pabyBuffer + 1 * sizeof(zero)));
6283 24936 : const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6284 24936 : pabyBuffer + 2 * sizeof(zero)));
6285 24936 : const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6286 24936 : pabyBuffer + 3 * sizeof(zero)));
6287 74808 : auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
6288 : // Clear the sign bit (makes -0.0 become +0.0)
6289 24936 : v = _mm_and_si128(v, signMask);
6290 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
6291 : if (!_mm_test_all_zeros(v, v))
6292 : #else
6293 49872 : if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
6294 : #endif
6295 : {
6296 661 : return false;
6297 : }
6298 24275 : pabyBuffer += UNROLLING * sizeof(zero);
6299 : }
6300 :
6301 304 : for (; i < n; i++)
6302 : {
6303 : uint32_t bits;
6304 272 : memcpy(&bits, pabyBuffer, sizeof(bits));
6305 272 : pabyBuffer += sizeof(bits);
6306 272 : if ((bits & 0x7FFFFFFF) != 0)
6307 17 : return false;
6308 : }
6309 :
6310 32 : return true;
6311 : }
6312 :
6313 16036 : else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
6314 4005 : nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
6315 : {
6316 4005 : const auto signMask = _mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL);
6317 4005 : const auto zero = _mm_setzero_si128();
6318 4005 : const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
6319 4005 : const size_t n = nWidth * nHeight * nComponents;
6320 :
6321 4005 : size_t i = 0;
6322 4005 : constexpr int UNROLLING = 4;
6323 4005 : constexpr size_t VALUES_PER_ITER =
6324 : UNROLLING * sizeof(zero) / sizeof(double);
6325 1664960 : for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
6326 : {
6327 1661340 : const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6328 : pabyBuffer + 0 * sizeof(zero)));
6329 1661340 : const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6330 1661340 : pabyBuffer + 1 * sizeof(zero)));
6331 1661340 : const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6332 1661340 : pabyBuffer + 2 * sizeof(zero)));
6333 1661340 : const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6334 1661340 : pabyBuffer + 3 * sizeof(zero)));
6335 4984020 : auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
6336 : // Clear the sign bit (makes -0.0 become +0.0)
6337 1661340 : v = _mm_and_si128(v, signMask);
6338 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
6339 : if (!_mm_test_all_zeros(v, v))
6340 : #else
6341 3322680 : if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
6342 : #endif
6343 : {
6344 389 : return false;
6345 : }
6346 1660950 : pabyBuffer += UNROLLING * sizeof(zero);
6347 : }
6348 :
6349 3643 : for (; i < n; i++)
6350 : {
6351 : uint64_t bits;
6352 34 : memcpy(&bits, pabyBuffer, sizeof(bits));
6353 34 : pabyBuffer += sizeof(bits);
6354 34 : if ((bits & 0x7FFFFFFFFFFFFFFFULL) != 0)
6355 7 : return false;
6356 : }
6357 :
6358 3609 : return true;
6359 : }
6360 : #endif
6361 :
6362 12031 : if (nBitsPerSample == 8 && nSampleFormat == GSF_UNSIGNED_INT)
6363 : {
6364 22424 : return GDALIsValueInRange<uint8_t>(dfNoDataValue) &&
6365 11212 : HasOnlyNoDataT(static_cast<const uint8_t *>(pBuffer),
6366 11212 : static_cast<uint8_t>(dfNoDataValue), nWidth,
6367 11212 : nHeight, nLineStride, nComponents);
6368 : }
6369 819 : if (nBitsPerSample == 8 && nSampleFormat == GSF_SIGNED_INT)
6370 : {
6371 : // Use unsigned implementation by converting the nodatavalue to
6372 : // unsigned
6373 119 : return GDALIsValueInRange<int8_t>(dfNoDataValue) &&
6374 59 : HasOnlyNoDataT(
6375 : static_cast<const uint8_t *>(pBuffer),
6376 59 : static_cast<uint8_t>(static_cast<int8_t>(dfNoDataValue)),
6377 60 : nWidth, nHeight, nLineStride, nComponents);
6378 : }
6379 759 : if (nBitsPerSample == 16 && nSampleFormat == GSF_UNSIGNED_INT)
6380 : {
6381 23 : return GDALIsValueInRange<uint16_t>(dfNoDataValue) &&
6382 11 : HasOnlyNoDataT(static_cast<const uint16_t *>(pBuffer),
6383 11 : static_cast<uint16_t>(dfNoDataValue), nWidth,
6384 12 : nHeight, nLineStride, nComponents);
6385 : }
6386 747 : if (nBitsPerSample == 16 && nSampleFormat == GSF_SIGNED_INT)
6387 : {
6388 : // Use unsigned implementation by converting the nodatavalue to
6389 : // unsigned
6390 111 : return GDALIsValueInRange<int16_t>(dfNoDataValue) &&
6391 55 : HasOnlyNoDataT(
6392 : static_cast<const uint16_t *>(pBuffer),
6393 55 : static_cast<uint16_t>(static_cast<int16_t>(dfNoDataValue)),
6394 56 : nWidth, nHeight, nLineStride, nComponents);
6395 : }
6396 691 : if (nBitsPerSample == 32 && nSampleFormat == GSF_UNSIGNED_INT)
6397 : {
6398 129 : return GDALIsValueInRange<uint32_t>(dfNoDataValue) &&
6399 64 : HasOnlyNoDataT(static_cast<const uint32_t *>(pBuffer),
6400 : static_cast<uint32_t>(dfNoDataValue), nWidth,
6401 65 : nHeight, nLineStride, nComponents);
6402 : }
6403 626 : if (nBitsPerSample == 32 && nSampleFormat == GSF_SIGNED_INT)
6404 : {
6405 : // Use unsigned implementation by converting the nodatavalue to
6406 : // unsigned
6407 23 : return GDALIsValueInRange<int32_t>(dfNoDataValue) &&
6408 11 : HasOnlyNoDataT(
6409 : static_cast<const uint32_t *>(pBuffer),
6410 11 : static_cast<uint32_t>(static_cast<int32_t>(dfNoDataValue)),
6411 12 : nWidth, nHeight, nLineStride, nComponents);
6412 : }
6413 614 : if (nBitsPerSample == 64 && nSampleFormat == GSF_UNSIGNED_INT)
6414 : {
6415 112 : return GDALIsValueInRange<uint64_t>(dfNoDataValue) &&
6416 56 : HasOnlyNoDataT(static_cast<const uint64_t *>(pBuffer),
6417 : static_cast<uint64_t>(dfNoDataValue), nWidth,
6418 56 : nHeight, nLineStride, nComponents);
6419 : }
6420 558 : if (nBitsPerSample == 64 && nSampleFormat == GSF_SIGNED_INT)
6421 : {
6422 : // Use unsigned implementation by converting the nodatavalue to
6423 : // unsigned
6424 0 : return GDALIsValueInRange<int64_t>(dfNoDataValue) &&
6425 0 : HasOnlyNoDataT(
6426 : static_cast<const uint64_t *>(pBuffer),
6427 0 : static_cast<uint64_t>(static_cast<int64_t>(dfNoDataValue)),
6428 0 : nWidth, nHeight, nLineStride, nComponents);
6429 : }
6430 558 : if (nBitsPerSample == 16 && nSampleFormat == GSF_FLOATING_POINT)
6431 : {
6432 106 : return (std::isnan(dfNoDataValue) ||
6433 211 : GDALIsValueInRange<GFloat16>(dfNoDataValue)) &&
6434 105 : HasOnlyNoDataT(static_cast<const GFloat16 *>(pBuffer),
6435 : static_cast<GFloat16>(dfNoDataValue), nWidth,
6436 106 : nHeight, nLineStride, nComponents);
6437 : }
6438 452 : if (nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
6439 : {
6440 268 : return (std::isnan(dfNoDataValue) ||
6441 535 : GDALIsValueInRange<float>(dfNoDataValue)) &&
6442 267 : HasOnlyNoDataT(static_cast<const float *>(pBuffer),
6443 : static_cast<float>(dfNoDataValue), nWidth,
6444 268 : nHeight, nLineStride, nComponents);
6445 : }
6446 184 : if (nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
6447 : {
6448 184 : return HasOnlyNoDataT(static_cast<const double *>(pBuffer),
6449 : dfNoDataValue, nWidth, nHeight, nLineStride,
6450 184 : nComponents);
6451 : }
6452 0 : return false;
6453 : }
6454 :
6455 : #ifdef HAVE_SSE2
6456 :
6457 : /************************************************************************/
6458 : /* GDALDeinterleave3Byte() */
6459 : /************************************************************************/
6460 :
6461 : #if defined(__GNUC__) && !defined(__clang__)
6462 : __attribute__((optimize("no-tree-vectorize")))
6463 : #endif
6464 382770 : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
6465 : GByte *CPL_RESTRICT pabyDest0,
6466 : GByte *CPL_RESTRICT pabyDest1,
6467 : GByte *CPL_RESTRICT pabyDest2, size_t nIters)
6468 : #ifdef USE_NEON_OPTIMIZATIONS
6469 : {
6470 : return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6471 : nIters);
6472 : }
6473 : #else
6474 : {
6475 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
6476 382770 : if (CPLHaveRuntimeSSSE3())
6477 : {
6478 382768 : return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
6479 382768 : pabyDest2, nIters);
6480 : }
6481 : #endif
6482 :
6483 2 : size_t i = 0;
6484 2 : if (((reinterpret_cast<uintptr_t>(pabySrc) |
6485 2 : reinterpret_cast<uintptr_t>(pabyDest0) |
6486 2 : reinterpret_cast<uintptr_t>(pabyDest1) |
6487 2 : reinterpret_cast<uintptr_t>(pabyDest2)) %
6488 : sizeof(unsigned int)) == 0)
6489 : {
6490 : // Slightly better than GCC autovectorizer
6491 17 : for (size_t j = 0; i + 3 < nIters; i += 4, ++j)
6492 : {
6493 15 : unsigned int word0 =
6494 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i);
6495 15 : unsigned int word1 =
6496 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 4);
6497 15 : unsigned int word2 =
6498 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 8);
6499 15 : reinterpret_cast<unsigned int *>(pabyDest0)[j] =
6500 15 : (word0 & 0xff) | ((word0 >> 24) << 8) | (word1 & 0x00ff0000) |
6501 15 : ((word2 >> 8) << 24);
6502 15 : reinterpret_cast<unsigned int *>(pabyDest1)[j] =
6503 15 : ((word0 >> 8) & 0xff) | ((word1 & 0xff) << 8) |
6504 15 : (((word1 >> 24)) << 16) | ((word2 >> 16) << 24);
6505 15 : pabyDest2[j * 4] = static_cast<GByte>(word0 >> 16);
6506 15 : pabyDest2[j * 4 + 1] = static_cast<GByte>(word1 >> 8);
6507 15 : pabyDest2[j * 4 + 2] = static_cast<GByte>(word2);
6508 15 : pabyDest2[j * 4 + 3] = static_cast<GByte>(word2 >> 24);
6509 : }
6510 : }
6511 : #if defined(__clang__)
6512 : #pragma clang loop vectorize(disable)
6513 : #endif
6514 3 : for (; i < nIters; ++i)
6515 : {
6516 1 : pabyDest0[i] = pabySrc[3 * i + 0];
6517 1 : pabyDest1[i] = pabySrc[3 * i + 1];
6518 1 : pabyDest2[i] = pabySrc[3 * i + 2];
6519 : }
6520 : }
6521 : #endif
6522 :
6523 : /************************************************************************/
6524 : /* GDALDeinterleave4Byte() */
6525 : /************************************************************************/
6526 :
6527 : #if !defined(__GNUC__) || defined(__clang__)
6528 :
6529 : /************************************************************************/
6530 : /* deinterleave() */
6531 : /************************************************************************/
6532 :
6533 : template <bool SHIFT, bool MASK>
6534 : inline __m128i deinterleave(__m128i &xmm0_ori, __m128i &xmm1_ori,
6535 : __m128i &xmm2_ori, __m128i &xmm3_ori)
6536 : {
6537 : // Set higher 24bit of each int32 packed word to 0
6538 : if (SHIFT)
6539 : {
6540 : xmm0_ori = _mm_srli_epi32(xmm0_ori, 8);
6541 : xmm1_ori = _mm_srli_epi32(xmm1_ori, 8);
6542 : xmm2_ori = _mm_srli_epi32(xmm2_ori, 8);
6543 : xmm3_ori = _mm_srli_epi32(xmm3_ori, 8);
6544 : }
6545 : __m128i xmm0;
6546 : __m128i xmm1;
6547 : __m128i xmm2;
6548 : __m128i xmm3;
6549 : if (MASK)
6550 : {
6551 : const __m128i xmm_mask = _mm_set1_epi32(0xff);
6552 : xmm0 = _mm_and_si128(xmm0_ori, xmm_mask);
6553 : xmm1 = _mm_and_si128(xmm1_ori, xmm_mask);
6554 : xmm2 = _mm_and_si128(xmm2_ori, xmm_mask);
6555 : xmm3 = _mm_and_si128(xmm3_ori, xmm_mask);
6556 : }
6557 : else
6558 : {
6559 : xmm0 = xmm0_ori;
6560 : xmm1 = xmm1_ori;
6561 : xmm2 = xmm2_ori;
6562 : xmm3 = xmm3_ori;
6563 : }
6564 : // Pack int32 to int16
6565 : xmm0 = _mm_packs_epi32(xmm0, xmm1);
6566 : xmm2 = _mm_packs_epi32(xmm2, xmm3);
6567 : // Pack int16 to uint8
6568 : xmm0 = _mm_packus_epi16(xmm0, xmm2);
6569 : return xmm0;
6570 : }
6571 :
6572 : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
6573 : GByte *CPL_RESTRICT pabyDest0,
6574 : GByte *CPL_RESTRICT pabyDest1,
6575 : GByte *CPL_RESTRICT pabyDest2,
6576 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6577 : #ifdef USE_NEON_OPTIMIZATIONS
6578 : {
6579 : return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6580 : pabyDest3, nIters);
6581 : }
6582 : #else
6583 : {
6584 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
6585 : if (CPLHaveRuntimeSSSE3())
6586 : {
6587 : return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
6588 : pabyDest2, pabyDest3, nIters);
6589 : }
6590 : #endif
6591 :
6592 : // Not the optimal SSE2-only code, as gcc auto-vectorizer manages to
6593 : // do something slightly better.
6594 : size_t i = 0;
6595 : for (; i + 15 < nIters; i += 16)
6596 : {
6597 : __m128i xmm0_ori = _mm_loadu_si128(
6598 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
6599 : __m128i xmm1_ori = _mm_loadu_si128(
6600 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
6601 : __m128i xmm2_ori = _mm_loadu_si128(
6602 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
6603 : __m128i xmm3_ori = _mm_loadu_si128(
6604 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
6605 :
6606 : _mm_storeu_si128(
6607 : reinterpret_cast<__m128i *>(pabyDest0 + i),
6608 : deinterleave<false, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6609 : _mm_storeu_si128(
6610 : reinterpret_cast<__m128i *>(pabyDest1 + i),
6611 : deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6612 : _mm_storeu_si128(
6613 : reinterpret_cast<__m128i *>(pabyDest2 + i),
6614 : deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6615 : _mm_storeu_si128(
6616 : reinterpret_cast<__m128i *>(pabyDest3 + i),
6617 : deinterleave<true, false>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6618 : }
6619 :
6620 : #if defined(__clang__)
6621 : #pragma clang loop vectorize(disable)
6622 : #endif
6623 : for (; i < nIters; ++i)
6624 : {
6625 : pabyDest0[i] = pabySrc[4 * i + 0];
6626 : pabyDest1[i] = pabySrc[4 * i + 1];
6627 : pabyDest2[i] = pabySrc[4 * i + 2];
6628 : pabyDest3[i] = pabySrc[4 * i + 3];
6629 : }
6630 : }
6631 : #endif
6632 : #else
6633 : // GCC autovectorizer does an excellent job
6634 97786 : __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
6635 : const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
6636 : GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2,
6637 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6638 : {
6639 545177000 : for (size_t i = 0; i < nIters; ++i)
6640 : {
6641 545079000 : pabyDest0[i] = pabySrc[4 * i + 0];
6642 545079000 : pabyDest1[i] = pabySrc[4 * i + 1];
6643 545079000 : pabyDest2[i] = pabySrc[4 * i + 2];
6644 545079000 : pabyDest3[i] = pabySrc[4 * i + 3];
6645 : }
6646 97786 : }
6647 : #endif
6648 :
6649 : #else
6650 :
6651 : /************************************************************************/
6652 : /* GDALDeinterleave3Byte() */
6653 : /************************************************************************/
6654 :
6655 : // TODO: Enabling below could help on non-Intel architectures where GCC knows
6656 : // how to auto-vectorize
6657 : // #if defined(__GNUC__)
6658 : //__attribute__((optimize("tree-vectorize")))
6659 : // #endif
6660 : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
6661 : GByte *CPL_RESTRICT pabyDest0,
6662 : GByte *CPL_RESTRICT pabyDest1,
6663 : GByte *CPL_RESTRICT pabyDest2, size_t nIters)
6664 : {
6665 : for (size_t i = 0; i < nIters; ++i)
6666 : {
6667 : pabyDest0[i] = pabySrc[3 * i + 0];
6668 : pabyDest1[i] = pabySrc[3 * i + 1];
6669 : pabyDest2[i] = pabySrc[3 * i + 2];
6670 : }
6671 : }
6672 :
6673 : /************************************************************************/
6674 : /* GDALDeinterleave4Byte() */
6675 : /************************************************************************/
6676 :
6677 : // TODO: Enabling below could help on non-Intel architectures where gcc knows
6678 : // how to auto-vectorize
6679 : // #if defined(__GNUC__)
6680 : //__attribute__((optimize("tree-vectorize")))
6681 : // #endif
6682 : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
6683 : GByte *CPL_RESTRICT pabyDest0,
6684 : GByte *CPL_RESTRICT pabyDest1,
6685 : GByte *CPL_RESTRICT pabyDest2,
6686 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6687 : {
6688 : for (size_t i = 0; i < nIters; ++i)
6689 : {
6690 : pabyDest0[i] = pabySrc[4 * i + 0];
6691 : pabyDest1[i] = pabySrc[4 * i + 1];
6692 : pabyDest2[i] = pabySrc[4 * i + 2];
6693 : pabyDest3[i] = pabySrc[4 * i + 3];
6694 : }
6695 : }
6696 :
6697 : #endif
6698 :
6699 : /************************************************************************/
6700 : /* GDALDeinterleave() */
6701 : /************************************************************************/
6702 :
6703 : /*! Copy values from a pixel-interleave buffer to multiple per-component
6704 : buffers.
6705 :
6706 : In pseudo-code
6707 : \verbatim
6708 : for(size_t i = 0; i < nIters; ++i)
6709 : for(int iComp = 0; iComp < nComponents; iComp++ )
6710 : ppDestBuffer[iComp][i] = pSourceBuffer[nComponents * i + iComp]
6711 : \endverbatim
6712 :
6713 : The implementation is optimized for a few cases, like de-interleaving
6714 : of 3 or 4-components Byte buffers.
6715 :
6716 : \since GDAL 3.6
6717 : */
6718 480906 : void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
6719 : int nComponents, void **ppDestBuffer,
6720 : GDALDataType eDestDT, size_t nIters)
6721 : {
6722 480906 : if (eSourceDT == eDestDT)
6723 : {
6724 480884 : if (eSourceDT == GDT_UInt8 || eSourceDT == GDT_Int8)
6725 : {
6726 480563 : if (nComponents == 3)
6727 : {
6728 382770 : const GByte *CPL_RESTRICT pabySrc =
6729 : static_cast<const GByte *>(pSourceBuffer);
6730 382770 : GByte *CPL_RESTRICT pabyDest0 =
6731 : static_cast<GByte *>(ppDestBuffer[0]);
6732 382770 : GByte *CPL_RESTRICT pabyDest1 =
6733 : static_cast<GByte *>(ppDestBuffer[1]);
6734 382770 : GByte *CPL_RESTRICT pabyDest2 =
6735 : static_cast<GByte *>(ppDestBuffer[2]);
6736 382770 : GDALDeinterleave3Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6737 : nIters);
6738 382770 : return;
6739 : }
6740 97793 : else if (nComponents == 4)
6741 : {
6742 97786 : const GByte *CPL_RESTRICT pabySrc =
6743 : static_cast<const GByte *>(pSourceBuffer);
6744 97786 : GByte *CPL_RESTRICT pabyDest0 =
6745 : static_cast<GByte *>(ppDestBuffer[0]);
6746 97786 : GByte *CPL_RESTRICT pabyDest1 =
6747 : static_cast<GByte *>(ppDestBuffer[1]);
6748 97786 : GByte *CPL_RESTRICT pabyDest2 =
6749 : static_cast<GByte *>(ppDestBuffer[2]);
6750 97786 : GByte *CPL_RESTRICT pabyDest3 =
6751 : static_cast<GByte *>(ppDestBuffer[3]);
6752 97786 : GDALDeinterleave4Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6753 : pabyDest3, nIters);
6754 97786 : return;
6755 7 : }
6756 : }
6757 : #if ((defined(__GNUC__) && !defined(__clang__)) || \
6758 : defined(__INTEL_CLANG_COMPILER)) && \
6759 : defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
6760 642 : else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
6761 321 : CPLHaveRuntimeSSSE3())
6762 : {
6763 321 : if (nComponents == 3)
6764 : {
6765 126 : const GUInt16 *CPL_RESTRICT panSrc =
6766 : static_cast<const GUInt16 *>(pSourceBuffer);
6767 126 : GUInt16 *CPL_RESTRICT panDest0 =
6768 : static_cast<GUInt16 *>(ppDestBuffer[0]);
6769 126 : GUInt16 *CPL_RESTRICT panDest1 =
6770 : static_cast<GUInt16 *>(ppDestBuffer[1]);
6771 126 : GUInt16 *CPL_RESTRICT panDest2 =
6772 : static_cast<GUInt16 *>(ppDestBuffer[2]);
6773 126 : GDALDeinterleave3UInt16_SSSE3(panSrc, panDest0, panDest1,
6774 : panDest2, nIters);
6775 126 : return;
6776 : }
6777 : #if !defined(__INTEL_CLANG_COMPILER)
6778 : // ICC autovectorizer doesn't do a good job, at least with icx
6779 : // 2022.1.0.20220316
6780 195 : else if (nComponents == 4)
6781 : {
6782 195 : const GUInt16 *CPL_RESTRICT panSrc =
6783 : static_cast<const GUInt16 *>(pSourceBuffer);
6784 195 : GUInt16 *CPL_RESTRICT panDest0 =
6785 : static_cast<GUInt16 *>(ppDestBuffer[0]);
6786 195 : GUInt16 *CPL_RESTRICT panDest1 =
6787 : static_cast<GUInt16 *>(ppDestBuffer[1]);
6788 195 : GUInt16 *CPL_RESTRICT panDest2 =
6789 : static_cast<GUInt16 *>(ppDestBuffer[2]);
6790 195 : GUInt16 *CPL_RESTRICT panDest3 =
6791 : static_cast<GUInt16 *>(ppDestBuffer[3]);
6792 195 : GDALDeinterleave4UInt16_SSSE3(panSrc, panDest0, panDest1,
6793 : panDest2, panDest3, nIters);
6794 195 : return;
6795 : }
6796 : #endif
6797 : }
6798 : #endif
6799 : }
6800 :
6801 29 : const int nSourceDTSize = GDALGetDataTypeSizeBytes(eSourceDT);
6802 29 : const int nDestDTSize = GDALGetDataTypeSizeBytes(eDestDT);
6803 108 : for (int iComp = 0; iComp < nComponents; iComp++)
6804 : {
6805 79 : GDALCopyWords64(static_cast<const GByte *>(pSourceBuffer) +
6806 79 : iComp * nSourceDTSize,
6807 : eSourceDT, nComponents * nSourceDTSize,
6808 79 : ppDestBuffer[iComp], eDestDT, nDestDTSize, nIters);
6809 : }
6810 : }
6811 :
6812 : /************************************************************************/
6813 : /* GDALTranspose2DSingleToSingle() */
6814 : /************************************************************************/
6815 : /**
6816 : * Transpose a 2D array of non-complex values, in a efficient (cache-oblivious) way.
6817 : *
6818 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6819 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6820 : * @param nSrcWidth Width of pSrc array.
6821 : * @param nSrcHeight Height of pSrc array.
6822 : */
6823 :
6824 : template <class DST, class SRC>
6825 160 : void GDALTranspose2DSingleToSingle(const SRC *CPL_RESTRICT pSrc,
6826 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6827 : size_t nSrcHeight)
6828 : {
6829 160 : constexpr size_t blocksize = 32;
6830 345 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6831 : {
6832 185 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6833 5016 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6834 : {
6835 : // transpose the block beginning at [i,j]
6836 4831 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6837 26185 : for (size_t k = i; k < max_k; ++k)
6838 : {
6839 669282 : for (size_t l = j; l < max_l; ++l)
6840 : {
6841 647928 : GDALCopyWord(pSrc[l + k * nSrcWidth],
6842 647928 : pDst[k + l * nSrcHeight]);
6843 : }
6844 : }
6845 : }
6846 : }
6847 160 : }
6848 :
6849 : /************************************************************************/
6850 : /* GDALTranspose2DComplexToComplex() */
6851 : /************************************************************************/
6852 : /**
6853 : * Transpose a 2D array of complex values into an array of complex values,
6854 : * in a efficient (cache-oblivious) way.
6855 : *
6856 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6857 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6858 : * @param nSrcWidth Width of pSrc array.
6859 : * @param nSrcHeight Height of pSrc array.
6860 : */
6861 : template <class DST, class SRC>
6862 25 : void GDALTranspose2DComplexToComplex(const SRC *CPL_RESTRICT pSrc,
6863 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6864 : size_t nSrcHeight)
6865 : {
6866 25 : constexpr size_t blocksize = 32;
6867 50 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6868 : {
6869 25 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6870 50 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6871 : {
6872 : // transpose the block beginning at [i,j]
6873 25 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6874 75 : for (size_t k = i; k < max_k; ++k)
6875 : {
6876 200 : for (size_t l = j; l < max_l; ++l)
6877 : {
6878 150 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
6879 150 : pDst[2 * (k + l * nSrcHeight) + 0]);
6880 150 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 1],
6881 150 : pDst[2 * (k + l * nSrcHeight) + 1]);
6882 : }
6883 : }
6884 : }
6885 : }
6886 25 : }
6887 :
6888 : /************************************************************************/
6889 : /* GDALTranspose2DComplexToSingle() */
6890 : /************************************************************************/
6891 : /**
6892 : * Transpose a 2D array of complex values into an array of non-complex values,
6893 : * in a efficient (cache-oblivious) way.
6894 : *
6895 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6896 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6897 : * @param nSrcWidth Width of pSrc array.
6898 : * @param nSrcHeight Height of pSrc array.
6899 : */
6900 : template <class DST, class SRC>
6901 55 : void GDALTranspose2DComplexToSingle(const SRC *CPL_RESTRICT pSrc,
6902 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6903 : size_t nSrcHeight)
6904 : {
6905 55 : constexpr size_t blocksize = 32;
6906 110 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6907 : {
6908 55 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6909 110 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6910 : {
6911 : // transpose the block beginning at [i,j]
6912 55 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6913 165 : for (size_t k = i; k < max_k; ++k)
6914 : {
6915 440 : for (size_t l = j; l < max_l; ++l)
6916 : {
6917 330 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
6918 330 : pDst[k + l * nSrcHeight]);
6919 : }
6920 : }
6921 : }
6922 : }
6923 55 : }
6924 :
6925 : /************************************************************************/
6926 : /* GDALTranspose2DSingleToComplex() */
6927 : /************************************************************************/
6928 : /**
6929 : * Transpose a 2D array of non-complex values into an array of complex values,
6930 : * in a efficient (cache-oblivious) way.
6931 : *
6932 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6933 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6934 : * @param nSrcWidth Width of pSrc array.
6935 : * @param nSrcHeight Height of pSrc array.
6936 : */
6937 : template <class DST, class SRC>
6938 55 : void GDALTranspose2DSingleToComplex(const SRC *CPL_RESTRICT pSrc,
6939 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6940 : size_t nSrcHeight)
6941 : {
6942 55 : constexpr size_t blocksize = 32;
6943 110 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6944 : {
6945 55 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6946 110 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6947 : {
6948 : // transpose the block beginning at [i,j]
6949 55 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6950 165 : for (size_t k = i; k < max_k; ++k)
6951 : {
6952 440 : for (size_t l = j; l < max_l; ++l)
6953 : {
6954 330 : GDALCopyWord(pSrc[l + k * nSrcWidth],
6955 330 : pDst[2 * (k + l * nSrcHeight) + 0]);
6956 330 : pDst[2 * (k + l * nSrcHeight) + 1] = 0;
6957 : }
6958 : }
6959 : }
6960 : }
6961 55 : }
6962 :
6963 : /************************************************************************/
6964 : /* GDALTranspose2D() */
6965 : /************************************************************************/
6966 :
6967 : template <class DST, bool DST_IS_COMPLEX>
6968 295 : static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
6969 : size_t nSrcWidth, size_t nSrcHeight)
6970 : {
6971 : #define CALL_GDALTranspose2D_internal(SRC_TYPE) \
6972 : do \
6973 : { \
6974 : if constexpr (DST_IS_COMPLEX) \
6975 : { \
6976 : GDALTranspose2DSingleToComplex( \
6977 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6978 : nSrcHeight); \
6979 : } \
6980 : else \
6981 : { \
6982 : GDALTranspose2DSingleToSingle(static_cast<const SRC_TYPE *>(pSrc), \
6983 : pDst, nSrcWidth, nSrcHeight); \
6984 : } \
6985 : } while (0)
6986 :
6987 : #define CALL_GDALTranspose2DComplex_internal(SRC_TYPE) \
6988 : do \
6989 : { \
6990 : if constexpr (DST_IS_COMPLEX) \
6991 : { \
6992 : GDALTranspose2DComplexToComplex( \
6993 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6994 : nSrcHeight); \
6995 : } \
6996 : else \
6997 : { \
6998 : GDALTranspose2DComplexToSingle( \
6999 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
7000 : nSrcHeight); \
7001 : } \
7002 : } while (0)
7003 :
7004 : // clang-format off
7005 295 : switch (eSrcType)
7006 : {
7007 16 : case GDT_UInt8: CALL_GDALTranspose2D_internal(uint8_t); break;
7008 15 : case GDT_Int8: CALL_GDALTranspose2D_internal(int8_t); break;
7009 33 : case GDT_UInt16: CALL_GDALTranspose2D_internal(uint16_t); break;
7010 20 : case GDT_Int16: CALL_GDALTranspose2D_internal(int16_t); break;
7011 24 : case GDT_UInt32: CALL_GDALTranspose2D_internal(uint32_t); break;
7012 16 : case GDT_Int32: CALL_GDALTranspose2D_internal(int32_t); break;
7013 16 : case GDT_UInt64: CALL_GDALTranspose2D_internal(uint64_t); break;
7014 16 : case GDT_Int64: CALL_GDALTranspose2D_internal(int64_t); break;
7015 16 : case GDT_Float16: CALL_GDALTranspose2D_internal(GFloat16); break;
7016 19 : case GDT_Float32: CALL_GDALTranspose2D_internal(float); break;
7017 24 : case GDT_Float64: CALL_GDALTranspose2D_internal(double); break;
7018 16 : case GDT_CInt16: CALL_GDALTranspose2DComplex_internal(int16_t); break;
7019 16 : case GDT_CInt32: CALL_GDALTranspose2DComplex_internal(int32_t); break;
7020 16 : case GDT_CFloat16: CALL_GDALTranspose2DComplex_internal(GFloat16); break;
7021 16 : case GDT_CFloat32: CALL_GDALTranspose2DComplex_internal(float); break;
7022 16 : case GDT_CFloat64: CALL_GDALTranspose2DComplex_internal(double); break;
7023 0 : case GDT_Unknown:
7024 : case GDT_TypeCount:
7025 0 : break;
7026 : }
7027 : // clang-format on
7028 :
7029 : #undef CALL_GDALTranspose2D_internal
7030 : #undef CALL_GDALTranspose2DComplex_internal
7031 295 : }
7032 :
7033 : /************************************************************************/
7034 : /* GDALInterleave2Byte() */
7035 : /************************************************************************/
7036 :
7037 : #if defined(HAVE_SSE2) && \
7038 : (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
7039 :
7040 : // ICC autovectorizer doesn't do a good job at generating good SSE code,
7041 : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
7042 : #if defined(__GNUC__)
7043 : __attribute__((noinline))
7044 : #endif
7045 : static void GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
7046 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
7047 : {
7048 : size_t i = 0;
7049 : constexpr size_t VALS_PER_ITER = 16;
7050 : for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
7051 : {
7052 : __m128i xmm0 =
7053 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + i));
7054 : __m128i xmm1 = _mm_loadu_si128(
7055 : reinterpret_cast<__m128i const *>(pSrc + i + nIters));
7056 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + 2 * i),
7057 : _mm_unpacklo_epi8(xmm0, xmm1));
7058 : _mm_storeu_si128(
7059 : reinterpret_cast<__m128i *>(pDst + 2 * i + VALS_PER_ITER),
7060 : _mm_unpackhi_epi8(xmm0, xmm1));
7061 : }
7062 : #if defined(__clang__)
7063 : #pragma clang loop vectorize(disable)
7064 : #endif
7065 : for (; i < nIters; ++i)
7066 : {
7067 : pDst[2 * i + 0] = pSrc[i + 0 * nIters];
7068 : pDst[2 * i + 1] = pSrc[i + 1 * nIters];
7069 : }
7070 : }
7071 :
7072 : #else
7073 :
7074 : #if defined(__GNUC__) && !defined(__clang__)
7075 : __attribute__((optimize("tree-vectorize")))
7076 : #endif
7077 : #if defined(__GNUC__)
7078 : __attribute__((noinline))
7079 : #endif
7080 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7081 : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
7082 : #pragma clang diagnostic push
7083 : #pragma clang diagnostic ignored "-Wpass-failed"
7084 : #endif
7085 9 : static void GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
7086 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
7087 : {
7088 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7089 : #pragma clang loop vectorize(enable)
7090 : #endif
7091 355429 : for (size_t i = 0; i < nIters; ++i)
7092 : {
7093 355420 : pDst[2 * i + 0] = pSrc[i + 0 * nIters];
7094 355420 : pDst[2 * i + 1] = pSrc[i + 1 * nIters];
7095 : }
7096 9 : }
7097 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7098 : #pragma clang diagnostic pop
7099 : #endif
7100 :
7101 : #endif
7102 :
7103 : /************************************************************************/
7104 : /* GDALInterleave4Byte() */
7105 : /************************************************************************/
7106 :
7107 : #if defined(HAVE_SSE2) && \
7108 : (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
7109 :
7110 : // ICC autovectorizer doesn't do a good job at generating good SSE code,
7111 : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
7112 : #if defined(__GNUC__)
7113 : __attribute__((noinline))
7114 : #endif
7115 : static void GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
7116 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
7117 : {
7118 : size_t i = 0;
7119 : constexpr size_t VALS_PER_ITER = 16;
7120 : for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
7121 : {
7122 : __m128i xmm0 = _mm_loadu_si128(
7123 : reinterpret_cast<__m128i const *>(pSrc + i + 0 * nIters));
7124 : __m128i xmm1 = _mm_loadu_si128(
7125 : reinterpret_cast<__m128i const *>(pSrc + i + 1 * nIters));
7126 : __m128i xmm2 = _mm_loadu_si128(
7127 : reinterpret_cast<__m128i const *>(pSrc + i + 2 * nIters));
7128 : __m128i xmm3 = _mm_loadu_si128(
7129 : reinterpret_cast<__m128i const *>(pSrc + i + 3 * nIters));
7130 : auto tmp0 = _mm_unpacklo_epi8(
7131 : xmm0,
7132 : xmm1); // (xmm0_0, xmm1_0, xmm0_1, xmm1_1, xmm0_2, xmm1_2, ...)
7133 : auto tmp1 = _mm_unpackhi_epi8(
7134 : xmm0,
7135 : xmm1); // (xmm0_8, xmm1_8, xmm0_9, xmm1_9, xmm0_10, xmm1_10, ...)
7136 : auto tmp2 = _mm_unpacklo_epi8(
7137 : xmm2,
7138 : xmm3); // (xmm2_0, xmm3_0, xmm2_1, xmm3_1, xmm2_2, xmm3_2, ...)
7139 : auto tmp3 = _mm_unpackhi_epi8(
7140 : xmm2,
7141 : xmm3); // (xmm2_8, xmm3_8, xmm2_9, xmm3_9, xmm2_10, xmm3_10, ...)
7142 : auto tmp2_0 = _mm_unpacklo_epi16(
7143 : tmp0,
7144 : tmp2); // (xmm0_0, xmm1_0, xmm2_0, xmm3_0, xmm0_1, xmm1_1, xmm2_1, xmm3_1, ...)
7145 : auto tmp2_1 = _mm_unpackhi_epi16(tmp0, tmp2);
7146 : auto tmp2_2 = _mm_unpacklo_epi16(tmp1, tmp3);
7147 : auto tmp2_3 = _mm_unpackhi_epi16(tmp1, tmp3);
7148 : _mm_storeu_si128(
7149 : reinterpret_cast<__m128i *>(pDst + 4 * i + 0 * VALS_PER_ITER),
7150 : tmp2_0);
7151 : _mm_storeu_si128(
7152 : reinterpret_cast<__m128i *>(pDst + 4 * i + 1 * VALS_PER_ITER),
7153 : tmp2_1);
7154 : _mm_storeu_si128(
7155 : reinterpret_cast<__m128i *>(pDst + 4 * i + 2 * VALS_PER_ITER),
7156 : tmp2_2);
7157 : _mm_storeu_si128(
7158 : reinterpret_cast<__m128i *>(pDst + 4 * i + 3 * VALS_PER_ITER),
7159 : tmp2_3);
7160 : }
7161 : #if defined(__clang__)
7162 : #pragma clang loop vectorize(disable)
7163 : #endif
7164 : for (; i < nIters; ++i)
7165 : {
7166 : pDst[4 * i + 0] = pSrc[i + 0 * nIters];
7167 : pDst[4 * i + 1] = pSrc[i + 1 * nIters];
7168 : pDst[4 * i + 2] = pSrc[i + 2 * nIters];
7169 : pDst[4 * i + 3] = pSrc[i + 3 * nIters];
7170 : }
7171 : }
7172 :
7173 : #else
7174 :
7175 : #if defined(__GNUC__) && !defined(__clang__)
7176 : __attribute__((optimize("tree-vectorize")))
7177 : #endif
7178 : #if defined(__GNUC__)
7179 : __attribute__((noinline))
7180 : #endif
7181 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7182 : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
7183 : #pragma clang diagnostic push
7184 : #pragma clang diagnostic ignored "-Wpass-failed"
7185 : #endif
7186 30 : static void GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
7187 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
7188 : {
7189 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7190 : #pragma clang loop vectorize(enable)
7191 : #endif
7192 49620700 : for (size_t i = 0; i < nIters; ++i)
7193 : {
7194 49620600 : pDst[4 * i + 0] = pSrc[i + 0 * nIters];
7195 49620600 : pDst[4 * i + 1] = pSrc[i + 1 * nIters];
7196 49620600 : pDst[4 * i + 2] = pSrc[i + 2 * nIters];
7197 49620600 : pDst[4 * i + 3] = pSrc[i + 3 * nIters];
7198 : }
7199 30 : }
7200 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7201 : #pragma clang diagnostic pop
7202 : #endif
7203 :
7204 : #endif
7205 :
7206 : /************************************************************************/
7207 : /* GDALTranspose2D() */
7208 : /************************************************************************/
7209 :
7210 : /**
7211 : * Transpose a 2D array in a efficient (cache-oblivious) way.
7212 : *
7213 : * @param pSrc Source array of width = nSrcWidth and height = nSrcHeight.
7214 : * @param eSrcType Data type of pSrc.
7215 : * @param pDst Destination transposed array of width = nSrcHeight and height = nSrcWidth.
7216 : * @param eDstType Data type of pDst.
7217 : * @param nSrcWidth Width of pSrc array.
7218 : * @param nSrcHeight Height of pSrc array.
7219 : * @since GDAL 3.11
7220 : */
7221 :
7222 365 : void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, void *pDst,
7223 : GDALDataType eDstType, size_t nSrcWidth, size_t nSrcHeight)
7224 : {
7225 365 : if (eSrcType == eDstType && (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8))
7226 : {
7227 70 : if (nSrcHeight == 2)
7228 : {
7229 9 : GDALInterleave2Byte(static_cast<const uint8_t *>(pSrc),
7230 : static_cast<uint8_t *>(pDst), nSrcWidth);
7231 9 : return;
7232 : }
7233 61 : if (nSrcHeight == 4)
7234 : {
7235 30 : GDALInterleave4Byte(static_cast<const uint8_t *>(pSrc),
7236 : static_cast<uint8_t *>(pDst), nSrcWidth);
7237 30 : return;
7238 : }
7239 : #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) && \
7240 : (defined(__x86_64) || defined(_M_X64)))
7241 31 : if (CPLHaveRuntimeSSSE3())
7242 : {
7243 31 : GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
7244 : static_cast<uint8_t *>(pDst), nSrcWidth,
7245 : nSrcHeight);
7246 31 : return;
7247 : }
7248 : #elif defined(USE_NEON_OPTIMIZATIONS)
7249 : {
7250 : GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
7251 : static_cast<uint8_t *>(pDst), nSrcWidth,
7252 : nSrcHeight);
7253 : return;
7254 : }
7255 : #endif
7256 : }
7257 :
7258 : #define CALL_GDALTranspose2D_internal(DST_TYPE, DST_IS_COMPLEX) \
7259 : GDALTranspose2D<DST_TYPE, DST_IS_COMPLEX>( \
7260 : pSrc, eSrcType, static_cast<DST_TYPE *>(pDst), nSrcWidth, nSrcHeight)
7261 :
7262 : // clang-format off
7263 295 : switch (eDstType)
7264 : {
7265 15 : case GDT_UInt8: CALL_GDALTranspose2D_internal(uint8_t, false); break;
7266 15 : case GDT_Int8: CALL_GDALTranspose2D_internal(int8_t, false); break;
7267 33 : case GDT_UInt16: CALL_GDALTranspose2D_internal(uint16_t, false); break;
7268 20 : case GDT_Int16: CALL_GDALTranspose2D_internal(int16_t, false); break;
7269 24 : case GDT_UInt32: CALL_GDALTranspose2D_internal(uint32_t, false); break;
7270 16 : case GDT_Int32: CALL_GDALTranspose2D_internal(int32_t, false); break;
7271 16 : case GDT_UInt64: CALL_GDALTranspose2D_internal(uint64_t, false); break;
7272 16 : case GDT_Int64: CALL_GDALTranspose2D_internal(int64_t, false); break;
7273 16 : case GDT_Float16: CALL_GDALTranspose2D_internal(GFloat16, false); break;
7274 19 : case GDT_Float32: CALL_GDALTranspose2D_internal(float, false); break;
7275 25 : case GDT_Float64: CALL_GDALTranspose2D_internal(double, false); break;
7276 16 : case GDT_CInt16: CALL_GDALTranspose2D_internal(int16_t, true); break;
7277 16 : case GDT_CInt32: CALL_GDALTranspose2D_internal(int32_t, true); break;
7278 16 : case GDT_CFloat16: CALL_GDALTranspose2D_internal(GFloat16, true); break;
7279 16 : case GDT_CFloat32: CALL_GDALTranspose2D_internal(float, true); break;
7280 16 : case GDT_CFloat64: CALL_GDALTranspose2D_internal(double, true); break;
7281 0 : case GDT_Unknown:
7282 : case GDT_TypeCount:
7283 0 : break;
7284 : }
7285 : // clang-format on
7286 :
7287 : #undef CALL_GDALTranspose2D_internal
7288 : }
7289 :
7290 : /************************************************************************/
7291 : /* ExtractBitAndConvertTo255() */
7292 : /************************************************************************/
7293 :
7294 : #if defined(__GNUC__) || defined(_MSC_VER)
7295 : // Signedness of char implementation dependent, so be explicit.
7296 : // Assumes 2-complement integer types and sign extension of right shifting
7297 : // GCC guarantees such:
7298 : // https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
7299 143590 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
7300 : {
7301 143590 : return static_cast<GByte>(static_cast<signed char>(byVal << (7 - nBit)) >>
7302 143590 : 7);
7303 : }
7304 : #else
7305 : // Portable way
7306 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
7307 : {
7308 : return (byVal & (1 << nBit)) ? 255 : 0;
7309 : }
7310 : #endif
7311 :
7312 : /************************************************************************/
7313 : /* ExpandEightPackedBitsToByteAt255() */
7314 : /************************************************************************/
7315 :
7316 17813 : static inline void ExpandEightPackedBitsToByteAt255(GByte byVal,
7317 : GByte abyOutput[8])
7318 : {
7319 17813 : abyOutput[0] = ExtractBitAndConvertTo255(byVal, 7);
7320 17813 : abyOutput[1] = ExtractBitAndConvertTo255(byVal, 6);
7321 17813 : abyOutput[2] = ExtractBitAndConvertTo255(byVal, 5);
7322 17813 : abyOutput[3] = ExtractBitAndConvertTo255(byVal, 4);
7323 17813 : abyOutput[4] = ExtractBitAndConvertTo255(byVal, 3);
7324 17813 : abyOutput[5] = ExtractBitAndConvertTo255(byVal, 2);
7325 17813 : abyOutput[6] = ExtractBitAndConvertTo255(byVal, 1);
7326 17813 : abyOutput[7] = ExtractBitAndConvertTo255(byVal, 0);
7327 17813 : }
7328 :
7329 : /************************************************************************/
7330 : /* GDALExpandPackedBitsToByteAt0Or255() */
7331 : /************************************************************************/
7332 :
7333 : /** Expand packed-bits (ordered from most-significant bit to least one)
7334 : into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
7335 : at 1 to a byte at 255.
7336 :
7337 : The function does (in a possibly more optimized way) the following:
7338 : \code{.cpp}
7339 : for (size_t i = 0; i < nInputBits; ++i )
7340 : {
7341 : pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 255 : 0;
7342 : }
7343 : \endcode
7344 :
7345 : @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
7346 : @param pabyOutput Output array of nInputBits bytes.
7347 : @param nInputBits Number of valid bits in pabyInput.
7348 :
7349 : @since 3.11
7350 : */
7351 :
7352 45357 : void GDALExpandPackedBitsToByteAt0Or255(const GByte *CPL_RESTRICT pabyInput,
7353 : GByte *CPL_RESTRICT pabyOutput,
7354 : size_t nInputBits)
7355 : {
7356 45357 : const size_t nInputWholeBytes = nInputBits / 8;
7357 45357 : size_t iByte = 0;
7358 :
7359 : #ifdef HAVE_SSE2
7360 : // Mask to isolate each bit
7361 45357 : const __m128i bit_mask = _mm_set_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4,
7362 : 8, 16, 32, 64, -128);
7363 45357 : const __m128i zero = _mm_setzero_si128();
7364 45357 : const __m128i all_ones = _mm_set1_epi8(-1);
7365 : #ifdef __SSSE3__
7366 : const __m128i dispatch_two_bytes =
7367 : _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
7368 : #endif
7369 45357 : constexpr size_t SSE_REG_SIZE = sizeof(bit_mask);
7370 135866 : for (; iByte + SSE_REG_SIZE <= nInputWholeBytes; iByte += SSE_REG_SIZE)
7371 : {
7372 90509 : __m128i reg_ori = _mm_loadu_si128(
7373 90509 : reinterpret_cast<const __m128i *>(pabyInput + iByte));
7374 :
7375 90509 : constexpr int NUM_PROCESSED_BYTES_PER_REG = 2;
7376 814581 : for (size_t k = 0; k < SSE_REG_SIZE / NUM_PROCESSED_BYTES_PER_REG; ++k)
7377 : {
7378 : // Given reg_ori = (A, B, ... 14 other bytes ...),
7379 : // expand to (A, A, A, A, A, A, A, A, B, B, B, B, B, B, B, B)
7380 : #ifdef __SSSE3__
7381 : __m128i reg = _mm_shuffle_epi8(reg_ori, dispatch_two_bytes);
7382 : #else
7383 724072 : __m128i reg = _mm_unpacklo_epi8(reg_ori, reg_ori);
7384 724072 : reg = _mm_unpacklo_epi16(reg, reg);
7385 724072 : reg = _mm_unpacklo_epi32(reg, reg);
7386 : #endif
7387 :
7388 : // Test if bits of interest are set
7389 724072 : reg = _mm_and_si128(reg, bit_mask);
7390 :
7391 : // Now test if those bits are set, by comparing to zero. So the
7392 : // result will be that bytes where bits are set will be at 0, and
7393 : // ones where they are cleared will be at 0xFF. So the inverse of
7394 : // the end result we want!
7395 724072 : reg = _mm_cmpeq_epi8(reg, zero);
7396 :
7397 : // Invert the result
7398 724072 : reg = _mm_andnot_si128(reg, all_ones);
7399 :
7400 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyOutput), reg);
7401 :
7402 724072 : pabyOutput += SSE_REG_SIZE;
7403 :
7404 : // Right-shift of 2 bytes
7405 724072 : reg_ori = _mm_bsrli_si128(reg_ori, NUM_PROCESSED_BYTES_PER_REG);
7406 : }
7407 : }
7408 :
7409 : #endif // HAVE_SSE2
7410 :
7411 63170 : for (; iByte < nInputWholeBytes; ++iByte)
7412 : {
7413 17813 : ExpandEightPackedBitsToByteAt255(pabyInput[iByte], pabyOutput);
7414 17813 : pabyOutput += 8;
7415 : }
7416 46443 : for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
7417 : {
7418 1086 : *pabyOutput = ExtractBitAndConvertTo255(pabyInput[iByte], 7 - iBit);
7419 1086 : ++pabyOutput;
7420 : }
7421 45357 : }
7422 :
7423 : /************************************************************************/
7424 : /* ExpandEightPackedBitsToByteAt1() */
7425 : /************************************************************************/
7426 :
7427 136113 : static inline void ExpandEightPackedBitsToByteAt1(GByte byVal,
7428 : GByte abyOutput[8])
7429 : {
7430 136113 : abyOutput[0] = (byVal >> 7) & 0x1;
7431 136113 : abyOutput[1] = (byVal >> 6) & 0x1;
7432 136113 : abyOutput[2] = (byVal >> 5) & 0x1;
7433 136113 : abyOutput[3] = (byVal >> 4) & 0x1;
7434 136113 : abyOutput[4] = (byVal >> 3) & 0x1;
7435 136113 : abyOutput[5] = (byVal >> 2) & 0x1;
7436 136113 : abyOutput[6] = (byVal >> 1) & 0x1;
7437 136113 : abyOutput[7] = (byVal >> 0) & 0x1;
7438 136113 : }
7439 :
7440 : /************************************************************************/
7441 : /* GDALExpandPackedBitsToByteAt0Or1() */
7442 : /************************************************************************/
7443 :
7444 : /** Expand packed-bits (ordered from most-significant bit to least one)
7445 : into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
7446 : at 1 to a byte at 1.
7447 :
7448 : The function does (in a possibly more optimized way) the following:
7449 : \code{.cpp}
7450 : for (size_t i = 0; i < nInputBits; ++i )
7451 : {
7452 : pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
7453 : }
7454 : \endcode
7455 :
7456 : @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
7457 : @param pabyOutput Output array of nInputBits bytes.
7458 : @param nInputBits Number of valid bits in pabyInput.
7459 :
7460 : @since 3.11
7461 : */
7462 :
7463 7033 : void GDALExpandPackedBitsToByteAt0Or1(const GByte *CPL_RESTRICT pabyInput,
7464 : GByte *CPL_RESTRICT pabyOutput,
7465 : size_t nInputBits)
7466 : {
7467 7033 : const size_t nInputWholeBytes = nInputBits / 8;
7468 7033 : size_t iByte = 0;
7469 143146 : for (; iByte < nInputWholeBytes; ++iByte)
7470 : {
7471 136113 : ExpandEightPackedBitsToByteAt1(pabyInput[iByte], pabyOutput);
7472 136113 : pabyOutput += 8;
7473 : }
7474 18886 : for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
7475 : {
7476 11853 : *pabyOutput = (pabyInput[iByte] >> (7 - iBit)) & 0x1;
7477 11853 : ++pabyOutput;
7478 : }
7479 7033 : }
|