Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: GDAL Core
4 : * Purpose: Contains default implementation of GDALRasterBand::IRasterIO()
5 : * and supporting functions of broader utility.
6 : * Author: Frank Warmerdam, warmerdam@pobox.com
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 1998, Frank Warmerdam
10 : * Copyright (c) 2007-2014, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * SPDX-License-Identifier: MIT
13 : ****************************************************************************/
14 :
15 : #include "cpl_port.h"
16 : #include "gdal.h"
17 : #include "gdal_priv.h"
18 :
19 : #include <cassert>
20 : #include <climits>
21 : #include <cmath>
22 : #include <cstddef>
23 : #include <cstdio>
24 : #include <cstdlib>
25 : #include <cstring>
26 :
27 : #include <algorithm>
28 : #include <limits>
29 : #include <stdexcept>
30 : #include <type_traits>
31 :
32 : #include "cpl_conv.h"
33 : #include "cpl_cpu_features.h"
34 : #include "cpl_error.h"
35 : #include "cpl_float.h"
36 : #include "cpl_progress.h"
37 : #include "cpl_string.h"
38 : #include "cpl_vsi.h"
39 : #include "gdal_priv_templates.hpp"
40 : #include "gdal_vrt.h"
41 : #include "gdalwarper.h"
42 : #include "memdataset.h"
43 : #include "vrtdataset.h"
44 :
45 : #if defined(__x86_64) || defined(_M_X64)
46 : #include <emmintrin.h>
47 : #include <immintrin.h>
48 : #define HAVE_SSE2
49 : // AVX2 dispatch: compile AVX2 code with target attribute, detect at runtime
50 : #if (defined(__GNUC__) || defined(__clang__)) && \
51 : defined(HAVE_AVX2_AT_COMPILE_TIME)
52 : #define HAVE_AVX2_DISPATCH
53 : #elif defined(_MSC_VER)
54 : #include <intrin.h>
55 : #define HAVE_AVX2_DISPATCH
56 : #endif
57 : #elif defined(USE_NEON_OPTIMIZATIONS)
58 : #include "include_sse2neon.h"
59 : #define HAVE_SSE2
60 : #endif
61 :
62 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
63 : #include "rasterio_ssse3.h"
64 : #ifdef __SSSE3__
65 : #include <tmmintrin.h>
66 : #endif
67 : #endif
68 :
69 : #ifdef __SSE4_1__
70 : #include <smmintrin.h>
71 : #endif
72 :
73 : #ifdef __GNUC__
74 : #define CPL_NOINLINE __attribute__((noinline))
75 : #else
76 : #define CPL_NOINLINE
77 : #endif
78 :
79 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
80 : int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
81 : int nDstPixelStride, GPtrDiff_t nWordCount);
82 :
83 : /************************************************************************/
84 : /* DownsamplingIntegerXFactor() */
85 : /************************************************************************/
86 :
87 : template <bool bSameDataType, int DATA_TYPE_SIZE>
88 695850 : static bool DownsamplingIntegerXFactor(
89 : GDALRasterBand *poBand, int iSrcX, int nSrcXInc, GPtrDiff_t iSrcOffsetCst,
90 : GByte *CPL_RESTRICT pabyDstData, int nPixelSpace, int nBufXSize,
91 : GDALDataType eDataType, GDALDataType eBufType, int &nStartBlockX,
92 : int nBlockXSize, GDALRasterBlock *&poBlock, int nLBlockY)
93 : {
94 695850 : const int nBandDataSize =
95 : bSameDataType ? DATA_TYPE_SIZE : GDALGetDataTypeSizeBytes(eDataType);
96 695850 : int nOuterLoopIters = nBufXSize - 1;
97 695850 : const int nIncSrcOffset = nSrcXInc * nBandDataSize;
98 : const GByte *CPL_RESTRICT pabySrcData;
99 695850 : int nEndBlockX = nBlockXSize + nStartBlockX;
100 :
101 695850 : if (iSrcX < nEndBlockX)
102 : {
103 295062 : CPLAssert(poBlock);
104 295062 : goto no_reload_block;
105 : }
106 400788 : goto reload_block;
107 :
108 : // Don't do the last iteration in the loop, as iSrcX might go beyond
109 : // nRasterXSize - 1
110 1265113 : while (--nOuterLoopIters >= 1)
111 : {
112 201834 : iSrcX += nSrcXInc;
113 201834 : pabySrcData += nIncSrcOffset;
114 201834 : pabyDstData += nPixelSpace;
115 :
116 : /* --------------------------------------------------------------------
117 : */
118 : /* Ensure we have the appropriate block loaded. */
119 : /* --------------------------------------------------------------------
120 : */
121 201834 : if (iSrcX >= nEndBlockX)
122 : {
123 201834 : reload_block:
124 : {
125 615212 : const int nLBlockX = iSrcX / nBlockXSize;
126 615212 : nStartBlockX = nLBlockX * nBlockXSize;
127 615212 : nEndBlockX = nStartBlockX + nBlockXSize;
128 :
129 615212 : if (poBlock != nullptr)
130 341376 : poBlock->DropLock();
131 :
132 615212 : poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
133 615212 : if (poBlock == nullptr)
134 : {
135 1 : return false;
136 : }
137 : }
138 :
139 615211 : no_reload_block:
140 : const GByte *pabySrcBlock =
141 1265113 : static_cast<const GByte *>(poBlock->GetDataRef());
142 1265113 : GPtrDiff_t iSrcOffset =
143 1265113 : (iSrcX - nStartBlockX + iSrcOffsetCst) * nBandDataSize;
144 1265113 : pabySrcData = pabySrcBlock + iSrcOffset;
145 : }
146 :
147 : /* --------------------------------------------------------------------
148 : */
149 : /* Copy the maximum run of pixels. */
150 : /* --------------------------------------------------------------------
151 : */
152 :
153 1265113 : const int nIters = std::min(
154 1265113 : (nEndBlockX - iSrcX + (nSrcXInc - 1)) / nSrcXInc, nOuterLoopIters);
155 : if (bSameDataType)
156 : {
157 1264670 : memcpy(pabyDstData, pabySrcData, nBandDataSize);
158 1264670 : if (nIters > 1)
159 : {
160 : if (DATA_TYPE_SIZE == 1)
161 : {
162 326320 : pabySrcData += nIncSrcOffset;
163 326320 : pabyDstData += nPixelSpace;
164 326320 : GDALFastCopyByte(pabySrcData, nIncSrcOffset, pabyDstData,
165 326320 : nPixelSpace, nIters - 1);
166 326320 : pabySrcData +=
167 326320 : static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 2);
168 326320 : pabyDstData +=
169 326320 : static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 2);
170 : }
171 : else
172 : {
173 4395716 : for (int i = 0; i < nIters - 1; i++)
174 : {
175 4197550 : pabySrcData += nIncSrcOffset;
176 4197550 : pabyDstData += nPixelSpace;
177 4197550 : memcpy(pabyDstData, pabySrcData, nBandDataSize);
178 : }
179 : }
180 524490 : iSrcX += nSrcXInc * (nIters - 1);
181 524490 : nOuterLoopIters -= nIters - 1;
182 : }
183 : }
184 : else
185 : {
186 : // Type to type conversion ...
187 443 : GDALCopyWords64(pabySrcData, eDataType, nIncSrcOffset, pabyDstData,
188 443 : eBufType, nPixelSpace, std::max(1, nIters));
189 443 : if (nIters > 1)
190 : {
191 216 : pabySrcData +=
192 216 : static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 1);
193 216 : pabyDstData +=
194 216 : static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 1);
195 216 : iSrcX += nSrcXInc * (nIters - 1);
196 216 : nOuterLoopIters -= nIters - 1;
197 : }
198 : }
199 : }
200 :
201 : // Deal with last iteration to avoid iSrcX to go beyond nRasterXSize - 1
202 1063279 : if (nOuterLoopIters == 0)
203 : {
204 367430 : const int nRasterXSize = poBand->GetXSize();
205 367430 : iSrcX =
206 734860 : static_cast<int>(std::min(static_cast<GInt64>(iSrcX) + nSrcXInc,
207 367430 : static_cast<GInt64>(nRasterXSize - 1)));
208 367430 : pabyDstData += nPixelSpace;
209 367430 : if (iSrcX < nEndBlockX)
210 : {
211 354840 : goto no_reload_block;
212 : }
213 12590 : goto reload_block;
214 : }
215 695849 : return true;
216 : }
217 :
218 : template <class A, class B>
219 2832480 : CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW inline auto CPLUnsanitizedMul(A a, B b)
220 : {
221 2832480 : return a * b;
222 : }
223 :
224 : /************************************************************************/
225 : /* IRasterIO() */
226 : /* */
227 : /* Default internal implementation of RasterIO() ... utilizes */
228 : /* the Block access methods to satisfy the request. This would */
229 : /* normally only be overridden by formats with overviews. */
230 : /************************************************************************/
231 :
232 6195440 : CPLErr GDALRasterBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
233 : int nXSize, int nYSize, void *pData,
234 : int nBufXSize, int nBufYSize,
235 : GDALDataType eBufType, GSpacing nPixelSpace,
236 : GSpacing nLineSpace,
237 : GDALRasterIOExtraArg *psExtraArg)
238 :
239 : {
240 6195440 : if (eRWFlag == GF_Write && eFlushBlockErr != CE_None)
241 : {
242 0 : CPLError(eFlushBlockErr, CPLE_AppDefined,
243 : "An error occurred while writing a dirty block "
244 : "from GDALRasterBand::IRasterIO");
245 0 : CPLErr eErr = eFlushBlockErr;
246 0 : eFlushBlockErr = CE_None;
247 0 : return eErr;
248 : }
249 6195440 : if (nBlockXSize <= 0 || nBlockYSize <= 0)
250 : {
251 0 : CPLError(CE_Failure, CPLE_AppDefined, "Invalid block size");
252 0 : return CE_Failure;
253 : }
254 :
255 6195440 : const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
256 6195440 : const int nBufDataSize = GDALGetDataTypeSizeBytes(eBufType);
257 6195440 : GByte dummyBlock[2] = {0, 0};
258 6195440 : GByte *pabySrcBlock =
259 : dummyBlock; /* to avoid Coverity warning about nullptr dereference */
260 6195440 : GDALRasterBlock *poBlock = nullptr;
261 6195440 : const bool bUseIntegerRequestCoords =
262 6562020 : (!psExtraArg->bFloatingPointWindowValidity ||
263 366579 : (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
264 341647 : nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
265 :
266 : /* ==================================================================== */
267 : /* A common case is the data requested with the destination */
268 : /* is packed, and the block width is the raster width. */
269 : /* ==================================================================== */
270 6102080 : if (nPixelSpace == nBufDataSize && nLineSpace == nPixelSpace * nXSize &&
271 3241120 : nBlockXSize == GetXSize() && nBufXSize == nXSize &&
272 12297500 : nBufYSize == nYSize && bUseIntegerRequestCoords)
273 : {
274 3100100 : CPLErr eErr = CE_None;
275 3100100 : int nLBlockY = -1;
276 :
277 9806530 : for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
278 : {
279 6707510 : const int iSrcY = iBufYOff + nYOff;
280 :
281 6707510 : if (iSrcY < nLBlockY * nBlockYSize ||
282 6707510 : iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
283 : {
284 3369540 : nLBlockY = iSrcY / nBlockYSize;
285 3369540 : bool bJustInitialize =
286 298129 : eRWFlag == GF_Write && nXOff == 0 &&
287 3725690 : nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
288 58015 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize;
289 :
290 : // Is this a partial tile at right and/or bottom edges of
291 : // the raster, and that is going to be completely written?
292 : // If so, do not load it from storage, but zero it so that
293 : // the content outsize of the validity area is initialized.
294 3369540 : bool bMemZeroBuffer = false;
295 298129 : if (eRWFlag == GF_Write && !bJustInitialize && nXOff == 0 &&
296 25683 : nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
297 3667760 : nYOff + nYSize == GetYSize() &&
298 90 : nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)
299 : {
300 90 : bJustInitialize = true;
301 90 : bMemZeroBuffer = true;
302 : }
303 :
304 3369540 : if (poBlock)
305 269438 : poBlock->DropLock();
306 :
307 3369540 : const GUInt32 nErrorCounter = CPLGetErrorCounter();
308 3369540 : poBlock = GetLockedBlockRef(0, nLBlockY, bJustInitialize);
309 3369540 : if (poBlock == nullptr)
310 : {
311 1079 : if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
312 : nullptr)
313 : {
314 0 : CPLError(CE_Failure, CPLE_AppDefined,
315 : "GetBlockRef failed at X block offset %d, "
316 : "Y block offset %d%s",
317 : 0, nLBlockY,
318 0 : (nErrorCounter != CPLGetErrorCounter())
319 0 : ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
320 : : "");
321 : }
322 1079 : eErr = CE_Failure;
323 1079 : break;
324 : }
325 :
326 3368460 : if (eRWFlag == GF_Write)
327 298129 : poBlock->MarkDirty();
328 :
329 3368460 : pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
330 3368460 : if (bMemZeroBuffer)
331 : {
332 90 : memset(pabySrcBlock, 0,
333 90 : static_cast<GPtrDiff_t>(nBandDataSize) *
334 90 : nBlockXSize * nBlockYSize);
335 : }
336 : }
337 :
338 6706430 : const auto nSrcByteOffset =
339 6706430 : (static_cast<GPtrDiff_t>(iSrcY - nLBlockY * nBlockYSize) *
340 6706430 : nBlockXSize +
341 6706430 : nXOff) *
342 6706430 : nBandDataSize;
343 :
344 6706430 : if (eDataType == eBufType)
345 : {
346 3041110 : if (eRWFlag == GF_Read)
347 2565860 : memcpy(static_cast<GByte *>(pData) +
348 2565860 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
349 2565860 : pabySrcBlock + nSrcByteOffset,
350 : static_cast<size_t>(nLineSpace));
351 : else
352 475251 : memcpy(pabySrcBlock + nSrcByteOffset,
353 475251 : static_cast<GByte *>(pData) +
354 475251 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
355 : static_cast<size_t>(nLineSpace));
356 : }
357 : else
358 : {
359 : // Type to type conversion.
360 3665320 : if (eRWFlag == GF_Read)
361 3643020 : GDALCopyWords64(
362 3643020 : pabySrcBlock + nSrcByteOffset, eDataType, nBandDataSize,
363 : static_cast<GByte *>(pData) +
364 3643020 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
365 : eBufType, static_cast<int>(nPixelSpace), nBufXSize);
366 : else
367 22299 : GDALCopyWords64(static_cast<GByte *>(pData) +
368 22299 : static_cast<GPtrDiff_t>(iBufYOff) *
369 : nLineSpace,
370 : eBufType, static_cast<int>(nPixelSpace),
371 22299 : pabySrcBlock + nSrcByteOffset, eDataType,
372 : nBandDataSize, nBufXSize);
373 : }
374 :
375 6794570 : if (psExtraArg->pfnProgress != nullptr &&
376 88144 : !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
377 : psExtraArg->pProgressData))
378 : {
379 5 : eErr = CE_Failure;
380 5 : break;
381 : }
382 : }
383 :
384 3100100 : if (poBlock)
385 3099020 : poBlock->DropLock();
386 :
387 3100100 : return eErr;
388 : }
389 :
390 : /* ==================================================================== */
391 : /* Do we have overviews that would be appropriate to satisfy */
392 : /* this request? */
393 : /* ==================================================================== */
394 3095330 : if ((nBufXSize < nXSize || nBufYSize < nYSize) && GetOverviewCount() > 0 &&
395 : eRWFlag == GF_Read)
396 : {
397 : GDALRasterIOExtraArg sExtraArg;
398 2967 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
399 :
400 : const int nOverview =
401 2967 : GDALBandGetBestOverviewLevel2(this, nXOff, nYOff, nXSize, nYSize,
402 : nBufXSize, nBufYSize, &sExtraArg);
403 2967 : if (nOverview >= 0)
404 : {
405 2892 : GDALRasterBand *poOverviewBand = GetOverview(nOverview);
406 2892 : if (poOverviewBand == nullptr)
407 2892 : return CE_Failure;
408 :
409 2892 : return poOverviewBand->RasterIO(
410 : eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize,
411 2892 : nBufYSize, eBufType, nPixelSpace, nLineSpace, &sExtraArg);
412 : }
413 : }
414 :
415 902908 : if (eRWFlag == GF_Read && nBufXSize < nXSize / 100 &&
416 6 : nBufYSize < nYSize / 100 && nPixelSpace == nBufDataSize &&
417 3995360 : nLineSpace == nPixelSpace * nBufXSize &&
418 6 : CPLTestBool(CPLGetConfigOption("GDAL_NO_COSTLY_OVERVIEW", "NO")))
419 : {
420 0 : memset(pData, 0, static_cast<size_t>(nLineSpace * nBufYSize));
421 0 : return CE_None;
422 : }
423 :
424 : /* ==================================================================== */
425 : /* The second case when we don't need subsample data but likely */
426 : /* need data type conversion. */
427 : /* ==================================================================== */
428 3092440 : if ( // nPixelSpace == nBufDataSize &&
429 3092440 : nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
430 : {
431 : #if DEBUG_VERBOSE
432 : printf("IRasterIO(%d,%d,%d,%d) rw=%d case 2\n", /*ok*/
433 : nXOff, nYOff, nXSize, nYSize, static_cast<int>(eRWFlag));
434 : #endif
435 :
436 : /* --------------------------------------------------------------------
437 : */
438 : /* Loop over buffer computing source locations. */
439 : /* --------------------------------------------------------------------
440 : */
441 : // Calculate starting values out of loop
442 2512990 : const int nLBlockXStart = nXOff / nBlockXSize;
443 2512990 : const int nXSpanEnd = nBufXSize + nXOff;
444 :
445 2512990 : int iBufYOff = 0;
446 2512990 : int iSrcY = nYOff;
447 : while (true)
448 : {
449 2554070 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
450 : static_cast<GPtrDiff_t>(nLineSpace);
451 2554070 : int nLBlockY = iSrcY / nBlockYSize;
452 2554070 : int nLBlockX = nLBlockXStart;
453 2554070 : int iSrcX = nXOff;
454 5386480 : while (iSrcX < nXSpanEnd)
455 : {
456 2832480 : int nXSpan = nLBlockX * nBlockXSize;
457 2832480 : if (nXSpan < INT_MAX - nBlockXSize)
458 2832480 : nXSpan += nBlockXSize;
459 : else
460 0 : nXSpan = INT_MAX;
461 2832480 : const int nXRight = nXSpan;
462 2832480 : nXSpan = (nXSpan < nXSpanEnd ? nXSpan : nXSpanEnd) - iSrcX;
463 :
464 : const size_t nXSpanSize =
465 2832480 : CPLUnsanitizedMul(nXSpan, static_cast<size_t>(nPixelSpace));
466 :
467 2832480 : bool bJustInitialize =
468 2043070 : eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
469 38135 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
470 4901960 : nXOff <= nLBlockX * nBlockXSize &&
471 26406 : nXOff + nXSize >= nXRight;
472 :
473 : // Is this a partial tile at right and/or bottom edges of
474 : // the raster, and that is going to be completely written?
475 : // If so, do not load it from storage, but zero it so that
476 : // the content outsize of the validity area is initialized.
477 2832480 : bool bMemZeroBuffer = false;
478 2043070 : if (eRWFlag == GF_Write && !bJustInitialize &&
479 2017920 : nXOff <= nLBlockX * nBlockXSize &&
480 2016260 : nYOff <= nLBlockY * nBlockYSize &&
481 12215 : (nXOff + nXSize >= nXRight ||
482 : // cppcheck-suppress knownConditionTrueFalse
483 4878330 : (nXOff + nXSize == GetXSize() && nXRight > GetXSize())) &&
484 12035 : (nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize ||
485 10801 : (nYOff + nYSize == GetYSize() &&
486 2009 : nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)))
487 : {
488 3243 : bJustInitialize = true;
489 3243 : bMemZeroBuffer = true;
490 : }
491 :
492 : /* --------------------------------------------------------------------
493 : */
494 : /* Ensure we have the appropriate block loaded. */
495 : /* --------------------------------------------------------------------
496 : */
497 2832480 : const GUInt32 nErrorCounter = CPLGetErrorCounter();
498 2832480 : poBlock =
499 2832480 : GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
500 2832480 : if (!poBlock)
501 : {
502 73 : if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
503 : nullptr)
504 : {
505 0 : CPLError(CE_Failure, CPLE_AppDefined,
506 : "GetBlockRef failed at X block offset %d, "
507 : "Y block offset %d%s",
508 : nLBlockX, nLBlockY,
509 0 : (nErrorCounter != CPLGetErrorCounter())
510 0 : ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
511 : : "");
512 : }
513 73 : return (CE_Failure);
514 : }
515 :
516 2832410 : if (eRWFlag == GF_Write)
517 2043070 : poBlock->MarkDirty();
518 :
519 2832410 : pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
520 2832410 : if (bMemZeroBuffer)
521 : {
522 3243 : memset(pabySrcBlock, 0,
523 3243 : static_cast<GPtrDiff_t>(nBandDataSize) *
524 3243 : nBlockXSize * nBlockYSize);
525 : }
526 : /* --------------------------------------------------------------------
527 : */
528 : /* Copy over this chunk of data. */
529 : /* --------------------------------------------------------------------
530 : */
531 2832410 : GPtrDiff_t iSrcOffset =
532 2832410 : (static_cast<GPtrDiff_t>(iSrcX) -
533 2832410 : static_cast<GPtrDiff_t>(nLBlockX * nBlockXSize) +
534 2832410 : (static_cast<GPtrDiff_t>(iSrcY) -
535 2832410 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
536 2832410 : nBlockXSize) *
537 2832410 : nBandDataSize;
538 : // Fill up as many rows as possible for the loaded block.
539 5664820 : const int kmax = std::min(nBlockYSize - (iSrcY % nBlockYSize),
540 2832410 : nBufYSize - iBufYOff);
541 61154100 : for (int k = 0; k < kmax; k++)
542 : {
543 58321700 : if (eDataType == eBufType && nPixelSpace == nBufDataSize)
544 : {
545 53917300 : if (eRWFlag == GF_Read)
546 49467500 : memcpy(static_cast<GByte *>(pData) + iBufOffset +
547 49467500 : static_cast<GPtrDiff_t>(k) * nLineSpace,
548 49467500 : pabySrcBlock + iSrcOffset, nXSpanSize);
549 : else
550 4449830 : memcpy(pabySrcBlock + iSrcOffset,
551 4449830 : static_cast<GByte *>(pData) + iBufOffset +
552 4449830 : static_cast<GPtrDiff_t>(k) * nLineSpace,
553 : nXSpanSize);
554 : }
555 : else
556 : {
557 : /* type to type conversion */
558 4404350 : if (eRWFlag == GF_Read)
559 4254140 : GDALCopyWords64(
560 4254140 : pabySrcBlock + iSrcOffset, eDataType,
561 : nBandDataSize,
562 4254140 : static_cast<GByte *>(pData) + iBufOffset +
563 4254140 : static_cast<GPtrDiff_t>(k) * nLineSpace,
564 : eBufType, static_cast<int>(nPixelSpace),
565 : nXSpan);
566 : else
567 150209 : GDALCopyWords64(
568 150209 : static_cast<GByte *>(pData) + iBufOffset +
569 150209 : static_cast<GPtrDiff_t>(k) * nLineSpace,
570 : eBufType, static_cast<int>(nPixelSpace),
571 150209 : pabySrcBlock + iSrcOffset, eDataType,
572 : nBandDataSize, nXSpan);
573 : }
574 :
575 58321700 : iSrcOffset +=
576 58321700 : static_cast<GPtrDiff_t>(nBlockXSize) * nBandDataSize;
577 : }
578 :
579 : iBufOffset =
580 2832410 : CPLUnsanitizedAdd<GPtrDiff_t>(iBufOffset, nXSpanSize);
581 2832410 : nLBlockX++;
582 2832410 : iSrcX += nXSpan;
583 :
584 2832410 : poBlock->DropLock();
585 2832410 : poBlock = nullptr;
586 : }
587 :
588 : /* Compute the increment to go on a block boundary */
589 2554000 : const int nYInc = nBlockYSize - (iSrcY % nBlockYSize);
590 :
591 2555880 : if (psExtraArg->pfnProgress != nullptr &&
592 1889 : !psExtraArg->pfnProgress(
593 2555880 : 1.0 * std::min(nBufYSize, iBufYOff + nYInc) / nBufYSize, "",
594 : psExtraArg->pProgressData))
595 : {
596 0 : return CE_Failure;
597 : }
598 :
599 2554000 : iBufYOff += nYInc;
600 2554000 : if (iBufYOff >= nBufYSize)
601 2512920 : break;
602 : // Only increment iSrcY after above loop end check, to avoid
603 : // potential int overflow.
604 41079 : iSrcY += nYInc;
605 41079 : }
606 :
607 2512920 : return CE_None;
608 : }
609 :
610 : /* ==================================================================== */
611 : /* Loop reading required source blocks to satisfy output */
612 : /* request. This is the most general implementation. */
613 : /* ==================================================================== */
614 :
615 579452 : double dfXOff = nXOff;
616 579452 : double dfYOff = nYOff;
617 579452 : double dfXSize = nXSize;
618 579452 : double dfYSize = nYSize;
619 579452 : if (psExtraArg->bFloatingPointWindowValidity)
620 : {
621 244495 : dfXOff = psExtraArg->dfXOff;
622 244495 : dfYOff = psExtraArg->dfYOff;
623 244495 : dfXSize = psExtraArg->dfXSize;
624 244495 : dfYSize = psExtraArg->dfYSize;
625 : }
626 :
627 : /* -------------------------------------------------------------------- */
628 : /* Compute stepping increment. */
629 : /* -------------------------------------------------------------------- */
630 579452 : const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
631 579452 : const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
632 579452 : CPLErr eErr = CE_None;
633 :
634 579452 : if (eRWFlag == GF_Write)
635 : {
636 : /* --------------------------------------------------------------------
637 : */
638 : /* Write case */
639 : /* Loop over raster window computing source locations in the buffer.
640 : */
641 : /* --------------------------------------------------------------------
642 : */
643 166655 : GByte *pabyDstBlock = nullptr;
644 166655 : int nLBlockX = -1;
645 166655 : int nLBlockY = -1;
646 :
647 1260010 : for (int iDstY = nYOff; iDstY < nYOff + nYSize; iDstY++)
648 : {
649 1093360 : const int iBufYOff = static_cast<int>((iDstY - nYOff) / dfSrcYInc);
650 :
651 12384200 : for (int iDstX = nXOff; iDstX < nXOff + nXSize; iDstX++)
652 : {
653 11290800 : const int iBufXOff =
654 11290800 : static_cast<int>((iDstX - nXOff) / dfSrcXInc);
655 11290800 : GPtrDiff_t iBufOffset =
656 11290800 : static_cast<GPtrDiff_t>(iBufYOff) *
657 : static_cast<GPtrDiff_t>(nLineSpace) +
658 11290800 : iBufXOff * static_cast<GPtrDiff_t>(nPixelSpace);
659 :
660 : // FIXME: this code likely doesn't work if the dirty block gets
661 : // flushed to disk before being completely written.
662 : // In the meantime, bJustInitialize should probably be set to
663 : // FALSE even if it is not ideal performance wise, and for
664 : // lossy compression.
665 :
666 : /* --------------------------------------------------------------------
667 : */
668 : /* Ensure we have the appropriate block loaded. */
669 : /* --------------------------------------------------------------------
670 : */
671 11290800 : if (iDstX < nLBlockX * nBlockXSize ||
672 11041500 : iDstX - nBlockXSize >= nLBlockX * nBlockXSize ||
673 10584800 : iDstY < nLBlockY * nBlockYSize ||
674 10584800 : iDstY - nBlockYSize >= nLBlockY * nBlockYSize)
675 : {
676 738702 : nLBlockX = iDstX / nBlockXSize;
677 738702 : nLBlockY = iDstY / nBlockYSize;
678 :
679 738702 : const bool bJustInitialize =
680 1065990 : nYOff <= nLBlockY * nBlockYSize &&
681 327291 : nYOff + nYSize - nBlockYSize >=
682 327291 : nLBlockY * nBlockYSize &&
683 1116320 : nXOff <= nLBlockX * nBlockXSize &&
684 50325 : nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
685 : /*bool bMemZeroBuffer = FALSE;
686 : if( !bJustInitialize &&
687 : nXOff <= nLBlockX * nBlockXSize &&
688 : nYOff <= nLBlockY * nBlockYSize &&
689 : (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
690 : (nXOff + nXSize == GetXSize() &&
691 : (nLBlockX+1) * nBlockXSize > GetXSize())) &&
692 : (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
693 : (nYOff + nYSize == GetYSize() &&
694 : (nLBlockY+1) * nBlockYSize > GetYSize())) )
695 : {
696 : bJustInitialize = TRUE;
697 : bMemZeroBuffer = TRUE;
698 : }*/
699 738702 : if (poBlock != nullptr)
700 572047 : poBlock->DropLock();
701 :
702 738702 : poBlock =
703 738702 : GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
704 738702 : if (poBlock == nullptr)
705 : {
706 0 : return (CE_Failure);
707 : }
708 :
709 738702 : poBlock->MarkDirty();
710 :
711 738702 : pabyDstBlock = static_cast<GByte *>(poBlock->GetDataRef());
712 : /*if( bMemZeroBuffer )
713 : {
714 : memset(pabyDstBlock, 0,
715 : static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
716 : * nBlockYSize);
717 : }*/
718 : }
719 :
720 : // To make Coverity happy. Should not happen by design.
721 11290800 : if (pabyDstBlock == nullptr)
722 : {
723 0 : CPLAssert(false);
724 : eErr = CE_Failure;
725 : break;
726 : }
727 :
728 : /* --------------------------------------------------------------------
729 : */
730 : /* Copy over this pixel of data. */
731 : /* --------------------------------------------------------------------
732 : */
733 11290800 : GPtrDiff_t iDstOffset =
734 11290800 : (static_cast<GPtrDiff_t>(iDstX) -
735 11290800 : static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
736 11290800 : (static_cast<GPtrDiff_t>(iDstY) -
737 11290800 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
738 11290800 : nBlockXSize) *
739 11290800 : nBandDataSize;
740 :
741 11290800 : if (eDataType == eBufType)
742 : {
743 11287700 : memcpy(pabyDstBlock + iDstOffset,
744 11287700 : static_cast<GByte *>(pData) + iBufOffset,
745 : nBandDataSize);
746 : }
747 : else
748 : {
749 : /* type to type conversion ... ouch, this is expensive way
750 : of handling single words */
751 3096 : GDALCopyWords64(static_cast<GByte *>(pData) + iBufOffset,
752 3096 : eBufType, 0, pabyDstBlock + iDstOffset,
753 : eDataType, 0, 1);
754 : }
755 : }
756 :
757 1093360 : if (psExtraArg->pfnProgress != nullptr &&
758 0 : !psExtraArg->pfnProgress(1.0 * (iDstY - nYOff + 1) / nYSize, "",
759 : psExtraArg->pProgressData))
760 : {
761 0 : eErr = CE_Failure;
762 0 : break;
763 : }
764 : }
765 : }
766 : else
767 : {
768 412797 : if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
769 : {
770 46692 : if ((psExtraArg->eResampleAlg == GRIORA_Cubic ||
771 15098 : psExtraArg->eResampleAlg == GRIORA_CubicSpline ||
772 15045 : psExtraArg->eResampleAlg == GRIORA_Bilinear ||
773 31641 : psExtraArg->eResampleAlg == GRIORA_Lanczos) &&
774 4763 : GetColorTable() != nullptr)
775 : {
776 0 : CPLError(CE_Warning, CPLE_NotSupported,
777 : "Resampling method not supported on paletted band. "
778 : "Falling back to nearest neighbour");
779 : }
780 15800 : else if (psExtraArg->eResampleAlg == GRIORA_Gauss &&
781 3 : GDALDataTypeIsComplex(eDataType))
782 : {
783 0 : CPLError(CE_Warning, CPLE_NotSupported,
784 : "Resampling method not supported on complex data type "
785 : "band. Falling back to nearest neighbour");
786 : }
787 : else
788 : {
789 15797 : return RasterIOResampled(eRWFlag, nXOff, nYOff, nXSize, nYSize,
790 : pData, nBufXSize, nBufYSize, eBufType,
791 15797 : nPixelSpace, nLineSpace, psExtraArg);
792 : }
793 : }
794 :
795 397000 : int nLimitBlockY = 0;
796 397000 : const bool bByteCopy = eDataType == eBufType && nBandDataSize == 1;
797 397000 : int nStartBlockX = -nBlockXSize;
798 397000 : constexpr double EPS = 1e-10;
799 397000 : int nLBlockY = -1;
800 397000 : const double dfSrcXStart = 0.5 * dfSrcXInc + dfXOff + EPS;
801 397000 : const bool bIntegerXFactor =
802 372767 : bUseIntegerRequestCoords &&
803 670836 : static_cast<int>(dfSrcXInc) == dfSrcXInc &&
804 273836 : static_cast<int>(dfSrcXInc) < INT_MAX / nBandDataSize;
805 :
806 : /* --------------------------------------------------------------------
807 : */
808 : /* Read case */
809 : /* Loop over buffer computing source locations. */
810 : /* --------------------------------------------------------------------
811 : */
812 2367100 : for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
813 : {
814 : // Add small epsilon to avoid some numeric precision issues.
815 1970110 : const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
816 1970110 : const int iSrcY = static_cast<int>(std::min(
817 1970110 : std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
818 :
819 1970110 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
820 : static_cast<GPtrDiff_t>(nLineSpace);
821 :
822 1970110 : if (iSrcY >= nLimitBlockY)
823 : {
824 438018 : nLBlockY = iSrcY / nBlockYSize;
825 438018 : nLimitBlockY = nLBlockY * nBlockYSize;
826 438018 : if (nLimitBlockY < INT_MAX - nBlockYSize)
827 438018 : nLimitBlockY += nBlockYSize;
828 : else
829 0 : nLimitBlockY = INT_MAX;
830 : // Make sure a new block is loaded.
831 438018 : nStartBlockX = -nBlockXSize;
832 : }
833 1532090 : else if (static_cast<int>(dfSrcXStart) < nStartBlockX)
834 : {
835 : // Make sure a new block is loaded.
836 437363 : nStartBlockX = -nBlockXSize;
837 : }
838 :
839 1970110 : GPtrDiff_t iSrcOffsetCst = (iSrcY - nLBlockY * nBlockYSize) *
840 1970110 : static_cast<GPtrDiff_t>(nBlockXSize);
841 :
842 1970110 : if (bIntegerXFactor)
843 : {
844 695850 : int iSrcX = static_cast<int>(dfSrcXStart);
845 695850 : const int nSrcXInc = static_cast<int>(dfSrcXInc);
846 695850 : GByte *pabyDstData = static_cast<GByte *>(pData) + iBufOffset;
847 695850 : bool bRet = false;
848 695850 : if (bByteCopy)
849 : {
850 585842 : bRet = DownsamplingIntegerXFactor<true, 1>(
851 : this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
852 : static_cast<int>(nPixelSpace), nBufXSize, GDT_UInt8,
853 : GDT_UInt8, nStartBlockX, nBlockXSize, poBlock,
854 : nLBlockY);
855 : }
856 110008 : else if (eDataType == eBufType)
857 : {
858 109783 : switch (nBandDataSize)
859 : {
860 109630 : case 2:
861 109630 : bRet = DownsamplingIntegerXFactor<true, 2>(
862 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
863 : pabyDstData, static_cast<int>(nPixelSpace),
864 : nBufXSize, eDataType, eDataType, nStartBlockX,
865 : nBlockXSize, poBlock, nLBlockY);
866 109630 : break;
867 55 : case 4:
868 55 : bRet = DownsamplingIntegerXFactor<true, 4>(
869 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
870 : pabyDstData, static_cast<int>(nPixelSpace),
871 : nBufXSize, eDataType, eDataType, nStartBlockX,
872 : nBlockXSize, poBlock, nLBlockY);
873 55 : break;
874 96 : case 8:
875 96 : bRet = DownsamplingIntegerXFactor<true, 8>(
876 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
877 : pabyDstData, static_cast<int>(nPixelSpace),
878 : nBufXSize, eDataType, eDataType, nStartBlockX,
879 : nBlockXSize, poBlock, nLBlockY);
880 96 : break;
881 2 : case 16:
882 2 : bRet = DownsamplingIntegerXFactor<true, 16>(
883 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
884 : pabyDstData, static_cast<int>(nPixelSpace),
885 : nBufXSize, eDataType, eDataType, nStartBlockX,
886 : nBlockXSize, poBlock, nLBlockY);
887 2 : break;
888 0 : default:
889 0 : CPLAssert(false);
890 : break;
891 : }
892 : }
893 : else
894 : {
895 225 : bRet = DownsamplingIntegerXFactor<false, 0>(
896 : this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
897 : static_cast<int>(nPixelSpace), nBufXSize, eDataType,
898 : eBufType, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
899 : }
900 695850 : if (!bRet)
901 1 : eErr = CE_Failure;
902 : }
903 : else
904 : {
905 1274260 : double dfSrcX = dfSrcXStart;
906 503811000 : for (int iBufXOff = 0; iBufXOff < nBufXSize;
907 502537000 : iBufXOff++, dfSrcX += dfSrcXInc)
908 : {
909 : // TODO?: try to avoid the clamping for most iterations
910 : const int iSrcX = static_cast<int>(
911 1005070000 : std::min(std::max(0.0, dfSrcX),
912 502537000 : static_cast<double>(nRasterXSize - 1)));
913 :
914 : /* --------------------------------------------------------------------
915 : */
916 : /* Ensure we have the appropriate block loaded. */
917 : /* --------------------------------------------------------------------
918 : */
919 502537000 : if (iSrcX >= nBlockXSize + nStartBlockX)
920 : {
921 1697820 : const int nLBlockX = iSrcX / nBlockXSize;
922 1697820 : nStartBlockX = nLBlockX * nBlockXSize;
923 :
924 1697820 : if (poBlock != nullptr)
925 1574650 : poBlock->DropLock();
926 :
927 1697820 : poBlock = GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
928 1697820 : if (poBlock == nullptr)
929 : {
930 9 : eErr = CE_Failure;
931 9 : break;
932 : }
933 :
934 : pabySrcBlock =
935 1697810 : static_cast<GByte *>(poBlock->GetDataRef());
936 : }
937 502537000 : const GPtrDiff_t nDiffX =
938 502537000 : static_cast<GPtrDiff_t>(iSrcX - nStartBlockX);
939 :
940 : /* --------------------------------------------------------------------
941 : */
942 : /* Copy over this pixel of data. */
943 : /* --------------------------------------------------------------------
944 : */
945 :
946 502537000 : if (bByteCopy)
947 : {
948 442592000 : GPtrDiff_t iSrcOffset = nDiffX + iSrcOffsetCst;
949 442592000 : static_cast<GByte *>(pData)[iBufOffset] =
950 442592000 : pabySrcBlock[iSrcOffset];
951 : }
952 59944700 : else if (eDataType == eBufType)
953 : {
954 50322800 : GPtrDiff_t iSrcOffset =
955 50322800 : (nDiffX + iSrcOffsetCst) * nBandDataSize;
956 50322800 : memcpy(static_cast<GByte *>(pData) + iBufOffset,
957 50322800 : pabySrcBlock + iSrcOffset, nBandDataSize);
958 : }
959 : else
960 : {
961 : // Type to type conversion ...
962 9621890 : GPtrDiff_t iSrcOffset =
963 9621890 : (nDiffX + iSrcOffsetCst) * nBandDataSize;
964 9621890 : GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
965 : static_cast<GByte *>(pData) +
966 9621890 : iBufOffset,
967 : eBufType, 0, 1);
968 : }
969 :
970 502537000 : iBufOffset += static_cast<int>(nPixelSpace);
971 : }
972 : }
973 1970110 : if (eErr == CE_Failure)
974 11 : break;
975 :
976 2191530 : if (psExtraArg->pfnProgress != nullptr &&
977 221434 : !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
978 : psExtraArg->pProgressData))
979 : {
980 1 : eErr = CE_Failure;
981 1 : break;
982 : }
983 : }
984 : }
985 :
986 563655 : if (poBlock != nullptr)
987 563645 : poBlock->DropLock();
988 :
989 563655 : return eErr;
990 : }
991 :
992 : /************************************************************************/
993 : /* GDALRasterIOTransformer() */
994 : /************************************************************************/
995 :
996 : struct GDALRasterIOTransformerStruct
997 : {
998 : double dfXOff;
999 : double dfYOff;
1000 : double dfXRatioDstToSrc;
1001 : double dfYRatioDstToSrc;
1002 : };
1003 :
1004 6897 : static int GDALRasterIOTransformer(void *pTransformerArg, int bDstToSrc,
1005 : int nPointCount, double *x, double *y,
1006 : double * /* z */, int *panSuccess)
1007 : {
1008 6897 : GDALRasterIOTransformerStruct *psParams =
1009 : static_cast<GDALRasterIOTransformerStruct *>(pTransformerArg);
1010 6897 : if (bDstToSrc)
1011 : {
1012 311993 : for (int i = 0; i < nPointCount; i++)
1013 : {
1014 305684 : x[i] = x[i] * psParams->dfXRatioDstToSrc + psParams->dfXOff;
1015 305684 : y[i] = y[i] * psParams->dfYRatioDstToSrc + psParams->dfYOff;
1016 305684 : panSuccess[i] = TRUE;
1017 : }
1018 : }
1019 : else
1020 : {
1021 1176 : for (int i = 0; i < nPointCount; i++)
1022 : {
1023 588 : x[i] = (x[i] - psParams->dfXOff) / psParams->dfXRatioDstToSrc;
1024 588 : y[i] = (y[i] - psParams->dfYOff) / psParams->dfYRatioDstToSrc;
1025 588 : panSuccess[i] = TRUE;
1026 : }
1027 : }
1028 6897 : return TRUE;
1029 : }
1030 :
1031 : /************************************************************************/
1032 : /* RasterIOResampled() */
1033 : /************************************************************************/
1034 :
1035 : //! @cond Doxygen_Suppress
1036 15797 : CPLErr GDALRasterBand::RasterIOResampled(
1037 : GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
1038 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
1039 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
1040 : {
1041 : // Determine if we use warping resampling or overview resampling
1042 : const bool bUseWarp =
1043 15797 : (GDALDataTypeIsComplex(eDataType) &&
1044 15956 : psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
1045 159 : psExtraArg->eResampleAlg != GRIORA_Mode);
1046 :
1047 15797 : double dfXOff = nXOff;
1048 15797 : double dfYOff = nYOff;
1049 15797 : double dfXSize = nXSize;
1050 15797 : double dfYSize = nYSize;
1051 15797 : if (psExtraArg->bFloatingPointWindowValidity)
1052 : {
1053 15051 : dfXOff = psExtraArg->dfXOff;
1054 15051 : dfYOff = psExtraArg->dfYOff;
1055 15051 : dfXSize = psExtraArg->dfXSize;
1056 15051 : dfYSize = psExtraArg->dfYSize;
1057 : }
1058 :
1059 15797 : const double dfXRatioDstToSrc = dfXSize / nBufXSize;
1060 15797 : const double dfYRatioDstToSrc = dfYSize / nBufYSize;
1061 :
1062 : // Determine the coordinates in the "virtual" output raster to see
1063 : // if there are not integers, in which case we will use them as a shift
1064 : // so that subwindow extracts give the exact same results as entire raster
1065 : // scaling.
1066 15797 : double dfDestXOff = dfXOff / dfXRatioDstToSrc;
1067 15797 : bool bHasXOffVirtual = false;
1068 15797 : int nDestXOffVirtual = 0;
1069 15797 : if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
1070 : {
1071 15469 : bHasXOffVirtual = true;
1072 15469 : dfXOff = nXOff;
1073 15469 : nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
1074 : }
1075 :
1076 15797 : double dfDestYOff = dfYOff / dfYRatioDstToSrc;
1077 15797 : bool bHasYOffVirtual = false;
1078 15797 : int nDestYOffVirtual = 0;
1079 15797 : if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
1080 : {
1081 15465 : bHasYOffVirtual = true;
1082 15465 : dfYOff = nYOff;
1083 15465 : nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
1084 : }
1085 :
1086 : // Create a MEM dataset that wraps the output buffer.
1087 : GDALDataset *poMEMDS;
1088 15797 : void *pTempBuffer = nullptr;
1089 15797 : GSpacing nPSMem = nPixelSpace;
1090 15797 : GSpacing nLSMem = nLineSpace;
1091 15797 : void *pDataMem = pData;
1092 15797 : GDALDataType eDTMem = eBufType;
1093 15797 : if (eBufType != eDataType && !GDAL_GET_OPERATE_IN_BUF_TYPE(*psExtraArg))
1094 : {
1095 4 : nPSMem = GDALGetDataTypeSizeBytes(eDataType);
1096 4 : nLSMem = nPSMem * nBufXSize;
1097 : pTempBuffer =
1098 4 : VSI_MALLOC2_VERBOSE(nBufYSize, static_cast<size_t>(nLSMem));
1099 4 : if (pTempBuffer == nullptr)
1100 0 : return CE_Failure;
1101 4 : pDataMem = pTempBuffer;
1102 4 : eDTMem = eDataType;
1103 : }
1104 :
1105 : poMEMDS =
1106 15797 : MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
1107 : nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr);
1108 15797 : GByte *pabyData = static_cast<GByte *>(pDataMem) -
1109 15797 : nPSMem * nDestXOffVirtual - nLSMem * nDestYOffVirtual;
1110 15797 : GDALRasterBandH hMEMBand = MEMCreateRasterBandEx(
1111 : poMEMDS, 1, pabyData, eDTMem, nPSMem, nLSMem, false);
1112 15797 : poMEMDS->SetBand(1, GDALRasterBand::FromHandle(hMEMBand));
1113 :
1114 : const char *pszNBITS =
1115 15797 : GetMetadataItem(GDALMD_NBITS, GDAL_MDD_IMAGE_STRUCTURE);
1116 15797 : const int nNBITS = pszNBITS ? atoi(pszNBITS) : 0;
1117 15797 : if (pszNBITS)
1118 6 : GDALRasterBand::FromHandle(hMEMBand)->SetMetadataItem(
1119 6 : GDALMD_NBITS, pszNBITS, GDAL_MDD_IMAGE_STRUCTURE);
1120 :
1121 15797 : CPLErr eErr = CE_None;
1122 :
1123 : // Do the resampling.
1124 15797 : if (bUseWarp)
1125 : {
1126 149 : int bHasNoData = FALSE;
1127 149 : double dfNoDataValue = GetNoDataValue(&bHasNoData);
1128 :
1129 149 : VRTDatasetH hVRTDS = nullptr;
1130 149 : GDALRasterBandH hVRTBand = nullptr;
1131 149 : if (GetDataset() == nullptr)
1132 : {
1133 : /* Create VRT dataset that wraps the whole dataset */
1134 0 : hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
1135 0 : VRTAddBand(hVRTDS, eDataType, nullptr);
1136 0 : hVRTBand = GDALGetRasterBand(hVRTDS, 1);
1137 0 : VRTAddSimpleSource(hVRTBand, this, 0, 0, nRasterXSize, nRasterYSize,
1138 : 0, 0, nRasterXSize, nRasterYSize, nullptr,
1139 : VRT_NODATA_UNSET);
1140 :
1141 : /* Add a mask band if needed */
1142 0 : if (GetMaskFlags() != GMF_ALL_VALID)
1143 : {
1144 0 : GDALDataset::FromHandle(hVRTDS)->CreateMaskBand(0);
1145 : VRTSourcedRasterBand *poVRTMaskBand =
1146 : reinterpret_cast<VRTSourcedRasterBand *>(
1147 : reinterpret_cast<GDALRasterBand *>(hVRTBand)
1148 0 : ->GetMaskBand());
1149 0 : poVRTMaskBand->AddMaskBandSource(this, 0, 0, nRasterXSize,
1150 0 : nRasterYSize, 0, 0,
1151 0 : nRasterXSize, nRasterYSize);
1152 : }
1153 : }
1154 :
1155 149 : GDALWarpOptions *psWarpOptions = GDALCreateWarpOptions();
1156 149 : switch (psExtraArg->eResampleAlg)
1157 : {
1158 0 : case GRIORA_NearestNeighbour:
1159 0 : psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
1160 0 : break;
1161 147 : case GRIORA_Bilinear:
1162 147 : psWarpOptions->eResampleAlg = GRA_Bilinear;
1163 147 : break;
1164 0 : case GRIORA_Cubic:
1165 0 : psWarpOptions->eResampleAlg = GRA_Cubic;
1166 0 : break;
1167 0 : case GRIORA_CubicSpline:
1168 0 : psWarpOptions->eResampleAlg = GRA_CubicSpline;
1169 0 : break;
1170 0 : case GRIORA_Lanczos:
1171 0 : psWarpOptions->eResampleAlg = GRA_Lanczos;
1172 0 : break;
1173 0 : case GRIORA_Average:
1174 0 : psWarpOptions->eResampleAlg = GRA_Average;
1175 0 : break;
1176 2 : case GRIORA_RMS:
1177 2 : psWarpOptions->eResampleAlg = GRA_RMS;
1178 2 : break;
1179 0 : case GRIORA_Mode:
1180 0 : psWarpOptions->eResampleAlg = GRA_Mode;
1181 0 : break;
1182 0 : default:
1183 0 : CPLAssert(false);
1184 : psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
1185 : break;
1186 : }
1187 149 : psWarpOptions->hSrcDS = hVRTDS ? hVRTDS : GetDataset();
1188 149 : psWarpOptions->hDstDS = poMEMDS;
1189 149 : psWarpOptions->nBandCount = 1;
1190 149 : int nSrcBandNumber = hVRTDS ? 1 : nBand;
1191 149 : int nDstBandNumber = 1;
1192 149 : psWarpOptions->panSrcBands = &nSrcBandNumber;
1193 149 : psWarpOptions->panDstBands = &nDstBandNumber;
1194 298 : psWarpOptions->pfnProgress = psExtraArg->pfnProgress
1195 149 : ? psExtraArg->pfnProgress
1196 : : GDALDummyProgress;
1197 149 : psWarpOptions->pProgressArg = psExtraArg->pProgressData;
1198 149 : psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
1199 149 : if (bHasNoData)
1200 : {
1201 0 : psWarpOptions->papszWarpOptions = CSLSetNameValue(
1202 : psWarpOptions->papszWarpOptions, "INIT_DEST", "NO_DATA");
1203 0 : if (psWarpOptions->padfSrcNoDataReal == nullptr)
1204 : {
1205 0 : psWarpOptions->padfSrcNoDataReal =
1206 0 : static_cast<double *>(CPLMalloc(sizeof(double)));
1207 0 : psWarpOptions->padfSrcNoDataReal[0] = dfNoDataValue;
1208 : }
1209 :
1210 0 : if (psWarpOptions->padfDstNoDataReal == nullptr)
1211 : {
1212 0 : psWarpOptions->padfDstNoDataReal =
1213 0 : static_cast<double *>(CPLMalloc(sizeof(double)));
1214 0 : psWarpOptions->padfDstNoDataReal[0] = dfNoDataValue;
1215 : }
1216 : }
1217 :
1218 : GDALRasterIOTransformerStruct sTransformer;
1219 149 : sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
1220 149 : sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
1221 149 : sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
1222 149 : sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
1223 149 : psWarpOptions->pTransformerArg = &sTransformer;
1224 :
1225 : GDALWarpOperationH hWarpOperation =
1226 149 : GDALCreateWarpOperation(psWarpOptions);
1227 149 : eErr = GDALChunkAndWarpImage(hWarpOperation, nDestXOffVirtual,
1228 : nDestYOffVirtual, nBufXSize, nBufYSize);
1229 149 : GDALDestroyWarpOperation(hWarpOperation);
1230 :
1231 149 : psWarpOptions->panSrcBands = nullptr;
1232 149 : psWarpOptions->panDstBands = nullptr;
1233 149 : GDALDestroyWarpOptions(psWarpOptions);
1234 :
1235 149 : if (hVRTDS)
1236 0 : GDALClose(hVRTDS);
1237 : }
1238 : else
1239 : {
1240 : const char *pszResampling =
1241 15648 : GDALRasterIOGetResampleAlg(psExtraArg->eResampleAlg);
1242 15648 : int nKernelRadius = 0;
1243 : GDALResampleFunction pfnResampleFunc =
1244 15648 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
1245 15648 : CPLAssert(pfnResampleFunc);
1246 : GDALDataType eWrkDataType =
1247 15648 : GDALGetOvrWorkDataType(pszResampling, eDataType);
1248 15648 : int nHasNoData = 0;
1249 15648 : double dfNoDataValue = GetNoDataValue(&nHasNoData);
1250 15648 : const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
1251 15648 : if (!bHasNoData)
1252 15516 : dfNoDataValue = 0.0;
1253 :
1254 15648 : int nDstBlockXSize = nBufXSize;
1255 15648 : int nDstBlockYSize = nBufYSize;
1256 15648 : int nFullResXChunk = 0;
1257 15648 : int nFullResYChunk = 0;
1258 : while (true)
1259 : {
1260 15659 : nFullResXChunk = static_cast<int>(std::min<double>(
1261 15659 : 3 + nDstBlockXSize * dfXRatioDstToSrc, nRasterXSize));
1262 15659 : nFullResYChunk = static_cast<int>(std::min<double>(
1263 15659 : 3 + nDstBlockYSize * dfYRatioDstToSrc, nRasterYSize));
1264 15659 : if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
1265 15601 : (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
1266 : 1024 * 1024))
1267 : break;
1268 : // When operating on the full width of a raster whose block width is
1269 : // the raster width, prefer doing chunks in height.
1270 11 : if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
1271 : nDstBlockYSize > 1)
1272 0 : nDstBlockYSize /= 2;
1273 : /* Otherwise cut the maximal dimension */
1274 11 : else if (nDstBlockXSize > 1 &&
1275 0 : (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
1276 11 : nDstBlockXSize /= 2;
1277 : else
1278 0 : nDstBlockYSize /= 2;
1279 : }
1280 :
1281 : const int nOvrXFactor =
1282 15648 : std::max(1, static_cast<int>(0.5 + dfXRatioDstToSrc));
1283 : const int nOvrYFactor =
1284 15648 : std::max(1, static_cast<int>(0.5 + dfYRatioDstToSrc));
1285 : const int nFullResXSizeQueried = static_cast<int>(
1286 31296 : std::min<int64_t>(nFullResXChunk + static_cast<int64_t>(2) *
1287 15648 : nKernelRadius * nOvrXFactor,
1288 15648 : nRasterXSize));
1289 : const int nFullResYSizeQueried = static_cast<int>(
1290 31296 : std::min<int64_t>(nFullResYChunk + static_cast<int64_t>(2) *
1291 15648 : nKernelRadius * nOvrYFactor,
1292 15648 : nRasterYSize));
1293 :
1294 : void *pChunk =
1295 15648 : VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
1296 : nFullResXSizeQueried, nFullResYSizeQueried);
1297 15648 : GByte *pabyChunkNoDataMask = nullptr;
1298 :
1299 15648 : GDALRasterBand *poMaskBand = GetMaskBand();
1300 15648 : int l_nMaskFlags = GetMaskFlags();
1301 :
1302 15648 : bool bUseNoDataMask = ((l_nMaskFlags & GMF_ALL_VALID) == 0);
1303 15648 : if (bUseNoDataMask)
1304 : {
1305 7525 : pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
1306 : nFullResXSizeQueried, nFullResYSizeQueried));
1307 : }
1308 15648 : if (pChunk == nullptr ||
1309 7525 : (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
1310 : {
1311 0 : GDALClose(poMEMDS);
1312 0 : CPLFree(pChunk);
1313 0 : CPLFree(pabyChunkNoDataMask);
1314 0 : VSIFree(pTempBuffer);
1315 0 : return CE_Failure;
1316 : }
1317 :
1318 : const int64_t nTotalBlocks =
1319 15648 : static_cast<int64_t>(cpl::div_round_up(nBufXSize, nDstBlockXSize)) *
1320 15648 : cpl::div_round_up(nBufYSize, nDstBlockYSize);
1321 15648 : int64_t nBlocksDone = 0;
1322 :
1323 31296 : for (int nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
1324 15648 : nDstYOff += nDstBlockYSize)
1325 : {
1326 : int nDstYCount;
1327 15648 : if (nDstYOff + nDstBlockYSize <= nBufYSize)
1328 15648 : nDstYCount = nDstBlockYSize;
1329 : else
1330 0 : nDstYCount = nBufYSize - nDstYOff;
1331 :
1332 15648 : int nChunkYOff =
1333 15648 : nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
1334 15648 : int nChunkYOff2 = nYOff + 1 +
1335 15648 : static_cast<int>(ceil((nDstYOff + nDstYCount) *
1336 : dfYRatioDstToSrc));
1337 15648 : if (nChunkYOff2 > nRasterYSize)
1338 789 : nChunkYOff2 = nRasterYSize;
1339 15648 : int nYCount = nChunkYOff2 - nChunkYOff;
1340 15648 : CPLAssert(nYCount <= nFullResYChunk);
1341 :
1342 15648 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrYFactor;
1343 15648 : int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrYFactor;
1344 15648 : if (nChunkYOffQueried < 0)
1345 : {
1346 498 : nChunkYSizeQueried += nChunkYOffQueried;
1347 498 : nChunkYOffQueried = 0;
1348 : }
1349 15648 : if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
1350 607 : nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
1351 15648 : CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
1352 :
1353 15648 : int nDstXOff = 0;
1354 31296 : for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
1355 15648 : nDstXOff += nDstBlockXSize)
1356 : {
1357 15648 : int nDstXCount = 0;
1358 15648 : if (nDstXOff + nDstBlockXSize <= nBufXSize)
1359 15648 : nDstXCount = nDstBlockXSize;
1360 : else
1361 0 : nDstXCount = nBufXSize - nDstXOff;
1362 :
1363 15648 : int nChunkXOff =
1364 15648 : nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
1365 15648 : int nChunkXOff2 =
1366 15648 : nXOff + 1 +
1367 15648 : static_cast<int>(
1368 15648 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
1369 15648 : if (nChunkXOff2 > nRasterXSize)
1370 9827 : nChunkXOff2 = nRasterXSize;
1371 15648 : int nXCount = nChunkXOff2 - nChunkXOff;
1372 15648 : CPLAssert(nXCount <= nFullResXChunk);
1373 :
1374 15648 : int nChunkXOffQueried =
1375 15648 : nChunkXOff - nKernelRadius * nOvrXFactor;
1376 15648 : int nChunkXSizeQueried =
1377 15648 : nXCount + 2 * nKernelRadius * nOvrXFactor;
1378 15648 : if (nChunkXOffQueried < 0)
1379 : {
1380 3310 : nChunkXSizeQueried += nChunkXOffQueried;
1381 3310 : nChunkXOffQueried = 0;
1382 : }
1383 15648 : if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
1384 3806 : nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
1385 15648 : CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
1386 :
1387 : // Read the source buffers.
1388 15648 : eErr = RasterIO(GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1389 : nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
1390 : nChunkXSizeQueried, nChunkYSizeQueried,
1391 : eWrkDataType, 0, 0, nullptr);
1392 :
1393 15648 : bool bSkipResample = false;
1394 15648 : bool bNoDataMaskFullyOpaque = false;
1395 15648 : if (eErr == CE_None && bUseNoDataMask)
1396 : {
1397 7525 : eErr = poMaskBand->RasterIO(
1398 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1399 : nChunkXSizeQueried, nChunkYSizeQueried,
1400 : pabyChunkNoDataMask, nChunkXSizeQueried,
1401 : nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
1402 :
1403 : /* Optimizations if mask if fully opaque or transparent */
1404 7525 : int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
1405 7525 : GByte bVal = pabyChunkNoDataMask[0];
1406 7525 : int i = 1;
1407 15237000 : for (; i < nPixels; i++)
1408 : {
1409 15230700 : if (pabyChunkNoDataMask[i] != bVal)
1410 1168 : break;
1411 : }
1412 7525 : if (i == nPixels)
1413 : {
1414 6357 : if (bVal == 0)
1415 : {
1416 12094 : for (int j = 0; j < nDstYCount; j++)
1417 : {
1418 6377 : GDALCopyWords64(&dfNoDataValue, GDT_Float64, 0,
1419 : static_cast<GByte *>(pDataMem) +
1420 6377 : nLSMem * (j + nDstYOff) +
1421 6377 : nDstXOff * nPSMem,
1422 : eDTMem,
1423 : static_cast<int>(nPSMem),
1424 : nDstXCount);
1425 : }
1426 5717 : bSkipResample = true;
1427 : }
1428 : else
1429 : {
1430 640 : bNoDataMaskFullyOpaque = true;
1431 : }
1432 : }
1433 : }
1434 :
1435 15648 : if (!bSkipResample && eErr == CE_None)
1436 : {
1437 9928 : const bool bPropagateNoData = false;
1438 9928 : void *pDstBuffer = nullptr;
1439 9928 : GDALDataType eDstBufferDataType = GDT_Unknown;
1440 : GDALRasterBand *poMEMBand =
1441 9928 : GDALRasterBand::FromHandle(hMEMBand);
1442 9928 : GDALOverviewResampleArgs args;
1443 9928 : args.eSrcDataType = eDataType;
1444 9928 : args.eOvrDataType = poMEMBand->GetRasterDataType();
1445 9928 : args.nOvrXSize = poMEMBand->GetXSize();
1446 9928 : args.nOvrYSize = poMEMBand->GetYSize();
1447 9928 : args.nOvrNBITS = nNBITS;
1448 9928 : args.dfXRatioDstToSrc = dfXRatioDstToSrc;
1449 9928 : args.dfYRatioDstToSrc = dfYRatioDstToSrc;
1450 9928 : args.dfSrcXDelta =
1451 9928 : dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
1452 9928 : args.dfSrcYDelta =
1453 9928 : dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
1454 9928 : args.eWrkDataType = eWrkDataType;
1455 9928 : args.pabyChunkNodataMask =
1456 9928 : bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask;
1457 9928 : args.nChunkXOff =
1458 9928 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
1459 9928 : args.nChunkXSize = nChunkXSizeQueried;
1460 9928 : args.nChunkYOff =
1461 9928 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
1462 9928 : args.nChunkYSize = nChunkYSizeQueried;
1463 9928 : args.nDstXOff = nDstXOff + nDestXOffVirtual;
1464 9928 : args.nDstXOff2 = nDstXOff + nDestXOffVirtual + nDstXCount;
1465 9928 : args.nDstYOff = nDstYOff + nDestYOffVirtual;
1466 9928 : args.nDstYOff2 = nDstYOff + nDestYOffVirtual + nDstYCount;
1467 9928 : args.pszResampling = pszResampling;
1468 9928 : args.bHasNoData = bHasNoData;
1469 9928 : args.dfNoDataValue = dfNoDataValue;
1470 9928 : args.poColorTable = GetColorTable();
1471 9928 : args.bPropagateNoData = bPropagateNoData;
1472 9928 : eErr = pfnResampleFunc(args, pChunk, &pDstBuffer,
1473 : &eDstBufferDataType);
1474 9928 : if (eErr == CE_None)
1475 : {
1476 9928 : eErr = poMEMBand->RasterIO(
1477 : GF_Write, nDstXOff + nDestXOffVirtual,
1478 : nDstYOff + nDestYOffVirtual, nDstXCount, nDstYCount,
1479 : pDstBuffer, nDstXCount, nDstYCount,
1480 : eDstBufferDataType, 0, 0, nullptr);
1481 : }
1482 9928 : CPLFree(pDstBuffer);
1483 : }
1484 :
1485 15648 : nBlocksDone++;
1486 28106 : if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
1487 12458 : !psExtraArg->pfnProgress(
1488 12458 : static_cast<double>(nBlocksDone) /
1489 12458 : static_cast<double>(nTotalBlocks),
1490 : "", psExtraArg->pProgressData))
1491 : {
1492 1 : eErr = CE_Failure;
1493 : }
1494 : }
1495 : }
1496 :
1497 15648 : CPLFree(pChunk);
1498 15648 : CPLFree(pabyChunkNoDataMask);
1499 : }
1500 :
1501 15797 : if (pTempBuffer)
1502 : {
1503 4 : CPL_IGNORE_RET_VAL(poMEMDS->GetRasterBand(1)->RasterIO(
1504 : GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
1505 : pData, nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
1506 : nullptr));
1507 : }
1508 15797 : GDALClose(poMEMDS);
1509 15797 : VSIFree(pTempBuffer);
1510 :
1511 15797 : return eErr;
1512 : }
1513 :
1514 : /************************************************************************/
1515 : /* RasterIOResampled() */
1516 : /************************************************************************/
1517 :
1518 2431 : CPLErr GDALDataset::RasterIOResampled(
1519 : GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
1520 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
1521 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
1522 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
1523 :
1524 : {
1525 : #if 0
1526 : // Determine if we use warping resampling or overview resampling
1527 : bool bUseWarp = false;
1528 : if( GDALDataTypeIsComplex( eDataType ) )
1529 : bUseWarp = true;
1530 : #endif
1531 :
1532 2431 : double dfXOff = nXOff;
1533 2431 : double dfYOff = nYOff;
1534 2431 : double dfXSize = nXSize;
1535 2431 : double dfYSize = nYSize;
1536 2431 : if (psExtraArg->bFloatingPointWindowValidity)
1537 : {
1538 2304 : dfXOff = psExtraArg->dfXOff;
1539 2304 : dfYOff = psExtraArg->dfYOff;
1540 2304 : dfXSize = psExtraArg->dfXSize;
1541 2304 : dfYSize = psExtraArg->dfYSize;
1542 : }
1543 :
1544 2431 : const double dfXRatioDstToSrc = dfXSize / nBufXSize;
1545 2431 : const double dfYRatioDstToSrc = dfYSize / nBufYSize;
1546 :
1547 : // Determine the coordinates in the "virtual" output raster to see
1548 : // if there are not integers, in which case we will use them as a shift
1549 : // so that subwindow extracts give the exact same results as entire raster
1550 : // scaling.
1551 2431 : double dfDestXOff = dfXOff / dfXRatioDstToSrc;
1552 2431 : bool bHasXOffVirtual = false;
1553 2431 : int nDestXOffVirtual = 0;
1554 2431 : if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
1555 : {
1556 2306 : bHasXOffVirtual = true;
1557 2306 : dfXOff = nXOff;
1558 2306 : nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
1559 : }
1560 :
1561 2431 : double dfDestYOff = dfYOff / dfYRatioDstToSrc;
1562 2431 : bool bHasYOffVirtual = false;
1563 2431 : int nDestYOffVirtual = 0;
1564 2431 : if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
1565 : {
1566 2266 : bHasYOffVirtual = true;
1567 2266 : dfYOff = nYOff;
1568 2266 : nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
1569 : }
1570 :
1571 : // Create a MEM dataset that wraps the output buffer.
1572 2431 : std::unique_ptr<void, VSIFreeReleaser> pTempBuffer;
1573 2431 : GSpacing nPSMem = nPixelSpace;
1574 2431 : GSpacing nLSMem = nLineSpace;
1575 2431 : GSpacing nBandSpaceMEM = nBandSpace;
1576 2431 : void *pDataMem = pData;
1577 2431 : GDALDataType eDTMem = eBufType;
1578 2431 : GDALRasterBand *poFirstSrcBand = GetRasterBand(panBandMap[0]);
1579 2431 : const GDALDataType eDataType = poFirstSrcBand->GetRasterDataType();
1580 2431 : if (eBufType != eDataType && !GDAL_GET_OPERATE_IN_BUF_TYPE(*psExtraArg))
1581 : {
1582 2 : nPSMem = GDALGetDataTypeSizeBytes(eDataType);
1583 2 : nLSMem = nPSMem * nBufXSize;
1584 2 : nBandSpaceMEM = nLSMem * nBandCount;
1585 2 : pTempBuffer.reset(VSI_MALLOC3_VERBOSE(nBandCount, nBufYSize,
1586 : static_cast<size_t>(nLSMem)));
1587 2 : if (pTempBuffer == nullptr)
1588 0 : return CE_Failure;
1589 2 : pDataMem = pTempBuffer.get();
1590 2 : eDTMem = eDataType;
1591 : }
1592 :
1593 : auto poMEMDS = std::unique_ptr<GDALDataset>(
1594 2431 : MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
1595 4862 : nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr));
1596 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1597 : std::vector<GDALRasterBand *> apoDstBands(nBandCount);
1598 : #endif
1599 2431 : int nNBITS = 0;
1600 9052 : for (int i = 0; i < nBandCount; i++)
1601 : {
1602 6621 : GByte *const pBandData = static_cast<GByte *>(pDataMem) -
1603 6621 : nPSMem * nDestXOffVirtual -
1604 6621 : nLSMem * nDestYOffVirtual + nBandSpaceMEM * i;
1605 6621 : auto poMEMBand = GDALRasterBand::FromHandle(MEMCreateRasterBandEx(
1606 : poMEMDS.get(), i + 1, pBandData, eDTMem, nPSMem, nLSMem, false));
1607 6621 : poMEMDS->SetBand(i + 1, poMEMBand);
1608 :
1609 6621 : GDALRasterBand *poSrcBand = GetRasterBand(panBandMap[i]);
1610 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1611 : apoDstBands[i] = poMEMBand;
1612 : #endif
1613 : const char *pszNBITS =
1614 6621 : poSrcBand->GetMetadataItem(GDALMD_NBITS, GDAL_MDD_IMAGE_STRUCTURE);
1615 6621 : if (pszNBITS)
1616 : {
1617 0 : nNBITS = atoi(pszNBITS);
1618 0 : poMEMDS->GetRasterBand(i + 1)->SetMetadataItem(
1619 0 : GDALMD_NBITS, pszNBITS, GDAL_MDD_IMAGE_STRUCTURE);
1620 : }
1621 : }
1622 :
1623 2431 : CPLErr eErr = CE_None;
1624 :
1625 : // TODO(schwehr): Why disabled? Why not just delete?
1626 : // Looks like this code was initially added as disable by copying
1627 : // from RasterIO here:
1628 : // https://trac.osgeo.org/gdal/changeset/29572
1629 : #if 0
1630 : // Do the resampling.
1631 : if( bUseWarp )
1632 : {
1633 : VRTDatasetH hVRTDS = nullptr;
1634 : GDALRasterBandH hVRTBand = nullptr;
1635 : if( GetDataset() == nullptr )
1636 : {
1637 : /* Create VRT dataset that wraps the whole dataset */
1638 : hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
1639 : VRTAddBand( hVRTDS, eDataType, nullptr );
1640 : hVRTBand = GDALGetRasterBand(hVRTDS, 1);
1641 : VRTAddSimpleSource( (VRTSourcedRasterBandH)hVRTBand,
1642 : (GDALRasterBandH)this,
1643 : 0, 0,
1644 : nRasterXSize, nRasterYSize,
1645 : 0, 0,
1646 : nRasterXSize, nRasterYSize,
1647 : nullptr, VRT_NODATA_UNSET );
1648 :
1649 : /* Add a mask band if needed */
1650 : if( GetMaskFlags() != GMF_ALL_VALID )
1651 : {
1652 : ((GDALDataset*)hVRTDS)->CreateMaskBand(0);
1653 : VRTSourcedRasterBand* poVRTMaskBand =
1654 : (VRTSourcedRasterBand*)(((GDALRasterBand*)hVRTBand)->GetMaskBand());
1655 : poVRTMaskBand->
1656 : AddMaskBandSource( this,
1657 : 0, 0,
1658 : nRasterXSize, nRasterYSize,
1659 : 0, 0,
1660 : nRasterXSize, nRasterYSize);
1661 : }
1662 : }
1663 :
1664 : GDALWarpOptions* psWarpOptions = GDALCreateWarpOptions();
1665 : psWarpOptions->eResampleAlg = (GDALResampleAlg)psExtraArg->eResampleAlg;
1666 : psWarpOptions->hSrcDS = (GDALDatasetH) (hVRTDS ? hVRTDS : GetDataset());
1667 : psWarpOptions->hDstDS = (GDALDatasetH) poMEMDS;
1668 : psWarpOptions->nBandCount = 1;
1669 : int nSrcBandNumber = (hVRTDS ? 1 : nBand);
1670 : int nDstBandNumber = 1;
1671 : psWarpOptions->panSrcBands = &nSrcBandNumber;
1672 : psWarpOptions->panDstBands = &nDstBandNumber;
1673 : psWarpOptions->pfnProgress = psExtraArg->pfnProgress ?
1674 : psExtraArg->pfnProgress : GDALDummyProgress;
1675 : psWarpOptions->pProgressArg = psExtraArg->pProgressData;
1676 : psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
1677 : GDALRasterIOTransformerStruct sTransformer;
1678 : sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
1679 : sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
1680 : sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
1681 : sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
1682 : psWarpOptions->pTransformerArg = &sTransformer;
1683 :
1684 : GDALWarpOperationH hWarpOperation = GDALCreateWarpOperation(psWarpOptions);
1685 : eErr = GDALChunkAndWarpImage( hWarpOperation,
1686 : nDestXOffVirtual, nDestYOffVirtual,
1687 : nBufXSize, nBufYSize );
1688 : GDALDestroyWarpOperation( hWarpOperation );
1689 :
1690 : psWarpOptions->panSrcBands = nullptr;
1691 : psWarpOptions->panDstBands = nullptr;
1692 : GDALDestroyWarpOptions( psWarpOptions );
1693 :
1694 : if( hVRTDS )
1695 : GDALClose(hVRTDS);
1696 : }
1697 : else
1698 : #endif
1699 : {
1700 : const char *pszResampling =
1701 2431 : GDALRasterIOGetResampleAlg(psExtraArg->eResampleAlg);
1702 :
1703 : int nBlockXSize, nBlockYSize;
1704 2431 : poFirstSrcBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
1705 :
1706 : int nKernelRadius;
1707 : GDALResampleFunction pfnResampleFunc =
1708 2431 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
1709 2431 : CPLAssert(pfnResampleFunc);
1710 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1711 : GDALResampleFunctionMultiBands pfnResampleFuncMultiBands =
1712 : GDALGetResampleFunctionMultiBands(pszResampling, &nKernelRadius);
1713 : #endif
1714 : GDALDataType eWrkDataType =
1715 2431 : GDALGetOvrWorkDataType(pszResampling, eDataType);
1716 :
1717 2431 : int nDstBlockXSize = nBufXSize;
1718 2431 : int nDstBlockYSize = nBufYSize;
1719 : int nFullResXChunk, nFullResYChunk;
1720 : while (true)
1721 : {
1722 2431 : nFullResXChunk = static_cast<int>(std::min<double>(
1723 2431 : 3 + nDstBlockXSize * dfXRatioDstToSrc, nRasterXSize));
1724 2431 : nFullResYChunk = static_cast<int>(std::min<double>(
1725 2431 : 3 + nDstBlockYSize * dfYRatioDstToSrc, nRasterYSize));
1726 2431 : if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
1727 2429 : (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
1728 : 1024 * 1024))
1729 : break;
1730 : // When operating on the full width of a raster whose block width is
1731 : // the raster width, prefer doing chunks in height.
1732 0 : if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
1733 : nDstBlockYSize > 1)
1734 0 : nDstBlockYSize /= 2;
1735 : /* Otherwise cut the maximal dimension */
1736 0 : else if (nDstBlockXSize > 1 &&
1737 0 : (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
1738 0 : nDstBlockXSize /= 2;
1739 : else
1740 0 : nDstBlockYSize /= 2;
1741 : }
1742 :
1743 : const int nOvrFactor =
1744 7293 : std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
1745 2431 : static_cast<int>(0.5 + dfYRatioDstToSrc)));
1746 : const int nFullResXSizeQueried = static_cast<int>(
1747 4862 : std::min<int64_t>(nFullResXChunk + static_cast<int64_t>(2) *
1748 2431 : nKernelRadius * nOvrFactor,
1749 2431 : nRasterXSize));
1750 : const int nFullResYSizeQueried = static_cast<int>(
1751 4862 : std::min<int64_t>(nFullResYChunk + static_cast<int64_t>(2) *
1752 2431 : nKernelRadius * nOvrFactor,
1753 2431 : nRasterYSize));
1754 :
1755 2431 : void *pChunk = VSI_MALLOC3_VERBOSE(
1756 : cpl::fits_on<int>(GDALGetDataTypeSizeBytes(eWrkDataType) *
1757 : nBandCount),
1758 : nFullResXSizeQueried, nFullResYSizeQueried);
1759 2431 : GByte *pabyChunkNoDataMask = nullptr;
1760 :
1761 2431 : GDALRasterBand *poMaskBand = poFirstSrcBand->GetMaskBand();
1762 2431 : int nMaskFlags = poFirstSrcBand->GetMaskFlags();
1763 :
1764 2431 : bool bUseNoDataMask = ((nMaskFlags & GMF_ALL_VALID) == 0);
1765 2431 : if (bUseNoDataMask)
1766 : {
1767 2156 : pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
1768 : nFullResXSizeQueried, nFullResYSizeQueried));
1769 : }
1770 2431 : if (pChunk == nullptr ||
1771 2156 : (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
1772 : {
1773 0 : CPLFree(pChunk);
1774 0 : CPLFree(pabyChunkNoDataMask);
1775 0 : return CE_Failure;
1776 : }
1777 :
1778 : const int64_t nTotalBlocks =
1779 2431 : static_cast<int64_t>(cpl::div_round_up(nBufXSize, nDstBlockXSize)) *
1780 2431 : cpl::div_round_up(nBufYSize, nDstBlockYSize);
1781 2431 : int64_t nBlocksDone = 0;
1782 :
1783 4862 : for (int nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
1784 2431 : nDstYOff += nDstBlockYSize)
1785 : {
1786 : int nDstYCount;
1787 2431 : if (nDstYOff + nDstBlockYSize <= nBufYSize)
1788 2431 : nDstYCount = nDstBlockYSize;
1789 : else
1790 0 : nDstYCount = nBufYSize - nDstYOff;
1791 :
1792 2431 : int nChunkYOff =
1793 2431 : nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
1794 2431 : int nChunkYOff2 = nYOff + 1 +
1795 2431 : static_cast<int>(ceil((nDstYOff + nDstYCount) *
1796 : dfYRatioDstToSrc));
1797 2431 : if (nChunkYOff2 > nRasterYSize)
1798 146 : nChunkYOff2 = nRasterYSize;
1799 2431 : int nYCount = nChunkYOff2 - nChunkYOff;
1800 2431 : CPLAssert(nYCount <= nFullResYChunk);
1801 :
1802 2431 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
1803 2431 : int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
1804 2431 : if (nChunkYOffQueried < 0)
1805 : {
1806 149 : nChunkYSizeQueried += nChunkYOffQueried;
1807 149 : nChunkYOffQueried = 0;
1808 : }
1809 2431 : if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
1810 170 : nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
1811 2431 : CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
1812 :
1813 : int nDstXOff;
1814 4862 : for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
1815 2431 : nDstXOff += nDstBlockXSize)
1816 : {
1817 : int nDstXCount;
1818 2431 : if (nDstXOff + nDstBlockXSize <= nBufXSize)
1819 2431 : nDstXCount = nDstBlockXSize;
1820 : else
1821 0 : nDstXCount = nBufXSize - nDstXOff;
1822 :
1823 2431 : int nChunkXOff =
1824 2431 : nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
1825 2431 : int nChunkXOff2 =
1826 2431 : nXOff + 1 +
1827 2431 : static_cast<int>(
1828 2431 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
1829 2431 : if (nChunkXOff2 > nRasterXSize)
1830 1672 : nChunkXOff2 = nRasterXSize;
1831 2431 : int nXCount = nChunkXOff2 - nChunkXOff;
1832 2431 : CPLAssert(nXCount <= nFullResXChunk);
1833 :
1834 2431 : int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
1835 2431 : int nChunkXSizeQueried =
1836 2431 : nXCount + 2 * nKernelRadius * nOvrFactor;
1837 2431 : if (nChunkXOffQueried < 0)
1838 : {
1839 1162 : nChunkXSizeQueried += nChunkXOffQueried;
1840 1162 : nChunkXOffQueried = 0;
1841 : }
1842 2431 : if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
1843 1680 : nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
1844 2431 : CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
1845 :
1846 2431 : bool bSkipResample = false;
1847 2431 : bool bNoDataMaskFullyOpaque = false;
1848 2431 : if (eErr == CE_None && bUseNoDataMask)
1849 : {
1850 2156 : eErr = poMaskBand->RasterIO(
1851 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1852 : nChunkXSizeQueried, nChunkYSizeQueried,
1853 : pabyChunkNoDataMask, nChunkXSizeQueried,
1854 : nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
1855 :
1856 : /* Optimizations if mask if fully opaque or transparent */
1857 2156 : const int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
1858 2156 : const GByte bVal = pabyChunkNoDataMask[0];
1859 2156 : int i = 1; // Used after for.
1860 49799600 : for (; i < nPixels; i++)
1861 : {
1862 49798500 : if (pabyChunkNoDataMask[i] != bVal)
1863 1031 : break;
1864 : }
1865 2156 : if (i == nPixels)
1866 : {
1867 1125 : if (bVal == 0)
1868 : {
1869 953 : GByte abyZero[16] = {0};
1870 3100 : for (int iBand = 0; iBand < nBandCount; iBand++)
1871 : {
1872 6979 : for (int j = 0; j < nDstYCount; j++)
1873 : {
1874 4832 : GDALCopyWords64(
1875 : abyZero, GDT_UInt8, 0,
1876 : static_cast<GByte *>(pDataMem) +
1877 4832 : iBand * nBandSpaceMEM +
1878 4832 : nLSMem * (j + nDstYOff) +
1879 4832 : nDstXOff * nPSMem,
1880 : eBufType, static_cast<int>(nPSMem),
1881 : nDstXCount);
1882 : }
1883 : }
1884 953 : bSkipResample = true;
1885 : }
1886 : else
1887 : {
1888 172 : bNoDataMaskFullyOpaque = true;
1889 : }
1890 : }
1891 : }
1892 :
1893 2431 : if (!bSkipResample && eErr == CE_None)
1894 : {
1895 : /* Read the source buffers */
1896 1475 : eErr = RasterIO(
1897 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1898 : nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
1899 : nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType,
1900 : nBandCount, panBandMap, 0, 0, 0, nullptr);
1901 : }
1902 :
1903 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1904 : if (pfnResampleFuncMultiBands && !bSkipResample &&
1905 : eErr == CE_None)
1906 : {
1907 : eErr = pfnResampleFuncMultiBands(
1908 : dfXRatioDstToSrc, dfYRatioDstToSrc,
1909 : dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
1910 : dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
1911 : eWrkDataType, (GByte *)pChunk, nBandCount,
1912 : bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask,
1913 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
1914 : nChunkXSizeQueried,
1915 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
1916 : nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
1917 : nDstXOff + nDestXOffVirtual + nDstXCount,
1918 : nDstYOff + nDestYOffVirtual,
1919 : nDstYOff + nDestYOffVirtual + nDstYCount,
1920 : apoDstBands.data(), pszResampling, FALSE /*bHasNoData*/,
1921 : 0.0 /* dfNoDataValue */, nullptr /* color table*/,
1922 : eDataType);
1923 : }
1924 : else
1925 : #endif
1926 : {
1927 : size_t nChunkBandOffset =
1928 2431 : static_cast<size_t>(nChunkXSizeQueried) *
1929 2431 : nChunkYSizeQueried *
1930 2431 : GDALGetDataTypeSizeBytes(eWrkDataType);
1931 6896 : for (int i = 0;
1932 6896 : i < nBandCount && !bSkipResample && eErr == CE_None;
1933 : i++)
1934 : {
1935 4465 : const bool bPropagateNoData = false;
1936 4465 : void *pDstBuffer = nullptr;
1937 4465 : GDALDataType eDstBufferDataType = GDT_Unknown;
1938 : GDALRasterBand *poMEMBand =
1939 4465 : poMEMDS->GetRasterBand(i + 1);
1940 4465 : GDALOverviewResampleArgs args;
1941 4465 : args.eSrcDataType = eDataType;
1942 4465 : args.eOvrDataType = poMEMBand->GetRasterDataType();
1943 4465 : args.nOvrXSize = poMEMBand->GetXSize();
1944 4465 : args.nOvrYSize = poMEMBand->GetYSize();
1945 4465 : args.nOvrNBITS = nNBITS;
1946 4465 : args.dfXRatioDstToSrc = dfXRatioDstToSrc;
1947 4465 : args.dfYRatioDstToSrc = dfYRatioDstToSrc;
1948 4465 : args.dfSrcXDelta =
1949 4465 : dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
1950 4465 : args.dfSrcYDelta =
1951 4465 : dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
1952 4465 : args.eWrkDataType = eWrkDataType;
1953 4465 : args.pabyChunkNodataMask = bNoDataMaskFullyOpaque
1954 4465 : ? nullptr
1955 : : pabyChunkNoDataMask;
1956 4465 : args.nChunkXOff =
1957 4465 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
1958 4465 : args.nChunkXSize = nChunkXSizeQueried;
1959 4465 : args.nChunkYOff =
1960 4465 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
1961 4465 : args.nChunkYSize = nChunkYSizeQueried;
1962 4465 : args.nDstXOff = nDstXOff + nDestXOffVirtual;
1963 4465 : args.nDstXOff2 =
1964 4465 : nDstXOff + nDestXOffVirtual + nDstXCount;
1965 4465 : args.nDstYOff = nDstYOff + nDestYOffVirtual;
1966 4465 : args.nDstYOff2 =
1967 4465 : nDstYOff + nDestYOffVirtual + nDstYCount;
1968 4465 : args.pszResampling = pszResampling;
1969 4465 : args.bHasNoData = false;
1970 4465 : args.dfNoDataValue = 0.0;
1971 4465 : args.poColorTable = nullptr;
1972 4465 : args.bPropagateNoData = bPropagateNoData;
1973 :
1974 : eErr =
1975 8930 : pfnResampleFunc(args,
1976 4465 : reinterpret_cast<GByte *>(pChunk) +
1977 4465 : i * nChunkBandOffset,
1978 : &pDstBuffer, &eDstBufferDataType);
1979 4465 : if (eErr == CE_None)
1980 : {
1981 4465 : eErr = poMEMBand->RasterIO(
1982 : GF_Write, nDstXOff + nDestXOffVirtual,
1983 : nDstYOff + nDestYOffVirtual, nDstXCount,
1984 : nDstYCount, pDstBuffer, nDstXCount, nDstYCount,
1985 : eDstBufferDataType, 0, 0, nullptr);
1986 : }
1987 4465 : CPLFree(pDstBuffer);
1988 : }
1989 : }
1990 :
1991 2431 : nBlocksDone++;
1992 4356 : if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
1993 1925 : !psExtraArg->pfnProgress(
1994 1925 : static_cast<double>(nBlocksDone) /
1995 1925 : static_cast<double>(nTotalBlocks),
1996 : "", psExtraArg->pProgressData))
1997 : {
1998 0 : eErr = CE_Failure;
1999 : }
2000 : }
2001 : }
2002 :
2003 2431 : CPLFree(pChunk);
2004 2431 : CPLFree(pabyChunkNoDataMask);
2005 : }
2006 :
2007 2431 : if (pTempBuffer)
2008 : {
2009 2 : CPL_IGNORE_RET_VAL(poMEMDS->RasterIO(
2010 : GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
2011 : pData, nBufXSize, nBufYSize, eBufType, nBandCount, nullptr,
2012 : nPixelSpace, nLineSpace, nBandSpace, nullptr));
2013 : }
2014 :
2015 2431 : return eErr;
2016 : }
2017 :
2018 : //! @endcond
2019 :
2020 : /************************************************************************/
2021 : /* GDALSwapWords() */
2022 : /************************************************************************/
2023 :
2024 : /**
2025 : * Byte swap words in-place.
2026 : *
2027 : * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
2028 : * a memory array. No assumption is made that the words being swapped are
2029 : * word aligned in memory. Use the CPL_LSB and CPL_MSB macros from cpl_port.h
2030 : * to determine if the current platform is big endian or little endian. Use
2031 : * The macros like CPL_SWAP32() to byte swap single values without the overhead
2032 : * of a function call.
2033 : *
2034 : * @param pData pointer to start of data buffer.
2035 : * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
2036 : * @param nWordCount the number of words to be swapped in this call.
2037 : * @param nWordSkip the byte offset from the start of one word to the start of
2038 : * the next. For packed buffers this is the same as nWordSize.
2039 : */
2040 :
2041 497405 : void CPL_STDCALL GDALSwapWords(void *pData, int nWordSize, int nWordCount,
2042 : int nWordSkip)
2043 :
2044 : {
2045 497405 : if (nWordCount > 0)
2046 497405 : VALIDATE_POINTER0(pData, "GDALSwapWords");
2047 :
2048 497405 : GByte *pabyData = static_cast<GByte *>(pData);
2049 :
2050 497405 : switch (nWordSize)
2051 : {
2052 7234 : case 1:
2053 7234 : break;
2054 :
2055 477161 : case 2:
2056 477161 : CPLAssert(nWordSkip >= 2 || nWordCount == 1);
2057 228194000 : for (int i = 0; i < nWordCount; i++)
2058 : {
2059 227716000 : CPL_SWAP16PTR(pabyData);
2060 227716000 : pabyData += nWordSkip;
2061 : }
2062 477161 : break;
2063 :
2064 10584 : case 4:
2065 10584 : CPLAssert(nWordSkip >= 4 || nWordCount == 1);
2066 10584 : if (CPL_IS_ALIGNED(pabyData, 4) && (nWordSkip % 4) == 0)
2067 : {
2068 29140600 : for (int i = 0; i < nWordCount; i++)
2069 : {
2070 29130000 : *reinterpret_cast<GUInt32 *>(pabyData) = CPL_SWAP32(
2071 : *reinterpret_cast<const GUInt32 *>(pabyData));
2072 29130000 : pabyData += nWordSkip;
2073 10581 : }
2074 : }
2075 : else
2076 : {
2077 9 : for (int i = 0; i < nWordCount; i++)
2078 : {
2079 6 : CPL_SWAP32PTR(pabyData);
2080 6 : pabyData += nWordSkip;
2081 : }
2082 : }
2083 10584 : break;
2084 :
2085 2426 : case 8:
2086 2426 : CPLAssert(nWordSkip >= 8 || nWordCount == 1);
2087 2426 : if (CPL_IS_ALIGNED(pabyData, 8) && (nWordSkip % 8) == 0)
2088 : {
2089 3356900 : for (int i = 0; i < nWordCount; i++)
2090 : {
2091 3354480 : *reinterpret_cast<GUInt64 *>(pabyData) = CPL_SWAP64(
2092 : *reinterpret_cast<const GUInt64 *>(pabyData));
2093 3354480 : pabyData += nWordSkip;
2094 2425 : }
2095 : }
2096 : else
2097 : {
2098 3 : for (int i = 0; i < nWordCount; i++)
2099 : {
2100 2 : CPL_SWAP64PTR(pabyData);
2101 2 : pabyData += nWordSkip;
2102 : }
2103 : }
2104 2426 : break;
2105 :
2106 0 : default:
2107 0 : CPLAssert(false);
2108 : }
2109 : }
2110 :
2111 : /************************************************************************/
2112 : /* GDALSwapWordsEx() */
2113 : /************************************************************************/
2114 :
2115 : /**
2116 : * Byte swap words in-place.
2117 : *
2118 : * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
2119 : * a memory array. No assumption is made that the words being swapped are
2120 : * word aligned in memory. Use the CPL_LSB and CPL_MSB macros from cpl_port.h
2121 : * to determine if the current platform is big endian or little endian. Use
2122 : * The macros like CPL_SWAP32() to byte swap single values without the overhead
2123 : * of a function call.
2124 : *
2125 : * @param pData pointer to start of data buffer.
2126 : * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
2127 : * @param nWordCount the number of words to be swapped in this call.
2128 : * @param nWordSkip the byte offset from the start of one word to the start of
2129 : * the next. For packed buffers this is the same as nWordSize.
2130 : */
2131 6130 : void CPL_STDCALL GDALSwapWordsEx(void *pData, int nWordSize, size_t nWordCount,
2132 : int nWordSkip)
2133 : {
2134 6130 : GByte *pabyData = static_cast<GByte *>(pData);
2135 12260 : while (nWordCount)
2136 : {
2137 : // Pick-up a multiple of 8 as max chunk size.
2138 6130 : const int nWordCountSmall =
2139 6130 : (nWordCount > (1 << 30)) ? (1 << 30) : static_cast<int>(nWordCount);
2140 6130 : GDALSwapWords(pabyData, nWordSize, nWordCountSmall, nWordSkip);
2141 6130 : pabyData += static_cast<size_t>(nWordSkip) * nWordCountSmall;
2142 6130 : nWordCount -= nWordCountSmall;
2143 : }
2144 6130 : }
2145 :
2146 : // Place the new GDALCopyWords helpers in an anonymous namespace
2147 : namespace
2148 : {
2149 :
2150 : /************************************************************************/
2151 : /* GDALCopyWordsT() */
2152 : /************************************************************************/
2153 : /**
2154 : * Template function, used to copy data from pSrcData into buffer
2155 : * pDstData, with stride nSrcPixelStride in the source data and
2156 : * stride nDstPixelStride in the destination data. This template can
2157 : * deal with the case where the input data type is real or complex and
2158 : * the output is real.
2159 : *
2160 : * @param pSrcData the source data buffer
2161 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
2162 : * of interest.
2163 : * @param pDstData the destination buffer.
2164 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
2165 : * interest.
2166 : * @param nWordCount the total number of pixel words to copy
2167 : *
2168 : * @code
2169 : * // Assume an input buffer of type GUInt16 named pBufferIn
2170 : * GByte *pBufferOut = new GByte[numBytesOut];
2171 : * GDALCopyWordsT<GUInt16, GByte>(pSrcData, 2, pDstData, 1, numBytesOut);
2172 : * @endcode
2173 : * @note
2174 : * This is a private function, and should not be exposed outside of
2175 : * rasterio.cpp. External users should call the GDALCopyWords driver function.
2176 : */
2177 :
2178 : template <class Tin, class Tout>
2179 48995127 : static void inline GDALCopyWordsGenericT(const Tin *const CPL_RESTRICT pSrcData,
2180 : int nSrcPixelStride,
2181 : Tout *const CPL_RESTRICT pDstData,
2182 : int nDstPixelStride,
2183 : GPtrDiff_t nWordCount)
2184 : {
2185 48995127 : decltype(nWordCount) nDstOffset = 0;
2186 :
2187 48995127 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
2188 48995127 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
2189 353955367 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
2190 : {
2191 304960227 : const Tin tValue =
2192 304960227 : *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
2193 304960227 : Tout *const pOutPixel =
2194 304960227 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2195 :
2196 304960227 : GDALCopyWord(tValue, *pOutPixel);
2197 :
2198 304960227 : nDstOffset += nDstPixelStride;
2199 : }
2200 48995127 : }
2201 :
2202 : template <class Tin, class Tout>
2203 29756776 : static void CPL_NOINLINE GDALCopyWordsT(const Tin *const CPL_RESTRICT pSrcData,
2204 : int nSrcPixelStride,
2205 : Tout *const CPL_RESTRICT pDstData,
2206 : int nDstPixelStride,
2207 : GPtrDiff_t nWordCount)
2208 : {
2209 29756776 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData, nDstPixelStride,
2210 : nWordCount);
2211 29756776 : }
2212 :
2213 : template <class Tin, class Tout>
2214 5101446 : static void inline GDALCopyWordsT_8atatime(
2215 : const Tin *const CPL_RESTRICT pSrcData, int nSrcPixelStride,
2216 : Tout *const CPL_RESTRICT pDstData, int nDstPixelStride,
2217 : GPtrDiff_t nWordCount)
2218 : {
2219 5101446 : decltype(nWordCount) nDstOffset = 0;
2220 :
2221 5101446 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
2222 5101446 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
2223 5101446 : decltype(nWordCount) n = 0;
2224 5101446 : if (nSrcPixelStride == static_cast<int>(sizeof(Tin)) &&
2225 : nDstPixelStride == static_cast<int>(sizeof(Tout)))
2226 : {
2227 53187859 : for (; n < nWordCount - 7; n += 8)
2228 : {
2229 52636234 : const Tin *pInValues = reinterpret_cast<const Tin *>(
2230 52636234 : pSrcDataPtr + (n * nSrcPixelStride));
2231 52636234 : Tout *const pOutPixels =
2232 52636234 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2233 :
2234 52636234 : GDALCopy8Words(pInValues, pOutPixels);
2235 :
2236 52636234 : nDstOffset += 8 * nDstPixelStride;
2237 : }
2238 : }
2239 10499457 : for (; n < nWordCount; n++)
2240 : {
2241 5398011 : const Tin tValue =
2242 5398011 : *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
2243 5398011 : Tout *const pOutPixel =
2244 5398011 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2245 :
2246 5398011 : GDALCopyWord(tValue, *pOutPixel);
2247 :
2248 5398011 : nDstOffset += nDstPixelStride;
2249 : }
2250 5101446 : }
2251 :
2252 : #ifdef HAVE_SSE2
2253 :
2254 : template <class Tout>
2255 1042126 : void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
2256 : int nSrcPixelStride,
2257 : Tout *const CPL_RESTRICT pDstData,
2258 : int nDstPixelStride, GPtrDiff_t nWordCount)
2259 : {
2260 : static_assert(std::is_integral<Tout>::value &&
2261 : sizeof(Tout) == sizeof(uint16_t),
2262 : "Bad Tout");
2263 1042126 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2264 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2265 : {
2266 35752 : decltype(nWordCount) n = 0;
2267 35752 : const __m128i xmm_zero = _mm_setzero_si128();
2268 35752 : GByte *CPL_RESTRICT pabyDstDataPtr =
2269 : reinterpret_cast<GByte *>(pDstData);
2270 1478148 : for (; n < nWordCount - 15; n += 16)
2271 : {
2272 1442396 : __m128i xmm = _mm_loadu_si128(
2273 1442396 : reinterpret_cast<const __m128i *>(pSrcData + n));
2274 1442396 : __m128i xmm0 = _mm_unpacklo_epi8(xmm, xmm_zero);
2275 1442396 : __m128i xmm1 = _mm_unpackhi_epi8(xmm, xmm_zero);
2276 : _mm_storeu_si128(
2277 1442396 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2), xmm0);
2278 : _mm_storeu_si128(
2279 1442396 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2 + 16), xmm1);
2280 : }
2281 : #if defined(__clang__)
2282 : #pragma clang loop vectorize(disable)
2283 : #endif
2284 111662 : for (; n < nWordCount; n++)
2285 : {
2286 75910 : pDstData[n] = pSrcData[n];
2287 35752 : }
2288 : }
2289 : else
2290 : {
2291 1006371 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2292 : nDstPixelStride, nWordCount);
2293 : }
2294 1042126 : }
2295 :
2296 : template <>
2297 1029400 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2298 : int nSrcPixelStride,
2299 : GUInt16 *const CPL_RESTRICT pDstData,
2300 : int nDstPixelStride, GPtrDiff_t nWordCount)
2301 : {
2302 1029400 : GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
2303 : nDstPixelStride, nWordCount);
2304 1029400 : }
2305 :
2306 : template <>
2307 12726 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2308 : int nSrcPixelStride,
2309 : GInt16 *const CPL_RESTRICT pDstData,
2310 : int nDstPixelStride, GPtrDiff_t nWordCount)
2311 : {
2312 12726 : GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
2313 : nDstPixelStride, nWordCount);
2314 12726 : }
2315 :
2316 : template <class Tout>
2317 16237776 : void GDALCopyWordsByteTo32Bit(const GByte *const CPL_RESTRICT pSrcData,
2318 : int nSrcPixelStride,
2319 : Tout *const CPL_RESTRICT pDstData,
2320 : int nDstPixelStride, GPtrDiff_t nWordCount)
2321 : {
2322 : static_assert(std::is_integral<Tout>::value &&
2323 : sizeof(Tout) == sizeof(uint32_t),
2324 : "Bad Tout");
2325 16237776 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2326 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2327 : {
2328 6533266 : decltype(nWordCount) n = 0;
2329 6533266 : const __m128i xmm_zero = _mm_setzero_si128();
2330 6533266 : GByte *CPL_RESTRICT pabyDstDataPtr =
2331 : reinterpret_cast<GByte *>(pDstData);
2332 74249627 : for (; n < nWordCount - 15; n += 16)
2333 : {
2334 67716361 : __m128i xmm = _mm_loadu_si128(
2335 67716361 : reinterpret_cast<const __m128i *>(pSrcData + n));
2336 67716361 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2337 67716361 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2338 67716361 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2339 67716361 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2340 67716361 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2341 67716361 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2342 : _mm_storeu_si128(
2343 67716361 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4), xmm0);
2344 : _mm_storeu_si128(
2345 67716361 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 16), xmm1);
2346 : _mm_storeu_si128(
2347 67716361 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 32), xmm2);
2348 : _mm_storeu_si128(
2349 67716361 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 48), xmm3);
2350 : }
2351 : #if defined(__clang__)
2352 : #pragma clang loop vectorize(disable)
2353 : #endif
2354 14830216 : for (; n < nWordCount; n++)
2355 : {
2356 8296910 : pDstData[n] = pSrcData[n];
2357 6533266 : }
2358 : }
2359 : else
2360 : {
2361 9704510 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2362 : nDstPixelStride, nWordCount);
2363 : }
2364 16237776 : }
2365 :
2366 : template <>
2367 476 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2368 : int nSrcPixelStride,
2369 : GUInt32 *const CPL_RESTRICT pDstData,
2370 : int nDstPixelStride, GPtrDiff_t nWordCount)
2371 : {
2372 476 : GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
2373 : nDstPixelStride, nWordCount);
2374 476 : }
2375 :
2376 : template <>
2377 16237300 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2378 : int nSrcPixelStride,
2379 : GInt32 *const CPL_RESTRICT pDstData,
2380 : int nDstPixelStride, GPtrDiff_t nWordCount)
2381 : {
2382 16237300 : GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
2383 : nDstPixelStride, nWordCount);
2384 16237300 : }
2385 :
2386 : template <>
2387 2851090 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2388 : int nSrcPixelStride,
2389 : float *const CPL_RESTRICT pDstData,
2390 : int nDstPixelStride, GPtrDiff_t nWordCount)
2391 : {
2392 2851090 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2393 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2394 : {
2395 228209 : decltype(nWordCount) n = 0;
2396 228209 : const __m128i xmm_zero = _mm_setzero_si128();
2397 228209 : GByte *CPL_RESTRICT pabyDstDataPtr =
2398 : reinterpret_cast<GByte *>(pDstData);
2399 2267200 : for (; n < nWordCount - 15; n += 16)
2400 : {
2401 2038990 : __m128i xmm = _mm_loadu_si128(
2402 2038990 : reinterpret_cast<const __m128i *>(pSrcData + n));
2403 2038990 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2404 2038990 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2405 2038990 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2406 2038990 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2407 2038990 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2408 2038990 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2409 2038990 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2410 2038990 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2411 2038990 : __m128 xmm2_f = _mm_cvtepi32_ps(xmm2);
2412 2038990 : __m128 xmm3_f = _mm_cvtepi32_ps(xmm3);
2413 2038990 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2414 : xmm0_f);
2415 : _mm_storeu_ps(
2416 2038990 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2417 : _mm_storeu_ps(
2418 2038990 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 32), xmm2_f);
2419 : _mm_storeu_ps(
2420 2038990 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 48), xmm3_f);
2421 : }
2422 : #if defined(__clang__)
2423 : #pragma clang loop vectorize(disable)
2424 : #endif
2425 951537 : for (; n < nWordCount; n++)
2426 : {
2427 723328 : pDstData[n] = pSrcData[n];
2428 228209 : }
2429 : }
2430 : else
2431 : {
2432 2622880 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2433 : nDstPixelStride, nWordCount);
2434 : }
2435 2851090 : }
2436 :
2437 : template <>
2438 170958 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2439 : int nSrcPixelStride,
2440 : double *const CPL_RESTRICT pDstData,
2441 : int nDstPixelStride, GPtrDiff_t nWordCount)
2442 : {
2443 170958 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2444 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2445 : {
2446 147160 : decltype(nWordCount) n = 0;
2447 147160 : const __m128i xmm_zero = _mm_setzero_si128();
2448 147160 : GByte *CPL_RESTRICT pabyDstDataPtr =
2449 : reinterpret_cast<GByte *>(pDstData);
2450 3127450 : for (; n < nWordCount - 15; n += 16)
2451 : {
2452 2980290 : __m128i xmm = _mm_loadu_si128(
2453 2980290 : reinterpret_cast<const __m128i *>(pSrcData + n));
2454 2980290 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2455 2980290 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2456 2980290 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2457 2980290 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2458 2980290 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2459 2980290 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2460 :
2461 : #if defined(__AVX2__) && defined(slightly_slower_than_SSE2)
2462 : _mm256_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2463 : _mm256_cvtepi32_pd(xmm0));
2464 : _mm256_storeu_pd(
2465 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2466 : _mm256_cvtepi32_pd(xmm1));
2467 : _mm256_storeu_pd(
2468 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
2469 : _mm256_cvtepi32_pd(xmm2));
2470 : _mm256_storeu_pd(
2471 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
2472 : _mm256_cvtepi32_pd(xmm3));
2473 : #else
2474 2980290 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2475 2980290 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2476 2980290 : __m128d xmm2_low_d = _mm_cvtepi32_pd(xmm2);
2477 2980290 : __m128d xmm3_low_d = _mm_cvtepi32_pd(xmm3);
2478 2980290 : xmm0 = _mm_srli_si128(xmm0, 8);
2479 2980290 : xmm1 = _mm_srli_si128(xmm1, 8);
2480 2980290 : xmm2 = _mm_srli_si128(xmm2, 8);
2481 2980290 : xmm3 = _mm_srli_si128(xmm3, 8);
2482 2980290 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2483 2980290 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2484 2980290 : __m128d xmm2_high_d = _mm_cvtepi32_pd(xmm2);
2485 2980290 : __m128d xmm3_high_d = _mm_cvtepi32_pd(xmm3);
2486 :
2487 2980290 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2488 : xmm0_low_d);
2489 : _mm_storeu_pd(
2490 2980290 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2491 : xmm0_high_d);
2492 : _mm_storeu_pd(
2493 2980290 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2494 : xmm1_low_d);
2495 : _mm_storeu_pd(
2496 2980290 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2497 : xmm1_high_d);
2498 : _mm_storeu_pd(
2499 2980290 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
2500 : xmm2_low_d);
2501 : _mm_storeu_pd(
2502 2980290 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 80),
2503 : xmm2_high_d);
2504 : _mm_storeu_pd(
2505 2980290 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
2506 : xmm3_low_d);
2507 : _mm_storeu_pd(
2508 2980290 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 112),
2509 : xmm3_high_d);
2510 : #endif
2511 : }
2512 : #if defined(__clang__)
2513 : #pragma clang loop vectorize(disable)
2514 : #endif
2515 280923 : for (; n < nWordCount; n++)
2516 : {
2517 133763 : pDstData[n] = pSrcData[n];
2518 147160 : }
2519 : }
2520 : else
2521 : {
2522 23798 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2523 : nDstPixelStride, nWordCount);
2524 : }
2525 170958 : }
2526 :
2527 : template <>
2528 148 : CPL_NOINLINE void GDALCopyWordsT(const uint8_t *const CPL_RESTRICT pSrcData,
2529 : int nSrcPixelStride,
2530 : int8_t *const CPL_RESTRICT pDstData,
2531 : int nDstPixelStride, GPtrDiff_t nWordCount)
2532 : {
2533 148 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2534 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2535 : {
2536 142 : decltype(nWordCount) n = 0;
2537 142 : const __m128i xmm_127 = _mm_set1_epi8(127);
2538 146 : for (; n < nWordCount - 31; n += 32)
2539 : {
2540 8 : __m128i xmm0 = _mm_loadu_si128(
2541 4 : reinterpret_cast<const __m128i *>(pSrcData + n));
2542 4 : __m128i xmm1 = _mm_loadu_si128(
2543 4 : reinterpret_cast<const __m128i *>(pSrcData + n + 16));
2544 4 : xmm0 = _mm_min_epu8(xmm0, xmm_127);
2545 4 : xmm1 = _mm_min_epu8(xmm1, xmm_127);
2546 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2547 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
2548 : xmm1);
2549 : }
2550 : #if defined(__clang__)
2551 : #pragma clang loop vectorize(disable)
2552 : #endif
2553 2424 : for (; n < nWordCount; n++)
2554 : {
2555 2282 : pDstData[n] = static_cast<int8_t>(std::min<int>(pSrcData[n], 127));
2556 142 : }
2557 : }
2558 : else
2559 : {
2560 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2561 : nDstPixelStride, nWordCount);
2562 : }
2563 148 : }
2564 :
2565 : template <>
2566 62 : CPL_NOINLINE void GDALCopyWordsT(const int8_t *const CPL_RESTRICT pSrcData,
2567 : int nSrcPixelStride,
2568 : uint8_t *const CPL_RESTRICT pDstData,
2569 : int nDstPixelStride, GPtrDiff_t nWordCount)
2570 : {
2571 62 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2572 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2573 : {
2574 56 : decltype(nWordCount) n = 0;
2575 : #if !(defined(__SSE4_1__) || defined(__AVX__) || \
2576 : defined(USE_NEON_OPTIMIZATIONS))
2577 56 : const __m128i xmm_INT8_to_UINT8 = _mm_set1_epi8(-128);
2578 : #endif
2579 117 : for (; n < nWordCount - 31; n += 32)
2580 : {
2581 122 : __m128i xmm0 = _mm_loadu_si128(
2582 61 : reinterpret_cast<const __m128i *>(pSrcData + n));
2583 61 : __m128i xmm1 = _mm_loadu_si128(
2584 61 : reinterpret_cast<const __m128i *>(pSrcData + n + 16));
2585 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
2586 : xmm0 = _mm_max_epi8(xmm0, _mm_setzero_si128());
2587 : xmm1 = _mm_max_epi8(xmm1, _mm_setzero_si128());
2588 : #else
2589 61 : xmm0 = _mm_add_epi8(xmm0, xmm_INT8_to_UINT8);
2590 61 : xmm1 = _mm_add_epi8(xmm1, xmm_INT8_to_UINT8);
2591 61 : xmm0 = _mm_max_epu8(xmm0, xmm_INT8_to_UINT8);
2592 61 : xmm1 = _mm_max_epu8(xmm1, xmm_INT8_to_UINT8);
2593 61 : xmm0 = _mm_sub_epi8(xmm0, xmm_INT8_to_UINT8);
2594 61 : xmm1 = _mm_sub_epi8(xmm1, xmm_INT8_to_UINT8);
2595 : #endif
2596 61 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2597 61 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
2598 : xmm1);
2599 : }
2600 : #if defined(__clang__)
2601 : #pragma clang loop vectorize(disable)
2602 : #endif
2603 352 : for (; n < nWordCount; n++)
2604 : {
2605 296 : pDstData[n] = static_cast<uint8_t>(std::max<int>(pSrcData[n], 0));
2606 56 : }
2607 : }
2608 : else
2609 : {
2610 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2611 : nDstPixelStride, nWordCount);
2612 : }
2613 62 : }
2614 :
2615 : template <>
2616 6037 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2617 : int nSrcPixelStride,
2618 : uint8_t *const CPL_RESTRICT pDstData,
2619 : int nDstPixelStride, GPtrDiff_t nWordCount)
2620 : {
2621 6037 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2622 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2623 : {
2624 5062 : decltype(nWordCount) n = 0;
2625 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
2626 : const auto xmm_MAX_INT16 = _mm_set1_epi16(32767);
2627 : #else
2628 : // In SSE2, min_epu16 does not exist, so shift from
2629 : // UInt16 to SInt16 to be able to use min_epi16
2630 5062 : const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
2631 5062 : const __m128i xmm_m255_shifted = _mm_set1_epi16(255 - 32768);
2632 : #endif
2633 71888 : for (; n < nWordCount - 15; n += 16)
2634 : {
2635 133652 : __m128i xmm0 = _mm_loadu_si128(
2636 66826 : reinterpret_cast<const __m128i *>(pSrcData + n));
2637 66826 : __m128i xmm1 = _mm_loadu_si128(
2638 66826 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2639 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
2640 : xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
2641 : xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
2642 : #else
2643 66826 : xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
2644 66826 : xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
2645 66826 : xmm0 = _mm_min_epi16(xmm0, xmm_m255_shifted);
2646 66826 : xmm1 = _mm_min_epi16(xmm1, xmm_m255_shifted);
2647 66826 : xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
2648 66826 : xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
2649 : #endif
2650 66826 : xmm0 = _mm_packus_epi16(xmm0, xmm1);
2651 66826 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2652 : }
2653 : #if defined(__clang__)
2654 : #pragma clang loop vectorize(disable)
2655 : #endif
2656 16403 : for (; n < nWordCount; n++)
2657 : {
2658 11341 : pDstData[n] = static_cast<uint8_t>(std::min<int>(pSrcData[n], 255));
2659 5062 : }
2660 : }
2661 : else
2662 : {
2663 975 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2664 : nDstPixelStride, nWordCount);
2665 : }
2666 6037 : }
2667 :
2668 : template <>
2669 46 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2670 : int nSrcPixelStride,
2671 : int16_t *const CPL_RESTRICT pDstData,
2672 : int nDstPixelStride, GPtrDiff_t nWordCount)
2673 : {
2674 46 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2675 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2676 : {
2677 40 : decltype(nWordCount) n = 0;
2678 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
2679 : const __m128i xmm_MAX_INT16 = _mm_set1_epi16(32767);
2680 : #else
2681 : // In SSE2, min_epu16 does not exist, so shift from
2682 : // UInt16 to SInt16 to be able to use min_epi16
2683 40 : const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
2684 40 : const __m128i xmm_32767_shifted = _mm_set1_epi16(32767 - 32768);
2685 : #endif
2686 169 : for (; n < nWordCount - 15; n += 16)
2687 : {
2688 258 : __m128i xmm0 = _mm_loadu_si128(
2689 129 : reinterpret_cast<const __m128i *>(pSrcData + n));
2690 129 : __m128i xmm1 = _mm_loadu_si128(
2691 129 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2692 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
2693 : xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
2694 : xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
2695 : #else
2696 129 : xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
2697 129 : xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
2698 129 : xmm0 = _mm_min_epi16(xmm0, xmm_32767_shifted);
2699 129 : xmm1 = _mm_min_epi16(xmm1, xmm_32767_shifted);
2700 129 : xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
2701 129 : xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
2702 : #endif
2703 129 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2704 129 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
2705 : xmm1);
2706 : }
2707 : #if defined(__clang__)
2708 : #pragma clang loop vectorize(disable)
2709 : #endif
2710 191 : for (; n < nWordCount; n++)
2711 : {
2712 151 : pDstData[n] =
2713 151 : static_cast<int16_t>(std::min<int>(pSrcData[n], 32767));
2714 40 : }
2715 : }
2716 : else
2717 : {
2718 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2719 : nDstPixelStride, nWordCount);
2720 : }
2721 46 : }
2722 :
2723 : template <>
2724 136 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2725 : int nSrcPixelStride,
2726 : uint16_t *const CPL_RESTRICT pDstData,
2727 : int nDstPixelStride, GPtrDiff_t nWordCount)
2728 : {
2729 136 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2730 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2731 : {
2732 93 : decltype(nWordCount) n = 0;
2733 93 : const __m128i xmm_zero = _mm_setzero_si128();
2734 278 : for (; n < nWordCount - 15; n += 16)
2735 : {
2736 370 : __m128i xmm0 = _mm_loadu_si128(
2737 185 : reinterpret_cast<const __m128i *>(pSrcData + n));
2738 185 : __m128i xmm1 = _mm_loadu_si128(
2739 185 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2740 185 : xmm0 = _mm_max_epi16(xmm0, xmm_zero);
2741 185 : xmm1 = _mm_max_epi16(xmm1, xmm_zero);
2742 185 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2743 185 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
2744 : xmm1);
2745 : }
2746 : #if defined(__clang__)
2747 : #pragma clang loop vectorize(disable)
2748 : #endif
2749 471 : for (; n < nWordCount; n++)
2750 : {
2751 378 : pDstData[n] = static_cast<uint16_t>(std::max<int>(pSrcData[n], 0));
2752 93 : }
2753 : }
2754 : else
2755 : {
2756 43 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2757 : nDstPixelStride, nWordCount);
2758 : }
2759 136 : }
2760 :
2761 : template <>
2762 3150 : CPL_NOINLINE void GDALCopyWordsT(const uint32_t *const CPL_RESTRICT pSrcData,
2763 : int nSrcPixelStride,
2764 : int32_t *const CPL_RESTRICT pDstData,
2765 : int nDstPixelStride, GPtrDiff_t nWordCount)
2766 : {
2767 3150 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2768 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2769 : {
2770 2196 : decltype(nWordCount) n = 0;
2771 2196 : const __m128i xmm_MAX_INT = _mm_set1_epi32(INT_MAX);
2772 2196 : [[maybe_unused]] const __m128i bias = _mm_set1_epi32(INT_MIN);
2773 : [[maybe_unused]] const __m128i xmm_MAX_INT_biased =
2774 2196 : _mm_xor_si128(xmm_MAX_INT, bias);
2775 45597 : for (; n < nWordCount - 7; n += 8)
2776 : {
2777 86802 : __m128i xmm0 = _mm_loadu_si128(
2778 43401 : reinterpret_cast<const __m128i *>(pSrcData + n));
2779 43401 : __m128i xmm1 = _mm_loadu_si128(
2780 43401 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
2781 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
2782 : xmm0 = _mm_min_epu32(xmm0, xmm_MAX_INT);
2783 : xmm1 = _mm_min_epu32(xmm1, xmm_MAX_INT);
2784 : #else
2785 43401 : const __m128i xmm0_biased = _mm_xor_si128(xmm0, bias);
2786 : const __m128i mask0 =
2787 43401 : _mm_cmplt_epi32(xmm0_biased, xmm_MAX_INT_biased);
2788 43401 : xmm0 = GDALIfThenElse(mask0, xmm0, xmm_MAX_INT);
2789 :
2790 43401 : const __m128i xmm1_biased = _mm_xor_si128(xmm1, bias);
2791 : const __m128i mask1 =
2792 43401 : _mm_cmplt_epi32(xmm1_biased, xmm_MAX_INT_biased);
2793 43401 : xmm1 = GDALIfThenElse(mask1, xmm1, xmm_MAX_INT);
2794 : #endif
2795 43401 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2796 43401 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
2797 : xmm1);
2798 : }
2799 : #if defined(__clang__)
2800 : #pragma clang loop vectorize(disable)
2801 : #endif
2802 9390 : for (; n < nWordCount; n++)
2803 : {
2804 7194 : pDstData[n] =
2805 7194 : static_cast<int32_t>(std::min<uint32_t>(pSrcData[n], INT_MAX));
2806 2196 : }
2807 : }
2808 : else
2809 : {
2810 954 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2811 : nDstPixelStride, nWordCount);
2812 : }
2813 3150 : }
2814 :
2815 : template <>
2816 93 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
2817 : int nSrcPixelStride,
2818 : uint32_t *const CPL_RESTRICT pDstData,
2819 : int nDstPixelStride, GPtrDiff_t nWordCount)
2820 : {
2821 93 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2822 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2823 : {
2824 38 : decltype(nWordCount) n = 0;
2825 38 : const __m128i xmm_zero = _mm_setzero_si128();
2826 333 : for (; n < nWordCount - 7; n += 8)
2827 : {
2828 590 : __m128i xmm0 = _mm_loadu_si128(
2829 295 : reinterpret_cast<const __m128i *>(pSrcData + n));
2830 295 : __m128i xmm1 = _mm_loadu_si128(
2831 295 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
2832 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
2833 : xmm0 = _mm_max_epi32(xmm0, xmm_zero);
2834 : xmm1 = _mm_max_epi32(xmm1, xmm_zero);
2835 : #else
2836 295 : const __m128i mask0 = _mm_cmpgt_epi32(xmm0, xmm_zero);
2837 295 : const __m128i mask1 = _mm_cmpgt_epi32(xmm1, xmm_zero);
2838 295 : xmm0 = _mm_and_si128(xmm0, mask0);
2839 295 : xmm1 = _mm_and_si128(xmm1, mask1);
2840 : #endif
2841 295 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2842 295 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
2843 : xmm1);
2844 : }
2845 : #if defined(__clang__)
2846 : #pragma clang loop vectorize(disable)
2847 : #endif
2848 192 : for (; n < nWordCount; n++)
2849 : {
2850 154 : pDstData[n] = static_cast<uint32_t>(std::max(pSrcData[n], 0));
2851 38 : }
2852 : }
2853 : else
2854 : {
2855 55 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2856 : nDstPixelStride, nWordCount);
2857 : }
2858 93 : }
2859 :
2860 : template <>
2861 403 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2862 : int nSrcPixelStride,
2863 : float *const CPL_RESTRICT pDstData,
2864 : int nDstPixelStride, GPtrDiff_t nWordCount)
2865 : {
2866 403 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2867 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2868 : {
2869 397 : decltype(nWordCount) n = 0;
2870 397 : const __m128i xmm_zero = _mm_setzero_si128();
2871 397 : GByte *CPL_RESTRICT pabyDstDataPtr =
2872 : reinterpret_cast<GByte *>(pDstData);
2873 1688 : for (; n < nWordCount - 7; n += 8)
2874 : {
2875 1291 : __m128i xmm = _mm_loadu_si128(
2876 1291 : reinterpret_cast<const __m128i *>(pSrcData + n));
2877 1291 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
2878 1291 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
2879 1291 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2880 1291 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2881 1291 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2882 : xmm0_f);
2883 : _mm_storeu_ps(
2884 1291 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2885 : }
2886 : #if defined(__clang__)
2887 : #pragma clang loop vectorize(disable)
2888 : #endif
2889 1415 : for (; n < nWordCount; n++)
2890 : {
2891 1018 : pDstData[n] = pSrcData[n];
2892 397 : }
2893 : }
2894 : else
2895 : {
2896 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2897 : nDstPixelStride, nWordCount);
2898 : }
2899 403 : }
2900 :
2901 : template <>
2902 1076640 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2903 : int nSrcPixelStride,
2904 : float *const CPL_RESTRICT pDstData,
2905 : int nDstPixelStride, GPtrDiff_t nWordCount)
2906 : {
2907 1076640 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2908 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2909 : {
2910 86742 : decltype(nWordCount) n = 0;
2911 86742 : GByte *CPL_RESTRICT pabyDstDataPtr =
2912 : reinterpret_cast<GByte *>(pDstData);
2913 586119 : for (; n < nWordCount - 7; n += 8)
2914 : {
2915 499377 : __m128i xmm = _mm_loadu_si128(
2916 499377 : reinterpret_cast<const __m128i *>(pSrcData + n));
2917 499377 : const auto sign = _mm_srai_epi16(xmm, 15);
2918 499377 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
2919 499377 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
2920 499377 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2921 499377 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2922 499377 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2923 : xmm0_f);
2924 : _mm_storeu_ps(
2925 499377 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2926 : }
2927 : #if defined(__clang__)
2928 : #pragma clang loop vectorize(disable)
2929 : #endif
2930 253882 : for (; n < nWordCount; n++)
2931 : {
2932 167140 : pDstData[n] = pSrcData[n];
2933 86742 : }
2934 : }
2935 : else
2936 : {
2937 989901 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2938 : nDstPixelStride, nWordCount);
2939 : }
2940 1076640 : }
2941 :
2942 : template <>
2943 449 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2944 : int nSrcPixelStride,
2945 : double *const CPL_RESTRICT pDstData,
2946 : int nDstPixelStride, GPtrDiff_t nWordCount)
2947 : {
2948 449 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2949 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2950 : {
2951 313 : decltype(nWordCount) n = 0;
2952 313 : const __m128i xmm_zero = _mm_setzero_si128();
2953 313 : GByte *CPL_RESTRICT pabyDstDataPtr =
2954 : reinterpret_cast<GByte *>(pDstData);
2955 829 : for (; n < nWordCount - 7; n += 8)
2956 : {
2957 516 : __m128i xmm = _mm_loadu_si128(
2958 516 : reinterpret_cast<const __m128i *>(pSrcData + n));
2959 516 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
2960 516 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
2961 :
2962 516 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2963 516 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2964 516 : xmm0 = _mm_srli_si128(xmm0, 8);
2965 516 : xmm1 = _mm_srli_si128(xmm1, 8);
2966 516 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2967 516 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2968 :
2969 516 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2970 : xmm0_low_d);
2971 : _mm_storeu_pd(
2972 516 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2973 : xmm0_high_d);
2974 : _mm_storeu_pd(
2975 516 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2976 : xmm1_low_d);
2977 : _mm_storeu_pd(
2978 516 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2979 : xmm1_high_d);
2980 : }
2981 : #if defined(__clang__)
2982 : #pragma clang loop vectorize(disable)
2983 : #endif
2984 1082 : for (; n < nWordCount; n++)
2985 : {
2986 769 : pDstData[n] = pSrcData[n];
2987 313 : }
2988 : }
2989 : else
2990 : {
2991 136 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2992 : nDstPixelStride, nWordCount);
2993 : }
2994 449 : }
2995 :
2996 : template <>
2997 4923280 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2998 : int nSrcPixelStride,
2999 : double *const CPL_RESTRICT pDstData,
3000 : int nDstPixelStride, GPtrDiff_t nWordCount)
3001 : {
3002 4923280 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
3003 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
3004 : {
3005 34874 : decltype(nWordCount) n = 0;
3006 34874 : GByte *CPL_RESTRICT pabyDstDataPtr =
3007 : reinterpret_cast<GByte *>(pDstData);
3008 403828 : for (; n < nWordCount - 7; n += 8)
3009 : {
3010 368954 : __m128i xmm = _mm_loadu_si128(
3011 368954 : reinterpret_cast<const __m128i *>(pSrcData + n));
3012 368954 : const auto sign = _mm_srai_epi16(xmm, 15);
3013 368954 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
3014 368954 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
3015 :
3016 368954 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
3017 368954 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
3018 368954 : xmm0 = _mm_srli_si128(xmm0, 8);
3019 368954 : xmm1 = _mm_srli_si128(xmm1, 8);
3020 368954 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
3021 368954 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
3022 :
3023 368954 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
3024 : xmm0_low_d);
3025 : _mm_storeu_pd(
3026 368954 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
3027 : xmm0_high_d);
3028 : _mm_storeu_pd(
3029 368954 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
3030 : xmm1_low_d);
3031 : _mm_storeu_pd(
3032 368954 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
3033 : xmm1_high_d);
3034 : }
3035 : #if defined(__clang__)
3036 : #pragma clang loop vectorize(disable)
3037 : #endif
3038 255934 : for (; n < nWordCount; n++)
3039 : {
3040 221060 : pDstData[n] = pSrcData[n];
3041 34874 : }
3042 : }
3043 : else
3044 : {
3045 4888400 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
3046 : nDstPixelStride, nWordCount);
3047 : }
3048 4923280 : }
3049 :
3050 : // ---- AVX2 helpers for int32 narrowing (runtime dispatch) ----
3051 :
3052 : #if defined(HAVE_AVX2_DISPATCH)
3053 : #if !defined(_MSC_VER)
3054 : __attribute__((target("avx2")))
3055 : #endif
3056 12723 : static void GDALCopyWordsInt32ToUInt8_AVX2(const int32_t *CPL_RESTRICT pSrc,
3057 : uint8_t *CPL_RESTRICT pDst,
3058 : GPtrDiff_t nWordCount)
3059 : {
3060 12723 : const __m256i permuteIdx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
3061 12723 : GPtrDiff_t n = 0;
3062 958119 : for (; n < nWordCount - 31; n += 32)
3063 : {
3064 : __m256i v0 =
3065 945396 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n));
3066 : __m256i v1 =
3067 945396 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n + 8));
3068 945396 : __m256i v2 = _mm256_loadu_si256(
3069 945396 : reinterpret_cast<const __m256i *>(pSrc + n + 16));
3070 945396 : __m256i v3 = _mm256_loadu_si256(
3071 945396 : reinterpret_cast<const __m256i *>(pSrc + n + 24));
3072 : // Clamp to [0, 255]
3073 : // Pack int32 -> int16 -> uint8, then fix cross-lane ordering
3074 945396 : __m256i ab16 = _mm256_packs_epi32(v0, v1);
3075 945396 : __m256i cd16 = _mm256_packs_epi32(v2, v3);
3076 945396 : __m256i bytes = _mm256_packus_epi16(ab16, cd16);
3077 945396 : bytes = _mm256_permutevar8x32_epi32(bytes, permuteIdx);
3078 945396 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(pDst + n), bytes);
3079 : }
3080 : #if defined(__clang__)
3081 : #pragma clang loop vectorize(disable)
3082 : #endif
3083 68589 : for (; n < nWordCount; n++)
3084 : {
3085 55866 : pDst[n] = static_cast<uint8_t>(std::clamp(pSrc[n], 0, 255));
3086 : }
3087 12723 : }
3088 :
3089 : #if !defined(_MSC_VER)
3090 : __attribute__((target("avx2")))
3091 : #endif
3092 10277 : static void GDALCopyWordsInt32ToUInt16_AVX2(const int32_t *CPL_RESTRICT pSrc,
3093 : uint16_t *CPL_RESTRICT pDst,
3094 : GPtrDiff_t nWordCount)
3095 : {
3096 : // _mm256_packus_epi32(v0, v1) produces per-lane interleaved result:
3097 : // [v0_lo4, v1_lo4, v0_hi4, v1_hi4] (in uint16 pairs per 32-bit lane)
3098 : // Permute to deinterleave: all v0 values first, then all v1 values
3099 10277 : const __m256i permuteIdx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
3100 10277 : GPtrDiff_t n = 0;
3101 670572 : for (; n < nWordCount - 15; n += 16)
3102 : {
3103 : __m256i v0 =
3104 660295 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n));
3105 : __m256i v1 =
3106 1320590 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n + 8));
3107 : // Clamp to [0, 65535]: _mm256_packus_epi32 saturates uint
3108 660295 : __m256i packed = _mm256_packus_epi32(v0, v1);
3109 : // Fix cross-lane interleave from packus
3110 660295 : packed = _mm256_permutevar8x32_epi32(packed, permuteIdx);
3111 660295 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(pDst + n), packed);
3112 : }
3113 : #if defined(__clang__)
3114 : #pragma clang loop vectorize(disable)
3115 : #endif
3116 163928 : for (; n < nWordCount; n++)
3117 : {
3118 153651 : pDst[n] = static_cast<uint16_t>(std::clamp(pSrc[n], 0, 65535));
3119 : }
3120 10277 : }
3121 : #endif // HAVE_AVX2_DISPATCH
3122 :
3123 : // ---- int32 -> uint8 with clamping to [0, 255] ----
3124 : template <>
3125 12837 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
3126 : int nSrcPixelStride,
3127 : uint8_t *const CPL_RESTRICT pDstData,
3128 : int nDstPixelStride, GPtrDiff_t nWordCount)
3129 : {
3130 12837 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
3131 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
3132 : {
3133 : #if defined(HAVE_AVX2_DISPATCH)
3134 12723 : if (CPLHaveRuntimeAVX2())
3135 : {
3136 12723 : GDALCopyWordsInt32ToUInt8_AVX2(pSrcData, pDstData, nWordCount);
3137 12723 : return;
3138 : }
3139 : #endif
3140 :
3141 : // SSE2 path: 16 pixels per iteration
3142 0 : decltype(nWordCount) n = 0;
3143 0 : for (; n < nWordCount - 15; n += 16)
3144 : {
3145 0 : __m128i v0 = _mm_loadu_si128(
3146 0 : reinterpret_cast<const __m128i *>(pSrcData + n));
3147 0 : __m128i v1 = _mm_loadu_si128(
3148 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
3149 0 : __m128i v2 = _mm_loadu_si128(
3150 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
3151 0 : __m128i v3 = _mm_loadu_si128(
3152 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 12));
3153 : // Pack int32->int16 with signed saturation to [-32768,32767] range
3154 0 : __m128i lo16 = _mm_packs_epi32(v0, v1);
3155 0 : __m128i hi16 = _mm_packs_epi32(v2, v3);
3156 : // Pack int16->uint8 with unsigned saturation to [0,255] range
3157 0 : __m128i bytes = _mm_packus_epi16(lo16, hi16);
3158 0 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), bytes);
3159 : }
3160 : #if defined(__clang__)
3161 : #pragma clang loop vectorize(disable)
3162 : #endif
3163 0 : for (; n < nWordCount; n++)
3164 : {
3165 0 : pDstData[n] = static_cast<uint8_t>(std::clamp(pSrcData[n], 0, 255));
3166 0 : }
3167 : }
3168 : else
3169 : {
3170 114 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
3171 : nDstPixelStride, nWordCount);
3172 : }
3173 : }
3174 :
3175 : // ---- int32 -> uint16 with clamping to [0, 65535] ----
3176 : template <>
3177 10322 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
3178 : int nSrcPixelStride,
3179 : uint16_t *const CPL_RESTRICT pDstData,
3180 : int nDstPixelStride, GPtrDiff_t nWordCount)
3181 : {
3182 10322 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
3183 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
3184 : {
3185 : #if defined(HAVE_AVX2_DISPATCH)
3186 10277 : if (CPLHaveRuntimeAVX2())
3187 : {
3188 10277 : GDALCopyWordsInt32ToUInt16_AVX2(pSrcData, pDstData, nWordCount);
3189 10277 : return;
3190 : }
3191 : #endif
3192 0 : decltype(nWordCount) n = 0;
3193 0 : for (; n < nWordCount - 15; n += 16)
3194 : {
3195 0 : __m128i v0 = _mm_loadu_si128(
3196 0 : reinterpret_cast<const __m128i *>(pSrcData + n));
3197 0 : __m128i v1 = _mm_loadu_si128(
3198 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
3199 0 : __m128i v2 = _mm_loadu_si128(
3200 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
3201 0 : __m128i v3 = _mm_loadu_si128(
3202 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 12));
3203 0 : const auto packed_lo = GDAL_mm_packus_epi32(v0, v1);
3204 0 : const auto packed_hi = GDAL_mm_packus_epi32(v2, v3);
3205 0 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n),
3206 : packed_lo);
3207 0 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
3208 : packed_hi);
3209 : }
3210 : #if defined(__clang__)
3211 : #pragma clang loop vectorize(disable)
3212 : #endif
3213 0 : for (; n < nWordCount; n++)
3214 : {
3215 0 : pDstData[n] =
3216 0 : static_cast<uint16_t>(std::clamp(pSrcData[n], 0, 65535));
3217 0 : }
3218 : }
3219 : else
3220 : {
3221 45 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
3222 : nDstPixelStride, nWordCount);
3223 : }
3224 : }
3225 :
3226 : // ---- int32 -> int16 with clamping to [-32768, 32767] ----
3227 : template <>
3228 98 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
3229 : int nSrcPixelStride,
3230 : int16_t *const CPL_RESTRICT pDstData,
3231 : int nDstPixelStride, GPtrDiff_t nWordCount)
3232 : {
3233 98 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
3234 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
3235 : {
3236 : // SSE2 path: 16 pixels per iteration
3237 43 : decltype(nWordCount) n = 0;
3238 268 : for (; n < nWordCount - 15; n += 16)
3239 : {
3240 450 : __m128i v0 = _mm_loadu_si128(
3241 225 : reinterpret_cast<const __m128i *>(pSrcData + n));
3242 450 : __m128i v1 = _mm_loadu_si128(
3243 225 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
3244 450 : __m128i v2 = _mm_loadu_si128(
3245 225 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
3246 225 : __m128i v3 = _mm_loadu_si128(
3247 225 : reinterpret_cast<const __m128i *>(pSrcData + n + 12));
3248 : // Pack int32->int16 with signed saturation to [-32768,32767] range
3249 225 : __m128i packed_lo = _mm_packs_epi32(v0, v1);
3250 225 : __m128i packed_hi = _mm_packs_epi32(v2, v3);
3251 225 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n),
3252 : packed_lo);
3253 225 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
3254 : packed_hi);
3255 : }
3256 : #if defined(__clang__)
3257 : #pragma clang loop vectorize(disable)
3258 : #endif
3259 191 : for (; n < nWordCount; n++)
3260 : {
3261 148 : pDstData[n] =
3262 148 : static_cast<int16_t>(std::clamp(pSrcData[n], -32768, 32767));
3263 43 : }
3264 : }
3265 : else
3266 : {
3267 55 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
3268 : nDstPixelStride, nWordCount);
3269 : }
3270 98 : }
3271 :
3272 : // ---- int16 -> uint8 with clamping to [0, 255] ----
3273 : template <>
3274 17428 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
3275 : int nSrcPixelStride,
3276 : uint8_t *const CPL_RESTRICT pDstData,
3277 : int nDstPixelStride, GPtrDiff_t nWordCount)
3278 : {
3279 17428 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
3280 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
3281 : {
3282 : // SSE2 path: 32 pixels per iteration
3283 17338 : decltype(nWordCount) n = 0;
3284 85649 : for (; n < nWordCount - 31; n += 32)
3285 : {
3286 136622 : __m128i v0 = _mm_loadu_si128(
3287 68311 : reinterpret_cast<const __m128i *>(pSrcData + n));
3288 136622 : __m128i v1 = _mm_loadu_si128(
3289 68311 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
3290 136622 : __m128i v2 = _mm_loadu_si128(
3291 68311 : reinterpret_cast<const __m128i *>(pSrcData + n + 16));
3292 68311 : __m128i v3 = _mm_loadu_si128(
3293 68311 : reinterpret_cast<const __m128i *>(pSrcData + n + 24));
3294 : // Pack int16->uint8 with unsigned saturation to [0, 255] range
3295 68311 : __m128i packed_lo = _mm_packus_epi16(v0, v1);
3296 68311 : __m128i packed_hi = _mm_packus_epi16(v2, v3);
3297 68311 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n),
3298 : packed_lo);
3299 68311 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
3300 : packed_hi);
3301 : }
3302 : #if defined(__clang__)
3303 : #pragma clang loop vectorize(disable)
3304 : #endif
3305 214741 : for (; n < nWordCount; n++)
3306 : {
3307 197403 : pDstData[n] =
3308 197403 : static_cast<uint8_t>(std::clamp<int>(pSrcData[n], 0, 255));
3309 17338 : }
3310 : }
3311 : else
3312 : {
3313 90 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
3314 : nDstPixelStride, nWordCount);
3315 : }
3316 17428 : }
3317 :
3318 : #endif // HAVE_SSE2
3319 :
3320 : template <>
3321 4437370 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3322 : int nSrcPixelStride,
3323 : GByte *const CPL_RESTRICT pDstData,
3324 : int nDstPixelStride, GPtrDiff_t nWordCount)
3325 : {
3326 4437370 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3327 : nDstPixelStride, nWordCount);
3328 4437370 : }
3329 :
3330 : template <>
3331 38394 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3332 : int nSrcPixelStride,
3333 : GUInt16 *const CPL_RESTRICT pDstData,
3334 : int nDstPixelStride, GPtrDiff_t nWordCount)
3335 : {
3336 38394 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3337 : nDstPixelStride, nWordCount);
3338 38394 : }
3339 :
3340 : template <>
3341 55939 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3342 : int nSrcPixelStride,
3343 : double *const CPL_RESTRICT pDstData,
3344 : int nDstPixelStride, GPtrDiff_t nWordCount)
3345 : {
3346 55939 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3347 : nDstPixelStride, nWordCount);
3348 55939 : }
3349 :
3350 : template <>
3351 122875 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3352 : int nSrcPixelStride,
3353 : float *const CPL_RESTRICT pDstData,
3354 : int nDstPixelStride, GPtrDiff_t nWordCount)
3355 : {
3356 122875 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3357 : nDstPixelStride, nWordCount);
3358 122875 : }
3359 :
3360 : template <>
3361 412 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
3362 : int nSrcPixelStride,
3363 : float *const CPL_RESTRICT pDstData,
3364 : int nDstPixelStride, GPtrDiff_t nWordCount)
3365 : {
3366 412 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3367 : nDstPixelStride, nWordCount);
3368 412 : }
3369 :
3370 : template <>
3371 544 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
3372 : int nSrcPixelStride,
3373 : double *const CPL_RESTRICT pDstData,
3374 : int nDstPixelStride, GPtrDiff_t nWordCount)
3375 : {
3376 544 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3377 : nDstPixelStride, nWordCount);
3378 544 : }
3379 :
3380 : template <>
3381 324215 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3382 : int nSrcPixelStride,
3383 : GByte *const CPL_RESTRICT pDstData,
3384 : int nDstPixelStride, GPtrDiff_t nWordCount)
3385 : {
3386 324215 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3387 : nDstPixelStride, nWordCount);
3388 324215 : }
3389 :
3390 : template <>
3391 61 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3392 : int nSrcPixelStride,
3393 : GInt8 *const CPL_RESTRICT pDstData,
3394 : int nDstPixelStride, GPtrDiff_t nWordCount)
3395 : {
3396 61 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3397 : nDstPixelStride, nWordCount);
3398 61 : }
3399 :
3400 : template <>
3401 15791 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3402 : int nSrcPixelStride,
3403 : GInt16 *const CPL_RESTRICT pDstData,
3404 : int nDstPixelStride, GPtrDiff_t nWordCount)
3405 : {
3406 15791 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3407 : nDstPixelStride, nWordCount);
3408 15791 : }
3409 :
3410 : template <>
3411 61719 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3412 : int nSrcPixelStride,
3413 : GUInt16 *const CPL_RESTRICT pDstData,
3414 : int nDstPixelStride, GPtrDiff_t nWordCount)
3415 : {
3416 61719 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3417 : nDstPixelStride, nWordCount);
3418 61719 : }
3419 :
3420 : template <>
3421 43991 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3422 : int nSrcPixelStride,
3423 : GInt32 *const CPL_RESTRICT pDstData,
3424 : int nDstPixelStride, GPtrDiff_t nWordCount)
3425 : {
3426 43991 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3427 : nDstPixelStride, nWordCount);
3428 43991 : }
3429 :
3430 : template <>
3431 72 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3432 : int nSrcPixelStride,
3433 : GFloat16 *const CPL_RESTRICT pDstData,
3434 : int nDstPixelStride, GPtrDiff_t nWordCount)
3435 : {
3436 72 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3437 : nDstPixelStride, nWordCount);
3438 72 : }
3439 :
3440 : template <>
3441 63 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3442 : int nSrcPixelStride,
3443 : GFloat16 *const CPL_RESTRICT pDstData,
3444 : int nDstPixelStride, GPtrDiff_t nWordCount)
3445 : {
3446 63 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3447 : nDstPixelStride, nWordCount);
3448 63 : }
3449 :
3450 : /************************************************************************/
3451 : /* GDALCopyWordsComplexT() */
3452 : /************************************************************************/
3453 : /**
3454 : * Template function, used to copy data from pSrcData into buffer
3455 : * pDstData, with stride nSrcPixelStride in the source data and
3456 : * stride nDstPixelStride in the destination data. Deals with the
3457 : * complex case, where input is complex and output is complex.
3458 : *
3459 : * @param pSrcData the source data buffer
3460 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
3461 : * of interest.
3462 : * @param pDstData the destination buffer.
3463 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
3464 : * interest.
3465 : * @param nWordCount the total number of pixel words to copy
3466 : *
3467 : */
3468 : template <class Tin, class Tout>
3469 98788 : inline void GDALCopyWordsComplexT(const Tin *const CPL_RESTRICT pSrcData,
3470 : int nSrcPixelStride,
3471 : Tout *const CPL_RESTRICT pDstData,
3472 : int nDstPixelStride, GPtrDiff_t nWordCount)
3473 : {
3474 98788 : decltype(nWordCount) nDstOffset = 0;
3475 98788 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
3476 98788 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
3477 :
3478 5631239 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
3479 : {
3480 5532446 : const Tin *const pPixelIn =
3481 5532446 : reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
3482 5532446 : Tout *const pPixelOut =
3483 5532446 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
3484 :
3485 5532446 : GDALCopyWord(pPixelIn[0], pPixelOut[0]);
3486 5532446 : GDALCopyWord(pPixelIn[1], pPixelOut[1]);
3487 :
3488 5532446 : nDstOffset += nDstPixelStride;
3489 : }
3490 98788 : }
3491 :
3492 : /************************************************************************/
3493 : /* GDALCopyWordsComplexOutT() */
3494 : /************************************************************************/
3495 : /**
3496 : * Template function, used to copy data from pSrcData into buffer
3497 : * pDstData, with stride nSrcPixelStride in the source data and
3498 : * stride nDstPixelStride in the destination data. Deals with the
3499 : * case where the value is real coming in, but complex going out.
3500 : *
3501 : * @param pSrcData the source data buffer
3502 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
3503 : * of interest, in bytes.
3504 : * @param pDstData the destination buffer.
3505 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
3506 : * interest, in bytes.
3507 : * @param nWordCount the total number of pixel words to copy
3508 : *
3509 : */
3510 : template <class Tin, class Tout>
3511 4778 : inline void GDALCopyWordsComplexOutT(const Tin *const CPL_RESTRICT pSrcData,
3512 : int nSrcPixelStride,
3513 : Tout *const CPL_RESTRICT pDstData,
3514 : int nDstPixelStride, GPtrDiff_t nWordCount)
3515 : {
3516 4778 : decltype(nWordCount) nDstOffset = 0;
3517 :
3518 4778 : const Tout tOutZero = static_cast<Tout>(0);
3519 :
3520 4778 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
3521 4778 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
3522 :
3523 1190456 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
3524 : {
3525 1185678 : const Tin tValue =
3526 1185678 : *reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
3527 1185678 : Tout *const pPixelOut =
3528 1185678 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
3529 1185678 : GDALCopyWord(tValue, *pPixelOut);
3530 :
3531 1185678 : pPixelOut[1] = tOutZero;
3532 :
3533 1185678 : nDstOffset += nDstPixelStride;
3534 : }
3535 4778 : }
3536 :
3537 : /************************************************************************/
3538 : /* GDALCopyWordsFromT() */
3539 : /************************************************************************/
3540 : /**
3541 : * Template driver function. Given the input type T, call the appropriate
3542 : * GDALCopyWordsT function template for the desired output type. You should
3543 : * never call this function directly (call GDALCopyWords instead).
3544 : *
3545 : * @param pSrcData source data buffer
3546 : * @param nSrcPixelStride pixel stride in input buffer, in pixel words
3547 : * @param bInComplex input is complex
3548 : * @param pDstData destination data buffer
3549 : * @param eDstType destination data type
3550 : * @param nDstPixelStride pixel stride in output buffer, in pixel words
3551 : * @param nWordCount number of pixel words to be copied
3552 : */
3553 : template <class T>
3554 61314995 : inline void GDALCopyWordsFromT(const T *const CPL_RESTRICT pSrcData,
3555 : int nSrcPixelStride, bool bInComplex,
3556 : void *CPL_RESTRICT pDstData,
3557 : GDALDataType eDstType, int nDstPixelStride,
3558 : GPtrDiff_t nWordCount)
3559 : {
3560 61314995 : switch (eDstType)
3561 : {
3562 4805731 : case GDT_UInt8:
3563 4805731 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3564 : static_cast<unsigned char *>(pDstData),
3565 : nDstPixelStride, nWordCount);
3566 4805731 : break;
3567 1903 : case GDT_Int8:
3568 1903 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3569 : static_cast<signed char *>(pDstData),
3570 : nDstPixelStride, nWordCount);
3571 1903 : break;
3572 1143791 : case GDT_UInt16:
3573 1143791 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3574 : static_cast<unsigned short *>(pDstData),
3575 : nDstPixelStride, nWordCount);
3576 1143791 : break;
3577 4162744 : case GDT_Int16:
3578 4162744 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3579 : static_cast<short *>(pDstData), nDstPixelStride,
3580 : nWordCount);
3581 4162744 : break;
3582 23348 : case GDT_UInt32:
3583 23348 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3584 : static_cast<unsigned int *>(pDstData),
3585 : nDstPixelStride, nWordCount);
3586 23348 : break;
3587 29460959 : case GDT_Int32:
3588 29460959 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3589 : static_cast<int *>(pDstData), nDstPixelStride,
3590 : nWordCount);
3591 29460959 : break;
3592 1262 : case GDT_UInt64:
3593 1262 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3594 : static_cast<std::uint64_t *>(pDstData),
3595 : nDstPixelStride, nWordCount);
3596 1262 : break;
3597 6209 : case GDT_Int64:
3598 6209 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3599 : static_cast<std::int64_t *>(pDstData),
3600 : nDstPixelStride, nWordCount);
3601 6209 : break;
3602 999 : case GDT_Float16:
3603 999 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3604 : static_cast<GFloat16 *>(pDstData), nDstPixelStride,
3605 : nWordCount);
3606 999 : break;
3607 4216100 : case GDT_Float32:
3608 4216100 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3609 : static_cast<float *>(pDstData), nDstPixelStride,
3610 : nWordCount);
3611 4216100 : break;
3612 17388255 : case GDT_Float64:
3613 17388255 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3614 : static_cast<double *>(pDstData), nDstPixelStride,
3615 : nWordCount);
3616 17388255 : break;
3617 94432 : case GDT_CInt16:
3618 94432 : if (bInComplex)
3619 : {
3620 93170 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3621 : static_cast<short *>(pDstData),
3622 : nDstPixelStride, nWordCount);
3623 : }
3624 : else // input is not complex, so we need to promote to a complex
3625 : // buffer
3626 : {
3627 1262 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3628 : static_cast<short *>(pDstData),
3629 : nDstPixelStride, nWordCount);
3630 : }
3631 94432 : break;
3632 1357 : case GDT_CInt32:
3633 1357 : if (bInComplex)
3634 : {
3635 717 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3636 : static_cast<int *>(pDstData),
3637 : nDstPixelStride, nWordCount);
3638 : }
3639 : else // input is not complex, so we need to promote to a complex
3640 : // buffer
3641 : {
3642 640 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3643 : static_cast<int *>(pDstData),
3644 : nDstPixelStride, nWordCount);
3645 : }
3646 1357 : break;
3647 313 : case GDT_CFloat16:
3648 313 : if (bInComplex)
3649 : {
3650 48 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3651 : static_cast<GFloat16 *>(pDstData),
3652 : nDstPixelStride, nWordCount);
3653 : }
3654 : else // input is not complex, so we need to promote to a complex
3655 : // buffer
3656 : {
3657 265 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3658 : static_cast<GFloat16 *>(pDstData),
3659 : nDstPixelStride, nWordCount);
3660 : }
3661 313 : break;
3662 3924 : case GDT_CFloat32:
3663 3924 : if (bInComplex)
3664 : {
3665 3115 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3666 : static_cast<float *>(pDstData),
3667 : nDstPixelStride, nWordCount);
3668 : }
3669 : else // input is not complex, so we need to promote to a complex
3670 : // buffer
3671 : {
3672 809 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3673 : static_cast<float *>(pDstData),
3674 : nDstPixelStride, nWordCount);
3675 : }
3676 3924 : break;
3677 3540 : case GDT_CFloat64:
3678 3540 : if (bInComplex)
3679 : {
3680 1738 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3681 : static_cast<double *>(pDstData),
3682 : nDstPixelStride, nWordCount);
3683 : }
3684 : else // input is not complex, so we need to promote to a complex
3685 : // buffer
3686 : {
3687 1802 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3688 : static_cast<double *>(pDstData),
3689 : nDstPixelStride, nWordCount);
3690 : }
3691 3540 : break;
3692 0 : case GDT_Unknown:
3693 : case GDT_TypeCount:
3694 0 : CPLAssert(false);
3695 : }
3696 61314995 : }
3697 :
3698 : } // end anonymous namespace
3699 :
3700 : /************************************************************************/
3701 : /* GDALReplicateWord() */
3702 : /************************************************************************/
3703 :
3704 : template <class T>
3705 600463 : inline void GDALReplicateWordT(void *pDstData, int nDstPixelStride,
3706 : GPtrDiff_t nWordCount)
3707 : {
3708 600463 : const T valSet = *static_cast<const T *>(pDstData);
3709 600463 : if (nDstPixelStride == static_cast<int>(sizeof(T)))
3710 : {
3711 570649 : T *pDstPtr = static_cast<T *>(pDstData) + 1;
3712 32018999 : while (nWordCount >= 4)
3713 : {
3714 31448340 : nWordCount -= 4;
3715 31448340 : pDstPtr[0] = valSet;
3716 31448340 : pDstPtr[1] = valSet;
3717 31448340 : pDstPtr[2] = valSet;
3718 31448340 : pDstPtr[3] = valSet;
3719 31448340 : pDstPtr += 4;
3720 : }
3721 1476843 : while (nWordCount > 0)
3722 : {
3723 906194 : --nWordCount;
3724 906194 : *pDstPtr = valSet;
3725 906194 : pDstPtr++;
3726 : }
3727 : }
3728 : else
3729 : {
3730 29814 : GByte *pabyDstPtr = static_cast<GByte *>(pDstData) + nDstPixelStride;
3731 1106520 : while (nWordCount > 0)
3732 : {
3733 1076706 : --nWordCount;
3734 1076706 : *reinterpret_cast<T *>(pabyDstPtr) = valSet;
3735 1076706 : pabyDstPtr += nDstPixelStride;
3736 : }
3737 : }
3738 600463 : }
3739 :
3740 1080460 : static void GDALReplicateWord(const void *CPL_RESTRICT pSrcData,
3741 : GDALDataType eSrcType,
3742 : void *CPL_RESTRICT pDstData,
3743 : GDALDataType eDstType, int nDstPixelStride,
3744 : GPtrDiff_t nWordCount)
3745 : {
3746 : /* -----------------------------------------------------------------------
3747 : */
3748 : /* Special case when the source data is always the same value */
3749 : /* (for VRTSourcedRasterBand::IRasterIO and
3750 : * VRTDerivedRasterBand::IRasterIO*/
3751 : /* for example) */
3752 : /* -----------------------------------------------------------------------
3753 : */
3754 : // Let the general translation case do the necessary conversions
3755 : // on the first destination element.
3756 1080460 : GDALCopyWords64(pSrcData, eSrcType, 0, pDstData, eDstType, 0, 1);
3757 :
3758 : // Now copy the first element to the nWordCount - 1 following destination
3759 : // elements.
3760 1080460 : nWordCount--;
3761 1080460 : GByte *pabyDstWord = reinterpret_cast<GByte *>(pDstData) + nDstPixelStride;
3762 :
3763 1080460 : switch (eDstType)
3764 : {
3765 479910 : case GDT_UInt8:
3766 : case GDT_Int8:
3767 : {
3768 479910 : if (nDstPixelStride == 1)
3769 : {
3770 369977 : if (nWordCount > 0)
3771 369977 : memset(pabyDstWord,
3772 369977 : *reinterpret_cast<const GByte *>(pDstData),
3773 : nWordCount);
3774 : }
3775 : else
3776 : {
3777 109933 : GByte valSet = *reinterpret_cast<const GByte *>(pDstData);
3778 72866900 : while (nWordCount > 0)
3779 : {
3780 72757000 : --nWordCount;
3781 72757000 : *pabyDstWord = valSet;
3782 72757000 : pabyDstWord += nDstPixelStride;
3783 : }
3784 : }
3785 479910 : break;
3786 : }
3787 :
3788 : #define CASE_DUPLICATE_SIMPLE(enum_type, c_type) \
3789 : case enum_type: \
3790 : { \
3791 : GDALReplicateWordT<c_type>(pDstData, nDstPixelStride, nWordCount); \
3792 : break; \
3793 : }
3794 :
3795 34514 : CASE_DUPLICATE_SIMPLE(GDT_UInt16, GUInt16)
3796 202455 : CASE_DUPLICATE_SIMPLE(GDT_Int16, GInt16)
3797 74 : CASE_DUPLICATE_SIMPLE(GDT_UInt32, GUInt32)
3798 301585 : CASE_DUPLICATE_SIMPLE(GDT_Int32, GInt32)
3799 41 : CASE_DUPLICATE_SIMPLE(GDT_UInt64, std::uint64_t)
3800 1072 : CASE_DUPLICATE_SIMPLE(GDT_Int64, std::int64_t)
3801 2 : CASE_DUPLICATE_SIMPLE(GDT_Float16, GFloat16)
3802 52861 : CASE_DUPLICATE_SIMPLE(GDT_Float32, float)
3803 7859 : CASE_DUPLICATE_SIMPLE(GDT_Float64, double)
3804 :
3805 : #define CASE_DUPLICATE_COMPLEX(enum_type, c_type) \
3806 : case enum_type: \
3807 : { \
3808 : c_type valSet1 = reinterpret_cast<const c_type *>(pDstData)[0]; \
3809 : c_type valSet2 = reinterpret_cast<const c_type *>(pDstData)[1]; \
3810 : while (nWordCount > 0) \
3811 : { \
3812 : --nWordCount; \
3813 : reinterpret_cast<c_type *>(pabyDstWord)[0] = valSet1; \
3814 : reinterpret_cast<c_type *>(pabyDstWord)[1] = valSet2; \
3815 : pabyDstWord += nDstPixelStride; \
3816 : } \
3817 : break; \
3818 : }
3819 :
3820 784 : CASE_DUPLICATE_COMPLEX(GDT_CInt16, GInt16)
3821 784 : CASE_DUPLICATE_COMPLEX(GDT_CInt32, GInt32)
3822 6 : CASE_DUPLICATE_COMPLEX(GDT_CFloat16, GFloat16)
3823 790 : CASE_DUPLICATE_COMPLEX(GDT_CFloat32, float)
3824 790 : CASE_DUPLICATE_COMPLEX(GDT_CFloat64, double)
3825 :
3826 0 : case GDT_Unknown:
3827 : case GDT_TypeCount:
3828 0 : CPLAssert(false);
3829 : }
3830 1080460 : }
3831 :
3832 : /************************************************************************/
3833 : /* GDALUnrolledCopy() */
3834 : /************************************************************************/
3835 :
3836 : template <class T, int srcStride, int dstStride>
3837 : #if defined(__GNUC__) && defined(__AVX2__)
3838 : __attribute__((optimize("tree-vectorize")))
3839 : #endif
3840 3057256 : static inline void GDALUnrolledCopyGeneric(T *CPL_RESTRICT pDest,
3841 : const T *CPL_RESTRICT pSrc,
3842 : GPtrDiff_t nIters)
3843 : {
3844 : #if !(defined(__GNUC__) && defined(__AVX2__))
3845 3057256 : if (nIters >= 16)
3846 : {
3847 133760088 : for (GPtrDiff_t i = nIters / 16; i != 0; i--)
3848 : {
3849 130823413 : pDest[0 * dstStride] = pSrc[0 * srcStride];
3850 130823413 : pDest[1 * dstStride] = pSrc[1 * srcStride];
3851 130823413 : pDest[2 * dstStride] = pSrc[2 * srcStride];
3852 130823413 : pDest[3 * dstStride] = pSrc[3 * srcStride];
3853 130823413 : pDest[4 * dstStride] = pSrc[4 * srcStride];
3854 130823413 : pDest[5 * dstStride] = pSrc[5 * srcStride];
3855 130823413 : pDest[6 * dstStride] = pSrc[6 * srcStride];
3856 130823413 : pDest[7 * dstStride] = pSrc[7 * srcStride];
3857 130823413 : pDest[8 * dstStride] = pSrc[8 * srcStride];
3858 130823413 : pDest[9 * dstStride] = pSrc[9 * srcStride];
3859 130823413 : pDest[10 * dstStride] = pSrc[10 * srcStride];
3860 130823413 : pDest[11 * dstStride] = pSrc[11 * srcStride];
3861 130823413 : pDest[12 * dstStride] = pSrc[12 * srcStride];
3862 130823413 : pDest[13 * dstStride] = pSrc[13 * srcStride];
3863 130823413 : pDest[14 * dstStride] = pSrc[14 * srcStride];
3864 130823413 : pDest[15 * dstStride] = pSrc[15 * srcStride];
3865 130823413 : pDest += 16 * dstStride;
3866 130823413 : pSrc += 16 * srcStride;
3867 : }
3868 2936688 : nIters = nIters % 16;
3869 : }
3870 : #else
3871 : #pragma GCC unroll 4
3872 : #endif
3873 5217496 : for (GPtrDiff_t i = 0; i < nIters; i++)
3874 : {
3875 2160243 : pDest[i * dstStride] = *pSrc;
3876 2160243 : pSrc += srcStride;
3877 : }
3878 3057256 : }
3879 :
3880 : template <class T, int srcStride, int dstStride>
3881 3057256 : static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
3882 : const T *CPL_RESTRICT pSrc,
3883 : GPtrDiff_t nIters)
3884 : {
3885 3057256 : GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
3886 3057256 : }
3887 :
3888 : #if defined(__AVX2__) && defined(HAVE_SSSE3_AT_COMPILE_TIME) && \
3889 : (defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS))
3890 :
3891 : template <>
3892 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
3893 : const GByte *CPL_RESTRICT pSrc,
3894 : GPtrDiff_t nIters)
3895 : {
3896 : if (nIters > 16)
3897 : {
3898 : // The SSSE3 variant is slightly faster than what the gcc autovectorizer
3899 : // generates
3900 : GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
3901 : }
3902 : else
3903 : {
3904 : for (GPtrDiff_t i = 0; i < nIters; i++)
3905 : {
3906 : pDest[i] = *pSrc;
3907 : pSrc += 3;
3908 : }
3909 : }
3910 : }
3911 :
3912 : #elif defined(HAVE_SSE2) && !(defined(__GNUC__) && defined(__AVX2__))
3913 :
3914 : template <>
3915 355218 : void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
3916 : const GByte *CPL_RESTRICT pSrc,
3917 : GPtrDiff_t nIters)
3918 : {
3919 355218 : decltype(nIters) i = 0;
3920 355218 : if (nIters > 16)
3921 : {
3922 195691 : const __m128i xmm_mask = _mm_set1_epi16(0xff);
3923 : // If we were sure that there would always be 1 trailing byte, we could
3924 : // check against nIters - 15
3925 3004490 : for (; i < nIters - 16; i += 16)
3926 : {
3927 : __m128i xmm0 =
3928 2808800 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
3929 : __m128i xmm1 =
3930 5617610 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
3931 : // Set higher 8bit of each int16 packed word to 0
3932 2808800 : xmm0 = _mm_and_si128(xmm0, xmm_mask);
3933 2808800 : xmm1 = _mm_and_si128(xmm1, xmm_mask);
3934 : // Pack int16 to uint8 and merge back both vector
3935 2808800 : xmm0 = _mm_packus_epi16(xmm0, xmm1);
3936 :
3937 : // Store result
3938 2808800 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
3939 :
3940 2808800 : pSrc += 2 * 16;
3941 : }
3942 : }
3943 4651210 : for (; i < nIters; i++)
3944 : {
3945 4295990 : pDest[i] = *pSrc;
3946 4295990 : pSrc += 2;
3947 : }
3948 355218 : }
3949 :
3950 1 : static void GDALUnrolledCopy_GByte_3_1_SSE2(GByte *CPL_RESTRICT pDest,
3951 : const GByte *CPL_RESTRICT pSrc,
3952 : GPtrDiff_t nIters)
3953 : {
3954 1 : decltype(nIters) i = 0;
3955 1 : const __m128i xmm_mask_ori = _mm_set_epi32(0, 0, 0, 255);
3956 : // If we were sure that there would always be 2 trailing bytes, we could
3957 : // check against nIters - 15
3958 2 : for (; i < nIters - 16; i += 16)
3959 : {
3960 : __m128i xmm0 =
3961 1 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
3962 : __m128i xmm1 =
3963 1 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
3964 : __m128i xmm2 =
3965 1 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
3966 :
3967 1 : auto xmm_mask0 = xmm_mask_ori;
3968 1 : auto xmm_mask1 = _mm_slli_si128(xmm_mask_ori, 6);
3969 1 : auto xmm_mask2 = _mm_slli_si128(xmm_mask_ori, 11);
3970 :
3971 1 : auto xmm = _mm_and_si128(xmm0, xmm_mask0);
3972 1 : auto xmm_res1 = _mm_and_si128(_mm_slli_si128(xmm1, 4), xmm_mask1);
3973 :
3974 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3975 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3976 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3977 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3978 2 : xmm_res1 = _mm_or_si128(
3979 : xmm_res1, _mm_and_si128(_mm_slli_si128(xmm1, 2), xmm_mask1));
3980 :
3981 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3982 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3983 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3984 2 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3985 1 : xmm_res1 = _mm_or_si128(xmm_res1, _mm_and_si128(xmm1, xmm_mask1));
3986 :
3987 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3988 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3989 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3990 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3991 2 : xmm_res1 = _mm_or_si128(
3992 : xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 2), xmm_mask1));
3993 :
3994 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3995 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3996 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3997 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3998 3 : xmm_res1 = _mm_or_si128(
3999 : xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 4), xmm_mask1));
4000 1 : xmm = _mm_or_si128(xmm, xmm_res1);
4001 :
4002 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
4003 1 : xmm0 = _mm_srli_si128(xmm0, 2);
4004 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
4005 :
4006 2 : xmm = _mm_or_si128(xmm,
4007 : _mm_and_si128(_mm_slli_si128(xmm2, 10), xmm_mask2));
4008 :
4009 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
4010 2 : xmm = _mm_or_si128(xmm,
4011 : _mm_and_si128(_mm_slli_si128(xmm2, 8), xmm_mask2));
4012 :
4013 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
4014 2 : xmm = _mm_or_si128(xmm,
4015 : _mm_and_si128(_mm_slli_si128(xmm2, 6), xmm_mask2));
4016 :
4017 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
4018 2 : xmm = _mm_or_si128(xmm,
4019 : _mm_and_si128(_mm_slli_si128(xmm2, 4), xmm_mask2));
4020 :
4021 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
4022 2 : xmm = _mm_or_si128(xmm,
4023 : _mm_and_si128(_mm_slli_si128(xmm2, 2), xmm_mask2));
4024 :
4025 1 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm);
4026 :
4027 1 : pSrc += 3 * 16;
4028 : }
4029 2 : for (; i < nIters; i++)
4030 : {
4031 1 : pDest[i] = *pSrc;
4032 1 : pSrc += 3;
4033 : }
4034 1 : }
4035 :
4036 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
4037 :
4038 : template <>
4039 193575 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
4040 : const GByte *CPL_RESTRICT pSrc,
4041 : GPtrDiff_t nIters)
4042 : {
4043 193575 : if (nIters > 16)
4044 : {
4045 187452 : if (CPLHaveRuntimeSSSE3())
4046 : {
4047 187451 : GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
4048 : }
4049 : else
4050 : {
4051 1 : GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
4052 : }
4053 : }
4054 : else
4055 : {
4056 20384 : for (GPtrDiff_t i = 0; i < nIters; i++)
4057 : {
4058 14261 : pDest[i] = *pSrc;
4059 14261 : pSrc += 3;
4060 : }
4061 : }
4062 193575 : }
4063 :
4064 : #else
4065 :
4066 : template <>
4067 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
4068 : const GByte *CPL_RESTRICT pSrc,
4069 : GPtrDiff_t nIters)
4070 : {
4071 : GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
4072 : }
4073 : #endif
4074 :
4075 : template <>
4076 332696 : void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
4077 : const GByte *CPL_RESTRICT pSrc,
4078 : GPtrDiff_t nIters)
4079 : {
4080 332696 : decltype(nIters) i = 0;
4081 332696 : if (nIters > 16)
4082 : {
4083 327399 : const __m128i xmm_mask = _mm_set1_epi32(0xff);
4084 : // If we were sure that there would always be 3 trailing bytes, we could
4085 : // check against nIters - 15
4086 28186800 : for (; i < nIters - 16; i += 16)
4087 : {
4088 : __m128i xmm0 =
4089 27859400 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
4090 : __m128i xmm1 =
4091 27859400 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
4092 : __m128i xmm2 =
4093 27859400 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
4094 : __m128i xmm3 =
4095 55718900 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 48));
4096 : // Set higher 24bit of each int32 packed word to 0
4097 27859400 : xmm0 = _mm_and_si128(xmm0, xmm_mask);
4098 27859400 : xmm1 = _mm_and_si128(xmm1, xmm_mask);
4099 27859400 : xmm2 = _mm_and_si128(xmm2, xmm_mask);
4100 27859400 : xmm3 = _mm_and_si128(xmm3, xmm_mask);
4101 : // Pack int32 to int16
4102 27859400 : xmm0 = _mm_packs_epi32(xmm0, xmm1);
4103 27859400 : xmm2 = _mm_packs_epi32(xmm2, xmm3);
4104 : // Pack int16 to uint8
4105 27859400 : xmm0 = _mm_packus_epi16(xmm0, xmm2);
4106 :
4107 : // Store result
4108 27859400 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
4109 :
4110 27859400 : pSrc += 4 * 16;
4111 : }
4112 : }
4113 5049340 : for (; i < nIters; i++)
4114 : {
4115 4716650 : pDest[i] = *pSrc;
4116 4716650 : pSrc += 4;
4117 : }
4118 332696 : }
4119 : #endif // HAVE_SSE2
4120 :
4121 : /************************************************************************/
4122 : /* GDALFastCopy() */
4123 : /************************************************************************/
4124 :
4125 : template <class T>
4126 40277300 : static inline void GDALFastCopy(T *CPL_RESTRICT pDest, int nDestStride,
4127 : const T *CPL_RESTRICT pSrc, int nSrcStride,
4128 : GPtrDiff_t nIters)
4129 : {
4130 40277300 : constexpr int sizeofT = static_cast<int>(sizeof(T));
4131 40277300 : if (nIters == 1)
4132 : {
4133 22545720 : *pDest = *pSrc;
4134 : }
4135 17731495 : else if (nDestStride == sizeofT)
4136 : {
4137 14601140 : if (nSrcStride == sizeofT)
4138 : {
4139 13509924 : memcpy(pDest, pSrc, nIters * sizeof(T));
4140 : }
4141 1091223 : else if (nSrcStride == 2 * sizeofT)
4142 : {
4143 358434 : GDALUnrolledCopy<T, 2, 1>(pDest, pSrc, nIters);
4144 : }
4145 732789 : else if (nSrcStride == 3 * sizeofT)
4146 : {
4147 290555 : GDALUnrolledCopy<T, 3, 1>(pDest, pSrc, nIters);
4148 : }
4149 442234 : else if (nSrcStride == 4 * sizeofT)
4150 : {
4151 336678 : GDALUnrolledCopy<T, 4, 1>(pDest, pSrc, nIters);
4152 : }
4153 : else
4154 : {
4155 17229290 : while (nIters-- > 0)
4156 : {
4157 17123750 : *pDest = *pSrc;
4158 17123750 : pSrc += nSrcStride / sizeofT;
4159 17123750 : pDest++;
4160 : }
4161 : }
4162 : }
4163 3130395 : else if (nSrcStride == sizeofT)
4164 : {
4165 3117389 : if (nDestStride == 2 * sizeofT)
4166 : {
4167 152788 : GDALUnrolledCopy<T, 1, 2>(pDest, pSrc, nIters);
4168 : }
4169 2964605 : else if (nDestStride == 3 * sizeofT)
4170 : {
4171 2136181 : GDALUnrolledCopy<T, 1, 3>(pDest, pSrc, nIters);
4172 : }
4173 828421 : else if (nDestStride == 4 * sizeofT)
4174 : {
4175 664109 : GDALUnrolledCopy<T, 1, 4>(pDest, pSrc, nIters);
4176 : }
4177 : else
4178 : {
4179 17169660 : while (nIters-- > 0)
4180 : {
4181 17005410 : *pDest = *pSrc;
4182 17005410 : pSrc++;
4183 17005410 : pDest += nDestStride / sizeofT;
4184 : }
4185 : }
4186 : }
4187 : else
4188 : {
4189 1220108 : while (nIters-- > 0)
4190 : {
4191 1207102 : *pDest = *pSrc;
4192 1207102 : pSrc += nSrcStride / sizeofT;
4193 1207102 : pDest += nDestStride / sizeofT;
4194 : }
4195 : }
4196 40277300 : }
4197 :
4198 : /************************************************************************/
4199 : /* GDALFastCopyByte() */
4200 : /************************************************************************/
4201 :
4202 326320 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
4203 : int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
4204 : int nDstPixelStride, GPtrDiff_t nWordCount)
4205 : {
4206 326320 : GDALFastCopy(pDstData, nDstPixelStride, pSrcData, nSrcPixelStride,
4207 : nWordCount);
4208 326320 : }
4209 :
4210 : /************************************************************************/
4211 : /* GDALCopyWords() */
4212 : /************************************************************************/
4213 :
4214 : /**
4215 : * Copy pixel words from buffer to buffer.
4216 : *
4217 : * @see GDALCopyWords64()
4218 : */
4219 80595500 : void CPL_STDCALL GDALCopyWords(const void *CPL_RESTRICT pSrcData,
4220 : GDALDataType eSrcType, int nSrcPixelStride,
4221 : void *CPL_RESTRICT pDstData,
4222 : GDALDataType eDstType, int nDstPixelStride,
4223 : int nWordCount)
4224 : {
4225 80595500 : GDALCopyWords64(pSrcData, eSrcType, nSrcPixelStride, pDstData, eDstType,
4226 : nDstPixelStride, nWordCount);
4227 80595500 : }
4228 :
4229 : /************************************************************************/
4230 : /* GDALCopyWords64() */
4231 : /************************************************************************/
4232 :
4233 : /**
4234 : * Copy pixel words from buffer to buffer.
4235 : *
4236 : * This function is used to copy pixel word values from one memory buffer
4237 : * to another, with support for conversion between data types, and differing
4238 : * step factors. The data type conversion is done using the following
4239 : * rules:
4240 : * <ul>
4241 : * <li>Values assigned to a lower range integer type are clipped. For
4242 : * instance assigning GDT_Int16 values to a GDT_UInt8 buffer will cause values
4243 : * less the 0 to be set to 0, and values larger than 255 to be set to 255.
4244 : * </li>
4245 : * <li>
4246 : * Assignment from floating point to integer rounds to closest integer.
4247 : * +Infinity is mapped to the largest integer. -Infinity is mapped to the
4248 : * smallest integer. NaN is mapped to 0.
4249 : * </li>
4250 : * <li>
4251 : * Assignment from non-complex to complex will result in the imaginary part
4252 : * being set to zero on output.
4253 : * </li>
4254 : * <li> Assignment from complex to
4255 : * non-complex will result in the complex portion being lost and the real
4256 : * component being preserved (<i>not magnitude!</i>).
4257 : * </li>
4258 : * </ul>
4259 : *
4260 : * No assumptions are made about the source or destination words occurring
4261 : * on word boundaries. It is assumed that all values are in native machine
4262 : * byte order.
4263 : *
4264 : * @param pSrcData Pointer to source data to be converted.
4265 : * @param eSrcType the source data type (see GDALDataType enum)
4266 : * @param nSrcPixelStride Source pixel stride (i.e. distance between 2 words),
4267 : * in bytes
4268 : * @param pDstData Pointer to buffer where destination data should go
4269 : * @param eDstType the destination data type (see GDALDataType enum)
4270 : * @param nDstPixelStride Destination pixel stride (i.e. distance between 2
4271 : * words), in bytes
4272 : * @param nWordCount number of words to be copied
4273 : *
4274 : * @note
4275 : * When adding a new data type to GDAL, you must do the following to
4276 : * support it properly within the GDALCopyWords function:
4277 : * 1. Add the data type to the switch on eSrcType in GDALCopyWords.
4278 : * This should invoke the appropriate GDALCopyWordsFromT wrapper.
4279 : * 2. Add the data type to the switch on eDstType in GDALCopyWordsFromT.
4280 : * This should call the appropriate GDALCopyWordsT template.
4281 : * 3. If appropriate, overload the appropriate CopyWord template in the
4282 : * above namespace. This will ensure that any conversion issues are
4283 : * handled (cases like the float -> int32 case, where the min/max)
4284 : * values are subject to roundoff error.
4285 : */
4286 :
4287 116987000 : void CPL_STDCALL GDALCopyWords64(const void *CPL_RESTRICT pSrcData,
4288 : GDALDataType eSrcType, int nSrcPixelStride,
4289 : void *CPL_RESTRICT pDstData,
4290 : GDALDataType eDstType, int nDstPixelStride,
4291 : GPtrDiff_t nWordCount)
4292 :
4293 : {
4294 : // On platforms where alignment matters, be careful
4295 116987000 : const int nSrcDataTypeSize = GDALGetDataTypeSizeBytes(eSrcType);
4296 116987000 : const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(eDstType);
4297 116987000 : if (CPL_UNLIKELY(nSrcDataTypeSize == 0 || nDstDataTypeSize == 0))
4298 : {
4299 2 : CPLError(CE_Failure, CPLE_NotSupported,
4300 : "GDALCopyWords64(): unsupported GDT_Unknown/GDT_TypeCount "
4301 : "argument");
4302 2 : return;
4303 : }
4304 116987000 : if (!(eSrcType == eDstType && nSrcPixelStride == nDstPixelStride) &&
4305 66416300 : ((reinterpret_cast<uintptr_t>(pSrcData) % nSrcDataTypeSize) != 0 ||
4306 66416300 : (reinterpret_cast<uintptr_t>(pDstData) % nDstDataTypeSize) != 0 ||
4307 66415900 : (nSrcPixelStride % nSrcDataTypeSize) != 0 ||
4308 66415800 : (nDstPixelStride % nDstDataTypeSize) != 0))
4309 : {
4310 905 : if (eSrcType == eDstType)
4311 : {
4312 34800 : for (decltype(nWordCount) i = 0; i < nWordCount; i++)
4313 : {
4314 34000 : memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
4315 : static_cast<const GByte *>(pSrcData) +
4316 34000 : nSrcPixelStride * i,
4317 : nDstDataTypeSize);
4318 : }
4319 : }
4320 : else
4321 : {
4322 210 : const auto getAlignedPtr = [](GByte *ptr, int align)
4323 : {
4324 : return ptr +
4325 210 : ((align - (reinterpret_cast<uintptr_t>(ptr) % align)) %
4326 210 : align);
4327 : };
4328 :
4329 : // The largest we need is for CFloat64 (16 bytes), so 32 bytes to
4330 : // be sure to get correctly aligned pointer.
4331 105 : constexpr size_t SIZEOF_CFLOAT64 = 2 * sizeof(double);
4332 : GByte abySrcBuffer[2 * SIZEOF_CFLOAT64];
4333 : GByte abyDstBuffer[2 * SIZEOF_CFLOAT64];
4334 : GByte *pabySrcBuffer =
4335 105 : getAlignedPtr(abySrcBuffer, nSrcDataTypeSize);
4336 : GByte *pabyDstBuffer =
4337 105 : getAlignedPtr(abyDstBuffer, nDstDataTypeSize);
4338 3360 : for (decltype(nWordCount) i = 0; i < nWordCount; i++)
4339 : {
4340 3255 : memcpy(pabySrcBuffer,
4341 : static_cast<const GByte *>(pSrcData) +
4342 3255 : nSrcPixelStride * i,
4343 : nSrcDataTypeSize);
4344 3255 : GDALCopyWords64(pabySrcBuffer, eSrcType, 0, pabyDstBuffer,
4345 : eDstType, 0, 1);
4346 3255 : memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
4347 : pabyDstBuffer, nDstDataTypeSize);
4348 : }
4349 : }
4350 905 : return;
4351 : }
4352 :
4353 : // Deal with the case where we're replicating a single word into the
4354 : // provided buffer
4355 116986000 : if (nSrcPixelStride == 0 && nWordCount > 1)
4356 : {
4357 1080460 : GDALReplicateWord(pSrcData, eSrcType, pDstData, eDstType,
4358 : nDstPixelStride, nWordCount);
4359 1080460 : return;
4360 : }
4361 :
4362 115906000 : if (eSrcType == eDstType)
4363 : {
4364 54852800 : if (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8)
4365 : {
4366 18154800 : GDALFastCopy(static_cast<GByte *>(pDstData), nDstPixelStride,
4367 : static_cast<const GByte *>(pSrcData), nSrcPixelStride,
4368 : nWordCount);
4369 18154800 : return;
4370 : }
4371 :
4372 36698000 : if (nSrcDataTypeSize == 2 && (nSrcPixelStride % 2) == 0 &&
4373 21796200 : (nDstPixelStride % 2) == 0)
4374 : {
4375 21796200 : GDALFastCopy(static_cast<short *>(pDstData), nDstPixelStride,
4376 : static_cast<const short *>(pSrcData), nSrcPixelStride,
4377 : nWordCount);
4378 21796200 : return;
4379 : }
4380 :
4381 14901800 : if (nWordCount == 1)
4382 : {
4383 : #if defined(CSA_BUILD) || defined(__COVERITY__)
4384 : // Avoid false positives...
4385 : memcpy(pDstData, pSrcData, nSrcDataTypeSize);
4386 : #else
4387 14411900 : if (nSrcDataTypeSize == 2)
4388 0 : memcpy(pDstData, pSrcData, 2);
4389 14411900 : else if (nSrcDataTypeSize == 4)
4390 13807600 : memcpy(pDstData, pSrcData, 4);
4391 604345 : else if (nSrcDataTypeSize == 8)
4392 587740 : memcpy(pDstData, pSrcData, 8);
4393 : else /* if( eSrcType == GDT_CFloat64 ) */
4394 16605 : memcpy(pDstData, pSrcData, 16);
4395 : #endif
4396 14411900 : return;
4397 : }
4398 :
4399 : // Let memcpy() handle the case where we're copying a packed buffer
4400 : // of pixels.
4401 489855 : if (nSrcPixelStride == nDstPixelStride)
4402 : {
4403 228011 : if (nSrcPixelStride == nSrcDataTypeSize)
4404 : {
4405 227931 : memcpy(pDstData, pSrcData, nWordCount * nSrcDataTypeSize);
4406 227931 : return;
4407 : }
4408 : }
4409 : }
4410 :
4411 : // Handle the more general case -- deals with conversion of data types
4412 : // directly.
4413 61314900 : switch (eSrcType)
4414 : {
4415 20307200 : case GDT_UInt8:
4416 20307200 : GDALCopyWordsFromT<unsigned char>(
4417 : static_cast<const unsigned char *>(pSrcData), nSrcPixelStride,
4418 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4419 20307200 : break;
4420 1806 : case GDT_Int8:
4421 1806 : GDALCopyWordsFromT<signed char>(
4422 : static_cast<const signed char *>(pSrcData), nSrcPixelStride,
4423 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4424 1806 : break;
4425 55565 : case GDT_UInt16:
4426 55565 : GDALCopyWordsFromT<unsigned short>(
4427 : static_cast<const unsigned short *>(pSrcData), nSrcPixelStride,
4428 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4429 55565 : break;
4430 6519850 : case GDT_Int16:
4431 6519850 : GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
4432 : nSrcPixelStride, false, pDstData,
4433 : eDstType, nDstPixelStride, nWordCount);
4434 6519850 : break;
4435 8282 : case GDT_UInt32:
4436 8282 : GDALCopyWordsFromT<unsigned int>(
4437 : static_cast<const unsigned int *>(pSrcData), nSrcPixelStride,
4438 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4439 8282 : break;
4440 12254800 : case GDT_Int32:
4441 12254800 : GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
4442 : nSrcPixelStride, false, pDstData, eDstType,
4443 : nDstPixelStride, nWordCount);
4444 12254800 : break;
4445 2205 : case GDT_UInt64:
4446 2205 : GDALCopyWordsFromT<std::uint64_t>(
4447 : static_cast<const std::uint64_t *>(pSrcData), nSrcPixelStride,
4448 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4449 2205 : break;
4450 11729 : case GDT_Int64:
4451 11729 : GDALCopyWordsFromT<std::int64_t>(
4452 : static_cast<const std::int64_t *>(pSrcData), nSrcPixelStride,
4453 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4454 11729 : break;
4455 1387 : case GDT_Float16:
4456 1387 : GDALCopyWordsFromT<GFloat16>(
4457 : static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, false,
4458 : pDstData, eDstType, nDstPixelStride, nWordCount);
4459 1387 : break;
4460 665046 : case GDT_Float32:
4461 665046 : GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
4462 : nSrcPixelStride, false, pDstData,
4463 : eDstType, nDstPixelStride, nWordCount);
4464 665046 : break;
4465 20726100 : case GDT_Float64:
4466 20726100 : GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
4467 : nSrcPixelStride, false, pDstData,
4468 : eDstType, nDstPixelStride, nWordCount);
4469 20726100 : break;
4470 478486 : case GDT_CInt16:
4471 478486 : GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
4472 : nSrcPixelStride, true, pDstData, eDstType,
4473 : nDstPixelStride, nWordCount);
4474 478486 : break;
4475 868 : case GDT_CInt32:
4476 868 : GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
4477 : nSrcPixelStride, true, pDstData, eDstType,
4478 : nDstPixelStride, nWordCount);
4479 868 : break;
4480 508 : case GDT_CFloat16:
4481 508 : GDALCopyWordsFromT<GFloat16>(
4482 : static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, true,
4483 : pDstData, eDstType, nDstPixelStride, nWordCount);
4484 508 : break;
4485 2437 : case GDT_CFloat32:
4486 2437 : GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
4487 : nSrcPixelStride, true, pDstData, eDstType,
4488 : nDstPixelStride, nWordCount);
4489 2437 : break;
4490 278624 : case GDT_CFloat64:
4491 278624 : GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
4492 : nSrcPixelStride, true, pDstData,
4493 : eDstType, nDstPixelStride, nWordCount);
4494 278624 : break;
4495 0 : case GDT_Unknown:
4496 : case GDT_TypeCount:
4497 0 : CPLAssert(false);
4498 : }
4499 : }
4500 :
4501 : /************************************************************************/
4502 : /* GDALCopyBits() */
4503 : /************************************************************************/
4504 :
4505 : /**
4506 : * Bitwise word copying.
4507 : *
4508 : * A function for moving sets of partial bytes around. Loosely
4509 : * speaking this is a bitwise analog to GDALCopyWords().
4510 : *
4511 : * It copies nStepCount "words" where each word is nBitCount bits long.
4512 : * The nSrcStep and nDstStep are the number of bits from the start of one
4513 : * word to the next (same as nBitCount if they are packed). The nSrcOffset
4514 : * and nDstOffset are the offset into the source and destination buffers
4515 : * to start at, also measured in bits.
4516 : *
4517 : * All bit offsets are assumed to start from the high order bit in a byte
4518 : * (i.e. most significant bit first). Currently this function is not very
4519 : * optimized, but it may be improved for some common cases in the future
4520 : * as needed.
4521 : *
4522 : * @param pabySrcData the source data buffer.
4523 : * @param nSrcOffset the offset (in bits) in pabySrcData to the start of the
4524 : * first word to copy.
4525 : * @param nSrcStep the offset in bits from the start one source word to the
4526 : * start of the next.
4527 : * @param pabyDstData the destination data buffer.
4528 : * @param nDstOffset the offset (in bits) in pabyDstData to the start of the
4529 : * first word to copy over.
4530 : * @param nDstStep the offset in bits from the start one word to the
4531 : * start of the next.
4532 : * @param nBitCount the number of bits in a word to be copied.
4533 : * @param nStepCount the number of words to copy.
4534 : */
4535 :
4536 0 : void GDALCopyBits(const GByte *pabySrcData, int nSrcOffset, int nSrcStep,
4537 : GByte *pabyDstData, int nDstOffset, int nDstStep,
4538 : int nBitCount, int nStepCount)
4539 :
4540 : {
4541 0 : VALIDATE_POINTER0(pabySrcData, "GDALCopyBits");
4542 :
4543 0 : for (int iStep = 0; iStep < nStepCount; iStep++)
4544 : {
4545 0 : for (int iBit = 0; iBit < nBitCount; iBit++)
4546 : {
4547 0 : if (pabySrcData[nSrcOffset >> 3] & (0x80 >> (nSrcOffset & 7)))
4548 0 : pabyDstData[nDstOffset >> 3] |= (0x80 >> (nDstOffset & 7));
4549 : else
4550 0 : pabyDstData[nDstOffset >> 3] &= ~(0x80 >> (nDstOffset & 7));
4551 :
4552 0 : nSrcOffset++;
4553 0 : nDstOffset++;
4554 : }
4555 :
4556 0 : nSrcOffset += (nSrcStep - nBitCount);
4557 0 : nDstOffset += (nDstStep - nBitCount);
4558 : }
4559 : }
4560 :
4561 : /************************************************************************/
4562 : /* GDALBandGetBestOverviewLevel() */
4563 : /************************************************************************/
4564 :
4565 525463 : int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand,
4566 : double dfTargetDownsamplingRatio,
4567 : double dfOversamplingThreshold)
4568 : {
4569 525463 : int iBestOvr = -1;
4570 525463 : double dfBestRatio = 0;
4571 525463 : const int nOvCount = poBand->GetOverviewCount();
4572 525463 : constexpr double EPSILON = 1e-1;
4573 1053620 : for (int iOvr = -1; iOvr < nOvCount; iOvr++)
4574 : {
4575 531090 : double dfOvrRatio = 1.0;
4576 531090 : GDALRasterBand *poOvrBand = nullptr;
4577 531090 : if (iOvr >= 0)
4578 : {
4579 5627 : poOvrBand = poBand->GetOverview(iOvr);
4580 11254 : if (poOvrBand == nullptr ||
4581 11254 : poOvrBand->GetXSize() > poBand->GetXSize() ||
4582 5627 : poOvrBand->GetYSize() > poBand->GetYSize())
4583 : {
4584 0 : continue;
4585 : }
4586 22508 : dfOvrRatio = std::min(static_cast<double>(poBand->GetXSize()) /
4587 5627 : poOvrBand->GetXSize(),
4588 11254 : static_cast<double>(poBand->GetYSize()) /
4589 11254 : poOvrBand->GetYSize());
4590 : }
4591 :
4592 : // Is it nearly the requested factor and better (lower) than
4593 : // the current best factor?
4594 : // Use an epsilon because of numerical instability.
4595 531197 : if (dfOvrRatio >=
4596 531090 : dfTargetDownsamplingRatio * dfOversamplingThreshold + EPSILON ||
4597 : dfOvrRatio <= dfBestRatio)
4598 : {
4599 107 : continue;
4600 : }
4601 :
4602 530983 : if (poOvrBand)
4603 : {
4604 : // Ignore AVERAGE_BIT2GRAYSCALE overviews.
4605 : const char *pszResampling =
4606 5520 : poOvrBand->GetMetadataItem("RESAMPLING");
4607 5520 : if (pszResampling != nullptr &&
4608 71 : STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2"))
4609 : {
4610 16 : continue;
4611 : }
4612 : }
4613 :
4614 530967 : iBestOvr = iOvr;
4615 530967 : dfBestRatio = dfOvrRatio;
4616 530967 : if (std::abs(dfTargetDownsamplingRatio - dfOvrRatio) < EPSILON)
4617 : {
4618 2938 : break;
4619 : }
4620 : }
4621 525463 : return iBestOvr;
4622 : }
4623 :
4624 : /************************************************************************/
4625 : /* GDALGetBestOverviewLevel() */
4626 : /* */
4627 : /* Returns the best overview level to satisfy the query or -1 if none */
4628 : /* Also updates nXOff, nYOff, nXSize, nYSize and psExtraArg when */
4629 : /* returning a valid overview level */
4630 : /************************************************************************/
4631 :
4632 0 : int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand, int &nXOff, int &nYOff,
4633 : int &nXSize, int &nYSize, int nBufXSize,
4634 : int nBufYSize)
4635 : {
4636 0 : return GDALBandGetBestOverviewLevel2(poBand, nXOff, nYOff, nXSize, nYSize,
4637 0 : nBufXSize, nBufYSize, nullptr);
4638 : }
4639 :
4640 525556 : int GDALBandGetBestOverviewLevel2(GDALRasterBand *poBand, int &nXOff,
4641 : int &nYOff, int &nXSize, int &nYSize,
4642 : int nBufXSize, int nBufYSize,
4643 : GDALRasterIOExtraArg *psExtraArg)
4644 : {
4645 525556 : if (psExtraArg != nullptr && psExtraArg->nVersion > 1 &&
4646 525556 : psExtraArg->bUseOnlyThisScale)
4647 109 : return -1;
4648 : /* -------------------------------------------------------------------- */
4649 : /* Compute the desired downsampling factor. It is */
4650 : /* based on the least reduced axis, and represents the number */
4651 : /* of source pixels to one destination pixel. */
4652 : /* -------------------------------------------------------------------- */
4653 525447 : const double dfDesiredDownsamplingFactor =
4654 525447 : ((nXSize / static_cast<double>(nBufXSize)) <
4655 363107 : (nYSize / static_cast<double>(nBufYSize)) ||
4656 : nBufYSize == 1)
4657 755372 : ? nXSize / static_cast<double>(nBufXSize)
4658 133182 : : nYSize / static_cast<double>(nBufYSize);
4659 :
4660 : /* -------------------------------------------------------------------- */
4661 : /* Find the overview level that largest downsampling factor (most */
4662 : /* downsampled) that is still less than (or only a little more) */
4663 : /* downsampled than the request. */
4664 : /* -------------------------------------------------------------------- */
4665 :
4666 : const char *pszOversampligThreshold =
4667 525447 : CPLGetConfigOption("GDAL_OVERVIEW_OVERSAMPLING_THRESHOLD", nullptr);
4668 :
4669 : // Cf https://github.com/OSGeo/gdal/pull/9040#issuecomment-1898524693
4670 : const double dfOversamplingThreshold =
4671 1050880 : pszOversampligThreshold ? CPLAtof(pszOversampligThreshold)
4672 525438 : : psExtraArg && psExtraArg->eResampleAlg != GRIORA_NearestNeighbour
4673 1050880 : ? 1.0
4674 525447 : : 1.2;
4675 525447 : const int iBestOvrLevel = GDALBandGetBestOverviewLevel(
4676 : poBand, dfDesiredDownsamplingFactor, dfOversamplingThreshold);
4677 :
4678 : /* -------------------------------------------------------------------- */
4679 : /* If we didn't find an overview that helps us, just return */
4680 : /* indicating failure and the full resolution image will be used. */
4681 : /* -------------------------------------------------------------------- */
4682 525447 : if (iBestOvrLevel < 0)
4683 522454 : return -1;
4684 2993 : const GDALRasterBand *poBestOverview = poBand->GetOverview(iBestOvrLevel);
4685 :
4686 : /* -------------------------------------------------------------------- */
4687 : /* Recompute the source window in terms of the selected */
4688 : /* overview. */
4689 : /* -------------------------------------------------------------------- */
4690 : const double dfXFactor =
4691 2993 : poBand->GetXSize() / static_cast<double>(poBestOverview->GetXSize());
4692 : const double dfYFactor =
4693 2993 : poBand->GetYSize() / static_cast<double>(poBestOverview->GetYSize());
4694 2993 : CPLDebug("GDAL", "Selecting overview %d x %d", poBestOverview->GetXSize(),
4695 : poBestOverview->GetYSize());
4696 :
4697 8979 : const int nOXOff = std::min(poBestOverview->GetXSize() - 1,
4698 2993 : static_cast<int>(nXOff / dfXFactor + 0.5));
4699 8979 : const int nOYOff = std::min(poBestOverview->GetYSize() - 1,
4700 2993 : static_cast<int>(nYOff / dfYFactor + 0.5));
4701 2993 : int nOXSize = std::max(1, static_cast<int>(nXSize / dfXFactor + 0.5));
4702 2993 : int nOYSize = std::max(1, static_cast<int>(nYSize / dfYFactor + 0.5));
4703 2993 : if (nOXOff + nOXSize > poBestOverview->GetXSize())
4704 0 : nOXSize = poBestOverview->GetXSize() - nOXOff;
4705 2993 : if (nOYOff + nOYSize > poBestOverview->GetYSize())
4706 2 : nOYSize = poBestOverview->GetYSize() - nOYOff;
4707 :
4708 2993 : if (psExtraArg)
4709 : {
4710 2993 : if (psExtraArg->bFloatingPointWindowValidity)
4711 : {
4712 117 : psExtraArg->dfXOff /= dfXFactor;
4713 117 : psExtraArg->dfXSize /= dfXFactor;
4714 117 : psExtraArg->dfYOff /= dfYFactor;
4715 117 : psExtraArg->dfYSize /= dfYFactor;
4716 : }
4717 2876 : else if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
4718 : {
4719 16 : psExtraArg->bFloatingPointWindowValidity = true;
4720 16 : psExtraArg->dfXOff = nXOff / dfXFactor;
4721 16 : psExtraArg->dfXSize = nXSize / dfXFactor;
4722 16 : psExtraArg->dfYOff = nYOff / dfYFactor;
4723 16 : psExtraArg->dfYSize = nYSize / dfYFactor;
4724 : }
4725 : }
4726 :
4727 2993 : nXOff = nOXOff;
4728 2993 : nYOff = nOYOff;
4729 2993 : nXSize = nOXSize;
4730 2993 : nYSize = nOYSize;
4731 :
4732 2993 : return iBestOvrLevel;
4733 : }
4734 :
4735 : /************************************************************************/
4736 : /* OverviewRasterIO() */
4737 : /* */
4738 : /* Special work function to utilize available overviews to */
4739 : /* more efficiently satisfy downsampled requests. It will */
4740 : /* return CE_Failure if there are no appropriate overviews */
4741 : /* available but it doesn't emit any error messages. */
4742 : /************************************************************************/
4743 :
4744 : //! @cond Doxygen_Suppress
4745 1 : CPLErr GDALRasterBand::OverviewRasterIO(
4746 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4747 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4748 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
4749 :
4750 : {
4751 : GDALRasterIOExtraArg sExtraArg;
4752 1 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4753 :
4754 1 : const int nOverview = GDALBandGetBestOverviewLevel2(
4755 : this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, &sExtraArg);
4756 1 : if (nOverview < 0)
4757 1 : return CE_Failure;
4758 :
4759 : /* -------------------------------------------------------------------- */
4760 : /* Recast the call in terms of the new raster layer. */
4761 : /* -------------------------------------------------------------------- */
4762 0 : GDALRasterBand *poOverviewBand = GetOverview(nOverview);
4763 0 : if (poOverviewBand == nullptr)
4764 0 : return CE_Failure;
4765 :
4766 0 : return poOverviewBand->RasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
4767 : pData, nBufXSize, nBufYSize, eBufType,
4768 0 : nPixelSpace, nLineSpace, &sExtraArg);
4769 : }
4770 :
4771 : /************************************************************************/
4772 : /* TryOverviewRasterIO() */
4773 : /************************************************************************/
4774 :
4775 362428 : CPLErr GDALRasterBand::TryOverviewRasterIO(
4776 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4777 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4778 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg,
4779 : int *pbTried)
4780 : {
4781 362428 : int nXOffMod = nXOff;
4782 362428 : int nYOffMod = nYOff;
4783 362428 : int nXSizeMod = nXSize;
4784 362428 : int nYSizeMod = nYSize;
4785 : GDALRasterIOExtraArg sExtraArg;
4786 :
4787 362428 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4788 :
4789 362428 : int iOvrLevel = GDALBandGetBestOverviewLevel2(
4790 : this, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize, nBufYSize,
4791 : &sExtraArg);
4792 :
4793 362428 : if (iOvrLevel >= 0)
4794 : {
4795 53 : GDALRasterBand *poOverviewBand = GetOverview(iOvrLevel);
4796 53 : if (poOverviewBand)
4797 : {
4798 53 : *pbTried = TRUE;
4799 53 : return poOverviewBand->RasterIO(
4800 : eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData,
4801 : nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
4802 53 : &sExtraArg);
4803 : }
4804 : }
4805 :
4806 362375 : *pbTried = FALSE;
4807 362375 : return CE_None;
4808 : }
4809 :
4810 : /************************************************************************/
4811 : /* TryOverviewRasterIO() */
4812 : /************************************************************************/
4813 :
4814 160153 : CPLErr GDALDataset::TryOverviewRasterIO(
4815 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4816 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4817 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
4818 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg,
4819 : int *pbTried)
4820 : {
4821 160153 : int nXOffMod = nXOff;
4822 160153 : int nYOffMod = nYOff;
4823 160153 : int nXSizeMod = nXSize;
4824 160153 : int nYSizeMod = nYSize;
4825 : GDALRasterIOExtraArg sExtraArg;
4826 160153 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4827 :
4828 320306 : int iOvrLevel = GDALBandGetBestOverviewLevel2(
4829 160153 : papoBands[0], nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize,
4830 : nBufYSize, &sExtraArg);
4831 :
4832 160196 : if (iOvrLevel >= 0 && papoBands[0]->GetOverview(iOvrLevel) != nullptr &&
4833 43 : papoBands[0]->GetOverview(iOvrLevel)->GetDataset() != nullptr)
4834 : {
4835 43 : *pbTried = TRUE;
4836 43 : return papoBands[0]->GetOverview(iOvrLevel)->GetDataset()->RasterIO(
4837 : eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData, nBufXSize,
4838 : nBufYSize, eBufType, nBandCount, panBandMap, nPixelSpace,
4839 43 : nLineSpace, nBandSpace, &sExtraArg);
4840 : }
4841 : else
4842 : {
4843 160110 : *pbTried = FALSE;
4844 160110 : return CE_None;
4845 : }
4846 : }
4847 :
4848 : /************************************************************************/
4849 : /* GetBestOverviewLevel() */
4850 : /* */
4851 : /* Returns the best overview level to satisfy the query or -1 if none */
4852 : /* Also updates nXOff, nYOff, nXSize, nYSize when returning a valid */
4853 : /* overview level */
4854 : /************************************************************************/
4855 :
4856 4 : static int GDALDatasetGetBestOverviewLevel(GDALDataset *poDS, int &nXOff,
4857 : int &nYOff, int &nXSize, int &nYSize,
4858 : int nBufXSize, int nBufYSize,
4859 : int nBandCount,
4860 : const int *panBandMap,
4861 : GDALRasterIOExtraArg *psExtraArg)
4862 : {
4863 4 : int nOverviewCount = 0;
4864 4 : GDALRasterBand *poFirstBand = nullptr;
4865 :
4866 : /* -------------------------------------------------------------------- */
4867 : /* Check that all bands have the same number of overviews and */
4868 : /* that they have all the same size and block dimensions */
4869 : /* -------------------------------------------------------------------- */
4870 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
4871 : {
4872 8 : GDALRasterBand *poBand = poDS->GetRasterBand(panBandMap[iBand]);
4873 8 : if (poBand == nullptr)
4874 0 : return -1;
4875 8 : if (iBand == 0)
4876 : {
4877 4 : poFirstBand = poBand;
4878 4 : nOverviewCount = poBand->GetOverviewCount();
4879 : }
4880 4 : else if (nOverviewCount != poBand->GetOverviewCount())
4881 : {
4882 0 : CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
4883 : "mismatched overview count, use std method.");
4884 0 : return -1;
4885 : }
4886 : else
4887 : {
4888 4 : for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
4889 : {
4890 0 : GDALRasterBand *poOvrBand = poBand->GetOverview(iOverview);
4891 : GDALRasterBand *poOvrFirstBand =
4892 0 : poFirstBand->GetOverview(iOverview);
4893 0 : if (poOvrBand == nullptr || poOvrFirstBand == nullptr)
4894 0 : continue;
4895 :
4896 0 : if (poOvrFirstBand->GetXSize() != poOvrBand->GetXSize() ||
4897 0 : poOvrFirstBand->GetYSize() != poOvrBand->GetYSize())
4898 : {
4899 0 : CPLDebug("GDAL",
4900 : "GDALDataset::GetBestOverviewLevel() ... "
4901 : "mismatched overview sizes, use std method.");
4902 0 : return -1;
4903 : }
4904 0 : int nBlockXSizeFirst = 0;
4905 0 : int nBlockYSizeFirst = 0;
4906 0 : poOvrFirstBand->GetBlockSize(&nBlockXSizeFirst,
4907 : &nBlockYSizeFirst);
4908 :
4909 0 : int nBlockXSizeCurrent = 0;
4910 0 : int nBlockYSizeCurrent = 0;
4911 0 : poOvrBand->GetBlockSize(&nBlockXSizeCurrent,
4912 : &nBlockYSizeCurrent);
4913 :
4914 0 : if (nBlockXSizeFirst != nBlockXSizeCurrent ||
4915 0 : nBlockYSizeFirst != nBlockYSizeCurrent)
4916 : {
4917 0 : CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
4918 : "mismatched block sizes, use std method.");
4919 0 : return -1;
4920 : }
4921 : }
4922 : }
4923 : }
4924 4 : if (poFirstBand == nullptr)
4925 0 : return -1;
4926 :
4927 4 : return GDALBandGetBestOverviewLevel2(poFirstBand, nXOff, nYOff, nXSize,
4928 : nYSize, nBufXSize, nBufYSize,
4929 4 : psExtraArg);
4930 : }
4931 :
4932 : /************************************************************************/
4933 : /* BlockBasedRasterIO() */
4934 : /* */
4935 : /* This convenience function implements a dataset level */
4936 : /* RasterIO() interface based on calling down to fetch blocks, */
4937 : /* much like the GDALRasterBand::IRasterIO(), but it handles */
4938 : /* all bands at once, so that a format driver that handles a */
4939 : /* request for different bands of the same block efficiently */
4940 : /* (i.e. without re-reading interleaved data) will efficiently. */
4941 : /* */
4942 : /* This method is intended to be called by an overridden */
4943 : /* IRasterIO() method in the driver specific GDALDataset */
4944 : /* derived class. */
4945 : /* */
4946 : /* Default internal implementation of RasterIO() ... utilizes */
4947 : /* the Block access methods to satisfy the request. This would */
4948 : /* normally only be overridden by formats with overviews. */
4949 : /* */
4950 : /* To keep things relatively simple, this method does not */
4951 : /* currently take advantage of some special cases addressed in */
4952 : /* GDALRasterBand::IRasterIO(), so it is likely best to only */
4953 : /* call it when you know it will help. That is in cases where */
4954 : /* data is at 1:1 to the buffer, and you know the driver is */
4955 : /* implementing interleaved IO efficiently on a block by block */
4956 : /* basis. Overviews will be used when possible. */
4957 : /************************************************************************/
4958 :
4959 65948 : CPLErr GDALDataset::BlockBasedRasterIO(
4960 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4961 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4962 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
4963 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
4964 :
4965 : {
4966 65948 : CPLAssert(nullptr != pData);
4967 :
4968 65948 : GByte **papabySrcBlock = nullptr;
4969 65948 : GDALRasterBlock *poBlock = nullptr;
4970 65948 : GDALRasterBlock **papoBlocks = nullptr;
4971 65948 : int nLBlockX = -1;
4972 65948 : int nLBlockY = -1;
4973 : int iBufYOff;
4974 : int iBufXOff;
4975 65948 : int nBlockXSize = 1;
4976 65948 : int nBlockYSize = 1;
4977 65948 : CPLErr eErr = CE_None;
4978 65948 : GDALDataType eDataType = GDT_UInt8;
4979 :
4980 65948 : const bool bUseIntegerRequestCoords =
4981 65991 : (!psExtraArg->bFloatingPointWindowValidity ||
4982 43 : (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
4983 41 : nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
4984 :
4985 : /* -------------------------------------------------------------------- */
4986 : /* Ensure that all bands share a common block size and data type. */
4987 : /* -------------------------------------------------------------------- */
4988 312052 : for (int iBand = 0; iBand < nBandCount; iBand++)
4989 : {
4990 246104 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
4991 :
4992 246104 : if (iBand == 0)
4993 : {
4994 65948 : poBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
4995 65948 : eDataType = poBand->GetRasterDataType();
4996 : }
4997 : else
4998 : {
4999 180156 : int nThisBlockXSize = 0;
5000 180156 : int nThisBlockYSize = 0;
5001 180156 : poBand->GetBlockSize(&nThisBlockXSize, &nThisBlockYSize);
5002 180156 : if (nThisBlockXSize != nBlockXSize ||
5003 180156 : nThisBlockYSize != nBlockYSize)
5004 : {
5005 0 : CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
5006 : "mismatched block sizes, use std method.");
5007 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
5008 : pData, nBufXSize, nBufYSize, eBufType,
5009 : nBandCount, panBandMap, nPixelSpace,
5010 0 : nLineSpace, nBandSpace, psExtraArg);
5011 : }
5012 :
5013 180156 : if (eDataType != poBand->GetRasterDataType() &&
5014 0 : (nXSize != nBufXSize || nYSize != nBufYSize))
5015 : {
5016 0 : CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
5017 : "mismatched band data types, use std method.");
5018 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
5019 : pData, nBufXSize, nBufYSize, eBufType,
5020 : nBandCount, panBandMap, nPixelSpace,
5021 0 : nLineSpace, nBandSpace, psExtraArg);
5022 : }
5023 : }
5024 : }
5025 :
5026 : /* ==================================================================== */
5027 : /* In this special case at full resolution we step through in */
5028 : /* blocks, turning the request over to the per-band */
5029 : /* IRasterIO(), but ensuring that all bands of one block are */
5030 : /* called before proceeding to the next. */
5031 : /* ==================================================================== */
5032 :
5033 65948 : if (nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
5034 : {
5035 : GDALRasterIOExtraArg sDummyExtraArg;
5036 65944 : INIT_RASTERIO_EXTRA_ARG(sDummyExtraArg);
5037 :
5038 65944 : int nChunkYSize = 0;
5039 65944 : int nChunkXSize = 0;
5040 :
5041 215391 : for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff += nChunkYSize)
5042 : {
5043 150463 : const int nChunkYOff = iBufYOff + nYOff;
5044 150463 : nChunkYSize = nBlockYSize - (nChunkYOff % nBlockYSize);
5045 150463 : if (nChunkYOff + nChunkYSize > nYOff + nYSize)
5046 60939 : nChunkYSize = (nYOff + nYSize) - nChunkYOff;
5047 :
5048 825901 : for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff += nChunkXSize)
5049 : {
5050 676453 : const int nChunkXOff = iBufXOff + nXOff;
5051 676453 : nChunkXSize = nBlockXSize - (nChunkXOff % nBlockXSize);
5052 676453 : if (nChunkXOff + nChunkXSize > nXOff + nXSize)
5053 71011 : nChunkXSize = (nXOff + nXSize) - nChunkXOff;
5054 :
5055 676453 : GByte *pabyChunkData =
5056 676453 : static_cast<GByte *>(pData) + iBufXOff * nPixelSpace +
5057 676453 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace;
5058 :
5059 3291120 : for (int iBand = 0; iBand < nBandCount; iBand++)
5060 : {
5061 2615680 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
5062 :
5063 5231370 : eErr = poBand->IRasterIO(
5064 : eRWFlag, nChunkXOff, nChunkYOff, nChunkXSize,
5065 : nChunkYSize,
5066 2615680 : pabyChunkData +
5067 2615680 : static_cast<GPtrDiff_t>(iBand) * nBandSpace,
5068 : nChunkXSize, nChunkYSize, eBufType, nPixelSpace,
5069 2615680 : nLineSpace, &sDummyExtraArg);
5070 2615680 : if (eErr != CE_None)
5071 1015 : return eErr;
5072 : }
5073 : }
5074 :
5075 168362 : if (psExtraArg->pfnProgress != nullptr &&
5076 18914 : !psExtraArg->pfnProgress(
5077 168362 : 1.0 * std::min(nBufYSize, iBufYOff + nChunkYSize) /
5078 : nBufYSize,
5079 : "", psExtraArg->pProgressData))
5080 : {
5081 1 : return CE_Failure;
5082 : }
5083 : }
5084 :
5085 64928 : return CE_None;
5086 : }
5087 :
5088 : /* Below code is not compatible with that case. It would need a complete */
5089 : /* separate code like done in GDALRasterBand::IRasterIO. */
5090 4 : if (eRWFlag == GF_Write && (nBufXSize < nXSize || nBufYSize < nYSize))
5091 : {
5092 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
5093 : nBufXSize, nBufYSize, eBufType, nBandCount,
5094 : panBandMap, nPixelSpace, nLineSpace,
5095 0 : nBandSpace, psExtraArg);
5096 : }
5097 :
5098 : /* We could have a smarter implementation, but that will do for now */
5099 4 : if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
5100 0 : (nBufXSize != nXSize || nBufYSize != nYSize))
5101 : {
5102 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
5103 : nBufXSize, nBufYSize, eBufType, nBandCount,
5104 : panBandMap, nPixelSpace, nLineSpace,
5105 0 : nBandSpace, psExtraArg);
5106 : }
5107 :
5108 : /* ==================================================================== */
5109 : /* Loop reading required source blocks to satisfy output */
5110 : /* request. This is the most general implementation. */
5111 : /* ==================================================================== */
5112 :
5113 4 : const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
5114 :
5115 : papabySrcBlock =
5116 4 : static_cast<GByte **>(CPLCalloc(sizeof(GByte *), nBandCount));
5117 : papoBlocks =
5118 4 : static_cast<GDALRasterBlock **>(CPLCalloc(sizeof(void *), nBandCount));
5119 :
5120 : /* -------------------------------------------------------------------- */
5121 : /* Select an overview level if appropriate. */
5122 : /* -------------------------------------------------------------------- */
5123 :
5124 : GDALRasterIOExtraArg sExtraArg;
5125 4 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
5126 4 : const int nOverviewLevel = GDALDatasetGetBestOverviewLevel(
5127 : this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, nBandCount,
5128 : panBandMap, &sExtraArg);
5129 4 : if (nOverviewLevel >= 0)
5130 : {
5131 2 : GetRasterBand(panBandMap[0])
5132 2 : ->GetOverview(nOverviewLevel)
5133 2 : ->GetBlockSize(&nBlockXSize, &nBlockYSize);
5134 : }
5135 :
5136 4 : double dfXOff = nXOff;
5137 4 : double dfYOff = nYOff;
5138 4 : double dfXSize = nXSize;
5139 4 : double dfYSize = nYSize;
5140 4 : if (sExtraArg.bFloatingPointWindowValidity)
5141 : {
5142 2 : dfXOff = sExtraArg.dfXOff;
5143 2 : dfYOff = sExtraArg.dfYOff;
5144 2 : dfXSize = sExtraArg.dfXSize;
5145 2 : dfYSize = sExtraArg.dfYSize;
5146 : }
5147 :
5148 : /* -------------------------------------------------------------------- */
5149 : /* Compute stepping increment. */
5150 : /* -------------------------------------------------------------------- */
5151 4 : const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
5152 4 : const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
5153 :
5154 4 : constexpr double EPS = 1e-10;
5155 : /* -------------------------------------------------------------------- */
5156 : /* Loop over buffer computing source locations. */
5157 : /* -------------------------------------------------------------------- */
5158 36 : for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
5159 : {
5160 : GPtrDiff_t iSrcOffset;
5161 :
5162 : // Add small epsilon to avoid some numeric precision issues.
5163 32 : const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
5164 32 : const int iSrcY = static_cast<int>(std::min(
5165 32 : std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
5166 :
5167 32 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
5168 : static_cast<GPtrDiff_t>(nLineSpace);
5169 :
5170 302 : for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff++)
5171 : {
5172 270 : const double dfSrcX = (iBufXOff + 0.5) * dfSrcXInc + dfXOff + EPS;
5173 270 : const int iSrcX = static_cast<int>(std::min(
5174 270 : std::max(0.0, dfSrcX), static_cast<double>(nRasterXSize - 1)));
5175 :
5176 : // FIXME: this code likely doesn't work if the dirty block gets
5177 : // flushed to disk before being completely written. In the meantime,
5178 : // bJustInitialize should probably be set to FALSE even if it is not
5179 : // ideal performance wise, and for lossy compression
5180 :
5181 : /* --------------------------------------------------------------------
5182 : */
5183 : /* Ensure we have the appropriate block loaded. */
5184 : /* --------------------------------------------------------------------
5185 : */
5186 270 : if (iSrcX < nLBlockX * nBlockXSize ||
5187 270 : iSrcX - nBlockXSize >= nLBlockX * nBlockXSize ||
5188 266 : iSrcY < nLBlockY * nBlockYSize ||
5189 266 : iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
5190 : {
5191 4 : nLBlockX = iSrcX / nBlockXSize;
5192 4 : nLBlockY = iSrcY / nBlockYSize;
5193 :
5194 4 : const bool bJustInitialize =
5195 0 : eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
5196 0 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
5197 4 : nXOff <= nLBlockX * nBlockXSize &&
5198 0 : nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
5199 : /*bool bMemZeroBuffer = FALSE;
5200 : if( eRWFlag == GF_Write && !bJustInitialize &&
5201 : nXOff <= nLBlockX * nBlockXSize &&
5202 : nYOff <= nLBlockY * nBlockYSize &&
5203 : (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
5204 : (nXOff + nXSize == GetRasterXSize() &&
5205 : (nLBlockX+1) * nBlockXSize > GetRasterXSize())) &&
5206 : (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
5207 : (nYOff + nYSize == GetRasterYSize() &&
5208 : (nLBlockY+1) * nBlockYSize > GetRasterYSize())) )
5209 : {
5210 : bJustInitialize = TRUE;
5211 : bMemZeroBuffer = TRUE;
5212 : }*/
5213 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
5214 : {
5215 8 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
5216 8 : if (nOverviewLevel >= 0)
5217 2 : poBand = poBand->GetOverview(nOverviewLevel);
5218 16 : poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY,
5219 8 : bJustInitialize);
5220 8 : if (poBlock == nullptr)
5221 : {
5222 0 : eErr = CE_Failure;
5223 0 : goto CleanupAndReturn;
5224 : }
5225 :
5226 8 : if (eRWFlag == GF_Write)
5227 0 : poBlock->MarkDirty();
5228 :
5229 8 : if (papoBlocks[iBand] != nullptr)
5230 0 : papoBlocks[iBand]->DropLock();
5231 :
5232 8 : papoBlocks[iBand] = poBlock;
5233 :
5234 8 : papabySrcBlock[iBand] =
5235 8 : static_cast<GByte *>(poBlock->GetDataRef());
5236 : /*if( bMemZeroBuffer )
5237 : {
5238 : memset(papabySrcBlock[iBand], 0,
5239 : static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
5240 : * nBlockYSize);
5241 : }*/
5242 : }
5243 : }
5244 :
5245 : /* --------------------------------------------------------------------
5246 : */
5247 : /* Copy over this pixel of data. */
5248 : /* --------------------------------------------------------------------
5249 : */
5250 270 : iSrcOffset = (static_cast<GPtrDiff_t>(iSrcX) -
5251 270 : static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
5252 270 : (static_cast<GPtrDiff_t>(iSrcY) -
5253 270 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
5254 270 : nBlockXSize) *
5255 270 : nBandDataSize;
5256 :
5257 980 : for (int iBand = 0; iBand < nBandCount; iBand++)
5258 : {
5259 710 : GByte *pabySrcBlock = papabySrcBlock[iBand];
5260 710 : GPtrDiff_t iBandBufOffset =
5261 710 : iBufOffset + static_cast<GPtrDiff_t>(iBand) *
5262 : static_cast<GPtrDiff_t>(nBandSpace);
5263 :
5264 710 : if (eDataType == eBufType)
5265 : {
5266 710 : if (eRWFlag == GF_Read)
5267 710 : memcpy(static_cast<GByte *>(pData) + iBandBufOffset,
5268 710 : pabySrcBlock + iSrcOffset, nBandDataSize);
5269 : else
5270 0 : memcpy(pabySrcBlock + iSrcOffset,
5271 : static_cast<const GByte *>(pData) +
5272 0 : iBandBufOffset,
5273 : nBandDataSize);
5274 : }
5275 : else
5276 : {
5277 : /* type to type conversion ... ouch, this is expensive way
5278 : of handling single words */
5279 :
5280 0 : if (eRWFlag == GF_Read)
5281 0 : GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
5282 : static_cast<GByte *>(pData) +
5283 0 : iBandBufOffset,
5284 : eBufType, 0, 1);
5285 : else
5286 0 : GDALCopyWords64(static_cast<const GByte *>(pData) +
5287 0 : iBandBufOffset,
5288 0 : eBufType, 0, pabySrcBlock + iSrcOffset,
5289 : eDataType, 0, 1);
5290 : }
5291 : }
5292 :
5293 270 : iBufOffset += static_cast<int>(nPixelSpace);
5294 : }
5295 : }
5296 :
5297 : /* -------------------------------------------------------------------- */
5298 : /* CleanupAndReturn. */
5299 : /* -------------------------------------------------------------------- */
5300 4 : CleanupAndReturn:
5301 4 : CPLFree(papabySrcBlock);
5302 4 : if (papoBlocks != nullptr)
5303 : {
5304 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
5305 : {
5306 8 : if (papoBlocks[iBand] != nullptr)
5307 8 : papoBlocks[iBand]->DropLock();
5308 : }
5309 4 : CPLFree(papoBlocks);
5310 : }
5311 :
5312 4 : return eErr;
5313 : }
5314 :
5315 : //! @endcond
5316 :
5317 : /************************************************************************/
5318 : /* GDALCopyWholeRasterGetSwathSize() */
5319 : /************************************************************************/
5320 :
5321 3405 : static void GDALCopyWholeRasterGetSwathSize(GDALRasterBand *poSrcPrototypeBand,
5322 : GDALRasterBand *poDstPrototypeBand,
5323 : int nBandCount,
5324 : int bDstIsCompressed,
5325 : int bInterleave, int *pnSwathCols,
5326 : int *pnSwathLines)
5327 : {
5328 3405 : GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
5329 3405 : int nSrcBlockXSize = 0;
5330 3405 : int nSrcBlockYSize = 0;
5331 3405 : int nBlockXSize = 0;
5332 3405 : int nBlockYSize = 0;
5333 :
5334 3405 : int nXSize = poSrcPrototypeBand->GetXSize();
5335 3405 : int nYSize = poSrcPrototypeBand->GetYSize();
5336 :
5337 3405 : poSrcPrototypeBand->GetBlockSize(&nSrcBlockXSize, &nSrcBlockYSize);
5338 3405 : poDstPrototypeBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
5339 :
5340 3405 : const int nMaxBlockXSize = std::max(nBlockXSize, nSrcBlockXSize);
5341 3405 : const int nMaxBlockYSize = std::max(nBlockYSize, nSrcBlockYSize);
5342 :
5343 3405 : int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5344 3405 : if (bInterleave)
5345 585 : nPixelSize *= nBandCount;
5346 :
5347 : // aim for one row of blocks. Do not settle for less.
5348 3405 : int nSwathCols = nXSize;
5349 3405 : int nSwathLines = nMaxBlockYSize;
5350 :
5351 6810 : const char *pszSrcCompression = poSrcPrototypeBand->GetMetadataItem(
5352 3405 : GDALMD_COMPRESSION, GDAL_MDD_IMAGE_STRUCTURE);
5353 3405 : if (pszSrcCompression == nullptr)
5354 : {
5355 3385 : auto poSrcDS = poSrcPrototypeBand->GetDataset();
5356 3385 : if (poSrcDS)
5357 3379 : pszSrcCompression = poSrcDS->GetMetadataItem(
5358 3379 : GDALMD_COMPRESSION, GDAL_MDD_IMAGE_STRUCTURE);
5359 : }
5360 :
5361 : /* -------------------------------------------------------------------- */
5362 : /* What will our swath size be? */
5363 : /* -------------------------------------------------------------------- */
5364 : // When writing interleaved data in a compressed format, we want to be sure
5365 : // that each block will only be written once, so the swath size must not be
5366 : // greater than the block cache.
5367 3405 : const char *pszSwathSize = CPLGetConfigOption("GDAL_SWATH_SIZE", nullptr);
5368 : int nTargetSwathSize;
5369 3405 : if (pszSwathSize != nullptr)
5370 0 : nTargetSwathSize = static_cast<int>(
5371 0 : std::min(GIntBig(INT_MAX), CPLAtoGIntBig(pszSwathSize)));
5372 : else
5373 : {
5374 : // As a default, take one 1/4 of the cache size.
5375 3405 : nTargetSwathSize = static_cast<int>(
5376 3405 : std::min(GIntBig(INT_MAX), GDALGetCacheMax64() / 4));
5377 :
5378 : // but if the minimum idal swath buf size is less, then go for it to
5379 : // avoid unnecessarily abusing RAM usage.
5380 : // but try to use 10 MB at least.
5381 3405 : GIntBig nIdealSwathBufSize =
5382 3405 : static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize;
5383 3405 : int nMinTargetSwathSize = 10 * 1000 * 1000;
5384 :
5385 3405 : if ((poSrcPrototypeBand->GetSuggestedBlockAccessPattern() &
5386 3405 : GSBAP_LARGEST_CHUNK_POSSIBLE) != 0)
5387 : {
5388 1 : nMinTargetSwathSize = nTargetSwathSize;
5389 : }
5390 :
5391 3405 : if (nIdealSwathBufSize < nTargetSwathSize &&
5392 3395 : nIdealSwathBufSize < nMinTargetSwathSize)
5393 : {
5394 3392 : nIdealSwathBufSize = nMinTargetSwathSize;
5395 : }
5396 :
5397 3405 : if (pszSrcCompression != nullptr &&
5398 185 : EQUAL(pszSrcCompression, "JPEG2000") &&
5399 0 : (!bDstIsCompressed || ((nSrcBlockXSize % nBlockXSize) == 0 &&
5400 0 : (nSrcBlockYSize % nBlockYSize) == 0)))
5401 : {
5402 2 : nIdealSwathBufSize =
5403 4 : std::max(nIdealSwathBufSize, static_cast<GIntBig>(nSwathCols) *
5404 2 : nSrcBlockYSize * nPixelSize);
5405 : }
5406 3405 : if (nTargetSwathSize > nIdealSwathBufSize)
5407 3392 : nTargetSwathSize = static_cast<int>(
5408 3392 : std::min(GIntBig(INT_MAX), nIdealSwathBufSize));
5409 : }
5410 :
5411 3405 : if (nTargetSwathSize < 1000000)
5412 8 : nTargetSwathSize = 1000000;
5413 :
5414 : /* But let's check that */
5415 3626 : if (bDstIsCompressed && bInterleave &&
5416 221 : nTargetSwathSize > GDALGetCacheMax64())
5417 : {
5418 0 : CPLError(CE_Warning, CPLE_AppDefined,
5419 : "When translating into a compressed interleave format, "
5420 : "the block cache size (" CPL_FRMT_GIB ") "
5421 : "should be at least the size of the swath (%d) "
5422 : "(GDAL_SWATH_SIZE config. option)",
5423 : GDALGetCacheMax64(), nTargetSwathSize);
5424 : }
5425 :
5426 : #define IS_DIVIDER_OF(x, y) ((y) % (x) == 0)
5427 : #define ROUND_TO(x, y) (((x) / (y)) * (y))
5428 :
5429 : // if both input and output datasets are tiled, that the tile dimensions
5430 : // are "compatible", try to stick to a swath dimension that is a multiple
5431 : // of input and output block dimensions.
5432 3405 : if (nBlockXSize != nXSize && nSrcBlockXSize != nXSize &&
5433 47 : IS_DIVIDER_OF(nBlockXSize, nMaxBlockXSize) &&
5434 47 : IS_DIVIDER_OF(nSrcBlockXSize, nMaxBlockXSize) &&
5435 47 : IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
5436 47 : IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
5437 : {
5438 47 : if (static_cast<GIntBig>(nMaxBlockXSize) * nMaxBlockYSize *
5439 47 : nPixelSize <=
5440 47 : static_cast<GIntBig>(nTargetSwathSize))
5441 : {
5442 47 : nSwathCols = nTargetSwathSize / (nMaxBlockYSize * nPixelSize);
5443 47 : nSwathCols = ROUND_TO(nSwathCols, nMaxBlockXSize);
5444 47 : if (nSwathCols == 0)
5445 0 : nSwathCols = nMaxBlockXSize;
5446 47 : if (nSwathCols > nXSize)
5447 45 : nSwathCols = nXSize;
5448 47 : nSwathLines = nMaxBlockYSize;
5449 :
5450 47 : if (static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize >
5451 47 : static_cast<GIntBig>(nTargetSwathSize))
5452 : {
5453 0 : nSwathCols = nXSize;
5454 0 : nSwathLines = nBlockYSize;
5455 : }
5456 : }
5457 : }
5458 :
5459 3405 : const GIntBig nMemoryPerCol = static_cast<GIntBig>(nSwathCols) * nPixelSize;
5460 3405 : const GIntBig nSwathBufSize = nMemoryPerCol * nSwathLines;
5461 3405 : if (nSwathBufSize > static_cast<GIntBig>(nTargetSwathSize))
5462 : {
5463 1 : nSwathLines = static_cast<int>(nTargetSwathSize / nMemoryPerCol);
5464 1 : if (nSwathLines == 0)
5465 1 : nSwathLines = 1;
5466 :
5467 1 : CPLDebug(
5468 : "GDAL",
5469 : "GDALCopyWholeRasterGetSwathSize(): adjusting to %d line swath "
5470 : "since requirement (" CPL_FRMT_GIB " bytes) exceed target swath "
5471 : "size (%d bytes) (GDAL_SWATH_SIZE config. option)",
5472 1 : nSwathLines, nBlockYSize * nMemoryPerCol, nTargetSwathSize);
5473 : }
5474 : // If we are processing single scans, try to handle several at once.
5475 : // If we are handling swaths already, only grow the swath if a row
5476 : // of blocks is substantially less than our target buffer size.
5477 3404 : else if (nSwathLines == 1 ||
5478 2850 : nMemoryPerCol * nSwathLines <
5479 2850 : static_cast<GIntBig>(nTargetSwathSize) / 10)
5480 : {
5481 3376 : nSwathLines = std::min(
5482 : nYSize,
5483 3376 : std::max(1, static_cast<int>(nTargetSwathSize / nMemoryPerCol)));
5484 :
5485 : /* If possible try to align to source and target block height */
5486 3376 : if ((nSwathLines % nMaxBlockYSize) != 0 &&
5487 273 : nSwathLines > nMaxBlockYSize &&
5488 273 : IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
5489 244 : IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
5490 217 : nSwathLines = ROUND_TO(nSwathLines, nMaxBlockYSize);
5491 : }
5492 :
5493 3405 : if (pszSrcCompression != nullptr && EQUAL(pszSrcCompression, "JPEG2000") &&
5494 0 : (!bDstIsCompressed || (IS_DIVIDER_OF(nBlockXSize, nSrcBlockXSize) &&
5495 0 : IS_DIVIDER_OF(nBlockYSize, nSrcBlockYSize))))
5496 : {
5497 : // Typical use case: converting from Pleaiades that is 2048x2048 tiled.
5498 2 : if (nSwathLines < nSrcBlockYSize)
5499 : {
5500 0 : nSwathLines = nSrcBlockYSize;
5501 :
5502 : // Number of pixels that can be read/write simultaneously.
5503 0 : nSwathCols = nTargetSwathSize / (nSrcBlockXSize * nPixelSize);
5504 0 : nSwathCols = ROUND_TO(nSwathCols, nSrcBlockXSize);
5505 0 : if (nSwathCols == 0)
5506 0 : nSwathCols = nSrcBlockXSize;
5507 0 : if (nSwathCols > nXSize)
5508 0 : nSwathCols = nXSize;
5509 :
5510 0 : CPLDebug(
5511 : "GDAL",
5512 : "GDALCopyWholeRasterGetSwathSize(): because of compression and "
5513 : "too high block, "
5514 : "use partial width at one time");
5515 : }
5516 2 : else if ((nSwathLines % nSrcBlockYSize) != 0)
5517 : {
5518 : /* Round on a multiple of nSrcBlockYSize */
5519 0 : nSwathLines = ROUND_TO(nSwathLines, nSrcBlockYSize);
5520 0 : CPLDebug(
5521 : "GDAL",
5522 : "GDALCopyWholeRasterGetSwathSize(): because of compression, "
5523 : "round nSwathLines to block height : %d",
5524 : nSwathLines);
5525 : }
5526 : }
5527 3403 : else if (bDstIsCompressed)
5528 : {
5529 426 : if (nSwathLines < nBlockYSize)
5530 : {
5531 153 : nSwathLines = nBlockYSize;
5532 :
5533 : // Number of pixels that can be read/write simultaneously.
5534 153 : nSwathCols = nTargetSwathSize / (nSwathLines * nPixelSize);
5535 153 : nSwathCols = ROUND_TO(nSwathCols, nBlockXSize);
5536 153 : if (nSwathCols == 0)
5537 0 : nSwathCols = nBlockXSize;
5538 153 : if (nSwathCols > nXSize)
5539 153 : nSwathCols = nXSize;
5540 :
5541 153 : CPLDebug(
5542 : "GDAL",
5543 : "GDALCopyWholeRasterGetSwathSize(): because of compression and "
5544 : "too high block, "
5545 : "use partial width at one time");
5546 : }
5547 273 : else if ((nSwathLines % nBlockYSize) != 0)
5548 : {
5549 : // Round on a multiple of nBlockYSize.
5550 9 : nSwathLines = ROUND_TO(nSwathLines, nBlockYSize);
5551 9 : CPLDebug(
5552 : "GDAL",
5553 : "GDALCopyWholeRasterGetSwathSize(): because of compression, "
5554 : "round nSwathLines to block height : %d",
5555 : nSwathLines);
5556 : }
5557 : }
5558 :
5559 3405 : *pnSwathCols = nSwathCols;
5560 3405 : *pnSwathLines = nSwathLines;
5561 3405 : }
5562 :
5563 : /************************************************************************/
5564 : /* GDALDatasetCopyWholeRaster() */
5565 : /************************************************************************/
5566 :
5567 : /**
5568 : * \brief Copy all dataset raster data.
5569 : *
5570 : * This function copies the complete raster contents of one dataset to
5571 : * another similarly configured dataset. The source and destination
5572 : * dataset must have the same number of bands, and the same width
5573 : * and height. The bands do not have to have the same data type.
5574 : *
5575 : * This function is primarily intended to support implementation of
5576 : * driver specific CreateCopy() functions. It implements efficient copying,
5577 : * in particular "chunking" the copy in substantial blocks and, if appropriate,
5578 : * performing the transfer in a pixel interleaved fashion.
5579 : *
5580 : * Currently the only papszOptions value supported are :
5581 : * <ul>
5582 : * <li>"INTERLEAVE=PIXEL/BAND" to force pixel (resp. band) interleaved read and
5583 : * write access pattern (this does not modify the layout of the destination
5584 : * data)</li>
5585 : * <li>"COMPRESSED=YES" to force alignment on target dataset block
5586 : * sizes to achieve best compression.</li>
5587 : * <li>"SKIP_HOLES=YES" to skip chunks
5588 : * for which GDALGetDataCoverageStatus() returns GDAL_DATA_COVERAGE_STATUS_EMPTY
5589 : * (GDAL >= 2.2)</li>
5590 : * </ul>
5591 : * More options may be supported in the future.
5592 : *
5593 : * @param hSrcDS the source dataset
5594 : * @param hDstDS the destination dataset
5595 : * @param papszOptions transfer hints in "StringList" Name=Value format.
5596 : * @param pfnProgress progress reporting function.
5597 : * @param pProgressData callback data for progress function.
5598 : *
5599 : * @return CE_None on success, or CE_Failure on failure.
5600 : */
5601 :
5602 3377 : CPLErr CPL_STDCALL GDALDatasetCopyWholeRaster(GDALDatasetH hSrcDS,
5603 : GDALDatasetH hDstDS,
5604 : CSLConstList papszOptions,
5605 : GDALProgressFunc pfnProgress,
5606 : void *pProgressData)
5607 :
5608 : {
5609 3377 : VALIDATE_POINTER1(hSrcDS, "GDALDatasetCopyWholeRaster", CE_Failure);
5610 3377 : VALIDATE_POINTER1(hDstDS, "GDALDatasetCopyWholeRaster", CE_Failure);
5611 :
5612 3377 : GDALDataset *poSrcDS = GDALDataset::FromHandle(hSrcDS);
5613 3377 : GDALDataset *poDstDS = GDALDataset::FromHandle(hDstDS);
5614 :
5615 3377 : if (pfnProgress == nullptr)
5616 0 : pfnProgress = GDALDummyProgress;
5617 :
5618 : /* -------------------------------------------------------------------- */
5619 : /* Confirm the datasets match in size and band counts. */
5620 : /* -------------------------------------------------------------------- */
5621 3377 : const int nXSize = poDstDS->GetRasterXSize();
5622 3377 : const int nYSize = poDstDS->GetRasterYSize();
5623 3377 : const int nBandCount = poDstDS->GetRasterCount();
5624 :
5625 3377 : if (poSrcDS->GetRasterXSize() != nXSize ||
5626 6754 : poSrcDS->GetRasterYSize() != nYSize ||
5627 3377 : poSrcDS->GetRasterCount() != nBandCount)
5628 : {
5629 0 : CPLError(CE_Failure, CPLE_AppDefined,
5630 : "Input and output dataset sizes or band counts do not\n"
5631 : "match in GDALDatasetCopyWholeRaster()");
5632 0 : return CE_Failure;
5633 : }
5634 :
5635 : /* -------------------------------------------------------------------- */
5636 : /* Report preliminary (0) progress. */
5637 : /* -------------------------------------------------------------------- */
5638 3377 : if (!pfnProgress(0.0, nullptr, pProgressData))
5639 : {
5640 1 : CPLError(CE_Failure, CPLE_UserInterrupt,
5641 : "User terminated CreateCopy()");
5642 1 : return CE_Failure;
5643 : }
5644 :
5645 : /* -------------------------------------------------------------------- */
5646 : /* Get our prototype band, and assume the others are similarly */
5647 : /* configured. */
5648 : /* -------------------------------------------------------------------- */
5649 3376 : if (nBandCount == 0)
5650 0 : return CE_None;
5651 :
5652 3376 : GDALRasterBand *poSrcPrototypeBand = poSrcDS->GetRasterBand(1);
5653 3376 : GDALRasterBand *poDstPrototypeBand = poDstDS->GetRasterBand(1);
5654 3376 : GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
5655 :
5656 : /* -------------------------------------------------------------------- */
5657 : /* Do we want to try and do the operation in a pixel */
5658 : /* interleaved fashion? */
5659 : /* -------------------------------------------------------------------- */
5660 3376 : bool bInterleave = false;
5661 : const char *pszInterleave =
5662 3376 : poSrcDS->GetMetadataItem(GDALMD_INTERLEAVE, GDAL_MDD_IMAGE_STRUCTURE);
5663 3376 : if (pszInterleave != nullptr &&
5664 2965 : (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
5665 209 : bInterleave = true;
5666 :
5667 : pszInterleave =
5668 3376 : poDstDS->GetMetadataItem(GDALMD_INTERLEAVE, GDAL_MDD_IMAGE_STRUCTURE);
5669 3376 : if (pszInterleave != nullptr &&
5670 2910 : (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
5671 530 : bInterleave = true;
5672 :
5673 3376 : pszInterleave = CSLFetchNameValue(papszOptions, GDALMD_INTERLEAVE);
5674 3376 : if (pszInterleave != nullptr && EQUAL(pszInterleave, "PIXEL"))
5675 5 : bInterleave = true;
5676 3371 : else if (pszInterleave != nullptr && EQUAL(pszInterleave, "BAND"))
5677 13 : bInterleave = false;
5678 : // attributes is specific to the TileDB driver
5679 3358 : else if (pszInterleave != nullptr && EQUAL(pszInterleave, "ATTRIBUTES"))
5680 4 : bInterleave = true;
5681 3354 : else if (pszInterleave != nullptr)
5682 : {
5683 0 : CPLError(CE_Warning, CPLE_NotSupported,
5684 : "Unsupported value for option INTERLEAVE");
5685 : }
5686 :
5687 : // If the destination is compressed, we must try to write blocks just once,
5688 : // to save disk space (GTiff case for example), and to avoid data loss
5689 : // (JPEG compression for example).
5690 3376 : bool bDstIsCompressed = false;
5691 : const char *pszDstCompressed =
5692 3376 : CSLFetchNameValue(papszOptions, "COMPRESSED");
5693 3376 : if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
5694 400 : bDstIsCompressed = true;
5695 :
5696 : /* -------------------------------------------------------------------- */
5697 : /* What will our swath size be? */
5698 : /* -------------------------------------------------------------------- */
5699 :
5700 3376 : int nSwathCols = 0;
5701 3376 : int nSwathLines = 0;
5702 3376 : GDALCopyWholeRasterGetSwathSize(poSrcPrototypeBand, poDstPrototypeBand,
5703 : nBandCount, bDstIsCompressed, bInterleave,
5704 : &nSwathCols, &nSwathLines);
5705 :
5706 3376 : int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5707 3376 : if (bInterleave)
5708 585 : nPixelSize *= nBandCount;
5709 :
5710 3376 : void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
5711 3376 : if (pSwathBuf == nullptr)
5712 : {
5713 0 : return CE_Failure;
5714 : }
5715 :
5716 3376 : CPLDebug("GDAL",
5717 : "GDALDatasetCopyWholeRaster(): %d*%d swaths, bInterleave=%d",
5718 : nSwathCols, nSwathLines, static_cast<int>(bInterleave));
5719 :
5720 : // Advise the source raster that we are going to read it completely
5721 : // Note: this might already have been done by GDALCreateCopy() in the
5722 : // likely case this function is indirectly called by it
5723 3376 : poSrcDS->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nBandCount,
5724 3376 : nullptr, nullptr);
5725 :
5726 : /* ==================================================================== */
5727 : /* Band oriented (uninterleaved) case. */
5728 : /* ==================================================================== */
5729 3376 : CPLErr eErr = CE_None;
5730 : const bool bCheckHoles =
5731 3376 : CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
5732 :
5733 3376 : if (!bInterleave)
5734 : {
5735 : GDALRasterIOExtraArg sExtraArg;
5736 2791 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5737 2791 : CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress); // to make cppcheck happy
5738 :
5739 8373 : const GIntBig nTotalBlocks = static_cast<GIntBig>(nBandCount) *
5740 2791 : DIV_ROUND_UP(nYSize, nSwathLines) *
5741 2791 : DIV_ROUND_UP(nXSize, nSwathCols);
5742 2791 : GIntBig nBlocksDone = 0;
5743 :
5744 8025 : for (int iBand = 0; iBand < nBandCount && eErr == CE_None; iBand++)
5745 : {
5746 5234 : int nBand = iBand + 1;
5747 :
5748 10731 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5749 : {
5750 5497 : int nThisLines = nSwathLines;
5751 :
5752 5497 : if (iY + nThisLines > nYSize)
5753 375 : nThisLines = nYSize - iY;
5754 :
5755 10994 : for (int iX = 0; iX < nXSize && eErr == CE_None;
5756 5497 : iX += nSwathCols)
5757 : {
5758 5497 : int nThisCols = nSwathCols;
5759 :
5760 5497 : if (iX + nThisCols > nXSize)
5761 0 : nThisCols = nXSize - iX;
5762 :
5763 5497 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5764 5497 : if (bCheckHoles)
5765 : {
5766 : nStatus = poSrcDS->GetRasterBand(nBand)
5767 3779 : ->GetDataCoverageStatus(
5768 : iX, iY, nThisCols, nThisLines,
5769 : GDAL_DATA_COVERAGE_STATUS_DATA);
5770 : }
5771 5497 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5772 : {
5773 5493 : sExtraArg.pfnProgress = GDALScaledProgress;
5774 10986 : sExtraArg.pProgressData = GDALCreateScaledProgress(
5775 5493 : nBlocksDone / static_cast<double>(nTotalBlocks),
5776 5493 : (nBlocksDone + 0.5) /
5777 5493 : static_cast<double>(nTotalBlocks),
5778 : pfnProgress, pProgressData);
5779 5493 : if (sExtraArg.pProgressData == nullptr)
5780 1688 : sExtraArg.pfnProgress = nullptr;
5781 :
5782 5493 : eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
5783 : nThisLines, pSwathBuf,
5784 : nThisCols, nThisLines, eDT, 1,
5785 : &nBand, 0, 0, 0, &sExtraArg);
5786 :
5787 5493 : GDALDestroyScaledProgress(sExtraArg.pProgressData);
5788 :
5789 5493 : if (eErr == CE_None)
5790 5485 : eErr = poDstDS->RasterIO(
5791 : GF_Write, iX, iY, nThisCols, nThisLines,
5792 : pSwathBuf, nThisCols, nThisLines, eDT, 1,
5793 : &nBand, 0, 0, 0, nullptr);
5794 : }
5795 :
5796 5497 : nBlocksDone++;
5797 10951 : if (eErr == CE_None &&
5798 5454 : !pfnProgress(nBlocksDone /
5799 5454 : static_cast<double>(nTotalBlocks),
5800 : nullptr, pProgressData))
5801 : {
5802 2 : eErr = CE_Failure;
5803 2 : CPLError(CE_Failure, CPLE_UserInterrupt,
5804 : "User terminated CreateCopy()");
5805 : }
5806 : }
5807 : }
5808 : }
5809 : }
5810 :
5811 : /* ==================================================================== */
5812 : /* Pixel interleaved case. */
5813 : /* ==================================================================== */
5814 : else /* if( bInterleave ) */
5815 : {
5816 : GDALRasterIOExtraArg sExtraArg;
5817 585 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5818 585 : CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress); // to make cppcheck happy
5819 :
5820 585 : const GIntBig nTotalBlocks =
5821 585 : static_cast<GIntBig>(DIV_ROUND_UP(nYSize, nSwathLines)) *
5822 585 : DIV_ROUND_UP(nXSize, nSwathCols);
5823 585 : GIntBig nBlocksDone = 0;
5824 :
5825 1392 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5826 : {
5827 807 : int nThisLines = nSwathLines;
5828 :
5829 807 : if (iY + nThisLines > nYSize)
5830 198 : nThisLines = nYSize - iY;
5831 :
5832 1619 : for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
5833 : {
5834 812 : int nThisCols = nSwathCols;
5835 :
5836 812 : if (iX + nThisCols > nXSize)
5837 3 : nThisCols = nXSize - iX;
5838 :
5839 812 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5840 812 : if (bCheckHoles)
5841 : {
5842 553 : nStatus = 0;
5843 606 : for (int iBand = 0; iBand < nBandCount; iBand++)
5844 : {
5845 587 : nStatus |= poSrcDS->GetRasterBand(iBand + 1)
5846 587 : ->GetDataCoverageStatus(
5847 : iX, iY, nThisCols, nThisLines,
5848 : GDAL_DATA_COVERAGE_STATUS_DATA);
5849 587 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5850 534 : break;
5851 : }
5852 : }
5853 812 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5854 : {
5855 793 : sExtraArg.pfnProgress = GDALScaledProgress;
5856 1586 : sExtraArg.pProgressData = GDALCreateScaledProgress(
5857 793 : nBlocksDone / static_cast<double>(nTotalBlocks),
5858 793 : (nBlocksDone + 0.5) / static_cast<double>(nTotalBlocks),
5859 : pfnProgress, pProgressData);
5860 793 : if (sExtraArg.pProgressData == nullptr)
5861 377 : sExtraArg.pfnProgress = nullptr;
5862 :
5863 793 : eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
5864 : nThisLines, pSwathBuf, nThisCols,
5865 : nThisLines, eDT, nBandCount,
5866 : nullptr, 0, 0, 0, &sExtraArg);
5867 :
5868 793 : GDALDestroyScaledProgress(sExtraArg.pProgressData);
5869 :
5870 793 : if (eErr == CE_None)
5871 792 : eErr = poDstDS->RasterIO(
5872 : GF_Write, iX, iY, nThisCols, nThisLines, pSwathBuf,
5873 : nThisCols, nThisLines, eDT, nBandCount, nullptr, 0,
5874 : 0, 0, nullptr);
5875 : }
5876 :
5877 812 : nBlocksDone++;
5878 1619 : if (eErr == CE_None &&
5879 807 : !pfnProgress(nBlocksDone /
5880 807 : static_cast<double>(nTotalBlocks),
5881 : nullptr, pProgressData))
5882 : {
5883 1 : eErr = CE_Failure;
5884 1 : CPLError(CE_Failure, CPLE_UserInterrupt,
5885 : "User terminated CreateCopy()");
5886 : }
5887 : }
5888 : }
5889 : }
5890 :
5891 : /* -------------------------------------------------------------------- */
5892 : /* Cleanup */
5893 : /* -------------------------------------------------------------------- */
5894 3376 : CPLFree(pSwathBuf);
5895 :
5896 3376 : return eErr;
5897 : }
5898 :
5899 : /************************************************************************/
5900 : /* GDALRasterBandCopyWholeRaster() */
5901 : /************************************************************************/
5902 :
5903 : /**
5904 : * \brief Copy a whole raster band
5905 : *
5906 : * This function copies the complete raster contents of one band to
5907 : * another similarly configured band. The source and destination
5908 : * bands must have the same width and height. The bands do not have
5909 : * to have the same data type.
5910 : *
5911 : * It implements efficient copying, in particular "chunking" the copy in
5912 : * substantial blocks.
5913 : *
5914 : * Currently the only papszOptions value supported are :
5915 : * <ul>
5916 : * <li>"COMPRESSED=YES" to force alignment on target dataset block sizes to
5917 : * achieve best compression.</li>
5918 : * <li>"SKIP_HOLES=YES" to skip chunks for which GDALGetDataCoverageStatus()
5919 : * returns GDAL_DATA_COVERAGE_STATUS_EMPTY (GDAL >= 2.2)</li>
5920 : * </ul>
5921 : *
5922 : * @param hSrcBand the source band
5923 : * @param hDstBand the destination band
5924 : * @param papszOptions transfer hints in "StringList" Name=Value format.
5925 : * @param pfnProgress progress reporting function.
5926 : * @param pProgressData callback data for progress function.
5927 : *
5928 : * @return CE_None on success, or CE_Failure on failure.
5929 : */
5930 :
5931 29 : CPLErr CPL_STDCALL GDALRasterBandCopyWholeRaster(
5932 : GDALRasterBandH hSrcBand, GDALRasterBandH hDstBand,
5933 : const char *const *const papszOptions, GDALProgressFunc pfnProgress,
5934 : void *pProgressData)
5935 :
5936 : {
5937 29 : VALIDATE_POINTER1(hSrcBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
5938 29 : VALIDATE_POINTER1(hDstBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
5939 :
5940 29 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
5941 29 : GDALRasterBand *poDstBand = GDALRasterBand::FromHandle(hDstBand);
5942 29 : CPLErr eErr = CE_None;
5943 :
5944 29 : if (pfnProgress == nullptr)
5945 2 : pfnProgress = GDALDummyProgress;
5946 :
5947 : /* -------------------------------------------------------------------- */
5948 : /* Confirm the datasets match in size and band counts. */
5949 : /* -------------------------------------------------------------------- */
5950 29 : int nXSize = poSrcBand->GetXSize();
5951 29 : int nYSize = poSrcBand->GetYSize();
5952 :
5953 29 : if (poDstBand->GetXSize() != nXSize || poDstBand->GetYSize() != nYSize)
5954 : {
5955 0 : CPLError(CE_Failure, CPLE_AppDefined,
5956 : "Input and output band sizes do not\n"
5957 : "match in GDALRasterBandCopyWholeRaster()");
5958 0 : return CE_Failure;
5959 : }
5960 :
5961 : /* -------------------------------------------------------------------- */
5962 : /* Report preliminary (0) progress. */
5963 : /* -------------------------------------------------------------------- */
5964 29 : if (!pfnProgress(0.0, nullptr, pProgressData))
5965 : {
5966 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
5967 : "User terminated CreateCopy()");
5968 0 : return CE_Failure;
5969 : }
5970 :
5971 29 : GDALDataType eDT = poDstBand->GetRasterDataType();
5972 :
5973 : // If the destination is compressed, we must try to write blocks just once,
5974 : // to save disk space (GTiff case for example), and to avoid data loss
5975 : // (JPEG compression for example).
5976 29 : bool bDstIsCompressed = false;
5977 : const char *pszDstCompressed =
5978 29 : CSLFetchNameValue(const_cast<char **>(papszOptions), "COMPRESSED");
5979 29 : if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
5980 26 : bDstIsCompressed = true;
5981 :
5982 : /* -------------------------------------------------------------------- */
5983 : /* What will our swath size be? */
5984 : /* -------------------------------------------------------------------- */
5985 :
5986 29 : int nSwathCols = 0;
5987 29 : int nSwathLines = 0;
5988 29 : GDALCopyWholeRasterGetSwathSize(poSrcBand, poDstBand, 1, bDstIsCompressed,
5989 : FALSE, &nSwathCols, &nSwathLines);
5990 :
5991 29 : const int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5992 :
5993 29 : void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
5994 29 : if (pSwathBuf == nullptr)
5995 : {
5996 0 : return CE_Failure;
5997 : }
5998 :
5999 29 : CPLDebug("GDAL", "GDALRasterBandCopyWholeRaster(): %d*%d swaths",
6000 : nSwathCols, nSwathLines);
6001 :
6002 : const bool bCheckHoles =
6003 29 : CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
6004 :
6005 : // Advise the source raster that we are going to read it completely
6006 29 : poSrcBand->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nullptr);
6007 :
6008 : /* ==================================================================== */
6009 : /* Band oriented (uninterleaved) case. */
6010 : /* ==================================================================== */
6011 :
6012 72 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
6013 : {
6014 43 : int nThisLines = nSwathLines;
6015 :
6016 43 : if (iY + nThisLines > nYSize)
6017 8 : nThisLines = nYSize - iY;
6018 :
6019 86 : for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
6020 : {
6021 43 : int nThisCols = nSwathCols;
6022 :
6023 43 : if (iX + nThisCols > nXSize)
6024 0 : nThisCols = nXSize - iX;
6025 :
6026 43 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
6027 43 : if (bCheckHoles)
6028 : {
6029 0 : nStatus = poSrcBand->GetDataCoverageStatus(
6030 : iX, iY, nThisCols, nThisLines,
6031 : GDAL_DATA_COVERAGE_STATUS_DATA);
6032 : }
6033 43 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
6034 : {
6035 43 : eErr = poSrcBand->RasterIO(GF_Read, iX, iY, nThisCols,
6036 : nThisLines, pSwathBuf, nThisCols,
6037 : nThisLines, eDT, 0, 0, nullptr);
6038 :
6039 43 : if (eErr == CE_None)
6040 43 : eErr = poDstBand->RasterIO(GF_Write, iX, iY, nThisCols,
6041 : nThisLines, pSwathBuf, nThisCols,
6042 : nThisLines, eDT, 0, 0, nullptr);
6043 : }
6044 :
6045 86 : if (eErr == CE_None && !pfnProgress(double(iY + nThisLines) /
6046 43 : static_cast<double>(nYSize),
6047 : nullptr, pProgressData))
6048 : {
6049 0 : eErr = CE_Failure;
6050 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
6051 : "User terminated CreateCopy()");
6052 : }
6053 : }
6054 : }
6055 :
6056 : /* -------------------------------------------------------------------- */
6057 : /* Cleanup */
6058 : /* -------------------------------------------------------------------- */
6059 29 : CPLFree(pSwathBuf);
6060 :
6061 29 : return eErr;
6062 : }
6063 :
6064 : /************************************************************************/
6065 : /* GDALCopyRasterIOExtraArg () */
6066 : /************************************************************************/
6067 :
6068 535029 : void GDALCopyRasterIOExtraArg(GDALRasterIOExtraArg *psDestArg,
6069 : const GDALRasterIOExtraArg *psSrcArg)
6070 : {
6071 535029 : INIT_RASTERIO_EXTRA_ARG(*psDestArg);
6072 535029 : if (psSrcArg)
6073 : {
6074 535029 : psDestArg->eResampleAlg = psSrcArg->eResampleAlg;
6075 535029 : psDestArg->pfnProgress = psSrcArg->pfnProgress;
6076 535029 : psDestArg->pProgressData = psSrcArg->pProgressData;
6077 535029 : psDestArg->bFloatingPointWindowValidity =
6078 535029 : psSrcArg->bFloatingPointWindowValidity;
6079 535029 : if (psSrcArg->bFloatingPointWindowValidity)
6080 : {
6081 212054 : psDestArg->dfXOff = psSrcArg->dfXOff;
6082 212054 : psDestArg->dfYOff = psSrcArg->dfYOff;
6083 212054 : psDestArg->dfXSize = psSrcArg->dfXSize;
6084 212054 : psDestArg->dfYSize = psSrcArg->dfYSize;
6085 : }
6086 535029 : if (psSrcArg->nVersion >= 2)
6087 : {
6088 535029 : psDestArg->bUseOnlyThisScale = psSrcArg->bUseOnlyThisScale;
6089 : }
6090 535029 : if (psSrcArg->nVersion >= 3)
6091 : {
6092 535029 : psDestArg->bOperateInBufType = psSrcArg->bOperateInBufType;
6093 : }
6094 : }
6095 535029 : }
6096 :
6097 : /************************************************************************/
6098 : /* HasOnlyNoData() */
6099 : /************************************************************************/
6100 :
6101 51285976 : template <class T> static inline bool IsEqualToNoData(T value, T noDataValue)
6102 : {
6103 51285976 : return value == noDataValue;
6104 : }
6105 :
6106 5509 : template <> bool IsEqualToNoData<GFloat16>(GFloat16 value, GFloat16 noDataValue)
6107 : {
6108 : using std::isnan;
6109 5509 : return isnan(noDataValue) ? isnan(value) : value == noDataValue;
6110 : }
6111 :
6112 251221 : template <> bool IsEqualToNoData<float>(float value, float noDataValue)
6113 : {
6114 251221 : return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
6115 : }
6116 :
6117 264257 : template <> bool IsEqualToNoData<double>(double value, double noDataValue)
6118 : {
6119 264257 : return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
6120 : }
6121 :
6122 : template <class T>
6123 12024 : static bool HasOnlyNoDataT(const T *pBuffer, T noDataValue, size_t nWidth,
6124 : size_t nHeight, size_t nLineStride,
6125 : size_t nComponents)
6126 : {
6127 : // Fast test: check the 4 corners and the middle pixel.
6128 23297 : for (size_t iBand = 0; iBand < nComponents; iBand++)
6129 : {
6130 24095 : if (!(IsEqualToNoData(pBuffer[iBand], noDataValue) &&
6131 11880 : IsEqualToNoData(pBuffer[(nWidth - 1) * nComponents + iBand],
6132 11750 : noDataValue) &&
6133 11750 : IsEqualToNoData(
6134 11750 : pBuffer[((nHeight - 1) / 2 * nLineStride + (nWidth - 1) / 2) *
6135 11750 : nComponents +
6136 : iBand],
6137 11276 : noDataValue) &&
6138 11276 : IsEqualToNoData(
6139 11276 : pBuffer[(nHeight - 1) * nLineStride * nComponents + iBand],
6140 : noDataValue) &&
6141 11276 : IsEqualToNoData(
6142 11276 : pBuffer[((nHeight - 1) * nLineStride + nWidth - 1) *
6143 11276 : nComponents +
6144 : iBand],
6145 : noDataValue)))
6146 : {
6147 942 : return false;
6148 : }
6149 : }
6150 :
6151 : // Test all pixels.
6152 52954 : for (size_t iY = 0; iY < nHeight; iY++)
6153 : {
6154 41993 : const T *pBufferLine = pBuffer + iY * nLineStride * nComponents;
6155 51790448 : for (size_t iX = 0; iX < nWidth * nComponents; iX++)
6156 : {
6157 51748615 : if (!IsEqualToNoData(pBufferLine[iX], noDataValue))
6158 : {
6159 121 : return false;
6160 : }
6161 : }
6162 : }
6163 10961 : return true;
6164 : }
6165 :
6166 : /************************************************************************/
6167 : /* GDALBufferHasOnlyNoData() */
6168 : /************************************************************************/
6169 :
6170 44012 : bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
6171 : size_t nWidth, size_t nHeight, size_t nLineStride,
6172 : size_t nComponents, int nBitsPerSample,
6173 : GDALBufferSampleFormat nSampleFormat)
6174 : {
6175 : // In the case where the nodata is 0, we can compare several bytes at
6176 : // once. Select the largest natural integer type for the architecture.
6177 44012 : if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
6178 : // Do not use this optimized code path for floating point numbers,
6179 : // as it can't detect negative zero.
6180 : nSampleFormat != GSF_FLOATING_POINT)
6181 : {
6182 27266 : const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
6183 27266 : const size_t nSize =
6184 27266 : static_cast<size_t>((static_cast<uint64_t>(nWidth) * nHeight *
6185 27266 : nComponents * nBitsPerSample +
6186 : 7) /
6187 : 8);
6188 : #ifdef HAVE_SSE2
6189 27266 : size_t n = nSize;
6190 : // Align to 16 bytes
6191 27329 : while ((reinterpret_cast<uintptr_t>(pabyBuffer) & 15) != 0 && n > 0)
6192 : {
6193 73 : --n;
6194 73 : if (*pabyBuffer)
6195 10 : return false;
6196 63 : pabyBuffer++;
6197 : }
6198 :
6199 27256 : const auto zero = _mm_setzero_si128();
6200 27256 : constexpr int UNROLLING = 4;
6201 2223230 : while (n >= UNROLLING * sizeof(zero))
6202 : {
6203 2207980 : const auto v0 = _mm_load_si128(reinterpret_cast<const __m128i *>(
6204 : pabyBuffer + 0 * sizeof(zero)));
6205 2207980 : const auto v1 = _mm_load_si128(reinterpret_cast<const __m128i *>(
6206 2207980 : pabyBuffer + 1 * sizeof(zero)));
6207 2207980 : const auto v2 = _mm_load_si128(reinterpret_cast<const __m128i *>(
6208 2207980 : pabyBuffer + 2 * sizeof(zero)));
6209 2207980 : const auto v3 = _mm_load_si128(reinterpret_cast<const __m128i *>(
6210 2207980 : pabyBuffer + 3 * sizeof(zero)));
6211 : const auto v =
6212 6623940 : _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
6213 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
6214 : if (!_mm_test_all_zeros(v, v))
6215 : #else
6216 4415960 : if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
6217 : #endif
6218 : {
6219 12002 : return false;
6220 : }
6221 2195980 : pabyBuffer += UNROLLING * sizeof(zero);
6222 2195980 : n -= UNROLLING * sizeof(zero);
6223 : }
6224 :
6225 233639 : while (n > 0)
6226 : {
6227 218489 : --n;
6228 218489 : if (*pabyBuffer)
6229 104 : return false;
6230 218385 : pabyBuffer++;
6231 : }
6232 : #else
6233 : #if SIZEOF_VOIDP >= 8 || defined(__x86_64__)
6234 : // We test __x86_64__ for x32 arch where SIZEOF_VOIDP == 4
6235 : typedef std::uint64_t WordType;
6236 : #else
6237 : typedef std::uint32_t WordType;
6238 : #endif
6239 :
6240 : const size_t nInitialIters =
6241 : std::min(sizeof(WordType) -
6242 : static_cast<size_t>(
6243 : reinterpret_cast<std::uintptr_t>(pabyBuffer) %
6244 : sizeof(WordType)),
6245 : nSize);
6246 : size_t i = 0;
6247 : for (; i < nInitialIters; i++)
6248 : {
6249 : if (pabyBuffer[i])
6250 : return false;
6251 : }
6252 : for (; i + sizeof(WordType) - 1 < nSize; i += sizeof(WordType))
6253 : {
6254 : if (*(reinterpret_cast<const WordType *>(pabyBuffer + i)))
6255 : return false;
6256 : }
6257 : for (; i < nSize; i++)
6258 : {
6259 : if (pabyBuffer[i])
6260 : return false;
6261 : }
6262 : #endif
6263 15150 : return true;
6264 : }
6265 :
6266 : #ifdef HAVE_SSE2
6267 16746 : else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
6268 710 : nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
6269 : {
6270 710 : const auto signMask = _mm_set1_epi32(0x7FFFFFFF);
6271 710 : const auto zero = _mm_setzero_si128();
6272 710 : const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
6273 710 : const size_t n = nWidth * nHeight * nComponents;
6274 :
6275 710 : size_t i = 0;
6276 710 : constexpr int UNROLLING = 4;
6277 710 : constexpr size_t VALUES_PER_ITER =
6278 : UNROLLING * sizeof(zero) / sizeof(float);
6279 24985 : for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
6280 : {
6281 24936 : const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6282 : pabyBuffer + 0 * sizeof(zero)));
6283 24936 : const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6284 24936 : pabyBuffer + 1 * sizeof(zero)));
6285 24936 : const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6286 24936 : pabyBuffer + 2 * sizeof(zero)));
6287 24936 : const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6288 24936 : pabyBuffer + 3 * sizeof(zero)));
6289 74808 : auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
6290 : // Clear the sign bit (makes -0.0 become +0.0)
6291 24936 : v = _mm_and_si128(v, signMask);
6292 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
6293 : if (!_mm_test_all_zeros(v, v))
6294 : #else
6295 49872 : if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
6296 : #endif
6297 : {
6298 661 : return false;
6299 : }
6300 24275 : pabyBuffer += UNROLLING * sizeof(zero);
6301 : }
6302 :
6303 304 : for (; i < n; i++)
6304 : {
6305 : uint32_t bits;
6306 272 : memcpy(&bits, pabyBuffer, sizeof(bits));
6307 272 : pabyBuffer += sizeof(bits);
6308 272 : if ((bits & 0x7FFFFFFF) != 0)
6309 17 : return false;
6310 : }
6311 :
6312 32 : return true;
6313 : }
6314 :
6315 16036 : else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
6316 4005 : nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
6317 : {
6318 4005 : const auto signMask = _mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL);
6319 4005 : const auto zero = _mm_setzero_si128();
6320 4005 : const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
6321 4005 : const size_t n = nWidth * nHeight * nComponents;
6322 :
6323 4005 : size_t i = 0;
6324 4005 : constexpr int UNROLLING = 4;
6325 4005 : constexpr size_t VALUES_PER_ITER =
6326 : UNROLLING * sizeof(zero) / sizeof(double);
6327 1664960 : for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
6328 : {
6329 1661340 : const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6330 : pabyBuffer + 0 * sizeof(zero)));
6331 1661340 : const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6332 1661340 : pabyBuffer + 1 * sizeof(zero)));
6333 1661340 : const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6334 1661340 : pabyBuffer + 2 * sizeof(zero)));
6335 1661340 : const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6336 1661340 : pabyBuffer + 3 * sizeof(zero)));
6337 4984020 : auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
6338 : // Clear the sign bit (makes -0.0 become +0.0)
6339 1661340 : v = _mm_and_si128(v, signMask);
6340 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
6341 : if (!_mm_test_all_zeros(v, v))
6342 : #else
6343 3322680 : if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
6344 : #endif
6345 : {
6346 389 : return false;
6347 : }
6348 1660950 : pabyBuffer += UNROLLING * sizeof(zero);
6349 : }
6350 :
6351 3643 : for (; i < n; i++)
6352 : {
6353 : uint64_t bits;
6354 34 : memcpy(&bits, pabyBuffer, sizeof(bits));
6355 34 : pabyBuffer += sizeof(bits);
6356 34 : if ((bits & 0x7FFFFFFFFFFFFFFFULL) != 0)
6357 7 : return false;
6358 : }
6359 :
6360 3609 : return true;
6361 : }
6362 : #endif
6363 :
6364 12031 : if (nBitsPerSample == 8 && nSampleFormat == GSF_UNSIGNED_INT)
6365 : {
6366 22424 : return GDALIsValueInRange<uint8_t>(dfNoDataValue) &&
6367 11212 : HasOnlyNoDataT(static_cast<const uint8_t *>(pBuffer),
6368 11212 : static_cast<uint8_t>(dfNoDataValue), nWidth,
6369 11212 : nHeight, nLineStride, nComponents);
6370 : }
6371 819 : if (nBitsPerSample == 8 && nSampleFormat == GSF_SIGNED_INT)
6372 : {
6373 : // Use unsigned implementation by converting the nodatavalue to
6374 : // unsigned
6375 119 : return GDALIsValueInRange<int8_t>(dfNoDataValue) &&
6376 59 : HasOnlyNoDataT(
6377 : static_cast<const uint8_t *>(pBuffer),
6378 59 : static_cast<uint8_t>(static_cast<int8_t>(dfNoDataValue)),
6379 60 : nWidth, nHeight, nLineStride, nComponents);
6380 : }
6381 759 : if (nBitsPerSample == 16 && nSampleFormat == GSF_UNSIGNED_INT)
6382 : {
6383 23 : return GDALIsValueInRange<uint16_t>(dfNoDataValue) &&
6384 11 : HasOnlyNoDataT(static_cast<const uint16_t *>(pBuffer),
6385 11 : static_cast<uint16_t>(dfNoDataValue), nWidth,
6386 12 : nHeight, nLineStride, nComponents);
6387 : }
6388 747 : if (nBitsPerSample == 16 && nSampleFormat == GSF_SIGNED_INT)
6389 : {
6390 : // Use unsigned implementation by converting the nodatavalue to
6391 : // unsigned
6392 111 : return GDALIsValueInRange<int16_t>(dfNoDataValue) &&
6393 55 : HasOnlyNoDataT(
6394 : static_cast<const uint16_t *>(pBuffer),
6395 55 : static_cast<uint16_t>(static_cast<int16_t>(dfNoDataValue)),
6396 56 : nWidth, nHeight, nLineStride, nComponents);
6397 : }
6398 691 : if (nBitsPerSample == 32 && nSampleFormat == GSF_UNSIGNED_INT)
6399 : {
6400 129 : return GDALIsValueInRange<uint32_t>(dfNoDataValue) &&
6401 64 : HasOnlyNoDataT(static_cast<const uint32_t *>(pBuffer),
6402 : static_cast<uint32_t>(dfNoDataValue), nWidth,
6403 65 : nHeight, nLineStride, nComponents);
6404 : }
6405 626 : if (nBitsPerSample == 32 && nSampleFormat == GSF_SIGNED_INT)
6406 : {
6407 : // Use unsigned implementation by converting the nodatavalue to
6408 : // unsigned
6409 23 : return GDALIsValueInRange<int32_t>(dfNoDataValue) &&
6410 11 : HasOnlyNoDataT(
6411 : static_cast<const uint32_t *>(pBuffer),
6412 11 : static_cast<uint32_t>(static_cast<int32_t>(dfNoDataValue)),
6413 12 : nWidth, nHeight, nLineStride, nComponents);
6414 : }
6415 614 : if (nBitsPerSample == 64 && nSampleFormat == GSF_UNSIGNED_INT)
6416 : {
6417 112 : return GDALIsValueInRange<uint64_t>(dfNoDataValue) &&
6418 56 : HasOnlyNoDataT(static_cast<const uint64_t *>(pBuffer),
6419 : static_cast<uint64_t>(dfNoDataValue), nWidth,
6420 56 : nHeight, nLineStride, nComponents);
6421 : }
6422 558 : if (nBitsPerSample == 64 && nSampleFormat == GSF_SIGNED_INT)
6423 : {
6424 : // Use unsigned implementation by converting the nodatavalue to
6425 : // unsigned
6426 0 : return GDALIsValueInRange<int64_t>(dfNoDataValue) &&
6427 0 : HasOnlyNoDataT(
6428 : static_cast<const uint64_t *>(pBuffer),
6429 0 : static_cast<uint64_t>(static_cast<int64_t>(dfNoDataValue)),
6430 0 : nWidth, nHeight, nLineStride, nComponents);
6431 : }
6432 558 : if (nBitsPerSample == 16 && nSampleFormat == GSF_FLOATING_POINT)
6433 : {
6434 106 : return (std::isnan(dfNoDataValue) ||
6435 211 : GDALIsValueInRange<GFloat16>(dfNoDataValue)) &&
6436 105 : HasOnlyNoDataT(static_cast<const GFloat16 *>(pBuffer),
6437 : static_cast<GFloat16>(dfNoDataValue), nWidth,
6438 106 : nHeight, nLineStride, nComponents);
6439 : }
6440 452 : if (nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
6441 : {
6442 268 : return (std::isnan(dfNoDataValue) ||
6443 535 : GDALIsValueInRange<float>(dfNoDataValue)) &&
6444 267 : HasOnlyNoDataT(static_cast<const float *>(pBuffer),
6445 : static_cast<float>(dfNoDataValue), nWidth,
6446 268 : nHeight, nLineStride, nComponents);
6447 : }
6448 184 : if (nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
6449 : {
6450 184 : return HasOnlyNoDataT(static_cast<const double *>(pBuffer),
6451 : dfNoDataValue, nWidth, nHeight, nLineStride,
6452 184 : nComponents);
6453 : }
6454 0 : return false;
6455 : }
6456 :
6457 : #ifdef HAVE_SSE2
6458 :
6459 : /************************************************************************/
6460 : /* GDALDeinterleave3Byte() */
6461 : /************************************************************************/
6462 :
6463 : #if defined(__GNUC__) && !defined(__clang__)
6464 : __attribute__((optimize("no-tree-vectorize")))
6465 : #endif
6466 383026 : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
6467 : GByte *CPL_RESTRICT pabyDest0,
6468 : GByte *CPL_RESTRICT pabyDest1,
6469 : GByte *CPL_RESTRICT pabyDest2, size_t nIters)
6470 : #ifdef USE_NEON_OPTIMIZATIONS
6471 : {
6472 : return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6473 : nIters);
6474 : }
6475 : #else
6476 : {
6477 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
6478 383026 : if (CPLHaveRuntimeSSSE3())
6479 : {
6480 383024 : return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
6481 383024 : pabyDest2, nIters);
6482 : }
6483 : #endif
6484 :
6485 2 : size_t i = 0;
6486 2 : if (((reinterpret_cast<uintptr_t>(pabySrc) |
6487 2 : reinterpret_cast<uintptr_t>(pabyDest0) |
6488 2 : reinterpret_cast<uintptr_t>(pabyDest1) |
6489 2 : reinterpret_cast<uintptr_t>(pabyDest2)) %
6490 : sizeof(unsigned int)) == 0)
6491 : {
6492 : // Slightly better than GCC autovectorizer
6493 17 : for (size_t j = 0; i + 3 < nIters; i += 4, ++j)
6494 : {
6495 15 : unsigned int word0 =
6496 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i);
6497 15 : unsigned int word1 =
6498 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 4);
6499 15 : unsigned int word2 =
6500 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 8);
6501 15 : reinterpret_cast<unsigned int *>(pabyDest0)[j] =
6502 15 : (word0 & 0xff) | ((word0 >> 24) << 8) | (word1 & 0x00ff0000) |
6503 15 : ((word2 >> 8) << 24);
6504 15 : reinterpret_cast<unsigned int *>(pabyDest1)[j] =
6505 15 : ((word0 >> 8) & 0xff) | ((word1 & 0xff) << 8) |
6506 15 : (((word1 >> 24)) << 16) | ((word2 >> 16) << 24);
6507 15 : pabyDest2[j * 4] = static_cast<GByte>(word0 >> 16);
6508 15 : pabyDest2[j * 4 + 1] = static_cast<GByte>(word1 >> 8);
6509 15 : pabyDest2[j * 4 + 2] = static_cast<GByte>(word2);
6510 15 : pabyDest2[j * 4 + 3] = static_cast<GByte>(word2 >> 24);
6511 : }
6512 : }
6513 : #if defined(__clang__)
6514 : #pragma clang loop vectorize(disable)
6515 : #endif
6516 3 : for (; i < nIters; ++i)
6517 : {
6518 1 : pabyDest0[i] = pabySrc[3 * i + 0];
6519 1 : pabyDest1[i] = pabySrc[3 * i + 1];
6520 1 : pabyDest2[i] = pabySrc[3 * i + 2];
6521 : }
6522 : }
6523 : #endif
6524 :
6525 : /************************************************************************/
6526 : /* GDALDeinterleave4Byte() */
6527 : /************************************************************************/
6528 :
6529 : #if !defined(__GNUC__) || defined(__clang__)
6530 :
6531 : /************************************************************************/
6532 : /* deinterleave() */
6533 : /************************************************************************/
6534 :
6535 : template <bool SHIFT, bool MASK>
6536 : inline __m128i deinterleave(__m128i &xmm0_ori, __m128i &xmm1_ori,
6537 : __m128i &xmm2_ori, __m128i &xmm3_ori)
6538 : {
6539 : // Set higher 24bit of each int32 packed word to 0
6540 : if (SHIFT)
6541 : {
6542 : xmm0_ori = _mm_srli_epi32(xmm0_ori, 8);
6543 : xmm1_ori = _mm_srli_epi32(xmm1_ori, 8);
6544 : xmm2_ori = _mm_srli_epi32(xmm2_ori, 8);
6545 : xmm3_ori = _mm_srli_epi32(xmm3_ori, 8);
6546 : }
6547 : __m128i xmm0;
6548 : __m128i xmm1;
6549 : __m128i xmm2;
6550 : __m128i xmm3;
6551 : if (MASK)
6552 : {
6553 : const __m128i xmm_mask = _mm_set1_epi32(0xff);
6554 : xmm0 = _mm_and_si128(xmm0_ori, xmm_mask);
6555 : xmm1 = _mm_and_si128(xmm1_ori, xmm_mask);
6556 : xmm2 = _mm_and_si128(xmm2_ori, xmm_mask);
6557 : xmm3 = _mm_and_si128(xmm3_ori, xmm_mask);
6558 : }
6559 : else
6560 : {
6561 : xmm0 = xmm0_ori;
6562 : xmm1 = xmm1_ori;
6563 : xmm2 = xmm2_ori;
6564 : xmm3 = xmm3_ori;
6565 : }
6566 : // Pack int32 to int16
6567 : xmm0 = _mm_packs_epi32(xmm0, xmm1);
6568 : xmm2 = _mm_packs_epi32(xmm2, xmm3);
6569 : // Pack int16 to uint8
6570 : xmm0 = _mm_packus_epi16(xmm0, xmm2);
6571 : return xmm0;
6572 : }
6573 :
6574 : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
6575 : GByte *CPL_RESTRICT pabyDest0,
6576 : GByte *CPL_RESTRICT pabyDest1,
6577 : GByte *CPL_RESTRICT pabyDest2,
6578 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6579 : #ifdef USE_NEON_OPTIMIZATIONS
6580 : {
6581 : return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6582 : pabyDest3, nIters);
6583 : }
6584 : #else
6585 : {
6586 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
6587 : if (CPLHaveRuntimeSSSE3())
6588 : {
6589 : return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
6590 : pabyDest2, pabyDest3, nIters);
6591 : }
6592 : #endif
6593 :
6594 : // Not the optimal SSE2-only code, as gcc auto-vectorizer manages to
6595 : // do something slightly better.
6596 : size_t i = 0;
6597 : for (; i + 15 < nIters; i += 16)
6598 : {
6599 : __m128i xmm0_ori = _mm_loadu_si128(
6600 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
6601 : __m128i xmm1_ori = _mm_loadu_si128(
6602 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
6603 : __m128i xmm2_ori = _mm_loadu_si128(
6604 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
6605 : __m128i xmm3_ori = _mm_loadu_si128(
6606 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
6607 :
6608 : _mm_storeu_si128(
6609 : reinterpret_cast<__m128i *>(pabyDest0 + i),
6610 : deinterleave<false, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6611 : _mm_storeu_si128(
6612 : reinterpret_cast<__m128i *>(pabyDest1 + i),
6613 : deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6614 : _mm_storeu_si128(
6615 : reinterpret_cast<__m128i *>(pabyDest2 + i),
6616 : deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6617 : _mm_storeu_si128(
6618 : reinterpret_cast<__m128i *>(pabyDest3 + i),
6619 : deinterleave<true, false>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6620 : }
6621 :
6622 : #if defined(__clang__)
6623 : #pragma clang loop vectorize(disable)
6624 : #endif
6625 : for (; i < nIters; ++i)
6626 : {
6627 : pabyDest0[i] = pabySrc[4 * i + 0];
6628 : pabyDest1[i] = pabySrc[4 * i + 1];
6629 : pabyDest2[i] = pabySrc[4 * i + 2];
6630 : pabyDest3[i] = pabySrc[4 * i + 3];
6631 : }
6632 : }
6633 : #endif
6634 : #else
6635 : // GCC autovectorizer does an excellent job
6636 97793 : __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
6637 : const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
6638 : GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2,
6639 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6640 : {
6641 545636000 : for (size_t i = 0; i < nIters; ++i)
6642 : {
6643 545538000 : pabyDest0[i] = pabySrc[4 * i + 0];
6644 545538000 : pabyDest1[i] = pabySrc[4 * i + 1];
6645 545538000 : pabyDest2[i] = pabySrc[4 * i + 2];
6646 545538000 : pabyDest3[i] = pabySrc[4 * i + 3];
6647 : }
6648 97793 : }
6649 : #endif
6650 :
6651 : #else
6652 :
6653 : /************************************************************************/
6654 : /* GDALDeinterleave3Byte() */
6655 : /************************************************************************/
6656 :
6657 : // TODO: Enabling below could help on non-Intel architectures where GCC knows
6658 : // how to auto-vectorize
6659 : // #if defined(__GNUC__)
6660 : //__attribute__((optimize("tree-vectorize")))
6661 : // #endif
6662 : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
6663 : GByte *CPL_RESTRICT pabyDest0,
6664 : GByte *CPL_RESTRICT pabyDest1,
6665 : GByte *CPL_RESTRICT pabyDest2, size_t nIters)
6666 : {
6667 : for (size_t i = 0; i < nIters; ++i)
6668 : {
6669 : pabyDest0[i] = pabySrc[3 * i + 0];
6670 : pabyDest1[i] = pabySrc[3 * i + 1];
6671 : pabyDest2[i] = pabySrc[3 * i + 2];
6672 : }
6673 : }
6674 :
6675 : /************************************************************************/
6676 : /* GDALDeinterleave4Byte() */
6677 : /************************************************************************/
6678 :
6679 : // TODO: Enabling below could help on non-Intel architectures where gcc knows
6680 : // how to auto-vectorize
6681 : // #if defined(__GNUC__)
6682 : //__attribute__((optimize("tree-vectorize")))
6683 : // #endif
6684 : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
6685 : GByte *CPL_RESTRICT pabyDest0,
6686 : GByte *CPL_RESTRICT pabyDest1,
6687 : GByte *CPL_RESTRICT pabyDest2,
6688 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6689 : {
6690 : for (size_t i = 0; i < nIters; ++i)
6691 : {
6692 : pabyDest0[i] = pabySrc[4 * i + 0];
6693 : pabyDest1[i] = pabySrc[4 * i + 1];
6694 : pabyDest2[i] = pabySrc[4 * i + 2];
6695 : pabyDest3[i] = pabySrc[4 * i + 3];
6696 : }
6697 : }
6698 :
6699 : #endif
6700 :
6701 : /************************************************************************/
6702 : /* GDALDeinterleave() */
6703 : /************************************************************************/
6704 :
6705 : /*! Copy values from a pixel-interleave buffer to multiple per-component
6706 : buffers.
6707 :
6708 : In pseudo-code
6709 : \verbatim
6710 : for(size_t i = 0; i < nIters; ++i)
6711 : for(int iComp = 0; iComp < nComponents; iComp++ )
6712 : ppDestBuffer[iComp][i] = pSourceBuffer[nComponents * i + iComp]
6713 : \endverbatim
6714 :
6715 : The implementation is optimized for a few cases, like de-interleaving
6716 : of 3 or 4-components Byte buffers.
6717 :
6718 : \since GDAL 3.6
6719 : */
6720 481169 : void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
6721 : int nComponents, void **ppDestBuffer,
6722 : GDALDataType eDestDT, size_t nIters)
6723 : {
6724 481169 : if (eSourceDT == eDestDT)
6725 : {
6726 481147 : if (eSourceDT == GDT_UInt8 || eSourceDT == GDT_Int8)
6727 : {
6728 480826 : if (nComponents == 3)
6729 : {
6730 383026 : const GByte *CPL_RESTRICT pabySrc =
6731 : static_cast<const GByte *>(pSourceBuffer);
6732 383026 : GByte *CPL_RESTRICT pabyDest0 =
6733 : static_cast<GByte *>(ppDestBuffer[0]);
6734 383026 : GByte *CPL_RESTRICT pabyDest1 =
6735 : static_cast<GByte *>(ppDestBuffer[1]);
6736 383026 : GByte *CPL_RESTRICT pabyDest2 =
6737 : static_cast<GByte *>(ppDestBuffer[2]);
6738 383026 : GDALDeinterleave3Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6739 : nIters);
6740 383026 : return;
6741 : }
6742 97800 : else if (nComponents == 4)
6743 : {
6744 97793 : const GByte *CPL_RESTRICT pabySrc =
6745 : static_cast<const GByte *>(pSourceBuffer);
6746 97793 : GByte *CPL_RESTRICT pabyDest0 =
6747 : static_cast<GByte *>(ppDestBuffer[0]);
6748 97793 : GByte *CPL_RESTRICT pabyDest1 =
6749 : static_cast<GByte *>(ppDestBuffer[1]);
6750 97793 : GByte *CPL_RESTRICT pabyDest2 =
6751 : static_cast<GByte *>(ppDestBuffer[2]);
6752 97793 : GByte *CPL_RESTRICT pabyDest3 =
6753 : static_cast<GByte *>(ppDestBuffer[3]);
6754 97793 : GDALDeinterleave4Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6755 : pabyDest3, nIters);
6756 97793 : return;
6757 7 : }
6758 : }
6759 : #if ((defined(__GNUC__) && !defined(__clang__)) || \
6760 : defined(__INTEL_CLANG_COMPILER)) && \
6761 : defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
6762 642 : else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
6763 321 : CPLHaveRuntimeSSSE3())
6764 : {
6765 321 : if (nComponents == 3)
6766 : {
6767 126 : const GUInt16 *CPL_RESTRICT panSrc =
6768 : static_cast<const GUInt16 *>(pSourceBuffer);
6769 126 : GUInt16 *CPL_RESTRICT panDest0 =
6770 : static_cast<GUInt16 *>(ppDestBuffer[0]);
6771 126 : GUInt16 *CPL_RESTRICT panDest1 =
6772 : static_cast<GUInt16 *>(ppDestBuffer[1]);
6773 126 : GUInt16 *CPL_RESTRICT panDest2 =
6774 : static_cast<GUInt16 *>(ppDestBuffer[2]);
6775 126 : GDALDeinterleave3UInt16_SSSE3(panSrc, panDest0, panDest1,
6776 : panDest2, nIters);
6777 126 : return;
6778 : }
6779 : #if !defined(__INTEL_CLANG_COMPILER)
6780 : // ICC autovectorizer doesn't do a good job, at least with icx
6781 : // 2022.1.0.20220316
6782 195 : else if (nComponents == 4)
6783 : {
6784 195 : const GUInt16 *CPL_RESTRICT panSrc =
6785 : static_cast<const GUInt16 *>(pSourceBuffer);
6786 195 : GUInt16 *CPL_RESTRICT panDest0 =
6787 : static_cast<GUInt16 *>(ppDestBuffer[0]);
6788 195 : GUInt16 *CPL_RESTRICT panDest1 =
6789 : static_cast<GUInt16 *>(ppDestBuffer[1]);
6790 195 : GUInt16 *CPL_RESTRICT panDest2 =
6791 : static_cast<GUInt16 *>(ppDestBuffer[2]);
6792 195 : GUInt16 *CPL_RESTRICT panDest3 =
6793 : static_cast<GUInt16 *>(ppDestBuffer[3]);
6794 195 : GDALDeinterleave4UInt16_SSSE3(panSrc, panDest0, panDest1,
6795 : panDest2, panDest3, nIters);
6796 195 : return;
6797 : }
6798 : #endif
6799 : }
6800 : #endif
6801 : }
6802 :
6803 29 : const int nSourceDTSize = GDALGetDataTypeSizeBytes(eSourceDT);
6804 29 : const int nDestDTSize = GDALGetDataTypeSizeBytes(eDestDT);
6805 108 : for (int iComp = 0; iComp < nComponents; iComp++)
6806 : {
6807 79 : GDALCopyWords64(static_cast<const GByte *>(pSourceBuffer) +
6808 79 : iComp * nSourceDTSize,
6809 : eSourceDT, nComponents * nSourceDTSize,
6810 79 : ppDestBuffer[iComp], eDestDT, nDestDTSize, nIters);
6811 : }
6812 : }
6813 :
6814 : /************************************************************************/
6815 : /* GDALTranspose2DSingleToSingle() */
6816 : /************************************************************************/
6817 : /**
6818 : * Transpose a 2D array of non-complex values, in a efficient (cache-oblivious) way.
6819 : *
6820 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6821 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6822 : * @param nSrcWidth Width of pSrc array.
6823 : * @param nSrcHeight Height of pSrc array.
6824 : */
6825 :
6826 : template <class DST, class SRC>
6827 160 : void GDALTranspose2DSingleToSingle(const SRC *CPL_RESTRICT pSrc,
6828 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6829 : size_t nSrcHeight)
6830 : {
6831 160 : constexpr size_t blocksize = 32;
6832 345 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6833 : {
6834 185 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6835 5016 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6836 : {
6837 : // transpose the block beginning at [i,j]
6838 4831 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6839 26185 : for (size_t k = i; k < max_k; ++k)
6840 : {
6841 669282 : for (size_t l = j; l < max_l; ++l)
6842 : {
6843 647928 : GDALCopyWord(pSrc[l + k * nSrcWidth],
6844 647928 : pDst[k + l * nSrcHeight]);
6845 : }
6846 : }
6847 : }
6848 : }
6849 160 : }
6850 :
6851 : /************************************************************************/
6852 : /* GDALTranspose2DComplexToComplex() */
6853 : /************************************************************************/
6854 : /**
6855 : * Transpose a 2D array of complex values into an array of complex values,
6856 : * in a efficient (cache-oblivious) way.
6857 : *
6858 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6859 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6860 : * @param nSrcWidth Width of pSrc array.
6861 : * @param nSrcHeight Height of pSrc array.
6862 : */
6863 : template <class DST, class SRC>
6864 25 : void GDALTranspose2DComplexToComplex(const SRC *CPL_RESTRICT pSrc,
6865 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6866 : size_t nSrcHeight)
6867 : {
6868 25 : constexpr size_t blocksize = 32;
6869 50 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6870 : {
6871 25 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6872 50 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6873 : {
6874 : // transpose the block beginning at [i,j]
6875 25 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6876 75 : for (size_t k = i; k < max_k; ++k)
6877 : {
6878 200 : for (size_t l = j; l < max_l; ++l)
6879 : {
6880 150 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
6881 150 : pDst[2 * (k + l * nSrcHeight) + 0]);
6882 150 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 1],
6883 150 : pDst[2 * (k + l * nSrcHeight) + 1]);
6884 : }
6885 : }
6886 : }
6887 : }
6888 25 : }
6889 :
6890 : /************************************************************************/
6891 : /* GDALTranspose2DComplexToSingle() */
6892 : /************************************************************************/
6893 : /**
6894 : * Transpose a 2D array of complex values into an array of non-complex values,
6895 : * in a efficient (cache-oblivious) way.
6896 : *
6897 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6898 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6899 : * @param nSrcWidth Width of pSrc array.
6900 : * @param nSrcHeight Height of pSrc array.
6901 : */
6902 : template <class DST, class SRC>
6903 55 : void GDALTranspose2DComplexToSingle(const SRC *CPL_RESTRICT pSrc,
6904 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6905 : size_t nSrcHeight)
6906 : {
6907 55 : constexpr size_t blocksize = 32;
6908 110 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6909 : {
6910 55 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6911 110 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6912 : {
6913 : // transpose the block beginning at [i,j]
6914 55 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6915 165 : for (size_t k = i; k < max_k; ++k)
6916 : {
6917 440 : for (size_t l = j; l < max_l; ++l)
6918 : {
6919 330 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
6920 330 : pDst[k + l * nSrcHeight]);
6921 : }
6922 : }
6923 : }
6924 : }
6925 55 : }
6926 :
6927 : /************************************************************************/
6928 : /* GDALTranspose2DSingleToComplex() */
6929 : /************************************************************************/
6930 : /**
6931 : * Transpose a 2D array of non-complex values into an array of complex values,
6932 : * in a efficient (cache-oblivious) way.
6933 : *
6934 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6935 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6936 : * @param nSrcWidth Width of pSrc array.
6937 : * @param nSrcHeight Height of pSrc array.
6938 : */
6939 : template <class DST, class SRC>
6940 55 : void GDALTranspose2DSingleToComplex(const SRC *CPL_RESTRICT pSrc,
6941 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6942 : size_t nSrcHeight)
6943 : {
6944 55 : constexpr size_t blocksize = 32;
6945 110 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6946 : {
6947 55 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6948 110 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6949 : {
6950 : // transpose the block beginning at [i,j]
6951 55 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6952 165 : for (size_t k = i; k < max_k; ++k)
6953 : {
6954 440 : for (size_t l = j; l < max_l; ++l)
6955 : {
6956 330 : GDALCopyWord(pSrc[l + k * nSrcWidth],
6957 330 : pDst[2 * (k + l * nSrcHeight) + 0]);
6958 330 : pDst[2 * (k + l * nSrcHeight) + 1] = 0;
6959 : }
6960 : }
6961 : }
6962 : }
6963 55 : }
6964 :
6965 : /************************************************************************/
6966 : /* GDALTranspose2D() */
6967 : /************************************************************************/
6968 :
6969 : template <class DST, bool DST_IS_COMPLEX>
6970 295 : static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
6971 : size_t nSrcWidth, size_t nSrcHeight)
6972 : {
6973 : #define CALL_GDALTranspose2D_internal(SRC_TYPE) \
6974 : do \
6975 : { \
6976 : if constexpr (DST_IS_COMPLEX) \
6977 : { \
6978 : GDALTranspose2DSingleToComplex( \
6979 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6980 : nSrcHeight); \
6981 : } \
6982 : else \
6983 : { \
6984 : GDALTranspose2DSingleToSingle(static_cast<const SRC_TYPE *>(pSrc), \
6985 : pDst, nSrcWidth, nSrcHeight); \
6986 : } \
6987 : } while (0)
6988 :
6989 : #define CALL_GDALTranspose2DComplex_internal(SRC_TYPE) \
6990 : do \
6991 : { \
6992 : if constexpr (DST_IS_COMPLEX) \
6993 : { \
6994 : GDALTranspose2DComplexToComplex( \
6995 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6996 : nSrcHeight); \
6997 : } \
6998 : else \
6999 : { \
7000 : GDALTranspose2DComplexToSingle( \
7001 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
7002 : nSrcHeight); \
7003 : } \
7004 : } while (0)
7005 :
7006 : // clang-format off
7007 295 : switch (eSrcType)
7008 : {
7009 16 : case GDT_UInt8: CALL_GDALTranspose2D_internal(uint8_t); break;
7010 15 : case GDT_Int8: CALL_GDALTranspose2D_internal(int8_t); break;
7011 33 : case GDT_UInt16: CALL_GDALTranspose2D_internal(uint16_t); break;
7012 20 : case GDT_Int16: CALL_GDALTranspose2D_internal(int16_t); break;
7013 24 : case GDT_UInt32: CALL_GDALTranspose2D_internal(uint32_t); break;
7014 16 : case GDT_Int32: CALL_GDALTranspose2D_internal(int32_t); break;
7015 16 : case GDT_UInt64: CALL_GDALTranspose2D_internal(uint64_t); break;
7016 16 : case GDT_Int64: CALL_GDALTranspose2D_internal(int64_t); break;
7017 16 : case GDT_Float16: CALL_GDALTranspose2D_internal(GFloat16); break;
7018 19 : case GDT_Float32: CALL_GDALTranspose2D_internal(float); break;
7019 24 : case GDT_Float64: CALL_GDALTranspose2D_internal(double); break;
7020 16 : case GDT_CInt16: CALL_GDALTranspose2DComplex_internal(int16_t); break;
7021 16 : case GDT_CInt32: CALL_GDALTranspose2DComplex_internal(int32_t); break;
7022 16 : case GDT_CFloat16: CALL_GDALTranspose2DComplex_internal(GFloat16); break;
7023 16 : case GDT_CFloat32: CALL_GDALTranspose2DComplex_internal(float); break;
7024 16 : case GDT_CFloat64: CALL_GDALTranspose2DComplex_internal(double); break;
7025 0 : case GDT_Unknown:
7026 : case GDT_TypeCount:
7027 0 : break;
7028 : }
7029 : // clang-format on
7030 :
7031 : #undef CALL_GDALTranspose2D_internal
7032 : #undef CALL_GDALTranspose2DComplex_internal
7033 295 : }
7034 :
7035 : /************************************************************************/
7036 : /* GDALInterleave2Byte() */
7037 : /************************************************************************/
7038 :
7039 : #if defined(HAVE_SSE2) && \
7040 : (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
7041 :
7042 : // ICC autovectorizer doesn't do a good job at generating good SSE code,
7043 : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
7044 : #if defined(__GNUC__)
7045 : __attribute__((noinline))
7046 : #endif
7047 : static void GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
7048 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
7049 : {
7050 : size_t i = 0;
7051 : constexpr size_t VALS_PER_ITER = 16;
7052 : for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
7053 : {
7054 : __m128i xmm0 =
7055 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + i));
7056 : __m128i xmm1 = _mm_loadu_si128(
7057 : reinterpret_cast<__m128i const *>(pSrc + i + nIters));
7058 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + 2 * i),
7059 : _mm_unpacklo_epi8(xmm0, xmm1));
7060 : _mm_storeu_si128(
7061 : reinterpret_cast<__m128i *>(pDst + 2 * i + VALS_PER_ITER),
7062 : _mm_unpackhi_epi8(xmm0, xmm1));
7063 : }
7064 : #if defined(__clang__)
7065 : #pragma clang loop vectorize(disable)
7066 : #endif
7067 : for (; i < nIters; ++i)
7068 : {
7069 : pDst[2 * i + 0] = pSrc[i + 0 * nIters];
7070 : pDst[2 * i + 1] = pSrc[i + 1 * nIters];
7071 : }
7072 : }
7073 :
7074 : #else
7075 :
7076 : #if defined(__GNUC__) && !defined(__clang__)
7077 : __attribute__((optimize("tree-vectorize")))
7078 : #endif
7079 : #if defined(__GNUC__)
7080 : __attribute__((noinline))
7081 : #endif
7082 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7083 : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
7084 : #pragma clang diagnostic push
7085 : #pragma clang diagnostic ignored "-Wpass-failed"
7086 : #endif
7087 9 : static void GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
7088 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
7089 : {
7090 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7091 : #pragma clang loop vectorize(enable)
7092 : #endif
7093 355429 : for (size_t i = 0; i < nIters; ++i)
7094 : {
7095 355420 : pDst[2 * i + 0] = pSrc[i + 0 * nIters];
7096 355420 : pDst[2 * i + 1] = pSrc[i + 1 * nIters];
7097 : }
7098 9 : }
7099 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7100 : #pragma clang diagnostic pop
7101 : #endif
7102 :
7103 : #endif
7104 :
7105 : /************************************************************************/
7106 : /* GDALInterleave4Byte() */
7107 : /************************************************************************/
7108 :
7109 : #if defined(HAVE_SSE2) && \
7110 : (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
7111 :
7112 : // ICC autovectorizer doesn't do a good job at generating good SSE code,
7113 : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
7114 : #if defined(__GNUC__)
7115 : __attribute__((noinline))
7116 : #endif
7117 : static void GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
7118 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
7119 : {
7120 : size_t i = 0;
7121 : constexpr size_t VALS_PER_ITER = 16;
7122 : for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
7123 : {
7124 : __m128i xmm0 = _mm_loadu_si128(
7125 : reinterpret_cast<__m128i const *>(pSrc + i + 0 * nIters));
7126 : __m128i xmm1 = _mm_loadu_si128(
7127 : reinterpret_cast<__m128i const *>(pSrc + i + 1 * nIters));
7128 : __m128i xmm2 = _mm_loadu_si128(
7129 : reinterpret_cast<__m128i const *>(pSrc + i + 2 * nIters));
7130 : __m128i xmm3 = _mm_loadu_si128(
7131 : reinterpret_cast<__m128i const *>(pSrc + i + 3 * nIters));
7132 : auto tmp0 = _mm_unpacklo_epi8(
7133 : xmm0,
7134 : xmm1); // (xmm0_0, xmm1_0, xmm0_1, xmm1_1, xmm0_2, xmm1_2, ...)
7135 : auto tmp1 = _mm_unpackhi_epi8(
7136 : xmm0,
7137 : xmm1); // (xmm0_8, xmm1_8, xmm0_9, xmm1_9, xmm0_10, xmm1_10, ...)
7138 : auto tmp2 = _mm_unpacklo_epi8(
7139 : xmm2,
7140 : xmm3); // (xmm2_0, xmm3_0, xmm2_1, xmm3_1, xmm2_2, xmm3_2, ...)
7141 : auto tmp3 = _mm_unpackhi_epi8(
7142 : xmm2,
7143 : xmm3); // (xmm2_8, xmm3_8, xmm2_9, xmm3_9, xmm2_10, xmm3_10, ...)
7144 : auto tmp2_0 = _mm_unpacklo_epi16(
7145 : tmp0,
7146 : tmp2); // (xmm0_0, xmm1_0, xmm2_0, xmm3_0, xmm0_1, xmm1_1, xmm2_1, xmm3_1, ...)
7147 : auto tmp2_1 = _mm_unpackhi_epi16(tmp0, tmp2);
7148 : auto tmp2_2 = _mm_unpacklo_epi16(tmp1, tmp3);
7149 : auto tmp2_3 = _mm_unpackhi_epi16(tmp1, tmp3);
7150 : _mm_storeu_si128(
7151 : reinterpret_cast<__m128i *>(pDst + 4 * i + 0 * VALS_PER_ITER),
7152 : tmp2_0);
7153 : _mm_storeu_si128(
7154 : reinterpret_cast<__m128i *>(pDst + 4 * i + 1 * VALS_PER_ITER),
7155 : tmp2_1);
7156 : _mm_storeu_si128(
7157 : reinterpret_cast<__m128i *>(pDst + 4 * i + 2 * VALS_PER_ITER),
7158 : tmp2_2);
7159 : _mm_storeu_si128(
7160 : reinterpret_cast<__m128i *>(pDst + 4 * i + 3 * VALS_PER_ITER),
7161 : tmp2_3);
7162 : }
7163 : #if defined(__clang__)
7164 : #pragma clang loop vectorize(disable)
7165 : #endif
7166 : for (; i < nIters; ++i)
7167 : {
7168 : pDst[4 * i + 0] = pSrc[i + 0 * nIters];
7169 : pDst[4 * i + 1] = pSrc[i + 1 * nIters];
7170 : pDst[4 * i + 2] = pSrc[i + 2 * nIters];
7171 : pDst[4 * i + 3] = pSrc[i + 3 * nIters];
7172 : }
7173 : }
7174 :
7175 : #else
7176 :
7177 : #if defined(__GNUC__) && !defined(__clang__)
7178 : __attribute__((optimize("tree-vectorize")))
7179 : #endif
7180 : #if defined(__GNUC__)
7181 : __attribute__((noinline))
7182 : #endif
7183 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7184 : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
7185 : #pragma clang diagnostic push
7186 : #pragma clang diagnostic ignored "-Wpass-failed"
7187 : #endif
7188 30 : static void GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
7189 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
7190 : {
7191 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7192 : #pragma clang loop vectorize(enable)
7193 : #endif
7194 49620700 : for (size_t i = 0; i < nIters; ++i)
7195 : {
7196 49620600 : pDst[4 * i + 0] = pSrc[i + 0 * nIters];
7197 49620600 : pDst[4 * i + 1] = pSrc[i + 1 * nIters];
7198 49620600 : pDst[4 * i + 2] = pSrc[i + 2 * nIters];
7199 49620600 : pDst[4 * i + 3] = pSrc[i + 3 * nIters];
7200 : }
7201 30 : }
7202 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7203 : #pragma clang diagnostic pop
7204 : #endif
7205 :
7206 : #endif
7207 :
7208 : /************************************************************************/
7209 : /* GDALTranspose2D() */
7210 : /************************************************************************/
7211 :
7212 : /**
7213 : * Transpose a 2D array in a efficient (cache-oblivious) way.
7214 : *
7215 : * @param pSrc Source array of width = nSrcWidth and height = nSrcHeight.
7216 : * @param eSrcType Data type of pSrc.
7217 : * @param pDst Destination transposed array of width = nSrcHeight and height = nSrcWidth.
7218 : * @param eDstType Data type of pDst.
7219 : * @param nSrcWidth Width of pSrc array.
7220 : * @param nSrcHeight Height of pSrc array.
7221 : * @since GDAL 3.11
7222 : */
7223 :
7224 365 : void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, void *pDst,
7225 : GDALDataType eDstType, size_t nSrcWidth, size_t nSrcHeight)
7226 : {
7227 365 : if (eSrcType == eDstType && (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8))
7228 : {
7229 70 : if (nSrcHeight == 2)
7230 : {
7231 9 : GDALInterleave2Byte(static_cast<const uint8_t *>(pSrc),
7232 : static_cast<uint8_t *>(pDst), nSrcWidth);
7233 9 : return;
7234 : }
7235 61 : if (nSrcHeight == 4)
7236 : {
7237 30 : GDALInterleave4Byte(static_cast<const uint8_t *>(pSrc),
7238 : static_cast<uint8_t *>(pDst), nSrcWidth);
7239 30 : return;
7240 : }
7241 : #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) && \
7242 : (defined(__x86_64) || defined(_M_X64)))
7243 31 : if (CPLHaveRuntimeSSSE3())
7244 : {
7245 31 : GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
7246 : static_cast<uint8_t *>(pDst), nSrcWidth,
7247 : nSrcHeight);
7248 31 : return;
7249 : }
7250 : #elif defined(USE_NEON_OPTIMIZATIONS)
7251 : {
7252 : GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
7253 : static_cast<uint8_t *>(pDst), nSrcWidth,
7254 : nSrcHeight);
7255 : return;
7256 : }
7257 : #endif
7258 : }
7259 :
7260 : #define CALL_GDALTranspose2D_internal(DST_TYPE, DST_IS_COMPLEX) \
7261 : GDALTranspose2D<DST_TYPE, DST_IS_COMPLEX>( \
7262 : pSrc, eSrcType, static_cast<DST_TYPE *>(pDst), nSrcWidth, nSrcHeight)
7263 :
7264 : // clang-format off
7265 295 : switch (eDstType)
7266 : {
7267 15 : case GDT_UInt8: CALL_GDALTranspose2D_internal(uint8_t, false); break;
7268 15 : case GDT_Int8: CALL_GDALTranspose2D_internal(int8_t, false); break;
7269 33 : case GDT_UInt16: CALL_GDALTranspose2D_internal(uint16_t, false); break;
7270 20 : case GDT_Int16: CALL_GDALTranspose2D_internal(int16_t, false); break;
7271 24 : case GDT_UInt32: CALL_GDALTranspose2D_internal(uint32_t, false); break;
7272 16 : case GDT_Int32: CALL_GDALTranspose2D_internal(int32_t, false); break;
7273 16 : case GDT_UInt64: CALL_GDALTranspose2D_internal(uint64_t, false); break;
7274 16 : case GDT_Int64: CALL_GDALTranspose2D_internal(int64_t, false); break;
7275 16 : case GDT_Float16: CALL_GDALTranspose2D_internal(GFloat16, false); break;
7276 19 : case GDT_Float32: CALL_GDALTranspose2D_internal(float, false); break;
7277 25 : case GDT_Float64: CALL_GDALTranspose2D_internal(double, false); break;
7278 16 : case GDT_CInt16: CALL_GDALTranspose2D_internal(int16_t, true); break;
7279 16 : case GDT_CInt32: CALL_GDALTranspose2D_internal(int32_t, true); break;
7280 16 : case GDT_CFloat16: CALL_GDALTranspose2D_internal(GFloat16, true); break;
7281 16 : case GDT_CFloat32: CALL_GDALTranspose2D_internal(float, true); break;
7282 16 : case GDT_CFloat64: CALL_GDALTranspose2D_internal(double, true); break;
7283 0 : case GDT_Unknown:
7284 : case GDT_TypeCount:
7285 0 : break;
7286 : }
7287 : // clang-format on
7288 :
7289 : #undef CALL_GDALTranspose2D_internal
7290 : }
7291 :
7292 : /************************************************************************/
7293 : /* ExtractBitAndConvertTo255() */
7294 : /************************************************************************/
7295 :
7296 : #if defined(__GNUC__) || defined(_MSC_VER)
7297 : // Signedness of char implementation dependent, so be explicit.
7298 : // Assumes 2-complement integer types and sign extension of right shifting
7299 : // GCC guarantees such:
7300 : // https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
7301 143686 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
7302 : {
7303 143686 : return static_cast<GByte>(static_cast<signed char>(byVal << (7 - nBit)) >>
7304 143686 : 7);
7305 : }
7306 : #else
7307 : // Portable way
7308 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
7309 : {
7310 : return (byVal & (1 << nBit)) ? 255 : 0;
7311 : }
7312 : #endif
7313 :
7314 : /************************************************************************/
7315 : /* ExpandEightPackedBitsToByteAt255() */
7316 : /************************************************************************/
7317 :
7318 17825 : static inline void ExpandEightPackedBitsToByteAt255(GByte byVal,
7319 : GByte abyOutput[8])
7320 : {
7321 17825 : abyOutput[0] = ExtractBitAndConvertTo255(byVal, 7);
7322 17825 : abyOutput[1] = ExtractBitAndConvertTo255(byVal, 6);
7323 17825 : abyOutput[2] = ExtractBitAndConvertTo255(byVal, 5);
7324 17825 : abyOutput[3] = ExtractBitAndConvertTo255(byVal, 4);
7325 17825 : abyOutput[4] = ExtractBitAndConvertTo255(byVal, 3);
7326 17825 : abyOutput[5] = ExtractBitAndConvertTo255(byVal, 2);
7327 17825 : abyOutput[6] = ExtractBitAndConvertTo255(byVal, 1);
7328 17825 : abyOutput[7] = ExtractBitAndConvertTo255(byVal, 0);
7329 17825 : }
7330 :
7331 : /************************************************************************/
7332 : /* GDALExpandPackedBitsToByteAt0Or255() */
7333 : /************************************************************************/
7334 :
7335 : /** Expand packed-bits (ordered from most-significant bit to least one)
7336 : into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
7337 : at 1 to a byte at 255.
7338 :
7339 : The function does (in a possibly more optimized way) the following:
7340 : \code{.cpp}
7341 : for (size_t i = 0; i < nInputBits; ++i )
7342 : {
7343 : pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 255 : 0;
7344 : }
7345 : \endcode
7346 :
7347 : @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
7348 : @param pabyOutput Output array of nInputBits bytes.
7349 : @param nInputBits Number of valid bits in pabyInput.
7350 :
7351 : @since 3.11
7352 : */
7353 :
7354 46905 : void GDALExpandPackedBitsToByteAt0Or255(const GByte *CPL_RESTRICT pabyInput,
7355 : GByte *CPL_RESTRICT pabyOutput,
7356 : size_t nInputBits)
7357 : {
7358 46905 : const size_t nInputWholeBytes = nInputBits / 8;
7359 46905 : size_t iByte = 0;
7360 :
7361 : #ifdef HAVE_SSE2
7362 : // Mask to isolate each bit
7363 46905 : const __m128i bit_mask = _mm_set_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4,
7364 : 8, 16, 32, 64, -128);
7365 46905 : const __m128i zero = _mm_setzero_si128();
7366 46905 : const __m128i all_ones = _mm_set1_epi8(-1);
7367 : #ifdef __SSSE3__
7368 : const __m128i dispatch_two_bytes =
7369 : _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
7370 : #endif
7371 46905 : constexpr size_t SSE_REG_SIZE = sizeof(bit_mask);
7372 138950 : for (; iByte + SSE_REG_SIZE <= nInputWholeBytes; iByte += SSE_REG_SIZE)
7373 : {
7374 92045 : __m128i reg_ori = _mm_loadu_si128(
7375 92045 : reinterpret_cast<const __m128i *>(pabyInput + iByte));
7376 :
7377 92045 : constexpr int NUM_PROCESSED_BYTES_PER_REG = 2;
7378 828405 : for (size_t k = 0; k < SSE_REG_SIZE / NUM_PROCESSED_BYTES_PER_REG; ++k)
7379 : {
7380 : // Given reg_ori = (A, B, ... 14 other bytes ...),
7381 : // expand to (A, A, A, A, A, A, A, A, B, B, B, B, B, B, B, B)
7382 : #ifdef __SSSE3__
7383 : __m128i reg = _mm_shuffle_epi8(reg_ori, dispatch_two_bytes);
7384 : #else
7385 736360 : __m128i reg = _mm_unpacklo_epi8(reg_ori, reg_ori);
7386 736360 : reg = _mm_unpacklo_epi16(reg, reg);
7387 736360 : reg = _mm_unpacklo_epi32(reg, reg);
7388 : #endif
7389 :
7390 : // Test if bits of interest are set
7391 736360 : reg = _mm_and_si128(reg, bit_mask);
7392 :
7393 : // Now test if those bits are set, by comparing to zero. So the
7394 : // result will be that bytes where bits are set will be at 0, and
7395 : // ones where they are cleared will be at 0xFF. So the inverse of
7396 : // the end result we want!
7397 736360 : reg = _mm_cmpeq_epi8(reg, zero);
7398 :
7399 : // Invert the result
7400 736360 : reg = _mm_andnot_si128(reg, all_ones);
7401 :
7402 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyOutput), reg);
7403 :
7404 736360 : pabyOutput += SSE_REG_SIZE;
7405 :
7406 : // Right-shift of 2 bytes
7407 736360 : reg_ori = _mm_bsrli_si128(reg_ori, NUM_PROCESSED_BYTES_PER_REG);
7408 : }
7409 : }
7410 :
7411 : #endif // HAVE_SSE2
7412 :
7413 64730 : for (; iByte < nInputWholeBytes; ++iByte)
7414 : {
7415 17825 : ExpandEightPackedBitsToByteAt255(pabyInput[iByte], pabyOutput);
7416 17825 : pabyOutput += 8;
7417 : }
7418 47991 : for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
7419 : {
7420 1086 : *pabyOutput = ExtractBitAndConvertTo255(pabyInput[iByte], 7 - iBit);
7421 1086 : ++pabyOutput;
7422 : }
7423 46905 : }
7424 :
7425 : /************************************************************************/
7426 : /* ExpandEightPackedBitsToByteAt1() */
7427 : /************************************************************************/
7428 :
7429 136113 : static inline void ExpandEightPackedBitsToByteAt1(GByte byVal,
7430 : GByte abyOutput[8])
7431 : {
7432 136113 : abyOutput[0] = (byVal >> 7) & 0x1;
7433 136113 : abyOutput[1] = (byVal >> 6) & 0x1;
7434 136113 : abyOutput[2] = (byVal >> 5) & 0x1;
7435 136113 : abyOutput[3] = (byVal >> 4) & 0x1;
7436 136113 : abyOutput[4] = (byVal >> 3) & 0x1;
7437 136113 : abyOutput[5] = (byVal >> 2) & 0x1;
7438 136113 : abyOutput[6] = (byVal >> 1) & 0x1;
7439 136113 : abyOutput[7] = (byVal >> 0) & 0x1;
7440 136113 : }
7441 :
7442 : /************************************************************************/
7443 : /* GDALExpandPackedBitsToByteAt0Or1() */
7444 : /************************************************************************/
7445 :
7446 : /** Expand packed-bits (ordered from most-significant bit to least one)
7447 : into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
7448 : at 1 to a byte at 1.
7449 :
7450 : The function does (in a possibly more optimized way) the following:
7451 : \code{.cpp}
7452 : for (size_t i = 0; i < nInputBits; ++i )
7453 : {
7454 : pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
7455 : }
7456 : \endcode
7457 :
7458 : @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
7459 : @param pabyOutput Output array of nInputBits bytes.
7460 : @param nInputBits Number of valid bits in pabyInput.
7461 :
7462 : @since 3.11
7463 : */
7464 :
7465 7033 : void GDALExpandPackedBitsToByteAt0Or1(const GByte *CPL_RESTRICT pabyInput,
7466 : GByte *CPL_RESTRICT pabyOutput,
7467 : size_t nInputBits)
7468 : {
7469 7033 : const size_t nInputWholeBytes = nInputBits / 8;
7470 7033 : size_t iByte = 0;
7471 143146 : for (; iByte < nInputWholeBytes; ++iByte)
7472 : {
7473 136113 : ExpandEightPackedBitsToByteAt1(pabyInput[iByte], pabyOutput);
7474 136113 : pabyOutput += 8;
7475 : }
7476 18886 : for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
7477 : {
7478 11853 : *pabyOutput = (pabyInput[iByte] >> (7 - iBit)) & 0x1;
7479 11853 : ++pabyOutput;
7480 : }
7481 7033 : }
|