Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: GDAL Core
4 : * Purpose: Contains default implementation of GDALRasterBand::IRasterIO()
5 : * and supporting functions of broader utility.
6 : * Author: Frank Warmerdam, warmerdam@pobox.com
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 1998, Frank Warmerdam
10 : * Copyright (c) 2007-2014, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * SPDX-License-Identifier: MIT
13 : ****************************************************************************/
14 :
15 : #include "cpl_port.h"
16 : #include "gdal.h"
17 : #include "gdal_priv.h"
18 :
19 : #include <cassert>
20 : #include <climits>
21 : #include <cmath>
22 : #include <cstddef>
23 : #include <cstdio>
24 : #include <cstdlib>
25 : #include <cstring>
26 :
27 : #include <algorithm>
28 : #include <limits>
29 : #include <stdexcept>
30 : #include <type_traits>
31 :
32 : #include "cpl_conv.h"
33 : #include "cpl_cpu_features.h"
34 : #include "cpl_error.h"
35 : #include "cpl_float.h"
36 : #include "cpl_progress.h"
37 : #include "cpl_string.h"
38 : #include "cpl_vsi.h"
39 : #include "gdal_priv_templates.hpp"
40 : #include "gdal_vrt.h"
41 : #include "gdalwarper.h"
42 : #include "memdataset.h"
43 : #include "vrtdataset.h"
44 :
45 : #if defined(__x86_64) || defined(_M_X64)
46 : #include <emmintrin.h>
47 : #include <immintrin.h>
48 : #define HAVE_SSE2
49 : // AVX2 dispatch: compile AVX2 code with target attribute, detect at runtime
50 : #if (defined(__GNUC__) || defined(__clang__)) && \
51 : defined(HAVE_AVX2_AT_COMPILE_TIME)
52 : #define HAVE_AVX2_DISPATCH
53 : #elif defined(_MSC_VER)
54 : #include <intrin.h>
55 : #define HAVE_AVX2_DISPATCH
56 : #endif
57 : #elif defined(USE_NEON_OPTIMIZATIONS)
58 : #include "include_sse2neon.h"
59 : #define HAVE_SSE2
60 : #endif
61 :
62 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
63 : #include "rasterio_ssse3.h"
64 : #ifdef __SSSE3__
65 : #include <tmmintrin.h>
66 : #endif
67 : #endif
68 :
69 : #ifdef __SSE4_1__
70 : #include <smmintrin.h>
71 : #endif
72 :
73 : #ifdef __GNUC__
74 : #define CPL_NOINLINE __attribute__((noinline))
75 : #else
76 : #define CPL_NOINLINE
77 : #endif
78 :
79 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
80 : int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
81 : int nDstPixelStride, GPtrDiff_t nWordCount);
82 :
83 : /************************************************************************/
84 : /* DownsamplingIntegerXFactor() */
85 : /************************************************************************/
86 :
87 : template <bool bSameDataType, int DATA_TYPE_SIZE>
88 695850 : static bool DownsamplingIntegerXFactor(
89 : GDALRasterBand *poBand, int iSrcX, int nSrcXInc, GPtrDiff_t iSrcOffsetCst,
90 : GByte *CPL_RESTRICT pabyDstData, int nPixelSpace, int nBufXSize,
91 : GDALDataType eDataType, GDALDataType eBufType, int &nStartBlockX,
92 : int nBlockXSize, GDALRasterBlock *&poBlock, int nLBlockY)
93 : {
94 695850 : const int nBandDataSize =
95 : bSameDataType ? DATA_TYPE_SIZE : GDALGetDataTypeSizeBytes(eDataType);
96 695850 : int nOuterLoopIters = nBufXSize - 1;
97 695850 : const int nIncSrcOffset = nSrcXInc * nBandDataSize;
98 : const GByte *CPL_RESTRICT pabySrcData;
99 695850 : int nEndBlockX = nBlockXSize + nStartBlockX;
100 :
101 695850 : if (iSrcX < nEndBlockX)
102 : {
103 295062 : CPLAssert(poBlock);
104 295062 : goto no_reload_block;
105 : }
106 400788 : goto reload_block;
107 :
108 : // Don't do the last iteration in the loop, as iSrcX might go beyond
109 : // nRasterXSize - 1
110 1265113 : while (--nOuterLoopIters >= 1)
111 : {
112 201834 : iSrcX += nSrcXInc;
113 201834 : pabySrcData += nIncSrcOffset;
114 201834 : pabyDstData += nPixelSpace;
115 :
116 : /* --------------------------------------------------------------------
117 : */
118 : /* Ensure we have the appropriate block loaded. */
119 : /* --------------------------------------------------------------------
120 : */
121 201834 : if (iSrcX >= nEndBlockX)
122 : {
123 201834 : reload_block:
124 : {
125 615212 : const int nLBlockX = iSrcX / nBlockXSize;
126 615212 : nStartBlockX = nLBlockX * nBlockXSize;
127 615212 : nEndBlockX = nStartBlockX + nBlockXSize;
128 :
129 615212 : if (poBlock != nullptr)
130 341376 : poBlock->DropLock();
131 :
132 615212 : poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
133 615212 : if (poBlock == nullptr)
134 : {
135 1 : return false;
136 : }
137 : }
138 :
139 615211 : no_reload_block:
140 : const GByte *pabySrcBlock =
141 1265113 : static_cast<const GByte *>(poBlock->GetDataRef());
142 1265113 : GPtrDiff_t iSrcOffset =
143 1265113 : (iSrcX - nStartBlockX + iSrcOffsetCst) * nBandDataSize;
144 1265113 : pabySrcData = pabySrcBlock + iSrcOffset;
145 : }
146 :
147 : /* --------------------------------------------------------------------
148 : */
149 : /* Copy the maximum run of pixels. */
150 : /* --------------------------------------------------------------------
151 : */
152 :
153 1265113 : const int nIters = std::min(
154 1265113 : (nEndBlockX - iSrcX + (nSrcXInc - 1)) / nSrcXInc, nOuterLoopIters);
155 : if (bSameDataType)
156 : {
157 1264670 : memcpy(pabyDstData, pabySrcData, nBandDataSize);
158 1264670 : if (nIters > 1)
159 : {
160 : if (DATA_TYPE_SIZE == 1)
161 : {
162 326320 : pabySrcData += nIncSrcOffset;
163 326320 : pabyDstData += nPixelSpace;
164 326320 : GDALFastCopyByte(pabySrcData, nIncSrcOffset, pabyDstData,
165 326320 : nPixelSpace, nIters - 1);
166 326320 : pabySrcData +=
167 326320 : static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 2);
168 326320 : pabyDstData +=
169 326320 : static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 2);
170 : }
171 : else
172 : {
173 4395716 : for (int i = 0; i < nIters - 1; i++)
174 : {
175 4197550 : pabySrcData += nIncSrcOffset;
176 4197550 : pabyDstData += nPixelSpace;
177 4197550 : memcpy(pabyDstData, pabySrcData, nBandDataSize);
178 : }
179 : }
180 524490 : iSrcX += nSrcXInc * (nIters - 1);
181 524490 : nOuterLoopIters -= nIters - 1;
182 : }
183 : }
184 : else
185 : {
186 : // Type to type conversion ...
187 443 : GDALCopyWords64(pabySrcData, eDataType, nIncSrcOffset, pabyDstData,
188 443 : eBufType, nPixelSpace, std::max(1, nIters));
189 443 : if (nIters > 1)
190 : {
191 216 : pabySrcData +=
192 216 : static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 1);
193 216 : pabyDstData +=
194 216 : static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 1);
195 216 : iSrcX += nSrcXInc * (nIters - 1);
196 216 : nOuterLoopIters -= nIters - 1;
197 : }
198 : }
199 : }
200 :
201 : // Deal with last iteration to avoid iSrcX to go beyond nRasterXSize - 1
202 1063279 : if (nOuterLoopIters == 0)
203 : {
204 367430 : const int nRasterXSize = poBand->GetXSize();
205 367430 : iSrcX =
206 734860 : static_cast<int>(std::min(static_cast<GInt64>(iSrcX) + nSrcXInc,
207 367430 : static_cast<GInt64>(nRasterXSize - 1)));
208 367430 : pabyDstData += nPixelSpace;
209 367430 : if (iSrcX < nEndBlockX)
210 : {
211 354840 : goto no_reload_block;
212 : }
213 12590 : goto reload_block;
214 : }
215 695849 : return true;
216 : }
217 :
218 : template <class A, class B>
219 2818770 : CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW inline auto CPLUnsanitizedMul(A a, B b)
220 : {
221 2818770 : return a * b;
222 : }
223 :
224 : /************************************************************************/
225 : /* IRasterIO() */
226 : /* */
227 : /* Default internal implementation of RasterIO() ... utilizes */
228 : /* the Block access methods to satisfy the request. This would */
229 : /* normally only be overridden by formats with overviews. */
230 : /************************************************************************/
231 :
232 6180720 : CPLErr GDALRasterBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
233 : int nXSize, int nYSize, void *pData,
234 : int nBufXSize, int nBufYSize,
235 : GDALDataType eBufType, GSpacing nPixelSpace,
236 : GSpacing nLineSpace,
237 : GDALRasterIOExtraArg *psExtraArg)
238 :
239 : {
240 6180720 : if (eRWFlag == GF_Write && eFlushBlockErr != CE_None)
241 : {
242 0 : CPLError(eFlushBlockErr, CPLE_AppDefined,
243 : "An error occurred while writing a dirty block "
244 : "from GDALRasterBand::IRasterIO");
245 0 : CPLErr eErr = eFlushBlockErr;
246 0 : eFlushBlockErr = CE_None;
247 0 : return eErr;
248 : }
249 6180720 : if (nBlockXSize <= 0 || nBlockYSize <= 0)
250 : {
251 0 : CPLError(CE_Failure, CPLE_AppDefined, "Invalid block size");
252 0 : return CE_Failure;
253 : }
254 :
255 6180720 : const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
256 6180720 : const int nBufDataSize = GDALGetDataTypeSizeBytes(eBufType);
257 6180720 : GByte dummyBlock[2] = {0, 0};
258 6180720 : GByte *pabySrcBlock =
259 : dummyBlock; /* to avoid Coverity warning about nullptr dereference */
260 6180720 : GDALRasterBlock *poBlock = nullptr;
261 6180720 : const bool bUseIntegerRequestCoords =
262 6545670 : (!psExtraArg->bFloatingPointWindowValidity ||
263 364948 : (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
264 340016 : nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
265 :
266 : /* ==================================================================== */
267 : /* A common case is the data requested with the destination */
268 : /* is packed, and the block width is the raster width. */
269 : /* ==================================================================== */
270 6088900 : if (nPixelSpace == nBufDataSize && nLineSpace == nPixelSpace * nXSize &&
271 3234430 : nBlockXSize == GetXSize() && nBufXSize == nXSize &&
272 12269600 : nBufYSize == nYSize && bUseIntegerRequestCoords)
273 : {
274 3096630 : CPLErr eErr = CE_None;
275 3096630 : int nLBlockY = -1;
276 :
277 9751410 : for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
278 : {
279 6655860 : const int iSrcY = iBufYOff + nYOff;
280 :
281 6655860 : if (iSrcY < nLBlockY * nBlockYSize ||
282 6655860 : iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
283 : {
284 3365160 : nLBlockY = iSrcY / nBlockYSize;
285 3365160 : bool bJustInitialize =
286 297355 : eRWFlag == GF_Write && nXOff == 0 &&
287 3720440 : nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
288 57921 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize;
289 :
290 : // Is this a partial tile at right and/or bottom edges of
291 : // the raster, and that is going to be completely written?
292 : // If so, do not load it from storage, but zero it so that
293 : // the content outsize of the validity area is initialized.
294 3365160 : bool bMemZeroBuffer = false;
295 297355 : if (eRWFlag == GF_Write && !bJustInitialize && nXOff == 0 &&
296 24978 : nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
297 3662610 : nYOff + nYSize == GetYSize() &&
298 90 : nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)
299 : {
300 90 : bJustInitialize = true;
301 90 : bMemZeroBuffer = true;
302 : }
303 :
304 3365160 : if (poBlock)
305 268533 : poBlock->DropLock();
306 :
307 3365160 : const GUInt32 nErrorCounter = CPLGetErrorCounter();
308 3365160 : poBlock = GetLockedBlockRef(0, nLBlockY, bJustInitialize);
309 3365160 : if (poBlock == nullptr)
310 : {
311 1078 : if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
312 : nullptr)
313 : {
314 0 : CPLError(CE_Failure, CPLE_AppDefined,
315 : "GetBlockRef failed at X block offset %d, "
316 : "Y block offset %d%s",
317 : 0, nLBlockY,
318 0 : (nErrorCounter != CPLGetErrorCounter())
319 0 : ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
320 : : "");
321 : }
322 1078 : eErr = CE_Failure;
323 1078 : break;
324 : }
325 :
326 3364090 : if (eRWFlag == GF_Write)
327 297355 : poBlock->MarkDirty();
328 :
329 3364090 : pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
330 3364090 : if (bMemZeroBuffer)
331 : {
332 90 : memset(pabySrcBlock, 0,
333 90 : static_cast<GPtrDiff_t>(nBandDataSize) *
334 90 : nBlockXSize * nBlockYSize);
335 : }
336 : }
337 :
338 6654780 : const auto nSrcByteOffset =
339 6654780 : (static_cast<GPtrDiff_t>(iSrcY - nLBlockY * nBlockYSize) *
340 6654780 : nBlockXSize +
341 6654780 : nXOff) *
342 6654780 : nBandDataSize;
343 :
344 6654780 : if (eDataType == eBufType)
345 : {
346 2991080 : if (eRWFlag == GF_Read)
347 2518500 : memcpy(static_cast<GByte *>(pData) +
348 2518500 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
349 2518500 : pabySrcBlock + nSrcByteOffset,
350 : static_cast<size_t>(nLineSpace));
351 : else
352 472580 : memcpy(pabySrcBlock + nSrcByteOffset,
353 472580 : static_cast<GByte *>(pData) +
354 472580 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
355 : static_cast<size_t>(nLineSpace));
356 : }
357 : else
358 : {
359 : // Type to type conversion.
360 3663710 : if (eRWFlag == GF_Read)
361 3641640 : GDALCopyWords64(
362 3641640 : pabySrcBlock + nSrcByteOffset, eDataType, nBandDataSize,
363 : static_cast<GByte *>(pData) +
364 3641640 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
365 : eBufType, static_cast<int>(nPixelSpace), nBufXSize);
366 : else
367 22065 : GDALCopyWords64(static_cast<GByte *>(pData) +
368 22065 : static_cast<GPtrDiff_t>(iBufYOff) *
369 : nLineSpace,
370 : eBufType, static_cast<int>(nPixelSpace),
371 22065 : pabySrcBlock + nSrcByteOffset, eDataType,
372 : nBandDataSize, nBufXSize);
373 : }
374 :
375 6742690 : if (psExtraArg->pfnProgress != nullptr &&
376 87908 : !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
377 : psExtraArg->pProgressData))
378 : {
379 5 : eErr = CE_Failure;
380 5 : break;
381 : }
382 : }
383 :
384 3096630 : if (poBlock)
385 3095550 : poBlock->DropLock();
386 :
387 3096630 : return eErr;
388 : }
389 :
390 : /* ==================================================================== */
391 : /* Do we have overviews that would be appropriate to satisfy */
392 : /* this request? */
393 : /* ==================================================================== */
394 3084090 : if ((nBufXSize < nXSize || nBufYSize < nYSize) && GetOverviewCount() > 0 &&
395 : eRWFlag == GF_Read)
396 : {
397 : GDALRasterIOExtraArg sExtraArg;
398 2967 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
399 :
400 : const int nOverview =
401 2967 : GDALBandGetBestOverviewLevel2(this, nXOff, nYOff, nXSize, nYSize,
402 : nBufXSize, nBufYSize, &sExtraArg);
403 2967 : if (nOverview >= 0)
404 : {
405 2892 : GDALRasterBand *poOverviewBand = GetOverview(nOverview);
406 2892 : if (poOverviewBand == nullptr)
407 2892 : return CE_Failure;
408 :
409 2892 : return poOverviewBand->RasterIO(
410 : eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize,
411 2892 : nBufYSize, eBufType, nPixelSpace, nLineSpace, &sExtraArg);
412 : }
413 : }
414 :
415 891712 : if (eRWFlag == GF_Read && nBufXSize < nXSize / 100 &&
416 6 : nBufYSize < nYSize / 100 && nPixelSpace == nBufDataSize &&
417 3972910 : nLineSpace == nPixelSpace * nBufXSize &&
418 6 : CPLTestBool(CPLGetConfigOption("GDAL_NO_COSTLY_OVERVIEW", "NO")))
419 : {
420 0 : memset(pData, 0, static_cast<size_t>(nLineSpace * nBufYSize));
421 0 : return CE_None;
422 : }
423 :
424 : /* ==================================================================== */
425 : /* The second case when we don't need subsample data but likely */
426 : /* need data type conversion. */
427 : /* ==================================================================== */
428 3081190 : if ( // nPixelSpace == nBufDataSize &&
429 3081190 : nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
430 : {
431 : #if DEBUG_VERBOSE
432 : printf("IRasterIO(%d,%d,%d,%d) rw=%d case 2\n", /*ok*/
433 : nXOff, nYOff, nXSize, nYSize, static_cast<int>(eRWFlag));
434 : #endif
435 :
436 : /* --------------------------------------------------------------------
437 : */
438 : /* Loop over buffer computing source locations. */
439 : /* --------------------------------------------------------------------
440 : */
441 : // Calculate starting values out of loop
442 2503280 : const int nLBlockXStart = nXOff / nBlockXSize;
443 2503280 : const int nXSpanEnd = nBufXSize + nXOff;
444 :
445 2503280 : int iBufYOff = 0;
446 2503280 : int iSrcY = nYOff;
447 : while (true)
448 : {
449 2544130 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
450 : static_cast<GPtrDiff_t>(nLineSpace);
451 2544130 : int nLBlockY = iSrcY / nBlockYSize;
452 2544130 : int nLBlockX = nLBlockXStart;
453 2544130 : int iSrcX = nXOff;
454 5362820 : while (iSrcX < nXSpanEnd)
455 : {
456 2818770 : int nXSpan = nLBlockX * nBlockXSize;
457 2818770 : if (nXSpan < INT_MAX - nBlockXSize)
458 2818770 : nXSpan += nBlockXSize;
459 : else
460 0 : nXSpan = INT_MAX;
461 2818770 : const int nXRight = nXSpan;
462 2818770 : nXSpan = (nXSpan < nXSpanEnd ? nXSpan : nXSpanEnd) - iSrcX;
463 :
464 : const size_t nXSpanSize =
465 2818770 : CPLUnsanitizedMul(nXSpan, static_cast<size_t>(nPixelSpace));
466 :
467 2818770 : bool bJustInitialize =
468 2042970 : eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
469 38035 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
470 4888110 : nXOff <= nLBlockX * nBlockXSize &&
471 26364 : nXOff + nXSize >= nXRight;
472 :
473 : // Is this a partial tile at right and/or bottom edges of
474 : // the raster, and that is going to be completely written?
475 : // If so, do not load it from storage, but zero it so that
476 : // the content outsize of the validity area is initialized.
477 2818770 : bool bMemZeroBuffer = false;
478 2042970 : if (eRWFlag == GF_Write && !bJustInitialize &&
479 2017850 : nXOff <= nLBlockX * nBlockXSize &&
480 2016190 : nYOff <= nLBlockY * nBlockYSize &&
481 12145 : (nXOff + nXSize >= nXRight ||
482 : // cppcheck-suppress knownConditionTrueFalse
483 4864460 : (nXOff + nXSize == GetXSize() && nXRight > GetXSize())) &&
484 11965 : (nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize ||
485 10743 : (nYOff + nYSize == GetYSize() &&
486 1951 : nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)))
487 : {
488 3173 : bJustInitialize = true;
489 3173 : bMemZeroBuffer = true;
490 : }
491 :
492 : /* --------------------------------------------------------------------
493 : */
494 : /* Ensure we have the appropriate block loaded. */
495 : /* --------------------------------------------------------------------
496 : */
497 2818770 : const GUInt32 nErrorCounter = CPLGetErrorCounter();
498 2818770 : poBlock =
499 2818770 : GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
500 2818770 : if (!poBlock)
501 : {
502 73 : if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
503 : nullptr)
504 : {
505 0 : CPLError(CE_Failure, CPLE_AppDefined,
506 : "GetBlockRef failed at X block offset %d, "
507 : "Y block offset %d%s",
508 : nLBlockX, nLBlockY,
509 0 : (nErrorCounter != CPLGetErrorCounter())
510 0 : ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
511 : : "");
512 : }
513 73 : return (CE_Failure);
514 : }
515 :
516 2818700 : if (eRWFlag == GF_Write)
517 2042970 : poBlock->MarkDirty();
518 :
519 2818700 : pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
520 2818700 : if (bMemZeroBuffer)
521 : {
522 3173 : memset(pabySrcBlock, 0,
523 3173 : static_cast<GPtrDiff_t>(nBandDataSize) *
524 3173 : nBlockXSize * nBlockYSize);
525 : }
526 : /* --------------------------------------------------------------------
527 : */
528 : /* Copy over this chunk of data. */
529 : /* --------------------------------------------------------------------
530 : */
531 2818700 : GPtrDiff_t iSrcOffset =
532 2818700 : (static_cast<GPtrDiff_t>(iSrcX) -
533 2818700 : static_cast<GPtrDiff_t>(nLBlockX * nBlockXSize) +
534 2818700 : (static_cast<GPtrDiff_t>(iSrcY) -
535 2818700 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
536 2818700 : nBlockXSize) *
537 2818700 : nBandDataSize;
538 : // Fill up as many rows as possible for the loaded block.
539 5637390 : const int kmax = std::min(nBlockYSize - (iSrcY % nBlockYSize),
540 2818700 : nBufYSize - iBufYOff);
541 60991500 : for (int k = 0; k < kmax; k++)
542 : {
543 58172800 : if (eDataType == eBufType && nPixelSpace == nBufDataSize)
544 : {
545 53770900 : if (eRWFlag == GF_Read)
546 49332700 : memcpy(static_cast<GByte *>(pData) + iBufOffset +
547 49332700 : static_cast<GPtrDiff_t>(k) * nLineSpace,
548 49332700 : pabySrcBlock + iSrcOffset, nXSpanSize);
549 : else
550 4438130 : memcpy(pabySrcBlock + iSrcOffset,
551 4438130 : static_cast<GByte *>(pData) + iBufOffset +
552 4438130 : static_cast<GPtrDiff_t>(k) * nLineSpace,
553 : nXSpanSize);
554 : }
555 : else
556 : {
557 : /* type to type conversion */
558 4401910 : if (eRWFlag == GF_Read)
559 4251700 : GDALCopyWords64(
560 4251700 : pabySrcBlock + iSrcOffset, eDataType,
561 : nBandDataSize,
562 4251700 : static_cast<GByte *>(pData) + iBufOffset +
563 4251700 : static_cast<GPtrDiff_t>(k) * nLineSpace,
564 : eBufType, static_cast<int>(nPixelSpace),
565 : nXSpan);
566 : else
567 150209 : GDALCopyWords64(
568 150209 : static_cast<GByte *>(pData) + iBufOffset +
569 150209 : static_cast<GPtrDiff_t>(k) * nLineSpace,
570 : eBufType, static_cast<int>(nPixelSpace),
571 150209 : pabySrcBlock + iSrcOffset, eDataType,
572 : nBandDataSize, nXSpan);
573 : }
574 :
575 58172800 : iSrcOffset +=
576 58172800 : static_cast<GPtrDiff_t>(nBlockXSize) * nBandDataSize;
577 : }
578 :
579 : iBufOffset =
580 2818700 : CPLUnsanitizedAdd<GPtrDiff_t>(iBufOffset, nXSpanSize);
581 2818700 : nLBlockX++;
582 2818700 : iSrcX += nXSpan;
583 :
584 2818700 : poBlock->DropLock();
585 2818700 : poBlock = nullptr;
586 : }
587 :
588 : /* Compute the increment to go on a block boundary */
589 2544050 : const int nYInc = nBlockYSize - (iSrcY % nBlockYSize);
590 :
591 2545940 : if (psExtraArg->pfnProgress != nullptr &&
592 1884 : !psExtraArg->pfnProgress(
593 2545940 : 1.0 * std::min(nBufYSize, iBufYOff + nYInc) / nBufYSize, "",
594 : psExtraArg->pProgressData))
595 : {
596 0 : return CE_Failure;
597 : }
598 :
599 2544050 : iBufYOff += nYInc;
600 2544050 : if (iBufYOff >= nBufYSize)
601 2503210 : break;
602 : // Only increment iSrcY after above loop end check, to avoid
603 : // potential int overflow.
604 40846 : iSrcY += nYInc;
605 40846 : }
606 :
607 2503210 : return CE_None;
608 : }
609 :
610 : /* ==================================================================== */
611 : /* Loop reading required source blocks to satisfy output */
612 : /* request. This is the most general implementation. */
613 : /* ==================================================================== */
614 :
615 577913 : double dfXOff = nXOff;
616 577913 : double dfYOff = nYOff;
617 577913 : double dfXSize = nXSize;
618 577913 : double dfYSize = nYSize;
619 577913 : if (psExtraArg->bFloatingPointWindowValidity)
620 : {
621 242956 : dfXOff = psExtraArg->dfXOff;
622 242956 : dfYOff = psExtraArg->dfYOff;
623 242956 : dfXSize = psExtraArg->dfXSize;
624 242956 : dfYSize = psExtraArg->dfYSize;
625 : }
626 :
627 : /* -------------------------------------------------------------------- */
628 : /* Compute stepping increment. */
629 : /* -------------------------------------------------------------------- */
630 577913 : const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
631 577913 : const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
632 577913 : CPLErr eErr = CE_None;
633 :
634 577913 : if (eRWFlag == GF_Write)
635 : {
636 : /* --------------------------------------------------------------------
637 : */
638 : /* Write case */
639 : /* Loop over raster window computing source locations in the buffer.
640 : */
641 : /* --------------------------------------------------------------------
642 : */
643 166655 : GByte *pabyDstBlock = nullptr;
644 166655 : int nLBlockX = -1;
645 166655 : int nLBlockY = -1;
646 :
647 1260010 : for (int iDstY = nYOff; iDstY < nYOff + nYSize; iDstY++)
648 : {
649 1093360 : const int iBufYOff = static_cast<int>((iDstY - nYOff) / dfSrcYInc);
650 :
651 12384200 : for (int iDstX = nXOff; iDstX < nXOff + nXSize; iDstX++)
652 : {
653 11290800 : const int iBufXOff =
654 11290800 : static_cast<int>((iDstX - nXOff) / dfSrcXInc);
655 11290800 : GPtrDiff_t iBufOffset =
656 11290800 : static_cast<GPtrDiff_t>(iBufYOff) *
657 : static_cast<GPtrDiff_t>(nLineSpace) +
658 11290800 : iBufXOff * static_cast<GPtrDiff_t>(nPixelSpace);
659 :
660 : // FIXME: this code likely doesn't work if the dirty block gets
661 : // flushed to disk before being completely written.
662 : // In the meantime, bJustInitialize should probably be set to
663 : // FALSE even if it is not ideal performance wise, and for
664 : // lossy compression.
665 :
666 : /* --------------------------------------------------------------------
667 : */
668 : /* Ensure we have the appropriate block loaded. */
669 : /* --------------------------------------------------------------------
670 : */
671 11290800 : if (iDstX < nLBlockX * nBlockXSize ||
672 11041500 : iDstX - nBlockXSize >= nLBlockX * nBlockXSize ||
673 10584800 : iDstY < nLBlockY * nBlockYSize ||
674 10584800 : iDstY - nBlockYSize >= nLBlockY * nBlockYSize)
675 : {
676 738702 : nLBlockX = iDstX / nBlockXSize;
677 738702 : nLBlockY = iDstY / nBlockYSize;
678 :
679 738702 : const bool bJustInitialize =
680 1065990 : nYOff <= nLBlockY * nBlockYSize &&
681 327291 : nYOff + nYSize - nBlockYSize >=
682 327291 : nLBlockY * nBlockYSize &&
683 1116320 : nXOff <= nLBlockX * nBlockXSize &&
684 50325 : nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
685 : /*bool bMemZeroBuffer = FALSE;
686 : if( !bJustInitialize &&
687 : nXOff <= nLBlockX * nBlockXSize &&
688 : nYOff <= nLBlockY * nBlockYSize &&
689 : (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
690 : (nXOff + nXSize == GetXSize() &&
691 : (nLBlockX+1) * nBlockXSize > GetXSize())) &&
692 : (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
693 : (nYOff + nYSize == GetYSize() &&
694 : (nLBlockY+1) * nBlockYSize > GetYSize())) )
695 : {
696 : bJustInitialize = TRUE;
697 : bMemZeroBuffer = TRUE;
698 : }*/
699 738702 : if (poBlock != nullptr)
700 572047 : poBlock->DropLock();
701 :
702 738702 : poBlock =
703 738702 : GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
704 738702 : if (poBlock == nullptr)
705 : {
706 0 : return (CE_Failure);
707 : }
708 :
709 738702 : poBlock->MarkDirty();
710 :
711 738702 : pabyDstBlock = static_cast<GByte *>(poBlock->GetDataRef());
712 : /*if( bMemZeroBuffer )
713 : {
714 : memset(pabyDstBlock, 0,
715 : static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
716 : * nBlockYSize);
717 : }*/
718 : }
719 :
720 : // To make Coverity happy. Should not happen by design.
721 11290800 : if (pabyDstBlock == nullptr)
722 : {
723 0 : CPLAssert(false);
724 : eErr = CE_Failure;
725 : break;
726 : }
727 :
728 : /* --------------------------------------------------------------------
729 : */
730 : /* Copy over this pixel of data. */
731 : /* --------------------------------------------------------------------
732 : */
733 11290800 : GPtrDiff_t iDstOffset =
734 11290800 : (static_cast<GPtrDiff_t>(iDstX) -
735 11290800 : static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
736 11290800 : (static_cast<GPtrDiff_t>(iDstY) -
737 11290800 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
738 11290800 : nBlockXSize) *
739 11290800 : nBandDataSize;
740 :
741 11290800 : if (eDataType == eBufType)
742 : {
743 11287700 : memcpy(pabyDstBlock + iDstOffset,
744 11287700 : static_cast<GByte *>(pData) + iBufOffset,
745 : nBandDataSize);
746 : }
747 : else
748 : {
749 : /* type to type conversion ... ouch, this is expensive way
750 : of handling single words */
751 3096 : GDALCopyWords64(static_cast<GByte *>(pData) + iBufOffset,
752 3096 : eBufType, 0, pabyDstBlock + iDstOffset,
753 : eDataType, 0, 1);
754 : }
755 : }
756 :
757 1093360 : if (psExtraArg->pfnProgress != nullptr &&
758 0 : !psExtraArg->pfnProgress(1.0 * (iDstY - nYOff + 1) / nYSize, "",
759 : psExtraArg->pProgressData))
760 : {
761 0 : eErr = CE_Failure;
762 0 : break;
763 : }
764 : }
765 : }
766 : else
767 : {
768 411258 : if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
769 : {
770 42075 : if ((psExtraArg->eResampleAlg == GRIORA_Cubic ||
771 13559 : psExtraArg->eResampleAlg == GRIORA_CubicSpline ||
772 13506 : psExtraArg->eResampleAlg == GRIORA_Bilinear ||
773 28563 : psExtraArg->eResampleAlg == GRIORA_Lanczos) &&
774 3224 : GetColorTable() != nullptr)
775 : {
776 0 : CPLError(CE_Warning, CPLE_NotSupported,
777 : "Resampling method not supported on paletted band. "
778 : "Falling back to nearest neighbour");
779 : }
780 14261 : else if (psExtraArg->eResampleAlg == GRIORA_Gauss &&
781 3 : GDALDataTypeIsComplex(eDataType))
782 : {
783 0 : CPLError(CE_Warning, CPLE_NotSupported,
784 : "Resampling method not supported on complex data type "
785 : "band. Falling back to nearest neighbour");
786 : }
787 : else
788 : {
789 14258 : return RasterIOResampled(eRWFlag, nXOff, nYOff, nXSize, nYSize,
790 : pData, nBufXSize, nBufYSize, eBufType,
791 14258 : nPixelSpace, nLineSpace, psExtraArg);
792 : }
793 : }
794 :
795 397000 : int nLimitBlockY = 0;
796 397000 : const bool bByteCopy = eDataType == eBufType && nBandDataSize == 1;
797 397000 : int nStartBlockX = -nBlockXSize;
798 397000 : constexpr double EPS = 1e-10;
799 397000 : int nLBlockY = -1;
800 397000 : const double dfSrcXStart = 0.5 * dfSrcXInc + dfXOff + EPS;
801 397000 : const bool bIntegerXFactor =
802 372767 : bUseIntegerRequestCoords &&
803 670836 : static_cast<int>(dfSrcXInc) == dfSrcXInc &&
804 273836 : static_cast<int>(dfSrcXInc) < INT_MAX / nBandDataSize;
805 :
806 : /* --------------------------------------------------------------------
807 : */
808 : /* Read case */
809 : /* Loop over buffer computing source locations. */
810 : /* --------------------------------------------------------------------
811 : */
812 2367100 : for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
813 : {
814 : // Add small epsilon to avoid some numeric precision issues.
815 1970110 : const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
816 1970110 : const int iSrcY = static_cast<int>(std::min(
817 1970110 : std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
818 :
819 1970110 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
820 : static_cast<GPtrDiff_t>(nLineSpace);
821 :
822 1970110 : if (iSrcY >= nLimitBlockY)
823 : {
824 438018 : nLBlockY = iSrcY / nBlockYSize;
825 438018 : nLimitBlockY = nLBlockY * nBlockYSize;
826 438018 : if (nLimitBlockY < INT_MAX - nBlockYSize)
827 438018 : nLimitBlockY += nBlockYSize;
828 : else
829 0 : nLimitBlockY = INT_MAX;
830 : // Make sure a new block is loaded.
831 438018 : nStartBlockX = -nBlockXSize;
832 : }
833 1532090 : else if (static_cast<int>(dfSrcXStart) < nStartBlockX)
834 : {
835 : // Make sure a new block is loaded.
836 437363 : nStartBlockX = -nBlockXSize;
837 : }
838 :
839 1970110 : GPtrDiff_t iSrcOffsetCst = (iSrcY - nLBlockY * nBlockYSize) *
840 1970110 : static_cast<GPtrDiff_t>(nBlockXSize);
841 :
842 1970110 : if (bIntegerXFactor)
843 : {
844 695850 : int iSrcX = static_cast<int>(dfSrcXStart);
845 695850 : const int nSrcXInc = static_cast<int>(dfSrcXInc);
846 695850 : GByte *pabyDstData = static_cast<GByte *>(pData) + iBufOffset;
847 695850 : bool bRet = false;
848 695850 : if (bByteCopy)
849 : {
850 585842 : bRet = DownsamplingIntegerXFactor<true, 1>(
851 : this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
852 : static_cast<int>(nPixelSpace), nBufXSize, GDT_UInt8,
853 : GDT_UInt8, nStartBlockX, nBlockXSize, poBlock,
854 : nLBlockY);
855 : }
856 110008 : else if (eDataType == eBufType)
857 : {
858 109783 : switch (nBandDataSize)
859 : {
860 109630 : case 2:
861 109630 : bRet = DownsamplingIntegerXFactor<true, 2>(
862 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
863 : pabyDstData, static_cast<int>(nPixelSpace),
864 : nBufXSize, eDataType, eDataType, nStartBlockX,
865 : nBlockXSize, poBlock, nLBlockY);
866 109630 : break;
867 55 : case 4:
868 55 : bRet = DownsamplingIntegerXFactor<true, 4>(
869 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
870 : pabyDstData, static_cast<int>(nPixelSpace),
871 : nBufXSize, eDataType, eDataType, nStartBlockX,
872 : nBlockXSize, poBlock, nLBlockY);
873 55 : break;
874 96 : case 8:
875 96 : bRet = DownsamplingIntegerXFactor<true, 8>(
876 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
877 : pabyDstData, static_cast<int>(nPixelSpace),
878 : nBufXSize, eDataType, eDataType, nStartBlockX,
879 : nBlockXSize, poBlock, nLBlockY);
880 96 : break;
881 2 : case 16:
882 2 : bRet = DownsamplingIntegerXFactor<true, 16>(
883 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
884 : pabyDstData, static_cast<int>(nPixelSpace),
885 : nBufXSize, eDataType, eDataType, nStartBlockX,
886 : nBlockXSize, poBlock, nLBlockY);
887 2 : break;
888 0 : default:
889 0 : CPLAssert(false);
890 : break;
891 : }
892 : }
893 : else
894 : {
895 225 : bRet = DownsamplingIntegerXFactor<false, 0>(
896 : this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
897 : static_cast<int>(nPixelSpace), nBufXSize, eDataType,
898 : eBufType, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
899 : }
900 695850 : if (!bRet)
901 1 : eErr = CE_Failure;
902 : }
903 : else
904 : {
905 1274260 : double dfSrcX = dfSrcXStart;
906 503811000 : for (int iBufXOff = 0; iBufXOff < nBufXSize;
907 502537000 : iBufXOff++, dfSrcX += dfSrcXInc)
908 : {
909 : // TODO?: try to avoid the clamping for most iterations
910 : const int iSrcX = static_cast<int>(
911 1005070000 : std::min(std::max(0.0, dfSrcX),
912 502537000 : static_cast<double>(nRasterXSize - 1)));
913 :
914 : /* --------------------------------------------------------------------
915 : */
916 : /* Ensure we have the appropriate block loaded. */
917 : /* --------------------------------------------------------------------
918 : */
919 502537000 : if (iSrcX >= nBlockXSize + nStartBlockX)
920 : {
921 1697820 : const int nLBlockX = iSrcX / nBlockXSize;
922 1697820 : nStartBlockX = nLBlockX * nBlockXSize;
923 :
924 1697820 : if (poBlock != nullptr)
925 1574650 : poBlock->DropLock();
926 :
927 1697820 : poBlock = GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
928 1697820 : if (poBlock == nullptr)
929 : {
930 9 : eErr = CE_Failure;
931 9 : break;
932 : }
933 :
934 : pabySrcBlock =
935 1697810 : static_cast<GByte *>(poBlock->GetDataRef());
936 : }
937 502537000 : const GPtrDiff_t nDiffX =
938 502537000 : static_cast<GPtrDiff_t>(iSrcX - nStartBlockX);
939 :
940 : /* --------------------------------------------------------------------
941 : */
942 : /* Copy over this pixel of data. */
943 : /* --------------------------------------------------------------------
944 : */
945 :
946 502537000 : if (bByteCopy)
947 : {
948 442592000 : GPtrDiff_t iSrcOffset = nDiffX + iSrcOffsetCst;
949 442592000 : static_cast<GByte *>(pData)[iBufOffset] =
950 442592000 : pabySrcBlock[iSrcOffset];
951 : }
952 59944700 : else if (eDataType == eBufType)
953 : {
954 50322800 : GPtrDiff_t iSrcOffset =
955 50322800 : (nDiffX + iSrcOffsetCst) * nBandDataSize;
956 50322800 : memcpy(static_cast<GByte *>(pData) + iBufOffset,
957 50322800 : pabySrcBlock + iSrcOffset, nBandDataSize);
958 : }
959 : else
960 : {
961 : // Type to type conversion ...
962 9621890 : GPtrDiff_t iSrcOffset =
963 9621890 : (nDiffX + iSrcOffsetCst) * nBandDataSize;
964 9621890 : GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
965 : static_cast<GByte *>(pData) +
966 9621890 : iBufOffset,
967 : eBufType, 0, 1);
968 : }
969 :
970 502537000 : iBufOffset += static_cast<int>(nPixelSpace);
971 : }
972 : }
973 1970110 : if (eErr == CE_Failure)
974 11 : break;
975 :
976 2191530 : if (psExtraArg->pfnProgress != nullptr &&
977 221434 : !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
978 : psExtraArg->pProgressData))
979 : {
980 1 : eErr = CE_Failure;
981 1 : break;
982 : }
983 : }
984 : }
985 :
986 563655 : if (poBlock != nullptr)
987 563645 : poBlock->DropLock();
988 :
989 563655 : return eErr;
990 : }
991 :
992 : /************************************************************************/
993 : /* GDALRasterIOTransformer() */
994 : /************************************************************************/
995 :
996 : struct GDALRasterIOTransformerStruct
997 : {
998 : double dfXOff;
999 : double dfYOff;
1000 : double dfXRatioDstToSrc;
1001 : double dfYRatioDstToSrc;
1002 : };
1003 :
1004 6897 : static int GDALRasterIOTransformer(void *pTransformerArg, int bDstToSrc,
1005 : int nPointCount, double *x, double *y,
1006 : double * /* z */, int *panSuccess)
1007 : {
1008 6897 : GDALRasterIOTransformerStruct *psParams =
1009 : static_cast<GDALRasterIOTransformerStruct *>(pTransformerArg);
1010 6897 : if (bDstToSrc)
1011 : {
1012 311993 : for (int i = 0; i < nPointCount; i++)
1013 : {
1014 305684 : x[i] = x[i] * psParams->dfXRatioDstToSrc + psParams->dfXOff;
1015 305684 : y[i] = y[i] * psParams->dfYRatioDstToSrc + psParams->dfYOff;
1016 305684 : panSuccess[i] = TRUE;
1017 : }
1018 : }
1019 : else
1020 : {
1021 1176 : for (int i = 0; i < nPointCount; i++)
1022 : {
1023 588 : x[i] = (x[i] - psParams->dfXOff) / psParams->dfXRatioDstToSrc;
1024 588 : y[i] = (y[i] - psParams->dfYOff) / psParams->dfYRatioDstToSrc;
1025 588 : panSuccess[i] = TRUE;
1026 : }
1027 : }
1028 6897 : return TRUE;
1029 : }
1030 :
1031 : /************************************************************************/
1032 : /* RasterIOResampled() */
1033 : /************************************************************************/
1034 :
1035 : //! @cond Doxygen_Suppress
1036 14258 : CPLErr GDALRasterBand::RasterIOResampled(
1037 : GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
1038 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
1039 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
1040 : {
1041 : // Determine if we use warping resampling or overview resampling
1042 : const bool bUseWarp =
1043 14258 : (GDALDataTypeIsComplex(eDataType) &&
1044 14417 : psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
1045 159 : psExtraArg->eResampleAlg != GRIORA_Mode);
1046 :
1047 14258 : double dfXOff = nXOff;
1048 14258 : double dfYOff = nYOff;
1049 14258 : double dfXSize = nXSize;
1050 14258 : double dfYSize = nYSize;
1051 14258 : if (psExtraArg->bFloatingPointWindowValidity)
1052 : {
1053 13512 : dfXOff = psExtraArg->dfXOff;
1054 13512 : dfYOff = psExtraArg->dfYOff;
1055 13512 : dfXSize = psExtraArg->dfXSize;
1056 13512 : dfYSize = psExtraArg->dfYSize;
1057 : }
1058 :
1059 14258 : const double dfXRatioDstToSrc = dfXSize / nBufXSize;
1060 14258 : const double dfYRatioDstToSrc = dfYSize / nBufYSize;
1061 :
1062 : // Determine the coordinates in the "virtual" output raster to see
1063 : // if there are not integers, in which case we will use them as a shift
1064 : // so that subwindow extracts give the exact same results as entire raster
1065 : // scaling.
1066 14258 : double dfDestXOff = dfXOff / dfXRatioDstToSrc;
1067 14258 : bool bHasXOffVirtual = false;
1068 14258 : int nDestXOffVirtual = 0;
1069 14258 : if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
1070 : {
1071 13930 : bHasXOffVirtual = true;
1072 13930 : dfXOff = nXOff;
1073 13930 : nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
1074 : }
1075 :
1076 14258 : double dfDestYOff = dfYOff / dfYRatioDstToSrc;
1077 14258 : bool bHasYOffVirtual = false;
1078 14258 : int nDestYOffVirtual = 0;
1079 14258 : if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
1080 : {
1081 13926 : bHasYOffVirtual = true;
1082 13926 : dfYOff = nYOff;
1083 13926 : nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
1084 : }
1085 :
1086 : // Create a MEM dataset that wraps the output buffer.
1087 : GDALDataset *poMEMDS;
1088 14258 : void *pTempBuffer = nullptr;
1089 14258 : GSpacing nPSMem = nPixelSpace;
1090 14258 : GSpacing nLSMem = nLineSpace;
1091 14258 : void *pDataMem = pData;
1092 14258 : GDALDataType eDTMem = eBufType;
1093 14258 : if (eBufType != eDataType && !GDAL_GET_OPERATE_IN_BUF_TYPE(*psExtraArg))
1094 : {
1095 4 : nPSMem = GDALGetDataTypeSizeBytes(eDataType);
1096 4 : nLSMem = nPSMem * nBufXSize;
1097 : pTempBuffer =
1098 4 : VSI_MALLOC2_VERBOSE(nBufYSize, static_cast<size_t>(nLSMem));
1099 4 : if (pTempBuffer == nullptr)
1100 0 : return CE_Failure;
1101 4 : pDataMem = pTempBuffer;
1102 4 : eDTMem = eDataType;
1103 : }
1104 :
1105 : poMEMDS =
1106 14258 : MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
1107 : nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr);
1108 14258 : GByte *pabyData = static_cast<GByte *>(pDataMem) -
1109 14258 : nPSMem * nDestXOffVirtual - nLSMem * nDestYOffVirtual;
1110 14258 : GDALRasterBandH hMEMBand = MEMCreateRasterBandEx(
1111 : poMEMDS, 1, pabyData, eDTMem, nPSMem, nLSMem, false);
1112 14258 : poMEMDS->SetBand(1, GDALRasterBand::FromHandle(hMEMBand));
1113 :
1114 14258 : const char *pszNBITS = GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
1115 14258 : const int nNBITS = pszNBITS ? atoi(pszNBITS) : 0;
1116 14258 : if (pszNBITS)
1117 6 : GDALRasterBand::FromHandle(hMEMBand)->SetMetadataItem(
1118 6 : "NBITS", pszNBITS, "IMAGE_STRUCTURE");
1119 :
1120 14258 : CPLErr eErr = CE_None;
1121 :
1122 : // Do the resampling.
1123 14258 : if (bUseWarp)
1124 : {
1125 149 : int bHasNoData = FALSE;
1126 149 : double dfNoDataValue = GetNoDataValue(&bHasNoData);
1127 :
1128 149 : VRTDatasetH hVRTDS = nullptr;
1129 149 : GDALRasterBandH hVRTBand = nullptr;
1130 149 : if (GetDataset() == nullptr)
1131 : {
1132 : /* Create VRT dataset that wraps the whole dataset */
1133 0 : hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
1134 0 : VRTAddBand(hVRTDS, eDataType, nullptr);
1135 0 : hVRTBand = GDALGetRasterBand(hVRTDS, 1);
1136 0 : VRTAddSimpleSource(hVRTBand, this, 0, 0, nRasterXSize, nRasterYSize,
1137 : 0, 0, nRasterXSize, nRasterYSize, nullptr,
1138 : VRT_NODATA_UNSET);
1139 :
1140 : /* Add a mask band if needed */
1141 0 : if (GetMaskFlags() != GMF_ALL_VALID)
1142 : {
1143 0 : GDALDataset::FromHandle(hVRTDS)->CreateMaskBand(0);
1144 : VRTSourcedRasterBand *poVRTMaskBand =
1145 : reinterpret_cast<VRTSourcedRasterBand *>(
1146 : reinterpret_cast<GDALRasterBand *>(hVRTBand)
1147 0 : ->GetMaskBand());
1148 0 : poVRTMaskBand->AddMaskBandSource(this, 0, 0, nRasterXSize,
1149 0 : nRasterYSize, 0, 0,
1150 0 : nRasterXSize, nRasterYSize);
1151 : }
1152 : }
1153 :
1154 149 : GDALWarpOptions *psWarpOptions = GDALCreateWarpOptions();
1155 149 : switch (psExtraArg->eResampleAlg)
1156 : {
1157 0 : case GRIORA_NearestNeighbour:
1158 0 : psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
1159 0 : break;
1160 147 : case GRIORA_Bilinear:
1161 147 : psWarpOptions->eResampleAlg = GRA_Bilinear;
1162 147 : break;
1163 0 : case GRIORA_Cubic:
1164 0 : psWarpOptions->eResampleAlg = GRA_Cubic;
1165 0 : break;
1166 0 : case GRIORA_CubicSpline:
1167 0 : psWarpOptions->eResampleAlg = GRA_CubicSpline;
1168 0 : break;
1169 0 : case GRIORA_Lanczos:
1170 0 : psWarpOptions->eResampleAlg = GRA_Lanczos;
1171 0 : break;
1172 0 : case GRIORA_Average:
1173 0 : psWarpOptions->eResampleAlg = GRA_Average;
1174 0 : break;
1175 2 : case GRIORA_RMS:
1176 2 : psWarpOptions->eResampleAlg = GRA_RMS;
1177 2 : break;
1178 0 : case GRIORA_Mode:
1179 0 : psWarpOptions->eResampleAlg = GRA_Mode;
1180 0 : break;
1181 0 : default:
1182 0 : CPLAssert(false);
1183 : psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
1184 : break;
1185 : }
1186 149 : psWarpOptions->hSrcDS = hVRTDS ? hVRTDS : GetDataset();
1187 149 : psWarpOptions->hDstDS = poMEMDS;
1188 149 : psWarpOptions->nBandCount = 1;
1189 149 : int nSrcBandNumber = hVRTDS ? 1 : nBand;
1190 149 : int nDstBandNumber = 1;
1191 149 : psWarpOptions->panSrcBands = &nSrcBandNumber;
1192 149 : psWarpOptions->panDstBands = &nDstBandNumber;
1193 298 : psWarpOptions->pfnProgress = psExtraArg->pfnProgress
1194 149 : ? psExtraArg->pfnProgress
1195 : : GDALDummyProgress;
1196 149 : psWarpOptions->pProgressArg = psExtraArg->pProgressData;
1197 149 : psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
1198 149 : if (bHasNoData)
1199 : {
1200 0 : psWarpOptions->papszWarpOptions = CSLSetNameValue(
1201 : psWarpOptions->papszWarpOptions, "INIT_DEST", "NO_DATA");
1202 0 : if (psWarpOptions->padfSrcNoDataReal == nullptr)
1203 : {
1204 0 : psWarpOptions->padfSrcNoDataReal =
1205 0 : static_cast<double *>(CPLMalloc(sizeof(double)));
1206 0 : psWarpOptions->padfSrcNoDataReal[0] = dfNoDataValue;
1207 : }
1208 :
1209 0 : if (psWarpOptions->padfDstNoDataReal == nullptr)
1210 : {
1211 0 : psWarpOptions->padfDstNoDataReal =
1212 0 : static_cast<double *>(CPLMalloc(sizeof(double)));
1213 0 : psWarpOptions->padfDstNoDataReal[0] = dfNoDataValue;
1214 : }
1215 : }
1216 :
1217 : GDALRasterIOTransformerStruct sTransformer;
1218 149 : sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
1219 149 : sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
1220 149 : sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
1221 149 : sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
1222 149 : psWarpOptions->pTransformerArg = &sTransformer;
1223 :
1224 : GDALWarpOperationH hWarpOperation =
1225 149 : GDALCreateWarpOperation(psWarpOptions);
1226 149 : eErr = GDALChunkAndWarpImage(hWarpOperation, nDestXOffVirtual,
1227 : nDestYOffVirtual, nBufXSize, nBufYSize);
1228 149 : GDALDestroyWarpOperation(hWarpOperation);
1229 :
1230 149 : psWarpOptions->panSrcBands = nullptr;
1231 149 : psWarpOptions->panDstBands = nullptr;
1232 149 : GDALDestroyWarpOptions(psWarpOptions);
1233 :
1234 149 : if (hVRTDS)
1235 0 : GDALClose(hVRTDS);
1236 : }
1237 : else
1238 : {
1239 : const char *pszResampling =
1240 14109 : GDALRasterIOGetResampleAlg(psExtraArg->eResampleAlg);
1241 14109 : int nKernelRadius = 0;
1242 : GDALResampleFunction pfnResampleFunc =
1243 14109 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
1244 14109 : CPLAssert(pfnResampleFunc);
1245 : GDALDataType eWrkDataType =
1246 14109 : GDALGetOvrWorkDataType(pszResampling, eDataType);
1247 14109 : int nHasNoData = 0;
1248 14109 : double dfNoDataValue = GetNoDataValue(&nHasNoData);
1249 14109 : const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
1250 14109 : if (!bHasNoData)
1251 13977 : dfNoDataValue = 0.0;
1252 :
1253 14109 : int nDstBlockXSize = nBufXSize;
1254 14109 : int nDstBlockYSize = nBufYSize;
1255 14109 : int nFullResXChunk = 0;
1256 14109 : int nFullResYChunk = 0;
1257 : while (true)
1258 : {
1259 14120 : nFullResXChunk =
1260 14120 : 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
1261 14120 : nFullResYChunk =
1262 14120 : 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
1263 14120 : if (nFullResXChunk > nRasterXSize)
1264 4777 : nFullResXChunk = nRasterXSize;
1265 14120 : if (nFullResYChunk > nRasterYSize)
1266 594 : nFullResYChunk = nRasterYSize;
1267 14120 : if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
1268 14062 : (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
1269 : 1024 * 1024))
1270 : break;
1271 : // When operating on the full width of a raster whose block width is
1272 : // the raster width, prefer doing chunks in height.
1273 11 : if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
1274 : nDstBlockYSize > 1)
1275 0 : nDstBlockYSize /= 2;
1276 : /* Otherwise cut the maximal dimension */
1277 11 : else if (nDstBlockXSize > 1 &&
1278 0 : (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
1279 11 : nDstBlockXSize /= 2;
1280 : else
1281 0 : nDstBlockYSize /= 2;
1282 : }
1283 :
1284 14109 : int nOvrXFactor = static_cast<int>(0.5 + dfXRatioDstToSrc);
1285 14109 : int nOvrYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
1286 14109 : if (nOvrXFactor == 0)
1287 2029 : nOvrXFactor = 1;
1288 14109 : if (nOvrYFactor == 0)
1289 2028 : nOvrYFactor = 1;
1290 14109 : int nFullResXSizeQueried =
1291 14109 : nFullResXChunk + 2 * nKernelRadius * nOvrXFactor;
1292 14109 : int nFullResYSizeQueried =
1293 14109 : nFullResYChunk + 2 * nKernelRadius * nOvrYFactor;
1294 :
1295 14109 : if (nFullResXSizeQueried > nRasterXSize)
1296 2734 : nFullResXSizeQueried = nRasterXSize;
1297 14109 : if (nFullResYSizeQueried > nRasterYSize)
1298 332 : nFullResYSizeQueried = nRasterYSize;
1299 :
1300 : void *pChunk =
1301 14109 : VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
1302 : nFullResXSizeQueried, nFullResYSizeQueried);
1303 14109 : GByte *pabyChunkNoDataMask = nullptr;
1304 :
1305 14109 : GDALRasterBand *poMaskBand = GetMaskBand();
1306 14109 : int l_nMaskFlags = GetMaskFlags();
1307 :
1308 14109 : bool bUseNoDataMask = ((l_nMaskFlags & GMF_ALL_VALID) == 0);
1309 14109 : if (bUseNoDataMask)
1310 : {
1311 7525 : pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
1312 : nFullResXSizeQueried, nFullResYSizeQueried));
1313 : }
1314 14109 : if (pChunk == nullptr ||
1315 7525 : (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
1316 : {
1317 0 : GDALClose(poMEMDS);
1318 0 : CPLFree(pChunk);
1319 0 : CPLFree(pabyChunkNoDataMask);
1320 0 : VSIFree(pTempBuffer);
1321 0 : return CE_Failure;
1322 : }
1323 :
1324 14109 : const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
1325 14109 : DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
1326 14109 : int nBlocksDone = 0;
1327 :
1328 : int nDstYOff;
1329 28218 : for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
1330 14109 : nDstYOff += nDstBlockYSize)
1331 : {
1332 : int nDstYCount;
1333 14109 : if (nDstYOff + nDstBlockYSize <= nBufYSize)
1334 14109 : nDstYCount = nDstBlockYSize;
1335 : else
1336 0 : nDstYCount = nBufYSize - nDstYOff;
1337 :
1338 14109 : int nChunkYOff =
1339 14109 : nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
1340 14109 : int nChunkYOff2 = nYOff + 1 +
1341 14109 : static_cast<int>(ceil((nDstYOff + nDstYCount) *
1342 : dfYRatioDstToSrc));
1343 14109 : if (nChunkYOff2 > nRasterYSize)
1344 782 : nChunkYOff2 = nRasterYSize;
1345 14109 : int nYCount = nChunkYOff2 - nChunkYOff;
1346 14109 : CPLAssert(nYCount <= nFullResYChunk);
1347 :
1348 14109 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrYFactor;
1349 14109 : int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrYFactor;
1350 14109 : if (nChunkYOffQueried < 0)
1351 : {
1352 491 : nChunkYSizeQueried += nChunkYOffQueried;
1353 491 : nChunkYOffQueried = 0;
1354 : }
1355 14109 : if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
1356 594 : nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
1357 14109 : CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
1358 :
1359 14109 : int nDstXOff = 0;
1360 28218 : for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
1361 14109 : nDstXOff += nDstBlockXSize)
1362 : {
1363 14109 : int nDstXCount = 0;
1364 14109 : if (nDstXOff + nDstBlockXSize <= nBufXSize)
1365 14109 : nDstXCount = nDstBlockXSize;
1366 : else
1367 0 : nDstXCount = nBufXSize - nDstXOff;
1368 :
1369 14109 : int nChunkXOff =
1370 14109 : nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
1371 14109 : int nChunkXOff2 =
1372 14109 : nXOff + 1 +
1373 14109 : static_cast<int>(
1374 14109 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
1375 14109 : if (nChunkXOff2 > nRasterXSize)
1376 8802 : nChunkXOff2 = nRasterXSize;
1377 14109 : int nXCount = nChunkXOff2 - nChunkXOff;
1378 14109 : CPLAssert(nXCount <= nFullResXChunk);
1379 :
1380 14109 : int nChunkXOffQueried =
1381 14109 : nChunkXOff - nKernelRadius * nOvrXFactor;
1382 14109 : int nChunkXSizeQueried =
1383 14109 : nXCount + 2 * nKernelRadius * nOvrXFactor;
1384 14109 : if (nChunkXOffQueried < 0)
1385 : {
1386 2795 : nChunkXSizeQueried += nChunkXOffQueried;
1387 2795 : nChunkXOffQueried = 0;
1388 : }
1389 14109 : if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
1390 2781 : nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
1391 14109 : CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
1392 :
1393 : // Read the source buffers.
1394 14109 : eErr = RasterIO(GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1395 : nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
1396 : nChunkXSizeQueried, nChunkYSizeQueried,
1397 : eWrkDataType, 0, 0, nullptr);
1398 :
1399 14109 : bool bSkipResample = false;
1400 14109 : bool bNoDataMaskFullyOpaque = false;
1401 14109 : if (eErr == CE_None && bUseNoDataMask)
1402 : {
1403 7525 : eErr = poMaskBand->RasterIO(
1404 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1405 : nChunkXSizeQueried, nChunkYSizeQueried,
1406 : pabyChunkNoDataMask, nChunkXSizeQueried,
1407 : nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
1408 :
1409 : /* Optimizations if mask if fully opaque or transparent */
1410 7525 : int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
1411 7525 : GByte bVal = pabyChunkNoDataMask[0];
1412 7525 : int i = 1;
1413 15237000 : for (; i < nPixels; i++)
1414 : {
1415 15230700 : if (pabyChunkNoDataMask[i] != bVal)
1416 1168 : break;
1417 : }
1418 7525 : if (i == nPixels)
1419 : {
1420 6357 : if (bVal == 0)
1421 : {
1422 12094 : for (int j = 0; j < nDstYCount; j++)
1423 : {
1424 6377 : GDALCopyWords64(&dfNoDataValue, GDT_Float64, 0,
1425 : static_cast<GByte *>(pDataMem) +
1426 6377 : nLSMem * (j + nDstYOff) +
1427 6377 : nDstXOff * nPSMem,
1428 : eDTMem,
1429 : static_cast<int>(nPSMem),
1430 : nDstXCount);
1431 : }
1432 5717 : bSkipResample = true;
1433 : }
1434 : else
1435 : {
1436 640 : bNoDataMaskFullyOpaque = true;
1437 : }
1438 : }
1439 : }
1440 :
1441 14109 : if (!bSkipResample && eErr == CE_None)
1442 : {
1443 8389 : const bool bPropagateNoData = false;
1444 8389 : void *pDstBuffer = nullptr;
1445 8389 : GDALDataType eDstBufferDataType = GDT_Unknown;
1446 : GDALRasterBand *poMEMBand =
1447 8389 : GDALRasterBand::FromHandle(hMEMBand);
1448 8389 : GDALOverviewResampleArgs args;
1449 8389 : args.eSrcDataType = eDataType;
1450 8389 : args.eOvrDataType = poMEMBand->GetRasterDataType();
1451 8389 : args.nOvrXSize = poMEMBand->GetXSize();
1452 8389 : args.nOvrYSize = poMEMBand->GetYSize();
1453 8389 : args.nOvrNBITS = nNBITS;
1454 8389 : args.dfXRatioDstToSrc = dfXRatioDstToSrc;
1455 8389 : args.dfYRatioDstToSrc = dfYRatioDstToSrc;
1456 8389 : args.dfSrcXDelta =
1457 8389 : dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
1458 8389 : args.dfSrcYDelta =
1459 8389 : dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
1460 8389 : args.eWrkDataType = eWrkDataType;
1461 8389 : args.pabyChunkNodataMask =
1462 8389 : bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask;
1463 8389 : args.nChunkXOff =
1464 8389 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
1465 8389 : args.nChunkXSize = nChunkXSizeQueried;
1466 8389 : args.nChunkYOff =
1467 8389 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
1468 8389 : args.nChunkYSize = nChunkYSizeQueried;
1469 8389 : args.nDstXOff = nDstXOff + nDestXOffVirtual;
1470 8389 : args.nDstXOff2 = nDstXOff + nDestXOffVirtual + nDstXCount;
1471 8389 : args.nDstYOff = nDstYOff + nDestYOffVirtual;
1472 8389 : args.nDstYOff2 = nDstYOff + nDestYOffVirtual + nDstYCount;
1473 8389 : args.pszResampling = pszResampling;
1474 8389 : args.bHasNoData = bHasNoData;
1475 8389 : args.dfNoDataValue = dfNoDataValue;
1476 8389 : args.poColorTable = GetColorTable();
1477 8389 : args.bPropagateNoData = bPropagateNoData;
1478 8389 : eErr = pfnResampleFunc(args, pChunk, &pDstBuffer,
1479 : &eDstBufferDataType);
1480 8389 : if (eErr == CE_None)
1481 : {
1482 8389 : eErr = poMEMBand->RasterIO(
1483 : GF_Write, nDstXOff + nDestXOffVirtual,
1484 : nDstYOff + nDestYOffVirtual, nDstXCount, nDstYCount,
1485 : pDstBuffer, nDstXCount, nDstYCount,
1486 : eDstBufferDataType, 0, 0, nullptr);
1487 : }
1488 8389 : CPLFree(pDstBuffer);
1489 : }
1490 :
1491 14109 : nBlocksDone++;
1492 25031 : if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
1493 10922 : !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
1494 : "", psExtraArg->pProgressData))
1495 : {
1496 1 : eErr = CE_Failure;
1497 : }
1498 : }
1499 : }
1500 :
1501 14109 : CPLFree(pChunk);
1502 14109 : CPLFree(pabyChunkNoDataMask);
1503 : }
1504 :
1505 14258 : if (pTempBuffer)
1506 : {
1507 4 : CPL_IGNORE_RET_VAL(poMEMDS->GetRasterBand(1)->RasterIO(
1508 : GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
1509 : pData, nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
1510 : nullptr));
1511 : }
1512 14258 : GDALClose(poMEMDS);
1513 14258 : VSIFree(pTempBuffer);
1514 :
1515 14258 : return eErr;
1516 : }
1517 :
1518 : /************************************************************************/
1519 : /* RasterIOResampled() */
1520 : /************************************************************************/
1521 :
1522 892 : CPLErr GDALDataset::RasterIOResampled(
1523 : GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
1524 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
1525 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
1526 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
1527 :
1528 : {
1529 : #if 0
1530 : // Determine if we use warping resampling or overview resampling
1531 : bool bUseWarp = false;
1532 : if( GDALDataTypeIsComplex( eDataType ) )
1533 : bUseWarp = true;
1534 : #endif
1535 :
1536 892 : double dfXOff = nXOff;
1537 892 : double dfYOff = nYOff;
1538 892 : double dfXSize = nXSize;
1539 892 : double dfYSize = nYSize;
1540 892 : if (psExtraArg->bFloatingPointWindowValidity)
1541 : {
1542 765 : dfXOff = psExtraArg->dfXOff;
1543 765 : dfYOff = psExtraArg->dfYOff;
1544 765 : dfXSize = psExtraArg->dfXSize;
1545 765 : dfYSize = psExtraArg->dfYSize;
1546 : }
1547 :
1548 892 : const double dfXRatioDstToSrc = dfXSize / nBufXSize;
1549 892 : const double dfYRatioDstToSrc = dfYSize / nBufYSize;
1550 :
1551 : // Determine the coordinates in the "virtual" output raster to see
1552 : // if there are not integers, in which case we will use them as a shift
1553 : // so that subwindow extracts give the exact same results as entire raster
1554 : // scaling.
1555 892 : double dfDestXOff = dfXOff / dfXRatioDstToSrc;
1556 892 : bool bHasXOffVirtual = false;
1557 892 : int nDestXOffVirtual = 0;
1558 892 : if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
1559 : {
1560 767 : bHasXOffVirtual = true;
1561 767 : dfXOff = nXOff;
1562 767 : nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
1563 : }
1564 :
1565 892 : double dfDestYOff = dfYOff / dfYRatioDstToSrc;
1566 892 : bool bHasYOffVirtual = false;
1567 892 : int nDestYOffVirtual = 0;
1568 892 : if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
1569 : {
1570 727 : bHasYOffVirtual = true;
1571 727 : dfYOff = nYOff;
1572 727 : nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
1573 : }
1574 :
1575 : // Create a MEM dataset that wraps the output buffer.
1576 892 : std::unique_ptr<void, VSIFreeReleaser> pTempBuffer;
1577 892 : GSpacing nPSMem = nPixelSpace;
1578 892 : GSpacing nLSMem = nLineSpace;
1579 892 : GSpacing nBandSpaceMEM = nBandSpace;
1580 892 : void *pDataMem = pData;
1581 892 : GDALDataType eDTMem = eBufType;
1582 892 : GDALRasterBand *poFirstSrcBand = GetRasterBand(panBandMap[0]);
1583 892 : const GDALDataType eDataType = poFirstSrcBand->GetRasterDataType();
1584 892 : if (eBufType != eDataType && !GDAL_GET_OPERATE_IN_BUF_TYPE(*psExtraArg))
1585 : {
1586 2 : nPSMem = GDALGetDataTypeSizeBytes(eDataType);
1587 2 : nLSMem = nPSMem * nBufXSize;
1588 2 : nBandSpaceMEM = nLSMem * nBandCount;
1589 2 : pTempBuffer.reset(VSI_MALLOC3_VERBOSE(nBandCount, nBufYSize,
1590 : static_cast<size_t>(nLSMem)));
1591 2 : if (pTempBuffer == nullptr)
1592 0 : return CE_Failure;
1593 2 : pDataMem = pTempBuffer.get();
1594 2 : eDTMem = eDataType;
1595 : }
1596 :
1597 : auto poMEMDS = std::unique_ptr<GDALDataset>(
1598 892 : MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
1599 1784 : nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr));
1600 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1601 : std::vector<GDALRasterBand *> apoDstBands(nBandCount);
1602 : #endif
1603 892 : int nNBITS = 0;
1604 2896 : for (int i = 0; i < nBandCount; i++)
1605 : {
1606 2004 : GByte *const pBandData = static_cast<GByte *>(pDataMem) -
1607 2004 : nPSMem * nDestXOffVirtual -
1608 2004 : nLSMem * nDestYOffVirtual + nBandSpaceMEM * i;
1609 2004 : auto poMEMBand = GDALRasterBand::FromHandle(MEMCreateRasterBandEx(
1610 : poMEMDS.get(), i + 1, pBandData, eDTMem, nPSMem, nLSMem, false));
1611 2004 : poMEMDS->SetBand(i + 1, poMEMBand);
1612 :
1613 2004 : GDALRasterBand *poSrcBand = GetRasterBand(panBandMap[i]);
1614 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1615 : apoDstBands[i] = poMEMBand;
1616 : #endif
1617 : const char *pszNBITS =
1618 2004 : poSrcBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
1619 2004 : if (pszNBITS)
1620 : {
1621 0 : nNBITS = atoi(pszNBITS);
1622 0 : poMEMDS->GetRasterBand(i + 1)->SetMetadataItem("NBITS", pszNBITS,
1623 0 : "IMAGE_STRUCTURE");
1624 : }
1625 : }
1626 :
1627 892 : CPLErr eErr = CE_None;
1628 :
1629 : // TODO(schwehr): Why disabled? Why not just delete?
1630 : // Looks like this code was initially added as disable by copying
1631 : // from RasterIO here:
1632 : // https://trac.osgeo.org/gdal/changeset/29572
1633 : #if 0
1634 : // Do the resampling.
1635 : if( bUseWarp )
1636 : {
1637 : VRTDatasetH hVRTDS = nullptr;
1638 : GDALRasterBandH hVRTBand = nullptr;
1639 : if( GetDataset() == nullptr )
1640 : {
1641 : /* Create VRT dataset that wraps the whole dataset */
1642 : hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
1643 : VRTAddBand( hVRTDS, eDataType, nullptr );
1644 : hVRTBand = GDALGetRasterBand(hVRTDS, 1);
1645 : VRTAddSimpleSource( (VRTSourcedRasterBandH)hVRTBand,
1646 : (GDALRasterBandH)this,
1647 : 0, 0,
1648 : nRasterXSize, nRasterYSize,
1649 : 0, 0,
1650 : nRasterXSize, nRasterYSize,
1651 : nullptr, VRT_NODATA_UNSET );
1652 :
1653 : /* Add a mask band if needed */
1654 : if( GetMaskFlags() != GMF_ALL_VALID )
1655 : {
1656 : ((GDALDataset*)hVRTDS)->CreateMaskBand(0);
1657 : VRTSourcedRasterBand* poVRTMaskBand =
1658 : (VRTSourcedRasterBand*)(((GDALRasterBand*)hVRTBand)->GetMaskBand());
1659 : poVRTMaskBand->
1660 : AddMaskBandSource( this,
1661 : 0, 0,
1662 : nRasterXSize, nRasterYSize,
1663 : 0, 0,
1664 : nRasterXSize, nRasterYSize);
1665 : }
1666 : }
1667 :
1668 : GDALWarpOptions* psWarpOptions = GDALCreateWarpOptions();
1669 : psWarpOptions->eResampleAlg = (GDALResampleAlg)psExtraArg->eResampleAlg;
1670 : psWarpOptions->hSrcDS = (GDALDatasetH) (hVRTDS ? hVRTDS : GetDataset());
1671 : psWarpOptions->hDstDS = (GDALDatasetH) poMEMDS;
1672 : psWarpOptions->nBandCount = 1;
1673 : int nSrcBandNumber = (hVRTDS ? 1 : nBand);
1674 : int nDstBandNumber = 1;
1675 : psWarpOptions->panSrcBands = &nSrcBandNumber;
1676 : psWarpOptions->panDstBands = &nDstBandNumber;
1677 : psWarpOptions->pfnProgress = psExtraArg->pfnProgress ?
1678 : psExtraArg->pfnProgress : GDALDummyProgress;
1679 : psWarpOptions->pProgressArg = psExtraArg->pProgressData;
1680 : psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
1681 : GDALRasterIOTransformerStruct sTransformer;
1682 : sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
1683 : sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
1684 : sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
1685 : sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
1686 : psWarpOptions->pTransformerArg = &sTransformer;
1687 :
1688 : GDALWarpOperationH hWarpOperation = GDALCreateWarpOperation(psWarpOptions);
1689 : eErr = GDALChunkAndWarpImage( hWarpOperation,
1690 : nDestXOffVirtual, nDestYOffVirtual,
1691 : nBufXSize, nBufYSize );
1692 : GDALDestroyWarpOperation( hWarpOperation );
1693 :
1694 : psWarpOptions->panSrcBands = nullptr;
1695 : psWarpOptions->panDstBands = nullptr;
1696 : GDALDestroyWarpOptions( psWarpOptions );
1697 :
1698 : if( hVRTDS )
1699 : GDALClose(hVRTDS);
1700 : }
1701 : else
1702 : #endif
1703 : {
1704 : const char *pszResampling =
1705 892 : GDALRasterIOGetResampleAlg(psExtraArg->eResampleAlg);
1706 :
1707 : int nBlockXSize, nBlockYSize;
1708 892 : poFirstSrcBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
1709 :
1710 : int nKernelRadius;
1711 : GDALResampleFunction pfnResampleFunc =
1712 892 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
1713 892 : CPLAssert(pfnResampleFunc);
1714 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1715 : GDALResampleFunctionMultiBands pfnResampleFuncMultiBands =
1716 : GDALGetResampleFunctionMultiBands(pszResampling, &nKernelRadius);
1717 : #endif
1718 : GDALDataType eWrkDataType =
1719 892 : GDALGetOvrWorkDataType(pszResampling, eDataType);
1720 :
1721 892 : int nDstBlockXSize = nBufXSize;
1722 892 : int nDstBlockYSize = nBufYSize;
1723 : int nFullResXChunk, nFullResYChunk;
1724 : while (true)
1725 : {
1726 892 : nFullResXChunk =
1727 892 : 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
1728 892 : nFullResYChunk =
1729 892 : 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
1730 892 : if (nFullResXChunk > nRasterXSize)
1731 591 : nFullResXChunk = nRasterXSize;
1732 892 : if (nFullResYChunk > nRasterYSize)
1733 57 : nFullResYChunk = nRasterYSize;
1734 892 : if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
1735 890 : (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
1736 : 1024 * 1024))
1737 : break;
1738 : // When operating on the full width of a raster whose block width is
1739 : // the raster width, prefer doing chunks in height.
1740 0 : if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
1741 : nDstBlockYSize > 1)
1742 0 : nDstBlockYSize /= 2;
1743 : /* Otherwise cut the maximal dimension */
1744 0 : else if (nDstBlockXSize > 1 &&
1745 0 : (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
1746 0 : nDstBlockXSize /= 2;
1747 : else
1748 0 : nDstBlockYSize /= 2;
1749 : }
1750 :
1751 1784 : int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
1752 892 : static_cast<int>(0.5 + dfYRatioDstToSrc));
1753 892 : if (nOvrFactor == 0)
1754 104 : nOvrFactor = 1;
1755 892 : int nFullResXSizeQueried =
1756 892 : nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
1757 892 : int nFullResYSizeQueried =
1758 892 : nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
1759 :
1760 892 : if (nFullResXSizeQueried > nRasterXSize)
1761 616 : nFullResXSizeQueried = nRasterXSize;
1762 892 : if (nFullResYSizeQueried > nRasterYSize)
1763 60 : nFullResYSizeQueried = nRasterYSize;
1764 :
1765 892 : void *pChunk = VSI_MALLOC3_VERBOSE(
1766 : cpl::fits_on<int>(GDALGetDataTypeSizeBytes(eWrkDataType) *
1767 : nBandCount),
1768 : nFullResXSizeQueried, nFullResYSizeQueried);
1769 892 : GByte *pabyChunkNoDataMask = nullptr;
1770 :
1771 892 : GDALRasterBand *poMaskBand = poFirstSrcBand->GetMaskBand();
1772 892 : int nMaskFlags = poFirstSrcBand->GetMaskFlags();
1773 :
1774 892 : bool bUseNoDataMask = ((nMaskFlags & GMF_ALL_VALID) == 0);
1775 892 : if (bUseNoDataMask)
1776 : {
1777 617 : pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
1778 : nFullResXSizeQueried, nFullResYSizeQueried));
1779 : }
1780 892 : if (pChunk == nullptr ||
1781 617 : (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
1782 : {
1783 0 : CPLFree(pChunk);
1784 0 : CPLFree(pabyChunkNoDataMask);
1785 0 : return CE_Failure;
1786 : }
1787 :
1788 892 : const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
1789 892 : DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
1790 892 : int nBlocksDone = 0;
1791 :
1792 : int nDstYOff;
1793 1784 : for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
1794 892 : nDstYOff += nDstBlockYSize)
1795 : {
1796 : int nDstYCount;
1797 892 : if (nDstYOff + nDstBlockYSize <= nBufYSize)
1798 892 : nDstYCount = nDstBlockYSize;
1799 : else
1800 0 : nDstYCount = nBufYSize - nDstYOff;
1801 :
1802 892 : int nChunkYOff =
1803 892 : nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
1804 892 : int nChunkYOff2 = nYOff + 1 +
1805 892 : static_cast<int>(ceil((nDstYOff + nDstYCount) *
1806 : dfYRatioDstToSrc));
1807 892 : if (nChunkYOff2 > nRasterYSize)
1808 139 : nChunkYOff2 = nRasterYSize;
1809 892 : int nYCount = nChunkYOff2 - nChunkYOff;
1810 892 : CPLAssert(nYCount <= nFullResYChunk);
1811 :
1812 892 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
1813 892 : int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
1814 892 : if (nChunkYOffQueried < 0)
1815 : {
1816 142 : nChunkYSizeQueried += nChunkYOffQueried;
1817 142 : nChunkYOffQueried = 0;
1818 : }
1819 892 : if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
1820 157 : nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
1821 892 : CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
1822 :
1823 : int nDstXOff;
1824 1784 : for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
1825 892 : nDstXOff += nDstBlockXSize)
1826 : {
1827 : int nDstXCount;
1828 892 : if (nDstXOff + nDstBlockXSize <= nBufXSize)
1829 892 : nDstXCount = nDstBlockXSize;
1830 : else
1831 0 : nDstXCount = nBufXSize - nDstXOff;
1832 :
1833 892 : int nChunkXOff =
1834 892 : nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
1835 892 : int nChunkXOff2 =
1836 892 : nXOff + 1 +
1837 892 : static_cast<int>(
1838 892 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
1839 892 : if (nChunkXOff2 > nRasterXSize)
1840 647 : nChunkXOff2 = nRasterXSize;
1841 892 : int nXCount = nChunkXOff2 - nChunkXOff;
1842 892 : CPLAssert(nXCount <= nFullResXChunk);
1843 :
1844 892 : int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
1845 892 : int nChunkXSizeQueried =
1846 892 : nXCount + 2 * nKernelRadius * nOvrFactor;
1847 892 : if (nChunkXOffQueried < 0)
1848 : {
1849 647 : nChunkXSizeQueried += nChunkXOffQueried;
1850 647 : nChunkXOffQueried = 0;
1851 : }
1852 892 : if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
1853 655 : nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
1854 892 : CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
1855 :
1856 892 : bool bSkipResample = false;
1857 892 : bool bNoDataMaskFullyOpaque = false;
1858 892 : if (eErr == CE_None && bUseNoDataMask)
1859 : {
1860 617 : eErr = poMaskBand->RasterIO(
1861 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1862 : nChunkXSizeQueried, nChunkYSizeQueried,
1863 : pabyChunkNoDataMask, nChunkXSizeQueried,
1864 : nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
1865 :
1866 : /* Optimizations if mask if fully opaque or transparent */
1867 617 : const int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
1868 617 : const GByte bVal = pabyChunkNoDataMask[0];
1869 617 : int i = 1; // Used after for.
1870 48197000 : for (; i < nPixels; i++)
1871 : {
1872 48196500 : if (pabyChunkNoDataMask[i] != bVal)
1873 72 : break;
1874 : }
1875 617 : if (i == nPixels)
1876 : {
1877 545 : if (bVal == 0)
1878 : {
1879 373 : GByte abyZero[16] = {0};
1880 780 : for (int iBand = 0; iBand < nBandCount; iBand++)
1881 : {
1882 3499 : for (int j = 0; j < nDstYCount; j++)
1883 : {
1884 3092 : GDALCopyWords64(
1885 : abyZero, GDT_UInt8, 0,
1886 : static_cast<GByte *>(pDataMem) +
1887 3092 : iBand * nBandSpaceMEM +
1888 3092 : nLSMem * (j + nDstYOff) +
1889 3092 : nDstXOff * nPSMem,
1890 : eBufType, static_cast<int>(nPSMem),
1891 : nDstXCount);
1892 : }
1893 : }
1894 373 : bSkipResample = true;
1895 : }
1896 : else
1897 : {
1898 172 : bNoDataMaskFullyOpaque = true;
1899 : }
1900 : }
1901 : }
1902 :
1903 892 : if (!bSkipResample && eErr == CE_None)
1904 : {
1905 : /* Read the source buffers */
1906 516 : eErr = RasterIO(
1907 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1908 : nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
1909 : nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType,
1910 : nBandCount, panBandMap, 0, 0, 0, nullptr);
1911 : }
1912 :
1913 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1914 : if (pfnResampleFuncMultiBands && !bSkipResample &&
1915 : eErr == CE_None)
1916 : {
1917 : eErr = pfnResampleFuncMultiBands(
1918 : dfXRatioDstToSrc, dfYRatioDstToSrc,
1919 : dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
1920 : dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
1921 : eWrkDataType, (GByte *)pChunk, nBandCount,
1922 : bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask,
1923 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
1924 : nChunkXSizeQueried,
1925 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
1926 : nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
1927 : nDstXOff + nDestXOffVirtual + nDstXCount,
1928 : nDstYOff + nDestYOffVirtual,
1929 : nDstYOff + nDestYOffVirtual + nDstYCount,
1930 : apoDstBands.data(), pszResampling, FALSE /*bHasNoData*/,
1931 : 0.0 /* dfNoDataValue */, nullptr /* color table*/,
1932 : eDataType);
1933 : }
1934 : else
1935 : #endif
1936 : {
1937 : size_t nChunkBandOffset =
1938 892 : static_cast<size_t>(nChunkXSizeQueried) *
1939 892 : nChunkYSizeQueried *
1940 892 : GDALGetDataTypeSizeBytes(eWrkDataType);
1941 2480 : for (int i = 0;
1942 2480 : i < nBandCount && !bSkipResample && eErr == CE_None;
1943 : i++)
1944 : {
1945 1588 : const bool bPropagateNoData = false;
1946 1588 : void *pDstBuffer = nullptr;
1947 1588 : GDALDataType eDstBufferDataType = GDT_Unknown;
1948 : GDALRasterBand *poMEMBand =
1949 1588 : poMEMDS->GetRasterBand(i + 1);
1950 1588 : GDALOverviewResampleArgs args;
1951 1588 : args.eSrcDataType = eDataType;
1952 1588 : args.eOvrDataType = poMEMBand->GetRasterDataType();
1953 1588 : args.nOvrXSize = poMEMBand->GetXSize();
1954 1588 : args.nOvrYSize = poMEMBand->GetYSize();
1955 1588 : args.nOvrNBITS = nNBITS;
1956 1588 : args.dfXRatioDstToSrc = dfXRatioDstToSrc;
1957 1588 : args.dfYRatioDstToSrc = dfYRatioDstToSrc;
1958 1588 : args.dfSrcXDelta =
1959 1588 : dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
1960 1588 : args.dfSrcYDelta =
1961 1588 : dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
1962 1588 : args.eWrkDataType = eWrkDataType;
1963 1588 : args.pabyChunkNodataMask = bNoDataMaskFullyOpaque
1964 1588 : ? nullptr
1965 : : pabyChunkNoDataMask;
1966 1588 : args.nChunkXOff =
1967 1588 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
1968 1588 : args.nChunkXSize = nChunkXSizeQueried;
1969 1588 : args.nChunkYOff =
1970 1588 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
1971 1588 : args.nChunkYSize = nChunkYSizeQueried;
1972 1588 : args.nDstXOff = nDstXOff + nDestXOffVirtual;
1973 1588 : args.nDstXOff2 =
1974 1588 : nDstXOff + nDestXOffVirtual + nDstXCount;
1975 1588 : args.nDstYOff = nDstYOff + nDestYOffVirtual;
1976 1588 : args.nDstYOff2 =
1977 1588 : nDstYOff + nDestYOffVirtual + nDstYCount;
1978 1588 : args.pszResampling = pszResampling;
1979 1588 : args.bHasNoData = false;
1980 1588 : args.dfNoDataValue = 0.0;
1981 1588 : args.poColorTable = nullptr;
1982 1588 : args.bPropagateNoData = bPropagateNoData;
1983 :
1984 : eErr =
1985 3176 : pfnResampleFunc(args,
1986 1588 : reinterpret_cast<GByte *>(pChunk) +
1987 1588 : i * nChunkBandOffset,
1988 : &pDstBuffer, &eDstBufferDataType);
1989 1588 : if (eErr == CE_None)
1990 : {
1991 1588 : eErr = poMEMBand->RasterIO(
1992 : GF_Write, nDstXOff + nDestXOffVirtual,
1993 : nDstYOff + nDestYOffVirtual, nDstXCount,
1994 : nDstYCount, pDstBuffer, nDstXCount, nDstYCount,
1995 : eDstBufferDataType, 0, 0, nullptr);
1996 : }
1997 1588 : CPLFree(pDstBuffer);
1998 : }
1999 : }
2000 :
2001 892 : nBlocksDone++;
2002 1281 : if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
2003 389 : !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
2004 : "", psExtraArg->pProgressData))
2005 : {
2006 0 : eErr = CE_Failure;
2007 : }
2008 : }
2009 : }
2010 :
2011 892 : CPLFree(pChunk);
2012 892 : CPLFree(pabyChunkNoDataMask);
2013 : }
2014 :
2015 892 : if (pTempBuffer)
2016 : {
2017 2 : CPL_IGNORE_RET_VAL(poMEMDS->RasterIO(
2018 : GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
2019 : pData, nBufXSize, nBufYSize, eBufType, nBandCount, nullptr,
2020 : nPixelSpace, nLineSpace, nBandSpace, nullptr));
2021 : }
2022 :
2023 892 : return eErr;
2024 : }
2025 :
2026 : //! @endcond
2027 :
2028 : /************************************************************************/
2029 : /* GDALSwapWords() */
2030 : /************************************************************************/
2031 :
2032 : /**
2033 : * Byte swap words in-place.
2034 : *
2035 : * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
2036 : * a memory array. No assumption is made that the words being swapped are
2037 : * word aligned in memory. Use the CPL_LSB and CPL_MSB macros from cpl_port.h
2038 : * to determine if the current platform is big endian or little endian. Use
2039 : * The macros like CPL_SWAP32() to byte swap single values without the overhead
2040 : * of a function call.
2041 : *
2042 : * @param pData pointer to start of data buffer.
2043 : * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
2044 : * @param nWordCount the number of words to be swapped in this call.
2045 : * @param nWordSkip the byte offset from the start of one word to the start of
2046 : * the next. For packed buffers this is the same as nWordSize.
2047 : */
2048 :
2049 497149 : void CPL_STDCALL GDALSwapWords(void *pData, int nWordSize, int nWordCount,
2050 : int nWordSkip)
2051 :
2052 : {
2053 497149 : if (nWordCount > 0)
2054 497149 : VALIDATE_POINTER0(pData, "GDALSwapWords");
2055 :
2056 497149 : GByte *pabyData = static_cast<GByte *>(pData);
2057 :
2058 497149 : switch (nWordSize)
2059 : {
2060 7234 : case 1:
2061 7234 : break;
2062 :
2063 476905 : case 2:
2064 476905 : CPLAssert(nWordSkip >= 2 || nWordCount == 1);
2065 228062000 : for (int i = 0; i < nWordCount; i++)
2066 : {
2067 227585000 : CPL_SWAP16PTR(pabyData);
2068 227585000 : pabyData += nWordSkip;
2069 : }
2070 476905 : break;
2071 :
2072 10584 : case 4:
2073 10584 : CPLAssert(nWordSkip >= 4 || nWordCount == 1);
2074 10584 : if (CPL_IS_ALIGNED(pabyData, 4) && (nWordSkip % 4) == 0)
2075 : {
2076 29140600 : for (int i = 0; i < nWordCount; i++)
2077 : {
2078 29130000 : *reinterpret_cast<GUInt32 *>(pabyData) = CPL_SWAP32(
2079 : *reinterpret_cast<const GUInt32 *>(pabyData));
2080 29130000 : pabyData += nWordSkip;
2081 10581 : }
2082 : }
2083 : else
2084 : {
2085 9 : for (int i = 0; i < nWordCount; i++)
2086 : {
2087 6 : CPL_SWAP32PTR(pabyData);
2088 6 : pabyData += nWordSkip;
2089 : }
2090 : }
2091 10584 : break;
2092 :
2093 2426 : case 8:
2094 2426 : CPLAssert(nWordSkip >= 8 || nWordCount == 1);
2095 2426 : if (CPL_IS_ALIGNED(pabyData, 8) && (nWordSkip % 8) == 0)
2096 : {
2097 3356900 : for (int i = 0; i < nWordCount; i++)
2098 : {
2099 3354480 : *reinterpret_cast<GUInt64 *>(pabyData) = CPL_SWAP64(
2100 : *reinterpret_cast<const GUInt64 *>(pabyData));
2101 3354480 : pabyData += nWordSkip;
2102 2425 : }
2103 : }
2104 : else
2105 : {
2106 3 : for (int i = 0; i < nWordCount; i++)
2107 : {
2108 2 : CPL_SWAP64PTR(pabyData);
2109 2 : pabyData += nWordSkip;
2110 : }
2111 : }
2112 2426 : break;
2113 :
2114 0 : default:
2115 0 : CPLAssert(false);
2116 : }
2117 : }
2118 :
2119 : /************************************************************************/
2120 : /* GDALSwapWordsEx() */
2121 : /************************************************************************/
2122 :
2123 : /**
2124 : * Byte swap words in-place.
2125 : *
2126 : * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
2127 : * a memory array. No assumption is made that the words being swapped are
2128 : * word aligned in memory. Use the CPL_LSB and CPL_MSB macros from cpl_port.h
2129 : * to determine if the current platform is big endian or little endian. Use
2130 : * The macros like CPL_SWAP32() to byte swap single values without the overhead
2131 : * of a function call.
2132 : *
2133 : * @param pData pointer to start of data buffer.
2134 : * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
2135 : * @param nWordCount the number of words to be swapped in this call.
2136 : * @param nWordSkip the byte offset from the start of one word to the start of
2137 : * the next. For packed buffers this is the same as nWordSize.
2138 : */
2139 6130 : void CPL_STDCALL GDALSwapWordsEx(void *pData, int nWordSize, size_t nWordCount,
2140 : int nWordSkip)
2141 : {
2142 6130 : GByte *pabyData = static_cast<GByte *>(pData);
2143 12260 : while (nWordCount)
2144 : {
2145 : // Pick-up a multiple of 8 as max chunk size.
2146 6130 : const int nWordCountSmall =
2147 6130 : (nWordCount > (1 << 30)) ? (1 << 30) : static_cast<int>(nWordCount);
2148 6130 : GDALSwapWords(pabyData, nWordSize, nWordCountSmall, nWordSkip);
2149 6130 : pabyData += static_cast<size_t>(nWordSkip) * nWordCountSmall;
2150 6130 : nWordCount -= nWordCountSmall;
2151 : }
2152 6130 : }
2153 :
2154 : // Place the new GDALCopyWords helpers in an anonymous namespace
2155 : namespace
2156 : {
2157 :
2158 : /************************************************************************/
2159 : /* GDALCopyWordsT() */
2160 : /************************************************************************/
2161 : /**
2162 : * Template function, used to copy data from pSrcData into buffer
2163 : * pDstData, with stride nSrcPixelStride in the source data and
2164 : * stride nDstPixelStride in the destination data. This template can
2165 : * deal with the case where the input data type is real or complex and
2166 : * the output is real.
2167 : *
2168 : * @param pSrcData the source data buffer
2169 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
2170 : * of interest.
2171 : * @param pDstData the destination buffer.
2172 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
2173 : * interest.
2174 : * @param nWordCount the total number of pixel words to copy
2175 : *
2176 : * @code
2177 : * // Assume an input buffer of type GUInt16 named pBufferIn
2178 : * GByte *pBufferOut = new GByte[numBytesOut];
2179 : * GDALCopyWordsT<GUInt16, GByte>(pSrcData, 2, pDstData, 1, numBytesOut);
2180 : * @endcode
2181 : * @note
2182 : * This is a private function, and should not be exposed outside of
2183 : * rasterio.cpp. External users should call the GDALCopyWords driver function.
2184 : */
2185 :
2186 : template <class Tin, class Tout>
2187 49013857 : static void inline GDALCopyWordsGenericT(const Tin *const CPL_RESTRICT pSrcData,
2188 : int nSrcPixelStride,
2189 : Tout *const CPL_RESTRICT pDstData,
2190 : int nDstPixelStride,
2191 : GPtrDiff_t nWordCount)
2192 : {
2193 49013857 : decltype(nWordCount) nDstOffset = 0;
2194 :
2195 49013857 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
2196 49013857 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
2197 356655113 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
2198 : {
2199 307641208 : const Tin tValue =
2200 307641208 : *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
2201 307641208 : Tout *const pOutPixel =
2202 307641208 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2203 :
2204 307641208 : GDALCopyWord(tValue, *pOutPixel);
2205 :
2206 307641208 : nDstOffset += nDstPixelStride;
2207 : }
2208 49013857 : }
2209 :
2210 : template <class Tin, class Tout>
2211 29776660 : static void CPL_NOINLINE GDALCopyWordsT(const Tin *const CPL_RESTRICT pSrcData,
2212 : int nSrcPixelStride,
2213 : Tout *const CPL_RESTRICT pDstData,
2214 : int nDstPixelStride,
2215 : GPtrDiff_t nWordCount)
2216 : {
2217 29776660 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData, nDstPixelStride,
2218 : nWordCount);
2219 29776660 : }
2220 :
2221 : template <class Tin, class Tout>
2222 5080936 : static void inline GDALCopyWordsT_8atatime(
2223 : const Tin *const CPL_RESTRICT pSrcData, int nSrcPixelStride,
2224 : Tout *const CPL_RESTRICT pDstData, int nDstPixelStride,
2225 : GPtrDiff_t nWordCount)
2226 : {
2227 5080936 : decltype(nWordCount) nDstOffset = 0;
2228 :
2229 5080936 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
2230 5080936 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
2231 5080936 : decltype(nWordCount) n = 0;
2232 5080936 : if (nSrcPixelStride == static_cast<int>(sizeof(Tin)) &&
2233 : nDstPixelStride == static_cast<int>(sizeof(Tout)))
2234 : {
2235 52932327 : for (; n < nWordCount - 7; n += 8)
2236 : {
2237 52390796 : const Tin *pInValues = reinterpret_cast<const Tin *>(
2238 52390796 : pSrcDataPtr + (n * nSrcPixelStride));
2239 52390796 : Tout *const pOutPixels =
2240 52390796 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2241 :
2242 52390796 : GDALCopy8Words(pInValues, pOutPixels);
2243 :
2244 52390796 : nDstOffset += 8 * nDstPixelStride;
2245 : }
2246 : }
2247 10465999 : for (; n < nWordCount; n++)
2248 : {
2249 5385053 : const Tin tValue =
2250 5385053 : *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
2251 5385053 : Tout *const pOutPixel =
2252 5385053 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2253 :
2254 5385053 : GDALCopyWord(tValue, *pOutPixel);
2255 :
2256 5385053 : nDstOffset += nDstPixelStride;
2257 : }
2258 5080936 : }
2259 :
2260 : #ifdef HAVE_SSE2
2261 :
2262 : template <class Tout>
2263 1042126 : void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
2264 : int nSrcPixelStride,
2265 : Tout *const CPL_RESTRICT pDstData,
2266 : int nDstPixelStride, GPtrDiff_t nWordCount)
2267 : {
2268 : static_assert(std::is_integral<Tout>::value &&
2269 : sizeof(Tout) == sizeof(uint16_t),
2270 : "Bad Tout");
2271 1042126 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2272 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2273 : {
2274 35752 : decltype(nWordCount) n = 0;
2275 35752 : const __m128i xmm_zero = _mm_setzero_si128();
2276 35752 : GByte *CPL_RESTRICT pabyDstDataPtr =
2277 : reinterpret_cast<GByte *>(pDstData);
2278 1478148 : for (; n < nWordCount - 15; n += 16)
2279 : {
2280 1442396 : __m128i xmm = _mm_loadu_si128(
2281 1442396 : reinterpret_cast<const __m128i *>(pSrcData + n));
2282 1442396 : __m128i xmm0 = _mm_unpacklo_epi8(xmm, xmm_zero);
2283 1442396 : __m128i xmm1 = _mm_unpackhi_epi8(xmm, xmm_zero);
2284 : _mm_storeu_si128(
2285 1442396 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2), xmm0);
2286 : _mm_storeu_si128(
2287 1442396 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2 + 16), xmm1);
2288 : }
2289 111662 : for (; n < nWordCount; n++)
2290 : {
2291 75910 : pDstData[n] = pSrcData[n];
2292 35752 : }
2293 : }
2294 : else
2295 : {
2296 1006371 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2297 : nDstPixelStride, nWordCount);
2298 : }
2299 1042126 : }
2300 :
2301 : template <>
2302 1029400 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2303 : int nSrcPixelStride,
2304 : GUInt16 *const CPL_RESTRICT pDstData,
2305 : int nDstPixelStride, GPtrDiff_t nWordCount)
2306 : {
2307 1029400 : GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
2308 : nDstPixelStride, nWordCount);
2309 1029400 : }
2310 :
2311 : template <>
2312 12726 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2313 : int nSrcPixelStride,
2314 : GInt16 *const CPL_RESTRICT pDstData,
2315 : int nDstPixelStride, GPtrDiff_t nWordCount)
2316 : {
2317 12726 : GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
2318 : nDstPixelStride, nWordCount);
2319 12726 : }
2320 :
2321 : template <class Tout>
2322 16237176 : void GDALCopyWordsByteTo32Bit(const GByte *const CPL_RESTRICT pSrcData,
2323 : int nSrcPixelStride,
2324 : Tout *const CPL_RESTRICT pDstData,
2325 : int nDstPixelStride, GPtrDiff_t nWordCount)
2326 : {
2327 : static_assert(std::is_integral<Tout>::value &&
2328 : sizeof(Tout) == sizeof(uint32_t),
2329 : "Bad Tout");
2330 16237176 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2331 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2332 : {
2333 6532686 : decltype(nWordCount) n = 0;
2334 6532686 : const __m128i xmm_zero = _mm_setzero_si128();
2335 6532686 : GByte *CPL_RESTRICT pabyDstDataPtr =
2336 : reinterpret_cast<GByte *>(pDstData);
2337 74248227 : for (; n < nWordCount - 15; n += 16)
2338 : {
2339 67715461 : __m128i xmm = _mm_loadu_si128(
2340 67715461 : reinterpret_cast<const __m128i *>(pSrcData + n));
2341 67715461 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2342 67715461 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2343 67715461 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2344 67715461 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2345 67715461 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2346 67715461 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2347 : _mm_storeu_si128(
2348 67715461 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4), xmm0);
2349 : _mm_storeu_si128(
2350 67715461 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 16), xmm1);
2351 : _mm_storeu_si128(
2352 67715461 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 32), xmm2);
2353 : _mm_storeu_si128(
2354 67715461 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 48), xmm3);
2355 : }
2356 14826316 : for (; n < nWordCount; n++)
2357 : {
2358 8293640 : pDstData[n] = pSrcData[n];
2359 6532686 : }
2360 : }
2361 : else
2362 : {
2363 9704510 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2364 : nDstPixelStride, nWordCount);
2365 : }
2366 16237176 : }
2367 :
2368 : template <>
2369 476 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2370 : int nSrcPixelStride,
2371 : GUInt32 *const CPL_RESTRICT pDstData,
2372 : int nDstPixelStride, GPtrDiff_t nWordCount)
2373 : {
2374 476 : GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
2375 : nDstPixelStride, nWordCount);
2376 476 : }
2377 :
2378 : template <>
2379 16236700 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2380 : int nSrcPixelStride,
2381 : GInt32 *const CPL_RESTRICT pDstData,
2382 : int nDstPixelStride, GPtrDiff_t nWordCount)
2383 : {
2384 16236700 : GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
2385 : nDstPixelStride, nWordCount);
2386 16236700 : }
2387 :
2388 : template <>
2389 2851070 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2390 : int nSrcPixelStride,
2391 : float *const CPL_RESTRICT pDstData,
2392 : int nDstPixelStride, GPtrDiff_t nWordCount)
2393 : {
2394 2851070 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2395 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2396 : {
2397 228189 : decltype(nWordCount) n = 0;
2398 228189 : const __m128i xmm_zero = _mm_setzero_si128();
2399 228189 : GByte *CPL_RESTRICT pabyDstDataPtr =
2400 : reinterpret_cast<GByte *>(pDstData);
2401 2267160 : for (; n < nWordCount - 15; n += 16)
2402 : {
2403 2038970 : __m128i xmm = _mm_loadu_si128(
2404 2038970 : reinterpret_cast<const __m128i *>(pSrcData + n));
2405 2038970 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2406 2038970 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2407 2038970 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2408 2038970 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2409 2038970 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2410 2038970 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2411 2038970 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2412 2038970 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2413 2038970 : __m128 xmm2_f = _mm_cvtepi32_ps(xmm2);
2414 2038970 : __m128 xmm3_f = _mm_cvtepi32_ps(xmm3);
2415 2038970 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2416 : xmm0_f);
2417 : _mm_storeu_ps(
2418 2038970 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2419 : _mm_storeu_ps(
2420 2038970 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 32), xmm2_f);
2421 : _mm_storeu_ps(
2422 2038970 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 48), xmm3_f);
2423 : }
2424 951437 : for (; n < nWordCount; n++)
2425 : {
2426 723248 : pDstData[n] = pSrcData[n];
2427 228189 : }
2428 : }
2429 : else
2430 : {
2431 2622880 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2432 : nDstPixelStride, nWordCount);
2433 : }
2434 2851070 : }
2435 :
2436 : template <>
2437 170938 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2438 : int nSrcPixelStride,
2439 : double *const CPL_RESTRICT pDstData,
2440 : int nDstPixelStride, GPtrDiff_t nWordCount)
2441 : {
2442 170938 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2443 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2444 : {
2445 147140 : decltype(nWordCount) n = 0;
2446 147140 : const __m128i xmm_zero = _mm_setzero_si128();
2447 147140 : GByte *CPL_RESTRICT pabyDstDataPtr =
2448 : reinterpret_cast<GByte *>(pDstData);
2449 3127410 : for (; n < nWordCount - 15; n += 16)
2450 : {
2451 2980270 : __m128i xmm = _mm_loadu_si128(
2452 2980270 : reinterpret_cast<const __m128i *>(pSrcData + n));
2453 2980270 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2454 2980270 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2455 2980270 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2456 2980270 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2457 2980270 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2458 2980270 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2459 :
2460 : #if defined(__AVX2__) && defined(slightly_slower_than_SSE2)
2461 : _mm256_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2462 : _mm256_cvtepi32_pd(xmm0));
2463 : _mm256_storeu_pd(
2464 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2465 : _mm256_cvtepi32_pd(xmm1));
2466 : _mm256_storeu_pd(
2467 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
2468 : _mm256_cvtepi32_pd(xmm2));
2469 : _mm256_storeu_pd(
2470 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
2471 : _mm256_cvtepi32_pd(xmm3));
2472 : #else
2473 2980270 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2474 2980270 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2475 2980270 : __m128d xmm2_low_d = _mm_cvtepi32_pd(xmm2);
2476 2980270 : __m128d xmm3_low_d = _mm_cvtepi32_pd(xmm3);
2477 2980270 : xmm0 = _mm_srli_si128(xmm0, 8);
2478 2980270 : xmm1 = _mm_srli_si128(xmm1, 8);
2479 2980270 : xmm2 = _mm_srli_si128(xmm2, 8);
2480 2980270 : xmm3 = _mm_srli_si128(xmm3, 8);
2481 2980270 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2482 2980270 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2483 2980270 : __m128d xmm2_high_d = _mm_cvtepi32_pd(xmm2);
2484 2980270 : __m128d xmm3_high_d = _mm_cvtepi32_pd(xmm3);
2485 :
2486 2980270 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2487 : xmm0_low_d);
2488 : _mm_storeu_pd(
2489 2980270 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2490 : xmm0_high_d);
2491 : _mm_storeu_pd(
2492 2980270 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2493 : xmm1_low_d);
2494 : _mm_storeu_pd(
2495 2980270 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2496 : xmm1_high_d);
2497 : _mm_storeu_pd(
2498 2980270 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
2499 : xmm2_low_d);
2500 : _mm_storeu_pd(
2501 2980270 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 80),
2502 : xmm2_high_d);
2503 : _mm_storeu_pd(
2504 2980270 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
2505 : xmm3_low_d);
2506 : _mm_storeu_pd(
2507 2980270 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 112),
2508 : xmm3_high_d);
2509 : #endif
2510 : }
2511 280823 : for (; n < nWordCount; n++)
2512 : {
2513 133683 : pDstData[n] = pSrcData[n];
2514 147140 : }
2515 : }
2516 : else
2517 : {
2518 23798 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2519 : nDstPixelStride, nWordCount);
2520 : }
2521 170938 : }
2522 :
2523 : template <>
2524 148 : CPL_NOINLINE void GDALCopyWordsT(const uint8_t *const CPL_RESTRICT pSrcData,
2525 : int nSrcPixelStride,
2526 : int8_t *const CPL_RESTRICT pDstData,
2527 : int nDstPixelStride, GPtrDiff_t nWordCount)
2528 : {
2529 148 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2530 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2531 : {
2532 142 : decltype(nWordCount) n = 0;
2533 142 : const __m128i xmm_127 = _mm_set1_epi8(127);
2534 146 : for (; n < nWordCount - 31; n += 32)
2535 : {
2536 8 : __m128i xmm0 = _mm_loadu_si128(
2537 4 : reinterpret_cast<const __m128i *>(pSrcData + n));
2538 4 : __m128i xmm1 = _mm_loadu_si128(
2539 4 : reinterpret_cast<const __m128i *>(pSrcData + n + 16));
2540 4 : xmm0 = _mm_min_epu8(xmm0, xmm_127);
2541 4 : xmm1 = _mm_min_epu8(xmm1, xmm_127);
2542 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2543 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
2544 : xmm1);
2545 : }
2546 2424 : for (; n < nWordCount; n++)
2547 : {
2548 2282 : pDstData[n] =
2549 2282 : pSrcData[n] >= 127 ? 127 : static_cast<int8_t>(pSrcData[n]);
2550 142 : }
2551 : }
2552 : else
2553 : {
2554 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2555 : nDstPixelStride, nWordCount);
2556 : }
2557 148 : }
2558 :
2559 : template <>
2560 62 : CPL_NOINLINE void GDALCopyWordsT(const int8_t *const CPL_RESTRICT pSrcData,
2561 : int nSrcPixelStride,
2562 : uint8_t *const CPL_RESTRICT pDstData,
2563 : int nDstPixelStride, GPtrDiff_t nWordCount)
2564 : {
2565 62 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2566 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2567 : {
2568 56 : decltype(nWordCount) n = 0;
2569 : #if !(defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS))
2570 56 : const __m128i xmm_INT8_to_UINT8 = _mm_set1_epi8(-128);
2571 : #endif
2572 117 : for (; n < nWordCount - 31; n += 32)
2573 : {
2574 122 : __m128i xmm0 = _mm_loadu_si128(
2575 61 : reinterpret_cast<const __m128i *>(pSrcData + n));
2576 61 : __m128i xmm1 = _mm_loadu_si128(
2577 61 : reinterpret_cast<const __m128i *>(pSrcData + n + 16));
2578 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2579 : xmm0 = _mm_max_epi8(xmm0, _mm_setzero_si128());
2580 : xmm1 = _mm_max_epi8(xmm1, _mm_setzero_si128());
2581 : #else
2582 61 : xmm0 = _mm_add_epi8(xmm0, xmm_INT8_to_UINT8);
2583 61 : xmm1 = _mm_add_epi8(xmm1, xmm_INT8_to_UINT8);
2584 61 : xmm0 = _mm_max_epu8(xmm0, xmm_INT8_to_UINT8);
2585 61 : xmm1 = _mm_max_epu8(xmm1, xmm_INT8_to_UINT8);
2586 61 : xmm0 = _mm_sub_epi8(xmm0, xmm_INT8_to_UINT8);
2587 61 : xmm1 = _mm_sub_epi8(xmm1, xmm_INT8_to_UINT8);
2588 : #endif
2589 61 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2590 61 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
2591 : xmm1);
2592 : }
2593 352 : for (; n < nWordCount; n++)
2594 : {
2595 296 : pDstData[n] =
2596 296 : pSrcData[n] < 0 ? 0 : static_cast<uint8_t>(pSrcData[n]);
2597 56 : }
2598 : }
2599 : else
2600 : {
2601 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2602 : nDstPixelStride, nWordCount);
2603 : }
2604 62 : }
2605 :
2606 : template <>
2607 6037 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2608 : int nSrcPixelStride,
2609 : uint8_t *const CPL_RESTRICT pDstData,
2610 : int nDstPixelStride, GPtrDiff_t nWordCount)
2611 : {
2612 6037 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2613 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2614 : {
2615 5062 : decltype(nWordCount) n = 0;
2616 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2617 : const auto xmm_MAX_INT16 = _mm_set1_epi16(32767);
2618 : #else
2619 : // In SSE2, min_epu16 does not exist, so shift from
2620 : // UInt16 to SInt16 to be able to use min_epi16
2621 5062 : const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
2622 5062 : const __m128i xmm_m255_shifted = _mm_set1_epi16(255 - 32768);
2623 : #endif
2624 71888 : for (; n < nWordCount - 15; n += 16)
2625 : {
2626 133652 : __m128i xmm0 = _mm_loadu_si128(
2627 66826 : reinterpret_cast<const __m128i *>(pSrcData + n));
2628 66826 : __m128i xmm1 = _mm_loadu_si128(
2629 66826 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2630 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2631 : xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
2632 : xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
2633 : #else
2634 66826 : xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
2635 66826 : xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
2636 66826 : xmm0 = _mm_min_epi16(xmm0, xmm_m255_shifted);
2637 66826 : xmm1 = _mm_min_epi16(xmm1, xmm_m255_shifted);
2638 66826 : xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
2639 66826 : xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
2640 : #endif
2641 66826 : xmm0 = _mm_packus_epi16(xmm0, xmm1);
2642 66826 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2643 : }
2644 16403 : for (; n < nWordCount; n++)
2645 : {
2646 11341 : pDstData[n] =
2647 11341 : pSrcData[n] >= 255 ? 255 : static_cast<uint8_t>(pSrcData[n]);
2648 5062 : }
2649 : }
2650 : else
2651 : {
2652 975 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2653 : nDstPixelStride, nWordCount);
2654 : }
2655 6037 : }
2656 :
2657 : template <>
2658 46 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2659 : int nSrcPixelStride,
2660 : int16_t *const CPL_RESTRICT pDstData,
2661 : int nDstPixelStride, GPtrDiff_t nWordCount)
2662 : {
2663 46 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2664 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2665 : {
2666 40 : decltype(nWordCount) n = 0;
2667 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2668 : const __m128i xmm_MAX_INT16 = _mm_set1_epi16(32767);
2669 : #else
2670 : // In SSE2, min_epu16 does not exist, so shift from
2671 : // UInt16 to SInt16 to be able to use min_epi16
2672 40 : const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
2673 40 : const __m128i xmm_32767_shifted = _mm_set1_epi16(32767 - 32768);
2674 : #endif
2675 169 : for (; n < nWordCount - 15; n += 16)
2676 : {
2677 258 : __m128i xmm0 = _mm_loadu_si128(
2678 129 : reinterpret_cast<const __m128i *>(pSrcData + n));
2679 129 : __m128i xmm1 = _mm_loadu_si128(
2680 129 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2681 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2682 : xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
2683 : xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
2684 : #else
2685 129 : xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
2686 129 : xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
2687 129 : xmm0 = _mm_min_epi16(xmm0, xmm_32767_shifted);
2688 129 : xmm1 = _mm_min_epi16(xmm1, xmm_32767_shifted);
2689 129 : xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
2690 129 : xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
2691 : #endif
2692 129 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2693 129 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
2694 : xmm1);
2695 : }
2696 191 : for (; n < nWordCount; n++)
2697 : {
2698 282 : pDstData[n] = pSrcData[n] >= 32767
2699 : ? 32767
2700 131 : : static_cast<int16_t>(pSrcData[n]);
2701 40 : }
2702 : }
2703 : else
2704 : {
2705 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2706 : nDstPixelStride, nWordCount);
2707 : }
2708 46 : }
2709 :
2710 : template <>
2711 136 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2712 : int nSrcPixelStride,
2713 : uint16_t *const CPL_RESTRICT pDstData,
2714 : int nDstPixelStride, GPtrDiff_t nWordCount)
2715 : {
2716 136 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2717 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2718 : {
2719 93 : decltype(nWordCount) n = 0;
2720 93 : const __m128i xmm_zero = _mm_setzero_si128();
2721 278 : for (; n < nWordCount - 15; n += 16)
2722 : {
2723 370 : __m128i xmm0 = _mm_loadu_si128(
2724 185 : reinterpret_cast<const __m128i *>(pSrcData + n));
2725 185 : __m128i xmm1 = _mm_loadu_si128(
2726 185 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2727 185 : xmm0 = _mm_max_epi16(xmm0, xmm_zero);
2728 185 : xmm1 = _mm_max_epi16(xmm1, xmm_zero);
2729 185 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2730 185 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
2731 : xmm1);
2732 : }
2733 471 : for (; n < nWordCount; n++)
2734 : {
2735 378 : pDstData[n] =
2736 378 : pSrcData[n] < 0 ? 0 : static_cast<uint16_t>(pSrcData[n]);
2737 93 : }
2738 : }
2739 : else
2740 : {
2741 43 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2742 : nDstPixelStride, nWordCount);
2743 : }
2744 136 : }
2745 :
2746 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2747 :
2748 : template <>
2749 : CPL_NOINLINE void GDALCopyWordsT(const uint32_t *const CPL_RESTRICT pSrcData,
2750 : int nSrcPixelStride,
2751 : int32_t *const CPL_RESTRICT pDstData,
2752 : int nDstPixelStride, GPtrDiff_t nWordCount)
2753 : {
2754 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2755 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2756 : {
2757 : decltype(nWordCount) n = 0;
2758 : const __m128i xmm_MAX_INT = _mm_set1_epi32(INT_MAX);
2759 : for (; n < nWordCount - 8; n += 7)
2760 : {
2761 : __m128i xmm0 = _mm_loadu_si128(
2762 : reinterpret_cast<const __m128i *>(pSrcData + n));
2763 : __m128i xmm1 = _mm_loadu_si128(
2764 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
2765 : xmm0 = _mm_min_epu32(xmm0, xmm_MAX_INT);
2766 : xmm1 = _mm_min_epu32(xmm1, xmm_MAX_INT);
2767 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2768 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
2769 : xmm1);
2770 : }
2771 : for (; n < nWordCount; n++)
2772 : {
2773 : pDstData[n] = pSrcData[n] >= INT_MAX
2774 : ? INT_MAX
2775 : : static_cast<int32_t>(pSrcData[n]);
2776 : }
2777 : }
2778 : else
2779 : {
2780 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2781 : nDstPixelStride, nWordCount);
2782 : }
2783 : }
2784 :
2785 : template <>
2786 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
2787 : int nSrcPixelStride,
2788 : uint32_t *const CPL_RESTRICT pDstData,
2789 : int nDstPixelStride, GPtrDiff_t nWordCount)
2790 : {
2791 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2792 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2793 : {
2794 : decltype(nWordCount) n = 0;
2795 : const __m128i xmm_zero = _mm_setzero_si128();
2796 : for (; n < nWordCount - 7; n += 8)
2797 : {
2798 : __m128i xmm0 = _mm_loadu_si128(
2799 : reinterpret_cast<const __m128i *>(pSrcData + n));
2800 : __m128i xmm1 = _mm_loadu_si128(
2801 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
2802 : xmm0 = _mm_max_epi32(xmm0, xmm_zero);
2803 : xmm1 = _mm_max_epi32(xmm1, xmm_zero);
2804 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2805 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
2806 : xmm1);
2807 : }
2808 : for (; n < nWordCount; n++)
2809 : {
2810 : pDstData[n] =
2811 : pSrcData[n] < 0 ? 0 : static_cast<uint32_t>(pSrcData[n]);
2812 : }
2813 : }
2814 : else
2815 : {
2816 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2817 : nDstPixelStride, nWordCount);
2818 : }
2819 : }
2820 :
2821 : #endif // defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2822 :
2823 : template <>
2824 403 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2825 : int nSrcPixelStride,
2826 : float *const CPL_RESTRICT pDstData,
2827 : int nDstPixelStride, GPtrDiff_t nWordCount)
2828 : {
2829 403 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2830 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2831 : {
2832 397 : decltype(nWordCount) n = 0;
2833 397 : const __m128i xmm_zero = _mm_setzero_si128();
2834 397 : GByte *CPL_RESTRICT pabyDstDataPtr =
2835 : reinterpret_cast<GByte *>(pDstData);
2836 1688 : for (; n < nWordCount - 7; n += 8)
2837 : {
2838 1291 : __m128i xmm = _mm_loadu_si128(
2839 1291 : reinterpret_cast<const __m128i *>(pSrcData + n));
2840 1291 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
2841 1291 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
2842 1291 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2843 1291 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2844 1291 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2845 : xmm0_f);
2846 : _mm_storeu_ps(
2847 1291 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2848 : }
2849 1415 : for (; n < nWordCount; n++)
2850 : {
2851 1018 : pDstData[n] = pSrcData[n];
2852 397 : }
2853 : }
2854 : else
2855 : {
2856 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2857 : nDstPixelStride, nWordCount);
2858 : }
2859 403 : }
2860 :
2861 : template <>
2862 1076640 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2863 : int nSrcPixelStride,
2864 : float *const CPL_RESTRICT pDstData,
2865 : int nDstPixelStride, GPtrDiff_t nWordCount)
2866 : {
2867 1076640 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2868 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2869 : {
2870 86742 : decltype(nWordCount) n = 0;
2871 86742 : GByte *CPL_RESTRICT pabyDstDataPtr =
2872 : reinterpret_cast<GByte *>(pDstData);
2873 586119 : for (; n < nWordCount - 7; n += 8)
2874 : {
2875 499377 : __m128i xmm = _mm_loadu_si128(
2876 499377 : reinterpret_cast<const __m128i *>(pSrcData + n));
2877 499377 : const auto sign = _mm_srai_epi16(xmm, 15);
2878 499377 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
2879 499377 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
2880 499377 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2881 499377 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2882 499377 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2883 : xmm0_f);
2884 : _mm_storeu_ps(
2885 499377 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2886 : }
2887 253882 : for (; n < nWordCount; n++)
2888 : {
2889 167140 : pDstData[n] = pSrcData[n];
2890 86742 : }
2891 : }
2892 : else
2893 : {
2894 989901 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2895 : nDstPixelStride, nWordCount);
2896 : }
2897 1076640 : }
2898 :
2899 : template <>
2900 449 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2901 : int nSrcPixelStride,
2902 : double *const CPL_RESTRICT pDstData,
2903 : int nDstPixelStride, GPtrDiff_t nWordCount)
2904 : {
2905 449 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2906 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2907 : {
2908 313 : decltype(nWordCount) n = 0;
2909 313 : const __m128i xmm_zero = _mm_setzero_si128();
2910 313 : GByte *CPL_RESTRICT pabyDstDataPtr =
2911 : reinterpret_cast<GByte *>(pDstData);
2912 829 : for (; n < nWordCount - 7; n += 8)
2913 : {
2914 516 : __m128i xmm = _mm_loadu_si128(
2915 516 : reinterpret_cast<const __m128i *>(pSrcData + n));
2916 516 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
2917 516 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
2918 :
2919 516 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2920 516 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2921 516 : xmm0 = _mm_srli_si128(xmm0, 8);
2922 516 : xmm1 = _mm_srli_si128(xmm1, 8);
2923 516 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2924 516 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2925 :
2926 516 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2927 : xmm0_low_d);
2928 : _mm_storeu_pd(
2929 516 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2930 : xmm0_high_d);
2931 : _mm_storeu_pd(
2932 516 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2933 : xmm1_low_d);
2934 : _mm_storeu_pd(
2935 516 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2936 : xmm1_high_d);
2937 : }
2938 1082 : for (; n < nWordCount; n++)
2939 : {
2940 769 : pDstData[n] = pSrcData[n];
2941 313 : }
2942 : }
2943 : else
2944 : {
2945 136 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2946 : nDstPixelStride, nWordCount);
2947 : }
2948 449 : }
2949 :
2950 : template <>
2951 4923280 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2952 : int nSrcPixelStride,
2953 : double *const CPL_RESTRICT pDstData,
2954 : int nDstPixelStride, GPtrDiff_t nWordCount)
2955 : {
2956 4923280 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2957 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2958 : {
2959 34874 : decltype(nWordCount) n = 0;
2960 34874 : GByte *CPL_RESTRICT pabyDstDataPtr =
2961 : reinterpret_cast<GByte *>(pDstData);
2962 403828 : for (; n < nWordCount - 7; n += 8)
2963 : {
2964 368954 : __m128i xmm = _mm_loadu_si128(
2965 368954 : reinterpret_cast<const __m128i *>(pSrcData + n));
2966 368954 : const auto sign = _mm_srai_epi16(xmm, 15);
2967 368954 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
2968 368954 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
2969 :
2970 368954 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2971 368954 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2972 368954 : xmm0 = _mm_srli_si128(xmm0, 8);
2973 368954 : xmm1 = _mm_srli_si128(xmm1, 8);
2974 368954 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2975 368954 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2976 :
2977 368954 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2978 : xmm0_low_d);
2979 : _mm_storeu_pd(
2980 368954 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2981 : xmm0_high_d);
2982 : _mm_storeu_pd(
2983 368954 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2984 : xmm1_low_d);
2985 : _mm_storeu_pd(
2986 368954 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2987 : xmm1_high_d);
2988 : }
2989 255934 : for (; n < nWordCount; n++)
2990 : {
2991 221060 : pDstData[n] = pSrcData[n];
2992 34874 : }
2993 : }
2994 : else
2995 : {
2996 4888400 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2997 : nDstPixelStride, nWordCount);
2998 : }
2999 4923280 : }
3000 :
3001 : // ---- AVX2 helpers for int32 narrowing (runtime dispatch) ----
3002 :
3003 : #if defined(HAVE_AVX2_DISPATCH)
3004 : #if !defined(_MSC_VER)
3005 : __attribute__((target("avx2")))
3006 : #endif
3007 12723 : static void GDALCopyWordsInt32ToUInt8_AVX2(const int32_t *CPL_RESTRICT pSrc,
3008 : uint8_t *CPL_RESTRICT pDst,
3009 : GPtrDiff_t nWordCount)
3010 : {
3011 12723 : const __m256i permuteIdx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
3012 12723 : GPtrDiff_t n = 0;
3013 958119 : for (; n < nWordCount - 31; n += 32)
3014 : {
3015 : __m256i v0 =
3016 945396 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n));
3017 : __m256i v1 =
3018 945396 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n + 8));
3019 945396 : __m256i v2 = _mm256_loadu_si256(
3020 945396 : reinterpret_cast<const __m256i *>(pSrc + n + 16));
3021 945396 : __m256i v3 = _mm256_loadu_si256(
3022 945396 : reinterpret_cast<const __m256i *>(pSrc + n + 24));
3023 : // Clamp to [0, 255]
3024 : // Pack int32 -> int16 -> uint8, then fix cross-lane ordering
3025 945396 : __m256i ab16 = _mm256_packs_epi32(v0, v1);
3026 945396 : __m256i cd16 = _mm256_packs_epi32(v2, v3);
3027 945396 : __m256i bytes = _mm256_packus_epi16(ab16, cd16);
3028 945396 : bytes = _mm256_permutevar8x32_epi32(bytes, permuteIdx);
3029 945396 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(pDst + n), bytes);
3030 : }
3031 68589 : for (; n < nWordCount; n++)
3032 : {
3033 70955 : pDst[n] = pSrc[n] <= 0 ? 0
3034 15089 : : pSrc[n] >= 255 ? 255
3035 1075 : : static_cast<uint8_t>(pSrc[n]);
3036 : }
3037 12723 : }
3038 :
3039 : #if !defined(_MSC_VER)
3040 : __attribute__((target("avx2")))
3041 : #endif
3042 10277 : static void GDALCopyWordsInt32ToUInt16_AVX2(const int32_t *CPL_RESTRICT pSrc,
3043 : uint16_t *CPL_RESTRICT pDst,
3044 : GPtrDiff_t nWordCount)
3045 : {
3046 : // _mm256_packus_epi32(v0, v1) produces per-lane interleaved result:
3047 : // [v0_lo4, v1_lo4, v0_hi4, v1_hi4] (in uint16 pairs per 32-bit lane)
3048 : // Permute to deinterleave: all v0 values first, then all v1 values
3049 10277 : const __m256i permuteIdx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
3050 10277 : GPtrDiff_t n = 0;
3051 670572 : for (; n < nWordCount - 15; n += 16)
3052 : {
3053 : __m256i v0 =
3054 660295 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n));
3055 : __m256i v1 =
3056 1320590 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n + 8));
3057 : // Clamp to [0, 65535]: _mm256_packus_epi32 saturates uint
3058 660295 : __m256i packed = _mm256_packus_epi32(v0, v1);
3059 : // Fix cross-lane interleave from packus
3060 660295 : packed = _mm256_permutevar8x32_epi32(packed, permuteIdx);
3061 660295 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(pDst + n), packed);
3062 : }
3063 163928 : for (; n < nWordCount; n++)
3064 : {
3065 307282 : pDst[n] = pSrc[n] <= 0 ? 0
3066 153631 : : pSrc[n] >= 65535 ? 65535
3067 153599 : : static_cast<uint16_t>(pSrc[n]);
3068 : }
3069 10277 : }
3070 : #endif // HAVE_AVX2_DISPATCH
3071 :
3072 : // ---- int32 -> uint8 with clamping to [0, 255] ----
3073 : template <>
3074 12837 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
3075 : int nSrcPixelStride,
3076 : uint8_t *const CPL_RESTRICT pDstData,
3077 : int nDstPixelStride, GPtrDiff_t nWordCount)
3078 : {
3079 12837 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
3080 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
3081 : {
3082 : #if defined(HAVE_AVX2_DISPATCH)
3083 12723 : if (CPLHaveRuntimeAVX2())
3084 : {
3085 12723 : GDALCopyWordsInt32ToUInt8_AVX2(pSrcData, pDstData, nWordCount);
3086 12723 : return;
3087 : }
3088 : #endif
3089 : #ifdef HAVE_SSE2
3090 : // SSE2 path: 16 pixels per iteration
3091 0 : decltype(nWordCount) n = 0;
3092 0 : for (; n < nWordCount - 15; n += 16)
3093 : {
3094 0 : __m128i v0 = _mm_loadu_si128(
3095 0 : reinterpret_cast<const __m128i *>(pSrcData + n));
3096 0 : __m128i v1 = _mm_loadu_si128(
3097 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
3098 0 : __m128i v2 = _mm_loadu_si128(
3099 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
3100 0 : __m128i v3 = _mm_loadu_si128(
3101 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 12));
3102 : // Pack int32->int16 with signed saturation to [-32768,32767] range
3103 0 : __m128i lo16 = _mm_packs_epi32(v0, v1);
3104 0 : __m128i hi16 = _mm_packs_epi32(v2, v3);
3105 : // Pack int16->uint8 with unsigned saturation to [0,255] range
3106 0 : __m128i bytes = _mm_packus_epi16(lo16, hi16);
3107 0 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), bytes);
3108 : }
3109 0 : for (; n < nWordCount; n++)
3110 : #else
3111 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
3112 : #endif
3113 : {
3114 0 : pDstData[n] = pSrcData[n] <= 0 ? 0
3115 0 : : pSrcData[n] >= 255
3116 : ? 255
3117 0 : : static_cast<uint8_t>(pSrcData[n]);
3118 0 : }
3119 : }
3120 : else
3121 : {
3122 114 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
3123 : nDstPixelStride, nWordCount);
3124 : }
3125 : }
3126 :
3127 : // ---- int32 -> uint16 with clamping to [0, 65535] ----
3128 : template <>
3129 10322 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
3130 : int nSrcPixelStride,
3131 : uint16_t *const CPL_RESTRICT pDstData,
3132 : int nDstPixelStride, GPtrDiff_t nWordCount)
3133 : {
3134 10322 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
3135 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
3136 : {
3137 : #if defined(HAVE_AVX2_DISPATCH)
3138 10277 : if (CPLHaveRuntimeAVX2())
3139 : {
3140 10277 : GDALCopyWordsInt32ToUInt16_AVX2(pSrcData, pDstData, nWordCount);
3141 10277 : return;
3142 : }
3143 : #endif
3144 0 : decltype(nWordCount) n = 0;
3145 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
3146 : // SSE4.1: _mm_packus_epi32 directly handles uint saturation
3147 : for (; n < nWordCount - 7; n += 8)
3148 : {
3149 : __m128i v0 = _mm_loadu_si128(
3150 : reinterpret_cast<const __m128i *>(pSrcData + n));
3151 : __m128i v1 = _mm_loadu_si128(
3152 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
3153 : __m128i packed = _mm_packus_epi32(v0, v1);
3154 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), packed);
3155 : }
3156 : #else
3157 : // SSE2: clamp to [0, 65535], bias to signed range, pack, unbias
3158 0 : const __m128i xmm_65535 = _mm_set1_epi32(65535);
3159 0 : const __m128i xmm_bias32 = _mm_set1_epi32(32768);
3160 0 : const __m128i xmm_bias16 = _mm_set1_epi16(-32768);
3161 0 : for (; n < nWordCount - 7; n += 8)
3162 : {
3163 0 : __m128i v0 = _mm_loadu_si128(
3164 0 : reinterpret_cast<const __m128i *>(pSrcData + n));
3165 0 : __m128i v1 = _mm_loadu_si128(
3166 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
3167 : // max(v, 0)
3168 0 : v0 = _mm_andnot_si128(_mm_srai_epi32(v0, 31), v0);
3169 0 : v1 = _mm_andnot_si128(_mm_srai_epi32(v1, 31), v1);
3170 : // min(v, 65535)
3171 0 : __m128i gt0 = _mm_cmpgt_epi32(v0, xmm_65535);
3172 0 : __m128i gt1 = _mm_cmpgt_epi32(v1, xmm_65535);
3173 0 : v0 = _mm_or_si128(_mm_andnot_si128(gt0, v0),
3174 : _mm_and_si128(gt0, xmm_65535));
3175 0 : v1 = _mm_or_si128(_mm_andnot_si128(gt1, v1),
3176 : _mm_and_si128(gt1, xmm_65535));
3177 : // Shift [0, 65535] -> [-32768, 32767] for _mm_packs_epi32
3178 0 : v0 = _mm_sub_epi32(v0, xmm_bias32);
3179 0 : v1 = _mm_sub_epi32(v1, xmm_bias32);
3180 0 : __m128i packed = _mm_packs_epi32(v0, v1);
3181 : // Shift back: sub_epi16(x, -32768) == add 32768 (mod 2^16)
3182 0 : packed = _mm_sub_epi16(packed, xmm_bias16);
3183 0 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), packed);
3184 : }
3185 : #endif
3186 0 : for (; n < nWordCount; n++)
3187 : {
3188 0 : pDstData[n] = pSrcData[n] <= 0 ? 0
3189 0 : : pSrcData[n] >= 65535
3190 : ? 65535
3191 0 : : static_cast<uint16_t>(pSrcData[n]);
3192 0 : }
3193 : }
3194 : else
3195 : {
3196 45 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
3197 : nDstPixelStride, nWordCount);
3198 : }
3199 : }
3200 :
3201 : #endif // HAVE_SSE2
3202 :
3203 : template <>
3204 4426980 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3205 : int nSrcPixelStride,
3206 : GByte *const CPL_RESTRICT pDstData,
3207 : int nDstPixelStride, GPtrDiff_t nWordCount)
3208 : {
3209 4426980 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3210 : nDstPixelStride, nWordCount);
3211 4426980 : }
3212 :
3213 : template <>
3214 38387 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3215 : int nSrcPixelStride,
3216 : GUInt16 *const CPL_RESTRICT pDstData,
3217 : int nDstPixelStride, GPtrDiff_t nWordCount)
3218 : {
3219 38387 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3220 : nDstPixelStride, nWordCount);
3221 38387 : }
3222 :
3223 : template <>
3224 55671 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3225 : int nSrcPixelStride,
3226 : double *const CPL_RESTRICT pDstData,
3227 : int nDstPixelStride, GPtrDiff_t nWordCount)
3228 : {
3229 55671 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3230 : nDstPixelStride, nWordCount);
3231 55671 : }
3232 :
3233 : template <>
3234 122846 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3235 : int nSrcPixelStride,
3236 : float *const CPL_RESTRICT pDstData,
3237 : int nDstPixelStride, GPtrDiff_t nWordCount)
3238 : {
3239 122846 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3240 : nDstPixelStride, nWordCount);
3241 122846 : }
3242 :
3243 : template <>
3244 412 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
3245 : int nSrcPixelStride,
3246 : float *const CPL_RESTRICT pDstData,
3247 : int nDstPixelStride, GPtrDiff_t nWordCount)
3248 : {
3249 412 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3250 : nDstPixelStride, nWordCount);
3251 412 : }
3252 :
3253 : template <>
3254 544 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
3255 : int nSrcPixelStride,
3256 : double *const CPL_RESTRICT pDstData,
3257 : int nDstPixelStride, GPtrDiff_t nWordCount)
3258 : {
3259 544 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3260 : nDstPixelStride, nWordCount);
3261 544 : }
3262 :
3263 : template <>
3264 314423 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3265 : int nSrcPixelStride,
3266 : GByte *const CPL_RESTRICT pDstData,
3267 : int nDstPixelStride, GPtrDiff_t nWordCount)
3268 : {
3269 314423 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3270 : nDstPixelStride, nWordCount);
3271 314423 : }
3272 :
3273 : template <>
3274 55 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3275 : int nSrcPixelStride,
3276 : GInt8 *const CPL_RESTRICT pDstData,
3277 : int nDstPixelStride, GPtrDiff_t nWordCount)
3278 : {
3279 55 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3280 : nDstPixelStride, nWordCount);
3281 55 : }
3282 :
3283 : template <>
3284 15785 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3285 : int nSrcPixelStride,
3286 : GInt16 *const CPL_RESTRICT pDstData,
3287 : int nDstPixelStride, GPtrDiff_t nWordCount)
3288 : {
3289 15785 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3290 : nDstPixelStride, nWordCount);
3291 15785 : }
3292 :
3293 : template <>
3294 61713 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3295 : int nSrcPixelStride,
3296 : GUInt16 *const CPL_RESTRICT pDstData,
3297 : int nDstPixelStride, GPtrDiff_t nWordCount)
3298 : {
3299 61713 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3300 : nDstPixelStride, nWordCount);
3301 61713 : }
3302 :
3303 : template <>
3304 43985 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3305 : int nSrcPixelStride,
3306 : GInt32 *const CPL_RESTRICT pDstData,
3307 : int nDstPixelStride, GPtrDiff_t nWordCount)
3308 : {
3309 43985 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3310 : nDstPixelStride, nWordCount);
3311 43985 : }
3312 :
3313 : template <>
3314 72 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3315 : int nSrcPixelStride,
3316 : GFloat16 *const CPL_RESTRICT pDstData,
3317 : int nDstPixelStride, GPtrDiff_t nWordCount)
3318 : {
3319 72 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3320 : nDstPixelStride, nWordCount);
3321 72 : }
3322 :
3323 : template <>
3324 63 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3325 : int nSrcPixelStride,
3326 : GFloat16 *const CPL_RESTRICT pDstData,
3327 : int nDstPixelStride, GPtrDiff_t nWordCount)
3328 : {
3329 63 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3330 : nDstPixelStride, nWordCount);
3331 63 : }
3332 :
3333 : /************************************************************************/
3334 : /* GDALCopyWordsComplexT() */
3335 : /************************************************************************/
3336 : /**
3337 : * Template function, used to copy data from pSrcData into buffer
3338 : * pDstData, with stride nSrcPixelStride in the source data and
3339 : * stride nDstPixelStride in the destination data. Deals with the
3340 : * complex case, where input is complex and output is complex.
3341 : *
3342 : * @param pSrcData the source data buffer
3343 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
3344 : * of interest.
3345 : * @param pDstData the destination buffer.
3346 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
3347 : * interest.
3348 : * @param nWordCount the total number of pixel words to copy
3349 : *
3350 : */
3351 : template <class Tin, class Tout>
3352 98788 : inline void GDALCopyWordsComplexT(const Tin *const CPL_RESTRICT pSrcData,
3353 : int nSrcPixelStride,
3354 : Tout *const CPL_RESTRICT pDstData,
3355 : int nDstPixelStride, GPtrDiff_t nWordCount)
3356 : {
3357 98788 : decltype(nWordCount) nDstOffset = 0;
3358 98788 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
3359 98788 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
3360 :
3361 5631239 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
3362 : {
3363 5532446 : const Tin *const pPixelIn =
3364 5532446 : reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
3365 5532446 : Tout *const pPixelOut =
3366 5532446 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
3367 :
3368 5532446 : GDALCopyWord(pPixelIn[0], pPixelOut[0]);
3369 5532446 : GDALCopyWord(pPixelIn[1], pPixelOut[1]);
3370 :
3371 5532446 : nDstOffset += nDstPixelStride;
3372 : }
3373 98788 : }
3374 :
3375 : /************************************************************************/
3376 : /* GDALCopyWordsComplexOutT() */
3377 : /************************************************************************/
3378 : /**
3379 : * Template function, used to copy data from pSrcData into buffer
3380 : * pDstData, with stride nSrcPixelStride in the source data and
3381 : * stride nDstPixelStride in the destination data. Deals with the
3382 : * case where the value is real coming in, but complex going out.
3383 : *
3384 : * @param pSrcData the source data buffer
3385 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
3386 : * of interest, in bytes.
3387 : * @param pDstData the destination buffer.
3388 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
3389 : * interest, in bytes.
3390 : * @param nWordCount the total number of pixel words to copy
3391 : *
3392 : */
3393 : template <class Tin, class Tout>
3394 4762 : inline void GDALCopyWordsComplexOutT(const Tin *const CPL_RESTRICT pSrcData,
3395 : int nSrcPixelStride,
3396 : Tout *const CPL_RESTRICT pDstData,
3397 : int nDstPixelStride, GPtrDiff_t nWordCount)
3398 : {
3399 4762 : decltype(nWordCount) nDstOffset = 0;
3400 :
3401 4762 : const Tout tOutZero = static_cast<Tout>(0);
3402 :
3403 4762 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
3404 4762 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
3405 :
3406 1190408 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
3407 : {
3408 1185646 : const Tin tValue =
3409 1185646 : *reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
3410 1185646 : Tout *const pPixelOut =
3411 1185646 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
3412 1185646 : GDALCopyWord(tValue, *pPixelOut);
3413 :
3414 1185646 : pPixelOut[1] = tOutZero;
3415 :
3416 1185646 : nDstOffset += nDstPixelStride;
3417 : }
3418 4762 : }
3419 :
3420 : /************************************************************************/
3421 : /* GDALCopyWordsFromT() */
3422 : /************************************************************************/
3423 : /**
3424 : * Template driver function. Given the input type T, call the appropriate
3425 : * GDALCopyWordsT function template for the desired output type. You should
3426 : * never call this function directly (call GDALCopyWords instead).
3427 : *
3428 : * @param pSrcData source data buffer
3429 : * @param nSrcPixelStride pixel stride in input buffer, in pixel words
3430 : * @param bInComplex input is complex
3431 : * @param pDstData destination data buffer
3432 : * @param eDstType destination data type
3433 : * @param nDstPixelStride pixel stride in output buffer, in pixel words
3434 : * @param nWordCount number of pixel words to be copied
3435 : */
3436 : template <class T>
3437 61292925 : inline void GDALCopyWordsFromT(const T *const CPL_RESTRICT pSrcData,
3438 : int nSrcPixelStride, bool bInComplex,
3439 : void *CPL_RESTRICT pDstData,
3440 : GDALDataType eDstType, int nDstPixelStride,
3441 : GPtrDiff_t nWordCount)
3442 : {
3443 61292925 : switch (eDstType)
3444 : {
3445 4785549 : case GDT_UInt8:
3446 4785549 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3447 : static_cast<unsigned char *>(pDstData),
3448 : nDstPixelStride, nWordCount);
3449 4785549 : break;
3450 1891 : case GDT_Int8:
3451 1891 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3452 : static_cast<signed char *>(pDstData),
3453 : nDstPixelStride, nWordCount);
3454 1891 : break;
3455 1143544 : case GDT_UInt16:
3456 1143544 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3457 : static_cast<unsigned short *>(pDstData),
3458 : nDstPixelStride, nWordCount);
3459 1143544 : break;
3460 4162728 : case GDT_Int16:
3461 4162728 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3462 : static_cast<short *>(pDstData), nDstPixelStride,
3463 : nWordCount);
3464 4162728 : break;
3465 23084 : case GDT_UInt32:
3466 23084 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3467 : static_cast<unsigned int *>(pDstData),
3468 : nDstPixelStride, nWordCount);
3469 23084 : break;
3470 29460249 : case GDT_Int32:
3471 29460249 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3472 : static_cast<int *>(pDstData), nDstPixelStride,
3473 : nWordCount);
3474 29460249 : break;
3475 1250 : case GDT_UInt64:
3476 1250 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3477 : static_cast<std::uint64_t *>(pDstData),
3478 : nDstPixelStride, nWordCount);
3479 1250 : break;
3480 5957 : case GDT_Int64:
3481 5957 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3482 : static_cast<std::int64_t *>(pDstData),
3483 : nDstPixelStride, nWordCount);
3484 5957 : break;
3485 999 : case GDT_Float16:
3486 999 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3487 : static_cast<GFloat16 *>(pDstData), nDstPixelStride,
3488 : nWordCount);
3489 999 : break;
3490 4216051 : case GDT_Float32:
3491 4216051 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3492 : static_cast<float *>(pDstData), nDstPixelStride,
3493 : nWordCount);
3494 4216051 : break;
3495 17387964 : case GDT_Float64:
3496 17387964 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3497 : static_cast<double *>(pDstData), nDstPixelStride,
3498 : nWordCount);
3499 17387964 : break;
3500 94424 : case GDT_CInt16:
3501 94424 : if (bInComplex)
3502 : {
3503 93170 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3504 : static_cast<short *>(pDstData),
3505 : nDstPixelStride, nWordCount);
3506 : }
3507 : else // input is not complex, so we need to promote to a complex
3508 : // buffer
3509 : {
3510 1254 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3511 : static_cast<short *>(pDstData),
3512 : nDstPixelStride, nWordCount);
3513 : }
3514 94424 : break;
3515 1349 : case GDT_CInt32:
3516 1349 : if (bInComplex)
3517 : {
3518 717 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3519 : static_cast<int *>(pDstData),
3520 : nDstPixelStride, nWordCount);
3521 : }
3522 : else // input is not complex, so we need to promote to a complex
3523 : // buffer
3524 : {
3525 632 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3526 : static_cast<int *>(pDstData),
3527 : nDstPixelStride, nWordCount);
3528 : }
3529 1349 : break;
3530 313 : case GDT_CFloat16:
3531 313 : if (bInComplex)
3532 : {
3533 48 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3534 : static_cast<GFloat16 *>(pDstData),
3535 : nDstPixelStride, nWordCount);
3536 : }
3537 : else // input is not complex, so we need to promote to a complex
3538 : // buffer
3539 : {
3540 265 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3541 : static_cast<GFloat16 *>(pDstData),
3542 : nDstPixelStride, nWordCount);
3543 : }
3544 313 : break;
3545 3924 : case GDT_CFloat32:
3546 3924 : if (bInComplex)
3547 : {
3548 3115 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3549 : static_cast<float *>(pDstData),
3550 : nDstPixelStride, nWordCount);
3551 : }
3552 : else // input is not complex, so we need to promote to a complex
3553 : // buffer
3554 : {
3555 809 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3556 : static_cast<float *>(pDstData),
3557 : nDstPixelStride, nWordCount);
3558 : }
3559 3924 : break;
3560 3540 : case GDT_CFloat64:
3561 3540 : if (bInComplex)
3562 : {
3563 1738 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3564 : static_cast<double *>(pDstData),
3565 : nDstPixelStride, nWordCount);
3566 : }
3567 : else // input is not complex, so we need to promote to a complex
3568 : // buffer
3569 : {
3570 1802 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3571 : static_cast<double *>(pDstData),
3572 : nDstPixelStride, nWordCount);
3573 : }
3574 3540 : break;
3575 0 : case GDT_Unknown:
3576 : case GDT_TypeCount:
3577 0 : CPLAssert(false);
3578 : }
3579 61292925 : }
3580 :
3581 : } // end anonymous namespace
3582 :
3583 : /************************************************************************/
3584 : /* GDALReplicateWord() */
3585 : /************************************************************************/
3586 :
3587 : template <class T>
3588 600405 : inline void GDALReplicateWordT(void *pDstData, int nDstPixelStride,
3589 : GPtrDiff_t nWordCount)
3590 : {
3591 600405 : const T valSet = *static_cast<const T *>(pDstData);
3592 600405 : if (nDstPixelStride == static_cast<int>(sizeof(T)))
3593 : {
3594 570592 : T *pDstPtr = static_cast<T *>(pDstData) + 1;
3595 31990099 : while (nWordCount >= 4)
3596 : {
3597 31419540 : nWordCount -= 4;
3598 31419540 : pDstPtr[0] = valSet;
3599 31419540 : pDstPtr[1] = valSet;
3600 31419540 : pDstPtr[2] = valSet;
3601 31419540 : pDstPtr[3] = valSet;
3602 31419540 : pDstPtr += 4;
3603 : }
3604 1476627 : while (nWordCount > 0)
3605 : {
3606 906035 : --nWordCount;
3607 906035 : *pDstPtr = valSet;
3608 906035 : pDstPtr++;
3609 : }
3610 : }
3611 : else
3612 : {
3613 29813 : GByte *pabyDstPtr = static_cast<GByte *>(pDstData) + nDstPixelStride;
3614 1040984 : while (nWordCount > 0)
3615 : {
3616 1011171 : --nWordCount;
3617 1011171 : *reinterpret_cast<T *>(pabyDstPtr) = valSet;
3618 1011171 : pabyDstPtr += nDstPixelStride;
3619 : }
3620 : }
3621 600405 : }
3622 :
3623 1068100 : static void GDALReplicateWord(const void *CPL_RESTRICT pSrcData,
3624 : GDALDataType eSrcType,
3625 : void *CPL_RESTRICT pDstData,
3626 : GDALDataType eDstType, int nDstPixelStride,
3627 : GPtrDiff_t nWordCount)
3628 : {
3629 : /* -----------------------------------------------------------------------
3630 : */
3631 : /* Special case when the source data is always the same value */
3632 : /* (for VRTSourcedRasterBand::IRasterIO and
3633 : * VRTDerivedRasterBand::IRasterIO*/
3634 : /* for example) */
3635 : /* -----------------------------------------------------------------------
3636 : */
3637 : // Let the general translation case do the necessary conversions
3638 : // on the first destination element.
3639 1068100 : GDALCopyWords64(pSrcData, eSrcType, 0, pDstData, eDstType, 0, 1);
3640 :
3641 : // Now copy the first element to the nWordCount - 1 following destination
3642 : // elements.
3643 1068100 : nWordCount--;
3644 1068100 : GByte *pabyDstWord = reinterpret_cast<GByte *>(pDstData) + nDstPixelStride;
3645 :
3646 1068100 : switch (eDstType)
3647 : {
3648 467605 : case GDT_UInt8:
3649 : case GDT_Int8:
3650 : {
3651 467605 : if (nDstPixelStride == 1)
3652 : {
3653 369689 : if (nWordCount > 0)
3654 369689 : memset(pabyDstWord,
3655 369689 : *reinterpret_cast<const GByte *>(pDstData),
3656 : nWordCount);
3657 : }
3658 : else
3659 : {
3660 97916 : GByte valSet = *reinterpret_cast<const GByte *>(pDstData);
3661 67566000 : while (nWordCount > 0)
3662 : {
3663 67468100 : --nWordCount;
3664 67468100 : *pabyDstWord = valSet;
3665 67468100 : pabyDstWord += nDstPixelStride;
3666 : }
3667 : }
3668 467605 : break;
3669 : }
3670 :
3671 : #define CASE_DUPLICATE_SIMPLE(enum_type, c_type) \
3672 : case enum_type: \
3673 : { \
3674 : GDALReplicateWordT<c_type>(pDstData, nDstPixelStride, nWordCount); \
3675 : break; \
3676 : }
3677 :
3678 34513 : CASE_DUPLICATE_SIMPLE(GDT_UInt16, GUInt16)
3679 202455 : CASE_DUPLICATE_SIMPLE(GDT_Int16, GInt16)
3680 74 : CASE_DUPLICATE_SIMPLE(GDT_UInt32, GUInt32)
3681 301585 : CASE_DUPLICATE_SIMPLE(GDT_Int32, GInt32)
3682 41 : CASE_DUPLICATE_SIMPLE(GDT_UInt64, std::uint64_t)
3683 1072 : CASE_DUPLICATE_SIMPLE(GDT_Int64, std::int64_t)
3684 2 : CASE_DUPLICATE_SIMPLE(GDT_Float16, GFloat16)
3685 52858 : CASE_DUPLICATE_SIMPLE(GDT_Float32, float)
3686 7805 : CASE_DUPLICATE_SIMPLE(GDT_Float64, double)
3687 :
3688 : #define CASE_DUPLICATE_COMPLEX(enum_type, c_type) \
3689 : case enum_type: \
3690 : { \
3691 : c_type valSet1 = reinterpret_cast<const c_type *>(pDstData)[0]; \
3692 : c_type valSet2 = reinterpret_cast<const c_type *>(pDstData)[1]; \
3693 : while (nWordCount > 0) \
3694 : { \
3695 : --nWordCount; \
3696 : reinterpret_cast<c_type *>(pabyDstWord)[0] = valSet1; \
3697 : reinterpret_cast<c_type *>(pabyDstWord)[1] = valSet2; \
3698 : pabyDstWord += nDstPixelStride; \
3699 : } \
3700 : break; \
3701 : }
3702 :
3703 784 : CASE_DUPLICATE_COMPLEX(GDT_CInt16, GInt16)
3704 784 : CASE_DUPLICATE_COMPLEX(GDT_CInt32, GInt32)
3705 6 : CASE_DUPLICATE_COMPLEX(GDT_CFloat16, GFloat16)
3706 790 : CASE_DUPLICATE_COMPLEX(GDT_CFloat32, float)
3707 790 : CASE_DUPLICATE_COMPLEX(GDT_CFloat64, double)
3708 :
3709 0 : case GDT_Unknown:
3710 : case GDT_TypeCount:
3711 0 : CPLAssert(false);
3712 : }
3713 1068100 : }
3714 :
3715 : /************************************************************************/
3716 : /* GDALUnrolledCopy() */
3717 : /************************************************************************/
3718 :
3719 : template <class T, int srcStride, int dstStride>
3720 : #if defined(__GNUC__) && defined(__AVX2__)
3721 : __attribute__((optimize("tree-vectorize")))
3722 : #endif
3723 3000525 : static inline void GDALUnrolledCopyGeneric(T *CPL_RESTRICT pDest,
3724 : const T *CPL_RESTRICT pSrc,
3725 : GPtrDiff_t nIters)
3726 : {
3727 : #if !(defined(__GNUC__) && defined(__AVX2__))
3728 3000525 : if (nIters >= 16)
3729 : {
3730 132812687 : for (GPtrDiff_t i = nIters / 16; i != 0; i--)
3731 : {
3732 129932845 : pDest[0 * dstStride] = pSrc[0 * srcStride];
3733 129932845 : pDest[1 * dstStride] = pSrc[1 * srcStride];
3734 129932845 : pDest[2 * dstStride] = pSrc[2 * srcStride];
3735 129932845 : pDest[3 * dstStride] = pSrc[3 * srcStride];
3736 129932845 : pDest[4 * dstStride] = pSrc[4 * srcStride];
3737 129932845 : pDest[5 * dstStride] = pSrc[5 * srcStride];
3738 129932845 : pDest[6 * dstStride] = pSrc[6 * srcStride];
3739 129932845 : pDest[7 * dstStride] = pSrc[7 * srcStride];
3740 129932845 : pDest[8 * dstStride] = pSrc[8 * srcStride];
3741 129932845 : pDest[9 * dstStride] = pSrc[9 * srcStride];
3742 129932845 : pDest[10 * dstStride] = pSrc[10 * srcStride];
3743 129932845 : pDest[11 * dstStride] = pSrc[11 * srcStride];
3744 129932845 : pDest[12 * dstStride] = pSrc[12 * srcStride];
3745 129932845 : pDest[13 * dstStride] = pSrc[13 * srcStride];
3746 129932845 : pDest[14 * dstStride] = pSrc[14 * srcStride];
3747 129932845 : pDest[15 * dstStride] = pSrc[15 * srcStride];
3748 129932845 : pDest += 16 * dstStride;
3749 129932845 : pSrc += 16 * srcStride;
3750 : }
3751 2879967 : nIters = nIters % 16;
3752 : }
3753 : #else
3754 : #pragma GCC unroll 4
3755 : #endif
3756 5160769 : for (GPtrDiff_t i = 0; i < nIters; i++)
3757 : {
3758 2160243 : pDest[i * dstStride] = *pSrc;
3759 2160243 : pSrc += srcStride;
3760 : }
3761 3000525 : }
3762 :
3763 : template <class T, int srcStride, int dstStride>
3764 3000525 : static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
3765 : const T *CPL_RESTRICT pSrc,
3766 : GPtrDiff_t nIters)
3767 : {
3768 3000525 : GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
3769 3000525 : }
3770 :
3771 : #if defined(__AVX2__) && defined(HAVE_SSSE3_AT_COMPILE_TIME) && \
3772 : (defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS))
3773 :
3774 : template <>
3775 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
3776 : const GByte *CPL_RESTRICT pSrc,
3777 : GPtrDiff_t nIters)
3778 : {
3779 : if (nIters > 16)
3780 : {
3781 : // The SSSE3 variant is slightly faster than what the gcc autovectorizer
3782 : // generates
3783 : GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
3784 : }
3785 : else
3786 : {
3787 : for (GPtrDiff_t i = 0; i < nIters; i++)
3788 : {
3789 : pDest[i] = *pSrc;
3790 : pSrc += 3;
3791 : }
3792 : }
3793 : }
3794 :
3795 : #elif defined(HAVE_SSE2) && !(defined(__GNUC__) && defined(__AVX2__))
3796 :
3797 : template <>
3798 354194 : void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
3799 : const GByte *CPL_RESTRICT pSrc,
3800 : GPtrDiff_t nIters)
3801 : {
3802 354194 : decltype(nIters) i = 0;
3803 354194 : if (nIters > 16)
3804 : {
3805 194667 : const __m128i xmm_mask = _mm_set1_epi16(0xff);
3806 : // If we were sure that there would always be 1 trailing byte, we could
3807 : // check against nIters - 15
3808 2988110 : for (; i < nIters - 16; i += 16)
3809 : {
3810 : __m128i xmm0 =
3811 2793440 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
3812 : __m128i xmm1 =
3813 5586890 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
3814 : // Set higher 8bit of each int16 packed word to 0
3815 2793440 : xmm0 = _mm_and_si128(xmm0, xmm_mask);
3816 2793440 : xmm1 = _mm_and_si128(xmm1, xmm_mask);
3817 : // Pack int16 to uint8 and merge back both vector
3818 2793440 : xmm0 = _mm_packus_epi16(xmm0, xmm1);
3819 :
3820 : // Store result
3821 2793440 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
3822 :
3823 2793440 : pSrc += 2 * 16;
3824 : }
3825 : }
3826 4633800 : for (; i < nIters; i++)
3827 : {
3828 4279610 : pDest[i] = *pSrc;
3829 4279610 : pSrc += 2;
3830 : }
3831 354194 : }
3832 :
3833 1 : static void GDALUnrolledCopy_GByte_3_1_SSE2(GByte *CPL_RESTRICT pDest,
3834 : const GByte *CPL_RESTRICT pSrc,
3835 : GPtrDiff_t nIters)
3836 : {
3837 1 : decltype(nIters) i = 0;
3838 1 : const __m128i xmm_mask_ori = _mm_set_epi32(0, 0, 0, 255);
3839 : // If we were sure that there would always be 2 trailing bytes, we could
3840 : // check against nIters - 15
3841 2 : for (; i < nIters - 16; i += 16)
3842 : {
3843 : __m128i xmm0 =
3844 1 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
3845 : __m128i xmm1 =
3846 1 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
3847 : __m128i xmm2 =
3848 1 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
3849 :
3850 1 : auto xmm_mask0 = xmm_mask_ori;
3851 1 : auto xmm_mask1 = _mm_slli_si128(xmm_mask_ori, 6);
3852 1 : auto xmm_mask2 = _mm_slli_si128(xmm_mask_ori, 11);
3853 :
3854 1 : auto xmm = _mm_and_si128(xmm0, xmm_mask0);
3855 1 : auto xmm_res1 = _mm_and_si128(_mm_slli_si128(xmm1, 4), xmm_mask1);
3856 :
3857 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3858 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3859 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3860 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3861 2 : xmm_res1 = _mm_or_si128(
3862 : xmm_res1, _mm_and_si128(_mm_slli_si128(xmm1, 2), xmm_mask1));
3863 :
3864 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3865 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3866 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3867 2 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3868 1 : xmm_res1 = _mm_or_si128(xmm_res1, _mm_and_si128(xmm1, xmm_mask1));
3869 :
3870 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3871 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3872 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3873 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3874 2 : xmm_res1 = _mm_or_si128(
3875 : xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 2), xmm_mask1));
3876 :
3877 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3878 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3879 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3880 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3881 3 : xmm_res1 = _mm_or_si128(
3882 : xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 4), xmm_mask1));
3883 1 : xmm = _mm_or_si128(xmm, xmm_res1);
3884 :
3885 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3886 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3887 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3888 :
3889 2 : xmm = _mm_or_si128(xmm,
3890 : _mm_and_si128(_mm_slli_si128(xmm2, 10), xmm_mask2));
3891 :
3892 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
3893 2 : xmm = _mm_or_si128(xmm,
3894 : _mm_and_si128(_mm_slli_si128(xmm2, 8), xmm_mask2));
3895 :
3896 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
3897 2 : xmm = _mm_or_si128(xmm,
3898 : _mm_and_si128(_mm_slli_si128(xmm2, 6), xmm_mask2));
3899 :
3900 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
3901 2 : xmm = _mm_or_si128(xmm,
3902 : _mm_and_si128(_mm_slli_si128(xmm2, 4), xmm_mask2));
3903 :
3904 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
3905 2 : xmm = _mm_or_si128(xmm,
3906 : _mm_and_si128(_mm_slli_si128(xmm2, 2), xmm_mask2));
3907 :
3908 1 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm);
3909 :
3910 1 : pSrc += 3 * 16;
3911 : }
3912 2 : for (; i < nIters; i++)
3913 : {
3914 1 : pDest[i] = *pSrc;
3915 1 : pSrc += 3;
3916 : }
3917 1 : }
3918 :
3919 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
3920 :
3921 : template <>
3922 192265 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
3923 : const GByte *CPL_RESTRICT pSrc,
3924 : GPtrDiff_t nIters)
3925 : {
3926 192265 : if (nIters > 16)
3927 : {
3928 186142 : if (CPLHaveRuntimeSSSE3())
3929 : {
3930 186141 : GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
3931 : }
3932 : else
3933 : {
3934 1 : GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
3935 : }
3936 : }
3937 : else
3938 : {
3939 20384 : for (GPtrDiff_t i = 0; i < nIters; i++)
3940 : {
3941 14261 : pDest[i] = *pSrc;
3942 14261 : pSrc += 3;
3943 : }
3944 : }
3945 192265 : }
3946 :
3947 : #else
3948 :
3949 : template <>
3950 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
3951 : const GByte *CPL_RESTRICT pSrc,
3952 : GPtrDiff_t nIters)
3953 : {
3954 : GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
3955 : }
3956 : #endif
3957 :
3958 : template <>
3959 332655 : void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
3960 : const GByte *CPL_RESTRICT pSrc,
3961 : GPtrDiff_t nIters)
3962 : {
3963 332655 : decltype(nIters) i = 0;
3964 332655 : if (nIters > 16)
3965 : {
3966 327362 : const __m128i xmm_mask = _mm_set1_epi32(0xff);
3967 : // If we were sure that there would always be 3 trailing bytes, we could
3968 : // check against nIters - 15
3969 28035300 : for (; i < nIters - 16; i += 16)
3970 : {
3971 : __m128i xmm0 =
3972 27707900 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
3973 : __m128i xmm1 =
3974 27707900 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
3975 : __m128i xmm2 =
3976 27707900 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
3977 : __m128i xmm3 =
3978 55415800 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 48));
3979 : // Set higher 24bit of each int32 packed word to 0
3980 27707900 : xmm0 = _mm_and_si128(xmm0, xmm_mask);
3981 27707900 : xmm1 = _mm_and_si128(xmm1, xmm_mask);
3982 27707900 : xmm2 = _mm_and_si128(xmm2, xmm_mask);
3983 27707900 : xmm3 = _mm_and_si128(xmm3, xmm_mask);
3984 : // Pack int32 to int16
3985 27707900 : xmm0 = _mm_packs_epi32(xmm0, xmm1);
3986 27707900 : xmm2 = _mm_packs_epi32(xmm2, xmm3);
3987 : // Pack int16 to uint8
3988 27707900 : xmm0 = _mm_packus_epi16(xmm0, xmm2);
3989 :
3990 : // Store result
3991 27707900 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
3992 :
3993 27707900 : pSrc += 4 * 16;
3994 : }
3995 : }
3996 5048700 : for (; i < nIters; i++)
3997 : {
3998 4716050 : pDest[i] = *pSrc;
3999 4716050 : pSrc += 4;
4000 : }
4001 332655 : }
4002 : #endif // HAVE_SSE2
4003 :
4004 : /************************************************************************/
4005 : /* GDALFastCopy() */
4006 : /************************************************************************/
4007 :
4008 : template <class T>
4009 40101000 : static inline void GDALFastCopy(T *CPL_RESTRICT pDest, int nDestStride,
4010 : const T *CPL_RESTRICT pSrc, int nSrcStride,
4011 : GPtrDiff_t nIters)
4012 : {
4013 40101000 : constexpr int sizeofT = static_cast<int>(sizeof(T));
4014 40101000 : if (nIters == 1)
4015 : {
4016 22540480 : *pDest = *pSrc;
4017 : }
4018 17560432 : else if (nDestStride == sizeofT)
4019 : {
4020 14486789 : if (nSrcStride == sizeofT)
4021 : {
4022 13397974 : memcpy(pDest, pSrc, nIters * sizeof(T));
4023 : }
4024 1088847 : else if (nSrcStride == 2 * sizeofT)
4025 : {
4026 357409 : GDALUnrolledCopy<T, 2, 1>(pDest, pSrc, nIters);
4027 : }
4028 731438 : else if (nSrcStride == 3 * sizeofT)
4029 : {
4030 289245 : GDALUnrolledCopy<T, 3, 1>(pDest, pSrc, nIters);
4031 : }
4032 442193 : else if (nSrcStride == 4 * sizeofT)
4033 : {
4034 336637 : GDALUnrolledCopy<T, 4, 1>(pDest, pSrc, nIters);
4035 : }
4036 : else
4037 : {
4038 17229290 : while (nIters-- > 0)
4039 : {
4040 17123750 : *pDest = *pSrc;
4041 17123750 : pSrc += nSrcStride / sizeofT;
4042 17123750 : pDest++;
4043 : }
4044 : }
4045 : }
4046 3073663 : else if (nSrcStride == sizeofT)
4047 : {
4048 3060667 : if (nDestStride == 2 * sizeofT)
4049 : {
4050 151252 : GDALUnrolledCopy<T, 1, 2>(pDest, pSrc, nIters);
4051 : }
4052 2909415 : else if (nDestStride == 3 * sizeofT)
4053 : {
4054 2131471 : GDALUnrolledCopy<T, 1, 3>(pDest, pSrc, nIters);
4055 : }
4056 777937 : else if (nDestStride == 4 * sizeofT)
4057 : {
4058 613625 : GDALUnrolledCopy<T, 1, 4>(pDest, pSrc, nIters);
4059 : }
4060 : else
4061 : {
4062 17169660 : while (nIters-- > 0)
4063 : {
4064 17005410 : *pDest = *pSrc;
4065 17005410 : pSrc++;
4066 17005410 : pDest += nDestStride / sizeofT;
4067 : }
4068 : }
4069 : }
4070 : else
4071 : {
4072 1220108 : while (nIters-- > 0)
4073 : {
4074 1207102 : *pDest = *pSrc;
4075 1207102 : pSrc += nSrcStride / sizeofT;
4076 1207102 : pDest += nDestStride / sizeofT;
4077 : }
4078 : }
4079 40101000 : }
4080 :
4081 : /************************************************************************/
4082 : /* GDALFastCopyByte() */
4083 : /************************************************************************/
4084 :
4085 326320 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
4086 : int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
4087 : int nDstPixelStride, GPtrDiff_t nWordCount)
4088 : {
4089 326320 : GDALFastCopy(pDstData, nDstPixelStride, pSrcData, nSrcPixelStride,
4090 : nWordCount);
4091 326320 : }
4092 :
4093 : /************************************************************************/
4094 : /* GDALCopyWords() */
4095 : /************************************************************************/
4096 :
4097 : /**
4098 : * Copy pixel words from buffer to buffer.
4099 : *
4100 : * @see GDALCopyWords64()
4101 : */
4102 80491000 : void CPL_STDCALL GDALCopyWords(const void *CPL_RESTRICT pSrcData,
4103 : GDALDataType eSrcType, int nSrcPixelStride,
4104 : void *CPL_RESTRICT pDstData,
4105 : GDALDataType eDstType, int nDstPixelStride,
4106 : int nWordCount)
4107 : {
4108 80491000 : GDALCopyWords64(pSrcData, eSrcType, nSrcPixelStride, pDstData, eDstType,
4109 : nDstPixelStride, nWordCount);
4110 80491000 : }
4111 :
4112 : /************************************************************************/
4113 : /* GDALCopyWords64() */
4114 : /************************************************************************/
4115 :
4116 : /**
4117 : * Copy pixel words from buffer to buffer.
4118 : *
4119 : * This function is used to copy pixel word values from one memory buffer
4120 : * to another, with support for conversion between data types, and differing
4121 : * step factors. The data type conversion is done using the following
4122 : * rules:
4123 : * <ul>
4124 : * <li>Values assigned to a lower range integer type are clipped. For
4125 : * instance assigning GDT_Int16 values to a GDT_UInt8 buffer will cause values
4126 : * less the 0 to be set to 0, and values larger than 255 to be set to 255.
4127 : * </li>
4128 : * <li>
4129 : * Assignment from floating point to integer rounds to closest integer.
4130 : * +Infinity is mapped to the largest integer. -Infinity is mapped to the
4131 : * smallest integer. NaN is mapped to 0.
4132 : * </li>
4133 : * <li>
4134 : * Assignment from non-complex to complex will result in the imaginary part
4135 : * being set to zero on output.
4136 : * </li>
4137 : * <li> Assignment from complex to
4138 : * non-complex will result in the complex portion being lost and the real
4139 : * component being preserved (<i>not magnitude!</i>).
4140 : * </li>
4141 : * </ul>
4142 : *
4143 : * No assumptions are made about the source or destination words occurring
4144 : * on word boundaries. It is assumed that all values are in native machine
4145 : * byte order.
4146 : *
4147 : * @param pSrcData Pointer to source data to be converted.
4148 : * @param eSrcType the source data type (see GDALDataType enum)
4149 : * @param nSrcPixelStride Source pixel stride (i.e. distance between 2 words),
4150 : * in bytes
4151 : * @param pDstData Pointer to buffer where destination data should go
4152 : * @param eDstType the destination data type (see GDALDataType enum)
4153 : * @param nDstPixelStride Destination pixel stride (i.e. distance between 2
4154 : * words), in bytes
4155 : * @param nWordCount number of words to be copied
4156 : *
4157 : * @note
4158 : * When adding a new data type to GDAL, you must do the following to
4159 : * support it properly within the GDALCopyWords function:
4160 : * 1. Add the data type to the switch on eSrcType in GDALCopyWords.
4161 : * This should invoke the appropriate GDALCopyWordsFromT wrapper.
4162 : * 2. Add the data type to the switch on eDstType in GDALCopyWordsFromT.
4163 : * This should call the appropriate GDALCopyWordsT template.
4164 : * 3. If appropriate, overload the appropriate CopyWord template in the
4165 : * above namespace. This will ensure that any conversion issues are
4166 : * handled (cases like the float -> int32 case, where the min/max)
4167 : * values are subject to roundoff error.
4168 : */
4169 :
4170 116774000 : void CPL_STDCALL GDALCopyWords64(const void *CPL_RESTRICT pSrcData,
4171 : GDALDataType eSrcType, int nSrcPixelStride,
4172 : void *CPL_RESTRICT pDstData,
4173 : GDALDataType eDstType, int nDstPixelStride,
4174 : GPtrDiff_t nWordCount)
4175 :
4176 : {
4177 : // On platforms where alignment matters, be careful
4178 116774000 : const int nSrcDataTypeSize = GDALGetDataTypeSizeBytes(eSrcType);
4179 116774000 : const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(eDstType);
4180 116774000 : if (CPL_UNLIKELY(nSrcDataTypeSize == 0 || nDstDataTypeSize == 0))
4181 : {
4182 2 : CPLError(CE_Failure, CPLE_NotSupported,
4183 : "GDALCopyWords64(): unsupported GDT_Unknown/GDT_TypeCount "
4184 : "argument");
4185 2 : return;
4186 : }
4187 116774000 : if (!(eSrcType == eDstType && nSrcPixelStride == nDstPixelStride) &&
4188 66322800 : ((reinterpret_cast<uintptr_t>(pSrcData) % nSrcDataTypeSize) != 0 ||
4189 66322800 : (reinterpret_cast<uintptr_t>(pDstData) % nDstDataTypeSize) != 0 ||
4190 66322400 : (nSrcPixelStride % nSrcDataTypeSize) != 0 ||
4191 66322300 : (nDstPixelStride % nDstDataTypeSize) != 0))
4192 : {
4193 905 : if (eSrcType == eDstType)
4194 : {
4195 34800 : for (decltype(nWordCount) i = 0; i < nWordCount; i++)
4196 : {
4197 34000 : memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
4198 : static_cast<const GByte *>(pSrcData) +
4199 34000 : nSrcPixelStride * i,
4200 : nDstDataTypeSize);
4201 : }
4202 : }
4203 : else
4204 : {
4205 210 : const auto getAlignedPtr = [](GByte *ptr, int align)
4206 : {
4207 : return ptr +
4208 210 : ((align - (reinterpret_cast<uintptr_t>(ptr) % align)) %
4209 210 : align);
4210 : };
4211 :
4212 : // The largest we need is for CFloat64 (16 bytes), so 32 bytes to
4213 : // be sure to get correctly aligned pointer.
4214 105 : constexpr size_t SIZEOF_CFLOAT64 = 2 * sizeof(double);
4215 : GByte abySrcBuffer[2 * SIZEOF_CFLOAT64];
4216 : GByte abyDstBuffer[2 * SIZEOF_CFLOAT64];
4217 : GByte *pabySrcBuffer =
4218 105 : getAlignedPtr(abySrcBuffer, nSrcDataTypeSize);
4219 : GByte *pabyDstBuffer =
4220 105 : getAlignedPtr(abyDstBuffer, nDstDataTypeSize);
4221 3360 : for (decltype(nWordCount) i = 0; i < nWordCount; i++)
4222 : {
4223 3255 : memcpy(pabySrcBuffer,
4224 : static_cast<const GByte *>(pSrcData) +
4225 3255 : nSrcPixelStride * i,
4226 : nSrcDataTypeSize);
4227 3255 : GDALCopyWords64(pabySrcBuffer, eSrcType, 0, pabyDstBuffer,
4228 : eDstType, 0, 1);
4229 3255 : memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
4230 : pabyDstBuffer, nDstDataTypeSize);
4231 : }
4232 : }
4233 905 : return;
4234 : }
4235 :
4236 : // Deal with the case where we're replicating a single word into the
4237 : // provided buffer
4238 116773000 : if (nSrcPixelStride == 0 && nWordCount > 1)
4239 : {
4240 1068100 : GDALReplicateWord(pSrcData, eSrcType, pDstData, eDstType,
4241 : nDstPixelStride, nWordCount);
4242 1068100 : return;
4243 : }
4244 :
4245 115705000 : if (eSrcType == eDstType)
4246 : {
4247 54673700 : if (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8)
4248 : {
4249 17979000 : GDALFastCopy(static_cast<GByte *>(pDstData), nDstPixelStride,
4250 : static_cast<const GByte *>(pSrcData), nSrcPixelStride,
4251 : nWordCount);
4252 17979000 : return;
4253 : }
4254 :
4255 36694700 : if (nSrcDataTypeSize == 2 && (nSrcPixelStride % 2) == 0 &&
4256 21795700 : (nDstPixelStride % 2) == 0)
4257 : {
4258 21795700 : GDALFastCopy(static_cast<short *>(pDstData), nDstPixelStride,
4259 : static_cast<const short *>(pSrcData), nSrcPixelStride,
4260 : nWordCount);
4261 21795700 : return;
4262 : }
4263 :
4264 14899000 : if (nWordCount == 1)
4265 : {
4266 : #if defined(CSA_BUILD) || defined(__COVERITY__)
4267 : // Avoid false positives...
4268 : memcpy(pDstData, pSrcData, nSrcDataTypeSize);
4269 : #else
4270 14411900 : if (nSrcDataTypeSize == 2)
4271 0 : memcpy(pDstData, pSrcData, 2);
4272 14411900 : else if (nSrcDataTypeSize == 4)
4273 13807600 : memcpy(pDstData, pSrcData, 4);
4274 604283 : else if (nSrcDataTypeSize == 8)
4275 587678 : memcpy(pDstData, pSrcData, 8);
4276 : else /* if( eSrcType == GDT_CFloat64 ) */
4277 16605 : memcpy(pDstData, pSrcData, 16);
4278 : #endif
4279 14411900 : return;
4280 : }
4281 :
4282 : // Let memcpy() handle the case where we're copying a packed buffer
4283 : // of pixels.
4284 487145 : if (nSrcPixelStride == nDstPixelStride)
4285 : {
4286 225301 : if (nSrcPixelStride == nSrcDataTypeSize)
4287 : {
4288 225233 : memcpy(pDstData, pSrcData, nWordCount * nSrcDataTypeSize);
4289 225233 : return;
4290 : }
4291 : }
4292 : }
4293 :
4294 : // Handle the more general case -- deals with conversion of data types
4295 : // directly.
4296 61292900 : switch (eSrcType)
4297 : {
4298 20306300 : case GDT_UInt8:
4299 20306300 : GDALCopyWordsFromT<unsigned char>(
4300 : static_cast<const unsigned char *>(pSrcData), nSrcPixelStride,
4301 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4302 20306300 : break;
4303 1786 : case GDT_Int8:
4304 1786 : GDALCopyWordsFromT<signed char>(
4305 : static_cast<const signed char *>(pSrcData), nSrcPixelStride,
4306 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4307 1786 : break;
4308 55311 : case GDT_UInt16:
4309 55311 : GDALCopyWordsFromT<unsigned short>(
4310 : static_cast<const unsigned short *>(pSrcData), nSrcPixelStride,
4311 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4312 55311 : break;
4313 6519830 : case GDT_Int16:
4314 6519830 : GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
4315 : nSrcPixelStride, false, pDstData,
4316 : eDstType, nDstPixelStride, nWordCount);
4317 6519830 : break;
4318 8016 : case GDT_UInt32:
4319 8016 : GDALCopyWordsFromT<unsigned int>(
4320 : static_cast<const unsigned int *>(pSrcData), nSrcPixelStride,
4321 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4322 8016 : break;
4323 12254800 : case GDT_Int32:
4324 12254800 : GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
4325 : nSrcPixelStride, false, pDstData, eDstType,
4326 : nDstPixelStride, nWordCount);
4327 12254800 : break;
4328 2205 : case GDT_UInt64:
4329 2205 : GDALCopyWordsFromT<std::uint64_t>(
4330 : static_cast<const std::uint64_t *>(pSrcData), nSrcPixelStride,
4331 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4332 2205 : break;
4333 11729 : case GDT_Int64:
4334 11729 : GDALCopyWordsFromT<std::int64_t>(
4335 : static_cast<const std::int64_t *>(pSrcData), nSrcPixelStride,
4336 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4337 11729 : break;
4338 1387 : case GDT_Float16:
4339 1387 : GDALCopyWordsFromT<GFloat16>(
4340 : static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, false,
4341 : pDstData, eDstType, nDstPixelStride, nWordCount);
4342 1387 : break;
4343 654936 : case GDT_Float32:
4344 654936 : GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
4345 : nSrcPixelStride, false, pDstData,
4346 : eDstType, nDstPixelStride, nWordCount);
4347 654936 : break;
4348 20715800 : case GDT_Float64:
4349 20715800 : GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
4350 : nSrcPixelStride, false, pDstData,
4351 : eDstType, nDstPixelStride, nWordCount);
4352 20715800 : break;
4353 478486 : case GDT_CInt16:
4354 478486 : GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
4355 : nSrcPixelStride, true, pDstData, eDstType,
4356 : nDstPixelStride, nWordCount);
4357 478486 : break;
4358 868 : case GDT_CInt32:
4359 868 : GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
4360 : nSrcPixelStride, true, pDstData, eDstType,
4361 : nDstPixelStride, nWordCount);
4362 868 : break;
4363 508 : case GDT_CFloat16:
4364 508 : GDALCopyWordsFromT<GFloat16>(
4365 : static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, true,
4366 : pDstData, eDstType, nDstPixelStride, nWordCount);
4367 508 : break;
4368 2437 : case GDT_CFloat32:
4369 2437 : GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
4370 : nSrcPixelStride, true, pDstData, eDstType,
4371 : nDstPixelStride, nWordCount);
4372 2437 : break;
4373 278520 : case GDT_CFloat64:
4374 278520 : GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
4375 : nSrcPixelStride, true, pDstData,
4376 : eDstType, nDstPixelStride, nWordCount);
4377 278520 : break;
4378 0 : case GDT_Unknown:
4379 : case GDT_TypeCount:
4380 0 : CPLAssert(false);
4381 : }
4382 : }
4383 :
4384 : /************************************************************************/
4385 : /* GDALCopyBits() */
4386 : /************************************************************************/
4387 :
4388 : /**
4389 : * Bitwise word copying.
4390 : *
4391 : * A function for moving sets of partial bytes around. Loosely
4392 : * speaking this is a bitwise analog to GDALCopyWords().
4393 : *
4394 : * It copies nStepCount "words" where each word is nBitCount bits long.
4395 : * The nSrcStep and nDstStep are the number of bits from the start of one
4396 : * word to the next (same as nBitCount if they are packed). The nSrcOffset
4397 : * and nDstOffset are the offset into the source and destination buffers
4398 : * to start at, also measured in bits.
4399 : *
4400 : * All bit offsets are assumed to start from the high order bit in a byte
4401 : * (i.e. most significant bit first). Currently this function is not very
4402 : * optimized, but it may be improved for some common cases in the future
4403 : * as needed.
4404 : *
4405 : * @param pabySrcData the source data buffer.
4406 : * @param nSrcOffset the offset (in bits) in pabySrcData to the start of the
4407 : * first word to copy.
4408 : * @param nSrcStep the offset in bits from the start one source word to the
4409 : * start of the next.
4410 : * @param pabyDstData the destination data buffer.
4411 : * @param nDstOffset the offset (in bits) in pabyDstData to the start of the
4412 : * first word to copy over.
4413 : * @param nDstStep the offset in bits from the start one word to the
4414 : * start of the next.
4415 : * @param nBitCount the number of bits in a word to be copied.
4416 : * @param nStepCount the number of words to copy.
4417 : */
4418 :
4419 0 : void GDALCopyBits(const GByte *pabySrcData, int nSrcOffset, int nSrcStep,
4420 : GByte *pabyDstData, int nDstOffset, int nDstStep,
4421 : int nBitCount, int nStepCount)
4422 :
4423 : {
4424 0 : VALIDATE_POINTER0(pabySrcData, "GDALCopyBits");
4425 :
4426 0 : for (int iStep = 0; iStep < nStepCount; iStep++)
4427 : {
4428 0 : for (int iBit = 0; iBit < nBitCount; iBit++)
4429 : {
4430 0 : if (pabySrcData[nSrcOffset >> 3] & (0x80 >> (nSrcOffset & 7)))
4431 0 : pabyDstData[nDstOffset >> 3] |= (0x80 >> (nDstOffset & 7));
4432 : else
4433 0 : pabyDstData[nDstOffset >> 3] &= ~(0x80 >> (nDstOffset & 7));
4434 :
4435 0 : nSrcOffset++;
4436 0 : nDstOffset++;
4437 : }
4438 :
4439 0 : nSrcOffset += (nSrcStep - nBitCount);
4440 0 : nDstOffset += (nDstStep - nBitCount);
4441 : }
4442 : }
4443 :
4444 : /************************************************************************/
4445 : /* GDALGetBestOverviewLevel() */
4446 : /* */
4447 : /* Returns the best overview level to satisfy the query or -1 if none */
4448 : /* Also updates nXOff, nYOff, nXSize, nYSize and psExtraArg when */
4449 : /* returning a valid overview level */
4450 : /************************************************************************/
4451 :
4452 0 : int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand, int &nXOff, int &nYOff,
4453 : int &nXSize, int &nYSize, int nBufXSize,
4454 : int nBufYSize)
4455 : {
4456 0 : return GDALBandGetBestOverviewLevel2(poBand, nXOff, nYOff, nXSize, nYSize,
4457 0 : nBufXSize, nBufYSize, nullptr);
4458 : }
4459 :
4460 524017 : int GDALBandGetBestOverviewLevel2(GDALRasterBand *poBand, int &nXOff,
4461 : int &nYOff, int &nXSize, int &nYSize,
4462 : int nBufXSize, int nBufYSize,
4463 : GDALRasterIOExtraArg *psExtraArg)
4464 : {
4465 524017 : if (psExtraArg != nullptr && psExtraArg->nVersion > 1 &&
4466 524017 : psExtraArg->bUseOnlyThisScale)
4467 109 : return -1;
4468 : /* -------------------------------------------------------------------- */
4469 : /* Compute the desired downsampling factor. It is */
4470 : /* based on the least reduced axis, and represents the number */
4471 : /* of source pixels to one destination pixel. */
4472 : /* -------------------------------------------------------------------- */
4473 523908 : const double dfDesiredDownsamplingFactor =
4474 523908 : ((nXSize / static_cast<double>(nBufXSize)) <
4475 361568 : (nYSize / static_cast<double>(nBufYSize)) ||
4476 : nBufYSize == 1)
4477 752297 : ? nXSize / static_cast<double>(nBufXSize)
4478 133179 : : nYSize / static_cast<double>(nBufYSize);
4479 :
4480 : /* -------------------------------------------------------------------- */
4481 : /* Find the overview level that largest downsampling factor (most */
4482 : /* downsampled) that is still less than (or only a little more) */
4483 : /* downsampled than the request. */
4484 : /* -------------------------------------------------------------------- */
4485 523908 : const int nOverviewCount = poBand->GetOverviewCount();
4486 523908 : GDALRasterBand *poBestOverview = nullptr;
4487 523908 : double dfBestDownsamplingFactor = 0;
4488 523908 : int nBestOverviewLevel = -1;
4489 :
4490 : const char *pszOversampligThreshold =
4491 523908 : CPLGetConfigOption("GDAL_OVERVIEW_OVERSAMPLING_THRESHOLD", nullptr);
4492 :
4493 : // Note: keep this logic for overview selection in sync between
4494 : // gdalwarp_lib.cpp and rasterio.cpp
4495 : // Cf https://github.com/OSGeo/gdal/pull/9040#issuecomment-1898524693
4496 : const double dfOversamplingThreshold =
4497 1047810 : pszOversampligThreshold ? CPLAtof(pszOversampligThreshold)
4498 523899 : : psExtraArg && psExtraArg->eResampleAlg != GRIORA_NearestNeighbour
4499 1047800 : ? 1.0
4500 523908 : : 1.2;
4501 526604 : for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
4502 : {
4503 5616 : GDALRasterBand *poOverview = poBand->GetOverview(iOverview);
4504 11232 : if (poOverview == nullptr ||
4505 11231 : poOverview->GetXSize() > poBand->GetXSize() ||
4506 5615 : poOverview->GetYSize() > poBand->GetYSize())
4507 : {
4508 1 : continue;
4509 : }
4510 :
4511 : // Compute downsampling factor of this overview
4512 : const double dfDownsamplingFactor = std::min(
4513 5615 : poBand->GetXSize() / static_cast<double>(poOverview->GetXSize()),
4514 11230 : poBand->GetYSize() / static_cast<double>(poOverview->GetYSize()));
4515 :
4516 : // Is it nearly the requested factor and better (lower) than
4517 : // the current best factor?
4518 : // Use an epsilon because of numerical instability.
4519 5615 : constexpr double EPSILON = 1e-1;
4520 5723 : if (dfDownsamplingFactor >=
4521 5615 : dfDesiredDownsamplingFactor * dfOversamplingThreshold +
4522 5507 : EPSILON ||
4523 : dfDownsamplingFactor <= dfBestDownsamplingFactor)
4524 : {
4525 108 : continue;
4526 : }
4527 :
4528 : // Ignore AVERAGE_BIT2GRAYSCALE overviews for RasterIO purposes.
4529 5507 : const char *pszResampling = poOverview->GetMetadataItem("RESAMPLING");
4530 :
4531 5507 : if (pszResampling != nullptr &&
4532 71 : STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2"))
4533 16 : continue;
4534 :
4535 : // OK, this is our new best overview.
4536 5491 : poBestOverview = poOverview;
4537 5491 : nBestOverviewLevel = iOverview;
4538 5491 : dfBestDownsamplingFactor = dfDownsamplingFactor;
4539 :
4540 5491 : if (std::abs(dfDesiredDownsamplingFactor - dfDownsamplingFactor) <
4541 : EPSILON)
4542 : {
4543 2920 : break;
4544 : }
4545 : }
4546 :
4547 : /* -------------------------------------------------------------------- */
4548 : /* If we didn't find an overview that helps us, just return */
4549 : /* indicating failure and the full resolution image will be used. */
4550 : /* -------------------------------------------------------------------- */
4551 523908 : if (nBestOverviewLevel < 0)
4552 520915 : return -1;
4553 :
4554 : /* -------------------------------------------------------------------- */
4555 : /* Recompute the source window in terms of the selected */
4556 : /* overview. */
4557 : /* -------------------------------------------------------------------- */
4558 : const double dfXFactor =
4559 2993 : poBand->GetXSize() / static_cast<double>(poBestOverview->GetXSize());
4560 : const double dfYFactor =
4561 2993 : poBand->GetYSize() / static_cast<double>(poBestOverview->GetYSize());
4562 2993 : CPLDebug("GDAL", "Selecting overview %d x %d", poBestOverview->GetXSize(),
4563 : poBestOverview->GetYSize());
4564 :
4565 8979 : const int nOXOff = std::min(poBestOverview->GetXSize() - 1,
4566 2993 : static_cast<int>(nXOff / dfXFactor + 0.5));
4567 8979 : const int nOYOff = std::min(poBestOverview->GetYSize() - 1,
4568 2993 : static_cast<int>(nYOff / dfYFactor + 0.5));
4569 2993 : int nOXSize = std::max(1, static_cast<int>(nXSize / dfXFactor + 0.5));
4570 2993 : int nOYSize = std::max(1, static_cast<int>(nYSize / dfYFactor + 0.5));
4571 2993 : if (nOXOff + nOXSize > poBestOverview->GetXSize())
4572 0 : nOXSize = poBestOverview->GetXSize() - nOXOff;
4573 2993 : if (nOYOff + nOYSize > poBestOverview->GetYSize())
4574 2 : nOYSize = poBestOverview->GetYSize() - nOYOff;
4575 :
4576 2993 : if (psExtraArg)
4577 : {
4578 2993 : if (psExtraArg->bFloatingPointWindowValidity)
4579 : {
4580 117 : psExtraArg->dfXOff /= dfXFactor;
4581 117 : psExtraArg->dfXSize /= dfXFactor;
4582 117 : psExtraArg->dfYOff /= dfYFactor;
4583 117 : psExtraArg->dfYSize /= dfYFactor;
4584 : }
4585 2876 : else if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
4586 : {
4587 16 : psExtraArg->bFloatingPointWindowValidity = true;
4588 16 : psExtraArg->dfXOff = nXOff / dfXFactor;
4589 16 : psExtraArg->dfXSize = nXSize / dfXFactor;
4590 16 : psExtraArg->dfYOff = nYOff / dfYFactor;
4591 16 : psExtraArg->dfYSize = nYSize / dfYFactor;
4592 : }
4593 : }
4594 :
4595 2993 : nXOff = nOXOff;
4596 2993 : nYOff = nOYOff;
4597 2993 : nXSize = nOXSize;
4598 2993 : nYSize = nOYSize;
4599 :
4600 2993 : return nBestOverviewLevel;
4601 : }
4602 :
4603 : /************************************************************************/
4604 : /* OverviewRasterIO() */
4605 : /* */
4606 : /* Special work function to utilize available overviews to */
4607 : /* more efficiently satisfy downsampled requests. It will */
4608 : /* return CE_Failure if there are no appropriate overviews */
4609 : /* available but it doesn't emit any error messages. */
4610 : /************************************************************************/
4611 :
4612 : //! @cond Doxygen_Suppress
4613 2 : CPLErr GDALRasterBand::OverviewRasterIO(
4614 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4615 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4616 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
4617 :
4618 : {
4619 : GDALRasterIOExtraArg sExtraArg;
4620 2 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4621 :
4622 2 : const int nOverview = GDALBandGetBestOverviewLevel2(
4623 : this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, &sExtraArg);
4624 2 : if (nOverview < 0)
4625 1 : return CE_Failure;
4626 :
4627 : /* -------------------------------------------------------------------- */
4628 : /* Recast the call in terms of the new raster layer. */
4629 : /* -------------------------------------------------------------------- */
4630 1 : GDALRasterBand *poOverviewBand = GetOverview(nOverview);
4631 1 : if (poOverviewBand == nullptr)
4632 0 : return CE_Failure;
4633 :
4634 1 : return poOverviewBand->RasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
4635 : pData, nBufXSize, nBufYSize, eBufType,
4636 1 : nPixelSpace, nLineSpace, &sExtraArg);
4637 : }
4638 :
4639 : /************************************************************************/
4640 : /* TryOverviewRasterIO() */
4641 : /************************************************************************/
4642 :
4643 362428 : CPLErr GDALRasterBand::TryOverviewRasterIO(
4644 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4645 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4646 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg,
4647 : int *pbTried)
4648 : {
4649 362428 : int nXOffMod = nXOff;
4650 362428 : int nYOffMod = nYOff;
4651 362428 : int nXSizeMod = nXSize;
4652 362428 : int nYSizeMod = nYSize;
4653 : GDALRasterIOExtraArg sExtraArg;
4654 :
4655 362428 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4656 :
4657 362428 : int iOvrLevel = GDALBandGetBestOverviewLevel2(
4658 : this, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize, nBufYSize,
4659 : &sExtraArg);
4660 :
4661 362428 : if (iOvrLevel >= 0)
4662 : {
4663 53 : GDALRasterBand *poOverviewBand = GetOverview(iOvrLevel);
4664 53 : if (poOverviewBand)
4665 : {
4666 53 : *pbTried = TRUE;
4667 53 : return poOverviewBand->RasterIO(
4668 : eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData,
4669 : nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
4670 53 : &sExtraArg);
4671 : }
4672 : }
4673 :
4674 362375 : *pbTried = FALSE;
4675 362375 : return CE_None;
4676 : }
4677 :
4678 : /************************************************************************/
4679 : /* TryOverviewRasterIO() */
4680 : /************************************************************************/
4681 :
4682 158613 : CPLErr GDALDataset::TryOverviewRasterIO(
4683 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4684 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4685 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
4686 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg,
4687 : int *pbTried)
4688 : {
4689 158613 : int nXOffMod = nXOff;
4690 158613 : int nYOffMod = nYOff;
4691 158613 : int nXSizeMod = nXSize;
4692 158613 : int nYSizeMod = nYSize;
4693 : GDALRasterIOExtraArg sExtraArg;
4694 158613 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4695 :
4696 317226 : int iOvrLevel = GDALBandGetBestOverviewLevel2(
4697 158613 : papoBands[0], nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize,
4698 : nBufYSize, &sExtraArg);
4699 :
4700 158655 : if (iOvrLevel >= 0 && papoBands[0]->GetOverview(iOvrLevel) != nullptr &&
4701 42 : papoBands[0]->GetOverview(iOvrLevel)->GetDataset() != nullptr)
4702 : {
4703 42 : *pbTried = TRUE;
4704 42 : return papoBands[0]->GetOverview(iOvrLevel)->GetDataset()->RasterIO(
4705 : eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData, nBufXSize,
4706 : nBufYSize, eBufType, nBandCount, panBandMap, nPixelSpace,
4707 42 : nLineSpace, nBandSpace, &sExtraArg);
4708 : }
4709 : else
4710 : {
4711 158571 : *pbTried = FALSE;
4712 158571 : return CE_None;
4713 : }
4714 : }
4715 :
4716 : /************************************************************************/
4717 : /* GetBestOverviewLevel() */
4718 : /* */
4719 : /* Returns the best overview level to satisfy the query or -1 if none */
4720 : /* Also updates nXOff, nYOff, nXSize, nYSize when returning a valid */
4721 : /* overview level */
4722 : /************************************************************************/
4723 :
4724 4 : static int GDALDatasetGetBestOverviewLevel(GDALDataset *poDS, int &nXOff,
4725 : int &nYOff, int &nXSize, int &nYSize,
4726 : int nBufXSize, int nBufYSize,
4727 : int nBandCount,
4728 : const int *panBandMap,
4729 : GDALRasterIOExtraArg *psExtraArg)
4730 : {
4731 4 : int nOverviewCount = 0;
4732 4 : GDALRasterBand *poFirstBand = nullptr;
4733 :
4734 : /* -------------------------------------------------------------------- */
4735 : /* Check that all bands have the same number of overviews and */
4736 : /* that they have all the same size and block dimensions */
4737 : /* -------------------------------------------------------------------- */
4738 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
4739 : {
4740 8 : GDALRasterBand *poBand = poDS->GetRasterBand(panBandMap[iBand]);
4741 8 : if (poBand == nullptr)
4742 0 : return -1;
4743 8 : if (iBand == 0)
4744 : {
4745 4 : poFirstBand = poBand;
4746 4 : nOverviewCount = poBand->GetOverviewCount();
4747 : }
4748 4 : else if (nOverviewCount != poBand->GetOverviewCount())
4749 : {
4750 0 : CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
4751 : "mismatched overview count, use std method.");
4752 0 : return -1;
4753 : }
4754 : else
4755 : {
4756 4 : for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
4757 : {
4758 0 : GDALRasterBand *poOvrBand = poBand->GetOverview(iOverview);
4759 : GDALRasterBand *poOvrFirstBand =
4760 0 : poFirstBand->GetOverview(iOverview);
4761 0 : if (poOvrBand == nullptr || poOvrFirstBand == nullptr)
4762 0 : continue;
4763 :
4764 0 : if (poOvrFirstBand->GetXSize() != poOvrBand->GetXSize() ||
4765 0 : poOvrFirstBand->GetYSize() != poOvrBand->GetYSize())
4766 : {
4767 0 : CPLDebug("GDAL",
4768 : "GDALDataset::GetBestOverviewLevel() ... "
4769 : "mismatched overview sizes, use std method.");
4770 0 : return -1;
4771 : }
4772 0 : int nBlockXSizeFirst = 0;
4773 0 : int nBlockYSizeFirst = 0;
4774 0 : poOvrFirstBand->GetBlockSize(&nBlockXSizeFirst,
4775 : &nBlockYSizeFirst);
4776 :
4777 0 : int nBlockXSizeCurrent = 0;
4778 0 : int nBlockYSizeCurrent = 0;
4779 0 : poOvrBand->GetBlockSize(&nBlockXSizeCurrent,
4780 : &nBlockYSizeCurrent);
4781 :
4782 0 : if (nBlockXSizeFirst != nBlockXSizeCurrent ||
4783 0 : nBlockYSizeFirst != nBlockYSizeCurrent)
4784 : {
4785 0 : CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
4786 : "mismatched block sizes, use std method.");
4787 0 : return -1;
4788 : }
4789 : }
4790 : }
4791 : }
4792 4 : if (poFirstBand == nullptr)
4793 0 : return -1;
4794 :
4795 4 : return GDALBandGetBestOverviewLevel2(poFirstBand, nXOff, nYOff, nXSize,
4796 : nYSize, nBufXSize, nBufYSize,
4797 4 : psExtraArg);
4798 : }
4799 :
4800 : /************************************************************************/
4801 : /* BlockBasedRasterIO() */
4802 : /* */
4803 : /* This convenience function implements a dataset level */
4804 : /* RasterIO() interface based on calling down to fetch blocks, */
4805 : /* much like the GDALRasterBand::IRasterIO(), but it handles */
4806 : /* all bands at once, so that a format driver that handles a */
4807 : /* request for different bands of the same block efficiently */
4808 : /* (i.e. without re-reading interleaved data) will efficiently. */
4809 : /* */
4810 : /* This method is intended to be called by an overridden */
4811 : /* IRasterIO() method in the driver specific GDALDataset */
4812 : /* derived class. */
4813 : /* */
4814 : /* Default internal implementation of RasterIO() ... utilizes */
4815 : /* the Block access methods to satisfy the request. This would */
4816 : /* normally only be overridden by formats with overviews. */
4817 : /* */
4818 : /* To keep things relatively simple, this method does not */
4819 : /* currently take advantage of some special cases addressed in */
4820 : /* GDALRasterBand::IRasterIO(), so it is likely best to only */
4821 : /* call it when you know it will help. That is in cases where */
4822 : /* data is at 1:1 to the buffer, and you know the driver is */
4823 : /* implementing interleaved IO efficiently on a block by block */
4824 : /* basis. Overviews will be used when possible. */
4825 : /************************************************************************/
4826 :
4827 64982 : CPLErr GDALDataset::BlockBasedRasterIO(
4828 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4829 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4830 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
4831 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
4832 :
4833 : {
4834 64982 : CPLAssert(nullptr != pData);
4835 :
4836 64982 : GByte **papabySrcBlock = nullptr;
4837 64982 : GDALRasterBlock *poBlock = nullptr;
4838 64982 : GDALRasterBlock **papoBlocks = nullptr;
4839 64982 : int nLBlockX = -1;
4840 64982 : int nLBlockY = -1;
4841 : int iBufYOff;
4842 : int iBufXOff;
4843 64982 : int nBlockXSize = 1;
4844 64982 : int nBlockYSize = 1;
4845 64982 : CPLErr eErr = CE_None;
4846 64982 : GDALDataType eDataType = GDT_UInt8;
4847 :
4848 64982 : const bool bUseIntegerRequestCoords =
4849 65020 : (!psExtraArg->bFloatingPointWindowValidity ||
4850 38 : (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
4851 36 : nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
4852 :
4853 : /* -------------------------------------------------------------------- */
4854 : /* Ensure that all bands share a common block size and data type. */
4855 : /* -------------------------------------------------------------------- */
4856 308187 : for (int iBand = 0; iBand < nBandCount; iBand++)
4857 : {
4858 243205 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
4859 :
4860 243205 : if (iBand == 0)
4861 : {
4862 64982 : poBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
4863 64982 : eDataType = poBand->GetRasterDataType();
4864 : }
4865 : else
4866 : {
4867 178223 : int nThisBlockXSize = 0;
4868 178223 : int nThisBlockYSize = 0;
4869 178223 : poBand->GetBlockSize(&nThisBlockXSize, &nThisBlockYSize);
4870 178223 : if (nThisBlockXSize != nBlockXSize ||
4871 178223 : nThisBlockYSize != nBlockYSize)
4872 : {
4873 0 : CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
4874 : "mismatched block sizes, use std method.");
4875 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
4876 : pData, nBufXSize, nBufYSize, eBufType,
4877 : nBandCount, panBandMap, nPixelSpace,
4878 0 : nLineSpace, nBandSpace, psExtraArg);
4879 : }
4880 :
4881 178223 : if (eDataType != poBand->GetRasterDataType() &&
4882 0 : (nXSize != nBufXSize || nYSize != nBufYSize))
4883 : {
4884 0 : CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
4885 : "mismatched band data types, use std method.");
4886 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
4887 : pData, nBufXSize, nBufYSize, eBufType,
4888 : nBandCount, panBandMap, nPixelSpace,
4889 0 : nLineSpace, nBandSpace, psExtraArg);
4890 : }
4891 : }
4892 : }
4893 :
4894 : /* ==================================================================== */
4895 : /* In this special case at full resolution we step through in */
4896 : /* blocks, turning the request over to the per-band */
4897 : /* IRasterIO(), but ensuring that all bands of one block are */
4898 : /* called before proceeding to the next. */
4899 : /* ==================================================================== */
4900 :
4901 64982 : if (nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
4902 : {
4903 : GDALRasterIOExtraArg sDummyExtraArg;
4904 64978 : INIT_RASTERIO_EXTRA_ARG(sDummyExtraArg);
4905 :
4906 64978 : int nChunkYSize = 0;
4907 64978 : int nChunkXSize = 0;
4908 :
4909 213434 : for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff += nChunkYSize)
4910 : {
4911 149472 : const int nChunkYOff = iBufYOff + nYOff;
4912 149472 : nChunkYSize = nBlockYSize - (nChunkYOff % nBlockYSize);
4913 149472 : if (nChunkYOff + nChunkYSize > nYOff + nYSize)
4914 59977 : nChunkYSize = (nYOff + nYSize) - nChunkYOff;
4915 :
4916 822752 : for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff += nChunkXSize)
4917 : {
4918 674295 : const int nChunkXOff = iBufXOff + nXOff;
4919 674295 : nChunkXSize = nBlockXSize - (nChunkXOff % nBlockXSize);
4920 674295 : if (nChunkXOff + nChunkXSize > nXOff + nXSize)
4921 70691 : nChunkXSize = (nXOff + nXSize) - nChunkXOff;
4922 :
4923 674295 : GByte *pabyChunkData =
4924 674295 : static_cast<GByte *>(pData) + iBufXOff * nPixelSpace +
4925 674295 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace;
4926 :
4927 3282490 : for (int iBand = 0; iBand < nBandCount; iBand++)
4928 : {
4929 2609210 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
4930 :
4931 5218420 : eErr = poBand->IRasterIO(
4932 : eRWFlag, nChunkXOff, nChunkYOff, nChunkXSize,
4933 : nChunkYSize,
4934 2609210 : pabyChunkData +
4935 2609210 : static_cast<GPtrDiff_t>(iBand) * nBandSpace,
4936 : nChunkXSize, nChunkYSize, eBufType, nPixelSpace,
4937 2609210 : nLineSpace, &sDummyExtraArg);
4938 2609210 : if (eErr != CE_None)
4939 1015 : return eErr;
4940 : }
4941 : }
4942 :
4943 167371 : if (psExtraArg->pfnProgress != nullptr &&
4944 18914 : !psExtraArg->pfnProgress(
4945 167371 : 1.0 * std::min(nBufYSize, iBufYOff + nChunkYSize) /
4946 : nBufYSize,
4947 : "", psExtraArg->pProgressData))
4948 : {
4949 1 : return CE_Failure;
4950 : }
4951 : }
4952 :
4953 63962 : return CE_None;
4954 : }
4955 :
4956 : /* Below code is not compatible with that case. It would need a complete */
4957 : /* separate code like done in GDALRasterBand::IRasterIO. */
4958 4 : if (eRWFlag == GF_Write && (nBufXSize < nXSize || nBufYSize < nYSize))
4959 : {
4960 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
4961 : nBufXSize, nBufYSize, eBufType, nBandCount,
4962 : panBandMap, nPixelSpace, nLineSpace,
4963 0 : nBandSpace, psExtraArg);
4964 : }
4965 :
4966 : /* We could have a smarter implementation, but that will do for now */
4967 4 : if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
4968 0 : (nBufXSize != nXSize || nBufYSize != nYSize))
4969 : {
4970 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
4971 : nBufXSize, nBufYSize, eBufType, nBandCount,
4972 : panBandMap, nPixelSpace, nLineSpace,
4973 0 : nBandSpace, psExtraArg);
4974 : }
4975 :
4976 : /* ==================================================================== */
4977 : /* Loop reading required source blocks to satisfy output */
4978 : /* request. This is the most general implementation. */
4979 : /* ==================================================================== */
4980 :
4981 4 : const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
4982 :
4983 : papabySrcBlock =
4984 4 : static_cast<GByte **>(CPLCalloc(sizeof(GByte *), nBandCount));
4985 : papoBlocks =
4986 4 : static_cast<GDALRasterBlock **>(CPLCalloc(sizeof(void *), nBandCount));
4987 :
4988 : /* -------------------------------------------------------------------- */
4989 : /* Select an overview level if appropriate. */
4990 : /* -------------------------------------------------------------------- */
4991 :
4992 : GDALRasterIOExtraArg sExtraArg;
4993 4 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4994 4 : const int nOverviewLevel = GDALDatasetGetBestOverviewLevel(
4995 : this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, nBandCount,
4996 : panBandMap, &sExtraArg);
4997 4 : if (nOverviewLevel >= 0)
4998 : {
4999 2 : GetRasterBand(panBandMap[0])
5000 2 : ->GetOverview(nOverviewLevel)
5001 2 : ->GetBlockSize(&nBlockXSize, &nBlockYSize);
5002 : }
5003 :
5004 4 : double dfXOff = nXOff;
5005 4 : double dfYOff = nYOff;
5006 4 : double dfXSize = nXSize;
5007 4 : double dfYSize = nYSize;
5008 4 : if (sExtraArg.bFloatingPointWindowValidity)
5009 : {
5010 2 : dfXOff = sExtraArg.dfXOff;
5011 2 : dfYOff = sExtraArg.dfYOff;
5012 2 : dfXSize = sExtraArg.dfXSize;
5013 2 : dfYSize = sExtraArg.dfYSize;
5014 : }
5015 :
5016 : /* -------------------------------------------------------------------- */
5017 : /* Compute stepping increment. */
5018 : /* -------------------------------------------------------------------- */
5019 4 : const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
5020 4 : const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
5021 :
5022 4 : constexpr double EPS = 1e-10;
5023 : /* -------------------------------------------------------------------- */
5024 : /* Loop over buffer computing source locations. */
5025 : /* -------------------------------------------------------------------- */
5026 36 : for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
5027 : {
5028 : GPtrDiff_t iSrcOffset;
5029 :
5030 : // Add small epsilon to avoid some numeric precision issues.
5031 32 : const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
5032 32 : const int iSrcY = static_cast<int>(std::min(
5033 32 : std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
5034 :
5035 32 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
5036 : static_cast<GPtrDiff_t>(nLineSpace);
5037 :
5038 302 : for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff++)
5039 : {
5040 270 : const double dfSrcX = (iBufXOff + 0.5) * dfSrcXInc + dfXOff + EPS;
5041 270 : const int iSrcX = static_cast<int>(std::min(
5042 270 : std::max(0.0, dfSrcX), static_cast<double>(nRasterXSize - 1)));
5043 :
5044 : // FIXME: this code likely doesn't work if the dirty block gets
5045 : // flushed to disk before being completely written. In the meantime,
5046 : // bJustInitialize should probably be set to FALSE even if it is not
5047 : // ideal performance wise, and for lossy compression
5048 :
5049 : /* --------------------------------------------------------------------
5050 : */
5051 : /* Ensure we have the appropriate block loaded. */
5052 : /* --------------------------------------------------------------------
5053 : */
5054 270 : if (iSrcX < nLBlockX * nBlockXSize ||
5055 270 : iSrcX - nBlockXSize >= nLBlockX * nBlockXSize ||
5056 266 : iSrcY < nLBlockY * nBlockYSize ||
5057 266 : iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
5058 : {
5059 4 : nLBlockX = iSrcX / nBlockXSize;
5060 4 : nLBlockY = iSrcY / nBlockYSize;
5061 :
5062 4 : const bool bJustInitialize =
5063 0 : eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
5064 0 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
5065 4 : nXOff <= nLBlockX * nBlockXSize &&
5066 0 : nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
5067 : /*bool bMemZeroBuffer = FALSE;
5068 : if( eRWFlag == GF_Write && !bJustInitialize &&
5069 : nXOff <= nLBlockX * nBlockXSize &&
5070 : nYOff <= nLBlockY * nBlockYSize &&
5071 : (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
5072 : (nXOff + nXSize == GetRasterXSize() &&
5073 : (nLBlockX+1) * nBlockXSize > GetRasterXSize())) &&
5074 : (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
5075 : (nYOff + nYSize == GetRasterYSize() &&
5076 : (nLBlockY+1) * nBlockYSize > GetRasterYSize())) )
5077 : {
5078 : bJustInitialize = TRUE;
5079 : bMemZeroBuffer = TRUE;
5080 : }*/
5081 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
5082 : {
5083 8 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
5084 8 : if (nOverviewLevel >= 0)
5085 2 : poBand = poBand->GetOverview(nOverviewLevel);
5086 16 : poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY,
5087 8 : bJustInitialize);
5088 8 : if (poBlock == nullptr)
5089 : {
5090 0 : eErr = CE_Failure;
5091 0 : goto CleanupAndReturn;
5092 : }
5093 :
5094 8 : if (eRWFlag == GF_Write)
5095 0 : poBlock->MarkDirty();
5096 :
5097 8 : if (papoBlocks[iBand] != nullptr)
5098 0 : papoBlocks[iBand]->DropLock();
5099 :
5100 8 : papoBlocks[iBand] = poBlock;
5101 :
5102 8 : papabySrcBlock[iBand] =
5103 8 : static_cast<GByte *>(poBlock->GetDataRef());
5104 : /*if( bMemZeroBuffer )
5105 : {
5106 : memset(papabySrcBlock[iBand], 0,
5107 : static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
5108 : * nBlockYSize);
5109 : }*/
5110 : }
5111 : }
5112 :
5113 : /* --------------------------------------------------------------------
5114 : */
5115 : /* Copy over this pixel of data. */
5116 : /* --------------------------------------------------------------------
5117 : */
5118 270 : iSrcOffset = (static_cast<GPtrDiff_t>(iSrcX) -
5119 270 : static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
5120 270 : (static_cast<GPtrDiff_t>(iSrcY) -
5121 270 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
5122 270 : nBlockXSize) *
5123 270 : nBandDataSize;
5124 :
5125 980 : for (int iBand = 0; iBand < nBandCount; iBand++)
5126 : {
5127 710 : GByte *pabySrcBlock = papabySrcBlock[iBand];
5128 710 : GPtrDiff_t iBandBufOffset =
5129 710 : iBufOffset + static_cast<GPtrDiff_t>(iBand) *
5130 : static_cast<GPtrDiff_t>(nBandSpace);
5131 :
5132 710 : if (eDataType == eBufType)
5133 : {
5134 710 : if (eRWFlag == GF_Read)
5135 710 : memcpy(static_cast<GByte *>(pData) + iBandBufOffset,
5136 710 : pabySrcBlock + iSrcOffset, nBandDataSize);
5137 : else
5138 0 : memcpy(pabySrcBlock + iSrcOffset,
5139 : static_cast<const GByte *>(pData) +
5140 0 : iBandBufOffset,
5141 : nBandDataSize);
5142 : }
5143 : else
5144 : {
5145 : /* type to type conversion ... ouch, this is expensive way
5146 : of handling single words */
5147 :
5148 0 : if (eRWFlag == GF_Read)
5149 0 : GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
5150 : static_cast<GByte *>(pData) +
5151 0 : iBandBufOffset,
5152 : eBufType, 0, 1);
5153 : else
5154 0 : GDALCopyWords64(static_cast<const GByte *>(pData) +
5155 0 : iBandBufOffset,
5156 0 : eBufType, 0, pabySrcBlock + iSrcOffset,
5157 : eDataType, 0, 1);
5158 : }
5159 : }
5160 :
5161 270 : iBufOffset += static_cast<int>(nPixelSpace);
5162 : }
5163 : }
5164 :
5165 : /* -------------------------------------------------------------------- */
5166 : /* CleanupAndReturn. */
5167 : /* -------------------------------------------------------------------- */
5168 4 : CleanupAndReturn:
5169 4 : CPLFree(papabySrcBlock);
5170 4 : if (papoBlocks != nullptr)
5171 : {
5172 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
5173 : {
5174 8 : if (papoBlocks[iBand] != nullptr)
5175 8 : papoBlocks[iBand]->DropLock();
5176 : }
5177 4 : CPLFree(papoBlocks);
5178 : }
5179 :
5180 4 : return eErr;
5181 : }
5182 :
5183 : //! @endcond
5184 :
5185 : /************************************************************************/
5186 : /* GDALCopyWholeRasterGetSwathSize() */
5187 : /************************************************************************/
5188 :
5189 3375 : static void GDALCopyWholeRasterGetSwathSize(GDALRasterBand *poSrcPrototypeBand,
5190 : GDALRasterBand *poDstPrototypeBand,
5191 : int nBandCount,
5192 : int bDstIsCompressed,
5193 : int bInterleave, int *pnSwathCols,
5194 : int *pnSwathLines)
5195 : {
5196 3375 : GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
5197 3375 : int nSrcBlockXSize = 0;
5198 3375 : int nSrcBlockYSize = 0;
5199 3375 : int nBlockXSize = 0;
5200 3375 : int nBlockYSize = 0;
5201 :
5202 3375 : int nXSize = poSrcPrototypeBand->GetXSize();
5203 3375 : int nYSize = poSrcPrototypeBand->GetYSize();
5204 :
5205 3375 : poSrcPrototypeBand->GetBlockSize(&nSrcBlockXSize, &nSrcBlockYSize);
5206 3375 : poDstPrototypeBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
5207 :
5208 3375 : const int nMaxBlockXSize = std::max(nBlockXSize, nSrcBlockXSize);
5209 3375 : const int nMaxBlockYSize = std::max(nBlockYSize, nSrcBlockYSize);
5210 :
5211 3375 : int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5212 3375 : if (bInterleave)
5213 583 : nPixelSize *= nBandCount;
5214 :
5215 : // aim for one row of blocks. Do not settle for less.
5216 3375 : int nSwathCols = nXSize;
5217 3375 : int nSwathLines = nMaxBlockYSize;
5218 :
5219 : const char *pszSrcCompression =
5220 3375 : poSrcPrototypeBand->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
5221 3375 : if (pszSrcCompression == nullptr)
5222 : {
5223 3355 : auto poSrcDS = poSrcPrototypeBand->GetDataset();
5224 3355 : if (poSrcDS)
5225 : pszSrcCompression =
5226 3349 : poSrcDS->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
5227 : }
5228 :
5229 : /* -------------------------------------------------------------------- */
5230 : /* What will our swath size be? */
5231 : /* -------------------------------------------------------------------- */
5232 : // When writing interleaved data in a compressed format, we want to be sure
5233 : // that each block will only be written once, so the swath size must not be
5234 : // greater than the block cache.
5235 3375 : const char *pszSwathSize = CPLGetConfigOption("GDAL_SWATH_SIZE", nullptr);
5236 : int nTargetSwathSize;
5237 3375 : if (pszSwathSize != nullptr)
5238 0 : nTargetSwathSize = static_cast<int>(
5239 0 : std::min(GIntBig(INT_MAX), CPLAtoGIntBig(pszSwathSize)));
5240 : else
5241 : {
5242 : // As a default, take one 1/4 of the cache size.
5243 3375 : nTargetSwathSize = static_cast<int>(
5244 3375 : std::min(GIntBig(INT_MAX), GDALGetCacheMax64() / 4));
5245 :
5246 : // but if the minimum idal swath buf size is less, then go for it to
5247 : // avoid unnecessarily abusing RAM usage.
5248 : // but try to use 10 MB at least.
5249 3375 : GIntBig nIdealSwathBufSize =
5250 3375 : static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize;
5251 3375 : int nMinTargetSwathSize = 10 * 1000 * 1000;
5252 :
5253 3375 : if ((poSrcPrototypeBand->GetSuggestedBlockAccessPattern() &
5254 3375 : GSBAP_LARGEST_CHUNK_POSSIBLE) != 0)
5255 : {
5256 1 : nMinTargetSwathSize = nTargetSwathSize;
5257 : }
5258 :
5259 3375 : if (nIdealSwathBufSize < nTargetSwathSize &&
5260 3365 : nIdealSwathBufSize < nMinTargetSwathSize)
5261 : {
5262 3362 : nIdealSwathBufSize = nMinTargetSwathSize;
5263 : }
5264 :
5265 3375 : if (pszSrcCompression != nullptr &&
5266 181 : EQUAL(pszSrcCompression, "JPEG2000") &&
5267 0 : (!bDstIsCompressed || ((nSrcBlockXSize % nBlockXSize) == 0 &&
5268 0 : (nSrcBlockYSize % nBlockYSize) == 0)))
5269 : {
5270 2 : nIdealSwathBufSize =
5271 4 : std::max(nIdealSwathBufSize, static_cast<GIntBig>(nSwathCols) *
5272 2 : nSrcBlockYSize * nPixelSize);
5273 : }
5274 3375 : if (nTargetSwathSize > nIdealSwathBufSize)
5275 3362 : nTargetSwathSize = static_cast<int>(
5276 3362 : std::min(GIntBig(INT_MAX), nIdealSwathBufSize));
5277 : }
5278 :
5279 3375 : if (nTargetSwathSize < 1000000)
5280 8 : nTargetSwathSize = 1000000;
5281 :
5282 : /* But let's check that */
5283 3596 : if (bDstIsCompressed && bInterleave &&
5284 221 : nTargetSwathSize > GDALGetCacheMax64())
5285 : {
5286 0 : CPLError(CE_Warning, CPLE_AppDefined,
5287 : "When translating into a compressed interleave format, "
5288 : "the block cache size (" CPL_FRMT_GIB ") "
5289 : "should be at least the size of the swath (%d) "
5290 : "(GDAL_SWATH_SIZE config. option)",
5291 : GDALGetCacheMax64(), nTargetSwathSize);
5292 : }
5293 :
5294 : #define IS_DIVIDER_OF(x, y) ((y) % (x) == 0)
5295 : #define ROUND_TO(x, y) (((x) / (y)) * (y))
5296 :
5297 : // if both input and output datasets are tiled, that the tile dimensions
5298 : // are "compatible", try to stick to a swath dimension that is a multiple
5299 : // of input and output block dimensions.
5300 3375 : if (nBlockXSize != nXSize && nSrcBlockXSize != nXSize &&
5301 47 : IS_DIVIDER_OF(nBlockXSize, nMaxBlockXSize) &&
5302 47 : IS_DIVIDER_OF(nSrcBlockXSize, nMaxBlockXSize) &&
5303 47 : IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
5304 47 : IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
5305 : {
5306 47 : if (static_cast<GIntBig>(nMaxBlockXSize) * nMaxBlockYSize *
5307 47 : nPixelSize <=
5308 47 : static_cast<GIntBig>(nTargetSwathSize))
5309 : {
5310 47 : nSwathCols = nTargetSwathSize / (nMaxBlockYSize * nPixelSize);
5311 47 : nSwathCols = ROUND_TO(nSwathCols, nMaxBlockXSize);
5312 47 : if (nSwathCols == 0)
5313 0 : nSwathCols = nMaxBlockXSize;
5314 47 : if (nSwathCols > nXSize)
5315 45 : nSwathCols = nXSize;
5316 47 : nSwathLines = nMaxBlockYSize;
5317 :
5318 47 : if (static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize >
5319 47 : static_cast<GIntBig>(nTargetSwathSize))
5320 : {
5321 0 : nSwathCols = nXSize;
5322 0 : nSwathLines = nBlockYSize;
5323 : }
5324 : }
5325 : }
5326 :
5327 3375 : const GIntBig nMemoryPerCol = static_cast<GIntBig>(nSwathCols) * nPixelSize;
5328 3375 : const GIntBig nSwathBufSize = nMemoryPerCol * nSwathLines;
5329 3375 : if (nSwathBufSize > static_cast<GIntBig>(nTargetSwathSize))
5330 : {
5331 1 : nSwathLines = static_cast<int>(nTargetSwathSize / nMemoryPerCol);
5332 1 : if (nSwathLines == 0)
5333 1 : nSwathLines = 1;
5334 :
5335 1 : CPLDebug(
5336 : "GDAL",
5337 : "GDALCopyWholeRasterGetSwathSize(): adjusting to %d line swath "
5338 : "since requirement (" CPL_FRMT_GIB " bytes) exceed target swath "
5339 : "size (%d bytes) (GDAL_SWATH_SIZE config. option)",
5340 1 : nSwathLines, nBlockYSize * nMemoryPerCol, nTargetSwathSize);
5341 : }
5342 : // If we are processing single scans, try to handle several at once.
5343 : // If we are handling swaths already, only grow the swath if a row
5344 : // of blocks is substantially less than our target buffer size.
5345 3374 : else if (nSwathLines == 1 ||
5346 2823 : nMemoryPerCol * nSwathLines <
5347 2823 : static_cast<GIntBig>(nTargetSwathSize) / 10)
5348 : {
5349 3346 : nSwathLines = std::min(
5350 : nYSize,
5351 3346 : std::max(1, static_cast<int>(nTargetSwathSize / nMemoryPerCol)));
5352 :
5353 : /* If possible try to align to source and target block height */
5354 3346 : if ((nSwathLines % nMaxBlockYSize) != 0 &&
5355 273 : nSwathLines > nMaxBlockYSize &&
5356 273 : IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
5357 244 : IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
5358 217 : nSwathLines = ROUND_TO(nSwathLines, nMaxBlockYSize);
5359 : }
5360 :
5361 3375 : if (pszSrcCompression != nullptr && EQUAL(pszSrcCompression, "JPEG2000") &&
5362 0 : (!bDstIsCompressed || (IS_DIVIDER_OF(nBlockXSize, nSrcBlockXSize) &&
5363 0 : IS_DIVIDER_OF(nBlockYSize, nSrcBlockYSize))))
5364 : {
5365 : // Typical use case: converting from Pleaiades that is 2048x2048 tiled.
5366 2 : if (nSwathLines < nSrcBlockYSize)
5367 : {
5368 0 : nSwathLines = nSrcBlockYSize;
5369 :
5370 : // Number of pixels that can be read/write simultaneously.
5371 0 : nSwathCols = nTargetSwathSize / (nSrcBlockXSize * nPixelSize);
5372 0 : nSwathCols = ROUND_TO(nSwathCols, nSrcBlockXSize);
5373 0 : if (nSwathCols == 0)
5374 0 : nSwathCols = nSrcBlockXSize;
5375 0 : if (nSwathCols > nXSize)
5376 0 : nSwathCols = nXSize;
5377 :
5378 0 : CPLDebug(
5379 : "GDAL",
5380 : "GDALCopyWholeRasterGetSwathSize(): because of compression and "
5381 : "too high block, "
5382 : "use partial width at one time");
5383 : }
5384 2 : else if ((nSwathLines % nSrcBlockYSize) != 0)
5385 : {
5386 : /* Round on a multiple of nSrcBlockYSize */
5387 0 : nSwathLines = ROUND_TO(nSwathLines, nSrcBlockYSize);
5388 0 : CPLDebug(
5389 : "GDAL",
5390 : "GDALCopyWholeRasterGetSwathSize(): because of compression, "
5391 : "round nSwathLines to block height : %d",
5392 : nSwathLines);
5393 : }
5394 : }
5395 3373 : else if (bDstIsCompressed)
5396 : {
5397 419 : if (nSwathLines < nBlockYSize)
5398 : {
5399 146 : nSwathLines = nBlockYSize;
5400 :
5401 : // Number of pixels that can be read/write simultaneously.
5402 146 : nSwathCols = nTargetSwathSize / (nSwathLines * nPixelSize);
5403 146 : nSwathCols = ROUND_TO(nSwathCols, nBlockXSize);
5404 146 : if (nSwathCols == 0)
5405 0 : nSwathCols = nBlockXSize;
5406 146 : if (nSwathCols > nXSize)
5407 146 : nSwathCols = nXSize;
5408 :
5409 146 : CPLDebug(
5410 : "GDAL",
5411 : "GDALCopyWholeRasterGetSwathSize(): because of compression and "
5412 : "too high block, "
5413 : "use partial width at one time");
5414 : }
5415 273 : else if ((nSwathLines % nBlockYSize) != 0)
5416 : {
5417 : // Round on a multiple of nBlockYSize.
5418 9 : nSwathLines = ROUND_TO(nSwathLines, nBlockYSize);
5419 9 : CPLDebug(
5420 : "GDAL",
5421 : "GDALCopyWholeRasterGetSwathSize(): because of compression, "
5422 : "round nSwathLines to block height : %d",
5423 : nSwathLines);
5424 : }
5425 : }
5426 :
5427 3375 : *pnSwathCols = nSwathCols;
5428 3375 : *pnSwathLines = nSwathLines;
5429 3375 : }
5430 :
5431 : /************************************************************************/
5432 : /* GDALDatasetCopyWholeRaster() */
5433 : /************************************************************************/
5434 :
5435 : /**
5436 : * \brief Copy all dataset raster data.
5437 : *
5438 : * This function copies the complete raster contents of one dataset to
5439 : * another similarly configured dataset. The source and destination
5440 : * dataset must have the same number of bands, and the same width
5441 : * and height. The bands do not have to have the same data type.
5442 : *
5443 : * This function is primarily intended to support implementation of
5444 : * driver specific CreateCopy() functions. It implements efficient copying,
5445 : * in particular "chunking" the copy in substantial blocks and, if appropriate,
5446 : * performing the transfer in a pixel interleaved fashion.
5447 : *
5448 : * Currently the only papszOptions value supported are :
5449 : * <ul>
5450 : * <li>"INTERLEAVE=PIXEL/BAND" to force pixel (resp. band) interleaved read and
5451 : * write access pattern (this does not modify the layout of the destination
5452 : * data)</li>
5453 : * <li>"COMPRESSED=YES" to force alignment on target dataset block
5454 : * sizes to achieve best compression.</li>
5455 : * <li>"SKIP_HOLES=YES" to skip chunks
5456 : * for which GDALGetDataCoverageStatus() returns GDAL_DATA_COVERAGE_STATUS_EMPTY
5457 : * (GDAL >= 2.2)</li>
5458 : * </ul>
5459 : * More options may be supported in the future.
5460 : *
5461 : * @param hSrcDS the source dataset
5462 : * @param hDstDS the destination dataset
5463 : * @param papszOptions transfer hints in "StringList" Name=Value format.
5464 : * @param pfnProgress progress reporting function.
5465 : * @param pProgressData callback data for progress function.
5466 : *
5467 : * @return CE_None on success, or CE_Failure on failure.
5468 : */
5469 :
5470 3347 : CPLErr CPL_STDCALL GDALDatasetCopyWholeRaster(GDALDatasetH hSrcDS,
5471 : GDALDatasetH hDstDS,
5472 : CSLConstList papszOptions,
5473 : GDALProgressFunc pfnProgress,
5474 : void *pProgressData)
5475 :
5476 : {
5477 3347 : VALIDATE_POINTER1(hSrcDS, "GDALDatasetCopyWholeRaster", CE_Failure);
5478 3347 : VALIDATE_POINTER1(hDstDS, "GDALDatasetCopyWholeRaster", CE_Failure);
5479 :
5480 3347 : GDALDataset *poSrcDS = GDALDataset::FromHandle(hSrcDS);
5481 3347 : GDALDataset *poDstDS = GDALDataset::FromHandle(hDstDS);
5482 :
5483 3347 : if (pfnProgress == nullptr)
5484 0 : pfnProgress = GDALDummyProgress;
5485 :
5486 : /* -------------------------------------------------------------------- */
5487 : /* Confirm the datasets match in size and band counts. */
5488 : /* -------------------------------------------------------------------- */
5489 3347 : const int nXSize = poDstDS->GetRasterXSize();
5490 3347 : const int nYSize = poDstDS->GetRasterYSize();
5491 3347 : const int nBandCount = poDstDS->GetRasterCount();
5492 :
5493 3347 : if (poSrcDS->GetRasterXSize() != nXSize ||
5494 6694 : poSrcDS->GetRasterYSize() != nYSize ||
5495 3347 : poSrcDS->GetRasterCount() != nBandCount)
5496 : {
5497 0 : CPLError(CE_Failure, CPLE_AppDefined,
5498 : "Input and output dataset sizes or band counts do not\n"
5499 : "match in GDALDatasetCopyWholeRaster()");
5500 0 : return CE_Failure;
5501 : }
5502 :
5503 : /* -------------------------------------------------------------------- */
5504 : /* Report preliminary (0) progress. */
5505 : /* -------------------------------------------------------------------- */
5506 3347 : if (!pfnProgress(0.0, nullptr, pProgressData))
5507 : {
5508 1 : CPLError(CE_Failure, CPLE_UserInterrupt,
5509 : "User terminated CreateCopy()");
5510 1 : return CE_Failure;
5511 : }
5512 :
5513 : /* -------------------------------------------------------------------- */
5514 : /* Get our prototype band, and assume the others are similarly */
5515 : /* configured. */
5516 : /* -------------------------------------------------------------------- */
5517 3346 : if (nBandCount == 0)
5518 0 : return CE_None;
5519 :
5520 3346 : GDALRasterBand *poSrcPrototypeBand = poSrcDS->GetRasterBand(1);
5521 3346 : GDALRasterBand *poDstPrototypeBand = poDstDS->GetRasterBand(1);
5522 3346 : GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
5523 :
5524 : /* -------------------------------------------------------------------- */
5525 : /* Do we want to try and do the operation in a pixel */
5526 : /* interleaved fashion? */
5527 : /* -------------------------------------------------------------------- */
5528 3346 : bool bInterleave = false;
5529 : const char *pszInterleave =
5530 3346 : poSrcDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
5531 3346 : if (pszInterleave != nullptr &&
5532 2942 : (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
5533 209 : bInterleave = true;
5534 :
5535 3346 : pszInterleave = poDstDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
5536 3346 : if (pszInterleave != nullptr &&
5537 2881 : (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
5538 528 : bInterleave = true;
5539 :
5540 3346 : pszInterleave = CSLFetchNameValue(papszOptions, "INTERLEAVE");
5541 3346 : if (pszInterleave != nullptr && EQUAL(pszInterleave, "PIXEL"))
5542 5 : bInterleave = true;
5543 3341 : else if (pszInterleave != nullptr && EQUAL(pszInterleave, "BAND"))
5544 13 : bInterleave = false;
5545 : // attributes is specific to the TileDB driver
5546 3328 : else if (pszInterleave != nullptr && EQUAL(pszInterleave, "ATTRIBUTES"))
5547 4 : bInterleave = true;
5548 3324 : else if (pszInterleave != nullptr)
5549 : {
5550 0 : CPLError(CE_Warning, CPLE_NotSupported,
5551 : "Unsupported value for option INTERLEAVE");
5552 : }
5553 :
5554 : // If the destination is compressed, we must try to write blocks just once,
5555 : // to save disk space (GTiff case for example), and to avoid data loss
5556 : // (JPEG compression for example).
5557 3346 : bool bDstIsCompressed = false;
5558 : const char *pszDstCompressed =
5559 3346 : CSLFetchNameValue(papszOptions, "COMPRESSED");
5560 3346 : if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
5561 393 : bDstIsCompressed = true;
5562 :
5563 : /* -------------------------------------------------------------------- */
5564 : /* What will our swath size be? */
5565 : /* -------------------------------------------------------------------- */
5566 :
5567 3346 : int nSwathCols = 0;
5568 3346 : int nSwathLines = 0;
5569 3346 : GDALCopyWholeRasterGetSwathSize(poSrcPrototypeBand, poDstPrototypeBand,
5570 : nBandCount, bDstIsCompressed, bInterleave,
5571 : &nSwathCols, &nSwathLines);
5572 :
5573 3346 : int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5574 3346 : if (bInterleave)
5575 583 : nPixelSize *= nBandCount;
5576 :
5577 3346 : void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
5578 3346 : if (pSwathBuf == nullptr)
5579 : {
5580 0 : return CE_Failure;
5581 : }
5582 :
5583 3346 : CPLDebug("GDAL",
5584 : "GDALDatasetCopyWholeRaster(): %d*%d swaths, bInterleave=%d",
5585 : nSwathCols, nSwathLines, static_cast<int>(bInterleave));
5586 :
5587 : // Advise the source raster that we are going to read it completely
5588 : // Note: this might already have been done by GDALCreateCopy() in the
5589 : // likely case this function is indirectly called by it
5590 3346 : poSrcDS->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nBandCount,
5591 3346 : nullptr, nullptr);
5592 :
5593 : /* ==================================================================== */
5594 : /* Band oriented (uninterleaved) case. */
5595 : /* ==================================================================== */
5596 3346 : CPLErr eErr = CE_None;
5597 : const bool bCheckHoles =
5598 3346 : CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
5599 :
5600 3346 : if (!bInterleave)
5601 : {
5602 : GDALRasterIOExtraArg sExtraArg;
5603 2763 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5604 2763 : CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress); // to make cppcheck happy
5605 :
5606 8289 : const GIntBig nTotalBlocks = static_cast<GIntBig>(nBandCount) *
5607 2763 : DIV_ROUND_UP(nYSize, nSwathLines) *
5608 2763 : DIV_ROUND_UP(nXSize, nSwathCols);
5609 2763 : GIntBig nBlocksDone = 0;
5610 :
5611 7969 : for (int iBand = 0; iBand < nBandCount && eErr == CE_None; iBand++)
5612 : {
5613 5206 : int nBand = iBand + 1;
5614 :
5615 10675 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5616 : {
5617 5469 : int nThisLines = nSwathLines;
5618 :
5619 5469 : if (iY + nThisLines > nYSize)
5620 368 : nThisLines = nYSize - iY;
5621 :
5622 10938 : for (int iX = 0; iX < nXSize && eErr == CE_None;
5623 5469 : iX += nSwathCols)
5624 : {
5625 5469 : int nThisCols = nSwathCols;
5626 :
5627 5469 : if (iX + nThisCols > nXSize)
5628 0 : nThisCols = nXSize - iX;
5629 :
5630 5469 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5631 5469 : if (bCheckHoles)
5632 : {
5633 : nStatus = poSrcDS->GetRasterBand(nBand)
5634 3757 : ->GetDataCoverageStatus(
5635 : iX, iY, nThisCols, nThisLines,
5636 : GDAL_DATA_COVERAGE_STATUS_DATA);
5637 : }
5638 5469 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5639 : {
5640 5465 : sExtraArg.pfnProgress = GDALScaledProgress;
5641 10930 : sExtraArg.pProgressData = GDALCreateScaledProgress(
5642 5465 : nBlocksDone / static_cast<double>(nTotalBlocks),
5643 5465 : (nBlocksDone + 0.5) /
5644 5465 : static_cast<double>(nTotalBlocks),
5645 : pfnProgress, pProgressData);
5646 5465 : if (sExtraArg.pProgressData == nullptr)
5647 1682 : sExtraArg.pfnProgress = nullptr;
5648 :
5649 5465 : eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
5650 : nThisLines, pSwathBuf,
5651 : nThisCols, nThisLines, eDT, 1,
5652 : &nBand, 0, 0, 0, &sExtraArg);
5653 :
5654 5465 : GDALDestroyScaledProgress(sExtraArg.pProgressData);
5655 :
5656 5465 : if (eErr == CE_None)
5657 5458 : eErr = poDstDS->RasterIO(
5658 : GF_Write, iX, iY, nThisCols, nThisLines,
5659 : pSwathBuf, nThisCols, nThisLines, eDT, 1,
5660 : &nBand, 0, 0, 0, nullptr);
5661 : }
5662 :
5663 5469 : nBlocksDone++;
5664 10896 : if (eErr == CE_None &&
5665 5427 : !pfnProgress(nBlocksDone /
5666 5427 : static_cast<double>(nTotalBlocks),
5667 : nullptr, pProgressData))
5668 : {
5669 2 : eErr = CE_Failure;
5670 2 : CPLError(CE_Failure, CPLE_UserInterrupt,
5671 : "User terminated CreateCopy()");
5672 : }
5673 : }
5674 : }
5675 : }
5676 : }
5677 :
5678 : /* ==================================================================== */
5679 : /* Pixel interleaved case. */
5680 : /* ==================================================================== */
5681 : else /* if( bInterleave ) */
5682 : {
5683 : GDALRasterIOExtraArg sExtraArg;
5684 583 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5685 583 : CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress); // to make cppcheck happy
5686 :
5687 583 : const GIntBig nTotalBlocks =
5688 583 : static_cast<GIntBig>(DIV_ROUND_UP(nYSize, nSwathLines)) *
5689 583 : DIV_ROUND_UP(nXSize, nSwathCols);
5690 583 : GIntBig nBlocksDone = 0;
5691 :
5692 1388 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5693 : {
5694 805 : int nThisLines = nSwathLines;
5695 :
5696 805 : if (iY + nThisLines > nYSize)
5697 198 : nThisLines = nYSize - iY;
5698 :
5699 1615 : for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
5700 : {
5701 810 : int nThisCols = nSwathCols;
5702 :
5703 810 : if (iX + nThisCols > nXSize)
5704 3 : nThisCols = nXSize - iX;
5705 :
5706 810 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5707 810 : if (bCheckHoles)
5708 : {
5709 551 : nStatus = 0;
5710 604 : for (int iBand = 0; iBand < nBandCount; iBand++)
5711 : {
5712 585 : nStatus |= poSrcDS->GetRasterBand(iBand + 1)
5713 585 : ->GetDataCoverageStatus(
5714 : iX, iY, nThisCols, nThisLines,
5715 : GDAL_DATA_COVERAGE_STATUS_DATA);
5716 585 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5717 532 : break;
5718 : }
5719 : }
5720 810 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5721 : {
5722 791 : sExtraArg.pfnProgress = GDALScaledProgress;
5723 1582 : sExtraArg.pProgressData = GDALCreateScaledProgress(
5724 791 : nBlocksDone / static_cast<double>(nTotalBlocks),
5725 791 : (nBlocksDone + 0.5) / static_cast<double>(nTotalBlocks),
5726 : pfnProgress, pProgressData);
5727 791 : if (sExtraArg.pProgressData == nullptr)
5728 375 : sExtraArg.pfnProgress = nullptr;
5729 :
5730 791 : eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
5731 : nThisLines, pSwathBuf, nThisCols,
5732 : nThisLines, eDT, nBandCount,
5733 : nullptr, 0, 0, 0, &sExtraArg);
5734 :
5735 791 : GDALDestroyScaledProgress(sExtraArg.pProgressData);
5736 :
5737 791 : if (eErr == CE_None)
5738 790 : eErr = poDstDS->RasterIO(
5739 : GF_Write, iX, iY, nThisCols, nThisLines, pSwathBuf,
5740 : nThisCols, nThisLines, eDT, nBandCount, nullptr, 0,
5741 : 0, 0, nullptr);
5742 : }
5743 :
5744 810 : nBlocksDone++;
5745 1615 : if (eErr == CE_None &&
5746 805 : !pfnProgress(nBlocksDone /
5747 805 : static_cast<double>(nTotalBlocks),
5748 : nullptr, pProgressData))
5749 : {
5750 1 : eErr = CE_Failure;
5751 1 : CPLError(CE_Failure, CPLE_UserInterrupt,
5752 : "User terminated CreateCopy()");
5753 : }
5754 : }
5755 : }
5756 : }
5757 :
5758 : /* -------------------------------------------------------------------- */
5759 : /* Cleanup */
5760 : /* -------------------------------------------------------------------- */
5761 3346 : CPLFree(pSwathBuf);
5762 :
5763 3346 : return eErr;
5764 : }
5765 :
5766 : /************************************************************************/
5767 : /* GDALRasterBandCopyWholeRaster() */
5768 : /************************************************************************/
5769 :
5770 : /**
5771 : * \brief Copy a whole raster band
5772 : *
5773 : * This function copies the complete raster contents of one band to
5774 : * another similarly configured band. The source and destination
5775 : * bands must have the same width and height. The bands do not have
5776 : * to have the same data type.
5777 : *
5778 : * It implements efficient copying, in particular "chunking" the copy in
5779 : * substantial blocks.
5780 : *
5781 : * Currently the only papszOptions value supported are :
5782 : * <ul>
5783 : * <li>"COMPRESSED=YES" to force alignment on target dataset block sizes to
5784 : * achieve best compression.</li>
5785 : * <li>"SKIP_HOLES=YES" to skip chunks for which GDALGetDataCoverageStatus()
5786 : * returns GDAL_DATA_COVERAGE_STATUS_EMPTY (GDAL >= 2.2)</li>
5787 : * </ul>
5788 : *
5789 : * @param hSrcBand the source band
5790 : * @param hDstBand the destination band
5791 : * @param papszOptions transfer hints in "StringList" Name=Value format.
5792 : * @param pfnProgress progress reporting function.
5793 : * @param pProgressData callback data for progress function.
5794 : *
5795 : * @return CE_None on success, or CE_Failure on failure.
5796 : */
5797 :
5798 29 : CPLErr CPL_STDCALL GDALRasterBandCopyWholeRaster(
5799 : GDALRasterBandH hSrcBand, GDALRasterBandH hDstBand,
5800 : const char *const *const papszOptions, GDALProgressFunc pfnProgress,
5801 : void *pProgressData)
5802 :
5803 : {
5804 29 : VALIDATE_POINTER1(hSrcBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
5805 29 : VALIDATE_POINTER1(hDstBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
5806 :
5807 29 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
5808 29 : GDALRasterBand *poDstBand = GDALRasterBand::FromHandle(hDstBand);
5809 29 : CPLErr eErr = CE_None;
5810 :
5811 29 : if (pfnProgress == nullptr)
5812 2 : pfnProgress = GDALDummyProgress;
5813 :
5814 : /* -------------------------------------------------------------------- */
5815 : /* Confirm the datasets match in size and band counts. */
5816 : /* -------------------------------------------------------------------- */
5817 29 : int nXSize = poSrcBand->GetXSize();
5818 29 : int nYSize = poSrcBand->GetYSize();
5819 :
5820 29 : if (poDstBand->GetXSize() != nXSize || poDstBand->GetYSize() != nYSize)
5821 : {
5822 0 : CPLError(CE_Failure, CPLE_AppDefined,
5823 : "Input and output band sizes do not\n"
5824 : "match in GDALRasterBandCopyWholeRaster()");
5825 0 : return CE_Failure;
5826 : }
5827 :
5828 : /* -------------------------------------------------------------------- */
5829 : /* Report preliminary (0) progress. */
5830 : /* -------------------------------------------------------------------- */
5831 29 : if (!pfnProgress(0.0, nullptr, pProgressData))
5832 : {
5833 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
5834 : "User terminated CreateCopy()");
5835 0 : return CE_Failure;
5836 : }
5837 :
5838 29 : GDALDataType eDT = poDstBand->GetRasterDataType();
5839 :
5840 : // If the destination is compressed, we must try to write blocks just once,
5841 : // to save disk space (GTiff case for example), and to avoid data loss
5842 : // (JPEG compression for example).
5843 29 : bool bDstIsCompressed = false;
5844 : const char *pszDstCompressed =
5845 29 : CSLFetchNameValue(const_cast<char **>(papszOptions), "COMPRESSED");
5846 29 : if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
5847 26 : bDstIsCompressed = true;
5848 :
5849 : /* -------------------------------------------------------------------- */
5850 : /* What will our swath size be? */
5851 : /* -------------------------------------------------------------------- */
5852 :
5853 29 : int nSwathCols = 0;
5854 29 : int nSwathLines = 0;
5855 29 : GDALCopyWholeRasterGetSwathSize(poSrcBand, poDstBand, 1, bDstIsCompressed,
5856 : FALSE, &nSwathCols, &nSwathLines);
5857 :
5858 29 : const int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5859 :
5860 29 : void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
5861 29 : if (pSwathBuf == nullptr)
5862 : {
5863 0 : return CE_Failure;
5864 : }
5865 :
5866 29 : CPLDebug("GDAL", "GDALRasterBandCopyWholeRaster(): %d*%d swaths",
5867 : nSwathCols, nSwathLines);
5868 :
5869 : const bool bCheckHoles =
5870 29 : CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
5871 :
5872 : // Advise the source raster that we are going to read it completely
5873 29 : poSrcBand->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nullptr);
5874 :
5875 : /* ==================================================================== */
5876 : /* Band oriented (uninterleaved) case. */
5877 : /* ==================================================================== */
5878 :
5879 72 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5880 : {
5881 43 : int nThisLines = nSwathLines;
5882 :
5883 43 : if (iY + nThisLines > nYSize)
5884 8 : nThisLines = nYSize - iY;
5885 :
5886 86 : for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
5887 : {
5888 43 : int nThisCols = nSwathCols;
5889 :
5890 43 : if (iX + nThisCols > nXSize)
5891 0 : nThisCols = nXSize - iX;
5892 :
5893 43 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5894 43 : if (bCheckHoles)
5895 : {
5896 0 : nStatus = poSrcBand->GetDataCoverageStatus(
5897 : iX, iY, nThisCols, nThisLines,
5898 : GDAL_DATA_COVERAGE_STATUS_DATA);
5899 : }
5900 43 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5901 : {
5902 43 : eErr = poSrcBand->RasterIO(GF_Read, iX, iY, nThisCols,
5903 : nThisLines, pSwathBuf, nThisCols,
5904 : nThisLines, eDT, 0, 0, nullptr);
5905 :
5906 43 : if (eErr == CE_None)
5907 43 : eErr = poDstBand->RasterIO(GF_Write, iX, iY, nThisCols,
5908 : nThisLines, pSwathBuf, nThisCols,
5909 : nThisLines, eDT, 0, 0, nullptr);
5910 : }
5911 :
5912 86 : if (eErr == CE_None && !pfnProgress(double(iY + nThisLines) /
5913 43 : static_cast<double>(nYSize),
5914 : nullptr, pProgressData))
5915 : {
5916 0 : eErr = CE_Failure;
5917 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
5918 : "User terminated CreateCopy()");
5919 : }
5920 : }
5921 : }
5922 :
5923 : /* -------------------------------------------------------------------- */
5924 : /* Cleanup */
5925 : /* -------------------------------------------------------------------- */
5926 29 : CPLFree(pSwathBuf);
5927 :
5928 29 : return eErr;
5929 : }
5930 :
5931 : /************************************************************************/
5932 : /* GDALCopyRasterIOExtraArg () */
5933 : /************************************************************************/
5934 :
5935 533484 : void GDALCopyRasterIOExtraArg(GDALRasterIOExtraArg *psDestArg,
5936 : const GDALRasterIOExtraArg *psSrcArg)
5937 : {
5938 533484 : INIT_RASTERIO_EXTRA_ARG(*psDestArg);
5939 533484 : if (psSrcArg)
5940 : {
5941 533484 : psDestArg->eResampleAlg = psSrcArg->eResampleAlg;
5942 533484 : psDestArg->pfnProgress = psSrcArg->pfnProgress;
5943 533484 : psDestArg->pProgressData = psSrcArg->pProgressData;
5944 533484 : psDestArg->bFloatingPointWindowValidity =
5945 533484 : psSrcArg->bFloatingPointWindowValidity;
5946 533484 : if (psSrcArg->bFloatingPointWindowValidity)
5947 : {
5948 210512 : psDestArg->dfXOff = psSrcArg->dfXOff;
5949 210512 : psDestArg->dfYOff = psSrcArg->dfYOff;
5950 210512 : psDestArg->dfXSize = psSrcArg->dfXSize;
5951 210512 : psDestArg->dfYSize = psSrcArg->dfYSize;
5952 : }
5953 533484 : if (psSrcArg->nVersion >= 2)
5954 : {
5955 533484 : psDestArg->bUseOnlyThisScale = psSrcArg->bUseOnlyThisScale;
5956 : }
5957 533484 : if (psSrcArg->nVersion >= 3)
5958 : {
5959 533484 : psDestArg->bOperateInBufType = psSrcArg->bOperateInBufType;
5960 : }
5961 : }
5962 533484 : }
5963 :
5964 : /************************************************************************/
5965 : /* HasOnlyNoData() */
5966 : /************************************************************************/
5967 :
5968 51285976 : template <class T> static inline bool IsEqualToNoData(T value, T noDataValue)
5969 : {
5970 51285976 : return value == noDataValue;
5971 : }
5972 :
5973 5509 : template <> bool IsEqualToNoData<GFloat16>(GFloat16 value, GFloat16 noDataValue)
5974 : {
5975 : using std::isnan;
5976 5509 : return isnan(noDataValue) ? isnan(value) : value == noDataValue;
5977 : }
5978 :
5979 251221 : template <> bool IsEqualToNoData<float>(float value, float noDataValue)
5980 : {
5981 251221 : return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
5982 : }
5983 :
5984 264257 : template <> bool IsEqualToNoData<double>(double value, double noDataValue)
5985 : {
5986 264257 : return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
5987 : }
5988 :
5989 : template <class T>
5990 12024 : static bool HasOnlyNoDataT(const T *pBuffer, T noDataValue, size_t nWidth,
5991 : size_t nHeight, size_t nLineStride,
5992 : size_t nComponents)
5993 : {
5994 : // Fast test: check the 4 corners and the middle pixel.
5995 23297 : for (size_t iBand = 0; iBand < nComponents; iBand++)
5996 : {
5997 24095 : if (!(IsEqualToNoData(pBuffer[iBand], noDataValue) &&
5998 11880 : IsEqualToNoData(pBuffer[(nWidth - 1) * nComponents + iBand],
5999 11750 : noDataValue) &&
6000 11750 : IsEqualToNoData(
6001 11750 : pBuffer[((nHeight - 1) / 2 * nLineStride + (nWidth - 1) / 2) *
6002 11750 : nComponents +
6003 : iBand],
6004 11276 : noDataValue) &&
6005 11276 : IsEqualToNoData(
6006 11276 : pBuffer[(nHeight - 1) * nLineStride * nComponents + iBand],
6007 : noDataValue) &&
6008 11276 : IsEqualToNoData(
6009 11276 : pBuffer[((nHeight - 1) * nLineStride + nWidth - 1) *
6010 11276 : nComponents +
6011 : iBand],
6012 : noDataValue)))
6013 : {
6014 942 : return false;
6015 : }
6016 : }
6017 :
6018 : // Test all pixels.
6019 52954 : for (size_t iY = 0; iY < nHeight; iY++)
6020 : {
6021 41993 : const T *pBufferLine = pBuffer + iY * nLineStride * nComponents;
6022 51790448 : for (size_t iX = 0; iX < nWidth * nComponents; iX++)
6023 : {
6024 51748615 : if (!IsEqualToNoData(pBufferLine[iX], noDataValue))
6025 : {
6026 121 : return false;
6027 : }
6028 : }
6029 : }
6030 10961 : return true;
6031 : }
6032 :
6033 : /************************************************************************/
6034 : /* GDALBufferHasOnlyNoData() */
6035 : /************************************************************************/
6036 :
6037 43909 : bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
6038 : size_t nWidth, size_t nHeight, size_t nLineStride,
6039 : size_t nComponents, int nBitsPerSample,
6040 : GDALBufferSampleFormat nSampleFormat)
6041 : {
6042 : // In the case where the nodata is 0, we can compare several bytes at
6043 : // once. Select the largest natural integer type for the architecture.
6044 43909 : if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
6045 : // Do not use this optimized code path for floating point numbers,
6046 : // as it can't detect negative zero.
6047 : nSampleFormat != GSF_FLOATING_POINT)
6048 : {
6049 27265 : const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
6050 27265 : const size_t nSize =
6051 27265 : static_cast<size_t>((static_cast<uint64_t>(nWidth) * nHeight *
6052 27265 : nComponents * nBitsPerSample +
6053 : 7) /
6054 : 8);
6055 : #ifdef HAVE_SSE2
6056 27265 : size_t n = nSize;
6057 : // Align to 16 bytes
6058 27328 : while ((reinterpret_cast<uintptr_t>(pabyBuffer) & 15) != 0 && n > 0)
6059 : {
6060 73 : --n;
6061 73 : if (*pabyBuffer)
6062 10 : return false;
6063 63 : pabyBuffer++;
6064 : }
6065 :
6066 27255 : const auto zero = _mm_setzero_si128();
6067 27255 : constexpr int UNROLLING = 4;
6068 2223230 : while (n >= UNROLLING * sizeof(zero))
6069 : {
6070 2207980 : const auto v0 = _mm_load_si128(reinterpret_cast<const __m128i *>(
6071 : pabyBuffer + 0 * sizeof(zero)));
6072 2207980 : const auto v1 = _mm_load_si128(reinterpret_cast<const __m128i *>(
6073 2207980 : pabyBuffer + 1 * sizeof(zero)));
6074 2207980 : const auto v2 = _mm_load_si128(reinterpret_cast<const __m128i *>(
6075 2207980 : pabyBuffer + 2 * sizeof(zero)));
6076 2207980 : const auto v3 = _mm_load_si128(reinterpret_cast<const __m128i *>(
6077 2207980 : pabyBuffer + 3 * sizeof(zero)));
6078 : const auto v =
6079 6623930 : _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
6080 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
6081 : if (!_mm_test_all_zeros(v, v))
6082 : #else
6083 4415960 : if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
6084 : #endif
6085 : {
6086 12001 : return false;
6087 : }
6088 2195980 : pabyBuffer += UNROLLING * sizeof(zero);
6089 2195980 : n -= UNROLLING * sizeof(zero);
6090 : }
6091 :
6092 233639 : while (n > 0)
6093 : {
6094 218489 : --n;
6095 218489 : if (*pabyBuffer)
6096 104 : return false;
6097 218385 : pabyBuffer++;
6098 : }
6099 : #else
6100 : #if SIZEOF_VOIDP >= 8 || defined(__x86_64__)
6101 : // We test __x86_64__ for x32 arch where SIZEOF_VOIDP == 4
6102 : typedef std::uint64_t WordType;
6103 : #else
6104 : typedef std::uint32_t WordType;
6105 : #endif
6106 :
6107 : const size_t nInitialIters =
6108 : std::min(sizeof(WordType) -
6109 : static_cast<size_t>(
6110 : reinterpret_cast<std::uintptr_t>(pabyBuffer) %
6111 : sizeof(WordType)),
6112 : nSize);
6113 : size_t i = 0;
6114 : for (; i < nInitialIters; i++)
6115 : {
6116 : if (pabyBuffer[i])
6117 : return false;
6118 : }
6119 : for (; i + sizeof(WordType) - 1 < nSize; i += sizeof(WordType))
6120 : {
6121 : if (*(reinterpret_cast<const WordType *>(pabyBuffer + i)))
6122 : return false;
6123 : }
6124 : for (; i < nSize; i++)
6125 : {
6126 : if (pabyBuffer[i])
6127 : return false;
6128 : }
6129 : #endif
6130 15150 : return true;
6131 : }
6132 :
6133 : #ifdef HAVE_SSE2
6134 16644 : else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
6135 708 : nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
6136 : {
6137 708 : const auto signMask = _mm_set1_epi32(0x7FFFFFFF);
6138 708 : const auto zero = _mm_setzero_si128();
6139 708 : const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
6140 708 : const size_t n = nWidth * nHeight * nComponents;
6141 :
6142 708 : size_t i = 0;
6143 708 : constexpr int UNROLLING = 4;
6144 708 : constexpr size_t VALUES_PER_ITER =
6145 : UNROLLING * sizeof(zero) / sizeof(float);
6146 24983 : for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
6147 : {
6148 24934 : const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6149 : pabyBuffer + 0 * sizeof(zero)));
6150 24934 : const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6151 24934 : pabyBuffer + 1 * sizeof(zero)));
6152 24934 : const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6153 24934 : pabyBuffer + 2 * sizeof(zero)));
6154 24934 : const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6155 24934 : pabyBuffer + 3 * sizeof(zero)));
6156 74802 : auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
6157 : // Clear the sign bit (makes -0.0 become +0.0)
6158 24934 : v = _mm_and_si128(v, signMask);
6159 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
6160 : if (!_mm_test_all_zeros(v, v))
6161 : #else
6162 49868 : if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
6163 : #endif
6164 : {
6165 659 : return false;
6166 : }
6167 24275 : pabyBuffer += UNROLLING * sizeof(zero);
6168 : }
6169 :
6170 304 : for (; i < n; i++)
6171 : {
6172 : uint32_t bits;
6173 272 : memcpy(&bits, pabyBuffer, sizeof(bits));
6174 272 : pabyBuffer += sizeof(bits);
6175 272 : if ((bits & 0x7FFFFFFF) != 0)
6176 17 : return false;
6177 : }
6178 :
6179 32 : return true;
6180 : }
6181 :
6182 15936 : else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
6183 3905 : nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
6184 : {
6185 3905 : const auto signMask = _mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL);
6186 3905 : const auto zero = _mm_setzero_si128();
6187 3905 : const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
6188 3905 : const size_t n = nWidth * nHeight * nComponents;
6189 :
6190 3905 : size_t i = 0;
6191 3905 : constexpr int UNROLLING = 4;
6192 3905 : constexpr size_t VALUES_PER_ITER =
6193 : UNROLLING * sizeof(zero) / sizeof(double);
6194 1664570 : for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
6195 : {
6196 1660950 : const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6197 : pabyBuffer + 0 * sizeof(zero)));
6198 1660950 : const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6199 1660950 : pabyBuffer + 1 * sizeof(zero)));
6200 1660950 : const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6201 1660950 : pabyBuffer + 2 * sizeof(zero)));
6202 1660950 : const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6203 1660950 : pabyBuffer + 3 * sizeof(zero)));
6204 4982850 : auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
6205 : // Clear the sign bit (makes -0.0 become +0.0)
6206 1660950 : v = _mm_and_si128(v, signMask);
6207 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
6208 : if (!_mm_test_all_zeros(v, v))
6209 : #else
6210 3321900 : if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
6211 : #endif
6212 : {
6213 289 : return false;
6214 : }
6215 1660660 : pabyBuffer += UNROLLING * sizeof(zero);
6216 : }
6217 :
6218 3643 : for (; i < n; i++)
6219 : {
6220 : uint64_t bits;
6221 34 : memcpy(&bits, pabyBuffer, sizeof(bits));
6222 34 : pabyBuffer += sizeof(bits);
6223 34 : if ((bits & 0x7FFFFFFFFFFFFFFFULL) != 0)
6224 7 : return false;
6225 : }
6226 :
6227 3609 : return true;
6228 : }
6229 : #endif
6230 :
6231 12031 : if (nBitsPerSample == 8 && nSampleFormat == GSF_UNSIGNED_INT)
6232 : {
6233 22424 : return GDALIsValueInRange<uint8_t>(dfNoDataValue) &&
6234 11212 : HasOnlyNoDataT(static_cast<const uint8_t *>(pBuffer),
6235 11212 : static_cast<uint8_t>(dfNoDataValue), nWidth,
6236 11212 : nHeight, nLineStride, nComponents);
6237 : }
6238 819 : if (nBitsPerSample == 8 && nSampleFormat == GSF_SIGNED_INT)
6239 : {
6240 : // Use unsigned implementation by converting the nodatavalue to
6241 : // unsigned
6242 119 : return GDALIsValueInRange<int8_t>(dfNoDataValue) &&
6243 59 : HasOnlyNoDataT(
6244 : static_cast<const uint8_t *>(pBuffer),
6245 59 : static_cast<uint8_t>(static_cast<int8_t>(dfNoDataValue)),
6246 60 : nWidth, nHeight, nLineStride, nComponents);
6247 : }
6248 759 : if (nBitsPerSample == 16 && nSampleFormat == GSF_UNSIGNED_INT)
6249 : {
6250 23 : return GDALIsValueInRange<uint16_t>(dfNoDataValue) &&
6251 11 : HasOnlyNoDataT(static_cast<const uint16_t *>(pBuffer),
6252 11 : static_cast<uint16_t>(dfNoDataValue), nWidth,
6253 12 : nHeight, nLineStride, nComponents);
6254 : }
6255 747 : if (nBitsPerSample == 16 && nSampleFormat == GSF_SIGNED_INT)
6256 : {
6257 : // Use unsigned implementation by converting the nodatavalue to
6258 : // unsigned
6259 111 : return GDALIsValueInRange<int16_t>(dfNoDataValue) &&
6260 55 : HasOnlyNoDataT(
6261 : static_cast<const uint16_t *>(pBuffer),
6262 55 : static_cast<uint16_t>(static_cast<int16_t>(dfNoDataValue)),
6263 56 : nWidth, nHeight, nLineStride, nComponents);
6264 : }
6265 691 : if (nBitsPerSample == 32 && nSampleFormat == GSF_UNSIGNED_INT)
6266 : {
6267 129 : return GDALIsValueInRange<uint32_t>(dfNoDataValue) &&
6268 64 : HasOnlyNoDataT(static_cast<const uint32_t *>(pBuffer),
6269 : static_cast<uint32_t>(dfNoDataValue), nWidth,
6270 65 : nHeight, nLineStride, nComponents);
6271 : }
6272 626 : if (nBitsPerSample == 32 && nSampleFormat == GSF_SIGNED_INT)
6273 : {
6274 : // Use unsigned implementation by converting the nodatavalue to
6275 : // unsigned
6276 23 : return GDALIsValueInRange<int32_t>(dfNoDataValue) &&
6277 11 : HasOnlyNoDataT(
6278 : static_cast<const uint32_t *>(pBuffer),
6279 11 : static_cast<uint32_t>(static_cast<int32_t>(dfNoDataValue)),
6280 12 : nWidth, nHeight, nLineStride, nComponents);
6281 : }
6282 614 : if (nBitsPerSample == 64 && nSampleFormat == GSF_UNSIGNED_INT)
6283 : {
6284 112 : return GDALIsValueInRange<uint64_t>(dfNoDataValue) &&
6285 56 : HasOnlyNoDataT(static_cast<const uint64_t *>(pBuffer),
6286 : static_cast<uint64_t>(dfNoDataValue), nWidth,
6287 56 : nHeight, nLineStride, nComponents);
6288 : }
6289 558 : if (nBitsPerSample == 64 && nSampleFormat == GSF_SIGNED_INT)
6290 : {
6291 : // Use unsigned implementation by converting the nodatavalue to
6292 : // unsigned
6293 0 : return GDALIsValueInRange<int64_t>(dfNoDataValue) &&
6294 0 : HasOnlyNoDataT(
6295 : static_cast<const uint64_t *>(pBuffer),
6296 0 : static_cast<uint64_t>(static_cast<int64_t>(dfNoDataValue)),
6297 0 : nWidth, nHeight, nLineStride, nComponents);
6298 : }
6299 558 : if (nBitsPerSample == 16 && nSampleFormat == GSF_FLOATING_POINT)
6300 : {
6301 106 : return (std::isnan(dfNoDataValue) ||
6302 211 : GDALIsValueInRange<GFloat16>(dfNoDataValue)) &&
6303 105 : HasOnlyNoDataT(static_cast<const GFloat16 *>(pBuffer),
6304 : static_cast<GFloat16>(dfNoDataValue), nWidth,
6305 106 : nHeight, nLineStride, nComponents);
6306 : }
6307 452 : if (nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
6308 : {
6309 268 : return (std::isnan(dfNoDataValue) ||
6310 535 : GDALIsValueInRange<float>(dfNoDataValue)) &&
6311 267 : HasOnlyNoDataT(static_cast<const float *>(pBuffer),
6312 : static_cast<float>(dfNoDataValue), nWidth,
6313 268 : nHeight, nLineStride, nComponents);
6314 : }
6315 184 : if (nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
6316 : {
6317 184 : return HasOnlyNoDataT(static_cast<const double *>(pBuffer),
6318 : dfNoDataValue, nWidth, nHeight, nLineStride,
6319 184 : nComponents);
6320 : }
6321 0 : return false;
6322 : }
6323 :
6324 : #ifdef HAVE_SSE2
6325 :
6326 : /************************************************************************/
6327 : /* GDALDeinterleave3Byte() */
6328 : /************************************************************************/
6329 :
6330 : #if defined(__GNUC__) && !defined(__clang__)
6331 : __attribute__((optimize("no-tree-vectorize")))
6332 : #endif
6333 380714 : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
6334 : GByte *CPL_RESTRICT pabyDest0,
6335 : GByte *CPL_RESTRICT pabyDest1,
6336 : GByte *CPL_RESTRICT pabyDest2, size_t nIters)
6337 : #ifdef USE_NEON_OPTIMIZATIONS
6338 : {
6339 : return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6340 : nIters);
6341 : }
6342 : #else
6343 : {
6344 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
6345 380714 : if (CPLHaveRuntimeSSSE3())
6346 : {
6347 380712 : return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
6348 380712 : pabyDest2, nIters);
6349 : }
6350 : #endif
6351 :
6352 2 : size_t i = 0;
6353 2 : if (((reinterpret_cast<uintptr_t>(pabySrc) |
6354 2 : reinterpret_cast<uintptr_t>(pabyDest0) |
6355 2 : reinterpret_cast<uintptr_t>(pabyDest1) |
6356 2 : reinterpret_cast<uintptr_t>(pabyDest2)) %
6357 : sizeof(unsigned int)) == 0)
6358 : {
6359 : // Slightly better than GCC autovectorizer
6360 17 : for (size_t j = 0; i + 3 < nIters; i += 4, ++j)
6361 : {
6362 15 : unsigned int word0 =
6363 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i);
6364 15 : unsigned int word1 =
6365 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 4);
6366 15 : unsigned int word2 =
6367 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 8);
6368 15 : reinterpret_cast<unsigned int *>(pabyDest0)[j] =
6369 15 : (word0 & 0xff) | ((word0 >> 24) << 8) | (word1 & 0x00ff0000) |
6370 15 : ((word2 >> 8) << 24);
6371 15 : reinterpret_cast<unsigned int *>(pabyDest1)[j] =
6372 15 : ((word0 >> 8) & 0xff) | ((word1 & 0xff) << 8) |
6373 15 : (((word1 >> 24)) << 16) | ((word2 >> 16) << 24);
6374 15 : pabyDest2[j * 4] = static_cast<GByte>(word0 >> 16);
6375 15 : pabyDest2[j * 4 + 1] = static_cast<GByte>(word1 >> 8);
6376 15 : pabyDest2[j * 4 + 2] = static_cast<GByte>(word2);
6377 15 : pabyDest2[j * 4 + 3] = static_cast<GByte>(word2 >> 24);
6378 : }
6379 : }
6380 : #if defined(__clang__)
6381 : #pragma clang loop vectorize(disable)
6382 : #endif
6383 3 : for (; i < nIters; ++i)
6384 : {
6385 1 : pabyDest0[i] = pabySrc[3 * i + 0];
6386 1 : pabyDest1[i] = pabySrc[3 * i + 1];
6387 1 : pabyDest2[i] = pabySrc[3 * i + 2];
6388 : }
6389 : }
6390 : #endif
6391 :
6392 : /************************************************************************/
6393 : /* GDALDeinterleave4Byte() */
6394 : /************************************************************************/
6395 :
6396 : #if !defined(__GNUC__) || defined(__clang__)
6397 :
6398 : /************************************************************************/
6399 : /* deinterleave() */
6400 : /************************************************************************/
6401 :
6402 : template <bool SHIFT, bool MASK>
6403 : inline __m128i deinterleave(__m128i &xmm0_ori, __m128i &xmm1_ori,
6404 : __m128i &xmm2_ori, __m128i &xmm3_ori)
6405 : {
6406 : // Set higher 24bit of each int32 packed word to 0
6407 : if (SHIFT)
6408 : {
6409 : xmm0_ori = _mm_srli_epi32(xmm0_ori, 8);
6410 : xmm1_ori = _mm_srli_epi32(xmm1_ori, 8);
6411 : xmm2_ori = _mm_srli_epi32(xmm2_ori, 8);
6412 : xmm3_ori = _mm_srli_epi32(xmm3_ori, 8);
6413 : }
6414 : __m128i xmm0;
6415 : __m128i xmm1;
6416 : __m128i xmm2;
6417 : __m128i xmm3;
6418 : if (MASK)
6419 : {
6420 : const __m128i xmm_mask = _mm_set1_epi32(0xff);
6421 : xmm0 = _mm_and_si128(xmm0_ori, xmm_mask);
6422 : xmm1 = _mm_and_si128(xmm1_ori, xmm_mask);
6423 : xmm2 = _mm_and_si128(xmm2_ori, xmm_mask);
6424 : xmm3 = _mm_and_si128(xmm3_ori, xmm_mask);
6425 : }
6426 : else
6427 : {
6428 : xmm0 = xmm0_ori;
6429 : xmm1 = xmm1_ori;
6430 : xmm2 = xmm2_ori;
6431 : xmm3 = xmm3_ori;
6432 : }
6433 : // Pack int32 to int16
6434 : xmm0 = _mm_packs_epi32(xmm0, xmm1);
6435 : xmm2 = _mm_packs_epi32(xmm2, xmm3);
6436 : // Pack int16 to uint8
6437 : xmm0 = _mm_packus_epi16(xmm0, xmm2);
6438 : return xmm0;
6439 : }
6440 :
6441 : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
6442 : GByte *CPL_RESTRICT pabyDest0,
6443 : GByte *CPL_RESTRICT pabyDest1,
6444 : GByte *CPL_RESTRICT pabyDest2,
6445 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6446 : #ifdef USE_NEON_OPTIMIZATIONS
6447 : {
6448 : return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6449 : pabyDest3, nIters);
6450 : }
6451 : #else
6452 : {
6453 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
6454 : if (CPLHaveRuntimeSSSE3())
6455 : {
6456 : return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
6457 : pabyDest2, pabyDest3, nIters);
6458 : }
6459 : #endif
6460 :
6461 : // Not the optimal SSE2-only code, as gcc auto-vectorizer manages to
6462 : // do something slightly better.
6463 : size_t i = 0;
6464 : for (; i + 15 < nIters; i += 16)
6465 : {
6466 : __m128i xmm0_ori = _mm_loadu_si128(
6467 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
6468 : __m128i xmm1_ori = _mm_loadu_si128(
6469 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
6470 : __m128i xmm2_ori = _mm_loadu_si128(
6471 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
6472 : __m128i xmm3_ori = _mm_loadu_si128(
6473 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
6474 :
6475 : _mm_storeu_si128(
6476 : reinterpret_cast<__m128i *>(pabyDest0 + i),
6477 : deinterleave<false, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6478 : _mm_storeu_si128(
6479 : reinterpret_cast<__m128i *>(pabyDest1 + i),
6480 : deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6481 : _mm_storeu_si128(
6482 : reinterpret_cast<__m128i *>(pabyDest2 + i),
6483 : deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6484 : _mm_storeu_si128(
6485 : reinterpret_cast<__m128i *>(pabyDest3 + i),
6486 : deinterleave<true, false>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6487 : }
6488 :
6489 : #if defined(__clang__)
6490 : #pragma clang loop vectorize(disable)
6491 : #endif
6492 : for (; i < nIters; ++i)
6493 : {
6494 : pabyDest0[i] = pabySrc[4 * i + 0];
6495 : pabyDest1[i] = pabySrc[4 * i + 1];
6496 : pabyDest2[i] = pabySrc[4 * i + 2];
6497 : pabyDest3[i] = pabySrc[4 * i + 3];
6498 : }
6499 : }
6500 : #endif
6501 : #else
6502 : // GCC autovectorizer does an excellent job
6503 73219 : __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
6504 : const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
6505 : GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2,
6506 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6507 : {
6508 539713000 : for (size_t i = 0; i < nIters; ++i)
6509 : {
6510 539640000 : pabyDest0[i] = pabySrc[4 * i + 0];
6511 539640000 : pabyDest1[i] = pabySrc[4 * i + 1];
6512 539640000 : pabyDest2[i] = pabySrc[4 * i + 2];
6513 539640000 : pabyDest3[i] = pabySrc[4 * i + 3];
6514 : }
6515 73219 : }
6516 : #endif
6517 :
6518 : #else
6519 :
6520 : /************************************************************************/
6521 : /* GDALDeinterleave3Byte() */
6522 : /************************************************************************/
6523 :
6524 : // TODO: Enabling below could help on non-Intel architectures where GCC knows
6525 : // how to auto-vectorize
6526 : // #if defined(__GNUC__)
6527 : //__attribute__((optimize("tree-vectorize")))
6528 : // #endif
6529 : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
6530 : GByte *CPL_RESTRICT pabyDest0,
6531 : GByte *CPL_RESTRICT pabyDest1,
6532 : GByte *CPL_RESTRICT pabyDest2, size_t nIters)
6533 : {
6534 : for (size_t i = 0; i < nIters; ++i)
6535 : {
6536 : pabyDest0[i] = pabySrc[3 * i + 0];
6537 : pabyDest1[i] = pabySrc[3 * i + 1];
6538 : pabyDest2[i] = pabySrc[3 * i + 2];
6539 : }
6540 : }
6541 :
6542 : /************************************************************************/
6543 : /* GDALDeinterleave4Byte() */
6544 : /************************************************************************/
6545 :
6546 : // TODO: Enabling below could help on non-Intel architectures where gcc knows
6547 : // how to auto-vectorize
6548 : // #if defined(__GNUC__)
6549 : //__attribute__((optimize("tree-vectorize")))
6550 : // #endif
6551 : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
6552 : GByte *CPL_RESTRICT pabyDest0,
6553 : GByte *CPL_RESTRICT pabyDest1,
6554 : GByte *CPL_RESTRICT pabyDest2,
6555 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6556 : {
6557 : for (size_t i = 0; i < nIters; ++i)
6558 : {
6559 : pabyDest0[i] = pabySrc[4 * i + 0];
6560 : pabyDest1[i] = pabySrc[4 * i + 1];
6561 : pabyDest2[i] = pabySrc[4 * i + 2];
6562 : pabyDest3[i] = pabySrc[4 * i + 3];
6563 : }
6564 : }
6565 :
6566 : #endif
6567 :
6568 : /************************************************************************/
6569 : /* GDALDeinterleave() */
6570 : /************************************************************************/
6571 :
6572 : /*! Copy values from a pixel-interleave buffer to multiple per-component
6573 : buffers.
6574 :
6575 : In pseudo-code
6576 : \verbatim
6577 : for(size_t i = 0; i < nIters; ++i)
6578 : for(int iComp = 0; iComp < nComponents; iComp++ )
6579 : ppDestBuffer[iComp][i] = pSourceBuffer[nComponents * i + iComp]
6580 : \endverbatim
6581 :
6582 : The implementation is optimized for a few cases, like de-interleaving
6583 : of 3 or 4-components Byte buffers.
6584 :
6585 : \since GDAL 3.6
6586 : */
6587 454283 : void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
6588 : int nComponents, void **ppDestBuffer,
6589 : GDALDataType eDestDT, size_t nIters)
6590 : {
6591 454283 : if (eSourceDT == eDestDT)
6592 : {
6593 454261 : if (eSourceDT == GDT_UInt8 || eSourceDT == GDT_Int8)
6594 : {
6595 453940 : if (nComponents == 3)
6596 : {
6597 380714 : const GByte *CPL_RESTRICT pabySrc =
6598 : static_cast<const GByte *>(pSourceBuffer);
6599 380714 : GByte *CPL_RESTRICT pabyDest0 =
6600 : static_cast<GByte *>(ppDestBuffer[0]);
6601 380714 : GByte *CPL_RESTRICT pabyDest1 =
6602 : static_cast<GByte *>(ppDestBuffer[1]);
6603 380714 : GByte *CPL_RESTRICT pabyDest2 =
6604 : static_cast<GByte *>(ppDestBuffer[2]);
6605 380714 : GDALDeinterleave3Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6606 : nIters);
6607 380714 : return;
6608 : }
6609 73226 : else if (nComponents == 4)
6610 : {
6611 73219 : const GByte *CPL_RESTRICT pabySrc =
6612 : static_cast<const GByte *>(pSourceBuffer);
6613 73219 : GByte *CPL_RESTRICT pabyDest0 =
6614 : static_cast<GByte *>(ppDestBuffer[0]);
6615 73219 : GByte *CPL_RESTRICT pabyDest1 =
6616 : static_cast<GByte *>(ppDestBuffer[1]);
6617 73219 : GByte *CPL_RESTRICT pabyDest2 =
6618 : static_cast<GByte *>(ppDestBuffer[2]);
6619 73219 : GByte *CPL_RESTRICT pabyDest3 =
6620 : static_cast<GByte *>(ppDestBuffer[3]);
6621 73219 : GDALDeinterleave4Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6622 : pabyDest3, nIters);
6623 73219 : return;
6624 7 : }
6625 : }
6626 : #if ((defined(__GNUC__) && !defined(__clang__)) || \
6627 : defined(__INTEL_CLANG_COMPILER)) && \
6628 : defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
6629 642 : else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
6630 321 : CPLHaveRuntimeSSSE3())
6631 : {
6632 321 : if (nComponents == 3)
6633 : {
6634 126 : const GUInt16 *CPL_RESTRICT panSrc =
6635 : static_cast<const GUInt16 *>(pSourceBuffer);
6636 126 : GUInt16 *CPL_RESTRICT panDest0 =
6637 : static_cast<GUInt16 *>(ppDestBuffer[0]);
6638 126 : GUInt16 *CPL_RESTRICT panDest1 =
6639 : static_cast<GUInt16 *>(ppDestBuffer[1]);
6640 126 : GUInt16 *CPL_RESTRICT panDest2 =
6641 : static_cast<GUInt16 *>(ppDestBuffer[2]);
6642 126 : GDALDeinterleave3UInt16_SSSE3(panSrc, panDest0, panDest1,
6643 : panDest2, nIters);
6644 126 : return;
6645 : }
6646 : #if !defined(__INTEL_CLANG_COMPILER)
6647 : // ICC autovectorizer doesn't do a good job, at least with icx
6648 : // 2022.1.0.20220316
6649 195 : else if (nComponents == 4)
6650 : {
6651 195 : const GUInt16 *CPL_RESTRICT panSrc =
6652 : static_cast<const GUInt16 *>(pSourceBuffer);
6653 195 : GUInt16 *CPL_RESTRICT panDest0 =
6654 : static_cast<GUInt16 *>(ppDestBuffer[0]);
6655 195 : GUInt16 *CPL_RESTRICT panDest1 =
6656 : static_cast<GUInt16 *>(ppDestBuffer[1]);
6657 195 : GUInt16 *CPL_RESTRICT panDest2 =
6658 : static_cast<GUInt16 *>(ppDestBuffer[2]);
6659 195 : GUInt16 *CPL_RESTRICT panDest3 =
6660 : static_cast<GUInt16 *>(ppDestBuffer[3]);
6661 195 : GDALDeinterleave4UInt16_SSSE3(panSrc, panDest0, panDest1,
6662 : panDest2, panDest3, nIters);
6663 195 : return;
6664 : }
6665 : #endif
6666 : }
6667 : #endif
6668 : }
6669 :
6670 29 : const int nSourceDTSize = GDALGetDataTypeSizeBytes(eSourceDT);
6671 29 : const int nDestDTSize = GDALGetDataTypeSizeBytes(eDestDT);
6672 108 : for (int iComp = 0; iComp < nComponents; iComp++)
6673 : {
6674 79 : GDALCopyWords64(static_cast<const GByte *>(pSourceBuffer) +
6675 79 : iComp * nSourceDTSize,
6676 : eSourceDT, nComponents * nSourceDTSize,
6677 79 : ppDestBuffer[iComp], eDestDT, nDestDTSize, nIters);
6678 : }
6679 : }
6680 :
6681 : /************************************************************************/
6682 : /* GDALTranspose2DSingleToSingle() */
6683 : /************************************************************************/
6684 : /**
6685 : * Transpose a 2D array of non-complex values, in a efficient (cache-oblivious) way.
6686 : *
6687 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6688 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6689 : * @param nSrcWidth Width of pSrc array.
6690 : * @param nSrcHeight Height of pSrc array.
6691 : */
6692 :
6693 : template <class DST, class SRC>
6694 160 : void GDALTranspose2DSingleToSingle(const SRC *CPL_RESTRICT pSrc,
6695 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6696 : size_t nSrcHeight)
6697 : {
6698 160 : constexpr size_t blocksize = 32;
6699 345 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6700 : {
6701 185 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6702 5016 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6703 : {
6704 : // transpose the block beginning at [i,j]
6705 4831 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6706 26185 : for (size_t k = i; k < max_k; ++k)
6707 : {
6708 669282 : for (size_t l = j; l < max_l; ++l)
6709 : {
6710 647928 : GDALCopyWord(pSrc[l + k * nSrcWidth],
6711 647928 : pDst[k + l * nSrcHeight]);
6712 : }
6713 : }
6714 : }
6715 : }
6716 160 : }
6717 :
6718 : /************************************************************************/
6719 : /* GDALTranspose2DComplexToComplex() */
6720 : /************************************************************************/
6721 : /**
6722 : * Transpose a 2D array of complex values into an array of complex values,
6723 : * in a efficient (cache-oblivious) way.
6724 : *
6725 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6726 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6727 : * @param nSrcWidth Width of pSrc array.
6728 : * @param nSrcHeight Height of pSrc array.
6729 : */
6730 : template <class DST, class SRC>
6731 25 : void GDALTranspose2DComplexToComplex(const SRC *CPL_RESTRICT pSrc,
6732 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6733 : size_t nSrcHeight)
6734 : {
6735 25 : constexpr size_t blocksize = 32;
6736 50 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6737 : {
6738 25 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6739 50 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6740 : {
6741 : // transpose the block beginning at [i,j]
6742 25 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6743 75 : for (size_t k = i; k < max_k; ++k)
6744 : {
6745 200 : for (size_t l = j; l < max_l; ++l)
6746 : {
6747 150 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
6748 150 : pDst[2 * (k + l * nSrcHeight) + 0]);
6749 150 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 1],
6750 150 : pDst[2 * (k + l * nSrcHeight) + 1]);
6751 : }
6752 : }
6753 : }
6754 : }
6755 25 : }
6756 :
6757 : /************************************************************************/
6758 : /* GDALTranspose2DComplexToSingle() */
6759 : /************************************************************************/
6760 : /**
6761 : * Transpose a 2D array of complex values into an array of non-complex values,
6762 : * in a efficient (cache-oblivious) way.
6763 : *
6764 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6765 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6766 : * @param nSrcWidth Width of pSrc array.
6767 : * @param nSrcHeight Height of pSrc array.
6768 : */
6769 : template <class DST, class SRC>
6770 55 : void GDALTranspose2DComplexToSingle(const SRC *CPL_RESTRICT pSrc,
6771 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6772 : size_t nSrcHeight)
6773 : {
6774 55 : constexpr size_t blocksize = 32;
6775 110 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6776 : {
6777 55 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6778 110 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6779 : {
6780 : // transpose the block beginning at [i,j]
6781 55 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6782 165 : for (size_t k = i; k < max_k; ++k)
6783 : {
6784 440 : for (size_t l = j; l < max_l; ++l)
6785 : {
6786 330 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
6787 330 : pDst[k + l * nSrcHeight]);
6788 : }
6789 : }
6790 : }
6791 : }
6792 55 : }
6793 :
6794 : /************************************************************************/
6795 : /* GDALTranspose2DSingleToComplex() */
6796 : /************************************************************************/
6797 : /**
6798 : * Transpose a 2D array of non-complex values into an array of complex values,
6799 : * in a efficient (cache-oblivious) way.
6800 : *
6801 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6802 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6803 : * @param nSrcWidth Width of pSrc array.
6804 : * @param nSrcHeight Height of pSrc array.
6805 : */
6806 : template <class DST, class SRC>
6807 55 : void GDALTranspose2DSingleToComplex(const SRC *CPL_RESTRICT pSrc,
6808 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6809 : size_t nSrcHeight)
6810 : {
6811 55 : constexpr size_t blocksize = 32;
6812 110 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6813 : {
6814 55 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6815 110 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6816 : {
6817 : // transpose the block beginning at [i,j]
6818 55 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6819 165 : for (size_t k = i; k < max_k; ++k)
6820 : {
6821 440 : for (size_t l = j; l < max_l; ++l)
6822 : {
6823 330 : GDALCopyWord(pSrc[l + k * nSrcWidth],
6824 330 : pDst[2 * (k + l * nSrcHeight) + 0]);
6825 330 : pDst[2 * (k + l * nSrcHeight) + 1] = 0;
6826 : }
6827 : }
6828 : }
6829 : }
6830 55 : }
6831 :
6832 : /************************************************************************/
6833 : /* GDALTranspose2D() */
6834 : /************************************************************************/
6835 :
6836 : template <class DST, bool DST_IS_COMPLEX>
6837 295 : static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
6838 : size_t nSrcWidth, size_t nSrcHeight)
6839 : {
6840 : #define CALL_GDALTranspose2D_internal(SRC_TYPE) \
6841 : do \
6842 : { \
6843 : if constexpr (DST_IS_COMPLEX) \
6844 : { \
6845 : GDALTranspose2DSingleToComplex( \
6846 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6847 : nSrcHeight); \
6848 : } \
6849 : else \
6850 : { \
6851 : GDALTranspose2DSingleToSingle(static_cast<const SRC_TYPE *>(pSrc), \
6852 : pDst, nSrcWidth, nSrcHeight); \
6853 : } \
6854 : } while (0)
6855 :
6856 : #define CALL_GDALTranspose2DComplex_internal(SRC_TYPE) \
6857 : do \
6858 : { \
6859 : if constexpr (DST_IS_COMPLEX) \
6860 : { \
6861 : GDALTranspose2DComplexToComplex( \
6862 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6863 : nSrcHeight); \
6864 : } \
6865 : else \
6866 : { \
6867 : GDALTranspose2DComplexToSingle( \
6868 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6869 : nSrcHeight); \
6870 : } \
6871 : } while (0)
6872 :
6873 : // clang-format off
6874 295 : switch (eSrcType)
6875 : {
6876 16 : case GDT_UInt8: CALL_GDALTranspose2D_internal(uint8_t); break;
6877 15 : case GDT_Int8: CALL_GDALTranspose2D_internal(int8_t); break;
6878 33 : case GDT_UInt16: CALL_GDALTranspose2D_internal(uint16_t); break;
6879 20 : case GDT_Int16: CALL_GDALTranspose2D_internal(int16_t); break;
6880 24 : case GDT_UInt32: CALL_GDALTranspose2D_internal(uint32_t); break;
6881 16 : case GDT_Int32: CALL_GDALTranspose2D_internal(int32_t); break;
6882 16 : case GDT_UInt64: CALL_GDALTranspose2D_internal(uint64_t); break;
6883 16 : case GDT_Int64: CALL_GDALTranspose2D_internal(int64_t); break;
6884 16 : case GDT_Float16: CALL_GDALTranspose2D_internal(GFloat16); break;
6885 19 : case GDT_Float32: CALL_GDALTranspose2D_internal(float); break;
6886 24 : case GDT_Float64: CALL_GDALTranspose2D_internal(double); break;
6887 16 : case GDT_CInt16: CALL_GDALTranspose2DComplex_internal(int16_t); break;
6888 16 : case GDT_CInt32: CALL_GDALTranspose2DComplex_internal(int32_t); break;
6889 16 : case GDT_CFloat16: CALL_GDALTranspose2DComplex_internal(GFloat16); break;
6890 16 : case GDT_CFloat32: CALL_GDALTranspose2DComplex_internal(float); break;
6891 16 : case GDT_CFloat64: CALL_GDALTranspose2DComplex_internal(double); break;
6892 0 : case GDT_Unknown:
6893 : case GDT_TypeCount:
6894 0 : break;
6895 : }
6896 : // clang-format on
6897 :
6898 : #undef CALL_GDALTranspose2D_internal
6899 : #undef CALL_GDALTranspose2DComplex_internal
6900 295 : }
6901 :
6902 : /************************************************************************/
6903 : /* GDALInterleave2Byte() */
6904 : /************************************************************************/
6905 :
6906 : #if defined(HAVE_SSE2) && \
6907 : (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
6908 :
6909 : // ICC autovectorizer doesn't do a good job at generating good SSE code,
6910 : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
6911 : #if defined(__GNUC__)
6912 : __attribute__((noinline))
6913 : #endif
6914 : static void GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
6915 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
6916 : {
6917 : size_t i = 0;
6918 : constexpr size_t VALS_PER_ITER = 16;
6919 : for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
6920 : {
6921 : __m128i xmm0 =
6922 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + i));
6923 : __m128i xmm1 = _mm_loadu_si128(
6924 : reinterpret_cast<__m128i const *>(pSrc + i + nIters));
6925 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + 2 * i),
6926 : _mm_unpacklo_epi8(xmm0, xmm1));
6927 : _mm_storeu_si128(
6928 : reinterpret_cast<__m128i *>(pDst + 2 * i + VALS_PER_ITER),
6929 : _mm_unpackhi_epi8(xmm0, xmm1));
6930 : }
6931 : #if defined(__clang__)
6932 : #pragma clang loop vectorize(disable)
6933 : #endif
6934 : for (; i < nIters; ++i)
6935 : {
6936 : pDst[2 * i + 0] = pSrc[i + 0 * nIters];
6937 : pDst[2 * i + 1] = pSrc[i + 1 * nIters];
6938 : }
6939 : }
6940 :
6941 : #else
6942 :
6943 : #if defined(__GNUC__) && !defined(__clang__)
6944 : __attribute__((optimize("tree-vectorize")))
6945 : #endif
6946 : #if defined(__GNUC__)
6947 : __attribute__((noinline))
6948 : #endif
6949 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6950 : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
6951 : #pragma clang diagnostic push
6952 : #pragma clang diagnostic ignored "-Wpass-failed"
6953 : #endif
6954 9 : static void GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
6955 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
6956 : {
6957 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6958 : #pragma clang loop vectorize(enable)
6959 : #endif
6960 355429 : for (size_t i = 0; i < nIters; ++i)
6961 : {
6962 355420 : pDst[2 * i + 0] = pSrc[i + 0 * nIters];
6963 355420 : pDst[2 * i + 1] = pSrc[i + 1 * nIters];
6964 : }
6965 9 : }
6966 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6967 : #pragma clang diagnostic pop
6968 : #endif
6969 :
6970 : #endif
6971 :
6972 : /************************************************************************/
6973 : /* GDALInterleave4Byte() */
6974 : /************************************************************************/
6975 :
6976 : #if defined(HAVE_SSE2) && \
6977 : (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
6978 :
6979 : // ICC autovectorizer doesn't do a good job at generating good SSE code,
6980 : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
6981 : #if defined(__GNUC__)
6982 : __attribute__((noinline))
6983 : #endif
6984 : static void GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
6985 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
6986 : {
6987 : size_t i = 0;
6988 : constexpr size_t VALS_PER_ITER = 16;
6989 : for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
6990 : {
6991 : __m128i xmm0 = _mm_loadu_si128(
6992 : reinterpret_cast<__m128i const *>(pSrc + i + 0 * nIters));
6993 : __m128i xmm1 = _mm_loadu_si128(
6994 : reinterpret_cast<__m128i const *>(pSrc + i + 1 * nIters));
6995 : __m128i xmm2 = _mm_loadu_si128(
6996 : reinterpret_cast<__m128i const *>(pSrc + i + 2 * nIters));
6997 : __m128i xmm3 = _mm_loadu_si128(
6998 : reinterpret_cast<__m128i const *>(pSrc + i + 3 * nIters));
6999 : auto tmp0 = _mm_unpacklo_epi8(
7000 : xmm0,
7001 : xmm1); // (xmm0_0, xmm1_0, xmm0_1, xmm1_1, xmm0_2, xmm1_2, ...)
7002 : auto tmp1 = _mm_unpackhi_epi8(
7003 : xmm0,
7004 : xmm1); // (xmm0_8, xmm1_8, xmm0_9, xmm1_9, xmm0_10, xmm1_10, ...)
7005 : auto tmp2 = _mm_unpacklo_epi8(
7006 : xmm2,
7007 : xmm3); // (xmm2_0, xmm3_0, xmm2_1, xmm3_1, xmm2_2, xmm3_2, ...)
7008 : auto tmp3 = _mm_unpackhi_epi8(
7009 : xmm2,
7010 : xmm3); // (xmm2_8, xmm3_8, xmm2_9, xmm3_9, xmm2_10, xmm3_10, ...)
7011 : auto tmp2_0 = _mm_unpacklo_epi16(
7012 : tmp0,
7013 : tmp2); // (xmm0_0, xmm1_0, xmm2_0, xmm3_0, xmm0_1, xmm1_1, xmm2_1, xmm3_1, ...)
7014 : auto tmp2_1 = _mm_unpackhi_epi16(tmp0, tmp2);
7015 : auto tmp2_2 = _mm_unpacklo_epi16(tmp1, tmp3);
7016 : auto tmp2_3 = _mm_unpackhi_epi16(tmp1, tmp3);
7017 : _mm_storeu_si128(
7018 : reinterpret_cast<__m128i *>(pDst + 4 * i + 0 * VALS_PER_ITER),
7019 : tmp2_0);
7020 : _mm_storeu_si128(
7021 : reinterpret_cast<__m128i *>(pDst + 4 * i + 1 * VALS_PER_ITER),
7022 : tmp2_1);
7023 : _mm_storeu_si128(
7024 : reinterpret_cast<__m128i *>(pDst + 4 * i + 2 * VALS_PER_ITER),
7025 : tmp2_2);
7026 : _mm_storeu_si128(
7027 : reinterpret_cast<__m128i *>(pDst + 4 * i + 3 * VALS_PER_ITER),
7028 : tmp2_3);
7029 : }
7030 : #if defined(__clang__)
7031 : #pragma clang loop vectorize(disable)
7032 : #endif
7033 : for (; i < nIters; ++i)
7034 : {
7035 : pDst[4 * i + 0] = pSrc[i + 0 * nIters];
7036 : pDst[4 * i + 1] = pSrc[i + 1 * nIters];
7037 : pDst[4 * i + 2] = pSrc[i + 2 * nIters];
7038 : pDst[4 * i + 3] = pSrc[i + 3 * nIters];
7039 : }
7040 : }
7041 :
7042 : #else
7043 :
7044 : #if defined(__GNUC__) && !defined(__clang__)
7045 : __attribute__((optimize("tree-vectorize")))
7046 : #endif
7047 : #if defined(__GNUC__)
7048 : __attribute__((noinline))
7049 : #endif
7050 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7051 : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
7052 : #pragma clang diagnostic push
7053 : #pragma clang diagnostic ignored "-Wpass-failed"
7054 : #endif
7055 30 : static void GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
7056 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
7057 : {
7058 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7059 : #pragma clang loop vectorize(enable)
7060 : #endif
7061 49620700 : for (size_t i = 0; i < nIters; ++i)
7062 : {
7063 49620600 : pDst[4 * i + 0] = pSrc[i + 0 * nIters];
7064 49620600 : pDst[4 * i + 1] = pSrc[i + 1 * nIters];
7065 49620600 : pDst[4 * i + 2] = pSrc[i + 2 * nIters];
7066 49620600 : pDst[4 * i + 3] = pSrc[i + 3 * nIters];
7067 : }
7068 30 : }
7069 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7070 : #pragma clang diagnostic pop
7071 : #endif
7072 :
7073 : #endif
7074 :
7075 : /************************************************************************/
7076 : /* GDALTranspose2D() */
7077 : /************************************************************************/
7078 :
7079 : /**
7080 : * Transpose a 2D array in a efficient (cache-oblivious) way.
7081 : *
7082 : * @param pSrc Source array of width = nSrcWidth and height = nSrcHeight.
7083 : * @param eSrcType Data type of pSrc.
7084 : * @param pDst Destination transposed array of width = nSrcHeight and height = nSrcWidth.
7085 : * @param eDstType Data type of pDst.
7086 : * @param nSrcWidth Width of pSrc array.
7087 : * @param nSrcHeight Height of pSrc array.
7088 : * @since GDAL 3.11
7089 : */
7090 :
7091 365 : void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, void *pDst,
7092 : GDALDataType eDstType, size_t nSrcWidth, size_t nSrcHeight)
7093 : {
7094 365 : if (eSrcType == eDstType && (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8))
7095 : {
7096 70 : if (nSrcHeight == 2)
7097 : {
7098 9 : GDALInterleave2Byte(static_cast<const uint8_t *>(pSrc),
7099 : static_cast<uint8_t *>(pDst), nSrcWidth);
7100 9 : return;
7101 : }
7102 61 : if (nSrcHeight == 4)
7103 : {
7104 30 : GDALInterleave4Byte(static_cast<const uint8_t *>(pSrc),
7105 : static_cast<uint8_t *>(pDst), nSrcWidth);
7106 30 : return;
7107 : }
7108 : #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) && \
7109 : (defined(__x86_64) || defined(_M_X64)))
7110 31 : if (CPLHaveRuntimeSSSE3())
7111 : {
7112 31 : GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
7113 : static_cast<uint8_t *>(pDst), nSrcWidth,
7114 : nSrcHeight);
7115 31 : return;
7116 : }
7117 : #elif defined(USE_NEON_OPTIMIZATIONS)
7118 : {
7119 : GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
7120 : static_cast<uint8_t *>(pDst), nSrcWidth,
7121 : nSrcHeight);
7122 : return;
7123 : }
7124 : #endif
7125 : }
7126 :
7127 : #define CALL_GDALTranspose2D_internal(DST_TYPE, DST_IS_COMPLEX) \
7128 : GDALTranspose2D<DST_TYPE, DST_IS_COMPLEX>( \
7129 : pSrc, eSrcType, static_cast<DST_TYPE *>(pDst), nSrcWidth, nSrcHeight)
7130 :
7131 : // clang-format off
7132 295 : switch (eDstType)
7133 : {
7134 15 : case GDT_UInt8: CALL_GDALTranspose2D_internal(uint8_t, false); break;
7135 15 : case GDT_Int8: CALL_GDALTranspose2D_internal(int8_t, false); break;
7136 33 : case GDT_UInt16: CALL_GDALTranspose2D_internal(uint16_t, false); break;
7137 20 : case GDT_Int16: CALL_GDALTranspose2D_internal(int16_t, false); break;
7138 24 : case GDT_UInt32: CALL_GDALTranspose2D_internal(uint32_t, false); break;
7139 16 : case GDT_Int32: CALL_GDALTranspose2D_internal(int32_t, false); break;
7140 16 : case GDT_UInt64: CALL_GDALTranspose2D_internal(uint64_t, false); break;
7141 16 : case GDT_Int64: CALL_GDALTranspose2D_internal(int64_t, false); break;
7142 16 : case GDT_Float16: CALL_GDALTranspose2D_internal(GFloat16, false); break;
7143 19 : case GDT_Float32: CALL_GDALTranspose2D_internal(float, false); break;
7144 25 : case GDT_Float64: CALL_GDALTranspose2D_internal(double, false); break;
7145 16 : case GDT_CInt16: CALL_GDALTranspose2D_internal(int16_t, true); break;
7146 16 : case GDT_CInt32: CALL_GDALTranspose2D_internal(int32_t, true); break;
7147 16 : case GDT_CFloat16: CALL_GDALTranspose2D_internal(GFloat16, true); break;
7148 16 : case GDT_CFloat32: CALL_GDALTranspose2D_internal(float, true); break;
7149 16 : case GDT_CFloat64: CALL_GDALTranspose2D_internal(double, true); break;
7150 0 : case GDT_Unknown:
7151 : case GDT_TypeCount:
7152 0 : break;
7153 : }
7154 : // clang-format on
7155 :
7156 : #undef CALL_GDALTranspose2D_internal
7157 : }
7158 :
7159 : /************************************************************************/
7160 : /* ExtractBitAndConvertTo255() */
7161 : /************************************************************************/
7162 :
7163 : #if defined(__GNUC__) || defined(_MSC_VER)
7164 : // Signedness of char implementation dependent, so be explicit.
7165 : // Assumes 2-complement integer types and sign extension of right shifting
7166 : // GCC guarantees such:
7167 : // https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
7168 143590 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
7169 : {
7170 143590 : return static_cast<GByte>(static_cast<signed char>(byVal << (7 - nBit)) >>
7171 143590 : 7);
7172 : }
7173 : #else
7174 : // Portable way
7175 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
7176 : {
7177 : return (byVal & (1 << nBit)) ? 255 : 0;
7178 : }
7179 : #endif
7180 :
7181 : /************************************************************************/
7182 : /* ExpandEightPackedBitsToByteAt255() */
7183 : /************************************************************************/
7184 :
7185 17813 : static inline void ExpandEightPackedBitsToByteAt255(GByte byVal,
7186 : GByte abyOutput[8])
7187 : {
7188 17813 : abyOutput[0] = ExtractBitAndConvertTo255(byVal, 7);
7189 17813 : abyOutput[1] = ExtractBitAndConvertTo255(byVal, 6);
7190 17813 : abyOutput[2] = ExtractBitAndConvertTo255(byVal, 5);
7191 17813 : abyOutput[3] = ExtractBitAndConvertTo255(byVal, 4);
7192 17813 : abyOutput[4] = ExtractBitAndConvertTo255(byVal, 3);
7193 17813 : abyOutput[5] = ExtractBitAndConvertTo255(byVal, 2);
7194 17813 : abyOutput[6] = ExtractBitAndConvertTo255(byVal, 1);
7195 17813 : abyOutput[7] = ExtractBitAndConvertTo255(byVal, 0);
7196 17813 : }
7197 :
7198 : /************************************************************************/
7199 : /* GDALExpandPackedBitsToByteAt0Or255() */
7200 : /************************************************************************/
7201 :
7202 : /** Expand packed-bits (ordered from most-significant bit to least one)
7203 : into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
7204 : at 1 to a byte at 255.
7205 :
7206 : The function does (in a possibly more optimized way) the following:
7207 : \code{.cpp}
7208 : for (size_t i = 0; i < nInputBits; ++i )
7209 : {
7210 : pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 255 : 0;
7211 : }
7212 : \endcode
7213 :
7214 : @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
7215 : @param pabyOutput Output array of nInputBits bytes.
7216 : @param nInputBits Number of valid bits in pabyInput.
7217 :
7218 : @since 3.11
7219 : */
7220 :
7221 45357 : void GDALExpandPackedBitsToByteAt0Or255(const GByte *CPL_RESTRICT pabyInput,
7222 : GByte *CPL_RESTRICT pabyOutput,
7223 : size_t nInputBits)
7224 : {
7225 45357 : const size_t nInputWholeBytes = nInputBits / 8;
7226 45357 : size_t iByte = 0;
7227 :
7228 : #ifdef HAVE_SSE2
7229 : // Mask to isolate each bit
7230 45357 : const __m128i bit_mask = _mm_set_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4,
7231 : 8, 16, 32, 64, -128);
7232 45357 : const __m128i zero = _mm_setzero_si128();
7233 45357 : const __m128i all_ones = _mm_set1_epi8(-1);
7234 : #ifdef __SSSE3__
7235 : const __m128i dispatch_two_bytes =
7236 : _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
7237 : #endif
7238 45357 : constexpr size_t SSE_REG_SIZE = sizeof(bit_mask);
7239 135866 : for (; iByte + SSE_REG_SIZE <= nInputWholeBytes; iByte += SSE_REG_SIZE)
7240 : {
7241 90509 : __m128i reg_ori = _mm_loadu_si128(
7242 90509 : reinterpret_cast<const __m128i *>(pabyInput + iByte));
7243 :
7244 90509 : constexpr int NUM_PROCESSED_BYTES_PER_REG = 2;
7245 814581 : for (size_t k = 0; k < SSE_REG_SIZE / NUM_PROCESSED_BYTES_PER_REG; ++k)
7246 : {
7247 : // Given reg_ori = (A, B, ... 14 other bytes ...),
7248 : // expand to (A, A, A, A, A, A, A, A, B, B, B, B, B, B, B, B)
7249 : #ifdef __SSSE3__
7250 : __m128i reg = _mm_shuffle_epi8(reg_ori, dispatch_two_bytes);
7251 : #else
7252 724072 : __m128i reg = _mm_unpacklo_epi8(reg_ori, reg_ori);
7253 724072 : reg = _mm_unpacklo_epi16(reg, reg);
7254 724072 : reg = _mm_unpacklo_epi32(reg, reg);
7255 : #endif
7256 :
7257 : // Test if bits of interest are set
7258 724072 : reg = _mm_and_si128(reg, bit_mask);
7259 :
7260 : // Now test if those bits are set, by comparing to zero. So the
7261 : // result will be that bytes where bits are set will be at 0, and
7262 : // ones where they are cleared will be at 0xFF. So the inverse of
7263 : // the end result we want!
7264 724072 : reg = _mm_cmpeq_epi8(reg, zero);
7265 :
7266 : // Invert the result
7267 724072 : reg = _mm_andnot_si128(reg, all_ones);
7268 :
7269 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyOutput), reg);
7270 :
7271 724072 : pabyOutput += SSE_REG_SIZE;
7272 :
7273 : // Right-shift of 2 bytes
7274 724072 : reg_ori = _mm_bsrli_si128(reg_ori, NUM_PROCESSED_BYTES_PER_REG);
7275 : }
7276 : }
7277 :
7278 : #endif // HAVE_SSE2
7279 :
7280 63170 : for (; iByte < nInputWholeBytes; ++iByte)
7281 : {
7282 17813 : ExpandEightPackedBitsToByteAt255(pabyInput[iByte], pabyOutput);
7283 17813 : pabyOutput += 8;
7284 : }
7285 46443 : for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
7286 : {
7287 1086 : *pabyOutput = ExtractBitAndConvertTo255(pabyInput[iByte], 7 - iBit);
7288 1086 : ++pabyOutput;
7289 : }
7290 45357 : }
7291 :
7292 : /************************************************************************/
7293 : /* ExpandEightPackedBitsToByteAt1() */
7294 : /************************************************************************/
7295 :
7296 136113 : static inline void ExpandEightPackedBitsToByteAt1(GByte byVal,
7297 : GByte abyOutput[8])
7298 : {
7299 136113 : abyOutput[0] = (byVal >> 7) & 0x1;
7300 136113 : abyOutput[1] = (byVal >> 6) & 0x1;
7301 136113 : abyOutput[2] = (byVal >> 5) & 0x1;
7302 136113 : abyOutput[3] = (byVal >> 4) & 0x1;
7303 136113 : abyOutput[4] = (byVal >> 3) & 0x1;
7304 136113 : abyOutput[5] = (byVal >> 2) & 0x1;
7305 136113 : abyOutput[6] = (byVal >> 1) & 0x1;
7306 136113 : abyOutput[7] = (byVal >> 0) & 0x1;
7307 136113 : }
7308 :
7309 : /************************************************************************/
7310 : /* GDALExpandPackedBitsToByteAt0Or1() */
7311 : /************************************************************************/
7312 :
7313 : /** Expand packed-bits (ordered from most-significant bit to least one)
7314 : into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
7315 : at 1 to a byte at 1.
7316 :
7317 : The function does (in a possibly more optimized way) the following:
7318 : \code{.cpp}
7319 : for (size_t i = 0; i < nInputBits; ++i )
7320 : {
7321 : pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
7322 : }
7323 : \endcode
7324 :
7325 : @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
7326 : @param pabyOutput Output array of nInputBits bytes.
7327 : @param nInputBits Number of valid bits in pabyInput.
7328 :
7329 : @since 3.11
7330 : */
7331 :
7332 7033 : void GDALExpandPackedBitsToByteAt0Or1(const GByte *CPL_RESTRICT pabyInput,
7333 : GByte *CPL_RESTRICT pabyOutput,
7334 : size_t nInputBits)
7335 : {
7336 7033 : const size_t nInputWholeBytes = nInputBits / 8;
7337 7033 : size_t iByte = 0;
7338 143146 : for (; iByte < nInputWholeBytes; ++iByte)
7339 : {
7340 136113 : ExpandEightPackedBitsToByteAt1(pabyInput[iByte], pabyOutput);
7341 136113 : pabyOutput += 8;
7342 : }
7343 18886 : for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
7344 : {
7345 11853 : *pabyOutput = (pabyInput[iByte] >> (7 - iBit)) & 0x1;
7346 11853 : ++pabyOutput;
7347 : }
7348 7033 : }
|