Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: GDAL Core
4 : * Purpose: Contains default implementation of GDALRasterBand::IRasterIO()
5 : * and supporting functions of broader utility.
6 : * Author: Frank Warmerdam, warmerdam@pobox.com
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 1998, Frank Warmerdam
10 : * Copyright (c) 2007-2014, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * SPDX-License-Identifier: MIT
13 : ****************************************************************************/
14 :
15 : #include "cpl_port.h"
16 : #include "gdal.h"
17 : #include "gdal_priv.h"
18 :
19 : #include <cassert>
20 : #include <climits>
21 : #include <cmath>
22 : #include <cstddef>
23 : #include <cstdio>
24 : #include <cstdlib>
25 : #include <cstring>
26 :
27 : #include <algorithm>
28 : #include <limits>
29 : #include <stdexcept>
30 : #include <type_traits>
31 :
32 : #include "cpl_conv.h"
33 : #include "cpl_cpu_features.h"
34 : #include "cpl_error.h"
35 : #include "cpl_float.h"
36 : #include "cpl_progress.h"
37 : #include "cpl_string.h"
38 : #include "cpl_vsi.h"
39 : #include "gdal_priv_templates.hpp"
40 : #include "gdal_vrt.h"
41 : #include "gdalwarper.h"
42 : #include "memdataset.h"
43 : #include "vrtdataset.h"
44 :
45 : #if defined(__x86_64) || defined(_M_X64)
46 : #include <emmintrin.h>
47 : #include <immintrin.h>
48 : #define HAVE_SSE2
49 : // AVX2 dispatch: compile AVX2 code with target attribute, detect at runtime
50 : #if (defined(__GNUC__) || defined(__clang__)) && \
51 : defined(HAVE_AVX2_AT_COMPILE_TIME)
52 : #define HAVE_AVX2_DISPATCH
53 : #elif defined(_MSC_VER)
54 : #include <intrin.h>
55 : #define HAVE_AVX2_DISPATCH
56 : #endif
57 : #elif defined(USE_NEON_OPTIMIZATIONS)
58 : #include "include_sse2neon.h"
59 : #define HAVE_SSE2
60 : #endif
61 :
62 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
63 : #include "rasterio_ssse3.h"
64 : #ifdef __SSSE3__
65 : #include <tmmintrin.h>
66 : #endif
67 : #endif
68 :
69 : #ifdef __SSE4_1__
70 : #include <smmintrin.h>
71 : #endif
72 :
73 : #ifdef __GNUC__
74 : #define CPL_NOINLINE __attribute__((noinline))
75 : #else
76 : #define CPL_NOINLINE
77 : #endif
78 :
79 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
80 : int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
81 : int nDstPixelStride, GPtrDiff_t nWordCount);
82 :
83 : /************************************************************************/
84 : /* DownsamplingIntegerXFactor() */
85 : /************************************************************************/
86 :
87 : template <bool bSameDataType, int DATA_TYPE_SIZE>
88 695850 : static bool DownsamplingIntegerXFactor(
89 : GDALRasterBand *poBand, int iSrcX, int nSrcXInc, GPtrDiff_t iSrcOffsetCst,
90 : GByte *CPL_RESTRICT pabyDstData, int nPixelSpace, int nBufXSize,
91 : GDALDataType eDataType, GDALDataType eBufType, int &nStartBlockX,
92 : int nBlockXSize, GDALRasterBlock *&poBlock, int nLBlockY)
93 : {
94 695850 : const int nBandDataSize =
95 : bSameDataType ? DATA_TYPE_SIZE : GDALGetDataTypeSizeBytes(eDataType);
96 695850 : int nOuterLoopIters = nBufXSize - 1;
97 695850 : const int nIncSrcOffset = nSrcXInc * nBandDataSize;
98 : const GByte *CPL_RESTRICT pabySrcData;
99 695850 : int nEndBlockX = nBlockXSize + nStartBlockX;
100 :
101 695850 : if (iSrcX < nEndBlockX)
102 : {
103 295062 : CPLAssert(poBlock);
104 295062 : goto no_reload_block;
105 : }
106 400788 : goto reload_block;
107 :
108 : // Don't do the last iteration in the loop, as iSrcX might go beyond
109 : // nRasterXSize - 1
110 1265113 : while (--nOuterLoopIters >= 1)
111 : {
112 201834 : iSrcX += nSrcXInc;
113 201834 : pabySrcData += nIncSrcOffset;
114 201834 : pabyDstData += nPixelSpace;
115 :
116 : /* --------------------------------------------------------------------
117 : */
118 : /* Ensure we have the appropriate block loaded. */
119 : /* --------------------------------------------------------------------
120 : */
121 201834 : if (iSrcX >= nEndBlockX)
122 : {
123 201834 : reload_block:
124 : {
125 615212 : const int nLBlockX = iSrcX / nBlockXSize;
126 615212 : nStartBlockX = nLBlockX * nBlockXSize;
127 615212 : nEndBlockX = nStartBlockX + nBlockXSize;
128 :
129 615212 : if (poBlock != nullptr)
130 341376 : poBlock->DropLock();
131 :
132 615212 : poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
133 615212 : if (poBlock == nullptr)
134 : {
135 1 : return false;
136 : }
137 : }
138 :
139 615211 : no_reload_block:
140 : const GByte *pabySrcBlock =
141 1265113 : static_cast<const GByte *>(poBlock->GetDataRef());
142 1265113 : GPtrDiff_t iSrcOffset =
143 1265113 : (iSrcX - nStartBlockX + iSrcOffsetCst) * nBandDataSize;
144 1265113 : pabySrcData = pabySrcBlock + iSrcOffset;
145 : }
146 :
147 : /* --------------------------------------------------------------------
148 : */
149 : /* Copy the maximum run of pixels. */
150 : /* --------------------------------------------------------------------
151 : */
152 :
153 1265113 : const int nIters = std::min(
154 1265113 : (nEndBlockX - iSrcX + (nSrcXInc - 1)) / nSrcXInc, nOuterLoopIters);
155 : if (bSameDataType)
156 : {
157 1264670 : memcpy(pabyDstData, pabySrcData, nBandDataSize);
158 1264670 : if (nIters > 1)
159 : {
160 : if (DATA_TYPE_SIZE == 1)
161 : {
162 326320 : pabySrcData += nIncSrcOffset;
163 326320 : pabyDstData += nPixelSpace;
164 326320 : GDALFastCopyByte(pabySrcData, nIncSrcOffset, pabyDstData,
165 326320 : nPixelSpace, nIters - 1);
166 326320 : pabySrcData +=
167 326320 : static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 2);
168 326320 : pabyDstData +=
169 326320 : static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 2);
170 : }
171 : else
172 : {
173 4395716 : for (int i = 0; i < nIters - 1; i++)
174 : {
175 4197550 : pabySrcData += nIncSrcOffset;
176 4197550 : pabyDstData += nPixelSpace;
177 4197550 : memcpy(pabyDstData, pabySrcData, nBandDataSize);
178 : }
179 : }
180 524490 : iSrcX += nSrcXInc * (nIters - 1);
181 524490 : nOuterLoopIters -= nIters - 1;
182 : }
183 : }
184 : else
185 : {
186 : // Type to type conversion ...
187 443 : GDALCopyWords64(pabySrcData, eDataType, nIncSrcOffset, pabyDstData,
188 443 : eBufType, nPixelSpace, std::max(1, nIters));
189 443 : if (nIters > 1)
190 : {
191 216 : pabySrcData +=
192 216 : static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 1);
193 216 : pabyDstData +=
194 216 : static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 1);
195 216 : iSrcX += nSrcXInc * (nIters - 1);
196 216 : nOuterLoopIters -= nIters - 1;
197 : }
198 : }
199 : }
200 :
201 : // Deal with last iteration to avoid iSrcX to go beyond nRasterXSize - 1
202 1063279 : if (nOuterLoopIters == 0)
203 : {
204 367430 : const int nRasterXSize = poBand->GetXSize();
205 367430 : iSrcX =
206 734860 : static_cast<int>(std::min(static_cast<GInt64>(iSrcX) + nSrcXInc,
207 367430 : static_cast<GInt64>(nRasterXSize - 1)));
208 367430 : pabyDstData += nPixelSpace;
209 367430 : if (iSrcX < nEndBlockX)
210 : {
211 354840 : goto no_reload_block;
212 : }
213 12590 : goto reload_block;
214 : }
215 695849 : return true;
216 : }
217 :
218 : template <class A, class B>
219 2818770 : CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW inline auto CPLUnsanitizedMul(A a, B b)
220 : {
221 2818770 : return a * b;
222 : }
223 :
224 : /************************************************************************/
225 : /* IRasterIO() */
226 : /* */
227 : /* Default internal implementation of RasterIO() ... utilizes */
228 : /* the Block access methods to satisfy the request. This would */
229 : /* normally only be overridden by formats with overviews. */
230 : /************************************************************************/
231 :
232 6180730 : CPLErr GDALRasterBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
233 : int nXSize, int nYSize, void *pData,
234 : int nBufXSize, int nBufYSize,
235 : GDALDataType eBufType, GSpacing nPixelSpace,
236 : GSpacing nLineSpace,
237 : GDALRasterIOExtraArg *psExtraArg)
238 :
239 : {
240 6180730 : if (eRWFlag == GF_Write && eFlushBlockErr != CE_None)
241 : {
242 0 : CPLError(eFlushBlockErr, CPLE_AppDefined,
243 : "An error occurred while writing a dirty block "
244 : "from GDALRasterBand::IRasterIO");
245 0 : CPLErr eErr = eFlushBlockErr;
246 0 : eFlushBlockErr = CE_None;
247 0 : return eErr;
248 : }
249 6180730 : if (nBlockXSize <= 0 || nBlockYSize <= 0)
250 : {
251 0 : CPLError(CE_Failure, CPLE_AppDefined, "Invalid block size");
252 0 : return CE_Failure;
253 : }
254 :
255 6180730 : const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
256 6180730 : const int nBufDataSize = GDALGetDataTypeSizeBytes(eBufType);
257 6180730 : GByte dummyBlock[2] = {0, 0};
258 6180730 : GByte *pabySrcBlock =
259 : dummyBlock; /* to avoid Coverity warning about nullptr dereference */
260 6180730 : GDALRasterBlock *poBlock = nullptr;
261 6180730 : const bool bUseIntegerRequestCoords =
262 6545680 : (!psExtraArg->bFloatingPointWindowValidity ||
263 364948 : (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
264 340016 : nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
265 :
266 : /* ==================================================================== */
267 : /* A common case is the data requested with the destination */
268 : /* is packed, and the block width is the raster width. */
269 : /* ==================================================================== */
270 6088920 : if (nPixelSpace == nBufDataSize && nLineSpace == nPixelSpace * nXSize &&
271 3234440 : nBlockXSize == GetXSize() && nBufXSize == nXSize &&
272 12269600 : nBufYSize == nYSize && bUseIntegerRequestCoords)
273 : {
274 3096640 : CPLErr eErr = CE_None;
275 3096640 : int nLBlockY = -1;
276 :
277 9751800 : for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
278 : {
279 6656240 : const int iSrcY = iBufYOff + nYOff;
280 :
281 6656240 : if (iSrcY < nLBlockY * nBlockYSize ||
282 6656240 : iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
283 : {
284 3365600 : nLBlockY = iSrcY / nBlockYSize;
285 3365600 : bool bJustInitialize =
286 297355 : eRWFlag == GF_Write && nXOff == 0 &&
287 3720870 : nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
288 57921 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize;
289 :
290 : // Is this a partial tile at right and/or bottom edges of
291 : // the raster, and that is going to be completely written?
292 : // If so, do not load it from storage, but zero it so that
293 : // the content outsize of the validity area is initialized.
294 3365600 : bool bMemZeroBuffer = false;
295 297355 : if (eRWFlag == GF_Write && !bJustInitialize && nXOff == 0 &&
296 24978 : nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
297 3663040 : nYOff + nYSize == GetYSize() &&
298 90 : nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)
299 : {
300 90 : bJustInitialize = true;
301 90 : bMemZeroBuffer = true;
302 : }
303 :
304 3365600 : if (poBlock)
305 268957 : poBlock->DropLock();
306 :
307 3365600 : const GUInt32 nErrorCounter = CPLGetErrorCounter();
308 3365600 : poBlock = GetLockedBlockRef(0, nLBlockY, bJustInitialize);
309 3365600 : if (poBlock == nullptr)
310 : {
311 1078 : if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
312 : nullptr)
313 : {
314 0 : CPLError(CE_Failure, CPLE_AppDefined,
315 : "GetBlockRef failed at X block offset %d, "
316 : "Y block offset %d%s",
317 : 0, nLBlockY,
318 0 : (nErrorCounter != CPLGetErrorCounter())
319 0 : ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
320 : : "");
321 : }
322 1078 : eErr = CE_Failure;
323 1078 : break;
324 : }
325 :
326 3364520 : if (eRWFlag == GF_Write)
327 297355 : poBlock->MarkDirty();
328 :
329 3364520 : pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
330 3364520 : if (bMemZeroBuffer)
331 : {
332 90 : memset(pabySrcBlock, 0,
333 90 : static_cast<GPtrDiff_t>(nBandDataSize) *
334 90 : nBlockXSize * nBlockYSize);
335 : }
336 : }
337 :
338 6655160 : const auto nSrcByteOffset =
339 6655160 : (static_cast<GPtrDiff_t>(iSrcY - nLBlockY * nBlockYSize) *
340 6655160 : nBlockXSize +
341 6655160 : nXOff) *
342 6655160 : nBandDataSize;
343 :
344 6655160 : if (eDataType == eBufType)
345 : {
346 2991450 : if (eRWFlag == GF_Read)
347 2518870 : memcpy(static_cast<GByte *>(pData) +
348 2518870 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
349 2518870 : pabySrcBlock + nSrcByteOffset,
350 : static_cast<size_t>(nLineSpace));
351 : else
352 472580 : memcpy(pabySrcBlock + nSrcByteOffset,
353 472580 : static_cast<GByte *>(pData) +
354 472580 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
355 : static_cast<size_t>(nLineSpace));
356 : }
357 : else
358 : {
359 : // Type to type conversion.
360 3663710 : if (eRWFlag == GF_Read)
361 3641640 : GDALCopyWords64(
362 3641640 : pabySrcBlock + nSrcByteOffset, eDataType, nBandDataSize,
363 : static_cast<GByte *>(pData) +
364 3641640 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
365 : eBufType, static_cast<int>(nPixelSpace), nBufXSize);
366 : else
367 22065 : GDALCopyWords64(static_cast<GByte *>(pData) +
368 22065 : static_cast<GPtrDiff_t>(iBufYOff) *
369 : nLineSpace,
370 : eBufType, static_cast<int>(nPixelSpace),
371 22065 : pabySrcBlock + nSrcByteOffset, eDataType,
372 : nBandDataSize, nBufXSize);
373 : }
374 :
375 6743170 : if (psExtraArg->pfnProgress != nullptr &&
376 88008 : !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
377 : psExtraArg->pProgressData))
378 : {
379 5 : eErr = CE_Failure;
380 5 : break;
381 : }
382 : }
383 :
384 3096640 : if (poBlock)
385 3095560 : poBlock->DropLock();
386 :
387 3096640 : return eErr;
388 : }
389 :
390 : /* ==================================================================== */
391 : /* Do we have overviews that would be appropriate to satisfy */
392 : /* this request? */
393 : /* ==================================================================== */
394 3084090 : if ((nBufXSize < nXSize || nBufYSize < nYSize) && GetOverviewCount() > 0 &&
395 : eRWFlag == GF_Read)
396 : {
397 : GDALRasterIOExtraArg sExtraArg;
398 2967 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
399 :
400 : const int nOverview =
401 2967 : GDALBandGetBestOverviewLevel2(this, nXOff, nYOff, nXSize, nYSize,
402 : nBufXSize, nBufYSize, &sExtraArg);
403 2967 : if (nOverview >= 0)
404 : {
405 2892 : GDALRasterBand *poOverviewBand = GetOverview(nOverview);
406 2892 : if (poOverviewBand == nullptr)
407 2892 : return CE_Failure;
408 :
409 2892 : return poOverviewBand->RasterIO(
410 : eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize,
411 2892 : nBufYSize, eBufType, nPixelSpace, nLineSpace, &sExtraArg);
412 : }
413 : }
414 :
415 891713 : if (eRWFlag == GF_Read && nBufXSize < nXSize / 100 &&
416 6 : nBufYSize < nYSize / 100 && nPixelSpace == nBufDataSize &&
417 3972910 : nLineSpace == nPixelSpace * nBufXSize &&
418 6 : CPLTestBool(CPLGetConfigOption("GDAL_NO_COSTLY_OVERVIEW", "NO")))
419 : {
420 0 : memset(pData, 0, static_cast<size_t>(nLineSpace * nBufYSize));
421 0 : return CE_None;
422 : }
423 :
424 : /* ==================================================================== */
425 : /* The second case when we don't need subsample data but likely */
426 : /* need data type conversion. */
427 : /* ==================================================================== */
428 3081200 : if ( // nPixelSpace == nBufDataSize &&
429 3081200 : nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
430 : {
431 : #if DEBUG_VERBOSE
432 : printf("IRasterIO(%d,%d,%d,%d) rw=%d case 2\n", /*ok*/
433 : nXOff, nYOff, nXSize, nYSize, static_cast<int>(eRWFlag));
434 : #endif
435 :
436 : /* --------------------------------------------------------------------
437 : */
438 : /* Loop over buffer computing source locations. */
439 : /* --------------------------------------------------------------------
440 : */
441 : // Calculate starting values out of loop
442 2503280 : const int nLBlockXStart = nXOff / nBlockXSize;
443 2503280 : const int nXSpanEnd = nBufXSize + nXOff;
444 :
445 2503280 : int nYInc = 0;
446 5047340 : for (int iBufYOff = 0, iSrcY = nYOff; iBufYOff < nBufYSize;
447 2544060 : iBufYOff += nYInc, iSrcY += nYInc)
448 : {
449 2544130 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
450 : static_cast<GPtrDiff_t>(nLineSpace);
451 2544130 : int nLBlockY = iSrcY / nBlockYSize;
452 2544130 : int nLBlockX = nLBlockXStart;
453 2544130 : int iSrcX = nXOff;
454 5362830 : while (iSrcX < nXSpanEnd)
455 : {
456 2818770 : int nXSpan = nLBlockX * nBlockXSize;
457 2818770 : if (nXSpan < INT_MAX - nBlockXSize)
458 2818770 : nXSpan += nBlockXSize;
459 : else
460 0 : nXSpan = INT_MAX;
461 2818770 : const int nXRight = nXSpan;
462 2818770 : nXSpan = (nXSpan < nXSpanEnd ? nXSpan : nXSpanEnd) - iSrcX;
463 :
464 : const size_t nXSpanSize =
465 2818770 : CPLUnsanitizedMul(nXSpan, static_cast<size_t>(nPixelSpace));
466 :
467 2818770 : bool bJustInitialize =
468 2042970 : eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
469 38035 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
470 4888110 : nXOff <= nLBlockX * nBlockXSize &&
471 26364 : nXOff + nXSize >= nXRight;
472 :
473 : // Is this a partial tile at right and/or bottom edges of
474 : // the raster, and that is going to be completely written?
475 : // If so, do not load it from storage, but zero it so that
476 : // the content outsize of the validity area is initialized.
477 2818770 : bool bMemZeroBuffer = false;
478 2042970 : if (eRWFlag == GF_Write && !bJustInitialize &&
479 2017850 : nXOff <= nLBlockX * nBlockXSize &&
480 2016190 : nYOff <= nLBlockY * nBlockYSize &&
481 12145 : (nXOff + nXSize >= nXRight ||
482 : // cppcheck-suppress knownConditionTrueFalse
483 4864460 : (nXOff + nXSize == GetXSize() && nXRight > GetXSize())) &&
484 11965 : (nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize ||
485 10743 : (nYOff + nYSize == GetYSize() &&
486 1951 : nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)))
487 : {
488 3173 : bJustInitialize = true;
489 3173 : bMemZeroBuffer = true;
490 : }
491 :
492 : /* --------------------------------------------------------------------
493 : */
494 : /* Ensure we have the appropriate block loaded. */
495 : /* --------------------------------------------------------------------
496 : */
497 2818770 : const GUInt32 nErrorCounter = CPLGetErrorCounter();
498 2818770 : poBlock =
499 2818770 : GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
500 2818770 : if (!poBlock)
501 : {
502 73 : if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
503 : nullptr)
504 : {
505 0 : CPLError(CE_Failure, CPLE_AppDefined,
506 : "GetBlockRef failed at X block offset %d, "
507 : "Y block offset %d%s",
508 : nLBlockX, nLBlockY,
509 0 : (nErrorCounter != CPLGetErrorCounter())
510 0 : ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
511 : : "");
512 : }
513 73 : return (CE_Failure);
514 : }
515 :
516 2818700 : if (eRWFlag == GF_Write)
517 2042970 : poBlock->MarkDirty();
518 :
519 2818700 : pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
520 2818700 : if (bMemZeroBuffer)
521 : {
522 3173 : memset(pabySrcBlock, 0,
523 3173 : static_cast<GPtrDiff_t>(nBandDataSize) *
524 3173 : nBlockXSize * nBlockYSize);
525 : }
526 : /* --------------------------------------------------------------------
527 : */
528 : /* Copy over this chunk of data. */
529 : /* --------------------------------------------------------------------
530 : */
531 2818700 : GPtrDiff_t iSrcOffset =
532 2818700 : (static_cast<GPtrDiff_t>(iSrcX) -
533 2818700 : static_cast<GPtrDiff_t>(nLBlockX * nBlockXSize) +
534 2818700 : (static_cast<GPtrDiff_t>(iSrcY) -
535 2818700 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
536 2818700 : nBlockXSize) *
537 2818700 : nBandDataSize;
538 : // Fill up as many rows as possible for the loaded block.
539 5637400 : const int kmax = std::min(nBlockYSize - (iSrcY % nBlockYSize),
540 2818700 : nBufYSize - iBufYOff);
541 60991500 : for (int k = 0; k < kmax; k++)
542 : {
543 58172800 : if (eDataType == eBufType && nPixelSpace == nBufDataSize)
544 : {
545 53770900 : if (eRWFlag == GF_Read)
546 49332800 : memcpy(static_cast<GByte *>(pData) + iBufOffset +
547 49332800 : static_cast<GPtrDiff_t>(k) * nLineSpace,
548 49332800 : pabySrcBlock + iSrcOffset, nXSpanSize);
549 : else
550 4438130 : memcpy(pabySrcBlock + iSrcOffset,
551 4438130 : static_cast<GByte *>(pData) + iBufOffset +
552 4438130 : static_cast<GPtrDiff_t>(k) * nLineSpace,
553 : nXSpanSize);
554 : }
555 : else
556 : {
557 : /* type to type conversion */
558 4401910 : if (eRWFlag == GF_Read)
559 4251700 : GDALCopyWords64(
560 4251700 : pabySrcBlock + iSrcOffset, eDataType,
561 : nBandDataSize,
562 4251700 : static_cast<GByte *>(pData) + iBufOffset +
563 4251700 : static_cast<GPtrDiff_t>(k) * nLineSpace,
564 : eBufType, static_cast<int>(nPixelSpace),
565 : nXSpan);
566 : else
567 150209 : GDALCopyWords64(
568 150209 : static_cast<GByte *>(pData) + iBufOffset +
569 150209 : static_cast<GPtrDiff_t>(k) * nLineSpace,
570 : eBufType, static_cast<int>(nPixelSpace),
571 150209 : pabySrcBlock + iSrcOffset, eDataType,
572 : nBandDataSize, nXSpan);
573 : }
574 :
575 58172800 : iSrcOffset +=
576 58172800 : static_cast<GPtrDiff_t>(nBlockXSize) * nBandDataSize;
577 : }
578 :
579 : iBufOffset =
580 2818700 : CPLUnsanitizedAdd<GPtrDiff_t>(iBufOffset, nXSpanSize);
581 2818700 : nLBlockX++;
582 2818700 : iSrcX += nXSpan;
583 :
584 2818700 : poBlock->DropLock();
585 2818700 : poBlock = nullptr;
586 : }
587 :
588 : /* Compute the increment to go on a block boundary */
589 2544060 : nYInc = nBlockYSize - (iSrcY % nBlockYSize);
590 :
591 2545940 : if (psExtraArg->pfnProgress != nullptr &&
592 1884 : !psExtraArg->pfnProgress(
593 2545940 : 1.0 * std::min(nBufYSize, iBufYOff + nYInc) / nBufYSize, "",
594 : psExtraArg->pProgressData))
595 : {
596 0 : return CE_Failure;
597 : }
598 : }
599 :
600 2503210 : return CE_None;
601 : }
602 :
603 : /* ==================================================================== */
604 : /* Loop reading required source blocks to satisfy output */
605 : /* request. This is the most general implementation. */
606 : /* ==================================================================== */
607 :
608 577913 : double dfXOff = nXOff;
609 577913 : double dfYOff = nYOff;
610 577913 : double dfXSize = nXSize;
611 577913 : double dfYSize = nYSize;
612 577913 : if (psExtraArg->bFloatingPointWindowValidity)
613 : {
614 242956 : dfXOff = psExtraArg->dfXOff;
615 242956 : dfYOff = psExtraArg->dfYOff;
616 242956 : dfXSize = psExtraArg->dfXSize;
617 242956 : dfYSize = psExtraArg->dfYSize;
618 : }
619 :
620 : /* -------------------------------------------------------------------- */
621 : /* Compute stepping increment. */
622 : /* -------------------------------------------------------------------- */
623 577913 : const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
624 577913 : const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
625 577913 : CPLErr eErr = CE_None;
626 :
627 577913 : if (eRWFlag == GF_Write)
628 : {
629 : /* --------------------------------------------------------------------
630 : */
631 : /* Write case */
632 : /* Loop over raster window computing source locations in the buffer.
633 : */
634 : /* --------------------------------------------------------------------
635 : */
636 166655 : GByte *pabyDstBlock = nullptr;
637 166655 : int nLBlockX = -1;
638 166655 : int nLBlockY = -1;
639 :
640 1260010 : for (int iDstY = nYOff; iDstY < nYOff + nYSize; iDstY++)
641 : {
642 1093360 : const int iBufYOff = static_cast<int>((iDstY - nYOff) / dfSrcYInc);
643 :
644 12384200 : for (int iDstX = nXOff; iDstX < nXOff + nXSize; iDstX++)
645 : {
646 11290800 : const int iBufXOff =
647 11290800 : static_cast<int>((iDstX - nXOff) / dfSrcXInc);
648 11290800 : GPtrDiff_t iBufOffset =
649 11290800 : static_cast<GPtrDiff_t>(iBufYOff) *
650 : static_cast<GPtrDiff_t>(nLineSpace) +
651 11290800 : iBufXOff * static_cast<GPtrDiff_t>(nPixelSpace);
652 :
653 : // FIXME: this code likely doesn't work if the dirty block gets
654 : // flushed to disk before being completely written.
655 : // In the meantime, bJustInitialize should probably be set to
656 : // FALSE even if it is not ideal performance wise, and for
657 : // lossy compression.
658 :
659 : /* --------------------------------------------------------------------
660 : */
661 : /* Ensure we have the appropriate block loaded. */
662 : /* --------------------------------------------------------------------
663 : */
664 11290800 : if (iDstX < nLBlockX * nBlockXSize ||
665 11041500 : iDstX - nBlockXSize >= nLBlockX * nBlockXSize ||
666 10584800 : iDstY < nLBlockY * nBlockYSize ||
667 10584800 : iDstY - nBlockYSize >= nLBlockY * nBlockYSize)
668 : {
669 738702 : nLBlockX = iDstX / nBlockXSize;
670 738702 : nLBlockY = iDstY / nBlockYSize;
671 :
672 738702 : const bool bJustInitialize =
673 1065990 : nYOff <= nLBlockY * nBlockYSize &&
674 327291 : nYOff + nYSize - nBlockYSize >=
675 327291 : nLBlockY * nBlockYSize &&
676 1116320 : nXOff <= nLBlockX * nBlockXSize &&
677 50325 : nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
678 : /*bool bMemZeroBuffer = FALSE;
679 : if( !bJustInitialize &&
680 : nXOff <= nLBlockX * nBlockXSize &&
681 : nYOff <= nLBlockY * nBlockYSize &&
682 : (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
683 : (nXOff + nXSize == GetXSize() &&
684 : (nLBlockX+1) * nBlockXSize > GetXSize())) &&
685 : (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
686 : (nYOff + nYSize == GetYSize() &&
687 : (nLBlockY+1) * nBlockYSize > GetYSize())) )
688 : {
689 : bJustInitialize = TRUE;
690 : bMemZeroBuffer = TRUE;
691 : }*/
692 738702 : if (poBlock != nullptr)
693 572047 : poBlock->DropLock();
694 :
695 738702 : poBlock =
696 738702 : GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
697 738702 : if (poBlock == nullptr)
698 : {
699 0 : return (CE_Failure);
700 : }
701 :
702 738702 : poBlock->MarkDirty();
703 :
704 738702 : pabyDstBlock = static_cast<GByte *>(poBlock->GetDataRef());
705 : /*if( bMemZeroBuffer )
706 : {
707 : memset(pabyDstBlock, 0,
708 : static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
709 : * nBlockYSize);
710 : }*/
711 : }
712 :
713 : // To make Coverity happy. Should not happen by design.
714 11290800 : if (pabyDstBlock == nullptr)
715 : {
716 0 : CPLAssert(false);
717 : eErr = CE_Failure;
718 : break;
719 : }
720 :
721 : /* --------------------------------------------------------------------
722 : */
723 : /* Copy over this pixel of data. */
724 : /* --------------------------------------------------------------------
725 : */
726 11290800 : GPtrDiff_t iDstOffset =
727 11290800 : (static_cast<GPtrDiff_t>(iDstX) -
728 11290800 : static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
729 11290800 : (static_cast<GPtrDiff_t>(iDstY) -
730 11290800 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
731 11290800 : nBlockXSize) *
732 11290800 : nBandDataSize;
733 :
734 11290800 : if (eDataType == eBufType)
735 : {
736 11287700 : memcpy(pabyDstBlock + iDstOffset,
737 11287700 : static_cast<GByte *>(pData) + iBufOffset,
738 : nBandDataSize);
739 : }
740 : else
741 : {
742 : /* type to type conversion ... ouch, this is expensive way
743 : of handling single words */
744 3096 : GDALCopyWords64(static_cast<GByte *>(pData) + iBufOffset,
745 3096 : eBufType, 0, pabyDstBlock + iDstOffset,
746 : eDataType, 0, 1);
747 : }
748 : }
749 :
750 1093360 : if (psExtraArg->pfnProgress != nullptr &&
751 0 : !psExtraArg->pfnProgress(1.0 * (iDstY - nYOff + 1) / nYSize, "",
752 : psExtraArg->pProgressData))
753 : {
754 0 : eErr = CE_Failure;
755 0 : break;
756 : }
757 : }
758 : }
759 : else
760 : {
761 411258 : if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
762 : {
763 42075 : if ((psExtraArg->eResampleAlg == GRIORA_Cubic ||
764 13559 : psExtraArg->eResampleAlg == GRIORA_CubicSpline ||
765 13506 : psExtraArg->eResampleAlg == GRIORA_Bilinear ||
766 28563 : psExtraArg->eResampleAlg == GRIORA_Lanczos) &&
767 3224 : GetColorTable() != nullptr)
768 : {
769 0 : CPLError(CE_Warning, CPLE_NotSupported,
770 : "Resampling method not supported on paletted band. "
771 : "Falling back to nearest neighbour");
772 : }
773 14261 : else if (psExtraArg->eResampleAlg == GRIORA_Gauss &&
774 3 : GDALDataTypeIsComplex(eDataType))
775 : {
776 0 : CPLError(CE_Warning, CPLE_NotSupported,
777 : "Resampling method not supported on complex data type "
778 : "band. Falling back to nearest neighbour");
779 : }
780 : else
781 : {
782 14258 : return RasterIOResampled(eRWFlag, nXOff, nYOff, nXSize, nYSize,
783 : pData, nBufXSize, nBufYSize, eBufType,
784 14258 : nPixelSpace, nLineSpace, psExtraArg);
785 : }
786 : }
787 :
788 397000 : int nLimitBlockY = 0;
789 397000 : const bool bByteCopy = eDataType == eBufType && nBandDataSize == 1;
790 397000 : int nStartBlockX = -nBlockXSize;
791 397000 : constexpr double EPS = 1e-10;
792 397000 : int nLBlockY = -1;
793 397000 : const double dfSrcXStart = 0.5 * dfSrcXInc + dfXOff + EPS;
794 397000 : const bool bIntegerXFactor =
795 372767 : bUseIntegerRequestCoords &&
796 670836 : static_cast<int>(dfSrcXInc) == dfSrcXInc &&
797 273836 : static_cast<int>(dfSrcXInc) < INT_MAX / nBandDataSize;
798 :
799 : /* --------------------------------------------------------------------
800 : */
801 : /* Read case */
802 : /* Loop over buffer computing source locations. */
803 : /* --------------------------------------------------------------------
804 : */
805 2367100 : for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
806 : {
807 : // Add small epsilon to avoid some numeric precision issues.
808 1970110 : const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
809 1970110 : const int iSrcY = static_cast<int>(std::min(
810 1970110 : std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
811 :
812 1970110 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
813 : static_cast<GPtrDiff_t>(nLineSpace);
814 :
815 1970110 : if (iSrcY >= nLimitBlockY)
816 : {
817 438018 : nLBlockY = iSrcY / nBlockYSize;
818 438018 : nLimitBlockY = nLBlockY * nBlockYSize;
819 438018 : if (nLimitBlockY < INT_MAX - nBlockYSize)
820 438018 : nLimitBlockY += nBlockYSize;
821 : else
822 0 : nLimitBlockY = INT_MAX;
823 : // Make sure a new block is loaded.
824 438018 : nStartBlockX = -nBlockXSize;
825 : }
826 1532090 : else if (static_cast<int>(dfSrcXStart) < nStartBlockX)
827 : {
828 : // Make sure a new block is loaded.
829 437363 : nStartBlockX = -nBlockXSize;
830 : }
831 :
832 1970110 : GPtrDiff_t iSrcOffsetCst = (iSrcY - nLBlockY * nBlockYSize) *
833 1970110 : static_cast<GPtrDiff_t>(nBlockXSize);
834 :
835 1970110 : if (bIntegerXFactor)
836 : {
837 695850 : int iSrcX = static_cast<int>(dfSrcXStart);
838 695850 : const int nSrcXInc = static_cast<int>(dfSrcXInc);
839 695850 : GByte *pabyDstData = static_cast<GByte *>(pData) + iBufOffset;
840 695850 : bool bRet = false;
841 695850 : if (bByteCopy)
842 : {
843 585842 : bRet = DownsamplingIntegerXFactor<true, 1>(
844 : this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
845 : static_cast<int>(nPixelSpace), nBufXSize, GDT_UInt8,
846 : GDT_UInt8, nStartBlockX, nBlockXSize, poBlock,
847 : nLBlockY);
848 : }
849 110008 : else if (eDataType == eBufType)
850 : {
851 109783 : switch (nBandDataSize)
852 : {
853 109630 : case 2:
854 109630 : bRet = DownsamplingIntegerXFactor<true, 2>(
855 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
856 : pabyDstData, static_cast<int>(nPixelSpace),
857 : nBufXSize, eDataType, eDataType, nStartBlockX,
858 : nBlockXSize, poBlock, nLBlockY);
859 109630 : break;
860 55 : case 4:
861 55 : bRet = DownsamplingIntegerXFactor<true, 4>(
862 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
863 : pabyDstData, static_cast<int>(nPixelSpace),
864 : nBufXSize, eDataType, eDataType, nStartBlockX,
865 : nBlockXSize, poBlock, nLBlockY);
866 55 : break;
867 96 : case 8:
868 96 : bRet = DownsamplingIntegerXFactor<true, 8>(
869 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
870 : pabyDstData, static_cast<int>(nPixelSpace),
871 : nBufXSize, eDataType, eDataType, nStartBlockX,
872 : nBlockXSize, poBlock, nLBlockY);
873 96 : break;
874 2 : case 16:
875 2 : bRet = DownsamplingIntegerXFactor<true, 16>(
876 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
877 : pabyDstData, static_cast<int>(nPixelSpace),
878 : nBufXSize, eDataType, eDataType, nStartBlockX,
879 : nBlockXSize, poBlock, nLBlockY);
880 2 : break;
881 0 : default:
882 0 : CPLAssert(false);
883 : break;
884 : }
885 : }
886 : else
887 : {
888 225 : bRet = DownsamplingIntegerXFactor<false, 0>(
889 : this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
890 : static_cast<int>(nPixelSpace), nBufXSize, eDataType,
891 : eBufType, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
892 : }
893 695850 : if (!bRet)
894 1 : eErr = CE_Failure;
895 : }
896 : else
897 : {
898 1274260 : double dfSrcX = dfSrcXStart;
899 503811000 : for (int iBufXOff = 0; iBufXOff < nBufXSize;
900 502537000 : iBufXOff++, dfSrcX += dfSrcXInc)
901 : {
902 : // TODO?: try to avoid the clamping for most iterations
903 : const int iSrcX = static_cast<int>(
904 1005070000 : std::min(std::max(0.0, dfSrcX),
905 502537000 : static_cast<double>(nRasterXSize - 1)));
906 :
907 : /* --------------------------------------------------------------------
908 : */
909 : /* Ensure we have the appropriate block loaded. */
910 : /* --------------------------------------------------------------------
911 : */
912 502537000 : if (iSrcX >= nBlockXSize + nStartBlockX)
913 : {
914 1697820 : const int nLBlockX = iSrcX / nBlockXSize;
915 1697820 : nStartBlockX = nLBlockX * nBlockXSize;
916 :
917 1697820 : if (poBlock != nullptr)
918 1574650 : poBlock->DropLock();
919 :
920 1697820 : poBlock = GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
921 1697820 : if (poBlock == nullptr)
922 : {
923 9 : eErr = CE_Failure;
924 9 : break;
925 : }
926 :
927 : pabySrcBlock =
928 1697810 : static_cast<GByte *>(poBlock->GetDataRef());
929 : }
930 502537000 : const GPtrDiff_t nDiffX =
931 502537000 : static_cast<GPtrDiff_t>(iSrcX - nStartBlockX);
932 :
933 : /* --------------------------------------------------------------------
934 : */
935 : /* Copy over this pixel of data. */
936 : /* --------------------------------------------------------------------
937 : */
938 :
939 502537000 : if (bByteCopy)
940 : {
941 442592000 : GPtrDiff_t iSrcOffset = nDiffX + iSrcOffsetCst;
942 442592000 : static_cast<GByte *>(pData)[iBufOffset] =
943 442592000 : pabySrcBlock[iSrcOffset];
944 : }
945 59944700 : else if (eDataType == eBufType)
946 : {
947 50322800 : GPtrDiff_t iSrcOffset =
948 50322800 : (nDiffX + iSrcOffsetCst) * nBandDataSize;
949 50322800 : memcpy(static_cast<GByte *>(pData) + iBufOffset,
950 50322800 : pabySrcBlock + iSrcOffset, nBandDataSize);
951 : }
952 : else
953 : {
954 : // Type to type conversion ...
955 9621890 : GPtrDiff_t iSrcOffset =
956 9621890 : (nDiffX + iSrcOffsetCst) * nBandDataSize;
957 9621890 : GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
958 : static_cast<GByte *>(pData) +
959 9621890 : iBufOffset,
960 : eBufType, 0, 1);
961 : }
962 :
963 502537000 : iBufOffset += static_cast<int>(nPixelSpace);
964 : }
965 : }
966 1970110 : if (eErr == CE_Failure)
967 11 : break;
968 :
969 2191530 : if (psExtraArg->pfnProgress != nullptr &&
970 221434 : !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
971 : psExtraArg->pProgressData))
972 : {
973 1 : eErr = CE_Failure;
974 1 : break;
975 : }
976 : }
977 : }
978 :
979 563655 : if (poBlock != nullptr)
980 563645 : poBlock->DropLock();
981 :
982 563655 : return eErr;
983 : }
984 :
985 : /************************************************************************/
986 : /* GDALRasterIOTransformer() */
987 : /************************************************************************/
988 :
989 : struct GDALRasterIOTransformerStruct
990 : {
991 : double dfXOff;
992 : double dfYOff;
993 : double dfXRatioDstToSrc;
994 : double dfYRatioDstToSrc;
995 : };
996 :
997 6897 : static int GDALRasterIOTransformer(void *pTransformerArg, int bDstToSrc,
998 : int nPointCount, double *x, double *y,
999 : double * /* z */, int *panSuccess)
1000 : {
1001 6897 : GDALRasterIOTransformerStruct *psParams =
1002 : static_cast<GDALRasterIOTransformerStruct *>(pTransformerArg);
1003 6897 : if (bDstToSrc)
1004 : {
1005 311993 : for (int i = 0; i < nPointCount; i++)
1006 : {
1007 305684 : x[i] = x[i] * psParams->dfXRatioDstToSrc + psParams->dfXOff;
1008 305684 : y[i] = y[i] * psParams->dfYRatioDstToSrc + psParams->dfYOff;
1009 305684 : panSuccess[i] = TRUE;
1010 : }
1011 : }
1012 : else
1013 : {
1014 1176 : for (int i = 0; i < nPointCount; i++)
1015 : {
1016 588 : x[i] = (x[i] - psParams->dfXOff) / psParams->dfXRatioDstToSrc;
1017 588 : y[i] = (y[i] - psParams->dfYOff) / psParams->dfYRatioDstToSrc;
1018 588 : panSuccess[i] = TRUE;
1019 : }
1020 : }
1021 6897 : return TRUE;
1022 : }
1023 :
1024 : /************************************************************************/
1025 : /* RasterIOResampled() */
1026 : /************************************************************************/
1027 :
1028 : //! @cond Doxygen_Suppress
1029 14258 : CPLErr GDALRasterBand::RasterIOResampled(
1030 : GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
1031 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
1032 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
1033 : {
1034 : // Determine if we use warping resampling or overview resampling
1035 : const bool bUseWarp =
1036 14258 : (GDALDataTypeIsComplex(eDataType) &&
1037 14417 : psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
1038 159 : psExtraArg->eResampleAlg != GRIORA_Mode);
1039 :
1040 14258 : double dfXOff = nXOff;
1041 14258 : double dfYOff = nYOff;
1042 14258 : double dfXSize = nXSize;
1043 14258 : double dfYSize = nYSize;
1044 14258 : if (psExtraArg->bFloatingPointWindowValidity)
1045 : {
1046 13512 : dfXOff = psExtraArg->dfXOff;
1047 13512 : dfYOff = psExtraArg->dfYOff;
1048 13512 : dfXSize = psExtraArg->dfXSize;
1049 13512 : dfYSize = psExtraArg->dfYSize;
1050 : }
1051 :
1052 14258 : const double dfXRatioDstToSrc = dfXSize / nBufXSize;
1053 14258 : const double dfYRatioDstToSrc = dfYSize / nBufYSize;
1054 :
1055 : // Determine the coordinates in the "virtual" output raster to see
1056 : // if there are not integers, in which case we will use them as a shift
1057 : // so that subwindow extracts give the exact same results as entire raster
1058 : // scaling.
1059 14258 : double dfDestXOff = dfXOff / dfXRatioDstToSrc;
1060 14258 : bool bHasXOffVirtual = false;
1061 14258 : int nDestXOffVirtual = 0;
1062 14258 : if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
1063 : {
1064 13930 : bHasXOffVirtual = true;
1065 13930 : dfXOff = nXOff;
1066 13930 : nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
1067 : }
1068 :
1069 14258 : double dfDestYOff = dfYOff / dfYRatioDstToSrc;
1070 14258 : bool bHasYOffVirtual = false;
1071 14258 : int nDestYOffVirtual = 0;
1072 14258 : if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
1073 : {
1074 13926 : bHasYOffVirtual = true;
1075 13926 : dfYOff = nYOff;
1076 13926 : nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
1077 : }
1078 :
1079 : // Create a MEM dataset that wraps the output buffer.
1080 : GDALDataset *poMEMDS;
1081 14258 : void *pTempBuffer = nullptr;
1082 14258 : GSpacing nPSMem = nPixelSpace;
1083 14258 : GSpacing nLSMem = nLineSpace;
1084 14258 : void *pDataMem = pData;
1085 14258 : GDALDataType eDTMem = eBufType;
1086 14258 : if (eBufType != eDataType && !GDAL_GET_OPERATE_IN_BUF_TYPE(*psExtraArg))
1087 : {
1088 4 : nPSMem = GDALGetDataTypeSizeBytes(eDataType);
1089 4 : nLSMem = nPSMem * nBufXSize;
1090 : pTempBuffer =
1091 4 : VSI_MALLOC2_VERBOSE(nBufYSize, static_cast<size_t>(nLSMem));
1092 4 : if (pTempBuffer == nullptr)
1093 0 : return CE_Failure;
1094 4 : pDataMem = pTempBuffer;
1095 4 : eDTMem = eDataType;
1096 : }
1097 :
1098 : poMEMDS =
1099 14258 : MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
1100 : nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr);
1101 14258 : GByte *pabyData = static_cast<GByte *>(pDataMem) -
1102 14258 : nPSMem * nDestXOffVirtual - nLSMem * nDestYOffVirtual;
1103 14258 : GDALRasterBandH hMEMBand = MEMCreateRasterBandEx(
1104 : poMEMDS, 1, pabyData, eDTMem, nPSMem, nLSMem, false);
1105 14258 : poMEMDS->SetBand(1, GDALRasterBand::FromHandle(hMEMBand));
1106 :
1107 14258 : const char *pszNBITS = GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
1108 14258 : const int nNBITS = pszNBITS ? atoi(pszNBITS) : 0;
1109 14258 : if (pszNBITS)
1110 6 : GDALRasterBand::FromHandle(hMEMBand)->SetMetadataItem(
1111 6 : "NBITS", pszNBITS, "IMAGE_STRUCTURE");
1112 :
1113 14258 : CPLErr eErr = CE_None;
1114 :
1115 : // Do the resampling.
1116 14258 : if (bUseWarp)
1117 : {
1118 149 : int bHasNoData = FALSE;
1119 149 : double dfNoDataValue = GetNoDataValue(&bHasNoData);
1120 :
1121 149 : VRTDatasetH hVRTDS = nullptr;
1122 149 : GDALRasterBandH hVRTBand = nullptr;
1123 149 : if (GetDataset() == nullptr)
1124 : {
1125 : /* Create VRT dataset that wraps the whole dataset */
1126 0 : hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
1127 0 : VRTAddBand(hVRTDS, eDataType, nullptr);
1128 0 : hVRTBand = GDALGetRasterBand(hVRTDS, 1);
1129 0 : VRTAddSimpleSource(hVRTBand, this, 0, 0, nRasterXSize, nRasterYSize,
1130 : 0, 0, nRasterXSize, nRasterYSize, nullptr,
1131 : VRT_NODATA_UNSET);
1132 :
1133 : /* Add a mask band if needed */
1134 0 : if (GetMaskFlags() != GMF_ALL_VALID)
1135 : {
1136 0 : GDALDataset::FromHandle(hVRTDS)->CreateMaskBand(0);
1137 : VRTSourcedRasterBand *poVRTMaskBand =
1138 : reinterpret_cast<VRTSourcedRasterBand *>(
1139 : reinterpret_cast<GDALRasterBand *>(hVRTBand)
1140 0 : ->GetMaskBand());
1141 0 : poVRTMaskBand->AddMaskBandSource(this, 0, 0, nRasterXSize,
1142 0 : nRasterYSize, 0, 0,
1143 0 : nRasterXSize, nRasterYSize);
1144 : }
1145 : }
1146 :
1147 149 : GDALWarpOptions *psWarpOptions = GDALCreateWarpOptions();
1148 149 : switch (psExtraArg->eResampleAlg)
1149 : {
1150 0 : case GRIORA_NearestNeighbour:
1151 0 : psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
1152 0 : break;
1153 147 : case GRIORA_Bilinear:
1154 147 : psWarpOptions->eResampleAlg = GRA_Bilinear;
1155 147 : break;
1156 0 : case GRIORA_Cubic:
1157 0 : psWarpOptions->eResampleAlg = GRA_Cubic;
1158 0 : break;
1159 0 : case GRIORA_CubicSpline:
1160 0 : psWarpOptions->eResampleAlg = GRA_CubicSpline;
1161 0 : break;
1162 0 : case GRIORA_Lanczos:
1163 0 : psWarpOptions->eResampleAlg = GRA_Lanczos;
1164 0 : break;
1165 0 : case GRIORA_Average:
1166 0 : psWarpOptions->eResampleAlg = GRA_Average;
1167 0 : break;
1168 2 : case GRIORA_RMS:
1169 2 : psWarpOptions->eResampleAlg = GRA_RMS;
1170 2 : break;
1171 0 : case GRIORA_Mode:
1172 0 : psWarpOptions->eResampleAlg = GRA_Mode;
1173 0 : break;
1174 0 : default:
1175 0 : CPLAssert(false);
1176 : psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
1177 : break;
1178 : }
1179 149 : psWarpOptions->hSrcDS = hVRTDS ? hVRTDS : GetDataset();
1180 149 : psWarpOptions->hDstDS = poMEMDS;
1181 149 : psWarpOptions->nBandCount = 1;
1182 149 : int nSrcBandNumber = hVRTDS ? 1 : nBand;
1183 149 : int nDstBandNumber = 1;
1184 149 : psWarpOptions->panSrcBands = &nSrcBandNumber;
1185 149 : psWarpOptions->panDstBands = &nDstBandNumber;
1186 298 : psWarpOptions->pfnProgress = psExtraArg->pfnProgress
1187 149 : ? psExtraArg->pfnProgress
1188 : : GDALDummyProgress;
1189 149 : psWarpOptions->pProgressArg = psExtraArg->pProgressData;
1190 149 : psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
1191 149 : if (bHasNoData)
1192 : {
1193 0 : psWarpOptions->papszWarpOptions = CSLSetNameValue(
1194 : psWarpOptions->papszWarpOptions, "INIT_DEST", "NO_DATA");
1195 0 : if (psWarpOptions->padfSrcNoDataReal == nullptr)
1196 : {
1197 0 : psWarpOptions->padfSrcNoDataReal =
1198 0 : static_cast<double *>(CPLMalloc(sizeof(double)));
1199 0 : psWarpOptions->padfSrcNoDataReal[0] = dfNoDataValue;
1200 : }
1201 :
1202 0 : if (psWarpOptions->padfDstNoDataReal == nullptr)
1203 : {
1204 0 : psWarpOptions->padfDstNoDataReal =
1205 0 : static_cast<double *>(CPLMalloc(sizeof(double)));
1206 0 : psWarpOptions->padfDstNoDataReal[0] = dfNoDataValue;
1207 : }
1208 : }
1209 :
1210 : GDALRasterIOTransformerStruct sTransformer;
1211 149 : sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
1212 149 : sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
1213 149 : sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
1214 149 : sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
1215 149 : psWarpOptions->pTransformerArg = &sTransformer;
1216 :
1217 : GDALWarpOperationH hWarpOperation =
1218 149 : GDALCreateWarpOperation(psWarpOptions);
1219 149 : eErr = GDALChunkAndWarpImage(hWarpOperation, nDestXOffVirtual,
1220 : nDestYOffVirtual, nBufXSize, nBufYSize);
1221 149 : GDALDestroyWarpOperation(hWarpOperation);
1222 :
1223 149 : psWarpOptions->panSrcBands = nullptr;
1224 149 : psWarpOptions->panDstBands = nullptr;
1225 149 : GDALDestroyWarpOptions(psWarpOptions);
1226 :
1227 149 : if (hVRTDS)
1228 0 : GDALClose(hVRTDS);
1229 : }
1230 : else
1231 : {
1232 : const char *pszResampling =
1233 14109 : GDALRasterIOGetResampleAlg(psExtraArg->eResampleAlg);
1234 14109 : int nKernelRadius = 0;
1235 : GDALResampleFunction pfnResampleFunc =
1236 14109 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
1237 14109 : CPLAssert(pfnResampleFunc);
1238 : GDALDataType eWrkDataType =
1239 14109 : GDALGetOvrWorkDataType(pszResampling, eDataType);
1240 14109 : int nHasNoData = 0;
1241 14109 : double dfNoDataValue = GetNoDataValue(&nHasNoData);
1242 14109 : const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
1243 14109 : if (!bHasNoData)
1244 13977 : dfNoDataValue = 0.0;
1245 :
1246 14109 : int nDstBlockXSize = nBufXSize;
1247 14109 : int nDstBlockYSize = nBufYSize;
1248 14109 : int nFullResXChunk = 0;
1249 14109 : int nFullResYChunk = 0;
1250 : while (true)
1251 : {
1252 14120 : nFullResXChunk =
1253 14120 : 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
1254 14120 : nFullResYChunk =
1255 14120 : 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
1256 14120 : if (nFullResXChunk > nRasterXSize)
1257 4777 : nFullResXChunk = nRasterXSize;
1258 14120 : if (nFullResYChunk > nRasterYSize)
1259 594 : nFullResYChunk = nRasterYSize;
1260 14120 : if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
1261 14062 : (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
1262 : 1024 * 1024))
1263 : break;
1264 : // When operating on the full width of a raster whose block width is
1265 : // the raster width, prefer doing chunks in height.
1266 11 : if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
1267 : nDstBlockYSize > 1)
1268 0 : nDstBlockYSize /= 2;
1269 : /* Otherwise cut the maximal dimension */
1270 11 : else if (nDstBlockXSize > 1 &&
1271 0 : (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
1272 11 : nDstBlockXSize /= 2;
1273 : else
1274 0 : nDstBlockYSize /= 2;
1275 : }
1276 :
1277 14109 : int nOvrXFactor = static_cast<int>(0.5 + dfXRatioDstToSrc);
1278 14109 : int nOvrYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
1279 14109 : if (nOvrXFactor == 0)
1280 2029 : nOvrXFactor = 1;
1281 14109 : if (nOvrYFactor == 0)
1282 2028 : nOvrYFactor = 1;
1283 14109 : int nFullResXSizeQueried =
1284 14109 : nFullResXChunk + 2 * nKernelRadius * nOvrXFactor;
1285 14109 : int nFullResYSizeQueried =
1286 14109 : nFullResYChunk + 2 * nKernelRadius * nOvrYFactor;
1287 :
1288 14109 : if (nFullResXSizeQueried > nRasterXSize)
1289 2734 : nFullResXSizeQueried = nRasterXSize;
1290 14109 : if (nFullResYSizeQueried > nRasterYSize)
1291 332 : nFullResYSizeQueried = nRasterYSize;
1292 :
1293 : void *pChunk =
1294 14109 : VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
1295 : nFullResXSizeQueried, nFullResYSizeQueried);
1296 14109 : GByte *pabyChunkNoDataMask = nullptr;
1297 :
1298 14109 : GDALRasterBand *poMaskBand = GetMaskBand();
1299 14109 : int l_nMaskFlags = GetMaskFlags();
1300 :
1301 14109 : bool bUseNoDataMask = ((l_nMaskFlags & GMF_ALL_VALID) == 0);
1302 14109 : if (bUseNoDataMask)
1303 : {
1304 7525 : pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
1305 : nFullResXSizeQueried, nFullResYSizeQueried));
1306 : }
1307 14109 : if (pChunk == nullptr ||
1308 7525 : (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
1309 : {
1310 0 : GDALClose(poMEMDS);
1311 0 : CPLFree(pChunk);
1312 0 : CPLFree(pabyChunkNoDataMask);
1313 0 : VSIFree(pTempBuffer);
1314 0 : return CE_Failure;
1315 : }
1316 :
1317 14109 : const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
1318 14109 : DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
1319 14109 : int nBlocksDone = 0;
1320 :
1321 : int nDstYOff;
1322 28218 : for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
1323 14109 : nDstYOff += nDstBlockYSize)
1324 : {
1325 : int nDstYCount;
1326 14109 : if (nDstYOff + nDstBlockYSize <= nBufYSize)
1327 14109 : nDstYCount = nDstBlockYSize;
1328 : else
1329 0 : nDstYCount = nBufYSize - nDstYOff;
1330 :
1331 14109 : int nChunkYOff =
1332 14109 : nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
1333 14109 : int nChunkYOff2 = nYOff + 1 +
1334 14109 : static_cast<int>(ceil((nDstYOff + nDstYCount) *
1335 : dfYRatioDstToSrc));
1336 14109 : if (nChunkYOff2 > nRasterYSize)
1337 782 : nChunkYOff2 = nRasterYSize;
1338 14109 : int nYCount = nChunkYOff2 - nChunkYOff;
1339 14109 : CPLAssert(nYCount <= nFullResYChunk);
1340 :
1341 14109 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrYFactor;
1342 14109 : int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrYFactor;
1343 14109 : if (nChunkYOffQueried < 0)
1344 : {
1345 491 : nChunkYSizeQueried += nChunkYOffQueried;
1346 491 : nChunkYOffQueried = 0;
1347 : }
1348 14109 : if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
1349 594 : nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
1350 14109 : CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
1351 :
1352 14109 : int nDstXOff = 0;
1353 28218 : for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
1354 14109 : nDstXOff += nDstBlockXSize)
1355 : {
1356 14109 : int nDstXCount = 0;
1357 14109 : if (nDstXOff + nDstBlockXSize <= nBufXSize)
1358 14109 : nDstXCount = nDstBlockXSize;
1359 : else
1360 0 : nDstXCount = nBufXSize - nDstXOff;
1361 :
1362 14109 : int nChunkXOff =
1363 14109 : nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
1364 14109 : int nChunkXOff2 =
1365 14109 : nXOff + 1 +
1366 14109 : static_cast<int>(
1367 14109 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
1368 14109 : if (nChunkXOff2 > nRasterXSize)
1369 8802 : nChunkXOff2 = nRasterXSize;
1370 14109 : int nXCount = nChunkXOff2 - nChunkXOff;
1371 14109 : CPLAssert(nXCount <= nFullResXChunk);
1372 :
1373 14109 : int nChunkXOffQueried =
1374 14109 : nChunkXOff - nKernelRadius * nOvrXFactor;
1375 14109 : int nChunkXSizeQueried =
1376 14109 : nXCount + 2 * nKernelRadius * nOvrXFactor;
1377 14109 : if (nChunkXOffQueried < 0)
1378 : {
1379 2795 : nChunkXSizeQueried += nChunkXOffQueried;
1380 2795 : nChunkXOffQueried = 0;
1381 : }
1382 14109 : if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
1383 2781 : nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
1384 14109 : CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
1385 :
1386 : // Read the source buffers.
1387 14109 : eErr = RasterIO(GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1388 : nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
1389 : nChunkXSizeQueried, nChunkYSizeQueried,
1390 : eWrkDataType, 0, 0, nullptr);
1391 :
1392 14109 : bool bSkipResample = false;
1393 14109 : bool bNoDataMaskFullyOpaque = false;
1394 14109 : if (eErr == CE_None && bUseNoDataMask)
1395 : {
1396 7525 : eErr = poMaskBand->RasterIO(
1397 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1398 : nChunkXSizeQueried, nChunkYSizeQueried,
1399 : pabyChunkNoDataMask, nChunkXSizeQueried,
1400 : nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
1401 :
1402 : /* Optimizations if mask if fully opaque or transparent */
1403 7525 : int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
1404 7525 : GByte bVal = pabyChunkNoDataMask[0];
1405 7525 : int i = 1;
1406 15237000 : for (; i < nPixels; i++)
1407 : {
1408 15230700 : if (pabyChunkNoDataMask[i] != bVal)
1409 1168 : break;
1410 : }
1411 7525 : if (i == nPixels)
1412 : {
1413 6357 : if (bVal == 0)
1414 : {
1415 12094 : for (int j = 0; j < nDstYCount; j++)
1416 : {
1417 6377 : GDALCopyWords64(&dfNoDataValue, GDT_Float64, 0,
1418 : static_cast<GByte *>(pDataMem) +
1419 6377 : nLSMem * (j + nDstYOff) +
1420 6377 : nDstXOff * nPSMem,
1421 : eDTMem,
1422 : static_cast<int>(nPSMem),
1423 : nDstXCount);
1424 : }
1425 5717 : bSkipResample = true;
1426 : }
1427 : else
1428 : {
1429 640 : bNoDataMaskFullyOpaque = true;
1430 : }
1431 : }
1432 : }
1433 :
1434 14109 : if (!bSkipResample && eErr == CE_None)
1435 : {
1436 8389 : const bool bPropagateNoData = false;
1437 8389 : void *pDstBuffer = nullptr;
1438 8389 : GDALDataType eDstBufferDataType = GDT_Unknown;
1439 : GDALRasterBand *poMEMBand =
1440 8389 : GDALRasterBand::FromHandle(hMEMBand);
1441 8389 : GDALOverviewResampleArgs args;
1442 8389 : args.eSrcDataType = eDataType;
1443 8389 : args.eOvrDataType = poMEMBand->GetRasterDataType();
1444 8389 : args.nOvrXSize = poMEMBand->GetXSize();
1445 8389 : args.nOvrYSize = poMEMBand->GetYSize();
1446 8389 : args.nOvrNBITS = nNBITS;
1447 8389 : args.dfXRatioDstToSrc = dfXRatioDstToSrc;
1448 8389 : args.dfYRatioDstToSrc = dfYRatioDstToSrc;
1449 8389 : args.dfSrcXDelta =
1450 8389 : dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
1451 8389 : args.dfSrcYDelta =
1452 8389 : dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
1453 8389 : args.eWrkDataType = eWrkDataType;
1454 8389 : args.pabyChunkNodataMask =
1455 8389 : bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask;
1456 8389 : args.nChunkXOff =
1457 8389 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
1458 8389 : args.nChunkXSize = nChunkXSizeQueried;
1459 8389 : args.nChunkYOff =
1460 8389 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
1461 8389 : args.nChunkYSize = nChunkYSizeQueried;
1462 8389 : args.nDstXOff = nDstXOff + nDestXOffVirtual;
1463 8389 : args.nDstXOff2 = nDstXOff + nDestXOffVirtual + nDstXCount;
1464 8389 : args.nDstYOff = nDstYOff + nDestYOffVirtual;
1465 8389 : args.nDstYOff2 = nDstYOff + nDestYOffVirtual + nDstYCount;
1466 8389 : args.pszResampling = pszResampling;
1467 8389 : args.bHasNoData = bHasNoData;
1468 8389 : args.dfNoDataValue = dfNoDataValue;
1469 8389 : args.poColorTable = GetColorTable();
1470 8389 : args.bPropagateNoData = bPropagateNoData;
1471 8389 : eErr = pfnResampleFunc(args, pChunk, &pDstBuffer,
1472 : &eDstBufferDataType);
1473 8389 : if (eErr == CE_None)
1474 : {
1475 8389 : eErr = poMEMBand->RasterIO(
1476 : GF_Write, nDstXOff + nDestXOffVirtual,
1477 : nDstYOff + nDestYOffVirtual, nDstXCount, nDstYCount,
1478 : pDstBuffer, nDstXCount, nDstYCount,
1479 : eDstBufferDataType, 0, 0, nullptr);
1480 : }
1481 8389 : CPLFree(pDstBuffer);
1482 : }
1483 :
1484 14109 : nBlocksDone++;
1485 25031 : if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
1486 10922 : !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
1487 : "", psExtraArg->pProgressData))
1488 : {
1489 1 : eErr = CE_Failure;
1490 : }
1491 : }
1492 : }
1493 :
1494 14109 : CPLFree(pChunk);
1495 14109 : CPLFree(pabyChunkNoDataMask);
1496 : }
1497 :
1498 14258 : if (pTempBuffer)
1499 : {
1500 4 : CPL_IGNORE_RET_VAL(poMEMDS->GetRasterBand(1)->RasterIO(
1501 : GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
1502 : pData, nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
1503 : nullptr));
1504 : }
1505 14258 : GDALClose(poMEMDS);
1506 14258 : VSIFree(pTempBuffer);
1507 :
1508 14258 : return eErr;
1509 : }
1510 :
1511 : /************************************************************************/
1512 : /* RasterIOResampled() */
1513 : /************************************************************************/
1514 :
1515 892 : CPLErr GDALDataset::RasterIOResampled(
1516 : GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
1517 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
1518 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
1519 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
1520 :
1521 : {
1522 : #if 0
1523 : // Determine if we use warping resampling or overview resampling
1524 : bool bUseWarp = false;
1525 : if( GDALDataTypeIsComplex( eDataType ) )
1526 : bUseWarp = true;
1527 : #endif
1528 :
1529 892 : double dfXOff = nXOff;
1530 892 : double dfYOff = nYOff;
1531 892 : double dfXSize = nXSize;
1532 892 : double dfYSize = nYSize;
1533 892 : if (psExtraArg->bFloatingPointWindowValidity)
1534 : {
1535 765 : dfXOff = psExtraArg->dfXOff;
1536 765 : dfYOff = psExtraArg->dfYOff;
1537 765 : dfXSize = psExtraArg->dfXSize;
1538 765 : dfYSize = psExtraArg->dfYSize;
1539 : }
1540 :
1541 892 : const double dfXRatioDstToSrc = dfXSize / nBufXSize;
1542 892 : const double dfYRatioDstToSrc = dfYSize / nBufYSize;
1543 :
1544 : // Determine the coordinates in the "virtual" output raster to see
1545 : // if there are not integers, in which case we will use them as a shift
1546 : // so that subwindow extracts give the exact same results as entire raster
1547 : // scaling.
1548 892 : double dfDestXOff = dfXOff / dfXRatioDstToSrc;
1549 892 : bool bHasXOffVirtual = false;
1550 892 : int nDestXOffVirtual = 0;
1551 892 : if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
1552 : {
1553 767 : bHasXOffVirtual = true;
1554 767 : dfXOff = nXOff;
1555 767 : nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
1556 : }
1557 :
1558 892 : double dfDestYOff = dfYOff / dfYRatioDstToSrc;
1559 892 : bool bHasYOffVirtual = false;
1560 892 : int nDestYOffVirtual = 0;
1561 892 : if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
1562 : {
1563 727 : bHasYOffVirtual = true;
1564 727 : dfYOff = nYOff;
1565 727 : nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
1566 : }
1567 :
1568 : // Create a MEM dataset that wraps the output buffer.
1569 892 : std::unique_ptr<void, VSIFreeReleaser> pTempBuffer;
1570 892 : GSpacing nPSMem = nPixelSpace;
1571 892 : GSpacing nLSMem = nLineSpace;
1572 892 : GSpacing nBandSpaceMEM = nBandSpace;
1573 892 : void *pDataMem = pData;
1574 892 : GDALDataType eDTMem = eBufType;
1575 892 : GDALRasterBand *poFirstSrcBand = GetRasterBand(panBandMap[0]);
1576 892 : const GDALDataType eDataType = poFirstSrcBand->GetRasterDataType();
1577 892 : if (eBufType != eDataType && !GDAL_GET_OPERATE_IN_BUF_TYPE(*psExtraArg))
1578 : {
1579 2 : nPSMem = GDALGetDataTypeSizeBytes(eDataType);
1580 2 : nLSMem = nPSMem * nBufXSize;
1581 2 : nBandSpaceMEM = nLSMem * nBandCount;
1582 2 : pTempBuffer.reset(VSI_MALLOC3_VERBOSE(nBandCount, nBufYSize,
1583 : static_cast<size_t>(nLSMem)));
1584 2 : if (pTempBuffer == nullptr)
1585 0 : return CE_Failure;
1586 2 : pDataMem = pTempBuffer.get();
1587 2 : eDTMem = eDataType;
1588 : }
1589 :
1590 : auto poMEMDS = std::unique_ptr<GDALDataset>(
1591 892 : MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
1592 1784 : nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr));
1593 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1594 : std::vector<GDALRasterBand *> apoDstBands(nBandCount);
1595 : #endif
1596 892 : int nNBITS = 0;
1597 2896 : for (int i = 0; i < nBandCount; i++)
1598 : {
1599 2004 : GByte *const pBandData = static_cast<GByte *>(pDataMem) -
1600 2004 : nPSMem * nDestXOffVirtual -
1601 2004 : nLSMem * nDestYOffVirtual + nBandSpaceMEM * i;
1602 2004 : auto poMEMBand = GDALRasterBand::FromHandle(MEMCreateRasterBandEx(
1603 : poMEMDS.get(), i + 1, pBandData, eDTMem, nPSMem, nLSMem, false));
1604 2004 : poMEMDS->SetBand(i + 1, poMEMBand);
1605 :
1606 2004 : GDALRasterBand *poSrcBand = GetRasterBand(panBandMap[i]);
1607 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1608 : apoDstBands[i] = poMEMBand;
1609 : #endif
1610 : const char *pszNBITS =
1611 2004 : poSrcBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
1612 2004 : if (pszNBITS)
1613 : {
1614 0 : nNBITS = atoi(pszNBITS);
1615 0 : poMEMDS->GetRasterBand(i + 1)->SetMetadataItem("NBITS", pszNBITS,
1616 0 : "IMAGE_STRUCTURE");
1617 : }
1618 : }
1619 :
1620 892 : CPLErr eErr = CE_None;
1621 :
1622 : // TODO(schwehr): Why disabled? Why not just delete?
1623 : // Looks like this code was initially added as disable by copying
1624 : // from RasterIO here:
1625 : // https://trac.osgeo.org/gdal/changeset/29572
1626 : #if 0
1627 : // Do the resampling.
1628 : if( bUseWarp )
1629 : {
1630 : VRTDatasetH hVRTDS = nullptr;
1631 : GDALRasterBandH hVRTBand = nullptr;
1632 : if( GetDataset() == nullptr )
1633 : {
1634 : /* Create VRT dataset that wraps the whole dataset */
1635 : hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
1636 : VRTAddBand( hVRTDS, eDataType, nullptr );
1637 : hVRTBand = GDALGetRasterBand(hVRTDS, 1);
1638 : VRTAddSimpleSource( (VRTSourcedRasterBandH)hVRTBand,
1639 : (GDALRasterBandH)this,
1640 : 0, 0,
1641 : nRasterXSize, nRasterYSize,
1642 : 0, 0,
1643 : nRasterXSize, nRasterYSize,
1644 : nullptr, VRT_NODATA_UNSET );
1645 :
1646 : /* Add a mask band if needed */
1647 : if( GetMaskFlags() != GMF_ALL_VALID )
1648 : {
1649 : ((GDALDataset*)hVRTDS)->CreateMaskBand(0);
1650 : VRTSourcedRasterBand* poVRTMaskBand =
1651 : (VRTSourcedRasterBand*)(((GDALRasterBand*)hVRTBand)->GetMaskBand());
1652 : poVRTMaskBand->
1653 : AddMaskBandSource( this,
1654 : 0, 0,
1655 : nRasterXSize, nRasterYSize,
1656 : 0, 0,
1657 : nRasterXSize, nRasterYSize);
1658 : }
1659 : }
1660 :
1661 : GDALWarpOptions* psWarpOptions = GDALCreateWarpOptions();
1662 : psWarpOptions->eResampleAlg = (GDALResampleAlg)psExtraArg->eResampleAlg;
1663 : psWarpOptions->hSrcDS = (GDALDatasetH) (hVRTDS ? hVRTDS : GetDataset());
1664 : psWarpOptions->hDstDS = (GDALDatasetH) poMEMDS;
1665 : psWarpOptions->nBandCount = 1;
1666 : int nSrcBandNumber = (hVRTDS ? 1 : nBand);
1667 : int nDstBandNumber = 1;
1668 : psWarpOptions->panSrcBands = &nSrcBandNumber;
1669 : psWarpOptions->panDstBands = &nDstBandNumber;
1670 : psWarpOptions->pfnProgress = psExtraArg->pfnProgress ?
1671 : psExtraArg->pfnProgress : GDALDummyProgress;
1672 : psWarpOptions->pProgressArg = psExtraArg->pProgressData;
1673 : psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
1674 : GDALRasterIOTransformerStruct sTransformer;
1675 : sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
1676 : sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
1677 : sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
1678 : sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
1679 : psWarpOptions->pTransformerArg = &sTransformer;
1680 :
1681 : GDALWarpOperationH hWarpOperation = GDALCreateWarpOperation(psWarpOptions);
1682 : eErr = GDALChunkAndWarpImage( hWarpOperation,
1683 : nDestXOffVirtual, nDestYOffVirtual,
1684 : nBufXSize, nBufYSize );
1685 : GDALDestroyWarpOperation( hWarpOperation );
1686 :
1687 : psWarpOptions->panSrcBands = nullptr;
1688 : psWarpOptions->panDstBands = nullptr;
1689 : GDALDestroyWarpOptions( psWarpOptions );
1690 :
1691 : if( hVRTDS )
1692 : GDALClose(hVRTDS);
1693 : }
1694 : else
1695 : #endif
1696 : {
1697 : const char *pszResampling =
1698 892 : GDALRasterIOGetResampleAlg(psExtraArg->eResampleAlg);
1699 :
1700 : int nBlockXSize, nBlockYSize;
1701 892 : poFirstSrcBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
1702 :
1703 : int nKernelRadius;
1704 : GDALResampleFunction pfnResampleFunc =
1705 892 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
1706 892 : CPLAssert(pfnResampleFunc);
1707 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1708 : GDALResampleFunctionMultiBands pfnResampleFuncMultiBands =
1709 : GDALGetResampleFunctionMultiBands(pszResampling, &nKernelRadius);
1710 : #endif
1711 : GDALDataType eWrkDataType =
1712 892 : GDALGetOvrWorkDataType(pszResampling, eDataType);
1713 :
1714 892 : int nDstBlockXSize = nBufXSize;
1715 892 : int nDstBlockYSize = nBufYSize;
1716 : int nFullResXChunk, nFullResYChunk;
1717 : while (true)
1718 : {
1719 892 : nFullResXChunk =
1720 892 : 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
1721 892 : nFullResYChunk =
1722 892 : 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
1723 892 : if (nFullResXChunk > nRasterXSize)
1724 591 : nFullResXChunk = nRasterXSize;
1725 892 : if (nFullResYChunk > nRasterYSize)
1726 57 : nFullResYChunk = nRasterYSize;
1727 892 : if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
1728 890 : (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
1729 : 1024 * 1024))
1730 : break;
1731 : // When operating on the full width of a raster whose block width is
1732 : // the raster width, prefer doing chunks in height.
1733 0 : if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
1734 : nDstBlockYSize > 1)
1735 0 : nDstBlockYSize /= 2;
1736 : /* Otherwise cut the maximal dimension */
1737 0 : else if (nDstBlockXSize > 1 &&
1738 0 : (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
1739 0 : nDstBlockXSize /= 2;
1740 : else
1741 0 : nDstBlockYSize /= 2;
1742 : }
1743 :
1744 1784 : int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
1745 892 : static_cast<int>(0.5 + dfYRatioDstToSrc));
1746 892 : if (nOvrFactor == 0)
1747 104 : nOvrFactor = 1;
1748 892 : int nFullResXSizeQueried =
1749 892 : nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
1750 892 : int nFullResYSizeQueried =
1751 892 : nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
1752 :
1753 892 : if (nFullResXSizeQueried > nRasterXSize)
1754 616 : nFullResXSizeQueried = nRasterXSize;
1755 892 : if (nFullResYSizeQueried > nRasterYSize)
1756 60 : nFullResYSizeQueried = nRasterYSize;
1757 :
1758 892 : void *pChunk = VSI_MALLOC3_VERBOSE(
1759 : cpl::fits_on<int>(GDALGetDataTypeSizeBytes(eWrkDataType) *
1760 : nBandCount),
1761 : nFullResXSizeQueried, nFullResYSizeQueried);
1762 892 : GByte *pabyChunkNoDataMask = nullptr;
1763 :
1764 892 : GDALRasterBand *poMaskBand = poFirstSrcBand->GetMaskBand();
1765 892 : int nMaskFlags = poFirstSrcBand->GetMaskFlags();
1766 :
1767 892 : bool bUseNoDataMask = ((nMaskFlags & GMF_ALL_VALID) == 0);
1768 892 : if (bUseNoDataMask)
1769 : {
1770 617 : pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
1771 : nFullResXSizeQueried, nFullResYSizeQueried));
1772 : }
1773 892 : if (pChunk == nullptr ||
1774 617 : (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
1775 : {
1776 0 : CPLFree(pChunk);
1777 0 : CPLFree(pabyChunkNoDataMask);
1778 0 : return CE_Failure;
1779 : }
1780 :
1781 892 : const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
1782 892 : DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
1783 892 : int nBlocksDone = 0;
1784 :
1785 : int nDstYOff;
1786 1784 : for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
1787 892 : nDstYOff += nDstBlockYSize)
1788 : {
1789 : int nDstYCount;
1790 892 : if (nDstYOff + nDstBlockYSize <= nBufYSize)
1791 892 : nDstYCount = nDstBlockYSize;
1792 : else
1793 0 : nDstYCount = nBufYSize - nDstYOff;
1794 :
1795 892 : int nChunkYOff =
1796 892 : nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
1797 892 : int nChunkYOff2 = nYOff + 1 +
1798 892 : static_cast<int>(ceil((nDstYOff + nDstYCount) *
1799 : dfYRatioDstToSrc));
1800 892 : if (nChunkYOff2 > nRasterYSize)
1801 139 : nChunkYOff2 = nRasterYSize;
1802 892 : int nYCount = nChunkYOff2 - nChunkYOff;
1803 892 : CPLAssert(nYCount <= nFullResYChunk);
1804 :
1805 892 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
1806 892 : int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
1807 892 : if (nChunkYOffQueried < 0)
1808 : {
1809 142 : nChunkYSizeQueried += nChunkYOffQueried;
1810 142 : nChunkYOffQueried = 0;
1811 : }
1812 892 : if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
1813 157 : nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
1814 892 : CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
1815 :
1816 : int nDstXOff;
1817 1784 : for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
1818 892 : nDstXOff += nDstBlockXSize)
1819 : {
1820 : int nDstXCount;
1821 892 : if (nDstXOff + nDstBlockXSize <= nBufXSize)
1822 892 : nDstXCount = nDstBlockXSize;
1823 : else
1824 0 : nDstXCount = nBufXSize - nDstXOff;
1825 :
1826 892 : int nChunkXOff =
1827 892 : nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
1828 892 : int nChunkXOff2 =
1829 892 : nXOff + 1 +
1830 892 : static_cast<int>(
1831 892 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
1832 892 : if (nChunkXOff2 > nRasterXSize)
1833 647 : nChunkXOff2 = nRasterXSize;
1834 892 : int nXCount = nChunkXOff2 - nChunkXOff;
1835 892 : CPLAssert(nXCount <= nFullResXChunk);
1836 :
1837 892 : int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
1838 892 : int nChunkXSizeQueried =
1839 892 : nXCount + 2 * nKernelRadius * nOvrFactor;
1840 892 : if (nChunkXOffQueried < 0)
1841 : {
1842 647 : nChunkXSizeQueried += nChunkXOffQueried;
1843 647 : nChunkXOffQueried = 0;
1844 : }
1845 892 : if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
1846 655 : nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
1847 892 : CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
1848 :
1849 892 : bool bSkipResample = false;
1850 892 : bool bNoDataMaskFullyOpaque = false;
1851 892 : if (eErr == CE_None && bUseNoDataMask)
1852 : {
1853 617 : eErr = poMaskBand->RasterIO(
1854 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1855 : nChunkXSizeQueried, nChunkYSizeQueried,
1856 : pabyChunkNoDataMask, nChunkXSizeQueried,
1857 : nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
1858 :
1859 : /* Optimizations if mask if fully opaque or transparent */
1860 617 : const int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
1861 617 : const GByte bVal = pabyChunkNoDataMask[0];
1862 617 : int i = 1; // Used after for.
1863 48197000 : for (; i < nPixels; i++)
1864 : {
1865 48196500 : if (pabyChunkNoDataMask[i] != bVal)
1866 72 : break;
1867 : }
1868 617 : if (i == nPixels)
1869 : {
1870 545 : if (bVal == 0)
1871 : {
1872 373 : GByte abyZero[16] = {0};
1873 780 : for (int iBand = 0; iBand < nBandCount; iBand++)
1874 : {
1875 3499 : for (int j = 0; j < nDstYCount; j++)
1876 : {
1877 3092 : GDALCopyWords64(
1878 : abyZero, GDT_UInt8, 0,
1879 : static_cast<GByte *>(pDataMem) +
1880 3092 : iBand * nBandSpaceMEM +
1881 3092 : nLSMem * (j + nDstYOff) +
1882 3092 : nDstXOff * nPSMem,
1883 : eBufType, static_cast<int>(nPSMem),
1884 : nDstXCount);
1885 : }
1886 : }
1887 373 : bSkipResample = true;
1888 : }
1889 : else
1890 : {
1891 172 : bNoDataMaskFullyOpaque = true;
1892 : }
1893 : }
1894 : }
1895 :
1896 892 : if (!bSkipResample && eErr == CE_None)
1897 : {
1898 : /* Read the source buffers */
1899 516 : eErr = RasterIO(
1900 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1901 : nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
1902 : nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType,
1903 : nBandCount, panBandMap, 0, 0, 0, nullptr);
1904 : }
1905 :
1906 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1907 : if (pfnResampleFuncMultiBands && !bSkipResample &&
1908 : eErr == CE_None)
1909 : {
1910 : eErr = pfnResampleFuncMultiBands(
1911 : dfXRatioDstToSrc, dfYRatioDstToSrc,
1912 : dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
1913 : dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
1914 : eWrkDataType, (GByte *)pChunk, nBandCount,
1915 : bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask,
1916 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
1917 : nChunkXSizeQueried,
1918 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
1919 : nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
1920 : nDstXOff + nDestXOffVirtual + nDstXCount,
1921 : nDstYOff + nDestYOffVirtual,
1922 : nDstYOff + nDestYOffVirtual + nDstYCount,
1923 : apoDstBands.data(), pszResampling, FALSE /*bHasNoData*/,
1924 : 0.0 /* dfNoDataValue */, nullptr /* color table*/,
1925 : eDataType);
1926 : }
1927 : else
1928 : #endif
1929 : {
1930 : size_t nChunkBandOffset =
1931 892 : static_cast<size_t>(nChunkXSizeQueried) *
1932 892 : nChunkYSizeQueried *
1933 892 : GDALGetDataTypeSizeBytes(eWrkDataType);
1934 2480 : for (int i = 0;
1935 2480 : i < nBandCount && !bSkipResample && eErr == CE_None;
1936 : i++)
1937 : {
1938 1588 : const bool bPropagateNoData = false;
1939 1588 : void *pDstBuffer = nullptr;
1940 1588 : GDALDataType eDstBufferDataType = GDT_Unknown;
1941 : GDALRasterBand *poMEMBand =
1942 1588 : poMEMDS->GetRasterBand(i + 1);
1943 1588 : GDALOverviewResampleArgs args;
1944 1588 : args.eSrcDataType = eDataType;
1945 1588 : args.eOvrDataType = poMEMBand->GetRasterDataType();
1946 1588 : args.nOvrXSize = poMEMBand->GetXSize();
1947 1588 : args.nOvrYSize = poMEMBand->GetYSize();
1948 1588 : args.nOvrNBITS = nNBITS;
1949 1588 : args.dfXRatioDstToSrc = dfXRatioDstToSrc;
1950 1588 : args.dfYRatioDstToSrc = dfYRatioDstToSrc;
1951 1588 : args.dfSrcXDelta =
1952 1588 : dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
1953 1588 : args.dfSrcYDelta =
1954 1588 : dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
1955 1588 : args.eWrkDataType = eWrkDataType;
1956 1588 : args.pabyChunkNodataMask = bNoDataMaskFullyOpaque
1957 1588 : ? nullptr
1958 : : pabyChunkNoDataMask;
1959 1588 : args.nChunkXOff =
1960 1588 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
1961 1588 : args.nChunkXSize = nChunkXSizeQueried;
1962 1588 : args.nChunkYOff =
1963 1588 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
1964 1588 : args.nChunkYSize = nChunkYSizeQueried;
1965 1588 : args.nDstXOff = nDstXOff + nDestXOffVirtual;
1966 1588 : args.nDstXOff2 =
1967 1588 : nDstXOff + nDestXOffVirtual + nDstXCount;
1968 1588 : args.nDstYOff = nDstYOff + nDestYOffVirtual;
1969 1588 : args.nDstYOff2 =
1970 1588 : nDstYOff + nDestYOffVirtual + nDstYCount;
1971 1588 : args.pszResampling = pszResampling;
1972 1588 : args.bHasNoData = false;
1973 1588 : args.dfNoDataValue = 0.0;
1974 1588 : args.poColorTable = nullptr;
1975 1588 : args.bPropagateNoData = bPropagateNoData;
1976 :
1977 : eErr =
1978 3176 : pfnResampleFunc(args,
1979 1588 : reinterpret_cast<GByte *>(pChunk) +
1980 1588 : i * nChunkBandOffset,
1981 : &pDstBuffer, &eDstBufferDataType);
1982 1588 : if (eErr == CE_None)
1983 : {
1984 1588 : eErr = poMEMBand->RasterIO(
1985 : GF_Write, nDstXOff + nDestXOffVirtual,
1986 : nDstYOff + nDestYOffVirtual, nDstXCount,
1987 : nDstYCount, pDstBuffer, nDstXCount, nDstYCount,
1988 : eDstBufferDataType, 0, 0, nullptr);
1989 : }
1990 1588 : CPLFree(pDstBuffer);
1991 : }
1992 : }
1993 :
1994 892 : nBlocksDone++;
1995 1281 : if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
1996 389 : !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
1997 : "", psExtraArg->pProgressData))
1998 : {
1999 0 : eErr = CE_Failure;
2000 : }
2001 : }
2002 : }
2003 :
2004 892 : CPLFree(pChunk);
2005 892 : CPLFree(pabyChunkNoDataMask);
2006 : }
2007 :
2008 892 : if (pTempBuffer)
2009 : {
2010 2 : CPL_IGNORE_RET_VAL(poMEMDS->RasterIO(
2011 : GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
2012 : pData, nBufXSize, nBufYSize, eBufType, nBandCount, nullptr,
2013 : nPixelSpace, nLineSpace, nBandSpace, nullptr));
2014 : }
2015 :
2016 892 : return eErr;
2017 : }
2018 :
2019 : //! @endcond
2020 :
2021 : /************************************************************************/
2022 : /* GDALSwapWords() */
2023 : /************************************************************************/
2024 :
2025 : /**
2026 : * Byte swap words in-place.
2027 : *
2028 : * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
2029 : * a memory array. No assumption is made that the words being swapped are
2030 : * word aligned in memory. Use the CPL_LSB and CPL_MSB macros from cpl_port.h
2031 : * to determine if the current platform is big endian or little endian. Use
2032 : * The macros like CPL_SWAP32() to byte swap single values without the overhead
2033 : * of a function call.
2034 : *
2035 : * @param pData pointer to start of data buffer.
2036 : * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
2037 : * @param nWordCount the number of words to be swapped in this call.
2038 : * @param nWordSkip the byte offset from the start of one word to the start of
2039 : * the next. For packed buffers this is the same as nWordSize.
2040 : */
2041 :
2042 497149 : void CPL_STDCALL GDALSwapWords(void *pData, int nWordSize, int nWordCount,
2043 : int nWordSkip)
2044 :
2045 : {
2046 497149 : if (nWordCount > 0)
2047 497149 : VALIDATE_POINTER0(pData, "GDALSwapWords");
2048 :
2049 497149 : GByte *pabyData = static_cast<GByte *>(pData);
2050 :
2051 497149 : switch (nWordSize)
2052 : {
2053 7234 : case 1:
2054 7234 : break;
2055 :
2056 476905 : case 2:
2057 476905 : CPLAssert(nWordSkip >= 2 || nWordCount == 1);
2058 228062000 : for (int i = 0; i < nWordCount; i++)
2059 : {
2060 227585000 : CPL_SWAP16PTR(pabyData);
2061 227585000 : pabyData += nWordSkip;
2062 : }
2063 476905 : break;
2064 :
2065 10584 : case 4:
2066 10584 : CPLAssert(nWordSkip >= 4 || nWordCount == 1);
2067 10584 : if (CPL_IS_ALIGNED(pabyData, 4) && (nWordSkip % 4) == 0)
2068 : {
2069 29140600 : for (int i = 0; i < nWordCount; i++)
2070 : {
2071 29130000 : *reinterpret_cast<GUInt32 *>(pabyData) = CPL_SWAP32(
2072 : *reinterpret_cast<const GUInt32 *>(pabyData));
2073 29130000 : pabyData += nWordSkip;
2074 10581 : }
2075 : }
2076 : else
2077 : {
2078 9 : for (int i = 0; i < nWordCount; i++)
2079 : {
2080 6 : CPL_SWAP32PTR(pabyData);
2081 6 : pabyData += nWordSkip;
2082 : }
2083 : }
2084 10584 : break;
2085 :
2086 2426 : case 8:
2087 2426 : CPLAssert(nWordSkip >= 8 || nWordCount == 1);
2088 2426 : if (CPL_IS_ALIGNED(pabyData, 8) && (nWordSkip % 8) == 0)
2089 : {
2090 3356900 : for (int i = 0; i < nWordCount; i++)
2091 : {
2092 3354480 : *reinterpret_cast<GUInt64 *>(pabyData) = CPL_SWAP64(
2093 : *reinterpret_cast<const GUInt64 *>(pabyData));
2094 3354480 : pabyData += nWordSkip;
2095 2425 : }
2096 : }
2097 : else
2098 : {
2099 3 : for (int i = 0; i < nWordCount; i++)
2100 : {
2101 2 : CPL_SWAP64PTR(pabyData);
2102 2 : pabyData += nWordSkip;
2103 : }
2104 : }
2105 2426 : break;
2106 :
2107 0 : default:
2108 0 : CPLAssert(false);
2109 : }
2110 : }
2111 :
2112 : /************************************************************************/
2113 : /* GDALSwapWordsEx() */
2114 : /************************************************************************/
2115 :
2116 : /**
2117 : * Byte swap words in-place.
2118 : *
2119 : * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
2120 : * a memory array. No assumption is made that the words being swapped are
2121 : * word aligned in memory. Use the CPL_LSB and CPL_MSB macros from cpl_port.h
2122 : * to determine if the current platform is big endian or little endian. Use
2123 : * The macros like CPL_SWAP32() to byte swap single values without the overhead
2124 : * of a function call.
2125 : *
2126 : * @param pData pointer to start of data buffer.
2127 : * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
2128 : * @param nWordCount the number of words to be swapped in this call.
2129 : * @param nWordSkip the byte offset from the start of one word to the start of
2130 : * the next. For packed buffers this is the same as nWordSize.
2131 : */
2132 6130 : void CPL_STDCALL GDALSwapWordsEx(void *pData, int nWordSize, size_t nWordCount,
2133 : int nWordSkip)
2134 : {
2135 6130 : GByte *pabyData = static_cast<GByte *>(pData);
2136 12260 : while (nWordCount)
2137 : {
2138 : // Pick-up a multiple of 8 as max chunk size.
2139 6130 : const int nWordCountSmall =
2140 6130 : (nWordCount > (1 << 30)) ? (1 << 30) : static_cast<int>(nWordCount);
2141 6130 : GDALSwapWords(pabyData, nWordSize, nWordCountSmall, nWordSkip);
2142 6130 : pabyData += static_cast<size_t>(nWordSkip) * nWordCountSmall;
2143 6130 : nWordCount -= nWordCountSmall;
2144 : }
2145 6130 : }
2146 :
2147 : // Place the new GDALCopyWords helpers in an anonymous namespace
2148 : namespace
2149 : {
2150 :
2151 : /************************************************************************/
2152 : /* GDALCopyWordsT() */
2153 : /************************************************************************/
2154 : /**
2155 : * Template function, used to copy data from pSrcData into buffer
2156 : * pDstData, with stride nSrcPixelStride in the source data and
2157 : * stride nDstPixelStride in the destination data. This template can
2158 : * deal with the case where the input data type is real or complex and
2159 : * the output is real.
2160 : *
2161 : * @param pSrcData the source data buffer
2162 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
2163 : * of interest.
2164 : * @param pDstData the destination buffer.
2165 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
2166 : * interest.
2167 : * @param nWordCount the total number of pixel words to copy
2168 : *
2169 : * @code
2170 : * // Assume an input buffer of type GUInt16 named pBufferIn
2171 : * GByte *pBufferOut = new GByte[numBytesOut];
2172 : * GDALCopyWordsT<GUInt16, GByte>(pSrcData, 2, pDstData, 1, numBytesOut);
2173 : * @endcode
2174 : * @note
2175 : * This is a private function, and should not be exposed outside of
2176 : * rasterio.cpp. External users should call the GDALCopyWords driver function.
2177 : */
2178 :
2179 : template <class Tin, class Tout>
2180 49013857 : static void inline GDALCopyWordsGenericT(const Tin *const CPL_RESTRICT pSrcData,
2181 : int nSrcPixelStride,
2182 : Tout *const CPL_RESTRICT pDstData,
2183 : int nDstPixelStride,
2184 : GPtrDiff_t nWordCount)
2185 : {
2186 49013857 : decltype(nWordCount) nDstOffset = 0;
2187 :
2188 49013857 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
2189 49013857 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
2190 356655113 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
2191 : {
2192 307641208 : const Tin tValue =
2193 307641208 : *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
2194 307641208 : Tout *const pOutPixel =
2195 307641208 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2196 :
2197 307641208 : GDALCopyWord(tValue, *pOutPixel);
2198 :
2199 307641208 : nDstOffset += nDstPixelStride;
2200 : }
2201 49013857 : }
2202 :
2203 : template <class Tin, class Tout>
2204 29776660 : static void CPL_NOINLINE GDALCopyWordsT(const Tin *const CPL_RESTRICT pSrcData,
2205 : int nSrcPixelStride,
2206 : Tout *const CPL_RESTRICT pDstData,
2207 : int nDstPixelStride,
2208 : GPtrDiff_t nWordCount)
2209 : {
2210 29776660 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData, nDstPixelStride,
2211 : nWordCount);
2212 29776660 : }
2213 :
2214 : template <class Tin, class Tout>
2215 5080935 : static void inline GDALCopyWordsT_8atatime(
2216 : const Tin *const CPL_RESTRICT pSrcData, int nSrcPixelStride,
2217 : Tout *const CPL_RESTRICT pDstData, int nDstPixelStride,
2218 : GPtrDiff_t nWordCount)
2219 : {
2220 5080935 : decltype(nWordCount) nDstOffset = 0;
2221 :
2222 5080935 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
2223 5080935 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
2224 5080935 : decltype(nWordCount) n = 0;
2225 5080935 : if (nSrcPixelStride == static_cast<int>(sizeof(Tin)) &&
2226 : nDstPixelStride == static_cast<int>(sizeof(Tout)))
2227 : {
2228 52932327 : for (; n < nWordCount - 7; n += 8)
2229 : {
2230 52390796 : const Tin *pInValues = reinterpret_cast<const Tin *>(
2231 52390796 : pSrcDataPtr + (n * nSrcPixelStride));
2232 52390796 : Tout *const pOutPixels =
2233 52390796 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2234 :
2235 52390796 : GDALCopy8Words(pInValues, pOutPixels);
2236 :
2237 52390796 : nDstOffset += 8 * nDstPixelStride;
2238 : }
2239 : }
2240 10465987 : for (; n < nWordCount; n++)
2241 : {
2242 5385052 : const Tin tValue =
2243 5385052 : *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
2244 5385052 : Tout *const pOutPixel =
2245 5385052 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2246 :
2247 5385052 : GDALCopyWord(tValue, *pOutPixel);
2248 :
2249 5385052 : nDstOffset += nDstPixelStride;
2250 : }
2251 5080935 : }
2252 :
2253 : #ifdef HAVE_SSE2
2254 :
2255 : template <class Tout>
2256 1042126 : void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
2257 : int nSrcPixelStride,
2258 : Tout *const CPL_RESTRICT pDstData,
2259 : int nDstPixelStride, GPtrDiff_t nWordCount)
2260 : {
2261 : static_assert(std::is_integral<Tout>::value &&
2262 : sizeof(Tout) == sizeof(uint16_t),
2263 : "Bad Tout");
2264 1042126 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2265 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2266 : {
2267 35752 : decltype(nWordCount) n = 0;
2268 35752 : const __m128i xmm_zero = _mm_setzero_si128();
2269 35752 : GByte *CPL_RESTRICT pabyDstDataPtr =
2270 : reinterpret_cast<GByte *>(pDstData);
2271 1478148 : for (; n < nWordCount - 15; n += 16)
2272 : {
2273 1442396 : __m128i xmm = _mm_loadu_si128(
2274 1442396 : reinterpret_cast<const __m128i *>(pSrcData + n));
2275 1442396 : __m128i xmm0 = _mm_unpacklo_epi8(xmm, xmm_zero);
2276 1442396 : __m128i xmm1 = _mm_unpackhi_epi8(xmm, xmm_zero);
2277 : _mm_storeu_si128(
2278 1442396 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2), xmm0);
2279 : _mm_storeu_si128(
2280 1442396 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2 + 16), xmm1);
2281 : }
2282 111662 : for (; n < nWordCount; n++)
2283 : {
2284 75910 : pDstData[n] = pSrcData[n];
2285 35752 : }
2286 : }
2287 : else
2288 : {
2289 1006371 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2290 : nDstPixelStride, nWordCount);
2291 : }
2292 1042126 : }
2293 :
2294 : template <>
2295 1029400 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2296 : int nSrcPixelStride,
2297 : GUInt16 *const CPL_RESTRICT pDstData,
2298 : int nDstPixelStride, GPtrDiff_t nWordCount)
2299 : {
2300 1029400 : GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
2301 : nDstPixelStride, nWordCount);
2302 1029400 : }
2303 :
2304 : template <>
2305 12726 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2306 : int nSrcPixelStride,
2307 : GInt16 *const CPL_RESTRICT pDstData,
2308 : int nDstPixelStride, GPtrDiff_t nWordCount)
2309 : {
2310 12726 : GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
2311 : nDstPixelStride, nWordCount);
2312 12726 : }
2313 :
2314 : template <class Tout>
2315 16237076 : void GDALCopyWordsByteTo32Bit(const GByte *const CPL_RESTRICT pSrcData,
2316 : int nSrcPixelStride,
2317 : Tout *const CPL_RESTRICT pDstData,
2318 : int nDstPixelStride, GPtrDiff_t nWordCount)
2319 : {
2320 : static_assert(std::is_integral<Tout>::value &&
2321 : sizeof(Tout) == sizeof(uint32_t),
2322 : "Bad Tout");
2323 16237076 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2324 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2325 : {
2326 6532586 : decltype(nWordCount) n = 0;
2327 6532586 : const __m128i xmm_zero = _mm_setzero_si128();
2328 6532586 : GByte *CPL_RESTRICT pabyDstDataPtr =
2329 : reinterpret_cast<GByte *>(pDstData);
2330 74248027 : for (; n < nWordCount - 15; n += 16)
2331 : {
2332 67715361 : __m128i xmm = _mm_loadu_si128(
2333 67715361 : reinterpret_cast<const __m128i *>(pSrcData + n));
2334 67715361 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2335 67715361 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2336 67715361 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2337 67715361 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2338 67715361 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2339 67715361 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2340 : _mm_storeu_si128(
2341 67715361 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4), xmm0);
2342 : _mm_storeu_si128(
2343 67715361 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 16), xmm1);
2344 : _mm_storeu_si128(
2345 67715361 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 32), xmm2);
2346 : _mm_storeu_si128(
2347 67715361 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 48), xmm3);
2348 : }
2349 14825816 : for (; n < nWordCount; n++)
2350 : {
2351 8293240 : pDstData[n] = pSrcData[n];
2352 6532586 : }
2353 : }
2354 : else
2355 : {
2356 9704510 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2357 : nDstPixelStride, nWordCount);
2358 : }
2359 16237076 : }
2360 :
2361 : template <>
2362 476 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2363 : int nSrcPixelStride,
2364 : GUInt32 *const CPL_RESTRICT pDstData,
2365 : int nDstPixelStride, GPtrDiff_t nWordCount)
2366 : {
2367 476 : GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
2368 : nDstPixelStride, nWordCount);
2369 476 : }
2370 :
2371 : template <>
2372 16236600 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2373 : int nSrcPixelStride,
2374 : GInt32 *const CPL_RESTRICT pDstData,
2375 : int nDstPixelStride, GPtrDiff_t nWordCount)
2376 : {
2377 16236600 : GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
2378 : nDstPixelStride, nWordCount);
2379 16236600 : }
2380 :
2381 : template <>
2382 2851070 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2383 : int nSrcPixelStride,
2384 : float *const CPL_RESTRICT pDstData,
2385 : int nDstPixelStride, GPtrDiff_t nWordCount)
2386 : {
2387 2851070 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2388 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2389 : {
2390 228189 : decltype(nWordCount) n = 0;
2391 228189 : const __m128i xmm_zero = _mm_setzero_si128();
2392 228189 : GByte *CPL_RESTRICT pabyDstDataPtr =
2393 : reinterpret_cast<GByte *>(pDstData);
2394 2267160 : for (; n < nWordCount - 15; n += 16)
2395 : {
2396 2038970 : __m128i xmm = _mm_loadu_si128(
2397 2038970 : reinterpret_cast<const __m128i *>(pSrcData + n));
2398 2038970 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2399 2038970 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2400 2038970 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2401 2038970 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2402 2038970 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2403 2038970 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2404 2038970 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2405 2038970 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2406 2038970 : __m128 xmm2_f = _mm_cvtepi32_ps(xmm2);
2407 2038970 : __m128 xmm3_f = _mm_cvtepi32_ps(xmm3);
2408 2038970 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2409 : xmm0_f);
2410 : _mm_storeu_ps(
2411 2038970 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2412 : _mm_storeu_ps(
2413 2038970 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 32), xmm2_f);
2414 : _mm_storeu_ps(
2415 2038970 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 48), xmm3_f);
2416 : }
2417 951437 : for (; n < nWordCount; n++)
2418 : {
2419 723248 : pDstData[n] = pSrcData[n];
2420 228189 : }
2421 : }
2422 : else
2423 : {
2424 2622880 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2425 : nDstPixelStride, nWordCount);
2426 : }
2427 2851070 : }
2428 :
2429 : template <>
2430 170938 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2431 : int nSrcPixelStride,
2432 : double *const CPL_RESTRICT pDstData,
2433 : int nDstPixelStride, GPtrDiff_t nWordCount)
2434 : {
2435 170938 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2436 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2437 : {
2438 147140 : decltype(nWordCount) n = 0;
2439 147140 : const __m128i xmm_zero = _mm_setzero_si128();
2440 147140 : GByte *CPL_RESTRICT pabyDstDataPtr =
2441 : reinterpret_cast<GByte *>(pDstData);
2442 3127410 : for (; n < nWordCount - 15; n += 16)
2443 : {
2444 2980270 : __m128i xmm = _mm_loadu_si128(
2445 2980270 : reinterpret_cast<const __m128i *>(pSrcData + n));
2446 2980270 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2447 2980270 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2448 2980270 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2449 2980270 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2450 2980270 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2451 2980270 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2452 :
2453 : #if defined(__AVX2__) && defined(slightly_slower_than_SSE2)
2454 : _mm256_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2455 : _mm256_cvtepi32_pd(xmm0));
2456 : _mm256_storeu_pd(
2457 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2458 : _mm256_cvtepi32_pd(xmm1));
2459 : _mm256_storeu_pd(
2460 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
2461 : _mm256_cvtepi32_pd(xmm2));
2462 : _mm256_storeu_pd(
2463 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
2464 : _mm256_cvtepi32_pd(xmm3));
2465 : #else
2466 2980270 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2467 2980270 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2468 2980270 : __m128d xmm2_low_d = _mm_cvtepi32_pd(xmm2);
2469 2980270 : __m128d xmm3_low_d = _mm_cvtepi32_pd(xmm3);
2470 2980270 : xmm0 = _mm_srli_si128(xmm0, 8);
2471 2980270 : xmm1 = _mm_srli_si128(xmm1, 8);
2472 2980270 : xmm2 = _mm_srli_si128(xmm2, 8);
2473 2980270 : xmm3 = _mm_srli_si128(xmm3, 8);
2474 2980270 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2475 2980270 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2476 2980270 : __m128d xmm2_high_d = _mm_cvtepi32_pd(xmm2);
2477 2980270 : __m128d xmm3_high_d = _mm_cvtepi32_pd(xmm3);
2478 :
2479 2980270 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2480 : xmm0_low_d);
2481 : _mm_storeu_pd(
2482 2980270 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2483 : xmm0_high_d);
2484 : _mm_storeu_pd(
2485 2980270 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2486 : xmm1_low_d);
2487 : _mm_storeu_pd(
2488 2980270 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2489 : xmm1_high_d);
2490 : _mm_storeu_pd(
2491 2980270 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
2492 : xmm2_low_d);
2493 : _mm_storeu_pd(
2494 2980270 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 80),
2495 : xmm2_high_d);
2496 : _mm_storeu_pd(
2497 2980270 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
2498 : xmm3_low_d);
2499 : _mm_storeu_pd(
2500 2980270 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 112),
2501 : xmm3_high_d);
2502 : #endif
2503 : }
2504 280823 : for (; n < nWordCount; n++)
2505 : {
2506 133683 : pDstData[n] = pSrcData[n];
2507 147140 : }
2508 : }
2509 : else
2510 : {
2511 23798 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2512 : nDstPixelStride, nWordCount);
2513 : }
2514 170938 : }
2515 :
2516 : template <>
2517 148 : CPL_NOINLINE void GDALCopyWordsT(const uint8_t *const CPL_RESTRICT pSrcData,
2518 : int nSrcPixelStride,
2519 : int8_t *const CPL_RESTRICT pDstData,
2520 : int nDstPixelStride, GPtrDiff_t nWordCount)
2521 : {
2522 148 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2523 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2524 : {
2525 142 : decltype(nWordCount) n = 0;
2526 142 : const __m128i xmm_127 = _mm_set1_epi8(127);
2527 146 : for (; n < nWordCount - 31; n += 32)
2528 : {
2529 8 : __m128i xmm0 = _mm_loadu_si128(
2530 4 : reinterpret_cast<const __m128i *>(pSrcData + n));
2531 4 : __m128i xmm1 = _mm_loadu_si128(
2532 4 : reinterpret_cast<const __m128i *>(pSrcData + n + 16));
2533 4 : xmm0 = _mm_min_epu8(xmm0, xmm_127);
2534 4 : xmm1 = _mm_min_epu8(xmm1, xmm_127);
2535 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2536 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
2537 : xmm1);
2538 : }
2539 2424 : for (; n < nWordCount; n++)
2540 : {
2541 2282 : pDstData[n] =
2542 2282 : pSrcData[n] >= 127 ? 127 : static_cast<int8_t>(pSrcData[n]);
2543 142 : }
2544 : }
2545 : else
2546 : {
2547 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2548 : nDstPixelStride, nWordCount);
2549 : }
2550 148 : }
2551 :
2552 : template <>
2553 62 : CPL_NOINLINE void GDALCopyWordsT(const int8_t *const CPL_RESTRICT pSrcData,
2554 : int nSrcPixelStride,
2555 : uint8_t *const CPL_RESTRICT pDstData,
2556 : int nDstPixelStride, GPtrDiff_t nWordCount)
2557 : {
2558 62 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2559 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2560 : {
2561 56 : decltype(nWordCount) n = 0;
2562 : #if !(defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS))
2563 56 : const __m128i xmm_INT8_to_UINT8 = _mm_set1_epi8(-128);
2564 : #endif
2565 117 : for (; n < nWordCount - 31; n += 32)
2566 : {
2567 122 : __m128i xmm0 = _mm_loadu_si128(
2568 61 : reinterpret_cast<const __m128i *>(pSrcData + n));
2569 61 : __m128i xmm1 = _mm_loadu_si128(
2570 61 : reinterpret_cast<const __m128i *>(pSrcData + n + 16));
2571 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2572 : xmm0 = _mm_max_epi8(xmm0, _mm_setzero_si128());
2573 : xmm1 = _mm_max_epi8(xmm1, _mm_setzero_si128());
2574 : #else
2575 61 : xmm0 = _mm_add_epi8(xmm0, xmm_INT8_to_UINT8);
2576 61 : xmm1 = _mm_add_epi8(xmm1, xmm_INT8_to_UINT8);
2577 61 : xmm0 = _mm_max_epu8(xmm0, xmm_INT8_to_UINT8);
2578 61 : xmm1 = _mm_max_epu8(xmm1, xmm_INT8_to_UINT8);
2579 61 : xmm0 = _mm_sub_epi8(xmm0, xmm_INT8_to_UINT8);
2580 61 : xmm1 = _mm_sub_epi8(xmm1, xmm_INT8_to_UINT8);
2581 : #endif
2582 61 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2583 61 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
2584 : xmm1);
2585 : }
2586 352 : for (; n < nWordCount; n++)
2587 : {
2588 296 : pDstData[n] =
2589 296 : pSrcData[n] < 0 ? 0 : static_cast<uint8_t>(pSrcData[n]);
2590 56 : }
2591 : }
2592 : else
2593 : {
2594 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2595 : nDstPixelStride, nWordCount);
2596 : }
2597 62 : }
2598 :
2599 : template <>
2600 6037 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2601 : int nSrcPixelStride,
2602 : uint8_t *const CPL_RESTRICT pDstData,
2603 : int nDstPixelStride, GPtrDiff_t nWordCount)
2604 : {
2605 6037 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2606 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2607 : {
2608 5062 : decltype(nWordCount) n = 0;
2609 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2610 : const auto xmm_MAX_INT16 = _mm_set1_epi16(32767);
2611 : #else
2612 : // In SSE2, min_epu16 does not exist, so shift from
2613 : // UInt16 to SInt16 to be able to use min_epi16
2614 5062 : const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
2615 5062 : const __m128i xmm_m255_shifted = _mm_set1_epi16(255 - 32768);
2616 : #endif
2617 71888 : for (; n < nWordCount - 15; n += 16)
2618 : {
2619 133652 : __m128i xmm0 = _mm_loadu_si128(
2620 66826 : reinterpret_cast<const __m128i *>(pSrcData + n));
2621 66826 : __m128i xmm1 = _mm_loadu_si128(
2622 66826 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2623 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2624 : xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
2625 : xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
2626 : #else
2627 66826 : xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
2628 66826 : xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
2629 66826 : xmm0 = _mm_min_epi16(xmm0, xmm_m255_shifted);
2630 66826 : xmm1 = _mm_min_epi16(xmm1, xmm_m255_shifted);
2631 66826 : xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
2632 66826 : xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
2633 : #endif
2634 66826 : xmm0 = _mm_packus_epi16(xmm0, xmm1);
2635 66826 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2636 : }
2637 16403 : for (; n < nWordCount; n++)
2638 : {
2639 11341 : pDstData[n] =
2640 11341 : pSrcData[n] >= 255 ? 255 : static_cast<uint8_t>(pSrcData[n]);
2641 5062 : }
2642 : }
2643 : else
2644 : {
2645 975 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2646 : nDstPixelStride, nWordCount);
2647 : }
2648 6037 : }
2649 :
2650 : template <>
2651 46 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2652 : int nSrcPixelStride,
2653 : int16_t *const CPL_RESTRICT pDstData,
2654 : int nDstPixelStride, GPtrDiff_t nWordCount)
2655 : {
2656 46 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2657 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2658 : {
2659 40 : decltype(nWordCount) n = 0;
2660 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2661 : const __m128i xmm_MAX_INT16 = _mm_set1_epi16(32767);
2662 : #else
2663 : // In SSE2, min_epu16 does not exist, so shift from
2664 : // UInt16 to SInt16 to be able to use min_epi16
2665 40 : const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
2666 40 : const __m128i xmm_32767_shifted = _mm_set1_epi16(32767 - 32768);
2667 : #endif
2668 169 : for (; n < nWordCount - 15; n += 16)
2669 : {
2670 258 : __m128i xmm0 = _mm_loadu_si128(
2671 129 : reinterpret_cast<const __m128i *>(pSrcData + n));
2672 129 : __m128i xmm1 = _mm_loadu_si128(
2673 129 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2674 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2675 : xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
2676 : xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
2677 : #else
2678 129 : xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
2679 129 : xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
2680 129 : xmm0 = _mm_min_epi16(xmm0, xmm_32767_shifted);
2681 129 : xmm1 = _mm_min_epi16(xmm1, xmm_32767_shifted);
2682 129 : xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
2683 129 : xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
2684 : #endif
2685 129 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2686 129 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
2687 : xmm1);
2688 : }
2689 191 : for (; n < nWordCount; n++)
2690 : {
2691 282 : pDstData[n] = pSrcData[n] >= 32767
2692 : ? 32767
2693 131 : : static_cast<int16_t>(pSrcData[n]);
2694 40 : }
2695 : }
2696 : else
2697 : {
2698 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2699 : nDstPixelStride, nWordCount);
2700 : }
2701 46 : }
2702 :
2703 : template <>
2704 136 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2705 : int nSrcPixelStride,
2706 : uint16_t *const CPL_RESTRICT pDstData,
2707 : int nDstPixelStride, GPtrDiff_t nWordCount)
2708 : {
2709 136 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2710 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2711 : {
2712 93 : decltype(nWordCount) n = 0;
2713 93 : const __m128i xmm_zero = _mm_setzero_si128();
2714 278 : for (; n < nWordCount - 15; n += 16)
2715 : {
2716 370 : __m128i xmm0 = _mm_loadu_si128(
2717 185 : reinterpret_cast<const __m128i *>(pSrcData + n));
2718 185 : __m128i xmm1 = _mm_loadu_si128(
2719 185 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2720 185 : xmm0 = _mm_max_epi16(xmm0, xmm_zero);
2721 185 : xmm1 = _mm_max_epi16(xmm1, xmm_zero);
2722 185 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2723 185 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
2724 : xmm1);
2725 : }
2726 471 : for (; n < nWordCount; n++)
2727 : {
2728 378 : pDstData[n] =
2729 378 : pSrcData[n] < 0 ? 0 : static_cast<uint16_t>(pSrcData[n]);
2730 93 : }
2731 : }
2732 : else
2733 : {
2734 43 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2735 : nDstPixelStride, nWordCount);
2736 : }
2737 136 : }
2738 :
2739 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2740 :
2741 : template <>
2742 : CPL_NOINLINE void GDALCopyWordsT(const uint32_t *const CPL_RESTRICT pSrcData,
2743 : int nSrcPixelStride,
2744 : int32_t *const CPL_RESTRICT pDstData,
2745 : int nDstPixelStride, GPtrDiff_t nWordCount)
2746 : {
2747 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2748 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2749 : {
2750 : decltype(nWordCount) n = 0;
2751 : const __m128i xmm_MAX_INT = _mm_set1_epi32(INT_MAX);
2752 : for (; n < nWordCount - 8; n += 7)
2753 : {
2754 : __m128i xmm0 = _mm_loadu_si128(
2755 : reinterpret_cast<const __m128i *>(pSrcData + n));
2756 : __m128i xmm1 = _mm_loadu_si128(
2757 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
2758 : xmm0 = _mm_min_epu32(xmm0, xmm_MAX_INT);
2759 : xmm1 = _mm_min_epu32(xmm1, xmm_MAX_INT);
2760 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2761 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
2762 : xmm1);
2763 : }
2764 : for (; n < nWordCount; n++)
2765 : {
2766 : pDstData[n] = pSrcData[n] >= INT_MAX
2767 : ? INT_MAX
2768 : : static_cast<int32_t>(pSrcData[n]);
2769 : }
2770 : }
2771 : else
2772 : {
2773 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2774 : nDstPixelStride, nWordCount);
2775 : }
2776 : }
2777 :
2778 : template <>
2779 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
2780 : int nSrcPixelStride,
2781 : uint32_t *const CPL_RESTRICT pDstData,
2782 : int nDstPixelStride, GPtrDiff_t nWordCount)
2783 : {
2784 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2785 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2786 : {
2787 : decltype(nWordCount) n = 0;
2788 : const __m128i xmm_zero = _mm_setzero_si128();
2789 : for (; n < nWordCount - 7; n += 8)
2790 : {
2791 : __m128i xmm0 = _mm_loadu_si128(
2792 : reinterpret_cast<const __m128i *>(pSrcData + n));
2793 : __m128i xmm1 = _mm_loadu_si128(
2794 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
2795 : xmm0 = _mm_max_epi32(xmm0, xmm_zero);
2796 : xmm1 = _mm_max_epi32(xmm1, xmm_zero);
2797 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2798 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
2799 : xmm1);
2800 : }
2801 : for (; n < nWordCount; n++)
2802 : {
2803 : pDstData[n] =
2804 : pSrcData[n] < 0 ? 0 : static_cast<uint32_t>(pSrcData[n]);
2805 : }
2806 : }
2807 : else
2808 : {
2809 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2810 : nDstPixelStride, nWordCount);
2811 : }
2812 : }
2813 :
2814 : #endif // defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2815 :
2816 : template <>
2817 403 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2818 : int nSrcPixelStride,
2819 : float *const CPL_RESTRICT pDstData,
2820 : int nDstPixelStride, GPtrDiff_t nWordCount)
2821 : {
2822 403 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2823 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2824 : {
2825 397 : decltype(nWordCount) n = 0;
2826 397 : const __m128i xmm_zero = _mm_setzero_si128();
2827 397 : GByte *CPL_RESTRICT pabyDstDataPtr =
2828 : reinterpret_cast<GByte *>(pDstData);
2829 1688 : for (; n < nWordCount - 7; n += 8)
2830 : {
2831 1291 : __m128i xmm = _mm_loadu_si128(
2832 1291 : reinterpret_cast<const __m128i *>(pSrcData + n));
2833 1291 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
2834 1291 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
2835 1291 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2836 1291 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2837 1291 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2838 : xmm0_f);
2839 : _mm_storeu_ps(
2840 1291 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2841 : }
2842 1415 : for (; n < nWordCount; n++)
2843 : {
2844 1018 : pDstData[n] = pSrcData[n];
2845 397 : }
2846 : }
2847 : else
2848 : {
2849 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2850 : nDstPixelStride, nWordCount);
2851 : }
2852 403 : }
2853 :
2854 : template <>
2855 1076640 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2856 : int nSrcPixelStride,
2857 : float *const CPL_RESTRICT pDstData,
2858 : int nDstPixelStride, GPtrDiff_t nWordCount)
2859 : {
2860 1076640 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2861 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2862 : {
2863 86742 : decltype(nWordCount) n = 0;
2864 86742 : GByte *CPL_RESTRICT pabyDstDataPtr =
2865 : reinterpret_cast<GByte *>(pDstData);
2866 586119 : for (; n < nWordCount - 7; n += 8)
2867 : {
2868 499377 : __m128i xmm = _mm_loadu_si128(
2869 499377 : reinterpret_cast<const __m128i *>(pSrcData + n));
2870 499377 : const auto sign = _mm_srai_epi16(xmm, 15);
2871 499377 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
2872 499377 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
2873 499377 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2874 499377 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2875 499377 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2876 : xmm0_f);
2877 : _mm_storeu_ps(
2878 499377 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2879 : }
2880 253882 : for (; n < nWordCount; n++)
2881 : {
2882 167140 : pDstData[n] = pSrcData[n];
2883 86742 : }
2884 : }
2885 : else
2886 : {
2887 989901 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2888 : nDstPixelStride, nWordCount);
2889 : }
2890 1076640 : }
2891 :
2892 : template <>
2893 449 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2894 : int nSrcPixelStride,
2895 : double *const CPL_RESTRICT pDstData,
2896 : int nDstPixelStride, GPtrDiff_t nWordCount)
2897 : {
2898 449 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2899 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2900 : {
2901 313 : decltype(nWordCount) n = 0;
2902 313 : const __m128i xmm_zero = _mm_setzero_si128();
2903 313 : GByte *CPL_RESTRICT pabyDstDataPtr =
2904 : reinterpret_cast<GByte *>(pDstData);
2905 829 : for (; n < nWordCount - 7; n += 8)
2906 : {
2907 516 : __m128i xmm = _mm_loadu_si128(
2908 516 : reinterpret_cast<const __m128i *>(pSrcData + n));
2909 516 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
2910 516 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
2911 :
2912 516 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2913 516 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2914 516 : xmm0 = _mm_srli_si128(xmm0, 8);
2915 516 : xmm1 = _mm_srli_si128(xmm1, 8);
2916 516 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2917 516 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2918 :
2919 516 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2920 : xmm0_low_d);
2921 : _mm_storeu_pd(
2922 516 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2923 : xmm0_high_d);
2924 : _mm_storeu_pd(
2925 516 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2926 : xmm1_low_d);
2927 : _mm_storeu_pd(
2928 516 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2929 : xmm1_high_d);
2930 : }
2931 1082 : for (; n < nWordCount; n++)
2932 : {
2933 769 : pDstData[n] = pSrcData[n];
2934 313 : }
2935 : }
2936 : else
2937 : {
2938 136 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2939 : nDstPixelStride, nWordCount);
2940 : }
2941 449 : }
2942 :
2943 : template <>
2944 4923280 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2945 : int nSrcPixelStride,
2946 : double *const CPL_RESTRICT pDstData,
2947 : int nDstPixelStride, GPtrDiff_t nWordCount)
2948 : {
2949 4923280 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2950 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2951 : {
2952 34874 : decltype(nWordCount) n = 0;
2953 34874 : GByte *CPL_RESTRICT pabyDstDataPtr =
2954 : reinterpret_cast<GByte *>(pDstData);
2955 403828 : for (; n < nWordCount - 7; n += 8)
2956 : {
2957 368954 : __m128i xmm = _mm_loadu_si128(
2958 368954 : reinterpret_cast<const __m128i *>(pSrcData + n));
2959 368954 : const auto sign = _mm_srai_epi16(xmm, 15);
2960 368954 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
2961 368954 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
2962 :
2963 368954 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2964 368954 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2965 368954 : xmm0 = _mm_srli_si128(xmm0, 8);
2966 368954 : xmm1 = _mm_srli_si128(xmm1, 8);
2967 368954 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2968 368954 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2969 :
2970 368954 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2971 : xmm0_low_d);
2972 : _mm_storeu_pd(
2973 368954 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2974 : xmm0_high_d);
2975 : _mm_storeu_pd(
2976 368954 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2977 : xmm1_low_d);
2978 : _mm_storeu_pd(
2979 368954 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2980 : xmm1_high_d);
2981 : }
2982 255934 : for (; n < nWordCount; n++)
2983 : {
2984 221060 : pDstData[n] = pSrcData[n];
2985 34874 : }
2986 : }
2987 : else
2988 : {
2989 4888400 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2990 : nDstPixelStride, nWordCount);
2991 : }
2992 4923280 : }
2993 :
2994 : // ---- AVX2 helpers for int32 narrowing (runtime dispatch) ----
2995 :
2996 : #if defined(HAVE_AVX2_DISPATCH)
2997 : #if !defined(_MSC_VER)
2998 : __attribute__((target("avx2")))
2999 : #endif
3000 12723 : static void GDALCopyWordsInt32ToUInt8_AVX2(const int32_t *CPL_RESTRICT pSrc,
3001 : uint8_t *CPL_RESTRICT pDst,
3002 : GPtrDiff_t nWordCount)
3003 : {
3004 12723 : const __m256i permuteIdx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
3005 12723 : GPtrDiff_t n = 0;
3006 958119 : for (; n < nWordCount - 31; n += 32)
3007 : {
3008 : __m256i v0 =
3009 945396 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n));
3010 : __m256i v1 =
3011 945396 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n + 8));
3012 945396 : __m256i v2 = _mm256_loadu_si256(
3013 945396 : reinterpret_cast<const __m256i *>(pSrc + n + 16));
3014 945396 : __m256i v3 = _mm256_loadu_si256(
3015 945396 : reinterpret_cast<const __m256i *>(pSrc + n + 24));
3016 : // Clamp to [0, 255]
3017 : // Pack int32 -> int16 -> uint8, then fix cross-lane ordering
3018 945396 : __m256i ab16 = _mm256_packs_epi32(v0, v1);
3019 945396 : __m256i cd16 = _mm256_packs_epi32(v2, v3);
3020 945396 : __m256i bytes = _mm256_packus_epi16(ab16, cd16);
3021 945396 : bytes = _mm256_permutevar8x32_epi32(bytes, permuteIdx);
3022 945396 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(pDst + n), bytes);
3023 : }
3024 68589 : for (; n < nWordCount; n++)
3025 : {
3026 70955 : pDst[n] = pSrc[n] <= 0 ? 0
3027 15089 : : pSrc[n] >= 255 ? 255
3028 1075 : : static_cast<uint8_t>(pSrc[n]);
3029 : }
3030 12723 : }
3031 :
3032 : #if !defined(_MSC_VER)
3033 : __attribute__((target("avx2")))
3034 : #endif
3035 10277 : static void GDALCopyWordsInt32ToUInt16_AVX2(const int32_t *CPL_RESTRICT pSrc,
3036 : uint16_t *CPL_RESTRICT pDst,
3037 : GPtrDiff_t nWordCount)
3038 : {
3039 : // _mm256_packus_epi32(v0, v1) produces per-lane interleaved result:
3040 : // [v0_lo4, v1_lo4, v0_hi4, v1_hi4] (in uint16 pairs per 32-bit lane)
3041 : // Permute to deinterleave: all v0 values first, then all v1 values
3042 10277 : const __m256i permuteIdx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
3043 10277 : GPtrDiff_t n = 0;
3044 670572 : for (; n < nWordCount - 15; n += 16)
3045 : {
3046 : __m256i v0 =
3047 660295 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n));
3048 : __m256i v1 =
3049 1320590 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n + 8));
3050 : // Clamp to [0, 65535]: _mm256_packus_epi32 saturates uint
3051 660295 : __m256i packed = _mm256_packus_epi32(v0, v1);
3052 : // Fix cross-lane interleave from packus
3053 660295 : packed = _mm256_permutevar8x32_epi32(packed, permuteIdx);
3054 660295 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(pDst + n), packed);
3055 : }
3056 163928 : for (; n < nWordCount; n++)
3057 : {
3058 307282 : pDst[n] = pSrc[n] <= 0 ? 0
3059 153631 : : pSrc[n] >= 65535 ? 65535
3060 153599 : : static_cast<uint16_t>(pSrc[n]);
3061 : }
3062 10277 : }
3063 : #endif // HAVE_AVX2_DISPATCH
3064 :
3065 : // ---- int32 -> uint8 with clamping to [0, 255] ----
3066 : template <>
3067 12837 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
3068 : int nSrcPixelStride,
3069 : uint8_t *const CPL_RESTRICT pDstData,
3070 : int nDstPixelStride, GPtrDiff_t nWordCount)
3071 : {
3072 12837 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
3073 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
3074 : {
3075 : #if defined(HAVE_AVX2_DISPATCH)
3076 12723 : if (CPLHaveRuntimeAVX2())
3077 : {
3078 12723 : GDALCopyWordsInt32ToUInt8_AVX2(pSrcData, pDstData, nWordCount);
3079 12723 : return;
3080 : }
3081 : #endif
3082 : #ifdef HAVE_SSE2
3083 : // SSE2 path: 16 pixels per iteration
3084 0 : decltype(nWordCount) n = 0;
3085 0 : for (; n < nWordCount - 15; n += 16)
3086 : {
3087 0 : __m128i v0 = _mm_loadu_si128(
3088 0 : reinterpret_cast<const __m128i *>(pSrcData + n));
3089 0 : __m128i v1 = _mm_loadu_si128(
3090 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
3091 0 : __m128i v2 = _mm_loadu_si128(
3092 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
3093 0 : __m128i v3 = _mm_loadu_si128(
3094 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 12));
3095 : // Values in [0, 255]: pack int32->int16->uint8
3096 0 : __m128i lo16 = _mm_packs_epi32(v0, v1);
3097 0 : __m128i hi16 = _mm_packs_epi32(v2, v3);
3098 0 : __m128i bytes = _mm_packus_epi16(lo16, hi16);
3099 0 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), bytes);
3100 : }
3101 0 : for (; n < nWordCount; n++)
3102 : #else
3103 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
3104 : #endif
3105 : {
3106 0 : pDstData[n] = pSrcData[n] <= 0 ? 0
3107 0 : : pSrcData[n] >= 255
3108 : ? 255
3109 0 : : static_cast<uint8_t>(pSrcData[n]);
3110 0 : }
3111 : }
3112 : else
3113 : {
3114 114 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
3115 : nDstPixelStride, nWordCount);
3116 : }
3117 : }
3118 :
3119 : // ---- int32 -> uint16 with clamping to [0, 65535] ----
3120 : template <>
3121 10322 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
3122 : int nSrcPixelStride,
3123 : uint16_t *const CPL_RESTRICT pDstData,
3124 : int nDstPixelStride, GPtrDiff_t nWordCount)
3125 : {
3126 10322 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
3127 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
3128 : {
3129 : #if defined(HAVE_AVX2_DISPATCH)
3130 10277 : if (CPLHaveRuntimeAVX2())
3131 : {
3132 10277 : GDALCopyWordsInt32ToUInt16_AVX2(pSrcData, pDstData, nWordCount);
3133 10277 : return;
3134 : }
3135 : #endif
3136 0 : decltype(nWordCount) n = 0;
3137 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
3138 : // SSE4.1: _mm_packus_epi32 directly handles uint saturation
3139 : for (; n < nWordCount - 7; n += 8)
3140 : {
3141 : __m128i v0 = _mm_loadu_si128(
3142 : reinterpret_cast<const __m128i *>(pSrcData + n));
3143 : __m128i v1 = _mm_loadu_si128(
3144 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
3145 : __m128i packed = _mm_packus_epi32(v0, v1);
3146 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), packed);
3147 : }
3148 : #else
3149 : // SSE2: clamp to [0, 65535], bias to signed range, pack, unbias
3150 0 : const __m128i xmm_65535 = _mm_set1_epi32(65535);
3151 0 : const __m128i xmm_bias32 = _mm_set1_epi32(32768);
3152 0 : const __m128i xmm_bias16 = _mm_set1_epi16(-32768);
3153 0 : for (; n < nWordCount - 7; n += 8)
3154 : {
3155 0 : __m128i v0 = _mm_loadu_si128(
3156 0 : reinterpret_cast<const __m128i *>(pSrcData + n));
3157 0 : __m128i v1 = _mm_loadu_si128(
3158 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
3159 : // max(v, 0)
3160 0 : v0 = _mm_andnot_si128(_mm_srai_epi32(v0, 31), v0);
3161 0 : v1 = _mm_andnot_si128(_mm_srai_epi32(v1, 31), v1);
3162 : // min(v, 65535)
3163 0 : __m128i gt0 = _mm_cmpgt_epi32(v0, xmm_65535);
3164 0 : __m128i gt1 = _mm_cmpgt_epi32(v1, xmm_65535);
3165 0 : v0 = _mm_or_si128(_mm_andnot_si128(gt0, v0),
3166 : _mm_and_si128(gt0, xmm_65535));
3167 0 : v1 = _mm_or_si128(_mm_andnot_si128(gt1, v1),
3168 : _mm_and_si128(gt1, xmm_65535));
3169 : // Shift [0, 65535] -> [-32768, 32767] for _mm_packs_epi32
3170 0 : v0 = _mm_sub_epi32(v0, xmm_bias32);
3171 0 : v1 = _mm_sub_epi32(v1, xmm_bias32);
3172 0 : __m128i packed = _mm_packs_epi32(v0, v1);
3173 : // Shift back: sub_epi16(x, -32768) == add 32768 (mod 2^16)
3174 0 : packed = _mm_sub_epi16(packed, xmm_bias16);
3175 0 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), packed);
3176 : }
3177 : #endif
3178 0 : for (; n < nWordCount; n++)
3179 : {
3180 0 : pDstData[n] = pSrcData[n] <= 0 ? 0
3181 0 : : pSrcData[n] >= 65535
3182 : ? 65535
3183 0 : : static_cast<uint16_t>(pSrcData[n]);
3184 0 : }
3185 : }
3186 : else
3187 : {
3188 45 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
3189 : nDstPixelStride, nWordCount);
3190 : }
3191 : }
3192 :
3193 : #endif // HAVE_SSE2
3194 :
3195 : template <>
3196 4426980 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3197 : int nSrcPixelStride,
3198 : GByte *const CPL_RESTRICT pDstData,
3199 : int nDstPixelStride, GPtrDiff_t nWordCount)
3200 : {
3201 4426980 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3202 : nDstPixelStride, nWordCount);
3203 4426980 : }
3204 :
3205 : template <>
3206 38387 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3207 : int nSrcPixelStride,
3208 : GUInt16 *const CPL_RESTRICT pDstData,
3209 : int nDstPixelStride, GPtrDiff_t nWordCount)
3210 : {
3211 38387 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3212 : nDstPixelStride, nWordCount);
3213 38387 : }
3214 :
3215 : template <>
3216 55671 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3217 : int nSrcPixelStride,
3218 : double *const CPL_RESTRICT pDstData,
3219 : int nDstPixelStride, GPtrDiff_t nWordCount)
3220 : {
3221 55671 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3222 : nDstPixelStride, nWordCount);
3223 55671 : }
3224 :
3225 : template <>
3226 122845 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3227 : int nSrcPixelStride,
3228 : float *const CPL_RESTRICT pDstData,
3229 : int nDstPixelStride, GPtrDiff_t nWordCount)
3230 : {
3231 122845 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3232 : nDstPixelStride, nWordCount);
3233 122845 : }
3234 :
3235 : template <>
3236 412 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
3237 : int nSrcPixelStride,
3238 : float *const CPL_RESTRICT pDstData,
3239 : int nDstPixelStride, GPtrDiff_t nWordCount)
3240 : {
3241 412 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3242 : nDstPixelStride, nWordCount);
3243 412 : }
3244 :
3245 : template <>
3246 544 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
3247 : int nSrcPixelStride,
3248 : double *const CPL_RESTRICT pDstData,
3249 : int nDstPixelStride, GPtrDiff_t nWordCount)
3250 : {
3251 544 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3252 : nDstPixelStride, nWordCount);
3253 544 : }
3254 :
3255 : template <>
3256 314423 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3257 : int nSrcPixelStride,
3258 : GByte *const CPL_RESTRICT pDstData,
3259 : int nDstPixelStride, GPtrDiff_t nWordCount)
3260 : {
3261 314423 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3262 : nDstPixelStride, nWordCount);
3263 314423 : }
3264 :
3265 : template <>
3266 55 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3267 : int nSrcPixelStride,
3268 : GInt8 *const CPL_RESTRICT pDstData,
3269 : int nDstPixelStride, GPtrDiff_t nWordCount)
3270 : {
3271 55 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3272 : nDstPixelStride, nWordCount);
3273 55 : }
3274 :
3275 : template <>
3276 15785 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3277 : int nSrcPixelStride,
3278 : GInt16 *const CPL_RESTRICT pDstData,
3279 : int nDstPixelStride, GPtrDiff_t nWordCount)
3280 : {
3281 15785 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3282 : nDstPixelStride, nWordCount);
3283 15785 : }
3284 :
3285 : template <>
3286 61713 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3287 : int nSrcPixelStride,
3288 : GUInt16 *const CPL_RESTRICT pDstData,
3289 : int nDstPixelStride, GPtrDiff_t nWordCount)
3290 : {
3291 61713 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3292 : nDstPixelStride, nWordCount);
3293 61713 : }
3294 :
3295 : template <>
3296 43985 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3297 : int nSrcPixelStride,
3298 : GInt32 *const CPL_RESTRICT pDstData,
3299 : int nDstPixelStride, GPtrDiff_t nWordCount)
3300 : {
3301 43985 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3302 : nDstPixelStride, nWordCount);
3303 43985 : }
3304 :
3305 : template <>
3306 72 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3307 : int nSrcPixelStride,
3308 : GFloat16 *const CPL_RESTRICT pDstData,
3309 : int nDstPixelStride, GPtrDiff_t nWordCount)
3310 : {
3311 72 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3312 : nDstPixelStride, nWordCount);
3313 72 : }
3314 :
3315 : template <>
3316 63 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3317 : int nSrcPixelStride,
3318 : GFloat16 *const CPL_RESTRICT pDstData,
3319 : int nDstPixelStride, GPtrDiff_t nWordCount)
3320 : {
3321 63 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3322 : nDstPixelStride, nWordCount);
3323 63 : }
3324 :
3325 : /************************************************************************/
3326 : /* GDALCopyWordsComplexT() */
3327 : /************************************************************************/
3328 : /**
3329 : * Template function, used to copy data from pSrcData into buffer
3330 : * pDstData, with stride nSrcPixelStride in the source data and
3331 : * stride nDstPixelStride in the destination data. Deals with the
3332 : * complex case, where input is complex and output is complex.
3333 : *
3334 : * @param pSrcData the source data buffer
3335 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
3336 : * of interest.
3337 : * @param pDstData the destination buffer.
3338 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
3339 : * interest.
3340 : * @param nWordCount the total number of pixel words to copy
3341 : *
3342 : */
3343 : template <class Tin, class Tout>
3344 98788 : inline void GDALCopyWordsComplexT(const Tin *const CPL_RESTRICT pSrcData,
3345 : int nSrcPixelStride,
3346 : Tout *const CPL_RESTRICT pDstData,
3347 : int nDstPixelStride, GPtrDiff_t nWordCount)
3348 : {
3349 98788 : decltype(nWordCount) nDstOffset = 0;
3350 98788 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
3351 98788 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
3352 :
3353 5631239 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
3354 : {
3355 5532446 : const Tin *const pPixelIn =
3356 5532446 : reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
3357 5532446 : Tout *const pPixelOut =
3358 5532446 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
3359 :
3360 5532446 : GDALCopyWord(pPixelIn[0], pPixelOut[0]);
3361 5532446 : GDALCopyWord(pPixelIn[1], pPixelOut[1]);
3362 :
3363 5532446 : nDstOffset += nDstPixelStride;
3364 : }
3365 98788 : }
3366 :
3367 : /************************************************************************/
3368 : /* GDALCopyWordsComplexOutT() */
3369 : /************************************************************************/
3370 : /**
3371 : * Template function, used to copy data from pSrcData into buffer
3372 : * pDstData, with stride nSrcPixelStride in the source data and
3373 : * stride nDstPixelStride in the destination data. Deals with the
3374 : * case where the value is real coming in, but complex going out.
3375 : *
3376 : * @param pSrcData the source data buffer
3377 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
3378 : * of interest, in bytes.
3379 : * @param pDstData the destination buffer.
3380 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
3381 : * interest, in bytes.
3382 : * @param nWordCount the total number of pixel words to copy
3383 : *
3384 : */
3385 : template <class Tin, class Tout>
3386 4762 : inline void GDALCopyWordsComplexOutT(const Tin *const CPL_RESTRICT pSrcData,
3387 : int nSrcPixelStride,
3388 : Tout *const CPL_RESTRICT pDstData,
3389 : int nDstPixelStride, GPtrDiff_t nWordCount)
3390 : {
3391 4762 : decltype(nWordCount) nDstOffset = 0;
3392 :
3393 4762 : const Tout tOutZero = static_cast<Tout>(0);
3394 :
3395 4762 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
3396 4762 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
3397 :
3398 1190408 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
3399 : {
3400 1185646 : const Tin tValue =
3401 1185646 : *reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
3402 1185646 : Tout *const pPixelOut =
3403 1185646 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
3404 1185646 : GDALCopyWord(tValue, *pPixelOut);
3405 :
3406 1185646 : pPixelOut[1] = tOutZero;
3407 :
3408 1185646 : nDstOffset += nDstPixelStride;
3409 : }
3410 4762 : }
3411 :
3412 : /************************************************************************/
3413 : /* GDALCopyWordsFromT() */
3414 : /************************************************************************/
3415 : /**
3416 : * Template driver function. Given the input type T, call the appropriate
3417 : * GDALCopyWordsT function template for the desired output type. You should
3418 : * never call this function directly (call GDALCopyWords instead).
3419 : *
3420 : * @param pSrcData source data buffer
3421 : * @param nSrcPixelStride pixel stride in input buffer, in pixel words
3422 : * @param bInComplex input is complex
3423 : * @param pDstData destination data buffer
3424 : * @param eDstType destination data type
3425 : * @param nDstPixelStride pixel stride in output buffer, in pixel words
3426 : * @param nWordCount number of pixel words to be copied
3427 : */
3428 : template <class T>
3429 61292825 : inline void GDALCopyWordsFromT(const T *const CPL_RESTRICT pSrcData,
3430 : int nSrcPixelStride, bool bInComplex,
3431 : void *CPL_RESTRICT pDstData,
3432 : GDALDataType eDstType, int nDstPixelStride,
3433 : GPtrDiff_t nWordCount)
3434 : {
3435 61292825 : switch (eDstType)
3436 : {
3437 4785549 : case GDT_UInt8:
3438 4785549 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3439 : static_cast<unsigned char *>(pDstData),
3440 : nDstPixelStride, nWordCount);
3441 4785549 : break;
3442 1891 : case GDT_Int8:
3443 1891 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3444 : static_cast<signed char *>(pDstData),
3445 : nDstPixelStride, nWordCount);
3446 1891 : break;
3447 1143544 : case GDT_UInt16:
3448 1143544 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3449 : static_cast<unsigned short *>(pDstData),
3450 : nDstPixelStride, nWordCount);
3451 1143544 : break;
3452 4162728 : case GDT_Int16:
3453 4162728 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3454 : static_cast<short *>(pDstData), nDstPixelStride,
3455 : nWordCount);
3456 4162728 : break;
3457 23084 : case GDT_UInt32:
3458 23084 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3459 : static_cast<unsigned int *>(pDstData),
3460 : nDstPixelStride, nWordCount);
3461 23084 : break;
3462 29460149 : case GDT_Int32:
3463 29460149 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3464 : static_cast<int *>(pDstData), nDstPixelStride,
3465 : nWordCount);
3466 29460149 : break;
3467 1250 : case GDT_UInt64:
3468 1250 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3469 : static_cast<std::uint64_t *>(pDstData),
3470 : nDstPixelStride, nWordCount);
3471 1250 : break;
3472 5957 : case GDT_Int64:
3473 5957 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3474 : static_cast<std::int64_t *>(pDstData),
3475 : nDstPixelStride, nWordCount);
3476 5957 : break;
3477 999 : case GDT_Float16:
3478 999 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3479 : static_cast<GFloat16 *>(pDstData), nDstPixelStride,
3480 : nWordCount);
3481 999 : break;
3482 4216050 : case GDT_Float32:
3483 4216050 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3484 : static_cast<float *>(pDstData), nDstPixelStride,
3485 : nWordCount);
3486 4216050 : break;
3487 17387964 : case GDT_Float64:
3488 17387964 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3489 : static_cast<double *>(pDstData), nDstPixelStride,
3490 : nWordCount);
3491 17387964 : break;
3492 94424 : case GDT_CInt16:
3493 94424 : if (bInComplex)
3494 : {
3495 93170 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3496 : static_cast<short *>(pDstData),
3497 : nDstPixelStride, nWordCount);
3498 : }
3499 : else // input is not complex, so we need to promote to a complex
3500 : // buffer
3501 : {
3502 1254 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3503 : static_cast<short *>(pDstData),
3504 : nDstPixelStride, nWordCount);
3505 : }
3506 94424 : break;
3507 1349 : case GDT_CInt32:
3508 1349 : if (bInComplex)
3509 : {
3510 717 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3511 : static_cast<int *>(pDstData),
3512 : nDstPixelStride, nWordCount);
3513 : }
3514 : else // input is not complex, so we need to promote to a complex
3515 : // buffer
3516 : {
3517 632 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3518 : static_cast<int *>(pDstData),
3519 : nDstPixelStride, nWordCount);
3520 : }
3521 1349 : break;
3522 313 : case GDT_CFloat16:
3523 313 : if (bInComplex)
3524 : {
3525 48 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3526 : static_cast<GFloat16 *>(pDstData),
3527 : nDstPixelStride, nWordCount);
3528 : }
3529 : else // input is not complex, so we need to promote to a complex
3530 : // buffer
3531 : {
3532 265 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3533 : static_cast<GFloat16 *>(pDstData),
3534 : nDstPixelStride, nWordCount);
3535 : }
3536 313 : break;
3537 3924 : case GDT_CFloat32:
3538 3924 : if (bInComplex)
3539 : {
3540 3115 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3541 : static_cast<float *>(pDstData),
3542 : nDstPixelStride, nWordCount);
3543 : }
3544 : else // input is not complex, so we need to promote to a complex
3545 : // buffer
3546 : {
3547 809 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3548 : static_cast<float *>(pDstData),
3549 : nDstPixelStride, nWordCount);
3550 : }
3551 3924 : break;
3552 3540 : case GDT_CFloat64:
3553 3540 : if (bInComplex)
3554 : {
3555 1738 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3556 : static_cast<double *>(pDstData),
3557 : nDstPixelStride, nWordCount);
3558 : }
3559 : else // input is not complex, so we need to promote to a complex
3560 : // buffer
3561 : {
3562 1802 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3563 : static_cast<double *>(pDstData),
3564 : nDstPixelStride, nWordCount);
3565 : }
3566 3540 : break;
3567 0 : case GDT_Unknown:
3568 : case GDT_TypeCount:
3569 0 : CPLAssert(false);
3570 : }
3571 61292825 : }
3572 :
3573 : } // end anonymous namespace
3574 :
3575 : /************************************************************************/
3576 : /* GDALReplicateWord() */
3577 : /************************************************************************/
3578 :
3579 : template <class T>
3580 600405 : inline void GDALReplicateWordT(void *pDstData, int nDstPixelStride,
3581 : GPtrDiff_t nWordCount)
3582 : {
3583 600405 : const T valSet = *static_cast<const T *>(pDstData);
3584 600405 : if (nDstPixelStride == static_cast<int>(sizeof(T)))
3585 : {
3586 570592 : T *pDstPtr = static_cast<T *>(pDstData) + 1;
3587 31990099 : while (nWordCount >= 4)
3588 : {
3589 31419540 : nWordCount -= 4;
3590 31419540 : pDstPtr[0] = valSet;
3591 31419540 : pDstPtr[1] = valSet;
3592 31419540 : pDstPtr[2] = valSet;
3593 31419540 : pDstPtr[3] = valSet;
3594 31419540 : pDstPtr += 4;
3595 : }
3596 1476627 : while (nWordCount > 0)
3597 : {
3598 906035 : --nWordCount;
3599 906035 : *pDstPtr = valSet;
3600 906035 : pDstPtr++;
3601 : }
3602 : }
3603 : else
3604 : {
3605 29813 : GByte *pabyDstPtr = static_cast<GByte *>(pDstData) + nDstPixelStride;
3606 1040984 : while (nWordCount > 0)
3607 : {
3608 1011171 : --nWordCount;
3609 1011171 : *reinterpret_cast<T *>(pabyDstPtr) = valSet;
3610 1011171 : pabyDstPtr += nDstPixelStride;
3611 : }
3612 : }
3613 600405 : }
3614 :
3615 1068100 : static void GDALReplicateWord(const void *CPL_RESTRICT pSrcData,
3616 : GDALDataType eSrcType,
3617 : void *CPL_RESTRICT pDstData,
3618 : GDALDataType eDstType, int nDstPixelStride,
3619 : GPtrDiff_t nWordCount)
3620 : {
3621 : /* -----------------------------------------------------------------------
3622 : */
3623 : /* Special case when the source data is always the same value */
3624 : /* (for VRTSourcedRasterBand::IRasterIO and
3625 : * VRTDerivedRasterBand::IRasterIO*/
3626 : /* for example) */
3627 : /* -----------------------------------------------------------------------
3628 : */
3629 : // Let the general translation case do the necessary conversions
3630 : // on the first destination element.
3631 1068100 : GDALCopyWords64(pSrcData, eSrcType, 0, pDstData, eDstType, 0, 1);
3632 :
3633 : // Now copy the first element to the nWordCount - 1 following destination
3634 : // elements.
3635 1068100 : nWordCount--;
3636 1068100 : GByte *pabyDstWord = reinterpret_cast<GByte *>(pDstData) + nDstPixelStride;
3637 :
3638 1068100 : switch (eDstType)
3639 : {
3640 467605 : case GDT_UInt8:
3641 : case GDT_Int8:
3642 : {
3643 467605 : if (nDstPixelStride == 1)
3644 : {
3645 369687 : if (nWordCount > 0)
3646 369687 : memset(pabyDstWord,
3647 369687 : *reinterpret_cast<const GByte *>(pDstData),
3648 : nWordCount);
3649 : }
3650 : else
3651 : {
3652 97918 : GByte valSet = *reinterpret_cast<const GByte *>(pDstData);
3653 67697100 : while (nWordCount > 0)
3654 : {
3655 67599200 : --nWordCount;
3656 67599200 : *pabyDstWord = valSet;
3657 67599200 : pabyDstWord += nDstPixelStride;
3658 : }
3659 : }
3660 467605 : break;
3661 : }
3662 :
3663 : #define CASE_DUPLICATE_SIMPLE(enum_type, c_type) \
3664 : case enum_type: \
3665 : { \
3666 : GDALReplicateWordT<c_type>(pDstData, nDstPixelStride, nWordCount); \
3667 : break; \
3668 : }
3669 :
3670 34513 : CASE_DUPLICATE_SIMPLE(GDT_UInt16, GUInt16)
3671 202455 : CASE_DUPLICATE_SIMPLE(GDT_Int16, GInt16)
3672 74 : CASE_DUPLICATE_SIMPLE(GDT_UInt32, GUInt32)
3673 301585 : CASE_DUPLICATE_SIMPLE(GDT_Int32, GInt32)
3674 41 : CASE_DUPLICATE_SIMPLE(GDT_UInt64, std::uint64_t)
3675 1072 : CASE_DUPLICATE_SIMPLE(GDT_Int64, std::int64_t)
3676 2 : CASE_DUPLICATE_SIMPLE(GDT_Float16, GFloat16)
3677 52858 : CASE_DUPLICATE_SIMPLE(GDT_Float32, float)
3678 7805 : CASE_DUPLICATE_SIMPLE(GDT_Float64, double)
3679 :
3680 : #define CASE_DUPLICATE_COMPLEX(enum_type, c_type) \
3681 : case enum_type: \
3682 : { \
3683 : c_type valSet1 = reinterpret_cast<const c_type *>(pDstData)[0]; \
3684 : c_type valSet2 = reinterpret_cast<const c_type *>(pDstData)[1]; \
3685 : while (nWordCount > 0) \
3686 : { \
3687 : --nWordCount; \
3688 : reinterpret_cast<c_type *>(pabyDstWord)[0] = valSet1; \
3689 : reinterpret_cast<c_type *>(pabyDstWord)[1] = valSet2; \
3690 : pabyDstWord += nDstPixelStride; \
3691 : } \
3692 : break; \
3693 : }
3694 :
3695 784 : CASE_DUPLICATE_COMPLEX(GDT_CInt16, GInt16)
3696 784 : CASE_DUPLICATE_COMPLEX(GDT_CInt32, GInt32)
3697 6 : CASE_DUPLICATE_COMPLEX(GDT_CFloat16, GFloat16)
3698 790 : CASE_DUPLICATE_COMPLEX(GDT_CFloat32, float)
3699 790 : CASE_DUPLICATE_COMPLEX(GDT_CFloat64, double)
3700 :
3701 0 : case GDT_Unknown:
3702 : case GDT_TypeCount:
3703 0 : CPLAssert(false);
3704 : }
3705 1068100 : }
3706 :
3707 : /************************************************************************/
3708 : /* GDALUnrolledCopy() */
3709 : /************************************************************************/
3710 :
3711 : template <class T, int srcStride, int dstStride>
3712 : #if defined(__GNUC__) && defined(__AVX2__)
3713 : __attribute__((optimize("tree-vectorize")))
3714 : #endif
3715 3000825 : static inline void GDALUnrolledCopyGeneric(T *CPL_RESTRICT pDest,
3716 : const T *CPL_RESTRICT pSrc,
3717 : GPtrDiff_t nIters)
3718 : {
3719 : #if !(defined(__GNUC__) && defined(__AVX2__))
3720 3000825 : if (nIters >= 16)
3721 : {
3722 132814787 : for (GPtrDiff_t i = nIters / 16; i != 0; i--)
3723 : {
3724 129934645 : pDest[0 * dstStride] = pSrc[0 * srcStride];
3725 129934645 : pDest[1 * dstStride] = pSrc[1 * srcStride];
3726 129934645 : pDest[2 * dstStride] = pSrc[2 * srcStride];
3727 129934645 : pDest[3 * dstStride] = pSrc[3 * srcStride];
3728 129934645 : pDest[4 * dstStride] = pSrc[4 * srcStride];
3729 129934645 : pDest[5 * dstStride] = pSrc[5 * srcStride];
3730 129934645 : pDest[6 * dstStride] = pSrc[6 * srcStride];
3731 129934645 : pDest[7 * dstStride] = pSrc[7 * srcStride];
3732 129934645 : pDest[8 * dstStride] = pSrc[8 * srcStride];
3733 129934645 : pDest[9 * dstStride] = pSrc[9 * srcStride];
3734 129934645 : pDest[10 * dstStride] = pSrc[10 * srcStride];
3735 129934645 : pDest[11 * dstStride] = pSrc[11 * srcStride];
3736 129934645 : pDest[12 * dstStride] = pSrc[12 * srcStride];
3737 129934645 : pDest[13 * dstStride] = pSrc[13 * srcStride];
3738 129934645 : pDest[14 * dstStride] = pSrc[14 * srcStride];
3739 129934645 : pDest[15 * dstStride] = pSrc[15 * srcStride];
3740 129934645 : pDest += 16 * dstStride;
3741 129934645 : pSrc += 16 * srcStride;
3742 : }
3743 2880267 : nIters = nIters % 16;
3744 : }
3745 : #else
3746 : #pragma GCC unroll 4
3747 : #endif
3748 5162269 : for (GPtrDiff_t i = 0; i < nIters; i++)
3749 : {
3750 2161443 : pDest[i * dstStride] = *pSrc;
3751 2161443 : pSrc += srcStride;
3752 : }
3753 3000825 : }
3754 :
3755 : template <class T, int srcStride, int dstStride>
3756 3000825 : static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
3757 : const T *CPL_RESTRICT pSrc,
3758 : GPtrDiff_t nIters)
3759 : {
3760 3000825 : GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
3761 3000825 : }
3762 :
3763 : #if defined(__AVX2__) && defined(HAVE_SSSE3_AT_COMPILE_TIME) && \
3764 : (defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS))
3765 :
3766 : template <>
3767 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
3768 : const GByte *CPL_RESTRICT pSrc,
3769 : GPtrDiff_t nIters)
3770 : {
3771 : if (nIters > 16)
3772 : {
3773 : // The SSSE3 variant is slightly faster than what the gcc autovectorizer
3774 : // generates
3775 : GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
3776 : }
3777 : else
3778 : {
3779 : for (GPtrDiff_t i = 0; i < nIters; i++)
3780 : {
3781 : pDest[i] = *pSrc;
3782 : pSrc += 3;
3783 : }
3784 : }
3785 : }
3786 :
3787 : #elif defined(HAVE_SSE2) && !(defined(__GNUC__) && defined(__AVX2__))
3788 :
3789 : template <>
3790 354194 : void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
3791 : const GByte *CPL_RESTRICT pSrc,
3792 : GPtrDiff_t nIters)
3793 : {
3794 354194 : decltype(nIters) i = 0;
3795 354194 : if (nIters > 16)
3796 : {
3797 194667 : const __m128i xmm_mask = _mm_set1_epi16(0xff);
3798 : // If we were sure that there would always be 1 trailing byte, we could
3799 : // check against nIters - 15
3800 2988110 : for (; i < nIters - 16; i += 16)
3801 : {
3802 : __m128i xmm0 =
3803 2793440 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
3804 : __m128i xmm1 =
3805 5586890 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
3806 : // Set higher 8bit of each int16 packed word to 0
3807 2793440 : xmm0 = _mm_and_si128(xmm0, xmm_mask);
3808 2793440 : xmm1 = _mm_and_si128(xmm1, xmm_mask);
3809 : // Pack int16 to uint8 and merge back both vector
3810 2793440 : xmm0 = _mm_packus_epi16(xmm0, xmm1);
3811 :
3812 : // Store result
3813 2793440 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
3814 :
3815 2793440 : pSrc += 2 * 16;
3816 : }
3817 : }
3818 4633800 : for (; i < nIters; i++)
3819 : {
3820 4279610 : pDest[i] = *pSrc;
3821 4279610 : pSrc += 2;
3822 : }
3823 354194 : }
3824 :
3825 1 : static void GDALUnrolledCopy_GByte_3_1_SSE2(GByte *CPL_RESTRICT pDest,
3826 : const GByte *CPL_RESTRICT pSrc,
3827 : GPtrDiff_t nIters)
3828 : {
3829 1 : decltype(nIters) i = 0;
3830 1 : const __m128i xmm_mask_ori = _mm_set_epi32(0, 0, 0, 255);
3831 : // If we were sure that there would always be 2 trailing bytes, we could
3832 : // check against nIters - 15
3833 2 : for (; i < nIters - 16; i += 16)
3834 : {
3835 : __m128i xmm0 =
3836 1 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
3837 : __m128i xmm1 =
3838 1 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
3839 : __m128i xmm2 =
3840 1 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
3841 :
3842 1 : auto xmm_mask0 = xmm_mask_ori;
3843 1 : auto xmm_mask1 = _mm_slli_si128(xmm_mask_ori, 6);
3844 1 : auto xmm_mask2 = _mm_slli_si128(xmm_mask_ori, 11);
3845 :
3846 1 : auto xmm = _mm_and_si128(xmm0, xmm_mask0);
3847 1 : auto xmm_res1 = _mm_and_si128(_mm_slli_si128(xmm1, 4), xmm_mask1);
3848 :
3849 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3850 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3851 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3852 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3853 2 : xmm_res1 = _mm_or_si128(
3854 : xmm_res1, _mm_and_si128(_mm_slli_si128(xmm1, 2), xmm_mask1));
3855 :
3856 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3857 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3858 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3859 2 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3860 1 : xmm_res1 = _mm_or_si128(xmm_res1, _mm_and_si128(xmm1, xmm_mask1));
3861 :
3862 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3863 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3864 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3865 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3866 2 : xmm_res1 = _mm_or_si128(
3867 : xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 2), xmm_mask1));
3868 :
3869 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3870 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3871 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3872 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3873 3 : xmm_res1 = _mm_or_si128(
3874 : xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 4), xmm_mask1));
3875 1 : xmm = _mm_or_si128(xmm, xmm_res1);
3876 :
3877 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3878 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3879 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3880 :
3881 2 : xmm = _mm_or_si128(xmm,
3882 : _mm_and_si128(_mm_slli_si128(xmm2, 10), xmm_mask2));
3883 :
3884 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
3885 2 : xmm = _mm_or_si128(xmm,
3886 : _mm_and_si128(_mm_slli_si128(xmm2, 8), xmm_mask2));
3887 :
3888 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
3889 2 : xmm = _mm_or_si128(xmm,
3890 : _mm_and_si128(_mm_slli_si128(xmm2, 6), xmm_mask2));
3891 :
3892 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
3893 2 : xmm = _mm_or_si128(xmm,
3894 : _mm_and_si128(_mm_slli_si128(xmm2, 4), xmm_mask2));
3895 :
3896 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
3897 2 : xmm = _mm_or_si128(xmm,
3898 : _mm_and_si128(_mm_slli_si128(xmm2, 2), xmm_mask2));
3899 :
3900 1 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm);
3901 :
3902 1 : pSrc += 3 * 16;
3903 : }
3904 2 : for (; i < nIters; i++)
3905 : {
3906 1 : pDest[i] = *pSrc;
3907 1 : pSrc += 3;
3908 : }
3909 1 : }
3910 :
3911 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
3912 :
3913 : template <>
3914 192265 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
3915 : const GByte *CPL_RESTRICT pSrc,
3916 : GPtrDiff_t nIters)
3917 : {
3918 192265 : if (nIters > 16)
3919 : {
3920 186142 : if (CPLHaveRuntimeSSSE3())
3921 : {
3922 186141 : GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
3923 : }
3924 : else
3925 : {
3926 1 : GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
3927 : }
3928 : }
3929 : else
3930 : {
3931 20384 : for (GPtrDiff_t i = 0; i < nIters; i++)
3932 : {
3933 14261 : pDest[i] = *pSrc;
3934 14261 : pSrc += 3;
3935 : }
3936 : }
3937 192265 : }
3938 :
3939 : #else
3940 :
3941 : template <>
3942 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
3943 : const GByte *CPL_RESTRICT pSrc,
3944 : GPtrDiff_t nIters)
3945 : {
3946 : GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
3947 : }
3948 : #endif
3949 :
3950 : template <>
3951 332657 : void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
3952 : const GByte *CPL_RESTRICT pSrc,
3953 : GPtrDiff_t nIters)
3954 : {
3955 332657 : decltype(nIters) i = 0;
3956 332657 : if (nIters > 16)
3957 : {
3958 327364 : const __m128i xmm_mask = _mm_set1_epi32(0xff);
3959 : // If we were sure that there would always be 3 trailing bytes, we could
3960 : // check against nIters - 15
3961 28043500 : for (; i < nIters - 16; i += 16)
3962 : {
3963 : __m128i xmm0 =
3964 27716100 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
3965 : __m128i xmm1 =
3966 27716100 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
3967 : __m128i xmm2 =
3968 27716100 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
3969 : __m128i xmm3 =
3970 55432200 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 48));
3971 : // Set higher 24bit of each int32 packed word to 0
3972 27716100 : xmm0 = _mm_and_si128(xmm0, xmm_mask);
3973 27716100 : xmm1 = _mm_and_si128(xmm1, xmm_mask);
3974 27716100 : xmm2 = _mm_and_si128(xmm2, xmm_mask);
3975 27716100 : xmm3 = _mm_and_si128(xmm3, xmm_mask);
3976 : // Pack int32 to int16
3977 27716100 : xmm0 = _mm_packs_epi32(xmm0, xmm1);
3978 27716100 : xmm2 = _mm_packs_epi32(xmm2, xmm3);
3979 : // Pack int16 to uint8
3980 27716100 : xmm0 = _mm_packus_epi16(xmm0, xmm2);
3981 :
3982 : // Store result
3983 27716100 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
3984 :
3985 27716100 : pSrc += 4 * 16;
3986 : }
3987 : }
3988 5048740 : for (; i < nIters; i++)
3989 : {
3990 4716080 : pDest[i] = *pSrc;
3991 4716080 : pSrc += 4;
3992 : }
3993 332657 : }
3994 : #endif // HAVE_SSE2
3995 :
3996 : /************************************************************************/
3997 : /* GDALFastCopy() */
3998 : /************************************************************************/
3999 :
4000 : template <class T>
4001 40101500 : static inline void GDALFastCopy(T *CPL_RESTRICT pDest, int nDestStride,
4002 : const T *CPL_RESTRICT pSrc, int nSrcStride,
4003 : GPtrDiff_t nIters)
4004 : {
4005 40101500 : constexpr int sizeofT = static_cast<int>(sizeof(T));
4006 40101500 : if (nIters == 1)
4007 : {
4008 22540480 : *pDest = *pSrc;
4009 : }
4010 17560932 : else if (nDestStride == sizeofT)
4011 : {
4012 14486989 : if (nSrcStride == sizeofT)
4013 : {
4014 13398074 : memcpy(pDest, pSrc, nIters * sizeof(T));
4015 : }
4016 1088849 : else if (nSrcStride == 2 * sizeofT)
4017 : {
4018 357409 : GDALUnrolledCopy<T, 2, 1>(pDest, pSrc, nIters);
4019 : }
4020 731440 : else if (nSrcStride == 3 * sizeofT)
4021 : {
4022 289245 : GDALUnrolledCopy<T, 3, 1>(pDest, pSrc, nIters);
4023 : }
4024 442195 : else if (nSrcStride == 4 * sizeofT)
4025 : {
4026 336639 : GDALUnrolledCopy<T, 4, 1>(pDest, pSrc, nIters);
4027 : }
4028 : else
4029 : {
4030 17229290 : while (nIters-- > 0)
4031 : {
4032 17123750 : *pDest = *pSrc;
4033 17123750 : pSrc += nSrcStride / sizeofT;
4034 17123750 : pDest++;
4035 : }
4036 : }
4037 : }
4038 3073963 : else if (nSrcStride == sizeofT)
4039 : {
4040 3060967 : if (nDestStride == 2 * sizeofT)
4041 : {
4042 151252 : GDALUnrolledCopy<T, 1, 2>(pDest, pSrc, nIters);
4043 : }
4044 2909715 : else if (nDestStride == 3 * sizeofT)
4045 : {
4046 2131771 : GDALUnrolledCopy<T, 1, 3>(pDest, pSrc, nIters);
4047 : }
4048 777937 : else if (nDestStride == 4 * sizeofT)
4049 : {
4050 613625 : GDALUnrolledCopy<T, 1, 4>(pDest, pSrc, nIters);
4051 : }
4052 : else
4053 : {
4054 17169660 : while (nIters-- > 0)
4055 : {
4056 17005410 : *pDest = *pSrc;
4057 17005410 : pSrc++;
4058 17005410 : pDest += nDestStride / sizeofT;
4059 : }
4060 : }
4061 : }
4062 : else
4063 : {
4064 1220108 : while (nIters-- > 0)
4065 : {
4066 1207102 : *pDest = *pSrc;
4067 1207102 : pSrc += nSrcStride / sizeofT;
4068 1207102 : pDest += nDestStride / sizeofT;
4069 : }
4070 : }
4071 40101500 : }
4072 :
4073 : /************************************************************************/
4074 : /* GDALFastCopyByte() */
4075 : /************************************************************************/
4076 :
4077 326320 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
4078 : int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
4079 : int nDstPixelStride, GPtrDiff_t nWordCount)
4080 : {
4081 326320 : GDALFastCopy(pDstData, nDstPixelStride, pSrcData, nSrcPixelStride,
4082 : nWordCount);
4083 326320 : }
4084 :
4085 : /************************************************************************/
4086 : /* GDALCopyWords() */
4087 : /************************************************************************/
4088 :
4089 : /**
4090 : * Copy pixel words from buffer to buffer.
4091 : *
4092 : * @see GDALCopyWords64()
4093 : */
4094 80491000 : void CPL_STDCALL GDALCopyWords(const void *CPL_RESTRICT pSrcData,
4095 : GDALDataType eSrcType, int nSrcPixelStride,
4096 : void *CPL_RESTRICT pDstData,
4097 : GDALDataType eDstType, int nDstPixelStride,
4098 : int nWordCount)
4099 : {
4100 80491000 : GDALCopyWords64(pSrcData, eSrcType, nSrcPixelStride, pDstData, eDstType,
4101 : nDstPixelStride, nWordCount);
4102 80491000 : }
4103 :
4104 : /************************************************************************/
4105 : /* GDALCopyWords64() */
4106 : /************************************************************************/
4107 :
4108 : /**
4109 : * Copy pixel words from buffer to buffer.
4110 : *
4111 : * This function is used to copy pixel word values from one memory buffer
4112 : * to another, with support for conversion between data types, and differing
4113 : * step factors. The data type conversion is done using the following
4114 : * rules:
4115 : * <ul>
4116 : * <li>Values assigned to a lower range integer type are clipped. For
4117 : * instance assigning GDT_Int16 values to a GDT_UInt8 buffer will cause values
4118 : * less the 0 to be set to 0, and values larger than 255 to be set to 255.
4119 : * </li>
4120 : * <li>
4121 : * Assignment from floating point to integer rounds to closest integer.
4122 : * +Infinity is mapped to the largest integer. -Infinity is mapped to the
4123 : * smallest integer. NaN is mapped to 0.
4124 : * </li>
4125 : * <li>
4126 : * Assignment from non-complex to complex will result in the imaginary part
4127 : * being set to zero on output.
4128 : * </li>
4129 : * <li> Assignment from complex to
4130 : * non-complex will result in the complex portion being lost and the real
4131 : * component being preserved (<i>not magnitude!</i>).
4132 : * </li>
4133 : * </ul>
4134 : *
4135 : * No assumptions are made about the source or destination words occurring
4136 : * on word boundaries. It is assumed that all values are in native machine
4137 : * byte order.
4138 : *
4139 : * @param pSrcData Pointer to source data to be converted.
4140 : * @param eSrcType the source data type (see GDALDataType enum)
4141 : * @param nSrcPixelStride Source pixel stride (i.e. distance between 2 words),
4142 : * in bytes
4143 : * @param pDstData Pointer to buffer where destination data should go
4144 : * @param eDstType the destination data type (see GDALDataType enum)
4145 : * @param nDstPixelStride Destination pixel stride (i.e. distance between 2
4146 : * words), in bytes
4147 : * @param nWordCount number of words to be copied
4148 : *
4149 : * @note
4150 : * When adding a new data type to GDAL, you must do the following to
4151 : * support it properly within the GDALCopyWords function:
4152 : * 1. Add the data type to the switch on eSrcType in GDALCopyWords.
4153 : * This should invoke the appropriate GDALCopyWordsFromT wrapper.
4154 : * 2. Add the data type to the switch on eDstType in GDALCopyWordsFromT.
4155 : * This should call the appropriate GDALCopyWordsT template.
4156 : * 3. If appropriate, overload the appropriate CopyWord template in the
4157 : * above namespace. This will ensure that any conversion issues are
4158 : * handled (cases like the float -> int32 case, where the min/max)
4159 : * values are subject to roundoff error.
4160 : */
4161 :
4162 116774000 : void CPL_STDCALL GDALCopyWords64(const void *CPL_RESTRICT pSrcData,
4163 : GDALDataType eSrcType, int nSrcPixelStride,
4164 : void *CPL_RESTRICT pDstData,
4165 : GDALDataType eDstType, int nDstPixelStride,
4166 : GPtrDiff_t nWordCount)
4167 :
4168 : {
4169 : // On platforms where alignment matters, be careful
4170 116774000 : const int nSrcDataTypeSize = GDALGetDataTypeSizeBytes(eSrcType);
4171 116774000 : const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(eDstType);
4172 116774000 : if (CPL_UNLIKELY(nSrcDataTypeSize == 0 || nDstDataTypeSize == 0))
4173 : {
4174 2 : CPLError(CE_Failure, CPLE_NotSupported,
4175 : "GDALCopyWords64(): unsupported GDT_Unknown/GDT_TypeCount "
4176 : "argument");
4177 2 : return;
4178 : }
4179 116774000 : if (!(eSrcType == eDstType && nSrcPixelStride == nDstPixelStride) &&
4180 66323000 : ((reinterpret_cast<uintptr_t>(pSrcData) % nSrcDataTypeSize) != 0 ||
4181 66323000 : (reinterpret_cast<uintptr_t>(pDstData) % nDstDataTypeSize) != 0 ||
4182 66322600 : (nSrcPixelStride % nSrcDataTypeSize) != 0 ||
4183 66322500 : (nDstPixelStride % nDstDataTypeSize) != 0))
4184 : {
4185 905 : if (eSrcType == eDstType)
4186 : {
4187 34800 : for (decltype(nWordCount) i = 0; i < nWordCount; i++)
4188 : {
4189 34000 : memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
4190 : static_cast<const GByte *>(pSrcData) +
4191 34000 : nSrcPixelStride * i,
4192 : nDstDataTypeSize);
4193 : }
4194 : }
4195 : else
4196 : {
4197 210 : const auto getAlignedPtr = [](GByte *ptr, int align)
4198 : {
4199 : return ptr +
4200 210 : ((align - (reinterpret_cast<uintptr_t>(ptr) % align)) %
4201 210 : align);
4202 : };
4203 :
4204 : // The largest we need is for CFloat64 (16 bytes), so 32 bytes to
4205 : // be sure to get correctly aligned pointer.
4206 105 : constexpr size_t SIZEOF_CFLOAT64 = 2 * sizeof(double);
4207 : GByte abySrcBuffer[2 * SIZEOF_CFLOAT64];
4208 : GByte abyDstBuffer[2 * SIZEOF_CFLOAT64];
4209 : GByte *pabySrcBuffer =
4210 105 : getAlignedPtr(abySrcBuffer, nSrcDataTypeSize);
4211 : GByte *pabyDstBuffer =
4212 105 : getAlignedPtr(abyDstBuffer, nDstDataTypeSize);
4213 3360 : for (decltype(nWordCount) i = 0; i < nWordCount; i++)
4214 : {
4215 3255 : memcpy(pabySrcBuffer,
4216 : static_cast<const GByte *>(pSrcData) +
4217 3255 : nSrcPixelStride * i,
4218 : nSrcDataTypeSize);
4219 3255 : GDALCopyWords64(pabySrcBuffer, eSrcType, 0, pabyDstBuffer,
4220 : eDstType, 0, 1);
4221 3255 : memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
4222 : pabyDstBuffer, nDstDataTypeSize);
4223 : }
4224 : }
4225 905 : return;
4226 : }
4227 :
4228 : // Deal with the case where we're replicating a single word into the
4229 : // provided buffer
4230 116773000 : if (nSrcPixelStride == 0 && nWordCount > 1)
4231 : {
4232 1068100 : GDALReplicateWord(pSrcData, eSrcType, pDstData, eDstType,
4233 : nDstPixelStride, nWordCount);
4234 1068100 : return;
4235 : }
4236 :
4237 115705000 : if (eSrcType == eDstType)
4238 : {
4239 54674100 : if (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8)
4240 : {
4241 17979400 : GDALFastCopy(static_cast<GByte *>(pDstData), nDstPixelStride,
4242 : static_cast<const GByte *>(pSrcData), nSrcPixelStride,
4243 : nWordCount);
4244 17979400 : return;
4245 : }
4246 :
4247 36694700 : if (nSrcDataTypeSize == 2 && (nSrcPixelStride % 2) == 0 &&
4248 21795700 : (nDstPixelStride % 2) == 0)
4249 : {
4250 21795700 : GDALFastCopy(static_cast<short *>(pDstData), nDstPixelStride,
4251 : static_cast<const short *>(pSrcData), nSrcPixelStride,
4252 : nWordCount);
4253 21795700 : return;
4254 : }
4255 :
4256 14899000 : if (nWordCount == 1)
4257 : {
4258 : #if defined(CSA_BUILD) || defined(__COVERITY__)
4259 : // Avoid false positives...
4260 : memcpy(pDstData, pSrcData, nSrcDataTypeSize);
4261 : #else
4262 14411900 : if (nSrcDataTypeSize == 2)
4263 0 : memcpy(pDstData, pSrcData, 2);
4264 14411900 : else if (nSrcDataTypeSize == 4)
4265 13807600 : memcpy(pDstData, pSrcData, 4);
4266 604283 : else if (nSrcDataTypeSize == 8)
4267 587678 : memcpy(pDstData, pSrcData, 8);
4268 : else /* if( eSrcType == GDT_CFloat64 ) */
4269 16605 : memcpy(pDstData, pSrcData, 16);
4270 : #endif
4271 14411900 : return;
4272 : }
4273 :
4274 : // Let memcpy() handle the case where we're copying a packed buffer
4275 : // of pixels.
4276 487145 : if (nSrcPixelStride == nDstPixelStride)
4277 : {
4278 225301 : if (nSrcPixelStride == nSrcDataTypeSize)
4279 : {
4280 225233 : memcpy(pDstData, pSrcData, nWordCount * nSrcDataTypeSize);
4281 225233 : return;
4282 : }
4283 : }
4284 : }
4285 :
4286 : // Handle the more general case -- deals with conversion of data types
4287 : // directly.
4288 61292800 : switch (eSrcType)
4289 : {
4290 20306200 : case GDT_UInt8:
4291 20306200 : GDALCopyWordsFromT<unsigned char>(
4292 : static_cast<const unsigned char *>(pSrcData), nSrcPixelStride,
4293 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4294 20306200 : break;
4295 1786 : case GDT_Int8:
4296 1786 : GDALCopyWordsFromT<signed char>(
4297 : static_cast<const signed char *>(pSrcData), nSrcPixelStride,
4298 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4299 1786 : break;
4300 55311 : case GDT_UInt16:
4301 55311 : GDALCopyWordsFromT<unsigned short>(
4302 : static_cast<const unsigned short *>(pSrcData), nSrcPixelStride,
4303 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4304 55311 : break;
4305 6519830 : case GDT_Int16:
4306 6519830 : GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
4307 : nSrcPixelStride, false, pDstData,
4308 : eDstType, nDstPixelStride, nWordCount);
4309 6519830 : break;
4310 8016 : case GDT_UInt32:
4311 8016 : GDALCopyWordsFromT<unsigned int>(
4312 : static_cast<const unsigned int *>(pSrcData), nSrcPixelStride,
4313 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4314 8016 : break;
4315 12254800 : case GDT_Int32:
4316 12254800 : GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
4317 : nSrcPixelStride, false, pDstData, eDstType,
4318 : nDstPixelStride, nWordCount);
4319 12254800 : break;
4320 2205 : case GDT_UInt64:
4321 2205 : GDALCopyWordsFromT<std::uint64_t>(
4322 : static_cast<const std::uint64_t *>(pSrcData), nSrcPixelStride,
4323 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4324 2205 : break;
4325 11729 : case GDT_Int64:
4326 11729 : GDALCopyWordsFromT<std::int64_t>(
4327 : static_cast<const std::int64_t *>(pSrcData), nSrcPixelStride,
4328 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4329 11729 : break;
4330 1387 : case GDT_Float16:
4331 1387 : GDALCopyWordsFromT<GFloat16>(
4332 : static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, false,
4333 : pDstData, eDstType, nDstPixelStride, nWordCount);
4334 1387 : break;
4335 654936 : case GDT_Float32:
4336 654936 : GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
4337 : nSrcPixelStride, false, pDstData,
4338 : eDstType, nDstPixelStride, nWordCount);
4339 654936 : break;
4340 20715800 : case GDT_Float64:
4341 20715800 : GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
4342 : nSrcPixelStride, false, pDstData,
4343 : eDstType, nDstPixelStride, nWordCount);
4344 20715800 : break;
4345 478486 : case GDT_CInt16:
4346 478486 : GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
4347 : nSrcPixelStride, true, pDstData, eDstType,
4348 : nDstPixelStride, nWordCount);
4349 478486 : break;
4350 868 : case GDT_CInt32:
4351 868 : GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
4352 : nSrcPixelStride, true, pDstData, eDstType,
4353 : nDstPixelStride, nWordCount);
4354 868 : break;
4355 508 : case GDT_CFloat16:
4356 508 : GDALCopyWordsFromT<GFloat16>(
4357 : static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, true,
4358 : pDstData, eDstType, nDstPixelStride, nWordCount);
4359 508 : break;
4360 2437 : case GDT_CFloat32:
4361 2437 : GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
4362 : nSrcPixelStride, true, pDstData, eDstType,
4363 : nDstPixelStride, nWordCount);
4364 2437 : break;
4365 278517 : case GDT_CFloat64:
4366 278517 : GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
4367 : nSrcPixelStride, true, pDstData,
4368 : eDstType, nDstPixelStride, nWordCount);
4369 278517 : break;
4370 0 : case GDT_Unknown:
4371 : case GDT_TypeCount:
4372 0 : CPLAssert(false);
4373 : }
4374 : }
4375 :
4376 : /************************************************************************/
4377 : /* GDALCopyBits() */
4378 : /************************************************************************/
4379 :
4380 : /**
4381 : * Bitwise word copying.
4382 : *
4383 : * A function for moving sets of partial bytes around. Loosely
4384 : * speaking this is a bitwise analog to GDALCopyWords().
4385 : *
4386 : * It copies nStepCount "words" where each word is nBitCount bits long.
4387 : * The nSrcStep and nDstStep are the number of bits from the start of one
4388 : * word to the next (same as nBitCount if they are packed). The nSrcOffset
4389 : * and nDstOffset are the offset into the source and destination buffers
4390 : * to start at, also measured in bits.
4391 : *
4392 : * All bit offsets are assumed to start from the high order bit in a byte
4393 : * (i.e. most significant bit first). Currently this function is not very
4394 : * optimized, but it may be improved for some common cases in the future
4395 : * as needed.
4396 : *
4397 : * @param pabySrcData the source data buffer.
4398 : * @param nSrcOffset the offset (in bits) in pabySrcData to the start of the
4399 : * first word to copy.
4400 : * @param nSrcStep the offset in bits from the start one source word to the
4401 : * start of the next.
4402 : * @param pabyDstData the destination data buffer.
4403 : * @param nDstOffset the offset (in bits) in pabyDstData to the start of the
4404 : * first word to copy over.
4405 : * @param nDstStep the offset in bits from the start one word to the
4406 : * start of the next.
4407 : * @param nBitCount the number of bits in a word to be copied.
4408 : * @param nStepCount the number of words to copy.
4409 : */
4410 :
4411 0 : void GDALCopyBits(const GByte *pabySrcData, int nSrcOffset, int nSrcStep,
4412 : GByte *pabyDstData, int nDstOffset, int nDstStep,
4413 : int nBitCount, int nStepCount)
4414 :
4415 : {
4416 0 : VALIDATE_POINTER0(pabySrcData, "GDALCopyBits");
4417 :
4418 0 : for (int iStep = 0; iStep < nStepCount; iStep++)
4419 : {
4420 0 : for (int iBit = 0; iBit < nBitCount; iBit++)
4421 : {
4422 0 : if (pabySrcData[nSrcOffset >> 3] & (0x80 >> (nSrcOffset & 7)))
4423 0 : pabyDstData[nDstOffset >> 3] |= (0x80 >> (nDstOffset & 7));
4424 : else
4425 0 : pabyDstData[nDstOffset >> 3] &= ~(0x80 >> (nDstOffset & 7));
4426 :
4427 0 : nSrcOffset++;
4428 0 : nDstOffset++;
4429 : }
4430 :
4431 0 : nSrcOffset += (nSrcStep - nBitCount);
4432 0 : nDstOffset += (nDstStep - nBitCount);
4433 : }
4434 : }
4435 :
4436 : /************************************************************************/
4437 : /* GDALGetBestOverviewLevel() */
4438 : /* */
4439 : /* Returns the best overview level to satisfy the query or -1 if none */
4440 : /* Also updates nXOff, nYOff, nXSize, nYSize and psExtraArg when */
4441 : /* returning a valid overview level */
4442 : /************************************************************************/
4443 :
4444 0 : int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand, int &nXOff, int &nYOff,
4445 : int &nXSize, int &nYSize, int nBufXSize,
4446 : int nBufYSize)
4447 : {
4448 0 : return GDALBandGetBestOverviewLevel2(poBand, nXOff, nYOff, nXSize, nYSize,
4449 0 : nBufXSize, nBufYSize, nullptr);
4450 : }
4451 :
4452 524017 : int GDALBandGetBestOverviewLevel2(GDALRasterBand *poBand, int &nXOff,
4453 : int &nYOff, int &nXSize, int &nYSize,
4454 : int nBufXSize, int nBufYSize,
4455 : GDALRasterIOExtraArg *psExtraArg)
4456 : {
4457 524017 : if (psExtraArg != nullptr && psExtraArg->nVersion > 1 &&
4458 524017 : psExtraArg->bUseOnlyThisScale)
4459 109 : return -1;
4460 : /* -------------------------------------------------------------------- */
4461 : /* Compute the desired downsampling factor. It is */
4462 : /* based on the least reduced axis, and represents the number */
4463 : /* of source pixels to one destination pixel. */
4464 : /* -------------------------------------------------------------------- */
4465 523908 : const double dfDesiredDownsamplingFactor =
4466 523908 : ((nXSize / static_cast<double>(nBufXSize)) <
4467 361568 : (nYSize / static_cast<double>(nBufYSize)) ||
4468 : nBufYSize == 1)
4469 752297 : ? nXSize / static_cast<double>(nBufXSize)
4470 133179 : : nYSize / static_cast<double>(nBufYSize);
4471 :
4472 : /* -------------------------------------------------------------------- */
4473 : /* Find the overview level that largest downsampling factor (most */
4474 : /* downsampled) that is still less than (or only a little more) */
4475 : /* downsampled than the request. */
4476 : /* -------------------------------------------------------------------- */
4477 523908 : const int nOverviewCount = poBand->GetOverviewCount();
4478 523908 : GDALRasterBand *poBestOverview = nullptr;
4479 523908 : double dfBestDownsamplingFactor = 0;
4480 523908 : int nBestOverviewLevel = -1;
4481 :
4482 : const char *pszOversampligThreshold =
4483 523908 : CPLGetConfigOption("GDAL_OVERVIEW_OVERSAMPLING_THRESHOLD", nullptr);
4484 :
4485 : // Note: keep this logic for overview selection in sync between
4486 : // gdalwarp_lib.cpp and rasterio.cpp
4487 : // Cf https://github.com/OSGeo/gdal/pull/9040#issuecomment-1898524693
4488 : const double dfOversamplingThreshold =
4489 1047810 : pszOversampligThreshold ? CPLAtof(pszOversampligThreshold)
4490 523899 : : psExtraArg && psExtraArg->eResampleAlg != GRIORA_NearestNeighbour
4491 1047800 : ? 1.0
4492 523908 : : 1.2;
4493 526604 : for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
4494 : {
4495 5616 : GDALRasterBand *poOverview = poBand->GetOverview(iOverview);
4496 11232 : if (poOverview == nullptr ||
4497 11231 : poOverview->GetXSize() > poBand->GetXSize() ||
4498 5615 : poOverview->GetYSize() > poBand->GetYSize())
4499 : {
4500 1 : continue;
4501 : }
4502 :
4503 : // Compute downsampling factor of this overview
4504 : const double dfDownsamplingFactor = std::min(
4505 5615 : poBand->GetXSize() / static_cast<double>(poOverview->GetXSize()),
4506 11230 : poBand->GetYSize() / static_cast<double>(poOverview->GetYSize()));
4507 :
4508 : // Is it nearly the requested factor and better (lower) than
4509 : // the current best factor?
4510 : // Use an epsilon because of numerical instability.
4511 5615 : constexpr double EPSILON = 1e-1;
4512 5723 : if (dfDownsamplingFactor >=
4513 5615 : dfDesiredDownsamplingFactor * dfOversamplingThreshold +
4514 5507 : EPSILON ||
4515 : dfDownsamplingFactor <= dfBestDownsamplingFactor)
4516 : {
4517 108 : continue;
4518 : }
4519 :
4520 : // Ignore AVERAGE_BIT2GRAYSCALE overviews for RasterIO purposes.
4521 5507 : const char *pszResampling = poOverview->GetMetadataItem("RESAMPLING");
4522 :
4523 5507 : if (pszResampling != nullptr &&
4524 71 : STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2"))
4525 16 : continue;
4526 :
4527 : // OK, this is our new best overview.
4528 5491 : poBestOverview = poOverview;
4529 5491 : nBestOverviewLevel = iOverview;
4530 5491 : dfBestDownsamplingFactor = dfDownsamplingFactor;
4531 :
4532 5491 : if (std::abs(dfDesiredDownsamplingFactor - dfDownsamplingFactor) <
4533 : EPSILON)
4534 : {
4535 2920 : break;
4536 : }
4537 : }
4538 :
4539 : /* -------------------------------------------------------------------- */
4540 : /* If we didn't find an overview that helps us, just return */
4541 : /* indicating failure and the full resolution image will be used. */
4542 : /* -------------------------------------------------------------------- */
4543 523908 : if (nBestOverviewLevel < 0)
4544 520915 : return -1;
4545 :
4546 : /* -------------------------------------------------------------------- */
4547 : /* Recompute the source window in terms of the selected */
4548 : /* overview. */
4549 : /* -------------------------------------------------------------------- */
4550 : const double dfXFactor =
4551 2993 : poBand->GetXSize() / static_cast<double>(poBestOverview->GetXSize());
4552 : const double dfYFactor =
4553 2993 : poBand->GetYSize() / static_cast<double>(poBestOverview->GetYSize());
4554 2993 : CPLDebug("GDAL", "Selecting overview %d x %d", poBestOverview->GetXSize(),
4555 : poBestOverview->GetYSize());
4556 :
4557 8979 : const int nOXOff = std::min(poBestOverview->GetXSize() - 1,
4558 2993 : static_cast<int>(nXOff / dfXFactor + 0.5));
4559 8979 : const int nOYOff = std::min(poBestOverview->GetYSize() - 1,
4560 2993 : static_cast<int>(nYOff / dfYFactor + 0.5));
4561 2993 : int nOXSize = std::max(1, static_cast<int>(nXSize / dfXFactor + 0.5));
4562 2993 : int nOYSize = std::max(1, static_cast<int>(nYSize / dfYFactor + 0.5));
4563 2993 : if (nOXOff + nOXSize > poBestOverview->GetXSize())
4564 0 : nOXSize = poBestOverview->GetXSize() - nOXOff;
4565 2993 : if (nOYOff + nOYSize > poBestOverview->GetYSize())
4566 2 : nOYSize = poBestOverview->GetYSize() - nOYOff;
4567 :
4568 2993 : if (psExtraArg)
4569 : {
4570 2993 : if (psExtraArg->bFloatingPointWindowValidity)
4571 : {
4572 117 : psExtraArg->dfXOff /= dfXFactor;
4573 117 : psExtraArg->dfXSize /= dfXFactor;
4574 117 : psExtraArg->dfYOff /= dfYFactor;
4575 117 : psExtraArg->dfYSize /= dfYFactor;
4576 : }
4577 2876 : else if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
4578 : {
4579 16 : psExtraArg->bFloatingPointWindowValidity = true;
4580 16 : psExtraArg->dfXOff = nXOff / dfXFactor;
4581 16 : psExtraArg->dfXSize = nXSize / dfXFactor;
4582 16 : psExtraArg->dfYOff = nYOff / dfYFactor;
4583 16 : psExtraArg->dfYSize = nYSize / dfYFactor;
4584 : }
4585 : }
4586 :
4587 2993 : nXOff = nOXOff;
4588 2993 : nYOff = nOYOff;
4589 2993 : nXSize = nOXSize;
4590 2993 : nYSize = nOYSize;
4591 :
4592 2993 : return nBestOverviewLevel;
4593 : }
4594 :
4595 : /************************************************************************/
4596 : /* OverviewRasterIO() */
4597 : /* */
4598 : /* Special work function to utilize available overviews to */
4599 : /* more efficiently satisfy downsampled requests. It will */
4600 : /* return CE_Failure if there are no appropriate overviews */
4601 : /* available but it doesn't emit any error messages. */
4602 : /************************************************************************/
4603 :
4604 : //! @cond Doxygen_Suppress
4605 2 : CPLErr GDALRasterBand::OverviewRasterIO(
4606 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4607 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4608 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
4609 :
4610 : {
4611 : GDALRasterIOExtraArg sExtraArg;
4612 2 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4613 :
4614 2 : const int nOverview = GDALBandGetBestOverviewLevel2(
4615 : this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, &sExtraArg);
4616 2 : if (nOverview < 0)
4617 1 : return CE_Failure;
4618 :
4619 : /* -------------------------------------------------------------------- */
4620 : /* Recast the call in terms of the new raster layer. */
4621 : /* -------------------------------------------------------------------- */
4622 1 : GDALRasterBand *poOverviewBand = GetOverview(nOverview);
4623 1 : if (poOverviewBand == nullptr)
4624 0 : return CE_Failure;
4625 :
4626 1 : return poOverviewBand->RasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
4627 : pData, nBufXSize, nBufYSize, eBufType,
4628 1 : nPixelSpace, nLineSpace, &sExtraArg);
4629 : }
4630 :
4631 : /************************************************************************/
4632 : /* TryOverviewRasterIO() */
4633 : /************************************************************************/
4634 :
4635 362428 : CPLErr GDALRasterBand::TryOverviewRasterIO(
4636 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4637 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4638 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg,
4639 : int *pbTried)
4640 : {
4641 362428 : int nXOffMod = nXOff;
4642 362428 : int nYOffMod = nYOff;
4643 362428 : int nXSizeMod = nXSize;
4644 362428 : int nYSizeMod = nYSize;
4645 : GDALRasterIOExtraArg sExtraArg;
4646 :
4647 362428 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4648 :
4649 362428 : int iOvrLevel = GDALBandGetBestOverviewLevel2(
4650 : this, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize, nBufYSize,
4651 : &sExtraArg);
4652 :
4653 362428 : if (iOvrLevel >= 0)
4654 : {
4655 53 : GDALRasterBand *poOverviewBand = GetOverview(iOvrLevel);
4656 53 : if (poOverviewBand)
4657 : {
4658 53 : *pbTried = TRUE;
4659 53 : return poOverviewBand->RasterIO(
4660 : eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData,
4661 : nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
4662 53 : &sExtraArg);
4663 : }
4664 : }
4665 :
4666 362375 : *pbTried = FALSE;
4667 362375 : return CE_None;
4668 : }
4669 :
4670 : /************************************************************************/
4671 : /* TryOverviewRasterIO() */
4672 : /************************************************************************/
4673 :
4674 158613 : CPLErr GDALDataset::TryOverviewRasterIO(
4675 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4676 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4677 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
4678 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg,
4679 : int *pbTried)
4680 : {
4681 158613 : int nXOffMod = nXOff;
4682 158613 : int nYOffMod = nYOff;
4683 158613 : int nXSizeMod = nXSize;
4684 158613 : int nYSizeMod = nYSize;
4685 : GDALRasterIOExtraArg sExtraArg;
4686 158613 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4687 :
4688 317226 : int iOvrLevel = GDALBandGetBestOverviewLevel2(
4689 158613 : papoBands[0], nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize,
4690 : nBufYSize, &sExtraArg);
4691 :
4692 158655 : if (iOvrLevel >= 0 && papoBands[0]->GetOverview(iOvrLevel) != nullptr &&
4693 42 : papoBands[0]->GetOverview(iOvrLevel)->GetDataset() != nullptr)
4694 : {
4695 42 : *pbTried = TRUE;
4696 42 : return papoBands[0]->GetOverview(iOvrLevel)->GetDataset()->RasterIO(
4697 : eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData, nBufXSize,
4698 : nBufYSize, eBufType, nBandCount, panBandMap, nPixelSpace,
4699 42 : nLineSpace, nBandSpace, &sExtraArg);
4700 : }
4701 : else
4702 : {
4703 158571 : *pbTried = FALSE;
4704 158571 : return CE_None;
4705 : }
4706 : }
4707 :
4708 : /************************************************************************/
4709 : /* GetBestOverviewLevel() */
4710 : /* */
4711 : /* Returns the best overview level to satisfy the query or -1 if none */
4712 : /* Also updates nXOff, nYOff, nXSize, nYSize when returning a valid */
4713 : /* overview level */
4714 : /************************************************************************/
4715 :
4716 4 : static int GDALDatasetGetBestOverviewLevel(GDALDataset *poDS, int &nXOff,
4717 : int &nYOff, int &nXSize, int &nYSize,
4718 : int nBufXSize, int nBufYSize,
4719 : int nBandCount,
4720 : const int *panBandMap,
4721 : GDALRasterIOExtraArg *psExtraArg)
4722 : {
4723 4 : int nOverviewCount = 0;
4724 4 : GDALRasterBand *poFirstBand = nullptr;
4725 :
4726 : /* -------------------------------------------------------------------- */
4727 : /* Check that all bands have the same number of overviews and */
4728 : /* that they have all the same size and block dimensions */
4729 : /* -------------------------------------------------------------------- */
4730 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
4731 : {
4732 8 : GDALRasterBand *poBand = poDS->GetRasterBand(panBandMap[iBand]);
4733 8 : if (poBand == nullptr)
4734 0 : return -1;
4735 8 : if (iBand == 0)
4736 : {
4737 4 : poFirstBand = poBand;
4738 4 : nOverviewCount = poBand->GetOverviewCount();
4739 : }
4740 4 : else if (nOverviewCount != poBand->GetOverviewCount())
4741 : {
4742 0 : CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
4743 : "mismatched overview count, use std method.");
4744 0 : return -1;
4745 : }
4746 : else
4747 : {
4748 4 : for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
4749 : {
4750 0 : GDALRasterBand *poOvrBand = poBand->GetOverview(iOverview);
4751 : GDALRasterBand *poOvrFirstBand =
4752 0 : poFirstBand->GetOverview(iOverview);
4753 0 : if (poOvrBand == nullptr || poOvrFirstBand == nullptr)
4754 0 : continue;
4755 :
4756 0 : if (poOvrFirstBand->GetXSize() != poOvrBand->GetXSize() ||
4757 0 : poOvrFirstBand->GetYSize() != poOvrBand->GetYSize())
4758 : {
4759 0 : CPLDebug("GDAL",
4760 : "GDALDataset::GetBestOverviewLevel() ... "
4761 : "mismatched overview sizes, use std method.");
4762 0 : return -1;
4763 : }
4764 0 : int nBlockXSizeFirst = 0;
4765 0 : int nBlockYSizeFirst = 0;
4766 0 : poOvrFirstBand->GetBlockSize(&nBlockXSizeFirst,
4767 : &nBlockYSizeFirst);
4768 :
4769 0 : int nBlockXSizeCurrent = 0;
4770 0 : int nBlockYSizeCurrent = 0;
4771 0 : poOvrBand->GetBlockSize(&nBlockXSizeCurrent,
4772 : &nBlockYSizeCurrent);
4773 :
4774 0 : if (nBlockXSizeFirst != nBlockXSizeCurrent ||
4775 0 : nBlockYSizeFirst != nBlockYSizeCurrent)
4776 : {
4777 0 : CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
4778 : "mismatched block sizes, use std method.");
4779 0 : return -1;
4780 : }
4781 : }
4782 : }
4783 : }
4784 4 : if (poFirstBand == nullptr)
4785 0 : return -1;
4786 :
4787 4 : return GDALBandGetBestOverviewLevel2(poFirstBand, nXOff, nYOff, nXSize,
4788 : nYSize, nBufXSize, nBufYSize,
4789 4 : psExtraArg);
4790 : }
4791 :
4792 : /************************************************************************/
4793 : /* BlockBasedRasterIO() */
4794 : /* */
4795 : /* This convenience function implements a dataset level */
4796 : /* RasterIO() interface based on calling down to fetch blocks, */
4797 : /* much like the GDALRasterBand::IRasterIO(), but it handles */
4798 : /* all bands at once, so that a format driver that handles a */
4799 : /* request for different bands of the same block efficiently */
4800 : /* (i.e. without re-reading interleaved data) will efficiently. */
4801 : /* */
4802 : /* This method is intended to be called by an overridden */
4803 : /* IRasterIO() method in the driver specific GDALDataset */
4804 : /* derived class. */
4805 : /* */
4806 : /* Default internal implementation of RasterIO() ... utilizes */
4807 : /* the Block access methods to satisfy the request. This would */
4808 : /* normally only be overridden by formats with overviews. */
4809 : /* */
4810 : /* To keep things relatively simple, this method does not */
4811 : /* currently take advantage of some special cases addressed in */
4812 : /* GDALRasterBand::IRasterIO(), so it is likely best to only */
4813 : /* call it when you know it will help. That is in cases where */
4814 : /* data is at 1:1 to the buffer, and you know the driver is */
4815 : /* implementing interleaved IO efficiently on a block by block */
4816 : /* basis. Overviews will be used when possible. */
4817 : /************************************************************************/
4818 :
4819 64982 : CPLErr GDALDataset::BlockBasedRasterIO(
4820 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4821 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4822 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
4823 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
4824 :
4825 : {
4826 64982 : CPLAssert(nullptr != pData);
4827 :
4828 64982 : GByte **papabySrcBlock = nullptr;
4829 64982 : GDALRasterBlock *poBlock = nullptr;
4830 64982 : GDALRasterBlock **papoBlocks = nullptr;
4831 64982 : int nLBlockX = -1;
4832 64982 : int nLBlockY = -1;
4833 : int iBufYOff;
4834 : int iBufXOff;
4835 64982 : int nBlockXSize = 1;
4836 64982 : int nBlockYSize = 1;
4837 64982 : CPLErr eErr = CE_None;
4838 64982 : GDALDataType eDataType = GDT_UInt8;
4839 :
4840 64982 : const bool bUseIntegerRequestCoords =
4841 65020 : (!psExtraArg->bFloatingPointWindowValidity ||
4842 38 : (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
4843 36 : nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
4844 :
4845 : /* -------------------------------------------------------------------- */
4846 : /* Ensure that all bands share a common block size and data type. */
4847 : /* -------------------------------------------------------------------- */
4848 308187 : for (int iBand = 0; iBand < nBandCount; iBand++)
4849 : {
4850 243205 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
4851 :
4852 243205 : if (iBand == 0)
4853 : {
4854 64982 : poBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
4855 64982 : eDataType = poBand->GetRasterDataType();
4856 : }
4857 : else
4858 : {
4859 178223 : int nThisBlockXSize = 0;
4860 178223 : int nThisBlockYSize = 0;
4861 178223 : poBand->GetBlockSize(&nThisBlockXSize, &nThisBlockYSize);
4862 178223 : if (nThisBlockXSize != nBlockXSize ||
4863 178223 : nThisBlockYSize != nBlockYSize)
4864 : {
4865 0 : CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
4866 : "mismatched block sizes, use std method.");
4867 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
4868 : pData, nBufXSize, nBufYSize, eBufType,
4869 : nBandCount, panBandMap, nPixelSpace,
4870 0 : nLineSpace, nBandSpace, psExtraArg);
4871 : }
4872 :
4873 178223 : if (eDataType != poBand->GetRasterDataType() &&
4874 0 : (nXSize != nBufXSize || nYSize != nBufYSize))
4875 : {
4876 0 : CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
4877 : "mismatched band data types, use std method.");
4878 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
4879 : pData, nBufXSize, nBufYSize, eBufType,
4880 : nBandCount, panBandMap, nPixelSpace,
4881 0 : nLineSpace, nBandSpace, psExtraArg);
4882 : }
4883 : }
4884 : }
4885 :
4886 : /* ==================================================================== */
4887 : /* In this special case at full resolution we step through in */
4888 : /* blocks, turning the request over to the per-band */
4889 : /* IRasterIO(), but ensuring that all bands of one block are */
4890 : /* called before proceeding to the next. */
4891 : /* ==================================================================== */
4892 :
4893 64982 : if (nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
4894 : {
4895 : GDALRasterIOExtraArg sDummyExtraArg;
4896 64978 : INIT_RASTERIO_EXTRA_ARG(sDummyExtraArg);
4897 :
4898 64978 : int nChunkYSize = 0;
4899 64978 : int nChunkXSize = 0;
4900 :
4901 213434 : for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff += nChunkYSize)
4902 : {
4903 149472 : const int nChunkYOff = iBufYOff + nYOff;
4904 149472 : nChunkYSize = nBlockYSize - (nChunkYOff % nBlockYSize);
4905 149472 : if (nChunkYOff + nChunkYSize > nYOff + nYSize)
4906 59977 : nChunkYSize = (nYOff + nYSize) - nChunkYOff;
4907 :
4908 822752 : for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff += nChunkXSize)
4909 : {
4910 674295 : const int nChunkXOff = iBufXOff + nXOff;
4911 674295 : nChunkXSize = nBlockXSize - (nChunkXOff % nBlockXSize);
4912 674295 : if (nChunkXOff + nChunkXSize > nXOff + nXSize)
4913 70691 : nChunkXSize = (nXOff + nXSize) - nChunkXOff;
4914 :
4915 674295 : GByte *pabyChunkData =
4916 674295 : static_cast<GByte *>(pData) + iBufXOff * nPixelSpace +
4917 674295 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace;
4918 :
4919 3282490 : for (int iBand = 0; iBand < nBandCount; iBand++)
4920 : {
4921 2609210 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
4922 :
4923 5218420 : eErr = poBand->IRasterIO(
4924 : eRWFlag, nChunkXOff, nChunkYOff, nChunkXSize,
4925 : nChunkYSize,
4926 2609210 : pabyChunkData +
4927 2609210 : static_cast<GPtrDiff_t>(iBand) * nBandSpace,
4928 : nChunkXSize, nChunkYSize, eBufType, nPixelSpace,
4929 2609210 : nLineSpace, &sDummyExtraArg);
4930 2609210 : if (eErr != CE_None)
4931 1015 : return eErr;
4932 : }
4933 : }
4934 :
4935 167371 : if (psExtraArg->pfnProgress != nullptr &&
4936 18914 : !psExtraArg->pfnProgress(
4937 167371 : 1.0 * std::min(nBufYSize, iBufYOff + nChunkYSize) /
4938 : nBufYSize,
4939 : "", psExtraArg->pProgressData))
4940 : {
4941 1 : return CE_Failure;
4942 : }
4943 : }
4944 :
4945 63962 : return CE_None;
4946 : }
4947 :
4948 : /* Below code is not compatible with that case. It would need a complete */
4949 : /* separate code like done in GDALRasterBand::IRasterIO. */
4950 4 : if (eRWFlag == GF_Write && (nBufXSize < nXSize || nBufYSize < nYSize))
4951 : {
4952 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
4953 : nBufXSize, nBufYSize, eBufType, nBandCount,
4954 : panBandMap, nPixelSpace, nLineSpace,
4955 0 : nBandSpace, psExtraArg);
4956 : }
4957 :
4958 : /* We could have a smarter implementation, but that will do for now */
4959 4 : if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
4960 0 : (nBufXSize != nXSize || nBufYSize != nYSize))
4961 : {
4962 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
4963 : nBufXSize, nBufYSize, eBufType, nBandCount,
4964 : panBandMap, nPixelSpace, nLineSpace,
4965 0 : nBandSpace, psExtraArg);
4966 : }
4967 :
4968 : /* ==================================================================== */
4969 : /* Loop reading required source blocks to satisfy output */
4970 : /* request. This is the most general implementation. */
4971 : /* ==================================================================== */
4972 :
4973 4 : const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
4974 :
4975 : papabySrcBlock =
4976 4 : static_cast<GByte **>(CPLCalloc(sizeof(GByte *), nBandCount));
4977 : papoBlocks =
4978 4 : static_cast<GDALRasterBlock **>(CPLCalloc(sizeof(void *), nBandCount));
4979 :
4980 : /* -------------------------------------------------------------------- */
4981 : /* Select an overview level if appropriate. */
4982 : /* -------------------------------------------------------------------- */
4983 :
4984 : GDALRasterIOExtraArg sExtraArg;
4985 4 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4986 4 : const int nOverviewLevel = GDALDatasetGetBestOverviewLevel(
4987 : this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, nBandCount,
4988 : panBandMap, &sExtraArg);
4989 4 : if (nOverviewLevel >= 0)
4990 : {
4991 2 : GetRasterBand(panBandMap[0])
4992 2 : ->GetOverview(nOverviewLevel)
4993 2 : ->GetBlockSize(&nBlockXSize, &nBlockYSize);
4994 : }
4995 :
4996 4 : double dfXOff = nXOff;
4997 4 : double dfYOff = nYOff;
4998 4 : double dfXSize = nXSize;
4999 4 : double dfYSize = nYSize;
5000 4 : if (sExtraArg.bFloatingPointWindowValidity)
5001 : {
5002 2 : dfXOff = sExtraArg.dfXOff;
5003 2 : dfYOff = sExtraArg.dfYOff;
5004 2 : dfXSize = sExtraArg.dfXSize;
5005 2 : dfYSize = sExtraArg.dfYSize;
5006 : }
5007 :
5008 : /* -------------------------------------------------------------------- */
5009 : /* Compute stepping increment. */
5010 : /* -------------------------------------------------------------------- */
5011 4 : const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
5012 4 : const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
5013 :
5014 4 : constexpr double EPS = 1e-10;
5015 : /* -------------------------------------------------------------------- */
5016 : /* Loop over buffer computing source locations. */
5017 : /* -------------------------------------------------------------------- */
5018 36 : for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
5019 : {
5020 : GPtrDiff_t iSrcOffset;
5021 :
5022 : // Add small epsilon to avoid some numeric precision issues.
5023 32 : const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
5024 32 : const int iSrcY = static_cast<int>(std::min(
5025 32 : std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
5026 :
5027 32 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
5028 : static_cast<GPtrDiff_t>(nLineSpace);
5029 :
5030 302 : for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff++)
5031 : {
5032 270 : const double dfSrcX = (iBufXOff + 0.5) * dfSrcXInc + dfXOff + EPS;
5033 270 : const int iSrcX = static_cast<int>(std::min(
5034 270 : std::max(0.0, dfSrcX), static_cast<double>(nRasterXSize - 1)));
5035 :
5036 : // FIXME: this code likely doesn't work if the dirty block gets
5037 : // flushed to disk before being completely written. In the meantime,
5038 : // bJustInitialize should probably be set to FALSE even if it is not
5039 : // ideal performance wise, and for lossy compression
5040 :
5041 : /* --------------------------------------------------------------------
5042 : */
5043 : /* Ensure we have the appropriate block loaded. */
5044 : /* --------------------------------------------------------------------
5045 : */
5046 270 : if (iSrcX < nLBlockX * nBlockXSize ||
5047 270 : iSrcX - nBlockXSize >= nLBlockX * nBlockXSize ||
5048 266 : iSrcY < nLBlockY * nBlockYSize ||
5049 266 : iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
5050 : {
5051 4 : nLBlockX = iSrcX / nBlockXSize;
5052 4 : nLBlockY = iSrcY / nBlockYSize;
5053 :
5054 4 : const bool bJustInitialize =
5055 0 : eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
5056 0 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
5057 4 : nXOff <= nLBlockX * nBlockXSize &&
5058 0 : nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
5059 : /*bool bMemZeroBuffer = FALSE;
5060 : if( eRWFlag == GF_Write && !bJustInitialize &&
5061 : nXOff <= nLBlockX * nBlockXSize &&
5062 : nYOff <= nLBlockY * nBlockYSize &&
5063 : (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
5064 : (nXOff + nXSize == GetRasterXSize() &&
5065 : (nLBlockX+1) * nBlockXSize > GetRasterXSize())) &&
5066 : (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
5067 : (nYOff + nYSize == GetRasterYSize() &&
5068 : (nLBlockY+1) * nBlockYSize > GetRasterYSize())) )
5069 : {
5070 : bJustInitialize = TRUE;
5071 : bMemZeroBuffer = TRUE;
5072 : }*/
5073 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
5074 : {
5075 8 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
5076 8 : if (nOverviewLevel >= 0)
5077 2 : poBand = poBand->GetOverview(nOverviewLevel);
5078 16 : poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY,
5079 8 : bJustInitialize);
5080 8 : if (poBlock == nullptr)
5081 : {
5082 0 : eErr = CE_Failure;
5083 0 : goto CleanupAndReturn;
5084 : }
5085 :
5086 8 : if (eRWFlag == GF_Write)
5087 0 : poBlock->MarkDirty();
5088 :
5089 8 : if (papoBlocks[iBand] != nullptr)
5090 0 : papoBlocks[iBand]->DropLock();
5091 :
5092 8 : papoBlocks[iBand] = poBlock;
5093 :
5094 8 : papabySrcBlock[iBand] =
5095 8 : static_cast<GByte *>(poBlock->GetDataRef());
5096 : /*if( bMemZeroBuffer )
5097 : {
5098 : memset(papabySrcBlock[iBand], 0,
5099 : static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
5100 : * nBlockYSize);
5101 : }*/
5102 : }
5103 : }
5104 :
5105 : /* --------------------------------------------------------------------
5106 : */
5107 : /* Copy over this pixel of data. */
5108 : /* --------------------------------------------------------------------
5109 : */
5110 270 : iSrcOffset = (static_cast<GPtrDiff_t>(iSrcX) -
5111 270 : static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
5112 270 : (static_cast<GPtrDiff_t>(iSrcY) -
5113 270 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
5114 270 : nBlockXSize) *
5115 270 : nBandDataSize;
5116 :
5117 980 : for (int iBand = 0; iBand < nBandCount; iBand++)
5118 : {
5119 710 : GByte *pabySrcBlock = papabySrcBlock[iBand];
5120 710 : GPtrDiff_t iBandBufOffset =
5121 710 : iBufOffset + static_cast<GPtrDiff_t>(iBand) *
5122 : static_cast<GPtrDiff_t>(nBandSpace);
5123 :
5124 710 : if (eDataType == eBufType)
5125 : {
5126 710 : if (eRWFlag == GF_Read)
5127 710 : memcpy(static_cast<GByte *>(pData) + iBandBufOffset,
5128 710 : pabySrcBlock + iSrcOffset, nBandDataSize);
5129 : else
5130 0 : memcpy(pabySrcBlock + iSrcOffset,
5131 : static_cast<const GByte *>(pData) +
5132 0 : iBandBufOffset,
5133 : nBandDataSize);
5134 : }
5135 : else
5136 : {
5137 : /* type to type conversion ... ouch, this is expensive way
5138 : of handling single words */
5139 :
5140 0 : if (eRWFlag == GF_Read)
5141 0 : GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
5142 : static_cast<GByte *>(pData) +
5143 0 : iBandBufOffset,
5144 : eBufType, 0, 1);
5145 : else
5146 0 : GDALCopyWords64(static_cast<const GByte *>(pData) +
5147 0 : iBandBufOffset,
5148 0 : eBufType, 0, pabySrcBlock + iSrcOffset,
5149 : eDataType, 0, 1);
5150 : }
5151 : }
5152 :
5153 270 : iBufOffset += static_cast<int>(nPixelSpace);
5154 : }
5155 : }
5156 :
5157 : /* -------------------------------------------------------------------- */
5158 : /* CleanupAndReturn. */
5159 : /* -------------------------------------------------------------------- */
5160 4 : CleanupAndReturn:
5161 4 : CPLFree(papabySrcBlock);
5162 4 : if (papoBlocks != nullptr)
5163 : {
5164 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
5165 : {
5166 8 : if (papoBlocks[iBand] != nullptr)
5167 8 : papoBlocks[iBand]->DropLock();
5168 : }
5169 4 : CPLFree(papoBlocks);
5170 : }
5171 :
5172 4 : return eErr;
5173 : }
5174 :
5175 : //! @endcond
5176 :
5177 : /************************************************************************/
5178 : /* GDALCopyWholeRasterGetSwathSize() */
5179 : /************************************************************************/
5180 :
5181 3376 : static void GDALCopyWholeRasterGetSwathSize(GDALRasterBand *poSrcPrototypeBand,
5182 : GDALRasterBand *poDstPrototypeBand,
5183 : int nBandCount,
5184 : int bDstIsCompressed,
5185 : int bInterleave, int *pnSwathCols,
5186 : int *pnSwathLines)
5187 : {
5188 3376 : GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
5189 3376 : int nSrcBlockXSize = 0;
5190 3376 : int nSrcBlockYSize = 0;
5191 3376 : int nBlockXSize = 0;
5192 3376 : int nBlockYSize = 0;
5193 :
5194 3376 : int nXSize = poSrcPrototypeBand->GetXSize();
5195 3376 : int nYSize = poSrcPrototypeBand->GetYSize();
5196 :
5197 3376 : poSrcPrototypeBand->GetBlockSize(&nSrcBlockXSize, &nSrcBlockYSize);
5198 3376 : poDstPrototypeBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
5199 :
5200 3376 : const int nMaxBlockXSize = std::max(nBlockXSize, nSrcBlockXSize);
5201 3376 : const int nMaxBlockYSize = std::max(nBlockYSize, nSrcBlockYSize);
5202 :
5203 3376 : int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5204 3376 : if (bInterleave)
5205 583 : nPixelSize *= nBandCount;
5206 :
5207 : // aim for one row of blocks. Do not settle for less.
5208 3376 : int nSwathCols = nXSize;
5209 3376 : int nSwathLines = nMaxBlockYSize;
5210 :
5211 : const char *pszSrcCompression =
5212 3376 : poSrcPrototypeBand->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
5213 3376 : if (pszSrcCompression == nullptr)
5214 : {
5215 3356 : auto poSrcDS = poSrcPrototypeBand->GetDataset();
5216 3356 : if (poSrcDS)
5217 : pszSrcCompression =
5218 3350 : poSrcDS->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
5219 : }
5220 :
5221 : /* -------------------------------------------------------------------- */
5222 : /* What will our swath size be? */
5223 : /* -------------------------------------------------------------------- */
5224 : // When writing interleaved data in a compressed format, we want to be sure
5225 : // that each block will only be written once, so the swath size must not be
5226 : // greater than the block cache.
5227 3376 : const char *pszSwathSize = CPLGetConfigOption("GDAL_SWATH_SIZE", nullptr);
5228 : int nTargetSwathSize;
5229 3376 : if (pszSwathSize != nullptr)
5230 0 : nTargetSwathSize = static_cast<int>(
5231 0 : std::min(GIntBig(INT_MAX), CPLAtoGIntBig(pszSwathSize)));
5232 : else
5233 : {
5234 : // As a default, take one 1/4 of the cache size.
5235 3376 : nTargetSwathSize = static_cast<int>(
5236 3376 : std::min(GIntBig(INT_MAX), GDALGetCacheMax64() / 4));
5237 :
5238 : // but if the minimum idal swath buf size is less, then go for it to
5239 : // avoid unnecessarily abusing RAM usage.
5240 : // but try to use 10 MB at least.
5241 3376 : GIntBig nIdealSwathBufSize =
5242 3376 : static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize;
5243 3376 : int nMinTargetSwathSize = 10 * 1000 * 1000;
5244 :
5245 3376 : if ((poSrcPrototypeBand->GetSuggestedBlockAccessPattern() &
5246 3376 : GSBAP_LARGEST_CHUNK_POSSIBLE) != 0)
5247 : {
5248 1 : nMinTargetSwathSize = nTargetSwathSize;
5249 : }
5250 :
5251 3376 : if (nIdealSwathBufSize < nTargetSwathSize &&
5252 3366 : nIdealSwathBufSize < nMinTargetSwathSize)
5253 : {
5254 3363 : nIdealSwathBufSize = nMinTargetSwathSize;
5255 : }
5256 :
5257 3376 : if (pszSrcCompression != nullptr &&
5258 181 : EQUAL(pszSrcCompression, "JPEG2000") &&
5259 0 : (!bDstIsCompressed || ((nSrcBlockXSize % nBlockXSize) == 0 &&
5260 0 : (nSrcBlockYSize % nBlockYSize) == 0)))
5261 : {
5262 2 : nIdealSwathBufSize =
5263 4 : std::max(nIdealSwathBufSize, static_cast<GIntBig>(nSwathCols) *
5264 2 : nSrcBlockYSize * nPixelSize);
5265 : }
5266 3376 : if (nTargetSwathSize > nIdealSwathBufSize)
5267 3363 : nTargetSwathSize = static_cast<int>(
5268 3363 : std::min(GIntBig(INT_MAX), nIdealSwathBufSize));
5269 : }
5270 :
5271 3376 : if (nTargetSwathSize < 1000000)
5272 8 : nTargetSwathSize = 1000000;
5273 :
5274 : /* But let's check that */
5275 3597 : if (bDstIsCompressed && bInterleave &&
5276 221 : nTargetSwathSize > GDALGetCacheMax64())
5277 : {
5278 0 : CPLError(CE_Warning, CPLE_AppDefined,
5279 : "When translating into a compressed interleave format, "
5280 : "the block cache size (" CPL_FRMT_GIB ") "
5281 : "should be at least the size of the swath (%d) "
5282 : "(GDAL_SWATH_SIZE config. option)",
5283 : GDALGetCacheMax64(), nTargetSwathSize);
5284 : }
5285 :
5286 : #define IS_DIVIDER_OF(x, y) ((y) % (x) == 0)
5287 : #define ROUND_TO(x, y) (((x) / (y)) * (y))
5288 :
5289 : // if both input and output datasets are tiled, that the tile dimensions
5290 : // are "compatible", try to stick to a swath dimension that is a multiple
5291 : // of input and output block dimensions.
5292 3376 : if (nBlockXSize != nXSize && nSrcBlockXSize != nXSize &&
5293 47 : IS_DIVIDER_OF(nBlockXSize, nMaxBlockXSize) &&
5294 47 : IS_DIVIDER_OF(nSrcBlockXSize, nMaxBlockXSize) &&
5295 47 : IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
5296 47 : IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
5297 : {
5298 47 : if (static_cast<GIntBig>(nMaxBlockXSize) * nMaxBlockYSize *
5299 47 : nPixelSize <=
5300 47 : static_cast<GIntBig>(nTargetSwathSize))
5301 : {
5302 47 : nSwathCols = nTargetSwathSize / (nMaxBlockYSize * nPixelSize);
5303 47 : nSwathCols = ROUND_TO(nSwathCols, nMaxBlockXSize);
5304 47 : if (nSwathCols == 0)
5305 0 : nSwathCols = nMaxBlockXSize;
5306 47 : if (nSwathCols > nXSize)
5307 45 : nSwathCols = nXSize;
5308 47 : nSwathLines = nMaxBlockYSize;
5309 :
5310 47 : if (static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize >
5311 47 : static_cast<GIntBig>(nTargetSwathSize))
5312 : {
5313 0 : nSwathCols = nXSize;
5314 0 : nSwathLines = nBlockYSize;
5315 : }
5316 : }
5317 : }
5318 :
5319 3376 : const GIntBig nMemoryPerCol = static_cast<GIntBig>(nSwathCols) * nPixelSize;
5320 3376 : const GIntBig nSwathBufSize = nMemoryPerCol * nSwathLines;
5321 3376 : if (nSwathBufSize > static_cast<GIntBig>(nTargetSwathSize))
5322 : {
5323 1 : nSwathLines = static_cast<int>(nTargetSwathSize / nMemoryPerCol);
5324 1 : if (nSwathLines == 0)
5325 1 : nSwathLines = 1;
5326 :
5327 1 : CPLDebug(
5328 : "GDAL",
5329 : "GDALCopyWholeRasterGetSwathSize(): adjusting to %d line swath "
5330 : "since requirement (" CPL_FRMT_GIB " bytes) exceed target swath "
5331 : "size (%d bytes) (GDAL_SWATH_SIZE config. option)",
5332 1 : nSwathLines, nBlockYSize * nMemoryPerCol, nTargetSwathSize);
5333 : }
5334 : // If we are processing single scans, try to handle several at once.
5335 : // If we are handling swaths already, only grow the swath if a row
5336 : // of blocks is substantially less than our target buffer size.
5337 3375 : else if (nSwathLines == 1 ||
5338 2824 : nMemoryPerCol * nSwathLines <
5339 2824 : static_cast<GIntBig>(nTargetSwathSize) / 10)
5340 : {
5341 3347 : nSwathLines = std::min(
5342 : nYSize,
5343 3347 : std::max(1, static_cast<int>(nTargetSwathSize / nMemoryPerCol)));
5344 :
5345 : /* If possible try to align to source and target block height */
5346 3347 : if ((nSwathLines % nMaxBlockYSize) != 0 &&
5347 273 : nSwathLines > nMaxBlockYSize &&
5348 273 : IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
5349 244 : IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
5350 217 : nSwathLines = ROUND_TO(nSwathLines, nMaxBlockYSize);
5351 : }
5352 :
5353 3376 : if (pszSrcCompression != nullptr && EQUAL(pszSrcCompression, "JPEG2000") &&
5354 0 : (!bDstIsCompressed || (IS_DIVIDER_OF(nBlockXSize, nSrcBlockXSize) &&
5355 0 : IS_DIVIDER_OF(nBlockYSize, nSrcBlockYSize))))
5356 : {
5357 : // Typical use case: converting from Pleaiades that is 2048x2048 tiled.
5358 2 : if (nSwathLines < nSrcBlockYSize)
5359 : {
5360 0 : nSwathLines = nSrcBlockYSize;
5361 :
5362 : // Number of pixels that can be read/write simultaneously.
5363 0 : nSwathCols = nTargetSwathSize / (nSrcBlockXSize * nPixelSize);
5364 0 : nSwathCols = ROUND_TO(nSwathCols, nSrcBlockXSize);
5365 0 : if (nSwathCols == 0)
5366 0 : nSwathCols = nSrcBlockXSize;
5367 0 : if (nSwathCols > nXSize)
5368 0 : nSwathCols = nXSize;
5369 :
5370 0 : CPLDebug(
5371 : "GDAL",
5372 : "GDALCopyWholeRasterGetSwathSize(): because of compression and "
5373 : "too high block, "
5374 : "use partial width at one time");
5375 : }
5376 2 : else if ((nSwathLines % nSrcBlockYSize) != 0)
5377 : {
5378 : /* Round on a multiple of nSrcBlockYSize */
5379 0 : nSwathLines = ROUND_TO(nSwathLines, nSrcBlockYSize);
5380 0 : CPLDebug(
5381 : "GDAL",
5382 : "GDALCopyWholeRasterGetSwathSize(): because of compression, "
5383 : "round nSwathLines to block height : %d",
5384 : nSwathLines);
5385 : }
5386 : }
5387 3374 : else if (bDstIsCompressed)
5388 : {
5389 419 : if (nSwathLines < nBlockYSize)
5390 : {
5391 146 : nSwathLines = nBlockYSize;
5392 :
5393 : // Number of pixels that can be read/write simultaneously.
5394 146 : nSwathCols = nTargetSwathSize / (nSwathLines * nPixelSize);
5395 146 : nSwathCols = ROUND_TO(nSwathCols, nBlockXSize);
5396 146 : if (nSwathCols == 0)
5397 0 : nSwathCols = nBlockXSize;
5398 146 : if (nSwathCols > nXSize)
5399 146 : nSwathCols = nXSize;
5400 :
5401 146 : CPLDebug(
5402 : "GDAL",
5403 : "GDALCopyWholeRasterGetSwathSize(): because of compression and "
5404 : "too high block, "
5405 : "use partial width at one time");
5406 : }
5407 273 : else if ((nSwathLines % nBlockYSize) != 0)
5408 : {
5409 : // Round on a multiple of nBlockYSize.
5410 9 : nSwathLines = ROUND_TO(nSwathLines, nBlockYSize);
5411 9 : CPLDebug(
5412 : "GDAL",
5413 : "GDALCopyWholeRasterGetSwathSize(): because of compression, "
5414 : "round nSwathLines to block height : %d",
5415 : nSwathLines);
5416 : }
5417 : }
5418 :
5419 3376 : *pnSwathCols = nSwathCols;
5420 3376 : *pnSwathLines = nSwathLines;
5421 3376 : }
5422 :
5423 : /************************************************************************/
5424 : /* GDALDatasetCopyWholeRaster() */
5425 : /************************************************************************/
5426 :
5427 : /**
5428 : * \brief Copy all dataset raster data.
5429 : *
5430 : * This function copies the complete raster contents of one dataset to
5431 : * another similarly configured dataset. The source and destination
5432 : * dataset must have the same number of bands, and the same width
5433 : * and height. The bands do not have to have the same data type.
5434 : *
5435 : * This function is primarily intended to support implementation of
5436 : * driver specific CreateCopy() functions. It implements efficient copying,
5437 : * in particular "chunking" the copy in substantial blocks and, if appropriate,
5438 : * performing the transfer in a pixel interleaved fashion.
5439 : *
5440 : * Currently the only papszOptions value supported are :
5441 : * <ul>
5442 : * <li>"INTERLEAVE=PIXEL/BAND" to force pixel (resp. band) interleaved read and
5443 : * write access pattern (this does not modify the layout of the destination
5444 : * data)</li>
5445 : * <li>"COMPRESSED=YES" to force alignment on target dataset block
5446 : * sizes to achieve best compression.</li>
5447 : * <li>"SKIP_HOLES=YES" to skip chunks
5448 : * for which GDALGetDataCoverageStatus() returns GDAL_DATA_COVERAGE_STATUS_EMPTY
5449 : * (GDAL >= 2.2)</li>
5450 : * </ul>
5451 : * More options may be supported in the future.
5452 : *
5453 : * @param hSrcDS the source dataset
5454 : * @param hDstDS the destination dataset
5455 : * @param papszOptions transfer hints in "StringList" Name=Value format.
5456 : * @param pfnProgress progress reporting function.
5457 : * @param pProgressData callback data for progress function.
5458 : *
5459 : * @return CE_None on success, or CE_Failure on failure.
5460 : */
5461 :
5462 3348 : CPLErr CPL_STDCALL GDALDatasetCopyWholeRaster(GDALDatasetH hSrcDS,
5463 : GDALDatasetH hDstDS,
5464 : CSLConstList papszOptions,
5465 : GDALProgressFunc pfnProgress,
5466 : void *pProgressData)
5467 :
5468 : {
5469 3348 : VALIDATE_POINTER1(hSrcDS, "GDALDatasetCopyWholeRaster", CE_Failure);
5470 3348 : VALIDATE_POINTER1(hDstDS, "GDALDatasetCopyWholeRaster", CE_Failure);
5471 :
5472 3348 : GDALDataset *poSrcDS = GDALDataset::FromHandle(hSrcDS);
5473 3348 : GDALDataset *poDstDS = GDALDataset::FromHandle(hDstDS);
5474 :
5475 3348 : if (pfnProgress == nullptr)
5476 0 : pfnProgress = GDALDummyProgress;
5477 :
5478 : /* -------------------------------------------------------------------- */
5479 : /* Confirm the datasets match in size and band counts. */
5480 : /* -------------------------------------------------------------------- */
5481 3348 : const int nXSize = poDstDS->GetRasterXSize();
5482 3348 : const int nYSize = poDstDS->GetRasterYSize();
5483 3348 : const int nBandCount = poDstDS->GetRasterCount();
5484 :
5485 3348 : if (poSrcDS->GetRasterXSize() != nXSize ||
5486 6696 : poSrcDS->GetRasterYSize() != nYSize ||
5487 3348 : poSrcDS->GetRasterCount() != nBandCount)
5488 : {
5489 0 : CPLError(CE_Failure, CPLE_AppDefined,
5490 : "Input and output dataset sizes or band counts do not\n"
5491 : "match in GDALDatasetCopyWholeRaster()");
5492 0 : return CE_Failure;
5493 : }
5494 :
5495 : /* -------------------------------------------------------------------- */
5496 : /* Report preliminary (0) progress. */
5497 : /* -------------------------------------------------------------------- */
5498 3348 : if (!pfnProgress(0.0, nullptr, pProgressData))
5499 : {
5500 1 : CPLError(CE_Failure, CPLE_UserInterrupt,
5501 : "User terminated CreateCopy()");
5502 1 : return CE_Failure;
5503 : }
5504 :
5505 : /* -------------------------------------------------------------------- */
5506 : /* Get our prototype band, and assume the others are similarly */
5507 : /* configured. */
5508 : /* -------------------------------------------------------------------- */
5509 3347 : if (nBandCount == 0)
5510 0 : return CE_None;
5511 :
5512 3347 : GDALRasterBand *poSrcPrototypeBand = poSrcDS->GetRasterBand(1);
5513 3347 : GDALRasterBand *poDstPrototypeBand = poDstDS->GetRasterBand(1);
5514 3347 : GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
5515 :
5516 : /* -------------------------------------------------------------------- */
5517 : /* Do we want to try and do the operation in a pixel */
5518 : /* interleaved fashion? */
5519 : /* -------------------------------------------------------------------- */
5520 3347 : bool bInterleave = false;
5521 : const char *pszInterleave =
5522 3347 : poSrcDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
5523 3347 : if (pszInterleave != nullptr &&
5524 2943 : (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
5525 209 : bInterleave = true;
5526 :
5527 3347 : pszInterleave = poDstDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
5528 3347 : if (pszInterleave != nullptr &&
5529 2882 : (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
5530 528 : bInterleave = true;
5531 :
5532 3347 : pszInterleave = CSLFetchNameValue(papszOptions, "INTERLEAVE");
5533 3347 : if (pszInterleave != nullptr && EQUAL(pszInterleave, "PIXEL"))
5534 5 : bInterleave = true;
5535 3342 : else if (pszInterleave != nullptr && EQUAL(pszInterleave, "BAND"))
5536 13 : bInterleave = false;
5537 : // attributes is specific to the TileDB driver
5538 3329 : else if (pszInterleave != nullptr && EQUAL(pszInterleave, "ATTRIBUTES"))
5539 4 : bInterleave = true;
5540 3325 : else if (pszInterleave != nullptr)
5541 : {
5542 0 : CPLError(CE_Warning, CPLE_NotSupported,
5543 : "Unsupported value for option INTERLEAVE");
5544 : }
5545 :
5546 : // If the destination is compressed, we must try to write blocks just once,
5547 : // to save disk space (GTiff case for example), and to avoid data loss
5548 : // (JPEG compression for example).
5549 3347 : bool bDstIsCompressed = false;
5550 : const char *pszDstCompressed =
5551 3347 : CSLFetchNameValue(papszOptions, "COMPRESSED");
5552 3347 : if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
5553 393 : bDstIsCompressed = true;
5554 :
5555 : /* -------------------------------------------------------------------- */
5556 : /* What will our swath size be? */
5557 : /* -------------------------------------------------------------------- */
5558 :
5559 3347 : int nSwathCols = 0;
5560 3347 : int nSwathLines = 0;
5561 3347 : GDALCopyWholeRasterGetSwathSize(poSrcPrototypeBand, poDstPrototypeBand,
5562 : nBandCount, bDstIsCompressed, bInterleave,
5563 : &nSwathCols, &nSwathLines);
5564 :
5565 3347 : int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5566 3347 : if (bInterleave)
5567 583 : nPixelSize *= nBandCount;
5568 :
5569 3347 : void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
5570 3347 : if (pSwathBuf == nullptr)
5571 : {
5572 0 : return CE_Failure;
5573 : }
5574 :
5575 3347 : CPLDebug("GDAL",
5576 : "GDALDatasetCopyWholeRaster(): %d*%d swaths, bInterleave=%d",
5577 : nSwathCols, nSwathLines, static_cast<int>(bInterleave));
5578 :
5579 : // Advise the source raster that we are going to read it completely
5580 : // Note: this might already have been done by GDALCreateCopy() in the
5581 : // likely case this function is indirectly called by it
5582 3347 : poSrcDS->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nBandCount,
5583 3347 : nullptr, nullptr);
5584 :
5585 : /* ==================================================================== */
5586 : /* Band oriented (uninterleaved) case. */
5587 : /* ==================================================================== */
5588 3347 : CPLErr eErr = CE_None;
5589 : const bool bCheckHoles =
5590 3347 : CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
5591 :
5592 3347 : if (!bInterleave)
5593 : {
5594 : GDALRasterIOExtraArg sExtraArg;
5595 2764 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5596 2764 : CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress); // to make cppcheck happy
5597 :
5598 8292 : const GIntBig nTotalBlocks = static_cast<GIntBig>(nBandCount) *
5599 2764 : DIV_ROUND_UP(nYSize, nSwathLines) *
5600 2764 : DIV_ROUND_UP(nXSize, nSwathCols);
5601 2764 : GIntBig nBlocksDone = 0;
5602 :
5603 7971 : for (int iBand = 0; iBand < nBandCount && eErr == CE_None; iBand++)
5604 : {
5605 5207 : int nBand = iBand + 1;
5606 :
5607 10677 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5608 : {
5609 5470 : int nThisLines = nSwathLines;
5610 :
5611 5470 : if (iY + nThisLines > nYSize)
5612 368 : nThisLines = nYSize - iY;
5613 :
5614 10940 : for (int iX = 0; iX < nXSize && eErr == CE_None;
5615 5470 : iX += nSwathCols)
5616 : {
5617 5470 : int nThisCols = nSwathCols;
5618 :
5619 5470 : if (iX + nThisCols > nXSize)
5620 0 : nThisCols = nXSize - iX;
5621 :
5622 5470 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5623 5470 : if (bCheckHoles)
5624 : {
5625 : nStatus = poSrcDS->GetRasterBand(nBand)
5626 3758 : ->GetDataCoverageStatus(
5627 : iX, iY, nThisCols, nThisLines,
5628 : GDAL_DATA_COVERAGE_STATUS_DATA);
5629 : }
5630 5470 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5631 : {
5632 5466 : sExtraArg.pfnProgress = GDALScaledProgress;
5633 10932 : sExtraArg.pProgressData = GDALCreateScaledProgress(
5634 5466 : nBlocksDone / static_cast<double>(nTotalBlocks),
5635 5466 : (nBlocksDone + 0.5) /
5636 5466 : static_cast<double>(nTotalBlocks),
5637 : pfnProgress, pProgressData);
5638 5466 : if (sExtraArg.pProgressData == nullptr)
5639 1682 : sExtraArg.pfnProgress = nullptr;
5640 :
5641 5466 : eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
5642 : nThisLines, pSwathBuf,
5643 : nThisCols, nThisLines, eDT, 1,
5644 : &nBand, 0, 0, 0, &sExtraArg);
5645 :
5646 5466 : GDALDestroyScaledProgress(sExtraArg.pProgressData);
5647 :
5648 5466 : if (eErr == CE_None)
5649 5459 : eErr = poDstDS->RasterIO(
5650 : GF_Write, iX, iY, nThisCols, nThisLines,
5651 : pSwathBuf, nThisCols, nThisLines, eDT, 1,
5652 : &nBand, 0, 0, 0, nullptr);
5653 : }
5654 :
5655 5470 : nBlocksDone++;
5656 10898 : if (eErr == CE_None &&
5657 5428 : !pfnProgress(nBlocksDone /
5658 5428 : static_cast<double>(nTotalBlocks),
5659 : nullptr, pProgressData))
5660 : {
5661 2 : eErr = CE_Failure;
5662 2 : CPLError(CE_Failure, CPLE_UserInterrupt,
5663 : "User terminated CreateCopy()");
5664 : }
5665 : }
5666 : }
5667 : }
5668 : }
5669 :
5670 : /* ==================================================================== */
5671 : /* Pixel interleaved case. */
5672 : /* ==================================================================== */
5673 : else /* if( bInterleave ) */
5674 : {
5675 : GDALRasterIOExtraArg sExtraArg;
5676 583 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5677 583 : CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress); // to make cppcheck happy
5678 :
5679 583 : const GIntBig nTotalBlocks =
5680 583 : static_cast<GIntBig>(DIV_ROUND_UP(nYSize, nSwathLines)) *
5681 583 : DIV_ROUND_UP(nXSize, nSwathCols);
5682 583 : GIntBig nBlocksDone = 0;
5683 :
5684 1388 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5685 : {
5686 805 : int nThisLines = nSwathLines;
5687 :
5688 805 : if (iY + nThisLines > nYSize)
5689 198 : nThisLines = nYSize - iY;
5690 :
5691 1615 : for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
5692 : {
5693 810 : int nThisCols = nSwathCols;
5694 :
5695 810 : if (iX + nThisCols > nXSize)
5696 3 : nThisCols = nXSize - iX;
5697 :
5698 810 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5699 810 : if (bCheckHoles)
5700 : {
5701 551 : nStatus = 0;
5702 604 : for (int iBand = 0; iBand < nBandCount; iBand++)
5703 : {
5704 585 : nStatus |= poSrcDS->GetRasterBand(iBand + 1)
5705 585 : ->GetDataCoverageStatus(
5706 : iX, iY, nThisCols, nThisLines,
5707 : GDAL_DATA_COVERAGE_STATUS_DATA);
5708 585 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5709 532 : break;
5710 : }
5711 : }
5712 810 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5713 : {
5714 791 : sExtraArg.pfnProgress = GDALScaledProgress;
5715 1582 : sExtraArg.pProgressData = GDALCreateScaledProgress(
5716 791 : nBlocksDone / static_cast<double>(nTotalBlocks),
5717 791 : (nBlocksDone + 0.5) / static_cast<double>(nTotalBlocks),
5718 : pfnProgress, pProgressData);
5719 791 : if (sExtraArg.pProgressData == nullptr)
5720 375 : sExtraArg.pfnProgress = nullptr;
5721 :
5722 791 : eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
5723 : nThisLines, pSwathBuf, nThisCols,
5724 : nThisLines, eDT, nBandCount,
5725 : nullptr, 0, 0, 0, &sExtraArg);
5726 :
5727 791 : GDALDestroyScaledProgress(sExtraArg.pProgressData);
5728 :
5729 791 : if (eErr == CE_None)
5730 790 : eErr = poDstDS->RasterIO(
5731 : GF_Write, iX, iY, nThisCols, nThisLines, pSwathBuf,
5732 : nThisCols, nThisLines, eDT, nBandCount, nullptr, 0,
5733 : 0, 0, nullptr);
5734 : }
5735 :
5736 810 : nBlocksDone++;
5737 1615 : if (eErr == CE_None &&
5738 805 : !pfnProgress(nBlocksDone /
5739 805 : static_cast<double>(nTotalBlocks),
5740 : nullptr, pProgressData))
5741 : {
5742 1 : eErr = CE_Failure;
5743 1 : CPLError(CE_Failure, CPLE_UserInterrupt,
5744 : "User terminated CreateCopy()");
5745 : }
5746 : }
5747 : }
5748 : }
5749 :
5750 : /* -------------------------------------------------------------------- */
5751 : /* Cleanup */
5752 : /* -------------------------------------------------------------------- */
5753 3347 : CPLFree(pSwathBuf);
5754 :
5755 3347 : return eErr;
5756 : }
5757 :
5758 : /************************************************************************/
5759 : /* GDALRasterBandCopyWholeRaster() */
5760 : /************************************************************************/
5761 :
5762 : /**
5763 : * \brief Copy a whole raster band
5764 : *
5765 : * This function copies the complete raster contents of one band to
5766 : * another similarly configured band. The source and destination
5767 : * bands must have the same width and height. The bands do not have
5768 : * to have the same data type.
5769 : *
5770 : * It implements efficient copying, in particular "chunking" the copy in
5771 : * substantial blocks.
5772 : *
5773 : * Currently the only papszOptions value supported are :
5774 : * <ul>
5775 : * <li>"COMPRESSED=YES" to force alignment on target dataset block sizes to
5776 : * achieve best compression.</li>
5777 : * <li>"SKIP_HOLES=YES" to skip chunks for which GDALGetDataCoverageStatus()
5778 : * returns GDAL_DATA_COVERAGE_STATUS_EMPTY (GDAL >= 2.2)</li>
5779 : * </ul>
5780 : *
5781 : * @param hSrcBand the source band
5782 : * @param hDstBand the destination band
5783 : * @param papszOptions transfer hints in "StringList" Name=Value format.
5784 : * @param pfnProgress progress reporting function.
5785 : * @param pProgressData callback data for progress function.
5786 : *
5787 : * @return CE_None on success, or CE_Failure on failure.
5788 : */
5789 :
5790 29 : CPLErr CPL_STDCALL GDALRasterBandCopyWholeRaster(
5791 : GDALRasterBandH hSrcBand, GDALRasterBandH hDstBand,
5792 : const char *const *const papszOptions, GDALProgressFunc pfnProgress,
5793 : void *pProgressData)
5794 :
5795 : {
5796 29 : VALIDATE_POINTER1(hSrcBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
5797 29 : VALIDATE_POINTER1(hDstBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
5798 :
5799 29 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
5800 29 : GDALRasterBand *poDstBand = GDALRasterBand::FromHandle(hDstBand);
5801 29 : CPLErr eErr = CE_None;
5802 :
5803 29 : if (pfnProgress == nullptr)
5804 2 : pfnProgress = GDALDummyProgress;
5805 :
5806 : /* -------------------------------------------------------------------- */
5807 : /* Confirm the datasets match in size and band counts. */
5808 : /* -------------------------------------------------------------------- */
5809 29 : int nXSize = poSrcBand->GetXSize();
5810 29 : int nYSize = poSrcBand->GetYSize();
5811 :
5812 29 : if (poDstBand->GetXSize() != nXSize || poDstBand->GetYSize() != nYSize)
5813 : {
5814 0 : CPLError(CE_Failure, CPLE_AppDefined,
5815 : "Input and output band sizes do not\n"
5816 : "match in GDALRasterBandCopyWholeRaster()");
5817 0 : return CE_Failure;
5818 : }
5819 :
5820 : /* -------------------------------------------------------------------- */
5821 : /* Report preliminary (0) progress. */
5822 : /* -------------------------------------------------------------------- */
5823 29 : if (!pfnProgress(0.0, nullptr, pProgressData))
5824 : {
5825 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
5826 : "User terminated CreateCopy()");
5827 0 : return CE_Failure;
5828 : }
5829 :
5830 29 : GDALDataType eDT = poDstBand->GetRasterDataType();
5831 :
5832 : // If the destination is compressed, we must try to write blocks just once,
5833 : // to save disk space (GTiff case for example), and to avoid data loss
5834 : // (JPEG compression for example).
5835 29 : bool bDstIsCompressed = false;
5836 : const char *pszDstCompressed =
5837 29 : CSLFetchNameValue(const_cast<char **>(papszOptions), "COMPRESSED");
5838 29 : if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
5839 26 : bDstIsCompressed = true;
5840 :
5841 : /* -------------------------------------------------------------------- */
5842 : /* What will our swath size be? */
5843 : /* -------------------------------------------------------------------- */
5844 :
5845 29 : int nSwathCols = 0;
5846 29 : int nSwathLines = 0;
5847 29 : GDALCopyWholeRasterGetSwathSize(poSrcBand, poDstBand, 1, bDstIsCompressed,
5848 : FALSE, &nSwathCols, &nSwathLines);
5849 :
5850 29 : const int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5851 :
5852 29 : void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
5853 29 : if (pSwathBuf == nullptr)
5854 : {
5855 0 : return CE_Failure;
5856 : }
5857 :
5858 29 : CPLDebug("GDAL", "GDALRasterBandCopyWholeRaster(): %d*%d swaths",
5859 : nSwathCols, nSwathLines);
5860 :
5861 : const bool bCheckHoles =
5862 29 : CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
5863 :
5864 : // Advise the source raster that we are going to read it completely
5865 29 : poSrcBand->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nullptr);
5866 :
5867 : /* ==================================================================== */
5868 : /* Band oriented (uninterleaved) case. */
5869 : /* ==================================================================== */
5870 :
5871 72 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5872 : {
5873 43 : int nThisLines = nSwathLines;
5874 :
5875 43 : if (iY + nThisLines > nYSize)
5876 8 : nThisLines = nYSize - iY;
5877 :
5878 86 : for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
5879 : {
5880 43 : int nThisCols = nSwathCols;
5881 :
5882 43 : if (iX + nThisCols > nXSize)
5883 0 : nThisCols = nXSize - iX;
5884 :
5885 43 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5886 43 : if (bCheckHoles)
5887 : {
5888 0 : nStatus = poSrcBand->GetDataCoverageStatus(
5889 : iX, iY, nThisCols, nThisLines,
5890 : GDAL_DATA_COVERAGE_STATUS_DATA);
5891 : }
5892 43 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5893 : {
5894 43 : eErr = poSrcBand->RasterIO(GF_Read, iX, iY, nThisCols,
5895 : nThisLines, pSwathBuf, nThisCols,
5896 : nThisLines, eDT, 0, 0, nullptr);
5897 :
5898 43 : if (eErr == CE_None)
5899 43 : eErr = poDstBand->RasterIO(GF_Write, iX, iY, nThisCols,
5900 : nThisLines, pSwathBuf, nThisCols,
5901 : nThisLines, eDT, 0, 0, nullptr);
5902 : }
5903 :
5904 86 : if (eErr == CE_None && !pfnProgress(double(iY + nThisLines) /
5905 43 : static_cast<double>(nYSize),
5906 : nullptr, pProgressData))
5907 : {
5908 0 : eErr = CE_Failure;
5909 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
5910 : "User terminated CreateCopy()");
5911 : }
5912 : }
5913 : }
5914 :
5915 : /* -------------------------------------------------------------------- */
5916 : /* Cleanup */
5917 : /* -------------------------------------------------------------------- */
5918 29 : CPLFree(pSwathBuf);
5919 :
5920 29 : return eErr;
5921 : }
5922 :
5923 : /************************************************************************/
5924 : /* GDALCopyRasterIOExtraArg () */
5925 : /************************************************************************/
5926 :
5927 533484 : void GDALCopyRasterIOExtraArg(GDALRasterIOExtraArg *psDestArg,
5928 : const GDALRasterIOExtraArg *psSrcArg)
5929 : {
5930 533484 : INIT_RASTERIO_EXTRA_ARG(*psDestArg);
5931 533484 : if (psSrcArg)
5932 : {
5933 533484 : psDestArg->eResampleAlg = psSrcArg->eResampleAlg;
5934 533484 : psDestArg->pfnProgress = psSrcArg->pfnProgress;
5935 533484 : psDestArg->pProgressData = psSrcArg->pProgressData;
5936 533484 : psDestArg->bFloatingPointWindowValidity =
5937 533484 : psSrcArg->bFloatingPointWindowValidity;
5938 533484 : if (psSrcArg->bFloatingPointWindowValidity)
5939 : {
5940 210512 : psDestArg->dfXOff = psSrcArg->dfXOff;
5941 210512 : psDestArg->dfYOff = psSrcArg->dfYOff;
5942 210512 : psDestArg->dfXSize = psSrcArg->dfXSize;
5943 210512 : psDestArg->dfYSize = psSrcArg->dfYSize;
5944 : }
5945 533484 : if (psSrcArg->nVersion >= 2)
5946 : {
5947 533484 : psDestArg->bUseOnlyThisScale = psSrcArg->bUseOnlyThisScale;
5948 : }
5949 533484 : if (psSrcArg->nVersion >= 3)
5950 : {
5951 533484 : psDestArg->bOperateInBufType = psSrcArg->bOperateInBufType;
5952 : }
5953 : }
5954 533484 : }
5955 :
5956 : /************************************************************************/
5957 : /* HasOnlyNoData() */
5958 : /************************************************************************/
5959 :
5960 51285976 : template <class T> static inline bool IsEqualToNoData(T value, T noDataValue)
5961 : {
5962 51285976 : return value == noDataValue;
5963 : }
5964 :
5965 5509 : template <> bool IsEqualToNoData<GFloat16>(GFloat16 value, GFloat16 noDataValue)
5966 : {
5967 : using std::isnan;
5968 5509 : return isnan(noDataValue) ? isnan(value) : value == noDataValue;
5969 : }
5970 :
5971 251221 : template <> bool IsEqualToNoData<float>(float value, float noDataValue)
5972 : {
5973 251221 : return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
5974 : }
5975 :
5976 264257 : template <> bool IsEqualToNoData<double>(double value, double noDataValue)
5977 : {
5978 264257 : return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
5979 : }
5980 :
5981 : template <class T>
5982 12025 : static bool HasOnlyNoDataT(const T *pBuffer, T noDataValue, size_t nWidth,
5983 : size_t nHeight, size_t nLineStride,
5984 : size_t nComponents)
5985 : {
5986 : // Fast test: check the 4 corners and the middle pixel.
5987 23298 : for (size_t iBand = 0; iBand < nComponents; iBand++)
5988 : {
5989 24097 : if (!(IsEqualToNoData(pBuffer[iBand], noDataValue) &&
5990 11881 : IsEqualToNoData(pBuffer[(nWidth - 1) * nComponents + iBand],
5991 11751 : noDataValue) &&
5992 11751 : IsEqualToNoData(
5993 11751 : pBuffer[((nHeight - 1) / 2 * nLineStride + (nWidth - 1) / 2) *
5994 11751 : nComponents +
5995 : iBand],
5996 11276 : noDataValue) &&
5997 11276 : IsEqualToNoData(
5998 11276 : pBuffer[(nHeight - 1) * nLineStride * nComponents + iBand],
5999 : noDataValue) &&
6000 11276 : IsEqualToNoData(
6001 11276 : pBuffer[((nHeight - 1) * nLineStride + nWidth - 1) *
6002 11276 : nComponents +
6003 : iBand],
6004 : noDataValue)))
6005 : {
6006 943 : return false;
6007 : }
6008 : }
6009 :
6010 : // Test all pixels.
6011 52954 : for (size_t iY = 0; iY < nHeight; iY++)
6012 : {
6013 41993 : const T *pBufferLine = pBuffer + iY * nLineStride * nComponents;
6014 51790448 : for (size_t iX = 0; iX < nWidth * nComponents; iX++)
6015 : {
6016 51748615 : if (!IsEqualToNoData(pBufferLine[iX], noDataValue))
6017 : {
6018 121 : return false;
6019 : }
6020 : }
6021 : }
6022 10961 : return true;
6023 : }
6024 :
6025 : /************************************************************************/
6026 : /* GDALBufferHasOnlyNoData() */
6027 : /************************************************************************/
6028 :
6029 43912 : bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
6030 : size_t nWidth, size_t nHeight, size_t nLineStride,
6031 : size_t nComponents, int nBitsPerSample,
6032 : GDALBufferSampleFormat nSampleFormat)
6033 : {
6034 : // In the case where the nodata is 0, we can compare several bytes at
6035 : // once. Select the largest natural integer type for the architecture.
6036 43912 : if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
6037 : // Do not use this optimized code path for floating point numbers,
6038 : // as it can't detect negative zero.
6039 : nSampleFormat != GSF_FLOATING_POINT)
6040 : {
6041 27267 : const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
6042 27267 : const size_t nSize =
6043 27267 : static_cast<size_t>((static_cast<uint64_t>(nWidth) * nHeight *
6044 27267 : nComponents * nBitsPerSample +
6045 : 7) /
6046 : 8);
6047 : #ifdef HAVE_SSE2
6048 27267 : size_t n = nSize;
6049 : // Align to 16 bytes
6050 27330 : while ((reinterpret_cast<uintptr_t>(pabyBuffer) & 15) != 0 && n > 0)
6051 : {
6052 73 : --n;
6053 73 : if (*pabyBuffer)
6054 10 : return false;
6055 63 : pabyBuffer++;
6056 : }
6057 :
6058 27257 : const auto zero = _mm_setzero_si128();
6059 27257 : constexpr int UNROLLING = 4;
6060 2223240 : while (n >= UNROLLING * sizeof(zero))
6061 : {
6062 2207980 : const auto v0 = _mm_load_si128(reinterpret_cast<const __m128i *>(
6063 : pabyBuffer + 0 * sizeof(zero)));
6064 2207980 : const auto v1 = _mm_load_si128(reinterpret_cast<const __m128i *>(
6065 2207980 : pabyBuffer + 1 * sizeof(zero)));
6066 2207980 : const auto v2 = _mm_load_si128(reinterpret_cast<const __m128i *>(
6067 2207980 : pabyBuffer + 2 * sizeof(zero)));
6068 2207980 : const auto v3 = _mm_load_si128(reinterpret_cast<const __m128i *>(
6069 2207980 : pabyBuffer + 3 * sizeof(zero)));
6070 : const auto v =
6071 6623950 : _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
6072 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
6073 : if (!_mm_test_all_zeros(v, v))
6074 : #else
6075 4415970 : if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
6076 : #endif
6077 : {
6078 12002 : return false;
6079 : }
6080 2195980 : pabyBuffer += UNROLLING * sizeof(zero);
6081 2195980 : n -= UNROLLING * sizeof(zero);
6082 : }
6083 :
6084 233676 : while (n > 0)
6085 : {
6086 218525 : --n;
6087 218525 : if (*pabyBuffer)
6088 104 : return false;
6089 218421 : pabyBuffer++;
6090 : }
6091 : #else
6092 : #if SIZEOF_VOIDP >= 8 || defined(__x86_64__)
6093 : // We test __x86_64__ for x32 arch where SIZEOF_VOIDP == 4
6094 : typedef std::uint64_t WordType;
6095 : #else
6096 : typedef std::uint32_t WordType;
6097 : #endif
6098 :
6099 : const size_t nInitialIters =
6100 : std::min(sizeof(WordType) -
6101 : static_cast<size_t>(
6102 : reinterpret_cast<std::uintptr_t>(pabyBuffer) %
6103 : sizeof(WordType)),
6104 : nSize);
6105 : size_t i = 0;
6106 : for (; i < nInitialIters; i++)
6107 : {
6108 : if (pabyBuffer[i])
6109 : return false;
6110 : }
6111 : for (; i + sizeof(WordType) - 1 < nSize; i += sizeof(WordType))
6112 : {
6113 : if (*(reinterpret_cast<const WordType *>(pabyBuffer + i)))
6114 : return false;
6115 : }
6116 : for (; i < nSize; i++)
6117 : {
6118 : if (pabyBuffer[i])
6119 : return false;
6120 : }
6121 : #endif
6122 15151 : return true;
6123 : }
6124 :
6125 : #ifdef HAVE_SSE2
6126 16645 : else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
6127 708 : nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
6128 : {
6129 708 : const auto signMask = _mm_set1_epi32(0x7FFFFFFF);
6130 708 : const auto zero = _mm_setzero_si128();
6131 708 : const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
6132 708 : const size_t n = nWidth * nHeight * nComponents;
6133 :
6134 708 : size_t i = 0;
6135 708 : constexpr int UNROLLING = 4;
6136 708 : constexpr size_t VALUES_PER_ITER =
6137 : UNROLLING * sizeof(zero) / sizeof(float);
6138 24983 : for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
6139 : {
6140 24934 : const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6141 : pabyBuffer + 0 * sizeof(zero)));
6142 24934 : const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6143 24934 : pabyBuffer + 1 * sizeof(zero)));
6144 24934 : const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6145 24934 : pabyBuffer + 2 * sizeof(zero)));
6146 24934 : const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6147 24934 : pabyBuffer + 3 * sizeof(zero)));
6148 74802 : auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
6149 : // Clear the sign bit (makes -0.0 become +0.0)
6150 24934 : v = _mm_and_si128(v, signMask);
6151 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
6152 : if (!_mm_test_all_zeros(v, v))
6153 : #else
6154 49868 : if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
6155 : #endif
6156 : {
6157 659 : return false;
6158 : }
6159 24275 : pabyBuffer += UNROLLING * sizeof(zero);
6160 : }
6161 :
6162 304 : for (; i < n; i++)
6163 : {
6164 : uint32_t bits;
6165 272 : memcpy(&bits, pabyBuffer, sizeof(bits));
6166 272 : pabyBuffer += sizeof(bits);
6167 272 : if ((bits & 0x7FFFFFFF) != 0)
6168 17 : return false;
6169 : }
6170 :
6171 32 : return true;
6172 : }
6173 :
6174 15937 : else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
6175 3905 : nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
6176 : {
6177 3905 : const auto signMask = _mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL);
6178 3905 : const auto zero = _mm_setzero_si128();
6179 3905 : const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
6180 3905 : const size_t n = nWidth * nHeight * nComponents;
6181 :
6182 3905 : size_t i = 0;
6183 3905 : constexpr int UNROLLING = 4;
6184 3905 : constexpr size_t VALUES_PER_ITER =
6185 : UNROLLING * sizeof(zero) / sizeof(double);
6186 1664570 : for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
6187 : {
6188 1660950 : const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6189 : pabyBuffer + 0 * sizeof(zero)));
6190 1660950 : const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6191 1660950 : pabyBuffer + 1 * sizeof(zero)));
6192 1660950 : const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6193 1660950 : pabyBuffer + 2 * sizeof(zero)));
6194 1660950 : const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6195 1660950 : pabyBuffer + 3 * sizeof(zero)));
6196 4982850 : auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
6197 : // Clear the sign bit (makes -0.0 become +0.0)
6198 1660950 : v = _mm_and_si128(v, signMask);
6199 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
6200 : if (!_mm_test_all_zeros(v, v))
6201 : #else
6202 3321900 : if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
6203 : #endif
6204 : {
6205 289 : return false;
6206 : }
6207 1660660 : pabyBuffer += UNROLLING * sizeof(zero);
6208 : }
6209 :
6210 3643 : for (; i < n; i++)
6211 : {
6212 : uint64_t bits;
6213 34 : memcpy(&bits, pabyBuffer, sizeof(bits));
6214 34 : pabyBuffer += sizeof(bits);
6215 34 : if ((bits & 0x7FFFFFFFFFFFFFFFULL) != 0)
6216 7 : return false;
6217 : }
6218 :
6219 3609 : return true;
6220 : }
6221 : #endif
6222 :
6223 12032 : if (nBitsPerSample == 8 && nSampleFormat == GSF_UNSIGNED_INT)
6224 : {
6225 22426 : return GDALIsValueInRange<uint8_t>(dfNoDataValue) &&
6226 11213 : HasOnlyNoDataT(static_cast<const uint8_t *>(pBuffer),
6227 11213 : static_cast<uint8_t>(dfNoDataValue), nWidth,
6228 11213 : nHeight, nLineStride, nComponents);
6229 : }
6230 819 : if (nBitsPerSample == 8 && nSampleFormat == GSF_SIGNED_INT)
6231 : {
6232 : // Use unsigned implementation by converting the nodatavalue to
6233 : // unsigned
6234 119 : return GDALIsValueInRange<int8_t>(dfNoDataValue) &&
6235 59 : HasOnlyNoDataT(
6236 : static_cast<const uint8_t *>(pBuffer),
6237 59 : static_cast<uint8_t>(static_cast<int8_t>(dfNoDataValue)),
6238 60 : nWidth, nHeight, nLineStride, nComponents);
6239 : }
6240 759 : if (nBitsPerSample == 16 && nSampleFormat == GSF_UNSIGNED_INT)
6241 : {
6242 23 : return GDALIsValueInRange<uint16_t>(dfNoDataValue) &&
6243 11 : HasOnlyNoDataT(static_cast<const uint16_t *>(pBuffer),
6244 11 : static_cast<uint16_t>(dfNoDataValue), nWidth,
6245 12 : nHeight, nLineStride, nComponents);
6246 : }
6247 747 : if (nBitsPerSample == 16 && nSampleFormat == GSF_SIGNED_INT)
6248 : {
6249 : // Use unsigned implementation by converting the nodatavalue to
6250 : // unsigned
6251 111 : return GDALIsValueInRange<int16_t>(dfNoDataValue) &&
6252 55 : HasOnlyNoDataT(
6253 : static_cast<const uint16_t *>(pBuffer),
6254 55 : static_cast<uint16_t>(static_cast<int16_t>(dfNoDataValue)),
6255 56 : nWidth, nHeight, nLineStride, nComponents);
6256 : }
6257 691 : if (nBitsPerSample == 32 && nSampleFormat == GSF_UNSIGNED_INT)
6258 : {
6259 129 : return GDALIsValueInRange<uint32_t>(dfNoDataValue) &&
6260 64 : HasOnlyNoDataT(static_cast<const uint32_t *>(pBuffer),
6261 : static_cast<uint32_t>(dfNoDataValue), nWidth,
6262 65 : nHeight, nLineStride, nComponents);
6263 : }
6264 626 : if (nBitsPerSample == 32 && nSampleFormat == GSF_SIGNED_INT)
6265 : {
6266 : // Use unsigned implementation by converting the nodatavalue to
6267 : // unsigned
6268 23 : return GDALIsValueInRange<int32_t>(dfNoDataValue) &&
6269 11 : HasOnlyNoDataT(
6270 : static_cast<const uint32_t *>(pBuffer),
6271 11 : static_cast<uint32_t>(static_cast<int32_t>(dfNoDataValue)),
6272 12 : nWidth, nHeight, nLineStride, nComponents);
6273 : }
6274 614 : if (nBitsPerSample == 64 && nSampleFormat == GSF_UNSIGNED_INT)
6275 : {
6276 112 : return GDALIsValueInRange<uint64_t>(dfNoDataValue) &&
6277 56 : HasOnlyNoDataT(static_cast<const uint64_t *>(pBuffer),
6278 : static_cast<uint64_t>(dfNoDataValue), nWidth,
6279 56 : nHeight, nLineStride, nComponents);
6280 : }
6281 558 : if (nBitsPerSample == 64 && nSampleFormat == GSF_SIGNED_INT)
6282 : {
6283 : // Use unsigned implementation by converting the nodatavalue to
6284 : // unsigned
6285 0 : return GDALIsValueInRange<int64_t>(dfNoDataValue) &&
6286 0 : HasOnlyNoDataT(
6287 : static_cast<const uint64_t *>(pBuffer),
6288 0 : static_cast<uint64_t>(static_cast<int64_t>(dfNoDataValue)),
6289 0 : nWidth, nHeight, nLineStride, nComponents);
6290 : }
6291 558 : if (nBitsPerSample == 16 && nSampleFormat == GSF_FLOATING_POINT)
6292 : {
6293 106 : return (std::isnan(dfNoDataValue) ||
6294 211 : GDALIsValueInRange<GFloat16>(dfNoDataValue)) &&
6295 105 : HasOnlyNoDataT(static_cast<const GFloat16 *>(pBuffer),
6296 : static_cast<GFloat16>(dfNoDataValue), nWidth,
6297 106 : nHeight, nLineStride, nComponents);
6298 : }
6299 452 : if (nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
6300 : {
6301 268 : return (std::isnan(dfNoDataValue) ||
6302 535 : GDALIsValueInRange<float>(dfNoDataValue)) &&
6303 267 : HasOnlyNoDataT(static_cast<const float *>(pBuffer),
6304 : static_cast<float>(dfNoDataValue), nWidth,
6305 268 : nHeight, nLineStride, nComponents);
6306 : }
6307 184 : if (nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
6308 : {
6309 184 : return HasOnlyNoDataT(static_cast<const double *>(pBuffer),
6310 : dfNoDataValue, nWidth, nHeight, nLineStride,
6311 184 : nComponents);
6312 : }
6313 0 : return false;
6314 : }
6315 :
6316 : #ifdef HAVE_SSE2
6317 :
6318 : /************************************************************************/
6319 : /* GDALDeinterleave3Byte() */
6320 : /************************************************************************/
6321 :
6322 : #if defined(__GNUC__) && !defined(__clang__)
6323 : __attribute__((optimize("no-tree-vectorize")))
6324 : #endif
6325 380714 : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
6326 : GByte *CPL_RESTRICT pabyDest0,
6327 : GByte *CPL_RESTRICT pabyDest1,
6328 : GByte *CPL_RESTRICT pabyDest2, size_t nIters)
6329 : #ifdef USE_NEON_OPTIMIZATIONS
6330 : {
6331 : return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6332 : nIters);
6333 : }
6334 : #else
6335 : {
6336 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
6337 380714 : if (CPLHaveRuntimeSSSE3())
6338 : {
6339 380712 : return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
6340 380712 : pabyDest2, nIters);
6341 : }
6342 : #endif
6343 :
6344 2 : size_t i = 0;
6345 2 : if (((reinterpret_cast<uintptr_t>(pabySrc) |
6346 2 : reinterpret_cast<uintptr_t>(pabyDest0) |
6347 2 : reinterpret_cast<uintptr_t>(pabyDest1) |
6348 2 : reinterpret_cast<uintptr_t>(pabyDest2)) %
6349 : sizeof(unsigned int)) == 0)
6350 : {
6351 : // Slightly better than GCC autovectorizer
6352 17 : for (size_t j = 0; i + 3 < nIters; i += 4, ++j)
6353 : {
6354 15 : unsigned int word0 =
6355 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i);
6356 15 : unsigned int word1 =
6357 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 4);
6358 15 : unsigned int word2 =
6359 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 8);
6360 15 : reinterpret_cast<unsigned int *>(pabyDest0)[j] =
6361 15 : (word0 & 0xff) | ((word0 >> 24) << 8) | (word1 & 0x00ff0000) |
6362 15 : ((word2 >> 8) << 24);
6363 15 : reinterpret_cast<unsigned int *>(pabyDest1)[j] =
6364 15 : ((word0 >> 8) & 0xff) | ((word1 & 0xff) << 8) |
6365 15 : (((word1 >> 24)) << 16) | ((word2 >> 16) << 24);
6366 15 : pabyDest2[j * 4] = static_cast<GByte>(word0 >> 16);
6367 15 : pabyDest2[j * 4 + 1] = static_cast<GByte>(word1 >> 8);
6368 15 : pabyDest2[j * 4 + 2] = static_cast<GByte>(word2);
6369 15 : pabyDest2[j * 4 + 3] = static_cast<GByte>(word2 >> 24);
6370 : }
6371 : }
6372 : #if defined(__clang__)
6373 : #pragma clang loop vectorize(disable)
6374 : #endif
6375 3 : for (; i < nIters; ++i)
6376 : {
6377 1 : pabyDest0[i] = pabySrc[3 * i + 0];
6378 1 : pabyDest1[i] = pabySrc[3 * i + 1];
6379 1 : pabyDest2[i] = pabySrc[3 * i + 2];
6380 : }
6381 : }
6382 : #endif
6383 :
6384 : /************************************************************************/
6385 : /* GDALDeinterleave4Byte() */
6386 : /************************************************************************/
6387 :
6388 : #if !defined(__GNUC__) || defined(__clang__)
6389 :
6390 : /************************************************************************/
6391 : /* deinterleave() */
6392 : /************************************************************************/
6393 :
6394 : template <bool SHIFT, bool MASK>
6395 : inline __m128i deinterleave(__m128i &xmm0_ori, __m128i &xmm1_ori,
6396 : __m128i &xmm2_ori, __m128i &xmm3_ori)
6397 : {
6398 : // Set higher 24bit of each int32 packed word to 0
6399 : if (SHIFT)
6400 : {
6401 : xmm0_ori = _mm_srli_epi32(xmm0_ori, 8);
6402 : xmm1_ori = _mm_srli_epi32(xmm1_ori, 8);
6403 : xmm2_ori = _mm_srli_epi32(xmm2_ori, 8);
6404 : xmm3_ori = _mm_srli_epi32(xmm3_ori, 8);
6405 : }
6406 : __m128i xmm0;
6407 : __m128i xmm1;
6408 : __m128i xmm2;
6409 : __m128i xmm3;
6410 : if (MASK)
6411 : {
6412 : const __m128i xmm_mask = _mm_set1_epi32(0xff);
6413 : xmm0 = _mm_and_si128(xmm0_ori, xmm_mask);
6414 : xmm1 = _mm_and_si128(xmm1_ori, xmm_mask);
6415 : xmm2 = _mm_and_si128(xmm2_ori, xmm_mask);
6416 : xmm3 = _mm_and_si128(xmm3_ori, xmm_mask);
6417 : }
6418 : else
6419 : {
6420 : xmm0 = xmm0_ori;
6421 : xmm1 = xmm1_ori;
6422 : xmm2 = xmm2_ori;
6423 : xmm3 = xmm3_ori;
6424 : }
6425 : // Pack int32 to int16
6426 : xmm0 = _mm_packs_epi32(xmm0, xmm1);
6427 : xmm2 = _mm_packs_epi32(xmm2, xmm3);
6428 : // Pack int16 to uint8
6429 : xmm0 = _mm_packus_epi16(xmm0, xmm2);
6430 : return xmm0;
6431 : }
6432 :
6433 : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
6434 : GByte *CPL_RESTRICT pabyDest0,
6435 : GByte *CPL_RESTRICT pabyDest1,
6436 : GByte *CPL_RESTRICT pabyDest2,
6437 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6438 : #ifdef USE_NEON_OPTIMIZATIONS
6439 : {
6440 : return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6441 : pabyDest3, nIters);
6442 : }
6443 : #else
6444 : {
6445 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
6446 : if (CPLHaveRuntimeSSSE3())
6447 : {
6448 : return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
6449 : pabyDest2, pabyDest3, nIters);
6450 : }
6451 : #endif
6452 :
6453 : // Not the optimal SSE2-only code, as gcc auto-vectorizer manages to
6454 : // do something slightly better.
6455 : size_t i = 0;
6456 : for (; i + 15 < nIters; i += 16)
6457 : {
6458 : __m128i xmm0_ori = _mm_loadu_si128(
6459 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
6460 : __m128i xmm1_ori = _mm_loadu_si128(
6461 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
6462 : __m128i xmm2_ori = _mm_loadu_si128(
6463 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
6464 : __m128i xmm3_ori = _mm_loadu_si128(
6465 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
6466 :
6467 : _mm_storeu_si128(
6468 : reinterpret_cast<__m128i *>(pabyDest0 + i),
6469 : deinterleave<false, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6470 : _mm_storeu_si128(
6471 : reinterpret_cast<__m128i *>(pabyDest1 + i),
6472 : deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6473 : _mm_storeu_si128(
6474 : reinterpret_cast<__m128i *>(pabyDest2 + i),
6475 : deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6476 : _mm_storeu_si128(
6477 : reinterpret_cast<__m128i *>(pabyDest3 + i),
6478 : deinterleave<true, false>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6479 : }
6480 :
6481 : #if defined(__clang__)
6482 : #pragma clang loop vectorize(disable)
6483 : #endif
6484 : for (; i < nIters; ++i)
6485 : {
6486 : pabyDest0[i] = pabySrc[4 * i + 0];
6487 : pabyDest1[i] = pabySrc[4 * i + 1];
6488 : pabyDest2[i] = pabySrc[4 * i + 2];
6489 : pabyDest3[i] = pabySrc[4 * i + 3];
6490 : }
6491 : }
6492 : #endif
6493 : #else
6494 : // GCC autovectorizer does an excellent job
6495 73222 : __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
6496 : const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
6497 : GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2,
6498 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6499 : {
6500 539910000 : for (size_t i = 0; i < nIters; ++i)
6501 : {
6502 539837000 : pabyDest0[i] = pabySrc[4 * i + 0];
6503 539837000 : pabyDest1[i] = pabySrc[4 * i + 1];
6504 539837000 : pabyDest2[i] = pabySrc[4 * i + 2];
6505 539837000 : pabyDest3[i] = pabySrc[4 * i + 3];
6506 : }
6507 73222 : }
6508 : #endif
6509 :
6510 : #else
6511 :
6512 : /************************************************************************/
6513 : /* GDALDeinterleave3Byte() */
6514 : /************************************************************************/
6515 :
6516 : // TODO: Enabling below could help on non-Intel architectures where GCC knows
6517 : // how to auto-vectorize
6518 : // #if defined(__GNUC__)
6519 : //__attribute__((optimize("tree-vectorize")))
6520 : // #endif
6521 : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
6522 : GByte *CPL_RESTRICT pabyDest0,
6523 : GByte *CPL_RESTRICT pabyDest1,
6524 : GByte *CPL_RESTRICT pabyDest2, size_t nIters)
6525 : {
6526 : for (size_t i = 0; i < nIters; ++i)
6527 : {
6528 : pabyDest0[i] = pabySrc[3 * i + 0];
6529 : pabyDest1[i] = pabySrc[3 * i + 1];
6530 : pabyDest2[i] = pabySrc[3 * i + 2];
6531 : }
6532 : }
6533 :
6534 : /************************************************************************/
6535 : /* GDALDeinterleave4Byte() */
6536 : /************************************************************************/
6537 :
6538 : // TODO: Enabling below could help on non-Intel architectures where gcc knows
6539 : // how to auto-vectorize
6540 : // #if defined(__GNUC__)
6541 : //__attribute__((optimize("tree-vectorize")))
6542 : // #endif
6543 : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
6544 : GByte *CPL_RESTRICT pabyDest0,
6545 : GByte *CPL_RESTRICT pabyDest1,
6546 : GByte *CPL_RESTRICT pabyDest2,
6547 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6548 : {
6549 : for (size_t i = 0; i < nIters; ++i)
6550 : {
6551 : pabyDest0[i] = pabySrc[4 * i + 0];
6552 : pabyDest1[i] = pabySrc[4 * i + 1];
6553 : pabyDest2[i] = pabySrc[4 * i + 2];
6554 : pabyDest3[i] = pabySrc[4 * i + 3];
6555 : }
6556 : }
6557 :
6558 : #endif
6559 :
6560 : /************************************************************************/
6561 : /* GDALDeinterleave() */
6562 : /************************************************************************/
6563 :
6564 : /*! Copy values from a pixel-interleave buffer to multiple per-component
6565 : buffers.
6566 :
6567 : In pseudo-code
6568 : \verbatim
6569 : for(size_t i = 0; i < nIters; ++i)
6570 : for(int iComp = 0; iComp < nComponents; iComp++ )
6571 : ppDestBuffer[iComp][i] = pSourceBuffer[nComponents * i + iComp]
6572 : \endverbatim
6573 :
6574 : The implementation is optimized for a few cases, like de-interleaving
6575 : of 3 or 4-components Byte buffers.
6576 :
6577 : \since GDAL 3.6
6578 : */
6579 454286 : void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
6580 : int nComponents, void **ppDestBuffer,
6581 : GDALDataType eDestDT, size_t nIters)
6582 : {
6583 454286 : if (eSourceDT == eDestDT)
6584 : {
6585 454264 : if (eSourceDT == GDT_UInt8 || eSourceDT == GDT_Int8)
6586 : {
6587 453943 : if (nComponents == 3)
6588 : {
6589 380714 : const GByte *CPL_RESTRICT pabySrc =
6590 : static_cast<const GByte *>(pSourceBuffer);
6591 380714 : GByte *CPL_RESTRICT pabyDest0 =
6592 : static_cast<GByte *>(ppDestBuffer[0]);
6593 380714 : GByte *CPL_RESTRICT pabyDest1 =
6594 : static_cast<GByte *>(ppDestBuffer[1]);
6595 380714 : GByte *CPL_RESTRICT pabyDest2 =
6596 : static_cast<GByte *>(ppDestBuffer[2]);
6597 380714 : GDALDeinterleave3Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6598 : nIters);
6599 380714 : return;
6600 : }
6601 73229 : else if (nComponents == 4)
6602 : {
6603 73222 : const GByte *CPL_RESTRICT pabySrc =
6604 : static_cast<const GByte *>(pSourceBuffer);
6605 73222 : GByte *CPL_RESTRICT pabyDest0 =
6606 : static_cast<GByte *>(ppDestBuffer[0]);
6607 73222 : GByte *CPL_RESTRICT pabyDest1 =
6608 : static_cast<GByte *>(ppDestBuffer[1]);
6609 73222 : GByte *CPL_RESTRICT pabyDest2 =
6610 : static_cast<GByte *>(ppDestBuffer[2]);
6611 73222 : GByte *CPL_RESTRICT pabyDest3 =
6612 : static_cast<GByte *>(ppDestBuffer[3]);
6613 73222 : GDALDeinterleave4Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6614 : pabyDest3, nIters);
6615 73222 : return;
6616 7 : }
6617 : }
6618 : #if ((defined(__GNUC__) && !defined(__clang__)) || \
6619 : defined(__INTEL_CLANG_COMPILER)) && \
6620 : defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
6621 642 : else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
6622 321 : CPLHaveRuntimeSSSE3())
6623 : {
6624 321 : if (nComponents == 3)
6625 : {
6626 126 : const GUInt16 *CPL_RESTRICT panSrc =
6627 : static_cast<const GUInt16 *>(pSourceBuffer);
6628 126 : GUInt16 *CPL_RESTRICT panDest0 =
6629 : static_cast<GUInt16 *>(ppDestBuffer[0]);
6630 126 : GUInt16 *CPL_RESTRICT panDest1 =
6631 : static_cast<GUInt16 *>(ppDestBuffer[1]);
6632 126 : GUInt16 *CPL_RESTRICT panDest2 =
6633 : static_cast<GUInt16 *>(ppDestBuffer[2]);
6634 126 : GDALDeinterleave3UInt16_SSSE3(panSrc, panDest0, panDest1,
6635 : panDest2, nIters);
6636 126 : return;
6637 : }
6638 : #if !defined(__INTEL_CLANG_COMPILER)
6639 : // ICC autovectorizer doesn't do a good job, at least with icx
6640 : // 2022.1.0.20220316
6641 195 : else if (nComponents == 4)
6642 : {
6643 195 : const GUInt16 *CPL_RESTRICT panSrc =
6644 : static_cast<const GUInt16 *>(pSourceBuffer);
6645 195 : GUInt16 *CPL_RESTRICT panDest0 =
6646 : static_cast<GUInt16 *>(ppDestBuffer[0]);
6647 195 : GUInt16 *CPL_RESTRICT panDest1 =
6648 : static_cast<GUInt16 *>(ppDestBuffer[1]);
6649 195 : GUInt16 *CPL_RESTRICT panDest2 =
6650 : static_cast<GUInt16 *>(ppDestBuffer[2]);
6651 195 : GUInt16 *CPL_RESTRICT panDest3 =
6652 : static_cast<GUInt16 *>(ppDestBuffer[3]);
6653 195 : GDALDeinterleave4UInt16_SSSE3(panSrc, panDest0, panDest1,
6654 : panDest2, panDest3, nIters);
6655 195 : return;
6656 : }
6657 : #endif
6658 : }
6659 : #endif
6660 : }
6661 :
6662 29 : const int nSourceDTSize = GDALGetDataTypeSizeBytes(eSourceDT);
6663 29 : const int nDestDTSize = GDALGetDataTypeSizeBytes(eDestDT);
6664 108 : for (int iComp = 0; iComp < nComponents; iComp++)
6665 : {
6666 79 : GDALCopyWords64(static_cast<const GByte *>(pSourceBuffer) +
6667 79 : iComp * nSourceDTSize,
6668 : eSourceDT, nComponents * nSourceDTSize,
6669 79 : ppDestBuffer[iComp], eDestDT, nDestDTSize, nIters);
6670 : }
6671 : }
6672 :
6673 : /************************************************************************/
6674 : /* GDALTranspose2DSingleToSingle() */
6675 : /************************************************************************/
6676 : /**
6677 : * Transpose a 2D array of non-complex values, in a efficient (cache-oblivious) way.
6678 : *
6679 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6680 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6681 : * @param nSrcWidth Width of pSrc array.
6682 : * @param nSrcHeight Height of pSrc array.
6683 : */
6684 :
6685 : template <class DST, class SRC>
6686 160 : void GDALTranspose2DSingleToSingle(const SRC *CPL_RESTRICT pSrc,
6687 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6688 : size_t nSrcHeight)
6689 : {
6690 160 : constexpr size_t blocksize = 32;
6691 345 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6692 : {
6693 185 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6694 5016 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6695 : {
6696 : // transpose the block beginning at [i,j]
6697 4831 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6698 26185 : for (size_t k = i; k < max_k; ++k)
6699 : {
6700 669282 : for (size_t l = j; l < max_l; ++l)
6701 : {
6702 647928 : GDALCopyWord(pSrc[l + k * nSrcWidth],
6703 647928 : pDst[k + l * nSrcHeight]);
6704 : }
6705 : }
6706 : }
6707 : }
6708 160 : }
6709 :
6710 : /************************************************************************/
6711 : /* GDALTranspose2DComplexToComplex() */
6712 : /************************************************************************/
6713 : /**
6714 : * Transpose a 2D array of complex values into an array of complex values,
6715 : * in a efficient (cache-oblivious) way.
6716 : *
6717 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6718 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6719 : * @param nSrcWidth Width of pSrc array.
6720 : * @param nSrcHeight Height of pSrc array.
6721 : */
6722 : template <class DST, class SRC>
6723 25 : void GDALTranspose2DComplexToComplex(const SRC *CPL_RESTRICT pSrc,
6724 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6725 : size_t nSrcHeight)
6726 : {
6727 25 : constexpr size_t blocksize = 32;
6728 50 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6729 : {
6730 25 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6731 50 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6732 : {
6733 : // transpose the block beginning at [i,j]
6734 25 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6735 75 : for (size_t k = i; k < max_k; ++k)
6736 : {
6737 200 : for (size_t l = j; l < max_l; ++l)
6738 : {
6739 150 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
6740 150 : pDst[2 * (k + l * nSrcHeight) + 0]);
6741 150 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 1],
6742 150 : pDst[2 * (k + l * nSrcHeight) + 1]);
6743 : }
6744 : }
6745 : }
6746 : }
6747 25 : }
6748 :
6749 : /************************************************************************/
6750 : /* GDALTranspose2DComplexToSingle() */
6751 : /************************************************************************/
6752 : /**
6753 : * Transpose a 2D array of complex values into an array of non-complex values,
6754 : * in a efficient (cache-oblivious) way.
6755 : *
6756 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6757 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6758 : * @param nSrcWidth Width of pSrc array.
6759 : * @param nSrcHeight Height of pSrc array.
6760 : */
6761 : template <class DST, class SRC>
6762 55 : void GDALTranspose2DComplexToSingle(const SRC *CPL_RESTRICT pSrc,
6763 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6764 : size_t nSrcHeight)
6765 : {
6766 55 : constexpr size_t blocksize = 32;
6767 110 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6768 : {
6769 55 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6770 110 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6771 : {
6772 : // transpose the block beginning at [i,j]
6773 55 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6774 165 : for (size_t k = i; k < max_k; ++k)
6775 : {
6776 440 : for (size_t l = j; l < max_l; ++l)
6777 : {
6778 330 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
6779 330 : pDst[k + l * nSrcHeight]);
6780 : }
6781 : }
6782 : }
6783 : }
6784 55 : }
6785 :
6786 : /************************************************************************/
6787 : /* GDALTranspose2DSingleToComplex() */
6788 : /************************************************************************/
6789 : /**
6790 : * Transpose a 2D array of non-complex values into an array of complex values,
6791 : * in a efficient (cache-oblivious) way.
6792 : *
6793 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6794 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6795 : * @param nSrcWidth Width of pSrc array.
6796 : * @param nSrcHeight Height of pSrc array.
6797 : */
6798 : template <class DST, class SRC>
6799 55 : void GDALTranspose2DSingleToComplex(const SRC *CPL_RESTRICT pSrc,
6800 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6801 : size_t nSrcHeight)
6802 : {
6803 55 : constexpr size_t blocksize = 32;
6804 110 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6805 : {
6806 55 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6807 110 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6808 : {
6809 : // transpose the block beginning at [i,j]
6810 55 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6811 165 : for (size_t k = i; k < max_k; ++k)
6812 : {
6813 440 : for (size_t l = j; l < max_l; ++l)
6814 : {
6815 330 : GDALCopyWord(pSrc[l + k * nSrcWidth],
6816 330 : pDst[2 * (k + l * nSrcHeight) + 0]);
6817 330 : pDst[2 * (k + l * nSrcHeight) + 1] = 0;
6818 : }
6819 : }
6820 : }
6821 : }
6822 55 : }
6823 :
6824 : /************************************************************************/
6825 : /* GDALTranspose2D() */
6826 : /************************************************************************/
6827 :
6828 : template <class DST, bool DST_IS_COMPLEX>
6829 295 : static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
6830 : size_t nSrcWidth, size_t nSrcHeight)
6831 : {
6832 : #define CALL_GDALTranspose2D_internal(SRC_TYPE) \
6833 : do \
6834 : { \
6835 : if constexpr (DST_IS_COMPLEX) \
6836 : { \
6837 : GDALTranspose2DSingleToComplex( \
6838 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6839 : nSrcHeight); \
6840 : } \
6841 : else \
6842 : { \
6843 : GDALTranspose2DSingleToSingle(static_cast<const SRC_TYPE *>(pSrc), \
6844 : pDst, nSrcWidth, nSrcHeight); \
6845 : } \
6846 : } while (0)
6847 :
6848 : #define CALL_GDALTranspose2DComplex_internal(SRC_TYPE) \
6849 : do \
6850 : { \
6851 : if constexpr (DST_IS_COMPLEX) \
6852 : { \
6853 : GDALTranspose2DComplexToComplex( \
6854 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6855 : nSrcHeight); \
6856 : } \
6857 : else \
6858 : { \
6859 : GDALTranspose2DComplexToSingle( \
6860 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6861 : nSrcHeight); \
6862 : } \
6863 : } while (0)
6864 :
6865 : // clang-format off
6866 295 : switch (eSrcType)
6867 : {
6868 16 : case GDT_UInt8: CALL_GDALTranspose2D_internal(uint8_t); break;
6869 15 : case GDT_Int8: CALL_GDALTranspose2D_internal(int8_t); break;
6870 33 : case GDT_UInt16: CALL_GDALTranspose2D_internal(uint16_t); break;
6871 20 : case GDT_Int16: CALL_GDALTranspose2D_internal(int16_t); break;
6872 24 : case GDT_UInt32: CALL_GDALTranspose2D_internal(uint32_t); break;
6873 16 : case GDT_Int32: CALL_GDALTranspose2D_internal(int32_t); break;
6874 16 : case GDT_UInt64: CALL_GDALTranspose2D_internal(uint64_t); break;
6875 16 : case GDT_Int64: CALL_GDALTranspose2D_internal(int64_t); break;
6876 16 : case GDT_Float16: CALL_GDALTranspose2D_internal(GFloat16); break;
6877 19 : case GDT_Float32: CALL_GDALTranspose2D_internal(float); break;
6878 24 : case GDT_Float64: CALL_GDALTranspose2D_internal(double); break;
6879 16 : case GDT_CInt16: CALL_GDALTranspose2DComplex_internal(int16_t); break;
6880 16 : case GDT_CInt32: CALL_GDALTranspose2DComplex_internal(int32_t); break;
6881 16 : case GDT_CFloat16: CALL_GDALTranspose2DComplex_internal(GFloat16); break;
6882 16 : case GDT_CFloat32: CALL_GDALTranspose2DComplex_internal(float); break;
6883 16 : case GDT_CFloat64: CALL_GDALTranspose2DComplex_internal(double); break;
6884 0 : case GDT_Unknown:
6885 : case GDT_TypeCount:
6886 0 : break;
6887 : }
6888 : // clang-format on
6889 :
6890 : #undef CALL_GDALTranspose2D_internal
6891 : #undef CALL_GDALTranspose2DComplex_internal
6892 295 : }
6893 :
6894 : /************************************************************************/
6895 : /* GDALInterleave2Byte() */
6896 : /************************************************************************/
6897 :
6898 : #if defined(HAVE_SSE2) && \
6899 : (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
6900 :
6901 : // ICC autovectorizer doesn't do a good job at generating good SSE code,
6902 : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
6903 : #if defined(__GNUC__)
6904 : __attribute__((noinline))
6905 : #endif
6906 : static void GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
6907 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
6908 : {
6909 : size_t i = 0;
6910 : constexpr size_t VALS_PER_ITER = 16;
6911 : for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
6912 : {
6913 : __m128i xmm0 =
6914 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + i));
6915 : __m128i xmm1 = _mm_loadu_si128(
6916 : reinterpret_cast<__m128i const *>(pSrc + i + nIters));
6917 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + 2 * i),
6918 : _mm_unpacklo_epi8(xmm0, xmm1));
6919 : _mm_storeu_si128(
6920 : reinterpret_cast<__m128i *>(pDst + 2 * i + VALS_PER_ITER),
6921 : _mm_unpackhi_epi8(xmm0, xmm1));
6922 : }
6923 : #if defined(__clang__)
6924 : #pragma clang loop vectorize(disable)
6925 : #endif
6926 : for (; i < nIters; ++i)
6927 : {
6928 : pDst[2 * i + 0] = pSrc[i + 0 * nIters];
6929 : pDst[2 * i + 1] = pSrc[i + 1 * nIters];
6930 : }
6931 : }
6932 :
6933 : #else
6934 :
6935 : #if defined(__GNUC__) && !defined(__clang__)
6936 : __attribute__((optimize("tree-vectorize")))
6937 : #endif
6938 : #if defined(__GNUC__)
6939 : __attribute__((noinline))
6940 : #endif
6941 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6942 : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
6943 : #pragma clang diagnostic push
6944 : #pragma clang diagnostic ignored "-Wpass-failed"
6945 : #endif
6946 9 : static void GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
6947 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
6948 : {
6949 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6950 : #pragma clang loop vectorize(enable)
6951 : #endif
6952 355429 : for (size_t i = 0; i < nIters; ++i)
6953 : {
6954 355420 : pDst[2 * i + 0] = pSrc[i + 0 * nIters];
6955 355420 : pDst[2 * i + 1] = pSrc[i + 1 * nIters];
6956 : }
6957 9 : }
6958 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6959 : #pragma clang diagnostic pop
6960 : #endif
6961 :
6962 : #endif
6963 :
6964 : /************************************************************************/
6965 : /* GDALInterleave4Byte() */
6966 : /************************************************************************/
6967 :
6968 : #if defined(HAVE_SSE2) && \
6969 : (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
6970 :
6971 : // ICC autovectorizer doesn't do a good job at generating good SSE code,
6972 : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
6973 : #if defined(__GNUC__)
6974 : __attribute__((noinline))
6975 : #endif
6976 : static void GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
6977 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
6978 : {
6979 : size_t i = 0;
6980 : constexpr size_t VALS_PER_ITER = 16;
6981 : for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
6982 : {
6983 : __m128i xmm0 = _mm_loadu_si128(
6984 : reinterpret_cast<__m128i const *>(pSrc + i + 0 * nIters));
6985 : __m128i xmm1 = _mm_loadu_si128(
6986 : reinterpret_cast<__m128i const *>(pSrc + i + 1 * nIters));
6987 : __m128i xmm2 = _mm_loadu_si128(
6988 : reinterpret_cast<__m128i const *>(pSrc + i + 2 * nIters));
6989 : __m128i xmm3 = _mm_loadu_si128(
6990 : reinterpret_cast<__m128i const *>(pSrc + i + 3 * nIters));
6991 : auto tmp0 = _mm_unpacklo_epi8(
6992 : xmm0,
6993 : xmm1); // (xmm0_0, xmm1_0, xmm0_1, xmm1_1, xmm0_2, xmm1_2, ...)
6994 : auto tmp1 = _mm_unpackhi_epi8(
6995 : xmm0,
6996 : xmm1); // (xmm0_8, xmm1_8, xmm0_9, xmm1_9, xmm0_10, xmm1_10, ...)
6997 : auto tmp2 = _mm_unpacklo_epi8(
6998 : xmm2,
6999 : xmm3); // (xmm2_0, xmm3_0, xmm2_1, xmm3_1, xmm2_2, xmm3_2, ...)
7000 : auto tmp3 = _mm_unpackhi_epi8(
7001 : xmm2,
7002 : xmm3); // (xmm2_8, xmm3_8, xmm2_9, xmm3_9, xmm2_10, xmm3_10, ...)
7003 : auto tmp2_0 = _mm_unpacklo_epi16(
7004 : tmp0,
7005 : tmp2); // (xmm0_0, xmm1_0, xmm2_0, xmm3_0, xmm0_1, xmm1_1, xmm2_1, xmm3_1, ...)
7006 : auto tmp2_1 = _mm_unpackhi_epi16(tmp0, tmp2);
7007 : auto tmp2_2 = _mm_unpacklo_epi16(tmp1, tmp3);
7008 : auto tmp2_3 = _mm_unpackhi_epi16(tmp1, tmp3);
7009 : _mm_storeu_si128(
7010 : reinterpret_cast<__m128i *>(pDst + 4 * i + 0 * VALS_PER_ITER),
7011 : tmp2_0);
7012 : _mm_storeu_si128(
7013 : reinterpret_cast<__m128i *>(pDst + 4 * i + 1 * VALS_PER_ITER),
7014 : tmp2_1);
7015 : _mm_storeu_si128(
7016 : reinterpret_cast<__m128i *>(pDst + 4 * i + 2 * VALS_PER_ITER),
7017 : tmp2_2);
7018 : _mm_storeu_si128(
7019 : reinterpret_cast<__m128i *>(pDst + 4 * i + 3 * VALS_PER_ITER),
7020 : tmp2_3);
7021 : }
7022 : #if defined(__clang__)
7023 : #pragma clang loop vectorize(disable)
7024 : #endif
7025 : for (; i < nIters; ++i)
7026 : {
7027 : pDst[4 * i + 0] = pSrc[i + 0 * nIters];
7028 : pDst[4 * i + 1] = pSrc[i + 1 * nIters];
7029 : pDst[4 * i + 2] = pSrc[i + 2 * nIters];
7030 : pDst[4 * i + 3] = pSrc[i + 3 * nIters];
7031 : }
7032 : }
7033 :
7034 : #else
7035 :
7036 : #if defined(__GNUC__) && !defined(__clang__)
7037 : __attribute__((optimize("tree-vectorize")))
7038 : #endif
7039 : #if defined(__GNUC__)
7040 : __attribute__((noinline))
7041 : #endif
7042 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7043 : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
7044 : #pragma clang diagnostic push
7045 : #pragma clang diagnostic ignored "-Wpass-failed"
7046 : #endif
7047 30 : static void GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
7048 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
7049 : {
7050 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7051 : #pragma clang loop vectorize(enable)
7052 : #endif
7053 49620700 : for (size_t i = 0; i < nIters; ++i)
7054 : {
7055 49620600 : pDst[4 * i + 0] = pSrc[i + 0 * nIters];
7056 49620600 : pDst[4 * i + 1] = pSrc[i + 1 * nIters];
7057 49620600 : pDst[4 * i + 2] = pSrc[i + 2 * nIters];
7058 49620600 : pDst[4 * i + 3] = pSrc[i + 3 * nIters];
7059 : }
7060 30 : }
7061 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7062 : #pragma clang diagnostic pop
7063 : #endif
7064 :
7065 : #endif
7066 :
7067 : /************************************************************************/
7068 : /* GDALTranspose2D() */
7069 : /************************************************************************/
7070 :
7071 : /**
7072 : * Transpose a 2D array in a efficient (cache-oblivious) way.
7073 : *
7074 : * @param pSrc Source array of width = nSrcWidth and height = nSrcHeight.
7075 : * @param eSrcType Data type of pSrc.
7076 : * @param pDst Destination transposed array of width = nSrcHeight and height = nSrcWidth.
7077 : * @param eDstType Data type of pDst.
7078 : * @param nSrcWidth Width of pSrc array.
7079 : * @param nSrcHeight Height of pSrc array.
7080 : * @since GDAL 3.11
7081 : */
7082 :
7083 365 : void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, void *pDst,
7084 : GDALDataType eDstType, size_t nSrcWidth, size_t nSrcHeight)
7085 : {
7086 365 : if (eSrcType == eDstType && (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8))
7087 : {
7088 70 : if (nSrcHeight == 2)
7089 : {
7090 9 : GDALInterleave2Byte(static_cast<const uint8_t *>(pSrc),
7091 : static_cast<uint8_t *>(pDst), nSrcWidth);
7092 9 : return;
7093 : }
7094 61 : if (nSrcHeight == 4)
7095 : {
7096 30 : GDALInterleave4Byte(static_cast<const uint8_t *>(pSrc),
7097 : static_cast<uint8_t *>(pDst), nSrcWidth);
7098 30 : return;
7099 : }
7100 : #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) && \
7101 : (defined(__x86_64) || defined(_M_X64)))
7102 31 : if (CPLHaveRuntimeSSSE3())
7103 : {
7104 31 : GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
7105 : static_cast<uint8_t *>(pDst), nSrcWidth,
7106 : nSrcHeight);
7107 31 : return;
7108 : }
7109 : #elif defined(USE_NEON_OPTIMIZATIONS)
7110 : {
7111 : GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
7112 : static_cast<uint8_t *>(pDst), nSrcWidth,
7113 : nSrcHeight);
7114 : return;
7115 : }
7116 : #endif
7117 : }
7118 :
7119 : #define CALL_GDALTranspose2D_internal(DST_TYPE, DST_IS_COMPLEX) \
7120 : GDALTranspose2D<DST_TYPE, DST_IS_COMPLEX>( \
7121 : pSrc, eSrcType, static_cast<DST_TYPE *>(pDst), nSrcWidth, nSrcHeight)
7122 :
7123 : // clang-format off
7124 295 : switch (eDstType)
7125 : {
7126 15 : case GDT_UInt8: CALL_GDALTranspose2D_internal(uint8_t, false); break;
7127 15 : case GDT_Int8: CALL_GDALTranspose2D_internal(int8_t, false); break;
7128 33 : case GDT_UInt16: CALL_GDALTranspose2D_internal(uint16_t, false); break;
7129 20 : case GDT_Int16: CALL_GDALTranspose2D_internal(int16_t, false); break;
7130 24 : case GDT_UInt32: CALL_GDALTranspose2D_internal(uint32_t, false); break;
7131 16 : case GDT_Int32: CALL_GDALTranspose2D_internal(int32_t, false); break;
7132 16 : case GDT_UInt64: CALL_GDALTranspose2D_internal(uint64_t, false); break;
7133 16 : case GDT_Int64: CALL_GDALTranspose2D_internal(int64_t, false); break;
7134 16 : case GDT_Float16: CALL_GDALTranspose2D_internal(GFloat16, false); break;
7135 19 : case GDT_Float32: CALL_GDALTranspose2D_internal(float, false); break;
7136 25 : case GDT_Float64: CALL_GDALTranspose2D_internal(double, false); break;
7137 16 : case GDT_CInt16: CALL_GDALTranspose2D_internal(int16_t, true); break;
7138 16 : case GDT_CInt32: CALL_GDALTranspose2D_internal(int32_t, true); break;
7139 16 : case GDT_CFloat16: CALL_GDALTranspose2D_internal(GFloat16, true); break;
7140 16 : case GDT_CFloat32: CALL_GDALTranspose2D_internal(float, true); break;
7141 16 : case GDT_CFloat64: CALL_GDALTranspose2D_internal(double, true); break;
7142 0 : case GDT_Unknown:
7143 : case GDT_TypeCount:
7144 0 : break;
7145 : }
7146 : // clang-format on
7147 :
7148 : #undef CALL_GDALTranspose2D_internal
7149 : }
7150 :
7151 : /************************************************************************/
7152 : /* ExtractBitAndConvertTo255() */
7153 : /************************************************************************/
7154 :
7155 : #if defined(__GNUC__) || defined(_MSC_VER)
7156 : // Signedness of char implementation dependent, so be explicit.
7157 : // Assumes 2-complement integer types and sign extension of right shifting
7158 : // GCC guarantees such:
7159 : // https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
7160 143590 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
7161 : {
7162 143590 : return static_cast<GByte>(static_cast<signed char>(byVal << (7 - nBit)) >>
7163 143590 : 7);
7164 : }
7165 : #else
7166 : // Portable way
7167 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
7168 : {
7169 : return (byVal & (1 << nBit)) ? 255 : 0;
7170 : }
7171 : #endif
7172 :
7173 : /************************************************************************/
7174 : /* ExpandEightPackedBitsToByteAt255() */
7175 : /************************************************************************/
7176 :
7177 17813 : static inline void ExpandEightPackedBitsToByteAt255(GByte byVal,
7178 : GByte abyOutput[8])
7179 : {
7180 17813 : abyOutput[0] = ExtractBitAndConvertTo255(byVal, 7);
7181 17813 : abyOutput[1] = ExtractBitAndConvertTo255(byVal, 6);
7182 17813 : abyOutput[2] = ExtractBitAndConvertTo255(byVal, 5);
7183 17813 : abyOutput[3] = ExtractBitAndConvertTo255(byVal, 4);
7184 17813 : abyOutput[4] = ExtractBitAndConvertTo255(byVal, 3);
7185 17813 : abyOutput[5] = ExtractBitAndConvertTo255(byVal, 2);
7186 17813 : abyOutput[6] = ExtractBitAndConvertTo255(byVal, 1);
7187 17813 : abyOutput[7] = ExtractBitAndConvertTo255(byVal, 0);
7188 17813 : }
7189 :
7190 : /************************************************************************/
7191 : /* GDALExpandPackedBitsToByteAt0Or255() */
7192 : /************************************************************************/
7193 :
7194 : /** Expand packed-bits (ordered from most-significant bit to least one)
7195 : into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
7196 : at 1 to a byte at 255.
7197 :
7198 : The function does (in a possibly more optimized way) the following:
7199 : \code{.cpp}
7200 : for (size_t i = 0; i < nInputBits; ++i )
7201 : {
7202 : pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 255 : 0;
7203 : }
7204 : \endcode
7205 :
7206 : @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
7207 : @param pabyOutput Output array of nInputBits bytes.
7208 : @param nInputBits Number of valid bits in pabyInput.
7209 :
7210 : @since 3.11
7211 : */
7212 :
7213 45357 : void GDALExpandPackedBitsToByteAt0Or255(const GByte *CPL_RESTRICT pabyInput,
7214 : GByte *CPL_RESTRICT pabyOutput,
7215 : size_t nInputBits)
7216 : {
7217 45357 : const size_t nInputWholeBytes = nInputBits / 8;
7218 45357 : size_t iByte = 0;
7219 :
7220 : #ifdef HAVE_SSE2
7221 : // Mask to isolate each bit
7222 45357 : const __m128i bit_mask = _mm_set_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4,
7223 : 8, 16, 32, 64, -128);
7224 45357 : const __m128i zero = _mm_setzero_si128();
7225 45357 : const __m128i all_ones = _mm_set1_epi8(-1);
7226 : #ifdef __SSSE3__
7227 : const __m128i dispatch_two_bytes =
7228 : _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
7229 : #endif
7230 45357 : constexpr size_t SSE_REG_SIZE = sizeof(bit_mask);
7231 135866 : for (; iByte + SSE_REG_SIZE <= nInputWholeBytes; iByte += SSE_REG_SIZE)
7232 : {
7233 90509 : __m128i reg_ori = _mm_loadu_si128(
7234 90509 : reinterpret_cast<const __m128i *>(pabyInput + iByte));
7235 :
7236 90509 : constexpr int NUM_PROCESSED_BYTES_PER_REG = 2;
7237 814581 : for (size_t k = 0; k < SSE_REG_SIZE / NUM_PROCESSED_BYTES_PER_REG; ++k)
7238 : {
7239 : // Given reg_ori = (A, B, ... 14 other bytes ...),
7240 : // expand to (A, A, A, A, A, A, A, A, B, B, B, B, B, B, B, B)
7241 : #ifdef __SSSE3__
7242 : __m128i reg = _mm_shuffle_epi8(reg_ori, dispatch_two_bytes);
7243 : #else
7244 724072 : __m128i reg = _mm_unpacklo_epi8(reg_ori, reg_ori);
7245 724072 : reg = _mm_unpacklo_epi16(reg, reg);
7246 724072 : reg = _mm_unpacklo_epi32(reg, reg);
7247 : #endif
7248 :
7249 : // Test if bits of interest are set
7250 724072 : reg = _mm_and_si128(reg, bit_mask);
7251 :
7252 : // Now test if those bits are set, by comparing to zero. So the
7253 : // result will be that bytes where bits are set will be at 0, and
7254 : // ones where they are cleared will be at 0xFF. So the inverse of
7255 : // the end result we want!
7256 724072 : reg = _mm_cmpeq_epi8(reg, zero);
7257 :
7258 : // Invert the result
7259 724072 : reg = _mm_andnot_si128(reg, all_ones);
7260 :
7261 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyOutput), reg);
7262 :
7263 724072 : pabyOutput += SSE_REG_SIZE;
7264 :
7265 : // Right-shift of 2 bytes
7266 724072 : reg_ori = _mm_bsrli_si128(reg_ori, NUM_PROCESSED_BYTES_PER_REG);
7267 : }
7268 : }
7269 :
7270 : #endif // HAVE_SSE2
7271 :
7272 63170 : for (; iByte < nInputWholeBytes; ++iByte)
7273 : {
7274 17813 : ExpandEightPackedBitsToByteAt255(pabyInput[iByte], pabyOutput);
7275 17813 : pabyOutput += 8;
7276 : }
7277 46443 : for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
7278 : {
7279 1086 : *pabyOutput = ExtractBitAndConvertTo255(pabyInput[iByte], 7 - iBit);
7280 1086 : ++pabyOutput;
7281 : }
7282 45357 : }
7283 :
7284 : /************************************************************************/
7285 : /* ExpandEightPackedBitsToByteAt1() */
7286 : /************************************************************************/
7287 :
7288 136113 : static inline void ExpandEightPackedBitsToByteAt1(GByte byVal,
7289 : GByte abyOutput[8])
7290 : {
7291 136113 : abyOutput[0] = (byVal >> 7) & 0x1;
7292 136113 : abyOutput[1] = (byVal >> 6) & 0x1;
7293 136113 : abyOutput[2] = (byVal >> 5) & 0x1;
7294 136113 : abyOutput[3] = (byVal >> 4) & 0x1;
7295 136113 : abyOutput[4] = (byVal >> 3) & 0x1;
7296 136113 : abyOutput[5] = (byVal >> 2) & 0x1;
7297 136113 : abyOutput[6] = (byVal >> 1) & 0x1;
7298 136113 : abyOutput[7] = (byVal >> 0) & 0x1;
7299 136113 : }
7300 :
7301 : /************************************************************************/
7302 : /* GDALExpandPackedBitsToByteAt0Or1() */
7303 : /************************************************************************/
7304 :
7305 : /** Expand packed-bits (ordered from most-significant bit to least one)
7306 : into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
7307 : at 1 to a byte at 1.
7308 :
7309 : The function does (in a possibly more optimized way) the following:
7310 : \code{.cpp}
7311 : for (size_t i = 0; i < nInputBits; ++i )
7312 : {
7313 : pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
7314 : }
7315 : \endcode
7316 :
7317 : @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
7318 : @param pabyOutput Output array of nInputBits bytes.
7319 : @param nInputBits Number of valid bits in pabyInput.
7320 :
7321 : @since 3.11
7322 : */
7323 :
7324 7033 : void GDALExpandPackedBitsToByteAt0Or1(const GByte *CPL_RESTRICT pabyInput,
7325 : GByte *CPL_RESTRICT pabyOutput,
7326 : size_t nInputBits)
7327 : {
7328 7033 : const size_t nInputWholeBytes = nInputBits / 8;
7329 7033 : size_t iByte = 0;
7330 143146 : for (; iByte < nInputWholeBytes; ++iByte)
7331 : {
7332 136113 : ExpandEightPackedBitsToByteAt1(pabyInput[iByte], pabyOutput);
7333 136113 : pabyOutput += 8;
7334 : }
7335 18886 : for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
7336 : {
7337 11853 : *pabyOutput = (pabyInput[iByte] >> (7 - iBit)) & 0x1;
7338 11853 : ++pabyOutput;
7339 : }
7340 7033 : }
|