Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: GDAL Core
4 : * Purpose: Contains default implementation of GDALRasterBand::IRasterIO()
5 : * and supporting functions of broader utility.
6 : * Author: Frank Warmerdam, warmerdam@pobox.com
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 1998, Frank Warmerdam
10 : * Copyright (c) 2007-2014, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * SPDX-License-Identifier: MIT
13 : ****************************************************************************/
14 :
15 : #include "cpl_port.h"
16 : #include "gdal.h"
17 : #include "gdal_priv.h"
18 :
19 : #include <cassert>
20 : #include <climits>
21 : #include <cmath>
22 : #include <cstddef>
23 : #include <cstdio>
24 : #include <cstdlib>
25 : #include <cstring>
26 :
27 : #include <algorithm>
28 : #include <limits>
29 : #include <stdexcept>
30 : #include <type_traits>
31 :
32 : #include "cpl_conv.h"
33 : #include "cpl_cpu_features.h"
34 : #include "cpl_error.h"
35 : #include "cpl_float.h"
36 : #include "cpl_progress.h"
37 : #include "cpl_string.h"
38 : #include "cpl_vsi.h"
39 : #include "gdal_priv_templates.hpp"
40 : #include "gdal_vrt.h"
41 : #include "gdalwarper.h"
42 : #include "memdataset.h"
43 : #include "vrtdataset.h"
44 :
45 : #if defined(__x86_64) || defined(_M_X64)
46 : #include <emmintrin.h>
47 : #include <immintrin.h>
48 : #define HAVE_SSE2
49 : // AVX2 dispatch: compile AVX2 code with target attribute, detect at runtime
50 : #if defined(__GNUC__) || defined(__clang__)
51 : #define HAVE_AVX2_DISPATCH
52 : #elif defined(_MSC_VER)
53 : #include <intrin.h>
54 : #define HAVE_AVX2_DISPATCH
55 : #define HAVE_AVX2_DISPATCH_MSVC
56 : #elif defined(__AVX2__)
57 : #define HAVE_AVX2_NATIVELY
58 : #endif
59 : #elif defined(USE_NEON_OPTIMIZATIONS)
60 : #include "include_sse2neon.h"
61 : #define HAVE_SSE2
62 : #endif
63 :
64 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
65 : #include "rasterio_ssse3.h"
66 : #ifdef __SSSE3__
67 : #include <tmmintrin.h>
68 : #endif
69 : #endif
70 :
71 : #ifdef __SSE4_1__
72 : #include <smmintrin.h>
73 : #endif
74 :
75 : #ifdef __GNUC__
76 : #define CPL_NOINLINE __attribute__((noinline))
77 : #else
78 : #define CPL_NOINLINE
79 : #endif
80 :
81 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
82 : int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
83 : int nDstPixelStride, GPtrDiff_t nWordCount);
84 :
85 : /************************************************************************/
86 : /* DownsamplingIntegerXFactor() */
87 : /************************************************************************/
88 :
89 : template <bool bSameDataType, int DATA_TYPE_SIZE>
90 695780 : static bool DownsamplingIntegerXFactor(
91 : GDALRasterBand *poBand, int iSrcX, int nSrcXInc, GPtrDiff_t iSrcOffsetCst,
92 : GByte *CPL_RESTRICT pabyDstData, int nPixelSpace, int nBufXSize,
93 : GDALDataType eDataType, GDALDataType eBufType, int &nStartBlockX,
94 : int nBlockXSize, GDALRasterBlock *&poBlock, int nLBlockY)
95 : {
96 695780 : const int nBandDataSize =
97 : bSameDataType ? DATA_TYPE_SIZE : GDALGetDataTypeSizeBytes(eDataType);
98 695780 : int nOuterLoopIters = nBufXSize - 1;
99 695780 : const int nIncSrcOffset = nSrcXInc * nBandDataSize;
100 : const GByte *CPL_RESTRICT pabySrcData;
101 695780 : int nEndBlockX = nBlockXSize + nStartBlockX;
102 :
103 695780 : if (iSrcX < nEndBlockX)
104 : {
105 294999 : CPLAssert(poBlock);
106 294999 : goto no_reload_block;
107 : }
108 400781 : goto reload_block;
109 :
110 : // Don't do the last iteration in the loop, as iSrcX might go beyond
111 : // nRasterXSize - 1
112 1264973 : while (--nOuterLoopIters >= 1)
113 : {
114 201834 : iSrcX += nSrcXInc;
115 201834 : pabySrcData += nIncSrcOffset;
116 201834 : pabyDstData += nPixelSpace;
117 :
118 : /* --------------------------------------------------------------------
119 : */
120 : /* Ensure we have the appropriate block loaded. */
121 : /* --------------------------------------------------------------------
122 : */
123 201834 : if (iSrcX >= nEndBlockX)
124 : {
125 201834 : reload_block:
126 : {
127 615205 : const int nLBlockX = iSrcX / nBlockXSize;
128 615205 : nStartBlockX = nLBlockX * nBlockXSize;
129 615205 : nEndBlockX = nStartBlockX + nBlockXSize;
130 :
131 615205 : if (poBlock != nullptr)
132 341376 : poBlock->DropLock();
133 :
134 615205 : poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
135 615205 : if (poBlock == nullptr)
136 : {
137 1 : return false;
138 : }
139 : }
140 :
141 615204 : no_reload_block:
142 : const GByte *pabySrcBlock =
143 1264973 : static_cast<const GByte *>(poBlock->GetDataRef());
144 1264973 : GPtrDiff_t iSrcOffset =
145 1264973 : (iSrcX - nStartBlockX + iSrcOffsetCst) * nBandDataSize;
146 1264973 : pabySrcData = pabySrcBlock + iSrcOffset;
147 : }
148 :
149 : /* --------------------------------------------------------------------
150 : */
151 : /* Copy the maximum run of pixels. */
152 : /* --------------------------------------------------------------------
153 : */
154 :
155 1264973 : const int nIters = std::min(
156 1264973 : (nEndBlockX - iSrcX + (nSrcXInc - 1)) / nSrcXInc, nOuterLoopIters);
157 : if (bSameDataType)
158 : {
159 1264530 : memcpy(pabyDstData, pabySrcData, nBandDataSize);
160 1264530 : if (nIters > 1)
161 : {
162 : if (DATA_TYPE_SIZE == 1)
163 : {
164 326250 : pabySrcData += nIncSrcOffset;
165 326250 : pabyDstData += nPixelSpace;
166 326250 : GDALFastCopyByte(pabySrcData, nIncSrcOffset, pabyDstData,
167 326250 : nPixelSpace, nIters - 1);
168 326250 : pabySrcData +=
169 326250 : static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 2);
170 326250 : pabyDstData +=
171 326250 : static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 2);
172 : }
173 : else
174 : {
175 4395716 : for (int i = 0; i < nIters - 1; i++)
176 : {
177 4197550 : pabySrcData += nIncSrcOffset;
178 4197550 : pabyDstData += nPixelSpace;
179 4197550 : memcpy(pabyDstData, pabySrcData, nBandDataSize);
180 : }
181 : }
182 524420 : iSrcX += nSrcXInc * (nIters - 1);
183 524420 : nOuterLoopIters -= nIters - 1;
184 : }
185 : }
186 : else
187 : {
188 : // Type to type conversion ...
189 443 : GDALCopyWords64(pabySrcData, eDataType, nIncSrcOffset, pabyDstData,
190 443 : eBufType, nPixelSpace, std::max(1, nIters));
191 443 : if (nIters > 1)
192 : {
193 216 : pabySrcData +=
194 216 : static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 1);
195 216 : pabyDstData +=
196 216 : static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 1);
197 216 : iSrcX += nSrcXInc * (nIters - 1);
198 216 : nOuterLoopIters -= nIters - 1;
199 : }
200 : }
201 : }
202 :
203 : // Deal with last iteration to avoid iSrcX to go beyond nRasterXSize - 1
204 1063139 : if (nOuterLoopIters == 0)
205 : {
206 367360 : const int nRasterXSize = poBand->GetXSize();
207 367360 : iSrcX =
208 734720 : static_cast<int>(std::min(static_cast<GInt64>(iSrcX) + nSrcXInc,
209 367360 : static_cast<GInt64>(nRasterXSize - 1)));
210 367360 : pabyDstData += nPixelSpace;
211 367360 : if (iSrcX < nEndBlockX)
212 : {
213 354770 : goto no_reload_block;
214 : }
215 12590 : goto reload_block;
216 : }
217 695779 : return true;
218 : }
219 :
220 : template <class A, class B>
221 2818450 : CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW inline auto CPLUnsanitizedMul(A a, B b)
222 : {
223 2818450 : return a * b;
224 : }
225 :
226 : /************************************************************************/
227 : /* IRasterIO() */
228 : /* */
229 : /* Default internal implementation of RasterIO() ... utilizes */
230 : /* the Block access methods to satisfy the request. This would */
231 : /* normally only be overridden by formats with overviews. */
232 : /************************************************************************/
233 :
234 6180460 : CPLErr GDALRasterBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
235 : int nXSize, int nYSize, void *pData,
236 : int nBufXSize, int nBufYSize,
237 : GDALDataType eBufType, GSpacing nPixelSpace,
238 : GSpacing nLineSpace,
239 : GDALRasterIOExtraArg *psExtraArg)
240 :
241 : {
242 6180460 : if (eRWFlag == GF_Write && eFlushBlockErr != CE_None)
243 : {
244 0 : CPLError(eFlushBlockErr, CPLE_AppDefined,
245 : "An error occurred while writing a dirty block "
246 : "from GDALRasterBand::IRasterIO");
247 0 : CPLErr eErr = eFlushBlockErr;
248 0 : eFlushBlockErr = CE_None;
249 0 : return eErr;
250 : }
251 6180460 : if (nBlockXSize <= 0 || nBlockYSize <= 0)
252 : {
253 0 : CPLError(CE_Failure, CPLE_AppDefined, "Invalid block size");
254 0 : return CE_Failure;
255 : }
256 :
257 6180460 : const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
258 6180460 : const int nBufDataSize = GDALGetDataTypeSizeBytes(eBufType);
259 6180460 : GByte dummyBlock[2] = {0, 0};
260 6180460 : GByte *pabySrcBlock =
261 : dummyBlock; /* to avoid Coverity warning about nullptr dereference */
262 6180460 : GDALRasterBlock *poBlock = nullptr;
263 6180460 : const bool bUseIntegerRequestCoords =
264 6545380 : (!psExtraArg->bFloatingPointWindowValidity ||
265 364919 : (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
266 339987 : nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
267 :
268 : /* ==================================================================== */
269 : /* A common case is the data requested with the destination */
270 : /* is packed, and the block width is the raster width. */
271 : /* ==================================================================== */
272 6088650 : if (nPixelSpace == nBufDataSize && nLineSpace == nPixelSpace * nXSize &&
273 3234250 : nBlockXSize == GetXSize() && nBufXSize == nXSize &&
274 12269100 : nBufYSize == nYSize && bUseIntegerRequestCoords)
275 : {
276 3096600 : CPLErr eErr = CE_None;
277 3096600 : int nLBlockY = -1;
278 :
279 9750600 : for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
280 : {
281 6655090 : const int iSrcY = iBufYOff + nYOff;
282 :
283 6655090 : if (iSrcY < nLBlockY * nBlockYSize ||
284 6655090 : iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
285 : {
286 3365560 : nLBlockY = iSrcY / nBlockYSize;
287 3365560 : bool bJustInitialize =
288 297352 : eRWFlag == GF_Write && nXOff == 0 &&
289 3720830 : nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
290 57919 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize;
291 :
292 : // Is this a partial tile at right and/or bottom edges of
293 : // the raster, and that is going to be completely written?
294 : // If so, do not load it from storage, but zero it so that
295 : // the content outsize of the validity area is initialized.
296 3365560 : bool bMemZeroBuffer = false;
297 297352 : if (eRWFlag == GF_Write && !bJustInitialize && nXOff == 0 &&
298 24975 : nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
299 3663000 : nYOff + nYSize == GetYSize() &&
300 89 : nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)
301 : {
302 89 : bJustInitialize = true;
303 89 : bMemZeroBuffer = true;
304 : }
305 :
306 3365560 : if (poBlock)
307 268959 : poBlock->DropLock();
308 :
309 3365560 : const GUInt32 nErrorCounter = CPLGetErrorCounter();
310 3365560 : poBlock = GetLockedBlockRef(0, nLBlockY, bJustInitialize);
311 3365560 : if (poBlock == nullptr)
312 : {
313 1078 : if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
314 : nullptr)
315 : {
316 0 : CPLError(CE_Failure, CPLE_AppDefined,
317 : "GetBlockRef failed at X block offset %d, "
318 : "Y block offset %d%s",
319 : 0, nLBlockY,
320 0 : (nErrorCounter != CPLGetErrorCounter())
321 0 : ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
322 : : "");
323 : }
324 1078 : eErr = CE_Failure;
325 1078 : break;
326 : }
327 :
328 3364480 : if (eRWFlag == GF_Write)
329 297352 : poBlock->MarkDirty();
330 :
331 3364480 : pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
332 3364480 : if (bMemZeroBuffer)
333 : {
334 89 : memset(pabySrcBlock, 0,
335 89 : static_cast<GPtrDiff_t>(nBandDataSize) *
336 89 : nBlockXSize * nBlockYSize);
337 : }
338 : }
339 :
340 6654010 : const auto nSrcByteOffset =
341 6654010 : (static_cast<GPtrDiff_t>(iSrcY - nLBlockY * nBlockYSize) *
342 6654010 : nBlockXSize +
343 6654010 : nXOff) *
344 6654010 : nBandDataSize;
345 :
346 6654010 : if (eDataType == eBufType)
347 : {
348 2990760 : if (eRWFlag == GF_Read)
349 2518250 : memcpy(static_cast<GByte *>(pData) +
350 2518250 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
351 2518250 : pabySrcBlock + nSrcByteOffset,
352 : static_cast<size_t>(nLineSpace));
353 : else
354 472505 : memcpy(pabySrcBlock + nSrcByteOffset,
355 472505 : static_cast<GByte *>(pData) +
356 472505 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
357 : static_cast<size_t>(nLineSpace));
358 : }
359 : else
360 : {
361 : // Type to type conversion.
362 3663250 : if (eRWFlag == GF_Read)
363 3641190 : GDALCopyWords64(
364 3641190 : pabySrcBlock + nSrcByteOffset, eDataType, nBandDataSize,
365 : static_cast<GByte *>(pData) +
366 3641190 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
367 : eBufType, static_cast<int>(nPixelSpace), nBufXSize);
368 : else
369 22065 : GDALCopyWords64(static_cast<GByte *>(pData) +
370 22065 : static_cast<GPtrDiff_t>(iBufYOff) *
371 : nLineSpace,
372 : eBufType, static_cast<int>(nPixelSpace),
373 22065 : pabySrcBlock + nSrcByteOffset, eDataType,
374 : nBandDataSize, nBufXSize);
375 : }
376 :
377 6741880 : if (psExtraArg->pfnProgress != nullptr &&
378 87868 : !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
379 : psExtraArg->pProgressData))
380 : {
381 5 : eErr = CE_Failure;
382 5 : break;
383 : }
384 : }
385 :
386 3096600 : if (poBlock)
387 3095520 : poBlock->DropLock();
388 :
389 3096600 : return eErr;
390 : }
391 :
392 : /* ==================================================================== */
393 : /* Do we have overviews that would be appropriate to satisfy */
394 : /* this request? */
395 : /* ==================================================================== */
396 3083860 : if ((nBufXSize < nXSize || nBufYSize < nYSize) && GetOverviewCount() > 0 &&
397 : eRWFlag == GF_Read)
398 : {
399 : GDALRasterIOExtraArg sExtraArg;
400 2967 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
401 :
402 : const int nOverview =
403 2967 : GDALBandGetBestOverviewLevel2(this, nXOff, nYOff, nXSize, nYSize,
404 : nBufXSize, nBufYSize, &sExtraArg);
405 2967 : if (nOverview >= 0)
406 : {
407 2892 : GDALRasterBand *poOverviewBand = GetOverview(nOverview);
408 2892 : if (poOverviewBand == nullptr)
409 2892 : return CE_Failure;
410 :
411 2892 : return poOverviewBand->RasterIO(
412 : eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize,
413 2892 : nBufYSize, eBufType, nPixelSpace, nLineSpace, &sExtraArg);
414 : }
415 : }
416 :
417 891491 : if (eRWFlag == GF_Read && nBufXSize < nXSize / 100 &&
418 6 : nBufYSize < nYSize / 100 && nPixelSpace == nBufDataSize &&
419 3972470 : nLineSpace == nPixelSpace * nBufXSize &&
420 6 : CPLTestBool(CPLGetConfigOption("GDAL_NO_COSTLY_OVERVIEW", "NO")))
421 : {
422 0 : memset(pData, 0, static_cast<size_t>(nLineSpace * nBufYSize));
423 0 : return CE_None;
424 : }
425 :
426 : /* ==================================================================== */
427 : /* The second case when we don't need subsample data but likely */
428 : /* need data type conversion. */
429 : /* ==================================================================== */
430 3080970 : if ( // nPixelSpace == nBufDataSize &&
431 3080970 : nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
432 : {
433 : #if DEBUG_VERBOSE
434 : printf("IRasterIO(%d,%d,%d,%d) rw=%d case 2\n", /*ok*/
435 : nXOff, nYOff, nXSize, nYSize, static_cast<int>(eRWFlag));
436 : #endif
437 :
438 : /* --------------------------------------------------------------------
439 : */
440 : /* Loop over buffer computing source locations. */
441 : /* --------------------------------------------------------------------
442 : */
443 : // Calculate starting values out of loop
444 2503120 : const int nLBlockXStart = nXOff / nBlockXSize;
445 2503120 : const int nXSpanEnd = nBufXSize + nXOff;
446 :
447 2503120 : int nYInc = 0;
448 5046990 : for (int iBufYOff = 0, iSrcY = nYOff; iBufYOff < nBufYSize;
449 2543870 : iBufYOff += nYInc, iSrcY += nYInc)
450 : {
451 2543940 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
452 : static_cast<GPtrDiff_t>(nLineSpace);
453 2543940 : int nLBlockY = iSrcY / nBlockYSize;
454 2543940 : int nLBlockX = nLBlockXStart;
455 2543940 : int iSrcX = nXOff;
456 5362320 : while (iSrcX < nXSpanEnd)
457 : {
458 2818450 : int nXSpan = nLBlockX * nBlockXSize;
459 2818450 : if (nXSpan < INT_MAX - nBlockXSize)
460 2818450 : nXSpan += nBlockXSize;
461 : else
462 0 : nXSpan = INT_MAX;
463 2818450 : const int nXRight = nXSpan;
464 2818450 : nXSpan = (nXSpan < nXSpanEnd ? nXSpan : nXSpanEnd) - iSrcX;
465 :
466 : const size_t nXSpanSize =
467 2818450 : CPLUnsanitizedMul(nXSpan, static_cast<size_t>(nPixelSpace));
468 :
469 2818450 : bool bJustInitialize =
470 2042970 : eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
471 38035 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
472 4887790 : nXOff <= nLBlockX * nBlockXSize &&
473 26364 : nXOff + nXSize >= nXRight;
474 :
475 : // Is this a partial tile at right and/or bottom edges of
476 : // the raster, and that is going to be completely written?
477 : // If so, do not load it from storage, but zero it so that
478 : // the content outsize of the validity area is initialized.
479 2818450 : bool bMemZeroBuffer = false;
480 2042970 : if (eRWFlag == GF_Write && !bJustInitialize &&
481 2017850 : nXOff <= nLBlockX * nBlockXSize &&
482 2016190 : nYOff <= nLBlockY * nBlockYSize &&
483 12145 : (nXOff + nXSize >= nXRight ||
484 : // cppcheck-suppress knownConditionTrueFalse
485 4864140 : (nXOff + nXSize == GetXSize() && nXRight > GetXSize())) &&
486 11965 : (nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize ||
487 10743 : (nYOff + nYSize == GetYSize() &&
488 1951 : nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)))
489 : {
490 3173 : bJustInitialize = true;
491 3173 : bMemZeroBuffer = true;
492 : }
493 :
494 : /* --------------------------------------------------------------------
495 : */
496 : /* Ensure we have the appropriate block loaded. */
497 : /* --------------------------------------------------------------------
498 : */
499 2818450 : const GUInt32 nErrorCounter = CPLGetErrorCounter();
500 2818450 : poBlock =
501 2818450 : GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
502 2818450 : if (!poBlock)
503 : {
504 73 : if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
505 : nullptr)
506 : {
507 0 : CPLError(CE_Failure, CPLE_AppDefined,
508 : "GetBlockRef failed at X block offset %d, "
509 : "Y block offset %d%s",
510 : nLBlockX, nLBlockY,
511 0 : (nErrorCounter != CPLGetErrorCounter())
512 0 : ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
513 : : "");
514 : }
515 73 : return (CE_Failure);
516 : }
517 :
518 2818380 : if (eRWFlag == GF_Write)
519 2042970 : poBlock->MarkDirty();
520 :
521 2818380 : pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
522 2818380 : if (bMemZeroBuffer)
523 : {
524 3173 : memset(pabySrcBlock, 0,
525 3173 : static_cast<GPtrDiff_t>(nBandDataSize) *
526 3173 : nBlockXSize * nBlockYSize);
527 : }
528 : /* --------------------------------------------------------------------
529 : */
530 : /* Copy over this chunk of data. */
531 : /* --------------------------------------------------------------------
532 : */
533 2818380 : GPtrDiff_t iSrcOffset =
534 2818380 : (static_cast<GPtrDiff_t>(iSrcX) -
535 2818380 : static_cast<GPtrDiff_t>(nLBlockX * nBlockXSize) +
536 2818380 : (static_cast<GPtrDiff_t>(iSrcY) -
537 2818380 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
538 2818380 : nBlockXSize) *
539 2818380 : nBandDataSize;
540 : // Fill up as many rows as possible for the loaded block.
541 5636750 : const int kmax = std::min(nBlockYSize - (iSrcY % nBlockYSize),
542 2818380 : nBufYSize - iBufYOff);
543 60959400 : for (int k = 0; k < kmax; k++)
544 : {
545 58141000 : if (eDataType == eBufType && nPixelSpace == nBufDataSize)
546 : {
547 53739300 : if (eRWFlag == GF_Read)
548 49301200 : memcpy(static_cast<GByte *>(pData) + iBufOffset +
549 49301200 : static_cast<GPtrDiff_t>(k) * nLineSpace,
550 49301200 : pabySrcBlock + iSrcOffset, nXSpanSize);
551 : else
552 4438130 : memcpy(pabySrcBlock + iSrcOffset,
553 4438130 : static_cast<GByte *>(pData) + iBufOffset +
554 4438130 : static_cast<GPtrDiff_t>(k) * nLineSpace,
555 : nXSpanSize);
556 : }
557 : else
558 : {
559 : /* type to type conversion */
560 4401720 : if (eRWFlag == GF_Read)
561 4251510 : GDALCopyWords64(
562 4251510 : pabySrcBlock + iSrcOffset, eDataType,
563 : nBandDataSize,
564 4251510 : static_cast<GByte *>(pData) + iBufOffset +
565 4251510 : static_cast<GPtrDiff_t>(k) * nLineSpace,
566 : eBufType, static_cast<int>(nPixelSpace),
567 : nXSpan);
568 : else
569 150209 : GDALCopyWords64(
570 150209 : static_cast<GByte *>(pData) + iBufOffset +
571 150209 : static_cast<GPtrDiff_t>(k) * nLineSpace,
572 : eBufType, static_cast<int>(nPixelSpace),
573 150209 : pabySrcBlock + iSrcOffset, eDataType,
574 : nBandDataSize, nXSpan);
575 : }
576 :
577 58141000 : iSrcOffset +=
578 58141000 : static_cast<GPtrDiff_t>(nBlockXSize) * nBandDataSize;
579 : }
580 :
581 : iBufOffset =
582 2818380 : CPLUnsanitizedAdd<GPtrDiff_t>(iBufOffset, nXSpanSize);
583 2818380 : nLBlockX++;
584 2818380 : iSrcX += nXSpan;
585 :
586 2818380 : poBlock->DropLock();
587 2818380 : poBlock = nullptr;
588 : }
589 :
590 : /* Compute the increment to go on a block boundary */
591 2543870 : nYInc = nBlockYSize - (iSrcY % nBlockYSize);
592 :
593 2545760 : if (psExtraArg->pfnProgress != nullptr &&
594 1884 : !psExtraArg->pfnProgress(
595 2545760 : 1.0 * std::min(nBufYSize, iBufYOff + nYInc) / nBufYSize, "",
596 : psExtraArg->pProgressData))
597 : {
598 0 : return CE_Failure;
599 : }
600 : }
601 :
602 2503040 : return CE_None;
603 : }
604 :
605 : /* ==================================================================== */
606 : /* Loop reading required source blocks to satisfy output */
607 : /* request. This is the most general implementation. */
608 : /* ==================================================================== */
609 :
610 577855 : double dfXOff = nXOff;
611 577855 : double dfYOff = nYOff;
612 577855 : double dfXSize = nXSize;
613 577855 : double dfYSize = nYSize;
614 577855 : if (psExtraArg->bFloatingPointWindowValidity)
615 : {
616 242949 : dfXOff = psExtraArg->dfXOff;
617 242949 : dfYOff = psExtraArg->dfYOff;
618 242949 : dfXSize = psExtraArg->dfXSize;
619 242949 : dfYSize = psExtraArg->dfYSize;
620 : }
621 :
622 : /* -------------------------------------------------------------------- */
623 : /* Compute stepping increment. */
624 : /* -------------------------------------------------------------------- */
625 577855 : const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
626 577855 : const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
627 577855 : CPLErr eErr = CE_None;
628 :
629 577855 : if (eRWFlag == GF_Write)
630 : {
631 : /* --------------------------------------------------------------------
632 : */
633 : /* Write case */
634 : /* Loop over raster window computing source locations in the buffer.
635 : */
636 : /* --------------------------------------------------------------------
637 : */
638 166655 : GByte *pabyDstBlock = nullptr;
639 166655 : int nLBlockX = -1;
640 166655 : int nLBlockY = -1;
641 :
642 1260010 : for (int iDstY = nYOff; iDstY < nYOff + nYSize; iDstY++)
643 : {
644 1093360 : const int iBufYOff = static_cast<int>((iDstY - nYOff) / dfSrcYInc);
645 :
646 12384200 : for (int iDstX = nXOff; iDstX < nXOff + nXSize; iDstX++)
647 : {
648 11290800 : const int iBufXOff =
649 11290800 : static_cast<int>((iDstX - nXOff) / dfSrcXInc);
650 11290800 : GPtrDiff_t iBufOffset =
651 11290800 : static_cast<GPtrDiff_t>(iBufYOff) *
652 : static_cast<GPtrDiff_t>(nLineSpace) +
653 11290800 : iBufXOff * static_cast<GPtrDiff_t>(nPixelSpace);
654 :
655 : // FIXME: this code likely doesn't work if the dirty block gets
656 : // flushed to disk before being completely written.
657 : // In the meantime, bJustInitialize should probably be set to
658 : // FALSE even if it is not ideal performance wise, and for
659 : // lossy compression.
660 :
661 : /* --------------------------------------------------------------------
662 : */
663 : /* Ensure we have the appropriate block loaded. */
664 : /* --------------------------------------------------------------------
665 : */
666 11290800 : if (iDstX < nLBlockX * nBlockXSize ||
667 11041500 : iDstX - nBlockXSize >= nLBlockX * nBlockXSize ||
668 10584800 : iDstY < nLBlockY * nBlockYSize ||
669 10584800 : iDstY - nBlockYSize >= nLBlockY * nBlockYSize)
670 : {
671 738702 : nLBlockX = iDstX / nBlockXSize;
672 738702 : nLBlockY = iDstY / nBlockYSize;
673 :
674 738702 : const bool bJustInitialize =
675 1065990 : nYOff <= nLBlockY * nBlockYSize &&
676 327291 : nYOff + nYSize - nBlockYSize >=
677 327291 : nLBlockY * nBlockYSize &&
678 1116320 : nXOff <= nLBlockX * nBlockXSize &&
679 50325 : nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
680 : /*bool bMemZeroBuffer = FALSE;
681 : if( !bJustInitialize &&
682 : nXOff <= nLBlockX * nBlockXSize &&
683 : nYOff <= nLBlockY * nBlockYSize &&
684 : (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
685 : (nXOff + nXSize == GetXSize() &&
686 : (nLBlockX+1) * nBlockXSize > GetXSize())) &&
687 : (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
688 : (nYOff + nYSize == GetYSize() &&
689 : (nLBlockY+1) * nBlockYSize > GetYSize())) )
690 : {
691 : bJustInitialize = TRUE;
692 : bMemZeroBuffer = TRUE;
693 : }*/
694 738702 : if (poBlock != nullptr)
695 572047 : poBlock->DropLock();
696 :
697 738702 : poBlock =
698 738702 : GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
699 738702 : if (poBlock == nullptr)
700 : {
701 0 : return (CE_Failure);
702 : }
703 :
704 738702 : poBlock->MarkDirty();
705 :
706 738702 : pabyDstBlock = static_cast<GByte *>(poBlock->GetDataRef());
707 : /*if( bMemZeroBuffer )
708 : {
709 : memset(pabyDstBlock, 0,
710 : static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
711 : * nBlockYSize);
712 : }*/
713 : }
714 :
715 : // To make Coverity happy. Should not happen by design.
716 11290800 : if (pabyDstBlock == nullptr)
717 : {
718 0 : CPLAssert(false);
719 : eErr = CE_Failure;
720 : break;
721 : }
722 :
723 : /* --------------------------------------------------------------------
724 : */
725 : /* Copy over this pixel of data. */
726 : /* --------------------------------------------------------------------
727 : */
728 11290800 : GPtrDiff_t iDstOffset =
729 11290800 : (static_cast<GPtrDiff_t>(iDstX) -
730 11290800 : static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
731 11290800 : (static_cast<GPtrDiff_t>(iDstY) -
732 11290800 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
733 11290800 : nBlockXSize) *
734 11290800 : nBandDataSize;
735 :
736 11290800 : if (eDataType == eBufType)
737 : {
738 11287700 : memcpy(pabyDstBlock + iDstOffset,
739 11287700 : static_cast<GByte *>(pData) + iBufOffset,
740 : nBandDataSize);
741 : }
742 : else
743 : {
744 : /* type to type conversion ... ouch, this is expensive way
745 : of handling single words */
746 3096 : GDALCopyWords64(static_cast<GByte *>(pData) + iBufOffset,
747 3096 : eBufType, 0, pabyDstBlock + iDstOffset,
748 : eDataType, 0, 1);
749 : }
750 : }
751 :
752 1093360 : if (psExtraArg->pfnProgress != nullptr &&
753 0 : !psExtraArg->pfnProgress(1.0 * (iDstY - nYOff + 1) / nYSize, "",
754 : psExtraArg->pProgressData))
755 : {
756 0 : eErr = CE_Failure;
757 0 : break;
758 : }
759 : }
760 : }
761 : else
762 : {
763 411200 : if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
764 : {
765 41928 : if ((psExtraArg->eResampleAlg == GRIORA_Cubic ||
766 13514 : psExtraArg->eResampleAlg == GRIORA_CubicSpline ||
767 13476 : psExtraArg->eResampleAlg == GRIORA_Bilinear ||
768 28455 : psExtraArg->eResampleAlg == GRIORA_Lanczos) &&
769 3191 : GetColorTable() != nullptr)
770 : {
771 0 : CPLError(CE_Warning, CPLE_NotSupported,
772 : "Resampling method not supported on paletted band. "
773 : "Falling back to nearest neighbour");
774 : }
775 14210 : else if (psExtraArg->eResampleAlg == GRIORA_Gauss &&
776 3 : GDALDataTypeIsComplex(eDataType))
777 : {
778 0 : CPLError(CE_Warning, CPLE_NotSupported,
779 : "Resampling method not supported on complex data type "
780 : "band. Falling back to nearest neighbour");
781 : }
782 : else
783 : {
784 14207 : return RasterIOResampled(eRWFlag, nXOff, nYOff, nXSize, nYSize,
785 : pData, nBufXSize, nBufYSize, eBufType,
786 14207 : nPixelSpace, nLineSpace, psExtraArg);
787 : }
788 : }
789 :
790 396993 : int nLimitBlockY = 0;
791 396993 : const bool bByteCopy = eDataType == eBufType && nBandDataSize == 1;
792 396993 : int nStartBlockX = -nBlockXSize;
793 396993 : constexpr double EPS = 1e-10;
794 396993 : int nLBlockY = -1;
795 396993 : const double dfSrcXStart = 0.5 * dfSrcXInc + dfXOff + EPS;
796 396993 : const bool bIntegerXFactor =
797 372760 : bUseIntegerRequestCoords &&
798 670822 : static_cast<int>(dfSrcXInc) == dfSrcXInc &&
799 273829 : static_cast<int>(dfSrcXInc) < INT_MAX / nBandDataSize;
800 :
801 : /* --------------------------------------------------------------------
802 : */
803 : /* Read case */
804 : /* Loop over buffer computing source locations. */
805 : /* --------------------------------------------------------------------
806 : */
807 2367020 : for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
808 : {
809 : // Add small epsilon to avoid some numeric precision issues.
810 1970040 : const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
811 1970040 : const int iSrcY = static_cast<int>(std::min(
812 1970040 : std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
813 :
814 1970040 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
815 : static_cast<GPtrDiff_t>(nLineSpace);
816 :
817 1970040 : if (iSrcY >= nLimitBlockY)
818 : {
819 438011 : nLBlockY = iSrcY / nBlockYSize;
820 438011 : nLimitBlockY = nLBlockY * nBlockYSize;
821 438011 : if (nLimitBlockY < INT_MAX - nBlockYSize)
822 438011 : nLimitBlockY += nBlockYSize;
823 : else
824 0 : nLimitBlockY = INT_MAX;
825 : // Make sure a new block is loaded.
826 438011 : nStartBlockX = -nBlockXSize;
827 : }
828 1532030 : else if (static_cast<int>(dfSrcXStart) < nStartBlockX)
829 : {
830 : // Make sure a new block is loaded.
831 437363 : nStartBlockX = -nBlockXSize;
832 : }
833 :
834 1970040 : GPtrDiff_t iSrcOffsetCst = (iSrcY - nLBlockY * nBlockYSize) *
835 1970040 : static_cast<GPtrDiff_t>(nBlockXSize);
836 :
837 1970040 : if (bIntegerXFactor)
838 : {
839 695780 : int iSrcX = static_cast<int>(dfSrcXStart);
840 695780 : const int nSrcXInc = static_cast<int>(dfSrcXInc);
841 695780 : GByte *pabyDstData = static_cast<GByte *>(pData) + iBufOffset;
842 695780 : bool bRet = false;
843 695780 : if (bByteCopy)
844 : {
845 585772 : bRet = DownsamplingIntegerXFactor<true, 1>(
846 : this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
847 : static_cast<int>(nPixelSpace), nBufXSize, GDT_UInt8,
848 : GDT_UInt8, nStartBlockX, nBlockXSize, poBlock,
849 : nLBlockY);
850 : }
851 110008 : else if (eDataType == eBufType)
852 : {
853 109783 : switch (nBandDataSize)
854 : {
855 109630 : case 2:
856 109630 : bRet = DownsamplingIntegerXFactor<true, 2>(
857 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
858 : pabyDstData, static_cast<int>(nPixelSpace),
859 : nBufXSize, eDataType, eDataType, nStartBlockX,
860 : nBlockXSize, poBlock, nLBlockY);
861 109630 : break;
862 55 : case 4:
863 55 : bRet = DownsamplingIntegerXFactor<true, 4>(
864 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
865 : pabyDstData, static_cast<int>(nPixelSpace),
866 : nBufXSize, eDataType, eDataType, nStartBlockX,
867 : nBlockXSize, poBlock, nLBlockY);
868 55 : break;
869 96 : case 8:
870 96 : bRet = DownsamplingIntegerXFactor<true, 8>(
871 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
872 : pabyDstData, static_cast<int>(nPixelSpace),
873 : nBufXSize, eDataType, eDataType, nStartBlockX,
874 : nBlockXSize, poBlock, nLBlockY);
875 96 : break;
876 2 : case 16:
877 2 : bRet = DownsamplingIntegerXFactor<true, 16>(
878 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
879 : pabyDstData, static_cast<int>(nPixelSpace),
880 : nBufXSize, eDataType, eDataType, nStartBlockX,
881 : nBlockXSize, poBlock, nLBlockY);
882 2 : break;
883 0 : default:
884 0 : CPLAssert(false);
885 : break;
886 : }
887 : }
888 : else
889 : {
890 225 : bRet = DownsamplingIntegerXFactor<false, 0>(
891 : this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
892 : static_cast<int>(nPixelSpace), nBufXSize, eDataType,
893 : eBufType, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
894 : }
895 695780 : if (!bRet)
896 1 : eErr = CE_Failure;
897 : }
898 : else
899 : {
900 1274260 : double dfSrcX = dfSrcXStart;
901 503811000 : for (int iBufXOff = 0; iBufXOff < nBufXSize;
902 502537000 : iBufXOff++, dfSrcX += dfSrcXInc)
903 : {
904 : // TODO?: try to avoid the clamping for most iterations
905 : const int iSrcX = static_cast<int>(
906 1005070000 : std::min(std::max(0.0, dfSrcX),
907 502537000 : static_cast<double>(nRasterXSize - 1)));
908 :
909 : /* --------------------------------------------------------------------
910 : */
911 : /* Ensure we have the appropriate block loaded. */
912 : /* --------------------------------------------------------------------
913 : */
914 502537000 : if (iSrcX >= nBlockXSize + nStartBlockX)
915 : {
916 1697820 : const int nLBlockX = iSrcX / nBlockXSize;
917 1697820 : nStartBlockX = nLBlockX * nBlockXSize;
918 :
919 1697820 : if (poBlock != nullptr)
920 1574650 : poBlock->DropLock();
921 :
922 1697820 : poBlock = GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
923 1697820 : if (poBlock == nullptr)
924 : {
925 9 : eErr = CE_Failure;
926 9 : break;
927 : }
928 :
929 : pabySrcBlock =
930 1697810 : static_cast<GByte *>(poBlock->GetDataRef());
931 : }
932 502537000 : const GPtrDiff_t nDiffX =
933 502537000 : static_cast<GPtrDiff_t>(iSrcX - nStartBlockX);
934 :
935 : /* --------------------------------------------------------------------
936 : */
937 : /* Copy over this pixel of data. */
938 : /* --------------------------------------------------------------------
939 : */
940 :
941 502537000 : if (bByteCopy)
942 : {
943 442592000 : GPtrDiff_t iSrcOffset = nDiffX + iSrcOffsetCst;
944 442592000 : static_cast<GByte *>(pData)[iBufOffset] =
945 442592000 : pabySrcBlock[iSrcOffset];
946 : }
947 59944700 : else if (eDataType == eBufType)
948 : {
949 50322800 : GPtrDiff_t iSrcOffset =
950 50322800 : (nDiffX + iSrcOffsetCst) * nBandDataSize;
951 50322800 : memcpy(static_cast<GByte *>(pData) + iBufOffset,
952 50322800 : pabySrcBlock + iSrcOffset, nBandDataSize);
953 : }
954 : else
955 : {
956 : // Type to type conversion ...
957 9621890 : GPtrDiff_t iSrcOffset =
958 9621890 : (nDiffX + iSrcOffsetCst) * nBandDataSize;
959 9621890 : GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
960 : static_cast<GByte *>(pData) +
961 9621890 : iBufOffset,
962 : eBufType, 0, 1);
963 : }
964 :
965 502537000 : iBufOffset += static_cast<int>(nPixelSpace);
966 : }
967 : }
968 1970040 : if (eErr == CE_Failure)
969 11 : break;
970 :
971 2191390 : if (psExtraArg->pfnProgress != nullptr &&
972 221364 : !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
973 : psExtraArg->pProgressData))
974 : {
975 1 : eErr = CE_Failure;
976 1 : break;
977 : }
978 : }
979 : }
980 :
981 563648 : if (poBlock != nullptr)
982 563638 : poBlock->DropLock();
983 :
984 563648 : return eErr;
985 : }
986 :
987 : /************************************************************************/
988 : /* GDALRasterIOTransformer() */
989 : /************************************************************************/
990 :
991 : struct GDALRasterIOTransformerStruct
992 : {
993 : double dfXOff;
994 : double dfYOff;
995 : double dfXRatioDstToSrc;
996 : double dfYRatioDstToSrc;
997 : };
998 :
999 6897 : static int GDALRasterIOTransformer(void *pTransformerArg, int bDstToSrc,
1000 : int nPointCount, double *x, double *y,
1001 : double * /* z */, int *panSuccess)
1002 : {
1003 6897 : GDALRasterIOTransformerStruct *psParams =
1004 : static_cast<GDALRasterIOTransformerStruct *>(pTransformerArg);
1005 6897 : if (bDstToSrc)
1006 : {
1007 311993 : for (int i = 0; i < nPointCount; i++)
1008 : {
1009 305684 : x[i] = x[i] * psParams->dfXRatioDstToSrc + psParams->dfXOff;
1010 305684 : y[i] = y[i] * psParams->dfYRatioDstToSrc + psParams->dfYOff;
1011 305684 : panSuccess[i] = TRUE;
1012 : }
1013 : }
1014 : else
1015 : {
1016 1176 : for (int i = 0; i < nPointCount; i++)
1017 : {
1018 588 : x[i] = (x[i] - psParams->dfXOff) / psParams->dfXRatioDstToSrc;
1019 588 : y[i] = (y[i] - psParams->dfYOff) / psParams->dfYRatioDstToSrc;
1020 588 : panSuccess[i] = TRUE;
1021 : }
1022 : }
1023 6897 : return TRUE;
1024 : }
1025 :
1026 : /************************************************************************/
1027 : /* RasterIOResampled() */
1028 : /************************************************************************/
1029 :
1030 : //! @cond Doxygen_Suppress
1031 14207 : CPLErr GDALRasterBand::RasterIOResampled(
1032 : GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
1033 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
1034 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
1035 : {
1036 : // Determine if we use warping resampling or overview resampling
1037 : const bool bUseWarp =
1038 14207 : (GDALDataTypeIsComplex(eDataType) &&
1039 14366 : psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
1040 159 : psExtraArg->eResampleAlg != GRIORA_Mode);
1041 :
1042 14207 : double dfXOff = nXOff;
1043 14207 : double dfYOff = nYOff;
1044 14207 : double dfXSize = nXSize;
1045 14207 : double dfYSize = nYSize;
1046 14207 : if (psExtraArg->bFloatingPointWindowValidity)
1047 : {
1048 13512 : dfXOff = psExtraArg->dfXOff;
1049 13512 : dfYOff = psExtraArg->dfYOff;
1050 13512 : dfXSize = psExtraArg->dfXSize;
1051 13512 : dfYSize = psExtraArg->dfYSize;
1052 : }
1053 :
1054 14207 : const double dfXRatioDstToSrc = dfXSize / nBufXSize;
1055 14207 : const double dfYRatioDstToSrc = dfYSize / nBufYSize;
1056 :
1057 : // Determine the coordinates in the "virtual" output raster to see
1058 : // if there are not integers, in which case we will use them as a shift
1059 : // so that subwindow extracts give the exact same results as entire raster
1060 : // scaling.
1061 14207 : double dfDestXOff = dfXOff / dfXRatioDstToSrc;
1062 14207 : bool bHasXOffVirtual = false;
1063 14207 : int nDestXOffVirtual = 0;
1064 14207 : if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
1065 : {
1066 13879 : bHasXOffVirtual = true;
1067 13879 : dfXOff = nXOff;
1068 13879 : nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
1069 : }
1070 :
1071 14207 : double dfDestYOff = dfYOff / dfYRatioDstToSrc;
1072 14207 : bool bHasYOffVirtual = false;
1073 14207 : int nDestYOffVirtual = 0;
1074 14207 : if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
1075 : {
1076 13875 : bHasYOffVirtual = true;
1077 13875 : dfYOff = nYOff;
1078 13875 : nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
1079 : }
1080 :
1081 : // Create a MEM dataset that wraps the output buffer.
1082 : GDALDataset *poMEMDS;
1083 14207 : void *pTempBuffer = nullptr;
1084 14207 : GSpacing nPSMem = nPixelSpace;
1085 14207 : GSpacing nLSMem = nLineSpace;
1086 14207 : void *pDataMem = pData;
1087 14207 : GDALDataType eDTMem = eBufType;
1088 14207 : if (eBufType != eDataType)
1089 : {
1090 44 : nPSMem = GDALGetDataTypeSizeBytes(eDataType);
1091 44 : nLSMem = nPSMem * nBufXSize;
1092 : pTempBuffer =
1093 44 : VSI_MALLOC2_VERBOSE(nBufYSize, static_cast<size_t>(nLSMem));
1094 44 : if (pTempBuffer == nullptr)
1095 0 : return CE_Failure;
1096 44 : pDataMem = pTempBuffer;
1097 44 : eDTMem = eDataType;
1098 : }
1099 :
1100 : poMEMDS =
1101 14207 : MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
1102 : nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr);
1103 14207 : GByte *pabyData = static_cast<GByte *>(pDataMem) -
1104 14207 : nPSMem * nDestXOffVirtual - nLSMem * nDestYOffVirtual;
1105 14207 : GDALRasterBandH hMEMBand = MEMCreateRasterBandEx(
1106 : poMEMDS, 1, pabyData, eDTMem, nPSMem, nLSMem, false);
1107 14207 : poMEMDS->SetBand(1, GDALRasterBand::FromHandle(hMEMBand));
1108 :
1109 14207 : const char *pszNBITS = GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
1110 14207 : const int nNBITS = pszNBITS ? atoi(pszNBITS) : 0;
1111 14207 : if (pszNBITS)
1112 6 : GDALRasterBand::FromHandle(hMEMBand)->SetMetadataItem(
1113 6 : "NBITS", pszNBITS, "IMAGE_STRUCTURE");
1114 :
1115 14207 : CPLErr eErr = CE_None;
1116 :
1117 : // Do the resampling.
1118 14207 : if (bUseWarp)
1119 : {
1120 149 : int bHasNoData = FALSE;
1121 149 : double dfNoDataValue = GetNoDataValue(&bHasNoData);
1122 :
1123 149 : VRTDatasetH hVRTDS = nullptr;
1124 149 : GDALRasterBandH hVRTBand = nullptr;
1125 149 : if (GetDataset() == nullptr)
1126 : {
1127 : /* Create VRT dataset that wraps the whole dataset */
1128 0 : hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
1129 0 : VRTAddBand(hVRTDS, eDataType, nullptr);
1130 0 : hVRTBand = GDALGetRasterBand(hVRTDS, 1);
1131 0 : VRTAddSimpleSource(hVRTBand, this, 0, 0, nRasterXSize, nRasterYSize,
1132 : 0, 0, nRasterXSize, nRasterYSize, nullptr,
1133 : VRT_NODATA_UNSET);
1134 :
1135 : /* Add a mask band if needed */
1136 0 : if (GetMaskFlags() != GMF_ALL_VALID)
1137 : {
1138 0 : GDALDataset::FromHandle(hVRTDS)->CreateMaskBand(0);
1139 : VRTSourcedRasterBand *poVRTMaskBand =
1140 : reinterpret_cast<VRTSourcedRasterBand *>(
1141 : reinterpret_cast<GDALRasterBand *>(hVRTBand)
1142 0 : ->GetMaskBand());
1143 0 : poVRTMaskBand->AddMaskBandSource(this, 0, 0, nRasterXSize,
1144 0 : nRasterYSize, 0, 0,
1145 0 : nRasterXSize, nRasterYSize);
1146 : }
1147 : }
1148 :
1149 149 : GDALWarpOptions *psWarpOptions = GDALCreateWarpOptions();
1150 149 : switch (psExtraArg->eResampleAlg)
1151 : {
1152 0 : case GRIORA_NearestNeighbour:
1153 0 : psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
1154 0 : break;
1155 147 : case GRIORA_Bilinear:
1156 147 : psWarpOptions->eResampleAlg = GRA_Bilinear;
1157 147 : break;
1158 0 : case GRIORA_Cubic:
1159 0 : psWarpOptions->eResampleAlg = GRA_Cubic;
1160 0 : break;
1161 0 : case GRIORA_CubicSpline:
1162 0 : psWarpOptions->eResampleAlg = GRA_CubicSpline;
1163 0 : break;
1164 0 : case GRIORA_Lanczos:
1165 0 : psWarpOptions->eResampleAlg = GRA_Lanczos;
1166 0 : break;
1167 0 : case GRIORA_Average:
1168 0 : psWarpOptions->eResampleAlg = GRA_Average;
1169 0 : break;
1170 2 : case GRIORA_RMS:
1171 2 : psWarpOptions->eResampleAlg = GRA_RMS;
1172 2 : break;
1173 0 : case GRIORA_Mode:
1174 0 : psWarpOptions->eResampleAlg = GRA_Mode;
1175 0 : break;
1176 0 : default:
1177 0 : CPLAssert(false);
1178 : psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
1179 : break;
1180 : }
1181 149 : psWarpOptions->hSrcDS = hVRTDS ? hVRTDS : GetDataset();
1182 149 : psWarpOptions->hDstDS = poMEMDS;
1183 149 : psWarpOptions->nBandCount = 1;
1184 149 : int nSrcBandNumber = hVRTDS ? 1 : nBand;
1185 149 : int nDstBandNumber = 1;
1186 149 : psWarpOptions->panSrcBands = &nSrcBandNumber;
1187 149 : psWarpOptions->panDstBands = &nDstBandNumber;
1188 298 : psWarpOptions->pfnProgress = psExtraArg->pfnProgress
1189 149 : ? psExtraArg->pfnProgress
1190 : : GDALDummyProgress;
1191 149 : psWarpOptions->pProgressArg = psExtraArg->pProgressData;
1192 149 : psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
1193 149 : if (bHasNoData)
1194 : {
1195 0 : psWarpOptions->papszWarpOptions = CSLSetNameValue(
1196 : psWarpOptions->papszWarpOptions, "INIT_DEST", "NO_DATA");
1197 0 : if (psWarpOptions->padfSrcNoDataReal == nullptr)
1198 : {
1199 0 : psWarpOptions->padfSrcNoDataReal =
1200 0 : static_cast<double *>(CPLMalloc(sizeof(double)));
1201 0 : psWarpOptions->padfSrcNoDataReal[0] = dfNoDataValue;
1202 : }
1203 :
1204 0 : if (psWarpOptions->padfDstNoDataReal == nullptr)
1205 : {
1206 0 : psWarpOptions->padfDstNoDataReal =
1207 0 : static_cast<double *>(CPLMalloc(sizeof(double)));
1208 0 : psWarpOptions->padfDstNoDataReal[0] = dfNoDataValue;
1209 : }
1210 : }
1211 :
1212 : GDALRasterIOTransformerStruct sTransformer;
1213 149 : sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
1214 149 : sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
1215 149 : sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
1216 149 : sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
1217 149 : psWarpOptions->pTransformerArg = &sTransformer;
1218 :
1219 : GDALWarpOperationH hWarpOperation =
1220 149 : GDALCreateWarpOperation(psWarpOptions);
1221 149 : eErr = GDALChunkAndWarpImage(hWarpOperation, nDestXOffVirtual,
1222 : nDestYOffVirtual, nBufXSize, nBufYSize);
1223 149 : GDALDestroyWarpOperation(hWarpOperation);
1224 :
1225 149 : psWarpOptions->panSrcBands = nullptr;
1226 149 : psWarpOptions->panDstBands = nullptr;
1227 149 : GDALDestroyWarpOptions(psWarpOptions);
1228 :
1229 149 : if (hVRTDS)
1230 0 : GDALClose(hVRTDS);
1231 : }
1232 : else
1233 : {
1234 14058 : const char *pszResampling =
1235 25844 : (psExtraArg->eResampleAlg == GRIORA_Bilinear) ? "BILINEAR"
1236 22879 : : (psExtraArg->eResampleAlg == GRIORA_Cubic) ? "CUBIC"
1237 22148 : : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
1238 22069 : : (psExtraArg->eResampleAlg == GRIORA_Lanczos) ? "LANCZOS"
1239 11137 : : (psExtraArg->eResampleAlg == GRIORA_Average) ? "AVERAGE"
1240 199 : : (psExtraArg->eResampleAlg == GRIORA_RMS) ? "RMS"
1241 79 : : (psExtraArg->eResampleAlg == GRIORA_Mode) ? "MODE"
1242 3 : : (psExtraArg->eResampleAlg == GRIORA_Gauss) ? "GAUSS"
1243 : : "UNKNOWN";
1244 :
1245 14058 : int nKernelRadius = 0;
1246 : GDALResampleFunction pfnResampleFunc =
1247 14058 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
1248 14058 : CPLAssert(pfnResampleFunc);
1249 : GDALDataType eWrkDataType =
1250 14058 : GDALGetOvrWorkDataType(pszResampling, eDataType);
1251 14058 : int nHasNoData = 0;
1252 14058 : double dfNoDataValue = GetNoDataValue(&nHasNoData);
1253 14058 : const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
1254 14058 : if (!bHasNoData)
1255 13968 : dfNoDataValue = 0.0;
1256 :
1257 14058 : int nDstBlockXSize = nBufXSize;
1258 14058 : int nDstBlockYSize = nBufYSize;
1259 14058 : int nFullResXChunk = 0;
1260 14058 : int nFullResYChunk = 0;
1261 : while (true)
1262 : {
1263 14069 : nFullResXChunk =
1264 14069 : 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
1265 14069 : nFullResYChunk =
1266 14069 : 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
1267 14069 : if (nFullResXChunk > nRasterXSize)
1268 4726 : nFullResXChunk = nRasterXSize;
1269 14069 : if (nFullResYChunk > nRasterYSize)
1270 543 : nFullResYChunk = nRasterYSize;
1271 14069 : if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
1272 14011 : (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
1273 : 1024 * 1024))
1274 : break;
1275 : // When operating on the full width of a raster whose block width is
1276 : // the raster width, prefer doing chunks in height.
1277 11 : if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
1278 : nDstBlockYSize > 1)
1279 0 : nDstBlockYSize /= 2;
1280 : /* Otherwise cut the maximal dimension */
1281 11 : else if (nDstBlockXSize > 1 &&
1282 0 : (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
1283 11 : nDstBlockXSize /= 2;
1284 : else
1285 0 : nDstBlockYSize /= 2;
1286 : }
1287 :
1288 14058 : int nOvrXFactor = static_cast<int>(0.5 + dfXRatioDstToSrc);
1289 14058 : int nOvrYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
1290 14058 : if (nOvrXFactor == 0)
1291 2029 : nOvrXFactor = 1;
1292 14058 : if (nOvrYFactor == 0)
1293 2028 : nOvrYFactor = 1;
1294 14058 : int nFullResXSizeQueried =
1295 14058 : nFullResXChunk + 2 * nKernelRadius * nOvrXFactor;
1296 14058 : int nFullResYSizeQueried =
1297 14058 : nFullResYChunk + 2 * nKernelRadius * nOvrYFactor;
1298 :
1299 14058 : if (nFullResXSizeQueried > nRasterXSize)
1300 2701 : nFullResXSizeQueried = nRasterXSize;
1301 14058 : if (nFullResYSizeQueried > nRasterYSize)
1302 299 : nFullResYSizeQueried = nRasterYSize;
1303 :
1304 : void *pChunk =
1305 14058 : VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
1306 : nFullResXSizeQueried, nFullResYSizeQueried);
1307 14058 : GByte *pabyChunkNoDataMask = nullptr;
1308 :
1309 14058 : GDALRasterBand *poMaskBand = GetMaskBand();
1310 14058 : int l_nMaskFlags = GetMaskFlags();
1311 :
1312 14058 : bool bUseNoDataMask = ((l_nMaskFlags & GMF_ALL_VALID) == 0);
1313 14058 : if (bUseNoDataMask)
1314 : {
1315 7483 : pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
1316 : nFullResXSizeQueried, nFullResYSizeQueried));
1317 : }
1318 14058 : if (pChunk == nullptr ||
1319 7483 : (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
1320 : {
1321 0 : GDALClose(poMEMDS);
1322 0 : CPLFree(pChunk);
1323 0 : CPLFree(pabyChunkNoDataMask);
1324 0 : VSIFree(pTempBuffer);
1325 0 : return CE_Failure;
1326 : }
1327 :
1328 14058 : const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
1329 14058 : DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
1330 14058 : int nBlocksDone = 0;
1331 :
1332 : int nDstYOff;
1333 28116 : for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
1334 14058 : nDstYOff += nDstBlockYSize)
1335 : {
1336 : int nDstYCount;
1337 14058 : if (nDstYOff + nDstBlockYSize <= nBufYSize)
1338 14058 : nDstYCount = nDstBlockYSize;
1339 : else
1340 0 : nDstYCount = nBufYSize - nDstYOff;
1341 :
1342 14058 : int nChunkYOff =
1343 14058 : nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
1344 14058 : int nChunkYOff2 = nYOff + 1 +
1345 14058 : static_cast<int>(ceil((nDstYOff + nDstYCount) *
1346 : dfYRatioDstToSrc));
1347 14058 : if (nChunkYOff2 > nRasterYSize)
1348 731 : nChunkYOff2 = nRasterYSize;
1349 14058 : int nYCount = nChunkYOff2 - nChunkYOff;
1350 14058 : CPLAssert(nYCount <= nFullResYChunk);
1351 :
1352 14058 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrYFactor;
1353 14058 : int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrYFactor;
1354 14058 : if (nChunkYOffQueried < 0)
1355 : {
1356 458 : nChunkYSizeQueried += nChunkYOffQueried;
1357 458 : nChunkYOffQueried = 0;
1358 : }
1359 14058 : if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
1360 561 : nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
1361 14058 : CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
1362 :
1363 14058 : int nDstXOff = 0;
1364 28116 : for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
1365 14058 : nDstXOff += nDstBlockXSize)
1366 : {
1367 14058 : int nDstXCount = 0;
1368 14058 : if (nDstXOff + nDstBlockXSize <= nBufXSize)
1369 14058 : nDstXCount = nDstBlockXSize;
1370 : else
1371 0 : nDstXCount = nBufXSize - nDstXOff;
1372 :
1373 14058 : int nChunkXOff =
1374 14058 : nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
1375 14058 : int nChunkXOff2 =
1376 14058 : nXOff + 1 +
1377 14058 : static_cast<int>(
1378 14058 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
1379 14058 : if (nChunkXOff2 > nRasterXSize)
1380 8751 : nChunkXOff2 = nRasterXSize;
1381 14058 : int nXCount = nChunkXOff2 - nChunkXOff;
1382 14058 : CPLAssert(nXCount <= nFullResXChunk);
1383 :
1384 14058 : int nChunkXOffQueried =
1385 14058 : nChunkXOff - nKernelRadius * nOvrXFactor;
1386 14058 : int nChunkXSizeQueried =
1387 14058 : nXCount + 2 * nKernelRadius * nOvrXFactor;
1388 14058 : if (nChunkXOffQueried < 0)
1389 : {
1390 2762 : nChunkXSizeQueried += nChunkXOffQueried;
1391 2762 : nChunkXOffQueried = 0;
1392 : }
1393 14058 : if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
1394 2748 : nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
1395 14058 : CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
1396 :
1397 : // Read the source buffers.
1398 14058 : eErr = RasterIO(GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1399 : nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
1400 : nChunkXSizeQueried, nChunkYSizeQueried,
1401 : eWrkDataType, 0, 0, nullptr);
1402 :
1403 14058 : bool bSkipResample = false;
1404 14058 : bool bNoDataMaskFullyOpaque = false;
1405 14058 : if (eErr == CE_None && bUseNoDataMask)
1406 : {
1407 7483 : eErr = poMaskBand->RasterIO(
1408 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1409 : nChunkXSizeQueried, nChunkYSizeQueried,
1410 : pabyChunkNoDataMask, nChunkXSizeQueried,
1411 : nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
1412 :
1413 : /* Optimizations if mask if fully opaque or transparent */
1414 7483 : int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
1415 7483 : GByte bVal = pabyChunkNoDataMask[0];
1416 7483 : int i = 1;
1417 15232100 : for (; i < nPixels; i++)
1418 : {
1419 15225700 : if (pabyChunkNoDataMask[i] != bVal)
1420 1126 : break;
1421 : }
1422 7483 : if (i == nPixels)
1423 : {
1424 6357 : if (bVal == 0)
1425 : {
1426 12094 : for (int j = 0; j < nDstYCount; j++)
1427 : {
1428 6377 : GDALCopyWords64(&dfNoDataValue, GDT_Float64, 0,
1429 : static_cast<GByte *>(pDataMem) +
1430 6377 : nLSMem * (j + nDstYOff) +
1431 6377 : nDstXOff * nPSMem,
1432 : eDTMem,
1433 : static_cast<int>(nPSMem),
1434 : nDstXCount);
1435 : }
1436 5717 : bSkipResample = true;
1437 : }
1438 : else
1439 : {
1440 640 : bNoDataMaskFullyOpaque = true;
1441 : }
1442 : }
1443 : }
1444 :
1445 14058 : if (!bSkipResample && eErr == CE_None)
1446 : {
1447 8338 : const bool bPropagateNoData = false;
1448 8338 : void *pDstBuffer = nullptr;
1449 8338 : GDALDataType eDstBufferDataType = GDT_Unknown;
1450 : GDALRasterBand *poMEMBand =
1451 8338 : GDALRasterBand::FromHandle(hMEMBand);
1452 8338 : GDALOverviewResampleArgs args;
1453 8338 : args.eSrcDataType = eDataType;
1454 8338 : args.eOvrDataType = poMEMBand->GetRasterDataType();
1455 8338 : args.nOvrXSize = poMEMBand->GetXSize();
1456 8338 : args.nOvrYSize = poMEMBand->GetYSize();
1457 8338 : args.nOvrNBITS = nNBITS;
1458 8338 : args.dfXRatioDstToSrc = dfXRatioDstToSrc;
1459 8338 : args.dfYRatioDstToSrc = dfYRatioDstToSrc;
1460 8338 : args.dfSrcXDelta =
1461 8338 : dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
1462 8338 : args.dfSrcYDelta =
1463 8338 : dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
1464 8338 : args.eWrkDataType = eWrkDataType;
1465 8338 : args.pabyChunkNodataMask =
1466 8338 : bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask;
1467 8338 : args.nChunkXOff =
1468 8338 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
1469 8338 : args.nChunkXSize = nChunkXSizeQueried;
1470 8338 : args.nChunkYOff =
1471 8338 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
1472 8338 : args.nChunkYSize = nChunkYSizeQueried;
1473 8338 : args.nDstXOff = nDstXOff + nDestXOffVirtual;
1474 8338 : args.nDstXOff2 = nDstXOff + nDestXOffVirtual + nDstXCount;
1475 8338 : args.nDstYOff = nDstYOff + nDestYOffVirtual;
1476 8338 : args.nDstYOff2 = nDstYOff + nDestYOffVirtual + nDstYCount;
1477 8338 : args.pszResampling = pszResampling;
1478 8338 : args.bHasNoData = bHasNoData;
1479 8338 : args.dfNoDataValue = dfNoDataValue;
1480 8338 : args.poColorTable = GetColorTable();
1481 8338 : args.bPropagateNoData = bPropagateNoData;
1482 8338 : eErr = pfnResampleFunc(args, pChunk, &pDstBuffer,
1483 : &eDstBufferDataType);
1484 8338 : if (eErr == CE_None)
1485 : {
1486 8338 : eErr = poMEMBand->RasterIO(
1487 : GF_Write, nDstXOff + nDestXOffVirtual,
1488 : nDstYOff + nDestYOffVirtual, nDstXCount, nDstYCount,
1489 : pDstBuffer, nDstXCount, nDstYCount,
1490 : eDstBufferDataType, 0, 0, nullptr);
1491 : }
1492 8338 : CPLFree(pDstBuffer);
1493 : }
1494 :
1495 14058 : nBlocksDone++;
1496 24980 : if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
1497 10922 : !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
1498 : "", psExtraArg->pProgressData))
1499 : {
1500 1 : eErr = CE_Failure;
1501 : }
1502 : }
1503 : }
1504 :
1505 14058 : CPLFree(pChunk);
1506 14058 : CPLFree(pabyChunkNoDataMask);
1507 : }
1508 :
1509 14207 : if (eBufType != eDataType)
1510 : {
1511 44 : CPL_IGNORE_RET_VAL(poMEMDS->GetRasterBand(1)->RasterIO(
1512 : GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
1513 : pData, nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
1514 : nullptr));
1515 : }
1516 14207 : GDALClose(poMEMDS);
1517 14207 : VSIFree(pTempBuffer);
1518 :
1519 14207 : return eErr;
1520 : }
1521 :
1522 : /************************************************************************/
1523 : /* RasterIOResampled() */
1524 : /************************************************************************/
1525 :
1526 886 : CPLErr GDALDataset::RasterIOResampled(
1527 : GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
1528 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
1529 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
1530 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
1531 :
1532 : {
1533 : #if 0
1534 : // Determine if we use warping resampling or overview resampling
1535 : bool bUseWarp = false;
1536 : if( GDALDataTypeIsComplex( eDataType ) )
1537 : bUseWarp = true;
1538 : #endif
1539 :
1540 886 : double dfXOff = nXOff;
1541 886 : double dfYOff = nYOff;
1542 886 : double dfXSize = nXSize;
1543 886 : double dfYSize = nYSize;
1544 886 : if (psExtraArg->bFloatingPointWindowValidity)
1545 : {
1546 765 : dfXOff = psExtraArg->dfXOff;
1547 765 : dfYOff = psExtraArg->dfYOff;
1548 765 : dfXSize = psExtraArg->dfXSize;
1549 765 : dfYSize = psExtraArg->dfYSize;
1550 : }
1551 :
1552 886 : const double dfXRatioDstToSrc = dfXSize / nBufXSize;
1553 886 : const double dfYRatioDstToSrc = dfYSize / nBufYSize;
1554 :
1555 : // Determine the coordinates in the "virtual" output raster to see
1556 : // if there are not integers, in which case we will use them as a shift
1557 : // so that subwindow extracts give the exact same results as entire raster
1558 : // scaling.
1559 886 : double dfDestXOff = dfXOff / dfXRatioDstToSrc;
1560 886 : bool bHasXOffVirtual = false;
1561 886 : int nDestXOffVirtual = 0;
1562 886 : if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
1563 : {
1564 761 : bHasXOffVirtual = true;
1565 761 : dfXOff = nXOff;
1566 761 : nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
1567 : }
1568 :
1569 886 : double dfDestYOff = dfYOff / dfYRatioDstToSrc;
1570 886 : bool bHasYOffVirtual = false;
1571 886 : int nDestYOffVirtual = 0;
1572 886 : if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
1573 : {
1574 721 : bHasYOffVirtual = true;
1575 721 : dfYOff = nYOff;
1576 721 : nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
1577 : }
1578 :
1579 : // Create a MEM dataset that wraps the output buffer.
1580 : GDALDataset *poMEMDS =
1581 886 : MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
1582 : nDestYOffVirtual + nBufYSize, 0, eBufType, nullptr);
1583 : GDALRasterBand **papoDstBands = static_cast<GDALRasterBand **>(
1584 886 : CPLMalloc(nBandCount * sizeof(GDALRasterBand *)));
1585 886 : int nNBITS = 0;
1586 2878 : for (int i = 0; i < nBandCount; i++)
1587 : {
1588 1992 : char szBuffer[32] = {'\0'};
1589 3984 : int nRet = CPLPrintPointer(
1590 : szBuffer,
1591 1992 : static_cast<GByte *>(pData) - nPixelSpace * nDestXOffVirtual -
1592 1992 : nLineSpace * nDestYOffVirtual + nBandSpace * i,
1593 : sizeof(szBuffer));
1594 1992 : szBuffer[nRet] = 0;
1595 :
1596 1992 : char szBuffer0[64] = {'\0'};
1597 1992 : snprintf(szBuffer0, sizeof(szBuffer0), "DATAPOINTER=%s", szBuffer);
1598 :
1599 1992 : char szBuffer1[64] = {'\0'};
1600 1992 : snprintf(szBuffer1, sizeof(szBuffer1), "PIXELOFFSET=" CPL_FRMT_GIB,
1601 : static_cast<GIntBig>(nPixelSpace));
1602 :
1603 1992 : char szBuffer2[64] = {'\0'};
1604 1992 : snprintf(szBuffer2, sizeof(szBuffer2), "LINEOFFSET=" CPL_FRMT_GIB,
1605 : static_cast<GIntBig>(nLineSpace));
1606 :
1607 1992 : char *apszOptions[4] = {szBuffer0, szBuffer1, szBuffer2, nullptr};
1608 :
1609 1992 : poMEMDS->AddBand(eBufType, apszOptions);
1610 :
1611 1992 : GDALRasterBand *poSrcBand = GetRasterBand(panBandMap[i]);
1612 1992 : papoDstBands[i] = poMEMDS->GetRasterBand(i + 1);
1613 : const char *pszNBITS =
1614 1992 : poSrcBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
1615 1992 : if (pszNBITS)
1616 : {
1617 0 : nNBITS = atoi(pszNBITS);
1618 0 : poMEMDS->GetRasterBand(i + 1)->SetMetadataItem("NBITS", pszNBITS,
1619 0 : "IMAGE_STRUCTURE");
1620 : }
1621 : }
1622 :
1623 886 : CPLErr eErr = CE_None;
1624 :
1625 : // TODO(schwehr): Why disabled? Why not just delete?
1626 : // Looks like this code was initially added as disable by copying
1627 : // from RasterIO here:
1628 : // https://trac.osgeo.org/gdal/changeset/29572
1629 : #if 0
1630 : // Do the resampling.
1631 : if( bUseWarp )
1632 : {
1633 : VRTDatasetH hVRTDS = nullptr;
1634 : GDALRasterBandH hVRTBand = nullptr;
1635 : if( GetDataset() == nullptr )
1636 : {
1637 : /* Create VRT dataset that wraps the whole dataset */
1638 : hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
1639 : VRTAddBand( hVRTDS, eDataType, nullptr );
1640 : hVRTBand = GDALGetRasterBand(hVRTDS, 1);
1641 : VRTAddSimpleSource( (VRTSourcedRasterBandH)hVRTBand,
1642 : (GDALRasterBandH)this,
1643 : 0, 0,
1644 : nRasterXSize, nRasterYSize,
1645 : 0, 0,
1646 : nRasterXSize, nRasterYSize,
1647 : nullptr, VRT_NODATA_UNSET );
1648 :
1649 : /* Add a mask band if needed */
1650 : if( GetMaskFlags() != GMF_ALL_VALID )
1651 : {
1652 : ((GDALDataset*)hVRTDS)->CreateMaskBand(0);
1653 : VRTSourcedRasterBand* poVRTMaskBand =
1654 : (VRTSourcedRasterBand*)(((GDALRasterBand*)hVRTBand)->GetMaskBand());
1655 : poVRTMaskBand->
1656 : AddMaskBandSource( this,
1657 : 0, 0,
1658 : nRasterXSize, nRasterYSize,
1659 : 0, 0,
1660 : nRasterXSize, nRasterYSize);
1661 : }
1662 : }
1663 :
1664 : GDALWarpOptions* psWarpOptions = GDALCreateWarpOptions();
1665 : psWarpOptions->eResampleAlg = (GDALResampleAlg)psExtraArg->eResampleAlg;
1666 : psWarpOptions->hSrcDS = (GDALDatasetH) (hVRTDS ? hVRTDS : GetDataset());
1667 : psWarpOptions->hDstDS = (GDALDatasetH) poMEMDS;
1668 : psWarpOptions->nBandCount = 1;
1669 : int nSrcBandNumber = (hVRTDS ? 1 : nBand);
1670 : int nDstBandNumber = 1;
1671 : psWarpOptions->panSrcBands = &nSrcBandNumber;
1672 : psWarpOptions->panDstBands = &nDstBandNumber;
1673 : psWarpOptions->pfnProgress = psExtraArg->pfnProgress ?
1674 : psExtraArg->pfnProgress : GDALDummyProgress;
1675 : psWarpOptions->pProgressArg = psExtraArg->pProgressData;
1676 : psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
1677 : GDALRasterIOTransformerStruct sTransformer;
1678 : sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
1679 : sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
1680 : sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
1681 : sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
1682 : psWarpOptions->pTransformerArg = &sTransformer;
1683 :
1684 : GDALWarpOperationH hWarpOperation = GDALCreateWarpOperation(psWarpOptions);
1685 : eErr = GDALChunkAndWarpImage( hWarpOperation,
1686 : nDestXOffVirtual, nDestYOffVirtual,
1687 : nBufXSize, nBufYSize );
1688 : GDALDestroyWarpOperation( hWarpOperation );
1689 :
1690 : psWarpOptions->panSrcBands = nullptr;
1691 : psWarpOptions->panDstBands = nullptr;
1692 : GDALDestroyWarpOptions( psWarpOptions );
1693 :
1694 : if( hVRTDS )
1695 : GDALClose(hVRTDS);
1696 : }
1697 : else
1698 : #endif
1699 : {
1700 886 : const char *pszResampling =
1701 1653 : (psExtraArg->eResampleAlg == GRIORA_Bilinear) ? "BILINEAR"
1702 767 : : (psExtraArg->eResampleAlg == GRIORA_Cubic) ? "CUBIC"
1703 0 : : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
1704 0 : : (psExtraArg->eResampleAlg == GRIORA_Lanczos) ? "LANCZOS"
1705 0 : : (psExtraArg->eResampleAlg == GRIORA_Average) ? "AVERAGE"
1706 0 : : (psExtraArg->eResampleAlg == GRIORA_RMS) ? "RMS"
1707 0 : : (psExtraArg->eResampleAlg == GRIORA_Mode) ? "MODE"
1708 0 : : (psExtraArg->eResampleAlg == GRIORA_Gauss) ? "GAUSS"
1709 : : "UNKNOWN";
1710 :
1711 886 : GDALRasterBand *poFirstSrcBand = GetRasterBand(panBandMap[0]);
1712 886 : GDALDataType eDataType = poFirstSrcBand->GetRasterDataType();
1713 : int nBlockXSize, nBlockYSize;
1714 886 : poFirstSrcBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
1715 :
1716 : int nKernelRadius;
1717 : GDALResampleFunction pfnResampleFunc =
1718 886 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
1719 886 : CPLAssert(pfnResampleFunc);
1720 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1721 : GDALResampleFunctionMultiBands pfnResampleFuncMultiBands =
1722 : GDALGetResampleFunctionMultiBands(pszResampling, &nKernelRadius);
1723 : #endif
1724 : GDALDataType eWrkDataType =
1725 886 : GDALGetOvrWorkDataType(pszResampling, eDataType);
1726 :
1727 886 : int nDstBlockXSize = nBufXSize;
1728 886 : int nDstBlockYSize = nBufYSize;
1729 : int nFullResXChunk, nFullResYChunk;
1730 : while (true)
1731 : {
1732 886 : nFullResXChunk =
1733 886 : 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
1734 886 : nFullResYChunk =
1735 886 : 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
1736 886 : if (nFullResXChunk > nRasterXSize)
1737 585 : nFullResXChunk = nRasterXSize;
1738 886 : if (nFullResYChunk > nRasterYSize)
1739 51 : nFullResYChunk = nRasterYSize;
1740 886 : if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
1741 884 : (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
1742 : 1024 * 1024))
1743 : break;
1744 : // When operating on the full width of a raster whose block width is
1745 : // the raster width, prefer doing chunks in height.
1746 0 : if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
1747 : nDstBlockYSize > 1)
1748 0 : nDstBlockYSize /= 2;
1749 : /* Otherwise cut the maximal dimension */
1750 0 : else if (nDstBlockXSize > 1 &&
1751 0 : (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
1752 0 : nDstBlockXSize /= 2;
1753 : else
1754 0 : nDstBlockYSize /= 2;
1755 : }
1756 :
1757 1772 : int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
1758 886 : static_cast<int>(0.5 + dfYRatioDstToSrc));
1759 886 : if (nOvrFactor == 0)
1760 104 : nOvrFactor = 1;
1761 886 : int nFullResXSizeQueried =
1762 886 : nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
1763 886 : int nFullResYSizeQueried =
1764 886 : nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
1765 :
1766 886 : if (nFullResXSizeQueried > nRasterXSize)
1767 610 : nFullResXSizeQueried = nRasterXSize;
1768 886 : if (nFullResYSizeQueried > nRasterYSize)
1769 54 : nFullResYSizeQueried = nRasterYSize;
1770 :
1771 886 : void *pChunk = VSI_MALLOC3_VERBOSE(
1772 : cpl::fits_on<int>(GDALGetDataTypeSizeBytes(eWrkDataType) *
1773 : nBandCount),
1774 : nFullResXSizeQueried, nFullResYSizeQueried);
1775 886 : GByte *pabyChunkNoDataMask = nullptr;
1776 :
1777 886 : GDALRasterBand *poMaskBand = poFirstSrcBand->GetMaskBand();
1778 886 : int nMaskFlags = poFirstSrcBand->GetMaskFlags();
1779 :
1780 886 : bool bUseNoDataMask = ((nMaskFlags & GMF_ALL_VALID) == 0);
1781 886 : if (bUseNoDataMask)
1782 : {
1783 617 : pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
1784 : nFullResXSizeQueried, nFullResYSizeQueried));
1785 : }
1786 886 : if (pChunk == nullptr ||
1787 617 : (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
1788 : {
1789 0 : GDALClose(poMEMDS);
1790 0 : CPLFree(pChunk);
1791 0 : CPLFree(pabyChunkNoDataMask);
1792 0 : CPLFree(papoDstBands);
1793 0 : return CE_Failure;
1794 : }
1795 :
1796 886 : const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
1797 886 : DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
1798 886 : int nBlocksDone = 0;
1799 :
1800 : int nDstYOff;
1801 1772 : for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
1802 886 : nDstYOff += nDstBlockYSize)
1803 : {
1804 : int nDstYCount;
1805 886 : if (nDstYOff + nDstBlockYSize <= nBufYSize)
1806 886 : nDstYCount = nDstBlockYSize;
1807 : else
1808 0 : nDstYCount = nBufYSize - nDstYOff;
1809 :
1810 886 : int nChunkYOff =
1811 886 : nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
1812 886 : int nChunkYOff2 = nYOff + 1 +
1813 886 : static_cast<int>(ceil((nDstYOff + nDstYCount) *
1814 : dfYRatioDstToSrc));
1815 886 : if (nChunkYOff2 > nRasterYSize)
1816 133 : nChunkYOff2 = nRasterYSize;
1817 886 : int nYCount = nChunkYOff2 - nChunkYOff;
1818 886 : CPLAssert(nYCount <= nFullResYChunk);
1819 :
1820 886 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
1821 886 : int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
1822 886 : if (nChunkYOffQueried < 0)
1823 : {
1824 136 : nChunkYSizeQueried += nChunkYOffQueried;
1825 136 : nChunkYOffQueried = 0;
1826 : }
1827 886 : if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
1828 151 : nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
1829 886 : CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
1830 :
1831 : int nDstXOff;
1832 1772 : for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
1833 886 : nDstXOff += nDstBlockXSize)
1834 : {
1835 : int nDstXCount;
1836 886 : if (nDstXOff + nDstBlockXSize <= nBufXSize)
1837 886 : nDstXCount = nDstBlockXSize;
1838 : else
1839 0 : nDstXCount = nBufXSize - nDstXOff;
1840 :
1841 886 : int nChunkXOff =
1842 886 : nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
1843 886 : int nChunkXOff2 =
1844 886 : nXOff + 1 +
1845 886 : static_cast<int>(
1846 886 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
1847 886 : if (nChunkXOff2 > nRasterXSize)
1848 641 : nChunkXOff2 = nRasterXSize;
1849 886 : int nXCount = nChunkXOff2 - nChunkXOff;
1850 886 : CPLAssert(nXCount <= nFullResXChunk);
1851 :
1852 886 : int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
1853 886 : int nChunkXSizeQueried =
1854 886 : nXCount + 2 * nKernelRadius * nOvrFactor;
1855 886 : if (nChunkXOffQueried < 0)
1856 : {
1857 641 : nChunkXSizeQueried += nChunkXOffQueried;
1858 641 : nChunkXOffQueried = 0;
1859 : }
1860 886 : if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
1861 649 : nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
1862 886 : CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
1863 :
1864 886 : bool bSkipResample = false;
1865 886 : bool bNoDataMaskFullyOpaque = false;
1866 886 : if (eErr == CE_None && bUseNoDataMask)
1867 : {
1868 617 : eErr = poMaskBand->RasterIO(
1869 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1870 : nChunkXSizeQueried, nChunkYSizeQueried,
1871 : pabyChunkNoDataMask, nChunkXSizeQueried,
1872 : nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
1873 :
1874 : /* Optimizations if mask if fully opaque or transparent */
1875 617 : const int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
1876 617 : const GByte bVal = pabyChunkNoDataMask[0];
1877 617 : int i = 1; // Used after for.
1878 48197000 : for (; i < nPixels; i++)
1879 : {
1880 48196500 : if (pabyChunkNoDataMask[i] != bVal)
1881 72 : break;
1882 : }
1883 617 : if (i == nPixels)
1884 : {
1885 545 : if (bVal == 0)
1886 : {
1887 373 : GByte abyZero[16] = {0};
1888 780 : for (int iBand = 0; iBand < nBandCount; iBand++)
1889 : {
1890 3499 : for (int j = 0; j < nDstYCount; j++)
1891 : {
1892 3092 : GDALCopyWords64(
1893 : abyZero, GDT_UInt8, 0,
1894 : static_cast<GByte *>(pData) +
1895 3092 : iBand * nBandSpace +
1896 3092 : nLineSpace * (j + nDstYOff) +
1897 3092 : nDstXOff * nPixelSpace,
1898 : eBufType, static_cast<int>(nPixelSpace),
1899 : nDstXCount);
1900 : }
1901 : }
1902 373 : bSkipResample = true;
1903 : }
1904 : else
1905 : {
1906 172 : bNoDataMaskFullyOpaque = true;
1907 : }
1908 : }
1909 : }
1910 :
1911 886 : if (!bSkipResample && eErr == CE_None)
1912 : {
1913 : /* Read the source buffers */
1914 510 : eErr = RasterIO(
1915 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1916 : nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
1917 : nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType,
1918 : nBandCount, panBandMap, 0, 0, 0, nullptr);
1919 : }
1920 :
1921 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1922 : if (pfnResampleFuncMultiBands && !bSkipResample &&
1923 : eErr == CE_None)
1924 : {
1925 : eErr = pfnResampleFuncMultiBands(
1926 : dfXRatioDstToSrc, dfYRatioDstToSrc,
1927 : dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
1928 : dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
1929 : eWrkDataType, (GByte *)pChunk, nBandCount,
1930 : bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask,
1931 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
1932 : nChunkXSizeQueried,
1933 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
1934 : nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
1935 : nDstXOff + nDestXOffVirtual + nDstXCount,
1936 : nDstYOff + nDestYOffVirtual,
1937 : nDstYOff + nDestYOffVirtual + nDstYCount, papoDstBands,
1938 : pszResampling, FALSE /*bHasNoData*/,
1939 : 0.0 /* dfNoDataValue */, nullptr /* color table*/,
1940 : eDataType);
1941 : }
1942 : else
1943 : #endif
1944 : {
1945 : size_t nChunkBandOffset =
1946 886 : static_cast<size_t>(nChunkXSizeQueried) *
1947 886 : nChunkYSizeQueried *
1948 886 : GDALGetDataTypeSizeBytes(eWrkDataType);
1949 2462 : for (int i = 0;
1950 2462 : i < nBandCount && !bSkipResample && eErr == CE_None;
1951 : i++)
1952 : {
1953 1576 : const bool bPropagateNoData = false;
1954 1576 : void *pDstBuffer = nullptr;
1955 1576 : GDALDataType eDstBufferDataType = GDT_Unknown;
1956 : GDALRasterBand *poMEMBand =
1957 1576 : poMEMDS->GetRasterBand(i + 1);
1958 1576 : GDALOverviewResampleArgs args;
1959 1576 : args.eSrcDataType = eDataType;
1960 1576 : args.eOvrDataType = poMEMBand->GetRasterDataType();
1961 1576 : args.nOvrXSize = poMEMBand->GetXSize();
1962 1576 : args.nOvrYSize = poMEMBand->GetYSize();
1963 1576 : args.nOvrNBITS = nNBITS;
1964 1576 : args.dfXRatioDstToSrc = dfXRatioDstToSrc;
1965 1576 : args.dfYRatioDstToSrc = dfYRatioDstToSrc;
1966 1576 : args.dfSrcXDelta =
1967 1576 : dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
1968 1576 : args.dfSrcYDelta =
1969 1576 : dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
1970 1576 : args.eWrkDataType = eWrkDataType;
1971 1576 : args.pabyChunkNodataMask = bNoDataMaskFullyOpaque
1972 1576 : ? nullptr
1973 : : pabyChunkNoDataMask;
1974 1576 : args.nChunkXOff =
1975 1576 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
1976 1576 : args.nChunkXSize = nChunkXSizeQueried;
1977 1576 : args.nChunkYOff =
1978 1576 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
1979 1576 : args.nChunkYSize = nChunkYSizeQueried;
1980 1576 : args.nDstXOff = nDstXOff + nDestXOffVirtual;
1981 1576 : args.nDstXOff2 =
1982 1576 : nDstXOff + nDestXOffVirtual + nDstXCount;
1983 1576 : args.nDstYOff = nDstYOff + nDestYOffVirtual;
1984 1576 : args.nDstYOff2 =
1985 1576 : nDstYOff + nDestYOffVirtual + nDstYCount;
1986 1576 : args.pszResampling = pszResampling;
1987 1576 : args.bHasNoData = false;
1988 1576 : args.dfNoDataValue = 0.0;
1989 1576 : args.poColorTable = nullptr;
1990 1576 : args.bPropagateNoData = bPropagateNoData;
1991 :
1992 : eErr =
1993 3152 : pfnResampleFunc(args,
1994 1576 : reinterpret_cast<GByte *>(pChunk) +
1995 1576 : i * nChunkBandOffset,
1996 : &pDstBuffer, &eDstBufferDataType);
1997 1576 : if (eErr == CE_None)
1998 : {
1999 1576 : eErr = poMEMBand->RasterIO(
2000 : GF_Write, nDstXOff + nDestXOffVirtual,
2001 : nDstYOff + nDestYOffVirtual, nDstXCount,
2002 : nDstYCount, pDstBuffer, nDstXCount, nDstYCount,
2003 : eDstBufferDataType, 0, 0, nullptr);
2004 : }
2005 1576 : CPLFree(pDstBuffer);
2006 : }
2007 : }
2008 :
2009 886 : nBlocksDone++;
2010 1275 : if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
2011 389 : !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
2012 : "", psExtraArg->pProgressData))
2013 : {
2014 0 : eErr = CE_Failure;
2015 : }
2016 : }
2017 : }
2018 :
2019 886 : CPLFree(pChunk);
2020 886 : CPLFree(pabyChunkNoDataMask);
2021 : }
2022 :
2023 886 : CPLFree(papoDstBands);
2024 886 : GDALClose(poMEMDS);
2025 :
2026 886 : return eErr;
2027 : }
2028 :
2029 : //! @endcond
2030 :
2031 : /************************************************************************/
2032 : /* GDALSwapWords() */
2033 : /************************************************************************/
2034 :
2035 : /**
2036 : * Byte swap words in-place.
2037 : *
2038 : * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
2039 : * a memory array. No assumption is made that the words being swapped are
2040 : * word aligned in memory. Use the CPL_LSB and CPL_MSB macros from cpl_port.h
2041 : * to determine if the current platform is big endian or little endian. Use
2042 : * The macros like CPL_SWAP32() to byte swap single values without the overhead
2043 : * of a function call.
2044 : *
2045 : * @param pData pointer to start of data buffer.
2046 : * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
2047 : * @param nWordCount the number of words to be swapped in this call.
2048 : * @param nWordSkip the byte offset from the start of one word to the start of
2049 : * the next. For packed buffers this is the same as nWordSize.
2050 : */
2051 :
2052 497143 : void CPL_STDCALL GDALSwapWords(void *pData, int nWordSize, int nWordCount,
2053 : int nWordSkip)
2054 :
2055 : {
2056 497143 : if (nWordCount > 0)
2057 497143 : VALIDATE_POINTER0(pData, "GDALSwapWords");
2058 :
2059 497143 : GByte *pabyData = static_cast<GByte *>(pData);
2060 :
2061 497143 : switch (nWordSize)
2062 : {
2063 7234 : case 1:
2064 7234 : break;
2065 :
2066 476903 : case 2:
2067 476903 : CPLAssert(nWordSkip >= 2 || nWordCount == 1);
2068 228062000 : for (int i = 0; i < nWordCount; i++)
2069 : {
2070 227585000 : CPL_SWAP16PTR(pabyData);
2071 227585000 : pabyData += nWordSkip;
2072 : }
2073 476903 : break;
2074 :
2075 10580 : case 4:
2076 10580 : CPLAssert(nWordSkip >= 4 || nWordCount == 1);
2077 10580 : if (CPL_IS_ALIGNED(pabyData, 4) && (nWordSkip % 4) == 0)
2078 : {
2079 29140500 : for (int i = 0; i < nWordCount; i++)
2080 : {
2081 29130000 : *reinterpret_cast<GUInt32 *>(pabyData) = CPL_SWAP32(
2082 : *reinterpret_cast<const GUInt32 *>(pabyData));
2083 29130000 : pabyData += nWordSkip;
2084 10577 : }
2085 : }
2086 : else
2087 : {
2088 9 : for (int i = 0; i < nWordCount; i++)
2089 : {
2090 6 : CPL_SWAP32PTR(pabyData);
2091 6 : pabyData += nWordSkip;
2092 : }
2093 : }
2094 10580 : break;
2095 :
2096 2426 : case 8:
2097 2426 : CPLAssert(nWordSkip >= 8 || nWordCount == 1);
2098 2426 : if (CPL_IS_ALIGNED(pabyData, 8) && (nWordSkip % 8) == 0)
2099 : {
2100 3356900 : for (int i = 0; i < nWordCount; i++)
2101 : {
2102 3354480 : *reinterpret_cast<GUInt64 *>(pabyData) = CPL_SWAP64(
2103 : *reinterpret_cast<const GUInt64 *>(pabyData));
2104 3354480 : pabyData += nWordSkip;
2105 2425 : }
2106 : }
2107 : else
2108 : {
2109 3 : for (int i = 0; i < nWordCount; i++)
2110 : {
2111 2 : CPL_SWAP64PTR(pabyData);
2112 2 : pabyData += nWordSkip;
2113 : }
2114 : }
2115 2426 : break;
2116 :
2117 0 : default:
2118 0 : CPLAssert(false);
2119 : }
2120 : }
2121 :
2122 : /************************************************************************/
2123 : /* GDALSwapWordsEx() */
2124 : /************************************************************************/
2125 :
2126 : /**
2127 : * Byte swap words in-place.
2128 : *
2129 : * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
2130 : * a memory array. No assumption is made that the words being swapped are
2131 : * word aligned in memory. Use the CPL_LSB and CPL_MSB macros from cpl_port.h
2132 : * to determine if the current platform is big endian or little endian. Use
2133 : * The macros like CPL_SWAP32() to byte swap single values without the overhead
2134 : * of a function call.
2135 : *
2136 : * @param pData pointer to start of data buffer.
2137 : * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
2138 : * @param nWordCount the number of words to be swapped in this call.
2139 : * @param nWordSkip the byte offset from the start of one word to the start of
2140 : * the next. For packed buffers this is the same as nWordSize.
2141 : */
2142 6124 : void CPL_STDCALL GDALSwapWordsEx(void *pData, int nWordSize, size_t nWordCount,
2143 : int nWordSkip)
2144 : {
2145 6124 : GByte *pabyData = static_cast<GByte *>(pData);
2146 12248 : while (nWordCount)
2147 : {
2148 : // Pick-up a multiple of 8 as max chunk size.
2149 6124 : const int nWordCountSmall =
2150 6124 : (nWordCount > (1 << 30)) ? (1 << 30) : static_cast<int>(nWordCount);
2151 6124 : GDALSwapWords(pabyData, nWordSize, nWordCountSmall, nWordSkip);
2152 6124 : pabyData += static_cast<size_t>(nWordSkip) * nWordCountSmall;
2153 6124 : nWordCount -= nWordCountSmall;
2154 : }
2155 6124 : }
2156 :
2157 : // Place the new GDALCopyWords helpers in an anonymous namespace
2158 : namespace
2159 : {
2160 :
2161 : /************************************************************************/
2162 : /* GDALCopyWordsT() */
2163 : /************************************************************************/
2164 : /**
2165 : * Template function, used to copy data from pSrcData into buffer
2166 : * pDstData, with stride nSrcPixelStride in the source data and
2167 : * stride nDstPixelStride in the destination data. This template can
2168 : * deal with the case where the input data type is real or complex and
2169 : * the output is real.
2170 : *
2171 : * @param pSrcData the source data buffer
2172 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
2173 : * of interest.
2174 : * @param pDstData the destination buffer.
2175 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
2176 : * interest.
2177 : * @param nWordCount the total number of pixel words to copy
2178 : *
2179 : * @code
2180 : * // Assume an input buffer of type GUInt16 named pBufferIn
2181 : * GByte *pBufferOut = new GByte[numBytesOut];
2182 : * GDALCopyWordsT<GUInt16, GByte>(pSrcData, 2, pDstData, 1, numBytesOut);
2183 : * @endcode
2184 : * @note
2185 : * This is a private function, and should not be exposed outside of
2186 : * rasterio.cpp. External users should call the GDALCopyWords driver function.
2187 : */
2188 :
2189 : template <class Tin, class Tout>
2190 49004022 : static void inline GDALCopyWordsGenericT(const Tin *const CPL_RESTRICT pSrcData,
2191 : int nSrcPixelStride,
2192 : Tout *const CPL_RESTRICT pDstData,
2193 : int nDstPixelStride,
2194 : GPtrDiff_t nWordCount)
2195 : {
2196 49004022 : decltype(nWordCount) nDstOffset = 0;
2197 :
2198 49004022 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
2199 49004022 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
2200 356635480 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
2201 : {
2202 307631416 : const Tin tValue =
2203 307631416 : *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
2204 307631416 : Tout *const pOutPixel =
2205 307631416 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2206 :
2207 307631416 : GDALCopyWord(tValue, *pOutPixel);
2208 :
2209 307631416 : nDstOffset += nDstPixelStride;
2210 : }
2211 49004022 : }
2212 :
2213 : template <class Tin, class Tout>
2214 29766045 : static void CPL_NOINLINE GDALCopyWordsT(const Tin *const CPL_RESTRICT pSrcData,
2215 : int nSrcPixelStride,
2216 : Tout *const CPL_RESTRICT pDstData,
2217 : int nDstPixelStride,
2218 : GPtrDiff_t nWordCount)
2219 : {
2220 29766045 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData, nDstPixelStride,
2221 : nWordCount);
2222 29766045 : }
2223 :
2224 : template <class Tin, class Tout>
2225 5094306 : static void inline GDALCopyWordsT_8atatime(
2226 : const Tin *const CPL_RESTRICT pSrcData, int nSrcPixelStride,
2227 : Tout *const CPL_RESTRICT pDstData, int nDstPixelStride,
2228 : GPtrDiff_t nWordCount)
2229 : {
2230 5094306 : decltype(nWordCount) nDstOffset = 0;
2231 :
2232 5094306 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
2233 5094306 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
2234 5094306 : decltype(nWordCount) n = 0;
2235 5094306 : if (nSrcPixelStride == static_cast<int>(sizeof(Tin)) &&
2236 : nDstPixelStride == static_cast<int>(sizeof(Tout)))
2237 : {
2238 57871497 : for (; n < nWordCount - 7; n += 8)
2239 : {
2240 57326656 : const Tin *pInValues = reinterpret_cast<const Tin *>(
2241 57326656 : pSrcDataPtr + (n * nSrcPixelStride));
2242 57326656 : Tout *const pOutPixels =
2243 57326656 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2244 :
2245 57326656 : GDALCopy8Words(pInValues, pOutPixels);
2246 :
2247 57326656 : nDstOffset += 8 * nDstPixelStride;
2248 : }
2249 : }
2250 10491671 : for (; n < nWordCount; n++)
2251 : {
2252 5397365 : const Tin tValue =
2253 5397365 : *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
2254 5397365 : Tout *const pOutPixel =
2255 5397365 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2256 :
2257 5397365 : GDALCopyWord(tValue, *pOutPixel);
2258 :
2259 5397365 : nDstOffset += nDstPixelStride;
2260 : }
2261 5094306 : }
2262 :
2263 : #ifdef HAVE_SSE2
2264 :
2265 : template <class Tout>
2266 1042120 : void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
2267 : int nSrcPixelStride,
2268 : Tout *const CPL_RESTRICT pDstData,
2269 : int nDstPixelStride, GPtrDiff_t nWordCount)
2270 : {
2271 : static_assert(std::is_integral<Tout>::value &&
2272 : sizeof(Tout) == sizeof(uint16_t),
2273 : "Bad Tout");
2274 1042120 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2275 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2276 : {
2277 35766 : decltype(nWordCount) n = 0;
2278 35766 : const __m128i xmm_zero = _mm_setzero_si128();
2279 35766 : GByte *CPL_RESTRICT pabyDstDataPtr =
2280 : reinterpret_cast<GByte *>(pDstData);
2281 1478162 : for (; n < nWordCount - 15; n += 16)
2282 : {
2283 1442396 : __m128i xmm = _mm_loadu_si128(
2284 1442396 : reinterpret_cast<const __m128i *>(pSrcData + n));
2285 1442396 : __m128i xmm0 = _mm_unpacklo_epi8(xmm, xmm_zero);
2286 1442396 : __m128i xmm1 = _mm_unpackhi_epi8(xmm, xmm_zero);
2287 : _mm_storeu_si128(
2288 1442396 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2), xmm0);
2289 : _mm_storeu_si128(
2290 1442396 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2 + 16), xmm1);
2291 : }
2292 111789 : for (; n < nWordCount; n++)
2293 : {
2294 76023 : pDstData[n] = pSrcData[n];
2295 35766 : }
2296 : }
2297 : else
2298 : {
2299 1006351 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2300 : nDstPixelStride, nWordCount);
2301 : }
2302 1042120 : }
2303 :
2304 : template <>
2305 1029380 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2306 : int nSrcPixelStride,
2307 : GUInt16 *const CPL_RESTRICT pDstData,
2308 : int nDstPixelStride, GPtrDiff_t nWordCount)
2309 : {
2310 1029380 : GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
2311 : nDstPixelStride, nWordCount);
2312 1029380 : }
2313 :
2314 : template <>
2315 12740 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2316 : int nSrcPixelStride,
2317 : GInt16 *const CPL_RESTRICT pDstData,
2318 : int nDstPixelStride, GPtrDiff_t nWordCount)
2319 : {
2320 12740 : GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
2321 : nDstPixelStride, nWordCount);
2322 12740 : }
2323 :
2324 : template <class Tout>
2325 16237076 : void GDALCopyWordsByteTo32Bit(const GByte *const CPL_RESTRICT pSrcData,
2326 : int nSrcPixelStride,
2327 : Tout *const CPL_RESTRICT pDstData,
2328 : int nDstPixelStride, GPtrDiff_t nWordCount)
2329 : {
2330 : static_assert(std::is_integral<Tout>::value &&
2331 : sizeof(Tout) == sizeof(uint32_t),
2332 : "Bad Tout");
2333 16237076 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2334 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2335 : {
2336 6532606 : decltype(nWordCount) n = 0;
2337 6532606 : const __m128i xmm_zero = _mm_setzero_si128();
2338 6532606 : GByte *CPL_RESTRICT pabyDstDataPtr =
2339 : reinterpret_cast<GByte *>(pDstData);
2340 74248627 : for (; n < nWordCount - 15; n += 16)
2341 : {
2342 67715961 : __m128i xmm = _mm_loadu_si128(
2343 67715961 : reinterpret_cast<const __m128i *>(pSrcData + n));
2344 67715961 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2345 67715961 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2346 67715961 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2347 67715961 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2348 67715961 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2349 67715961 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2350 : _mm_storeu_si128(
2351 67715961 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4), xmm0);
2352 : _mm_storeu_si128(
2353 67715961 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 16), xmm1);
2354 : _mm_storeu_si128(
2355 67715961 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 32), xmm2);
2356 : _mm_storeu_si128(
2357 67715961 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 48), xmm3);
2358 : }
2359 14825316 : for (; n < nWordCount; n++)
2360 : {
2361 8292760 : pDstData[n] = pSrcData[n];
2362 6532606 : }
2363 : }
2364 : else
2365 : {
2366 9704490 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2367 : nDstPixelStride, nWordCount);
2368 : }
2369 16237076 : }
2370 :
2371 : template <>
2372 476 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2373 : int nSrcPixelStride,
2374 : GUInt32 *const CPL_RESTRICT pDstData,
2375 : int nDstPixelStride, GPtrDiff_t nWordCount)
2376 : {
2377 476 : GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
2378 : nDstPixelStride, nWordCount);
2379 476 : }
2380 :
2381 : template <>
2382 16236600 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2383 : int nSrcPixelStride,
2384 : GInt32 *const CPL_RESTRICT pDstData,
2385 : int nDstPixelStride, GPtrDiff_t nWordCount)
2386 : {
2387 16236600 : GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
2388 : nDstPixelStride, nWordCount);
2389 16236600 : }
2390 :
2391 : template <>
2392 2856070 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2393 : int nSrcPixelStride,
2394 : float *const CPL_RESTRICT pDstData,
2395 : int nDstPixelStride, GPtrDiff_t nWordCount)
2396 : {
2397 2856070 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2398 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2399 : {
2400 233183 : decltype(nWordCount) n = 0;
2401 233183 : const __m128i xmm_zero = _mm_setzero_si128();
2402 233183 : GByte *CPL_RESTRICT pabyDstDataPtr =
2403 : reinterpret_cast<GByte *>(pDstData);
2404 4780370 : for (; n < nWordCount - 15; n += 16)
2405 : {
2406 4547190 : __m128i xmm = _mm_loadu_si128(
2407 4547190 : reinterpret_cast<const __m128i *>(pSrcData + n));
2408 4547190 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2409 4547190 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2410 4547190 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2411 4547190 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2412 4547190 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2413 4547190 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2414 4547190 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2415 4547190 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2416 4547190 : __m128 xmm2_f = _mm_cvtepi32_ps(xmm2);
2417 4547190 : __m128 xmm3_f = _mm_cvtepi32_ps(xmm3);
2418 4547190 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2419 : xmm0_f);
2420 : _mm_storeu_ps(
2421 4547190 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2422 : _mm_storeu_ps(
2423 4547190 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 32), xmm2_f);
2424 : _mm_storeu_ps(
2425 4547190 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 48), xmm3_f);
2426 : }
2427 957106 : for (; n < nWordCount; n++)
2428 : {
2429 723923 : pDstData[n] = pSrcData[n];
2430 233183 : }
2431 : }
2432 : else
2433 : {
2434 2622880 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2435 : nDstPixelStride, nWordCount);
2436 : }
2437 2856070 : }
2438 :
2439 : template <>
2440 170733 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2441 : int nSrcPixelStride,
2442 : double *const CPL_RESTRICT pDstData,
2443 : int nDstPixelStride, GPtrDiff_t nWordCount)
2444 : {
2445 170733 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2446 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2447 : {
2448 146935 : decltype(nWordCount) n = 0;
2449 146935 : const __m128i xmm_zero = _mm_setzero_si128();
2450 146935 : GByte *CPL_RESTRICT pabyDstDataPtr =
2451 : reinterpret_cast<GByte *>(pDstData);
2452 3126930 : for (; n < nWordCount - 15; n += 16)
2453 : {
2454 2979990 : __m128i xmm = _mm_loadu_si128(
2455 2979990 : reinterpret_cast<const __m128i *>(pSrcData + n));
2456 2979990 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2457 2979990 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2458 2979990 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2459 2979990 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2460 2979990 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2461 2979990 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2462 :
2463 : #if defined(__AVX2__) && defined(slightly_slower_than_SSE2)
2464 : _mm256_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2465 : _mm256_cvtepi32_pd(xmm0));
2466 : _mm256_storeu_pd(
2467 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2468 : _mm256_cvtepi32_pd(xmm1));
2469 : _mm256_storeu_pd(
2470 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
2471 : _mm256_cvtepi32_pd(xmm2));
2472 : _mm256_storeu_pd(
2473 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
2474 : _mm256_cvtepi32_pd(xmm3));
2475 : #else
2476 2979990 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2477 2979990 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2478 2979990 : __m128d xmm2_low_d = _mm_cvtepi32_pd(xmm2);
2479 2979990 : __m128d xmm3_low_d = _mm_cvtepi32_pd(xmm3);
2480 2979990 : xmm0 = _mm_srli_si128(xmm0, 8);
2481 2979990 : xmm1 = _mm_srli_si128(xmm1, 8);
2482 2979990 : xmm2 = _mm_srli_si128(xmm2, 8);
2483 2979990 : xmm3 = _mm_srli_si128(xmm3, 8);
2484 2979990 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2485 2979990 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2486 2979990 : __m128d xmm2_high_d = _mm_cvtepi32_pd(xmm2);
2487 2979990 : __m128d xmm3_high_d = _mm_cvtepi32_pd(xmm3);
2488 :
2489 2979990 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2490 : xmm0_low_d);
2491 : _mm_storeu_pd(
2492 2979990 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2493 : xmm0_high_d);
2494 : _mm_storeu_pd(
2495 2979990 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2496 : xmm1_low_d);
2497 : _mm_storeu_pd(
2498 2979990 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2499 : xmm1_high_d);
2500 : _mm_storeu_pd(
2501 2979990 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
2502 : xmm2_low_d);
2503 : _mm_storeu_pd(
2504 2979990 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 80),
2505 : xmm2_high_d);
2506 : _mm_storeu_pd(
2507 2979990 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
2508 : xmm3_low_d);
2509 : _mm_storeu_pd(
2510 2979990 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 112),
2511 : xmm3_high_d);
2512 : #endif
2513 : }
2514 280278 : for (; n < nWordCount; n++)
2515 : {
2516 133343 : pDstData[n] = pSrcData[n];
2517 146935 : }
2518 : }
2519 : else
2520 : {
2521 23798 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2522 : nDstPixelStride, nWordCount);
2523 : }
2524 170733 : }
2525 :
2526 : template <>
2527 148 : CPL_NOINLINE void GDALCopyWordsT(const uint8_t *const CPL_RESTRICT pSrcData,
2528 : int nSrcPixelStride,
2529 : int8_t *const CPL_RESTRICT pDstData,
2530 : int nDstPixelStride, GPtrDiff_t nWordCount)
2531 : {
2532 148 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2533 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2534 : {
2535 142 : decltype(nWordCount) n = 0;
2536 142 : const __m128i xmm_127 = _mm_set1_epi8(127);
2537 146 : for (; n < nWordCount - 31; n += 32)
2538 : {
2539 8 : __m128i xmm0 = _mm_loadu_si128(
2540 4 : reinterpret_cast<const __m128i *>(pSrcData + n));
2541 4 : __m128i xmm1 = _mm_loadu_si128(
2542 4 : reinterpret_cast<const __m128i *>(pSrcData + n + 16));
2543 4 : xmm0 = _mm_min_epu8(xmm0, xmm_127);
2544 4 : xmm1 = _mm_min_epu8(xmm1, xmm_127);
2545 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2546 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
2547 : xmm1);
2548 : }
2549 2424 : for (; n < nWordCount; n++)
2550 : {
2551 2282 : pDstData[n] =
2552 2282 : pSrcData[n] >= 127 ? 127 : static_cast<int8_t>(pSrcData[n]);
2553 142 : }
2554 : }
2555 : else
2556 : {
2557 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2558 : nDstPixelStride, nWordCount);
2559 : }
2560 148 : }
2561 :
2562 : template <>
2563 82 : CPL_NOINLINE void GDALCopyWordsT(const int8_t *const CPL_RESTRICT pSrcData,
2564 : int nSrcPixelStride,
2565 : uint8_t *const CPL_RESTRICT pDstData,
2566 : int nDstPixelStride, GPtrDiff_t nWordCount)
2567 : {
2568 82 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2569 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2570 : {
2571 56 : decltype(nWordCount) n = 0;
2572 : #if !(defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS))
2573 56 : const __m128i xmm_INT8_to_UINT8 = _mm_set1_epi8(-128);
2574 : #endif
2575 117 : for (; n < nWordCount - 31; n += 32)
2576 : {
2577 122 : __m128i xmm0 = _mm_loadu_si128(
2578 61 : reinterpret_cast<const __m128i *>(pSrcData + n));
2579 61 : __m128i xmm1 = _mm_loadu_si128(
2580 61 : reinterpret_cast<const __m128i *>(pSrcData + n + 16));
2581 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2582 : xmm0 = _mm_max_epi8(xmm0, _mm_setzero_si128());
2583 : xmm1 = _mm_max_epi8(xmm1, _mm_setzero_si128());
2584 : #else
2585 61 : xmm0 = _mm_add_epi8(xmm0, xmm_INT8_to_UINT8);
2586 61 : xmm1 = _mm_add_epi8(xmm1, xmm_INT8_to_UINT8);
2587 61 : xmm0 = _mm_max_epu8(xmm0, xmm_INT8_to_UINT8);
2588 61 : xmm1 = _mm_max_epu8(xmm1, xmm_INT8_to_UINT8);
2589 61 : xmm0 = _mm_sub_epi8(xmm0, xmm_INT8_to_UINT8);
2590 61 : xmm1 = _mm_sub_epi8(xmm1, xmm_INT8_to_UINT8);
2591 : #endif
2592 61 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2593 61 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
2594 : xmm1);
2595 : }
2596 352 : for (; n < nWordCount; n++)
2597 : {
2598 296 : pDstData[n] =
2599 296 : pSrcData[n] < 0 ? 0 : static_cast<uint8_t>(pSrcData[n]);
2600 56 : }
2601 : }
2602 : else
2603 : {
2604 26 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2605 : nDstPixelStride, nWordCount);
2606 : }
2607 82 : }
2608 :
2609 : template <>
2610 6037 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2611 : int nSrcPixelStride,
2612 : uint8_t *const CPL_RESTRICT pDstData,
2613 : int nDstPixelStride, GPtrDiff_t nWordCount)
2614 : {
2615 6037 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2616 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2617 : {
2618 5062 : decltype(nWordCount) n = 0;
2619 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2620 : const auto xmm_MAX_INT16 = _mm_set1_epi16(32767);
2621 : #else
2622 : // In SSE2, min_epu16 does not exist, so shift from
2623 : // UInt16 to SInt16 to be able to use min_epi16
2624 5062 : const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
2625 5062 : const __m128i xmm_m255_shifted = _mm_set1_epi16(255 - 32768);
2626 : #endif
2627 71888 : for (; n < nWordCount - 15; n += 16)
2628 : {
2629 133652 : __m128i xmm0 = _mm_loadu_si128(
2630 66826 : reinterpret_cast<const __m128i *>(pSrcData + n));
2631 66826 : __m128i xmm1 = _mm_loadu_si128(
2632 66826 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2633 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2634 : xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
2635 : xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
2636 : #else
2637 66826 : xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
2638 66826 : xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
2639 66826 : xmm0 = _mm_min_epi16(xmm0, xmm_m255_shifted);
2640 66826 : xmm1 = _mm_min_epi16(xmm1, xmm_m255_shifted);
2641 66826 : xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
2642 66826 : xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
2643 : #endif
2644 66826 : xmm0 = _mm_packus_epi16(xmm0, xmm1);
2645 66826 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2646 : }
2647 16403 : for (; n < nWordCount; n++)
2648 : {
2649 11341 : pDstData[n] =
2650 11341 : pSrcData[n] >= 255 ? 255 : static_cast<uint8_t>(pSrcData[n]);
2651 5062 : }
2652 : }
2653 : else
2654 : {
2655 975 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2656 : nDstPixelStride, nWordCount);
2657 : }
2658 6037 : }
2659 :
2660 : template <>
2661 46 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2662 : int nSrcPixelStride,
2663 : int16_t *const CPL_RESTRICT pDstData,
2664 : int nDstPixelStride, GPtrDiff_t nWordCount)
2665 : {
2666 46 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2667 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2668 : {
2669 40 : decltype(nWordCount) n = 0;
2670 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2671 : const __m128i xmm_MAX_INT16 = _mm_set1_epi16(32767);
2672 : #else
2673 : // In SSE2, min_epu16 does not exist, so shift from
2674 : // UInt16 to SInt16 to be able to use min_epi16
2675 40 : const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
2676 40 : const __m128i xmm_32767_shifted = _mm_set1_epi16(32767 - 32768);
2677 : #endif
2678 169 : for (; n < nWordCount - 15; n += 16)
2679 : {
2680 258 : __m128i xmm0 = _mm_loadu_si128(
2681 129 : reinterpret_cast<const __m128i *>(pSrcData + n));
2682 129 : __m128i xmm1 = _mm_loadu_si128(
2683 129 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2684 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2685 : xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
2686 : xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
2687 : #else
2688 129 : xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
2689 129 : xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
2690 129 : xmm0 = _mm_min_epi16(xmm0, xmm_32767_shifted);
2691 129 : xmm1 = _mm_min_epi16(xmm1, xmm_32767_shifted);
2692 129 : xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
2693 129 : xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
2694 : #endif
2695 129 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2696 129 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
2697 : xmm1);
2698 : }
2699 191 : for (; n < nWordCount; n++)
2700 : {
2701 282 : pDstData[n] = pSrcData[n] >= 32767
2702 : ? 32767
2703 131 : : static_cast<int16_t>(pSrcData[n]);
2704 40 : }
2705 : }
2706 : else
2707 : {
2708 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2709 : nDstPixelStride, nWordCount);
2710 : }
2711 46 : }
2712 :
2713 : template <>
2714 136 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2715 : int nSrcPixelStride,
2716 : uint16_t *const CPL_RESTRICT pDstData,
2717 : int nDstPixelStride, GPtrDiff_t nWordCount)
2718 : {
2719 136 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2720 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2721 : {
2722 93 : decltype(nWordCount) n = 0;
2723 93 : const __m128i xmm_zero = _mm_setzero_si128();
2724 278 : for (; n < nWordCount - 15; n += 16)
2725 : {
2726 370 : __m128i xmm0 = _mm_loadu_si128(
2727 185 : reinterpret_cast<const __m128i *>(pSrcData + n));
2728 185 : __m128i xmm1 = _mm_loadu_si128(
2729 185 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2730 185 : xmm0 = _mm_max_epi16(xmm0, xmm_zero);
2731 185 : xmm1 = _mm_max_epi16(xmm1, xmm_zero);
2732 185 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2733 185 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
2734 : xmm1);
2735 : }
2736 471 : for (; n < nWordCount; n++)
2737 : {
2738 378 : pDstData[n] =
2739 378 : pSrcData[n] < 0 ? 0 : static_cast<uint16_t>(pSrcData[n]);
2740 93 : }
2741 : }
2742 : else
2743 : {
2744 43 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2745 : nDstPixelStride, nWordCount);
2746 : }
2747 136 : }
2748 :
2749 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2750 :
2751 : template <>
2752 : CPL_NOINLINE void GDALCopyWordsT(const uint32_t *const CPL_RESTRICT pSrcData,
2753 : int nSrcPixelStride,
2754 : int32_t *const CPL_RESTRICT pDstData,
2755 : int nDstPixelStride, GPtrDiff_t nWordCount)
2756 : {
2757 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2758 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2759 : {
2760 : decltype(nWordCount) n = 0;
2761 : const __m128i xmm_MAX_INT = _mm_set1_epi32(INT_MAX);
2762 : for (; n < nWordCount - 8; n += 7)
2763 : {
2764 : __m128i xmm0 = _mm_loadu_si128(
2765 : reinterpret_cast<const __m128i *>(pSrcData + n));
2766 : __m128i xmm1 = _mm_loadu_si128(
2767 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
2768 : xmm0 = _mm_min_epu32(xmm0, xmm_MAX_INT);
2769 : xmm1 = _mm_min_epu32(xmm1, xmm_MAX_INT);
2770 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2771 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
2772 : xmm1);
2773 : }
2774 : for (; n < nWordCount; n++)
2775 : {
2776 : pDstData[n] = pSrcData[n] >= INT_MAX
2777 : ? INT_MAX
2778 : : static_cast<int32_t>(pSrcData[n]);
2779 : }
2780 : }
2781 : else
2782 : {
2783 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2784 : nDstPixelStride, nWordCount);
2785 : }
2786 : }
2787 :
2788 : template <>
2789 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
2790 : int nSrcPixelStride,
2791 : uint32_t *const CPL_RESTRICT pDstData,
2792 : int nDstPixelStride, GPtrDiff_t nWordCount)
2793 : {
2794 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2795 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2796 : {
2797 : decltype(nWordCount) n = 0;
2798 : const __m128i xmm_zero = _mm_setzero_si128();
2799 : for (; n < nWordCount - 7; n += 8)
2800 : {
2801 : __m128i xmm0 = _mm_loadu_si128(
2802 : reinterpret_cast<const __m128i *>(pSrcData + n));
2803 : __m128i xmm1 = _mm_loadu_si128(
2804 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
2805 : xmm0 = _mm_max_epi32(xmm0, xmm_zero);
2806 : xmm1 = _mm_max_epi32(xmm1, xmm_zero);
2807 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2808 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
2809 : xmm1);
2810 : }
2811 : for (; n < nWordCount; n++)
2812 : {
2813 : pDstData[n] =
2814 : pSrcData[n] < 0 ? 0 : static_cast<uint32_t>(pSrcData[n]);
2815 : }
2816 : }
2817 : else
2818 : {
2819 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2820 : nDstPixelStride, nWordCount);
2821 : }
2822 : }
2823 :
2824 : #endif // defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2825 :
2826 : template <>
2827 343 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2828 : int nSrcPixelStride,
2829 : float *const CPL_RESTRICT pDstData,
2830 : int nDstPixelStride, GPtrDiff_t nWordCount)
2831 : {
2832 343 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2833 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2834 : {
2835 337 : decltype(nWordCount) n = 0;
2836 337 : const __m128i xmm_zero = _mm_setzero_si128();
2837 337 : GByte *CPL_RESTRICT pabyDstDataPtr =
2838 : reinterpret_cast<GByte *>(pDstData);
2839 1508 : for (; n < nWordCount - 7; n += 8)
2840 : {
2841 1171 : __m128i xmm = _mm_loadu_si128(
2842 1171 : reinterpret_cast<const __m128i *>(pSrcData + n));
2843 1171 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
2844 1171 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
2845 1171 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2846 1171 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2847 1171 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2848 : xmm0_f);
2849 : _mm_storeu_ps(
2850 1171 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2851 : }
2852 1115 : for (; n < nWordCount; n++)
2853 : {
2854 778 : pDstData[n] = pSrcData[n];
2855 337 : }
2856 : }
2857 : else
2858 : {
2859 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2860 : nDstPixelStride, nWordCount);
2861 : }
2862 343 : }
2863 :
2864 : template <>
2865 1076640 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2866 : int nSrcPixelStride,
2867 : float *const CPL_RESTRICT pDstData,
2868 : int nDstPixelStride, GPtrDiff_t nWordCount)
2869 : {
2870 1076640 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2871 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2872 : {
2873 86739 : decltype(nWordCount) n = 0;
2874 86739 : GByte *CPL_RESTRICT pabyDstDataPtr =
2875 : reinterpret_cast<GByte *>(pDstData);
2876 586116 : for (; n < nWordCount - 7; n += 8)
2877 : {
2878 499377 : __m128i xmm = _mm_loadu_si128(
2879 499377 : reinterpret_cast<const __m128i *>(pSrcData + n));
2880 499377 : const auto sign = _mm_srai_epi16(xmm, 15);
2881 499377 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
2882 499377 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
2883 499377 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2884 499377 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2885 499377 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2886 : xmm0_f);
2887 : _mm_storeu_ps(
2888 499377 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2889 : }
2890 253879 : for (; n < nWordCount; n++)
2891 : {
2892 167140 : pDstData[n] = pSrcData[n];
2893 86739 : }
2894 : }
2895 : else
2896 : {
2897 989901 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2898 : nDstPixelStride, nWordCount);
2899 : }
2900 1076640 : }
2901 :
2902 : template <>
2903 449 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2904 : int nSrcPixelStride,
2905 : double *const CPL_RESTRICT pDstData,
2906 : int nDstPixelStride, GPtrDiff_t nWordCount)
2907 : {
2908 449 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2909 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2910 : {
2911 313 : decltype(nWordCount) n = 0;
2912 313 : const __m128i xmm_zero = _mm_setzero_si128();
2913 313 : GByte *CPL_RESTRICT pabyDstDataPtr =
2914 : reinterpret_cast<GByte *>(pDstData);
2915 829 : for (; n < nWordCount - 7; n += 8)
2916 : {
2917 516 : __m128i xmm = _mm_loadu_si128(
2918 516 : reinterpret_cast<const __m128i *>(pSrcData + n));
2919 516 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
2920 516 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
2921 :
2922 516 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2923 516 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2924 516 : xmm0 = _mm_srli_si128(xmm0, 8);
2925 516 : xmm1 = _mm_srli_si128(xmm1, 8);
2926 516 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2927 516 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2928 :
2929 516 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2930 : xmm0_low_d);
2931 : _mm_storeu_pd(
2932 516 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2933 : xmm0_high_d);
2934 : _mm_storeu_pd(
2935 516 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2936 : xmm1_low_d);
2937 : _mm_storeu_pd(
2938 516 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2939 : xmm1_high_d);
2940 : }
2941 1082 : for (; n < nWordCount; n++)
2942 : {
2943 769 : pDstData[n] = pSrcData[n];
2944 313 : }
2945 : }
2946 : else
2947 : {
2948 136 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2949 : nDstPixelStride, nWordCount);
2950 : }
2951 449 : }
2952 :
2953 : template <>
2954 4923020 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2955 : int nSrcPixelStride,
2956 : double *const CPL_RESTRICT pDstData,
2957 : int nDstPixelStride, GPtrDiff_t nWordCount)
2958 : {
2959 4923020 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2960 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2961 : {
2962 34621 : decltype(nWordCount) n = 0;
2963 34621 : GByte *CPL_RESTRICT pabyDstDataPtr =
2964 : reinterpret_cast<GByte *>(pDstData);
2965 403422 : for (; n < nWordCount - 7; n += 8)
2966 : {
2967 368801 : __m128i xmm = _mm_loadu_si128(
2968 368801 : reinterpret_cast<const __m128i *>(pSrcData + n));
2969 368801 : const auto sign = _mm_srai_epi16(xmm, 15);
2970 368801 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
2971 368801 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
2972 :
2973 368801 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2974 368801 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2975 368801 : xmm0 = _mm_srli_si128(xmm0, 8);
2976 368801 : xmm1 = _mm_srli_si128(xmm1, 8);
2977 368801 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2978 368801 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2979 :
2980 368801 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2981 : xmm0_low_d);
2982 : _mm_storeu_pd(
2983 368801 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2984 : xmm0_high_d);
2985 : _mm_storeu_pd(
2986 368801 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2987 : xmm1_low_d);
2988 : _mm_storeu_pd(
2989 368801 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2990 : xmm1_high_d);
2991 : }
2992 254691 : for (; n < nWordCount; n++)
2993 : {
2994 220070 : pDstData[n] = pSrcData[n];
2995 34621 : }
2996 : }
2997 : else
2998 : {
2999 4888400 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
3000 : nDstPixelStride, nWordCount);
3001 : }
3002 4923020 : }
3003 :
3004 : // ---- AVX2 helpers for int32 narrowing (runtime dispatch) ----
3005 :
3006 : #if defined(HAVE_AVX2_DISPATCH) || defined(HAVE_AVX2_NATIVELY)
3007 : #if defined(HAVE_AVX2_DISPATCH) && !defined(HAVE_AVX2_DISPATCH_MSVC)
3008 : __attribute__((target("avx2")))
3009 : #endif
3010 12727 : static void GDALCopyWordsInt32ToUInt8_AVX2(const int32_t *CPL_RESTRICT pSrc,
3011 : uint8_t *CPL_RESTRICT pDst,
3012 : GPtrDiff_t nWordCount)
3013 : {
3014 12727 : const __m256i ymm_zero = _mm256_setzero_si256();
3015 12727 : const __m256i ymm_255 = _mm256_set1_epi32(255);
3016 12727 : const __m256i permuteIdx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
3017 12727 : GPtrDiff_t n = 0;
3018 958123 : for (; n < nWordCount - 31; n += 32)
3019 : {
3020 : __m256i v0 =
3021 945396 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n));
3022 : __m256i v1 =
3023 945396 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n + 8));
3024 945396 : __m256i v2 = _mm256_loadu_si256(
3025 945396 : reinterpret_cast<const __m256i *>(pSrc + n + 16));
3026 945396 : __m256i v3 = _mm256_loadu_si256(
3027 945396 : reinterpret_cast<const __m256i *>(pSrc + n + 24));
3028 : // Clamp to [0, 255]
3029 945396 : v0 = _mm256_max_epi32(v0, ymm_zero);
3030 945396 : v1 = _mm256_max_epi32(v1, ymm_zero);
3031 945396 : v2 = _mm256_max_epi32(v2, ymm_zero);
3032 945396 : v3 = _mm256_max_epi32(v3, ymm_zero);
3033 945396 : v0 = _mm256_min_epi32(v0, ymm_255);
3034 945396 : v1 = _mm256_min_epi32(v1, ymm_255);
3035 945396 : v2 = _mm256_min_epi32(v2, ymm_255);
3036 945396 : v3 = _mm256_min_epi32(v3, ymm_255);
3037 : // Pack int32→int16→uint8, then fix cross-lane ordering
3038 945396 : __m256i ab16 = _mm256_packs_epi32(v0, v1);
3039 945396 : __m256i cd16 = _mm256_packs_epi32(v2, v3);
3040 945396 : __m256i bytes = _mm256_packus_epi16(ab16, cd16);
3041 945396 : bytes = _mm256_permutevar8x32_epi32(bytes, permuteIdx);
3042 945396 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(pDst + n), bytes);
3043 : }
3044 68601 : for (; n < nWordCount; n++)
3045 : {
3046 70969 : pDst[n] = pSrc[n] <= 0 ? 0
3047 15095 : : pSrc[n] >= 255 ? 255
3048 1079 : : static_cast<uint8_t>(pSrc[n]);
3049 : }
3050 12727 : }
3051 : #endif // HAVE_AVX2_DISPATCH || HAVE_AVX2_NATIVELY
3052 :
3053 : #if defined(HAVE_AVX2_DISPATCH) || defined(HAVE_AVX2_NATIVELY)
3054 : #if defined(HAVE_AVX2_DISPATCH) && !defined(HAVE_AVX2_DISPATCH_MSVC)
3055 : __attribute__((target("avx2")))
3056 : #endif
3057 10277 : static void GDALCopyWordsInt32ToUInt16_AVX2(const int32_t *CPL_RESTRICT pSrc,
3058 : uint16_t *CPL_RESTRICT pDst,
3059 : GPtrDiff_t nWordCount)
3060 : {
3061 10277 : const __m256i ymm_zero = _mm256_setzero_si256();
3062 : // _mm256_packus_epi32(v0, v1) produces per-lane interleaved result:
3063 : // [v0_lo4, v1_lo4, v0_hi4, v1_hi4] (in uint16 pairs per 32-bit lane)
3064 : // Permute to deinterleave: all v0 values first, then all v1 values
3065 10277 : const __m256i permuteIdx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
3066 10277 : GPtrDiff_t n = 0;
3067 670572 : for (; n < nWordCount - 15; n += 16)
3068 : {
3069 : __m256i v0 =
3070 660295 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n));
3071 : __m256i v1 =
3072 1320590 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n + 8));
3073 : // Clamp to [0, 65535]: _mm256_packus_epi32 saturates uint
3074 660295 : v0 = _mm256_max_epi32(v0, ymm_zero);
3075 660295 : v1 = _mm256_max_epi32(v1, ymm_zero);
3076 660295 : __m256i packed = _mm256_packus_epi32(v0, v1);
3077 : // Fix cross-lane interleave from packus
3078 660295 : packed = _mm256_permutevar8x32_epi32(packed, permuteIdx);
3079 660295 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(pDst + n), packed);
3080 : }
3081 163928 : for (; n < nWordCount; n++)
3082 : {
3083 307282 : pDst[n] = pSrc[n] <= 0 ? 0
3084 153631 : : pSrc[n] >= 65535 ? 65535
3085 153599 : : static_cast<uint16_t>(pSrc[n]);
3086 : }
3087 10277 : }
3088 : #endif // HAVE_AVX2_DISPATCH || HAVE_AVX2_NATIVELY
3089 :
3090 : // ---- int32 -> uint8 with clamping to [0, 255] ----
3091 : template <>
3092 13641 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
3093 : int nSrcPixelStride,
3094 : uint8_t *const CPL_RESTRICT pDstData,
3095 : int nDstPixelStride, GPtrDiff_t nWordCount)
3096 : {
3097 13641 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
3098 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
3099 : {
3100 : #if defined(HAVE_AVX2_DISPATCH)
3101 12727 : if (CPLHaveRuntimeAVX2())
3102 : {
3103 12727 : GDALCopyWordsInt32ToUInt8_AVX2(pSrcData, pDstData, nWordCount);
3104 12727 : return;
3105 : }
3106 : #elif defined(HAVE_AVX2_NATIVELY)
3107 : GDALCopyWordsInt32ToUInt8_AVX2(pSrcData, pDstData, nWordCount);
3108 : return;
3109 : #endif
3110 : #ifdef HAVE_SSE2
3111 : // SSE2 path: 16 pixels per iteration
3112 0 : decltype(nWordCount) n = 0;
3113 0 : const __m128i xmm_255 = _mm_set1_epi32(255);
3114 0 : for (; n < nWordCount - 15; n += 16)
3115 : {
3116 0 : __m128i v0 = _mm_loadu_si128(
3117 0 : reinterpret_cast<const __m128i *>(pSrcData + n));
3118 0 : __m128i v1 = _mm_loadu_si128(
3119 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
3120 0 : __m128i v2 = _mm_loadu_si128(
3121 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
3122 0 : __m128i v3 = _mm_loadu_si128(
3123 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 12));
3124 : // Clamp to [0, 255] using SSE2 arithmetic:
3125 : // max(v, 0): zero out negatives via sign bit mask
3126 0 : v0 = _mm_andnot_si128(_mm_srai_epi32(v0, 31), v0);
3127 0 : v1 = _mm_andnot_si128(_mm_srai_epi32(v1, 31), v1);
3128 0 : v2 = _mm_andnot_si128(_mm_srai_epi32(v2, 31), v2);
3129 0 : v3 = _mm_andnot_si128(_mm_srai_epi32(v3, 31), v3);
3130 : // min(v, 255): blend 255 where v > 255
3131 0 : __m128i gt0 = _mm_cmpgt_epi32(v0, xmm_255);
3132 0 : __m128i gt1 = _mm_cmpgt_epi32(v1, xmm_255);
3133 0 : __m128i gt2 = _mm_cmpgt_epi32(v2, xmm_255);
3134 0 : __m128i gt3 = _mm_cmpgt_epi32(v3, xmm_255);
3135 0 : v0 = _mm_or_si128(_mm_andnot_si128(gt0, v0),
3136 : _mm_and_si128(gt0, xmm_255));
3137 0 : v1 = _mm_or_si128(_mm_andnot_si128(gt1, v1),
3138 : _mm_and_si128(gt1, xmm_255));
3139 0 : v2 = _mm_or_si128(_mm_andnot_si128(gt2, v2),
3140 : _mm_and_si128(gt2, xmm_255));
3141 0 : v3 = _mm_or_si128(_mm_andnot_si128(gt3, v3),
3142 : _mm_and_si128(gt3, xmm_255));
3143 : // Values in [0, 255]: pack int32→int16→uint8
3144 0 : __m128i lo16 = _mm_packs_epi32(v0, v1);
3145 0 : __m128i hi16 = _mm_packs_epi32(v2, v3);
3146 0 : __m128i bytes = _mm_packus_epi16(lo16, hi16);
3147 0 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), bytes);
3148 : }
3149 0 : for (; n < nWordCount; n++)
3150 : #else
3151 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
3152 : #endif
3153 : {
3154 0 : pDstData[n] = pSrcData[n] <= 0 ? 0
3155 0 : : pSrcData[n] >= 255
3156 : ? 255
3157 0 : : static_cast<uint8_t>(pSrcData[n]);
3158 0 : }
3159 : }
3160 : else
3161 : {
3162 914 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
3163 : nDstPixelStride, nWordCount);
3164 : }
3165 : }
3166 :
3167 : // ---- int32 -> uint16 with clamping to [0, 65535] ----
3168 : template <>
3169 10322 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
3170 : int nSrcPixelStride,
3171 : uint16_t *const CPL_RESTRICT pDstData,
3172 : int nDstPixelStride, GPtrDiff_t nWordCount)
3173 : {
3174 10322 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
3175 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
3176 : {
3177 : #if defined(HAVE_AVX2_DISPATCH)
3178 10277 : if (CPLHaveRuntimeAVX2())
3179 : {
3180 10277 : GDALCopyWordsInt32ToUInt16_AVX2(pSrcData, pDstData, nWordCount);
3181 10277 : return;
3182 : }
3183 : #elif defined(HAVE_AVX2_NATIVELY)
3184 : GDALCopyWordsInt32ToUInt16_AVX2(pSrcData, pDstData, nWordCount);
3185 : return;
3186 : #endif
3187 0 : decltype(nWordCount) n = 0;
3188 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
3189 : // SSE4.1: _mm_packus_epi32 directly handles uint saturation
3190 : for (; n < nWordCount - 7; n += 8)
3191 : {
3192 : __m128i v0 = _mm_loadu_si128(
3193 : reinterpret_cast<const __m128i *>(pSrcData + n));
3194 : __m128i v1 = _mm_loadu_si128(
3195 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
3196 : v0 = _mm_max_epi32(v0, _mm_setzero_si128());
3197 : v1 = _mm_max_epi32(v1, _mm_setzero_si128());
3198 : __m128i packed = _mm_packus_epi32(v0, v1);
3199 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), packed);
3200 : }
3201 : #else
3202 : // SSE2: clamp to [0, 65535], bias to signed range, pack, unbias
3203 0 : const __m128i xmm_65535 = _mm_set1_epi32(65535);
3204 0 : const __m128i xmm_bias32 = _mm_set1_epi32(32768);
3205 0 : const __m128i xmm_bias16 = _mm_set1_epi16(-32768);
3206 0 : for (; n < nWordCount - 7; n += 8)
3207 : {
3208 0 : __m128i v0 = _mm_loadu_si128(
3209 0 : reinterpret_cast<const __m128i *>(pSrcData + n));
3210 0 : __m128i v1 = _mm_loadu_si128(
3211 0 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
3212 : // max(v, 0)
3213 0 : v0 = _mm_andnot_si128(_mm_srai_epi32(v0, 31), v0);
3214 0 : v1 = _mm_andnot_si128(_mm_srai_epi32(v1, 31), v1);
3215 : // min(v, 65535)
3216 0 : __m128i gt0 = _mm_cmpgt_epi32(v0, xmm_65535);
3217 0 : __m128i gt1 = _mm_cmpgt_epi32(v1, xmm_65535);
3218 0 : v0 = _mm_or_si128(_mm_andnot_si128(gt0, v0),
3219 : _mm_and_si128(gt0, xmm_65535));
3220 0 : v1 = _mm_or_si128(_mm_andnot_si128(gt1, v1),
3221 : _mm_and_si128(gt1, xmm_65535));
3222 : // Shift [0, 65535] → [-32768, 32767] for _mm_packs_epi32
3223 0 : v0 = _mm_sub_epi32(v0, xmm_bias32);
3224 0 : v1 = _mm_sub_epi32(v1, xmm_bias32);
3225 0 : __m128i packed = _mm_packs_epi32(v0, v1);
3226 : // Shift back: sub_epi16(x, -32768) == add 32768 (mod 2^16)
3227 0 : packed = _mm_sub_epi16(packed, xmm_bias16);
3228 0 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), packed);
3229 : }
3230 : #endif
3231 0 : for (; n < nWordCount; n++)
3232 : {
3233 0 : pDstData[n] = pSrcData[n] <= 0 ? 0
3234 0 : : pSrcData[n] >= 65535
3235 : ? 65535
3236 0 : : static_cast<uint16_t>(pSrcData[n]);
3237 0 : }
3238 : }
3239 : else
3240 : {
3241 45 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
3242 : nDstPixelStride, nWordCount);
3243 : }
3244 : }
3245 :
3246 : #endif // HAVE_SSE2
3247 :
3248 : template <>
3249 4436800 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3250 : int nSrcPixelStride,
3251 : GByte *const CPL_RESTRICT pDstData,
3252 : int nDstPixelStride, GPtrDiff_t nWordCount)
3253 : {
3254 4436800 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3255 : nDstPixelStride, nWordCount);
3256 4436800 : }
3257 :
3258 : template <>
3259 38387 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3260 : int nSrcPixelStride,
3261 : GUInt16 *const CPL_RESTRICT pDstData,
3262 : int nDstPixelStride, GPtrDiff_t nWordCount)
3263 : {
3264 38387 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3265 : nDstPixelStride, nWordCount);
3266 38387 : }
3267 :
3268 : template <>
3269 55620 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3270 : int nSrcPixelStride,
3271 : double *const CPL_RESTRICT pDstData,
3272 : int nDstPixelStride, GPtrDiff_t nWordCount)
3273 : {
3274 55620 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3275 : nDstPixelStride, nWordCount);
3276 55620 : }
3277 :
3278 : template <>
3279 122818 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3280 : int nSrcPixelStride,
3281 : float *const CPL_RESTRICT pDstData,
3282 : int nDstPixelStride, GPtrDiff_t nWordCount)
3283 : {
3284 122818 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3285 : nDstPixelStride, nWordCount);
3286 122818 : }
3287 :
3288 : template <>
3289 412 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
3290 : int nSrcPixelStride,
3291 : float *const CPL_RESTRICT pDstData,
3292 : int nDstPixelStride, GPtrDiff_t nWordCount)
3293 : {
3294 412 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3295 : nDstPixelStride, nWordCount);
3296 412 : }
3297 :
3298 : template <>
3299 544 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
3300 : int nSrcPixelStride,
3301 : double *const CPL_RESTRICT pDstData,
3302 : int nDstPixelStride, GPtrDiff_t nWordCount)
3303 : {
3304 544 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3305 : nDstPixelStride, nWordCount);
3306 544 : }
3307 :
3308 : template <>
3309 318163 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3310 : int nSrcPixelStride,
3311 : GByte *const CPL_RESTRICT pDstData,
3312 : int nDstPixelStride, GPtrDiff_t nWordCount)
3313 : {
3314 318163 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3315 : nDstPixelStride, nWordCount);
3316 318163 : }
3317 :
3318 : template <>
3319 55 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3320 : int nSrcPixelStride,
3321 : GInt8 *const CPL_RESTRICT pDstData,
3322 : int nDstPixelStride, GPtrDiff_t nWordCount)
3323 : {
3324 55 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3325 : nDstPixelStride, nWordCount);
3326 55 : }
3327 :
3328 : template <>
3329 15775 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3330 : int nSrcPixelStride,
3331 : GInt16 *const CPL_RESTRICT pDstData,
3332 : int nDstPixelStride, GPtrDiff_t nWordCount)
3333 : {
3334 15775 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3335 : nDstPixelStride, nWordCount);
3336 15775 : }
3337 :
3338 : template <>
3339 61713 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3340 : int nSrcPixelStride,
3341 : GUInt16 *const CPL_RESTRICT pDstData,
3342 : int nDstPixelStride, GPtrDiff_t nWordCount)
3343 : {
3344 61713 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3345 : nDstPixelStride, nWordCount);
3346 61713 : }
3347 :
3348 : template <>
3349 43884 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3350 : int nSrcPixelStride,
3351 : GInt32 *const CPL_RESTRICT pDstData,
3352 : int nDstPixelStride, GPtrDiff_t nWordCount)
3353 : {
3354 43884 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3355 : nDstPixelStride, nWordCount);
3356 43884 : }
3357 :
3358 : template <>
3359 72 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3360 : int nSrcPixelStride,
3361 : GFloat16 *const CPL_RESTRICT pDstData,
3362 : int nDstPixelStride, GPtrDiff_t nWordCount)
3363 : {
3364 72 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3365 : nDstPixelStride, nWordCount);
3366 72 : }
3367 :
3368 : template <>
3369 63 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3370 : int nSrcPixelStride,
3371 : GFloat16 *const CPL_RESTRICT pDstData,
3372 : int nDstPixelStride, GPtrDiff_t nWordCount)
3373 : {
3374 63 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3375 : nDstPixelStride, nWordCount);
3376 63 : }
3377 :
3378 : /************************************************************************/
3379 : /* GDALCopyWordsComplexT() */
3380 : /************************************************************************/
3381 : /**
3382 : * Template function, used to copy data from pSrcData into buffer
3383 : * pDstData, with stride nSrcPixelStride in the source data and
3384 : * stride nDstPixelStride in the destination data. Deals with the
3385 : * complex case, where input is complex and output is complex.
3386 : *
3387 : * @param pSrcData the source data buffer
3388 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
3389 : * of interest.
3390 : * @param pDstData the destination buffer.
3391 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
3392 : * interest.
3393 : * @param nWordCount the total number of pixel words to copy
3394 : *
3395 : */
3396 : template <class Tin, class Tout>
3397 98787 : inline void GDALCopyWordsComplexT(const Tin *const CPL_RESTRICT pSrcData,
3398 : int nSrcPixelStride,
3399 : Tout *const CPL_RESTRICT pDstData,
3400 : int nDstPixelStride, GPtrDiff_t nWordCount)
3401 : {
3402 98787 : decltype(nWordCount) nDstOffset = 0;
3403 98787 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
3404 98787 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
3405 :
3406 5631237 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
3407 : {
3408 5532445 : const Tin *const pPixelIn =
3409 5532445 : reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
3410 5532445 : Tout *const pPixelOut =
3411 5532445 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
3412 :
3413 5532445 : GDALCopyWord(pPixelIn[0], pPixelOut[0]);
3414 5532445 : GDALCopyWord(pPixelIn[1], pPixelOut[1]);
3415 :
3416 5532445 : nDstOffset += nDstPixelStride;
3417 : }
3418 98787 : }
3419 :
3420 : /************************************************************************/
3421 : /* GDALCopyWordsComplexOutT() */
3422 : /************************************************************************/
3423 : /**
3424 : * Template function, used to copy data from pSrcData into buffer
3425 : * pDstData, with stride nSrcPixelStride in the source data and
3426 : * stride nDstPixelStride in the destination data. Deals with the
3427 : * case where the value is real coming in, but complex going out.
3428 : *
3429 : * @param pSrcData the source data buffer
3430 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
3431 : * of interest, in bytes.
3432 : * @param pDstData the destination buffer.
3433 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
3434 : * interest, in bytes.
3435 : * @param nWordCount the total number of pixel words to copy
3436 : *
3437 : */
3438 : template <class Tin, class Tout>
3439 4762 : inline void GDALCopyWordsComplexOutT(const Tin *const CPL_RESTRICT pSrcData,
3440 : int nSrcPixelStride,
3441 : Tout *const CPL_RESTRICT pDstData,
3442 : int nDstPixelStride, GPtrDiff_t nWordCount)
3443 : {
3444 4762 : decltype(nWordCount) nDstOffset = 0;
3445 :
3446 4762 : const Tout tOutZero = static_cast<Tout>(0);
3447 :
3448 4762 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
3449 4762 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
3450 :
3451 1190408 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
3452 : {
3453 1185646 : const Tin tValue =
3454 1185646 : *reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
3455 1185646 : Tout *const pPixelOut =
3456 1185646 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
3457 1185646 : GDALCopyWord(tValue, *pPixelOut);
3458 :
3459 1185646 : pPixelOut[1] = tOutZero;
3460 :
3461 1185646 : nDstOffset += nDstPixelStride;
3462 : }
3463 4762 : }
3464 :
3465 : /************************************************************************/
3466 : /* GDALCopyWordsFromT() */
3467 : /************************************************************************/
3468 : /**
3469 : * Template driver function. Given the input type T, call the appropriate
3470 : * GDALCopyWordsT function template for the desired output type. You should
3471 : * never call this function directly (call GDALCopyWords instead).
3472 : *
3473 : * @param pSrcData source data buffer
3474 : * @param nSrcPixelStride pixel stride in input buffer, in pixel words
3475 : * @param bInComplex input is complex
3476 : * @param pDstData destination data buffer
3477 : * @param eDstType destination data type
3478 : * @param nDstPixelStride pixel stride in output buffer, in pixel words
3479 : * @param nWordCount number of pixel words to be copied
3480 : */
3481 : template <class T>
3482 61300899 : inline void GDALCopyWordsFromT(const T *const CPL_RESTRICT pSrcData,
3483 : int nSrcPixelStride, bool bInComplex,
3484 : void *CPL_RESTRICT pDstData,
3485 : GDALDataType eDstType, int nDstPixelStride,
3486 : GPtrDiff_t nWordCount)
3487 : {
3488 61300899 : switch (eDstType)
3489 : {
3490 4799947 : case GDT_UInt8:
3491 4799947 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3492 : static_cast<unsigned char *>(pDstData),
3493 : nDstPixelStride, nWordCount);
3494 4799947 : break;
3495 1891 : case GDT_Int8:
3496 1891 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3497 : static_cast<signed char *>(pDstData),
3498 : nDstPixelStride, nWordCount);
3499 1891 : break;
3500 1143524 : case GDT_UInt16:
3501 1143524 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3502 : static_cast<unsigned short *>(pDstData),
3503 : nDstPixelStride, nWordCount);
3504 1143524 : break;
3505 4162742 : case GDT_Int16:
3506 4162742 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3507 : static_cast<short *>(pDstData), nDstPixelStride,
3508 : nWordCount);
3509 4162742 : break;
3510 23084 : case GDT_UInt32:
3511 23084 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3512 : static_cast<unsigned int *>(pDstData),
3513 : nDstPixelStride, nWordCount);
3514 23084 : break;
3515 29449431 : case GDT_Int32:
3516 29449431 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3517 : static_cast<int *>(pDstData), nDstPixelStride,
3518 : nWordCount);
3519 29449431 : break;
3520 1250 : case GDT_UInt64:
3521 1250 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3522 : static_cast<std::uint64_t *>(pDstData),
3523 : nDstPixelStride, nWordCount);
3524 1250 : break;
3525 5957 : case GDT_Int64:
3526 5957 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3527 : static_cast<std::int64_t *>(pDstData),
3528 : nDstPixelStride, nWordCount);
3529 5957 : break;
3530 999 : case GDT_Float16:
3531 999 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3532 : static_cast<GFloat16 *>(pDstData), nDstPixelStride,
3533 : nWordCount);
3534 999 : break;
3535 4220966 : case GDT_Float32:
3536 4220966 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3537 : static_cast<float *>(pDstData), nDstPixelStride,
3538 : nWordCount);
3539 4220966 : break;
3540 17387423 : case GDT_Float64:
3541 17387423 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3542 : static_cast<double *>(pDstData), nDstPixelStride,
3543 : nWordCount);
3544 17387423 : break;
3545 94424 : case GDT_CInt16:
3546 94424 : if (bInComplex)
3547 : {
3548 93170 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3549 : static_cast<short *>(pDstData),
3550 : nDstPixelStride, nWordCount);
3551 : }
3552 : else // input is not complex, so we need to promote to a complex
3553 : // buffer
3554 : {
3555 1254 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3556 : static_cast<short *>(pDstData),
3557 : nDstPixelStride, nWordCount);
3558 : }
3559 94424 : break;
3560 1349 : case GDT_CInt32:
3561 1349 : if (bInComplex)
3562 : {
3563 717 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3564 : static_cast<int *>(pDstData),
3565 : nDstPixelStride, nWordCount);
3566 : }
3567 : else // input is not complex, so we need to promote to a complex
3568 : // buffer
3569 : {
3570 632 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3571 : static_cast<int *>(pDstData),
3572 : nDstPixelStride, nWordCount);
3573 : }
3574 1349 : break;
3575 313 : case GDT_CFloat16:
3576 313 : if (bInComplex)
3577 : {
3578 48 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3579 : static_cast<GFloat16 *>(pDstData),
3580 : nDstPixelStride, nWordCount);
3581 : }
3582 : else // input is not complex, so we need to promote to a complex
3583 : // buffer
3584 : {
3585 265 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3586 : static_cast<GFloat16 *>(pDstData),
3587 : nDstPixelStride, nWordCount);
3588 : }
3589 313 : break;
3590 3923 : case GDT_CFloat32:
3591 3923 : if (bInComplex)
3592 : {
3593 3114 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3594 : static_cast<float *>(pDstData),
3595 : nDstPixelStride, nWordCount);
3596 : }
3597 : else // input is not complex, so we need to promote to a complex
3598 : // buffer
3599 : {
3600 809 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3601 : static_cast<float *>(pDstData),
3602 : nDstPixelStride, nWordCount);
3603 : }
3604 3923 : break;
3605 3540 : case GDT_CFloat64:
3606 3540 : if (bInComplex)
3607 : {
3608 1738 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3609 : static_cast<double *>(pDstData),
3610 : nDstPixelStride, nWordCount);
3611 : }
3612 : else // input is not complex, so we need to promote to a complex
3613 : // buffer
3614 : {
3615 1802 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3616 : static_cast<double *>(pDstData),
3617 : nDstPixelStride, nWordCount);
3618 : }
3619 3540 : break;
3620 0 : case GDT_Unknown:
3621 : case GDT_TypeCount:
3622 0 : CPLAssert(false);
3623 : }
3624 61300899 : }
3625 :
3626 : } // end anonymous namespace
3627 :
3628 : /************************************************************************/
3629 : /* GDALReplicateWord() */
3630 : /************************************************************************/
3631 :
3632 : template <class T>
3633 600347 : inline void GDALReplicateWordT(void *pDstData, int nDstPixelStride,
3634 : GPtrDiff_t nWordCount)
3635 : {
3636 600347 : const T valSet = *static_cast<const T *>(pDstData);
3637 600347 : if (nDstPixelStride == static_cast<int>(sizeof(T)))
3638 : {
3639 570534 : T *pDstPtr = static_cast<T *>(pDstData) + 1;
3640 31989919 : while (nWordCount >= 4)
3641 : {
3642 31419380 : nWordCount -= 4;
3643 31419380 : pDstPtr[0] = valSet;
3644 31419380 : pDstPtr[1] = valSet;
3645 31419380 : pDstPtr[2] = valSet;
3646 31419380 : pDstPtr[3] = valSet;
3647 31419380 : pDstPtr += 4;
3648 : }
3649 1476400 : while (nWordCount > 0)
3650 : {
3651 905866 : --nWordCount;
3652 905866 : *pDstPtr = valSet;
3653 905866 : pDstPtr++;
3654 : }
3655 : }
3656 : else
3657 : {
3658 29813 : GByte *pabyDstPtr = static_cast<GByte *>(pDstData) + nDstPixelStride;
3659 1040984 : while (nWordCount > 0)
3660 : {
3661 1011171 : --nWordCount;
3662 1011171 : *reinterpret_cast<T *>(pabyDstPtr) = valSet;
3663 1011171 : pabyDstPtr += nDstPixelStride;
3664 : }
3665 : }
3666 600347 : }
3667 :
3668 1067780 : static void GDALReplicateWord(const void *CPL_RESTRICT pSrcData,
3669 : GDALDataType eSrcType,
3670 : void *CPL_RESTRICT pDstData,
3671 : GDALDataType eDstType, int nDstPixelStride,
3672 : GPtrDiff_t nWordCount)
3673 : {
3674 : /* -----------------------------------------------------------------------
3675 : */
3676 : /* Special case when the source data is always the same value */
3677 : /* (for VRTSourcedRasterBand::IRasterIO and
3678 : * VRTDerivedRasterBand::IRasterIO*/
3679 : /* for example) */
3680 : /* -----------------------------------------------------------------------
3681 : */
3682 : // Let the general translation case do the necessary conversions
3683 : // on the first destination element.
3684 1067780 : GDALCopyWords64(pSrcData, eSrcType, 0, pDstData, eDstType, 0, 1);
3685 :
3686 : // Now copy the first element to the nWordCount - 1 following destination
3687 : // elements.
3688 1067780 : nWordCount--;
3689 1067780 : GByte *pabyDstWord = reinterpret_cast<GByte *>(pDstData) + nDstPixelStride;
3690 :
3691 1067780 : switch (eDstType)
3692 : {
3693 467342 : case GDT_UInt8:
3694 : case GDT_Int8:
3695 : {
3696 467342 : if (nDstPixelStride == 1)
3697 : {
3698 369424 : if (nWordCount > 0)
3699 369424 : memset(pabyDstWord,
3700 369424 : *reinterpret_cast<const GByte *>(pDstData),
3701 : nWordCount);
3702 : }
3703 : else
3704 : {
3705 97918 : GByte valSet = *reinterpret_cast<const GByte *>(pDstData);
3706 67697100 : while (nWordCount > 0)
3707 : {
3708 67599200 : --nWordCount;
3709 67599200 : *pabyDstWord = valSet;
3710 67599200 : pabyDstWord += nDstPixelStride;
3711 : }
3712 : }
3713 467342 : break;
3714 : }
3715 :
3716 : #define CASE_DUPLICATE_SIMPLE(enum_type, c_type) \
3717 : case enum_type: \
3718 : { \
3719 : GDALReplicateWordT<c_type>(pDstData, nDstPixelStride, nWordCount); \
3720 : break; \
3721 : }
3722 :
3723 34513 : CASE_DUPLICATE_SIMPLE(GDT_UInt16, GUInt16)
3724 202461 : CASE_DUPLICATE_SIMPLE(GDT_Int16, GInt16)
3725 74 : CASE_DUPLICATE_SIMPLE(GDT_UInt32, GUInt32)
3726 301585 : CASE_DUPLICATE_SIMPLE(GDT_Int32, GInt32)
3727 41 : CASE_DUPLICATE_SIMPLE(GDT_UInt64, std::uint64_t)
3728 1072 : CASE_DUPLICATE_SIMPLE(GDT_Int64, std::int64_t)
3729 2 : CASE_DUPLICATE_SIMPLE(GDT_Float16, GFloat16)
3730 52827 : CASE_DUPLICATE_SIMPLE(GDT_Float32, float)
3731 7772 : CASE_DUPLICATE_SIMPLE(GDT_Float64, double)
3732 :
3733 : #define CASE_DUPLICATE_COMPLEX(enum_type, c_type) \
3734 : case enum_type: \
3735 : { \
3736 : c_type valSet1 = reinterpret_cast<const c_type *>(pDstData)[0]; \
3737 : c_type valSet2 = reinterpret_cast<const c_type *>(pDstData)[1]; \
3738 : while (nWordCount > 0) \
3739 : { \
3740 : --nWordCount; \
3741 : reinterpret_cast<c_type *>(pabyDstWord)[0] = valSet1; \
3742 : reinterpret_cast<c_type *>(pabyDstWord)[1] = valSet2; \
3743 : pabyDstWord += nDstPixelStride; \
3744 : } \
3745 : break; \
3746 : }
3747 :
3748 784 : CASE_DUPLICATE_COMPLEX(GDT_CInt16, GInt16)
3749 784 : CASE_DUPLICATE_COMPLEX(GDT_CInt32, GInt32)
3750 6 : CASE_DUPLICATE_COMPLEX(GDT_CFloat16, GFloat16)
3751 790 : CASE_DUPLICATE_COMPLEX(GDT_CFloat32, float)
3752 790 : CASE_DUPLICATE_COMPLEX(GDT_CFloat64, double)
3753 :
3754 0 : case GDT_Unknown:
3755 : case GDT_TypeCount:
3756 0 : CPLAssert(false);
3757 : }
3758 1067780 : }
3759 :
3760 : /************************************************************************/
3761 : /* GDALUnrolledCopy() */
3762 : /************************************************************************/
3763 :
3764 : template <class T, int srcStride, int dstStride>
3765 : #if defined(__GNUC__) && defined(__AVX2__)
3766 : __attribute__((optimize("tree-vectorize")))
3767 : #endif
3768 3000975 : static inline void GDALUnrolledCopyGeneric(T *CPL_RESTRICT pDest,
3769 : const T *CPL_RESTRICT pSrc,
3770 : GPtrDiff_t nIters)
3771 : {
3772 : #if !(defined(__GNUC__) && defined(__AVX2__))
3773 3000975 : if (nIters >= 16)
3774 : {
3775 132815387 : for (GPtrDiff_t i = nIters / 16; i != 0; i--)
3776 : {
3777 129935045 : pDest[0 * dstStride] = pSrc[0 * srcStride];
3778 129935045 : pDest[1 * dstStride] = pSrc[1 * srcStride];
3779 129935045 : pDest[2 * dstStride] = pSrc[2 * srcStride];
3780 129935045 : pDest[3 * dstStride] = pSrc[3 * srcStride];
3781 129935045 : pDest[4 * dstStride] = pSrc[4 * srcStride];
3782 129935045 : pDest[5 * dstStride] = pSrc[5 * srcStride];
3783 129935045 : pDest[6 * dstStride] = pSrc[6 * srcStride];
3784 129935045 : pDest[7 * dstStride] = pSrc[7 * srcStride];
3785 129935045 : pDest[8 * dstStride] = pSrc[8 * srcStride];
3786 129935045 : pDest[9 * dstStride] = pSrc[9 * srcStride];
3787 129935045 : pDest[10 * dstStride] = pSrc[10 * srcStride];
3788 129935045 : pDest[11 * dstStride] = pSrc[11 * srcStride];
3789 129935045 : pDest[12 * dstStride] = pSrc[12 * srcStride];
3790 129935045 : pDest[13 * dstStride] = pSrc[13 * srcStride];
3791 129935045 : pDest[14 * dstStride] = pSrc[14 * srcStride];
3792 129935045 : pDest[15 * dstStride] = pSrc[15 * srcStride];
3793 129935045 : pDest += 16 * dstStride;
3794 129935045 : pSrc += 16 * srcStride;
3795 : }
3796 2880417 : nIters = nIters % 16;
3797 : }
3798 : #else
3799 : #pragma GCC unroll 4
3800 : #endif
3801 5162719 : for (GPtrDiff_t i = 0; i < nIters; i++)
3802 : {
3803 2161743 : pDest[i * dstStride] = *pSrc;
3804 2161743 : pSrc += srcStride;
3805 : }
3806 3000975 : }
3807 :
3808 : template <class T, int srcStride, int dstStride>
3809 3000975 : static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
3810 : const T *CPL_RESTRICT pSrc,
3811 : GPtrDiff_t nIters)
3812 : {
3813 3000975 : GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
3814 3000975 : }
3815 :
3816 : #if defined(__AVX2__) && defined(HAVE_SSSE3_AT_COMPILE_TIME) && \
3817 : (defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS))
3818 :
3819 : template <>
3820 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
3821 : const GByte *CPL_RESTRICT pSrc,
3822 : GPtrDiff_t nIters)
3823 : {
3824 : if (nIters > 16)
3825 : {
3826 : // The SSSE3 variant is slightly faster than what the gcc autovectorizer
3827 : // generates
3828 : GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
3829 : }
3830 : else
3831 : {
3832 : for (GPtrDiff_t i = 0; i < nIters; i++)
3833 : {
3834 : pDest[i] = *pSrc;
3835 : pSrc += 3;
3836 : }
3837 : }
3838 : }
3839 :
3840 : #elif defined(HAVE_SSE2) && !(defined(__GNUC__) && defined(__AVX2__))
3841 :
3842 : template <>
3843 354124 : void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
3844 : const GByte *CPL_RESTRICT pSrc,
3845 : GPtrDiff_t nIters)
3846 : {
3847 354124 : decltype(nIters) i = 0;
3848 354124 : if (nIters > 16)
3849 : {
3850 194667 : const __m128i xmm_mask = _mm_set1_epi16(0xff);
3851 : // If we were sure that there would always be 1 trailing byte, we could
3852 : // check against nIters - 15
3853 2988110 : for (; i < nIters - 16; i += 16)
3854 : {
3855 : __m128i xmm0 =
3856 2793440 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
3857 : __m128i xmm1 =
3858 5586890 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
3859 : // Set higher 8bit of each int16 packed word to 0
3860 2793440 : xmm0 = _mm_and_si128(xmm0, xmm_mask);
3861 2793440 : xmm1 = _mm_and_si128(xmm1, xmm_mask);
3862 : // Pack int16 to uint8 and merge back both vector
3863 2793440 : xmm0 = _mm_packus_epi16(xmm0, xmm1);
3864 :
3865 : // Store result
3866 2793440 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
3867 :
3868 2793440 : pSrc += 2 * 16;
3869 : }
3870 : }
3871 4633170 : for (; i < nIters; i++)
3872 : {
3873 4279050 : pDest[i] = *pSrc;
3874 4279050 : pSrc += 2;
3875 : }
3876 354124 : }
3877 :
3878 1 : static void GDALUnrolledCopy_GByte_3_1_SSE2(GByte *CPL_RESTRICT pDest,
3879 : const GByte *CPL_RESTRICT pSrc,
3880 : GPtrDiff_t nIters)
3881 : {
3882 1 : decltype(nIters) i = 0;
3883 1 : const __m128i xmm_mask_ori = _mm_set_epi32(0, 0, 0, 255);
3884 : // If we were sure that there would always be 2 trailing bytes, we could
3885 : // check against nIters - 15
3886 2 : for (; i < nIters - 16; i += 16)
3887 : {
3888 : __m128i xmm0 =
3889 1 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
3890 : __m128i xmm1 =
3891 1 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
3892 : __m128i xmm2 =
3893 1 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
3894 :
3895 1 : auto xmm_mask0 = xmm_mask_ori;
3896 1 : auto xmm_mask1 = _mm_slli_si128(xmm_mask_ori, 6);
3897 1 : auto xmm_mask2 = _mm_slli_si128(xmm_mask_ori, 11);
3898 :
3899 1 : auto xmm = _mm_and_si128(xmm0, xmm_mask0);
3900 1 : auto xmm_res1 = _mm_and_si128(_mm_slli_si128(xmm1, 4), xmm_mask1);
3901 :
3902 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3903 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3904 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3905 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3906 2 : xmm_res1 = _mm_or_si128(
3907 : xmm_res1, _mm_and_si128(_mm_slli_si128(xmm1, 2), xmm_mask1));
3908 :
3909 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3910 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3911 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3912 2 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3913 1 : xmm_res1 = _mm_or_si128(xmm_res1, _mm_and_si128(xmm1, xmm_mask1));
3914 :
3915 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3916 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3917 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3918 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3919 2 : xmm_res1 = _mm_or_si128(
3920 : xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 2), xmm_mask1));
3921 :
3922 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3923 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3924 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3925 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3926 3 : xmm_res1 = _mm_or_si128(
3927 : xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 4), xmm_mask1));
3928 1 : xmm = _mm_or_si128(xmm, xmm_res1);
3929 :
3930 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3931 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3932 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3933 :
3934 2 : xmm = _mm_or_si128(xmm,
3935 : _mm_and_si128(_mm_slli_si128(xmm2, 10), xmm_mask2));
3936 :
3937 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
3938 2 : xmm = _mm_or_si128(xmm,
3939 : _mm_and_si128(_mm_slli_si128(xmm2, 8), xmm_mask2));
3940 :
3941 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
3942 2 : xmm = _mm_or_si128(xmm,
3943 : _mm_and_si128(_mm_slli_si128(xmm2, 6), xmm_mask2));
3944 :
3945 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
3946 2 : xmm = _mm_or_si128(xmm,
3947 : _mm_and_si128(_mm_slli_si128(xmm2, 4), xmm_mask2));
3948 :
3949 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
3950 2 : xmm = _mm_or_si128(xmm,
3951 : _mm_and_si128(_mm_slli_si128(xmm2, 2), xmm_mask2));
3952 :
3953 1 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm);
3954 :
3955 1 : pSrc += 3 * 16;
3956 : }
3957 2 : for (; i < nIters; i++)
3958 : {
3959 1 : pDest[i] = *pSrc;
3960 1 : pSrc += 3;
3961 : }
3962 1 : }
3963 :
3964 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
3965 :
3966 : template <>
3967 192265 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
3968 : const GByte *CPL_RESTRICT pSrc,
3969 : GPtrDiff_t nIters)
3970 : {
3971 192265 : if (nIters > 16)
3972 : {
3973 186142 : if (CPLHaveRuntimeSSSE3())
3974 : {
3975 186141 : GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
3976 : }
3977 : else
3978 : {
3979 1 : GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
3980 : }
3981 : }
3982 : else
3983 : {
3984 20384 : for (GPtrDiff_t i = 0; i < nIters; i++)
3985 : {
3986 14261 : pDest[i] = *pSrc;
3987 14261 : pSrc += 3;
3988 : }
3989 : }
3990 192265 : }
3991 :
3992 : #else
3993 :
3994 : template <>
3995 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
3996 : const GByte *CPL_RESTRICT pSrc,
3997 : GPtrDiff_t nIters)
3998 : {
3999 : GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
4000 : }
4001 : #endif
4002 :
4003 : template <>
4004 332657 : void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
4005 : const GByte *CPL_RESTRICT pSrc,
4006 : GPtrDiff_t nIters)
4007 : {
4008 332657 : decltype(nIters) i = 0;
4009 332657 : if (nIters > 16)
4010 : {
4011 327364 : const __m128i xmm_mask = _mm_set1_epi32(0xff);
4012 : // If we were sure that there would always be 3 trailing bytes, we could
4013 : // check against nIters - 15
4014 28043500 : for (; i < nIters - 16; i += 16)
4015 : {
4016 : __m128i xmm0 =
4017 27716100 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
4018 : __m128i xmm1 =
4019 27716100 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
4020 : __m128i xmm2 =
4021 27716100 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
4022 : __m128i xmm3 =
4023 55432200 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 48));
4024 : // Set higher 24bit of each int32 packed word to 0
4025 27716100 : xmm0 = _mm_and_si128(xmm0, xmm_mask);
4026 27716100 : xmm1 = _mm_and_si128(xmm1, xmm_mask);
4027 27716100 : xmm2 = _mm_and_si128(xmm2, xmm_mask);
4028 27716100 : xmm3 = _mm_and_si128(xmm3, xmm_mask);
4029 : // Pack int32 to int16
4030 27716100 : xmm0 = _mm_packs_epi32(xmm0, xmm1);
4031 27716100 : xmm2 = _mm_packs_epi32(xmm2, xmm3);
4032 : // Pack int16 to uint8
4033 27716100 : xmm0 = _mm_packus_epi16(xmm0, xmm2);
4034 :
4035 : // Store result
4036 27716100 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
4037 :
4038 27716100 : pSrc += 4 * 16;
4039 : }
4040 : }
4041 5048740 : for (; i < nIters; i++)
4042 : {
4043 4716080 : pDest[i] = *pSrc;
4044 4716080 : pSrc += 4;
4045 : }
4046 332657 : }
4047 : #endif // HAVE_SSE2
4048 :
4049 : /************************************************************************/
4050 : /* GDALFastCopy() */
4051 : /************************************************************************/
4052 :
4053 : template <class T>
4054 40098900 : static inline void GDALFastCopy(T *CPL_RESTRICT pDest, int nDestStride,
4055 : const T *CPL_RESTRICT pSrc, int nSrcStride,
4056 : GPtrDiff_t nIters)
4057 : {
4058 40098900 : constexpr int sizeofT = static_cast<int>(sizeof(T));
4059 40098900 : if (nIters == 1)
4060 : {
4061 22540680 : *pDest = *pSrc;
4062 : }
4063 17558245 : else if (nDestStride == sizeofT)
4064 : {
4065 14484102 : if (nSrcStride == sizeofT)
4066 : {
4067 13395287 : memcpy(pDest, pSrc, nIters * sizeof(T));
4068 : }
4069 1088779 : else if (nSrcStride == 2 * sizeofT)
4070 : {
4071 357339 : GDALUnrolledCopy<T, 2, 1>(pDest, pSrc, nIters);
4072 : }
4073 731440 : else if (nSrcStride == 3 * sizeofT)
4074 : {
4075 289245 : GDALUnrolledCopy<T, 3, 1>(pDest, pSrc, nIters);
4076 : }
4077 442195 : else if (nSrcStride == 4 * sizeofT)
4078 : {
4079 336639 : GDALUnrolledCopy<T, 4, 1>(pDest, pSrc, nIters);
4080 : }
4081 : else
4082 : {
4083 17229290 : while (nIters-- > 0)
4084 : {
4085 17123750 : *pDest = *pSrc;
4086 17123750 : pSrc += nSrcStride / sizeofT;
4087 17123750 : pDest++;
4088 : }
4089 : }
4090 : }
4091 3074113 : else if (nSrcStride == sizeofT)
4092 : {
4093 3061117 : if (nDestStride == 2 * sizeofT)
4094 : {
4095 151252 : GDALUnrolledCopy<T, 1, 2>(pDest, pSrc, nIters);
4096 : }
4097 2909865 : else if (nDestStride == 3 * sizeofT)
4098 : {
4099 2131921 : GDALUnrolledCopy<T, 1, 3>(pDest, pSrc, nIters);
4100 : }
4101 777937 : else if (nDestStride == 4 * sizeofT)
4102 : {
4103 613625 : GDALUnrolledCopy<T, 1, 4>(pDest, pSrc, nIters);
4104 : }
4105 : else
4106 : {
4107 17169660 : while (nIters-- > 0)
4108 : {
4109 17005410 : *pDest = *pSrc;
4110 17005410 : pSrc++;
4111 17005410 : pDest += nDestStride / sizeofT;
4112 : }
4113 : }
4114 : }
4115 : else
4116 : {
4117 1220108 : while (nIters-- > 0)
4118 : {
4119 1207102 : *pDest = *pSrc;
4120 1207102 : pSrc += nSrcStride / sizeofT;
4121 1207102 : pDest += nDestStride / sizeofT;
4122 : }
4123 : }
4124 40098900 : }
4125 :
4126 : /************************************************************************/
4127 : /* GDALFastCopyByte() */
4128 : /************************************************************************/
4129 :
4130 326250 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
4131 : int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
4132 : int nDstPixelStride, GPtrDiff_t nWordCount)
4133 : {
4134 326250 : GDALFastCopy(pDstData, nDstPixelStride, pSrcData, nSrcPixelStride,
4135 : nWordCount);
4136 326250 : }
4137 :
4138 : /************************************************************************/
4139 : /* GDALCopyWords() */
4140 : /************************************************************************/
4141 :
4142 : /**
4143 : * Copy pixel words from buffer to buffer.
4144 : *
4145 : * @see GDALCopyWords64()
4146 : */
4147 80502400 : void CPL_STDCALL GDALCopyWords(const void *CPL_RESTRICT pSrcData,
4148 : GDALDataType eSrcType, int nSrcPixelStride,
4149 : void *CPL_RESTRICT pDstData,
4150 : GDALDataType eDstType, int nDstPixelStride,
4151 : int nWordCount)
4152 : {
4153 80502400 : GDALCopyWords64(pSrcData, eSrcType, nSrcPixelStride, pDstData, eDstType,
4154 : nDstPixelStride, nWordCount);
4155 80502400 : }
4156 :
4157 : /************************************************************************/
4158 : /* GDALCopyWords64() */
4159 : /************************************************************************/
4160 :
4161 : /**
4162 : * Copy pixel words from buffer to buffer.
4163 : *
4164 : * This function is used to copy pixel word values from one memory buffer
4165 : * to another, with support for conversion between data types, and differing
4166 : * step factors. The data type conversion is done using the following
4167 : * rules:
4168 : * <ul>
4169 : * <li>Values assigned to a lower range integer type are clipped. For
4170 : * instance assigning GDT_Int16 values to a GDT_UInt8 buffer will cause values
4171 : * less the 0 to be set to 0, and values larger than 255 to be set to 255.
4172 : * </li>
4173 : * <li>
4174 : * Assignment from floating point to integer rounds to closest integer.
4175 : * +Infinity is mapped to the largest integer. -Infinity is mapped to the
4176 : * smallest integer. NaN is mapped to 0.
4177 : * </li>
4178 : * <li>
4179 : * Assignment from non-complex to complex will result in the imaginary part
4180 : * being set to zero on output.
4181 : * </li>
4182 : * <li> Assignment from complex to
4183 : * non-complex will result in the complex portion being lost and the real
4184 : * component being preserved (<i>not magnitude!</i>).
4185 : * </li>
4186 : * </ul>
4187 : *
4188 : * No assumptions are made about the source or destination words occurring
4189 : * on word boundaries. It is assumed that all values are in native machine
4190 : * byte order.
4191 : *
4192 : * @param pSrcData Pointer to source data to be converted.
4193 : * @param eSrcType the source data type (see GDALDataType enum)
4194 : * @param nSrcPixelStride Source pixel stride (i.e. distance between 2 words),
4195 : * in bytes
4196 : * @param pDstData Pointer to buffer where destination data should go
4197 : * @param eDstType the destination data type (see GDALDataType enum)
4198 : * @param nDstPixelStride Destination pixel stride (i.e. distance between 2
4199 : * words), in bytes
4200 : * @param nWordCount number of words to be copied
4201 : *
4202 : * @note
4203 : * When adding a new data type to GDAL, you must do the following to
4204 : * support it properly within the GDALCopyWords function:
4205 : * 1. Add the data type to the switch on eSrcType in GDALCopyWords.
4206 : * This should invoke the appropriate GDALCopyWordsFromT wrapper.
4207 : * 2. Add the data type to the switch on eDstType in GDALCopyWordsFromT.
4208 : * This should call the appropriate GDALCopyWordsT template.
4209 : * 3. If appropriate, overload the appropriate CopyWord template in the
4210 : * above namespace. This will ensure that any conversion issues are
4211 : * handled (cases like the float -> int32 case, where the min/max)
4212 : * values are subject to roundoff error.
4213 : */
4214 :
4215 116786000 : void CPL_STDCALL GDALCopyWords64(const void *CPL_RESTRICT pSrcData,
4216 : GDALDataType eSrcType, int nSrcPixelStride,
4217 : void *CPL_RESTRICT pDstData,
4218 : GDALDataType eDstType, int nDstPixelStride,
4219 : GPtrDiff_t nWordCount)
4220 :
4221 : {
4222 : // On platforms where alignment matters, be careful
4223 116786000 : const int nSrcDataTypeSize = GDALGetDataTypeSizeBytes(eSrcType);
4224 116786000 : const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(eDstType);
4225 116786000 : if (CPL_UNLIKELY(nSrcDataTypeSize == 0 || nDstDataTypeSize == 0))
4226 : {
4227 2 : CPLError(CE_Failure, CPLE_NotSupported,
4228 : "GDALCopyWords64(): unsupported GDT_Unknown/GDT_TypeCount "
4229 : "argument");
4230 2 : return;
4231 : }
4232 116786000 : if (!(eSrcType == eDstType && nSrcPixelStride == nDstPixelStride) &&
4233 66330800 : ((reinterpret_cast<uintptr_t>(pSrcData) % nSrcDataTypeSize) != 0 ||
4234 66330800 : (reinterpret_cast<uintptr_t>(pDstData) % nDstDataTypeSize) != 0 ||
4235 66330400 : (nSrcPixelStride % nSrcDataTypeSize) != 0 ||
4236 66330400 : (nDstPixelStride % nDstDataTypeSize) != 0))
4237 : {
4238 905 : if (eSrcType == eDstType)
4239 : {
4240 34800 : for (decltype(nWordCount) i = 0; i < nWordCount; i++)
4241 : {
4242 34000 : memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
4243 : static_cast<const GByte *>(pSrcData) +
4244 34000 : nSrcPixelStride * i,
4245 : nDstDataTypeSize);
4246 : }
4247 : }
4248 : else
4249 : {
4250 210 : const auto getAlignedPtr = [](GByte *ptr, int align)
4251 : {
4252 : return ptr +
4253 210 : ((align - (reinterpret_cast<uintptr_t>(ptr) % align)) %
4254 210 : align);
4255 : };
4256 :
4257 : // The largest we need is for CFloat64 (16 bytes), so 32 bytes to
4258 : // be sure to get correctly aligned pointer.
4259 105 : constexpr size_t SIZEOF_CFLOAT64 = 2 * sizeof(double);
4260 : GByte abySrcBuffer[2 * SIZEOF_CFLOAT64];
4261 : GByte abyDstBuffer[2 * SIZEOF_CFLOAT64];
4262 : GByte *pabySrcBuffer =
4263 105 : getAlignedPtr(abySrcBuffer, nSrcDataTypeSize);
4264 : GByte *pabyDstBuffer =
4265 105 : getAlignedPtr(abyDstBuffer, nDstDataTypeSize);
4266 3360 : for (decltype(nWordCount) i = 0; i < nWordCount; i++)
4267 : {
4268 3255 : memcpy(pabySrcBuffer,
4269 : static_cast<const GByte *>(pSrcData) +
4270 3255 : nSrcPixelStride * i,
4271 : nSrcDataTypeSize);
4272 3255 : GDALCopyWords64(pabySrcBuffer, eSrcType, 0, pabyDstBuffer,
4273 : eDstType, 0, 1);
4274 3255 : memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
4275 : pabyDstBuffer, nDstDataTypeSize);
4276 : }
4277 : }
4278 905 : return;
4279 : }
4280 :
4281 : // Deal with the case where we're replicating a single word into the
4282 : // provided buffer
4283 116785000 : if (nSrcPixelStride == 0 && nWordCount > 1)
4284 : {
4285 1067780 : GDALReplicateWord(pSrcData, eSrcType, pDstData, eDstType,
4286 : nDstPixelStride, nWordCount);
4287 1067780 : return;
4288 : }
4289 :
4290 115717000 : if (eSrcType == eDstType)
4291 : {
4292 54678100 : if (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8)
4293 : {
4294 17976000 : GDALFastCopy(static_cast<GByte *>(pDstData), nDstPixelStride,
4295 : static_cast<const GByte *>(pSrcData), nSrcPixelStride,
4296 : nWordCount);
4297 17976000 : return;
4298 : }
4299 :
4300 36702100 : if (nSrcDataTypeSize == 2 && (nSrcPixelStride % 2) == 0 &&
4301 21796600 : (nDstPixelStride % 2) == 0)
4302 : {
4303 21796600 : GDALFastCopy(static_cast<short *>(pDstData), nDstPixelStride,
4304 : static_cast<const short *>(pSrcData), nSrcPixelStride,
4305 : nWordCount);
4306 21796600 : return;
4307 : }
4308 :
4309 14905500 : if (nWordCount == 1)
4310 : {
4311 : #if defined(CSA_BUILD) || defined(__COVERITY__)
4312 : // Avoid false positives...
4313 : memcpy(pDstData, pSrcData, nSrcDataTypeSize);
4314 : #else
4315 14418400 : if (nSrcDataTypeSize == 2)
4316 0 : memcpy(pDstData, pSrcData, 2);
4317 14418400 : else if (nSrcDataTypeSize == 4)
4318 13814200 : memcpy(pDstData, pSrcData, 4);
4319 604143 : else if (nSrcDataTypeSize == 8)
4320 587538 : memcpy(pDstData, pSrcData, 8);
4321 : else /* if( eSrcType == GDT_CFloat64 ) */
4322 16605 : memcpy(pDstData, pSrcData, 16);
4323 : #endif
4324 14418400 : return;
4325 : }
4326 :
4327 : // Let memcpy() handle the case where we're copying a packed buffer
4328 : // of pixels.
4329 487079 : if (nSrcPixelStride == nDstPixelStride)
4330 : {
4331 225235 : if (nSrcPixelStride == nSrcDataTypeSize)
4332 : {
4333 225167 : memcpy(pDstData, pSrcData, nWordCount * nSrcDataTypeSize);
4334 225167 : return;
4335 : }
4336 : }
4337 : }
4338 :
4339 : // Handle the more general case -- deals with conversion of data types
4340 : // directly.
4341 61300800 : switch (eSrcType)
4342 : {
4343 20311000 : case GDT_UInt8:
4344 20311000 : GDALCopyWordsFromT<unsigned char>(
4345 : static_cast<const unsigned char *>(pSrcData), nSrcPixelStride,
4346 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4347 20311000 : break;
4348 1802 : case GDT_Int8:
4349 1802 : GDALCopyWordsFromT<signed char>(
4350 : static_cast<const signed char *>(pSrcData), nSrcPixelStride,
4351 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4352 1802 : break;
4353 54651 : case GDT_UInt16:
4354 54651 : GDALCopyWordsFromT<unsigned short>(
4355 : static_cast<const unsigned short *>(pSrcData), nSrcPixelStride,
4356 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4357 54651 : break;
4358 6519570 : case GDT_Int16:
4359 6519570 : GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
4360 : nSrcPixelStride, false, pDstData,
4361 : eDstType, nDstPixelStride, nWordCount);
4362 6519570 : break;
4363 8016 : case GDT_UInt32:
4364 8016 : GDALCopyWordsFromT<unsigned int>(
4365 : static_cast<const unsigned int *>(pSrcData), nSrcPixelStride,
4366 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4367 8016 : break;
4368 12255600 : case GDT_Int32:
4369 12255600 : GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
4370 : nSrcPixelStride, false, pDstData, eDstType,
4371 : nDstPixelStride, nWordCount);
4372 12255600 : break;
4373 2205 : case GDT_UInt64:
4374 2205 : GDALCopyWordsFromT<std::uint64_t>(
4375 : static_cast<const std::uint64_t *>(pSrcData), nSrcPixelStride,
4376 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4377 2205 : break;
4378 11729 : case GDT_Int64:
4379 11729 : GDALCopyWordsFromT<std::int64_t>(
4380 : static_cast<const std::int64_t *>(pSrcData), nSrcPixelStride,
4381 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4382 11729 : break;
4383 1387 : case GDT_Float16:
4384 1387 : GDALCopyWordsFromT<GFloat16>(
4385 : static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, false,
4386 : pDstData, eDstType, nDstPixelStride, nWordCount);
4387 1387 : break;
4388 658514 : case GDT_Float32:
4389 658514 : GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
4390 : nSrcPixelStride, false, pDstData,
4391 : eDstType, nDstPixelStride, nWordCount);
4392 658514 : break;
4393 20715700 : case GDT_Float64:
4394 20715700 : GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
4395 : nSrcPixelStride, false, pDstData,
4396 : eDstType, nDstPixelStride, nWordCount);
4397 20715700 : break;
4398 478485 : case GDT_CInt16:
4399 478485 : GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
4400 : nSrcPixelStride, true, pDstData, eDstType,
4401 : nDstPixelStride, nWordCount);
4402 478485 : break;
4403 868 : case GDT_CInt32:
4404 868 : GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
4405 : nSrcPixelStride, true, pDstData, eDstType,
4406 : nDstPixelStride, nWordCount);
4407 868 : break;
4408 508 : case GDT_CFloat16:
4409 508 : GDALCopyWordsFromT<GFloat16>(
4410 : static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, true,
4411 : pDstData, eDstType, nDstPixelStride, nWordCount);
4412 508 : break;
4413 2437 : case GDT_CFloat32:
4414 2437 : GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
4415 : nSrcPixelStride, true, pDstData, eDstType,
4416 : nDstPixelStride, nWordCount);
4417 2437 : break;
4418 278404 : case GDT_CFloat64:
4419 278404 : GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
4420 : nSrcPixelStride, true, pDstData,
4421 : eDstType, nDstPixelStride, nWordCount);
4422 278404 : break;
4423 0 : case GDT_Unknown:
4424 : case GDT_TypeCount:
4425 0 : CPLAssert(false);
4426 : }
4427 : }
4428 :
4429 : /************************************************************************/
4430 : /* GDALCopyBits() */
4431 : /************************************************************************/
4432 :
4433 : /**
4434 : * Bitwise word copying.
4435 : *
4436 : * A function for moving sets of partial bytes around. Loosely
4437 : * speaking this is a bitwise analog to GDALCopyWords().
4438 : *
4439 : * It copies nStepCount "words" where each word is nBitCount bits long.
4440 : * The nSrcStep and nDstStep are the number of bits from the start of one
4441 : * word to the next (same as nBitCount if they are packed). The nSrcOffset
4442 : * and nDstOffset are the offset into the source and destination buffers
4443 : * to start at, also measured in bits.
4444 : *
4445 : * All bit offsets are assumed to start from the high order bit in a byte
4446 : * (i.e. most significant bit first). Currently this function is not very
4447 : * optimized, but it may be improved for some common cases in the future
4448 : * as needed.
4449 : *
4450 : * @param pabySrcData the source data buffer.
4451 : * @param nSrcOffset the offset (in bits) in pabySrcData to the start of the
4452 : * first word to copy.
4453 : * @param nSrcStep the offset in bits from the start one source word to the
4454 : * start of the next.
4455 : * @param pabyDstData the destination data buffer.
4456 : * @param nDstOffset the offset (in bits) in pabyDstData to the start of the
4457 : * first word to copy over.
4458 : * @param nDstStep the offset in bits from the start one word to the
4459 : * start of the next.
4460 : * @param nBitCount the number of bits in a word to be copied.
4461 : * @param nStepCount the number of words to copy.
4462 : */
4463 :
4464 0 : void GDALCopyBits(const GByte *pabySrcData, int nSrcOffset, int nSrcStep,
4465 : GByte *pabyDstData, int nDstOffset, int nDstStep,
4466 : int nBitCount, int nStepCount)
4467 :
4468 : {
4469 0 : VALIDATE_POINTER0(pabySrcData, "GDALCopyBits");
4470 :
4471 0 : for (int iStep = 0; iStep < nStepCount; iStep++)
4472 : {
4473 0 : for (int iBit = 0; iBit < nBitCount; iBit++)
4474 : {
4475 0 : if (pabySrcData[nSrcOffset >> 3] & (0x80 >> (nSrcOffset & 7)))
4476 0 : pabyDstData[nDstOffset >> 3] |= (0x80 >> (nDstOffset & 7));
4477 : else
4478 0 : pabyDstData[nDstOffset >> 3] &= ~(0x80 >> (nDstOffset & 7));
4479 :
4480 0 : nSrcOffset++;
4481 0 : nDstOffset++;
4482 : }
4483 :
4484 0 : nSrcOffset += (nSrcStep - nBitCount);
4485 0 : nDstOffset += (nDstStep - nBitCount);
4486 : }
4487 : }
4488 :
4489 : /************************************************************************/
4490 : /* GDALGetBestOverviewLevel() */
4491 : /* */
4492 : /* Returns the best overview level to satisfy the query or -1 if none */
4493 : /* Also updates nXOff, nYOff, nXSize, nYSize and psExtraArg when */
4494 : /* returning a valid overview level */
4495 : /************************************************************************/
4496 :
4497 0 : int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand, int &nXOff, int &nYOff,
4498 : int &nXSize, int &nYSize, int nBufXSize,
4499 : int nBufYSize)
4500 : {
4501 0 : return GDALBandGetBestOverviewLevel2(poBand, nXOff, nYOff, nXSize, nYSize,
4502 0 : nBufXSize, nBufYSize, nullptr);
4503 : }
4504 :
4505 524002 : int GDALBandGetBestOverviewLevel2(GDALRasterBand *poBand, int &nXOff,
4506 : int &nYOff, int &nXSize, int &nYSize,
4507 : int nBufXSize, int nBufYSize,
4508 : GDALRasterIOExtraArg *psExtraArg)
4509 : {
4510 524002 : if (psExtraArg != nullptr && psExtraArg->nVersion > 1 &&
4511 524002 : psExtraArg->bUseOnlyThisScale)
4512 109 : return -1;
4513 : /* -------------------------------------------------------------------- */
4514 : /* Compute the desired downsampling factor. It is */
4515 : /* based on the least reduced axis, and represents the number */
4516 : /* of source pixels to one destination pixel. */
4517 : /* -------------------------------------------------------------------- */
4518 523893 : const double dfDesiredDownsamplingFactor =
4519 523893 : ((nXSize / static_cast<double>(nBufXSize)) <
4520 361553 : (nYSize / static_cast<double>(nBufYSize)) ||
4521 : nBufYSize == 1)
4522 752282 : ? nXSize / static_cast<double>(nBufXSize)
4523 133164 : : nYSize / static_cast<double>(nBufYSize);
4524 :
4525 : /* -------------------------------------------------------------------- */
4526 : /* Find the overview level that largest downsampling factor (most */
4527 : /* downsampled) that is still less than (or only a little more) */
4528 : /* downsampled than the request. */
4529 : /* -------------------------------------------------------------------- */
4530 523893 : const int nOverviewCount = poBand->GetOverviewCount();
4531 523893 : GDALRasterBand *poBestOverview = nullptr;
4532 523893 : double dfBestDownsamplingFactor = 0;
4533 523893 : int nBestOverviewLevel = -1;
4534 :
4535 : const char *pszOversampligThreshold =
4536 523893 : CPLGetConfigOption("GDAL_OVERVIEW_OVERSAMPLING_THRESHOLD", nullptr);
4537 :
4538 : // Note: keep this logic for overview selection in sync between
4539 : // gdalwarp_lib.cpp and rasterio.cpp
4540 : // Cf https://github.com/OSGeo/gdal/pull/9040#issuecomment-1898524693
4541 : const double dfOversamplingThreshold =
4542 1047780 : pszOversampligThreshold ? CPLAtof(pszOversampligThreshold)
4543 523884 : : psExtraArg && psExtraArg->eResampleAlg != GRIORA_NearestNeighbour
4544 1047770 : ? 1.0
4545 523893 : : 1.2;
4546 526589 : for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
4547 : {
4548 5614 : GDALRasterBand *poOverview = poBand->GetOverview(iOverview);
4549 11228 : if (poOverview == nullptr ||
4550 11227 : poOverview->GetXSize() > poBand->GetXSize() ||
4551 5613 : poOverview->GetYSize() > poBand->GetYSize())
4552 : {
4553 1 : continue;
4554 : }
4555 :
4556 : // Compute downsampling factor of this overview
4557 : const double dfDownsamplingFactor = std::min(
4558 5613 : poBand->GetXSize() / static_cast<double>(poOverview->GetXSize()),
4559 11226 : poBand->GetYSize() / static_cast<double>(poOverview->GetYSize()));
4560 :
4561 : // Is it nearly the requested factor and better (lower) than
4562 : // the current best factor?
4563 : // Use an epsilon because of numerical instability.
4564 5613 : constexpr double EPSILON = 1e-1;
4565 5721 : if (dfDownsamplingFactor >=
4566 5613 : dfDesiredDownsamplingFactor * dfOversamplingThreshold +
4567 5505 : EPSILON ||
4568 : dfDownsamplingFactor <= dfBestDownsamplingFactor)
4569 : {
4570 108 : continue;
4571 : }
4572 :
4573 : // Ignore AVERAGE_BIT2GRAYSCALE overviews for RasterIO purposes.
4574 5505 : const char *pszResampling = poOverview->GetMetadataItem("RESAMPLING");
4575 :
4576 5505 : if (pszResampling != nullptr &&
4577 71 : STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2"))
4578 16 : continue;
4579 :
4580 : // OK, this is our new best overview.
4581 5489 : poBestOverview = poOverview;
4582 5489 : nBestOverviewLevel = iOverview;
4583 5489 : dfBestDownsamplingFactor = dfDownsamplingFactor;
4584 :
4585 5489 : if (std::abs(dfDesiredDownsamplingFactor - dfDownsamplingFactor) <
4586 : EPSILON)
4587 : {
4588 2918 : break;
4589 : }
4590 : }
4591 :
4592 : /* -------------------------------------------------------------------- */
4593 : /* If we didn't find an overview that helps us, just return */
4594 : /* indicating failure and the full resolution image will be used. */
4595 : /* -------------------------------------------------------------------- */
4596 523893 : if (nBestOverviewLevel < 0)
4597 520902 : return -1;
4598 :
4599 : /* -------------------------------------------------------------------- */
4600 : /* Recompute the source window in terms of the selected */
4601 : /* overview. */
4602 : /* -------------------------------------------------------------------- */
4603 : const double dfXFactor =
4604 2991 : poBand->GetXSize() / static_cast<double>(poBestOverview->GetXSize());
4605 : const double dfYFactor =
4606 2991 : poBand->GetYSize() / static_cast<double>(poBestOverview->GetYSize());
4607 2991 : CPLDebug("GDAL", "Selecting overview %d x %d", poBestOverview->GetXSize(),
4608 : poBestOverview->GetYSize());
4609 :
4610 8973 : const int nOXOff = std::min(poBestOverview->GetXSize() - 1,
4611 2991 : static_cast<int>(nXOff / dfXFactor + 0.5));
4612 8973 : const int nOYOff = std::min(poBestOverview->GetYSize() - 1,
4613 2991 : static_cast<int>(nYOff / dfYFactor + 0.5));
4614 2991 : int nOXSize = std::max(1, static_cast<int>(nXSize / dfXFactor + 0.5));
4615 2991 : int nOYSize = std::max(1, static_cast<int>(nYSize / dfYFactor + 0.5));
4616 2991 : if (nOXOff + nOXSize > poBestOverview->GetXSize())
4617 0 : nOXSize = poBestOverview->GetXSize() - nOXOff;
4618 2991 : if (nOYOff + nOYSize > poBestOverview->GetYSize())
4619 2 : nOYSize = poBestOverview->GetYSize() - nOYOff;
4620 :
4621 2991 : if (psExtraArg)
4622 : {
4623 2991 : if (psExtraArg->bFloatingPointWindowValidity)
4624 : {
4625 117 : psExtraArg->dfXOff /= dfXFactor;
4626 117 : psExtraArg->dfXSize /= dfXFactor;
4627 117 : psExtraArg->dfYOff /= dfYFactor;
4628 117 : psExtraArg->dfYSize /= dfYFactor;
4629 : }
4630 2874 : else if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
4631 : {
4632 16 : psExtraArg->bFloatingPointWindowValidity = true;
4633 16 : psExtraArg->dfXOff = nXOff / dfXFactor;
4634 16 : psExtraArg->dfXSize = nXSize / dfXFactor;
4635 16 : psExtraArg->dfYOff = nYOff / dfYFactor;
4636 16 : psExtraArg->dfYSize = nYSize / dfYFactor;
4637 : }
4638 : }
4639 :
4640 2991 : nXOff = nOXOff;
4641 2991 : nYOff = nOYOff;
4642 2991 : nXSize = nOXSize;
4643 2991 : nYSize = nOYSize;
4644 :
4645 2991 : return nBestOverviewLevel;
4646 : }
4647 :
4648 : /************************************************************************/
4649 : /* OverviewRasterIO() */
4650 : /* */
4651 : /* Special work function to utilize available overviews to */
4652 : /* more efficiently satisfy downsampled requests. It will */
4653 : /* return CE_Failure if there are no appropriate overviews */
4654 : /* available but it doesn't emit any error messages. */
4655 : /************************************************************************/
4656 :
4657 : //! @cond Doxygen_Suppress
4658 2 : CPLErr GDALRasterBand::OverviewRasterIO(
4659 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4660 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4661 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
4662 :
4663 : {
4664 : GDALRasterIOExtraArg sExtraArg;
4665 2 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4666 :
4667 2 : const int nOverview = GDALBandGetBestOverviewLevel2(
4668 : this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, &sExtraArg);
4669 2 : if (nOverview < 0)
4670 1 : return CE_Failure;
4671 :
4672 : /* -------------------------------------------------------------------- */
4673 : /* Recast the call in terms of the new raster layer. */
4674 : /* -------------------------------------------------------------------- */
4675 1 : GDALRasterBand *poOverviewBand = GetOverview(nOverview);
4676 1 : if (poOverviewBand == nullptr)
4677 0 : return CE_Failure;
4678 :
4679 1 : return poOverviewBand->RasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
4680 : pData, nBufXSize, nBufYSize, eBufType,
4681 1 : nPixelSpace, nLineSpace, &sExtraArg);
4682 : }
4683 :
4684 : /************************************************************************/
4685 : /* TryOverviewRasterIO() */
4686 : /************************************************************************/
4687 :
4688 362420 : CPLErr GDALRasterBand::TryOverviewRasterIO(
4689 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4690 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4691 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg,
4692 : int *pbTried)
4693 : {
4694 362420 : int nXOffMod = nXOff;
4695 362420 : int nYOffMod = nYOff;
4696 362420 : int nXSizeMod = nXSize;
4697 362420 : int nYSizeMod = nYSize;
4698 : GDALRasterIOExtraArg sExtraArg;
4699 :
4700 362420 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4701 :
4702 362420 : int iOvrLevel = GDALBandGetBestOverviewLevel2(
4703 : this, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize, nBufYSize,
4704 : &sExtraArg);
4705 :
4706 362420 : if (iOvrLevel >= 0)
4707 : {
4708 52 : GDALRasterBand *poOverviewBand = GetOverview(iOvrLevel);
4709 52 : if (poOverviewBand)
4710 : {
4711 52 : *pbTried = TRUE;
4712 52 : return poOverviewBand->RasterIO(
4713 : eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData,
4714 : nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
4715 52 : &sExtraArg);
4716 : }
4717 : }
4718 :
4719 362368 : *pbTried = FALSE;
4720 362368 : return CE_None;
4721 : }
4722 :
4723 : /************************************************************************/
4724 : /* TryOverviewRasterIO() */
4725 : /************************************************************************/
4726 :
4727 158606 : CPLErr GDALDataset::TryOverviewRasterIO(
4728 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4729 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4730 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
4731 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg,
4732 : int *pbTried)
4733 : {
4734 158606 : int nXOffMod = nXOff;
4735 158606 : int nYOffMod = nYOff;
4736 158606 : int nXSizeMod = nXSize;
4737 158606 : int nYSizeMod = nYSize;
4738 : GDALRasterIOExtraArg sExtraArg;
4739 158606 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4740 :
4741 317212 : int iOvrLevel = GDALBandGetBestOverviewLevel2(
4742 158606 : papoBands[0], nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize,
4743 : nBufYSize, &sExtraArg);
4744 :
4745 158647 : if (iOvrLevel >= 0 && papoBands[0]->GetOverview(iOvrLevel) != nullptr &&
4746 41 : papoBands[0]->GetOverview(iOvrLevel)->GetDataset() != nullptr)
4747 : {
4748 41 : *pbTried = TRUE;
4749 41 : return papoBands[0]->GetOverview(iOvrLevel)->GetDataset()->RasterIO(
4750 : eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData, nBufXSize,
4751 : nBufYSize, eBufType, nBandCount, panBandMap, nPixelSpace,
4752 41 : nLineSpace, nBandSpace, &sExtraArg);
4753 : }
4754 : else
4755 : {
4756 158565 : *pbTried = FALSE;
4757 158565 : return CE_None;
4758 : }
4759 : }
4760 :
4761 : /************************************************************************/
4762 : /* GetBestOverviewLevel() */
4763 : /* */
4764 : /* Returns the best overview level to satisfy the query or -1 if none */
4765 : /* Also updates nXOff, nYOff, nXSize, nYSize when returning a valid */
4766 : /* overview level */
4767 : /************************************************************************/
4768 :
4769 4 : static int GDALDatasetGetBestOverviewLevel(GDALDataset *poDS, int &nXOff,
4770 : int &nYOff, int &nXSize, int &nYSize,
4771 : int nBufXSize, int nBufYSize,
4772 : int nBandCount,
4773 : const int *panBandMap,
4774 : GDALRasterIOExtraArg *psExtraArg)
4775 : {
4776 4 : int nOverviewCount = 0;
4777 4 : GDALRasterBand *poFirstBand = nullptr;
4778 :
4779 : /* -------------------------------------------------------------------- */
4780 : /* Check that all bands have the same number of overviews and */
4781 : /* that they have all the same size and block dimensions */
4782 : /* -------------------------------------------------------------------- */
4783 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
4784 : {
4785 8 : GDALRasterBand *poBand = poDS->GetRasterBand(panBandMap[iBand]);
4786 8 : if (poBand == nullptr)
4787 0 : return -1;
4788 8 : if (iBand == 0)
4789 : {
4790 4 : poFirstBand = poBand;
4791 4 : nOverviewCount = poBand->GetOverviewCount();
4792 : }
4793 4 : else if (nOverviewCount != poBand->GetOverviewCount())
4794 : {
4795 0 : CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
4796 : "mismatched overview count, use std method.");
4797 0 : return -1;
4798 : }
4799 : else
4800 : {
4801 4 : for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
4802 : {
4803 0 : GDALRasterBand *poOvrBand = poBand->GetOverview(iOverview);
4804 : GDALRasterBand *poOvrFirstBand =
4805 0 : poFirstBand->GetOverview(iOverview);
4806 0 : if (poOvrBand == nullptr || poOvrFirstBand == nullptr)
4807 0 : continue;
4808 :
4809 0 : if (poOvrFirstBand->GetXSize() != poOvrBand->GetXSize() ||
4810 0 : poOvrFirstBand->GetYSize() != poOvrBand->GetYSize())
4811 : {
4812 0 : CPLDebug("GDAL",
4813 : "GDALDataset::GetBestOverviewLevel() ... "
4814 : "mismatched overview sizes, use std method.");
4815 0 : return -1;
4816 : }
4817 0 : int nBlockXSizeFirst = 0;
4818 0 : int nBlockYSizeFirst = 0;
4819 0 : poOvrFirstBand->GetBlockSize(&nBlockXSizeFirst,
4820 : &nBlockYSizeFirst);
4821 :
4822 0 : int nBlockXSizeCurrent = 0;
4823 0 : int nBlockYSizeCurrent = 0;
4824 0 : poOvrBand->GetBlockSize(&nBlockXSizeCurrent,
4825 : &nBlockYSizeCurrent);
4826 :
4827 0 : if (nBlockXSizeFirst != nBlockXSizeCurrent ||
4828 0 : nBlockYSizeFirst != nBlockYSizeCurrent)
4829 : {
4830 0 : CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
4831 : "mismatched block sizes, use std method.");
4832 0 : return -1;
4833 : }
4834 : }
4835 : }
4836 : }
4837 4 : if (poFirstBand == nullptr)
4838 0 : return -1;
4839 :
4840 4 : return GDALBandGetBestOverviewLevel2(poFirstBand, nXOff, nYOff, nXSize,
4841 : nYSize, nBufXSize, nBufYSize,
4842 4 : psExtraArg);
4843 : }
4844 :
4845 : /************************************************************************/
4846 : /* BlockBasedRasterIO() */
4847 : /* */
4848 : /* This convenience function implements a dataset level */
4849 : /* RasterIO() interface based on calling down to fetch blocks, */
4850 : /* much like the GDALRasterBand::IRasterIO(), but it handles */
4851 : /* all bands at once, so that a format driver that handles a */
4852 : /* request for different bands of the same block efficiently */
4853 : /* (i.e. without re-reading interleaved data) will efficiently. */
4854 : /* */
4855 : /* This method is intended to be called by an overridden */
4856 : /* IRasterIO() method in the driver specific GDALDataset */
4857 : /* derived class. */
4858 : /* */
4859 : /* Default internal implementation of RasterIO() ... utilizes */
4860 : /* the Block access methods to satisfy the request. This would */
4861 : /* normally only be overridden by formats with overviews. */
4862 : /* */
4863 : /* To keep things relatively simple, this method does not */
4864 : /* currently take advantage of some special cases addressed in */
4865 : /* GDALRasterBand::IRasterIO(), so it is likely best to only */
4866 : /* call it when you know it will help. That is in cases where */
4867 : /* data is at 1:1 to the buffer, and you know the driver is */
4868 : /* implementing interleaved IO efficiently on a block by block */
4869 : /* basis. Overviews will be used when possible. */
4870 : /************************************************************************/
4871 :
4872 64982 : CPLErr GDALDataset::BlockBasedRasterIO(
4873 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4874 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4875 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
4876 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
4877 :
4878 : {
4879 64982 : CPLAssert(nullptr != pData);
4880 :
4881 64982 : GByte **papabySrcBlock = nullptr;
4882 64982 : GDALRasterBlock *poBlock = nullptr;
4883 64982 : GDALRasterBlock **papoBlocks = nullptr;
4884 64982 : int nLBlockX = -1;
4885 64982 : int nLBlockY = -1;
4886 : int iBufYOff;
4887 : int iBufXOff;
4888 64982 : int nBlockXSize = 1;
4889 64982 : int nBlockYSize = 1;
4890 64982 : CPLErr eErr = CE_None;
4891 64982 : GDALDataType eDataType = GDT_UInt8;
4892 :
4893 64982 : const bool bUseIntegerRequestCoords =
4894 65020 : (!psExtraArg->bFloatingPointWindowValidity ||
4895 38 : (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
4896 36 : nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
4897 :
4898 : /* -------------------------------------------------------------------- */
4899 : /* Ensure that all bands share a common block size and data type. */
4900 : /* -------------------------------------------------------------------- */
4901 308187 : for (int iBand = 0; iBand < nBandCount; iBand++)
4902 : {
4903 243205 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
4904 :
4905 243205 : if (iBand == 0)
4906 : {
4907 64982 : poBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
4908 64982 : eDataType = poBand->GetRasterDataType();
4909 : }
4910 : else
4911 : {
4912 178223 : int nThisBlockXSize = 0;
4913 178223 : int nThisBlockYSize = 0;
4914 178223 : poBand->GetBlockSize(&nThisBlockXSize, &nThisBlockYSize);
4915 178223 : if (nThisBlockXSize != nBlockXSize ||
4916 178223 : nThisBlockYSize != nBlockYSize)
4917 : {
4918 0 : CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
4919 : "mismatched block sizes, use std method.");
4920 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
4921 : pData, nBufXSize, nBufYSize, eBufType,
4922 : nBandCount, panBandMap, nPixelSpace,
4923 0 : nLineSpace, nBandSpace, psExtraArg);
4924 : }
4925 :
4926 178223 : if (eDataType != poBand->GetRasterDataType() &&
4927 0 : (nXSize != nBufXSize || nYSize != nBufYSize))
4928 : {
4929 0 : CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
4930 : "mismatched band data types, use std method.");
4931 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
4932 : pData, nBufXSize, nBufYSize, eBufType,
4933 : nBandCount, panBandMap, nPixelSpace,
4934 0 : nLineSpace, nBandSpace, psExtraArg);
4935 : }
4936 : }
4937 : }
4938 :
4939 : /* ==================================================================== */
4940 : /* In this special case at full resolution we step through in */
4941 : /* blocks, turning the request over to the per-band */
4942 : /* IRasterIO(), but ensuring that all bands of one block are */
4943 : /* called before proceeding to the next. */
4944 : /* ==================================================================== */
4945 :
4946 64982 : if (nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
4947 : {
4948 : GDALRasterIOExtraArg sDummyExtraArg;
4949 64978 : INIT_RASTERIO_EXTRA_ARG(sDummyExtraArg);
4950 :
4951 64978 : int nChunkYSize = 0;
4952 64978 : int nChunkXSize = 0;
4953 :
4954 213434 : for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff += nChunkYSize)
4955 : {
4956 149472 : const int nChunkYOff = iBufYOff + nYOff;
4957 149472 : nChunkYSize = nBlockYSize - (nChunkYOff % nBlockYSize);
4958 149472 : if (nChunkYOff + nChunkYSize > nYOff + nYSize)
4959 59977 : nChunkYSize = (nYOff + nYSize) - nChunkYOff;
4960 :
4961 822752 : for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff += nChunkXSize)
4962 : {
4963 674295 : const int nChunkXOff = iBufXOff + nXOff;
4964 674295 : nChunkXSize = nBlockXSize - (nChunkXOff % nBlockXSize);
4965 674295 : if (nChunkXOff + nChunkXSize > nXOff + nXSize)
4966 70691 : nChunkXSize = (nXOff + nXSize) - nChunkXOff;
4967 :
4968 674295 : GByte *pabyChunkData =
4969 674295 : static_cast<GByte *>(pData) + iBufXOff * nPixelSpace +
4970 674295 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace;
4971 :
4972 3282490 : for (int iBand = 0; iBand < nBandCount; iBand++)
4973 : {
4974 2609210 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
4975 :
4976 5218420 : eErr = poBand->IRasterIO(
4977 : eRWFlag, nChunkXOff, nChunkYOff, nChunkXSize,
4978 : nChunkYSize,
4979 2609210 : pabyChunkData +
4980 2609210 : static_cast<GPtrDiff_t>(iBand) * nBandSpace,
4981 : nChunkXSize, nChunkYSize, eBufType, nPixelSpace,
4982 2609210 : nLineSpace, &sDummyExtraArg);
4983 2609210 : if (eErr != CE_None)
4984 1015 : return eErr;
4985 : }
4986 : }
4987 :
4988 167371 : if (psExtraArg->pfnProgress != nullptr &&
4989 18914 : !psExtraArg->pfnProgress(
4990 167371 : 1.0 * std::min(nBufYSize, iBufYOff + nChunkYSize) /
4991 : nBufYSize,
4992 : "", psExtraArg->pProgressData))
4993 : {
4994 1 : return CE_Failure;
4995 : }
4996 : }
4997 :
4998 63962 : return CE_None;
4999 : }
5000 :
5001 : /* Below code is not compatible with that case. It would need a complete */
5002 : /* separate code like done in GDALRasterBand::IRasterIO. */
5003 4 : if (eRWFlag == GF_Write && (nBufXSize < nXSize || nBufYSize < nYSize))
5004 : {
5005 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
5006 : nBufXSize, nBufYSize, eBufType, nBandCount,
5007 : panBandMap, nPixelSpace, nLineSpace,
5008 0 : nBandSpace, psExtraArg);
5009 : }
5010 :
5011 : /* We could have a smarter implementation, but that will do for now */
5012 4 : if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
5013 0 : (nBufXSize != nXSize || nBufYSize != nYSize))
5014 : {
5015 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
5016 : nBufXSize, nBufYSize, eBufType, nBandCount,
5017 : panBandMap, nPixelSpace, nLineSpace,
5018 0 : nBandSpace, psExtraArg);
5019 : }
5020 :
5021 : /* ==================================================================== */
5022 : /* Loop reading required source blocks to satisfy output */
5023 : /* request. This is the most general implementation. */
5024 : /* ==================================================================== */
5025 :
5026 4 : const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
5027 :
5028 : papabySrcBlock =
5029 4 : static_cast<GByte **>(CPLCalloc(sizeof(GByte *), nBandCount));
5030 : papoBlocks =
5031 4 : static_cast<GDALRasterBlock **>(CPLCalloc(sizeof(void *), nBandCount));
5032 :
5033 : /* -------------------------------------------------------------------- */
5034 : /* Select an overview level if appropriate. */
5035 : /* -------------------------------------------------------------------- */
5036 :
5037 : GDALRasterIOExtraArg sExtraArg;
5038 4 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
5039 4 : const int nOverviewLevel = GDALDatasetGetBestOverviewLevel(
5040 : this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, nBandCount,
5041 : panBandMap, &sExtraArg);
5042 4 : if (nOverviewLevel >= 0)
5043 : {
5044 2 : GetRasterBand(panBandMap[0])
5045 2 : ->GetOverview(nOverviewLevel)
5046 2 : ->GetBlockSize(&nBlockXSize, &nBlockYSize);
5047 : }
5048 :
5049 4 : double dfXOff = nXOff;
5050 4 : double dfYOff = nYOff;
5051 4 : double dfXSize = nXSize;
5052 4 : double dfYSize = nYSize;
5053 4 : if (sExtraArg.bFloatingPointWindowValidity)
5054 : {
5055 2 : dfXOff = sExtraArg.dfXOff;
5056 2 : dfYOff = sExtraArg.dfYOff;
5057 2 : dfXSize = sExtraArg.dfXSize;
5058 2 : dfYSize = sExtraArg.dfYSize;
5059 : }
5060 :
5061 : /* -------------------------------------------------------------------- */
5062 : /* Compute stepping increment. */
5063 : /* -------------------------------------------------------------------- */
5064 4 : const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
5065 4 : const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
5066 :
5067 4 : constexpr double EPS = 1e-10;
5068 : /* -------------------------------------------------------------------- */
5069 : /* Loop over buffer computing source locations. */
5070 : /* -------------------------------------------------------------------- */
5071 36 : for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
5072 : {
5073 : GPtrDiff_t iSrcOffset;
5074 :
5075 : // Add small epsilon to avoid some numeric precision issues.
5076 32 : const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
5077 32 : const int iSrcY = static_cast<int>(std::min(
5078 32 : std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
5079 :
5080 32 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
5081 : static_cast<GPtrDiff_t>(nLineSpace);
5082 :
5083 302 : for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff++)
5084 : {
5085 270 : const double dfSrcX = (iBufXOff + 0.5) * dfSrcXInc + dfXOff + EPS;
5086 270 : const int iSrcX = static_cast<int>(std::min(
5087 270 : std::max(0.0, dfSrcX), static_cast<double>(nRasterXSize - 1)));
5088 :
5089 : // FIXME: this code likely doesn't work if the dirty block gets
5090 : // flushed to disk before being completely written. In the meantime,
5091 : // bJustInitialize should probably be set to FALSE even if it is not
5092 : // ideal performance wise, and for lossy compression
5093 :
5094 : /* --------------------------------------------------------------------
5095 : */
5096 : /* Ensure we have the appropriate block loaded. */
5097 : /* --------------------------------------------------------------------
5098 : */
5099 270 : if (iSrcX < nLBlockX * nBlockXSize ||
5100 270 : iSrcX - nBlockXSize >= nLBlockX * nBlockXSize ||
5101 266 : iSrcY < nLBlockY * nBlockYSize ||
5102 266 : iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
5103 : {
5104 4 : nLBlockX = iSrcX / nBlockXSize;
5105 4 : nLBlockY = iSrcY / nBlockYSize;
5106 :
5107 4 : const bool bJustInitialize =
5108 0 : eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
5109 0 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
5110 4 : nXOff <= nLBlockX * nBlockXSize &&
5111 0 : nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
5112 : /*bool bMemZeroBuffer = FALSE;
5113 : if( eRWFlag == GF_Write && !bJustInitialize &&
5114 : nXOff <= nLBlockX * nBlockXSize &&
5115 : nYOff <= nLBlockY * nBlockYSize &&
5116 : (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
5117 : (nXOff + nXSize == GetRasterXSize() &&
5118 : (nLBlockX+1) * nBlockXSize > GetRasterXSize())) &&
5119 : (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
5120 : (nYOff + nYSize == GetRasterYSize() &&
5121 : (nLBlockY+1) * nBlockYSize > GetRasterYSize())) )
5122 : {
5123 : bJustInitialize = TRUE;
5124 : bMemZeroBuffer = TRUE;
5125 : }*/
5126 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
5127 : {
5128 8 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
5129 8 : if (nOverviewLevel >= 0)
5130 2 : poBand = poBand->GetOverview(nOverviewLevel);
5131 16 : poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY,
5132 8 : bJustInitialize);
5133 8 : if (poBlock == nullptr)
5134 : {
5135 0 : eErr = CE_Failure;
5136 0 : goto CleanupAndReturn;
5137 : }
5138 :
5139 8 : if (eRWFlag == GF_Write)
5140 0 : poBlock->MarkDirty();
5141 :
5142 8 : if (papoBlocks[iBand] != nullptr)
5143 0 : papoBlocks[iBand]->DropLock();
5144 :
5145 8 : papoBlocks[iBand] = poBlock;
5146 :
5147 8 : papabySrcBlock[iBand] =
5148 8 : static_cast<GByte *>(poBlock->GetDataRef());
5149 : /*if( bMemZeroBuffer )
5150 : {
5151 : memset(papabySrcBlock[iBand], 0,
5152 : static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
5153 : * nBlockYSize);
5154 : }*/
5155 : }
5156 : }
5157 :
5158 : /* --------------------------------------------------------------------
5159 : */
5160 : /* Copy over this pixel of data. */
5161 : /* --------------------------------------------------------------------
5162 : */
5163 270 : iSrcOffset = (static_cast<GPtrDiff_t>(iSrcX) -
5164 270 : static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
5165 270 : (static_cast<GPtrDiff_t>(iSrcY) -
5166 270 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
5167 270 : nBlockXSize) *
5168 270 : nBandDataSize;
5169 :
5170 980 : for (int iBand = 0; iBand < nBandCount; iBand++)
5171 : {
5172 710 : GByte *pabySrcBlock = papabySrcBlock[iBand];
5173 710 : GPtrDiff_t iBandBufOffset =
5174 710 : iBufOffset + static_cast<GPtrDiff_t>(iBand) *
5175 : static_cast<GPtrDiff_t>(nBandSpace);
5176 :
5177 710 : if (eDataType == eBufType)
5178 : {
5179 710 : if (eRWFlag == GF_Read)
5180 710 : memcpy(static_cast<GByte *>(pData) + iBandBufOffset,
5181 710 : pabySrcBlock + iSrcOffset, nBandDataSize);
5182 : else
5183 0 : memcpy(pabySrcBlock + iSrcOffset,
5184 : static_cast<const GByte *>(pData) +
5185 0 : iBandBufOffset,
5186 : nBandDataSize);
5187 : }
5188 : else
5189 : {
5190 : /* type to type conversion ... ouch, this is expensive way
5191 : of handling single words */
5192 :
5193 0 : if (eRWFlag == GF_Read)
5194 0 : GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
5195 : static_cast<GByte *>(pData) +
5196 0 : iBandBufOffset,
5197 : eBufType, 0, 1);
5198 : else
5199 0 : GDALCopyWords64(static_cast<const GByte *>(pData) +
5200 0 : iBandBufOffset,
5201 0 : eBufType, 0, pabySrcBlock + iSrcOffset,
5202 : eDataType, 0, 1);
5203 : }
5204 : }
5205 :
5206 270 : iBufOffset += static_cast<int>(nPixelSpace);
5207 : }
5208 : }
5209 :
5210 : /* -------------------------------------------------------------------- */
5211 : /* CleanupAndReturn. */
5212 : /* -------------------------------------------------------------------- */
5213 4 : CleanupAndReturn:
5214 4 : CPLFree(papabySrcBlock);
5215 4 : if (papoBlocks != nullptr)
5216 : {
5217 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
5218 : {
5219 8 : if (papoBlocks[iBand] != nullptr)
5220 8 : papoBlocks[iBand]->DropLock();
5221 : }
5222 4 : CPLFree(papoBlocks);
5223 : }
5224 :
5225 4 : return eErr;
5226 : }
5227 :
5228 : //! @endcond
5229 :
5230 : /************************************************************************/
5231 : /* GDALCopyWholeRasterGetSwathSize() */
5232 : /************************************************************************/
5233 :
5234 3359 : static void GDALCopyWholeRasterGetSwathSize(GDALRasterBand *poSrcPrototypeBand,
5235 : GDALRasterBand *poDstPrototypeBand,
5236 : int nBandCount,
5237 : int bDstIsCompressed,
5238 : int bInterleave, int *pnSwathCols,
5239 : int *pnSwathLines)
5240 : {
5241 3359 : GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
5242 3359 : int nSrcBlockXSize = 0;
5243 3359 : int nSrcBlockYSize = 0;
5244 3359 : int nBlockXSize = 0;
5245 3359 : int nBlockYSize = 0;
5246 :
5247 3359 : int nXSize = poSrcPrototypeBand->GetXSize();
5248 3359 : int nYSize = poSrcPrototypeBand->GetYSize();
5249 :
5250 3359 : poSrcPrototypeBand->GetBlockSize(&nSrcBlockXSize, &nSrcBlockYSize);
5251 3359 : poDstPrototypeBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
5252 :
5253 3359 : const int nMaxBlockXSize = std::max(nBlockXSize, nSrcBlockXSize);
5254 3359 : const int nMaxBlockYSize = std::max(nBlockYSize, nSrcBlockYSize);
5255 :
5256 3359 : int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5257 3359 : if (bInterleave)
5258 583 : nPixelSize *= nBandCount;
5259 :
5260 : // aim for one row of blocks. Do not settle for less.
5261 3359 : int nSwathCols = nXSize;
5262 3359 : int nSwathLines = nMaxBlockYSize;
5263 :
5264 : const char *pszSrcCompression =
5265 3359 : poSrcPrototypeBand->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
5266 3359 : if (pszSrcCompression == nullptr)
5267 : {
5268 3339 : auto poSrcDS = poSrcPrototypeBand->GetDataset();
5269 3339 : if (poSrcDS)
5270 : pszSrcCompression =
5271 3333 : poSrcDS->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
5272 : }
5273 :
5274 : /* -------------------------------------------------------------------- */
5275 : /* What will our swath size be? */
5276 : /* -------------------------------------------------------------------- */
5277 : // When writing interleaved data in a compressed format, we want to be sure
5278 : // that each block will only be written once, so the swath size must not be
5279 : // greater than the block cache.
5280 3359 : const char *pszSwathSize = CPLGetConfigOption("GDAL_SWATH_SIZE", nullptr);
5281 : int nTargetSwathSize;
5282 3359 : if (pszSwathSize != nullptr)
5283 0 : nTargetSwathSize = static_cast<int>(
5284 0 : std::min(GIntBig(INT_MAX), CPLAtoGIntBig(pszSwathSize)));
5285 : else
5286 : {
5287 : // As a default, take one 1/4 of the cache size.
5288 3359 : nTargetSwathSize = static_cast<int>(
5289 3359 : std::min(GIntBig(INT_MAX), GDALGetCacheMax64() / 4));
5290 :
5291 : // but if the minimum idal swath buf size is less, then go for it to
5292 : // avoid unnecessarily abusing RAM usage.
5293 : // but try to use 10 MB at least.
5294 3359 : GIntBig nIdealSwathBufSize =
5295 3359 : static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize;
5296 3359 : int nMinTargetSwathSize = 10 * 1000 * 1000;
5297 :
5298 3359 : if ((poSrcPrototypeBand->GetSuggestedBlockAccessPattern() &
5299 3359 : GSBAP_LARGEST_CHUNK_POSSIBLE) != 0)
5300 : {
5301 1 : nMinTargetSwathSize = nTargetSwathSize;
5302 : }
5303 :
5304 3359 : if (nIdealSwathBufSize < nTargetSwathSize &&
5305 3349 : nIdealSwathBufSize < nMinTargetSwathSize)
5306 : {
5307 3346 : nIdealSwathBufSize = nMinTargetSwathSize;
5308 : }
5309 :
5310 3359 : if (pszSrcCompression != nullptr &&
5311 178 : EQUAL(pszSrcCompression, "JPEG2000") &&
5312 0 : (!bDstIsCompressed || ((nSrcBlockXSize % nBlockXSize) == 0 &&
5313 0 : (nSrcBlockYSize % nBlockYSize) == 0)))
5314 : {
5315 2 : nIdealSwathBufSize =
5316 4 : std::max(nIdealSwathBufSize, static_cast<GIntBig>(nSwathCols) *
5317 2 : nSrcBlockYSize * nPixelSize);
5318 : }
5319 3359 : if (nTargetSwathSize > nIdealSwathBufSize)
5320 3346 : nTargetSwathSize = static_cast<int>(
5321 3346 : std::min(GIntBig(INT_MAX), nIdealSwathBufSize));
5322 : }
5323 :
5324 3359 : if (nTargetSwathSize < 1000000)
5325 8 : nTargetSwathSize = 1000000;
5326 :
5327 : /* But let's check that */
5328 3580 : if (bDstIsCompressed && bInterleave &&
5329 221 : nTargetSwathSize > GDALGetCacheMax64())
5330 : {
5331 0 : CPLError(CE_Warning, CPLE_AppDefined,
5332 : "When translating into a compressed interleave format, "
5333 : "the block cache size (" CPL_FRMT_GIB ") "
5334 : "should be at least the size of the swath (%d) "
5335 : "(GDAL_SWATH_SIZE config. option)",
5336 : GDALGetCacheMax64(), nTargetSwathSize);
5337 : }
5338 :
5339 : #define IS_DIVIDER_OF(x, y) ((y) % (x) == 0)
5340 : #define ROUND_TO(x, y) (((x) / (y)) * (y))
5341 :
5342 : // if both input and output datasets are tiled, that the tile dimensions
5343 : // are "compatible", try to stick to a swath dimension that is a multiple
5344 : // of input and output block dimensions.
5345 3359 : if (nBlockXSize != nXSize && nSrcBlockXSize != nXSize &&
5346 47 : IS_DIVIDER_OF(nBlockXSize, nMaxBlockXSize) &&
5347 47 : IS_DIVIDER_OF(nSrcBlockXSize, nMaxBlockXSize) &&
5348 47 : IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
5349 47 : IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
5350 : {
5351 47 : if (static_cast<GIntBig>(nMaxBlockXSize) * nMaxBlockYSize *
5352 47 : nPixelSize <=
5353 47 : static_cast<GIntBig>(nTargetSwathSize))
5354 : {
5355 47 : nSwathCols = nTargetSwathSize / (nMaxBlockYSize * nPixelSize);
5356 47 : nSwathCols = ROUND_TO(nSwathCols, nMaxBlockXSize);
5357 47 : if (nSwathCols == 0)
5358 0 : nSwathCols = nMaxBlockXSize;
5359 47 : if (nSwathCols > nXSize)
5360 45 : nSwathCols = nXSize;
5361 47 : nSwathLines = nMaxBlockYSize;
5362 :
5363 47 : if (static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize >
5364 47 : static_cast<GIntBig>(nTargetSwathSize))
5365 : {
5366 0 : nSwathCols = nXSize;
5367 0 : nSwathLines = nBlockYSize;
5368 : }
5369 : }
5370 : }
5371 :
5372 3359 : const GIntBig nMemoryPerCol = static_cast<GIntBig>(nSwathCols) * nPixelSize;
5373 3359 : const GIntBig nSwathBufSize = nMemoryPerCol * nSwathLines;
5374 3359 : if (nSwathBufSize > static_cast<GIntBig>(nTargetSwathSize))
5375 : {
5376 1 : nSwathLines = static_cast<int>(nTargetSwathSize / nMemoryPerCol);
5377 1 : if (nSwathLines == 0)
5378 1 : nSwathLines = 1;
5379 :
5380 1 : CPLDebug(
5381 : "GDAL",
5382 : "GDALCopyWholeRasterGetSwathSize(): adjusting to %d line swath "
5383 : "since requirement (" CPL_FRMT_GIB " bytes) exceed target swath "
5384 : "size (%d bytes) (GDAL_SWATH_SIZE config. option)",
5385 1 : nSwathLines, nBlockYSize * nMemoryPerCol, nTargetSwathSize);
5386 : }
5387 : // If we are processing single scans, try to handle several at once.
5388 : // If we are handling swaths already, only grow the swath if a row
5389 : // of blocks is substantially less than our target buffer size.
5390 3358 : else if (nSwathLines == 1 ||
5391 2807 : nMemoryPerCol * nSwathLines <
5392 2807 : static_cast<GIntBig>(nTargetSwathSize) / 10)
5393 : {
5394 3330 : nSwathLines = std::min(
5395 : nYSize,
5396 3330 : std::max(1, static_cast<int>(nTargetSwathSize / nMemoryPerCol)));
5397 :
5398 : /* If possible try to align to source and target block height */
5399 3330 : if ((nSwathLines % nMaxBlockYSize) != 0 &&
5400 273 : nSwathLines > nMaxBlockYSize &&
5401 273 : IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
5402 244 : IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
5403 217 : nSwathLines = ROUND_TO(nSwathLines, nMaxBlockYSize);
5404 : }
5405 :
5406 3359 : if (pszSrcCompression != nullptr && EQUAL(pszSrcCompression, "JPEG2000") &&
5407 0 : (!bDstIsCompressed || (IS_DIVIDER_OF(nBlockXSize, nSrcBlockXSize) &&
5408 0 : IS_DIVIDER_OF(nBlockYSize, nSrcBlockYSize))))
5409 : {
5410 : // Typical use case: converting from Pleaiades that is 2048x2048 tiled.
5411 2 : if (nSwathLines < nSrcBlockYSize)
5412 : {
5413 0 : nSwathLines = nSrcBlockYSize;
5414 :
5415 : // Number of pixels that can be read/write simultaneously.
5416 0 : nSwathCols = nTargetSwathSize / (nSrcBlockXSize * nPixelSize);
5417 0 : nSwathCols = ROUND_TO(nSwathCols, nSrcBlockXSize);
5418 0 : if (nSwathCols == 0)
5419 0 : nSwathCols = nSrcBlockXSize;
5420 0 : if (nSwathCols > nXSize)
5421 0 : nSwathCols = nXSize;
5422 :
5423 0 : CPLDebug(
5424 : "GDAL",
5425 : "GDALCopyWholeRasterGetSwathSize(): because of compression and "
5426 : "too high block, "
5427 : "use partial width at one time");
5428 : }
5429 2 : else if ((nSwathLines % nSrcBlockYSize) != 0)
5430 : {
5431 : /* Round on a multiple of nSrcBlockYSize */
5432 0 : nSwathLines = ROUND_TO(nSwathLines, nSrcBlockYSize);
5433 0 : CPLDebug(
5434 : "GDAL",
5435 : "GDALCopyWholeRasterGetSwathSize(): because of compression, "
5436 : "round nSwathLines to block height : %d",
5437 : nSwathLines);
5438 : }
5439 : }
5440 3357 : else if (bDstIsCompressed)
5441 : {
5442 419 : if (nSwathLines < nBlockYSize)
5443 : {
5444 146 : nSwathLines = nBlockYSize;
5445 :
5446 : // Number of pixels that can be read/write simultaneously.
5447 146 : nSwathCols = nTargetSwathSize / (nSwathLines * nPixelSize);
5448 146 : nSwathCols = ROUND_TO(nSwathCols, nBlockXSize);
5449 146 : if (nSwathCols == 0)
5450 0 : nSwathCols = nBlockXSize;
5451 146 : if (nSwathCols > nXSize)
5452 146 : nSwathCols = nXSize;
5453 :
5454 146 : CPLDebug(
5455 : "GDAL",
5456 : "GDALCopyWholeRasterGetSwathSize(): because of compression and "
5457 : "too high block, "
5458 : "use partial width at one time");
5459 : }
5460 273 : else if ((nSwathLines % nBlockYSize) != 0)
5461 : {
5462 : // Round on a multiple of nBlockYSize.
5463 9 : nSwathLines = ROUND_TO(nSwathLines, nBlockYSize);
5464 9 : CPLDebug(
5465 : "GDAL",
5466 : "GDALCopyWholeRasterGetSwathSize(): because of compression, "
5467 : "round nSwathLines to block height : %d",
5468 : nSwathLines);
5469 : }
5470 : }
5471 :
5472 3359 : *pnSwathCols = nSwathCols;
5473 3359 : *pnSwathLines = nSwathLines;
5474 3359 : }
5475 :
5476 : /************************************************************************/
5477 : /* GDALDatasetCopyWholeRaster() */
5478 : /************************************************************************/
5479 :
5480 : /**
5481 : * \brief Copy all dataset raster data.
5482 : *
5483 : * This function copies the complete raster contents of one dataset to
5484 : * another similarly configured dataset. The source and destination
5485 : * dataset must have the same number of bands, and the same width
5486 : * and height. The bands do not have to have the same data type.
5487 : *
5488 : * This function is primarily intended to support implementation of
5489 : * driver specific CreateCopy() functions. It implements efficient copying,
5490 : * in particular "chunking" the copy in substantial blocks and, if appropriate,
5491 : * performing the transfer in a pixel interleaved fashion.
5492 : *
5493 : * Currently the only papszOptions value supported are :
5494 : * <ul>
5495 : * <li>"INTERLEAVE=PIXEL/BAND" to force pixel (resp. band) interleaved read and
5496 : * write access pattern (this does not modify the layout of the destination
5497 : * data)</li>
5498 : * <li>"COMPRESSED=YES" to force alignment on target dataset block
5499 : * sizes to achieve best compression.</li>
5500 : * <li>"SKIP_HOLES=YES" to skip chunks
5501 : * for which GDALGetDataCoverageStatus() returns GDAL_DATA_COVERAGE_STATUS_EMPTY
5502 : * (GDAL >= 2.2)</li>
5503 : * </ul>
5504 : * More options may be supported in the future.
5505 : *
5506 : * @param hSrcDS the source dataset
5507 : * @param hDstDS the destination dataset
5508 : * @param papszOptions transfer hints in "StringList" Name=Value format.
5509 : * @param pfnProgress progress reporting function.
5510 : * @param pProgressData callback data for progress function.
5511 : *
5512 : * @return CE_None on success, or CE_Failure on failure.
5513 : */
5514 :
5515 3331 : CPLErr CPL_STDCALL GDALDatasetCopyWholeRaster(GDALDatasetH hSrcDS,
5516 : GDALDatasetH hDstDS,
5517 : CSLConstList papszOptions,
5518 : GDALProgressFunc pfnProgress,
5519 : void *pProgressData)
5520 :
5521 : {
5522 3331 : VALIDATE_POINTER1(hSrcDS, "GDALDatasetCopyWholeRaster", CE_Failure);
5523 3331 : VALIDATE_POINTER1(hDstDS, "GDALDatasetCopyWholeRaster", CE_Failure);
5524 :
5525 3331 : GDALDataset *poSrcDS = GDALDataset::FromHandle(hSrcDS);
5526 3331 : GDALDataset *poDstDS = GDALDataset::FromHandle(hDstDS);
5527 :
5528 3331 : if (pfnProgress == nullptr)
5529 0 : pfnProgress = GDALDummyProgress;
5530 :
5531 : /* -------------------------------------------------------------------- */
5532 : /* Confirm the datasets match in size and band counts. */
5533 : /* -------------------------------------------------------------------- */
5534 3331 : const int nXSize = poDstDS->GetRasterXSize();
5535 3331 : const int nYSize = poDstDS->GetRasterYSize();
5536 3331 : const int nBandCount = poDstDS->GetRasterCount();
5537 :
5538 3331 : if (poSrcDS->GetRasterXSize() != nXSize ||
5539 6662 : poSrcDS->GetRasterYSize() != nYSize ||
5540 3331 : poSrcDS->GetRasterCount() != nBandCount)
5541 : {
5542 0 : CPLError(CE_Failure, CPLE_AppDefined,
5543 : "Input and output dataset sizes or band counts do not\n"
5544 : "match in GDALDatasetCopyWholeRaster()");
5545 0 : return CE_Failure;
5546 : }
5547 :
5548 : /* -------------------------------------------------------------------- */
5549 : /* Report preliminary (0) progress. */
5550 : /* -------------------------------------------------------------------- */
5551 3331 : if (!pfnProgress(0.0, nullptr, pProgressData))
5552 : {
5553 1 : CPLError(CE_Failure, CPLE_UserInterrupt,
5554 : "User terminated CreateCopy()");
5555 1 : return CE_Failure;
5556 : }
5557 :
5558 : /* -------------------------------------------------------------------- */
5559 : /* Get our prototype band, and assume the others are similarly */
5560 : /* configured. */
5561 : /* -------------------------------------------------------------------- */
5562 3330 : if (nBandCount == 0)
5563 0 : return CE_None;
5564 :
5565 3330 : GDALRasterBand *poSrcPrototypeBand = poSrcDS->GetRasterBand(1);
5566 3330 : GDALRasterBand *poDstPrototypeBand = poDstDS->GetRasterBand(1);
5567 3330 : GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
5568 :
5569 : /* -------------------------------------------------------------------- */
5570 : /* Do we want to try and do the operation in a pixel */
5571 : /* interleaved fashion? */
5572 : /* -------------------------------------------------------------------- */
5573 3330 : bool bInterleave = false;
5574 : const char *pszInterleave =
5575 3330 : poSrcDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
5576 3330 : if (pszInterleave != nullptr &&
5577 2926 : (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
5578 209 : bInterleave = true;
5579 :
5580 3330 : pszInterleave = poDstDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
5581 3330 : if (pszInterleave != nullptr &&
5582 2865 : (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
5583 528 : bInterleave = true;
5584 :
5585 3330 : pszInterleave = CSLFetchNameValue(papszOptions, "INTERLEAVE");
5586 3330 : if (pszInterleave != nullptr && EQUAL(pszInterleave, "PIXEL"))
5587 5 : bInterleave = true;
5588 3325 : else if (pszInterleave != nullptr && EQUAL(pszInterleave, "BAND"))
5589 13 : bInterleave = false;
5590 : // attributes is specific to the TileDB driver
5591 3312 : else if (pszInterleave != nullptr && EQUAL(pszInterleave, "ATTRIBUTES"))
5592 4 : bInterleave = true;
5593 3308 : else if (pszInterleave != nullptr)
5594 : {
5595 0 : CPLError(CE_Warning, CPLE_NotSupported,
5596 : "Unsupported value for option INTERLEAVE");
5597 : }
5598 :
5599 : // If the destination is compressed, we must try to write blocks just once,
5600 : // to save disk space (GTiff case for example), and to avoid data loss
5601 : // (JPEG compression for example).
5602 3330 : bool bDstIsCompressed = false;
5603 : const char *pszDstCompressed =
5604 3330 : CSLFetchNameValue(papszOptions, "COMPRESSED");
5605 3330 : if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
5606 393 : bDstIsCompressed = true;
5607 :
5608 : /* -------------------------------------------------------------------- */
5609 : /* What will our swath size be? */
5610 : /* -------------------------------------------------------------------- */
5611 :
5612 3330 : int nSwathCols = 0;
5613 3330 : int nSwathLines = 0;
5614 3330 : GDALCopyWholeRasterGetSwathSize(poSrcPrototypeBand, poDstPrototypeBand,
5615 : nBandCount, bDstIsCompressed, bInterleave,
5616 : &nSwathCols, &nSwathLines);
5617 :
5618 3330 : int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5619 3330 : if (bInterleave)
5620 583 : nPixelSize *= nBandCount;
5621 :
5622 3330 : void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
5623 3330 : if (pSwathBuf == nullptr)
5624 : {
5625 0 : return CE_Failure;
5626 : }
5627 :
5628 3330 : CPLDebug("GDAL",
5629 : "GDALDatasetCopyWholeRaster(): %d*%d swaths, bInterleave=%d",
5630 : nSwathCols, nSwathLines, static_cast<int>(bInterleave));
5631 :
5632 : // Advise the source raster that we are going to read it completely
5633 : // Note: this might already have been done by GDALCreateCopy() in the
5634 : // likely case this function is indirectly called by it
5635 3330 : poSrcDS->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nBandCount,
5636 3330 : nullptr, nullptr);
5637 :
5638 : /* ==================================================================== */
5639 : /* Band oriented (uninterleaved) case. */
5640 : /* ==================================================================== */
5641 3330 : CPLErr eErr = CE_None;
5642 : const bool bCheckHoles =
5643 3330 : CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
5644 :
5645 3330 : if (!bInterleave)
5646 : {
5647 : GDALRasterIOExtraArg sExtraArg;
5648 2747 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5649 2747 : CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress); // to make cppcheck happy
5650 :
5651 8241 : const GIntBig nTotalBlocks = static_cast<GIntBig>(nBandCount) *
5652 2747 : DIV_ROUND_UP(nYSize, nSwathLines) *
5653 2747 : DIV_ROUND_UP(nXSize, nSwathCols);
5654 2747 : GIntBig nBlocksDone = 0;
5655 :
5656 7934 : for (int iBand = 0; iBand < nBandCount && eErr == CE_None; iBand++)
5657 : {
5658 5187 : int nBand = iBand + 1;
5659 :
5660 10637 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5661 : {
5662 5450 : int nThisLines = nSwathLines;
5663 :
5664 5450 : if (iY + nThisLines > nYSize)
5665 368 : nThisLines = nYSize - iY;
5666 :
5667 10900 : for (int iX = 0; iX < nXSize && eErr == CE_None;
5668 5450 : iX += nSwathCols)
5669 : {
5670 5450 : int nThisCols = nSwathCols;
5671 :
5672 5450 : if (iX + nThisCols > nXSize)
5673 0 : nThisCols = nXSize - iX;
5674 :
5675 5450 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5676 5450 : if (bCheckHoles)
5677 : {
5678 : nStatus = poSrcDS->GetRasterBand(nBand)
5679 3744 : ->GetDataCoverageStatus(
5680 : iX, iY, nThisCols, nThisLines,
5681 : GDAL_DATA_COVERAGE_STATUS_DATA);
5682 : }
5683 5450 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5684 : {
5685 5446 : sExtraArg.pfnProgress = GDALScaledProgress;
5686 10892 : sExtraArg.pProgressData = GDALCreateScaledProgress(
5687 5446 : nBlocksDone / static_cast<double>(nTotalBlocks),
5688 5446 : (nBlocksDone + 0.5) /
5689 5446 : static_cast<double>(nTotalBlocks),
5690 : pfnProgress, pProgressData);
5691 5446 : if (sExtraArg.pProgressData == nullptr)
5692 1676 : sExtraArg.pfnProgress = nullptr;
5693 :
5694 5446 : eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
5695 : nThisLines, pSwathBuf,
5696 : nThisCols, nThisLines, eDT, 1,
5697 : &nBand, 0, 0, 0, &sExtraArg);
5698 :
5699 5446 : GDALDestroyScaledProgress(sExtraArg.pProgressData);
5700 :
5701 5446 : if (eErr == CE_None)
5702 5439 : eErr = poDstDS->RasterIO(
5703 : GF_Write, iX, iY, nThisCols, nThisLines,
5704 : pSwathBuf, nThisCols, nThisLines, eDT, 1,
5705 : &nBand, 0, 0, 0, nullptr);
5706 : }
5707 :
5708 5450 : nBlocksDone++;
5709 10858 : if (eErr == CE_None &&
5710 5408 : !pfnProgress(nBlocksDone /
5711 5408 : static_cast<double>(nTotalBlocks),
5712 : nullptr, pProgressData))
5713 : {
5714 2 : eErr = CE_Failure;
5715 2 : CPLError(CE_Failure, CPLE_UserInterrupt,
5716 : "User terminated CreateCopy()");
5717 : }
5718 : }
5719 : }
5720 : }
5721 : }
5722 :
5723 : /* ==================================================================== */
5724 : /* Pixel interleaved case. */
5725 : /* ==================================================================== */
5726 : else /* if( bInterleave ) */
5727 : {
5728 : GDALRasterIOExtraArg sExtraArg;
5729 583 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5730 583 : CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress); // to make cppcheck happy
5731 :
5732 583 : const GIntBig nTotalBlocks =
5733 583 : static_cast<GIntBig>(DIV_ROUND_UP(nYSize, nSwathLines)) *
5734 583 : DIV_ROUND_UP(nXSize, nSwathCols);
5735 583 : GIntBig nBlocksDone = 0;
5736 :
5737 1388 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5738 : {
5739 805 : int nThisLines = nSwathLines;
5740 :
5741 805 : if (iY + nThisLines > nYSize)
5742 198 : nThisLines = nYSize - iY;
5743 :
5744 1615 : for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
5745 : {
5746 810 : int nThisCols = nSwathCols;
5747 :
5748 810 : if (iX + nThisCols > nXSize)
5749 3 : nThisCols = nXSize - iX;
5750 :
5751 810 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5752 810 : if (bCheckHoles)
5753 : {
5754 551 : nStatus = 0;
5755 604 : for (int iBand = 0; iBand < nBandCount; iBand++)
5756 : {
5757 585 : nStatus |= poSrcDS->GetRasterBand(iBand + 1)
5758 585 : ->GetDataCoverageStatus(
5759 : iX, iY, nThisCols, nThisLines,
5760 : GDAL_DATA_COVERAGE_STATUS_DATA);
5761 585 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5762 532 : break;
5763 : }
5764 : }
5765 810 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5766 : {
5767 791 : sExtraArg.pfnProgress = GDALScaledProgress;
5768 1582 : sExtraArg.pProgressData = GDALCreateScaledProgress(
5769 791 : nBlocksDone / static_cast<double>(nTotalBlocks),
5770 791 : (nBlocksDone + 0.5) / static_cast<double>(nTotalBlocks),
5771 : pfnProgress, pProgressData);
5772 791 : if (sExtraArg.pProgressData == nullptr)
5773 375 : sExtraArg.pfnProgress = nullptr;
5774 :
5775 791 : eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
5776 : nThisLines, pSwathBuf, nThisCols,
5777 : nThisLines, eDT, nBandCount,
5778 : nullptr, 0, 0, 0, &sExtraArg);
5779 :
5780 791 : GDALDestroyScaledProgress(sExtraArg.pProgressData);
5781 :
5782 791 : if (eErr == CE_None)
5783 790 : eErr = poDstDS->RasterIO(
5784 : GF_Write, iX, iY, nThisCols, nThisLines, pSwathBuf,
5785 : nThisCols, nThisLines, eDT, nBandCount, nullptr, 0,
5786 : 0, 0, nullptr);
5787 : }
5788 :
5789 810 : nBlocksDone++;
5790 1615 : if (eErr == CE_None &&
5791 805 : !pfnProgress(nBlocksDone /
5792 805 : static_cast<double>(nTotalBlocks),
5793 : nullptr, pProgressData))
5794 : {
5795 1 : eErr = CE_Failure;
5796 1 : CPLError(CE_Failure, CPLE_UserInterrupt,
5797 : "User terminated CreateCopy()");
5798 : }
5799 : }
5800 : }
5801 : }
5802 :
5803 : /* -------------------------------------------------------------------- */
5804 : /* Cleanup */
5805 : /* -------------------------------------------------------------------- */
5806 3330 : CPLFree(pSwathBuf);
5807 :
5808 3330 : return eErr;
5809 : }
5810 :
5811 : /************************************************************************/
5812 : /* GDALRasterBandCopyWholeRaster() */
5813 : /************************************************************************/
5814 :
5815 : /**
5816 : * \brief Copy a whole raster band
5817 : *
5818 : * This function copies the complete raster contents of one band to
5819 : * another similarly configured band. The source and destination
5820 : * bands must have the same width and height. The bands do not have
5821 : * to have the same data type.
5822 : *
5823 : * It implements efficient copying, in particular "chunking" the copy in
5824 : * substantial blocks.
5825 : *
5826 : * Currently the only papszOptions value supported are :
5827 : * <ul>
5828 : * <li>"COMPRESSED=YES" to force alignment on target dataset block sizes to
5829 : * achieve best compression.</li>
5830 : * <li>"SKIP_HOLES=YES" to skip chunks for which GDALGetDataCoverageStatus()
5831 : * returns GDAL_DATA_COVERAGE_STATUS_EMPTY (GDAL >= 2.2)</li>
5832 : * </ul>
5833 : *
5834 : * @param hSrcBand the source band
5835 : * @param hDstBand the destination band
5836 : * @param papszOptions transfer hints in "StringList" Name=Value format.
5837 : * @param pfnProgress progress reporting function.
5838 : * @param pProgressData callback data for progress function.
5839 : *
5840 : * @return CE_None on success, or CE_Failure on failure.
5841 : */
5842 :
5843 29 : CPLErr CPL_STDCALL GDALRasterBandCopyWholeRaster(
5844 : GDALRasterBandH hSrcBand, GDALRasterBandH hDstBand,
5845 : const char *const *const papszOptions, GDALProgressFunc pfnProgress,
5846 : void *pProgressData)
5847 :
5848 : {
5849 29 : VALIDATE_POINTER1(hSrcBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
5850 29 : VALIDATE_POINTER1(hDstBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
5851 :
5852 29 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
5853 29 : GDALRasterBand *poDstBand = GDALRasterBand::FromHandle(hDstBand);
5854 29 : CPLErr eErr = CE_None;
5855 :
5856 29 : if (pfnProgress == nullptr)
5857 2 : pfnProgress = GDALDummyProgress;
5858 :
5859 : /* -------------------------------------------------------------------- */
5860 : /* Confirm the datasets match in size and band counts. */
5861 : /* -------------------------------------------------------------------- */
5862 29 : int nXSize = poSrcBand->GetXSize();
5863 29 : int nYSize = poSrcBand->GetYSize();
5864 :
5865 29 : if (poDstBand->GetXSize() != nXSize || poDstBand->GetYSize() != nYSize)
5866 : {
5867 0 : CPLError(CE_Failure, CPLE_AppDefined,
5868 : "Input and output band sizes do not\n"
5869 : "match in GDALRasterBandCopyWholeRaster()");
5870 0 : return CE_Failure;
5871 : }
5872 :
5873 : /* -------------------------------------------------------------------- */
5874 : /* Report preliminary (0) progress. */
5875 : /* -------------------------------------------------------------------- */
5876 29 : if (!pfnProgress(0.0, nullptr, pProgressData))
5877 : {
5878 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
5879 : "User terminated CreateCopy()");
5880 0 : return CE_Failure;
5881 : }
5882 :
5883 29 : GDALDataType eDT = poDstBand->GetRasterDataType();
5884 :
5885 : // If the destination is compressed, we must try to write blocks just once,
5886 : // to save disk space (GTiff case for example), and to avoid data loss
5887 : // (JPEG compression for example).
5888 29 : bool bDstIsCompressed = false;
5889 : const char *pszDstCompressed =
5890 29 : CSLFetchNameValue(const_cast<char **>(papszOptions), "COMPRESSED");
5891 29 : if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
5892 26 : bDstIsCompressed = true;
5893 :
5894 : /* -------------------------------------------------------------------- */
5895 : /* What will our swath size be? */
5896 : /* -------------------------------------------------------------------- */
5897 :
5898 29 : int nSwathCols = 0;
5899 29 : int nSwathLines = 0;
5900 29 : GDALCopyWholeRasterGetSwathSize(poSrcBand, poDstBand, 1, bDstIsCompressed,
5901 : FALSE, &nSwathCols, &nSwathLines);
5902 :
5903 29 : const int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5904 :
5905 29 : void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
5906 29 : if (pSwathBuf == nullptr)
5907 : {
5908 0 : return CE_Failure;
5909 : }
5910 :
5911 29 : CPLDebug("GDAL", "GDALRasterBandCopyWholeRaster(): %d*%d swaths",
5912 : nSwathCols, nSwathLines);
5913 :
5914 : const bool bCheckHoles =
5915 29 : CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
5916 :
5917 : // Advise the source raster that we are going to read it completely
5918 29 : poSrcBand->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nullptr);
5919 :
5920 : /* ==================================================================== */
5921 : /* Band oriented (uninterleaved) case. */
5922 : /* ==================================================================== */
5923 :
5924 72 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5925 : {
5926 43 : int nThisLines = nSwathLines;
5927 :
5928 43 : if (iY + nThisLines > nYSize)
5929 8 : nThisLines = nYSize - iY;
5930 :
5931 86 : for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
5932 : {
5933 43 : int nThisCols = nSwathCols;
5934 :
5935 43 : if (iX + nThisCols > nXSize)
5936 0 : nThisCols = nXSize - iX;
5937 :
5938 43 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5939 43 : if (bCheckHoles)
5940 : {
5941 0 : nStatus = poSrcBand->GetDataCoverageStatus(
5942 : iX, iY, nThisCols, nThisLines,
5943 : GDAL_DATA_COVERAGE_STATUS_DATA);
5944 : }
5945 43 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5946 : {
5947 43 : eErr = poSrcBand->RasterIO(GF_Read, iX, iY, nThisCols,
5948 : nThisLines, pSwathBuf, nThisCols,
5949 : nThisLines, eDT, 0, 0, nullptr);
5950 :
5951 43 : if (eErr == CE_None)
5952 43 : eErr = poDstBand->RasterIO(GF_Write, iX, iY, nThisCols,
5953 : nThisLines, pSwathBuf, nThisCols,
5954 : nThisLines, eDT, 0, 0, nullptr);
5955 : }
5956 :
5957 86 : if (eErr == CE_None && !pfnProgress(double(iY + nThisLines) /
5958 43 : static_cast<double>(nYSize),
5959 : nullptr, pProgressData))
5960 : {
5961 0 : eErr = CE_Failure;
5962 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
5963 : "User terminated CreateCopy()");
5964 : }
5965 : }
5966 : }
5967 :
5968 : /* -------------------------------------------------------------------- */
5969 : /* Cleanup */
5970 : /* -------------------------------------------------------------------- */
5971 29 : CPLFree(pSwathBuf);
5972 :
5973 29 : return eErr;
5974 : }
5975 :
5976 : /************************************************************************/
5977 : /* GDALCopyRasterIOExtraArg () */
5978 : /************************************************************************/
5979 :
5980 527312 : void GDALCopyRasterIOExtraArg(GDALRasterIOExtraArg *psDestArg,
5981 : GDALRasterIOExtraArg *psSrcArg)
5982 : {
5983 527312 : INIT_RASTERIO_EXTRA_ARG(*psDestArg);
5984 527312 : if (psSrcArg)
5985 : {
5986 527312 : psDestArg->eResampleAlg = psSrcArg->eResampleAlg;
5987 527312 : psDestArg->pfnProgress = psSrcArg->pfnProgress;
5988 527312 : psDestArg->pProgressData = psSrcArg->pProgressData;
5989 527312 : psDestArg->bFloatingPointWindowValidity =
5990 527312 : psSrcArg->bFloatingPointWindowValidity;
5991 527312 : if (psSrcArg->bFloatingPointWindowValidity)
5992 : {
5993 204393 : psDestArg->dfXOff = psSrcArg->dfXOff;
5994 204393 : psDestArg->dfYOff = psSrcArg->dfYOff;
5995 204393 : psDestArg->dfXSize = psSrcArg->dfXSize;
5996 204393 : psDestArg->dfYSize = psSrcArg->dfYSize;
5997 : }
5998 527312 : if (psSrcArg->nVersion >= 2)
5999 : {
6000 527312 : psDestArg->bUseOnlyThisScale = psSrcArg->bUseOnlyThisScale;
6001 : }
6002 : }
6003 527312 : }
6004 :
6005 : /************************************************************************/
6006 : /* HasOnlyNoData() */
6007 : /************************************************************************/
6008 :
6009 50997376 : template <class T> static inline bool IsEqualToNoData(T value, T noDataValue)
6010 : {
6011 50997376 : return value == noDataValue;
6012 : }
6013 :
6014 5509 : template <> bool IsEqualToNoData<GFloat16>(GFloat16 value, GFloat16 noDataValue)
6015 : {
6016 : using std::isnan;
6017 5509 : return isnan(noDataValue) ? isnan(value) : value == noDataValue;
6018 : }
6019 :
6020 251221 : template <> bool IsEqualToNoData<float>(float value, float noDataValue)
6021 : {
6022 251221 : return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
6023 : }
6024 :
6025 264257 : template <> bool IsEqualToNoData<double>(double value, double noDataValue)
6026 : {
6027 264257 : return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
6028 : }
6029 :
6030 : template <class T>
6031 12015 : static bool HasOnlyNoDataT(const T *pBuffer, T noDataValue, size_t nWidth,
6032 : size_t nHeight, size_t nLineStride,
6033 : size_t nComponents)
6034 : {
6035 : // Fast test: check the 4 corners and the middle pixel.
6036 23278 : for (size_t iBand = 0; iBand < nComponents; iBand++)
6037 : {
6038 24077 : if (!(IsEqualToNoData(pBuffer[iBand], noDataValue) &&
6039 11871 : IsEqualToNoData(pBuffer[(nWidth - 1) * nComponents + iBand],
6040 11741 : noDataValue) &&
6041 11741 : IsEqualToNoData(
6042 11741 : pBuffer[((nHeight - 1) / 2 * nLineStride + (nWidth - 1) / 2) *
6043 11741 : nComponents +
6044 : iBand],
6045 11266 : noDataValue) &&
6046 11266 : IsEqualToNoData(
6047 11266 : pBuffer[(nHeight - 1) * nLineStride * nComponents + iBand],
6048 : noDataValue) &&
6049 11266 : IsEqualToNoData(
6050 11266 : pBuffer[((nHeight - 1) * nLineStride + nWidth - 1) *
6051 11266 : nComponents +
6052 : iBand],
6053 : noDataValue)))
6054 : {
6055 943 : return false;
6056 : }
6057 : }
6058 :
6059 : // Test all pixels.
6060 51319 : for (size_t iY = 0; iY < nHeight; iY++)
6061 : {
6062 40368 : const T *pBufferLine = pBuffer + iY * nLineStride * nComponents;
6063 51500248 : for (size_t iX = 0; iX < nWidth * nComponents; iX++)
6064 : {
6065 51459915 : if (!IsEqualToNoData(pBufferLine[iX], noDataValue))
6066 : {
6067 121 : return false;
6068 : }
6069 : }
6070 : }
6071 10951 : return true;
6072 : }
6073 :
6074 : /************************************************************************/
6075 : /* GDALBufferHasOnlyNoData() */
6076 : /************************************************************************/
6077 :
6078 43882 : bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
6079 : size_t nWidth, size_t nHeight, size_t nLineStride,
6080 : size_t nComponents, int nBitsPerSample,
6081 : GDALBufferSampleFormat nSampleFormat)
6082 : {
6083 : // In the case where the nodata is 0, we can compare several bytes at
6084 : // once. Select the largest natural integer type for the architecture.
6085 43882 : if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
6086 : // Do not use this optimized code path for floating point numbers,
6087 : // as it can't detect negative zero.
6088 : nSampleFormat != GSF_FLOATING_POINT)
6089 : {
6090 27247 : const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
6091 27247 : const size_t nSize =
6092 27247 : static_cast<size_t>((static_cast<uint64_t>(nWidth) * nHeight *
6093 27247 : nComponents * nBitsPerSample +
6094 : 7) /
6095 : 8);
6096 : #ifdef HAVE_SSE2
6097 27247 : size_t n = nSize;
6098 : // Align to 16 bytes
6099 27310 : while ((reinterpret_cast<uintptr_t>(pabyBuffer) & 15) != 0 && n > 0)
6100 : {
6101 73 : --n;
6102 73 : if (*pabyBuffer)
6103 10 : return false;
6104 63 : pabyBuffer++;
6105 : }
6106 :
6107 27237 : const auto zero = _mm_setzero_si128();
6108 27237 : constexpr int UNROLLING = 4;
6109 2217660 : while (n >= UNROLLING * sizeof(zero))
6110 : {
6111 2202420 : const auto v0 = _mm_load_si128(reinterpret_cast<const __m128i *>(
6112 : pabyBuffer + 0 * sizeof(zero)));
6113 2202420 : const auto v1 = _mm_load_si128(reinterpret_cast<const __m128i *>(
6114 2202420 : pabyBuffer + 1 * sizeof(zero)));
6115 2202420 : const auto v2 = _mm_load_si128(reinterpret_cast<const __m128i *>(
6116 2202420 : pabyBuffer + 2 * sizeof(zero)));
6117 2202420 : const auto v3 = _mm_load_si128(reinterpret_cast<const __m128i *>(
6118 2202420 : pabyBuffer + 3 * sizeof(zero)));
6119 : const auto v =
6120 6607260 : _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
6121 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
6122 : if (!_mm_test_all_zeros(v, v))
6123 : #else
6124 4404840 : if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
6125 : #endif
6126 : {
6127 12001 : return false;
6128 : }
6129 2190420 : pabyBuffer += UNROLLING * sizeof(zero);
6130 2190420 : n -= UNROLLING * sizeof(zero);
6131 : }
6132 :
6133 233657 : while (n > 0)
6134 : {
6135 218525 : --n;
6136 218525 : if (*pabyBuffer)
6137 104 : return false;
6138 218421 : pabyBuffer++;
6139 : }
6140 : #else
6141 : #if SIZEOF_VOIDP >= 8 || defined(__x86_64__)
6142 : // We test __x86_64__ for x32 arch where SIZEOF_VOIDP == 4
6143 : typedef std::uint64_t WordType;
6144 : #else
6145 : typedef std::uint32_t WordType;
6146 : #endif
6147 :
6148 : const size_t nInitialIters =
6149 : std::min(sizeof(WordType) -
6150 : static_cast<size_t>(
6151 : reinterpret_cast<std::uintptr_t>(pabyBuffer) %
6152 : sizeof(WordType)),
6153 : nSize);
6154 : size_t i = 0;
6155 : for (; i < nInitialIters; i++)
6156 : {
6157 : if (pabyBuffer[i])
6158 : return false;
6159 : }
6160 : for (; i + sizeof(WordType) - 1 < nSize; i += sizeof(WordType))
6161 : {
6162 : if (*(reinterpret_cast<const WordType *>(pabyBuffer + i)))
6163 : return false;
6164 : }
6165 : for (; i < nSize; i++)
6166 : {
6167 : if (pabyBuffer[i])
6168 : return false;
6169 : }
6170 : #endif
6171 15132 : return true;
6172 : }
6173 :
6174 : #ifdef HAVE_SSE2
6175 16635 : else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
6176 708 : nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
6177 : {
6178 708 : const auto signMask = _mm_set1_epi32(0x7FFFFFFF);
6179 708 : const auto zero = _mm_setzero_si128();
6180 708 : const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
6181 708 : const size_t n = nWidth * nHeight * nComponents;
6182 :
6183 708 : size_t i = 0;
6184 708 : constexpr int UNROLLING = 4;
6185 708 : constexpr size_t VALUES_PER_ITER =
6186 : UNROLLING * sizeof(zero) / sizeof(float);
6187 24983 : for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
6188 : {
6189 24934 : const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6190 : pabyBuffer + 0 * sizeof(zero)));
6191 24934 : const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6192 24934 : pabyBuffer + 1 * sizeof(zero)));
6193 24934 : const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6194 24934 : pabyBuffer + 2 * sizeof(zero)));
6195 24934 : const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6196 24934 : pabyBuffer + 3 * sizeof(zero)));
6197 74802 : auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
6198 : // Clear the sign bit (makes -0.0 become +0.0)
6199 24934 : v = _mm_and_si128(v, signMask);
6200 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
6201 : if (!_mm_test_all_zeros(v, v))
6202 : #else
6203 49868 : if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
6204 : #endif
6205 : {
6206 659 : return false;
6207 : }
6208 24275 : pabyBuffer += UNROLLING * sizeof(zero);
6209 : }
6210 :
6211 304 : for (; i < n; i++)
6212 : {
6213 : uint32_t bits;
6214 272 : memcpy(&bits, pabyBuffer, sizeof(bits));
6215 272 : pabyBuffer += sizeof(bits);
6216 272 : if ((bits & 0x7FFFFFFF) != 0)
6217 17 : return false;
6218 : }
6219 :
6220 32 : return true;
6221 : }
6222 :
6223 15927 : else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
6224 3905 : nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
6225 : {
6226 3905 : const auto signMask = _mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL);
6227 3905 : const auto zero = _mm_setzero_si128();
6228 3905 : const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
6229 3905 : const size_t n = nWidth * nHeight * nComponents;
6230 :
6231 3905 : size_t i = 0;
6232 3905 : constexpr int UNROLLING = 4;
6233 3905 : constexpr size_t VALUES_PER_ITER =
6234 : UNROLLING * sizeof(zero) / sizeof(double);
6235 1664570 : for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
6236 : {
6237 1660950 : const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6238 : pabyBuffer + 0 * sizeof(zero)));
6239 1660950 : const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6240 1660950 : pabyBuffer + 1 * sizeof(zero)));
6241 1660950 : const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6242 1660950 : pabyBuffer + 2 * sizeof(zero)));
6243 1660950 : const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
6244 1660950 : pabyBuffer + 3 * sizeof(zero)));
6245 4982850 : auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
6246 : // Clear the sign bit (makes -0.0 become +0.0)
6247 1660950 : v = _mm_and_si128(v, signMask);
6248 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
6249 : if (!_mm_test_all_zeros(v, v))
6250 : #else
6251 3321900 : if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
6252 : #endif
6253 : {
6254 289 : return false;
6255 : }
6256 1660660 : pabyBuffer += UNROLLING * sizeof(zero);
6257 : }
6258 :
6259 3643 : for (; i < n; i++)
6260 : {
6261 : uint64_t bits;
6262 34 : memcpy(&bits, pabyBuffer, sizeof(bits));
6263 34 : pabyBuffer += sizeof(bits);
6264 34 : if ((bits & 0x7FFFFFFFFFFFFFFFULL) != 0)
6265 7 : return false;
6266 : }
6267 :
6268 3609 : return true;
6269 : }
6270 : #endif
6271 :
6272 12022 : if (nBitsPerSample == 8 && nSampleFormat == GSF_UNSIGNED_INT)
6273 : {
6274 22406 : return GDALIsValueInRange<uint8_t>(dfNoDataValue) &&
6275 11203 : HasOnlyNoDataT(static_cast<const uint8_t *>(pBuffer),
6276 11203 : static_cast<uint8_t>(dfNoDataValue), nWidth,
6277 11203 : nHeight, nLineStride, nComponents);
6278 : }
6279 819 : if (nBitsPerSample == 8 && nSampleFormat == GSF_SIGNED_INT)
6280 : {
6281 : // Use unsigned implementation by converting the nodatavalue to
6282 : // unsigned
6283 119 : return GDALIsValueInRange<int8_t>(dfNoDataValue) &&
6284 59 : HasOnlyNoDataT(
6285 : static_cast<const uint8_t *>(pBuffer),
6286 59 : static_cast<uint8_t>(static_cast<int8_t>(dfNoDataValue)),
6287 60 : nWidth, nHeight, nLineStride, nComponents);
6288 : }
6289 759 : if (nBitsPerSample == 16 && nSampleFormat == GSF_UNSIGNED_INT)
6290 : {
6291 23 : return GDALIsValueInRange<uint16_t>(dfNoDataValue) &&
6292 11 : HasOnlyNoDataT(static_cast<const uint16_t *>(pBuffer),
6293 11 : static_cast<uint16_t>(dfNoDataValue), nWidth,
6294 12 : nHeight, nLineStride, nComponents);
6295 : }
6296 747 : if (nBitsPerSample == 16 && nSampleFormat == GSF_SIGNED_INT)
6297 : {
6298 : // Use unsigned implementation by converting the nodatavalue to
6299 : // unsigned
6300 111 : return GDALIsValueInRange<int16_t>(dfNoDataValue) &&
6301 55 : HasOnlyNoDataT(
6302 : static_cast<const uint16_t *>(pBuffer),
6303 55 : static_cast<uint16_t>(static_cast<int16_t>(dfNoDataValue)),
6304 56 : nWidth, nHeight, nLineStride, nComponents);
6305 : }
6306 691 : if (nBitsPerSample == 32 && nSampleFormat == GSF_UNSIGNED_INT)
6307 : {
6308 129 : return GDALIsValueInRange<uint32_t>(dfNoDataValue) &&
6309 64 : HasOnlyNoDataT(static_cast<const uint32_t *>(pBuffer),
6310 : static_cast<uint32_t>(dfNoDataValue), nWidth,
6311 65 : nHeight, nLineStride, nComponents);
6312 : }
6313 626 : if (nBitsPerSample == 32 && nSampleFormat == GSF_SIGNED_INT)
6314 : {
6315 : // Use unsigned implementation by converting the nodatavalue to
6316 : // unsigned
6317 23 : return GDALIsValueInRange<int32_t>(dfNoDataValue) &&
6318 11 : HasOnlyNoDataT(
6319 : static_cast<const uint32_t *>(pBuffer),
6320 11 : static_cast<uint32_t>(static_cast<int32_t>(dfNoDataValue)),
6321 12 : nWidth, nHeight, nLineStride, nComponents);
6322 : }
6323 614 : if (nBitsPerSample == 64 && nSampleFormat == GSF_UNSIGNED_INT)
6324 : {
6325 112 : return GDALIsValueInRange<uint64_t>(dfNoDataValue) &&
6326 56 : HasOnlyNoDataT(static_cast<const uint64_t *>(pBuffer),
6327 : static_cast<uint64_t>(dfNoDataValue), nWidth,
6328 56 : nHeight, nLineStride, nComponents);
6329 : }
6330 558 : if (nBitsPerSample == 64 && nSampleFormat == GSF_SIGNED_INT)
6331 : {
6332 : // Use unsigned implementation by converting the nodatavalue to
6333 : // unsigned
6334 0 : return GDALIsValueInRange<int64_t>(dfNoDataValue) &&
6335 0 : HasOnlyNoDataT(
6336 : static_cast<const uint64_t *>(pBuffer),
6337 0 : static_cast<uint64_t>(static_cast<int64_t>(dfNoDataValue)),
6338 0 : nWidth, nHeight, nLineStride, nComponents);
6339 : }
6340 558 : if (nBitsPerSample == 16 && nSampleFormat == GSF_FLOATING_POINT)
6341 : {
6342 106 : return (std::isnan(dfNoDataValue) ||
6343 211 : GDALIsValueInRange<GFloat16>(dfNoDataValue)) &&
6344 105 : HasOnlyNoDataT(static_cast<const GFloat16 *>(pBuffer),
6345 : static_cast<GFloat16>(dfNoDataValue), nWidth,
6346 106 : nHeight, nLineStride, nComponents);
6347 : }
6348 452 : if (nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
6349 : {
6350 268 : return (std::isnan(dfNoDataValue) ||
6351 535 : GDALIsValueInRange<float>(dfNoDataValue)) &&
6352 267 : HasOnlyNoDataT(static_cast<const float *>(pBuffer),
6353 : static_cast<float>(dfNoDataValue), nWidth,
6354 268 : nHeight, nLineStride, nComponents);
6355 : }
6356 184 : if (nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
6357 : {
6358 184 : return HasOnlyNoDataT(static_cast<const double *>(pBuffer),
6359 : dfNoDataValue, nWidth, nHeight, nLineStride,
6360 184 : nComponents);
6361 : }
6362 0 : return false;
6363 : }
6364 :
6365 : #ifdef HAVE_SSE2
6366 :
6367 : /************************************************************************/
6368 : /* GDALDeinterleave3Byte() */
6369 : /************************************************************************/
6370 :
6371 : #if defined(__GNUC__) && !defined(__clang__)
6372 : __attribute__((optimize("no-tree-vectorize")))
6373 : #endif
6374 380714 : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
6375 : GByte *CPL_RESTRICT pabyDest0,
6376 : GByte *CPL_RESTRICT pabyDest1,
6377 : GByte *CPL_RESTRICT pabyDest2, size_t nIters)
6378 : #ifdef USE_NEON_OPTIMIZATIONS
6379 : {
6380 : return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6381 : nIters);
6382 : }
6383 : #else
6384 : {
6385 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
6386 380714 : if (CPLHaveRuntimeSSSE3())
6387 : {
6388 380712 : return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
6389 380712 : pabyDest2, nIters);
6390 : }
6391 : #endif
6392 :
6393 2 : size_t i = 0;
6394 2 : if (((reinterpret_cast<uintptr_t>(pabySrc) |
6395 2 : reinterpret_cast<uintptr_t>(pabyDest0) |
6396 2 : reinterpret_cast<uintptr_t>(pabyDest1) |
6397 2 : reinterpret_cast<uintptr_t>(pabyDest2)) %
6398 : sizeof(unsigned int)) == 0)
6399 : {
6400 : // Slightly better than GCC autovectorizer
6401 17 : for (size_t j = 0; i + 3 < nIters; i += 4, ++j)
6402 : {
6403 15 : unsigned int word0 =
6404 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i);
6405 15 : unsigned int word1 =
6406 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 4);
6407 15 : unsigned int word2 =
6408 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 8);
6409 15 : reinterpret_cast<unsigned int *>(pabyDest0)[j] =
6410 15 : (word0 & 0xff) | ((word0 >> 24) << 8) | (word1 & 0x00ff0000) |
6411 15 : ((word2 >> 8) << 24);
6412 15 : reinterpret_cast<unsigned int *>(pabyDest1)[j] =
6413 15 : ((word0 >> 8) & 0xff) | ((word1 & 0xff) << 8) |
6414 15 : (((word1 >> 24)) << 16) | ((word2 >> 16) << 24);
6415 15 : pabyDest2[j * 4] = static_cast<GByte>(word0 >> 16);
6416 15 : pabyDest2[j * 4 + 1] = static_cast<GByte>(word1 >> 8);
6417 15 : pabyDest2[j * 4 + 2] = static_cast<GByte>(word2);
6418 15 : pabyDest2[j * 4 + 3] = static_cast<GByte>(word2 >> 24);
6419 : }
6420 : }
6421 : #if defined(__clang__)
6422 : #pragma clang loop vectorize(disable)
6423 : #endif
6424 3 : for (; i < nIters; ++i)
6425 : {
6426 1 : pabyDest0[i] = pabySrc[3 * i + 0];
6427 1 : pabyDest1[i] = pabySrc[3 * i + 1];
6428 1 : pabyDest2[i] = pabySrc[3 * i + 2];
6429 : }
6430 : }
6431 : #endif
6432 :
6433 : /************************************************************************/
6434 : /* GDALDeinterleave4Byte() */
6435 : /************************************************************************/
6436 :
6437 : #if !defined(__GNUC__) || defined(__clang__)
6438 :
6439 : /************************************************************************/
6440 : /* deinterleave() */
6441 : /************************************************************************/
6442 :
6443 : template <bool SHIFT, bool MASK>
6444 : inline __m128i deinterleave(__m128i &xmm0_ori, __m128i &xmm1_ori,
6445 : __m128i &xmm2_ori, __m128i &xmm3_ori)
6446 : {
6447 : // Set higher 24bit of each int32 packed word to 0
6448 : if (SHIFT)
6449 : {
6450 : xmm0_ori = _mm_srli_epi32(xmm0_ori, 8);
6451 : xmm1_ori = _mm_srli_epi32(xmm1_ori, 8);
6452 : xmm2_ori = _mm_srli_epi32(xmm2_ori, 8);
6453 : xmm3_ori = _mm_srli_epi32(xmm3_ori, 8);
6454 : }
6455 : __m128i xmm0;
6456 : __m128i xmm1;
6457 : __m128i xmm2;
6458 : __m128i xmm3;
6459 : if (MASK)
6460 : {
6461 : const __m128i xmm_mask = _mm_set1_epi32(0xff);
6462 : xmm0 = _mm_and_si128(xmm0_ori, xmm_mask);
6463 : xmm1 = _mm_and_si128(xmm1_ori, xmm_mask);
6464 : xmm2 = _mm_and_si128(xmm2_ori, xmm_mask);
6465 : xmm3 = _mm_and_si128(xmm3_ori, xmm_mask);
6466 : }
6467 : else
6468 : {
6469 : xmm0 = xmm0_ori;
6470 : xmm1 = xmm1_ori;
6471 : xmm2 = xmm2_ori;
6472 : xmm3 = xmm3_ori;
6473 : }
6474 : // Pack int32 to int16
6475 : xmm0 = _mm_packs_epi32(xmm0, xmm1);
6476 : xmm2 = _mm_packs_epi32(xmm2, xmm3);
6477 : // Pack int16 to uint8
6478 : xmm0 = _mm_packus_epi16(xmm0, xmm2);
6479 : return xmm0;
6480 : }
6481 :
6482 : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
6483 : GByte *CPL_RESTRICT pabyDest0,
6484 : GByte *CPL_RESTRICT pabyDest1,
6485 : GByte *CPL_RESTRICT pabyDest2,
6486 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6487 : #ifdef USE_NEON_OPTIMIZATIONS
6488 : {
6489 : return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6490 : pabyDest3, nIters);
6491 : }
6492 : #else
6493 : {
6494 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
6495 : if (CPLHaveRuntimeSSSE3())
6496 : {
6497 : return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
6498 : pabyDest2, pabyDest3, nIters);
6499 : }
6500 : #endif
6501 :
6502 : // Not the optimal SSE2-only code, as gcc auto-vectorizer manages to
6503 : // do something slightly better.
6504 : size_t i = 0;
6505 : for (; i + 15 < nIters; i += 16)
6506 : {
6507 : __m128i xmm0_ori = _mm_loadu_si128(
6508 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
6509 : __m128i xmm1_ori = _mm_loadu_si128(
6510 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
6511 : __m128i xmm2_ori = _mm_loadu_si128(
6512 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
6513 : __m128i xmm3_ori = _mm_loadu_si128(
6514 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
6515 :
6516 : _mm_storeu_si128(
6517 : reinterpret_cast<__m128i *>(pabyDest0 + i),
6518 : deinterleave<false, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6519 : _mm_storeu_si128(
6520 : reinterpret_cast<__m128i *>(pabyDest1 + i),
6521 : deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6522 : _mm_storeu_si128(
6523 : reinterpret_cast<__m128i *>(pabyDest2 + i),
6524 : deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6525 : _mm_storeu_si128(
6526 : reinterpret_cast<__m128i *>(pabyDest3 + i),
6527 : deinterleave<true, false>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6528 : }
6529 :
6530 : #if defined(__clang__)
6531 : #pragma clang loop vectorize(disable)
6532 : #endif
6533 : for (; i < nIters; ++i)
6534 : {
6535 : pabyDest0[i] = pabySrc[4 * i + 0];
6536 : pabyDest1[i] = pabySrc[4 * i + 1];
6537 : pabyDest2[i] = pabySrc[4 * i + 2];
6538 : pabyDest3[i] = pabySrc[4 * i + 3];
6539 : }
6540 : }
6541 : #endif
6542 : #else
6543 : // GCC autovectorizer does an excellent job
6544 73229 : __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
6545 : const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
6546 : GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2,
6547 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6548 : {
6549 540369000 : for (size_t i = 0; i < nIters; ++i)
6550 : {
6551 540295000 : pabyDest0[i] = pabySrc[4 * i + 0];
6552 540295000 : pabyDest1[i] = pabySrc[4 * i + 1];
6553 540295000 : pabyDest2[i] = pabySrc[4 * i + 2];
6554 540295000 : pabyDest3[i] = pabySrc[4 * i + 3];
6555 : }
6556 73229 : }
6557 : #endif
6558 :
6559 : #else
6560 :
6561 : /************************************************************************/
6562 : /* GDALDeinterleave3Byte() */
6563 : /************************************************************************/
6564 :
6565 : // TODO: Enabling below could help on non-Intel architectures where GCC knows
6566 : // how to auto-vectorize
6567 : // #if defined(__GNUC__)
6568 : //__attribute__((optimize("tree-vectorize")))
6569 : // #endif
6570 : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
6571 : GByte *CPL_RESTRICT pabyDest0,
6572 : GByte *CPL_RESTRICT pabyDest1,
6573 : GByte *CPL_RESTRICT pabyDest2, size_t nIters)
6574 : {
6575 : for (size_t i = 0; i < nIters; ++i)
6576 : {
6577 : pabyDest0[i] = pabySrc[3 * i + 0];
6578 : pabyDest1[i] = pabySrc[3 * i + 1];
6579 : pabyDest2[i] = pabySrc[3 * i + 2];
6580 : }
6581 : }
6582 :
6583 : /************************************************************************/
6584 : /* GDALDeinterleave4Byte() */
6585 : /************************************************************************/
6586 :
6587 : // TODO: Enabling below could help on non-Intel architectures where gcc knows
6588 : // how to auto-vectorize
6589 : // #if defined(__GNUC__)
6590 : //__attribute__((optimize("tree-vectorize")))
6591 : // #endif
6592 : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
6593 : GByte *CPL_RESTRICT pabyDest0,
6594 : GByte *CPL_RESTRICT pabyDest1,
6595 : GByte *CPL_RESTRICT pabyDest2,
6596 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6597 : {
6598 : for (size_t i = 0; i < nIters; ++i)
6599 : {
6600 : pabyDest0[i] = pabySrc[4 * i + 0];
6601 : pabyDest1[i] = pabySrc[4 * i + 1];
6602 : pabyDest2[i] = pabySrc[4 * i + 2];
6603 : pabyDest3[i] = pabySrc[4 * i + 3];
6604 : }
6605 : }
6606 :
6607 : #endif
6608 :
6609 : /************************************************************************/
6610 : /* GDALDeinterleave() */
6611 : /************************************************************************/
6612 :
6613 : /*! Copy values from a pixel-interleave buffer to multiple per-component
6614 : buffers.
6615 :
6616 : In pseudo-code
6617 : \verbatim
6618 : for(size_t i = 0; i < nIters; ++i)
6619 : for(int iComp = 0; iComp < nComponents; iComp++ )
6620 : ppDestBuffer[iComp][i] = pSourceBuffer[nComponents * i + iComp]
6621 : \endverbatim
6622 :
6623 : The implementation is optimized for a few cases, like de-interleaving
6624 : of 3 or 4-components Byte buffers.
6625 :
6626 : \since GDAL 3.6
6627 : */
6628 454293 : void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
6629 : int nComponents, void **ppDestBuffer,
6630 : GDALDataType eDestDT, size_t nIters)
6631 : {
6632 454293 : if (eSourceDT == eDestDT)
6633 : {
6634 454271 : if (eSourceDT == GDT_UInt8 || eSourceDT == GDT_Int8)
6635 : {
6636 453950 : if (nComponents == 3)
6637 : {
6638 380714 : const GByte *CPL_RESTRICT pabySrc =
6639 : static_cast<const GByte *>(pSourceBuffer);
6640 380714 : GByte *CPL_RESTRICT pabyDest0 =
6641 : static_cast<GByte *>(ppDestBuffer[0]);
6642 380714 : GByte *CPL_RESTRICT pabyDest1 =
6643 : static_cast<GByte *>(ppDestBuffer[1]);
6644 380714 : GByte *CPL_RESTRICT pabyDest2 =
6645 : static_cast<GByte *>(ppDestBuffer[2]);
6646 380714 : GDALDeinterleave3Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6647 : nIters);
6648 380714 : return;
6649 : }
6650 73236 : else if (nComponents == 4)
6651 : {
6652 73229 : const GByte *CPL_RESTRICT pabySrc =
6653 : static_cast<const GByte *>(pSourceBuffer);
6654 73229 : GByte *CPL_RESTRICT pabyDest0 =
6655 : static_cast<GByte *>(ppDestBuffer[0]);
6656 73229 : GByte *CPL_RESTRICT pabyDest1 =
6657 : static_cast<GByte *>(ppDestBuffer[1]);
6658 73229 : GByte *CPL_RESTRICT pabyDest2 =
6659 : static_cast<GByte *>(ppDestBuffer[2]);
6660 73229 : GByte *CPL_RESTRICT pabyDest3 =
6661 : static_cast<GByte *>(ppDestBuffer[3]);
6662 73229 : GDALDeinterleave4Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6663 : pabyDest3, nIters);
6664 73229 : return;
6665 7 : }
6666 : }
6667 : #if ((defined(__GNUC__) && !defined(__clang__)) || \
6668 : defined(__INTEL_CLANG_COMPILER)) && \
6669 : defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
6670 642 : else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
6671 321 : CPLHaveRuntimeSSSE3())
6672 : {
6673 321 : if (nComponents == 3)
6674 : {
6675 126 : const GUInt16 *CPL_RESTRICT panSrc =
6676 : static_cast<const GUInt16 *>(pSourceBuffer);
6677 126 : GUInt16 *CPL_RESTRICT panDest0 =
6678 : static_cast<GUInt16 *>(ppDestBuffer[0]);
6679 126 : GUInt16 *CPL_RESTRICT panDest1 =
6680 : static_cast<GUInt16 *>(ppDestBuffer[1]);
6681 126 : GUInt16 *CPL_RESTRICT panDest2 =
6682 : static_cast<GUInt16 *>(ppDestBuffer[2]);
6683 126 : GDALDeinterleave3UInt16_SSSE3(panSrc, panDest0, panDest1,
6684 : panDest2, nIters);
6685 126 : return;
6686 : }
6687 : #if !defined(__INTEL_CLANG_COMPILER)
6688 : // ICC autovectorizer doesn't do a good job, at least with icx
6689 : // 2022.1.0.20220316
6690 195 : else if (nComponents == 4)
6691 : {
6692 195 : const GUInt16 *CPL_RESTRICT panSrc =
6693 : static_cast<const GUInt16 *>(pSourceBuffer);
6694 195 : GUInt16 *CPL_RESTRICT panDest0 =
6695 : static_cast<GUInt16 *>(ppDestBuffer[0]);
6696 195 : GUInt16 *CPL_RESTRICT panDest1 =
6697 : static_cast<GUInt16 *>(ppDestBuffer[1]);
6698 195 : GUInt16 *CPL_RESTRICT panDest2 =
6699 : static_cast<GUInt16 *>(ppDestBuffer[2]);
6700 195 : GUInt16 *CPL_RESTRICT panDest3 =
6701 : static_cast<GUInt16 *>(ppDestBuffer[3]);
6702 195 : GDALDeinterleave4UInt16_SSSE3(panSrc, panDest0, panDest1,
6703 : panDest2, panDest3, nIters);
6704 195 : return;
6705 : }
6706 : #endif
6707 : }
6708 : #endif
6709 : }
6710 :
6711 29 : const int nSourceDTSize = GDALGetDataTypeSizeBytes(eSourceDT);
6712 29 : const int nDestDTSize = GDALGetDataTypeSizeBytes(eDestDT);
6713 108 : for (int iComp = 0; iComp < nComponents; iComp++)
6714 : {
6715 79 : GDALCopyWords64(static_cast<const GByte *>(pSourceBuffer) +
6716 79 : iComp * nSourceDTSize,
6717 : eSourceDT, nComponents * nSourceDTSize,
6718 79 : ppDestBuffer[iComp], eDestDT, nDestDTSize, nIters);
6719 : }
6720 : }
6721 :
6722 : /************************************************************************/
6723 : /* GDALTranspose2DSingleToSingle() */
6724 : /************************************************************************/
6725 : /**
6726 : * Transpose a 2D array of non-complex values, in a efficient (cache-oblivious) way.
6727 : *
6728 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6729 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6730 : * @param nSrcWidth Width of pSrc array.
6731 : * @param nSrcHeight Height of pSrc array.
6732 : */
6733 :
6734 : template <class DST, class SRC>
6735 160 : void GDALTranspose2DSingleToSingle(const SRC *CPL_RESTRICT pSrc,
6736 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6737 : size_t nSrcHeight)
6738 : {
6739 160 : constexpr size_t blocksize = 32;
6740 345 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6741 : {
6742 185 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6743 5016 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6744 : {
6745 : // transpose the block beginning at [i,j]
6746 4831 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6747 26185 : for (size_t k = i; k < max_k; ++k)
6748 : {
6749 669282 : for (size_t l = j; l < max_l; ++l)
6750 : {
6751 647928 : GDALCopyWord(pSrc[l + k * nSrcWidth],
6752 647928 : pDst[k + l * nSrcHeight]);
6753 : }
6754 : }
6755 : }
6756 : }
6757 160 : }
6758 :
6759 : /************************************************************************/
6760 : /* GDALTranspose2DComplexToComplex() */
6761 : /************************************************************************/
6762 : /**
6763 : * Transpose a 2D array of complex values into an array of complex values,
6764 : * in a efficient (cache-oblivious) way.
6765 : *
6766 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6767 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6768 : * @param nSrcWidth Width of pSrc array.
6769 : * @param nSrcHeight Height of pSrc array.
6770 : */
6771 : template <class DST, class SRC>
6772 25 : void GDALTranspose2DComplexToComplex(const SRC *CPL_RESTRICT pSrc,
6773 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6774 : size_t nSrcHeight)
6775 : {
6776 25 : constexpr size_t blocksize = 32;
6777 50 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6778 : {
6779 25 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6780 50 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6781 : {
6782 : // transpose the block beginning at [i,j]
6783 25 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6784 75 : for (size_t k = i; k < max_k; ++k)
6785 : {
6786 200 : for (size_t l = j; l < max_l; ++l)
6787 : {
6788 150 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
6789 150 : pDst[2 * (k + l * nSrcHeight) + 0]);
6790 150 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 1],
6791 150 : pDst[2 * (k + l * nSrcHeight) + 1]);
6792 : }
6793 : }
6794 : }
6795 : }
6796 25 : }
6797 :
6798 : /************************************************************************/
6799 : /* GDALTranspose2DComplexToSingle() */
6800 : /************************************************************************/
6801 : /**
6802 : * Transpose a 2D array of complex values into an array of non-complex values,
6803 : * in a efficient (cache-oblivious) way.
6804 : *
6805 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6806 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6807 : * @param nSrcWidth Width of pSrc array.
6808 : * @param nSrcHeight Height of pSrc array.
6809 : */
6810 : template <class DST, class SRC>
6811 55 : void GDALTranspose2DComplexToSingle(const SRC *CPL_RESTRICT pSrc,
6812 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6813 : size_t nSrcHeight)
6814 : {
6815 55 : constexpr size_t blocksize = 32;
6816 110 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6817 : {
6818 55 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6819 110 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6820 : {
6821 : // transpose the block beginning at [i,j]
6822 55 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6823 165 : for (size_t k = i; k < max_k; ++k)
6824 : {
6825 440 : for (size_t l = j; l < max_l; ++l)
6826 : {
6827 330 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
6828 330 : pDst[k + l * nSrcHeight]);
6829 : }
6830 : }
6831 : }
6832 : }
6833 55 : }
6834 :
6835 : /************************************************************************/
6836 : /* GDALTranspose2DSingleToComplex() */
6837 : /************************************************************************/
6838 : /**
6839 : * Transpose a 2D array of non-complex values into an array of complex values,
6840 : * in a efficient (cache-oblivious) way.
6841 : *
6842 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6843 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6844 : * @param nSrcWidth Width of pSrc array.
6845 : * @param nSrcHeight Height of pSrc array.
6846 : */
6847 : template <class DST, class SRC>
6848 55 : void GDALTranspose2DSingleToComplex(const SRC *CPL_RESTRICT pSrc,
6849 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6850 : size_t nSrcHeight)
6851 : {
6852 55 : constexpr size_t blocksize = 32;
6853 110 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6854 : {
6855 55 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6856 110 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6857 : {
6858 : // transpose the block beginning at [i,j]
6859 55 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6860 165 : for (size_t k = i; k < max_k; ++k)
6861 : {
6862 440 : for (size_t l = j; l < max_l; ++l)
6863 : {
6864 330 : GDALCopyWord(pSrc[l + k * nSrcWidth],
6865 330 : pDst[2 * (k + l * nSrcHeight) + 0]);
6866 330 : pDst[2 * (k + l * nSrcHeight) + 1] = 0;
6867 : }
6868 : }
6869 : }
6870 : }
6871 55 : }
6872 :
6873 : /************************************************************************/
6874 : /* GDALTranspose2D() */
6875 : /************************************************************************/
6876 :
6877 : template <class DST, bool DST_IS_COMPLEX>
6878 295 : static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
6879 : size_t nSrcWidth, size_t nSrcHeight)
6880 : {
6881 : #define CALL_GDALTranspose2D_internal(SRC_TYPE) \
6882 : do \
6883 : { \
6884 : if constexpr (DST_IS_COMPLEX) \
6885 : { \
6886 : GDALTranspose2DSingleToComplex( \
6887 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6888 : nSrcHeight); \
6889 : } \
6890 : else \
6891 : { \
6892 : GDALTranspose2DSingleToSingle(static_cast<const SRC_TYPE *>(pSrc), \
6893 : pDst, nSrcWidth, nSrcHeight); \
6894 : } \
6895 : } while (0)
6896 :
6897 : #define CALL_GDALTranspose2DComplex_internal(SRC_TYPE) \
6898 : do \
6899 : { \
6900 : if constexpr (DST_IS_COMPLEX) \
6901 : { \
6902 : GDALTranspose2DComplexToComplex( \
6903 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6904 : nSrcHeight); \
6905 : } \
6906 : else \
6907 : { \
6908 : GDALTranspose2DComplexToSingle( \
6909 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6910 : nSrcHeight); \
6911 : } \
6912 : } while (0)
6913 :
6914 : // clang-format off
6915 295 : switch (eSrcType)
6916 : {
6917 16 : case GDT_UInt8: CALL_GDALTranspose2D_internal(uint8_t); break;
6918 15 : case GDT_Int8: CALL_GDALTranspose2D_internal(int8_t); break;
6919 33 : case GDT_UInt16: CALL_GDALTranspose2D_internal(uint16_t); break;
6920 20 : case GDT_Int16: CALL_GDALTranspose2D_internal(int16_t); break;
6921 24 : case GDT_UInt32: CALL_GDALTranspose2D_internal(uint32_t); break;
6922 16 : case GDT_Int32: CALL_GDALTranspose2D_internal(int32_t); break;
6923 16 : case GDT_UInt64: CALL_GDALTranspose2D_internal(uint64_t); break;
6924 16 : case GDT_Int64: CALL_GDALTranspose2D_internal(int64_t); break;
6925 16 : case GDT_Float16: CALL_GDALTranspose2D_internal(GFloat16); break;
6926 19 : case GDT_Float32: CALL_GDALTranspose2D_internal(float); break;
6927 24 : case GDT_Float64: CALL_GDALTranspose2D_internal(double); break;
6928 16 : case GDT_CInt16: CALL_GDALTranspose2DComplex_internal(int16_t); break;
6929 16 : case GDT_CInt32: CALL_GDALTranspose2DComplex_internal(int32_t); break;
6930 16 : case GDT_CFloat16: CALL_GDALTranspose2DComplex_internal(GFloat16); break;
6931 16 : case GDT_CFloat32: CALL_GDALTranspose2DComplex_internal(float); break;
6932 16 : case GDT_CFloat64: CALL_GDALTranspose2DComplex_internal(double); break;
6933 0 : case GDT_Unknown:
6934 : case GDT_TypeCount:
6935 0 : break;
6936 : }
6937 : // clang-format on
6938 :
6939 : #undef CALL_GDALTranspose2D_internal
6940 : #undef CALL_GDALTranspose2DComplex_internal
6941 295 : }
6942 :
6943 : /************************************************************************/
6944 : /* GDALInterleave2Byte() */
6945 : /************************************************************************/
6946 :
6947 : #if defined(HAVE_SSE2) && \
6948 : (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
6949 :
6950 : // ICC autovectorizer doesn't do a good job at generating good SSE code,
6951 : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
6952 : #if defined(__GNUC__)
6953 : __attribute__((noinline))
6954 : #endif
6955 : static void GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
6956 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
6957 : {
6958 : size_t i = 0;
6959 : constexpr size_t VALS_PER_ITER = 16;
6960 : for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
6961 : {
6962 : __m128i xmm0 =
6963 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + i));
6964 : __m128i xmm1 = _mm_loadu_si128(
6965 : reinterpret_cast<__m128i const *>(pSrc + i + nIters));
6966 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + 2 * i),
6967 : _mm_unpacklo_epi8(xmm0, xmm1));
6968 : _mm_storeu_si128(
6969 : reinterpret_cast<__m128i *>(pDst + 2 * i + VALS_PER_ITER),
6970 : _mm_unpackhi_epi8(xmm0, xmm1));
6971 : }
6972 : #if defined(__clang__)
6973 : #pragma clang loop vectorize(disable)
6974 : #endif
6975 : for (; i < nIters; ++i)
6976 : {
6977 : pDst[2 * i + 0] = pSrc[i + 0 * nIters];
6978 : pDst[2 * i + 1] = pSrc[i + 1 * nIters];
6979 : }
6980 : }
6981 :
6982 : #else
6983 :
6984 : #if defined(__GNUC__) && !defined(__clang__)
6985 : __attribute__((optimize("tree-vectorize")))
6986 : #endif
6987 : #if defined(__GNUC__)
6988 : __attribute__((noinline))
6989 : #endif
6990 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6991 : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
6992 : #pragma clang diagnostic push
6993 : #pragma clang diagnostic ignored "-Wpass-failed"
6994 : #endif
6995 9 : static void GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
6996 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
6997 : {
6998 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6999 : #pragma clang loop vectorize(enable)
7000 : #endif
7001 355429 : for (size_t i = 0; i < nIters; ++i)
7002 : {
7003 355420 : pDst[2 * i + 0] = pSrc[i + 0 * nIters];
7004 355420 : pDst[2 * i + 1] = pSrc[i + 1 * nIters];
7005 : }
7006 9 : }
7007 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7008 : #pragma clang diagnostic pop
7009 : #endif
7010 :
7011 : #endif
7012 :
7013 : /************************************************************************/
7014 : /* GDALInterleave4Byte() */
7015 : /************************************************************************/
7016 :
7017 : #if defined(HAVE_SSE2) && \
7018 : (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
7019 :
7020 : // ICC autovectorizer doesn't do a good job at generating good SSE code,
7021 : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
7022 : #if defined(__GNUC__)
7023 : __attribute__((noinline))
7024 : #endif
7025 : static void GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
7026 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
7027 : {
7028 : size_t i = 0;
7029 : constexpr size_t VALS_PER_ITER = 16;
7030 : for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
7031 : {
7032 : __m128i xmm0 = _mm_loadu_si128(
7033 : reinterpret_cast<__m128i const *>(pSrc + i + 0 * nIters));
7034 : __m128i xmm1 = _mm_loadu_si128(
7035 : reinterpret_cast<__m128i const *>(pSrc + i + 1 * nIters));
7036 : __m128i xmm2 = _mm_loadu_si128(
7037 : reinterpret_cast<__m128i const *>(pSrc + i + 2 * nIters));
7038 : __m128i xmm3 = _mm_loadu_si128(
7039 : reinterpret_cast<__m128i const *>(pSrc + i + 3 * nIters));
7040 : auto tmp0 = _mm_unpacklo_epi8(
7041 : xmm0,
7042 : xmm1); // (xmm0_0, xmm1_0, xmm0_1, xmm1_1, xmm0_2, xmm1_2, ...)
7043 : auto tmp1 = _mm_unpackhi_epi8(
7044 : xmm0,
7045 : xmm1); // (xmm0_8, xmm1_8, xmm0_9, xmm1_9, xmm0_10, xmm1_10, ...)
7046 : auto tmp2 = _mm_unpacklo_epi8(
7047 : xmm2,
7048 : xmm3); // (xmm2_0, xmm3_0, xmm2_1, xmm3_1, xmm2_2, xmm3_2, ...)
7049 : auto tmp3 = _mm_unpackhi_epi8(
7050 : xmm2,
7051 : xmm3); // (xmm2_8, xmm3_8, xmm2_9, xmm3_9, xmm2_10, xmm3_10, ...)
7052 : auto tmp2_0 = _mm_unpacklo_epi16(
7053 : tmp0,
7054 : tmp2); // (xmm0_0, xmm1_0, xmm2_0, xmm3_0, xmm0_1, xmm1_1, xmm2_1, xmm3_1, ...)
7055 : auto tmp2_1 = _mm_unpackhi_epi16(tmp0, tmp2);
7056 : auto tmp2_2 = _mm_unpacklo_epi16(tmp1, tmp3);
7057 : auto tmp2_3 = _mm_unpackhi_epi16(tmp1, tmp3);
7058 : _mm_storeu_si128(
7059 : reinterpret_cast<__m128i *>(pDst + 4 * i + 0 * VALS_PER_ITER),
7060 : tmp2_0);
7061 : _mm_storeu_si128(
7062 : reinterpret_cast<__m128i *>(pDst + 4 * i + 1 * VALS_PER_ITER),
7063 : tmp2_1);
7064 : _mm_storeu_si128(
7065 : reinterpret_cast<__m128i *>(pDst + 4 * i + 2 * VALS_PER_ITER),
7066 : tmp2_2);
7067 : _mm_storeu_si128(
7068 : reinterpret_cast<__m128i *>(pDst + 4 * i + 3 * VALS_PER_ITER),
7069 : tmp2_3);
7070 : }
7071 : #if defined(__clang__)
7072 : #pragma clang loop vectorize(disable)
7073 : #endif
7074 : for (; i < nIters; ++i)
7075 : {
7076 : pDst[4 * i + 0] = pSrc[i + 0 * nIters];
7077 : pDst[4 * i + 1] = pSrc[i + 1 * nIters];
7078 : pDst[4 * i + 2] = pSrc[i + 2 * nIters];
7079 : pDst[4 * i + 3] = pSrc[i + 3 * nIters];
7080 : }
7081 : }
7082 :
7083 : #else
7084 :
7085 : #if defined(__GNUC__) && !defined(__clang__)
7086 : __attribute__((optimize("tree-vectorize")))
7087 : #endif
7088 : #if defined(__GNUC__)
7089 : __attribute__((noinline))
7090 : #endif
7091 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7092 : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
7093 : #pragma clang diagnostic push
7094 : #pragma clang diagnostic ignored "-Wpass-failed"
7095 : #endif
7096 30 : static void GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
7097 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
7098 : {
7099 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7100 : #pragma clang loop vectorize(enable)
7101 : #endif
7102 49620700 : for (size_t i = 0; i < nIters; ++i)
7103 : {
7104 49620600 : pDst[4 * i + 0] = pSrc[i + 0 * nIters];
7105 49620600 : pDst[4 * i + 1] = pSrc[i + 1 * nIters];
7106 49620600 : pDst[4 * i + 2] = pSrc[i + 2 * nIters];
7107 49620600 : pDst[4 * i + 3] = pSrc[i + 3 * nIters];
7108 : }
7109 30 : }
7110 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
7111 : #pragma clang diagnostic pop
7112 : #endif
7113 :
7114 : #endif
7115 :
7116 : /************************************************************************/
7117 : /* GDALTranspose2D() */
7118 : /************************************************************************/
7119 :
7120 : /**
7121 : * Transpose a 2D array in a efficient (cache-oblivious) way.
7122 : *
7123 : * @param pSrc Source array of width = nSrcWidth and height = nSrcHeight.
7124 : * @param eSrcType Data type of pSrc.
7125 : * @param pDst Destination transposed array of width = nSrcHeight and height = nSrcWidth.
7126 : * @param eDstType Data type of pDst.
7127 : * @param nSrcWidth Width of pSrc array.
7128 : * @param nSrcHeight Height of pSrc array.
7129 : * @since GDAL 3.11
7130 : */
7131 :
7132 365 : void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, void *pDst,
7133 : GDALDataType eDstType, size_t nSrcWidth, size_t nSrcHeight)
7134 : {
7135 365 : if (eSrcType == eDstType && (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8))
7136 : {
7137 70 : if (nSrcHeight == 2)
7138 : {
7139 9 : GDALInterleave2Byte(static_cast<const uint8_t *>(pSrc),
7140 : static_cast<uint8_t *>(pDst), nSrcWidth);
7141 9 : return;
7142 : }
7143 61 : if (nSrcHeight == 4)
7144 : {
7145 30 : GDALInterleave4Byte(static_cast<const uint8_t *>(pSrc),
7146 : static_cast<uint8_t *>(pDst), nSrcWidth);
7147 30 : return;
7148 : }
7149 : #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) && \
7150 : (defined(__x86_64) || defined(_M_X64)))
7151 31 : if (CPLHaveRuntimeSSSE3())
7152 : {
7153 31 : GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
7154 : static_cast<uint8_t *>(pDst), nSrcWidth,
7155 : nSrcHeight);
7156 31 : return;
7157 : }
7158 : #elif defined(USE_NEON_OPTIMIZATIONS)
7159 : {
7160 : GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
7161 : static_cast<uint8_t *>(pDst), nSrcWidth,
7162 : nSrcHeight);
7163 : return;
7164 : }
7165 : #endif
7166 : }
7167 :
7168 : #define CALL_GDALTranspose2D_internal(DST_TYPE, DST_IS_COMPLEX) \
7169 : GDALTranspose2D<DST_TYPE, DST_IS_COMPLEX>( \
7170 : pSrc, eSrcType, static_cast<DST_TYPE *>(pDst), nSrcWidth, nSrcHeight)
7171 :
7172 : // clang-format off
7173 295 : switch (eDstType)
7174 : {
7175 15 : case GDT_UInt8: CALL_GDALTranspose2D_internal(uint8_t, false); break;
7176 15 : case GDT_Int8: CALL_GDALTranspose2D_internal(int8_t, false); break;
7177 33 : case GDT_UInt16: CALL_GDALTranspose2D_internal(uint16_t, false); break;
7178 20 : case GDT_Int16: CALL_GDALTranspose2D_internal(int16_t, false); break;
7179 24 : case GDT_UInt32: CALL_GDALTranspose2D_internal(uint32_t, false); break;
7180 16 : case GDT_Int32: CALL_GDALTranspose2D_internal(int32_t, false); break;
7181 16 : case GDT_UInt64: CALL_GDALTranspose2D_internal(uint64_t, false); break;
7182 16 : case GDT_Int64: CALL_GDALTranspose2D_internal(int64_t, false); break;
7183 16 : case GDT_Float16: CALL_GDALTranspose2D_internal(GFloat16, false); break;
7184 19 : case GDT_Float32: CALL_GDALTranspose2D_internal(float, false); break;
7185 25 : case GDT_Float64: CALL_GDALTranspose2D_internal(double, false); break;
7186 16 : case GDT_CInt16: CALL_GDALTranspose2D_internal(int16_t, true); break;
7187 16 : case GDT_CInt32: CALL_GDALTranspose2D_internal(int32_t, true); break;
7188 16 : case GDT_CFloat16: CALL_GDALTranspose2D_internal(GFloat16, true); break;
7189 16 : case GDT_CFloat32: CALL_GDALTranspose2D_internal(float, true); break;
7190 16 : case GDT_CFloat64: CALL_GDALTranspose2D_internal(double, true); break;
7191 0 : case GDT_Unknown:
7192 : case GDT_TypeCount:
7193 0 : break;
7194 : }
7195 : // clang-format on
7196 :
7197 : #undef CALL_GDALTranspose2D_internal
7198 : }
7199 :
7200 : /************************************************************************/
7201 : /* ExtractBitAndConvertTo255() */
7202 : /************************************************************************/
7203 :
7204 : #if defined(__GNUC__) || defined(_MSC_VER)
7205 : // Signedness of char implementation dependent, so be explicit.
7206 : // Assumes 2-complement integer types and sign extension of right shifting
7207 : // GCC guarantees such:
7208 : // https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
7209 124890 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
7210 : {
7211 124890 : return static_cast<GByte>(static_cast<signed char>(byVal << (7 - nBit)) >>
7212 124890 : 7);
7213 : }
7214 : #else
7215 : // Portable way
7216 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
7217 : {
7218 : return (byVal & (1 << nBit)) ? 255 : 0;
7219 : }
7220 : #endif
7221 :
7222 : /************************************************************************/
7223 : /* ExpandEightPackedBitsToByteAt255() */
7224 : /************************************************************************/
7225 :
7226 15569 : static inline void ExpandEightPackedBitsToByteAt255(GByte byVal,
7227 : GByte abyOutput[8])
7228 : {
7229 15569 : abyOutput[0] = ExtractBitAndConvertTo255(byVal, 7);
7230 15569 : abyOutput[1] = ExtractBitAndConvertTo255(byVal, 6);
7231 15569 : abyOutput[2] = ExtractBitAndConvertTo255(byVal, 5);
7232 15569 : abyOutput[3] = ExtractBitAndConvertTo255(byVal, 4);
7233 15569 : abyOutput[4] = ExtractBitAndConvertTo255(byVal, 3);
7234 15569 : abyOutput[5] = ExtractBitAndConvertTo255(byVal, 2);
7235 15569 : abyOutput[6] = ExtractBitAndConvertTo255(byVal, 1);
7236 15569 : abyOutput[7] = ExtractBitAndConvertTo255(byVal, 0);
7237 15569 : }
7238 :
7239 : /************************************************************************/
7240 : /* GDALExpandPackedBitsToByteAt0Or255() */
7241 : /************************************************************************/
7242 :
7243 : /** Expand packed-bits (ordered from most-significant bit to least one)
7244 : into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
7245 : at 1 to a byte at 255.
7246 :
7247 : The function does (in a possibly more optimized way) the following:
7248 : \code{.cpp}
7249 : for (size_t i = 0; i < nInputBits; ++i )
7250 : {
7251 : pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 255 : 0;
7252 : }
7253 : \endcode
7254 :
7255 : @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
7256 : @param pabyOutput Output array of nInputBits bytes.
7257 : @param nInputBits Number of valid bits in pabyInput.
7258 :
7259 : @since 3.11
7260 : */
7261 :
7262 45145 : void GDALExpandPackedBitsToByteAt0Or255(const GByte *CPL_RESTRICT pabyInput,
7263 : GByte *CPL_RESTRICT pabyOutput,
7264 : size_t nInputBits)
7265 : {
7266 45145 : const size_t nInputWholeBytes = nInputBits / 8;
7267 45145 : size_t iByte = 0;
7268 :
7269 : #ifdef HAVE_SSE2
7270 : // Mask to isolate each bit
7271 45145 : const __m128i bit_mask = _mm_set_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4,
7272 : 8, 16, 32, 64, -128);
7273 45145 : const __m128i zero = _mm_setzero_si128();
7274 45145 : const __m128i all_ones = _mm_set1_epi8(-1);
7275 : #ifdef __SSSE3__
7276 : const __m128i dispatch_two_bytes =
7277 : _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
7278 : #endif
7279 45145 : constexpr size_t SSE_REG_SIZE = sizeof(bit_mask);
7280 135654 : for (; iByte + SSE_REG_SIZE <= nInputWholeBytes; iByte += SSE_REG_SIZE)
7281 : {
7282 90509 : __m128i reg_ori = _mm_loadu_si128(
7283 90509 : reinterpret_cast<const __m128i *>(pabyInput + iByte));
7284 :
7285 90509 : constexpr int NUM_PROCESSED_BYTES_PER_REG = 2;
7286 814581 : for (size_t k = 0; k < SSE_REG_SIZE / NUM_PROCESSED_BYTES_PER_REG; ++k)
7287 : {
7288 : // Given reg_ori = (A, B, ... 14 other bytes ...),
7289 : // expand to (A, A, A, A, A, A, A, A, B, B, B, B, B, B, B, B)
7290 : #ifdef __SSSE3__
7291 : __m128i reg = _mm_shuffle_epi8(reg_ori, dispatch_two_bytes);
7292 : #else
7293 724072 : __m128i reg = _mm_unpacklo_epi8(reg_ori, reg_ori);
7294 724072 : reg = _mm_unpacklo_epi16(reg, reg);
7295 724072 : reg = _mm_unpacklo_epi32(reg, reg);
7296 : #endif
7297 :
7298 : // Test if bits of interest are set
7299 724072 : reg = _mm_and_si128(reg, bit_mask);
7300 :
7301 : // Now test if those bits are set, by comparing to zero. So the
7302 : // result will be that bytes where bits are set will be at 0, and
7303 : // ones where they are cleared will be at 0xFF. So the inverse of
7304 : // the end result we want!
7305 724072 : reg = _mm_cmpeq_epi8(reg, zero);
7306 :
7307 : // Invert the result
7308 724072 : reg = _mm_andnot_si128(reg, all_ones);
7309 :
7310 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyOutput), reg);
7311 :
7312 724072 : pabyOutput += SSE_REG_SIZE;
7313 :
7314 : // Right-shift of 2 bytes
7315 724072 : reg_ori = _mm_bsrli_si128(reg_ori, NUM_PROCESSED_BYTES_PER_REG);
7316 : }
7317 : }
7318 :
7319 : #endif // HAVE_SSE2
7320 :
7321 60714 : for (; iByte < nInputWholeBytes; ++iByte)
7322 : {
7323 15569 : ExpandEightPackedBitsToByteAt255(pabyInput[iByte], pabyOutput);
7324 15569 : pabyOutput += 8;
7325 : }
7326 45483 : for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
7327 : {
7328 338 : *pabyOutput = ExtractBitAndConvertTo255(pabyInput[iByte], 7 - iBit);
7329 338 : ++pabyOutput;
7330 : }
7331 45145 : }
7332 :
7333 : /************************************************************************/
7334 : /* ExpandEightPackedBitsToByteAt1() */
7335 : /************************************************************************/
7336 :
7337 136113 : static inline void ExpandEightPackedBitsToByteAt1(GByte byVal,
7338 : GByte abyOutput[8])
7339 : {
7340 136113 : abyOutput[0] = (byVal >> 7) & 0x1;
7341 136113 : abyOutput[1] = (byVal >> 6) & 0x1;
7342 136113 : abyOutput[2] = (byVal >> 5) & 0x1;
7343 136113 : abyOutput[3] = (byVal >> 4) & 0x1;
7344 136113 : abyOutput[4] = (byVal >> 3) & 0x1;
7345 136113 : abyOutput[5] = (byVal >> 2) & 0x1;
7346 136113 : abyOutput[6] = (byVal >> 1) & 0x1;
7347 136113 : abyOutput[7] = (byVal >> 0) & 0x1;
7348 136113 : }
7349 :
7350 : /************************************************************************/
7351 : /* GDALExpandPackedBitsToByteAt0Or1() */
7352 : /************************************************************************/
7353 :
7354 : /** Expand packed-bits (ordered from most-significant bit to least one)
7355 : into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
7356 : at 1 to a byte at 1.
7357 :
7358 : The function does (in a possibly more optimized way) the following:
7359 : \code{.cpp}
7360 : for (size_t i = 0; i < nInputBits; ++i )
7361 : {
7362 : pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
7363 : }
7364 : \endcode
7365 :
7366 : @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
7367 : @param pabyOutput Output array of nInputBits bytes.
7368 : @param nInputBits Number of valid bits in pabyInput.
7369 :
7370 : @since 3.11
7371 : */
7372 :
7373 7033 : void GDALExpandPackedBitsToByteAt0Or1(const GByte *CPL_RESTRICT pabyInput,
7374 : GByte *CPL_RESTRICT pabyOutput,
7375 : size_t nInputBits)
7376 : {
7377 7033 : const size_t nInputWholeBytes = nInputBits / 8;
7378 7033 : size_t iByte = 0;
7379 143146 : for (; iByte < nInputWholeBytes; ++iByte)
7380 : {
7381 136113 : ExpandEightPackedBitsToByteAt1(pabyInput[iByte], pabyOutput);
7382 136113 : pabyOutput += 8;
7383 : }
7384 18886 : for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
7385 : {
7386 11853 : *pabyOutput = (pabyInput[iByte] >> (7 - iBit)) & 0x1;
7387 11853 : ++pabyOutput;
7388 : }
7389 7033 : }
|