Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: GDAL Core
4 : * Purpose: Contains default implementation of GDALRasterBand::IRasterIO()
5 : * and supporting functions of broader utility.
6 : * Author: Frank Warmerdam, warmerdam@pobox.com
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 1998, Frank Warmerdam
10 : * Copyright (c) 2007-2014, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * SPDX-License-Identifier: MIT
13 : ****************************************************************************/
14 :
15 : #include "cpl_port.h"
16 : #include "gdal.h"
17 : #include "gdal_priv.h"
18 :
19 : #include <cassert>
20 : #include <climits>
21 : #include <cmath>
22 : #include <cstddef>
23 : #include <cstdio>
24 : #include <cstdlib>
25 : #include <cstring>
26 :
27 : #include <algorithm>
28 : #include <limits>
29 : #include <stdexcept>
30 : #include <type_traits>
31 :
32 : #include "cpl_conv.h"
33 : #include "cpl_cpu_features.h"
34 : #include "cpl_error.h"
35 : #include "cpl_float.h"
36 : #include "cpl_progress.h"
37 : #include "cpl_string.h"
38 : #include "cpl_vsi.h"
39 : #include "gdal_priv_templates.hpp"
40 : #include "gdal_vrt.h"
41 : #include "gdalwarper.h"
42 : #include "memdataset.h"
43 : #include "vrtdataset.h"
44 :
45 : #if defined(__x86_64) || defined(_M_X64)
46 : #include <emmintrin.h>
47 : #define HAVE_SSE2
48 : #elif defined(USE_NEON_OPTIMIZATIONS)
49 : #include "include_sse2neon.h"
50 : #define HAVE_SSE2
51 : #endif
52 :
53 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
54 : #include "rasterio_ssse3.h"
55 : #ifdef __SSSE3__
56 : #include <tmmintrin.h>
57 : #endif
58 : #endif
59 :
60 : #ifdef __SSE4_1__
61 : #include <smmintrin.h>
62 : #endif
63 :
64 : #ifdef __GNUC__
65 : #define CPL_NOINLINE __attribute__((noinline))
66 : #else
67 : #define CPL_NOINLINE
68 : #endif
69 :
70 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
71 : int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
72 : int nDstPixelStride, GPtrDiff_t nWordCount);
73 :
74 : /************************************************************************/
75 : /* DownsamplingIntegerXFactor() */
76 : /************************************************************************/
77 :
78 : template <bool bSameDataType, int DATA_TYPE_SIZE>
79 695677 : static bool DownsamplingIntegerXFactor(
80 : GDALRasterBand *poBand, int iSrcX, int nSrcXInc, GPtrDiff_t iSrcOffsetCst,
81 : GByte *CPL_RESTRICT pabyDstData, int nPixelSpace, int nBufXSize,
82 : GDALDataType eDataType, GDALDataType eBufType, int &nStartBlockX,
83 : int nBlockXSize, GDALRasterBlock *&poBlock, int nLBlockY)
84 : {
85 695677 : const int nBandDataSize =
86 : bSameDataType ? DATA_TYPE_SIZE : GDALGetDataTypeSizeBytes(eDataType);
87 695677 : int nOuterLoopIters = nBufXSize - 1;
88 695677 : const int nIncSrcOffset = nSrcXInc * nBandDataSize;
89 : const GByte *CPL_RESTRICT pabySrcData;
90 695677 : int nEndBlockX = nBlockXSize + nStartBlockX;
91 :
92 695677 : if (iSrcX < nEndBlockX)
93 : {
94 294999 : CPLAssert(poBlock);
95 294999 : goto no_reload_block;
96 : }
97 400678 : goto reload_block;
98 :
99 : // Don't do the last iteration in the loop, as iSrcX might go beyond
100 : // nRasterXSize - 1
101 1264772 : while (--nOuterLoopIters >= 1)
102 : {
103 201834 : iSrcX += nSrcXInc;
104 201834 : pabySrcData += nIncSrcOffset;
105 201834 : pabyDstData += nPixelSpace;
106 :
107 : /* --------------------------------------------------------------------
108 : */
109 : /* Ensure we have the appropriate block loaded. */
110 : /* --------------------------------------------------------------------
111 : */
112 201834 : if (iSrcX >= nEndBlockX)
113 : {
114 201834 : reload_block:
115 : {
116 615102 : const int nLBlockX = iSrcX / nBlockXSize;
117 615102 : nStartBlockX = nLBlockX * nBlockXSize;
118 615102 : nEndBlockX = nStartBlockX + nBlockXSize;
119 :
120 615102 : if (poBlock != nullptr)
121 341314 : poBlock->DropLock();
122 :
123 615102 : poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
124 615102 : if (poBlock == nullptr)
125 : {
126 1 : return false;
127 : }
128 : }
129 :
130 615101 : no_reload_block:
131 : const GByte *pabySrcBlock =
132 1264772 : static_cast<const GByte *>(poBlock->GetDataRef());
133 1264772 : GPtrDiff_t iSrcOffset =
134 1264772 : (iSrcX - nStartBlockX + iSrcOffsetCst) * nBandDataSize;
135 1264772 : pabySrcData = pabySrcBlock + iSrcOffset;
136 : }
137 :
138 : /* --------------------------------------------------------------------
139 : */
140 : /* Copy the maximum run of pixels. */
141 : /* --------------------------------------------------------------------
142 : */
143 :
144 1264772 : const int nIters = std::min(
145 1264772 : (nEndBlockX - iSrcX + (nSrcXInc - 1)) / nSrcXInc, nOuterLoopIters);
146 : if (bSameDataType)
147 : {
148 1264367 : memcpy(pabyDstData, pabySrcData, nBandDataSize);
149 1264367 : if (nIters > 1)
150 : {
151 : if (DATA_TYPE_SIZE == 1)
152 : {
153 326246 : pabySrcData += nIncSrcOffset;
154 326246 : pabyDstData += nPixelSpace;
155 326246 : GDALFastCopyByte(pabySrcData, nIncSrcOffset, pabyDstData,
156 326246 : nPixelSpace, nIters - 1);
157 326246 : pabySrcData +=
158 326246 : static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 2);
159 326246 : pabyDstData +=
160 326246 : static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 2);
161 : }
162 : else
163 : {
164 4395158 : for (int i = 0; i < nIters - 1; i++)
165 : {
166 4197064 : pabySrcData += nIncSrcOffset;
167 4197064 : pabyDstData += nPixelSpace;
168 4197064 : memcpy(pabyDstData, pabySrcData, nBandDataSize);
169 : }
170 : }
171 524340 : iSrcX += nSrcXInc * (nIters - 1);
172 524340 : nOuterLoopIters -= nIters - 1;
173 : }
174 : }
175 : else
176 : {
177 : // Type to type conversion ...
178 405 : GDALCopyWords64(pabySrcData, eDataType, nIncSrcOffset, pabyDstData,
179 405 : eBufType, nPixelSpace, std::max(1, nIters));
180 405 : if (nIters > 1)
181 : {
182 198 : pabySrcData +=
183 198 : static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 1);
184 198 : pabyDstData +=
185 198 : static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 1);
186 198 : iSrcX += nSrcXInc * (nIters - 1);
187 198 : nOuterLoopIters -= nIters - 1;
188 : }
189 : }
190 : }
191 :
192 : // Deal with last iteration to avoid iSrcX to go beyond nRasterXSize - 1
193 1062938 : if (nOuterLoopIters == 0)
194 : {
195 367262 : const int nRasterXSize = poBand->GetXSize();
196 367262 : iSrcX =
197 734524 : static_cast<int>(std::min(static_cast<GInt64>(iSrcX) + nSrcXInc,
198 367262 : static_cast<GInt64>(nRasterXSize - 1)));
199 367262 : pabyDstData += nPixelSpace;
200 367262 : if (iSrcX < nEndBlockX)
201 : {
202 354672 : goto no_reload_block;
203 : }
204 12590 : goto reload_block;
205 : }
206 695676 : return true;
207 : }
208 :
209 : template <class A, class B>
210 2726210 : CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW inline auto CPLUnsanitizedMul(A a, B b)
211 : {
212 2726210 : return a * b;
213 : }
214 :
215 : /************************************************************************/
216 : /* IRasterIO() */
217 : /* */
218 : /* Default internal implementation of RasterIO() ... utilizes */
219 : /* the Block access methods to satisfy the request. This would */
220 : /* normally only be overridden by formats with overviews. */
221 : /************************************************************************/
222 :
223 6127840 : CPLErr GDALRasterBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
224 : int nXSize, int nYSize, void *pData,
225 : int nBufXSize, int nBufYSize,
226 : GDALDataType eBufType, GSpacing nPixelSpace,
227 : GSpacing nLineSpace,
228 : GDALRasterIOExtraArg *psExtraArg)
229 :
230 : {
231 6127840 : if (eRWFlag == GF_Write && eFlushBlockErr != CE_None)
232 : {
233 0 : CPLError(eFlushBlockErr, CPLE_AppDefined,
234 : "An error occurred while writing a dirty block "
235 : "from GDALRasterBand::IRasterIO");
236 0 : CPLErr eErr = eFlushBlockErr;
237 0 : eFlushBlockErr = CE_None;
238 0 : return eErr;
239 : }
240 6127840 : if (nBlockXSize <= 0 || nBlockYSize <= 0)
241 : {
242 106 : CPLError(CE_Failure, CPLE_AppDefined, "Invalid block size");
243 0 : return CE_Failure;
244 : }
245 :
246 6127740 : const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
247 6127740 : const int nBufDataSize = GDALGetDataTypeSizeBytes(eBufType);
248 6127700 : GByte dummyBlock[2] = {0, 0};
249 6127700 : GByte *pabySrcBlock =
250 : dummyBlock; /* to avoid Coverity warning about nullptr dereference */
251 6127700 : GDALRasterBlock *poBlock = nullptr;
252 6127700 : const bool bUseIntegerRequestCoords =
253 6474390 : (!psExtraArg->bFloatingPointWindowValidity ||
254 346685 : (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
255 323303 : nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
256 :
257 : /* ==================================================================== */
258 : /* A common case is the data requested with the destination */
259 : /* is packed, and the block width is the raster width. */
260 : /* ==================================================================== */
261 6049950 : if (nPixelSpace == nBufDataSize && nLineSpace == nPixelSpace * nXSize &&
262 3203720 : nBlockXSize == GetXSize() && nBufXSize == nXSize &&
263 12177600 : nBufYSize == nYSize && bUseIntegerRequestCoords)
264 : {
265 3091450 : CPLErr eErr = CE_None;
266 3091450 : int nLBlockY = -1;
267 :
268 9283750 : for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
269 : {
270 6193190 : const int iSrcY = iBufYOff + nYOff;
271 :
272 6193190 : if (iSrcY < nLBlockY * nBlockYSize ||
273 6192950 : iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
274 : {
275 3353740 : nLBlockY = iSrcY / nBlockYSize;
276 3353740 : bool bJustInitialize =
277 295403 : eRWFlag == GF_Write && nXOff == 0 &&
278 3706200 : nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
279 57065 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize;
280 :
281 : // Is this a partial tile at right and/or bottom edges of
282 : // the raster, and that is going to be completely written?
283 : // If so, do not load it from storage, but zero it so that
284 : // the content outsize of the validity area is initialized.
285 3353740 : bool bMemZeroBuffer = false;
286 295403 : if (eRWFlag == GF_Write && !bJustInitialize && nXOff == 0 &&
287 23861 : nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
288 3649230 : nYOff + nYSize == GetYSize() &&
289 89 : nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)
290 : {
291 89 : bJustInitialize = true;
292 89 : bMemZeroBuffer = true;
293 : }
294 :
295 3353740 : if (poBlock)
296 262257 : poBlock->DropLock();
297 :
298 3353730 : const GUInt32 nErrorCounter = CPLGetErrorCounter();
299 3353810 : poBlock = GetLockedBlockRef(0, nLBlockY, bJustInitialize);
300 3354070 : if (poBlock == nullptr)
301 : {
302 1078 : if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
303 : nullptr)
304 : {
305 0 : CPLError(CE_Failure, CPLE_AppDefined,
306 : "GetBlockRef failed at X block offset %d, "
307 : "Y block offset %d%s",
308 : 0, nLBlockY,
309 0 : (nErrorCounter != CPLGetErrorCounter())
310 0 : ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
311 : : "");
312 : }
313 1078 : eErr = CE_Failure;
314 1078 : break;
315 : }
316 :
317 3352990 : if (eRWFlag == GF_Write)
318 295403 : poBlock->MarkDirty();
319 :
320 3352990 : pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
321 3352900 : if (bMemZeroBuffer)
322 : {
323 89 : memset(pabySrcBlock, 0,
324 89 : static_cast<GPtrDiff_t>(nBandDataSize) *
325 89 : nBlockXSize * nBlockYSize);
326 : }
327 : }
328 :
329 6192350 : const auto nSrcByteOffset =
330 6192350 : (static_cast<GPtrDiff_t>(iSrcY - nLBlockY * nBlockYSize) *
331 6192350 : nBlockXSize +
332 6192350 : nXOff) *
333 6192350 : nBandDataSize;
334 :
335 6192350 : if (eDataType == eBufType)
336 : {
337 2555580 : if (eRWFlag == GF_Read)
338 2084750 : memcpy(static_cast<GByte *>(pData) +
339 2084750 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
340 2084750 : pabySrcBlock + nSrcByteOffset,
341 : static_cast<size_t>(nLineSpace));
342 : else
343 470829 : memcpy(pabySrcBlock + nSrcByteOffset,
344 470829 : static_cast<GByte *>(pData) +
345 470829 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
346 : static_cast<size_t>(nLineSpace));
347 : }
348 : else
349 : {
350 : // Type to type conversion.
351 3636770 : if (eRWFlag == GF_Read)
352 3614870 : GDALCopyWords64(
353 3614870 : pabySrcBlock + nSrcByteOffset, eDataType, nBandDataSize,
354 : static_cast<GByte *>(pData) +
355 3614870 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
356 : eBufType, static_cast<int>(nPixelSpace), nBufXSize);
357 : else
358 21901 : GDALCopyWords64(static_cast<GByte *>(pData) +
359 21901 : static_cast<GPtrDiff_t>(iBufYOff) *
360 : nLineSpace,
361 : eBufType, static_cast<int>(nPixelSpace),
362 21901 : pabySrcBlock + nSrcByteOffset, eDataType,
363 : nBandDataSize, nBufXSize);
364 : }
365 :
366 6265860 : if (psExtraArg->pfnProgress != nullptr &&
367 73559 : !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
368 : psExtraArg->pProgressData))
369 : {
370 5 : eErr = CE_Failure;
371 5 : break;
372 : }
373 : }
374 :
375 3091650 : if (poBlock)
376 3090730 : poBlock->DropLock();
377 :
378 3091800 : return eErr;
379 : }
380 :
381 : /* ==================================================================== */
382 : /* Do we have overviews that would be appropriate to satisfy */
383 : /* this request? */
384 : /* ==================================================================== */
385 3036210 : if ((nBufXSize < nXSize || nBufYSize < nYSize) && GetOverviewCount() > 0 &&
386 : eRWFlag == GF_Read)
387 : {
388 : GDALRasterIOExtraArg sExtraArg;
389 2902 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
390 :
391 : const int nOverview =
392 2902 : GDALBandGetBestOverviewLevel2(this, nXOff, nYOff, nXSize, nYSize,
393 : nBufXSize, nBufYSize, &sExtraArg);
394 2902 : if (nOverview >= 0)
395 : {
396 2827 : GDALRasterBand *poOverviewBand = GetOverview(nOverview);
397 2827 : if (poOverviewBand == nullptr)
398 2827 : return CE_Failure;
399 :
400 2827 : return poOverviewBand->RasterIO(
401 : eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize,
402 2827 : nBufYSize, eBufType, nPixelSpace, nLineSpace, &sExtraArg);
403 : }
404 : }
405 :
406 844560 : if (eRWFlag == GF_Read && nBufXSize < nXSize / 100 &&
407 6 : nBufYSize < nYSize / 100 && nPixelSpace == nBufDataSize &&
408 3877860 : nLineSpace == nPixelSpace * nBufXSize &&
409 6 : CPLTestBool(CPLGetConfigOption("GDAL_NO_COSTLY_OVERVIEW", "NO")))
410 : {
411 0 : memset(pData, 0, static_cast<size_t>(nLineSpace * nBufYSize));
412 0 : return CE_None;
413 : }
414 :
415 : /* ==================================================================== */
416 : /* The second case when we don't need subsample data but likely */
417 : /* need data type conversion. */
418 : /* ==================================================================== */
419 3033300 : if ( // nPixelSpace == nBufDataSize &&
420 3033300 : nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
421 : {
422 : #if DEBUG_VERBOSE
423 : printf("IRasterIO(%d,%d,%d,%d) rw=%d case 2\n", /*ok*/
424 : nXOff, nYOff, nXSize, nYSize, static_cast<int>(eRWFlag));
425 : #endif
426 :
427 : /* --------------------------------------------------------------------
428 : */
429 : /* Loop over buffer computing source locations. */
430 : /* --------------------------------------------------------------------
431 : */
432 : // Calculate starting values out of loop
433 2468080 : const int nLBlockXStart = nXOff / nBlockXSize;
434 2468080 : const int nXSpanEnd = nBufXSize + nXOff;
435 :
436 2468080 : int nYInc = 0;
437 4975050 : for (int iBufYOff = 0, iSrcY = nYOff; iBufYOff < nBufYSize;
438 2506970 : iBufYOff += nYInc, iSrcY += nYInc)
439 : {
440 2507040 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
441 : static_cast<GPtrDiff_t>(nLineSpace);
442 2507040 : int nLBlockY = iSrcY / nBlockYSize;
443 2507040 : int nLBlockX = nLBlockXStart;
444 2507040 : int iSrcX = nXOff;
445 5233190 : while (iSrcX < nXSpanEnd)
446 : {
447 2726210 : int nXSpan = nLBlockX * nBlockXSize;
448 2726210 : if (nXSpan < INT_MAX - nBlockXSize)
449 2726210 : nXSpan += nBlockXSize;
450 : else
451 0 : nXSpan = INT_MAX;
452 2726210 : const int nXRight = nXSpan;
453 2726210 : nXSpan = (nXSpan < nXSpanEnd ? nXSpan : nXSpanEnd) - iSrcX;
454 :
455 : const size_t nXSpanSize =
456 2726210 : CPLUnsanitizedMul(nXSpan, static_cast<size_t>(nPixelSpace));
457 :
458 2726210 : bool bJustInitialize =
459 2042160 : eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
460 37248 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
461 4793940 : nXOff <= nLBlockX * nBlockXSize &&
462 25577 : nXOff + nXSize >= nXRight;
463 :
464 : // Is this a partial tile at right and/or bottom edges of
465 : // the raster, and that is going to be completely written?
466 : // If so, do not load it from storage, but zero it so that
467 : // the content outsize of the validity area is initialized.
468 2726210 : bool bMemZeroBuffer = false;
469 2042160 : if (eRWFlag == GF_Write && !bJustInitialize &&
470 2017820 : nXOff <= nLBlockX * nBlockXSize &&
471 2016190 : nYOff <= nLBlockY * nBlockYSize &&
472 12145 : (nXOff + nXSize >= nXRight ||
473 : // cppcheck-suppress knownConditionTrueFalse
474 4771070 : (nXOff + nXSize == GetXSize() && nXRight > GetXSize())) &&
475 11965 : (nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize ||
476 10743 : (nYOff + nYSize == GetYSize() &&
477 1955 : nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)))
478 : {
479 3177 : bJustInitialize = true;
480 3177 : bMemZeroBuffer = true;
481 : }
482 :
483 : /* --------------------------------------------------------------------
484 : */
485 : /* Ensure we have the appropriate block loaded. */
486 : /* --------------------------------------------------------------------
487 : */
488 2726210 : const GUInt32 nErrorCounter = CPLGetErrorCounter();
489 2726220 : poBlock =
490 2726210 : GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
491 2726220 : if (!poBlock)
492 : {
493 71 : if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
494 : nullptr)
495 : {
496 0 : CPLError(CE_Failure, CPLE_AppDefined,
497 : "GetBlockRef failed at X block offset %d, "
498 : "Y block offset %d%s",
499 : nLBlockX, nLBlockY,
500 0 : (nErrorCounter != CPLGetErrorCounter())
501 0 : ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
502 : : "");
503 : }
504 71 : return (CE_Failure);
505 : }
506 :
507 2726150 : if (eRWFlag == GF_Write)
508 2042160 : poBlock->MarkDirty();
509 :
510 2726150 : pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
511 2726150 : if (bMemZeroBuffer)
512 : {
513 3177 : memset(pabySrcBlock, 0,
514 3177 : static_cast<GPtrDiff_t>(nBandDataSize) *
515 3177 : nBlockXSize * nBlockYSize);
516 : }
517 : /* --------------------------------------------------------------------
518 : */
519 : /* Copy over this chunk of data. */
520 : /* --------------------------------------------------------------------
521 : */
522 2726150 : GPtrDiff_t iSrcOffset =
523 2726150 : (static_cast<GPtrDiff_t>(iSrcX) -
524 2726150 : static_cast<GPtrDiff_t>(nLBlockX * nBlockXSize) +
525 2726150 : (static_cast<GPtrDiff_t>(iSrcY) -
526 2726150 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
527 2726150 : nBlockXSize) *
528 2726150 : nBandDataSize;
529 : // Fill up as many rows as possible for the loaded block.
530 5452300 : const int kmax = std::min(nBlockYSize - (iSrcY % nBlockYSize),
531 2726150 : nBufYSize - iBufYOff);
532 58961300 : for (int k = 0; k < kmax; k++)
533 : {
534 56235200 : if (eDataType == eBufType && nPixelSpace == nBufDataSize)
535 : {
536 52305800 : if (eRWFlag == GF_Read)
537 47862400 : memcpy(static_cast<GByte *>(pData) + iBufOffset +
538 47862400 : static_cast<GPtrDiff_t>(k) * nLineSpace,
539 47862400 : pabySrcBlock + iSrcOffset, nXSpanSize);
540 : else
541 4443410 : memcpy(pabySrcBlock + iSrcOffset,
542 4443410 : static_cast<GByte *>(pData) + iBufOffset +
543 4443410 : static_cast<GPtrDiff_t>(k) * nLineSpace,
544 : nXSpanSize);
545 : }
546 : else
547 : {
548 : /* type to type conversion */
549 3929370 : if (eRWFlag == GF_Read)
550 3898000 : GDALCopyWords64(
551 3898000 : pabySrcBlock + iSrcOffset, eDataType,
552 : nBandDataSize,
553 3898000 : static_cast<GByte *>(pData) + iBufOffset +
554 3898000 : static_cast<GPtrDiff_t>(k) * nLineSpace,
555 : eBufType, static_cast<int>(nPixelSpace),
556 : nXSpan);
557 : else
558 31365 : GDALCopyWords64(
559 31365 : static_cast<GByte *>(pData) + iBufOffset +
560 31365 : static_cast<GPtrDiff_t>(k) * nLineSpace,
561 : eBufType, static_cast<int>(nPixelSpace),
562 31365 : pabySrcBlock + iSrcOffset, eDataType,
563 : nBandDataSize, nXSpan);
564 : }
565 :
566 56235200 : iSrcOffset +=
567 56235200 : static_cast<GPtrDiff_t>(nBlockXSize) * nBandDataSize;
568 : }
569 :
570 : iBufOffset =
571 2726130 : CPLUnsanitizedAdd<GPtrDiff_t>(iBufOffset, nXSpanSize);
572 2726020 : nLBlockX++;
573 2726020 : iSrcX += nXSpan;
574 :
575 2726020 : poBlock->DropLock();
576 2726140 : poBlock = nullptr;
577 : }
578 :
579 : /* Compute the increment to go on a block boundary */
580 2506980 : nYInc = nBlockYSize - (iSrcY % nBlockYSize);
581 :
582 2508840 : if (psExtraArg->pfnProgress != nullptr &&
583 1853 : !psExtraArg->pfnProgress(
584 2508840 : 1.0 * std::min(nBufYSize, iBufYOff + nYInc) / nBufYSize, "",
585 : psExtraArg->pProgressData))
586 : {
587 5 : return CE_Failure;
588 : }
589 : }
590 :
591 2468010 : return CE_None;
592 : }
593 :
594 : /* ==================================================================== */
595 : /* Loop reading required source blocks to satisfy output */
596 : /* request. This is the most general implementation. */
597 : /* ==================================================================== */
598 :
599 565222 : double dfXOff = nXOff;
600 565222 : double dfYOff = nYOff;
601 565222 : double dfXSize = nXSize;
602 565222 : double dfYSize = nYSize;
603 565222 : if (psExtraArg->bFloatingPointWindowValidity)
604 : {
605 230510 : dfXOff = psExtraArg->dfXOff;
606 230510 : dfYOff = psExtraArg->dfYOff;
607 230510 : dfXSize = psExtraArg->dfXSize;
608 230510 : dfYSize = psExtraArg->dfYSize;
609 : }
610 :
611 : /* -------------------------------------------------------------------- */
612 : /* Compute stepping increment. */
613 : /* -------------------------------------------------------------------- */
614 565222 : const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
615 565222 : const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
616 565222 : CPLErr eErr = CE_None;
617 :
618 565222 : if (eRWFlag == GF_Write)
619 : {
620 : /* --------------------------------------------------------------------
621 : */
622 : /* Write case */
623 : /* Loop over raster window computing source locations in the buffer.
624 : */
625 : /* --------------------------------------------------------------------
626 : */
627 166651 : GByte *pabyDstBlock = nullptr;
628 166651 : int nLBlockX = -1;
629 166651 : int nLBlockY = -1;
630 :
631 1259990 : for (int iDstY = nYOff; iDstY < nYOff + nYSize; iDstY++)
632 : {
633 1093340 : const int iBufYOff = static_cast<int>((iDstY - nYOff) / dfSrcYInc);
634 :
635 12384000 : for (int iDstX = nXOff; iDstX < nXOff + nXSize; iDstX++)
636 : {
637 11290600 : const int iBufXOff =
638 11290600 : static_cast<int>((iDstX - nXOff) / dfSrcXInc);
639 11290600 : GPtrDiff_t iBufOffset =
640 11290600 : static_cast<GPtrDiff_t>(iBufYOff) *
641 : static_cast<GPtrDiff_t>(nLineSpace) +
642 11290600 : iBufXOff * static_cast<GPtrDiff_t>(nPixelSpace);
643 :
644 : // FIXME: this code likely doesn't work if the dirty block gets
645 : // flushed to disk before being completely written.
646 : // In the meantime, bJustInitialize should probably be set to
647 : // FALSE even if it is not ideal performance wise, and for
648 : // lossy compression.
649 :
650 : /* --------------------------------------------------------------------
651 : */
652 : /* Ensure we have the appropriate block loaded. */
653 : /* --------------------------------------------------------------------
654 : */
655 11290600 : if (iDstX < nLBlockX * nBlockXSize ||
656 11041300 : iDstX - nBlockXSize >= nLBlockX * nBlockXSize ||
657 10584600 : iDstY < nLBlockY * nBlockYSize ||
658 10584600 : iDstY - nBlockYSize >= nLBlockY * nBlockYSize)
659 : {
660 738682 : nLBlockX = iDstX / nBlockXSize;
661 738682 : nLBlockY = iDstY / nBlockYSize;
662 :
663 738682 : const bool bJustInitialize =
664 1065950 : nYOff <= nLBlockY * nBlockYSize &&
665 327271 : nYOff + nYSize - nBlockYSize >=
666 327271 : nLBlockY * nBlockYSize &&
667 1116260 : nXOff <= nLBlockX * nBlockXSize &&
668 50305 : nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
669 : /*bool bMemZeroBuffer = FALSE;
670 : if( !bJustInitialize &&
671 : nXOff <= nLBlockX * nBlockXSize &&
672 : nYOff <= nLBlockY * nBlockYSize &&
673 : (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
674 : (nXOff + nXSize == GetXSize() &&
675 : (nLBlockX+1) * nBlockXSize > GetXSize())) &&
676 : (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
677 : (nYOff + nYSize == GetYSize() &&
678 : (nLBlockY+1) * nBlockYSize > GetYSize())) )
679 : {
680 : bJustInitialize = TRUE;
681 : bMemZeroBuffer = TRUE;
682 : }*/
683 738682 : if (poBlock != nullptr)
684 572031 : poBlock->DropLock();
685 :
686 738682 : poBlock =
687 738682 : GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
688 738682 : if (poBlock == nullptr)
689 : {
690 0 : return (CE_Failure);
691 : }
692 :
693 738682 : poBlock->MarkDirty();
694 :
695 738682 : pabyDstBlock = static_cast<GByte *>(poBlock->GetDataRef());
696 : /*if( bMemZeroBuffer )
697 : {
698 : memset(pabyDstBlock, 0,
699 : static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
700 : * nBlockYSize);
701 : }*/
702 : }
703 :
704 : // To make Coverity happy. Should not happen by design.
705 11290600 : if (pabyDstBlock == nullptr)
706 : {
707 0 : CPLAssert(false);
708 : eErr = CE_Failure;
709 : break;
710 : }
711 :
712 : /* --------------------------------------------------------------------
713 : */
714 : /* Copy over this pixel of data. */
715 : /* --------------------------------------------------------------------
716 : */
717 11290600 : GPtrDiff_t iDstOffset =
718 11290600 : (static_cast<GPtrDiff_t>(iDstX) -
719 11290600 : static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
720 11290600 : (static_cast<GPtrDiff_t>(iDstY) -
721 11290600 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
722 11290600 : nBlockXSize) *
723 11290600 : nBandDataSize;
724 :
725 11290600 : if (eDataType == eBufType)
726 : {
727 11287500 : memcpy(pabyDstBlock + iDstOffset,
728 11287500 : static_cast<GByte *>(pData) + iBufOffset,
729 : nBandDataSize);
730 : }
731 : else
732 : {
733 : /* type to type conversion ... ouch, this is expensive way
734 : of handling single words */
735 3096 : GDALCopyWords64(static_cast<GByte *>(pData) + iBufOffset,
736 3096 : eBufType, 0, pabyDstBlock + iDstOffset,
737 : eDataType, 0, 1);
738 : }
739 : }
740 :
741 1093340 : if (psExtraArg->pfnProgress != nullptr &&
742 0 : !psExtraArg->pfnProgress(1.0 * (iDstY - nYOff + 1) / nYSize, "",
743 : psExtraArg->pProgressData))
744 : {
745 0 : eErr = CE_Failure;
746 0 : break;
747 : }
748 : }
749 : }
750 : else
751 : {
752 398571 : if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
753 : {
754 8687 : if ((psExtraArg->eResampleAlg == GRIORA_Cubic ||
755 2513 : psExtraArg->eResampleAlg == GRIORA_CubicSpline ||
756 2511 : psExtraArg->eResampleAlg == GRIORA_Bilinear ||
757 6179 : psExtraArg->eResampleAlg == GRIORA_Lanczos) &&
758 2968 : GetColorTable() != nullptr)
759 : {
760 0 : CPLError(CE_Warning, CPLE_NotSupported,
761 : "Resampling method not supported on paletted band. "
762 : "Falling back to nearest neighbour");
763 : }
764 3090 : else if (psExtraArg->eResampleAlg == GRIORA_Gauss &&
765 3 : GDALDataTypeIsComplex(eDataType))
766 : {
767 0 : CPLError(CE_Warning, CPLE_NotSupported,
768 : "Resampling method not supported on complex data type "
769 : "band. Falling back to nearest neighbour");
770 : }
771 : else
772 : {
773 3087 : return RasterIOResampled(eRWFlag, nXOff, nYOff, nXSize, nYSize,
774 : pData, nBufXSize, nBufYSize, eBufType,
775 3087 : nPixelSpace, nLineSpace, psExtraArg);
776 : }
777 : }
778 :
779 395483 : int nLimitBlockY = 0;
780 395483 : const bool bByteCopy = eDataType == eBufType && nBandDataSize == 1;
781 395483 : int nStartBlockX = -nBlockXSize;
782 395483 : const double EPS = 1e-10;
783 395483 : int nLBlockY = -1;
784 395483 : const double dfSrcXStart = 0.5 * dfSrcXInc + dfXOff + EPS;
785 395483 : const bool bIntegerXFactor =
786 372806 : bUseIntegerRequestCoords &&
787 669271 : static_cast<int>(dfSrcXInc) == dfSrcXInc &&
788 273788 : static_cast<int>(dfSrcXInc) < INT_MAX / nBandDataSize;
789 :
790 : /* --------------------------------------------------------------------
791 : */
792 : /* Read case */
793 : /* Loop over buffer computing source locations. */
794 : /* --------------------------------------------------------------------
795 : */
796 2451410 : for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
797 : {
798 : // Add small epsilon to avoid some numeric precision issues.
799 2055940 : const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
800 2055940 : const int iSrcY = static_cast<int>(std::min(
801 2055940 : std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
802 :
803 2055940 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
804 : static_cast<GPtrDiff_t>(nLineSpace);
805 :
806 2055940 : if (iSrcY >= nLimitBlockY)
807 : {
808 433624 : nLBlockY = iSrcY / nBlockYSize;
809 433624 : nLimitBlockY = nLBlockY * nBlockYSize;
810 433624 : if (nLimitBlockY < INT_MAX - nBlockYSize)
811 433624 : nLimitBlockY += nBlockYSize;
812 : else
813 0 : nLimitBlockY = INT_MAX;
814 : // Make sure a new block is loaded.
815 433624 : nStartBlockX = -nBlockXSize;
816 : }
817 1622320 : else if (static_cast<int>(dfSrcXStart) < nStartBlockX)
818 : {
819 : // Make sure a new block is loaded.
820 441987 : nStartBlockX = -nBlockXSize;
821 : }
822 :
823 2055940 : GPtrDiff_t iSrcOffsetCst = (iSrcY - nLBlockY * nBlockYSize) *
824 2055940 : static_cast<GPtrDiff_t>(nBlockXSize);
825 :
826 2055940 : if (bIntegerXFactor)
827 : {
828 695677 : int iSrcX = static_cast<int>(dfSrcXStart);
829 695677 : const int nSrcXInc = static_cast<int>(dfSrcXInc);
830 695677 : GByte *pabyDstData = static_cast<GByte *>(pData) + iBufOffset;
831 695677 : bool bRet = false;
832 695677 : if (bByteCopy)
833 : {
834 585768 : bRet = DownsamplingIntegerXFactor<true, 1>(
835 : this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
836 : static_cast<int>(nPixelSpace), nBufXSize, GDT_Byte,
837 : GDT_Byte, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
838 : }
839 109909 : else if (eDataType == eBufType)
840 : {
841 109704 : switch (nBandDataSize)
842 : {
843 109624 : case 2:
844 109624 : bRet = DownsamplingIntegerXFactor<true, 2>(
845 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
846 : pabyDstData, static_cast<int>(nPixelSpace),
847 : nBufXSize, eDataType, eDataType, nStartBlockX,
848 : nBlockXSize, poBlock, nLBlockY);
849 109624 : break;
850 22 : case 4:
851 22 : bRet = DownsamplingIntegerXFactor<true, 4>(
852 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
853 : pabyDstData, static_cast<int>(nPixelSpace),
854 : nBufXSize, eDataType, eDataType, nStartBlockX,
855 : nBlockXSize, poBlock, nLBlockY);
856 22 : break;
857 56 : case 8:
858 56 : bRet = DownsamplingIntegerXFactor<true, 8>(
859 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
860 : pabyDstData, static_cast<int>(nPixelSpace),
861 : nBufXSize, eDataType, eDataType, nStartBlockX,
862 : nBlockXSize, poBlock, nLBlockY);
863 56 : break;
864 2 : case 16:
865 2 : bRet = DownsamplingIntegerXFactor<true, 16>(
866 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
867 : pabyDstData, static_cast<int>(nPixelSpace),
868 : nBufXSize, eDataType, eDataType, nStartBlockX,
869 : nBlockXSize, poBlock, nLBlockY);
870 2 : break;
871 0 : default:
872 0 : CPLAssert(false);
873 : break;
874 : }
875 : }
876 : else
877 : {
878 205 : bRet = DownsamplingIntegerXFactor<false, 0>(
879 : this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
880 : static_cast<int>(nPixelSpace), nBufXSize, eDataType,
881 : eBufType, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
882 : }
883 695677 : if (!bRet)
884 1 : eErr = CE_Failure;
885 : }
886 : else
887 : {
888 1360260 : double dfSrcX = dfSrcXStart;
889 582293000 : for (int iBufXOff = 0; iBufXOff < nBufXSize;
890 580933000 : iBufXOff++, dfSrcX += dfSrcXInc)
891 : {
892 : // TODO?: try to avoid the clamping for most iterations
893 : const int iSrcX = static_cast<int>(
894 1161870000 : std::min(std::max(0.0, dfSrcX),
895 580933000 : static_cast<double>(nRasterXSize - 1)));
896 :
897 : /* --------------------------------------------------------------------
898 : */
899 : /* Ensure we have the appropriate block loaded. */
900 : /* --------------------------------------------------------------------
901 : */
902 580933000 : if (iSrcX >= nBlockXSize + nStartBlockX)
903 : {
904 1702800 : const int nLBlockX = iSrcX / nBlockXSize;
905 1702800 : nStartBlockX = nLBlockX * nBlockXSize;
906 :
907 1702800 : if (poBlock != nullptr)
908 1581100 : poBlock->DropLock();
909 :
910 1702800 : poBlock = GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
911 1702800 : if (poBlock == nullptr)
912 : {
913 9 : eErr = CE_Failure;
914 9 : break;
915 : }
916 :
917 : pabySrcBlock =
918 1702790 : static_cast<GByte *>(poBlock->GetDataRef());
919 : }
920 580933000 : const GPtrDiff_t nDiffX =
921 580933000 : static_cast<GPtrDiff_t>(iSrcX - nStartBlockX);
922 :
923 : /* --------------------------------------------------------------------
924 : */
925 : /* Copy over this pixel of data. */
926 : /* --------------------------------------------------------------------
927 : */
928 :
929 580933000 : if (bByteCopy)
930 : {
931 527231000 : GPtrDiff_t iSrcOffset = nDiffX + iSrcOffsetCst;
932 527231000 : static_cast<GByte *>(pData)[iBufOffset] =
933 527231000 : pabySrcBlock[iSrcOffset];
934 : }
935 53701600 : else if (eDataType == eBufType)
936 : {
937 48225600 : GPtrDiff_t iSrcOffset =
938 48225600 : (nDiffX + iSrcOffsetCst) * nBandDataSize;
939 48225600 : memcpy(static_cast<GByte *>(pData) + iBufOffset,
940 48225600 : pabySrcBlock + iSrcOffset, nBandDataSize);
941 : }
942 : else
943 : {
944 : // Type to type conversion ...
945 5476050 : GPtrDiff_t iSrcOffset =
946 5476050 : (nDiffX + iSrcOffsetCst) * nBandDataSize;
947 5476050 : GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
948 : static_cast<GByte *>(pData) +
949 5476050 : iBufOffset,
950 : eBufType, 0, 1);
951 : }
952 :
953 580933000 : iBufOffset += static_cast<int>(nPixelSpace);
954 : }
955 : }
956 2055940 : if (eErr == CE_Failure)
957 11 : break;
958 :
959 2287020 : if (psExtraArg->pfnProgress != nullptr &&
960 231086 : !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
961 : psExtraArg->pProgressData))
962 : {
963 1 : eErr = CE_Failure;
964 1 : break;
965 : }
966 : }
967 : }
968 :
969 562134 : if (poBlock != nullptr)
970 562124 : poBlock->DropLock();
971 :
972 562134 : return eErr;
973 : }
974 :
975 : /************************************************************************/
976 : /* GDALRasterIOTransformer() */
977 : /************************************************************************/
978 :
979 : struct GDALRasterIOTransformerStruct
980 : {
981 : double dfXOff;
982 : double dfYOff;
983 : double dfXRatioDstToSrc;
984 : double dfYRatioDstToSrc;
985 : };
986 :
987 6748 : static int GDALRasterIOTransformer(void *pTransformerArg, int bDstToSrc,
988 : int nPointCount, double *x, double *y,
989 : double * /* z */, int *panSuccess)
990 : {
991 6748 : GDALRasterIOTransformerStruct *psParams =
992 : static_cast<GDALRasterIOTransformerStruct *>(pTransformerArg);
993 6748 : if (bDstToSrc)
994 : {
995 252996 : for (int i = 0; i < nPointCount; i++)
996 : {
997 246836 : x[i] = x[i] * psParams->dfXRatioDstToSrc + psParams->dfXOff;
998 246836 : y[i] = y[i] * psParams->dfYRatioDstToSrc + psParams->dfYOff;
999 246836 : panSuccess[i] = TRUE;
1000 : }
1001 : }
1002 : else
1003 : {
1004 1176 : for (int i = 0; i < nPointCount; i++)
1005 : {
1006 588 : x[i] = (x[i] - psParams->dfXOff) / psParams->dfXRatioDstToSrc;
1007 588 : y[i] = (y[i] - psParams->dfYOff) / psParams->dfYRatioDstToSrc;
1008 588 : panSuccess[i] = TRUE;
1009 : }
1010 : }
1011 6748 : return TRUE;
1012 : }
1013 :
1014 : /************************************************************************/
1015 : /* RasterIOResampled() */
1016 : /************************************************************************/
1017 :
1018 : //! @cond Doxygen_Suppress
1019 3087 : CPLErr GDALRasterBand::RasterIOResampled(
1020 : GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
1021 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
1022 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
1023 : {
1024 : // Determine if we use warping resampling or overview resampling
1025 : const bool bUseWarp =
1026 3087 : (GDALDataTypeIsComplex(eDataType) &&
1027 3244 : psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
1028 157 : psExtraArg->eResampleAlg != GRIORA_Mode);
1029 :
1030 3087 : double dfXOff = nXOff;
1031 3087 : double dfYOff = nYOff;
1032 3087 : double dfXSize = nXSize;
1033 3087 : double dfYSize = nYSize;
1034 3087 : if (psExtraArg->bFloatingPointWindowValidity)
1035 : {
1036 2628 : dfXOff = psExtraArg->dfXOff;
1037 2628 : dfYOff = psExtraArg->dfYOff;
1038 2628 : dfXSize = psExtraArg->dfXSize;
1039 2628 : dfYSize = psExtraArg->dfYSize;
1040 : }
1041 :
1042 3087 : const double dfXRatioDstToSrc = dfXSize / nBufXSize;
1043 3087 : const double dfYRatioDstToSrc = dfYSize / nBufYSize;
1044 :
1045 : // Determine the coordinates in the "virtual" output raster to see
1046 : // if there are not integers, in which case we will use them as a shift
1047 : // so that subwindow extracts give the exact same results as entire raster
1048 : // scaling.
1049 3087 : double dfDestXOff = dfXOff / dfXRatioDstToSrc;
1050 3087 : bool bHasXOffVirtual = false;
1051 3087 : int nDestXOffVirtual = 0;
1052 3087 : if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
1053 : {
1054 2759 : bHasXOffVirtual = true;
1055 2759 : dfXOff = nXOff;
1056 2759 : nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
1057 : }
1058 :
1059 3087 : double dfDestYOff = dfYOff / dfYRatioDstToSrc;
1060 3087 : bool bHasYOffVirtual = false;
1061 3087 : int nDestYOffVirtual = 0;
1062 3087 : if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
1063 : {
1064 2755 : bHasYOffVirtual = true;
1065 2755 : dfYOff = nYOff;
1066 2755 : nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
1067 : }
1068 :
1069 : // Create a MEM dataset that wraps the output buffer.
1070 : GDALDataset *poMEMDS;
1071 3087 : void *pTempBuffer = nullptr;
1072 3087 : GSpacing nPSMem = nPixelSpace;
1073 3087 : GSpacing nLSMem = nLineSpace;
1074 3087 : void *pDataMem = pData;
1075 3087 : GDALDataType eDTMem = eBufType;
1076 3087 : if (eBufType != eDataType)
1077 : {
1078 40 : nPSMem = GDALGetDataTypeSizeBytes(eDataType);
1079 40 : nLSMem = nPSMem * nBufXSize;
1080 : pTempBuffer =
1081 40 : VSI_MALLOC2_VERBOSE(nBufYSize, static_cast<size_t>(nLSMem));
1082 40 : if (pTempBuffer == nullptr)
1083 0 : return CE_Failure;
1084 40 : pDataMem = pTempBuffer;
1085 40 : eDTMem = eDataType;
1086 : }
1087 :
1088 : poMEMDS =
1089 3087 : MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
1090 : nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr);
1091 3087 : GByte *pabyData = static_cast<GByte *>(pDataMem) -
1092 3087 : nPSMem * nDestXOffVirtual - nLSMem * nDestYOffVirtual;
1093 3087 : GDALRasterBandH hMEMBand = MEMCreateRasterBandEx(
1094 : poMEMDS, 1, pabyData, eDTMem, nPSMem, nLSMem, false);
1095 3087 : poMEMDS->SetBand(1, GDALRasterBand::FromHandle(hMEMBand));
1096 :
1097 3087 : const char *pszNBITS = GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
1098 3087 : const int nNBITS = pszNBITS ? atoi(pszNBITS) : 0;
1099 3087 : if (pszNBITS)
1100 6 : GDALRasterBand::FromHandle(hMEMBand)->SetMetadataItem(
1101 6 : "NBITS", pszNBITS, "IMAGE_STRUCTURE");
1102 :
1103 3087 : CPLErr eErr = CE_None;
1104 :
1105 : // Do the resampling.
1106 3087 : if (bUseWarp)
1107 : {
1108 149 : int bHasNoData = FALSE;
1109 149 : double dfNoDataValue = GetNoDataValue(&bHasNoData);
1110 :
1111 149 : VRTDatasetH hVRTDS = nullptr;
1112 149 : GDALRasterBandH hVRTBand = nullptr;
1113 149 : if (GetDataset() == nullptr)
1114 : {
1115 : /* Create VRT dataset that wraps the whole dataset */
1116 0 : hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
1117 0 : VRTAddBand(hVRTDS, eDataType, nullptr);
1118 0 : hVRTBand = GDALGetRasterBand(hVRTDS, 1);
1119 0 : VRTAddSimpleSource(hVRTBand, this, 0, 0, nRasterXSize, nRasterYSize,
1120 : 0, 0, nRasterXSize, nRasterYSize, nullptr,
1121 : VRT_NODATA_UNSET);
1122 :
1123 : /* Add a mask band if needed */
1124 0 : if (GetMaskFlags() != GMF_ALL_VALID)
1125 : {
1126 0 : GDALDataset::FromHandle(hVRTDS)->CreateMaskBand(0);
1127 : VRTSourcedRasterBand *poVRTMaskBand =
1128 : reinterpret_cast<VRTSourcedRasterBand *>(
1129 : reinterpret_cast<GDALRasterBand *>(hVRTBand)
1130 0 : ->GetMaskBand());
1131 0 : poVRTMaskBand->AddMaskBandSource(this, 0, 0, nRasterXSize,
1132 0 : nRasterYSize, 0, 0,
1133 0 : nRasterXSize, nRasterYSize);
1134 : }
1135 : }
1136 :
1137 149 : GDALWarpOptions *psWarpOptions = GDALCreateWarpOptions();
1138 149 : switch (psExtraArg->eResampleAlg)
1139 : {
1140 0 : case GRIORA_NearestNeighbour:
1141 0 : psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
1142 0 : break;
1143 147 : case GRIORA_Bilinear:
1144 147 : psWarpOptions->eResampleAlg = GRA_Bilinear;
1145 147 : break;
1146 0 : case GRIORA_Cubic:
1147 0 : psWarpOptions->eResampleAlg = GRA_Cubic;
1148 0 : break;
1149 0 : case GRIORA_CubicSpline:
1150 0 : psWarpOptions->eResampleAlg = GRA_CubicSpline;
1151 0 : break;
1152 0 : case GRIORA_Lanczos:
1153 0 : psWarpOptions->eResampleAlg = GRA_Lanczos;
1154 0 : break;
1155 0 : case GRIORA_Average:
1156 0 : psWarpOptions->eResampleAlg = GRA_Average;
1157 0 : break;
1158 2 : case GRIORA_RMS:
1159 2 : psWarpOptions->eResampleAlg = GRA_RMS;
1160 2 : break;
1161 0 : case GRIORA_Mode:
1162 0 : psWarpOptions->eResampleAlg = GRA_Mode;
1163 0 : break;
1164 0 : default:
1165 0 : CPLAssert(false);
1166 : psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
1167 : break;
1168 : }
1169 149 : psWarpOptions->hSrcDS = hVRTDS ? hVRTDS : GetDataset();
1170 149 : psWarpOptions->hDstDS = poMEMDS;
1171 149 : psWarpOptions->nBandCount = 1;
1172 149 : int nSrcBandNumber = hVRTDS ? 1 : nBand;
1173 149 : int nDstBandNumber = 1;
1174 149 : psWarpOptions->panSrcBands = &nSrcBandNumber;
1175 149 : psWarpOptions->panDstBands = &nDstBandNumber;
1176 298 : psWarpOptions->pfnProgress = psExtraArg->pfnProgress
1177 149 : ? psExtraArg->pfnProgress
1178 : : GDALDummyProgress;
1179 149 : psWarpOptions->pProgressArg = psExtraArg->pProgressData;
1180 149 : psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
1181 149 : if (bHasNoData)
1182 : {
1183 0 : psWarpOptions->papszWarpOptions = CSLSetNameValue(
1184 : psWarpOptions->papszWarpOptions, "INIT_DEST", "NO_DATA");
1185 0 : if (psWarpOptions->padfSrcNoDataReal == nullptr)
1186 : {
1187 0 : psWarpOptions->padfSrcNoDataReal =
1188 0 : static_cast<double *>(CPLMalloc(sizeof(double)));
1189 0 : psWarpOptions->padfSrcNoDataReal[0] = dfNoDataValue;
1190 : }
1191 :
1192 0 : if (psWarpOptions->padfDstNoDataReal == nullptr)
1193 : {
1194 0 : psWarpOptions->padfDstNoDataReal =
1195 0 : static_cast<double *>(CPLMalloc(sizeof(double)));
1196 0 : psWarpOptions->padfDstNoDataReal[0] = dfNoDataValue;
1197 : }
1198 : }
1199 :
1200 : GDALRasterIOTransformerStruct sTransformer;
1201 149 : sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
1202 149 : sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
1203 149 : sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
1204 149 : sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
1205 149 : psWarpOptions->pTransformerArg = &sTransformer;
1206 :
1207 : GDALWarpOperationH hWarpOperation =
1208 149 : GDALCreateWarpOperation(psWarpOptions);
1209 149 : eErr = GDALChunkAndWarpImage(hWarpOperation, nDestXOffVirtual,
1210 : nDestYOffVirtual, nBufXSize, nBufYSize);
1211 149 : GDALDestroyWarpOperation(hWarpOperation);
1212 :
1213 149 : psWarpOptions->panSrcBands = nullptr;
1214 149 : psWarpOptions->panDstBands = nullptr;
1215 149 : GDALDestroyWarpOptions(psWarpOptions);
1216 :
1217 149 : if (hVRTDS)
1218 0 : GDALClose(hVRTDS);
1219 : }
1220 : else
1221 : {
1222 2938 : const char *pszResampling =
1223 3636 : (psExtraArg->eResampleAlg == GRIORA_Bilinear) ? "BILINEAR"
1224 822 : : (psExtraArg->eResampleAlg == GRIORA_Cubic) ? "CUBIC"
1225 246 : : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
1226 239 : : (psExtraArg->eResampleAlg == GRIORA_Lanczos) ? "LANCZOS"
1227 172 : : (psExtraArg->eResampleAlg == GRIORA_Average) ? "AVERAGE"
1228 95 : : (psExtraArg->eResampleAlg == GRIORA_RMS) ? "RMS"
1229 43 : : (psExtraArg->eResampleAlg == GRIORA_Mode) ? "MODE"
1230 3 : : (psExtraArg->eResampleAlg == GRIORA_Gauss) ? "GAUSS"
1231 : : "UNKNOWN";
1232 :
1233 2938 : int nKernelRadius = 0;
1234 : GDALResampleFunction pfnResampleFunc =
1235 2938 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
1236 2938 : CPLAssert(pfnResampleFunc);
1237 : GDALDataType eWrkDataType =
1238 2938 : GDALGetOvrWorkDataType(pszResampling, eDataType);
1239 2938 : int nHasNoData = 0;
1240 2938 : double dfNoDataValue = GetNoDataValue(&nHasNoData);
1241 2938 : const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
1242 2938 : if (!bHasNoData)
1243 2848 : dfNoDataValue = 0.0;
1244 :
1245 2938 : int nDstBlockXSize = nBufXSize;
1246 2938 : int nDstBlockYSize = nBufYSize;
1247 2938 : int nFullResXChunk = 0;
1248 2938 : int nFullResYChunk = 0;
1249 : while (true)
1250 : {
1251 2949 : nFullResXChunk =
1252 2949 : 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
1253 2949 : nFullResYChunk =
1254 2949 : 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
1255 2949 : if (nFullResXChunk > nRasterXSize)
1256 2666 : nFullResXChunk = nRasterXSize;
1257 2949 : if (nFullResYChunk > nRasterYSize)
1258 267 : nFullResYChunk = nRasterYSize;
1259 2949 : if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
1260 2895 : (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
1261 : 1024 * 1024))
1262 : break;
1263 : // When operating on the full width of a raster whose block width is
1264 : // the raster width, prefer doing chunks in height.
1265 11 : if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
1266 : nDstBlockYSize > 1)
1267 0 : nDstBlockYSize /= 2;
1268 : /* Otherwise cut the maximal dimension */
1269 11 : else if (nDstBlockXSize > 1 &&
1270 0 : (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
1271 11 : nDstBlockXSize /= 2;
1272 : else
1273 0 : nDstBlockYSize /= 2;
1274 : }
1275 :
1276 2938 : int nOvrXFactor = static_cast<int>(0.5 + dfXRatioDstToSrc);
1277 2938 : int nOvrYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
1278 2938 : if (nOvrXFactor == 0)
1279 2029 : nOvrXFactor = 1;
1280 2938 : if (nOvrYFactor == 0)
1281 2028 : nOvrYFactor = 1;
1282 2938 : int nFullResXSizeQueried =
1283 2938 : nFullResXChunk + 2 * nKernelRadius * nOvrXFactor;
1284 2938 : int nFullResYSizeQueried =
1285 2938 : nFullResYChunk + 2 * nKernelRadius * nOvrYFactor;
1286 :
1287 2938 : if (nFullResXSizeQueried > nRasterXSize)
1288 2558 : nFullResXSizeQueried = nRasterXSize;
1289 2938 : if (nFullResYSizeQueried > nRasterYSize)
1290 156 : nFullResYSizeQueried = nRasterYSize;
1291 :
1292 : void *pChunk =
1293 2938 : VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
1294 : nFullResXSizeQueried, nFullResYSizeQueried);
1295 2938 : GByte *pabyChunkNoDataMask = nullptr;
1296 :
1297 2938 : GDALRasterBand *poMaskBand = GetMaskBand();
1298 2938 : int l_nMaskFlags = GetMaskFlags();
1299 :
1300 2938 : bool bUseNoDataMask = ((l_nMaskFlags & GMF_ALL_VALID) == 0);
1301 2938 : if (bUseNoDataMask)
1302 : {
1303 158 : pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
1304 : nFullResXSizeQueried, nFullResYSizeQueried));
1305 : }
1306 2938 : if (pChunk == nullptr ||
1307 158 : (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
1308 : {
1309 0 : GDALClose(poMEMDS);
1310 0 : CPLFree(pChunk);
1311 0 : CPLFree(pabyChunkNoDataMask);
1312 0 : VSIFree(pTempBuffer);
1313 0 : return CE_Failure;
1314 : }
1315 :
1316 2938 : const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
1317 2938 : DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
1318 2938 : int nBlocksDone = 0;
1319 :
1320 : int nDstYOff;
1321 5876 : for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
1322 2938 : nDstYOff += nDstBlockYSize)
1323 : {
1324 : int nDstYCount;
1325 2938 : if (nDstYOff + nDstBlockYSize <= nBufYSize)
1326 2938 : nDstYCount = nDstBlockYSize;
1327 : else
1328 0 : nDstYCount = nBufYSize - nDstYOff;
1329 :
1330 2938 : int nChunkYOff =
1331 2938 : nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
1332 2938 : int nChunkYOff2 = nYOff + 1 +
1333 2938 : static_cast<int>(ceil((nDstYOff + nDstYCount) *
1334 : dfYRatioDstToSrc));
1335 2938 : if (nChunkYOff2 > nRasterYSize)
1336 391 : nChunkYOff2 = nRasterYSize;
1337 2938 : int nYCount = nChunkYOff2 - nChunkYOff;
1338 2938 : CPLAssert(nYCount <= nFullResYChunk);
1339 :
1340 2938 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrYFactor;
1341 2938 : int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrYFactor;
1342 2938 : if (nChunkYOffQueried < 0)
1343 : {
1344 291 : nChunkYSizeQueried += nChunkYOffQueried;
1345 291 : nChunkYOffQueried = 0;
1346 : }
1347 2938 : if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
1348 394 : nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
1349 2938 : CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
1350 :
1351 2938 : int nDstXOff = 0;
1352 5876 : for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
1353 2938 : nDstXOff += nDstBlockXSize)
1354 : {
1355 2938 : int nDstXCount = 0;
1356 2938 : if (nDstXOff + nDstBlockXSize <= nBufXSize)
1357 2938 : nDstXCount = nDstBlockXSize;
1358 : else
1359 0 : nDstXCount = nBufXSize - nDstXOff;
1360 :
1361 2938 : int nChunkXOff =
1362 2938 : nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
1363 2938 : int nChunkXOff2 =
1364 2938 : nXOff + 1 +
1365 2938 : static_cast<int>(
1366 2938 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
1367 2938 : if (nChunkXOff2 > nRasterXSize)
1368 2691 : nChunkXOff2 = nRasterXSize;
1369 2938 : int nXCount = nChunkXOff2 - nChunkXOff;
1370 2938 : CPLAssert(nXCount <= nFullResXChunk);
1371 :
1372 2938 : int nChunkXOffQueried =
1373 2938 : nChunkXOff - nKernelRadius * nOvrXFactor;
1374 2938 : int nChunkXSizeQueried =
1375 2938 : nXCount + 2 * nKernelRadius * nOvrXFactor;
1376 2938 : if (nChunkXOffQueried < 0)
1377 : {
1378 2595 : nChunkXSizeQueried += nChunkXOffQueried;
1379 2595 : nChunkXOffQueried = 0;
1380 : }
1381 2938 : if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
1382 2581 : nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
1383 2938 : CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
1384 :
1385 : // Read the source buffers.
1386 2938 : eErr = RasterIO(GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1387 : nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
1388 : nChunkXSizeQueried, nChunkYSizeQueried,
1389 : eWrkDataType, 0, 0, nullptr);
1390 :
1391 2938 : bool bSkipResample = false;
1392 2938 : bool bNoDataMaskFullyOpaque = false;
1393 2938 : if (eErr == CE_None && bUseNoDataMask)
1394 : {
1395 158 : eErr = poMaskBand->RasterIO(
1396 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1397 : nChunkXSizeQueried, nChunkYSizeQueried,
1398 : pabyChunkNoDataMask, nChunkXSizeQueried,
1399 : nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
1400 :
1401 : /* Optimizations if mask if fully opaque or transparent */
1402 158 : int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
1403 158 : GByte bVal = pabyChunkNoDataMask[0];
1404 158 : int i = 1;
1405 3751650 : for (; i < nPixels; i++)
1406 : {
1407 3751590 : if (pabyChunkNoDataMask[i] != bVal)
1408 104 : break;
1409 : }
1410 158 : if (i == nPixels)
1411 : {
1412 54 : if (bVal == 0)
1413 : {
1414 712 : for (int j = 0; j < nDstYCount; j++)
1415 : {
1416 686 : GDALCopyWords64(&dfNoDataValue, GDT_Float64, 0,
1417 : static_cast<GByte *>(pDataMem) +
1418 686 : nLSMem * (j + nDstYOff) +
1419 686 : nDstXOff * nPSMem,
1420 : eDTMem,
1421 : static_cast<int>(nPSMem),
1422 : nDstXCount);
1423 : }
1424 26 : bSkipResample = true;
1425 : }
1426 : else
1427 : {
1428 28 : bNoDataMaskFullyOpaque = true;
1429 : }
1430 : }
1431 : }
1432 :
1433 2938 : if (!bSkipResample && eErr == CE_None)
1434 : {
1435 2909 : const bool bPropagateNoData = false;
1436 2909 : void *pDstBuffer = nullptr;
1437 2909 : GDALDataType eDstBufferDataType = GDT_Unknown;
1438 : GDALRasterBand *poMEMBand =
1439 2909 : GDALRasterBand::FromHandle(hMEMBand);
1440 2909 : GDALOverviewResampleArgs args;
1441 2909 : args.eSrcDataType = eDataType;
1442 2909 : args.eOvrDataType = poMEMBand->GetRasterDataType();
1443 2909 : args.nOvrXSize = poMEMBand->GetXSize();
1444 2909 : args.nOvrYSize = poMEMBand->GetYSize();
1445 2909 : args.nOvrNBITS = nNBITS;
1446 2909 : args.dfXRatioDstToSrc = dfXRatioDstToSrc;
1447 2909 : args.dfYRatioDstToSrc = dfYRatioDstToSrc;
1448 2909 : args.dfSrcXDelta =
1449 2909 : dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
1450 2909 : args.dfSrcYDelta =
1451 2909 : dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
1452 2909 : args.eWrkDataType = eWrkDataType;
1453 2909 : args.pabyChunkNodataMask =
1454 2909 : bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask;
1455 2909 : args.nChunkXOff =
1456 2909 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
1457 2909 : args.nChunkXSize = nChunkXSizeQueried;
1458 2909 : args.nChunkYOff =
1459 2909 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
1460 2909 : args.nChunkYSize = nChunkYSizeQueried;
1461 2909 : args.nDstXOff = nDstXOff + nDestXOffVirtual;
1462 2909 : args.nDstXOff2 = nDstXOff + nDestXOffVirtual + nDstXCount;
1463 2909 : args.nDstYOff = nDstYOff + nDestYOffVirtual;
1464 2909 : args.nDstYOff2 = nDstYOff + nDestYOffVirtual + nDstYCount;
1465 2909 : args.pszResampling = pszResampling;
1466 2909 : args.bHasNoData = bHasNoData;
1467 2909 : args.dfNoDataValue = dfNoDataValue;
1468 2909 : args.poColorTable = GetColorTable();
1469 2909 : args.bPropagateNoData = bPropagateNoData;
1470 2909 : eErr = pfnResampleFunc(args, pChunk, &pDstBuffer,
1471 : &eDstBufferDataType);
1472 2909 : if (eErr == CE_None)
1473 : {
1474 2909 : eErr = poMEMBand->RasterIO(
1475 : GF_Write, nDstXOff + nDestXOffVirtual,
1476 : nDstYOff + nDestYOffVirtual, nDstXCount, nDstYCount,
1477 : pDstBuffer, nDstXCount, nDstYCount,
1478 : eDstBufferDataType, 0, 0, nullptr);
1479 : }
1480 2909 : CPLFree(pDstBuffer);
1481 : }
1482 :
1483 2938 : nBlocksDone++;
1484 3363 : if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
1485 425 : !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
1486 : "", psExtraArg->pProgressData))
1487 : {
1488 1 : eErr = CE_Failure;
1489 : }
1490 : }
1491 : }
1492 :
1493 2938 : CPLFree(pChunk);
1494 2938 : CPLFree(pabyChunkNoDataMask);
1495 : }
1496 :
1497 3087 : if (eBufType != eDataType)
1498 : {
1499 40 : CPL_IGNORE_RET_VAL(poMEMDS->GetRasterBand(1)->RasterIO(
1500 : GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
1501 : pData, nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
1502 : nullptr));
1503 : }
1504 3087 : GDALClose(poMEMDS);
1505 3087 : VSIFree(pTempBuffer);
1506 :
1507 3087 : return eErr;
1508 : }
1509 :
1510 : /************************************************************************/
1511 : /* RasterIOResampled() */
1512 : /************************************************************************/
1513 :
1514 798 : CPLErr GDALDataset::RasterIOResampled(
1515 : GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
1516 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
1517 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
1518 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
1519 :
1520 : {
1521 : #if 0
1522 : // Determine if we use warping resampling or overview resampling
1523 : bool bUseWarp = false;
1524 : if( GDALDataTypeIsComplex( eDataType ) )
1525 : bUseWarp = true;
1526 : #endif
1527 :
1528 798 : double dfXOff = nXOff;
1529 798 : double dfYOff = nYOff;
1530 798 : double dfXSize = nXSize;
1531 798 : double dfYSize = nYSize;
1532 798 : if (psExtraArg->bFloatingPointWindowValidity)
1533 : {
1534 678 : dfXOff = psExtraArg->dfXOff;
1535 678 : dfYOff = psExtraArg->dfYOff;
1536 678 : dfXSize = psExtraArg->dfXSize;
1537 678 : dfYSize = psExtraArg->dfYSize;
1538 : }
1539 :
1540 798 : const double dfXRatioDstToSrc = dfXSize / nBufXSize;
1541 798 : const double dfYRatioDstToSrc = dfYSize / nBufYSize;
1542 :
1543 : // Determine the coordinates in the "virtual" output raster to see
1544 : // if there are not integers, in which case we will use them as a shift
1545 : // so that subwindow extracts give the exact same results as entire raster
1546 : // scaling.
1547 798 : double dfDestXOff = dfXOff / dfXRatioDstToSrc;
1548 798 : bool bHasXOffVirtual = false;
1549 798 : int nDestXOffVirtual = 0;
1550 798 : if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
1551 : {
1552 670 : bHasXOffVirtual = true;
1553 670 : dfXOff = nXOff;
1554 670 : nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
1555 : }
1556 :
1557 798 : double dfDestYOff = dfYOff / dfYRatioDstToSrc;
1558 798 : bool bHasYOffVirtual = false;
1559 798 : int nDestYOffVirtual = 0;
1560 798 : if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
1561 : {
1562 631 : bHasYOffVirtual = true;
1563 631 : dfYOff = nYOff;
1564 631 : nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
1565 : }
1566 :
1567 : // Create a MEM dataset that wraps the output buffer.
1568 : GDALDataset *poMEMDS =
1569 798 : MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
1570 : nDestYOffVirtual + nBufYSize, 0, eBufType, nullptr);
1571 : GDALRasterBand **papoDstBands = static_cast<GDALRasterBand **>(
1572 796 : CPLMalloc(nBandCount * sizeof(GDALRasterBand *)));
1573 797 : int nNBITS = 0;
1574 2516 : for (int i = 0; i < nBandCount; i++)
1575 : {
1576 1719 : char szBuffer[32] = {'\0'};
1577 3445 : int nRet = CPLPrintPointer(
1578 : szBuffer,
1579 1719 : static_cast<GByte *>(pData) - nPixelSpace * nDestXOffVirtual -
1580 1719 : nLineSpace * nDestYOffVirtual + nBandSpace * i,
1581 : sizeof(szBuffer));
1582 1726 : szBuffer[nRet] = 0;
1583 :
1584 1726 : char szBuffer0[64] = {'\0'};
1585 1726 : snprintf(szBuffer0, sizeof(szBuffer0), "DATAPOINTER=%s", szBuffer);
1586 :
1587 1726 : char szBuffer1[64] = {'\0'};
1588 1726 : snprintf(szBuffer1, sizeof(szBuffer1), "PIXELOFFSET=" CPL_FRMT_GIB,
1589 : static_cast<GIntBig>(nPixelSpace));
1590 :
1591 1726 : char szBuffer2[64] = {'\0'};
1592 1726 : snprintf(szBuffer2, sizeof(szBuffer2), "LINEOFFSET=" CPL_FRMT_GIB,
1593 : static_cast<GIntBig>(nLineSpace));
1594 :
1595 1726 : char *apszOptions[4] = {szBuffer0, szBuffer1, szBuffer2, nullptr};
1596 :
1597 1726 : poMEMDS->AddBand(eBufType, apszOptions);
1598 :
1599 1732 : GDALRasterBand *poSrcBand = GetRasterBand(panBandMap[i]);
1600 1728 : papoDstBands[i] = poMEMDS->GetRasterBand(i + 1);
1601 : const char *pszNBITS =
1602 1724 : poSrcBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
1603 1719 : if (pszNBITS)
1604 : {
1605 0 : nNBITS = atoi(pszNBITS);
1606 0 : poMEMDS->GetRasterBand(i + 1)->SetMetadataItem("NBITS", pszNBITS,
1607 0 : "IMAGE_STRUCTURE");
1608 : }
1609 : }
1610 :
1611 797 : CPLErr eErr = CE_None;
1612 :
1613 : // TODO(schwehr): Why disabled? Why not just delete?
1614 : // Looks like this code was initially added as disable by copying
1615 : // from RasterIO here:
1616 : // https://trac.osgeo.org/gdal/changeset/29572
1617 : #if 0
1618 : // Do the resampling.
1619 : if( bUseWarp )
1620 : {
1621 : VRTDatasetH hVRTDS = nullptr;
1622 : GDALRasterBandH hVRTBand = nullptr;
1623 : if( GetDataset() == nullptr )
1624 : {
1625 : /* Create VRT dataset that wraps the whole dataset */
1626 : hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
1627 : VRTAddBand( hVRTDS, eDataType, nullptr );
1628 : hVRTBand = GDALGetRasterBand(hVRTDS, 1);
1629 : VRTAddSimpleSource( (VRTSourcedRasterBandH)hVRTBand,
1630 : (GDALRasterBandH)this,
1631 : 0, 0,
1632 : nRasterXSize, nRasterYSize,
1633 : 0, 0,
1634 : nRasterXSize, nRasterYSize,
1635 : nullptr, VRT_NODATA_UNSET );
1636 :
1637 : /* Add a mask band if needed */
1638 : if( GetMaskFlags() != GMF_ALL_VALID )
1639 : {
1640 : ((GDALDataset*)hVRTDS)->CreateMaskBand(0);
1641 : VRTSourcedRasterBand* poVRTMaskBand =
1642 : (VRTSourcedRasterBand*)(((GDALRasterBand*)hVRTBand)->GetMaskBand());
1643 : poVRTMaskBand->
1644 : AddMaskBandSource( this,
1645 : 0, 0,
1646 : nRasterXSize, nRasterYSize,
1647 : 0, 0,
1648 : nRasterXSize, nRasterYSize);
1649 : }
1650 : }
1651 :
1652 : GDALWarpOptions* psWarpOptions = GDALCreateWarpOptions();
1653 : psWarpOptions->eResampleAlg = (GDALResampleAlg)psExtraArg->eResampleAlg;
1654 : psWarpOptions->hSrcDS = (GDALDatasetH) (hVRTDS ? hVRTDS : GetDataset());
1655 : psWarpOptions->hDstDS = (GDALDatasetH) poMEMDS;
1656 : psWarpOptions->nBandCount = 1;
1657 : int nSrcBandNumber = (hVRTDS ? 1 : nBand);
1658 : int nDstBandNumber = 1;
1659 : psWarpOptions->panSrcBands = &nSrcBandNumber;
1660 : psWarpOptions->panDstBands = &nDstBandNumber;
1661 : psWarpOptions->pfnProgress = psExtraArg->pfnProgress ?
1662 : psExtraArg->pfnProgress : GDALDummyProgress;
1663 : psWarpOptions->pProgressArg = psExtraArg->pProgressData;
1664 : psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
1665 : GDALRasterIOTransformerStruct sTransformer;
1666 : sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
1667 : sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
1668 : sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
1669 : sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
1670 : psWarpOptions->pTransformerArg = &sTransformer;
1671 :
1672 : GDALWarpOperationH hWarpOperation = GDALCreateWarpOperation(psWarpOptions);
1673 : eErr = GDALChunkAndWarpImage( hWarpOperation,
1674 : nDestXOffVirtual, nDestYOffVirtual,
1675 : nBufXSize, nBufYSize );
1676 : GDALDestroyWarpOperation( hWarpOperation );
1677 :
1678 : psWarpOptions->panSrcBands = nullptr;
1679 : psWarpOptions->panDstBands = nullptr;
1680 : GDALDestroyWarpOptions( psWarpOptions );
1681 :
1682 : if( hVRTDS )
1683 : GDALClose(hVRTDS);
1684 : }
1685 : else
1686 : #endif
1687 : {
1688 797 : const char *pszResampling =
1689 1471 : (psExtraArg->eResampleAlg == GRIORA_Bilinear) ? "BILINEAR"
1690 674 : : (psExtraArg->eResampleAlg == GRIORA_Cubic) ? "CUBIC"
1691 0 : : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
1692 0 : : (psExtraArg->eResampleAlg == GRIORA_Lanczos) ? "LANCZOS"
1693 0 : : (psExtraArg->eResampleAlg == GRIORA_Average) ? "AVERAGE"
1694 0 : : (psExtraArg->eResampleAlg == GRIORA_RMS) ? "RMS"
1695 0 : : (psExtraArg->eResampleAlg == GRIORA_Mode) ? "MODE"
1696 0 : : (psExtraArg->eResampleAlg == GRIORA_Gauss) ? "GAUSS"
1697 : : "UNKNOWN";
1698 :
1699 797 : GDALRasterBand *poFirstSrcBand = GetRasterBand(panBandMap[0]);
1700 795 : GDALDataType eDataType = poFirstSrcBand->GetRasterDataType();
1701 : int nBlockXSize, nBlockYSize;
1702 794 : poFirstSrcBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
1703 :
1704 : int nKernelRadius;
1705 : GDALResampleFunction pfnResampleFunc =
1706 793 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
1707 794 : CPLAssert(pfnResampleFunc);
1708 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1709 : GDALResampleFunctionMultiBands pfnResampleFuncMultiBands =
1710 : GDALGetResampleFunctionMultiBands(pszResampling, &nKernelRadius);
1711 : #endif
1712 : GDALDataType eWrkDataType =
1713 794 : GDALGetOvrWorkDataType(pszResampling, eDataType);
1714 :
1715 792 : int nDstBlockXSize = nBufXSize;
1716 792 : int nDstBlockYSize = nBufYSize;
1717 : int nFullResXChunk, nFullResYChunk;
1718 : while (true)
1719 : {
1720 792 : nFullResXChunk =
1721 792 : 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
1722 792 : nFullResYChunk =
1723 792 : 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
1724 792 : if (nFullResXChunk > nRasterXSize)
1725 574 : nFullResXChunk = nRasterXSize;
1726 792 : if (nFullResYChunk > nRasterYSize)
1727 45 : nFullResYChunk = nRasterYSize;
1728 792 : if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
1729 790 : (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
1730 : 1024 * 1024))
1731 : break;
1732 : // When operating on the full width of a raster whose block width is
1733 : // the raster width, prefer doing chunks in height.
1734 0 : if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
1735 : nDstBlockYSize > 1)
1736 0 : nDstBlockYSize /= 2;
1737 : /* Otherwise cut the maximal dimension */
1738 0 : else if (nDstBlockXSize > 1 &&
1739 0 : (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
1740 0 : nDstBlockXSize /= 2;
1741 : else
1742 0 : nDstBlockYSize /= 2;
1743 : }
1744 :
1745 1581 : int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
1746 792 : static_cast<int>(0.5 + dfYRatioDstToSrc));
1747 789 : if (nOvrFactor == 0)
1748 98 : nOvrFactor = 1;
1749 789 : int nFullResXSizeQueried =
1750 789 : nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
1751 789 : int nFullResYSizeQueried =
1752 789 : nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
1753 :
1754 789 : if (nFullResXSizeQueried > nRasterXSize)
1755 599 : nFullResXSizeQueried = nRasterXSize;
1756 789 : if (nFullResYSizeQueried > nRasterYSize)
1757 48 : nFullResYSizeQueried = nRasterYSize;
1758 :
1759 789 : void *pChunk = VSI_MALLOC3_VERBOSE(
1760 : cpl::fits_on<int>(GDALGetDataTypeSizeBytes(eWrkDataType) *
1761 : nBandCount),
1762 : nFullResXSizeQueried, nFullResYSizeQueried);
1763 800 : GByte *pabyChunkNoDataMask = nullptr;
1764 :
1765 800 : GDALRasterBand *poMaskBand = poFirstSrcBand->GetMaskBand();
1766 798 : int nMaskFlags = poFirstSrcBand->GetMaskFlags();
1767 :
1768 796 : bool bUseNoDataMask = ((nMaskFlags & GMF_ALL_VALID) == 0);
1769 796 : if (bUseNoDataMask)
1770 : {
1771 531 : pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
1772 : nFullResXSizeQueried, nFullResYSizeQueried));
1773 : }
1774 796 : if (pChunk == nullptr ||
1775 531 : (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
1776 : {
1777 2 : GDALClose(poMEMDS);
1778 0 : CPLFree(pChunk);
1779 0 : CPLFree(pabyChunkNoDataMask);
1780 0 : CPLFree(papoDstBands);
1781 0 : return CE_Failure;
1782 : }
1783 :
1784 794 : const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
1785 794 : DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
1786 794 : int nBlocksDone = 0;
1787 :
1788 : int nDstYOff;
1789 1599 : for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
1790 805 : nDstYOff += nDstBlockYSize)
1791 : {
1792 : int nDstYCount;
1793 796 : if (nDstYOff + nDstBlockYSize <= nBufYSize)
1794 798 : nDstYCount = nDstBlockYSize;
1795 : else
1796 0 : nDstYCount = nBufYSize - nDstYOff;
1797 :
1798 796 : int nChunkYOff =
1799 796 : nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
1800 796 : int nChunkYOff2 = nYOff + 1 +
1801 796 : static_cast<int>(ceil((nDstYOff + nDstYCount) *
1802 : dfYRatioDstToSrc));
1803 796 : if (nChunkYOff2 > nRasterYSize)
1804 103 : nChunkYOff2 = nRasterYSize;
1805 796 : int nYCount = nChunkYOff2 - nChunkYOff;
1806 796 : CPLAssert(nYCount <= nFullResYChunk);
1807 :
1808 796 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
1809 796 : int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
1810 796 : if (nChunkYOffQueried < 0)
1811 : {
1812 106 : nChunkYSizeQueried += nChunkYOffQueried;
1813 106 : nChunkYOffQueried = 0;
1814 : }
1815 796 : if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
1816 121 : nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
1817 796 : CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
1818 :
1819 : int nDstXOff;
1820 1597 : for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
1821 801 : nDstXOff += nDstBlockXSize)
1822 : {
1823 : int nDstXCount;
1824 792 : if (nDstXOff + nDstBlockXSize <= nBufXSize)
1825 794 : nDstXCount = nDstBlockXSize;
1826 : else
1827 0 : nDstXCount = nBufXSize - nDstXOff;
1828 :
1829 792 : int nChunkXOff =
1830 792 : nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
1831 792 : int nChunkXOff2 =
1832 792 : nXOff + 1 +
1833 792 : static_cast<int>(
1834 792 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
1835 792 : if (nChunkXOff2 > nRasterXSize)
1836 606 : nChunkXOff2 = nRasterXSize;
1837 792 : int nXCount = nChunkXOff2 - nChunkXOff;
1838 792 : CPLAssert(nXCount <= nFullResXChunk);
1839 :
1840 792 : int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
1841 792 : int nChunkXSizeQueried =
1842 792 : nXCount + 2 * nKernelRadius * nOvrFactor;
1843 792 : if (nChunkXOffQueried < 0)
1844 : {
1845 600 : nChunkXSizeQueried += nChunkXOffQueried;
1846 600 : nChunkXOffQueried = 0;
1847 : }
1848 792 : if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
1849 610 : nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
1850 792 : CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
1851 :
1852 792 : bool bSkipResample = false;
1853 792 : bool bNoDataMaskFullyOpaque = false;
1854 792 : if (eErr == CE_None && bUseNoDataMask)
1855 : {
1856 531 : eErr = poMaskBand->RasterIO(
1857 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1858 : nChunkXSizeQueried, nChunkYSizeQueried,
1859 : pabyChunkNoDataMask, nChunkXSizeQueried,
1860 : nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
1861 :
1862 : /* Optimizations if mask if fully opaque or transparent */
1863 531 : const int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
1864 531 : const GByte bVal = pabyChunkNoDataMask[0];
1865 531 : int i = 1; // Used after for.
1866 23952400 : for (; i < nPixels; i++)
1867 : {
1868 23951900 : if (pabyChunkNoDataMask[i] != bVal)
1869 72 : break;
1870 : }
1871 531 : if (i == nPixels)
1872 : {
1873 459 : if (bVal == 0)
1874 : {
1875 373 : GByte abyZero[16] = {0};
1876 780 : for (int iBand = 0; iBand < nBandCount; iBand++)
1877 : {
1878 3499 : for (int j = 0; j < nDstYCount; j++)
1879 : {
1880 3092 : GDALCopyWords64(
1881 : abyZero, GDT_Byte, 0,
1882 : static_cast<GByte *>(pData) +
1883 3092 : iBand * nBandSpace +
1884 3092 : nLineSpace * (j + nDstYOff) +
1885 3092 : nDstXOff * nPixelSpace,
1886 : eBufType, static_cast<int>(nPixelSpace),
1887 : nDstXCount);
1888 : }
1889 : }
1890 373 : bSkipResample = true;
1891 : }
1892 : else
1893 : {
1894 86 : bNoDataMaskFullyOpaque = true;
1895 : }
1896 : }
1897 : }
1898 :
1899 792 : if (!bSkipResample && eErr == CE_None)
1900 : {
1901 : /* Read the source buffers */
1902 414 : eErr = RasterIO(
1903 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1904 : nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
1905 : nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType,
1906 : nBandCount, panBandMap, 0, 0, 0, nullptr);
1907 : }
1908 :
1909 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1910 : if (pfnResampleFuncMultiBands && !bSkipResample &&
1911 : eErr == CE_None)
1912 : {
1913 : eErr = pfnResampleFuncMultiBands(
1914 : dfXRatioDstToSrc, dfYRatioDstToSrc,
1915 : dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
1916 : dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
1917 : eWrkDataType, (GByte *)pChunk, nBandCount,
1918 : bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask,
1919 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
1920 : nChunkXSizeQueried,
1921 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
1922 : nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
1923 : nDstXOff + nDestXOffVirtual + nDstXCount,
1924 : nDstYOff + nDestYOffVirtual,
1925 : nDstYOff + nDestYOffVirtual + nDstYCount, papoDstBands,
1926 : pszResampling, FALSE /*bHasNoData*/,
1927 : 0.0 /* dfNoDataValue */, nullptr /* color table*/,
1928 : eDataType);
1929 : }
1930 : else
1931 : #endif
1932 : {
1933 : size_t nChunkBandOffset =
1934 802 : static_cast<size_t>(nChunkXSizeQueried) *
1935 802 : nChunkYSizeQueried *
1936 802 : GDALGetDataTypeSizeBytes(eWrkDataType);
1937 2118 : for (int i = 0;
1938 2118 : i < nBandCount && !bSkipResample && eErr == CE_None;
1939 : i++)
1940 : {
1941 1317 : const bool bPropagateNoData = false;
1942 1317 : void *pDstBuffer = nullptr;
1943 1317 : GDALDataType eDstBufferDataType = GDT_Unknown;
1944 : GDALRasterBand *poMEMBand =
1945 1317 : poMEMDS->GetRasterBand(i + 1);
1946 1316 : GDALOverviewResampleArgs args;
1947 1316 : args.eSrcDataType = eDataType;
1948 1316 : args.eOvrDataType = poMEMBand->GetRasterDataType();
1949 1317 : args.nOvrXSize = poMEMBand->GetXSize();
1950 1315 : args.nOvrYSize = poMEMBand->GetYSize();
1951 1315 : args.nOvrNBITS = nNBITS;
1952 1315 : args.dfXRatioDstToSrc = dfXRatioDstToSrc;
1953 1315 : args.dfYRatioDstToSrc = dfYRatioDstToSrc;
1954 1315 : args.dfSrcXDelta =
1955 1315 : dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
1956 1315 : args.dfSrcYDelta =
1957 1315 : dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
1958 1315 : args.eWrkDataType = eWrkDataType;
1959 1315 : args.pabyChunkNodataMask = bNoDataMaskFullyOpaque
1960 1315 : ? nullptr
1961 : : pabyChunkNoDataMask;
1962 1315 : args.nChunkXOff =
1963 1315 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
1964 1315 : args.nChunkXSize = nChunkXSizeQueried;
1965 1315 : args.nChunkYOff =
1966 1315 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
1967 1315 : args.nChunkYSize = nChunkYSizeQueried;
1968 1315 : args.nDstXOff = nDstXOff + nDestXOffVirtual;
1969 1315 : args.nDstXOff2 =
1970 1315 : nDstXOff + nDestXOffVirtual + nDstXCount;
1971 1315 : args.nDstYOff = nDstYOff + nDestYOffVirtual;
1972 1315 : args.nDstYOff2 =
1973 1315 : nDstYOff + nDestYOffVirtual + nDstYCount;
1974 1315 : args.pszResampling = pszResampling;
1975 1315 : args.bHasNoData = false;
1976 1315 : args.dfNoDataValue = 0.0;
1977 1315 : args.poColorTable = nullptr;
1978 1315 : args.bPropagateNoData = bPropagateNoData;
1979 :
1980 : eErr =
1981 2633 : pfnResampleFunc(args,
1982 1315 : reinterpret_cast<GByte *>(pChunk) +
1983 1315 : i * nChunkBandOffset,
1984 : &pDstBuffer, &eDstBufferDataType);
1985 1318 : if (eErr == CE_None)
1986 : {
1987 1318 : eErr = poMEMBand->RasterIO(
1988 : GF_Write, nDstXOff + nDestXOffVirtual,
1989 : nDstYOff + nDestYOffVirtual, nDstXCount,
1990 : nDstYCount, pDstBuffer, nDstXCount, nDstYCount,
1991 : eDstBufferDataType, 0, 0, nullptr);
1992 : }
1993 1318 : CPLFree(pDstBuffer);
1994 : }
1995 : }
1996 :
1997 801 : nBlocksDone++;
1998 1190 : if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
1999 389 : !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
2000 : "", psExtraArg->pProgressData))
2001 : {
2002 0 : eErr = CE_Failure;
2003 : }
2004 : }
2005 : }
2006 :
2007 803 : CPLFree(pChunk);
2008 800 : CPLFree(pabyChunkNoDataMask);
2009 : }
2010 :
2011 800 : CPLFree(papoDstBands);
2012 800 : GDALClose(poMEMDS);
2013 :
2014 800 : return eErr;
2015 : }
2016 :
2017 : //! @endcond
2018 :
2019 : /************************************************************************/
2020 : /* GDALSwapWords() */
2021 : /************************************************************************/
2022 :
2023 : /**
2024 : * Byte swap words in-place.
2025 : *
2026 : * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
2027 : * a memory array. No assumption is made that the words being swapped are
2028 : * word aligned in memory. Use the CPL_LSB and CPL_MSB macros from cpl_port.h
2029 : * to determine if the current platform is big endian or little endian. Use
2030 : * The macros like CPL_SWAP32() to byte swap single values without the overhead
2031 : * of a function call.
2032 : *
2033 : * @param pData pointer to start of data buffer.
2034 : * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
2035 : * @param nWordCount the number of words to be swapped in this call.
2036 : * @param nWordSkip the byte offset from the start of one word to the start of
2037 : * the next. For packed buffers this is the same as nWordSize.
2038 : */
2039 :
2040 497137 : void CPL_STDCALL GDALSwapWords(void *pData, int nWordSize, int nWordCount,
2041 : int nWordSkip)
2042 :
2043 : {
2044 497137 : if (nWordCount > 0)
2045 497137 : VALIDATE_POINTER0(pData, "GDALSwapWords");
2046 :
2047 497137 : GByte *pabyData = static_cast<GByte *>(pData);
2048 :
2049 497137 : switch (nWordSize)
2050 : {
2051 7234 : case 1:
2052 7234 : break;
2053 :
2054 476903 : case 2:
2055 476903 : CPLAssert(nWordSkip >= 2 || nWordCount == 1);
2056 228062000 : for (int i = 0; i < nWordCount; i++)
2057 : {
2058 227585000 : CPL_SWAP16PTR(pabyData);
2059 227585000 : pabyData += nWordSkip;
2060 : }
2061 476903 : break;
2062 :
2063 10574 : case 4:
2064 10574 : CPLAssert(nWordSkip >= 4 || nWordCount == 1);
2065 10574 : if (CPL_IS_ALIGNED(pabyData, 4) && (nWordSkip % 4) == 0)
2066 : {
2067 29140500 : for (int i = 0; i < nWordCount; i++)
2068 : {
2069 29130000 : *reinterpret_cast<GUInt32 *>(pabyData) = CPL_SWAP32(
2070 : *reinterpret_cast<const GUInt32 *>(pabyData));
2071 29130000 : pabyData += nWordSkip;
2072 10571 : }
2073 : }
2074 : else
2075 : {
2076 9 : for (int i = 0; i < nWordCount; i++)
2077 : {
2078 6 : CPL_SWAP32PTR(pabyData);
2079 6 : pabyData += nWordSkip;
2080 : }
2081 : }
2082 10574 : break;
2083 :
2084 2426 : case 8:
2085 2426 : CPLAssert(nWordSkip >= 8 || nWordCount == 1);
2086 2426 : if (CPL_IS_ALIGNED(pabyData, 8) && (nWordSkip % 8) == 0)
2087 : {
2088 3356900 : for (int i = 0; i < nWordCount; i++)
2089 : {
2090 3354480 : *reinterpret_cast<GUInt64 *>(pabyData) = CPL_SWAP64(
2091 : *reinterpret_cast<const GUInt64 *>(pabyData));
2092 3354480 : pabyData += nWordSkip;
2093 2425 : }
2094 : }
2095 : else
2096 : {
2097 3 : for (int i = 0; i < nWordCount; i++)
2098 : {
2099 2 : CPL_SWAP64PTR(pabyData);
2100 2 : pabyData += nWordSkip;
2101 : }
2102 : }
2103 2426 : break;
2104 :
2105 0 : default:
2106 0 : CPLAssert(false);
2107 : }
2108 : }
2109 :
2110 : /************************************************************************/
2111 : /* GDALSwapWordsEx() */
2112 : /************************************************************************/
2113 :
2114 : /**
2115 : * Byte swap words in-place.
2116 : *
2117 : * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
2118 : * a memory array. No assumption is made that the words being swapped are
2119 : * word aligned in memory. Use the CPL_LSB and CPL_MSB macros from cpl_port.h
2120 : * to determine if the current platform is big endian or little endian. Use
2121 : * The macros like CPL_SWAP32() to byte swap single values without the overhead
2122 : * of a function call.
2123 : *
2124 : * @param pData pointer to start of data buffer.
2125 : * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
2126 : * @param nWordCount the number of words to be swapped in this call.
2127 : * @param nWordSkip the byte offset from the start of one word to the start of
2128 : * the next. For packed buffers this is the same as nWordSize.
2129 : * @since GDAL 2.1
2130 : */
2131 6118 : void CPL_STDCALL GDALSwapWordsEx(void *pData, int nWordSize, size_t nWordCount,
2132 : int nWordSkip)
2133 : {
2134 6118 : GByte *pabyData = static_cast<GByte *>(pData);
2135 12236 : while (nWordCount)
2136 : {
2137 : // Pick-up a multiple of 8 as max chunk size.
2138 6118 : const int nWordCountSmall =
2139 6118 : (nWordCount > (1 << 30)) ? (1 << 30) : static_cast<int>(nWordCount);
2140 6118 : GDALSwapWords(pabyData, nWordSize, nWordCountSmall, nWordSkip);
2141 6118 : pabyData += static_cast<size_t>(nWordSkip) * nWordCountSmall;
2142 6118 : nWordCount -= nWordCountSmall;
2143 : }
2144 6118 : }
2145 :
2146 : // Place the new GDALCopyWords helpers in an anonymous namespace
2147 : namespace
2148 : {
2149 :
2150 : /************************************************************************/
2151 : /* GDALCopyWordsT() */
2152 : /************************************************************************/
2153 : /**
2154 : * Template function, used to copy data from pSrcData into buffer
2155 : * pDstData, with stride nSrcPixelStride in the source data and
2156 : * stride nDstPixelStride in the destination data. This template can
2157 : * deal with the case where the input data type is real or complex and
2158 : * the output is real.
2159 : *
2160 : * @param pSrcData the source data buffer
2161 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
2162 : * of interest.
2163 : * @param pDstData the destination buffer.
2164 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
2165 : * interest.
2166 : * @param nWordCount the total number of pixel words to copy
2167 : *
2168 : * @code
2169 : * // Assume an input buffer of type GUInt16 named pBufferIn
2170 : * GByte *pBufferOut = new GByte[numBytesOut];
2171 : * GDALCopyWordsT<GUInt16, GByte>(pSrcData, 2, pDstData, 1, numBytesOut);
2172 : * @endcode
2173 : * @note
2174 : * This is a private function, and should not be exposed outside of
2175 : * rasterio.cpp. External users should call the GDALCopyWords driver function.
2176 : */
2177 :
2178 : template <class Tin, class Tout>
2179 42349621 : static void inline GDALCopyWordsGenericT(const Tin *const CPL_RESTRICT pSrcData,
2180 : int nSrcPixelStride,
2181 : Tout *const CPL_RESTRICT pDstData,
2182 : int nDstPixelStride,
2183 : GPtrDiff_t nWordCount)
2184 : {
2185 42349621 : decltype(nWordCount) nDstOffset = 0;
2186 :
2187 42349621 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
2188 42349621 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
2189 519453428 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
2190 : {
2191 477101082 : const Tin tValue =
2192 477101082 : *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
2193 477101082 : Tout *const pOutPixel =
2194 477101082 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2195 :
2196 477101082 : GDALCopyWord(tValue, *pOutPixel);
2197 :
2198 477103982 : nDstOffset += nDstPixelStride;
2199 : }
2200 42352531 : }
2201 :
2202 : template <class Tin, class Tout>
2203 29693930 : static void CPL_NOINLINE GDALCopyWordsT(const Tin *const CPL_RESTRICT pSrcData,
2204 : int nSrcPixelStride,
2205 : Tout *const CPL_RESTRICT pDstData,
2206 : int nDstPixelStride,
2207 : GPtrDiff_t nWordCount)
2208 : {
2209 29693930 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData, nDstPixelStride,
2210 : nWordCount);
2211 29693940 : }
2212 :
2213 : template <class Tin, class Tout>
2214 4912315 : static void inline GDALCopyWordsT_8atatime(
2215 : const Tin *const CPL_RESTRICT pSrcData, int nSrcPixelStride,
2216 : Tout *const CPL_RESTRICT pDstData, int nDstPixelStride,
2217 : GPtrDiff_t nWordCount)
2218 : {
2219 4912315 : decltype(nWordCount) nDstOffset = 0;
2220 :
2221 4912315 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
2222 4912315 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
2223 4912315 : decltype(nWordCount) n = 0;
2224 4912315 : if (nSrcPixelStride == static_cast<int>(sizeof(Tin)) &&
2225 : nDstPixelStride == static_cast<int>(sizeof(Tout)))
2226 : {
2227 34922092 : for (; n < nWordCount - 7; n += 8)
2228 : {
2229 34514600 : const Tin *pInValues = reinterpret_cast<const Tin *>(
2230 34514600 : pSrcDataPtr + (n * nSrcPixelStride));
2231 34514600 : Tout *const pOutPixels =
2232 34514600 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2233 :
2234 34514600 : GDALCopy8Words(pInValues, pOutPixels);
2235 :
2236 34507214 : nDstOffset += 8 * nDstPixelStride;
2237 : }
2238 : }
2239 10211744 : for (; n < nWordCount; n++)
2240 : {
2241 5299367 : const Tin tValue =
2242 5299367 : *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
2243 5299367 : Tout *const pOutPixel =
2244 5299367 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2245 :
2246 5299367 : GDALCopyWord(tValue, *pOutPixel);
2247 :
2248 5306811 : nDstOffset += nDstPixelStride;
2249 : }
2250 4912367 : }
2251 :
2252 : #ifdef HAVE_SSE2
2253 :
2254 : template <class Tout>
2255 39675 : void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
2256 : int nSrcPixelStride,
2257 : Tout *const CPL_RESTRICT pDstData,
2258 : int nDstPixelStride, GPtrDiff_t nWordCount)
2259 : {
2260 : static_assert(std::is_integral<Tout>::value &&
2261 : sizeof(Tout) == sizeof(uint16_t),
2262 : "Bad Tout");
2263 39675 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2264 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2265 : {
2266 33324 : decltype(nWordCount) n = 0;
2267 33324 : const __m128i xmm_zero = _mm_setzero_si128();
2268 33324 : GByte *CPL_RESTRICT pabyDstDataPtr =
2269 : reinterpret_cast<GByte *>(pDstData);
2270 1415668 : for (; n < nWordCount - 15; n += 16)
2271 : {
2272 1382344 : __m128i xmm = _mm_loadu_si128(
2273 1382344 : reinterpret_cast<const __m128i *>(pSrcData + n));
2274 1382344 : __m128i xmm0 = _mm_unpacklo_epi8(xmm, xmm_zero);
2275 1382344 : __m128i xmm1 = _mm_unpackhi_epi8(xmm, xmm_zero);
2276 : _mm_storeu_si128(
2277 1382344 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2), xmm0);
2278 : _mm_storeu_si128(
2279 1382344 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2 + 16), xmm1);
2280 : }
2281 109209 : for (; n < nWordCount; n++)
2282 : {
2283 75885 : pDstData[n] = pSrcData[n];
2284 33324 : }
2285 : }
2286 : else
2287 : {
2288 6351 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2289 : nDstPixelStride, nWordCount);
2290 : }
2291 39675 : }
2292 :
2293 : template <>
2294 26942 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2295 : int nSrcPixelStride,
2296 : GUInt16 *const CPL_RESTRICT pDstData,
2297 : int nDstPixelStride, GPtrDiff_t nWordCount)
2298 : {
2299 26942 : GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
2300 : nDstPixelStride, nWordCount);
2301 26942 : }
2302 :
2303 : template <>
2304 12733 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2305 : int nSrcPixelStride,
2306 : GInt16 *const CPL_RESTRICT pDstData,
2307 : int nDstPixelStride, GPtrDiff_t nWordCount)
2308 : {
2309 12733 : GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
2310 : nDstPixelStride, nWordCount);
2311 12733 : }
2312 :
2313 : template <class Tout>
2314 12821065 : void GDALCopyWordsByteTo32Bit(const GByte *const CPL_RESTRICT pSrcData,
2315 : int nSrcPixelStride,
2316 : Tout *const CPL_RESTRICT pDstData,
2317 : int nDstPixelStride, GPtrDiff_t nWordCount)
2318 : {
2319 : static_assert(std::is_integral<Tout>::value &&
2320 : sizeof(Tout) == sizeof(uint32_t),
2321 : "Bad Tout");
2322 12821065 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2323 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2324 : {
2325 6246775 : decltype(nWordCount) n = 0;
2326 6246775 : const __m128i xmm_zero = _mm_setzero_si128();
2327 6246775 : GByte *CPL_RESTRICT pabyDstDataPtr =
2328 : reinterpret_cast<GByte *>(pDstData);
2329 70559900 : for (; n < nWordCount - 15; n += 16)
2330 : {
2331 64300945 : __m128i xmm = _mm_loadu_si128(
2332 64300945 : reinterpret_cast<const __m128i *>(pSrcData + n));
2333 64334745 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2334 64336845 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2335 64230445 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2336 64195545 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2337 64232945 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2338 64313145 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2339 : _mm_storeu_si128(
2340 64313145 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4), xmm0);
2341 : _mm_storeu_si128(
2342 64313145 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 16), xmm1);
2343 : _mm_storeu_si128(
2344 64313145 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 32), xmm2);
2345 : _mm_storeu_si128(
2346 64313145 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 48), xmm3);
2347 : }
2348 14493179 : for (; n < nWordCount; n++)
2349 : {
2350 8234284 : pDstData[n] = pSrcData[n];
2351 6258945 : }
2352 : }
2353 : else
2354 : {
2355 6574250 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2356 : nDstPixelStride, nWordCount);
2357 : }
2358 12811365 : }
2359 :
2360 : template <>
2361 465 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2362 : int nSrcPixelStride,
2363 : GUInt32 *const CPL_RESTRICT pDstData,
2364 : int nDstPixelStride, GPtrDiff_t nWordCount)
2365 : {
2366 465 : GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
2367 : nDstPixelStride, nWordCount);
2368 465 : }
2369 :
2370 : template <>
2371 12832600 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2372 : int nSrcPixelStride,
2373 : GInt32 *const CPL_RESTRICT pDstData,
2374 : int nDstPixelStride, GPtrDiff_t nWordCount)
2375 : {
2376 12832600 : GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
2377 : nDstPixelStride, nWordCount);
2378 12840700 : }
2379 :
2380 : template <>
2381 2471810 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2382 : int nSrcPixelStride,
2383 : float *const CPL_RESTRICT pDstData,
2384 : int nDstPixelStride, GPtrDiff_t nWordCount)
2385 : {
2386 2471810 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2387 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2388 : {
2389 112368 : decltype(nWordCount) n = 0;
2390 112368 : const __m128i xmm_zero = _mm_setzero_si128();
2391 112368 : GByte *CPL_RESTRICT pabyDstDataPtr =
2392 : reinterpret_cast<GByte *>(pDstData);
2393 3261800 : for (; n < nWordCount - 15; n += 16)
2394 : {
2395 3149440 : __m128i xmm = _mm_loadu_si128(
2396 3149440 : reinterpret_cast<const __m128i *>(pSrcData + n));
2397 3149440 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2398 3149440 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2399 3149440 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2400 3149440 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2401 3149440 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2402 3149440 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2403 3149440 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2404 3149440 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2405 3149440 : __m128 xmm2_f = _mm_cvtepi32_ps(xmm2);
2406 3149440 : __m128 xmm3_f = _mm_cvtepi32_ps(xmm3);
2407 3149440 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2408 : xmm0_f);
2409 : _mm_storeu_ps(
2410 3149440 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2411 : _mm_storeu_ps(
2412 3149440 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 32), xmm2_f);
2413 : _mm_storeu_ps(
2414 3149440 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 48), xmm3_f);
2415 : }
2416 480044 : for (; n < nWordCount; n++)
2417 : {
2418 367676 : pDstData[n] = pSrcData[n];
2419 112368 : }
2420 : }
2421 : else
2422 : {
2423 2359440 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2424 : nDstPixelStride, nWordCount);
2425 : }
2426 2471810 : }
2427 :
2428 : template <>
2429 147675 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2430 : int nSrcPixelStride,
2431 : double *const CPL_RESTRICT pDstData,
2432 : int nDstPixelStride, GPtrDiff_t nWordCount)
2433 : {
2434 147675 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2435 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2436 : {
2437 124619 : decltype(nWordCount) n = 0;
2438 124619 : const __m128i xmm_zero = _mm_setzero_si128();
2439 124619 : GByte *CPL_RESTRICT pabyDstDataPtr =
2440 : reinterpret_cast<GByte *>(pDstData);
2441 1424200 : for (; n < nWordCount - 15; n += 16)
2442 : {
2443 1299580 : __m128i xmm = _mm_loadu_si128(
2444 1299580 : reinterpret_cast<const __m128i *>(pSrcData + n));
2445 1299580 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2446 1299580 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2447 1299580 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2448 1299580 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2449 1299580 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2450 1299580 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2451 :
2452 : #if defined(__AVX2__) && defined(slightly_slower_than_SSE2)
2453 : _mm256_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2454 : _mm256_cvtepi32_pd(xmm0));
2455 : _mm256_storeu_pd(
2456 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2457 : _mm256_cvtepi32_pd(xmm1));
2458 : _mm256_storeu_pd(
2459 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
2460 : _mm256_cvtepi32_pd(xmm2));
2461 : _mm256_storeu_pd(
2462 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
2463 : _mm256_cvtepi32_pd(xmm3));
2464 : #else
2465 1299580 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2466 1299580 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2467 1299580 : __m128d xmm2_low_d = _mm_cvtepi32_pd(xmm2);
2468 1299580 : __m128d xmm3_low_d = _mm_cvtepi32_pd(xmm3);
2469 1299580 : xmm0 = _mm_srli_si128(xmm0, 8);
2470 1299580 : xmm1 = _mm_srli_si128(xmm1, 8);
2471 1299580 : xmm2 = _mm_srli_si128(xmm2, 8);
2472 1299580 : xmm3 = _mm_srli_si128(xmm3, 8);
2473 1299580 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2474 1299580 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2475 1299580 : __m128d xmm2_high_d = _mm_cvtepi32_pd(xmm2);
2476 1299580 : __m128d xmm3_high_d = _mm_cvtepi32_pd(xmm3);
2477 :
2478 1299580 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2479 : xmm0_low_d);
2480 : _mm_storeu_pd(
2481 1299580 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2482 : xmm0_high_d);
2483 : _mm_storeu_pd(
2484 1299580 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2485 : xmm1_low_d);
2486 : _mm_storeu_pd(
2487 1299580 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2488 : xmm1_high_d);
2489 : _mm_storeu_pd(
2490 1299580 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
2491 : xmm2_low_d);
2492 : _mm_storeu_pd(
2493 1299580 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 80),
2494 : xmm2_high_d);
2495 : _mm_storeu_pd(
2496 1299580 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
2497 : xmm3_low_d);
2498 : _mm_storeu_pd(
2499 1299580 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 112),
2500 : xmm3_high_d);
2501 : #endif
2502 : }
2503 236049 : for (; n < nWordCount; n++)
2504 : {
2505 111430 : pDstData[n] = pSrcData[n];
2506 124619 : }
2507 : }
2508 : else
2509 : {
2510 23056 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2511 : nDstPixelStride, nWordCount);
2512 : }
2513 147675 : }
2514 :
2515 : template <>
2516 148 : CPL_NOINLINE void GDALCopyWordsT(const uint8_t *const CPL_RESTRICT pSrcData,
2517 : int nSrcPixelStride,
2518 : int8_t *const CPL_RESTRICT pDstData,
2519 : int nDstPixelStride, GPtrDiff_t nWordCount)
2520 : {
2521 148 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2522 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2523 : {
2524 142 : decltype(nWordCount) n = 0;
2525 142 : const __m128i xmm_127 = _mm_set1_epi8(127);
2526 146 : for (; n < nWordCount - 31; n += 32)
2527 : {
2528 8 : __m128i xmm0 = _mm_loadu_si128(
2529 4 : reinterpret_cast<const __m128i *>(pSrcData + n));
2530 4 : __m128i xmm1 = _mm_loadu_si128(
2531 4 : reinterpret_cast<const __m128i *>(pSrcData + n + 16));
2532 4 : xmm0 = _mm_min_epu8(xmm0, xmm_127);
2533 4 : xmm1 = _mm_min_epu8(xmm1, xmm_127);
2534 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2535 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
2536 : xmm1);
2537 : }
2538 2422 : for (; n < nWordCount; n++)
2539 : {
2540 2280 : pDstData[n] =
2541 2280 : pSrcData[n] >= 127 ? 127 : static_cast<int8_t>(pSrcData[n]);
2542 142 : }
2543 : }
2544 : else
2545 : {
2546 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2547 : nDstPixelStride, nWordCount);
2548 : }
2549 148 : }
2550 :
2551 : template <>
2552 82 : CPL_NOINLINE void GDALCopyWordsT(const int8_t *const CPL_RESTRICT pSrcData,
2553 : int nSrcPixelStride,
2554 : uint8_t *const CPL_RESTRICT pDstData,
2555 : int nDstPixelStride, GPtrDiff_t nWordCount)
2556 : {
2557 82 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2558 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2559 : {
2560 56 : decltype(nWordCount) n = 0;
2561 : #if !(defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS))
2562 56 : const __m128i xmm_INT8_to_UINT8 = _mm_set1_epi8(-128);
2563 : #endif
2564 117 : for (; n < nWordCount - 31; n += 32)
2565 : {
2566 122 : __m128i xmm0 = _mm_loadu_si128(
2567 61 : reinterpret_cast<const __m128i *>(pSrcData + n));
2568 61 : __m128i xmm1 = _mm_loadu_si128(
2569 61 : reinterpret_cast<const __m128i *>(pSrcData + n + 16));
2570 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2571 : xmm0 = _mm_max_epi8(xmm0, _mm_setzero_si128());
2572 : xmm1 = _mm_max_epi8(xmm1, _mm_setzero_si128());
2573 : #else
2574 61 : xmm0 = _mm_add_epi8(xmm0, xmm_INT8_to_UINT8);
2575 61 : xmm1 = _mm_add_epi8(xmm1, xmm_INT8_to_UINT8);
2576 61 : xmm0 = _mm_max_epu8(xmm0, xmm_INT8_to_UINT8);
2577 61 : xmm1 = _mm_max_epu8(xmm1, xmm_INT8_to_UINT8);
2578 61 : xmm0 = _mm_sub_epi8(xmm0, xmm_INT8_to_UINT8);
2579 61 : xmm1 = _mm_sub_epi8(xmm1, xmm_INT8_to_UINT8);
2580 : #endif
2581 61 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2582 61 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
2583 : xmm1);
2584 : }
2585 352 : for (; n < nWordCount; n++)
2586 : {
2587 296 : pDstData[n] =
2588 296 : pSrcData[n] < 0 ? 0 : static_cast<uint8_t>(pSrcData[n]);
2589 56 : }
2590 : }
2591 : else
2592 : {
2593 26 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2594 : nDstPixelStride, nWordCount);
2595 : }
2596 82 : }
2597 :
2598 : template <>
2599 6037 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2600 : int nSrcPixelStride,
2601 : uint8_t *const CPL_RESTRICT pDstData,
2602 : int nDstPixelStride, GPtrDiff_t nWordCount)
2603 : {
2604 6037 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2605 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2606 : {
2607 5062 : decltype(nWordCount) n = 0;
2608 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2609 : const auto xmm_MAX_INT16 = _mm_set1_epi16(32767);
2610 : #else
2611 : // In SSE2, min_epu16 does not exist, so shift from
2612 : // UInt16 to SInt16 to be able to use min_epi16
2613 5062 : const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
2614 5062 : const __m128i xmm_m255_shifted = _mm_set1_epi16(255 - 32768);
2615 : #endif
2616 71888 : for (; n < nWordCount - 15; n += 16)
2617 : {
2618 133652 : __m128i xmm0 = _mm_loadu_si128(
2619 66826 : reinterpret_cast<const __m128i *>(pSrcData + n));
2620 66826 : __m128i xmm1 = _mm_loadu_si128(
2621 66826 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2622 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2623 : xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
2624 : xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
2625 : #else
2626 66826 : xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
2627 66826 : xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
2628 66826 : xmm0 = _mm_min_epi16(xmm0, xmm_m255_shifted);
2629 66826 : xmm1 = _mm_min_epi16(xmm1, xmm_m255_shifted);
2630 66826 : xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
2631 66826 : xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
2632 : #endif
2633 66826 : xmm0 = _mm_packus_epi16(xmm0, xmm1);
2634 66826 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2635 : }
2636 16403 : for (; n < nWordCount; n++)
2637 : {
2638 11341 : pDstData[n] =
2639 11341 : pSrcData[n] >= 255 ? 255 : static_cast<uint8_t>(pSrcData[n]);
2640 5062 : }
2641 : }
2642 : else
2643 : {
2644 975 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2645 : nDstPixelStride, nWordCount);
2646 : }
2647 6037 : }
2648 :
2649 : template <>
2650 46 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2651 : int nSrcPixelStride,
2652 : int16_t *const CPL_RESTRICT pDstData,
2653 : int nDstPixelStride, GPtrDiff_t nWordCount)
2654 : {
2655 46 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2656 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2657 : {
2658 40 : decltype(nWordCount) n = 0;
2659 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2660 : const __m128i xmm_MAX_INT16 = _mm_set1_epi16(32767);
2661 : #else
2662 : // In SSE2, min_epu16 does not exist, so shift from
2663 : // UInt16 to SInt16 to be able to use min_epi16
2664 40 : const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
2665 40 : const __m128i xmm_32767_shifted = _mm_set1_epi16(32767 - 32768);
2666 : #endif
2667 169 : for (; n < nWordCount - 15; n += 16)
2668 : {
2669 258 : __m128i xmm0 = _mm_loadu_si128(
2670 129 : reinterpret_cast<const __m128i *>(pSrcData + n));
2671 129 : __m128i xmm1 = _mm_loadu_si128(
2672 129 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2673 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2674 : xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
2675 : xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
2676 : #else
2677 129 : xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
2678 129 : xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
2679 129 : xmm0 = _mm_min_epi16(xmm0, xmm_32767_shifted);
2680 129 : xmm1 = _mm_min_epi16(xmm1, xmm_32767_shifted);
2681 129 : xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
2682 129 : xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
2683 : #endif
2684 129 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2685 129 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
2686 : xmm1);
2687 : }
2688 191 : for (; n < nWordCount; n++)
2689 : {
2690 282 : pDstData[n] = pSrcData[n] >= 32767
2691 : ? 32767
2692 131 : : static_cast<int16_t>(pSrcData[n]);
2693 40 : }
2694 : }
2695 : else
2696 : {
2697 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2698 : nDstPixelStride, nWordCount);
2699 : }
2700 46 : }
2701 :
2702 : template <>
2703 135 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2704 : int nSrcPixelStride,
2705 : uint16_t *const CPL_RESTRICT pDstData,
2706 : int nDstPixelStride, GPtrDiff_t nWordCount)
2707 : {
2708 135 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2709 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2710 : {
2711 92 : decltype(nWordCount) n = 0;
2712 92 : const __m128i xmm_zero = _mm_setzero_si128();
2713 277 : for (; n < nWordCount - 15; n += 16)
2714 : {
2715 370 : __m128i xmm0 = _mm_loadu_si128(
2716 185 : reinterpret_cast<const __m128i *>(pSrcData + n));
2717 185 : __m128i xmm1 = _mm_loadu_si128(
2718 185 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2719 185 : xmm0 = _mm_max_epi16(xmm0, xmm_zero);
2720 185 : xmm1 = _mm_max_epi16(xmm1, xmm_zero);
2721 185 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2722 185 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
2723 : xmm1);
2724 : }
2725 468 : for (; n < nWordCount; n++)
2726 : {
2727 376 : pDstData[n] =
2728 376 : pSrcData[n] < 0 ? 0 : static_cast<uint16_t>(pSrcData[n]);
2729 92 : }
2730 : }
2731 : else
2732 : {
2733 43 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2734 : nDstPixelStride, nWordCount);
2735 : }
2736 135 : }
2737 :
2738 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2739 :
2740 : template <>
2741 : CPL_NOINLINE void GDALCopyWordsT(const uint32_t *const CPL_RESTRICT pSrcData,
2742 : int nSrcPixelStride,
2743 : int32_t *const CPL_RESTRICT pDstData,
2744 : int nDstPixelStride, GPtrDiff_t nWordCount)
2745 : {
2746 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2747 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2748 : {
2749 : decltype(nWordCount) n = 0;
2750 : const __m128i xmm_MAX_INT = _mm_set1_epi32(INT_MAX);
2751 : for (; n < nWordCount - 8; n += 7)
2752 : {
2753 : __m128i xmm0 = _mm_loadu_si128(
2754 : reinterpret_cast<const __m128i *>(pSrcData + n));
2755 : __m128i xmm1 = _mm_loadu_si128(
2756 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
2757 : xmm0 = _mm_min_epu32(xmm0, xmm_MAX_INT);
2758 : xmm1 = _mm_min_epu32(xmm1, xmm_MAX_INT);
2759 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2760 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
2761 : xmm1);
2762 : }
2763 : for (; n < nWordCount; n++)
2764 : {
2765 : pDstData[n] = pSrcData[n] >= INT_MAX
2766 : ? INT_MAX
2767 : : static_cast<int32_t>(pSrcData[n]);
2768 : }
2769 : }
2770 : else
2771 : {
2772 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2773 : nDstPixelStride, nWordCount);
2774 : }
2775 : }
2776 :
2777 : template <>
2778 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
2779 : int nSrcPixelStride,
2780 : uint32_t *const CPL_RESTRICT pDstData,
2781 : int nDstPixelStride, GPtrDiff_t nWordCount)
2782 : {
2783 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2784 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2785 : {
2786 : decltype(nWordCount) n = 0;
2787 : const __m128i xmm_zero = _mm_setzero_si128();
2788 : for (; n < nWordCount - 7; n += 8)
2789 : {
2790 : __m128i xmm0 = _mm_loadu_si128(
2791 : reinterpret_cast<const __m128i *>(pSrcData + n));
2792 : __m128i xmm1 = _mm_loadu_si128(
2793 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
2794 : xmm0 = _mm_max_epi32(xmm0, xmm_zero);
2795 : xmm1 = _mm_max_epi32(xmm1, xmm_zero);
2796 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2797 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
2798 : xmm1);
2799 : }
2800 : for (; n < nWordCount; n++)
2801 : {
2802 : pDstData[n] =
2803 : pSrcData[n] < 0 ? 0 : static_cast<uint32_t>(pSrcData[n]);
2804 : }
2805 : }
2806 : else
2807 : {
2808 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2809 : nDstPixelStride, nWordCount);
2810 : }
2811 : }
2812 :
2813 : #endif // defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2814 :
2815 : template <>
2816 339 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2817 : int nSrcPixelStride,
2818 : float *const CPL_RESTRICT pDstData,
2819 : int nDstPixelStride, GPtrDiff_t nWordCount)
2820 : {
2821 339 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2822 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2823 : {
2824 333 : decltype(nWordCount) n = 0;
2825 333 : const __m128i xmm_zero = _mm_setzero_si128();
2826 333 : GByte *CPL_RESTRICT pabyDstDataPtr =
2827 : reinterpret_cast<GByte *>(pDstData);
2828 1472 : for (; n < nWordCount - 7; n += 8)
2829 : {
2830 1139 : __m128i xmm = _mm_loadu_si128(
2831 1139 : reinterpret_cast<const __m128i *>(pSrcData + n));
2832 1139 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
2833 1139 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
2834 1139 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2835 1139 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2836 1139 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2837 : xmm0_f);
2838 : _mm_storeu_ps(
2839 1139 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2840 : }
2841 1099 : for (; n < nWordCount; n++)
2842 : {
2843 766 : pDstData[n] = pSrcData[n];
2844 333 : }
2845 : }
2846 : else
2847 : {
2848 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2849 : nDstPixelStride, nWordCount);
2850 : }
2851 339 : }
2852 :
2853 : template <>
2854 1072750 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2855 : int nSrcPixelStride,
2856 : float *const CPL_RESTRICT pDstData,
2857 : int nDstPixelStride, GPtrDiff_t nWordCount)
2858 : {
2859 1072750 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2860 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2861 : {
2862 82850 : decltype(nWordCount) n = 0;
2863 82850 : GByte *CPL_RESTRICT pabyDstDataPtr =
2864 : reinterpret_cast<GByte *>(pDstData);
2865 553615 : for (; n < nWordCount - 7; n += 8)
2866 : {
2867 470765 : __m128i xmm = _mm_loadu_si128(
2868 470765 : reinterpret_cast<const __m128i *>(pSrcData + n));
2869 470765 : const auto sign = _mm_srai_epi16(xmm, 15);
2870 470765 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
2871 470765 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
2872 470765 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2873 470765 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2874 470765 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2875 : xmm0_f);
2876 : _mm_storeu_ps(
2877 470765 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2878 : }
2879 242713 : for (; n < nWordCount; n++)
2880 : {
2881 159863 : pDstData[n] = pSrcData[n];
2882 82850 : }
2883 : }
2884 : else
2885 : {
2886 989901 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2887 : nDstPixelStride, nWordCount);
2888 : }
2889 1072750 : }
2890 :
2891 : template <>
2892 380 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2893 : int nSrcPixelStride,
2894 : double *const CPL_RESTRICT pDstData,
2895 : int nDstPixelStride, GPtrDiff_t nWordCount)
2896 : {
2897 380 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2898 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2899 : {
2900 269 : decltype(nWordCount) n = 0;
2901 269 : const __m128i xmm_zero = _mm_setzero_si128();
2902 269 : GByte *CPL_RESTRICT pabyDstDataPtr =
2903 : reinterpret_cast<GByte *>(pDstData);
2904 713 : for (; n < nWordCount - 7; n += 8)
2905 : {
2906 444 : __m128i xmm = _mm_loadu_si128(
2907 444 : reinterpret_cast<const __m128i *>(pSrcData + n));
2908 444 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
2909 444 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
2910 :
2911 444 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2912 444 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2913 444 : xmm0 = _mm_srli_si128(xmm0, 8);
2914 444 : xmm1 = _mm_srli_si128(xmm1, 8);
2915 444 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2916 444 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2917 :
2918 444 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2919 : xmm0_low_d);
2920 : _mm_storeu_pd(
2921 444 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2922 : xmm0_high_d);
2923 : _mm_storeu_pd(
2924 444 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2925 : xmm1_low_d);
2926 : _mm_storeu_pd(
2927 444 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2928 : xmm1_high_d);
2929 : }
2930 918 : for (; n < nWordCount; n++)
2931 : {
2932 649 : pDstData[n] = pSrcData[n];
2933 269 : }
2934 : }
2935 : else
2936 : {
2937 111 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2938 : nDstPixelStride, nWordCount);
2939 : }
2940 380 : }
2941 :
2942 : template <>
2943 2760110 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2944 : int nSrcPixelStride,
2945 : double *const CPL_RESTRICT pDstData,
2946 : int nDstPixelStride, GPtrDiff_t nWordCount)
2947 : {
2948 2760110 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2949 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2950 : {
2951 34418 : decltype(nWordCount) n = 0;
2952 34418 : GByte *CPL_RESTRICT pabyDstDataPtr =
2953 : reinterpret_cast<GByte *>(pDstData);
2954 400839 : for (; n < nWordCount - 7; n += 8)
2955 : {
2956 366562 : __m128i xmm = _mm_loadu_si128(
2957 366562 : reinterpret_cast<const __m128i *>(pSrcData + n));
2958 366589 : const auto sign = _mm_srai_epi16(xmm, 15);
2959 366498 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
2960 366545 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
2961 :
2962 366515 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2963 366444 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2964 366444 : xmm0 = _mm_srli_si128(xmm0, 8);
2965 366520 : xmm1 = _mm_srli_si128(xmm1, 8);
2966 366560 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2967 366421 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2968 :
2969 366421 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2970 : xmm0_low_d);
2971 : _mm_storeu_pd(
2972 366421 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2973 : xmm0_high_d);
2974 : _mm_storeu_pd(
2975 366421 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2976 : xmm1_low_d);
2977 : _mm_storeu_pd(
2978 366421 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2979 : xmm1_high_d);
2980 : }
2981 252070 : for (; n < nWordCount; n++)
2982 : {
2983 217793 : pDstData[n] = pSrcData[n];
2984 34277 : }
2985 : }
2986 : else
2987 : {
2988 2725690 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2989 : nDstPixelStride, nWordCount);
2990 : }
2991 2759970 : }
2992 :
2993 : template <>
2994 4388770 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
2995 : int nSrcPixelStride,
2996 : GByte *const CPL_RESTRICT pDstData,
2997 : int nDstPixelStride, GPtrDiff_t nWordCount)
2998 : {
2999 4388770 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3000 : nDstPixelStride, nWordCount);
3001 4388800 : }
3002 :
3003 : template <>
3004 38353 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3005 : int nSrcPixelStride,
3006 : GUInt16 *const CPL_RESTRICT pDstData,
3007 : int nDstPixelStride, GPtrDiff_t nWordCount)
3008 : {
3009 38353 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3010 : nDstPixelStride, nWordCount);
3011 38353 : }
3012 :
3013 : template <>
3014 51557 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3015 : int nSrcPixelStride,
3016 : double *const CPL_RESTRICT pDstData,
3017 : int nDstPixelStride, GPtrDiff_t nWordCount)
3018 : {
3019 51557 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3020 : nDstPixelStride, nWordCount);
3021 51557 : }
3022 :
3023 : template <>
3024 122467 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3025 : int nSrcPixelStride,
3026 : float *const CPL_RESTRICT pDstData,
3027 : int nDstPixelStride, GPtrDiff_t nWordCount)
3028 : {
3029 122467 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3030 : nDstPixelStride, nWordCount);
3031 122467 : }
3032 :
3033 : template <>
3034 111 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
3035 : int nSrcPixelStride,
3036 : float *const CPL_RESTRICT pDstData,
3037 : int nDstPixelStride, GPtrDiff_t nWordCount)
3038 : {
3039 111 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3040 : nDstPixelStride, nWordCount);
3041 111 : }
3042 :
3043 : template <>
3044 532 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
3045 : int nSrcPixelStride,
3046 : double *const CPL_RESTRICT pDstData,
3047 : int nDstPixelStride, GPtrDiff_t nWordCount)
3048 : {
3049 532 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3050 : nDstPixelStride, nWordCount);
3051 532 : }
3052 :
3053 : #ifdef __F16C__
3054 :
3055 : template <>
3056 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3057 : int nSrcPixelStride,
3058 : GFloat16 *const CPL_RESTRICT pDstData,
3059 : int nDstPixelStride, GPtrDiff_t nWordCount)
3060 : {
3061 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3062 : nDstPixelStride, nWordCount);
3063 : }
3064 :
3065 : template <>
3066 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3067 : int nSrcPixelStride,
3068 : GFloat16 *const CPL_RESTRICT pDstData,
3069 : int nDstPixelStride, GPtrDiff_t nWordCount)
3070 : {
3071 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3072 : nDstPixelStride, nWordCount);
3073 : }
3074 :
3075 : #endif // __F16C__
3076 :
3077 : #endif // HAVE_SSE2
3078 :
3079 : template <>
3080 233093 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3081 : int nSrcPixelStride,
3082 : GByte *const CPL_RESTRICT pDstData,
3083 : int nDstPixelStride, GPtrDiff_t nWordCount)
3084 : {
3085 233093 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3086 : nDstPixelStride, nWordCount);
3087 233094 : }
3088 :
3089 : template <>
3090 15775 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3091 : int nSrcPixelStride,
3092 : GInt16 *const CPL_RESTRICT pDstData,
3093 : int nDstPixelStride, GPtrDiff_t nWordCount)
3094 : {
3095 15775 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3096 : nDstPixelStride, nWordCount);
3097 15775 : }
3098 :
3099 : template <>
3100 61692 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3101 : int nSrcPixelStride,
3102 : GUInt16 *const CPL_RESTRICT pDstData,
3103 : int nDstPixelStride, GPtrDiff_t nWordCount)
3104 : {
3105 61692 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3106 : nDstPixelStride, nWordCount);
3107 61688 : }
3108 :
3109 : /************************************************************************/
3110 : /* GDALCopyWordsComplexT() */
3111 : /************************************************************************/
3112 : /**
3113 : * Template function, used to copy data from pSrcData into buffer
3114 : * pDstData, with stride nSrcPixelStride in the source data and
3115 : * stride nDstPixelStride in the destination data. Deals with the
3116 : * complex case, where input is complex and output is complex.
3117 : *
3118 : * @param pSrcData the source data buffer
3119 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
3120 : * of interest.
3121 : * @param pDstData the destination buffer.
3122 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
3123 : * interest.
3124 : * @param nWordCount the total number of pixel words to copy
3125 : *
3126 : */
3127 : template <class Tin, class Tout>
3128 96717 : inline void GDALCopyWordsComplexT(const Tin *const CPL_RESTRICT pSrcData,
3129 : int nSrcPixelStride,
3130 : Tout *const CPL_RESTRICT pDstData,
3131 : int nDstPixelStride, GPtrDiff_t nWordCount)
3132 : {
3133 96717 : decltype(nWordCount) nDstOffset = 0;
3134 96717 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
3135 96717 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
3136 :
3137 5243171 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
3138 : {
3139 5146449 : const Tin *const pPixelIn =
3140 5146449 : reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
3141 5146449 : Tout *const pPixelOut =
3142 5146449 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
3143 :
3144 5146449 : GDALCopyWord(pPixelIn[0], pPixelOut[0]);
3145 5146449 : GDALCopyWord(pPixelIn[1], pPixelOut[1]);
3146 :
3147 5146449 : nDstOffset += nDstPixelStride;
3148 : }
3149 96717 : }
3150 :
3151 : /************************************************************************/
3152 : /* GDALCopyWordsComplexOutT() */
3153 : /************************************************************************/
3154 : /**
3155 : * Template function, used to copy data from pSrcData into buffer
3156 : * pDstData, with stride nSrcPixelStride in the source data and
3157 : * stride nDstPixelStride in the destination data. Deals with the
3158 : * case where the value is real coming in, but complex going out.
3159 : *
3160 : * @param pSrcData the source data buffer
3161 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
3162 : * of interest, in bytes.
3163 : * @param pDstData the destination buffer.
3164 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
3165 : * interest, in bytes.
3166 : * @param nWordCount the total number of pixel words to copy
3167 : *
3168 : */
3169 : template <class Tin, class Tout>
3170 3877 : inline void GDALCopyWordsComplexOutT(const Tin *const CPL_RESTRICT pSrcData,
3171 : int nSrcPixelStride,
3172 : Tout *const CPL_RESTRICT pDstData,
3173 : int nDstPixelStride, GPtrDiff_t nWordCount)
3174 : {
3175 3877 : decltype(nWordCount) nDstOffset = 0;
3176 :
3177 3877 : const Tout tOutZero = static_cast<Tout>(0);
3178 :
3179 3877 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
3180 3877 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
3181 :
3182 1155414 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
3183 : {
3184 1151537 : const Tin tValue =
3185 1151537 : *reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
3186 1151537 : Tout *const pPixelOut =
3187 1151537 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
3188 1151537 : GDALCopyWord(tValue, *pPixelOut);
3189 :
3190 1151537 : pPixelOut[1] = tOutZero;
3191 :
3192 1151537 : nDstOffset += nDstPixelStride;
3193 : }
3194 3877 : }
3195 :
3196 : /************************************************************************/
3197 : /* GDALCopyWordsFromT() */
3198 : /************************************************************************/
3199 : /**
3200 : * Template driver function. Given the input type T, call the appropriate
3201 : * GDALCopyWordsT function template for the desired output type. You should
3202 : * never call this function directly (call GDALCopyWords instead).
3203 : *
3204 : * @param pSrcData source data buffer
3205 : * @param nSrcPixelStride pixel stride in input buffer, in pixel words
3206 : * @param bInComplex input is complex
3207 : * @param pDstData destination data buffer
3208 : * @param eDstType destination data type
3209 : * @param nDstPixelStride pixel stride in output buffer, in pixel words
3210 : * @param nWordCount number of pixel words to be copied
3211 : */
3212 : template <class T>
3213 54031222 : inline void GDALCopyWordsFromT(const T *const CPL_RESTRICT pSrcData,
3214 : int nSrcPixelStride, bool bInComplex,
3215 : void *CPL_RESTRICT pDstData,
3216 : GDALDataType eDstType, int nDstPixelStride,
3217 : GPtrDiff_t nWordCount)
3218 : {
3219 54031222 : switch (eDstType)
3220 : {
3221 4666567 : case GDT_Byte:
3222 4666567 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3223 : static_cast<unsigned char *>(pDstData),
3224 : nDstPixelStride, nWordCount);
3225 4666608 : break;
3226 751 : case GDT_Int8:
3227 751 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3228 : static_cast<signed char *>(pDstData),
3229 : nDstPixelStride, nWordCount);
3230 751 : break;
3231 140708 : case GDT_UInt16:
3232 140708 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3233 : static_cast<unsigned short *>(pDstData),
3234 : nDstPixelStride, nWordCount);
3235 140705 : break;
3236 4162813 : case GDT_Int16:
3237 4162813 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3238 : static_cast<short *>(pDstData), nDstPixelStride,
3239 : nWordCount);
3240 4162813 : break;
3241 22239 : case GDT_UInt32:
3242 22239 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3243 : static_cast<unsigned int *>(pDstData),
3244 : nDstPixelStride, nWordCount);
3245 22239 : break;
3246 26023167 : case GDT_Int32:
3247 26023167 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3248 : static_cast<int *>(pDstData), nDstPixelStride,
3249 : nWordCount);
3250 26050969 : break;
3251 809 : case GDT_UInt64:
3252 809 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3253 : static_cast<std::uint64_t *>(pDstData),
3254 : nDstPixelStride, nWordCount);
3255 809 : break;
3256 5181 : case GDT_Int64:
3257 5181 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3258 : static_cast<std::int64_t *>(pDstData),
3259 : nDstPixelStride, nWordCount);
3260 5181 : break;
3261 940 : case GDT_Float16:
3262 940 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3263 : static_cast<GFloat16 *>(pDstData), nDstPixelStride,
3264 : nWordCount);
3265 940 : break;
3266 3700690 : case GDT_Float32:
3267 3700690 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3268 : static_cast<float *>(pDstData), nDstPixelStride,
3269 : nWordCount);
3270 3700690 : break;
3271 15195964 : case GDT_Float64:
3272 15195964 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3273 : static_cast<double *>(pDstData), nDstPixelStride,
3274 : nWordCount);
3275 15195974 : break;
3276 94123 : case GDT_CInt16:
3277 94123 : if (bInComplex)
3278 : {
3279 92870 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3280 : static_cast<short *>(pDstData),
3281 : nDstPixelStride, nWordCount);
3282 : }
3283 : else // input is not complex, so we need to promote to a complex
3284 : // buffer
3285 : {
3286 1253 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3287 : static_cast<short *>(pDstData),
3288 : nDstPixelStride, nWordCount);
3289 : }
3290 94123 : break;
3291 1052 : case GDT_CInt32:
3292 1052 : if (bInComplex)
3293 : {
3294 421 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3295 : static_cast<int *>(pDstData),
3296 : nDstPixelStride, nWordCount);
3297 : }
3298 : else // input is not complex, so we need to promote to a complex
3299 : // buffer
3300 : {
3301 631 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3302 : static_cast<int *>(pDstData),
3303 : nDstPixelStride, nWordCount);
3304 : }
3305 1052 : break;
3306 281 : case GDT_CFloat16:
3307 281 : if (bInComplex)
3308 : {
3309 16 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3310 : static_cast<GFloat16 *>(pDstData),
3311 : nDstPixelStride, nWordCount);
3312 : }
3313 : else // input is not complex, so we need to promote to a complex
3314 : // buffer
3315 : {
3316 265 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3317 : static_cast<GFloat16 *>(pDstData),
3318 : nDstPixelStride, nWordCount);
3319 : }
3320 281 : break;
3321 3359 : case GDT_CFloat32:
3322 3359 : if (bInComplex)
3323 : {
3324 2564 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3325 : static_cast<float *>(pDstData),
3326 : nDstPixelStride, nWordCount);
3327 : }
3328 : else // input is not complex, so we need to promote to a complex
3329 : // buffer
3330 : {
3331 795 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3332 : static_cast<float *>(pDstData),
3333 : nDstPixelStride, nWordCount);
3334 : }
3335 3359 : break;
3336 1779 : case GDT_CFloat64:
3337 1779 : if (bInComplex)
3338 : {
3339 846 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3340 : static_cast<double *>(pDstData),
3341 : nDstPixelStride, nWordCount);
3342 : }
3343 : else // input is not complex, so we need to promote to a complex
3344 : // buffer
3345 : {
3346 933 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3347 : static_cast<double *>(pDstData),
3348 : nDstPixelStride, nWordCount);
3349 : }
3350 1779 : break;
3351 0 : case GDT_Unknown:
3352 : case GDT_TypeCount:
3353 0 : CPLAssert(false);
3354 : }
3355 54058932 : }
3356 :
3357 : } // end anonymous namespace
3358 :
3359 : /************************************************************************/
3360 : /* GDALReplicateWord() */
3361 : /************************************************************************/
3362 :
3363 : template <class T>
3364 590394 : inline void GDALReplicateWordT(void *pDstData, int nDstPixelStride,
3365 : GPtrDiff_t nWordCount)
3366 : {
3367 590394 : const T valSet = *static_cast<const T *>(pDstData);
3368 590394 : if (nDstPixelStride == static_cast<int>(sizeof(T)))
3369 : {
3370 560663 : T *pDstPtr = static_cast<T *>(pDstData) + 1;
3371 21001485 : while (nWordCount >= 4)
3372 : {
3373 20440821 : nWordCount -= 4;
3374 20440821 : pDstPtr[0] = valSet;
3375 20440821 : pDstPtr[1] = valSet;
3376 20440821 : pDstPtr[2] = valSet;
3377 20440821 : pDstPtr[3] = valSet;
3378 20440821 : pDstPtr += 4;
3379 : }
3380 1461515 : while (nWordCount > 0)
3381 : {
3382 900852 : --nWordCount;
3383 900852 : *pDstPtr = valSet;
3384 900852 : pDstPtr++;
3385 : }
3386 : }
3387 : else
3388 : {
3389 29751 : GByte *pabyDstPtr = static_cast<GByte *>(pDstData) + nDstPixelStride;
3390 1040338 : while (nWordCount > 0)
3391 : {
3392 1010587 : --nWordCount;
3393 1010587 : *reinterpret_cast<T *>(pabyDstPtr) = valSet;
3394 1010587 : pabyDstPtr += nDstPixelStride;
3395 : }
3396 : }
3397 590394 : }
3398 :
3399 1010670 : static void GDALReplicateWord(const void *CPL_RESTRICT pSrcData,
3400 : GDALDataType eSrcType,
3401 : void *CPL_RESTRICT pDstData,
3402 : GDALDataType eDstType, int nDstPixelStride,
3403 : GPtrDiff_t nWordCount)
3404 : {
3405 : /* -----------------------------------------------------------------------
3406 : */
3407 : /* Special case when the source data is always the same value */
3408 : /* (for VRTSourcedRasterBand::IRasterIO and
3409 : * VRTDerivedRasterBand::IRasterIO*/
3410 : /* for example) */
3411 : /* -----------------------------------------------------------------------
3412 : */
3413 : // Let the general translation case do the necessary conversions
3414 : // on the first destination element.
3415 1010670 : GDALCopyWords64(pSrcData, eSrcType, 0, pDstData, eDstType, 0, 1);
3416 :
3417 : // Now copy the first element to the nWordCount - 1 following destination
3418 : // elements.
3419 1006230 : nWordCount--;
3420 1006230 : GByte *pabyDstWord = reinterpret_cast<GByte *>(pDstData) + nDstPixelStride;
3421 :
3422 1006230 : switch (eDstType)
3423 : {
3424 415592 : case GDT_Byte:
3425 : case GDT_Int8:
3426 : {
3427 415592 : if (nDstPixelStride == 1)
3428 : {
3429 375174 : if (nWordCount > 0)
3430 375174 : memset(pabyDstWord,
3431 375174 : *reinterpret_cast<const GByte *>(pDstData),
3432 : nWordCount);
3433 : }
3434 : else
3435 : {
3436 40418 : GByte valSet = *reinterpret_cast<const GByte *>(pDstData);
3437 31325800 : while (nWordCount > 0)
3438 : {
3439 31285300 : --nWordCount;
3440 31285300 : *pabyDstWord = valSet;
3441 31285300 : pabyDstWord += nDstPixelStride;
3442 : }
3443 : }
3444 415592 : break;
3445 : }
3446 :
3447 : #define CASE_DUPLICATE_SIMPLE(enum_type, c_type) \
3448 : case enum_type: \
3449 : { \
3450 : GDALReplicateWordT<c_type>(pDstData, nDstPixelStride, nWordCount); \
3451 : break; \
3452 : }
3453 :
3454 34497 : CASE_DUPLICATE_SIMPLE(GDT_UInt16, GUInt16)
3455 202438 : CASE_DUPLICATE_SIMPLE(GDT_Int16, GInt16)
3456 56 : CASE_DUPLICATE_SIMPLE(GDT_UInt32, GUInt32)
3457 294683 : CASE_DUPLICATE_SIMPLE(GDT_Int32, GInt32)
3458 21 : CASE_DUPLICATE_SIMPLE(GDT_UInt64, std::uint64_t)
3459 1064 : CASE_DUPLICATE_SIMPLE(GDT_Int64, std::int64_t)
3460 0 : CASE_DUPLICATE_SIMPLE(GDT_Float16, GFloat16)
3461 52564 : CASE_DUPLICATE_SIMPLE(GDT_Float32, float)
3462 5171 : CASE_DUPLICATE_SIMPLE(GDT_Float64, double)
3463 :
3464 : #define CASE_DUPLICATE_COMPLEX(enum_type, c_type) \
3465 : case enum_type: \
3466 : { \
3467 : c_type valSet1 = reinterpret_cast<const c_type *>(pDstData)[0]; \
3468 : c_type valSet2 = reinterpret_cast<const c_type *>(pDstData)[1]; \
3469 : while (nWordCount > 0) \
3470 : { \
3471 : --nWordCount; \
3472 : reinterpret_cast<c_type *>(pabyDstWord)[0] = valSet1; \
3473 : reinterpret_cast<c_type *>(pabyDstWord)[1] = valSet2; \
3474 : pabyDstWord += nDstPixelStride; \
3475 : } \
3476 : break; \
3477 : }
3478 :
3479 784 : CASE_DUPLICATE_COMPLEX(GDT_CInt16, GInt16)
3480 784 : CASE_DUPLICATE_COMPLEX(GDT_CInt32, GInt32)
3481 0 : CASE_DUPLICATE_COMPLEX(GDT_CFloat16, GFloat16)
3482 784 : CASE_DUPLICATE_COMPLEX(GDT_CFloat32, float)
3483 784 : CASE_DUPLICATE_COMPLEX(GDT_CFloat64, double)
3484 :
3485 0 : case GDT_Unknown:
3486 : case GDT_TypeCount:
3487 0 : CPLAssert(false);
3488 : }
3489 1011740 : }
3490 :
3491 : /************************************************************************/
3492 : /* GDALUnrolledCopy() */
3493 : /************************************************************************/
3494 :
3495 : template <class T, int srcStride, int dstStride>
3496 3266290 : static inline void GDALUnrolledCopyGeneric(T *CPL_RESTRICT pDest,
3497 : const T *CPL_RESTRICT pSrc,
3498 : GPtrDiff_t nIters)
3499 : {
3500 3266290 : if (nIters >= 16)
3501 : {
3502 137565157 : for (GPtrDiff_t i = nIters / 16; i != 0; i--)
3503 : {
3504 134425950 : pDest[0 * dstStride] = pSrc[0 * srcStride];
3505 134425950 : pDest[1 * dstStride] = pSrc[1 * srcStride];
3506 134425950 : pDest[2 * dstStride] = pSrc[2 * srcStride];
3507 134425950 : pDest[3 * dstStride] = pSrc[3 * srcStride];
3508 134425950 : pDest[4 * dstStride] = pSrc[4 * srcStride];
3509 134425950 : pDest[5 * dstStride] = pSrc[5 * srcStride];
3510 134425950 : pDest[6 * dstStride] = pSrc[6 * srcStride];
3511 134425950 : pDest[7 * dstStride] = pSrc[7 * srcStride];
3512 134425950 : pDest[8 * dstStride] = pSrc[8 * srcStride];
3513 134425950 : pDest[9 * dstStride] = pSrc[9 * srcStride];
3514 134425950 : pDest[10 * dstStride] = pSrc[10 * srcStride];
3515 134425950 : pDest[11 * dstStride] = pSrc[11 * srcStride];
3516 134425950 : pDest[12 * dstStride] = pSrc[12 * srcStride];
3517 134425950 : pDest[13 * dstStride] = pSrc[13 * srcStride];
3518 134425950 : pDest[14 * dstStride] = pSrc[14 * srcStride];
3519 134425950 : pDest[15 * dstStride] = pSrc[15 * srcStride];
3520 134425950 : pDest += 16 * dstStride;
3521 134425950 : pSrc += 16 * srcStride;
3522 : }
3523 3139185 : nIters = nIters % 16;
3524 : }
3525 5508278 : for (GPtrDiff_t i = 0; i < nIters; i++)
3526 : {
3527 2241985 : pDest[i * dstStride] = *pSrc;
3528 2241985 : pSrc += srcStride;
3529 : }
3530 3266290 : }
3531 :
3532 : template <class T, int srcStride, int dstStride>
3533 3260190 : static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
3534 : const T *CPL_RESTRICT pSrc,
3535 : GPtrDiff_t nIters)
3536 : {
3537 3260190 : GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
3538 3260200 : }
3539 :
3540 : #ifdef HAVE_SSE2
3541 :
3542 : template <>
3543 352920 : void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
3544 : const GByte *CPL_RESTRICT pSrc,
3545 : GPtrDiff_t nIters)
3546 : {
3547 352920 : decltype(nIters) i = 0;
3548 352920 : if (nIters > 16)
3549 : {
3550 194667 : const __m128i xmm_mask = _mm_set1_epi16(0xff);
3551 : // If we were sure that there would always be 1 trailing byte, we could
3552 : // check against nIters - 15
3553 2988110 : for (; i < nIters - 16; i += 16)
3554 : {
3555 : __m128i xmm0 =
3556 2793440 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
3557 : __m128i xmm1 =
3558 5586890 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
3559 : // Set higher 8bit of each int16 packed word to 0
3560 2793440 : xmm0 = _mm_and_si128(xmm0, xmm_mask);
3561 2793440 : xmm1 = _mm_and_si128(xmm1, xmm_mask);
3562 : // Pack int16 to uint8 and merge back both vector
3563 2793440 : xmm0 = _mm_packus_epi16(xmm0, xmm1);
3564 :
3565 : // Store result
3566 2793440 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
3567 :
3568 2793440 : pSrc += 2 * 16;
3569 : }
3570 : }
3571 4619940 : for (; i < nIters; i++)
3572 : {
3573 4267020 : pDest[i] = *pSrc;
3574 4267020 : pSrc += 2;
3575 : }
3576 352920 : }
3577 :
3578 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
3579 :
3580 : template <>
3581 191860 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
3582 : const GByte *CPL_RESTRICT pSrc,
3583 : GPtrDiff_t nIters)
3584 : {
3585 191860 : if (nIters > 16 && CPLHaveRuntimeSSSE3())
3586 : {
3587 185760 : GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
3588 : }
3589 : else
3590 : {
3591 6100 : GDALUnrolledCopyGeneric<GByte, 3, 1>(pDest, pSrc, nIters);
3592 : }
3593 191860 : }
3594 :
3595 : #endif
3596 :
3597 : template <>
3598 106509 : void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
3599 : const GByte *CPL_RESTRICT pSrc,
3600 : GPtrDiff_t nIters)
3601 : {
3602 106509 : decltype(nIters) i = 0;
3603 106509 : if (nIters > 16)
3604 : {
3605 101217 : const __m128i xmm_mask = _mm_set1_epi32(0xff);
3606 : // If we were sure that there would always be 3 trailing bytes, we could
3607 : // check against nIters - 15
3608 10393300 : for (; i < nIters - 16; i += 16)
3609 : {
3610 : __m128i xmm0 =
3611 10289700 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
3612 : __m128i xmm1 =
3613 10289700 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
3614 : __m128i xmm2 =
3615 10289700 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
3616 : __m128i xmm3 =
3617 20579500 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 48));
3618 : // Set higher 24bit of each int32 packed word to 0
3619 10289700 : xmm0 = _mm_and_si128(xmm0, xmm_mask);
3620 10289700 : xmm1 = _mm_and_si128(xmm1, xmm_mask);
3621 10289700 : xmm2 = _mm_and_si128(xmm2, xmm_mask);
3622 10289700 : xmm3 = _mm_and_si128(xmm3, xmm_mask);
3623 : // Pack int32 to int16
3624 10291000 : xmm0 = _mm_packs_epi32(xmm0, xmm1);
3625 10290600 : xmm2 = _mm_packs_epi32(xmm2, xmm3);
3626 : // Pack int16 to uint8
3627 10292100 : xmm0 = _mm_packus_epi16(xmm0, xmm2);
3628 :
3629 : // Store result
3630 10292100 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
3631 :
3632 10292100 : pSrc += 4 * 16;
3633 : }
3634 : }
3635 1141460 : for (; i < nIters; i++)
3636 : {
3637 1032610 : pDest[i] = *pSrc;
3638 1032610 : pSrc += 4;
3639 : }
3640 108848 : }
3641 : #endif // HAVE_SSE2
3642 :
3643 : /************************************************************************/
3644 : /* GDALFastCopy() */
3645 : /************************************************************************/
3646 :
3647 : template <class T>
3648 40426400 : static inline void GDALFastCopy(T *CPL_RESTRICT pDest, int nDestStride,
3649 : const T *CPL_RESTRICT pSrc, int nSrcStride,
3650 : GPtrDiff_t nIters)
3651 : {
3652 40426400 : constexpr int sizeofT = static_cast<int>(sizeof(T));
3653 40426400 : if (nIters == 1)
3654 : {
3655 22109490 : *pDest = *pSrc;
3656 : }
3657 18317022 : else if (nDestStride == sizeofT)
3658 : {
3659 14977990 : if (nSrcStride == sizeofT)
3660 : {
3661 14126028 : memcpy(pDest, pSrc, nIters * sizeof(T));
3662 : }
3663 852021 : else if (nSrcStride == 2 * sizeofT)
3664 : {
3665 355879 : GDALUnrolledCopy<T, 2, 1>(pDest, pSrc, nIters);
3666 : }
3667 496142 : else if (nSrcStride == 3 * sizeofT)
3668 : {
3669 288432 : GDALUnrolledCopy<T, 3, 1>(pDest, pSrc, nIters);
3670 : }
3671 207710 : else if (nSrcStride == 4 * sizeofT)
3672 : {
3673 110491 : GDALUnrolledCopy<T, 4, 1>(pDest, pSrc, nIters);
3674 : }
3675 : else
3676 : {
3677 17153700 : while (nIters-- > 0)
3678 : {
3679 17056450 : *pDest = *pSrc;
3680 17056450 : pSrc += nSrcStride / sizeofT;
3681 17056450 : pDest++;
3682 : }
3683 : }
3684 : }
3685 3338962 : else if (nSrcStride == sizeofT)
3686 : {
3687 3320206 : if (nDestStride == 2 * sizeofT)
3688 : {
3689 158669 : GDALUnrolledCopy<T, 1, 2>(pDest, pSrc, nIters);
3690 : }
3691 3161535 : else if (nDestStride == 3 * sizeofT)
3692 : {
3693 2275351 : GDALUnrolledCopy<T, 1, 3>(pDest, pSrc, nIters);
3694 : }
3695 886185 : else if (nDestStride == 4 * sizeofT)
3696 : {
3697 722657 : GDALUnrolledCopy<T, 1, 4>(pDest, pSrc, nIters);
3698 : }
3699 : else
3700 : {
3701 17105960 : while (nIters-- > 0)
3702 : {
3703 16942410 : *pDest = *pSrc;
3704 16942410 : pSrc++;
3705 16942410 : pDest += nDestStride / sizeofT;
3706 : }
3707 : }
3708 : }
3709 : else
3710 : {
3711 1225858 : while (nIters-- > 0)
3712 : {
3713 1207102 : *pDest = *pSrc;
3714 1207102 : pSrc += nSrcStride / sizeofT;
3715 1207102 : pDest += nDestStride / sizeofT;
3716 : }
3717 : }
3718 40426500 : }
3719 :
3720 : /************************************************************************/
3721 : /* GDALFastCopyByte() */
3722 : /************************************************************************/
3723 :
3724 326246 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
3725 : int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
3726 : int nDstPixelStride, GPtrDiff_t nWordCount)
3727 : {
3728 326246 : GDALFastCopy(pDstData, nDstPixelStride, pSrcData, nSrcPixelStride,
3729 : nWordCount);
3730 326246 : }
3731 :
3732 : /************************************************************************/
3733 : /* GDALCopyWords() */
3734 : /************************************************************************/
3735 :
3736 : /**
3737 : * Copy pixel words from buffer to buffer.
3738 : *
3739 : * @see GDALCopyWords64()
3740 : */
3741 87227200 : void CPL_STDCALL GDALCopyWords(const void *CPL_RESTRICT pSrcData,
3742 : GDALDataType eSrcType, int nSrcPixelStride,
3743 : void *CPL_RESTRICT pDstData,
3744 : GDALDataType eDstType, int nDstPixelStride,
3745 : int nWordCount)
3746 : {
3747 87227200 : GDALCopyWords64(pSrcData, eSrcType, nSrcPixelStride, pDstData, eDstType,
3748 : nDstPixelStride, nWordCount);
3749 87219000 : }
3750 :
3751 : /************************************************************************/
3752 : /* GDALCopyWords64() */
3753 : /************************************************************************/
3754 :
3755 : /**
3756 : * Copy pixel words from buffer to buffer.
3757 : *
3758 : * This function is used to copy pixel word values from one memory buffer
3759 : * to another, with support for conversion between data types, and differing
3760 : * step factors. The data type conversion is done using the following
3761 : * rules:
3762 : * <ul>
3763 : * <li>Values assigned to a lower range integer type are clipped. For
3764 : * instance assigning GDT_Int16 values to a GDT_Byte buffer will cause values
3765 : * less the 0 to be set to 0, and values larger than 255 to be set to 255.
3766 : * </li>
3767 : * <li>
3768 : * Assignment from floating point to integer rounds to closest integer.
3769 : * +Infinity is mapped to the largest integer. -Infinity is mapped to the
3770 : * smallest integer. NaN is mapped to 0.
3771 : * </li>
3772 : * <li>
3773 : * Assignment from non-complex to complex will result in the imaginary part
3774 : * being set to zero on output.
3775 : * </li>
3776 : * <li> Assignment from complex to
3777 : * non-complex will result in the complex portion being lost and the real
3778 : * component being preserved (<i>not magnitude!</i>).
3779 : * </li>
3780 : * </ul>
3781 : *
3782 : * No assumptions are made about the source or destination words occurring
3783 : * on word boundaries. It is assumed that all values are in native machine
3784 : * byte order.
3785 : *
3786 : * @param pSrcData Pointer to source data to be converted.
3787 : * @param eSrcType the source data type (see GDALDataType enum)
3788 : * @param nSrcPixelStride Source pixel stride (i.e. distance between 2 words),
3789 : * in bytes
3790 : * @param pDstData Pointer to buffer where destination data should go
3791 : * @param eDstType the destination data type (see GDALDataType enum)
3792 : * @param nDstPixelStride Destination pixel stride (i.e. distance between 2
3793 : * words), in bytes
3794 : * @param nWordCount number of words to be copied
3795 : *
3796 : * @note
3797 : * When adding a new data type to GDAL, you must do the following to
3798 : * support it properly within the GDALCopyWords function:
3799 : * 1. Add the data type to the switch on eSrcType in GDALCopyWords.
3800 : * This should invoke the appropriate GDALCopyWordsFromT wrapper.
3801 : * 2. Add the data type to the switch on eDstType in GDALCopyWordsFromT.
3802 : * This should call the appropriate GDALCopyWordsT template.
3803 : * 3. If appropriate, overload the appropriate CopyWord template in the
3804 : * above namespace. This will ensure that any conversion issues are
3805 : * handled (cases like the float -> int32 case, where the min/max)
3806 : * values are subject to roundoff error.
3807 : */
3808 :
3809 109318000 : void CPL_STDCALL GDALCopyWords64(const void *CPL_RESTRICT pSrcData,
3810 : GDALDataType eSrcType, int nSrcPixelStride,
3811 : void *CPL_RESTRICT pDstData,
3812 : GDALDataType eDstType, int nDstPixelStride,
3813 : GPtrDiff_t nWordCount)
3814 :
3815 : {
3816 : // On platforms where alignment matters, be careful
3817 109318000 : const int nSrcDataTypeSize = GDALGetDataTypeSizeBytes(eSrcType);
3818 109283000 : const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(eDstType);
3819 109287000 : if (CPL_UNLIKELY(nSrcDataTypeSize == 0 || nDstDataTypeSize == 0))
3820 : {
3821 2 : CPLError(CE_Failure, CPLE_NotSupported,
3822 : "GDALCopyWords64(): unsupported GDT_Unknown/GDT_TypeCount "
3823 : "argument");
3824 2 : return;
3825 : }
3826 109287000 : if (!(eSrcType == eDstType && nSrcPixelStride == nDstPixelStride) &&
3827 59019900 : ((reinterpret_cast<uintptr_t>(pSrcData) % nSrcDataTypeSize) != 0 ||
3828 59027600 : (reinterpret_cast<uintptr_t>(pDstData) % nDstDataTypeSize) != 0 ||
3829 59018000 : (nSrcPixelStride % nSrcDataTypeSize) != 0 ||
3830 59017300 : (nDstPixelStride % nDstDataTypeSize) != 0))
3831 : {
3832 905 : if (eSrcType == eDstType)
3833 : {
3834 34800 : for (decltype(nWordCount) i = 0; i < nWordCount; i++)
3835 : {
3836 34000 : memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
3837 : static_cast<const GByte *>(pSrcData) +
3838 34000 : nSrcPixelStride * i,
3839 : nDstDataTypeSize);
3840 : }
3841 : }
3842 : else
3843 : {
3844 210 : const auto getAlignedPtr = [](GByte *ptr, int align)
3845 : {
3846 : return ptr +
3847 210 : ((align - (reinterpret_cast<uintptr_t>(ptr) % align)) %
3848 210 : align);
3849 : };
3850 :
3851 : // The largest we need is for CFloat64 (16 bytes), so 32 bytes to
3852 : // be sure to get correctly aligned pointer.
3853 105 : constexpr size_t SIZEOF_CFLOAT64 = 2 * sizeof(double);
3854 : GByte abySrcBuffer[2 * SIZEOF_CFLOAT64];
3855 : GByte abyDstBuffer[2 * SIZEOF_CFLOAT64];
3856 : GByte *pabySrcBuffer =
3857 105 : getAlignedPtr(abySrcBuffer, nSrcDataTypeSize);
3858 : GByte *pabyDstBuffer =
3859 105 : getAlignedPtr(abyDstBuffer, nDstDataTypeSize);
3860 3360 : for (decltype(nWordCount) i = 0; i < nWordCount; i++)
3861 : {
3862 3255 : memcpy(pabySrcBuffer,
3863 : static_cast<const GByte *>(pSrcData) +
3864 3255 : nSrcPixelStride * i,
3865 : nSrcDataTypeSize);
3866 3255 : GDALCopyWords64(pabySrcBuffer, eSrcType, 0, pabyDstBuffer,
3867 : eDstType, 0, 1);
3868 3255 : memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
3869 : pabyDstBuffer, nDstDataTypeSize);
3870 : }
3871 : }
3872 905 : return;
3873 : }
3874 :
3875 : // Deal with the case where we're replicating a single word into the
3876 : // provided buffer
3877 109287000 : if (nSrcPixelStride == 0 && nWordCount > 1)
3878 : {
3879 1011010 : GDALReplicateWord(pSrcData, eSrcType, pDstData, eDstType,
3880 : nDstPixelStride, nWordCount);
3881 1011700 : return;
3882 : }
3883 :
3884 108276000 : if (eSrcType == eDstType)
3885 : {
3886 54380600 : if (eSrcType == GDT_Byte || eSrcType == GDT_Int8)
3887 : {
3888 19286300 : GDALFastCopy(static_cast<GByte *>(pDstData), nDstPixelStride,
3889 : static_cast<const GByte *>(pSrcData), nSrcPixelStride,
3890 : nWordCount);
3891 19289300 : return;
3892 : }
3893 :
3894 35094400 : if (nSrcDataTypeSize == 2 && (nSrcPixelStride % 2) == 0 &&
3895 20809900 : (nDstPixelStride % 2) == 0)
3896 : {
3897 20809900 : GDALFastCopy(static_cast<short *>(pDstData), nDstPixelStride,
3898 : static_cast<const short *>(pSrcData), nSrcPixelStride,
3899 : nWordCount);
3900 20809700 : return;
3901 : }
3902 :
3903 14284500 : if (nWordCount == 1)
3904 : {
3905 : #if defined(CSA_BUILD) || defined(__COVERITY__)
3906 : // Avoid false positives...
3907 : memcpy(pDstData, pSrcData, nSrcDataTypeSize);
3908 : #else
3909 13892900 : if (nSrcDataTypeSize == 2)
3910 0 : memcpy(pDstData, pSrcData, 2);
3911 13892900 : else if (nSrcDataTypeSize == 4)
3912 13809200 : memcpy(pDstData, pSrcData, 4);
3913 83682 : else if (nSrcDataTypeSize == 8)
3914 67162 : memcpy(pDstData, pSrcData, 8);
3915 : else /* if( eSrcType == GDT_CFloat64 ) */
3916 16520 : memcpy(pDstData, pSrcData, 16);
3917 : #endif
3918 13892900 : return;
3919 : }
3920 :
3921 : // Let memcpy() handle the case where we're copying a packed buffer
3922 : // of pixels.
3923 391667 : if (nSrcPixelStride == nDstPixelStride)
3924 : {
3925 263680 : if (nSrcPixelStride == nSrcDataTypeSize)
3926 : {
3927 263612 : memcpy(pDstData, pSrcData, nWordCount * nSrcDataTypeSize);
3928 263612 : return;
3929 : }
3930 : }
3931 : }
3932 :
3933 : // Handle the more general case -- deals with conversion of data types
3934 : // directly.
3935 54023000 : switch (eSrcType)
3936 : {
3937 15463800 : case GDT_Byte:
3938 15463800 : GDALCopyWordsFromT<unsigned char>(
3939 : static_cast<const unsigned char *>(pSrcData), nSrcPixelStride,
3940 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
3941 15505600 : break;
3942 1225 : case GDT_Int8:
3943 1225 : GDALCopyWordsFromT<signed char>(
3944 : static_cast<const signed char *>(pSrcData), nSrcPixelStride,
3945 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
3946 1225 : break;
3947 53348 : case GDT_UInt16:
3948 53348 : GDALCopyWordsFromT<unsigned short>(
3949 : static_cast<const unsigned short *>(pSrcData), nSrcPixelStride,
3950 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
3951 53348 : break;
3952 4350270 : case GDT_Int16:
3953 4350270 : GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
3954 : nSrcPixelStride, false, pDstData,
3955 : eDstType, nDstPixelStride, nWordCount);
3956 4350270 : break;
3957 7092 : case GDT_UInt32:
3958 7092 : GDALCopyWordsFromT<unsigned int>(
3959 : static_cast<const unsigned int *>(pSrcData), nSrcPixelStride,
3960 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
3961 7092 : break;
3962 12255000 : case GDT_Int32:
3963 12255000 : GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
3964 : nSrcPixelStride, false, pDstData, eDstType,
3965 : nDstPixelStride, nWordCount);
3966 12255000 : break;
3967 1635 : case GDT_UInt64:
3968 1635 : GDALCopyWordsFromT<std::uint64_t>(
3969 : static_cast<const std::uint64_t *>(pSrcData), nSrcPixelStride,
3970 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
3971 1635 : break;
3972 10978 : case GDT_Int64:
3973 10978 : GDALCopyWordsFromT<std::int64_t>(
3974 : static_cast<const std::int64_t *>(pSrcData), nSrcPixelStride,
3975 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
3976 10978 : break;
3977 1074 : case GDT_Float16:
3978 1074 : GDALCopyWordsFromT<GFloat16>(
3979 : static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, false,
3980 : pDstData, eDstType, nDstPixelStride, nWordCount);
3981 1074 : break;
3982 438218 : case GDT_Float32:
3983 438218 : GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
3984 : nSrcPixelStride, false, pDstData,
3985 : eDstType, nDstPixelStride, nWordCount);
3986 438210 : break;
3987 20666800 : case GDT_Float64:
3988 20666800 : GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
3989 : nSrcPixelStride, false, pDstData,
3990 : eDstType, nDstPixelStride, nWordCount);
3991 20666800 : break;
3992 478141 : case GDT_CInt16:
3993 478141 : GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
3994 : nSrcPixelStride, true, pDstData, eDstType,
3995 : nDstPixelStride, nWordCount);
3996 478141 : break;
3997 556 : case GDT_CInt32:
3998 556 : GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
3999 : nSrcPixelStride, true, pDstData, eDstType,
4000 : nDstPixelStride, nWordCount);
4001 556 : break;
4002 396 : case GDT_CFloat16:
4003 396 : GDALCopyWordsFromT<GFloat16>(
4004 : static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, true,
4005 : pDstData, eDstType, nDstPixelStride, nWordCount);
4006 396 : break;
4007 1547 : case GDT_CFloat32:
4008 1547 : GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
4009 : nSrcPixelStride, true, pDstData, eDstType,
4010 : nDstPixelStride, nWordCount);
4011 1547 : break;
4012 276418 : case GDT_CFloat64:
4013 276418 : GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
4014 : nSrcPixelStride, true, pDstData,
4015 : eDstType, nDstPixelStride, nWordCount);
4016 276418 : break;
4017 0 : case GDT_Unknown:
4018 : case GDT_TypeCount:
4019 0 : CPLAssert(false);
4020 : }
4021 : }
4022 :
4023 : /************************************************************************/
4024 : /* GDALCopyBits() */
4025 : /************************************************************************/
4026 :
4027 : /**
4028 : * Bitwise word copying.
4029 : *
4030 : * A function for moving sets of partial bytes around. Loosely
4031 : * speaking this is a bitwise analog to GDALCopyWords().
4032 : *
4033 : * It copies nStepCount "words" where each word is nBitCount bits long.
4034 : * The nSrcStep and nDstStep are the number of bits from the start of one
4035 : * word to the next (same as nBitCount if they are packed). The nSrcOffset
4036 : * and nDstOffset are the offset into the source and destination buffers
4037 : * to start at, also measured in bits.
4038 : *
4039 : * All bit offsets are assumed to start from the high order bit in a byte
4040 : * (i.e. most significant bit first). Currently this function is not very
4041 : * optimized, but it may be improved for some common cases in the future
4042 : * as needed.
4043 : *
4044 : * @param pabySrcData the source data buffer.
4045 : * @param nSrcOffset the offset (in bits) in pabySrcData to the start of the
4046 : * first word to copy.
4047 : * @param nSrcStep the offset in bits from the start one source word to the
4048 : * start of the next.
4049 : * @param pabyDstData the destination data buffer.
4050 : * @param nDstOffset the offset (in bits) in pabyDstData to the start of the
4051 : * first word to copy over.
4052 : * @param nDstStep the offset in bits from the start one word to the
4053 : * start of the next.
4054 : * @param nBitCount the number of bits in a word to be copied.
4055 : * @param nStepCount the number of words to copy.
4056 : */
4057 :
4058 0 : void GDALCopyBits(const GByte *pabySrcData, int nSrcOffset, int nSrcStep,
4059 : GByte *pabyDstData, int nDstOffset, int nDstStep,
4060 : int nBitCount, int nStepCount)
4061 :
4062 : {
4063 0 : VALIDATE_POINTER0(pabySrcData, "GDALCopyBits");
4064 :
4065 0 : for (int iStep = 0; iStep < nStepCount; iStep++)
4066 : {
4067 0 : for (int iBit = 0; iBit < nBitCount; iBit++)
4068 : {
4069 0 : if (pabySrcData[nSrcOffset >> 3] & (0x80 >> (nSrcOffset & 7)))
4070 0 : pabyDstData[nDstOffset >> 3] |= (0x80 >> (nDstOffset & 7));
4071 : else
4072 0 : pabyDstData[nDstOffset >> 3] &= ~(0x80 >> (nDstOffset & 7));
4073 :
4074 0 : nSrcOffset++;
4075 0 : nDstOffset++;
4076 : }
4077 :
4078 0 : nSrcOffset += (nSrcStep - nBitCount);
4079 0 : nDstOffset += (nDstStep - nBitCount);
4080 : }
4081 : }
4082 :
4083 : /************************************************************************/
4084 : /* GDALGetBestOverviewLevel() */
4085 : /* */
4086 : /* Returns the best overview level to satisfy the query or -1 if none */
4087 : /* Also updates nXOff, nYOff, nXSize, nYSize and psExtraArg when */
4088 : /* returning a valid overview level */
4089 : /************************************************************************/
4090 :
4091 0 : int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand, int &nXOff, int &nYOff,
4092 : int &nXSize, int &nYSize, int nBufXSize,
4093 : int nBufYSize)
4094 : {
4095 0 : return GDALBandGetBestOverviewLevel2(poBand, nXOff, nYOff, nXSize, nYSize,
4096 0 : nBufXSize, nBufYSize, nullptr);
4097 : }
4098 :
4099 523846 : int GDALBandGetBestOverviewLevel2(GDALRasterBand *poBand, int &nXOff,
4100 : int &nYOff, int &nXSize, int &nYSize,
4101 : int nBufXSize, int nBufYSize,
4102 : GDALRasterIOExtraArg *psExtraArg)
4103 : {
4104 523846 : if (psExtraArg != nullptr && psExtraArg->nVersion > 1 &&
4105 523846 : psExtraArg->bUseOnlyThisScale)
4106 109 : return -1;
4107 : /* -------------------------------------------------------------------- */
4108 : /* Compute the desired downsampling factor. It is */
4109 : /* based on the least reduced axis, and represents the number */
4110 : /* of source pixels to one destination pixel. */
4111 : /* -------------------------------------------------------------------- */
4112 523737 : const double dfDesiredDownsamplingFactor =
4113 523737 : ((nXSize / static_cast<double>(nBufXSize)) <
4114 361399 : (nYSize / static_cast<double>(nBufYSize)) ||
4115 : nBufYSize == 1)
4116 752122 : ? nXSize / static_cast<double>(nBufXSize)
4117 133014 : : nYSize / static_cast<double>(nBufYSize);
4118 :
4119 : /* -------------------------------------------------------------------- */
4120 : /* Find the overview level that largest downsampling factor (most */
4121 : /* downsampled) that is still less than (or only a little more) */
4122 : /* downsampled than the request. */
4123 : /* -------------------------------------------------------------------- */
4124 523737 : const int nOverviewCount = poBand->GetOverviewCount();
4125 523737 : GDALRasterBand *poBestOverview = nullptr;
4126 523737 : double dfBestDownsamplingFactor = 0;
4127 523737 : int nBestOverviewLevel = -1;
4128 :
4129 : const char *pszOversampligThreshold =
4130 523737 : CPLGetConfigOption("GDAL_OVERVIEW_OVERSAMPLING_THRESHOLD", nullptr);
4131 :
4132 : // Note: keep this logic for overview selection in sync between
4133 : // gdalwarp_lib.cpp and rasterio.cpp
4134 : // Cf https://github.com/OSGeo/gdal/pull/9040#issuecomment-1898524693
4135 : const double dfOversamplingThreshold =
4136 1047460 : pszOversampligThreshold ? CPLAtof(pszOversampligThreshold)
4137 523728 : : psExtraArg && psExtraArg->eResampleAlg != GRIORA_NearestNeighbour
4138 1047460 : ? 1.0
4139 523737 : : 1.2;
4140 526432 : for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
4141 : {
4142 5547 : GDALRasterBand *poOverview = poBand->GetOverview(iOverview);
4143 11094 : if (poOverview == nullptr ||
4144 11093 : poOverview->GetXSize() > poBand->GetXSize() ||
4145 5546 : poOverview->GetYSize() > poBand->GetYSize())
4146 : {
4147 1 : continue;
4148 : }
4149 :
4150 : // Compute downsampling factor of this overview
4151 : const double dfDownsamplingFactor = std::min(
4152 5546 : poBand->GetXSize() / static_cast<double>(poOverview->GetXSize()),
4153 11092 : poBand->GetYSize() / static_cast<double>(poOverview->GetYSize()));
4154 :
4155 : // Is it nearly the requested factor and better (lower) than
4156 : // the current best factor?
4157 : // Use an epsilon because of numerical instability.
4158 5546 : constexpr double EPSILON = 1e-1;
4159 5654 : if (dfDownsamplingFactor >=
4160 5546 : dfDesiredDownsamplingFactor * dfOversamplingThreshold +
4161 5438 : EPSILON ||
4162 : dfDownsamplingFactor <= dfBestDownsamplingFactor)
4163 : {
4164 108 : continue;
4165 : }
4166 :
4167 : // Ignore AVERAGE_BIT2GRAYSCALE overviews for RasterIO purposes.
4168 5438 : const char *pszResampling = poOverview->GetMetadataItem("RESAMPLING");
4169 :
4170 5438 : if (pszResampling != nullptr &&
4171 71 : STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2"))
4172 16 : continue;
4173 :
4174 : // OK, this is our new best overview.
4175 5422 : poBestOverview = poOverview;
4176 5422 : nBestOverviewLevel = iOverview;
4177 5422 : dfBestDownsamplingFactor = dfDownsamplingFactor;
4178 :
4179 5422 : if (std::abs(dfDesiredDownsamplingFactor - dfDownsamplingFactor) <
4180 : EPSILON)
4181 : {
4182 2852 : break;
4183 : }
4184 : }
4185 :
4186 : /* -------------------------------------------------------------------- */
4187 : /* If we didn't find an overview that helps us, just return */
4188 : /* indicating failure and the full resolution image will be used. */
4189 : /* -------------------------------------------------------------------- */
4190 523737 : if (nBestOverviewLevel < 0)
4191 520813 : return -1;
4192 :
4193 : /* -------------------------------------------------------------------- */
4194 : /* Recompute the source window in terms of the selected */
4195 : /* overview. */
4196 : /* -------------------------------------------------------------------- */
4197 : const double dfXFactor =
4198 2924 : poBand->GetXSize() / static_cast<double>(poBestOverview->GetXSize());
4199 : const double dfYFactor =
4200 2924 : poBand->GetYSize() / static_cast<double>(poBestOverview->GetYSize());
4201 2924 : CPLDebug("GDAL", "Selecting overview %d x %d", poBestOverview->GetXSize(),
4202 : poBestOverview->GetYSize());
4203 :
4204 8772 : const int nOXOff = std::min(poBestOverview->GetXSize() - 1,
4205 2924 : static_cast<int>(nXOff / dfXFactor + 0.5));
4206 8772 : const int nOYOff = std::min(poBestOverview->GetYSize() - 1,
4207 2924 : static_cast<int>(nYOff / dfYFactor + 0.5));
4208 2924 : int nOXSize = std::max(1, static_cast<int>(nXSize / dfXFactor + 0.5));
4209 2924 : int nOYSize = std::max(1, static_cast<int>(nYSize / dfYFactor + 0.5));
4210 2924 : if (nOXOff + nOXSize > poBestOverview->GetXSize())
4211 0 : nOXSize = poBestOverview->GetXSize() - nOXOff;
4212 2924 : if (nOYOff + nOYSize > poBestOverview->GetYSize())
4213 2 : nOYSize = poBestOverview->GetYSize() - nOYOff;
4214 :
4215 2924 : if (psExtraArg)
4216 : {
4217 2924 : if (psExtraArg->bFloatingPointWindowValidity)
4218 : {
4219 50 : psExtraArg->dfXOff /= dfXFactor;
4220 50 : psExtraArg->dfXSize /= dfXFactor;
4221 50 : psExtraArg->dfYOff /= dfYFactor;
4222 50 : psExtraArg->dfYSize /= dfYFactor;
4223 : }
4224 2874 : else if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
4225 : {
4226 16 : psExtraArg->bFloatingPointWindowValidity = true;
4227 16 : psExtraArg->dfXOff = nXOff / dfXFactor;
4228 16 : psExtraArg->dfXSize = nXSize / dfXFactor;
4229 16 : psExtraArg->dfYOff = nYOff / dfYFactor;
4230 16 : psExtraArg->dfYSize = nYSize / dfYFactor;
4231 : }
4232 : }
4233 :
4234 2924 : nXOff = nOXOff;
4235 2924 : nYOff = nOYOff;
4236 2924 : nXSize = nOXSize;
4237 2924 : nYSize = nOYSize;
4238 :
4239 2924 : return nBestOverviewLevel;
4240 : }
4241 :
4242 : /************************************************************************/
4243 : /* OverviewRasterIO() */
4244 : /* */
4245 : /* Special work function to utilize available overviews to */
4246 : /* more efficiently satisfy downsampled requests. It will */
4247 : /* return CE_Failure if there are no appropriate overviews */
4248 : /* available but it doesn't emit any error messages. */
4249 : /************************************************************************/
4250 :
4251 : //! @cond Doxygen_Suppress
4252 2 : CPLErr GDALRasterBand::OverviewRasterIO(
4253 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4254 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4255 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
4256 :
4257 : {
4258 : GDALRasterIOExtraArg sExtraArg;
4259 2 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4260 :
4261 2 : const int nOverview = GDALBandGetBestOverviewLevel2(
4262 : this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, &sExtraArg);
4263 2 : if (nOverview < 0)
4264 1 : return CE_Failure;
4265 :
4266 : /* -------------------------------------------------------------------- */
4267 : /* Recast the call in terms of the new raster layer. */
4268 : /* -------------------------------------------------------------------- */
4269 1 : GDALRasterBand *poOverviewBand = GetOverview(nOverview);
4270 1 : if (poOverviewBand == nullptr)
4271 0 : return CE_Failure;
4272 :
4273 1 : return poOverviewBand->RasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
4274 : pData, nBufXSize, nBufYSize, eBufType,
4275 1 : nPixelSpace, nLineSpace, &sExtraArg);
4276 : }
4277 :
4278 : /************************************************************************/
4279 : /* TryOverviewRasterIO() */
4280 : /************************************************************************/
4281 :
4282 362416 : CPLErr GDALRasterBand::TryOverviewRasterIO(
4283 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4284 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4285 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg,
4286 : int *pbTried)
4287 : {
4288 362416 : int nXOffMod = nXOff;
4289 362416 : int nYOffMod = nYOff;
4290 362416 : int nXSizeMod = nXSize;
4291 362416 : int nYSizeMod = nYSize;
4292 : GDALRasterIOExtraArg sExtraArg;
4293 :
4294 362416 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4295 :
4296 362416 : int iOvrLevel = GDALBandGetBestOverviewLevel2(
4297 : this, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize, nBufYSize,
4298 : &sExtraArg);
4299 :
4300 362416 : if (iOvrLevel >= 0)
4301 : {
4302 50 : GDALRasterBand *poOverviewBand = GetOverview(iOvrLevel);
4303 50 : if (poOverviewBand)
4304 : {
4305 50 : *pbTried = TRUE;
4306 50 : return poOverviewBand->RasterIO(
4307 : eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData,
4308 : nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
4309 50 : &sExtraArg);
4310 : }
4311 : }
4312 :
4313 362366 : *pbTried = FALSE;
4314 362366 : return CE_None;
4315 : }
4316 :
4317 : /************************************************************************/
4318 : /* TryOverviewRasterIO() */
4319 : /************************************************************************/
4320 :
4321 158519 : CPLErr GDALDataset::TryOverviewRasterIO(
4322 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4323 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4324 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
4325 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg,
4326 : int *pbTried)
4327 : {
4328 158519 : int nXOffMod = nXOff;
4329 158519 : int nYOffMod = nYOff;
4330 158519 : int nXSizeMod = nXSize;
4331 158519 : int nYSizeMod = nYSize;
4332 : GDALRasterIOExtraArg sExtraArg;
4333 158519 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4334 :
4335 317038 : int iOvrLevel = GDALBandGetBestOverviewLevel2(
4336 158519 : papoBands[0], nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize,
4337 : nBufYSize, &sExtraArg);
4338 :
4339 158560 : if (iOvrLevel >= 0 && papoBands[0]->GetOverview(iOvrLevel) != nullptr &&
4340 41 : papoBands[0]->GetOverview(iOvrLevel)->GetDataset() != nullptr)
4341 : {
4342 41 : *pbTried = TRUE;
4343 41 : return papoBands[0]->GetOverview(iOvrLevel)->GetDataset()->RasterIO(
4344 : eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData, nBufXSize,
4345 : nBufYSize, eBufType, nBandCount, panBandMap, nPixelSpace,
4346 41 : nLineSpace, nBandSpace, &sExtraArg);
4347 : }
4348 : else
4349 : {
4350 158478 : *pbTried = FALSE;
4351 158478 : return CE_None;
4352 : }
4353 : }
4354 :
4355 : /************************************************************************/
4356 : /* GetBestOverviewLevel() */
4357 : /* */
4358 : /* Returns the best overview level to satisfy the query or -1 if none */
4359 : /* Also updates nXOff, nYOff, nXSize, nYSize when returning a valid */
4360 : /* overview level */
4361 : /************************************************************************/
4362 :
4363 4 : static int GDALDatasetGetBestOverviewLevel(GDALDataset *poDS, int &nXOff,
4364 : int &nYOff, int &nXSize, int &nYSize,
4365 : int nBufXSize, int nBufYSize,
4366 : int nBandCount,
4367 : const int *panBandMap,
4368 : GDALRasterIOExtraArg *psExtraArg)
4369 : {
4370 4 : int nOverviewCount = 0;
4371 4 : GDALRasterBand *poFirstBand = nullptr;
4372 :
4373 : /* -------------------------------------------------------------------- */
4374 : /* Check that all bands have the same number of overviews and */
4375 : /* that they have all the same size and block dimensions */
4376 : /* -------------------------------------------------------------------- */
4377 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
4378 : {
4379 8 : GDALRasterBand *poBand = poDS->GetRasterBand(panBandMap[iBand]);
4380 8 : if (poBand == nullptr)
4381 0 : return -1;
4382 8 : if (iBand == 0)
4383 : {
4384 4 : poFirstBand = poBand;
4385 4 : nOverviewCount = poBand->GetOverviewCount();
4386 : }
4387 4 : else if (nOverviewCount != poBand->GetOverviewCount())
4388 : {
4389 0 : CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
4390 : "mismatched overview count, use std method.");
4391 0 : return -1;
4392 : }
4393 : else
4394 : {
4395 4 : for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
4396 : {
4397 0 : GDALRasterBand *poOvrBand = poBand->GetOverview(iOverview);
4398 : GDALRasterBand *poOvrFirstBand =
4399 0 : poFirstBand->GetOverview(iOverview);
4400 0 : if (poOvrBand == nullptr || poOvrFirstBand == nullptr)
4401 0 : continue;
4402 :
4403 0 : if (poOvrFirstBand->GetXSize() != poOvrBand->GetXSize() ||
4404 0 : poOvrFirstBand->GetYSize() != poOvrBand->GetYSize())
4405 : {
4406 0 : CPLDebug("GDAL",
4407 : "GDALDataset::GetBestOverviewLevel() ... "
4408 : "mismatched overview sizes, use std method.");
4409 0 : return -1;
4410 : }
4411 0 : int nBlockXSizeFirst = 0;
4412 0 : int nBlockYSizeFirst = 0;
4413 0 : poOvrFirstBand->GetBlockSize(&nBlockXSizeFirst,
4414 : &nBlockYSizeFirst);
4415 :
4416 0 : int nBlockXSizeCurrent = 0;
4417 0 : int nBlockYSizeCurrent = 0;
4418 0 : poOvrBand->GetBlockSize(&nBlockXSizeCurrent,
4419 : &nBlockYSizeCurrent);
4420 :
4421 0 : if (nBlockXSizeFirst != nBlockXSizeCurrent ||
4422 0 : nBlockYSizeFirst != nBlockYSizeCurrent)
4423 : {
4424 0 : CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
4425 : "mismatched block sizes, use std method.");
4426 0 : return -1;
4427 : }
4428 : }
4429 : }
4430 : }
4431 4 : if (poFirstBand == nullptr)
4432 0 : return -1;
4433 :
4434 4 : return GDALBandGetBestOverviewLevel2(poFirstBand, nXOff, nYOff, nXSize,
4435 : nYSize, nBufXSize, nBufYSize,
4436 4 : psExtraArg);
4437 : }
4438 :
4439 : /************************************************************************/
4440 : /* BlockBasedRasterIO() */
4441 : /* */
4442 : /* This convenience function implements a dataset level */
4443 : /* RasterIO() interface based on calling down to fetch blocks, */
4444 : /* much like the GDALRasterBand::IRasterIO(), but it handles */
4445 : /* all bands at once, so that a format driver that handles a */
4446 : /* request for different bands of the same block efficiently */
4447 : /* (i.e. without re-reading interleaved data) will efficiently. */
4448 : /* */
4449 : /* This method is intended to be called by an overridden */
4450 : /* IRasterIO() method in the driver specific GDALDataset */
4451 : /* derived class. */
4452 : /* */
4453 : /* Default internal implementation of RasterIO() ... utilizes */
4454 : /* the Block access methods to satisfy the request. This would */
4455 : /* normally only be overridden by formats with overviews. */
4456 : /* */
4457 : /* To keep things relatively simple, this method does not */
4458 : /* currently take advantage of some special cases addressed in */
4459 : /* GDALRasterBand::IRasterIO(), so it is likely best to only */
4460 : /* call it when you know it will help. That is in cases where */
4461 : /* data is at 1:1 to the buffer, and you know the driver is */
4462 : /* implementing interleaved IO efficiently on a block by block */
4463 : /* basis. Overviews will be used when possible. */
4464 : /************************************************************************/
4465 :
4466 63953 : CPLErr GDALDataset::BlockBasedRasterIO(
4467 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4468 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4469 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
4470 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
4471 :
4472 : {
4473 63953 : CPLAssert(nullptr != pData);
4474 :
4475 63953 : GByte **papabySrcBlock = nullptr;
4476 63953 : GDALRasterBlock *poBlock = nullptr;
4477 63953 : GDALRasterBlock **papoBlocks = nullptr;
4478 63953 : int nLBlockX = -1;
4479 63953 : int nLBlockY = -1;
4480 : int iBufYOff;
4481 : int iBufXOff;
4482 63953 : int nBlockXSize = 1;
4483 63953 : int nBlockYSize = 1;
4484 63953 : CPLErr eErr = CE_None;
4485 63953 : GDALDataType eDataType = GDT_Byte;
4486 :
4487 63953 : const bool bUseIntegerRequestCoords =
4488 63983 : (!psExtraArg->bFloatingPointWindowValidity ||
4489 30 : (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
4490 28 : nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
4491 :
4492 : /* -------------------------------------------------------------------- */
4493 : /* Ensure that all bands share a common block size and data type. */
4494 : /* -------------------------------------------------------------------- */
4495 303180 : for (int iBand = 0; iBand < nBandCount; iBand++)
4496 : {
4497 239225 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
4498 :
4499 239226 : if (iBand == 0)
4500 : {
4501 63954 : poBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
4502 63953 : eDataType = poBand->GetRasterDataType();
4503 : }
4504 : else
4505 : {
4506 175272 : int nThisBlockXSize = 0;
4507 175272 : int nThisBlockYSize = 0;
4508 175272 : poBand->GetBlockSize(&nThisBlockXSize, &nThisBlockYSize);
4509 175272 : if (nThisBlockXSize != nBlockXSize ||
4510 175272 : nThisBlockYSize != nBlockYSize)
4511 : {
4512 0 : CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
4513 : "mismatched block sizes, use std method.");
4514 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
4515 : pData, nBufXSize, nBufYSize, eBufType,
4516 : nBandCount, panBandMap, nPixelSpace,
4517 0 : nLineSpace, nBandSpace, psExtraArg);
4518 : }
4519 :
4520 175272 : if (eDataType != poBand->GetRasterDataType() &&
4521 0 : (nXSize != nBufXSize || nYSize != nBufYSize))
4522 : {
4523 0 : CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
4524 : "mismatched band data types, use std method.");
4525 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
4526 : pData, nBufXSize, nBufYSize, eBufType,
4527 : nBandCount, panBandMap, nPixelSpace,
4528 0 : nLineSpace, nBandSpace, psExtraArg);
4529 : }
4530 : }
4531 : }
4532 :
4533 : /* ==================================================================== */
4534 : /* In this special case at full resolution we step through in */
4535 : /* blocks, turning the request over to the per-band */
4536 : /* IRasterIO(), but ensuring that all bands of one block are */
4537 : /* called before proceeding to the next. */
4538 : /* ==================================================================== */
4539 :
4540 63955 : if (nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
4541 : {
4542 : GDALRasterIOExtraArg sDummyExtraArg;
4543 63951 : INIT_RASTERIO_EXTRA_ARG(sDummyExtraArg);
4544 :
4545 63951 : int nChunkYSize = 0;
4546 63951 : int nChunkXSize = 0;
4547 :
4548 210228 : for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff += nChunkYSize)
4549 : {
4550 147292 : const int nChunkYOff = iBufYOff + nYOff;
4551 147292 : nChunkYSize = nBlockYSize - (nChunkYOff % nBlockYSize);
4552 147292 : if (nChunkYOff + nChunkYSize > nYOff + nYSize)
4553 59131 : nChunkYSize = (nYOff + nYSize) - nChunkYOff;
4554 :
4555 817191 : for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff += nChunkXSize)
4556 : {
4557 670913 : const int nChunkXOff = iBufXOff + nXOff;
4558 670913 : nChunkXSize = nBlockXSize - (nChunkXOff % nBlockXSize);
4559 670913 : if (nChunkXOff + nChunkXSize > nXOff + nXSize)
4560 70247 : nChunkXSize = (nXOff + nXSize) - nChunkXOff;
4561 :
4562 670913 : GByte *pabyChunkData =
4563 670913 : static_cast<GByte *>(pData) + iBufXOff * nPixelSpace +
4564 670913 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace;
4565 :
4566 3267300 : for (int iBand = 0; iBand < nBandCount; iBand++)
4567 : {
4568 2597400 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
4569 :
4570 5194780 : eErr = poBand->IRasterIO(
4571 : eRWFlag, nChunkXOff, nChunkYOff, nChunkXSize,
4572 : nChunkYSize,
4573 2597400 : pabyChunkData +
4574 2597400 : static_cast<GPtrDiff_t>(iBand) * nBandSpace,
4575 : nChunkXSize, nChunkYSize, eBufType, nPixelSpace,
4576 2597400 : nLineSpace, &sDummyExtraArg);
4577 2597390 : if (eErr != CE_None)
4578 1006 : return eErr;
4579 : }
4580 : }
4581 :
4582 165068 : if (psExtraArg->pfnProgress != nullptr &&
4583 18790 : !psExtraArg->pfnProgress(
4584 165068 : 1.0 * std::min(nBufYSize, iBufYOff + nChunkYSize) /
4585 : nBufYSize,
4586 : "", psExtraArg->pProgressData))
4587 : {
4588 1 : return CE_Failure;
4589 : }
4590 : }
4591 :
4592 62936 : return CE_None;
4593 : }
4594 :
4595 : /* Below code is not compatible with that case. It would need a complete */
4596 : /* separate code like done in GDALRasterBand::IRasterIO. */
4597 4 : if (eRWFlag == GF_Write && (nBufXSize < nXSize || nBufYSize < nYSize))
4598 : {
4599 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
4600 : nBufXSize, nBufYSize, eBufType, nBandCount,
4601 : panBandMap, nPixelSpace, nLineSpace,
4602 0 : nBandSpace, psExtraArg);
4603 : }
4604 :
4605 : /* We could have a smarter implementation, but that will do for now */
4606 4 : if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
4607 0 : (nBufXSize != nXSize || nBufYSize != nYSize))
4608 : {
4609 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
4610 : nBufXSize, nBufYSize, eBufType, nBandCount,
4611 : panBandMap, nPixelSpace, nLineSpace,
4612 0 : nBandSpace, psExtraArg);
4613 : }
4614 :
4615 : /* ==================================================================== */
4616 : /* Loop reading required source blocks to satisfy output */
4617 : /* request. This is the most general implementation. */
4618 : /* ==================================================================== */
4619 :
4620 4 : const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
4621 :
4622 : papabySrcBlock =
4623 4 : static_cast<GByte **>(CPLCalloc(sizeof(GByte *), nBandCount));
4624 : papoBlocks =
4625 4 : static_cast<GDALRasterBlock **>(CPLCalloc(sizeof(void *), nBandCount));
4626 :
4627 : /* -------------------------------------------------------------------- */
4628 : /* Select an overview level if appropriate. */
4629 : /* -------------------------------------------------------------------- */
4630 :
4631 : GDALRasterIOExtraArg sExtraArg;
4632 4 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4633 4 : const int nOverviewLevel = GDALDatasetGetBestOverviewLevel(
4634 : this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, nBandCount,
4635 : panBandMap, &sExtraArg);
4636 4 : if (nOverviewLevel >= 0)
4637 : {
4638 2 : GetRasterBand(panBandMap[0])
4639 2 : ->GetOverview(nOverviewLevel)
4640 2 : ->GetBlockSize(&nBlockXSize, &nBlockYSize);
4641 : }
4642 :
4643 4 : double dfXOff = nXOff;
4644 4 : double dfYOff = nYOff;
4645 4 : double dfXSize = nXSize;
4646 4 : double dfYSize = nYSize;
4647 4 : if (sExtraArg.bFloatingPointWindowValidity)
4648 : {
4649 2 : dfXOff = sExtraArg.dfXOff;
4650 2 : dfYOff = sExtraArg.dfYOff;
4651 2 : dfXSize = sExtraArg.dfXSize;
4652 2 : dfYSize = sExtraArg.dfYSize;
4653 : }
4654 :
4655 : /* -------------------------------------------------------------------- */
4656 : /* Compute stepping increment. */
4657 : /* -------------------------------------------------------------------- */
4658 4 : const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
4659 4 : const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
4660 :
4661 4 : constexpr double EPS = 1e-10;
4662 : /* -------------------------------------------------------------------- */
4663 : /* Loop over buffer computing source locations. */
4664 : /* -------------------------------------------------------------------- */
4665 36 : for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
4666 : {
4667 : GPtrDiff_t iSrcOffset;
4668 :
4669 : // Add small epsilon to avoid some numeric precision issues.
4670 32 : const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
4671 32 : const int iSrcY = static_cast<int>(std::min(
4672 32 : std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
4673 :
4674 32 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
4675 : static_cast<GPtrDiff_t>(nLineSpace);
4676 :
4677 302 : for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff++)
4678 : {
4679 270 : const double dfSrcX = (iBufXOff + 0.5) * dfSrcXInc + dfXOff + EPS;
4680 270 : const int iSrcX = static_cast<int>(std::min(
4681 270 : std::max(0.0, dfSrcX), static_cast<double>(nRasterXSize - 1)));
4682 :
4683 : // FIXME: this code likely doesn't work if the dirty block gets
4684 : // flushed to disk before being completely written. In the meantime,
4685 : // bJustInitialize should probably be set to FALSE even if it is not
4686 : // ideal performance wise, and for lossy compression
4687 :
4688 : /* --------------------------------------------------------------------
4689 : */
4690 : /* Ensure we have the appropriate block loaded. */
4691 : /* --------------------------------------------------------------------
4692 : */
4693 270 : if (iSrcX < nLBlockX * nBlockXSize ||
4694 270 : iSrcX - nBlockXSize >= nLBlockX * nBlockXSize ||
4695 266 : iSrcY < nLBlockY * nBlockYSize ||
4696 266 : iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
4697 : {
4698 4 : nLBlockX = iSrcX / nBlockXSize;
4699 4 : nLBlockY = iSrcY / nBlockYSize;
4700 :
4701 4 : const bool bJustInitialize =
4702 0 : eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
4703 0 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
4704 4 : nXOff <= nLBlockX * nBlockXSize &&
4705 0 : nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
4706 : /*bool bMemZeroBuffer = FALSE;
4707 : if( eRWFlag == GF_Write && !bJustInitialize &&
4708 : nXOff <= nLBlockX * nBlockXSize &&
4709 : nYOff <= nLBlockY * nBlockYSize &&
4710 : (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
4711 : (nXOff + nXSize == GetRasterXSize() &&
4712 : (nLBlockX+1) * nBlockXSize > GetRasterXSize())) &&
4713 : (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
4714 : (nYOff + nYSize == GetRasterYSize() &&
4715 : (nLBlockY+1) * nBlockYSize > GetRasterYSize())) )
4716 : {
4717 : bJustInitialize = TRUE;
4718 : bMemZeroBuffer = TRUE;
4719 : }*/
4720 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
4721 : {
4722 8 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
4723 8 : if (nOverviewLevel >= 0)
4724 2 : poBand = poBand->GetOverview(nOverviewLevel);
4725 16 : poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY,
4726 8 : bJustInitialize);
4727 8 : if (poBlock == nullptr)
4728 : {
4729 0 : eErr = CE_Failure;
4730 0 : goto CleanupAndReturn;
4731 : }
4732 :
4733 8 : if (eRWFlag == GF_Write)
4734 0 : poBlock->MarkDirty();
4735 :
4736 8 : if (papoBlocks[iBand] != nullptr)
4737 0 : papoBlocks[iBand]->DropLock();
4738 :
4739 8 : papoBlocks[iBand] = poBlock;
4740 :
4741 8 : papabySrcBlock[iBand] =
4742 8 : static_cast<GByte *>(poBlock->GetDataRef());
4743 : /*if( bMemZeroBuffer )
4744 : {
4745 : memset(papabySrcBlock[iBand], 0,
4746 : static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
4747 : * nBlockYSize);
4748 : }*/
4749 : }
4750 : }
4751 :
4752 : /* --------------------------------------------------------------------
4753 : */
4754 : /* Copy over this pixel of data. */
4755 : /* --------------------------------------------------------------------
4756 : */
4757 270 : iSrcOffset = (static_cast<GPtrDiff_t>(iSrcX) -
4758 270 : static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
4759 270 : (static_cast<GPtrDiff_t>(iSrcY) -
4760 270 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
4761 270 : nBlockXSize) *
4762 270 : nBandDataSize;
4763 :
4764 980 : for (int iBand = 0; iBand < nBandCount; iBand++)
4765 : {
4766 710 : GByte *pabySrcBlock = papabySrcBlock[iBand];
4767 710 : GPtrDiff_t iBandBufOffset =
4768 710 : iBufOffset + static_cast<GPtrDiff_t>(iBand) *
4769 : static_cast<GPtrDiff_t>(nBandSpace);
4770 :
4771 710 : if (eDataType == eBufType)
4772 : {
4773 710 : if (eRWFlag == GF_Read)
4774 710 : memcpy(static_cast<GByte *>(pData) + iBandBufOffset,
4775 710 : pabySrcBlock + iSrcOffset, nBandDataSize);
4776 : else
4777 0 : memcpy(pabySrcBlock + iSrcOffset,
4778 : static_cast<const GByte *>(pData) +
4779 0 : iBandBufOffset,
4780 : nBandDataSize);
4781 : }
4782 : else
4783 : {
4784 : /* type to type conversion ... ouch, this is expensive way
4785 : of handling single words */
4786 :
4787 0 : if (eRWFlag == GF_Read)
4788 0 : GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
4789 : static_cast<GByte *>(pData) +
4790 0 : iBandBufOffset,
4791 : eBufType, 0, 1);
4792 : else
4793 0 : GDALCopyWords64(static_cast<const GByte *>(pData) +
4794 0 : iBandBufOffset,
4795 0 : eBufType, 0, pabySrcBlock + iSrcOffset,
4796 : eDataType, 0, 1);
4797 : }
4798 : }
4799 :
4800 270 : iBufOffset += static_cast<int>(nPixelSpace);
4801 : }
4802 : }
4803 :
4804 : /* -------------------------------------------------------------------- */
4805 : /* CleanupAndReturn. */
4806 : /* -------------------------------------------------------------------- */
4807 4 : CleanupAndReturn:
4808 4 : CPLFree(papabySrcBlock);
4809 4 : if (papoBlocks != nullptr)
4810 : {
4811 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
4812 : {
4813 8 : if (papoBlocks[iBand] != nullptr)
4814 8 : papoBlocks[iBand]->DropLock();
4815 : }
4816 4 : CPLFree(papoBlocks);
4817 : }
4818 :
4819 4 : return eErr;
4820 : }
4821 :
4822 : //! @endcond
4823 :
4824 : /************************************************************************/
4825 : /* GDALCopyWholeRasterGetSwathSize() */
4826 : /************************************************************************/
4827 :
4828 3162 : static void GDALCopyWholeRasterGetSwathSize(GDALRasterBand *poSrcPrototypeBand,
4829 : GDALRasterBand *poDstPrototypeBand,
4830 : int nBandCount,
4831 : int bDstIsCompressed,
4832 : int bInterleave, int *pnSwathCols,
4833 : int *pnSwathLines)
4834 : {
4835 3162 : GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
4836 3162 : int nSrcBlockXSize = 0;
4837 3162 : int nSrcBlockYSize = 0;
4838 3162 : int nBlockXSize = 0;
4839 3162 : int nBlockYSize = 0;
4840 :
4841 3162 : int nXSize = poSrcPrototypeBand->GetXSize();
4842 3162 : int nYSize = poSrcPrototypeBand->GetYSize();
4843 :
4844 3162 : poSrcPrototypeBand->GetBlockSize(&nSrcBlockXSize, &nSrcBlockYSize);
4845 3162 : poDstPrototypeBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
4846 :
4847 3162 : const int nMaxBlockXSize = std::max(nBlockXSize, nSrcBlockXSize);
4848 3162 : const int nMaxBlockYSize = std::max(nBlockYSize, nSrcBlockYSize);
4849 :
4850 3162 : int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
4851 3162 : if (bInterleave)
4852 550 : nPixelSize *= nBandCount;
4853 :
4854 : // aim for one row of blocks. Do not settle for less.
4855 3162 : int nSwathCols = nXSize;
4856 3162 : int nSwathLines = nMaxBlockYSize;
4857 :
4858 : const char *pszSrcCompression =
4859 3162 : poSrcPrototypeBand->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
4860 3162 : if (pszSrcCompression == nullptr)
4861 : {
4862 3136 : auto poSrcDS = poSrcPrototypeBand->GetDataset();
4863 3136 : if (poSrcDS)
4864 : pszSrcCompression =
4865 3130 : poSrcDS->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
4866 : }
4867 :
4868 : /* -------------------------------------------------------------------- */
4869 : /* What will our swath size be? */
4870 : /* -------------------------------------------------------------------- */
4871 : // When writing interleaved data in a compressed format, we want to be sure
4872 : // that each block will only be written once, so the swath size must not be
4873 : // greater than the block cache.
4874 3162 : const char *pszSwathSize = CPLGetConfigOption("GDAL_SWATH_SIZE", nullptr);
4875 : int nTargetSwathSize;
4876 3162 : if (pszSwathSize != nullptr)
4877 0 : nTargetSwathSize = static_cast<int>(
4878 0 : std::min(GIntBig(INT_MAX), CPLAtoGIntBig(pszSwathSize)));
4879 : else
4880 : {
4881 : // As a default, take one 1/4 of the cache size.
4882 3162 : nTargetSwathSize = static_cast<int>(
4883 3162 : std::min(GIntBig(INT_MAX), GDALGetCacheMax64() / 4));
4884 :
4885 : // but if the minimum idal swath buf size is less, then go for it to
4886 : // avoid unnecessarily abusing RAM usage.
4887 : // but try to use 10 MB at least.
4888 3162 : GIntBig nIdealSwathBufSize =
4889 3162 : static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize;
4890 3162 : int nMinTargetSwathSize = 10 * 1000 * 1000;
4891 :
4892 3162 : if ((poSrcPrototypeBand->GetSuggestedBlockAccessPattern() &
4893 3162 : GSBAP_LARGEST_CHUNK_POSSIBLE) != 0)
4894 : {
4895 2 : nMinTargetSwathSize = nTargetSwathSize;
4896 : }
4897 :
4898 3162 : if (nIdealSwathBufSize < nTargetSwathSize &&
4899 3152 : nIdealSwathBufSize < nMinTargetSwathSize)
4900 : {
4901 3149 : nIdealSwathBufSize = nMinTargetSwathSize;
4902 : }
4903 :
4904 3162 : if (pszSrcCompression != nullptr &&
4905 180 : EQUAL(pszSrcCompression, "JPEG2000") &&
4906 0 : (!bDstIsCompressed || ((nSrcBlockXSize % nBlockXSize) == 0 &&
4907 0 : (nSrcBlockYSize % nBlockYSize) == 0)))
4908 : {
4909 2 : nIdealSwathBufSize =
4910 4 : std::max(nIdealSwathBufSize, static_cast<GIntBig>(nSwathCols) *
4911 2 : nSrcBlockYSize * nPixelSize);
4912 : }
4913 3162 : if (nTargetSwathSize > nIdealSwathBufSize)
4914 3148 : nTargetSwathSize = static_cast<int>(
4915 3148 : std::min(GIntBig(INT_MAX), nIdealSwathBufSize));
4916 : }
4917 :
4918 3162 : if (nTargetSwathSize < 1000000)
4919 8 : nTargetSwathSize = 1000000;
4920 :
4921 : /* But let's check that */
4922 3380 : if (bDstIsCompressed && bInterleave &&
4923 218 : nTargetSwathSize > GDALGetCacheMax64())
4924 : {
4925 0 : CPLError(CE_Warning, CPLE_AppDefined,
4926 : "When translating into a compressed interleave format, "
4927 : "the block cache size (" CPL_FRMT_GIB ") "
4928 : "should be at least the size of the swath (%d) "
4929 : "(GDAL_SWATH_SIZE config. option)",
4930 : GDALGetCacheMax64(), nTargetSwathSize);
4931 : }
4932 :
4933 : #define IS_DIVIDER_OF(x, y) ((y) % (x) == 0)
4934 : #define ROUND_TO(x, y) (((x) / (y)) * (y))
4935 :
4936 : // if both input and output datasets are tiled, that the tile dimensions
4937 : // are "compatible", try to stick to a swath dimension that is a multiple
4938 : // of input and output block dimensions.
4939 3162 : if (nBlockXSize != nXSize && nSrcBlockXSize != nXSize &&
4940 38 : IS_DIVIDER_OF(nBlockXSize, nMaxBlockXSize) &&
4941 38 : IS_DIVIDER_OF(nSrcBlockXSize, nMaxBlockXSize) &&
4942 38 : IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
4943 38 : IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
4944 : {
4945 38 : if (static_cast<GIntBig>(nMaxBlockXSize) * nMaxBlockYSize *
4946 38 : nPixelSize <=
4947 38 : static_cast<GIntBig>(nTargetSwathSize))
4948 : {
4949 38 : nSwathCols = nTargetSwathSize / (nMaxBlockYSize * nPixelSize);
4950 38 : nSwathCols = ROUND_TO(nSwathCols, nMaxBlockXSize);
4951 38 : if (nSwathCols == 0)
4952 0 : nSwathCols = nMaxBlockXSize;
4953 38 : if (nSwathCols > nXSize)
4954 36 : nSwathCols = nXSize;
4955 38 : nSwathLines = nMaxBlockYSize;
4956 :
4957 38 : if (static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize >
4958 38 : static_cast<GIntBig>(nTargetSwathSize))
4959 : {
4960 0 : nSwathCols = nXSize;
4961 0 : nSwathLines = nBlockYSize;
4962 : }
4963 : }
4964 : }
4965 :
4966 3162 : const GIntBig nMemoryPerCol = static_cast<GIntBig>(nSwathCols) * nPixelSize;
4967 3162 : const GIntBig nSwathBufSize = nMemoryPerCol * nSwathLines;
4968 3162 : if (nSwathBufSize > static_cast<GIntBig>(nTargetSwathSize))
4969 : {
4970 1 : nSwathLines = static_cast<int>(nTargetSwathSize / nMemoryPerCol);
4971 1 : if (nSwathLines == 0)
4972 1 : nSwathLines = 1;
4973 :
4974 1 : CPLDebug(
4975 : "GDAL",
4976 : "GDALCopyWholeRasterGetSwathSize(): adjusting to %d line swath "
4977 : "since requirement (" CPL_FRMT_GIB " bytes) exceed target swath "
4978 : "size (%d bytes) (GDAL_SWATH_SIZE config. option)",
4979 1 : nSwathLines, nBlockYSize * nMemoryPerCol, nTargetSwathSize);
4980 : }
4981 : // If we are processing single scans, try to handle several at once.
4982 : // If we are handling swaths already, only grow the swath if a row
4983 : // of blocks is substantially less than our target buffer size.
4984 3161 : else if (nSwathLines == 1 ||
4985 2620 : nMemoryPerCol * nSwathLines <
4986 2620 : static_cast<GIntBig>(nTargetSwathSize) / 10)
4987 : {
4988 3133 : nSwathLines = std::min(
4989 : nYSize,
4990 3133 : std::max(1, static_cast<int>(nTargetSwathSize / nMemoryPerCol)));
4991 :
4992 : /* If possible try to align to source and target block height */
4993 3133 : if ((nSwathLines % nMaxBlockYSize) != 0 &&
4994 251 : nSwathLines > nMaxBlockYSize &&
4995 251 : IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
4996 222 : IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
4997 202 : nSwathLines = ROUND_TO(nSwathLines, nMaxBlockYSize);
4998 : }
4999 :
5000 3162 : if (pszSrcCompression != nullptr && EQUAL(pszSrcCompression, "JPEG2000") &&
5001 0 : (!bDstIsCompressed || (IS_DIVIDER_OF(nBlockXSize, nSrcBlockXSize) &&
5002 0 : IS_DIVIDER_OF(nBlockYSize, nSrcBlockYSize))))
5003 : {
5004 : // Typical use case: converting from Pleaiades that is 2048x2048 tiled.
5005 2 : if (nSwathLines < nSrcBlockYSize)
5006 : {
5007 0 : nSwathLines = nSrcBlockYSize;
5008 :
5009 : // Number of pixels that can be read/write simultaneously.
5010 0 : nSwathCols = nTargetSwathSize / (nSrcBlockXSize * nPixelSize);
5011 0 : nSwathCols = ROUND_TO(nSwathCols, nSrcBlockXSize);
5012 0 : if (nSwathCols == 0)
5013 0 : nSwathCols = nSrcBlockXSize;
5014 0 : if (nSwathCols > nXSize)
5015 0 : nSwathCols = nXSize;
5016 :
5017 0 : CPLDebug(
5018 : "GDAL",
5019 : "GDALCopyWholeRasterGetSwathSize(): because of compression and "
5020 : "too high block, "
5021 : "use partial width at one time");
5022 : }
5023 2 : else if ((nSwathLines % nSrcBlockYSize) != 0)
5024 : {
5025 : /* Round on a multiple of nSrcBlockYSize */
5026 0 : nSwathLines = ROUND_TO(nSwathLines, nSrcBlockYSize);
5027 0 : CPLDebug(
5028 : "GDAL",
5029 : "GDALCopyWholeRasterGetSwathSize(): because of compression, "
5030 : "round nSwathLines to block height : %d",
5031 : nSwathLines);
5032 : }
5033 : }
5034 3160 : else if (bDstIsCompressed)
5035 : {
5036 408 : if (nSwathLines < nBlockYSize)
5037 : {
5038 146 : nSwathLines = nBlockYSize;
5039 :
5040 : // Number of pixels that can be read/write simultaneously.
5041 146 : nSwathCols = nTargetSwathSize / (nSwathLines * nPixelSize);
5042 146 : nSwathCols = ROUND_TO(nSwathCols, nBlockXSize);
5043 146 : if (nSwathCols == 0)
5044 0 : nSwathCols = nBlockXSize;
5045 146 : if (nSwathCols > nXSize)
5046 146 : nSwathCols = nXSize;
5047 :
5048 146 : CPLDebug(
5049 : "GDAL",
5050 : "GDALCopyWholeRasterGetSwathSize(): because of compression and "
5051 : "too high block, "
5052 : "use partial width at one time");
5053 : }
5054 262 : else if ((nSwathLines % nBlockYSize) != 0)
5055 : {
5056 : // Round on a multiple of nBlockYSize.
5057 9 : nSwathLines = ROUND_TO(nSwathLines, nBlockYSize);
5058 9 : CPLDebug(
5059 : "GDAL",
5060 : "GDALCopyWholeRasterGetSwathSize(): because of compression, "
5061 : "round nSwathLines to block height : %d",
5062 : nSwathLines);
5063 : }
5064 : }
5065 :
5066 3162 : *pnSwathCols = nSwathCols;
5067 3162 : *pnSwathLines = nSwathLines;
5068 3162 : }
5069 :
5070 : /************************************************************************/
5071 : /* GDALDatasetCopyWholeRaster() */
5072 : /************************************************************************/
5073 :
5074 : /**
5075 : * \brief Copy all dataset raster data.
5076 : *
5077 : * This function copies the complete raster contents of one dataset to
5078 : * another similarly configured dataset. The source and destination
5079 : * dataset must have the same number of bands, and the same width
5080 : * and height. The bands do not have to have the same data type.
5081 : *
5082 : * This function is primarily intended to support implementation of
5083 : * driver specific CreateCopy() functions. It implements efficient copying,
5084 : * in particular "chunking" the copy in substantial blocks and, if appropriate,
5085 : * performing the transfer in a pixel interleaved fashion.
5086 : *
5087 : * Currently the only papszOptions value supported are :
5088 : * <ul>
5089 : * <li>"INTERLEAVE=PIXEL/BAND" to force pixel (resp. band) interleaved read and
5090 : * write access pattern (this does not modify the layout of the destination
5091 : * data)</li> <li>"COMPRESSED=YES" to force alignment on target dataset block
5092 : * sizes to achieve best compression.</li> <li>"SKIP_HOLES=YES" to skip chunks
5093 : * for which GDALGetDataCoverageStatus() returns GDAL_DATA_COVERAGE_STATUS_EMPTY
5094 : * (GDAL >= 2.2)</li>
5095 : * </ul>
5096 : * More options may be supported in the future.
5097 : *
5098 : * @param hSrcDS the source dataset
5099 : * @param hDstDS the destination dataset
5100 : * @param papszOptions transfer hints in "StringList" Name=Value format.
5101 : * @param pfnProgress progress reporting function.
5102 : * @param pProgressData callback data for progress function.
5103 : *
5104 : * @return CE_None on success, or CE_Failure on failure.
5105 : */
5106 :
5107 3135 : CPLErr CPL_STDCALL GDALDatasetCopyWholeRaster(GDALDatasetH hSrcDS,
5108 : GDALDatasetH hDstDS,
5109 : CSLConstList papszOptions,
5110 : GDALProgressFunc pfnProgress,
5111 : void *pProgressData)
5112 :
5113 : {
5114 3135 : VALIDATE_POINTER1(hSrcDS, "GDALDatasetCopyWholeRaster", CE_Failure);
5115 3135 : VALIDATE_POINTER1(hDstDS, "GDALDatasetCopyWholeRaster", CE_Failure);
5116 :
5117 3135 : GDALDataset *poSrcDS = GDALDataset::FromHandle(hSrcDS);
5118 3135 : GDALDataset *poDstDS = GDALDataset::FromHandle(hDstDS);
5119 :
5120 3135 : if (pfnProgress == nullptr)
5121 0 : pfnProgress = GDALDummyProgress;
5122 :
5123 : /* -------------------------------------------------------------------- */
5124 : /* Confirm the datasets match in size and band counts. */
5125 : /* -------------------------------------------------------------------- */
5126 3135 : const int nXSize = poDstDS->GetRasterXSize();
5127 3135 : const int nYSize = poDstDS->GetRasterYSize();
5128 3135 : const int nBandCount = poDstDS->GetRasterCount();
5129 :
5130 3135 : if (poSrcDS->GetRasterXSize() != nXSize ||
5131 6270 : poSrcDS->GetRasterYSize() != nYSize ||
5132 3135 : poSrcDS->GetRasterCount() != nBandCount)
5133 : {
5134 0 : CPLError(CE_Failure, CPLE_AppDefined,
5135 : "Input and output dataset sizes or band counts do not\n"
5136 : "match in GDALDatasetCopyWholeRaster()");
5137 0 : return CE_Failure;
5138 : }
5139 :
5140 : /* -------------------------------------------------------------------- */
5141 : /* Report preliminary (0) progress. */
5142 : /* -------------------------------------------------------------------- */
5143 3135 : if (!pfnProgress(0.0, nullptr, pProgressData))
5144 : {
5145 1 : CPLError(CE_Failure, CPLE_UserInterrupt,
5146 : "User terminated CreateCopy()");
5147 1 : return CE_Failure;
5148 : }
5149 :
5150 : /* -------------------------------------------------------------------- */
5151 : /* Get our prototype band, and assume the others are similarly */
5152 : /* configured. */
5153 : /* -------------------------------------------------------------------- */
5154 3134 : if (nBandCount == 0)
5155 0 : return CE_None;
5156 :
5157 3134 : GDALRasterBand *poSrcPrototypeBand = poSrcDS->GetRasterBand(1);
5158 3134 : GDALRasterBand *poDstPrototypeBand = poDstDS->GetRasterBand(1);
5159 3134 : GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
5160 :
5161 : /* -------------------------------------------------------------------- */
5162 : /* Do we want to try and do the operation in a pixel */
5163 : /* interleaved fashion? */
5164 : /* -------------------------------------------------------------------- */
5165 3134 : bool bInterleave = false;
5166 : const char *pszInterleave =
5167 3134 : poSrcDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
5168 3134 : if (pszInterleave != nullptr &&
5169 2779 : (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
5170 184 : bInterleave = true;
5171 :
5172 3134 : pszInterleave = poDstDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
5173 3134 : if (pszInterleave != nullptr &&
5174 2679 : (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
5175 497 : bInterleave = true;
5176 :
5177 3134 : pszInterleave = CSLFetchNameValue(papszOptions, "INTERLEAVE");
5178 3134 : if (pszInterleave != nullptr && EQUAL(pszInterleave, "PIXEL"))
5179 5 : bInterleave = true;
5180 3129 : else if (pszInterleave != nullptr && EQUAL(pszInterleave, "BAND"))
5181 13 : bInterleave = false;
5182 : // attributes is specific to the TileDB driver
5183 3116 : else if (pszInterleave != nullptr && EQUAL(pszInterleave, "ATTRIBUTES"))
5184 4 : bInterleave = true;
5185 3112 : else if (pszInterleave != nullptr)
5186 : {
5187 0 : CPLError(CE_Warning, CPLE_NotSupported,
5188 : "Unsupported value for option INTERLEAVE");
5189 : }
5190 :
5191 : // If the destination is compressed, we must try to write blocks just once,
5192 : // to save disk space (GTiff case for example), and to avoid data loss
5193 : // (JPEG compression for example).
5194 3134 : bool bDstIsCompressed = false;
5195 : const char *pszDstCompressed =
5196 3134 : CSLFetchNameValue(papszOptions, "COMPRESSED");
5197 3134 : if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
5198 383 : bDstIsCompressed = true;
5199 :
5200 : /* -------------------------------------------------------------------- */
5201 : /* What will our swath size be? */
5202 : /* -------------------------------------------------------------------- */
5203 :
5204 3134 : int nSwathCols = 0;
5205 3134 : int nSwathLines = 0;
5206 3134 : GDALCopyWholeRasterGetSwathSize(poSrcPrototypeBand, poDstPrototypeBand,
5207 : nBandCount, bDstIsCompressed, bInterleave,
5208 : &nSwathCols, &nSwathLines);
5209 :
5210 3134 : int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5211 3134 : if (bInterleave)
5212 550 : nPixelSize *= nBandCount;
5213 :
5214 3134 : void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
5215 3134 : if (pSwathBuf == nullptr)
5216 : {
5217 0 : return CE_Failure;
5218 : }
5219 :
5220 3134 : CPLDebug("GDAL",
5221 : "GDALDatasetCopyWholeRaster(): %d*%d swaths, bInterleave=%d",
5222 : nSwathCols, nSwathLines, static_cast<int>(bInterleave));
5223 :
5224 : // Advise the source raster that we are going to read it completely
5225 : // Note: this might already have been done by GDALCreateCopy() in the
5226 : // likely case this function is indirectly called by it
5227 3134 : poSrcDS->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nBandCount,
5228 3134 : nullptr, nullptr);
5229 :
5230 : /* ==================================================================== */
5231 : /* Band oriented (uninterleaved) case. */
5232 : /* ==================================================================== */
5233 3134 : CPLErr eErr = CE_None;
5234 : const bool bCheckHoles =
5235 3134 : CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
5236 :
5237 3134 : if (!bInterleave)
5238 : {
5239 : GDALRasterIOExtraArg sExtraArg;
5240 2584 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5241 2584 : CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress); // to make cppcheck happy
5242 :
5243 7752 : const GIntBig nTotalBlocks = static_cast<GIntBig>(nBandCount) *
5244 2584 : DIV_ROUND_UP(nYSize, nSwathLines) *
5245 2584 : DIV_ROUND_UP(nXSize, nSwathCols);
5246 2584 : GIntBig nBlocksDone = 0;
5247 :
5248 7561 : for (int iBand = 0; iBand < nBandCount && eErr == CE_None; iBand++)
5249 : {
5250 4977 : int nBand = iBand + 1;
5251 :
5252 10269 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5253 : {
5254 5292 : int nThisLines = nSwathLines;
5255 :
5256 5292 : if (iY + nThisLines > nYSize)
5257 375 : nThisLines = nYSize - iY;
5258 :
5259 10584 : for (int iX = 0; iX < nXSize && eErr == CE_None;
5260 5292 : iX += nSwathCols)
5261 : {
5262 5292 : int nThisCols = nSwathCols;
5263 :
5264 5292 : if (iX + nThisCols > nXSize)
5265 0 : nThisCols = nXSize - iX;
5266 :
5267 5292 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5268 5292 : if (bCheckHoles)
5269 : {
5270 : nStatus = poSrcDS->GetRasterBand(nBand)
5271 3640 : ->GetDataCoverageStatus(
5272 : iX, iY, nThisCols, nThisLines,
5273 : GDAL_DATA_COVERAGE_STATUS_DATA);
5274 : }
5275 5292 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5276 : {
5277 5288 : sExtraArg.pfnProgress = GDALScaledProgress;
5278 10576 : sExtraArg.pProgressData = GDALCreateScaledProgress(
5279 5288 : nBlocksDone / static_cast<double>(nTotalBlocks),
5280 5288 : (nBlocksDone + 0.5) /
5281 5288 : static_cast<double>(nTotalBlocks),
5282 : pfnProgress, pProgressData);
5283 5288 : if (sExtraArg.pProgressData == nullptr)
5284 1622 : sExtraArg.pfnProgress = nullptr;
5285 :
5286 5288 : eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
5287 : nThisLines, pSwathBuf,
5288 : nThisCols, nThisLines, eDT, 1,
5289 : &nBand, 0, 0, 0, &sExtraArg);
5290 :
5291 5288 : GDALDestroyScaledProgress(sExtraArg.pProgressData);
5292 :
5293 5288 : if (eErr == CE_None)
5294 5281 : eErr = poDstDS->RasterIO(
5295 : GF_Write, iX, iY, nThisCols, nThisLines,
5296 : pSwathBuf, nThisCols, nThisLines, eDT, 1,
5297 : &nBand, 0, 0, 0, nullptr);
5298 : }
5299 :
5300 5292 : nBlocksDone++;
5301 10542 : if (eErr == CE_None &&
5302 5250 : !pfnProgress(nBlocksDone /
5303 5250 : static_cast<double>(nTotalBlocks),
5304 : nullptr, pProgressData))
5305 : {
5306 2 : eErr = CE_Failure;
5307 2 : CPLError(CE_Failure, CPLE_UserInterrupt,
5308 : "User terminated CreateCopy()");
5309 : }
5310 : }
5311 : }
5312 : }
5313 : }
5314 :
5315 : /* ==================================================================== */
5316 : /* Pixel interleaved case. */
5317 : /* ==================================================================== */
5318 : else /* if( bInterleave ) */
5319 : {
5320 : GDALRasterIOExtraArg sExtraArg;
5321 550 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5322 550 : CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress); // to make cppcheck happy
5323 :
5324 550 : const GIntBig nTotalBlocks =
5325 550 : static_cast<GIntBig>(DIV_ROUND_UP(nYSize, nSwathLines)) *
5326 550 : DIV_ROUND_UP(nXSize, nSwathCols);
5327 550 : GIntBig nBlocksDone = 0;
5328 :
5329 1315 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5330 : {
5331 765 : int nThisLines = nSwathLines;
5332 :
5333 765 : if (iY + nThisLines > nYSize)
5334 191 : nThisLines = nYSize - iY;
5335 :
5336 1535 : for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
5337 : {
5338 770 : int nThisCols = nSwathCols;
5339 :
5340 770 : if (iX + nThisCols > nXSize)
5341 3 : nThisCols = nXSize - iX;
5342 :
5343 770 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5344 770 : if (bCheckHoles)
5345 : {
5346 540 : nStatus = 0;
5347 593 : for (int iBand = 0; iBand < nBandCount; iBand++)
5348 : {
5349 574 : nStatus |= poSrcDS->GetRasterBand(iBand + 1)
5350 574 : ->GetDataCoverageStatus(
5351 : iX, iY, nThisCols, nThisLines,
5352 : GDAL_DATA_COVERAGE_STATUS_DATA);
5353 574 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5354 521 : break;
5355 : }
5356 : }
5357 770 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5358 : {
5359 751 : sExtraArg.pfnProgress = GDALScaledProgress;
5360 1502 : sExtraArg.pProgressData = GDALCreateScaledProgress(
5361 751 : nBlocksDone / static_cast<double>(nTotalBlocks),
5362 751 : (nBlocksDone + 0.5) / static_cast<double>(nTotalBlocks),
5363 : pfnProgress, pProgressData);
5364 751 : if (sExtraArg.pProgressData == nullptr)
5365 343 : sExtraArg.pfnProgress = nullptr;
5366 :
5367 751 : eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
5368 : nThisLines, pSwathBuf, nThisCols,
5369 : nThisLines, eDT, nBandCount,
5370 : nullptr, 0, 0, 0, &sExtraArg);
5371 :
5372 751 : GDALDestroyScaledProgress(sExtraArg.pProgressData);
5373 :
5374 751 : if (eErr == CE_None)
5375 750 : eErr = poDstDS->RasterIO(
5376 : GF_Write, iX, iY, nThisCols, nThisLines, pSwathBuf,
5377 : nThisCols, nThisLines, eDT, nBandCount, nullptr, 0,
5378 : 0, 0, nullptr);
5379 : }
5380 :
5381 770 : nBlocksDone++;
5382 1536 : if (eErr == CE_None &&
5383 766 : !pfnProgress(nBlocksDone /
5384 766 : static_cast<double>(nTotalBlocks),
5385 : nullptr, pProgressData))
5386 : {
5387 1 : eErr = CE_Failure;
5388 1 : CPLError(CE_Failure, CPLE_UserInterrupt,
5389 : "User terminated CreateCopy()");
5390 : }
5391 : }
5392 : }
5393 : }
5394 :
5395 : /* -------------------------------------------------------------------- */
5396 : /* Cleanup */
5397 : /* -------------------------------------------------------------------- */
5398 3134 : CPLFree(pSwathBuf);
5399 :
5400 3134 : return eErr;
5401 : }
5402 :
5403 : /************************************************************************/
5404 : /* GDALRasterBandCopyWholeRaster() */
5405 : /************************************************************************/
5406 :
5407 : /**
5408 : * \brief Copy a whole raster band
5409 : *
5410 : * This function copies the complete raster contents of one band to
5411 : * another similarly configured band. The source and destination
5412 : * bands must have the same width and height. The bands do not have
5413 : * to have the same data type.
5414 : *
5415 : * It implements efficient copying, in particular "chunking" the copy in
5416 : * substantial blocks.
5417 : *
5418 : * Currently the only papszOptions value supported are :
5419 : * <ul>
5420 : * <li>"COMPRESSED=YES" to force alignment on target dataset block sizes to
5421 : * achieve best compression.</li>
5422 : * <li>"SKIP_HOLES=YES" to skip chunks for which GDALGetDataCoverageStatus()
5423 : * returns GDAL_DATA_COVERAGE_STATUS_EMPTY (GDAL >= 2.2)</li>
5424 : * </ul>
5425 : *
5426 : * @param hSrcBand the source band
5427 : * @param hDstBand the destination band
5428 : * @param papszOptions transfer hints in "StringList" Name=Value format.
5429 : * @param pfnProgress progress reporting function.
5430 : * @param pProgressData callback data for progress function.
5431 : *
5432 : * @return CE_None on success, or CE_Failure on failure.
5433 : */
5434 :
5435 28 : CPLErr CPL_STDCALL GDALRasterBandCopyWholeRaster(
5436 : GDALRasterBandH hSrcBand, GDALRasterBandH hDstBand,
5437 : const char *const *const papszOptions, GDALProgressFunc pfnProgress,
5438 : void *pProgressData)
5439 :
5440 : {
5441 28 : VALIDATE_POINTER1(hSrcBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
5442 28 : VALIDATE_POINTER1(hDstBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
5443 :
5444 28 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
5445 28 : GDALRasterBand *poDstBand = GDALRasterBand::FromHandle(hDstBand);
5446 28 : CPLErr eErr = CE_None;
5447 :
5448 28 : if (pfnProgress == nullptr)
5449 2 : pfnProgress = GDALDummyProgress;
5450 :
5451 : /* -------------------------------------------------------------------- */
5452 : /* Confirm the datasets match in size and band counts. */
5453 : /* -------------------------------------------------------------------- */
5454 28 : int nXSize = poSrcBand->GetXSize();
5455 28 : int nYSize = poSrcBand->GetYSize();
5456 :
5457 28 : if (poDstBand->GetXSize() != nXSize || poDstBand->GetYSize() != nYSize)
5458 : {
5459 0 : CPLError(CE_Failure, CPLE_AppDefined,
5460 : "Input and output band sizes do not\n"
5461 : "match in GDALRasterBandCopyWholeRaster()");
5462 0 : return CE_Failure;
5463 : }
5464 :
5465 : /* -------------------------------------------------------------------- */
5466 : /* Report preliminary (0) progress. */
5467 : /* -------------------------------------------------------------------- */
5468 28 : if (!pfnProgress(0.0, nullptr, pProgressData))
5469 : {
5470 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
5471 : "User terminated CreateCopy()");
5472 0 : return CE_Failure;
5473 : }
5474 :
5475 28 : GDALDataType eDT = poDstBand->GetRasterDataType();
5476 :
5477 : // If the destination is compressed, we must try to write blocks just once,
5478 : // to save disk space (GTiff case for example), and to avoid data loss
5479 : // (JPEG compression for example).
5480 28 : bool bDstIsCompressed = false;
5481 : const char *pszDstCompressed =
5482 28 : CSLFetchNameValue(const_cast<char **>(papszOptions), "COMPRESSED");
5483 28 : if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
5484 25 : bDstIsCompressed = true;
5485 :
5486 : /* -------------------------------------------------------------------- */
5487 : /* What will our swath size be? */
5488 : /* -------------------------------------------------------------------- */
5489 :
5490 28 : int nSwathCols = 0;
5491 28 : int nSwathLines = 0;
5492 28 : GDALCopyWholeRasterGetSwathSize(poSrcBand, poDstBand, 1, bDstIsCompressed,
5493 : FALSE, &nSwathCols, &nSwathLines);
5494 :
5495 28 : const int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5496 :
5497 28 : void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
5498 28 : if (pSwathBuf == nullptr)
5499 : {
5500 0 : return CE_Failure;
5501 : }
5502 :
5503 28 : CPLDebug("GDAL", "GDALRasterBandCopyWholeRaster(): %d*%d swaths",
5504 : nSwathCols, nSwathLines);
5505 :
5506 : const bool bCheckHoles =
5507 28 : CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
5508 :
5509 : // Advise the source raster that we are going to read it completely
5510 28 : poSrcBand->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nullptr);
5511 :
5512 : /* ==================================================================== */
5513 : /* Band oriented (uninterleaved) case. */
5514 : /* ==================================================================== */
5515 :
5516 70 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5517 : {
5518 42 : int nThisLines = nSwathLines;
5519 :
5520 42 : if (iY + nThisLines > nYSize)
5521 8 : nThisLines = nYSize - iY;
5522 :
5523 84 : for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
5524 : {
5525 42 : int nThisCols = nSwathCols;
5526 :
5527 42 : if (iX + nThisCols > nXSize)
5528 0 : nThisCols = nXSize - iX;
5529 :
5530 42 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5531 42 : if (bCheckHoles)
5532 : {
5533 0 : nStatus = poSrcBand->GetDataCoverageStatus(
5534 : iX, iY, nThisCols, nThisLines,
5535 : GDAL_DATA_COVERAGE_STATUS_DATA);
5536 : }
5537 42 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5538 : {
5539 42 : eErr = poSrcBand->RasterIO(GF_Read, iX, iY, nThisCols,
5540 : nThisLines, pSwathBuf, nThisCols,
5541 : nThisLines, eDT, 0, 0, nullptr);
5542 :
5543 42 : if (eErr == CE_None)
5544 42 : eErr = poDstBand->RasterIO(GF_Write, iX, iY, nThisCols,
5545 : nThisLines, pSwathBuf, nThisCols,
5546 : nThisLines, eDT, 0, 0, nullptr);
5547 : }
5548 :
5549 84 : if (eErr == CE_None &&
5550 42 : !pfnProgress((iY + nThisLines) / static_cast<float>(nYSize),
5551 : nullptr, pProgressData))
5552 : {
5553 0 : eErr = CE_Failure;
5554 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
5555 : "User terminated CreateCopy()");
5556 : }
5557 : }
5558 : }
5559 :
5560 : /* -------------------------------------------------------------------- */
5561 : /* Cleanup */
5562 : /* -------------------------------------------------------------------- */
5563 28 : CPLFree(pSwathBuf);
5564 :
5565 28 : return eErr;
5566 : }
5567 :
5568 : /************************************************************************/
5569 : /* GDALCopyRasterIOExtraArg () */
5570 : /************************************************************************/
5571 :
5572 526974 : void GDALCopyRasterIOExtraArg(GDALRasterIOExtraArg *psDestArg,
5573 : GDALRasterIOExtraArg *psSrcArg)
5574 : {
5575 526974 : INIT_RASTERIO_EXTRA_ARG(*psDestArg);
5576 526974 : if (psSrcArg)
5577 : {
5578 526974 : psDestArg->eResampleAlg = psSrcArg->eResampleAlg;
5579 526974 : psDestArg->pfnProgress = psSrcArg->pfnProgress;
5580 526974 : psDestArg->pProgressData = psSrcArg->pProgressData;
5581 526974 : psDestArg->bFloatingPointWindowValidity =
5582 526974 : psSrcArg->bFloatingPointWindowValidity;
5583 526974 : if (psSrcArg->bFloatingPointWindowValidity)
5584 : {
5585 204182 : psDestArg->dfXOff = psSrcArg->dfXOff;
5586 204182 : psDestArg->dfYOff = psSrcArg->dfYOff;
5587 204182 : psDestArg->dfXSize = psSrcArg->dfXSize;
5588 204182 : psDestArg->dfYSize = psSrcArg->dfYSize;
5589 : }
5590 526974 : if (psSrcArg->nVersion >= 2)
5591 : {
5592 526974 : psDestArg->bUseOnlyThisScale = psSrcArg->bUseOnlyThisScale;
5593 : }
5594 : }
5595 526974 : }
5596 :
5597 : /************************************************************************/
5598 : /* HasOnlyNoData() */
5599 : /************************************************************************/
5600 :
5601 24858136 : template <class T> static inline bool IsEqualToNoData(T value, T noDataValue)
5602 : {
5603 24858136 : return value == noDataValue;
5604 : }
5605 :
5606 0 : template <> bool IsEqualToNoData<GFloat16>(GFloat16 value, GFloat16 noDataValue)
5607 : {
5608 : using std::isnan;
5609 0 : return isnan(noDataValue) ? isnan(value) : value == noDataValue;
5610 : }
5611 :
5612 560462 : template <> bool IsEqualToNoData<float>(float value, float noDataValue)
5613 : {
5614 560462 : return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
5615 : }
5616 :
5617 13481900 : template <> bool IsEqualToNoData<double>(double value, double noDataValue)
5618 : {
5619 13481900 : return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
5620 : }
5621 :
5622 : template <class T>
5623 15885 : static bool HasOnlyNoDataT(const T *pBuffer, T noDataValue, size_t nWidth,
5624 : size_t nHeight, size_t nLineStride,
5625 : size_t nComponents)
5626 : {
5627 : // Fast test: check the 4 corners and the middle pixel.
5628 30871 : for (size_t iBand = 0; iBand < nComponents; iBand++)
5629 : {
5630 32519 : if (!(IsEqualToNoData(pBuffer[iBand], noDataValue) &&
5631 16172 : IsEqualToNoData(pBuffer[(nWidth - 1) * nComponents + iBand],
5632 15948 : noDataValue) &&
5633 15948 : IsEqualToNoData(
5634 15948 : pBuffer[((nHeight - 1) / 2 * nLineStride + (nWidth - 1) / 2) *
5635 15948 : nComponents +
5636 : iBand],
5637 14999 : noDataValue) &&
5638 14999 : IsEqualToNoData(
5639 14999 : pBuffer[(nHeight - 1) * nLineStride * nComponents + iBand],
5640 : noDataValue) &&
5641 14991 : IsEqualToNoData(
5642 14991 : pBuffer[((nHeight - 1) * nLineStride + nWidth - 1) *
5643 14991 : nComponents +
5644 : iBand],
5645 : noDataValue)))
5646 : {
5647 1361 : return false;
5648 : }
5649 : }
5650 :
5651 : // Test all pixels.
5652 45036 : for (size_t iY = 0; iY < nHeight; iY++)
5653 : {
5654 30566 : const T *pBufferLine = pBuffer + iY * nLineStride * nComponents;
5655 38852446 : for (size_t iX = 0; iX < nWidth * nComponents; iX++)
5656 : {
5657 38821963 : if (!IsEqualToNoData(pBufferLine[iX], noDataValue))
5658 : {
5659 54 : return false;
5660 : }
5661 : }
5662 : }
5663 14470 : return true;
5664 : }
5665 :
5666 : /************************************************************************/
5667 : /* GDALBufferHasOnlyNoData() */
5668 : /************************************************************************/
5669 :
5670 42607 : bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
5671 : size_t nWidth, size_t nHeight, size_t nLineStride,
5672 : size_t nComponents, int nBitsPerSample,
5673 : GDALBufferSampleFormat nSampleFormat)
5674 : {
5675 : // In the case where the nodata is 0, we can compare several bytes at
5676 : // once. Select the largest natural integer type for the architecture.
5677 : #if SIZEOF_VOIDP >= 8 || defined(__x86_64__)
5678 : // We test __x86_64__ for x32 arch where SIZEOF_VOIDP == 4
5679 : typedef std::uint64_t WordType;
5680 : #else
5681 : typedef std::uint32_t WordType;
5682 : #endif
5683 42607 : if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
5684 : // Do not use this optimized code path for floating point numbers,
5685 : // as it can't detect negative zero.
5686 : nSampleFormat != GSF_FLOATING_POINT)
5687 : {
5688 26716 : const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
5689 26716 : const size_t nSize =
5690 26716 : (nWidth * nHeight * nComponents * nBitsPerSample + 7) / 8;
5691 26716 : size_t i = 0;
5692 : const size_t nInitialIters =
5693 53432 : std::min(sizeof(WordType) -
5694 26716 : static_cast<size_t>(
5695 : reinterpret_cast<std::uintptr_t>(pabyBuffer) %
5696 : sizeof(WordType)),
5697 26716 : nSize);
5698 220408 : for (; i < nInitialIters; i++)
5699 : {
5700 198066 : if (pabyBuffer[i])
5701 4374 : return false;
5702 : }
5703 16522900 : for (; i + sizeof(WordType) - 1 < nSize; i += sizeof(WordType))
5704 : {
5705 16507800 : if (*(reinterpret_cast<const WordType *>(pabyBuffer + i)))
5706 7198 : return false;
5707 : }
5708 52533 : for (; i < nSize; i++)
5709 : {
5710 37394 : if (pabyBuffer[i])
5711 5 : return false;
5712 : }
5713 15139 : return true;
5714 : }
5715 :
5716 15891 : if (nBitsPerSample == 8 && nSampleFormat == GSF_UNSIGNED_INT)
5717 : {
5718 22272 : return GDALIsValueInRange<uint8_t>(dfNoDataValue) &&
5719 11136 : HasOnlyNoDataT(static_cast<const uint8_t *>(pBuffer),
5720 11136 : static_cast<uint8_t>(dfNoDataValue), nWidth,
5721 11136 : nHeight, nLineStride, nComponents);
5722 : }
5723 4755 : if (nBitsPerSample == 8 && nSampleFormat == GSF_SIGNED_INT)
5724 : {
5725 : // Use unsigned implementation by converting the nodatavalue to
5726 : // unsigned
5727 63 : return GDALIsValueInRange<int8_t>(dfNoDataValue) &&
5728 31 : HasOnlyNoDataT(
5729 : static_cast<const uint8_t *>(pBuffer),
5730 31 : static_cast<uint8_t>(static_cast<int8_t>(dfNoDataValue)),
5731 32 : nWidth, nHeight, nLineStride, nComponents);
5732 : }
5733 4723 : if (nBitsPerSample == 16 && nSampleFormat == GSF_UNSIGNED_INT)
5734 : {
5735 21 : return GDALIsValueInRange<uint16_t>(dfNoDataValue) &&
5736 10 : HasOnlyNoDataT(static_cast<const uint16_t *>(pBuffer),
5737 10 : static_cast<uint16_t>(dfNoDataValue), nWidth,
5738 11 : nHeight, nLineStride, nComponents);
5739 : }
5740 4712 : if (nBitsPerSample == 16 && nSampleFormat == GSF_SIGNED_INT)
5741 : {
5742 : // Use unsigned implementation by converting the nodatavalue to
5743 : // unsigned
5744 99 : return GDALIsValueInRange<int16_t>(dfNoDataValue) &&
5745 49 : HasOnlyNoDataT(
5746 : static_cast<const uint16_t *>(pBuffer),
5747 49 : static_cast<uint16_t>(static_cast<int16_t>(dfNoDataValue)),
5748 50 : nWidth, nHeight, nLineStride, nComponents);
5749 : }
5750 4662 : if (nBitsPerSample == 32 && nSampleFormat == GSF_UNSIGNED_INT)
5751 : {
5752 73 : return GDALIsValueInRange<uint32_t>(dfNoDataValue) &&
5753 36 : HasOnlyNoDataT(static_cast<const uint32_t *>(pBuffer),
5754 : static_cast<uint32_t>(dfNoDataValue), nWidth,
5755 37 : nHeight, nLineStride, nComponents);
5756 : }
5757 4625 : if (nBitsPerSample == 32 && nSampleFormat == GSF_SIGNED_INT)
5758 : {
5759 : // Use unsigned implementation by converting the nodatavalue to
5760 : // unsigned
5761 23 : return GDALIsValueInRange<int32_t>(dfNoDataValue) &&
5762 11 : HasOnlyNoDataT(
5763 : static_cast<const uint32_t *>(pBuffer),
5764 11 : static_cast<uint32_t>(static_cast<int32_t>(dfNoDataValue)),
5765 12 : nWidth, nHeight, nLineStride, nComponents);
5766 : }
5767 4613 : if (nBitsPerSample == 64 && nSampleFormat == GSF_UNSIGNED_INT)
5768 : {
5769 56 : return GDALIsValueInRange<uint64_t>(dfNoDataValue) &&
5770 28 : HasOnlyNoDataT(static_cast<const uint64_t *>(pBuffer),
5771 : static_cast<uint64_t>(dfNoDataValue), nWidth,
5772 28 : nHeight, nLineStride, nComponents);
5773 : }
5774 4585 : if (nBitsPerSample == 64 && nSampleFormat == GSF_SIGNED_INT)
5775 : {
5776 : // Use unsigned implementation by converting the nodatavalue to
5777 : // unsigned
5778 0 : return GDALIsValueInRange<int64_t>(dfNoDataValue) &&
5779 0 : HasOnlyNoDataT(
5780 : static_cast<const uint64_t *>(pBuffer),
5781 0 : static_cast<uint64_t>(static_cast<int64_t>(dfNoDataValue)),
5782 0 : nWidth, nHeight, nLineStride, nComponents);
5783 : }
5784 4585 : if (nBitsPerSample == 16 && nSampleFormat == GSF_FLOATING_POINT)
5785 : {
5786 0 : return (std::isnan(dfNoDataValue) ||
5787 0 : GDALIsValueInRange<GFloat16>(dfNoDataValue)) &&
5788 0 : HasOnlyNoDataT(static_cast<const GFloat16 *>(pBuffer),
5789 : static_cast<GFloat16>(dfNoDataValue), nWidth,
5790 0 : nHeight, nLineStride, nComponents);
5791 : }
5792 4585 : if (nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
5793 : {
5794 754 : return (std::isnan(dfNoDataValue) ||
5795 1507 : GDALIsValueInRange<float>(dfNoDataValue)) &&
5796 753 : HasOnlyNoDataT(static_cast<const float *>(pBuffer),
5797 : static_cast<float>(dfNoDataValue), nWidth,
5798 754 : nHeight, nLineStride, nComponents);
5799 : }
5800 3831 : if (nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
5801 : {
5802 3831 : return HasOnlyNoDataT(static_cast<const double *>(pBuffer),
5803 : dfNoDataValue, nWidth, nHeight, nLineStride,
5804 3831 : nComponents);
5805 : }
5806 0 : return false;
5807 : }
5808 :
5809 : #ifdef HAVE_SSE2
5810 :
5811 : /************************************************************************/
5812 : /* GDALDeinterleave3Byte() */
5813 : /************************************************************************/
5814 :
5815 : #if defined(__GNUC__) && !defined(__clang__)
5816 : __attribute__((optimize("no-tree-vectorize")))
5817 : #endif
5818 : static void
5819 202475 : GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
5820 : GByte *CPL_RESTRICT pabyDest0,
5821 : GByte *CPL_RESTRICT pabyDest1,
5822 : GByte *CPL_RESTRICT pabyDest2, size_t nIters)
5823 : #ifdef USE_NEON_OPTIMIZATIONS
5824 : {
5825 : return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
5826 : nIters);
5827 : }
5828 : #else
5829 : {
5830 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
5831 202475 : if (CPLHaveRuntimeSSSE3())
5832 : {
5833 202487 : return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
5834 202481 : pabyDest2, nIters);
5835 : }
5836 : #endif
5837 :
5838 1 : size_t i = 0;
5839 1 : if (((reinterpret_cast<uintptr_t>(pabySrc) |
5840 1 : reinterpret_cast<uintptr_t>(pabyDest0) |
5841 1 : reinterpret_cast<uintptr_t>(pabyDest1) |
5842 1 : reinterpret_cast<uintptr_t>(pabyDest2)) %
5843 : sizeof(unsigned int)) == 0)
5844 : {
5845 : // Slightly better than GCC autovectorizer
5846 17 : for (size_t j = 0; i + 3 < nIters; i += 4, ++j)
5847 : {
5848 15 : unsigned int word0 =
5849 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i);
5850 15 : unsigned int word1 =
5851 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 4);
5852 15 : unsigned int word2 =
5853 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 8);
5854 15 : reinterpret_cast<unsigned int *>(pabyDest0)[j] =
5855 15 : (word0 & 0xff) | ((word0 >> 24) << 8) | (word1 & 0x00ff0000) |
5856 15 : ((word2 >> 8) << 24);
5857 15 : reinterpret_cast<unsigned int *>(pabyDest1)[j] =
5858 15 : ((word0 >> 8) & 0xff) | ((word1 & 0xff) << 8) |
5859 15 : (((word1 >> 24)) << 16) | ((word2 >> 16) << 24);
5860 15 : pabyDest2[j * 4] = static_cast<GByte>(word0 >> 16);
5861 15 : pabyDest2[j * 4 + 1] = static_cast<GByte>(word1 >> 8);
5862 15 : pabyDest2[j * 4 + 2] = static_cast<GByte>(word2);
5863 15 : pabyDest2[j * 4 + 3] = static_cast<GByte>(word2 >> 24);
5864 : }
5865 : }
5866 : #if defined(__clang__)
5867 : #pragma clang loop vectorize(disable)
5868 : #endif
5869 2 : for (; i < nIters; ++i)
5870 : {
5871 1 : pabyDest0[i] = pabySrc[3 * i + 0];
5872 1 : pabyDest1[i] = pabySrc[3 * i + 1];
5873 1 : pabyDest2[i] = pabySrc[3 * i + 2];
5874 : }
5875 : }
5876 : #endif
5877 :
5878 : /************************************************************************/
5879 : /* GDALDeinterleave4Byte() */
5880 : /************************************************************************/
5881 :
5882 : #if !defined(__GNUC__) || defined(__clang__)
5883 :
5884 : /************************************************************************/
5885 : /* deinterleave() */
5886 : /************************************************************************/
5887 :
5888 : template <bool SHIFT, bool MASK>
5889 : inline __m128i deinterleave(__m128i &xmm0_ori, __m128i &xmm1_ori,
5890 : __m128i &xmm2_ori, __m128i &xmm3_ori)
5891 : {
5892 : // Set higher 24bit of each int32 packed word to 0
5893 : if (SHIFT)
5894 : {
5895 : xmm0_ori = _mm_srli_epi32(xmm0_ori, 8);
5896 : xmm1_ori = _mm_srli_epi32(xmm1_ori, 8);
5897 : xmm2_ori = _mm_srli_epi32(xmm2_ori, 8);
5898 : xmm3_ori = _mm_srli_epi32(xmm3_ori, 8);
5899 : }
5900 : __m128i xmm0;
5901 : __m128i xmm1;
5902 : __m128i xmm2;
5903 : __m128i xmm3;
5904 : if (MASK)
5905 : {
5906 : const __m128i xmm_mask = _mm_set1_epi32(0xff);
5907 : xmm0 = _mm_and_si128(xmm0_ori, xmm_mask);
5908 : xmm1 = _mm_and_si128(xmm1_ori, xmm_mask);
5909 : xmm2 = _mm_and_si128(xmm2_ori, xmm_mask);
5910 : xmm3 = _mm_and_si128(xmm3_ori, xmm_mask);
5911 : }
5912 : else
5913 : {
5914 : xmm0 = xmm0_ori;
5915 : xmm1 = xmm1_ori;
5916 : xmm2 = xmm2_ori;
5917 : xmm3 = xmm3_ori;
5918 : }
5919 : // Pack int32 to int16
5920 : xmm0 = _mm_packs_epi32(xmm0, xmm1);
5921 : xmm2 = _mm_packs_epi32(xmm2, xmm3);
5922 : // Pack int16 to uint8
5923 : xmm0 = _mm_packus_epi16(xmm0, xmm2);
5924 : return xmm0;
5925 : }
5926 :
5927 : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
5928 : GByte *CPL_RESTRICT pabyDest0,
5929 : GByte *CPL_RESTRICT pabyDest1,
5930 : GByte *CPL_RESTRICT pabyDest2,
5931 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
5932 : #ifdef USE_NEON_OPTIMIZATIONS
5933 : {
5934 : return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
5935 : pabyDest3, nIters);
5936 : }
5937 : #else
5938 : {
5939 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
5940 : if (CPLHaveRuntimeSSSE3())
5941 : {
5942 : return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
5943 : pabyDest2, pabyDest3, nIters);
5944 : }
5945 : #endif
5946 :
5947 : // Not the optimal SSE2-only code, as gcc auto-vectorizer manages to
5948 : // do something slightly better.
5949 : size_t i = 0;
5950 : for (; i + 15 < nIters; i += 16)
5951 : {
5952 : __m128i xmm0_ori = _mm_loadu_si128(
5953 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
5954 : __m128i xmm1_ori = _mm_loadu_si128(
5955 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
5956 : __m128i xmm2_ori = _mm_loadu_si128(
5957 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
5958 : __m128i xmm3_ori = _mm_loadu_si128(
5959 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
5960 :
5961 : _mm_storeu_si128(
5962 : reinterpret_cast<__m128i *>(pabyDest0 + i),
5963 : deinterleave<false, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
5964 : _mm_storeu_si128(
5965 : reinterpret_cast<__m128i *>(pabyDest1 + i),
5966 : deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
5967 : _mm_storeu_si128(
5968 : reinterpret_cast<__m128i *>(pabyDest2 + i),
5969 : deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
5970 : _mm_storeu_si128(
5971 : reinterpret_cast<__m128i *>(pabyDest3 + i),
5972 : deinterleave<true, false>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
5973 : }
5974 :
5975 : #if defined(__clang__)
5976 : #pragma clang loop vectorize(disable)
5977 : #endif
5978 : for (; i < nIters; ++i)
5979 : {
5980 : pabyDest0[i] = pabySrc[4 * i + 0];
5981 : pabyDest1[i] = pabySrc[4 * i + 1];
5982 : pabyDest2[i] = pabySrc[4 * i + 2];
5983 : pabyDest3[i] = pabySrc[4 * i + 3];
5984 : }
5985 : }
5986 : #endif
5987 : #else
5988 : // GCC autovectorizer does an excellent job
5989 61563 : __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
5990 : const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
5991 : GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2,
5992 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
5993 : {
5994 527625000 : for (size_t i = 0; i < nIters; ++i)
5995 : {
5996 527564000 : pabyDest0[i] = pabySrc[4 * i + 0];
5997 527564000 : pabyDest1[i] = pabySrc[4 * i + 1];
5998 527564000 : pabyDest2[i] = pabySrc[4 * i + 2];
5999 527564000 : pabyDest3[i] = pabySrc[4 * i + 3];
6000 : }
6001 61563 : }
6002 : #endif
6003 :
6004 : #else
6005 :
6006 : /************************************************************************/
6007 : /* GDALDeinterleave3Byte() */
6008 : /************************************************************************/
6009 :
6010 : // TODO: Enabling below could help on non-Intel architectures where GCC knows
6011 : // how to auto-vectorize
6012 : // #if defined(__GNUC__)
6013 : //__attribute__((optimize("tree-vectorize")))
6014 : // #endif
6015 : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
6016 : GByte *CPL_RESTRICT pabyDest0,
6017 : GByte *CPL_RESTRICT pabyDest1,
6018 : GByte *CPL_RESTRICT pabyDest2, size_t nIters)
6019 : {
6020 : for (size_t i = 0; i < nIters; ++i)
6021 : {
6022 : pabyDest0[i] = pabySrc[3 * i + 0];
6023 : pabyDest1[i] = pabySrc[3 * i + 1];
6024 : pabyDest2[i] = pabySrc[3 * i + 2];
6025 : }
6026 : }
6027 :
6028 : /************************************************************************/
6029 : /* GDALDeinterleave4Byte() */
6030 : /************************************************************************/
6031 :
6032 : // TODO: Enabling below could help on non-Intel architectures where gcc knows
6033 : // how to auto-vectorize
6034 : // #if defined(__GNUC__)
6035 : //__attribute__((optimize("tree-vectorize")))
6036 : // #endif
6037 : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
6038 : GByte *CPL_RESTRICT pabyDest0,
6039 : GByte *CPL_RESTRICT pabyDest1,
6040 : GByte *CPL_RESTRICT pabyDest2,
6041 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6042 : {
6043 : for (size_t i = 0; i < nIters; ++i)
6044 : {
6045 : pabyDest0[i] = pabySrc[4 * i + 0];
6046 : pabyDest1[i] = pabySrc[4 * i + 1];
6047 : pabyDest2[i] = pabySrc[4 * i + 2];
6048 : pabyDest3[i] = pabySrc[4 * i + 3];
6049 : }
6050 : }
6051 :
6052 : #endif
6053 :
6054 : /************************************************************************/
6055 : /* GDALDeinterleave() */
6056 : /************************************************************************/
6057 :
6058 : /*! Copy values from a pixel-interleave buffer to multiple per-component
6059 : buffers.
6060 :
6061 : In pseudo-code
6062 : \verbatim
6063 : for(size_t i = 0; i < nIters; ++i)
6064 : for(int iComp = 0; iComp < nComponents; iComp++ )
6065 : ppDestBuffer[iComp][i] = pSourceBuffer[nComponents * i + iComp]
6066 : \endverbatim
6067 :
6068 : The implementation is optimized for a few cases, like de-interleaving
6069 : of 3 or 4-components Byte buffers.
6070 :
6071 : \since GDAL 3.6
6072 : */
6073 264393 : void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
6074 : int nComponents, void **ppDestBuffer,
6075 : GDALDataType eDestDT, size_t nIters)
6076 : {
6077 264393 : if (eSourceDT == eDestDT)
6078 : {
6079 264372 : if (eSourceDT == GDT_Byte || eSourceDT == GDT_Int8)
6080 : {
6081 264051 : if (nComponents == 3)
6082 : {
6083 202478 : const GByte *CPL_RESTRICT pabySrc =
6084 : static_cast<const GByte *>(pSourceBuffer);
6085 202478 : GByte *CPL_RESTRICT pabyDest0 =
6086 : static_cast<GByte *>(ppDestBuffer[0]);
6087 202478 : GByte *CPL_RESTRICT pabyDest1 =
6088 : static_cast<GByte *>(ppDestBuffer[1]);
6089 202478 : GByte *CPL_RESTRICT pabyDest2 =
6090 : static_cast<GByte *>(ppDestBuffer[2]);
6091 202478 : GDALDeinterleave3Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6092 : nIters);
6093 202485 : return;
6094 : }
6095 61573 : else if (nComponents == 4)
6096 : {
6097 61563 : const GByte *CPL_RESTRICT pabySrc =
6098 : static_cast<const GByte *>(pSourceBuffer);
6099 61563 : GByte *CPL_RESTRICT pabyDest0 =
6100 : static_cast<GByte *>(ppDestBuffer[0]);
6101 61563 : GByte *CPL_RESTRICT pabyDest1 =
6102 : static_cast<GByte *>(ppDestBuffer[1]);
6103 61563 : GByte *CPL_RESTRICT pabyDest2 =
6104 : static_cast<GByte *>(ppDestBuffer[2]);
6105 61563 : GByte *CPL_RESTRICT pabyDest3 =
6106 : static_cast<GByte *>(ppDestBuffer[3]);
6107 61563 : GDALDeinterleave4Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6108 : pabyDest3, nIters);
6109 61563 : return;
6110 10 : }
6111 : }
6112 : #if ((defined(__GNUC__) && !defined(__clang__)) || \
6113 : defined(__INTEL_CLANG_COMPILER)) && \
6114 : defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
6115 642 : else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
6116 321 : CPLHaveRuntimeSSSE3())
6117 : {
6118 321 : if (nComponents == 3)
6119 : {
6120 126 : const GUInt16 *CPL_RESTRICT panSrc =
6121 : static_cast<const GUInt16 *>(pSourceBuffer);
6122 126 : GUInt16 *CPL_RESTRICT panDest0 =
6123 : static_cast<GUInt16 *>(ppDestBuffer[0]);
6124 126 : GUInt16 *CPL_RESTRICT panDest1 =
6125 : static_cast<GUInt16 *>(ppDestBuffer[1]);
6126 126 : GUInt16 *CPL_RESTRICT panDest2 =
6127 : static_cast<GUInt16 *>(ppDestBuffer[2]);
6128 126 : GDALDeinterleave3UInt16_SSSE3(panSrc, panDest0, panDest1,
6129 : panDest2, nIters);
6130 126 : return;
6131 : }
6132 : #if !defined(__INTEL_CLANG_COMPILER)
6133 : // ICC autovectorizer doesn't do a good job, at least with icx
6134 : // 2022.1.0.20220316
6135 195 : else if (nComponents == 4)
6136 : {
6137 195 : const GUInt16 *CPL_RESTRICT panSrc =
6138 : static_cast<const GUInt16 *>(pSourceBuffer);
6139 195 : GUInt16 *CPL_RESTRICT panDest0 =
6140 : static_cast<GUInt16 *>(ppDestBuffer[0]);
6141 195 : GUInt16 *CPL_RESTRICT panDest1 =
6142 : static_cast<GUInt16 *>(ppDestBuffer[1]);
6143 195 : GUInt16 *CPL_RESTRICT panDest2 =
6144 : static_cast<GUInt16 *>(ppDestBuffer[2]);
6145 195 : GUInt16 *CPL_RESTRICT panDest3 =
6146 : static_cast<GUInt16 *>(ppDestBuffer[3]);
6147 195 : GDALDeinterleave4UInt16_SSSE3(panSrc, panDest0, panDest1,
6148 : panDest2, panDest3, nIters);
6149 195 : return;
6150 : }
6151 : #endif
6152 : }
6153 : #endif
6154 : }
6155 :
6156 31 : const int nSourceDTSize = GDALGetDataTypeSizeBytes(eSourceDT);
6157 29 : const int nDestDTSize = GDALGetDataTypeSizeBytes(eDestDT);
6158 108 : for (int iComp = 0; iComp < nComponents; iComp++)
6159 : {
6160 79 : GDALCopyWords64(static_cast<const GByte *>(pSourceBuffer) +
6161 79 : iComp * nSourceDTSize,
6162 : eSourceDT, nComponents * nSourceDTSize,
6163 79 : ppDestBuffer[iComp], eDestDT, nDestDTSize, nIters);
6164 : }
6165 : }
6166 :
6167 : /************************************************************************/
6168 : /* GDALTranspose2DSingleToSingle() */
6169 : /************************************************************************/
6170 : /**
6171 : * Transpose a 2D array of non-complex values, in a efficient (cache-oblivious) way.
6172 : *
6173 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6174 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6175 : * @param nSrcWidth Width of pSrc array.
6176 : * @param nSrcHeight Height of pSrc array.
6177 : */
6178 :
6179 : template <class DST, class SRC>
6180 145 : void GDALTranspose2DSingleToSingle(const SRC *CPL_RESTRICT pSrc,
6181 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6182 : size_t nSrcHeight)
6183 : {
6184 145 : constexpr size_t blocksize = 32;
6185 315 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6186 : {
6187 170 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6188 390 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6189 : {
6190 : // transpose the block beginning at [i,j]
6191 220 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6192 2509 : for (size_t k = i; k < max_k; ++k)
6193 : {
6194 41017 : for (size_t l = j; l < max_l; ++l)
6195 : {
6196 38728 : GDALCopyWord(pSrc[l + k * nSrcWidth],
6197 38728 : pDst[k + l * nSrcHeight]);
6198 : }
6199 : }
6200 : }
6201 : }
6202 145 : }
6203 :
6204 : /************************************************************************/
6205 : /* GDALTranspose2DComplexToComplex() */
6206 : /************************************************************************/
6207 : /**
6208 : * Transpose a 2D array of complex values into an array of complex values,
6209 : * in a efficient (cache-oblivious) way.
6210 : *
6211 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6212 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6213 : * @param nSrcWidth Width of pSrc array.
6214 : * @param nSrcHeight Height of pSrc array.
6215 : */
6216 : template <class DST, class SRC>
6217 25 : void GDALTranspose2DComplexToComplex(const SRC *CPL_RESTRICT pSrc,
6218 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6219 : size_t nSrcHeight)
6220 : {
6221 25 : constexpr size_t blocksize = 32;
6222 50 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6223 : {
6224 25 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6225 50 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6226 : {
6227 : // transpose the block beginning at [i,j]
6228 25 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6229 75 : for (size_t k = i; k < max_k; ++k)
6230 : {
6231 200 : for (size_t l = j; l < max_l; ++l)
6232 : {
6233 150 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
6234 150 : pDst[2 * (k + l * nSrcHeight) + 0]);
6235 150 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 1],
6236 150 : pDst[2 * (k + l * nSrcHeight) + 1]);
6237 : }
6238 : }
6239 : }
6240 : }
6241 25 : }
6242 :
6243 : /************************************************************************/
6244 : /* GDALTranspose2DComplexToSingle() */
6245 : /************************************************************************/
6246 : /**
6247 : * Transpose a 2D array of complex values into an array of non-complex values,
6248 : * in a efficient (cache-oblivious) way.
6249 : *
6250 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6251 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6252 : * @param nSrcWidth Width of pSrc array.
6253 : * @param nSrcHeight Height of pSrc array.
6254 : */
6255 : template <class DST, class SRC>
6256 55 : void GDALTranspose2DComplexToSingle(const SRC *CPL_RESTRICT pSrc,
6257 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6258 : size_t nSrcHeight)
6259 : {
6260 55 : constexpr size_t blocksize = 32;
6261 110 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6262 : {
6263 55 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6264 110 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6265 : {
6266 : // transpose the block beginning at [i,j]
6267 55 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6268 165 : for (size_t k = i; k < max_k; ++k)
6269 : {
6270 440 : for (size_t l = j; l < max_l; ++l)
6271 : {
6272 330 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
6273 330 : pDst[k + l * nSrcHeight]);
6274 : }
6275 : }
6276 : }
6277 : }
6278 55 : }
6279 :
6280 : /************************************************************************/
6281 : /* GDALTranspose2DSingleToComplex() */
6282 : /************************************************************************/
6283 : /**
6284 : * Transpose a 2D array of non-complex values into an array of complex values,
6285 : * in a efficient (cache-oblivious) way.
6286 : *
6287 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6288 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6289 : * @param nSrcWidth Width of pSrc array.
6290 : * @param nSrcHeight Height of pSrc array.
6291 : */
6292 : template <class DST, class SRC>
6293 55 : void GDALTranspose2DSingleToComplex(const SRC *CPL_RESTRICT pSrc,
6294 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6295 : size_t nSrcHeight)
6296 : {
6297 55 : constexpr size_t blocksize = 32;
6298 110 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6299 : {
6300 55 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6301 110 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6302 : {
6303 : // transpose the block beginning at [i,j]
6304 55 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6305 165 : for (size_t k = i; k < max_k; ++k)
6306 : {
6307 440 : for (size_t l = j; l < max_l; ++l)
6308 : {
6309 330 : GDALCopyWord(pSrc[l + k * nSrcWidth],
6310 330 : pDst[2 * (k + l * nSrcHeight) + 0]);
6311 330 : pDst[2 * (k + l * nSrcHeight) + 1] = 0;
6312 : }
6313 : }
6314 : }
6315 : }
6316 55 : }
6317 :
6318 : /************************************************************************/
6319 : /* GDALTranspose2D() */
6320 : /************************************************************************/
6321 :
6322 : template <class DST, bool DST_IS_COMPLEX>
6323 280 : static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
6324 : size_t nSrcWidth, size_t nSrcHeight)
6325 : {
6326 : #define CALL_GDALTranspose2D_internal(SRC_TYPE) \
6327 : do \
6328 : { \
6329 : if constexpr (DST_IS_COMPLEX) \
6330 : { \
6331 : GDALTranspose2DSingleToComplex( \
6332 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6333 : nSrcHeight); \
6334 : } \
6335 : else \
6336 : { \
6337 : GDALTranspose2DSingleToSingle(static_cast<const SRC_TYPE *>(pSrc), \
6338 : pDst, nSrcWidth, nSrcHeight); \
6339 : } \
6340 : } while (0)
6341 :
6342 : #define CALL_GDALTranspose2DComplex_internal(SRC_TYPE) \
6343 : do \
6344 : { \
6345 : if constexpr (DST_IS_COMPLEX) \
6346 : { \
6347 : GDALTranspose2DComplexToComplex( \
6348 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6349 : nSrcHeight); \
6350 : } \
6351 : else \
6352 : { \
6353 : GDALTranspose2DComplexToSingle( \
6354 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6355 : nSrcHeight); \
6356 : } \
6357 : } while (0)
6358 :
6359 : // clang-format off
6360 280 : switch (eSrcType)
6361 : {
6362 16 : case GDT_Byte: CALL_GDALTranspose2D_internal(uint8_t); break;
6363 15 : case GDT_Int8: CALL_GDALTranspose2D_internal(int8_t); break;
6364 24 : case GDT_UInt16: CALL_GDALTranspose2D_internal(uint16_t); break;
6365 16 : case GDT_Int16: CALL_GDALTranspose2D_internal(int16_t); break;
6366 24 : case GDT_UInt32: CALL_GDALTranspose2D_internal(uint32_t); break;
6367 16 : case GDT_Int32: CALL_GDALTranspose2D_internal(int32_t); break;
6368 16 : case GDT_UInt64: CALL_GDALTranspose2D_internal(uint64_t); break;
6369 16 : case GDT_Int64: CALL_GDALTranspose2D_internal(int64_t); break;
6370 16 : case GDT_Float16: CALL_GDALTranspose2D_internal(GFloat16); break;
6371 17 : case GDT_Float32: CALL_GDALTranspose2D_internal(float); break;
6372 24 : case GDT_Float64: CALL_GDALTranspose2D_internal(double); break;
6373 16 : case GDT_CInt16: CALL_GDALTranspose2DComplex_internal(int16_t); break;
6374 16 : case GDT_CInt32: CALL_GDALTranspose2DComplex_internal(int32_t); break;
6375 16 : case GDT_CFloat16: CALL_GDALTranspose2DComplex_internal(GFloat16); break;
6376 16 : case GDT_CFloat32: CALL_GDALTranspose2DComplex_internal(float); break;
6377 16 : case GDT_CFloat64: CALL_GDALTranspose2DComplex_internal(double); break;
6378 0 : case GDT_Unknown:
6379 : case GDT_TypeCount:
6380 0 : break;
6381 : }
6382 : // clang-format on
6383 :
6384 : #undef CALL_GDALTranspose2D_internal
6385 : #undef CALL_GDALTranspose2DComplex_internal
6386 280 : }
6387 :
6388 : /************************************************************************/
6389 : /* GDALInterleave2Byte() */
6390 : /************************************************************************/
6391 :
6392 : #if defined(HAVE_SSE2) && \
6393 : (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
6394 :
6395 : // ICC autovectorizer doesn't do a good job at generating good SSE code,
6396 : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
6397 : #if defined(__GNUC__)
6398 : __attribute__((noinline))
6399 : #endif
6400 : static void
6401 : GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
6402 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
6403 : {
6404 : size_t i = 0;
6405 : constexpr size_t VALS_PER_ITER = 16;
6406 : for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
6407 : {
6408 : __m128i xmm0 =
6409 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + i));
6410 : __m128i xmm1 = _mm_loadu_si128(
6411 : reinterpret_cast<__m128i const *>(pSrc + i + nIters));
6412 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + 2 * i),
6413 : _mm_unpacklo_epi8(xmm0, xmm1));
6414 : _mm_storeu_si128(
6415 : reinterpret_cast<__m128i *>(pDst + 2 * i + VALS_PER_ITER),
6416 : _mm_unpackhi_epi8(xmm0, xmm1));
6417 : }
6418 : #if defined(__clang__)
6419 : #pragma clang loop vectorize(disable)
6420 : #endif
6421 : for (; i < nIters; ++i)
6422 : {
6423 : pDst[2 * i + 0] = pSrc[i + 0 * nIters];
6424 : pDst[2 * i + 1] = pSrc[i + 1 * nIters];
6425 : }
6426 : }
6427 :
6428 : #else
6429 :
6430 : #if defined(__GNUC__) && !defined(__clang__)
6431 : __attribute__((optimize("tree-vectorize")))
6432 : #endif
6433 : #if defined(__GNUC__)
6434 : __attribute__((noinline))
6435 : #endif
6436 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6437 : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
6438 : #pragma clang diagnostic push
6439 : #pragma clang diagnostic ignored "-Wpass-failed"
6440 : #endif
6441 : static void
6442 4 : GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
6443 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
6444 : {
6445 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6446 : #pragma clang loop vectorize(enable)
6447 : #endif
6448 44 : for (size_t i = 0; i < nIters; ++i)
6449 : {
6450 40 : pDst[2 * i + 0] = pSrc[i + 0 * nIters];
6451 40 : pDst[2 * i + 1] = pSrc[i + 1 * nIters];
6452 : }
6453 4 : }
6454 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6455 : #pragma clang diagnostic pop
6456 : #endif
6457 :
6458 : #endif
6459 :
6460 : /************************************************************************/
6461 : /* GDALInterleave4Byte() */
6462 : /************************************************************************/
6463 :
6464 : #if defined(HAVE_SSE2) && \
6465 : (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
6466 :
6467 : // ICC autovectorizer doesn't do a good job at generating good SSE code,
6468 : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
6469 : #if defined(__GNUC__)
6470 : __attribute__((noinline))
6471 : #endif
6472 : static void
6473 : GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
6474 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
6475 : {
6476 : size_t i = 0;
6477 : constexpr size_t VALS_PER_ITER = 16;
6478 : for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
6479 : {
6480 : __m128i xmm0 = _mm_loadu_si128(
6481 : reinterpret_cast<__m128i const *>(pSrc + i + 0 * nIters));
6482 : __m128i xmm1 = _mm_loadu_si128(
6483 : reinterpret_cast<__m128i const *>(pSrc + i + 1 * nIters));
6484 : __m128i xmm2 = _mm_loadu_si128(
6485 : reinterpret_cast<__m128i const *>(pSrc + i + 2 * nIters));
6486 : __m128i xmm3 = _mm_loadu_si128(
6487 : reinterpret_cast<__m128i const *>(pSrc + i + 3 * nIters));
6488 : auto tmp0 = _mm_unpacklo_epi8(
6489 : xmm0,
6490 : xmm1); // (xmm0_0, xmm1_0, xmm0_1, xmm1_1, xmm0_2, xmm1_2, ...)
6491 : auto tmp1 = _mm_unpackhi_epi8(
6492 : xmm0,
6493 : xmm1); // (xmm0_8, xmm1_8, xmm0_9, xmm1_9, xmm0_10, xmm1_10, ...)
6494 : auto tmp2 = _mm_unpacklo_epi8(
6495 : xmm2,
6496 : xmm3); // (xmm2_0, xmm3_0, xmm2_1, xmm3_1, xmm2_2, xmm3_2, ...)
6497 : auto tmp3 = _mm_unpackhi_epi8(
6498 : xmm2,
6499 : xmm3); // (xmm2_8, xmm3_8, xmm2_9, xmm3_9, xmm2_10, xmm3_10, ...)
6500 : auto tmp2_0 = _mm_unpacklo_epi16(
6501 : tmp0,
6502 : tmp2); // (xmm0_0, xmm1_0, xmm2_0, xmm3_0, xmm0_1, xmm1_1, xmm2_1, xmm3_1, ...)
6503 : auto tmp2_1 = _mm_unpackhi_epi16(tmp0, tmp2);
6504 : auto tmp2_2 = _mm_unpacklo_epi16(tmp1, tmp3);
6505 : auto tmp2_3 = _mm_unpackhi_epi16(tmp1, tmp3);
6506 : _mm_storeu_si128(
6507 : reinterpret_cast<__m128i *>(pDst + 4 * i + 0 * VALS_PER_ITER),
6508 : tmp2_0);
6509 : _mm_storeu_si128(
6510 : reinterpret_cast<__m128i *>(pDst + 4 * i + 1 * VALS_PER_ITER),
6511 : tmp2_1);
6512 : _mm_storeu_si128(
6513 : reinterpret_cast<__m128i *>(pDst + 4 * i + 2 * VALS_PER_ITER),
6514 : tmp2_2);
6515 : _mm_storeu_si128(
6516 : reinterpret_cast<__m128i *>(pDst + 4 * i + 3 * VALS_PER_ITER),
6517 : tmp2_3);
6518 : }
6519 : #if defined(__clang__)
6520 : #pragma clang loop vectorize(disable)
6521 : #endif
6522 : for (; i < nIters; ++i)
6523 : {
6524 : pDst[4 * i + 0] = pSrc[i + 0 * nIters];
6525 : pDst[4 * i + 1] = pSrc[i + 1 * nIters];
6526 : pDst[4 * i + 2] = pSrc[i + 2 * nIters];
6527 : pDst[4 * i + 3] = pSrc[i + 3 * nIters];
6528 : }
6529 : }
6530 :
6531 : #else
6532 :
6533 : #if defined(__GNUC__) && !defined(__clang__)
6534 : __attribute__((optimize("tree-vectorize")))
6535 : #endif
6536 : #if defined(__GNUC__)
6537 : __attribute__((noinline))
6538 : #endif
6539 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6540 : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
6541 : #pragma clang diagnostic push
6542 : #pragma clang diagnostic ignored "-Wpass-failed"
6543 : #endif
6544 : static void
6545 2 : GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
6546 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
6547 : {
6548 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6549 : #pragma clang loop vectorize(enable)
6550 : #endif
6551 36 : for (size_t i = 0; i < nIters; ++i)
6552 : {
6553 34 : pDst[4 * i + 0] = pSrc[i + 0 * nIters];
6554 34 : pDst[4 * i + 1] = pSrc[i + 1 * nIters];
6555 34 : pDst[4 * i + 2] = pSrc[i + 2 * nIters];
6556 34 : pDst[4 * i + 3] = pSrc[i + 3 * nIters];
6557 : }
6558 2 : }
6559 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6560 : #pragma clang diagnostic pop
6561 : #endif
6562 :
6563 : #endif
6564 :
6565 : /************************************************************************/
6566 : /* GDALTranspose2D() */
6567 : /************************************************************************/
6568 :
6569 : /**
6570 : * Transpose a 2D array in a efficient (cache-oblivious) way.
6571 : *
6572 : * @param pSrc Source array of width = nSrcWidth and height = nSrcHeight.
6573 : * @param eSrcType Data type of pSrc.
6574 : * @param pDst Destination transposed array of width = nSrcHeight and height = nSrcWidth.
6575 : * @param eDstType Data type of pDst.
6576 : * @param nSrcWidth Width of pSrc array.
6577 : * @param nSrcHeight Height of pSrc array.
6578 : * @since GDAL 3.11
6579 : */
6580 :
6581 305 : void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, void *pDst,
6582 : GDALDataType eDstType, size_t nSrcWidth, size_t nSrcHeight)
6583 : {
6584 305 : if (eSrcType == eDstType && (eSrcType == GDT_Byte || eSrcType == GDT_Int8))
6585 : {
6586 25 : if (nSrcHeight == 2)
6587 : {
6588 4 : GDALInterleave2Byte(static_cast<const uint8_t *>(pSrc),
6589 : static_cast<uint8_t *>(pDst), nSrcWidth);
6590 4 : return;
6591 : }
6592 21 : if (nSrcHeight == 4)
6593 : {
6594 2 : GDALInterleave4Byte(static_cast<const uint8_t *>(pSrc),
6595 : static_cast<uint8_t *>(pDst), nSrcWidth);
6596 2 : return;
6597 : }
6598 : #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) && \
6599 : (defined(__x86_64) || defined(_M_X64)))
6600 19 : if (CPLHaveRuntimeSSSE3())
6601 : {
6602 19 : GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
6603 : static_cast<uint8_t *>(pDst), nSrcWidth,
6604 : nSrcHeight);
6605 19 : return;
6606 : }
6607 : #elif defined(USE_NEON_OPTIMIZATIONS)
6608 : {
6609 : GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
6610 : static_cast<uint8_t *>(pDst), nSrcWidth,
6611 : nSrcHeight);
6612 : return;
6613 : }
6614 : #endif
6615 : }
6616 :
6617 : #define CALL_GDALTranspose2D_internal(DST_TYPE, DST_IS_COMPLEX) \
6618 : GDALTranspose2D<DST_TYPE, DST_IS_COMPLEX>( \
6619 : pSrc, eSrcType, static_cast<DST_TYPE *>(pDst), nSrcWidth, nSrcHeight)
6620 :
6621 : // clang-format off
6622 280 : switch (eDstType)
6623 : {
6624 15 : case GDT_Byte: CALL_GDALTranspose2D_internal(uint8_t, false); break;
6625 15 : case GDT_Int8: CALL_GDALTranspose2D_internal(int8_t, false); break;
6626 24 : case GDT_UInt16: CALL_GDALTranspose2D_internal(uint16_t, false); break;
6627 16 : case GDT_Int16: CALL_GDALTranspose2D_internal(int16_t, false); break;
6628 24 : case GDT_UInt32: CALL_GDALTranspose2D_internal(uint32_t, false); break;
6629 16 : case GDT_Int32: CALL_GDALTranspose2D_internal(int32_t, false); break;
6630 16 : case GDT_UInt64: CALL_GDALTranspose2D_internal(uint64_t, false); break;
6631 16 : case GDT_Int64: CALL_GDALTranspose2D_internal(int64_t, false); break;
6632 16 : case GDT_Float16: CALL_GDALTranspose2D_internal(GFloat16, false); break;
6633 17 : case GDT_Float32: CALL_GDALTranspose2D_internal(float, false); break;
6634 25 : case GDT_Float64: CALL_GDALTranspose2D_internal(double, false); break;
6635 16 : case GDT_CInt16: CALL_GDALTranspose2D_internal(int16_t, true); break;
6636 16 : case GDT_CInt32: CALL_GDALTranspose2D_internal(int32_t, true); break;
6637 16 : case GDT_CFloat16: CALL_GDALTranspose2D_internal(GFloat16, true); break;
6638 16 : case GDT_CFloat32: CALL_GDALTranspose2D_internal(float, true); break;
6639 16 : case GDT_CFloat64: CALL_GDALTranspose2D_internal(double, true); break;
6640 0 : case GDT_Unknown:
6641 : case GDT_TypeCount:
6642 0 : break;
6643 : }
6644 : // clang-format on
6645 :
6646 : #undef CALL_GDALTranspose2D_internal
6647 : }
6648 :
6649 : /************************************************************************/
6650 : /* ExtractBitAndConvertTo255() */
6651 : /************************************************************************/
6652 :
6653 : #if defined(__GNUC__) || defined(_MSC_VER)
6654 : // Signedness of char implementation dependent, so be explicit.
6655 : // Assumes 2-complement integer types and sign extension of right shifting
6656 : // GCC guarantees such:
6657 : // https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
6658 157290 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
6659 : {
6660 157290 : return static_cast<GByte>(static_cast<signed char>(byVal << (7 - nBit)) >>
6661 157290 : 7);
6662 : }
6663 : #else
6664 : // Portable way
6665 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
6666 : {
6667 : return (byVal & (1 << nBit)) ? 255 : 0;
6668 : }
6669 : #endif
6670 :
6671 : /************************************************************************/
6672 : /* ExpandEightPackedBitsToByteAt255() */
6673 : /************************************************************************/
6674 :
6675 19457 : static inline void ExpandEightPackedBitsToByteAt255(GByte byVal,
6676 : GByte abyOutput[8])
6677 : {
6678 19457 : abyOutput[0] = ExtractBitAndConvertTo255(byVal, 7);
6679 19457 : abyOutput[1] = ExtractBitAndConvertTo255(byVal, 6);
6680 19457 : abyOutput[2] = ExtractBitAndConvertTo255(byVal, 5);
6681 19457 : abyOutput[3] = ExtractBitAndConvertTo255(byVal, 4);
6682 19457 : abyOutput[4] = ExtractBitAndConvertTo255(byVal, 3);
6683 19457 : abyOutput[5] = ExtractBitAndConvertTo255(byVal, 2);
6684 19457 : abyOutput[6] = ExtractBitAndConvertTo255(byVal, 1);
6685 19457 : abyOutput[7] = ExtractBitAndConvertTo255(byVal, 0);
6686 19457 : }
6687 :
6688 : /************************************************************************/
6689 : /* GDALExpandPackedBitsToByteAt0Or255() */
6690 : /************************************************************************/
6691 :
6692 : /** Expand packed-bits (ordered from most-significant bit to least one)
6693 : into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
6694 : at 1 to a byte at 255.
6695 :
6696 : The function does (in a possibly more optimized way) the following:
6697 : \code{.cpp}
6698 : for (size_t i = 0; i < nInputBits; ++i )
6699 : {
6700 : pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 255 : 0;
6701 : }
6702 : \endcode
6703 :
6704 : @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
6705 : @param pabyOutput Output array of nInputBits bytes.
6706 : @param nInputBits Number of valid bits in pabyInput.
6707 :
6708 : @since 3.11
6709 : */
6710 :
6711 44445 : void GDALExpandPackedBitsToByteAt0Or255(const GByte *CPL_RESTRICT pabyInput,
6712 : GByte *CPL_RESTRICT pabyOutput,
6713 : size_t nInputBits)
6714 : {
6715 44445 : const size_t nInputWholeBytes = nInputBits / 8;
6716 44445 : size_t iByte = 0;
6717 :
6718 : #ifdef HAVE_SSE2
6719 : // Mask to isolate each bit
6720 44445 : const __m128i bit_mask = _mm_set_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4,
6721 : 8, 16, 32, 64, -128);
6722 44445 : const __m128i zero = _mm_setzero_si128();
6723 44445 : const __m128i all_ones = _mm_set1_epi8(-1);
6724 : #ifdef __SSSE3__
6725 : const __m128i dispatch_two_bytes =
6726 : _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
6727 : #endif
6728 44445 : constexpr size_t SSE_REG_SIZE = sizeof(bit_mask);
6729 132394 : for (; iByte + SSE_REG_SIZE <= nInputWholeBytes; iByte += SSE_REG_SIZE)
6730 : {
6731 87949 : __m128i reg_ori = _mm_loadu_si128(
6732 87949 : reinterpret_cast<const __m128i *>(pabyInput + iByte));
6733 :
6734 87949 : constexpr int NUM_PROCESSED_BYTES_PER_REG = 2;
6735 791541 : for (size_t k = 0; k < SSE_REG_SIZE / NUM_PROCESSED_BYTES_PER_REG; ++k)
6736 : {
6737 : // Given reg_ori = (A, B, ... 14 other bytes ...),
6738 : // expand to (A, A, A, A, A, A, A, A, B, B, B, B, B, B, B, B)
6739 : #ifdef __SSSE3__
6740 : __m128i reg = _mm_shuffle_epi8(reg_ori, dispatch_two_bytes);
6741 : #else
6742 703592 : __m128i reg = _mm_unpacklo_epi8(reg_ori, reg_ori);
6743 703592 : reg = _mm_unpacklo_epi16(reg, reg);
6744 703592 : reg = _mm_unpacklo_epi32(reg, reg);
6745 : #endif
6746 :
6747 : // Test if bits of interest are set
6748 703592 : reg = _mm_and_si128(reg, bit_mask);
6749 :
6750 : // Now test if those bits are set, by comparing to zero. So the
6751 : // result will be that bytes where bits are set will be at 0, and
6752 : // ones where they are cleared will be at 0xFF. So the inverse of
6753 : // the end result we want!
6754 703592 : reg = _mm_cmpeq_epi8(reg, zero);
6755 :
6756 : // Invert the result
6757 703592 : reg = _mm_andnot_si128(reg, all_ones);
6758 :
6759 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyOutput), reg);
6760 :
6761 703592 : pabyOutput += SSE_REG_SIZE;
6762 :
6763 : // Right-shift of 2 bytes
6764 703592 : reg_ori = _mm_bsrli_si128(reg_ori, NUM_PROCESSED_BYTES_PER_REG);
6765 : }
6766 : }
6767 :
6768 : #endif // HAVE_SSE2
6769 :
6770 63902 : for (; iByte < nInputWholeBytes; ++iByte)
6771 : {
6772 19457 : ExpandEightPackedBitsToByteAt255(pabyInput[iByte], pabyOutput);
6773 19457 : pabyOutput += 8;
6774 : }
6775 46079 : for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
6776 : {
6777 1634 : *pabyOutput = ExtractBitAndConvertTo255(pabyInput[iByte], 7 - iBit);
6778 1634 : ++pabyOutput;
6779 : }
6780 44445 : }
6781 :
6782 : /************************************************************************/
6783 : /* ExpandEightPackedBitsToByteAt1() */
6784 : /************************************************************************/
6785 :
6786 136113 : static inline void ExpandEightPackedBitsToByteAt1(GByte byVal,
6787 : GByte abyOutput[8])
6788 : {
6789 136113 : abyOutput[0] = (byVal >> 7) & 0x1;
6790 136113 : abyOutput[1] = (byVal >> 6) & 0x1;
6791 136113 : abyOutput[2] = (byVal >> 5) & 0x1;
6792 136113 : abyOutput[3] = (byVal >> 4) & 0x1;
6793 136113 : abyOutput[4] = (byVal >> 3) & 0x1;
6794 136113 : abyOutput[5] = (byVal >> 2) & 0x1;
6795 136113 : abyOutput[6] = (byVal >> 1) & 0x1;
6796 136113 : abyOutput[7] = (byVal >> 0) & 0x1;
6797 136113 : }
6798 :
6799 : /************************************************************************/
6800 : /* GDALExpandPackedBitsToByteAt0Or1() */
6801 : /************************************************************************/
6802 :
6803 : /** Expand packed-bits (ordered from most-significant bit to least one)
6804 : into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
6805 : at 1 to a byte at 1.
6806 :
6807 : The function does (in a possibly more optimized way) the following:
6808 : \code{.cpp}
6809 : for (size_t i = 0; i < nInputBits; ++i )
6810 : {
6811 : pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
6812 : }
6813 : \endcode
6814 :
6815 : @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
6816 : @param pabyOutput Output array of nInputBits bytes.
6817 : @param nInputBits Number of valid bits in pabyInput.
6818 :
6819 : @since 3.11
6820 : */
6821 :
6822 7041 : void GDALExpandPackedBitsToByteAt0Or1(const GByte *CPL_RESTRICT pabyInput,
6823 : GByte *CPL_RESTRICT pabyOutput,
6824 : size_t nInputBits)
6825 : {
6826 7041 : const size_t nInputWholeBytes = nInputBits / 8;
6827 7041 : size_t iByte = 0;
6828 143154 : for (; iByte < nInputWholeBytes; ++iByte)
6829 : {
6830 136113 : ExpandEightPackedBitsToByteAt1(pabyInput[iByte], pabyOutput);
6831 136113 : pabyOutput += 8;
6832 : }
6833 18902 : for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
6834 : {
6835 11861 : *pabyOutput = (pabyInput[iByte] >> (7 - iBit)) & 0x1;
6836 11861 : ++pabyOutput;
6837 : }
6838 7041 : }
|