Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: GDAL Core
4 : * Purpose: Contains default implementation of GDALRasterBand::IRasterIO()
5 : * and supporting functions of broader utility.
6 : * Author: Frank Warmerdam, warmerdam@pobox.com
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 1998, Frank Warmerdam
10 : * Copyright (c) 2007-2014, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * SPDX-License-Identifier: MIT
13 : ****************************************************************************/
14 :
15 : #include "cpl_port.h"
16 : #include "gdal.h"
17 : #include "gdal_priv.h"
18 :
19 : #include <cassert>
20 : #include <climits>
21 : #include <cmath>
22 : #include <cstddef>
23 : #include <cstdio>
24 : #include <cstdlib>
25 : #include <cstring>
26 :
27 : #include <algorithm>
28 : #include <limits>
29 : #include <stdexcept>
30 : #include <type_traits>
31 :
32 : #include "cpl_conv.h"
33 : #include "cpl_cpu_features.h"
34 : #include "cpl_error.h"
35 : #include "cpl_float.h"
36 : #include "cpl_progress.h"
37 : #include "cpl_string.h"
38 : #include "cpl_vsi.h"
39 : #include "gdal_priv_templates.hpp"
40 : #include "gdal_vrt.h"
41 : #include "gdalwarper.h"
42 : #include "memdataset.h"
43 : #include "vrtdataset.h"
44 :
45 : #if defined(__x86_64) || defined(_M_X64)
46 : #include <emmintrin.h>
47 : #define HAVE_SSE2
48 : #elif defined(USE_NEON_OPTIMIZATIONS)
49 : #include "include_sse2neon.h"
50 : #define HAVE_SSE2
51 : #endif
52 :
53 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
54 : #include "rasterio_ssse3.h"
55 : #ifdef __SSSE3__
56 : #include <tmmintrin.h>
57 : #endif
58 : #endif
59 :
60 : #ifdef __SSE4_1__
61 : #include <smmintrin.h>
62 : #endif
63 :
64 : #ifdef __GNUC__
65 : #define CPL_NOINLINE __attribute__((noinline))
66 : #else
67 : #define CPL_NOINLINE
68 : #endif
69 :
70 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
71 : int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
72 : int nDstPixelStride, GPtrDiff_t nWordCount);
73 :
74 : /************************************************************************/
75 : /* DownsamplingIntegerXFactor() */
76 : /************************************************************************/
77 :
78 : template <bool bSameDataType, int DATA_TYPE_SIZE>
79 695780 : static bool DownsamplingIntegerXFactor(
80 : GDALRasterBand *poBand, int iSrcX, int nSrcXInc, GPtrDiff_t iSrcOffsetCst,
81 : GByte *CPL_RESTRICT pabyDstData, int nPixelSpace, int nBufXSize,
82 : GDALDataType eDataType, GDALDataType eBufType, int &nStartBlockX,
83 : int nBlockXSize, GDALRasterBlock *&poBlock, int nLBlockY)
84 : {
85 695780 : const int nBandDataSize =
86 : bSameDataType ? DATA_TYPE_SIZE : GDALGetDataTypeSizeBytes(eDataType);
87 695780 : int nOuterLoopIters = nBufXSize - 1;
88 695780 : const int nIncSrcOffset = nSrcXInc * nBandDataSize;
89 : const GByte *CPL_RESTRICT pabySrcData;
90 695780 : int nEndBlockX = nBlockXSize + nStartBlockX;
91 :
92 695780 : if (iSrcX < nEndBlockX)
93 : {
94 294999 : CPLAssert(poBlock);
95 294999 : goto no_reload_block;
96 : }
97 400781 : goto reload_block;
98 :
99 : // Don't do the last iteration in the loop, as iSrcX might go beyond
100 : // nRasterXSize - 1
101 1264973 : while (--nOuterLoopIters >= 1)
102 : {
103 201834 : iSrcX += nSrcXInc;
104 201834 : pabySrcData += nIncSrcOffset;
105 201834 : pabyDstData += nPixelSpace;
106 :
107 : /* --------------------------------------------------------------------
108 : */
109 : /* Ensure we have the appropriate block loaded. */
110 : /* --------------------------------------------------------------------
111 : */
112 201834 : if (iSrcX >= nEndBlockX)
113 : {
114 201834 : reload_block:
115 : {
116 615205 : const int nLBlockX = iSrcX / nBlockXSize;
117 615205 : nStartBlockX = nLBlockX * nBlockXSize;
118 615205 : nEndBlockX = nStartBlockX + nBlockXSize;
119 :
120 615205 : if (poBlock != nullptr)
121 341376 : poBlock->DropLock();
122 :
123 615205 : poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
124 615205 : if (poBlock == nullptr)
125 : {
126 1 : return false;
127 : }
128 : }
129 :
130 615204 : no_reload_block:
131 : const GByte *pabySrcBlock =
132 1264973 : static_cast<const GByte *>(poBlock->GetDataRef());
133 1264973 : GPtrDiff_t iSrcOffset =
134 1264973 : (iSrcX - nStartBlockX + iSrcOffsetCst) * nBandDataSize;
135 1264973 : pabySrcData = pabySrcBlock + iSrcOffset;
136 : }
137 :
138 : /* --------------------------------------------------------------------
139 : */
140 : /* Copy the maximum run of pixels. */
141 : /* --------------------------------------------------------------------
142 : */
143 :
144 1264973 : const int nIters = std::min(
145 1264973 : (nEndBlockX - iSrcX + (nSrcXInc - 1)) / nSrcXInc, nOuterLoopIters);
146 : if (bSameDataType)
147 : {
148 1264530 : memcpy(pabyDstData, pabySrcData, nBandDataSize);
149 1264530 : if (nIters > 1)
150 : {
151 : if (DATA_TYPE_SIZE == 1)
152 : {
153 326250 : pabySrcData += nIncSrcOffset;
154 326250 : pabyDstData += nPixelSpace;
155 326250 : GDALFastCopyByte(pabySrcData, nIncSrcOffset, pabyDstData,
156 326250 : nPixelSpace, nIters - 1);
157 326250 : pabySrcData +=
158 326250 : static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 2);
159 326250 : pabyDstData +=
160 326250 : static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 2);
161 : }
162 : else
163 : {
164 4395716 : for (int i = 0; i < nIters - 1; i++)
165 : {
166 4197550 : pabySrcData += nIncSrcOffset;
167 4197550 : pabyDstData += nPixelSpace;
168 4197550 : memcpy(pabyDstData, pabySrcData, nBandDataSize);
169 : }
170 : }
171 524420 : iSrcX += nSrcXInc * (nIters - 1);
172 524420 : nOuterLoopIters -= nIters - 1;
173 : }
174 : }
175 : else
176 : {
177 : // Type to type conversion ...
178 443 : GDALCopyWords64(pabySrcData, eDataType, nIncSrcOffset, pabyDstData,
179 443 : eBufType, nPixelSpace, std::max(1, nIters));
180 443 : if (nIters > 1)
181 : {
182 216 : pabySrcData +=
183 216 : static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 1);
184 216 : pabyDstData +=
185 216 : static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 1);
186 216 : iSrcX += nSrcXInc * (nIters - 1);
187 216 : nOuterLoopIters -= nIters - 1;
188 : }
189 : }
190 : }
191 :
192 : // Deal with last iteration to avoid iSrcX to go beyond nRasterXSize - 1
193 1063139 : if (nOuterLoopIters == 0)
194 : {
195 367360 : const int nRasterXSize = poBand->GetXSize();
196 367360 : iSrcX =
197 734720 : static_cast<int>(std::min(static_cast<GInt64>(iSrcX) + nSrcXInc,
198 367360 : static_cast<GInt64>(nRasterXSize - 1)));
199 367360 : pabyDstData += nPixelSpace;
200 367360 : if (iSrcX < nEndBlockX)
201 : {
202 354770 : goto no_reload_block;
203 : }
204 12590 : goto reload_block;
205 : }
206 695779 : return true;
207 : }
208 :
209 : template <class A, class B>
210 2731490 : CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW inline auto CPLUnsanitizedMul(A a, B b)
211 : {
212 2731490 : return a * b;
213 : }
214 :
215 : /************************************************************************/
216 : /* IRasterIO() */
217 : /* */
218 : /* Default internal implementation of RasterIO() ... utilizes */
219 : /* the Block access methods to satisfy the request. This would */
220 : /* normally only be overridden by formats with overviews. */
221 : /************************************************************************/
222 :
223 6118970 : CPLErr GDALRasterBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
224 : int nXSize, int nYSize, void *pData,
225 : int nBufXSize, int nBufYSize,
226 : GDALDataType eBufType, GSpacing nPixelSpace,
227 : GSpacing nLineSpace,
228 : GDALRasterIOExtraArg *psExtraArg)
229 :
230 : {
231 6118970 : if (eRWFlag == GF_Write && eFlushBlockErr != CE_None)
232 : {
233 0 : CPLError(eFlushBlockErr, CPLE_AppDefined,
234 : "An error occurred while writing a dirty block "
235 : "from GDALRasterBand::IRasterIO");
236 0 : CPLErr eErr = eFlushBlockErr;
237 0 : eFlushBlockErr = CE_None;
238 0 : return eErr;
239 : }
240 6118970 : if (nBlockXSize <= 0 || nBlockYSize <= 0)
241 : {
242 0 : CPLError(CE_Failure, CPLE_AppDefined, "Invalid block size");
243 0 : return CE_Failure;
244 : }
245 :
246 6118970 : const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
247 6118970 : const int nBufDataSize = GDALGetDataTypeSizeBytes(eBufType);
248 6118970 : GByte dummyBlock[2] = {0, 0};
249 6118970 : GByte *pabySrcBlock =
250 : dummyBlock; /* to avoid Coverity warning about nullptr dereference */
251 6118970 : GDALRasterBlock *poBlock = nullptr;
252 6118970 : const bool bUseIntegerRequestCoords =
253 6466180 : (!psExtraArg->bFloatingPointWindowValidity ||
254 347211 : (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
255 323819 : nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
256 :
257 : /* ==================================================================== */
258 : /* A common case is the data requested with the destination */
259 : /* is packed, and the block width is the raster width. */
260 : /* ==================================================================== */
261 6041470 : if (nPixelSpace == nBufDataSize && nLineSpace == nPixelSpace * nXSize &&
262 3191750 : nBlockXSize == GetXSize() && nBufXSize == nXSize &&
263 12160400 : nBufYSize == nYSize && bUseIntegerRequestCoords)
264 : {
265 3079220 : CPLErr eErr = CE_None;
266 3079220 : int nLBlockY = -1;
267 :
268 9619620 : for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
269 : {
270 6541480 : const int iSrcY = iBufYOff + nYOff;
271 :
272 6541480 : if (iSrcY < nLBlockY * nBlockYSize ||
273 6541480 : iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
274 : {
275 3338900 : nLBlockY = iSrcY / nBlockYSize;
276 3338900 : bool bJustInitialize =
277 295478 : eRWFlag == GF_Write && nXOff == 0 &&
278 3691510 : nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
279 57137 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize;
280 :
281 : // Is this a partial tile at right and/or bottom edges of
282 : // the raster, and that is going to be completely written?
283 : // If so, do not load it from storage, but zero it so that
284 : // the content outsize of the validity area is initialized.
285 3338900 : bool bMemZeroBuffer = false;
286 295478 : if (eRWFlag == GF_Write && !bJustInitialize && nXOff == 0 &&
287 23861 : nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
288 3634460 : nYOff + nYSize == GetYSize() &&
289 89 : nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)
290 : {
291 89 : bJustInitialize = true;
292 89 : bMemZeroBuffer = true;
293 : }
294 :
295 3338900 : if (poBlock)
296 259673 : poBlock->DropLock();
297 :
298 3338900 : const GUInt32 nErrorCounter = CPLGetErrorCounter();
299 3338900 : poBlock = GetLockedBlockRef(0, nLBlockY, bJustInitialize);
300 3338900 : if (poBlock == nullptr)
301 : {
302 1079 : if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
303 : nullptr)
304 : {
305 0 : CPLError(CE_Failure, CPLE_AppDefined,
306 : "GetBlockRef failed at X block offset %d, "
307 : "Y block offset %d%s",
308 : 0, nLBlockY,
309 0 : (nErrorCounter != CPLGetErrorCounter())
310 0 : ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
311 : : "");
312 : }
313 1079 : eErr = CE_Failure;
314 1079 : break;
315 : }
316 :
317 3337820 : if (eRWFlag == GF_Write)
318 295478 : poBlock->MarkDirty();
319 :
320 3337820 : pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
321 3337820 : if (bMemZeroBuffer)
322 : {
323 89 : memset(pabySrcBlock, 0,
324 89 : static_cast<GPtrDiff_t>(nBandDataSize) *
325 89 : nBlockXSize * nBlockYSize);
326 : }
327 : }
328 :
329 6540400 : const auto nSrcByteOffset =
330 6540400 : (static_cast<GPtrDiff_t>(iSrcY - nLBlockY * nBlockYSize) *
331 6540400 : nBlockXSize +
332 6540400 : nXOff) *
333 6540400 : nBandDataSize;
334 :
335 6540400 : if (eDataType == eBufType)
336 : {
337 2892660 : if (eRWFlag == GF_Read)
338 2422040 : memcpy(static_cast<GByte *>(pData) +
339 2422040 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
340 2422040 : pabySrcBlock + nSrcByteOffset,
341 : static_cast<size_t>(nLineSpace));
342 : else
343 470615 : memcpy(pabySrcBlock + nSrcByteOffset,
344 470615 : static_cast<GByte *>(pData) +
345 470615 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
346 : static_cast<size_t>(nLineSpace));
347 : }
348 : else
349 : {
350 : // Type to type conversion.
351 3647740 : if (eRWFlag == GF_Read)
352 3626140 : GDALCopyWords64(
353 3626140 : pabySrcBlock + nSrcByteOffset, eDataType, nBandDataSize,
354 : static_cast<GByte *>(pData) +
355 3626140 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
356 : eBufType, static_cast<int>(nPixelSpace), nBufXSize);
357 : else
358 21603 : GDALCopyWords64(static_cast<GByte *>(pData) +
359 21603 : static_cast<GPtrDiff_t>(iBufYOff) *
360 : nLineSpace,
361 : eBufType, static_cast<int>(nPixelSpace),
362 21603 : pabySrcBlock + nSrcByteOffset, eDataType,
363 : nBandDataSize, nBufXSize);
364 : }
365 :
366 6624000 : if (psExtraArg->pfnProgress != nullptr &&
367 83604 : !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
368 : psExtraArg->pProgressData))
369 : {
370 5 : eErr = CE_Failure;
371 5 : break;
372 : }
373 : }
374 :
375 3079220 : if (poBlock)
376 3078140 : poBlock->DropLock();
377 :
378 3079220 : return eErr;
379 : }
380 :
381 : /* ==================================================================== */
382 : /* Do we have overviews that would be appropriate to satisfy */
383 : /* this request? */
384 : /* ==================================================================== */
385 3039740 : if ((nBufXSize < nXSize || nBufYSize < nYSize) && GetOverviewCount() > 0 &&
386 : eRWFlag == GF_Read)
387 : {
388 : GDALRasterIOExtraArg sExtraArg;
389 2967 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
390 :
391 : const int nOverview =
392 2967 : GDALBandGetBestOverviewLevel2(this, nXOff, nYOff, nXSize, nYSize,
393 : nBufXSize, nBufYSize, &sExtraArg);
394 2967 : if (nOverview >= 0)
395 : {
396 2892 : GDALRasterBand *poOverviewBand = GetOverview(nOverview);
397 2892 : if (poOverviewBand == nullptr)
398 2892 : return CE_Failure;
399 :
400 2892 : return poOverviewBand->RasterIO(
401 : eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize,
402 2892 : nBufYSize, eBufType, nPixelSpace, nLineSpace, &sExtraArg);
403 : }
404 : }
405 :
406 848087 : if (eRWFlag == GF_Read && nBufXSize < nXSize / 100 &&
407 6 : nBufYSize < nYSize / 100 && nPixelSpace == nBufDataSize &&
408 3884940 : nLineSpace == nPixelSpace * nBufXSize &&
409 6 : CPLTestBool(CPLGetConfigOption("GDAL_NO_COSTLY_OVERVIEW", "NO")))
410 : {
411 0 : memset(pData, 0, static_cast<size_t>(nLineSpace * nBufYSize));
412 0 : return CE_None;
413 : }
414 :
415 : /* ==================================================================== */
416 : /* The second case when we don't need subsample data but likely */
417 : /* need data type conversion. */
418 : /* ==================================================================== */
419 3036850 : if ( // nPixelSpace == nBufDataSize &&
420 3036850 : nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
421 : {
422 : #if DEBUG_VERBOSE
423 : printf("IRasterIO(%d,%d,%d,%d) rw=%d case 2\n", /*ok*/
424 : nXOff, nYOff, nXSize, nYSize, static_cast<int>(eRWFlag));
425 : #endif
426 :
427 : /* --------------------------------------------------------------------
428 : */
429 : /* Loop over buffer computing source locations. */
430 : /* --------------------------------------------------------------------
431 : */
432 : // Calculate starting values out of loop
433 2471220 : const int nLBlockXStart = nXOff / nBlockXSize;
434 2471220 : const int nXSpanEnd = nBufXSize + nXOff;
435 :
436 2471220 : int nYInc = 0;
437 4981980 : for (int iBufYOff = 0, iSrcY = nYOff; iBufYOff < nBufYSize;
438 2510760 : iBufYOff += nYInc, iSrcY += nYInc)
439 : {
440 2510830 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
441 : static_cast<GPtrDiff_t>(nLineSpace);
442 2510830 : int nLBlockY = iSrcY / nBlockYSize;
443 2510830 : int nLBlockX = nLBlockXStart;
444 2510830 : int iSrcX = nXOff;
445 5242260 : while (iSrcX < nXSpanEnd)
446 : {
447 2731490 : int nXSpan = nLBlockX * nBlockXSize;
448 2731490 : if (nXSpan < INT_MAX - nBlockXSize)
449 2731490 : nXSpan += nBlockXSize;
450 : else
451 0 : nXSpan = INT_MAX;
452 2731490 : const int nXRight = nXSpan;
453 2731490 : nXSpan = (nXSpan < nXSpanEnd ? nXSpan : nXSpanEnd) - iSrcX;
454 :
455 : const size_t nXSpanSize =
456 2731490 : CPLUnsanitizedMul(nXSpan, static_cast<size_t>(nPixelSpace));
457 :
458 2731490 : bool bJustInitialize =
459 2042260 : eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
460 37317 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
461 4799390 : nXOff <= nLBlockX * nBlockXSize &&
462 25639 : nXOff + nXSize >= nXRight;
463 :
464 : // Is this a partial tile at right and/or bottom edges of
465 : // the raster, and that is going to be completely written?
466 : // If so, do not load it from storage, but zero it so that
467 : // the content outsize of the validity area is initialized.
468 2731490 : bool bMemZeroBuffer = false;
469 2042260 : if (eRWFlag == GF_Write && !bJustInitialize &&
470 2017850 : nXOff <= nLBlockX * nBlockXSize &&
471 2016200 : nYOff <= nLBlockY * nBlockYSize &&
472 12152 : (nXOff + nXSize >= nXRight ||
473 : // cppcheck-suppress knownConditionTrueFalse
474 4776460 : (nXOff + nXSize == GetXSize() && nXRight > GetXSize())) &&
475 11972 : (nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize ||
476 10750 : (nYOff + nYSize == GetYSize() &&
477 1958 : nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)))
478 : {
479 3180 : bJustInitialize = true;
480 3180 : bMemZeroBuffer = true;
481 : }
482 :
483 : /* --------------------------------------------------------------------
484 : */
485 : /* Ensure we have the appropriate block loaded. */
486 : /* --------------------------------------------------------------------
487 : */
488 2731490 : const GUInt32 nErrorCounter = CPLGetErrorCounter();
489 2731490 : poBlock =
490 2731490 : GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
491 2731490 : if (!poBlock)
492 : {
493 71 : if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
494 : nullptr)
495 : {
496 0 : CPLError(CE_Failure, CPLE_AppDefined,
497 : "GetBlockRef failed at X block offset %d, "
498 : "Y block offset %d%s",
499 : nLBlockX, nLBlockY,
500 0 : (nErrorCounter != CPLGetErrorCounter())
501 0 : ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
502 : : "");
503 : }
504 71 : return (CE_Failure);
505 : }
506 :
507 2731420 : if (eRWFlag == GF_Write)
508 2042260 : poBlock->MarkDirty();
509 :
510 2731420 : pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
511 2731420 : if (bMemZeroBuffer)
512 : {
513 3180 : memset(pabySrcBlock, 0,
514 3180 : static_cast<GPtrDiff_t>(nBandDataSize) *
515 3180 : nBlockXSize * nBlockYSize);
516 : }
517 : /* --------------------------------------------------------------------
518 : */
519 : /* Copy over this chunk of data. */
520 : /* --------------------------------------------------------------------
521 : */
522 2731420 : GPtrDiff_t iSrcOffset =
523 2731420 : (static_cast<GPtrDiff_t>(iSrcX) -
524 2731420 : static_cast<GPtrDiff_t>(nLBlockX * nBlockXSize) +
525 2731420 : (static_cast<GPtrDiff_t>(iSrcY) -
526 2731420 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
527 2731420 : nBlockXSize) *
528 2731420 : nBandDataSize;
529 : // Fill up as many rows as possible for the loaded block.
530 5462850 : const int kmax = std::min(nBlockYSize - (iSrcY % nBlockYSize),
531 2731420 : nBufYSize - iBufYOff);
532 59780200 : for (int k = 0; k < kmax; k++)
533 : {
534 57048800 : if (eDataType == eBufType && nPixelSpace == nBufDataSize)
535 : {
536 53095600 : if (eRWFlag == GF_Read)
537 48657500 : memcpy(static_cast<GByte *>(pData) + iBufOffset +
538 48657500 : static_cast<GPtrDiff_t>(k) * nLineSpace,
539 48657500 : pabySrcBlock + iSrcOffset, nXSpanSize);
540 : else
541 4438030 : memcpy(pabySrcBlock + iSrcOffset,
542 4438030 : static_cast<GByte *>(pData) + iBufOffset +
543 4438030 : static_cast<GPtrDiff_t>(k) * nLineSpace,
544 : nXSpanSize);
545 : }
546 : else
547 : {
548 : /* type to type conversion */
549 3953230 : if (eRWFlag == GF_Read)
550 3896460 : GDALCopyWords64(
551 3896460 : pabySrcBlock + iSrcOffset, eDataType,
552 : nBandDataSize,
553 3896460 : static_cast<GByte *>(pData) + iBufOffset +
554 3896460 : static_cast<GPtrDiff_t>(k) * nLineSpace,
555 : eBufType, static_cast<int>(nPixelSpace),
556 : nXSpan);
557 : else
558 56776 : GDALCopyWords64(
559 56776 : static_cast<GByte *>(pData) + iBufOffset +
560 56776 : static_cast<GPtrDiff_t>(k) * nLineSpace,
561 : eBufType, static_cast<int>(nPixelSpace),
562 56776 : pabySrcBlock + iSrcOffset, eDataType,
563 : nBandDataSize, nXSpan);
564 : }
565 :
566 57048800 : iSrcOffset +=
567 57048800 : static_cast<GPtrDiff_t>(nBlockXSize) * nBandDataSize;
568 : }
569 :
570 : iBufOffset =
571 2731420 : CPLUnsanitizedAdd<GPtrDiff_t>(iBufOffset, nXSpanSize);
572 2731420 : nLBlockX++;
573 2731420 : iSrcX += nXSpan;
574 :
575 2731420 : poBlock->DropLock();
576 2731420 : poBlock = nullptr;
577 : }
578 :
579 : /* Compute the increment to go on a block boundary */
580 2510760 : nYInc = nBlockYSize - (iSrcY % nBlockYSize);
581 :
582 2512620 : if (psExtraArg->pfnProgress != nullptr &&
583 1856 : !psExtraArg->pfnProgress(
584 2512620 : 1.0 * std::min(nBufYSize, iBufYOff + nYInc) / nBufYSize, "",
585 : psExtraArg->pProgressData))
586 : {
587 0 : return CE_Failure;
588 : }
589 : }
590 :
591 2471150 : return CE_None;
592 : }
593 :
594 : /* ==================================================================== */
595 : /* Loop reading required source blocks to satisfy output */
596 : /* request. This is the most general implementation. */
597 : /* ==================================================================== */
598 :
599 565633 : double dfXOff = nXOff;
600 565633 : double dfYOff = nYOff;
601 565633 : double dfXSize = nXSize;
602 565633 : double dfYSize = nYSize;
603 565633 : if (psExtraArg->bFloatingPointWindowValidity)
604 : {
605 230638 : dfXOff = psExtraArg->dfXOff;
606 230638 : dfYOff = psExtraArg->dfYOff;
607 230638 : dfXSize = psExtraArg->dfXSize;
608 230638 : dfYSize = psExtraArg->dfYSize;
609 : }
610 :
611 : /* -------------------------------------------------------------------- */
612 : /* Compute stepping increment. */
613 : /* -------------------------------------------------------------------- */
614 565633 : const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
615 565633 : const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
616 565633 : CPLErr eErr = CE_None;
617 :
618 565633 : if (eRWFlag == GF_Write)
619 : {
620 : /* --------------------------------------------------------------------
621 : */
622 : /* Write case */
623 : /* Loop over raster window computing source locations in the buffer.
624 : */
625 : /* --------------------------------------------------------------------
626 : */
627 166655 : GByte *pabyDstBlock = nullptr;
628 166655 : int nLBlockX = -1;
629 166655 : int nLBlockY = -1;
630 :
631 1260010 : for (int iDstY = nYOff; iDstY < nYOff + nYSize; iDstY++)
632 : {
633 1093360 : const int iBufYOff = static_cast<int>((iDstY - nYOff) / dfSrcYInc);
634 :
635 12384200 : for (int iDstX = nXOff; iDstX < nXOff + nXSize; iDstX++)
636 : {
637 11290800 : const int iBufXOff =
638 11290800 : static_cast<int>((iDstX - nXOff) / dfSrcXInc);
639 11290800 : GPtrDiff_t iBufOffset =
640 11290800 : static_cast<GPtrDiff_t>(iBufYOff) *
641 : static_cast<GPtrDiff_t>(nLineSpace) +
642 11290800 : iBufXOff * static_cast<GPtrDiff_t>(nPixelSpace);
643 :
644 : // FIXME: this code likely doesn't work if the dirty block gets
645 : // flushed to disk before being completely written.
646 : // In the meantime, bJustInitialize should probably be set to
647 : // FALSE even if it is not ideal performance wise, and for
648 : // lossy compression.
649 :
650 : /* --------------------------------------------------------------------
651 : */
652 : /* Ensure we have the appropriate block loaded. */
653 : /* --------------------------------------------------------------------
654 : */
655 11290800 : if (iDstX < nLBlockX * nBlockXSize ||
656 11041500 : iDstX - nBlockXSize >= nLBlockX * nBlockXSize ||
657 10584800 : iDstY < nLBlockY * nBlockYSize ||
658 10584800 : iDstY - nBlockYSize >= nLBlockY * nBlockYSize)
659 : {
660 738702 : nLBlockX = iDstX / nBlockXSize;
661 738702 : nLBlockY = iDstY / nBlockYSize;
662 :
663 738702 : const bool bJustInitialize =
664 1065990 : nYOff <= nLBlockY * nBlockYSize &&
665 327291 : nYOff + nYSize - nBlockYSize >=
666 327291 : nLBlockY * nBlockYSize &&
667 1116320 : nXOff <= nLBlockX * nBlockXSize &&
668 50325 : nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
669 : /*bool bMemZeroBuffer = FALSE;
670 : if( !bJustInitialize &&
671 : nXOff <= nLBlockX * nBlockXSize &&
672 : nYOff <= nLBlockY * nBlockYSize &&
673 : (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
674 : (nXOff + nXSize == GetXSize() &&
675 : (nLBlockX+1) * nBlockXSize > GetXSize())) &&
676 : (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
677 : (nYOff + nYSize == GetYSize() &&
678 : (nLBlockY+1) * nBlockYSize > GetYSize())) )
679 : {
680 : bJustInitialize = TRUE;
681 : bMemZeroBuffer = TRUE;
682 : }*/
683 738702 : if (poBlock != nullptr)
684 572047 : poBlock->DropLock();
685 :
686 738702 : poBlock =
687 738702 : GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
688 738702 : if (poBlock == nullptr)
689 : {
690 0 : return (CE_Failure);
691 : }
692 :
693 738702 : poBlock->MarkDirty();
694 :
695 738702 : pabyDstBlock = static_cast<GByte *>(poBlock->GetDataRef());
696 : /*if( bMemZeroBuffer )
697 : {
698 : memset(pabyDstBlock, 0,
699 : static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
700 : * nBlockYSize);
701 : }*/
702 : }
703 :
704 : // To make Coverity happy. Should not happen by design.
705 11290800 : if (pabyDstBlock == nullptr)
706 : {
707 0 : CPLAssert(false);
708 : eErr = CE_Failure;
709 : break;
710 : }
711 :
712 : /* --------------------------------------------------------------------
713 : */
714 : /* Copy over this pixel of data. */
715 : /* --------------------------------------------------------------------
716 : */
717 11290800 : GPtrDiff_t iDstOffset =
718 11290800 : (static_cast<GPtrDiff_t>(iDstX) -
719 11290800 : static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
720 11290800 : (static_cast<GPtrDiff_t>(iDstY) -
721 11290800 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
722 11290800 : nBlockXSize) *
723 11290800 : nBandDataSize;
724 :
725 11290800 : if (eDataType == eBufType)
726 : {
727 11287700 : memcpy(pabyDstBlock + iDstOffset,
728 11287700 : static_cast<GByte *>(pData) + iBufOffset,
729 : nBandDataSize);
730 : }
731 : else
732 : {
733 : /* type to type conversion ... ouch, this is expensive way
734 : of handling single words */
735 3096 : GDALCopyWords64(static_cast<GByte *>(pData) + iBufOffset,
736 3096 : eBufType, 0, pabyDstBlock + iDstOffset,
737 : eDataType, 0, 1);
738 : }
739 : }
740 :
741 1093360 : if (psExtraArg->pfnProgress != nullptr &&
742 0 : !psExtraArg->pfnProgress(1.0 * (iDstY - nYOff + 1) / nYSize, "",
743 : psExtraArg->pProgressData))
744 : {
745 0 : eErr = CE_Failure;
746 0 : break;
747 : }
748 : }
749 : }
750 : else
751 : {
752 398978 : if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
753 : {
754 9543 : if ((psExtraArg->eResampleAlg == GRIORA_Cubic ||
755 2719 : psExtraArg->eResampleAlg == GRIORA_CubicSpline ||
756 2681 : psExtraArg->eResampleAlg == GRIORA_Bilinear ||
757 6865 : psExtraArg->eResampleAlg == GRIORA_Lanczos) &&
758 3191 : GetColorTable() != nullptr)
759 : {
760 0 : CPLError(CE_Warning, CPLE_NotSupported,
761 : "Resampling method not supported on paletted band. "
762 : "Falling back to nearest neighbour");
763 : }
764 3415 : else if (psExtraArg->eResampleAlg == GRIORA_Gauss &&
765 3 : GDALDataTypeIsComplex(eDataType))
766 : {
767 0 : CPLError(CE_Warning, CPLE_NotSupported,
768 : "Resampling method not supported on complex data type "
769 : "band. Falling back to nearest neighbour");
770 : }
771 : else
772 : {
773 3412 : return RasterIOResampled(eRWFlag, nXOff, nYOff, nXSize, nYSize,
774 : pData, nBufXSize, nBufYSize, eBufType,
775 3412 : nPixelSpace, nLineSpace, psExtraArg);
776 : }
777 : }
778 :
779 395566 : int nLimitBlockY = 0;
780 395566 : const bool bByteCopy = eDataType == eBufType && nBandDataSize == 1;
781 395566 : int nStartBlockX = -nBlockXSize;
782 395566 : constexpr double EPS = 1e-10;
783 395566 : int nLBlockY = -1;
784 395566 : const double dfSrcXStart = 0.5 * dfSrcXInc + dfXOff + EPS;
785 395566 : const bool bIntegerXFactor =
786 372877 : bUseIntegerRequestCoords &&
787 669395 : static_cast<int>(dfSrcXInc) == dfSrcXInc &&
788 273829 : static_cast<int>(dfSrcXInc) < INT_MAX / nBandDataSize;
789 :
790 : /* --------------------------------------------------------------------
791 : */
792 : /* Read case */
793 : /* Loop over buffer computing source locations. */
794 : /* --------------------------------------------------------------------
795 : */
796 2469430 : for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
797 : {
798 : // Add small epsilon to avoid some numeric precision issues.
799 2073880 : const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
800 2073880 : const int iSrcY = static_cast<int>(std::min(
801 2073880 : std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
802 :
803 2073880 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
804 : static_cast<GPtrDiff_t>(nLineSpace);
805 :
806 2073880 : if (iSrcY >= nLimitBlockY)
807 : {
808 437822 : nLBlockY = iSrcY / nBlockYSize;
809 437822 : nLimitBlockY = nLBlockY * nBlockYSize;
810 437822 : if (nLimitBlockY < INT_MAX - nBlockYSize)
811 437822 : nLimitBlockY += nBlockYSize;
812 : else
813 0 : nLimitBlockY = INT_MAX;
814 : // Make sure a new block is loaded.
815 437822 : nStartBlockX = -nBlockXSize;
816 : }
817 1636050 : else if (static_cast<int>(dfSrcXStart) < nStartBlockX)
818 : {
819 : // Make sure a new block is loaded.
820 441987 : nStartBlockX = -nBlockXSize;
821 : }
822 :
823 2073880 : GPtrDiff_t iSrcOffsetCst = (iSrcY - nLBlockY * nBlockYSize) *
824 2073880 : static_cast<GPtrDiff_t>(nBlockXSize);
825 :
826 2073880 : if (bIntegerXFactor)
827 : {
828 695780 : int iSrcX = static_cast<int>(dfSrcXStart);
829 695780 : const int nSrcXInc = static_cast<int>(dfSrcXInc);
830 695780 : GByte *pabyDstData = static_cast<GByte *>(pData) + iBufOffset;
831 695780 : bool bRet = false;
832 695780 : if (bByteCopy)
833 : {
834 585773 : bRet = DownsamplingIntegerXFactor<true, 1>(
835 : this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
836 : static_cast<int>(nPixelSpace), nBufXSize, GDT_UInt8,
837 : GDT_UInt8, nStartBlockX, nBlockXSize, poBlock,
838 : nLBlockY);
839 : }
840 110007 : else if (eDataType == eBufType)
841 : {
842 109782 : switch (nBandDataSize)
843 : {
844 109630 : case 2:
845 109630 : bRet = DownsamplingIntegerXFactor<true, 2>(
846 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
847 : pabyDstData, static_cast<int>(nPixelSpace),
848 : nBufXSize, eDataType, eDataType, nStartBlockX,
849 : nBlockXSize, poBlock, nLBlockY);
850 109630 : break;
851 54 : case 4:
852 54 : bRet = DownsamplingIntegerXFactor<true, 4>(
853 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
854 : pabyDstData, static_cast<int>(nPixelSpace),
855 : nBufXSize, eDataType, eDataType, nStartBlockX,
856 : nBlockXSize, poBlock, nLBlockY);
857 54 : break;
858 96 : case 8:
859 96 : bRet = DownsamplingIntegerXFactor<true, 8>(
860 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
861 : pabyDstData, static_cast<int>(nPixelSpace),
862 : nBufXSize, eDataType, eDataType, nStartBlockX,
863 : nBlockXSize, poBlock, nLBlockY);
864 96 : break;
865 2 : case 16:
866 2 : bRet = DownsamplingIntegerXFactor<true, 16>(
867 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
868 : pabyDstData, static_cast<int>(nPixelSpace),
869 : nBufXSize, eDataType, eDataType, nStartBlockX,
870 : nBlockXSize, poBlock, nLBlockY);
871 2 : break;
872 0 : default:
873 0 : CPLAssert(false);
874 : break;
875 : }
876 : }
877 : else
878 : {
879 225 : bRet = DownsamplingIntegerXFactor<false, 0>(
880 : this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
881 : static_cast<int>(nPixelSpace), nBufXSize, eDataType,
882 : eBufType, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
883 : }
884 695780 : if (!bRet)
885 1 : eErr = CE_Failure;
886 : }
887 : else
888 : {
889 1378100 : double dfSrcX = dfSrcXStart;
890 598175000 : for (int iBufXOff = 0; iBufXOff < nBufXSize;
891 596797000 : iBufXOff++, dfSrcX += dfSrcXInc)
892 : {
893 : // TODO?: try to avoid the clamping for most iterations
894 : const int iSrcX = static_cast<int>(
895 1193590000 : std::min(std::max(0.0, dfSrcX),
896 596797000 : static_cast<double>(nRasterXSize - 1)));
897 :
898 : /* --------------------------------------------------------------------
899 : */
900 : /* Ensure we have the appropriate block loaded. */
901 : /* --------------------------------------------------------------------
902 : */
903 596797000 : if (iSrcX >= nBlockXSize + nStartBlockX)
904 : {
905 1706900 : const int nLBlockX = iSrcX / nBlockXSize;
906 1706900 : nStartBlockX = nLBlockX * nBlockXSize;
907 :
908 1706900 : if (poBlock != nullptr)
909 1585160 : poBlock->DropLock();
910 :
911 1706900 : poBlock = GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
912 1706900 : if (poBlock == nullptr)
913 : {
914 9 : eErr = CE_Failure;
915 9 : break;
916 : }
917 :
918 : pabySrcBlock =
919 1706890 : static_cast<GByte *>(poBlock->GetDataRef());
920 : }
921 596797000 : const GPtrDiff_t nDiffX =
922 596797000 : static_cast<GPtrDiff_t>(iSrcX - nStartBlockX);
923 :
924 : /* --------------------------------------------------------------------
925 : */
926 : /* Copy over this pixel of data. */
927 : /* --------------------------------------------------------------------
928 : */
929 :
930 596797000 : if (bByteCopy)
931 : {
932 540998000 : GPtrDiff_t iSrcOffset = nDiffX + iSrcOffsetCst;
933 540998000 : static_cast<GByte *>(pData)[iBufOffset] =
934 540998000 : pabySrcBlock[iSrcOffset];
935 : }
936 55799000 : else if (eDataType == eBufType)
937 : {
938 50322800 : GPtrDiff_t iSrcOffset =
939 50322800 : (nDiffX + iSrcOffsetCst) * nBandDataSize;
940 50322800 : memcpy(static_cast<GByte *>(pData) + iBufOffset,
941 50322800 : pabySrcBlock + iSrcOffset, nBandDataSize);
942 : }
943 : else
944 : {
945 : // Type to type conversion ...
946 5476160 : GPtrDiff_t iSrcOffset =
947 5476160 : (nDiffX + iSrcOffsetCst) * nBandDataSize;
948 5476160 : GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
949 : static_cast<GByte *>(pData) +
950 5476160 : iBufOffset,
951 : eBufType, 0, 1);
952 : }
953 :
954 596797000 : iBufOffset += static_cast<int>(nPixelSpace);
955 : }
956 : }
957 2073880 : if (eErr == CE_Failure)
958 11 : break;
959 :
960 2315150 : if (psExtraArg->pfnProgress != nullptr &&
961 241284 : !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
962 : psExtraArg->pProgressData))
963 : {
964 1 : eErr = CE_Failure;
965 1 : break;
966 : }
967 : }
968 : }
969 :
970 562221 : if (poBlock != nullptr)
971 562211 : poBlock->DropLock();
972 :
973 562221 : return eErr;
974 : }
975 :
976 : /************************************************************************/
977 : /* GDALRasterIOTransformer() */
978 : /************************************************************************/
979 :
980 : struct GDALRasterIOTransformerStruct
981 : {
982 : double dfXOff;
983 : double dfYOff;
984 : double dfXRatioDstToSrc;
985 : double dfYRatioDstToSrc;
986 : };
987 :
988 6748 : static int GDALRasterIOTransformer(void *pTransformerArg, int bDstToSrc,
989 : int nPointCount, double *x, double *y,
990 : double * /* z */, int *panSuccess)
991 : {
992 6748 : GDALRasterIOTransformerStruct *psParams =
993 : static_cast<GDALRasterIOTransformerStruct *>(pTransformerArg);
994 6748 : if (bDstToSrc)
995 : {
996 252996 : for (int i = 0; i < nPointCount; i++)
997 : {
998 246836 : x[i] = x[i] * psParams->dfXRatioDstToSrc + psParams->dfXOff;
999 246836 : y[i] = y[i] * psParams->dfYRatioDstToSrc + psParams->dfYOff;
1000 246836 : panSuccess[i] = TRUE;
1001 : }
1002 : }
1003 : else
1004 : {
1005 1176 : for (int i = 0; i < nPointCount; i++)
1006 : {
1007 588 : x[i] = (x[i] - psParams->dfXOff) / psParams->dfXRatioDstToSrc;
1008 588 : y[i] = (y[i] - psParams->dfYOff) / psParams->dfYRatioDstToSrc;
1009 588 : panSuccess[i] = TRUE;
1010 : }
1011 : }
1012 6748 : return TRUE;
1013 : }
1014 :
1015 : /************************************************************************/
1016 : /* RasterIOResampled() */
1017 : /************************************************************************/
1018 :
1019 : //! @cond Doxygen_Suppress
1020 3412 : CPLErr GDALRasterBand::RasterIOResampled(
1021 : GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
1022 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
1023 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
1024 : {
1025 : // Determine if we use warping resampling or overview resampling
1026 : const bool bUseWarp =
1027 3412 : (GDALDataTypeIsComplex(eDataType) &&
1028 3571 : psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
1029 159 : psExtraArg->eResampleAlg != GRIORA_Mode);
1030 :
1031 3412 : double dfXOff = nXOff;
1032 3412 : double dfYOff = nYOff;
1033 3412 : double dfXSize = nXSize;
1034 3412 : double dfYSize = nYSize;
1035 3412 : if (psExtraArg->bFloatingPointWindowValidity)
1036 : {
1037 2717 : dfXOff = psExtraArg->dfXOff;
1038 2717 : dfYOff = psExtraArg->dfYOff;
1039 2717 : dfXSize = psExtraArg->dfXSize;
1040 2717 : dfYSize = psExtraArg->dfYSize;
1041 : }
1042 :
1043 3412 : const double dfXRatioDstToSrc = dfXSize / nBufXSize;
1044 3412 : const double dfYRatioDstToSrc = dfYSize / nBufYSize;
1045 :
1046 : // Determine the coordinates in the "virtual" output raster to see
1047 : // if there are not integers, in which case we will use them as a shift
1048 : // so that subwindow extracts give the exact same results as entire raster
1049 : // scaling.
1050 3412 : double dfDestXOff = dfXOff / dfXRatioDstToSrc;
1051 3412 : bool bHasXOffVirtual = false;
1052 3412 : int nDestXOffVirtual = 0;
1053 3412 : if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
1054 : {
1055 3084 : bHasXOffVirtual = true;
1056 3084 : dfXOff = nXOff;
1057 3084 : nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
1058 : }
1059 :
1060 3412 : double dfDestYOff = dfYOff / dfYRatioDstToSrc;
1061 3412 : bool bHasYOffVirtual = false;
1062 3412 : int nDestYOffVirtual = 0;
1063 3412 : if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
1064 : {
1065 3080 : bHasYOffVirtual = true;
1066 3080 : dfYOff = nYOff;
1067 3080 : nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
1068 : }
1069 :
1070 : // Create a MEM dataset that wraps the output buffer.
1071 : GDALDataset *poMEMDS;
1072 3412 : void *pTempBuffer = nullptr;
1073 3412 : GSpacing nPSMem = nPixelSpace;
1074 3412 : GSpacing nLSMem = nLineSpace;
1075 3412 : void *pDataMem = pData;
1076 3412 : GDALDataType eDTMem = eBufType;
1077 3412 : if (eBufType != eDataType)
1078 : {
1079 44 : nPSMem = GDALGetDataTypeSizeBytes(eDataType);
1080 44 : nLSMem = nPSMem * nBufXSize;
1081 : pTempBuffer =
1082 44 : VSI_MALLOC2_VERBOSE(nBufYSize, static_cast<size_t>(nLSMem));
1083 44 : if (pTempBuffer == nullptr)
1084 0 : return CE_Failure;
1085 44 : pDataMem = pTempBuffer;
1086 44 : eDTMem = eDataType;
1087 : }
1088 :
1089 : poMEMDS =
1090 3412 : MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
1091 : nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr);
1092 3412 : GByte *pabyData = static_cast<GByte *>(pDataMem) -
1093 3412 : nPSMem * nDestXOffVirtual - nLSMem * nDestYOffVirtual;
1094 3412 : GDALRasterBandH hMEMBand = MEMCreateRasterBandEx(
1095 : poMEMDS, 1, pabyData, eDTMem, nPSMem, nLSMem, false);
1096 3412 : poMEMDS->SetBand(1, GDALRasterBand::FromHandle(hMEMBand));
1097 :
1098 3412 : const char *pszNBITS = GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
1099 3412 : const int nNBITS = pszNBITS ? atoi(pszNBITS) : 0;
1100 3412 : if (pszNBITS)
1101 6 : GDALRasterBand::FromHandle(hMEMBand)->SetMetadataItem(
1102 6 : "NBITS", pszNBITS, "IMAGE_STRUCTURE");
1103 :
1104 3412 : CPLErr eErr = CE_None;
1105 :
1106 : // Do the resampling.
1107 3412 : if (bUseWarp)
1108 : {
1109 149 : int bHasNoData = FALSE;
1110 149 : double dfNoDataValue = GetNoDataValue(&bHasNoData);
1111 :
1112 149 : VRTDatasetH hVRTDS = nullptr;
1113 149 : GDALRasterBandH hVRTBand = nullptr;
1114 149 : if (GetDataset() == nullptr)
1115 : {
1116 : /* Create VRT dataset that wraps the whole dataset */
1117 0 : hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
1118 0 : VRTAddBand(hVRTDS, eDataType, nullptr);
1119 0 : hVRTBand = GDALGetRasterBand(hVRTDS, 1);
1120 0 : VRTAddSimpleSource(hVRTBand, this, 0, 0, nRasterXSize, nRasterYSize,
1121 : 0, 0, nRasterXSize, nRasterYSize, nullptr,
1122 : VRT_NODATA_UNSET);
1123 :
1124 : /* Add a mask band if needed */
1125 0 : if (GetMaskFlags() != GMF_ALL_VALID)
1126 : {
1127 0 : GDALDataset::FromHandle(hVRTDS)->CreateMaskBand(0);
1128 : VRTSourcedRasterBand *poVRTMaskBand =
1129 : reinterpret_cast<VRTSourcedRasterBand *>(
1130 : reinterpret_cast<GDALRasterBand *>(hVRTBand)
1131 0 : ->GetMaskBand());
1132 0 : poVRTMaskBand->AddMaskBandSource(this, 0, 0, nRasterXSize,
1133 0 : nRasterYSize, 0, 0,
1134 0 : nRasterXSize, nRasterYSize);
1135 : }
1136 : }
1137 :
1138 149 : GDALWarpOptions *psWarpOptions = GDALCreateWarpOptions();
1139 149 : switch (psExtraArg->eResampleAlg)
1140 : {
1141 0 : case GRIORA_NearestNeighbour:
1142 0 : psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
1143 0 : break;
1144 147 : case GRIORA_Bilinear:
1145 147 : psWarpOptions->eResampleAlg = GRA_Bilinear;
1146 147 : break;
1147 0 : case GRIORA_Cubic:
1148 0 : psWarpOptions->eResampleAlg = GRA_Cubic;
1149 0 : break;
1150 0 : case GRIORA_CubicSpline:
1151 0 : psWarpOptions->eResampleAlg = GRA_CubicSpline;
1152 0 : break;
1153 0 : case GRIORA_Lanczos:
1154 0 : psWarpOptions->eResampleAlg = GRA_Lanczos;
1155 0 : break;
1156 0 : case GRIORA_Average:
1157 0 : psWarpOptions->eResampleAlg = GRA_Average;
1158 0 : break;
1159 2 : case GRIORA_RMS:
1160 2 : psWarpOptions->eResampleAlg = GRA_RMS;
1161 2 : break;
1162 0 : case GRIORA_Mode:
1163 0 : psWarpOptions->eResampleAlg = GRA_Mode;
1164 0 : break;
1165 0 : default:
1166 0 : CPLAssert(false);
1167 : psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
1168 : break;
1169 : }
1170 149 : psWarpOptions->hSrcDS = hVRTDS ? hVRTDS : GetDataset();
1171 149 : psWarpOptions->hDstDS = poMEMDS;
1172 149 : psWarpOptions->nBandCount = 1;
1173 149 : int nSrcBandNumber = hVRTDS ? 1 : nBand;
1174 149 : int nDstBandNumber = 1;
1175 149 : psWarpOptions->panSrcBands = &nSrcBandNumber;
1176 149 : psWarpOptions->panDstBands = &nDstBandNumber;
1177 298 : psWarpOptions->pfnProgress = psExtraArg->pfnProgress
1178 149 : ? psExtraArg->pfnProgress
1179 : : GDALDummyProgress;
1180 149 : psWarpOptions->pProgressArg = psExtraArg->pProgressData;
1181 149 : psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
1182 149 : if (bHasNoData)
1183 : {
1184 0 : psWarpOptions->papszWarpOptions = CSLSetNameValue(
1185 : psWarpOptions->papszWarpOptions, "INIT_DEST", "NO_DATA");
1186 0 : if (psWarpOptions->padfSrcNoDataReal == nullptr)
1187 : {
1188 0 : psWarpOptions->padfSrcNoDataReal =
1189 0 : static_cast<double *>(CPLMalloc(sizeof(double)));
1190 0 : psWarpOptions->padfSrcNoDataReal[0] = dfNoDataValue;
1191 : }
1192 :
1193 0 : if (psWarpOptions->padfDstNoDataReal == nullptr)
1194 : {
1195 0 : psWarpOptions->padfDstNoDataReal =
1196 0 : static_cast<double *>(CPLMalloc(sizeof(double)));
1197 0 : psWarpOptions->padfDstNoDataReal[0] = dfNoDataValue;
1198 : }
1199 : }
1200 :
1201 : GDALRasterIOTransformerStruct sTransformer;
1202 149 : sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
1203 149 : sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
1204 149 : sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
1205 149 : sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
1206 149 : psWarpOptions->pTransformerArg = &sTransformer;
1207 :
1208 : GDALWarpOperationH hWarpOperation =
1209 149 : GDALCreateWarpOperation(psWarpOptions);
1210 149 : eErr = GDALChunkAndWarpImage(hWarpOperation, nDestXOffVirtual,
1211 : nDestYOffVirtual, nBufXSize, nBufYSize);
1212 149 : GDALDestroyWarpOperation(hWarpOperation);
1213 :
1214 149 : psWarpOptions->panSrcBands = nullptr;
1215 149 : psWarpOptions->panDstBands = nullptr;
1216 149 : GDALDestroyWarpOptions(psWarpOptions);
1217 :
1218 149 : if (hVRTDS)
1219 0 : GDALClose(hVRTDS);
1220 : }
1221 : else
1222 : {
1223 3263 : const char *pszResampling =
1224 4254 : (psExtraArg->eResampleAlg == GRIORA_Bilinear) ? "BILINEAR"
1225 1289 : : (psExtraArg->eResampleAlg == GRIORA_Cubic) ? "CUBIC"
1226 558 : : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
1227 479 : : (psExtraArg->eResampleAlg == GRIORA_Lanczos) ? "LANCZOS"
1228 342 : : (psExtraArg->eResampleAlg == GRIORA_Average) ? "AVERAGE"
1229 199 : : (psExtraArg->eResampleAlg == GRIORA_RMS) ? "RMS"
1230 79 : : (psExtraArg->eResampleAlg == GRIORA_Mode) ? "MODE"
1231 3 : : (psExtraArg->eResampleAlg == GRIORA_Gauss) ? "GAUSS"
1232 : : "UNKNOWN";
1233 :
1234 3263 : int nKernelRadius = 0;
1235 : GDALResampleFunction pfnResampleFunc =
1236 3263 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
1237 3263 : CPLAssert(pfnResampleFunc);
1238 : GDALDataType eWrkDataType =
1239 3263 : GDALGetOvrWorkDataType(pszResampling, eDataType);
1240 3263 : int nHasNoData = 0;
1241 3263 : double dfNoDataValue = GetNoDataValue(&nHasNoData);
1242 3263 : const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
1243 3263 : if (!bHasNoData)
1244 3173 : dfNoDataValue = 0.0;
1245 :
1246 3263 : int nDstBlockXSize = nBufXSize;
1247 3263 : int nDstBlockYSize = nBufYSize;
1248 3263 : int nFullResXChunk = 0;
1249 3263 : int nFullResYChunk = 0;
1250 : while (true)
1251 : {
1252 3274 : nFullResXChunk =
1253 3274 : 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
1254 3274 : nFullResYChunk =
1255 3274 : 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
1256 3274 : if (nFullResXChunk > nRasterXSize)
1257 2911 : nFullResXChunk = nRasterXSize;
1258 3274 : if (nFullResYChunk > nRasterYSize)
1259 512 : nFullResYChunk = nRasterYSize;
1260 3274 : if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
1261 3216 : (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
1262 : 1024 * 1024))
1263 : break;
1264 : // When operating on the full width of a raster whose block width is
1265 : // the raster width, prefer doing chunks in height.
1266 11 : if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
1267 : nDstBlockYSize > 1)
1268 0 : nDstBlockYSize /= 2;
1269 : /* Otherwise cut the maximal dimension */
1270 11 : else if (nDstBlockXSize > 1 &&
1271 0 : (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
1272 11 : nDstBlockXSize /= 2;
1273 : else
1274 0 : nDstBlockYSize /= 2;
1275 : }
1276 :
1277 3263 : int nOvrXFactor = static_cast<int>(0.5 + dfXRatioDstToSrc);
1278 3263 : int nOvrYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
1279 3263 : if (nOvrXFactor == 0)
1280 2029 : nOvrXFactor = 1;
1281 3263 : if (nOvrYFactor == 0)
1282 2028 : nOvrYFactor = 1;
1283 3263 : int nFullResXSizeQueried =
1284 3263 : nFullResXChunk + 2 * nKernelRadius * nOvrXFactor;
1285 3263 : int nFullResYSizeQueried =
1286 3263 : nFullResYChunk + 2 * nKernelRadius * nOvrYFactor;
1287 :
1288 3263 : if (nFullResXSizeQueried > nRasterXSize)
1289 2701 : nFullResXSizeQueried = nRasterXSize;
1290 3263 : if (nFullResYSizeQueried > nRasterYSize)
1291 299 : nFullResYSizeQueried = nRasterYSize;
1292 :
1293 : void *pChunk =
1294 3263 : VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
1295 : nFullResXSizeQueried, nFullResYSizeQueried);
1296 3263 : GByte *pabyChunkNoDataMask = nullptr;
1297 :
1298 3263 : GDALRasterBand *poMaskBand = GetMaskBand();
1299 3263 : int l_nMaskFlags = GetMaskFlags();
1300 :
1301 3263 : bool bUseNoDataMask = ((l_nMaskFlags & GMF_ALL_VALID) == 0);
1302 3263 : if (bUseNoDataMask)
1303 : {
1304 158 : pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
1305 : nFullResXSizeQueried, nFullResYSizeQueried));
1306 : }
1307 3263 : if (pChunk == nullptr ||
1308 158 : (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
1309 : {
1310 0 : GDALClose(poMEMDS);
1311 0 : CPLFree(pChunk);
1312 0 : CPLFree(pabyChunkNoDataMask);
1313 0 : VSIFree(pTempBuffer);
1314 0 : return CE_Failure;
1315 : }
1316 :
1317 3263 : const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
1318 3263 : DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
1319 3263 : int nBlocksDone = 0;
1320 :
1321 : int nDstYOff;
1322 6526 : for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
1323 3263 : nDstYOff += nDstBlockYSize)
1324 : {
1325 : int nDstYCount;
1326 3263 : if (nDstYOff + nDstBlockYSize <= nBufYSize)
1327 3263 : nDstYCount = nDstBlockYSize;
1328 : else
1329 0 : nDstYCount = nBufYSize - nDstYOff;
1330 :
1331 3263 : int nChunkYOff =
1332 3263 : nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
1333 3263 : int nChunkYOff2 = nYOff + 1 +
1334 3263 : static_cast<int>(ceil((nDstYOff + nDstYCount) *
1335 : dfYRatioDstToSrc));
1336 3263 : if (nChunkYOff2 > nRasterYSize)
1337 660 : nChunkYOff2 = nRasterYSize;
1338 3263 : int nYCount = nChunkYOff2 - nChunkYOff;
1339 3263 : CPLAssert(nYCount <= nFullResYChunk);
1340 :
1341 3263 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrYFactor;
1342 3263 : int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrYFactor;
1343 3263 : if (nChunkYOffQueried < 0)
1344 : {
1345 458 : nChunkYSizeQueried += nChunkYOffQueried;
1346 458 : nChunkYOffQueried = 0;
1347 : }
1348 3263 : if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
1349 561 : nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
1350 3263 : CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
1351 :
1352 3263 : int nDstXOff = 0;
1353 6526 : for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
1354 3263 : nDstXOff += nDstBlockXSize)
1355 : {
1356 3263 : int nDstXCount = 0;
1357 3263 : if (nDstXOff + nDstBlockXSize <= nBufXSize)
1358 3263 : nDstXCount = nDstBlockXSize;
1359 : else
1360 0 : nDstXCount = nBufXSize - nDstXOff;
1361 :
1362 3263 : int nChunkXOff =
1363 3263 : nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
1364 3263 : int nChunkXOff2 =
1365 3263 : nXOff + 1 +
1366 3263 : static_cast<int>(
1367 3263 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
1368 3263 : if (nChunkXOff2 > nRasterXSize)
1369 2960 : nChunkXOff2 = nRasterXSize;
1370 3263 : int nXCount = nChunkXOff2 - nChunkXOff;
1371 3263 : CPLAssert(nXCount <= nFullResXChunk);
1372 :
1373 3263 : int nChunkXOffQueried =
1374 3263 : nChunkXOff - nKernelRadius * nOvrXFactor;
1375 3263 : int nChunkXSizeQueried =
1376 3263 : nXCount + 2 * nKernelRadius * nOvrXFactor;
1377 3263 : if (nChunkXOffQueried < 0)
1378 : {
1379 2762 : nChunkXSizeQueried += nChunkXOffQueried;
1380 2762 : nChunkXOffQueried = 0;
1381 : }
1382 3263 : if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
1383 2748 : nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
1384 3263 : CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
1385 :
1386 : // Read the source buffers.
1387 3263 : eErr = RasterIO(GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1388 : nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
1389 : nChunkXSizeQueried, nChunkYSizeQueried,
1390 : eWrkDataType, 0, 0, nullptr);
1391 :
1392 3263 : bool bSkipResample = false;
1393 3263 : bool bNoDataMaskFullyOpaque = false;
1394 3263 : if (eErr == CE_None && bUseNoDataMask)
1395 : {
1396 158 : eErr = poMaskBand->RasterIO(
1397 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1398 : nChunkXSizeQueried, nChunkYSizeQueried,
1399 : pabyChunkNoDataMask, nChunkXSizeQueried,
1400 : nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
1401 :
1402 : /* Optimizations if mask if fully opaque or transparent */
1403 158 : int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
1404 158 : GByte bVal = pabyChunkNoDataMask[0];
1405 158 : int i = 1;
1406 3751650 : for (; i < nPixels; i++)
1407 : {
1408 3751590 : if (pabyChunkNoDataMask[i] != bVal)
1409 104 : break;
1410 : }
1411 158 : if (i == nPixels)
1412 : {
1413 54 : if (bVal == 0)
1414 : {
1415 712 : for (int j = 0; j < nDstYCount; j++)
1416 : {
1417 686 : GDALCopyWords64(&dfNoDataValue, GDT_Float64, 0,
1418 : static_cast<GByte *>(pDataMem) +
1419 686 : nLSMem * (j + nDstYOff) +
1420 686 : nDstXOff * nPSMem,
1421 : eDTMem,
1422 : static_cast<int>(nPSMem),
1423 : nDstXCount);
1424 : }
1425 26 : bSkipResample = true;
1426 : }
1427 : else
1428 : {
1429 28 : bNoDataMaskFullyOpaque = true;
1430 : }
1431 : }
1432 : }
1433 :
1434 3263 : if (!bSkipResample && eErr == CE_None)
1435 : {
1436 3234 : const bool bPropagateNoData = false;
1437 3234 : void *pDstBuffer = nullptr;
1438 3234 : GDALDataType eDstBufferDataType = GDT_Unknown;
1439 : GDALRasterBand *poMEMBand =
1440 3234 : GDALRasterBand::FromHandle(hMEMBand);
1441 3234 : GDALOverviewResampleArgs args;
1442 3234 : args.eSrcDataType = eDataType;
1443 3234 : args.eOvrDataType = poMEMBand->GetRasterDataType();
1444 3234 : args.nOvrXSize = poMEMBand->GetXSize();
1445 3234 : args.nOvrYSize = poMEMBand->GetYSize();
1446 3234 : args.nOvrNBITS = nNBITS;
1447 3234 : args.dfXRatioDstToSrc = dfXRatioDstToSrc;
1448 3234 : args.dfYRatioDstToSrc = dfYRatioDstToSrc;
1449 3234 : args.dfSrcXDelta =
1450 3234 : dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
1451 3234 : args.dfSrcYDelta =
1452 3234 : dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
1453 3234 : args.eWrkDataType = eWrkDataType;
1454 3234 : args.pabyChunkNodataMask =
1455 3234 : bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask;
1456 3234 : args.nChunkXOff =
1457 3234 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
1458 3234 : args.nChunkXSize = nChunkXSizeQueried;
1459 3234 : args.nChunkYOff =
1460 3234 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
1461 3234 : args.nChunkYSize = nChunkYSizeQueried;
1462 3234 : args.nDstXOff = nDstXOff + nDestXOffVirtual;
1463 3234 : args.nDstXOff2 = nDstXOff + nDestXOffVirtual + nDstXCount;
1464 3234 : args.nDstYOff = nDstYOff + nDestYOffVirtual;
1465 3234 : args.nDstYOff2 = nDstYOff + nDestYOffVirtual + nDstYCount;
1466 3234 : args.pszResampling = pszResampling;
1467 3234 : args.bHasNoData = bHasNoData;
1468 3234 : args.dfNoDataValue = dfNoDataValue;
1469 3234 : args.poColorTable = GetColorTable();
1470 3234 : args.bPropagateNoData = bPropagateNoData;
1471 3234 : eErr = pfnResampleFunc(args, pChunk, &pDstBuffer,
1472 : &eDstBufferDataType);
1473 3234 : if (eErr == CE_None)
1474 : {
1475 3234 : eErr = poMEMBand->RasterIO(
1476 : GF_Write, nDstXOff + nDestXOffVirtual,
1477 : nDstYOff + nDestYOffVirtual, nDstXCount, nDstYCount,
1478 : pDstBuffer, nDstXCount, nDstYCount,
1479 : eDstBufferDataType, 0, 0, nullptr);
1480 : }
1481 3234 : CPLFree(pDstBuffer);
1482 : }
1483 :
1484 3263 : nBlocksDone++;
1485 3689 : if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
1486 426 : !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
1487 : "", psExtraArg->pProgressData))
1488 : {
1489 1 : eErr = CE_Failure;
1490 : }
1491 : }
1492 : }
1493 :
1494 3263 : CPLFree(pChunk);
1495 3263 : CPLFree(pabyChunkNoDataMask);
1496 : }
1497 :
1498 3412 : if (eBufType != eDataType)
1499 : {
1500 44 : CPL_IGNORE_RET_VAL(poMEMDS->GetRasterBand(1)->RasterIO(
1501 : GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
1502 : pData, nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
1503 : nullptr));
1504 : }
1505 3412 : GDALClose(poMEMDS);
1506 3412 : VSIFree(pTempBuffer);
1507 :
1508 3412 : return eErr;
1509 : }
1510 :
1511 : /************************************************************************/
1512 : /* RasterIOResampled() */
1513 : /************************************************************************/
1514 :
1515 886 : CPLErr GDALDataset::RasterIOResampled(
1516 : GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
1517 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
1518 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
1519 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
1520 :
1521 : {
1522 : #if 0
1523 : // Determine if we use warping resampling or overview resampling
1524 : bool bUseWarp = false;
1525 : if( GDALDataTypeIsComplex( eDataType ) )
1526 : bUseWarp = true;
1527 : #endif
1528 :
1529 886 : double dfXOff = nXOff;
1530 886 : double dfYOff = nYOff;
1531 886 : double dfXSize = nXSize;
1532 886 : double dfYSize = nYSize;
1533 886 : if (psExtraArg->bFloatingPointWindowValidity)
1534 : {
1535 765 : dfXOff = psExtraArg->dfXOff;
1536 765 : dfYOff = psExtraArg->dfYOff;
1537 765 : dfXSize = psExtraArg->dfXSize;
1538 765 : dfYSize = psExtraArg->dfYSize;
1539 : }
1540 :
1541 886 : const double dfXRatioDstToSrc = dfXSize / nBufXSize;
1542 886 : const double dfYRatioDstToSrc = dfYSize / nBufYSize;
1543 :
1544 : // Determine the coordinates in the "virtual" output raster to see
1545 : // if there are not integers, in which case we will use them as a shift
1546 : // so that subwindow extracts give the exact same results as entire raster
1547 : // scaling.
1548 886 : double dfDestXOff = dfXOff / dfXRatioDstToSrc;
1549 886 : bool bHasXOffVirtual = false;
1550 886 : int nDestXOffVirtual = 0;
1551 886 : if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
1552 : {
1553 761 : bHasXOffVirtual = true;
1554 761 : dfXOff = nXOff;
1555 761 : nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
1556 : }
1557 :
1558 886 : double dfDestYOff = dfYOff / dfYRatioDstToSrc;
1559 886 : bool bHasYOffVirtual = false;
1560 886 : int nDestYOffVirtual = 0;
1561 886 : if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
1562 : {
1563 721 : bHasYOffVirtual = true;
1564 721 : dfYOff = nYOff;
1565 721 : nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
1566 : }
1567 :
1568 : // Create a MEM dataset that wraps the output buffer.
1569 : GDALDataset *poMEMDS =
1570 886 : MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
1571 : nDestYOffVirtual + nBufYSize, 0, eBufType, nullptr);
1572 : GDALRasterBand **papoDstBands = static_cast<GDALRasterBand **>(
1573 886 : CPLMalloc(nBandCount * sizeof(GDALRasterBand *)));
1574 886 : int nNBITS = 0;
1575 2878 : for (int i = 0; i < nBandCount; i++)
1576 : {
1577 1992 : char szBuffer[32] = {'\0'};
1578 3984 : int nRet = CPLPrintPointer(
1579 : szBuffer,
1580 1992 : static_cast<GByte *>(pData) - nPixelSpace * nDestXOffVirtual -
1581 1992 : nLineSpace * nDestYOffVirtual + nBandSpace * i,
1582 : sizeof(szBuffer));
1583 1992 : szBuffer[nRet] = 0;
1584 :
1585 1992 : char szBuffer0[64] = {'\0'};
1586 1992 : snprintf(szBuffer0, sizeof(szBuffer0), "DATAPOINTER=%s", szBuffer);
1587 :
1588 1992 : char szBuffer1[64] = {'\0'};
1589 1992 : snprintf(szBuffer1, sizeof(szBuffer1), "PIXELOFFSET=" CPL_FRMT_GIB,
1590 : static_cast<GIntBig>(nPixelSpace));
1591 :
1592 1992 : char szBuffer2[64] = {'\0'};
1593 1992 : snprintf(szBuffer2, sizeof(szBuffer2), "LINEOFFSET=" CPL_FRMT_GIB,
1594 : static_cast<GIntBig>(nLineSpace));
1595 :
1596 1992 : char *apszOptions[4] = {szBuffer0, szBuffer1, szBuffer2, nullptr};
1597 :
1598 1992 : poMEMDS->AddBand(eBufType, apszOptions);
1599 :
1600 1992 : GDALRasterBand *poSrcBand = GetRasterBand(panBandMap[i]);
1601 1992 : papoDstBands[i] = poMEMDS->GetRasterBand(i + 1);
1602 : const char *pszNBITS =
1603 1992 : poSrcBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
1604 1992 : if (pszNBITS)
1605 : {
1606 0 : nNBITS = atoi(pszNBITS);
1607 0 : poMEMDS->GetRasterBand(i + 1)->SetMetadataItem("NBITS", pszNBITS,
1608 0 : "IMAGE_STRUCTURE");
1609 : }
1610 : }
1611 :
1612 886 : CPLErr eErr = CE_None;
1613 :
1614 : // TODO(schwehr): Why disabled? Why not just delete?
1615 : // Looks like this code was initially added as disable by copying
1616 : // from RasterIO here:
1617 : // https://trac.osgeo.org/gdal/changeset/29572
1618 : #if 0
1619 : // Do the resampling.
1620 : if( bUseWarp )
1621 : {
1622 : VRTDatasetH hVRTDS = nullptr;
1623 : GDALRasterBandH hVRTBand = nullptr;
1624 : if( GetDataset() == nullptr )
1625 : {
1626 : /* Create VRT dataset that wraps the whole dataset */
1627 : hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
1628 : VRTAddBand( hVRTDS, eDataType, nullptr );
1629 : hVRTBand = GDALGetRasterBand(hVRTDS, 1);
1630 : VRTAddSimpleSource( (VRTSourcedRasterBandH)hVRTBand,
1631 : (GDALRasterBandH)this,
1632 : 0, 0,
1633 : nRasterXSize, nRasterYSize,
1634 : 0, 0,
1635 : nRasterXSize, nRasterYSize,
1636 : nullptr, VRT_NODATA_UNSET );
1637 :
1638 : /* Add a mask band if needed */
1639 : if( GetMaskFlags() != GMF_ALL_VALID )
1640 : {
1641 : ((GDALDataset*)hVRTDS)->CreateMaskBand(0);
1642 : VRTSourcedRasterBand* poVRTMaskBand =
1643 : (VRTSourcedRasterBand*)(((GDALRasterBand*)hVRTBand)->GetMaskBand());
1644 : poVRTMaskBand->
1645 : AddMaskBandSource( this,
1646 : 0, 0,
1647 : nRasterXSize, nRasterYSize,
1648 : 0, 0,
1649 : nRasterXSize, nRasterYSize);
1650 : }
1651 : }
1652 :
1653 : GDALWarpOptions* psWarpOptions = GDALCreateWarpOptions();
1654 : psWarpOptions->eResampleAlg = (GDALResampleAlg)psExtraArg->eResampleAlg;
1655 : psWarpOptions->hSrcDS = (GDALDatasetH) (hVRTDS ? hVRTDS : GetDataset());
1656 : psWarpOptions->hDstDS = (GDALDatasetH) poMEMDS;
1657 : psWarpOptions->nBandCount = 1;
1658 : int nSrcBandNumber = (hVRTDS ? 1 : nBand);
1659 : int nDstBandNumber = 1;
1660 : psWarpOptions->panSrcBands = &nSrcBandNumber;
1661 : psWarpOptions->panDstBands = &nDstBandNumber;
1662 : psWarpOptions->pfnProgress = psExtraArg->pfnProgress ?
1663 : psExtraArg->pfnProgress : GDALDummyProgress;
1664 : psWarpOptions->pProgressArg = psExtraArg->pProgressData;
1665 : psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
1666 : GDALRasterIOTransformerStruct sTransformer;
1667 : sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
1668 : sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
1669 : sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
1670 : sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
1671 : psWarpOptions->pTransformerArg = &sTransformer;
1672 :
1673 : GDALWarpOperationH hWarpOperation = GDALCreateWarpOperation(psWarpOptions);
1674 : eErr = GDALChunkAndWarpImage( hWarpOperation,
1675 : nDestXOffVirtual, nDestYOffVirtual,
1676 : nBufXSize, nBufYSize );
1677 : GDALDestroyWarpOperation( hWarpOperation );
1678 :
1679 : psWarpOptions->panSrcBands = nullptr;
1680 : psWarpOptions->panDstBands = nullptr;
1681 : GDALDestroyWarpOptions( psWarpOptions );
1682 :
1683 : if( hVRTDS )
1684 : GDALClose(hVRTDS);
1685 : }
1686 : else
1687 : #endif
1688 : {
1689 886 : const char *pszResampling =
1690 1653 : (psExtraArg->eResampleAlg == GRIORA_Bilinear) ? "BILINEAR"
1691 767 : : (psExtraArg->eResampleAlg == GRIORA_Cubic) ? "CUBIC"
1692 0 : : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
1693 0 : : (psExtraArg->eResampleAlg == GRIORA_Lanczos) ? "LANCZOS"
1694 0 : : (psExtraArg->eResampleAlg == GRIORA_Average) ? "AVERAGE"
1695 0 : : (psExtraArg->eResampleAlg == GRIORA_RMS) ? "RMS"
1696 0 : : (psExtraArg->eResampleAlg == GRIORA_Mode) ? "MODE"
1697 0 : : (psExtraArg->eResampleAlg == GRIORA_Gauss) ? "GAUSS"
1698 : : "UNKNOWN";
1699 :
1700 886 : GDALRasterBand *poFirstSrcBand = GetRasterBand(panBandMap[0]);
1701 886 : GDALDataType eDataType = poFirstSrcBand->GetRasterDataType();
1702 : int nBlockXSize, nBlockYSize;
1703 886 : poFirstSrcBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
1704 :
1705 : int nKernelRadius;
1706 : GDALResampleFunction pfnResampleFunc =
1707 886 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
1708 886 : CPLAssert(pfnResampleFunc);
1709 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1710 : GDALResampleFunctionMultiBands pfnResampleFuncMultiBands =
1711 : GDALGetResampleFunctionMultiBands(pszResampling, &nKernelRadius);
1712 : #endif
1713 : GDALDataType eWrkDataType =
1714 886 : GDALGetOvrWorkDataType(pszResampling, eDataType);
1715 :
1716 886 : int nDstBlockXSize = nBufXSize;
1717 886 : int nDstBlockYSize = nBufYSize;
1718 : int nFullResXChunk, nFullResYChunk;
1719 : while (true)
1720 : {
1721 886 : nFullResXChunk =
1722 886 : 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
1723 886 : nFullResYChunk =
1724 886 : 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
1725 886 : if (nFullResXChunk > nRasterXSize)
1726 585 : nFullResXChunk = nRasterXSize;
1727 886 : if (nFullResYChunk > nRasterYSize)
1728 51 : nFullResYChunk = nRasterYSize;
1729 886 : if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
1730 884 : (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
1731 : 1024 * 1024))
1732 : break;
1733 : // When operating on the full width of a raster whose block width is
1734 : // the raster width, prefer doing chunks in height.
1735 0 : if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
1736 : nDstBlockYSize > 1)
1737 0 : nDstBlockYSize /= 2;
1738 : /* Otherwise cut the maximal dimension */
1739 0 : else if (nDstBlockXSize > 1 &&
1740 0 : (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
1741 0 : nDstBlockXSize /= 2;
1742 : else
1743 0 : nDstBlockYSize /= 2;
1744 : }
1745 :
1746 1772 : int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
1747 886 : static_cast<int>(0.5 + dfYRatioDstToSrc));
1748 886 : if (nOvrFactor == 0)
1749 104 : nOvrFactor = 1;
1750 886 : int nFullResXSizeQueried =
1751 886 : nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
1752 886 : int nFullResYSizeQueried =
1753 886 : nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
1754 :
1755 886 : if (nFullResXSizeQueried > nRasterXSize)
1756 610 : nFullResXSizeQueried = nRasterXSize;
1757 886 : if (nFullResYSizeQueried > nRasterYSize)
1758 54 : nFullResYSizeQueried = nRasterYSize;
1759 :
1760 886 : void *pChunk = VSI_MALLOC3_VERBOSE(
1761 : cpl::fits_on<int>(GDALGetDataTypeSizeBytes(eWrkDataType) *
1762 : nBandCount),
1763 : nFullResXSizeQueried, nFullResYSizeQueried);
1764 886 : GByte *pabyChunkNoDataMask = nullptr;
1765 :
1766 886 : GDALRasterBand *poMaskBand = poFirstSrcBand->GetMaskBand();
1767 886 : int nMaskFlags = poFirstSrcBand->GetMaskFlags();
1768 :
1769 886 : bool bUseNoDataMask = ((nMaskFlags & GMF_ALL_VALID) == 0);
1770 886 : if (bUseNoDataMask)
1771 : {
1772 617 : pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
1773 : nFullResXSizeQueried, nFullResYSizeQueried));
1774 : }
1775 886 : if (pChunk == nullptr ||
1776 617 : (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
1777 : {
1778 0 : GDALClose(poMEMDS);
1779 0 : CPLFree(pChunk);
1780 0 : CPLFree(pabyChunkNoDataMask);
1781 0 : CPLFree(papoDstBands);
1782 0 : return CE_Failure;
1783 : }
1784 :
1785 886 : const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
1786 886 : DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
1787 886 : int nBlocksDone = 0;
1788 :
1789 : int nDstYOff;
1790 1772 : for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
1791 886 : nDstYOff += nDstBlockYSize)
1792 : {
1793 : int nDstYCount;
1794 886 : if (nDstYOff + nDstBlockYSize <= nBufYSize)
1795 886 : nDstYCount = nDstBlockYSize;
1796 : else
1797 0 : nDstYCount = nBufYSize - nDstYOff;
1798 :
1799 886 : int nChunkYOff =
1800 886 : nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
1801 886 : int nChunkYOff2 = nYOff + 1 +
1802 886 : static_cast<int>(ceil((nDstYOff + nDstYCount) *
1803 : dfYRatioDstToSrc));
1804 886 : if (nChunkYOff2 > nRasterYSize)
1805 133 : nChunkYOff2 = nRasterYSize;
1806 886 : int nYCount = nChunkYOff2 - nChunkYOff;
1807 886 : CPLAssert(nYCount <= nFullResYChunk);
1808 :
1809 886 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
1810 886 : int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
1811 886 : if (nChunkYOffQueried < 0)
1812 : {
1813 136 : nChunkYSizeQueried += nChunkYOffQueried;
1814 136 : nChunkYOffQueried = 0;
1815 : }
1816 886 : if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
1817 151 : nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
1818 886 : CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
1819 :
1820 : int nDstXOff;
1821 1772 : for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
1822 886 : nDstXOff += nDstBlockXSize)
1823 : {
1824 : int nDstXCount;
1825 886 : if (nDstXOff + nDstBlockXSize <= nBufXSize)
1826 886 : nDstXCount = nDstBlockXSize;
1827 : else
1828 0 : nDstXCount = nBufXSize - nDstXOff;
1829 :
1830 886 : int nChunkXOff =
1831 886 : nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
1832 886 : int nChunkXOff2 =
1833 886 : nXOff + 1 +
1834 886 : static_cast<int>(
1835 886 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
1836 886 : if (nChunkXOff2 > nRasterXSize)
1837 641 : nChunkXOff2 = nRasterXSize;
1838 886 : int nXCount = nChunkXOff2 - nChunkXOff;
1839 886 : CPLAssert(nXCount <= nFullResXChunk);
1840 :
1841 886 : int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
1842 886 : int nChunkXSizeQueried =
1843 886 : nXCount + 2 * nKernelRadius * nOvrFactor;
1844 886 : if (nChunkXOffQueried < 0)
1845 : {
1846 641 : nChunkXSizeQueried += nChunkXOffQueried;
1847 641 : nChunkXOffQueried = 0;
1848 : }
1849 886 : if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
1850 649 : nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
1851 886 : CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
1852 :
1853 886 : bool bSkipResample = false;
1854 886 : bool bNoDataMaskFullyOpaque = false;
1855 886 : if (eErr == CE_None && bUseNoDataMask)
1856 : {
1857 617 : eErr = poMaskBand->RasterIO(
1858 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1859 : nChunkXSizeQueried, nChunkYSizeQueried,
1860 : pabyChunkNoDataMask, nChunkXSizeQueried,
1861 : nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
1862 :
1863 : /* Optimizations if mask if fully opaque or transparent */
1864 617 : const int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
1865 617 : const GByte bVal = pabyChunkNoDataMask[0];
1866 617 : int i = 1; // Used after for.
1867 48197000 : for (; i < nPixels; i++)
1868 : {
1869 48196500 : if (pabyChunkNoDataMask[i] != bVal)
1870 72 : break;
1871 : }
1872 617 : if (i == nPixels)
1873 : {
1874 545 : if (bVal == 0)
1875 : {
1876 373 : GByte abyZero[16] = {0};
1877 780 : for (int iBand = 0; iBand < nBandCount; iBand++)
1878 : {
1879 3499 : for (int j = 0; j < nDstYCount; j++)
1880 : {
1881 3092 : GDALCopyWords64(
1882 : abyZero, GDT_UInt8, 0,
1883 : static_cast<GByte *>(pData) +
1884 3092 : iBand * nBandSpace +
1885 3092 : nLineSpace * (j + nDstYOff) +
1886 3092 : nDstXOff * nPixelSpace,
1887 : eBufType, static_cast<int>(nPixelSpace),
1888 : nDstXCount);
1889 : }
1890 : }
1891 373 : bSkipResample = true;
1892 : }
1893 : else
1894 : {
1895 172 : bNoDataMaskFullyOpaque = true;
1896 : }
1897 : }
1898 : }
1899 :
1900 886 : if (!bSkipResample && eErr == CE_None)
1901 : {
1902 : /* Read the source buffers */
1903 510 : eErr = RasterIO(
1904 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1905 : nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
1906 : nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType,
1907 : nBandCount, panBandMap, 0, 0, 0, nullptr);
1908 : }
1909 :
1910 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1911 : if (pfnResampleFuncMultiBands && !bSkipResample &&
1912 : eErr == CE_None)
1913 : {
1914 : eErr = pfnResampleFuncMultiBands(
1915 : dfXRatioDstToSrc, dfYRatioDstToSrc,
1916 : dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
1917 : dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
1918 : eWrkDataType, (GByte *)pChunk, nBandCount,
1919 : bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask,
1920 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
1921 : nChunkXSizeQueried,
1922 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
1923 : nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
1924 : nDstXOff + nDestXOffVirtual + nDstXCount,
1925 : nDstYOff + nDestYOffVirtual,
1926 : nDstYOff + nDestYOffVirtual + nDstYCount, papoDstBands,
1927 : pszResampling, FALSE /*bHasNoData*/,
1928 : 0.0 /* dfNoDataValue */, nullptr /* color table*/,
1929 : eDataType);
1930 : }
1931 : else
1932 : #endif
1933 : {
1934 : size_t nChunkBandOffset =
1935 886 : static_cast<size_t>(nChunkXSizeQueried) *
1936 886 : nChunkYSizeQueried *
1937 886 : GDALGetDataTypeSizeBytes(eWrkDataType);
1938 2462 : for (int i = 0;
1939 2462 : i < nBandCount && !bSkipResample && eErr == CE_None;
1940 : i++)
1941 : {
1942 1576 : const bool bPropagateNoData = false;
1943 1576 : void *pDstBuffer = nullptr;
1944 1576 : GDALDataType eDstBufferDataType = GDT_Unknown;
1945 : GDALRasterBand *poMEMBand =
1946 1576 : poMEMDS->GetRasterBand(i + 1);
1947 1576 : GDALOverviewResampleArgs args;
1948 1576 : args.eSrcDataType = eDataType;
1949 1576 : args.eOvrDataType = poMEMBand->GetRasterDataType();
1950 1576 : args.nOvrXSize = poMEMBand->GetXSize();
1951 1576 : args.nOvrYSize = poMEMBand->GetYSize();
1952 1576 : args.nOvrNBITS = nNBITS;
1953 1576 : args.dfXRatioDstToSrc = dfXRatioDstToSrc;
1954 1576 : args.dfYRatioDstToSrc = dfYRatioDstToSrc;
1955 1576 : args.dfSrcXDelta =
1956 1576 : dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
1957 1576 : args.dfSrcYDelta =
1958 1576 : dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
1959 1576 : args.eWrkDataType = eWrkDataType;
1960 1576 : args.pabyChunkNodataMask = bNoDataMaskFullyOpaque
1961 1576 : ? nullptr
1962 : : pabyChunkNoDataMask;
1963 1576 : args.nChunkXOff =
1964 1576 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
1965 1576 : args.nChunkXSize = nChunkXSizeQueried;
1966 1576 : args.nChunkYOff =
1967 1576 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
1968 1576 : args.nChunkYSize = nChunkYSizeQueried;
1969 1576 : args.nDstXOff = nDstXOff + nDestXOffVirtual;
1970 1576 : args.nDstXOff2 =
1971 1576 : nDstXOff + nDestXOffVirtual + nDstXCount;
1972 1576 : args.nDstYOff = nDstYOff + nDestYOffVirtual;
1973 1576 : args.nDstYOff2 =
1974 1576 : nDstYOff + nDestYOffVirtual + nDstYCount;
1975 1576 : args.pszResampling = pszResampling;
1976 1576 : args.bHasNoData = false;
1977 1576 : args.dfNoDataValue = 0.0;
1978 1576 : args.poColorTable = nullptr;
1979 1576 : args.bPropagateNoData = bPropagateNoData;
1980 :
1981 : eErr =
1982 3152 : pfnResampleFunc(args,
1983 1576 : reinterpret_cast<GByte *>(pChunk) +
1984 1576 : i * nChunkBandOffset,
1985 : &pDstBuffer, &eDstBufferDataType);
1986 1576 : if (eErr == CE_None)
1987 : {
1988 1576 : eErr = poMEMBand->RasterIO(
1989 : GF_Write, nDstXOff + nDestXOffVirtual,
1990 : nDstYOff + nDestYOffVirtual, nDstXCount,
1991 : nDstYCount, pDstBuffer, nDstXCount, nDstYCount,
1992 : eDstBufferDataType, 0, 0, nullptr);
1993 : }
1994 1576 : CPLFree(pDstBuffer);
1995 : }
1996 : }
1997 :
1998 886 : nBlocksDone++;
1999 1275 : if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
2000 389 : !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
2001 : "", psExtraArg->pProgressData))
2002 : {
2003 0 : eErr = CE_Failure;
2004 : }
2005 : }
2006 : }
2007 :
2008 886 : CPLFree(pChunk);
2009 886 : CPLFree(pabyChunkNoDataMask);
2010 : }
2011 :
2012 886 : CPLFree(papoDstBands);
2013 886 : GDALClose(poMEMDS);
2014 :
2015 886 : return eErr;
2016 : }
2017 :
2018 : //! @endcond
2019 :
2020 : /************************************************************************/
2021 : /* GDALSwapWords() */
2022 : /************************************************************************/
2023 :
2024 : /**
2025 : * Byte swap words in-place.
2026 : *
2027 : * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
2028 : * a memory array. No assumption is made that the words being swapped are
2029 : * word aligned in memory. Use the CPL_LSB and CPL_MSB macros from cpl_port.h
2030 : * to determine if the current platform is big endian or little endian. Use
2031 : * The macros like CPL_SWAP32() to byte swap single values without the overhead
2032 : * of a function call.
2033 : *
2034 : * @param pData pointer to start of data buffer.
2035 : * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
2036 : * @param nWordCount the number of words to be swapped in this call.
2037 : * @param nWordSkip the byte offset from the start of one word to the start of
2038 : * the next. For packed buffers this is the same as nWordSize.
2039 : */
2040 :
2041 497143 : void CPL_STDCALL GDALSwapWords(void *pData, int nWordSize, int nWordCount,
2042 : int nWordSkip)
2043 :
2044 : {
2045 497143 : if (nWordCount > 0)
2046 497143 : VALIDATE_POINTER0(pData, "GDALSwapWords");
2047 :
2048 497143 : GByte *pabyData = static_cast<GByte *>(pData);
2049 :
2050 497143 : switch (nWordSize)
2051 : {
2052 7234 : case 1:
2053 7234 : break;
2054 :
2055 476903 : case 2:
2056 476903 : CPLAssert(nWordSkip >= 2 || nWordCount == 1);
2057 228062000 : for (int i = 0; i < nWordCount; i++)
2058 : {
2059 227585000 : CPL_SWAP16PTR(pabyData);
2060 227585000 : pabyData += nWordSkip;
2061 : }
2062 476903 : break;
2063 :
2064 10580 : case 4:
2065 10580 : CPLAssert(nWordSkip >= 4 || nWordCount == 1);
2066 10580 : if (CPL_IS_ALIGNED(pabyData, 4) && (nWordSkip % 4) == 0)
2067 : {
2068 29140500 : for (int i = 0; i < nWordCount; i++)
2069 : {
2070 29130000 : *reinterpret_cast<GUInt32 *>(pabyData) = CPL_SWAP32(
2071 : *reinterpret_cast<const GUInt32 *>(pabyData));
2072 29130000 : pabyData += nWordSkip;
2073 10577 : }
2074 : }
2075 : else
2076 : {
2077 9 : for (int i = 0; i < nWordCount; i++)
2078 : {
2079 6 : CPL_SWAP32PTR(pabyData);
2080 6 : pabyData += nWordSkip;
2081 : }
2082 : }
2083 10580 : break;
2084 :
2085 2426 : case 8:
2086 2426 : CPLAssert(nWordSkip >= 8 || nWordCount == 1);
2087 2426 : if (CPL_IS_ALIGNED(pabyData, 8) && (nWordSkip % 8) == 0)
2088 : {
2089 3356900 : for (int i = 0; i < nWordCount; i++)
2090 : {
2091 3354480 : *reinterpret_cast<GUInt64 *>(pabyData) = CPL_SWAP64(
2092 : *reinterpret_cast<const GUInt64 *>(pabyData));
2093 3354480 : pabyData += nWordSkip;
2094 2425 : }
2095 : }
2096 : else
2097 : {
2098 3 : for (int i = 0; i < nWordCount; i++)
2099 : {
2100 2 : CPL_SWAP64PTR(pabyData);
2101 2 : pabyData += nWordSkip;
2102 : }
2103 : }
2104 2426 : break;
2105 :
2106 0 : default:
2107 0 : CPLAssert(false);
2108 : }
2109 : }
2110 :
2111 : /************************************************************************/
2112 : /* GDALSwapWordsEx() */
2113 : /************************************************************************/
2114 :
2115 : /**
2116 : * Byte swap words in-place.
2117 : *
2118 : * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
2119 : * a memory array. No assumption is made that the words being swapped are
2120 : * word aligned in memory. Use the CPL_LSB and CPL_MSB macros from cpl_port.h
2121 : * to determine if the current platform is big endian or little endian. Use
2122 : * The macros like CPL_SWAP32() to byte swap single values without the overhead
2123 : * of a function call.
2124 : *
2125 : * @param pData pointer to start of data buffer.
2126 : * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
2127 : * @param nWordCount the number of words to be swapped in this call.
2128 : * @param nWordSkip the byte offset from the start of one word to the start of
2129 : * the next. For packed buffers this is the same as nWordSize.
2130 : */
2131 6124 : void CPL_STDCALL GDALSwapWordsEx(void *pData, int nWordSize, size_t nWordCount,
2132 : int nWordSkip)
2133 : {
2134 6124 : GByte *pabyData = static_cast<GByte *>(pData);
2135 12248 : while (nWordCount)
2136 : {
2137 : // Pick-up a multiple of 8 as max chunk size.
2138 6124 : const int nWordCountSmall =
2139 6124 : (nWordCount > (1 << 30)) ? (1 << 30) : static_cast<int>(nWordCount);
2140 6124 : GDALSwapWords(pabyData, nWordSize, nWordCountSmall, nWordSkip);
2141 6124 : pabyData += static_cast<size_t>(nWordSkip) * nWordCountSmall;
2142 6124 : nWordCount -= nWordCountSmall;
2143 : }
2144 6124 : }
2145 :
2146 : // Place the new GDALCopyWords helpers in an anonymous namespace
2147 : namespace
2148 : {
2149 :
2150 : /************************************************************************/
2151 : /* GDALCopyWordsT() */
2152 : /************************************************************************/
2153 : /**
2154 : * Template function, used to copy data from pSrcData into buffer
2155 : * pDstData, with stride nSrcPixelStride in the source data and
2156 : * stride nDstPixelStride in the destination data. This template can
2157 : * deal with the case where the input data type is real or complex and
2158 : * the output is real.
2159 : *
2160 : * @param pSrcData the source data buffer
2161 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
2162 : * of interest.
2163 : * @param pDstData the destination buffer.
2164 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
2165 : * interest.
2166 : * @param nWordCount the total number of pixel words to copy
2167 : *
2168 : * @code
2169 : * // Assume an input buffer of type GUInt16 named pBufferIn
2170 : * GByte *pBufferOut = new GByte[numBytesOut];
2171 : * GDALCopyWordsT<GUInt16, GByte>(pSrcData, 2, pDstData, 1, numBytesOut);
2172 : * @endcode
2173 : * @note
2174 : * This is a private function, and should not be exposed outside of
2175 : * rasterio.cpp. External users should call the GDALCopyWords driver function.
2176 : */
2177 :
2178 : template <class Tin, class Tout>
2179 42454209 : static void inline GDALCopyWordsGenericT(const Tin *const CPL_RESTRICT pSrcData,
2180 : int nSrcPixelStride,
2181 : Tout *const CPL_RESTRICT pDstData,
2182 : int nDstPixelStride,
2183 : GPtrDiff_t nWordCount)
2184 : {
2185 42454209 : decltype(nWordCount) nDstOffset = 0;
2186 :
2187 42454209 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
2188 42454209 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
2189 384343861 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
2190 : {
2191 341889564 : const Tin tValue =
2192 341889564 : *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
2193 341889564 : Tout *const pOutPixel =
2194 341889564 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2195 :
2196 341889564 : GDALCopyWord(tValue, *pOutPixel);
2197 :
2198 341889564 : nDstOffset += nDstPixelStride;
2199 : }
2200 42454209 : }
2201 :
2202 : template <class Tin, class Tout>
2203 29786219 : static void CPL_NOINLINE GDALCopyWordsT(const Tin *const CPL_RESTRICT pSrcData,
2204 : int nSrcPixelStride,
2205 : Tout *const CPL_RESTRICT pDstData,
2206 : int nDstPixelStride,
2207 : GPtrDiff_t nWordCount)
2208 : {
2209 29786219 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData, nDstPixelStride,
2210 : nWordCount);
2211 29786219 : }
2212 :
2213 : template <class Tin, class Tout>
2214 5076549 : static void inline GDALCopyWordsT_8atatime(
2215 : const Tin *const CPL_RESTRICT pSrcData, int nSrcPixelStride,
2216 : Tout *const CPL_RESTRICT pDstData, int nDstPixelStride,
2217 : GPtrDiff_t nWordCount)
2218 : {
2219 5076549 : decltype(nWordCount) nDstOffset = 0;
2220 :
2221 5076549 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
2222 5076549 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
2223 5076549 : decltype(nWordCount) n = 0;
2224 5076549 : if (nSrcPixelStride == static_cast<int>(sizeof(Tin)) &&
2225 : nDstPixelStride == static_cast<int>(sizeof(Tout)))
2226 : {
2227 57868365 : for (; n < nWordCount - 7; n += 8)
2228 : {
2229 57324286 : const Tin *pInValues = reinterpret_cast<const Tin *>(
2230 57324286 : pSrcDataPtr + (n * nSrcPixelStride));
2231 57324286 : Tout *const pOutPixels =
2232 57324286 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2233 :
2234 57324286 : GDALCopy8Words(pInValues, pOutPixels);
2235 :
2236 57324286 : nDstOffset += 8 * nDstPixelStride;
2237 : }
2238 : }
2239 10454616 : for (; n < nWordCount; n++)
2240 : {
2241 5378067 : const Tin tValue =
2242 5378067 : *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
2243 5378067 : Tout *const pOutPixel =
2244 5378067 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2245 :
2246 5378067 : GDALCopyWord(tValue, *pOutPixel);
2247 :
2248 5378067 : nDstOffset += nDstPixelStride;
2249 : }
2250 5076549 : }
2251 :
2252 : #ifdef HAVE_SSE2
2253 :
2254 : template <class Tout>
2255 39717 : void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
2256 : int nSrcPixelStride,
2257 : Tout *const CPL_RESTRICT pDstData,
2258 : int nDstPixelStride, GPtrDiff_t nWordCount)
2259 : {
2260 : static_assert(std::is_integral<Tout>::value &&
2261 : sizeof(Tout) == sizeof(uint16_t),
2262 : "Bad Tout");
2263 39717 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2264 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2265 : {
2266 33366 : decltype(nWordCount) n = 0;
2267 33366 : const __m128i xmm_zero = _mm_setzero_si128();
2268 33366 : GByte *CPL_RESTRICT pabyDstDataPtr =
2269 : reinterpret_cast<GByte *>(pDstData);
2270 1415762 : for (; n < nWordCount - 15; n += 16)
2271 : {
2272 1382396 : __m128i xmm = _mm_loadu_si128(
2273 1382396 : reinterpret_cast<const __m128i *>(pSrcData + n));
2274 1382396 : __m128i xmm0 = _mm_unpacklo_epi8(xmm, xmm_zero);
2275 1382396 : __m128i xmm1 = _mm_unpackhi_epi8(xmm, xmm_zero);
2276 : _mm_storeu_si128(
2277 1382396 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2), xmm0);
2278 : _mm_storeu_si128(
2279 1382396 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2 + 16), xmm1);
2280 : }
2281 109389 : for (; n < nWordCount; n++)
2282 : {
2283 76023 : pDstData[n] = pSrcData[n];
2284 33366 : }
2285 : }
2286 : else
2287 : {
2288 6351 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2289 : nDstPixelStride, nWordCount);
2290 : }
2291 39717 : }
2292 :
2293 : template <>
2294 26977 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2295 : int nSrcPixelStride,
2296 : GUInt16 *const CPL_RESTRICT pDstData,
2297 : int nDstPixelStride, GPtrDiff_t nWordCount)
2298 : {
2299 26977 : GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
2300 : nDstPixelStride, nWordCount);
2301 26977 : }
2302 :
2303 : template <>
2304 12740 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2305 : int nSrcPixelStride,
2306 : GInt16 *const CPL_RESTRICT pDstData,
2307 : int nDstPixelStride, GPtrDiff_t nWordCount)
2308 : {
2309 12740 : GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
2310 : nDstPixelStride, nWordCount);
2311 12740 : }
2312 :
2313 : template <class Tout>
2314 12854276 : void GDALCopyWordsByteTo32Bit(const GByte *const CPL_RESTRICT pSrcData,
2315 : int nSrcPixelStride,
2316 : Tout *const CPL_RESTRICT pDstData,
2317 : int nDstPixelStride, GPtrDiff_t nWordCount)
2318 : {
2319 : static_assert(std::is_integral<Tout>::value &&
2320 : sizeof(Tout) == sizeof(uint32_t),
2321 : "Bad Tout");
2322 12854276 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2323 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2324 : {
2325 6293656 : decltype(nWordCount) n = 0;
2326 6293656 : const __m128i xmm_zero = _mm_setzero_si128();
2327 6293656 : GByte *CPL_RESTRICT pabyDstDataPtr =
2328 : reinterpret_cast<GByte *>(pDstData);
2329 70192227 : for (; n < nWordCount - 15; n += 16)
2330 : {
2331 63898561 : __m128i xmm = _mm_loadu_si128(
2332 63898561 : reinterpret_cast<const __m128i *>(pSrcData + n));
2333 63898561 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2334 63898561 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2335 63898561 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2336 63898561 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2337 63898561 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2338 63898561 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2339 : _mm_storeu_si128(
2340 63898561 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4), xmm0);
2341 : _mm_storeu_si128(
2342 63898561 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 16), xmm1);
2343 : _mm_storeu_si128(
2344 63898561 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 32), xmm2);
2345 : _mm_storeu_si128(
2346 63898561 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 48), xmm3);
2347 : }
2348 14580816 : for (; n < nWordCount; n++)
2349 : {
2350 8287210 : pDstData[n] = pSrcData[n];
2351 6293656 : }
2352 : }
2353 : else
2354 : {
2355 6560670 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2356 : nDstPixelStride, nWordCount);
2357 : }
2358 12854276 : }
2359 :
2360 : template <>
2361 476 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2362 : int nSrcPixelStride,
2363 : GUInt32 *const CPL_RESTRICT pDstData,
2364 : int nDstPixelStride, GPtrDiff_t nWordCount)
2365 : {
2366 476 : GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
2367 : nDstPixelStride, nWordCount);
2368 476 : }
2369 :
2370 : template <>
2371 12853800 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2372 : int nSrcPixelStride,
2373 : GInt32 *const CPL_RESTRICT pDstData,
2374 : int nDstPixelStride, GPtrDiff_t nWordCount)
2375 : {
2376 12853800 : GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
2377 : nDstPixelStride, nWordCount);
2378 12853800 : }
2379 :
2380 : template <>
2381 2476020 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2382 : int nSrcPixelStride,
2383 : float *const CPL_RESTRICT pDstData,
2384 : int nDstPixelStride, GPtrDiff_t nWordCount)
2385 : {
2386 2476020 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2387 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2388 : {
2389 115285 : decltype(nWordCount) n = 0;
2390 115285 : const __m128i xmm_zero = _mm_setzero_si128();
2391 115285 : GByte *CPL_RESTRICT pabyDstDataPtr =
2392 : reinterpret_cast<GByte *>(pDstData);
2393 3324090 : for (; n < nWordCount - 15; n += 16)
2394 : {
2395 3208800 : __m128i xmm = _mm_loadu_si128(
2396 3208800 : reinterpret_cast<const __m128i *>(pSrcData + n));
2397 3208800 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2398 3208800 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2399 3208800 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2400 3208800 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2401 3208800 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2402 3208800 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2403 3208800 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2404 3208800 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2405 3208800 : __m128 xmm2_f = _mm_cvtepi32_ps(xmm2);
2406 3208800 : __m128 xmm3_f = _mm_cvtepi32_ps(xmm3);
2407 3208800 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2408 : xmm0_f);
2409 : _mm_storeu_ps(
2410 3208800 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2411 : _mm_storeu_ps(
2412 3208800 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 32), xmm2_f);
2413 : _mm_storeu_ps(
2414 3208800 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 48), xmm3_f);
2415 : }
2416 502808 : for (; n < nWordCount; n++)
2417 : {
2418 387523 : pDstData[n] = pSrcData[n];
2419 115285 : }
2420 : }
2421 : else
2422 : {
2423 2360740 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2424 : nDstPixelStride, nWordCount);
2425 : }
2426 2476020 : }
2427 :
2428 : template <>
2429 169970 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2430 : int nSrcPixelStride,
2431 : double *const CPL_RESTRICT pDstData,
2432 : int nDstPixelStride, GPtrDiff_t nWordCount)
2433 : {
2434 169970 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2435 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2436 : {
2437 146506 : decltype(nWordCount) n = 0;
2438 146506 : const __m128i xmm_zero = _mm_setzero_si128();
2439 146506 : GByte *CPL_RESTRICT pabyDstDataPtr =
2440 : reinterpret_cast<GByte *>(pDstData);
2441 3126180 : for (; n < nWordCount - 15; n += 16)
2442 : {
2443 2979670 : __m128i xmm = _mm_loadu_si128(
2444 2979670 : reinterpret_cast<const __m128i *>(pSrcData + n));
2445 2979670 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2446 2979670 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2447 2979670 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2448 2979670 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2449 2979670 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2450 2979670 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2451 :
2452 : #if defined(__AVX2__) && defined(slightly_slower_than_SSE2)
2453 : _mm256_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2454 : _mm256_cvtepi32_pd(xmm0));
2455 : _mm256_storeu_pd(
2456 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2457 : _mm256_cvtepi32_pd(xmm1));
2458 : _mm256_storeu_pd(
2459 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
2460 : _mm256_cvtepi32_pd(xmm2));
2461 : _mm256_storeu_pd(
2462 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
2463 : _mm256_cvtepi32_pd(xmm3));
2464 : #else
2465 2979670 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2466 2979670 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2467 2979670 : __m128d xmm2_low_d = _mm_cvtepi32_pd(xmm2);
2468 2979670 : __m128d xmm3_low_d = _mm_cvtepi32_pd(xmm3);
2469 2979670 : xmm0 = _mm_srli_si128(xmm0, 8);
2470 2979670 : xmm1 = _mm_srli_si128(xmm1, 8);
2471 2979670 : xmm2 = _mm_srli_si128(xmm2, 8);
2472 2979670 : xmm3 = _mm_srli_si128(xmm3, 8);
2473 2979670 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2474 2979670 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2475 2979670 : __m128d xmm2_high_d = _mm_cvtepi32_pd(xmm2);
2476 2979670 : __m128d xmm3_high_d = _mm_cvtepi32_pd(xmm3);
2477 :
2478 2979670 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2479 : xmm0_low_d);
2480 : _mm_storeu_pd(
2481 2979670 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2482 : xmm0_high_d);
2483 : _mm_storeu_pd(
2484 2979670 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2485 : xmm1_low_d);
2486 : _mm_storeu_pd(
2487 2979670 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2488 : xmm1_high_d);
2489 : _mm_storeu_pd(
2490 2979670 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
2491 : xmm2_low_d);
2492 : _mm_storeu_pd(
2493 2979670 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 80),
2494 : xmm2_high_d);
2495 : _mm_storeu_pd(
2496 2979670 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
2497 : xmm3_low_d);
2498 : _mm_storeu_pd(
2499 2979670 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 112),
2500 : xmm3_high_d);
2501 : #endif
2502 : }
2503 278002 : for (; n < nWordCount; n++)
2504 : {
2505 131496 : pDstData[n] = pSrcData[n];
2506 146506 : }
2507 : }
2508 : else
2509 : {
2510 23464 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2511 : nDstPixelStride, nWordCount);
2512 : }
2513 169970 : }
2514 :
2515 : template <>
2516 148 : CPL_NOINLINE void GDALCopyWordsT(const uint8_t *const CPL_RESTRICT pSrcData,
2517 : int nSrcPixelStride,
2518 : int8_t *const CPL_RESTRICT pDstData,
2519 : int nDstPixelStride, GPtrDiff_t nWordCount)
2520 : {
2521 148 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2522 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2523 : {
2524 142 : decltype(nWordCount) n = 0;
2525 142 : const __m128i xmm_127 = _mm_set1_epi8(127);
2526 146 : for (; n < nWordCount - 31; n += 32)
2527 : {
2528 8 : __m128i xmm0 = _mm_loadu_si128(
2529 4 : reinterpret_cast<const __m128i *>(pSrcData + n));
2530 4 : __m128i xmm1 = _mm_loadu_si128(
2531 4 : reinterpret_cast<const __m128i *>(pSrcData + n + 16));
2532 4 : xmm0 = _mm_min_epu8(xmm0, xmm_127);
2533 4 : xmm1 = _mm_min_epu8(xmm1, xmm_127);
2534 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2535 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
2536 : xmm1);
2537 : }
2538 2424 : for (; n < nWordCount; n++)
2539 : {
2540 2282 : pDstData[n] =
2541 2282 : pSrcData[n] >= 127 ? 127 : static_cast<int8_t>(pSrcData[n]);
2542 142 : }
2543 : }
2544 : else
2545 : {
2546 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2547 : nDstPixelStride, nWordCount);
2548 : }
2549 148 : }
2550 :
2551 : template <>
2552 82 : CPL_NOINLINE void GDALCopyWordsT(const int8_t *const CPL_RESTRICT pSrcData,
2553 : int nSrcPixelStride,
2554 : uint8_t *const CPL_RESTRICT pDstData,
2555 : int nDstPixelStride, GPtrDiff_t nWordCount)
2556 : {
2557 82 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2558 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2559 : {
2560 56 : decltype(nWordCount) n = 0;
2561 : #if !(defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS))
2562 56 : const __m128i xmm_INT8_to_UINT8 = _mm_set1_epi8(-128);
2563 : #endif
2564 117 : for (; n < nWordCount - 31; n += 32)
2565 : {
2566 122 : __m128i xmm0 = _mm_loadu_si128(
2567 61 : reinterpret_cast<const __m128i *>(pSrcData + n));
2568 61 : __m128i xmm1 = _mm_loadu_si128(
2569 61 : reinterpret_cast<const __m128i *>(pSrcData + n + 16));
2570 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2571 : xmm0 = _mm_max_epi8(xmm0, _mm_setzero_si128());
2572 : xmm1 = _mm_max_epi8(xmm1, _mm_setzero_si128());
2573 : #else
2574 61 : xmm0 = _mm_add_epi8(xmm0, xmm_INT8_to_UINT8);
2575 61 : xmm1 = _mm_add_epi8(xmm1, xmm_INT8_to_UINT8);
2576 61 : xmm0 = _mm_max_epu8(xmm0, xmm_INT8_to_UINT8);
2577 61 : xmm1 = _mm_max_epu8(xmm1, xmm_INT8_to_UINT8);
2578 61 : xmm0 = _mm_sub_epi8(xmm0, xmm_INT8_to_UINT8);
2579 61 : xmm1 = _mm_sub_epi8(xmm1, xmm_INT8_to_UINT8);
2580 : #endif
2581 61 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2582 61 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
2583 : xmm1);
2584 : }
2585 352 : for (; n < nWordCount; n++)
2586 : {
2587 296 : pDstData[n] =
2588 296 : pSrcData[n] < 0 ? 0 : static_cast<uint8_t>(pSrcData[n]);
2589 56 : }
2590 : }
2591 : else
2592 : {
2593 26 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2594 : nDstPixelStride, nWordCount);
2595 : }
2596 82 : }
2597 :
2598 : template <>
2599 6037 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2600 : int nSrcPixelStride,
2601 : uint8_t *const CPL_RESTRICT pDstData,
2602 : int nDstPixelStride, GPtrDiff_t nWordCount)
2603 : {
2604 6037 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2605 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2606 : {
2607 5062 : decltype(nWordCount) n = 0;
2608 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2609 : const auto xmm_MAX_INT16 = _mm_set1_epi16(32767);
2610 : #else
2611 : // In SSE2, min_epu16 does not exist, so shift from
2612 : // UInt16 to SInt16 to be able to use min_epi16
2613 5062 : const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
2614 5062 : const __m128i xmm_m255_shifted = _mm_set1_epi16(255 - 32768);
2615 : #endif
2616 71888 : for (; n < nWordCount - 15; n += 16)
2617 : {
2618 133652 : __m128i xmm0 = _mm_loadu_si128(
2619 66826 : reinterpret_cast<const __m128i *>(pSrcData + n));
2620 66826 : __m128i xmm1 = _mm_loadu_si128(
2621 66826 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2622 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2623 : xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
2624 : xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
2625 : #else
2626 66826 : xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
2627 66826 : xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
2628 66826 : xmm0 = _mm_min_epi16(xmm0, xmm_m255_shifted);
2629 66826 : xmm1 = _mm_min_epi16(xmm1, xmm_m255_shifted);
2630 66826 : xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
2631 66826 : xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
2632 : #endif
2633 66826 : xmm0 = _mm_packus_epi16(xmm0, xmm1);
2634 66826 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2635 : }
2636 16403 : for (; n < nWordCount; n++)
2637 : {
2638 11341 : pDstData[n] =
2639 11341 : pSrcData[n] >= 255 ? 255 : static_cast<uint8_t>(pSrcData[n]);
2640 5062 : }
2641 : }
2642 : else
2643 : {
2644 975 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2645 : nDstPixelStride, nWordCount);
2646 : }
2647 6037 : }
2648 :
2649 : template <>
2650 46 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2651 : int nSrcPixelStride,
2652 : int16_t *const CPL_RESTRICT pDstData,
2653 : int nDstPixelStride, GPtrDiff_t nWordCount)
2654 : {
2655 46 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2656 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2657 : {
2658 40 : decltype(nWordCount) n = 0;
2659 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2660 : const __m128i xmm_MAX_INT16 = _mm_set1_epi16(32767);
2661 : #else
2662 : // In SSE2, min_epu16 does not exist, so shift from
2663 : // UInt16 to SInt16 to be able to use min_epi16
2664 40 : const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
2665 40 : const __m128i xmm_32767_shifted = _mm_set1_epi16(32767 - 32768);
2666 : #endif
2667 169 : for (; n < nWordCount - 15; n += 16)
2668 : {
2669 258 : __m128i xmm0 = _mm_loadu_si128(
2670 129 : reinterpret_cast<const __m128i *>(pSrcData + n));
2671 129 : __m128i xmm1 = _mm_loadu_si128(
2672 129 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2673 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2674 : xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
2675 : xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
2676 : #else
2677 129 : xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
2678 129 : xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
2679 129 : xmm0 = _mm_min_epi16(xmm0, xmm_32767_shifted);
2680 129 : xmm1 = _mm_min_epi16(xmm1, xmm_32767_shifted);
2681 129 : xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
2682 129 : xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
2683 : #endif
2684 129 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2685 129 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
2686 : xmm1);
2687 : }
2688 191 : for (; n < nWordCount; n++)
2689 : {
2690 282 : pDstData[n] = pSrcData[n] >= 32767
2691 : ? 32767
2692 131 : : static_cast<int16_t>(pSrcData[n]);
2693 40 : }
2694 : }
2695 : else
2696 : {
2697 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2698 : nDstPixelStride, nWordCount);
2699 : }
2700 46 : }
2701 :
2702 : template <>
2703 135 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2704 : int nSrcPixelStride,
2705 : uint16_t *const CPL_RESTRICT pDstData,
2706 : int nDstPixelStride, GPtrDiff_t nWordCount)
2707 : {
2708 135 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2709 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2710 : {
2711 92 : decltype(nWordCount) n = 0;
2712 92 : const __m128i xmm_zero = _mm_setzero_si128();
2713 277 : for (; n < nWordCount - 15; n += 16)
2714 : {
2715 370 : __m128i xmm0 = _mm_loadu_si128(
2716 185 : reinterpret_cast<const __m128i *>(pSrcData + n));
2717 185 : __m128i xmm1 = _mm_loadu_si128(
2718 185 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2719 185 : xmm0 = _mm_max_epi16(xmm0, xmm_zero);
2720 185 : xmm1 = _mm_max_epi16(xmm1, xmm_zero);
2721 185 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2722 185 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
2723 : xmm1);
2724 : }
2725 468 : for (; n < nWordCount; n++)
2726 : {
2727 376 : pDstData[n] =
2728 376 : pSrcData[n] < 0 ? 0 : static_cast<uint16_t>(pSrcData[n]);
2729 92 : }
2730 : }
2731 : else
2732 : {
2733 43 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2734 : nDstPixelStride, nWordCount);
2735 : }
2736 135 : }
2737 :
2738 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2739 :
2740 : template <>
2741 : CPL_NOINLINE void GDALCopyWordsT(const uint32_t *const CPL_RESTRICT pSrcData,
2742 : int nSrcPixelStride,
2743 : int32_t *const CPL_RESTRICT pDstData,
2744 : int nDstPixelStride, GPtrDiff_t nWordCount)
2745 : {
2746 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2747 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2748 : {
2749 : decltype(nWordCount) n = 0;
2750 : const __m128i xmm_MAX_INT = _mm_set1_epi32(INT_MAX);
2751 : for (; n < nWordCount - 8; n += 7)
2752 : {
2753 : __m128i xmm0 = _mm_loadu_si128(
2754 : reinterpret_cast<const __m128i *>(pSrcData + n));
2755 : __m128i xmm1 = _mm_loadu_si128(
2756 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
2757 : xmm0 = _mm_min_epu32(xmm0, xmm_MAX_INT);
2758 : xmm1 = _mm_min_epu32(xmm1, xmm_MAX_INT);
2759 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2760 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
2761 : xmm1);
2762 : }
2763 : for (; n < nWordCount; n++)
2764 : {
2765 : pDstData[n] = pSrcData[n] >= INT_MAX
2766 : ? INT_MAX
2767 : : static_cast<int32_t>(pSrcData[n]);
2768 : }
2769 : }
2770 : else
2771 : {
2772 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2773 : nDstPixelStride, nWordCount);
2774 : }
2775 : }
2776 :
2777 : template <>
2778 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
2779 : int nSrcPixelStride,
2780 : uint32_t *const CPL_RESTRICT pDstData,
2781 : int nDstPixelStride, GPtrDiff_t nWordCount)
2782 : {
2783 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2784 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2785 : {
2786 : decltype(nWordCount) n = 0;
2787 : const __m128i xmm_zero = _mm_setzero_si128();
2788 : for (; n < nWordCount - 7; n += 8)
2789 : {
2790 : __m128i xmm0 = _mm_loadu_si128(
2791 : reinterpret_cast<const __m128i *>(pSrcData + n));
2792 : __m128i xmm1 = _mm_loadu_si128(
2793 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
2794 : xmm0 = _mm_max_epi32(xmm0, xmm_zero);
2795 : xmm1 = _mm_max_epi32(xmm1, xmm_zero);
2796 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2797 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
2798 : xmm1);
2799 : }
2800 : for (; n < nWordCount; n++)
2801 : {
2802 : pDstData[n] =
2803 : pSrcData[n] < 0 ? 0 : static_cast<uint32_t>(pSrcData[n]);
2804 : }
2805 : }
2806 : else
2807 : {
2808 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2809 : nDstPixelStride, nWordCount);
2810 : }
2811 : }
2812 :
2813 : #endif // defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2814 :
2815 : template <>
2816 343 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2817 : int nSrcPixelStride,
2818 : float *const CPL_RESTRICT pDstData,
2819 : int nDstPixelStride, GPtrDiff_t nWordCount)
2820 : {
2821 343 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2822 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2823 : {
2824 337 : decltype(nWordCount) n = 0;
2825 337 : const __m128i xmm_zero = _mm_setzero_si128();
2826 337 : GByte *CPL_RESTRICT pabyDstDataPtr =
2827 : reinterpret_cast<GByte *>(pDstData);
2828 1508 : for (; n < nWordCount - 7; n += 8)
2829 : {
2830 1171 : __m128i xmm = _mm_loadu_si128(
2831 1171 : reinterpret_cast<const __m128i *>(pSrcData + n));
2832 1171 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
2833 1171 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
2834 1171 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2835 1171 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2836 1171 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2837 : xmm0_f);
2838 : _mm_storeu_ps(
2839 1171 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2840 : }
2841 1115 : for (; n < nWordCount; n++)
2842 : {
2843 778 : pDstData[n] = pSrcData[n];
2844 337 : }
2845 : }
2846 : else
2847 : {
2848 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2849 : nDstPixelStride, nWordCount);
2850 : }
2851 343 : }
2852 :
2853 : template <>
2854 1073480 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2855 : int nSrcPixelStride,
2856 : float *const CPL_RESTRICT pDstData,
2857 : int nDstPixelStride, GPtrDiff_t nWordCount)
2858 : {
2859 1073480 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2860 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2861 : {
2862 83580 : decltype(nWordCount) n = 0;
2863 83580 : GByte *CPL_RESTRICT pabyDstDataPtr =
2864 : reinterpret_cast<GByte *>(pDstData);
2865 565267 : for (; n < nWordCount - 7; n += 8)
2866 : {
2867 481687 : __m128i xmm = _mm_loadu_si128(
2868 481687 : reinterpret_cast<const __m128i *>(pSrcData + n));
2869 481687 : const auto sign = _mm_srai_epi16(xmm, 15);
2870 481687 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
2871 481687 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
2872 481687 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2873 481687 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2874 481687 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2875 : xmm0_f);
2876 : _mm_storeu_ps(
2877 481687 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2878 : }
2879 244181 : for (; n < nWordCount; n++)
2880 : {
2881 160601 : pDstData[n] = pSrcData[n];
2882 83580 : }
2883 : }
2884 : else
2885 : {
2886 989901 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2887 : nDstPixelStride, nWordCount);
2888 : }
2889 1073480 : }
2890 :
2891 : template <>
2892 405 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2893 : int nSrcPixelStride,
2894 : double *const CPL_RESTRICT pDstData,
2895 : int nDstPixelStride, GPtrDiff_t nWordCount)
2896 : {
2897 405 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2898 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2899 : {
2900 293 : decltype(nWordCount) n = 0;
2901 293 : const __m128i xmm_zero = _mm_setzero_si128();
2902 293 : GByte *CPL_RESTRICT pabyDstDataPtr =
2903 : reinterpret_cast<GByte *>(pDstData);
2904 809 : for (; n < nWordCount - 7; n += 8)
2905 : {
2906 516 : __m128i xmm = _mm_loadu_si128(
2907 516 : reinterpret_cast<const __m128i *>(pSrcData + n));
2908 516 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
2909 516 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
2910 :
2911 516 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2912 516 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2913 516 : xmm0 = _mm_srli_si128(xmm0, 8);
2914 516 : xmm1 = _mm_srli_si128(xmm1, 8);
2915 516 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2916 516 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2917 :
2918 516 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2919 : xmm0_low_d);
2920 : _mm_storeu_pd(
2921 516 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2922 : xmm0_high_d);
2923 : _mm_storeu_pd(
2924 516 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2925 : xmm1_low_d);
2926 : _mm_storeu_pd(
2927 516 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2928 : xmm1_high_d);
2929 : }
2930 1034 : for (; n < nWordCount; n++)
2931 : {
2932 741 : pDstData[n] = pSrcData[n];
2933 293 : }
2934 : }
2935 : else
2936 : {
2937 112 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2938 : nDstPixelStride, nWordCount);
2939 : }
2940 405 : }
2941 :
2942 : template <>
2943 2760350 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2944 : int nSrcPixelStride,
2945 : double *const CPL_RESTRICT pDstData,
2946 : int nDstPixelStride, GPtrDiff_t nWordCount)
2947 : {
2948 2760350 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2949 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2950 : {
2951 34660 : decltype(nWordCount) n = 0;
2952 34660 : GByte *CPL_RESTRICT pabyDstDataPtr =
2953 : reinterpret_cast<GByte *>(pDstData);
2954 401770 : for (; n < nWordCount - 7; n += 8)
2955 : {
2956 367110 : __m128i xmm = _mm_loadu_si128(
2957 367110 : reinterpret_cast<const __m128i *>(pSrcData + n));
2958 367110 : const auto sign = _mm_srai_epi16(xmm, 15);
2959 367110 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
2960 367110 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
2961 :
2962 367110 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2963 367110 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2964 367110 : xmm0 = _mm_srli_si128(xmm0, 8);
2965 367110 : xmm1 = _mm_srli_si128(xmm1, 8);
2966 367110 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2967 367110 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2968 :
2969 367110 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2970 : xmm0_low_d);
2971 : _mm_storeu_pd(
2972 367110 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2973 : xmm0_high_d);
2974 : _mm_storeu_pd(
2975 367110 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2976 : xmm1_low_d);
2977 : _mm_storeu_pd(
2978 367110 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2979 : xmm1_high_d);
2980 : }
2981 253693 : for (; n < nWordCount; n++)
2982 : {
2983 219033 : pDstData[n] = pSrcData[n];
2984 34660 : }
2985 : }
2986 : else
2987 : {
2988 2725690 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2989 : nDstPixelStride, nWordCount);
2990 : }
2991 2760350 : }
2992 :
2993 : #endif // HAVE_SSE2
2994 :
2995 : template <>
2996 4420690 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
2997 : int nSrcPixelStride,
2998 : GByte *const CPL_RESTRICT pDstData,
2999 : int nDstPixelStride, GPtrDiff_t nWordCount)
3000 : {
3001 4420690 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3002 : nDstPixelStride, nWordCount);
3003 4420690 : }
3004 :
3005 : template <>
3006 38235 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3007 : int nSrcPixelStride,
3008 : GUInt16 *const CPL_RESTRICT pDstData,
3009 : int nDstPixelStride, GPtrDiff_t nWordCount)
3010 : {
3011 38235 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3012 : nDstPixelStride, nWordCount);
3013 38235 : }
3014 :
3015 : template <>
3016 54830 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3017 : int nSrcPixelStride,
3018 : double *const CPL_RESTRICT pDstData,
3019 : int nDstPixelStride, GPtrDiff_t nWordCount)
3020 : {
3021 54830 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3022 : nDstPixelStride, nWordCount);
3023 54830 : }
3024 :
3025 : template <>
3026 122131 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3027 : int nSrcPixelStride,
3028 : float *const CPL_RESTRICT pDstData,
3029 : int nDstPixelStride, GPtrDiff_t nWordCount)
3030 : {
3031 122131 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3032 : nDstPixelStride, nWordCount);
3033 122131 : }
3034 :
3035 : template <>
3036 396 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
3037 : int nSrcPixelStride,
3038 : float *const CPL_RESTRICT pDstData,
3039 : int nDstPixelStride, GPtrDiff_t nWordCount)
3040 : {
3041 396 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3042 : nDstPixelStride, nWordCount);
3043 396 : }
3044 :
3045 : template <>
3046 544 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
3047 : int nSrcPixelStride,
3048 : double *const CPL_RESTRICT pDstData,
3049 : int nDstPixelStride, GPtrDiff_t nWordCount)
3050 : {
3051 544 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3052 : nDstPixelStride, nWordCount);
3053 544 : }
3054 :
3055 : template <>
3056 318163 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3057 : int nSrcPixelStride,
3058 : GByte *const CPL_RESTRICT pDstData,
3059 : int nDstPixelStride, GPtrDiff_t nWordCount)
3060 : {
3061 318163 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3062 : nDstPixelStride, nWordCount);
3063 318163 : }
3064 :
3065 : template <>
3066 55 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3067 : int nSrcPixelStride,
3068 : GInt8 *const CPL_RESTRICT pDstData,
3069 : int nDstPixelStride, GPtrDiff_t nWordCount)
3070 : {
3071 55 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3072 : nDstPixelStride, nWordCount);
3073 55 : }
3074 :
3075 : template <>
3076 15775 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3077 : int nSrcPixelStride,
3078 : GInt16 *const CPL_RESTRICT pDstData,
3079 : int nDstPixelStride, GPtrDiff_t nWordCount)
3080 : {
3081 15775 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3082 : nDstPixelStride, nWordCount);
3083 15775 : }
3084 :
3085 : template <>
3086 61713 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3087 : int nSrcPixelStride,
3088 : GUInt16 *const CPL_RESTRICT pDstData,
3089 : int nDstPixelStride, GPtrDiff_t nWordCount)
3090 : {
3091 61713 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3092 : nDstPixelStride, nWordCount);
3093 61713 : }
3094 :
3095 : template <>
3096 43884 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3097 : int nSrcPixelStride,
3098 : GInt32 *const CPL_RESTRICT pDstData,
3099 : int nDstPixelStride, GPtrDiff_t nWordCount)
3100 : {
3101 43884 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3102 : nDstPixelStride, nWordCount);
3103 43884 : }
3104 :
3105 : template <>
3106 72 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3107 : int nSrcPixelStride,
3108 : GFloat16 *const CPL_RESTRICT pDstData,
3109 : int nDstPixelStride, GPtrDiff_t nWordCount)
3110 : {
3111 72 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3112 : nDstPixelStride, nWordCount);
3113 72 : }
3114 :
3115 : template <>
3116 61 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3117 : int nSrcPixelStride,
3118 : GFloat16 *const CPL_RESTRICT pDstData,
3119 : int nDstPixelStride, GPtrDiff_t nWordCount)
3120 : {
3121 61 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3122 : nDstPixelStride, nWordCount);
3123 61 : }
3124 :
3125 : /************************************************************************/
3126 : /* GDALCopyWordsComplexT() */
3127 : /************************************************************************/
3128 : /**
3129 : * Template function, used to copy data from pSrcData into buffer
3130 : * pDstData, with stride nSrcPixelStride in the source data and
3131 : * stride nDstPixelStride in the destination data. Deals with the
3132 : * complex case, where input is complex and output is complex.
3133 : *
3134 : * @param pSrcData the source data buffer
3135 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
3136 : * of interest.
3137 : * @param pDstData the destination buffer.
3138 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
3139 : * interest.
3140 : * @param nWordCount the total number of pixel words to copy
3141 : *
3142 : */
3143 : template <class Tin, class Tout>
3144 98631 : inline void GDALCopyWordsComplexT(const Tin *const CPL_RESTRICT pSrcData,
3145 : int nSrcPixelStride,
3146 : Tout *const CPL_RESTRICT pDstData,
3147 : int nDstPixelStride, GPtrDiff_t nWordCount)
3148 : {
3149 98631 : decltype(nWordCount) nDstOffset = 0;
3150 98631 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
3151 98631 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
3152 :
3153 5630497 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
3154 : {
3155 5531861 : const Tin *const pPixelIn =
3156 5531861 : reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
3157 5531861 : Tout *const pPixelOut =
3158 5531861 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
3159 :
3160 5531861 : GDALCopyWord(pPixelIn[0], pPixelOut[0]);
3161 5531861 : GDALCopyWord(pPixelIn[1], pPixelOut[1]);
3162 :
3163 5531861 : nDstOffset += nDstPixelStride;
3164 : }
3165 98631 : }
3166 :
3167 : /************************************************************************/
3168 : /* GDALCopyWordsComplexOutT() */
3169 : /************************************************************************/
3170 : /**
3171 : * Template function, used to copy data from pSrcData into buffer
3172 : * pDstData, with stride nSrcPixelStride in the source data and
3173 : * stride nDstPixelStride in the destination data. Deals with the
3174 : * case where the value is real coming in, but complex going out.
3175 : *
3176 : * @param pSrcData the source data buffer
3177 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
3178 : * of interest, in bytes.
3179 : * @param pDstData the destination buffer.
3180 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
3181 : * interest, in bytes.
3182 : * @param nWordCount the total number of pixel words to copy
3183 : *
3184 : */
3185 : template <class Tin, class Tout>
3186 4394 : inline void GDALCopyWordsComplexOutT(const Tin *const CPL_RESTRICT pSrcData,
3187 : int nSrcPixelStride,
3188 : Tout *const CPL_RESTRICT pDstData,
3189 : int nDstPixelStride, GPtrDiff_t nWordCount)
3190 : {
3191 4394 : decltype(nWordCount) nDstOffset = 0;
3192 :
3193 4394 : const Tout tOutZero = static_cast<Tout>(0);
3194 :
3195 4394 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
3196 4394 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
3197 :
3198 1188704 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
3199 : {
3200 1184310 : const Tin tValue =
3201 1184310 : *reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
3202 1184310 : Tout *const pPixelOut =
3203 1184310 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
3204 1184310 : GDALCopyWord(tValue, *pPixelOut);
3205 :
3206 1184310 : pPixelOut[1] = tOutZero;
3207 :
3208 1184310 : nDstOffset += nDstPixelStride;
3209 : }
3210 4394 : }
3211 :
3212 : /************************************************************************/
3213 : /* GDALCopyWordsFromT() */
3214 : /************************************************************************/
3215 : /**
3216 : * Template driver function. Given the input type T, call the appropriate
3217 : * GDALCopyWordsT function template for the desired output type. You should
3218 : * never call this function directly (call GDALCopyWords instead).
3219 : *
3220 : * @param pSrcData source data buffer
3221 : * @param nSrcPixelStride pixel stride in input buffer, in pixel words
3222 : * @param bInComplex input is complex
3223 : * @param pDstData destination data buffer
3224 : * @param eDstType destination data type
3225 : * @param nDstPixelStride pixel stride in output buffer, in pixel words
3226 : * @param nWordCount number of pixel words to be copied
3227 : */
3228 : template <class T>
3229 54346773 : inline void GDALCopyWordsFromT(const T *const CPL_RESTRICT pSrcData,
3230 : int nSrcPixelStride, bool bInComplex,
3231 : void *CPL_RESTRICT pDstData,
3232 : GDALDataType eDstType, int nDstPixelStride,
3233 : GPtrDiff_t nWordCount)
3234 : {
3235 54346773 : switch (eDstType)
3236 : {
3237 4783834 : case GDT_UInt8:
3238 4783834 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3239 : static_cast<unsigned char *>(pDstData),
3240 : nDstPixelStride, nWordCount);
3241 4783834 : break;
3242 753 : case GDT_Int8:
3243 753 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3244 : static_cast<signed char *>(pDstData),
3245 : nDstPixelStride, nWordCount);
3246 753 : break;
3247 140646 : case GDT_UInt16:
3248 140646 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3249 : static_cast<unsigned short *>(pDstData),
3250 : nDstPixelStride, nWordCount);
3251 140646 : break;
3252 4162591 : case GDT_Int16:
3253 4162591 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3254 : static_cast<short *>(pDstData), nDstPixelStride,
3255 : nWordCount);
3256 4162591 : break;
3257 22554 : case GDT_UInt32:
3258 22554 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3259 : static_cast<unsigned int *>(pDstData),
3260 : nDstPixelStride, nWordCount);
3261 22554 : break;
3262 26066531 : case GDT_Int32:
3263 26066531 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3264 : static_cast<int *>(pDstData), nDstPixelStride,
3265 : nWordCount);
3266 26066531 : break;
3267 1110 : case GDT_UInt64:
3268 1110 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3269 : static_cast<std::uint64_t *>(pDstData),
3270 : nDstPixelStride, nWordCount);
3271 1110 : break;
3272 5754 : case GDT_Int64:
3273 5754 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3274 : static_cast<std::int64_t *>(pDstData),
3275 : nDstPixelStride, nWordCount);
3276 5754 : break;
3277 997 : case GDT_Float16:
3278 997 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3279 : static_cast<GFloat16 *>(pDstData), nDstPixelStride,
3280 : nWordCount);
3281 997 : break;
3282 3836699 : case GDT_Float32:
3283 3836699 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3284 : static_cast<float *>(pDstData), nDstPixelStride,
3285 : nWordCount);
3286 3836699 : break;
3287 15222308 : case GDT_Float64:
3288 15222308 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3289 : static_cast<double *>(pDstData), nDstPixelStride,
3290 : nWordCount);
3291 15222308 : break;
3292 94424 : case GDT_CInt16:
3293 94424 : if (bInComplex)
3294 : {
3295 93170 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3296 : static_cast<short *>(pDstData),
3297 : nDstPixelStride, nWordCount);
3298 : }
3299 : else // input is not complex, so we need to promote to a complex
3300 : // buffer
3301 : {
3302 1254 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3303 : static_cast<short *>(pDstData),
3304 : nDstPixelStride, nWordCount);
3305 : }
3306 94424 : break;
3307 1349 : case GDT_CInt32:
3308 1349 : if (bInComplex)
3309 : {
3310 717 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3311 : static_cast<int *>(pDstData),
3312 : nDstPixelStride, nWordCount);
3313 : }
3314 : else // input is not complex, so we need to promote to a complex
3315 : // buffer
3316 : {
3317 632 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3318 : static_cast<int *>(pDstData),
3319 : nDstPixelStride, nWordCount);
3320 : }
3321 1349 : break;
3322 313 : case GDT_CFloat16:
3323 313 : if (bInComplex)
3324 : {
3325 48 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3326 : static_cast<GFloat16 *>(pDstData),
3327 : nDstPixelStride, nWordCount);
3328 : }
3329 : else // input is not complex, so we need to promote to a complex
3330 : // buffer
3331 : {
3332 265 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3333 : static_cast<GFloat16 *>(pDstData),
3334 : nDstPixelStride, nWordCount);
3335 : }
3336 313 : break;
3337 3791 : case GDT_CFloat32:
3338 3791 : if (bInComplex)
3339 : {
3340 2994 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3341 : static_cast<float *>(pDstData),
3342 : nDstPixelStride, nWordCount);
3343 : }
3344 : else // input is not complex, so we need to promote to a complex
3345 : // buffer
3346 : {
3347 797 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3348 : static_cast<float *>(pDstData),
3349 : nDstPixelStride, nWordCount);
3350 : }
3351 3791 : break;
3352 3148 : case GDT_CFloat64:
3353 3148 : if (bInComplex)
3354 : {
3355 1702 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3356 : static_cast<double *>(pDstData),
3357 : nDstPixelStride, nWordCount);
3358 : }
3359 : else // input is not complex, so we need to promote to a complex
3360 : // buffer
3361 : {
3362 1446 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3363 : static_cast<double *>(pDstData),
3364 : nDstPixelStride, nWordCount);
3365 : }
3366 3148 : break;
3367 0 : case GDT_Unknown:
3368 : case GDT_TypeCount:
3369 0 : CPLAssert(false);
3370 : }
3371 54346773 : }
3372 :
3373 : } // end anonymous namespace
3374 :
3375 : /************************************************************************/
3376 : /* GDALReplicateWord() */
3377 : /************************************************************************/
3378 :
3379 : template <class T>
3380 598663 : inline void GDALReplicateWordT(void *pDstData, int nDstPixelStride,
3381 : GPtrDiff_t nWordCount)
3382 : {
3383 598663 : const T valSet = *static_cast<const T *>(pDstData);
3384 598663 : if (nDstPixelStride == static_cast<int>(sizeof(T)))
3385 : {
3386 568912 : T *pDstPtr = static_cast<T *>(pDstData) + 1;
3387 31900003 : while (nWordCount >= 4)
3388 : {
3389 31331068 : nWordCount -= 4;
3390 31331068 : pDstPtr[0] = valSet;
3391 31331068 : pDstPtr[1] = valSet;
3392 31331068 : pDstPtr[2] = valSet;
3393 31331068 : pDstPtr[3] = valSet;
3394 31331068 : pDstPtr += 4;
3395 : }
3396 1470357 : while (nWordCount > 0)
3397 : {
3398 901445 : --nWordCount;
3399 901445 : *pDstPtr = valSet;
3400 901445 : pDstPtr++;
3401 : }
3402 : }
3403 : else
3404 : {
3405 29751 : GByte *pabyDstPtr = static_cast<GByte *>(pDstData) + nDstPixelStride;
3406 1040338 : while (nWordCount > 0)
3407 : {
3408 1010587 : --nWordCount;
3409 1010587 : *reinterpret_cast<T *>(pabyDstPtr) = valSet;
3410 1010587 : pabyDstPtr += nDstPixelStride;
3411 : }
3412 : }
3413 598663 : }
3414 :
3415 1050460 : static void GDALReplicateWord(const void *CPL_RESTRICT pSrcData,
3416 : GDALDataType eSrcType,
3417 : void *CPL_RESTRICT pDstData,
3418 : GDALDataType eDstType, int nDstPixelStride,
3419 : GPtrDiff_t nWordCount)
3420 : {
3421 : /* -----------------------------------------------------------------------
3422 : */
3423 : /* Special case when the source data is always the same value */
3424 : /* (for VRTSourcedRasterBand::IRasterIO and
3425 : * VRTDerivedRasterBand::IRasterIO*/
3426 : /* for example) */
3427 : /* -----------------------------------------------------------------------
3428 : */
3429 : // Let the general translation case do the necessary conversions
3430 : // on the first destination element.
3431 1050460 : GDALCopyWords64(pSrcData, eSrcType, 0, pDstData, eDstType, 0, 1);
3432 :
3433 : // Now copy the first element to the nWordCount - 1 following destination
3434 : // elements.
3435 1050460 : nWordCount--;
3436 1050460 : GByte *pabyDstWord = reinterpret_cast<GByte *>(pDstData) + nDstPixelStride;
3437 :
3438 1050460 : switch (eDstType)
3439 : {
3440 451704 : case GDT_UInt8:
3441 : case GDT_Int8:
3442 : {
3443 451704 : if (nDstPixelStride == 1)
3444 : {
3445 380124 : if (nWordCount > 0)
3446 380124 : memset(pabyDstWord,
3447 380124 : *reinterpret_cast<const GByte *>(pDstData),
3448 : nWordCount);
3449 : }
3450 : else
3451 : {
3452 71580 : GByte valSet = *reinterpret_cast<const GByte *>(pDstData);
3453 54467500 : while (nWordCount > 0)
3454 : {
3455 54395900 : --nWordCount;
3456 54395900 : *pabyDstWord = valSet;
3457 54395900 : pabyDstWord += nDstPixelStride;
3458 : }
3459 : }
3460 451704 : break;
3461 : }
3462 :
3463 : #define CASE_DUPLICATE_SIMPLE(enum_type, c_type) \
3464 : case enum_type: \
3465 : { \
3466 : GDALReplicateWordT<c_type>(pDstData, nDstPixelStride, nWordCount); \
3467 : break; \
3468 : }
3469 :
3470 34507 : CASE_DUPLICATE_SIMPLE(GDT_UInt16, GUInt16)
3471 202447 : CASE_DUPLICATE_SIMPLE(GDT_Int16, GInt16)
3472 56 : CASE_DUPLICATE_SIMPLE(GDT_UInt32, GUInt32)
3473 300535 : CASE_DUPLICATE_SIMPLE(GDT_Int32, GInt32)
3474 23 : CASE_DUPLICATE_SIMPLE(GDT_UInt64, std::uint64_t)
3475 1066 : CASE_DUPLICATE_SIMPLE(GDT_Int64, std::int64_t)
3476 0 : CASE_DUPLICATE_SIMPLE(GDT_Float16, GFloat16)
3477 52668 : CASE_DUPLICATE_SIMPLE(GDT_Float32, float)
3478 7361 : CASE_DUPLICATE_SIMPLE(GDT_Float64, double)
3479 :
3480 : #define CASE_DUPLICATE_COMPLEX(enum_type, c_type) \
3481 : case enum_type: \
3482 : { \
3483 : c_type valSet1 = reinterpret_cast<const c_type *>(pDstData)[0]; \
3484 : c_type valSet2 = reinterpret_cast<const c_type *>(pDstData)[1]; \
3485 : while (nWordCount > 0) \
3486 : { \
3487 : --nWordCount; \
3488 : reinterpret_cast<c_type *>(pabyDstWord)[0] = valSet1; \
3489 : reinterpret_cast<c_type *>(pabyDstWord)[1] = valSet2; \
3490 : pabyDstWord += nDstPixelStride; \
3491 : } \
3492 : break; \
3493 : }
3494 :
3495 784 : CASE_DUPLICATE_COMPLEX(GDT_CInt16, GInt16)
3496 784 : CASE_DUPLICATE_COMPLEX(GDT_CInt32, GInt32)
3497 6 : CASE_DUPLICATE_COMPLEX(GDT_CFloat16, GFloat16)
3498 790 : CASE_DUPLICATE_COMPLEX(GDT_CFloat32, float)
3499 790 : CASE_DUPLICATE_COMPLEX(GDT_CFloat64, double)
3500 :
3501 0 : case GDT_Unknown:
3502 : case GDT_TypeCount:
3503 0 : CPLAssert(false);
3504 : }
3505 1050460 : }
3506 :
3507 : /************************************************************************/
3508 : /* GDALUnrolledCopy() */
3509 : /************************************************************************/
3510 :
3511 : template <class T, int srcStride, int dstStride>
3512 3037055 : static inline void GDALUnrolledCopyGeneric(T *CPL_RESTRICT pDest,
3513 : const T *CPL_RESTRICT pSrc,
3514 : GPtrDiff_t nIters)
3515 : {
3516 3037055 : if (nIters >= 16)
3517 : {
3518 133208509 : for (GPtrDiff_t i = nIters / 16; i != 0; i--)
3519 : {
3520 130298156 : pDest[0 * dstStride] = pSrc[0 * srcStride];
3521 130298156 : pDest[1 * dstStride] = pSrc[1 * srcStride];
3522 130298156 : pDest[2 * dstStride] = pSrc[2 * srcStride];
3523 130298156 : pDest[3 * dstStride] = pSrc[3 * srcStride];
3524 130298156 : pDest[4 * dstStride] = pSrc[4 * srcStride];
3525 130298156 : pDest[5 * dstStride] = pSrc[5 * srcStride];
3526 130298156 : pDest[6 * dstStride] = pSrc[6 * srcStride];
3527 130298156 : pDest[7 * dstStride] = pSrc[7 * srcStride];
3528 130298156 : pDest[8 * dstStride] = pSrc[8 * srcStride];
3529 130298156 : pDest[9 * dstStride] = pSrc[9 * srcStride];
3530 130298156 : pDest[10 * dstStride] = pSrc[10 * srcStride];
3531 130298156 : pDest[11 * dstStride] = pSrc[11 * srcStride];
3532 130298156 : pDest[12 * dstStride] = pSrc[12 * srcStride];
3533 130298156 : pDest[13 * dstStride] = pSrc[13 * srcStride];
3534 130298156 : pDest[14 * dstStride] = pSrc[14 * srcStride];
3535 130298156 : pDest[15 * dstStride] = pSrc[15 * srcStride];
3536 130298156 : pDest += 16 * dstStride;
3537 130298156 : pSrc += 16 * srcStride;
3538 : }
3539 2910399 : nIters = nIters % 16;
3540 : }
3541 5199161 : for (GPtrDiff_t i = 0; i < nIters; i++)
3542 : {
3543 2162107 : pDest[i * dstStride] = *pSrc;
3544 2162107 : pSrc += srcStride;
3545 : }
3546 3037055 : }
3547 :
3548 : template <class T, int srcStride, int dstStride>
3549 3030955 : static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
3550 : const T *CPL_RESTRICT pSrc,
3551 : GPtrDiff_t nIters)
3552 : {
3553 3030955 : GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
3554 3030955 : }
3555 :
3556 : #ifdef HAVE_SSE2
3557 :
3558 : template <>
3559 353436 : void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
3560 : const GByte *CPL_RESTRICT pSrc,
3561 : GPtrDiff_t nIters)
3562 : {
3563 353436 : decltype(nIters) i = 0;
3564 353436 : if (nIters > 16)
3565 : {
3566 195179 : const __m128i xmm_mask = _mm_set1_epi16(0xff);
3567 : // If we were sure that there would always be 1 trailing byte, we could
3568 : // check against nIters - 15
3569 2996300 : for (; i < nIters - 16; i += 16)
3570 : {
3571 : __m128i xmm0 =
3572 2801120 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
3573 : __m128i xmm1 =
3574 5602250 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
3575 : // Set higher 8bit of each int16 packed word to 0
3576 2801120 : xmm0 = _mm_and_si128(xmm0, xmm_mask);
3577 2801120 : xmm1 = _mm_and_si128(xmm1, xmm_mask);
3578 : // Pack int16 to uint8 and merge back both vector
3579 2801120 : xmm0 = _mm_packus_epi16(xmm0, xmm1);
3580 :
3581 : // Store result
3582 2801120 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
3583 :
3584 2801120 : pSrc += 2 * 16;
3585 : }
3586 : }
3587 4628680 : for (; i < nIters; i++)
3588 : {
3589 4275240 : pDest[i] = *pSrc;
3590 4275240 : pSrc += 2;
3591 : }
3592 353436 : }
3593 :
3594 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
3595 :
3596 : template <>
3597 192064 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
3598 : const GByte *CPL_RESTRICT pSrc,
3599 : GPtrDiff_t nIters)
3600 : {
3601 192064 : if (nIters > 16 && CPLHaveRuntimeSSSE3())
3602 : {
3603 185964 : GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
3604 : }
3605 : else
3606 : {
3607 6100 : GDALUnrolledCopyGeneric<GByte, 3, 1>(pDest, pSrc, nIters);
3608 : }
3609 192064 : }
3610 :
3611 : #endif
3612 :
3613 : template <>
3614 106698 : void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
3615 : const GByte *CPL_RESTRICT pSrc,
3616 : GPtrDiff_t nIters)
3617 : {
3618 106698 : decltype(nIters) i = 0;
3619 106698 : if (nIters > 16)
3620 : {
3621 101405 : const __m128i xmm_mask = _mm_set1_epi32(0xff);
3622 : // If we were sure that there would always be 3 trailing bytes, we could
3623 : // check against nIters - 15
3624 11580500 : for (; i < nIters - 16; i += 16)
3625 : {
3626 : __m128i xmm0 =
3627 11479100 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
3628 : __m128i xmm1 =
3629 11479100 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
3630 : __m128i xmm2 =
3631 11479100 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
3632 : __m128i xmm3 =
3633 22958200 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 48));
3634 : // Set higher 24bit of each int32 packed word to 0
3635 11479100 : xmm0 = _mm_and_si128(xmm0, xmm_mask);
3636 11479100 : xmm1 = _mm_and_si128(xmm1, xmm_mask);
3637 11479100 : xmm2 = _mm_and_si128(xmm2, xmm_mask);
3638 11479100 : xmm3 = _mm_and_si128(xmm3, xmm_mask);
3639 : // Pack int32 to int16
3640 11479100 : xmm0 = _mm_packs_epi32(xmm0, xmm1);
3641 11479100 : xmm2 = _mm_packs_epi32(xmm2, xmm3);
3642 : // Pack int16 to uint8
3643 11479100 : xmm0 = _mm_packus_epi16(xmm0, xmm2);
3644 :
3645 : // Store result
3646 11479100 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
3647 :
3648 11479100 : pSrc += 4 * 16;
3649 : }
3650 : }
3651 1143150 : for (; i < nIters; i++)
3652 : {
3653 1036450 : pDest[i] = *pSrc;
3654 1036450 : pSrc += 4;
3655 : }
3656 106698 : }
3657 : #endif // HAVE_SSE2
3658 :
3659 : /************************************************************************/
3660 : /* GDALFastCopy() */
3661 : /************************************************************************/
3662 :
3663 : template <class T>
3664 39774200 : static inline void GDALFastCopy(T *CPL_RESTRICT pDest, int nDestStride,
3665 : const T *CPL_RESTRICT pSrc, int nSrcStride,
3666 : GPtrDiff_t nIters)
3667 : {
3668 39774200 : constexpr int sizeofT = static_cast<int>(sizeof(T));
3669 39774200 : if (nIters == 1)
3670 : {
3671 22297220 : *pDest = *pSrc;
3672 : }
3673 17476914 : else if (nDestStride == sizeofT)
3674 : {
3675 14372472 : if (nSrcStride == sizeofT)
3676 : {
3677 13513459 : memcpy(pDest, pSrc, nIters * sizeof(T));
3678 : }
3679 859029 : else if (nSrcStride == 2 * sizeofT)
3680 : {
3681 356651 : GDALUnrolledCopy<T, 2, 1>(pDest, pSrc, nIters);
3682 : }
3683 502378 : else if (nSrcStride == 3 * sizeofT)
3684 : {
3685 288642 : GDALUnrolledCopy<T, 3, 1>(pDest, pSrc, nIters);
3686 : }
3687 213736 : else if (nSrcStride == 4 * sizeofT)
3688 : {
3689 110680 : GDALUnrolledCopy<T, 4, 1>(pDest, pSrc, nIters);
3690 : }
3691 : else
3692 : {
3693 17219290 : while (nIters-- > 0)
3694 : {
3695 17116250 : *pDest = *pSrc;
3696 17116250 : pSrc += nSrcStride / sizeofT;
3697 17116250 : pDest++;
3698 : }
3699 : }
3700 : }
3701 3104492 : else if (nSrcStride == sizeofT)
3702 : {
3703 3091496 : if (nDestStride == 2 * sizeofT)
3704 : {
3705 150268 : GDALUnrolledCopy<T, 1, 2>(pDest, pSrc, nIters);
3706 : }
3707 2941225 : else if (nDestStride == 3 * sizeofT)
3708 : {
3709 2113491 : GDALUnrolledCopy<T, 1, 3>(pDest, pSrc, nIters);
3710 : }
3711 827733 : else if (nDestStride == 4 * sizeofT)
3712 : {
3713 663421 : GDALUnrolledCopy<T, 1, 4>(pDest, pSrc, nIters);
3714 : }
3715 : else
3716 : {
3717 17169660 : while (nIters-- > 0)
3718 : {
3719 17005410 : *pDest = *pSrc;
3720 17005410 : pSrc++;
3721 17005410 : pDest += nDestStride / sizeofT;
3722 : }
3723 : }
3724 : }
3725 : else
3726 : {
3727 1220108 : while (nIters-- > 0)
3728 : {
3729 1207102 : *pDest = *pSrc;
3730 1207102 : pSrc += nSrcStride / sizeofT;
3731 1207102 : pDest += nDestStride / sizeofT;
3732 : }
3733 : }
3734 39774200 : }
3735 :
3736 : /************************************************************************/
3737 : /* GDALFastCopyByte() */
3738 : /************************************************************************/
3739 :
3740 326250 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
3741 : int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
3742 : int nDstPixelStride, GPtrDiff_t nWordCount)
3743 : {
3744 326250 : GDALFastCopy(pDstData, nDstPixelStride, pSrcData, nSrcPixelStride,
3745 : nWordCount);
3746 326250 : }
3747 :
3748 : /************************************************************************/
3749 : /* GDALCopyWords() */
3750 : /************************************************************************/
3751 :
3752 : /**
3753 : * Copy pixel words from buffer to buffer.
3754 : *
3755 : * @see GDALCopyWords64()
3756 : */
3757 78066600 : void CPL_STDCALL GDALCopyWords(const void *CPL_RESTRICT pSrcData,
3758 : GDALDataType eSrcType, int nSrcPixelStride,
3759 : void *CPL_RESTRICT pDstData,
3760 : GDALDataType eDstType, int nDstPixelStride,
3761 : int nWordCount)
3762 : {
3763 78066600 : GDALCopyWords64(pSrcData, eSrcType, nSrcPixelStride, pDstData, eDstType,
3764 : nDstPixelStride, nWordCount);
3765 78066600 : }
3766 :
3767 : /************************************************************************/
3768 : /* GDALCopyWords64() */
3769 : /************************************************************************/
3770 :
3771 : /**
3772 : * Copy pixel words from buffer to buffer.
3773 : *
3774 : * This function is used to copy pixel word values from one memory buffer
3775 : * to another, with support for conversion between data types, and differing
3776 : * step factors. The data type conversion is done using the following
3777 : * rules:
3778 : * <ul>
3779 : * <li>Values assigned to a lower range integer type are clipped. For
3780 : * instance assigning GDT_Int16 values to a GDT_UInt8 buffer will cause values
3781 : * less the 0 to be set to 0, and values larger than 255 to be set to 255.
3782 : * </li>
3783 : * <li>
3784 : * Assignment from floating point to integer rounds to closest integer.
3785 : * +Infinity is mapped to the largest integer. -Infinity is mapped to the
3786 : * smallest integer. NaN is mapped to 0.
3787 : * </li>
3788 : * <li>
3789 : * Assignment from non-complex to complex will result in the imaginary part
3790 : * being set to zero on output.
3791 : * </li>
3792 : * <li> Assignment from complex to
3793 : * non-complex will result in the complex portion being lost and the real
3794 : * component being preserved (<i>not magnitude!</i>).
3795 : * </li>
3796 : * </ul>
3797 : *
3798 : * No assumptions are made about the source or destination words occurring
3799 : * on word boundaries. It is assumed that all values are in native machine
3800 : * byte order.
3801 : *
3802 : * @param pSrcData Pointer to source data to be converted.
3803 : * @param eSrcType the source data type (see GDALDataType enum)
3804 : * @param nSrcPixelStride Source pixel stride (i.e. distance between 2 words),
3805 : * in bytes
3806 : * @param pDstData Pointer to buffer where destination data should go
3807 : * @param eDstType the destination data type (see GDALDataType enum)
3808 : * @param nDstPixelStride Destination pixel stride (i.e. distance between 2
3809 : * words), in bytes
3810 : * @param nWordCount number of words to be copied
3811 : *
3812 : * @note
3813 : * When adding a new data type to GDAL, you must do the following to
3814 : * support it properly within the GDALCopyWords function:
3815 : * 1. Add the data type to the switch on eSrcType in GDALCopyWords.
3816 : * This should invoke the appropriate GDALCopyWordsFromT wrapper.
3817 : * 2. Add the data type to the switch on eDstType in GDALCopyWordsFromT.
3818 : * This should call the appropriate GDALCopyWordsT template.
3819 : * 3. If appropriate, overload the appropriate CopyWord template in the
3820 : * above namespace. This will ensure that any conversion issues are
3821 : * handled (cases like the float -> int32 case, where the min/max)
3822 : * values are subject to roundoff error.
3823 : */
3824 :
3825 108950000 : void CPL_STDCALL GDALCopyWords64(const void *CPL_RESTRICT pSrcData,
3826 : GDALDataType eSrcType, int nSrcPixelStride,
3827 : void *CPL_RESTRICT pDstData,
3828 : GDALDataType eDstType, int nDstPixelStride,
3829 : GPtrDiff_t nWordCount)
3830 :
3831 : {
3832 : // On platforms where alignment matters, be careful
3833 108950000 : const int nSrcDataTypeSize = GDALGetDataTypeSizeBytes(eSrcType);
3834 108950000 : const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(eDstType);
3835 108950000 : if (CPL_UNLIKELY(nSrcDataTypeSize == 0 || nDstDataTypeSize == 0))
3836 : {
3837 2 : CPLError(CE_Failure, CPLE_NotSupported,
3838 : "GDALCopyWords64(): unsupported GDT_Unknown/GDT_TypeCount "
3839 : "argument");
3840 2 : return;
3841 : }
3842 108950000 : if (!(eSrcType == eDstType && nSrcPixelStride == nDstPixelStride) &&
3843 59159600 : ((reinterpret_cast<uintptr_t>(pSrcData) % nSrcDataTypeSize) != 0 ||
3844 59159600 : (reinterpret_cast<uintptr_t>(pDstData) % nDstDataTypeSize) != 0 ||
3845 59159200 : (nSrcPixelStride % nSrcDataTypeSize) != 0 ||
3846 59159100 : (nDstPixelStride % nDstDataTypeSize) != 0))
3847 : {
3848 905 : if (eSrcType == eDstType)
3849 : {
3850 34800 : for (decltype(nWordCount) i = 0; i < nWordCount; i++)
3851 : {
3852 34000 : memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
3853 : static_cast<const GByte *>(pSrcData) +
3854 34000 : nSrcPixelStride * i,
3855 : nDstDataTypeSize);
3856 : }
3857 : }
3858 : else
3859 : {
3860 210 : const auto getAlignedPtr = [](GByte *ptr, int align)
3861 : {
3862 : return ptr +
3863 210 : ((align - (reinterpret_cast<uintptr_t>(ptr) % align)) %
3864 210 : align);
3865 : };
3866 :
3867 : // The largest we need is for CFloat64 (16 bytes), so 32 bytes to
3868 : // be sure to get correctly aligned pointer.
3869 105 : constexpr size_t SIZEOF_CFLOAT64 = 2 * sizeof(double);
3870 : GByte abySrcBuffer[2 * SIZEOF_CFLOAT64];
3871 : GByte abyDstBuffer[2 * SIZEOF_CFLOAT64];
3872 : GByte *pabySrcBuffer =
3873 105 : getAlignedPtr(abySrcBuffer, nSrcDataTypeSize);
3874 : GByte *pabyDstBuffer =
3875 105 : getAlignedPtr(abyDstBuffer, nDstDataTypeSize);
3876 3360 : for (decltype(nWordCount) i = 0; i < nWordCount; i++)
3877 : {
3878 3255 : memcpy(pabySrcBuffer,
3879 : static_cast<const GByte *>(pSrcData) +
3880 3255 : nSrcPixelStride * i,
3881 : nSrcDataTypeSize);
3882 3255 : GDALCopyWords64(pabySrcBuffer, eSrcType, 0, pabyDstBuffer,
3883 : eDstType, 0, 1);
3884 3255 : memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
3885 : pabyDstBuffer, nDstDataTypeSize);
3886 : }
3887 : }
3888 905 : return;
3889 : }
3890 :
3891 : // Deal with the case where we're replicating a single word into the
3892 : // provided buffer
3893 108949000 : if (nSrcPixelStride == 0 && nWordCount > 1)
3894 : {
3895 1050460 : GDALReplicateWord(pSrcData, eSrcType, pDstData, eDstType,
3896 : nDstPixelStride, nWordCount);
3897 1050460 : return;
3898 : }
3899 :
3900 107899000 : if (eSrcType == eDstType)
3901 : {
3902 53814000 : if (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8)
3903 : {
3904 17996900 : GDALFastCopy(static_cast<GByte *>(pDstData), nDstPixelStride,
3905 : static_cast<const GByte *>(pSrcData), nSrcPixelStride,
3906 : nWordCount);
3907 17996900 : return;
3908 : }
3909 :
3910 35817100 : if (nSrcDataTypeSize == 2 && (nSrcPixelStride % 2) == 0 &&
3911 21451000 : (nDstPixelStride % 2) == 0)
3912 : {
3913 21451000 : GDALFastCopy(static_cast<short *>(pDstData), nDstPixelStride,
3914 : static_cast<const short *>(pSrcData), nSrcPixelStride,
3915 : nWordCount);
3916 21451000 : return;
3917 : }
3918 :
3919 14366100 : if (nWordCount == 1)
3920 : {
3921 : #if defined(CSA_BUILD) || defined(__COVERITY__)
3922 : // Avoid false positives...
3923 : memcpy(pDstData, pSrcData, nSrcDataTypeSize);
3924 : #else
3925 13908600 : if (nSrcDataTypeSize == 2)
3926 0 : memcpy(pDstData, pSrcData, 2);
3927 13908600 : else if (nSrcDataTypeSize == 4)
3928 13813500 : memcpy(pDstData, pSrcData, 4);
3929 95125 : else if (nSrcDataTypeSize == 8)
3930 78520 : memcpy(pDstData, pSrcData, 8);
3931 : else /* if( eSrcType == GDT_CFloat64 ) */
3932 16605 : memcpy(pDstData, pSrcData, 16);
3933 : #endif
3934 13908600 : return;
3935 : }
3936 :
3937 : // Let memcpy() handle the case where we're copying a packed buffer
3938 : // of pixels.
3939 457421 : if (nSrcPixelStride == nDstPixelStride)
3940 : {
3941 195607 : if (nSrcPixelStride == nSrcDataTypeSize)
3942 : {
3943 195539 : memcpy(pDstData, pSrcData, nWordCount * nSrcDataTypeSize);
3944 195539 : return;
3945 : }
3946 : }
3947 : }
3948 :
3949 : // Handle the more general case -- deals with conversion of data types
3950 : // directly.
3951 54346800 : switch (eSrcType)
3952 : {
3953 15544900 : case GDT_UInt8:
3954 15544900 : GDALCopyWordsFromT<unsigned char>(
3955 : static_cast<const unsigned char *>(pSrcData), nSrcPixelStride,
3956 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
3957 15544900 : break;
3958 1291 : case GDT_Int8:
3959 1291 : GDALCopyWordsFromT<signed char>(
3960 : static_cast<const signed char *>(pSrcData), nSrcPixelStride,
3961 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
3962 1291 : break;
3963 54285 : case GDT_UInt16:
3964 54285 : GDALCopyWordsFromT<unsigned short>(
3965 : static_cast<const unsigned short *>(pSrcData), nSrcPixelStride,
3966 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
3967 54285 : break;
3968 4353740 : case GDT_Int16:
3969 4353740 : GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
3970 : nSrcPixelStride, false, pDstData,
3971 : eDstType, nDstPixelStride, nWordCount);
3972 4353740 : break;
3973 7432 : case GDT_UInt32:
3974 7432 : GDALCopyWordsFromT<unsigned int>(
3975 : static_cast<const unsigned int *>(pSrcData), nSrcPixelStride,
3976 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
3977 7432 : break;
3978 12255400 : case GDT_Int32:
3979 12255400 : GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
3980 : nSrcPixelStride, false, pDstData, eDstType,
3981 : nDstPixelStride, nWordCount);
3982 12255400 : break;
3983 1957 : case GDT_UInt64:
3984 1957 : GDALCopyWordsFromT<std::uint64_t>(
3985 : static_cast<const std::uint64_t *>(pSrcData), nSrcPixelStride,
3986 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
3987 1957 : break;
3988 11578 : case GDT_Int64:
3989 11578 : GDALCopyWordsFromT<std::int64_t>(
3990 : static_cast<const std::int64_t *>(pSrcData), nSrcPixelStride,
3991 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
3992 11578 : break;
3993 1371 : case GDT_Float16:
3994 1371 : GDALCopyWordsFromT<GFloat16>(
3995 : static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, false,
3996 : pDstData, eDstType, nDstPixelStride, nWordCount);
3997 1371 : break;
3998 657732 : case GDT_Float32:
3999 657732 : GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
4000 : nSrcPixelStride, false, pDstData,
4001 : eDstType, nDstPixelStride, nWordCount);
4002 657732 : break;
4003 20697400 : case GDT_Float64:
4004 20697400 : GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
4005 : nSrcPixelStride, false, pDstData,
4006 : eDstType, nDstPixelStride, nWordCount);
4007 20697400 : break;
4008 478485 : case GDT_CInt16:
4009 478485 : GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
4010 : nSrcPixelStride, true, pDstData, eDstType,
4011 : nDstPixelStride, nWordCount);
4012 478485 : break;
4013 868 : case GDT_CInt32:
4014 868 : GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
4015 : nSrcPixelStride, true, pDstData, eDstType,
4016 : nDstPixelStride, nWordCount);
4017 868 : break;
4018 508 : case GDT_CFloat16:
4019 508 : GDALCopyWordsFromT<GFloat16>(
4020 : static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, true,
4021 : pDstData, eDstType, nDstPixelStride, nWordCount);
4022 508 : break;
4023 2389 : case GDT_CFloat32:
4024 2389 : GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
4025 : nSrcPixelStride, true, pDstData, eDstType,
4026 : nDstPixelStride, nWordCount);
4027 2389 : break;
4028 277349 : case GDT_CFloat64:
4029 277349 : GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
4030 : nSrcPixelStride, true, pDstData,
4031 : eDstType, nDstPixelStride, nWordCount);
4032 277349 : break;
4033 0 : case GDT_Unknown:
4034 : case GDT_TypeCount:
4035 0 : CPLAssert(false);
4036 : }
4037 : }
4038 :
4039 : /************************************************************************/
4040 : /* GDALCopyBits() */
4041 : /************************************************************************/
4042 :
4043 : /**
4044 : * Bitwise word copying.
4045 : *
4046 : * A function for moving sets of partial bytes around. Loosely
4047 : * speaking this is a bitwise analog to GDALCopyWords().
4048 : *
4049 : * It copies nStepCount "words" where each word is nBitCount bits long.
4050 : * The nSrcStep and nDstStep are the number of bits from the start of one
4051 : * word to the next (same as nBitCount if they are packed). The nSrcOffset
4052 : * and nDstOffset are the offset into the source and destination buffers
4053 : * to start at, also measured in bits.
4054 : *
4055 : * All bit offsets are assumed to start from the high order bit in a byte
4056 : * (i.e. most significant bit first). Currently this function is not very
4057 : * optimized, but it may be improved for some common cases in the future
4058 : * as needed.
4059 : *
4060 : * @param pabySrcData the source data buffer.
4061 : * @param nSrcOffset the offset (in bits) in pabySrcData to the start of the
4062 : * first word to copy.
4063 : * @param nSrcStep the offset in bits from the start one source word to the
4064 : * start of the next.
4065 : * @param pabyDstData the destination data buffer.
4066 : * @param nDstOffset the offset (in bits) in pabyDstData to the start of the
4067 : * first word to copy over.
4068 : * @param nDstStep the offset in bits from the start one word to the
4069 : * start of the next.
4070 : * @param nBitCount the number of bits in a word to be copied.
4071 : * @param nStepCount the number of words to copy.
4072 : */
4073 :
4074 0 : void GDALCopyBits(const GByte *pabySrcData, int nSrcOffset, int nSrcStep,
4075 : GByte *pabyDstData, int nDstOffset, int nDstStep,
4076 : int nBitCount, int nStepCount)
4077 :
4078 : {
4079 0 : VALIDATE_POINTER0(pabySrcData, "GDALCopyBits");
4080 :
4081 0 : for (int iStep = 0; iStep < nStepCount; iStep++)
4082 : {
4083 0 : for (int iBit = 0; iBit < nBitCount; iBit++)
4084 : {
4085 0 : if (pabySrcData[nSrcOffset >> 3] & (0x80 >> (nSrcOffset & 7)))
4086 0 : pabyDstData[nDstOffset >> 3] |= (0x80 >> (nDstOffset & 7));
4087 : else
4088 0 : pabyDstData[nDstOffset >> 3] &= ~(0x80 >> (nDstOffset & 7));
4089 :
4090 0 : nSrcOffset++;
4091 0 : nDstOffset++;
4092 : }
4093 :
4094 0 : nSrcOffset += (nSrcStep - nBitCount);
4095 0 : nDstOffset += (nDstStep - nBitCount);
4096 : }
4097 : }
4098 :
4099 : /************************************************************************/
4100 : /* GDALGetBestOverviewLevel() */
4101 : /* */
4102 : /* Returns the best overview level to satisfy the query or -1 if none */
4103 : /* Also updates nXOff, nYOff, nXSize, nYSize and psExtraArg when */
4104 : /* returning a valid overview level */
4105 : /************************************************************************/
4106 :
4107 0 : int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand, int &nXOff, int &nYOff,
4108 : int &nXSize, int &nYSize, int nBufXSize,
4109 : int nBufYSize)
4110 : {
4111 0 : return GDALBandGetBestOverviewLevel2(poBand, nXOff, nYOff, nXSize, nYSize,
4112 0 : nBufXSize, nBufYSize, nullptr);
4113 : }
4114 :
4115 523998 : int GDALBandGetBestOverviewLevel2(GDALRasterBand *poBand, int &nXOff,
4116 : int &nYOff, int &nXSize, int &nYSize,
4117 : int nBufXSize, int nBufYSize,
4118 : GDALRasterIOExtraArg *psExtraArg)
4119 : {
4120 523998 : if (psExtraArg != nullptr && psExtraArg->nVersion > 1 &&
4121 523998 : psExtraArg->bUseOnlyThisScale)
4122 109 : return -1;
4123 : /* -------------------------------------------------------------------- */
4124 : /* Compute the desired downsampling factor. It is */
4125 : /* based on the least reduced axis, and represents the number */
4126 : /* of source pixels to one destination pixel. */
4127 : /* -------------------------------------------------------------------- */
4128 523889 : const double dfDesiredDownsamplingFactor =
4129 523889 : ((nXSize / static_cast<double>(nBufXSize)) <
4130 361551 : (nYSize / static_cast<double>(nBufYSize)) ||
4131 : nBufYSize == 1)
4132 752276 : ? nXSize / static_cast<double>(nBufXSize)
4133 133164 : : nYSize / static_cast<double>(nBufYSize);
4134 :
4135 : /* -------------------------------------------------------------------- */
4136 : /* Find the overview level that largest downsampling factor (most */
4137 : /* downsampled) that is still less than (or only a little more) */
4138 : /* downsampled than the request. */
4139 : /* -------------------------------------------------------------------- */
4140 523889 : const int nOverviewCount = poBand->GetOverviewCount();
4141 523889 : GDALRasterBand *poBestOverview = nullptr;
4142 523889 : double dfBestDownsamplingFactor = 0;
4143 523889 : int nBestOverviewLevel = -1;
4144 :
4145 : const char *pszOversampligThreshold =
4146 523889 : CPLGetConfigOption("GDAL_OVERVIEW_OVERSAMPLING_THRESHOLD", nullptr);
4147 :
4148 : // Note: keep this logic for overview selection in sync between
4149 : // gdalwarp_lib.cpp and rasterio.cpp
4150 : // Cf https://github.com/OSGeo/gdal/pull/9040#issuecomment-1898524693
4151 : const double dfOversamplingThreshold =
4152 1047770 : pszOversampligThreshold ? CPLAtof(pszOversampligThreshold)
4153 523880 : : psExtraArg && psExtraArg->eResampleAlg != GRIORA_NearestNeighbour
4154 1047760 : ? 1.0
4155 523889 : : 1.2;
4156 526585 : for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
4157 : {
4158 5612 : GDALRasterBand *poOverview = poBand->GetOverview(iOverview);
4159 11224 : if (poOverview == nullptr ||
4160 11223 : poOverview->GetXSize() > poBand->GetXSize() ||
4161 5611 : poOverview->GetYSize() > poBand->GetYSize())
4162 : {
4163 1 : continue;
4164 : }
4165 :
4166 : // Compute downsampling factor of this overview
4167 : const double dfDownsamplingFactor = std::min(
4168 5611 : poBand->GetXSize() / static_cast<double>(poOverview->GetXSize()),
4169 11222 : poBand->GetYSize() / static_cast<double>(poOverview->GetYSize()));
4170 :
4171 : // Is it nearly the requested factor and better (lower) than
4172 : // the current best factor?
4173 : // Use an epsilon because of numerical instability.
4174 5611 : constexpr double EPSILON = 1e-1;
4175 5719 : if (dfDownsamplingFactor >=
4176 5611 : dfDesiredDownsamplingFactor * dfOversamplingThreshold +
4177 5503 : EPSILON ||
4178 : dfDownsamplingFactor <= dfBestDownsamplingFactor)
4179 : {
4180 108 : continue;
4181 : }
4182 :
4183 : // Ignore AVERAGE_BIT2GRAYSCALE overviews for RasterIO purposes.
4184 5503 : const char *pszResampling = poOverview->GetMetadataItem("RESAMPLING");
4185 :
4186 5503 : if (pszResampling != nullptr &&
4187 71 : STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2"))
4188 16 : continue;
4189 :
4190 : // OK, this is our new best overview.
4191 5487 : poBestOverview = poOverview;
4192 5487 : nBestOverviewLevel = iOverview;
4193 5487 : dfBestDownsamplingFactor = dfDownsamplingFactor;
4194 :
4195 5487 : if (std::abs(dfDesiredDownsamplingFactor - dfDownsamplingFactor) <
4196 : EPSILON)
4197 : {
4198 2916 : break;
4199 : }
4200 : }
4201 :
4202 : /* -------------------------------------------------------------------- */
4203 : /* If we didn't find an overview that helps us, just return */
4204 : /* indicating failure and the full resolution image will be used. */
4205 : /* -------------------------------------------------------------------- */
4206 523889 : if (nBestOverviewLevel < 0)
4207 520900 : return -1;
4208 :
4209 : /* -------------------------------------------------------------------- */
4210 : /* Recompute the source window in terms of the selected */
4211 : /* overview. */
4212 : /* -------------------------------------------------------------------- */
4213 : const double dfXFactor =
4214 2989 : poBand->GetXSize() / static_cast<double>(poBestOverview->GetXSize());
4215 : const double dfYFactor =
4216 2989 : poBand->GetYSize() / static_cast<double>(poBestOverview->GetYSize());
4217 2989 : CPLDebug("GDAL", "Selecting overview %d x %d", poBestOverview->GetXSize(),
4218 : poBestOverview->GetYSize());
4219 :
4220 8967 : const int nOXOff = std::min(poBestOverview->GetXSize() - 1,
4221 2989 : static_cast<int>(nXOff / dfXFactor + 0.5));
4222 8967 : const int nOYOff = std::min(poBestOverview->GetYSize() - 1,
4223 2989 : static_cast<int>(nYOff / dfYFactor + 0.5));
4224 2989 : int nOXSize = std::max(1, static_cast<int>(nXSize / dfXFactor + 0.5));
4225 2989 : int nOYSize = std::max(1, static_cast<int>(nYSize / dfYFactor + 0.5));
4226 2989 : if (nOXOff + nOXSize > poBestOverview->GetXSize())
4227 0 : nOXSize = poBestOverview->GetXSize() - nOXOff;
4228 2989 : if (nOYOff + nOYSize > poBestOverview->GetYSize())
4229 2 : nOYSize = poBestOverview->GetYSize() - nOYOff;
4230 :
4231 2989 : if (psExtraArg)
4232 : {
4233 2989 : if (psExtraArg->bFloatingPointWindowValidity)
4234 : {
4235 115 : psExtraArg->dfXOff /= dfXFactor;
4236 115 : psExtraArg->dfXSize /= dfXFactor;
4237 115 : psExtraArg->dfYOff /= dfYFactor;
4238 115 : psExtraArg->dfYSize /= dfYFactor;
4239 : }
4240 2874 : else if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
4241 : {
4242 16 : psExtraArg->bFloatingPointWindowValidity = true;
4243 16 : psExtraArg->dfXOff = nXOff / dfXFactor;
4244 16 : psExtraArg->dfXSize = nXSize / dfXFactor;
4245 16 : psExtraArg->dfYOff = nYOff / dfYFactor;
4246 16 : psExtraArg->dfYSize = nYSize / dfYFactor;
4247 : }
4248 : }
4249 :
4250 2989 : nXOff = nOXOff;
4251 2989 : nYOff = nOYOff;
4252 2989 : nXSize = nOXSize;
4253 2989 : nYSize = nOYSize;
4254 :
4255 2989 : return nBestOverviewLevel;
4256 : }
4257 :
4258 : /************************************************************************/
4259 : /* OverviewRasterIO() */
4260 : /* */
4261 : /* Special work function to utilize available overviews to */
4262 : /* more efficiently satisfy downsampled requests. It will */
4263 : /* return CE_Failure if there are no appropriate overviews */
4264 : /* available but it doesn't emit any error messages. */
4265 : /************************************************************************/
4266 :
4267 : //! @cond Doxygen_Suppress
4268 2 : CPLErr GDALRasterBand::OverviewRasterIO(
4269 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4270 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4271 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
4272 :
4273 : {
4274 : GDALRasterIOExtraArg sExtraArg;
4275 2 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4276 :
4277 2 : const int nOverview = GDALBandGetBestOverviewLevel2(
4278 : this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, &sExtraArg);
4279 2 : if (nOverview < 0)
4280 1 : return CE_Failure;
4281 :
4282 : /* -------------------------------------------------------------------- */
4283 : /* Recast the call in terms of the new raster layer. */
4284 : /* -------------------------------------------------------------------- */
4285 1 : GDALRasterBand *poOverviewBand = GetOverview(nOverview);
4286 1 : if (poOverviewBand == nullptr)
4287 0 : return CE_Failure;
4288 :
4289 1 : return poOverviewBand->RasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
4290 : pData, nBufXSize, nBufYSize, eBufType,
4291 1 : nPixelSpace, nLineSpace, &sExtraArg);
4292 : }
4293 :
4294 : /************************************************************************/
4295 : /* TryOverviewRasterIO() */
4296 : /************************************************************************/
4297 :
4298 362417 : CPLErr GDALRasterBand::TryOverviewRasterIO(
4299 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4300 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4301 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg,
4302 : int *pbTried)
4303 : {
4304 362417 : int nXOffMod = nXOff;
4305 362417 : int nYOffMod = nYOff;
4306 362417 : int nXSizeMod = nXSize;
4307 362417 : int nYSizeMod = nYSize;
4308 : GDALRasterIOExtraArg sExtraArg;
4309 :
4310 362417 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4311 :
4312 362417 : int iOvrLevel = GDALBandGetBestOverviewLevel2(
4313 : this, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize, nBufYSize,
4314 : &sExtraArg);
4315 :
4316 362417 : if (iOvrLevel >= 0)
4317 : {
4318 50 : GDALRasterBand *poOverviewBand = GetOverview(iOvrLevel);
4319 50 : if (poOverviewBand)
4320 : {
4321 50 : *pbTried = TRUE;
4322 50 : return poOverviewBand->RasterIO(
4323 : eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData,
4324 : nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
4325 50 : &sExtraArg);
4326 : }
4327 : }
4328 :
4329 362367 : *pbTried = FALSE;
4330 362367 : return CE_None;
4331 : }
4332 :
4333 : /************************************************************************/
4334 : /* TryOverviewRasterIO() */
4335 : /************************************************************************/
4336 :
4337 158605 : CPLErr GDALDataset::TryOverviewRasterIO(
4338 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4339 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4340 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
4341 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg,
4342 : int *pbTried)
4343 : {
4344 158605 : int nXOffMod = nXOff;
4345 158605 : int nYOffMod = nYOff;
4346 158605 : int nXSizeMod = nXSize;
4347 158605 : int nYSizeMod = nYSize;
4348 : GDALRasterIOExtraArg sExtraArg;
4349 158605 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4350 :
4351 317210 : int iOvrLevel = GDALBandGetBestOverviewLevel2(
4352 158605 : papoBands[0], nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize,
4353 : nBufYSize, &sExtraArg);
4354 :
4355 158646 : if (iOvrLevel >= 0 && papoBands[0]->GetOverview(iOvrLevel) != nullptr &&
4356 41 : papoBands[0]->GetOverview(iOvrLevel)->GetDataset() != nullptr)
4357 : {
4358 41 : *pbTried = TRUE;
4359 41 : return papoBands[0]->GetOverview(iOvrLevel)->GetDataset()->RasterIO(
4360 : eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData, nBufXSize,
4361 : nBufYSize, eBufType, nBandCount, panBandMap, nPixelSpace,
4362 41 : nLineSpace, nBandSpace, &sExtraArg);
4363 : }
4364 : else
4365 : {
4366 158564 : *pbTried = FALSE;
4367 158564 : return CE_None;
4368 : }
4369 : }
4370 :
4371 : /************************************************************************/
4372 : /* GetBestOverviewLevel() */
4373 : /* */
4374 : /* Returns the best overview level to satisfy the query or -1 if none */
4375 : /* Also updates nXOff, nYOff, nXSize, nYSize when returning a valid */
4376 : /* overview level */
4377 : /************************************************************************/
4378 :
4379 4 : static int GDALDatasetGetBestOverviewLevel(GDALDataset *poDS, int &nXOff,
4380 : int &nYOff, int &nXSize, int &nYSize,
4381 : int nBufXSize, int nBufYSize,
4382 : int nBandCount,
4383 : const int *panBandMap,
4384 : GDALRasterIOExtraArg *psExtraArg)
4385 : {
4386 4 : int nOverviewCount = 0;
4387 4 : GDALRasterBand *poFirstBand = nullptr;
4388 :
4389 : /* -------------------------------------------------------------------- */
4390 : /* Check that all bands have the same number of overviews and */
4391 : /* that they have all the same size and block dimensions */
4392 : /* -------------------------------------------------------------------- */
4393 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
4394 : {
4395 8 : GDALRasterBand *poBand = poDS->GetRasterBand(panBandMap[iBand]);
4396 8 : if (poBand == nullptr)
4397 0 : return -1;
4398 8 : if (iBand == 0)
4399 : {
4400 4 : poFirstBand = poBand;
4401 4 : nOverviewCount = poBand->GetOverviewCount();
4402 : }
4403 4 : else if (nOverviewCount != poBand->GetOverviewCount())
4404 : {
4405 0 : CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
4406 : "mismatched overview count, use std method.");
4407 0 : return -1;
4408 : }
4409 : else
4410 : {
4411 4 : for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
4412 : {
4413 0 : GDALRasterBand *poOvrBand = poBand->GetOverview(iOverview);
4414 : GDALRasterBand *poOvrFirstBand =
4415 0 : poFirstBand->GetOverview(iOverview);
4416 0 : if (poOvrBand == nullptr || poOvrFirstBand == nullptr)
4417 0 : continue;
4418 :
4419 0 : if (poOvrFirstBand->GetXSize() != poOvrBand->GetXSize() ||
4420 0 : poOvrFirstBand->GetYSize() != poOvrBand->GetYSize())
4421 : {
4422 0 : CPLDebug("GDAL",
4423 : "GDALDataset::GetBestOverviewLevel() ... "
4424 : "mismatched overview sizes, use std method.");
4425 0 : return -1;
4426 : }
4427 0 : int nBlockXSizeFirst = 0;
4428 0 : int nBlockYSizeFirst = 0;
4429 0 : poOvrFirstBand->GetBlockSize(&nBlockXSizeFirst,
4430 : &nBlockYSizeFirst);
4431 :
4432 0 : int nBlockXSizeCurrent = 0;
4433 0 : int nBlockYSizeCurrent = 0;
4434 0 : poOvrBand->GetBlockSize(&nBlockXSizeCurrent,
4435 : &nBlockYSizeCurrent);
4436 :
4437 0 : if (nBlockXSizeFirst != nBlockXSizeCurrent ||
4438 0 : nBlockYSizeFirst != nBlockYSizeCurrent)
4439 : {
4440 0 : CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
4441 : "mismatched block sizes, use std method.");
4442 0 : return -1;
4443 : }
4444 : }
4445 : }
4446 : }
4447 4 : if (poFirstBand == nullptr)
4448 0 : return -1;
4449 :
4450 4 : return GDALBandGetBestOverviewLevel2(poFirstBand, nXOff, nYOff, nXSize,
4451 : nYSize, nBufXSize, nBufYSize,
4452 4 : psExtraArg);
4453 : }
4454 :
4455 : /************************************************************************/
4456 : /* BlockBasedRasterIO() */
4457 : /* */
4458 : /* This convenience function implements a dataset level */
4459 : /* RasterIO() interface based on calling down to fetch blocks, */
4460 : /* much like the GDALRasterBand::IRasterIO(), but it handles */
4461 : /* all bands at once, so that a format driver that handles a */
4462 : /* request for different bands of the same block efficiently */
4463 : /* (i.e. without re-reading interleaved data) will efficiently. */
4464 : /* */
4465 : /* This method is intended to be called by an overridden */
4466 : /* IRasterIO() method in the driver specific GDALDataset */
4467 : /* derived class. */
4468 : /* */
4469 : /* Default internal implementation of RasterIO() ... utilizes */
4470 : /* the Block access methods to satisfy the request. This would */
4471 : /* normally only be overridden by formats with overviews. */
4472 : /* */
4473 : /* To keep things relatively simple, this method does not */
4474 : /* currently take advantage of some special cases addressed in */
4475 : /* GDALRasterBand::IRasterIO(), so it is likely best to only */
4476 : /* call it when you know it will help. That is in cases where */
4477 : /* data is at 1:1 to the buffer, and you know the driver is */
4478 : /* implementing interleaved IO efficiently on a block by block */
4479 : /* basis. Overviews will be used when possible. */
4480 : /************************************************************************/
4481 :
4482 64158 : CPLErr GDALDataset::BlockBasedRasterIO(
4483 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4484 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4485 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
4486 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
4487 :
4488 : {
4489 64158 : CPLAssert(nullptr != pData);
4490 :
4491 64158 : GByte **papabySrcBlock = nullptr;
4492 64158 : GDALRasterBlock *poBlock = nullptr;
4493 64158 : GDALRasterBlock **papoBlocks = nullptr;
4494 64158 : int nLBlockX = -1;
4495 64158 : int nLBlockY = -1;
4496 : int iBufYOff;
4497 : int iBufXOff;
4498 64158 : int nBlockXSize = 1;
4499 64158 : int nBlockYSize = 1;
4500 64158 : CPLErr eErr = CE_None;
4501 64158 : GDALDataType eDataType = GDT_UInt8;
4502 :
4503 64158 : const bool bUseIntegerRequestCoords =
4504 64188 : (!psExtraArg->bFloatingPointWindowValidity ||
4505 30 : (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
4506 28 : nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
4507 :
4508 : /* -------------------------------------------------------------------- */
4509 : /* Ensure that all bands share a common block size and data type. */
4510 : /* -------------------------------------------------------------------- */
4511 304098 : for (int iBand = 0; iBand < nBandCount; iBand++)
4512 : {
4513 239940 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
4514 :
4515 239940 : if (iBand == 0)
4516 : {
4517 64158 : poBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
4518 64158 : eDataType = poBand->GetRasterDataType();
4519 : }
4520 : else
4521 : {
4522 175782 : int nThisBlockXSize = 0;
4523 175782 : int nThisBlockYSize = 0;
4524 175782 : poBand->GetBlockSize(&nThisBlockXSize, &nThisBlockYSize);
4525 175782 : if (nThisBlockXSize != nBlockXSize ||
4526 175782 : nThisBlockYSize != nBlockYSize)
4527 : {
4528 0 : CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
4529 : "mismatched block sizes, use std method.");
4530 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
4531 : pData, nBufXSize, nBufYSize, eBufType,
4532 : nBandCount, panBandMap, nPixelSpace,
4533 0 : nLineSpace, nBandSpace, psExtraArg);
4534 : }
4535 :
4536 175782 : if (eDataType != poBand->GetRasterDataType() &&
4537 0 : (nXSize != nBufXSize || nYSize != nBufYSize))
4538 : {
4539 0 : CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
4540 : "mismatched band data types, use std method.");
4541 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
4542 : pData, nBufXSize, nBufYSize, eBufType,
4543 : nBandCount, panBandMap, nPixelSpace,
4544 0 : nLineSpace, nBandSpace, psExtraArg);
4545 : }
4546 : }
4547 : }
4548 :
4549 : /* ==================================================================== */
4550 : /* In this special case at full resolution we step through in */
4551 : /* blocks, turning the request over to the per-band */
4552 : /* IRasterIO(), but ensuring that all bands of one block are */
4553 : /* called before proceeding to the next. */
4554 : /* ==================================================================== */
4555 :
4556 64158 : if (nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
4557 : {
4558 : GDALRasterIOExtraArg sDummyExtraArg;
4559 64154 : INIT_RASTERIO_EXTRA_ARG(sDummyExtraArg);
4560 :
4561 64154 : int nChunkYSize = 0;
4562 64154 : int nChunkXSize = 0;
4563 :
4564 210793 : for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff += nChunkYSize)
4565 : {
4566 147654 : const int nChunkYOff = iBufYOff + nYOff;
4567 147654 : nChunkYSize = nBlockYSize - (nChunkYOff % nBlockYSize);
4568 147654 : if (nChunkYOff + nChunkYSize > nYOff + nYSize)
4569 59192 : nChunkYSize = (nYOff + nYSize) - nChunkYOff;
4570 :
4571 818583 : for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff += nChunkXSize)
4572 : {
4573 671943 : const int nChunkXOff = iBufXOff + nXOff;
4574 671943 : nChunkXSize = nBlockXSize - (nChunkXOff % nBlockXSize);
4575 671943 : if (nChunkXOff + nChunkXSize > nXOff + nXSize)
4576 70386 : nChunkXSize = (nXOff + nXSize) - nChunkXOff;
4577 :
4578 671943 : GByte *pabyChunkData =
4579 671943 : static_cast<GByte *>(pData) + iBufXOff * nPixelSpace +
4580 671943 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace;
4581 :
4582 3271560 : for (int iBand = 0; iBand < nBandCount; iBand++)
4583 : {
4584 2600630 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
4585 :
4586 5201250 : eErr = poBand->IRasterIO(
4587 : eRWFlag, nChunkXOff, nChunkYOff, nChunkXSize,
4588 : nChunkYSize,
4589 2600630 : pabyChunkData +
4590 2600630 : static_cast<GPtrDiff_t>(iBand) * nBandSpace,
4591 : nChunkXSize, nChunkYSize, eBufType, nPixelSpace,
4592 2600630 : nLineSpace, &sDummyExtraArg);
4593 2600630 : if (eErr != CE_None)
4594 1014 : return eErr;
4595 : }
4596 : }
4597 :
4598 165469 : if (psExtraArg->pfnProgress != nullptr &&
4599 18829 : !psExtraArg->pfnProgress(
4600 165469 : 1.0 * std::min(nBufYSize, iBufYOff + nChunkYSize) /
4601 : nBufYSize,
4602 : "", psExtraArg->pProgressData))
4603 : {
4604 1 : return CE_Failure;
4605 : }
4606 : }
4607 :
4608 63139 : return CE_None;
4609 : }
4610 :
4611 : /* Below code is not compatible with that case. It would need a complete */
4612 : /* separate code like done in GDALRasterBand::IRasterIO. */
4613 4 : if (eRWFlag == GF_Write && (nBufXSize < nXSize || nBufYSize < nYSize))
4614 : {
4615 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
4616 : nBufXSize, nBufYSize, eBufType, nBandCount,
4617 : panBandMap, nPixelSpace, nLineSpace,
4618 0 : nBandSpace, psExtraArg);
4619 : }
4620 :
4621 : /* We could have a smarter implementation, but that will do for now */
4622 4 : if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
4623 0 : (nBufXSize != nXSize || nBufYSize != nYSize))
4624 : {
4625 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
4626 : nBufXSize, nBufYSize, eBufType, nBandCount,
4627 : panBandMap, nPixelSpace, nLineSpace,
4628 0 : nBandSpace, psExtraArg);
4629 : }
4630 :
4631 : /* ==================================================================== */
4632 : /* Loop reading required source blocks to satisfy output */
4633 : /* request. This is the most general implementation. */
4634 : /* ==================================================================== */
4635 :
4636 4 : const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
4637 :
4638 : papabySrcBlock =
4639 4 : static_cast<GByte **>(CPLCalloc(sizeof(GByte *), nBandCount));
4640 : papoBlocks =
4641 4 : static_cast<GDALRasterBlock **>(CPLCalloc(sizeof(void *), nBandCount));
4642 :
4643 : /* -------------------------------------------------------------------- */
4644 : /* Select an overview level if appropriate. */
4645 : /* -------------------------------------------------------------------- */
4646 :
4647 : GDALRasterIOExtraArg sExtraArg;
4648 4 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4649 4 : const int nOverviewLevel = GDALDatasetGetBestOverviewLevel(
4650 : this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, nBandCount,
4651 : panBandMap, &sExtraArg);
4652 4 : if (nOverviewLevel >= 0)
4653 : {
4654 2 : GetRasterBand(panBandMap[0])
4655 2 : ->GetOverview(nOverviewLevel)
4656 2 : ->GetBlockSize(&nBlockXSize, &nBlockYSize);
4657 : }
4658 :
4659 4 : double dfXOff = nXOff;
4660 4 : double dfYOff = nYOff;
4661 4 : double dfXSize = nXSize;
4662 4 : double dfYSize = nYSize;
4663 4 : if (sExtraArg.bFloatingPointWindowValidity)
4664 : {
4665 2 : dfXOff = sExtraArg.dfXOff;
4666 2 : dfYOff = sExtraArg.dfYOff;
4667 2 : dfXSize = sExtraArg.dfXSize;
4668 2 : dfYSize = sExtraArg.dfYSize;
4669 : }
4670 :
4671 : /* -------------------------------------------------------------------- */
4672 : /* Compute stepping increment. */
4673 : /* -------------------------------------------------------------------- */
4674 4 : const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
4675 4 : const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
4676 :
4677 4 : constexpr double EPS = 1e-10;
4678 : /* -------------------------------------------------------------------- */
4679 : /* Loop over buffer computing source locations. */
4680 : /* -------------------------------------------------------------------- */
4681 36 : for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
4682 : {
4683 : GPtrDiff_t iSrcOffset;
4684 :
4685 : // Add small epsilon to avoid some numeric precision issues.
4686 32 : const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
4687 32 : const int iSrcY = static_cast<int>(std::min(
4688 32 : std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
4689 :
4690 32 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
4691 : static_cast<GPtrDiff_t>(nLineSpace);
4692 :
4693 302 : for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff++)
4694 : {
4695 270 : const double dfSrcX = (iBufXOff + 0.5) * dfSrcXInc + dfXOff + EPS;
4696 270 : const int iSrcX = static_cast<int>(std::min(
4697 270 : std::max(0.0, dfSrcX), static_cast<double>(nRasterXSize - 1)));
4698 :
4699 : // FIXME: this code likely doesn't work if the dirty block gets
4700 : // flushed to disk before being completely written. In the meantime,
4701 : // bJustInitialize should probably be set to FALSE even if it is not
4702 : // ideal performance wise, and for lossy compression
4703 :
4704 : /* --------------------------------------------------------------------
4705 : */
4706 : /* Ensure we have the appropriate block loaded. */
4707 : /* --------------------------------------------------------------------
4708 : */
4709 270 : if (iSrcX < nLBlockX * nBlockXSize ||
4710 270 : iSrcX - nBlockXSize >= nLBlockX * nBlockXSize ||
4711 266 : iSrcY < nLBlockY * nBlockYSize ||
4712 266 : iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
4713 : {
4714 4 : nLBlockX = iSrcX / nBlockXSize;
4715 4 : nLBlockY = iSrcY / nBlockYSize;
4716 :
4717 4 : const bool bJustInitialize =
4718 0 : eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
4719 0 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
4720 4 : nXOff <= nLBlockX * nBlockXSize &&
4721 0 : nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
4722 : /*bool bMemZeroBuffer = FALSE;
4723 : if( eRWFlag == GF_Write && !bJustInitialize &&
4724 : nXOff <= nLBlockX * nBlockXSize &&
4725 : nYOff <= nLBlockY * nBlockYSize &&
4726 : (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
4727 : (nXOff + nXSize == GetRasterXSize() &&
4728 : (nLBlockX+1) * nBlockXSize > GetRasterXSize())) &&
4729 : (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
4730 : (nYOff + nYSize == GetRasterYSize() &&
4731 : (nLBlockY+1) * nBlockYSize > GetRasterYSize())) )
4732 : {
4733 : bJustInitialize = TRUE;
4734 : bMemZeroBuffer = TRUE;
4735 : }*/
4736 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
4737 : {
4738 8 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
4739 8 : if (nOverviewLevel >= 0)
4740 2 : poBand = poBand->GetOverview(nOverviewLevel);
4741 16 : poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY,
4742 8 : bJustInitialize);
4743 8 : if (poBlock == nullptr)
4744 : {
4745 0 : eErr = CE_Failure;
4746 0 : goto CleanupAndReturn;
4747 : }
4748 :
4749 8 : if (eRWFlag == GF_Write)
4750 0 : poBlock->MarkDirty();
4751 :
4752 8 : if (papoBlocks[iBand] != nullptr)
4753 0 : papoBlocks[iBand]->DropLock();
4754 :
4755 8 : papoBlocks[iBand] = poBlock;
4756 :
4757 8 : papabySrcBlock[iBand] =
4758 8 : static_cast<GByte *>(poBlock->GetDataRef());
4759 : /*if( bMemZeroBuffer )
4760 : {
4761 : memset(papabySrcBlock[iBand], 0,
4762 : static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
4763 : * nBlockYSize);
4764 : }*/
4765 : }
4766 : }
4767 :
4768 : /* --------------------------------------------------------------------
4769 : */
4770 : /* Copy over this pixel of data. */
4771 : /* --------------------------------------------------------------------
4772 : */
4773 270 : iSrcOffset = (static_cast<GPtrDiff_t>(iSrcX) -
4774 270 : static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
4775 270 : (static_cast<GPtrDiff_t>(iSrcY) -
4776 270 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
4777 270 : nBlockXSize) *
4778 270 : nBandDataSize;
4779 :
4780 980 : for (int iBand = 0; iBand < nBandCount; iBand++)
4781 : {
4782 710 : GByte *pabySrcBlock = papabySrcBlock[iBand];
4783 710 : GPtrDiff_t iBandBufOffset =
4784 710 : iBufOffset + static_cast<GPtrDiff_t>(iBand) *
4785 : static_cast<GPtrDiff_t>(nBandSpace);
4786 :
4787 710 : if (eDataType == eBufType)
4788 : {
4789 710 : if (eRWFlag == GF_Read)
4790 710 : memcpy(static_cast<GByte *>(pData) + iBandBufOffset,
4791 710 : pabySrcBlock + iSrcOffset, nBandDataSize);
4792 : else
4793 0 : memcpy(pabySrcBlock + iSrcOffset,
4794 : static_cast<const GByte *>(pData) +
4795 0 : iBandBufOffset,
4796 : nBandDataSize);
4797 : }
4798 : else
4799 : {
4800 : /* type to type conversion ... ouch, this is expensive way
4801 : of handling single words */
4802 :
4803 0 : if (eRWFlag == GF_Read)
4804 0 : GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
4805 : static_cast<GByte *>(pData) +
4806 0 : iBandBufOffset,
4807 : eBufType, 0, 1);
4808 : else
4809 0 : GDALCopyWords64(static_cast<const GByte *>(pData) +
4810 0 : iBandBufOffset,
4811 0 : eBufType, 0, pabySrcBlock + iSrcOffset,
4812 : eDataType, 0, 1);
4813 : }
4814 : }
4815 :
4816 270 : iBufOffset += static_cast<int>(nPixelSpace);
4817 : }
4818 : }
4819 :
4820 : /* -------------------------------------------------------------------- */
4821 : /* CleanupAndReturn. */
4822 : /* -------------------------------------------------------------------- */
4823 4 : CleanupAndReturn:
4824 4 : CPLFree(papabySrcBlock);
4825 4 : if (papoBlocks != nullptr)
4826 : {
4827 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
4828 : {
4829 8 : if (papoBlocks[iBand] != nullptr)
4830 8 : papoBlocks[iBand]->DropLock();
4831 : }
4832 4 : CPLFree(papoBlocks);
4833 : }
4834 :
4835 4 : return eErr;
4836 : }
4837 :
4838 : //! @endcond
4839 :
4840 : /************************************************************************/
4841 : /* GDALCopyWholeRasterGetSwathSize() */
4842 : /************************************************************************/
4843 :
4844 3293 : static void GDALCopyWholeRasterGetSwathSize(GDALRasterBand *poSrcPrototypeBand,
4845 : GDALRasterBand *poDstPrototypeBand,
4846 : int nBandCount,
4847 : int bDstIsCompressed,
4848 : int bInterleave, int *pnSwathCols,
4849 : int *pnSwathLines)
4850 : {
4851 3293 : GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
4852 3293 : int nSrcBlockXSize = 0;
4853 3293 : int nSrcBlockYSize = 0;
4854 3293 : int nBlockXSize = 0;
4855 3293 : int nBlockYSize = 0;
4856 :
4857 3293 : int nXSize = poSrcPrototypeBand->GetXSize();
4858 3293 : int nYSize = poSrcPrototypeBand->GetYSize();
4859 :
4860 3293 : poSrcPrototypeBand->GetBlockSize(&nSrcBlockXSize, &nSrcBlockYSize);
4861 3293 : poDstPrototypeBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
4862 :
4863 3293 : const int nMaxBlockXSize = std::max(nBlockXSize, nSrcBlockXSize);
4864 3293 : const int nMaxBlockYSize = std::max(nBlockYSize, nSrcBlockYSize);
4865 :
4866 3293 : int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
4867 3293 : if (bInterleave)
4868 556 : nPixelSize *= nBandCount;
4869 :
4870 : // aim for one row of blocks. Do not settle for less.
4871 3293 : int nSwathCols = nXSize;
4872 3293 : int nSwathLines = nMaxBlockYSize;
4873 :
4874 : const char *pszSrcCompression =
4875 3293 : poSrcPrototypeBand->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
4876 3293 : if (pszSrcCompression == nullptr)
4877 : {
4878 3267 : auto poSrcDS = poSrcPrototypeBand->GetDataset();
4879 3267 : if (poSrcDS)
4880 : pszSrcCompression =
4881 3261 : poSrcDS->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
4882 : }
4883 :
4884 : /* -------------------------------------------------------------------- */
4885 : /* What will our swath size be? */
4886 : /* -------------------------------------------------------------------- */
4887 : // When writing interleaved data in a compressed format, we want to be sure
4888 : // that each block will only be written once, so the swath size must not be
4889 : // greater than the block cache.
4890 3293 : const char *pszSwathSize = CPLGetConfigOption("GDAL_SWATH_SIZE", nullptr);
4891 : int nTargetSwathSize;
4892 3293 : if (pszSwathSize != nullptr)
4893 0 : nTargetSwathSize = static_cast<int>(
4894 0 : std::min(GIntBig(INT_MAX), CPLAtoGIntBig(pszSwathSize)));
4895 : else
4896 : {
4897 : // As a default, take one 1/4 of the cache size.
4898 3293 : nTargetSwathSize = static_cast<int>(
4899 3293 : std::min(GIntBig(INT_MAX), GDALGetCacheMax64() / 4));
4900 :
4901 : // but if the minimum idal swath buf size is less, then go for it to
4902 : // avoid unnecessarily abusing RAM usage.
4903 : // but try to use 10 MB at least.
4904 3293 : GIntBig nIdealSwathBufSize =
4905 3293 : static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize;
4906 3293 : int nMinTargetSwathSize = 10 * 1000 * 1000;
4907 :
4908 3293 : if ((poSrcPrototypeBand->GetSuggestedBlockAccessPattern() &
4909 3293 : GSBAP_LARGEST_CHUNK_POSSIBLE) != 0)
4910 : {
4911 1 : nMinTargetSwathSize = nTargetSwathSize;
4912 : }
4913 :
4914 3293 : if (nIdealSwathBufSize < nTargetSwathSize &&
4915 3283 : nIdealSwathBufSize < nMinTargetSwathSize)
4916 : {
4917 3280 : nIdealSwathBufSize = nMinTargetSwathSize;
4918 : }
4919 :
4920 3293 : if (pszSrcCompression != nullptr &&
4921 184 : EQUAL(pszSrcCompression, "JPEG2000") &&
4922 0 : (!bDstIsCompressed || ((nSrcBlockXSize % nBlockXSize) == 0 &&
4923 0 : (nSrcBlockYSize % nBlockYSize) == 0)))
4924 : {
4925 2 : nIdealSwathBufSize =
4926 4 : std::max(nIdealSwathBufSize, static_cast<GIntBig>(nSwathCols) *
4927 2 : nSrcBlockYSize * nPixelSize);
4928 : }
4929 3293 : if (nTargetSwathSize > nIdealSwathBufSize)
4930 3280 : nTargetSwathSize = static_cast<int>(
4931 3280 : std::min(GIntBig(INT_MAX), nIdealSwathBufSize));
4932 : }
4933 :
4934 3293 : if (nTargetSwathSize < 1000000)
4935 8 : nTargetSwathSize = 1000000;
4936 :
4937 : /* But let's check that */
4938 3514 : if (bDstIsCompressed && bInterleave &&
4939 221 : nTargetSwathSize > GDALGetCacheMax64())
4940 : {
4941 0 : CPLError(CE_Warning, CPLE_AppDefined,
4942 : "When translating into a compressed interleave format, "
4943 : "the block cache size (" CPL_FRMT_GIB ") "
4944 : "should be at least the size of the swath (%d) "
4945 : "(GDAL_SWATH_SIZE config. option)",
4946 : GDALGetCacheMax64(), nTargetSwathSize);
4947 : }
4948 :
4949 : #define IS_DIVIDER_OF(x, y) ((y) % (x) == 0)
4950 : #define ROUND_TO(x, y) (((x) / (y)) * (y))
4951 :
4952 : // if both input and output datasets are tiled, that the tile dimensions
4953 : // are "compatible", try to stick to a swath dimension that is a multiple
4954 : // of input and output block dimensions.
4955 3293 : if (nBlockXSize != nXSize && nSrcBlockXSize != nXSize &&
4956 43 : IS_DIVIDER_OF(nBlockXSize, nMaxBlockXSize) &&
4957 43 : IS_DIVIDER_OF(nSrcBlockXSize, nMaxBlockXSize) &&
4958 43 : IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
4959 43 : IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
4960 : {
4961 43 : if (static_cast<GIntBig>(nMaxBlockXSize) * nMaxBlockYSize *
4962 43 : nPixelSize <=
4963 43 : static_cast<GIntBig>(nTargetSwathSize))
4964 : {
4965 43 : nSwathCols = nTargetSwathSize / (nMaxBlockYSize * nPixelSize);
4966 43 : nSwathCols = ROUND_TO(nSwathCols, nMaxBlockXSize);
4967 43 : if (nSwathCols == 0)
4968 0 : nSwathCols = nMaxBlockXSize;
4969 43 : if (nSwathCols > nXSize)
4970 41 : nSwathCols = nXSize;
4971 43 : nSwathLines = nMaxBlockYSize;
4972 :
4973 43 : if (static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize >
4974 43 : static_cast<GIntBig>(nTargetSwathSize))
4975 : {
4976 0 : nSwathCols = nXSize;
4977 0 : nSwathLines = nBlockYSize;
4978 : }
4979 : }
4980 : }
4981 :
4982 3293 : const GIntBig nMemoryPerCol = static_cast<GIntBig>(nSwathCols) * nPixelSize;
4983 3293 : const GIntBig nSwathBufSize = nMemoryPerCol * nSwathLines;
4984 3293 : if (nSwathBufSize > static_cast<GIntBig>(nTargetSwathSize))
4985 : {
4986 1 : nSwathLines = static_cast<int>(nTargetSwathSize / nMemoryPerCol);
4987 1 : if (nSwathLines == 0)
4988 1 : nSwathLines = 1;
4989 :
4990 1 : CPLDebug(
4991 : "GDAL",
4992 : "GDALCopyWholeRasterGetSwathSize(): adjusting to %d line swath "
4993 : "since requirement (" CPL_FRMT_GIB " bytes) exceed target swath "
4994 : "size (%d bytes) (GDAL_SWATH_SIZE config. option)",
4995 1 : nSwathLines, nBlockYSize * nMemoryPerCol, nTargetSwathSize);
4996 : }
4997 : // If we are processing single scans, try to handle several at once.
4998 : // If we are handling swaths already, only grow the swath if a row
4999 : // of blocks is substantially less than our target buffer size.
5000 3292 : else if (nSwathLines == 1 ||
5001 2737 : nMemoryPerCol * nSwathLines <
5002 2737 : static_cast<GIntBig>(nTargetSwathSize) / 10)
5003 : {
5004 3264 : nSwathLines = std::min(
5005 : nYSize,
5006 3264 : std::max(1, static_cast<int>(nTargetSwathSize / nMemoryPerCol)));
5007 :
5008 : /* If possible try to align to source and target block height */
5009 3264 : if ((nSwathLines % nMaxBlockYSize) != 0 &&
5010 260 : nSwathLines > nMaxBlockYSize &&
5011 260 : IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
5012 231 : IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
5013 209 : nSwathLines = ROUND_TO(nSwathLines, nMaxBlockYSize);
5014 : }
5015 :
5016 3293 : if (pszSrcCompression != nullptr && EQUAL(pszSrcCompression, "JPEG2000") &&
5017 0 : (!bDstIsCompressed || (IS_DIVIDER_OF(nBlockXSize, nSrcBlockXSize) &&
5018 0 : IS_DIVIDER_OF(nBlockYSize, nSrcBlockYSize))))
5019 : {
5020 : // Typical use case: converting from Pleaiades that is 2048x2048 tiled.
5021 2 : if (nSwathLines < nSrcBlockYSize)
5022 : {
5023 0 : nSwathLines = nSrcBlockYSize;
5024 :
5025 : // Number of pixels that can be read/write simultaneously.
5026 0 : nSwathCols = nTargetSwathSize / (nSrcBlockXSize * nPixelSize);
5027 0 : nSwathCols = ROUND_TO(nSwathCols, nSrcBlockXSize);
5028 0 : if (nSwathCols == 0)
5029 0 : nSwathCols = nSrcBlockXSize;
5030 0 : if (nSwathCols > nXSize)
5031 0 : nSwathCols = nXSize;
5032 :
5033 0 : CPLDebug(
5034 : "GDAL",
5035 : "GDALCopyWholeRasterGetSwathSize(): because of compression and "
5036 : "too high block, "
5037 : "use partial width at one time");
5038 : }
5039 2 : else if ((nSwathLines % nSrcBlockYSize) != 0)
5040 : {
5041 : /* Round on a multiple of nSrcBlockYSize */
5042 0 : nSwathLines = ROUND_TO(nSwathLines, nSrcBlockYSize);
5043 0 : CPLDebug(
5044 : "GDAL",
5045 : "GDALCopyWholeRasterGetSwathSize(): because of compression, "
5046 : "round nSwathLines to block height : %d",
5047 : nSwathLines);
5048 : }
5049 : }
5050 3291 : else if (bDstIsCompressed)
5051 : {
5052 415 : if (nSwathLines < nBlockYSize)
5053 : {
5054 146 : nSwathLines = nBlockYSize;
5055 :
5056 : // Number of pixels that can be read/write simultaneously.
5057 146 : nSwathCols = nTargetSwathSize / (nSwathLines * nPixelSize);
5058 146 : nSwathCols = ROUND_TO(nSwathCols, nBlockXSize);
5059 146 : if (nSwathCols == 0)
5060 0 : nSwathCols = nBlockXSize;
5061 146 : if (nSwathCols > nXSize)
5062 146 : nSwathCols = nXSize;
5063 :
5064 146 : CPLDebug(
5065 : "GDAL",
5066 : "GDALCopyWholeRasterGetSwathSize(): because of compression and "
5067 : "too high block, "
5068 : "use partial width at one time");
5069 : }
5070 269 : else if ((nSwathLines % nBlockYSize) != 0)
5071 : {
5072 : // Round on a multiple of nBlockYSize.
5073 9 : nSwathLines = ROUND_TO(nSwathLines, nBlockYSize);
5074 9 : CPLDebug(
5075 : "GDAL",
5076 : "GDALCopyWholeRasterGetSwathSize(): because of compression, "
5077 : "round nSwathLines to block height : %d",
5078 : nSwathLines);
5079 : }
5080 : }
5081 :
5082 3293 : *pnSwathCols = nSwathCols;
5083 3293 : *pnSwathLines = nSwathLines;
5084 3293 : }
5085 :
5086 : /************************************************************************/
5087 : /* GDALDatasetCopyWholeRaster() */
5088 : /************************************************************************/
5089 :
5090 : /**
5091 : * \brief Copy all dataset raster data.
5092 : *
5093 : * This function copies the complete raster contents of one dataset to
5094 : * another similarly configured dataset. The source and destination
5095 : * dataset must have the same number of bands, and the same width
5096 : * and height. The bands do not have to have the same data type.
5097 : *
5098 : * This function is primarily intended to support implementation of
5099 : * driver specific CreateCopy() functions. It implements efficient copying,
5100 : * in particular "chunking" the copy in substantial blocks and, if appropriate,
5101 : * performing the transfer in a pixel interleaved fashion.
5102 : *
5103 : * Currently the only papszOptions value supported are :
5104 : * <ul>
5105 : * <li>"INTERLEAVE=PIXEL/BAND" to force pixel (resp. band) interleaved read and
5106 : * write access pattern (this does not modify the layout of the destination
5107 : * data)</li> <li>"COMPRESSED=YES" to force alignment on target dataset block
5108 : * sizes to achieve best compression.</li> <li>"SKIP_HOLES=YES" to skip chunks
5109 : * for which GDALGetDataCoverageStatus() returns GDAL_DATA_COVERAGE_STATUS_EMPTY
5110 : * (GDAL >= 2.2)</li>
5111 : * </ul>
5112 : * More options may be supported in the future.
5113 : *
5114 : * @param hSrcDS the source dataset
5115 : * @param hDstDS the destination dataset
5116 : * @param papszOptions transfer hints in "StringList" Name=Value format.
5117 : * @param pfnProgress progress reporting function.
5118 : * @param pProgressData callback data for progress function.
5119 : *
5120 : * @return CE_None on success, or CE_Failure on failure.
5121 : */
5122 :
5123 3265 : CPLErr CPL_STDCALL GDALDatasetCopyWholeRaster(GDALDatasetH hSrcDS,
5124 : GDALDatasetH hDstDS,
5125 : CSLConstList papszOptions,
5126 : GDALProgressFunc pfnProgress,
5127 : void *pProgressData)
5128 :
5129 : {
5130 3265 : VALIDATE_POINTER1(hSrcDS, "GDALDatasetCopyWholeRaster", CE_Failure);
5131 3265 : VALIDATE_POINTER1(hDstDS, "GDALDatasetCopyWholeRaster", CE_Failure);
5132 :
5133 3265 : GDALDataset *poSrcDS = GDALDataset::FromHandle(hSrcDS);
5134 3265 : GDALDataset *poDstDS = GDALDataset::FromHandle(hDstDS);
5135 :
5136 3265 : if (pfnProgress == nullptr)
5137 0 : pfnProgress = GDALDummyProgress;
5138 :
5139 : /* -------------------------------------------------------------------- */
5140 : /* Confirm the datasets match in size and band counts. */
5141 : /* -------------------------------------------------------------------- */
5142 3265 : const int nXSize = poDstDS->GetRasterXSize();
5143 3265 : const int nYSize = poDstDS->GetRasterYSize();
5144 3265 : const int nBandCount = poDstDS->GetRasterCount();
5145 :
5146 3265 : if (poSrcDS->GetRasterXSize() != nXSize ||
5147 6530 : poSrcDS->GetRasterYSize() != nYSize ||
5148 3265 : poSrcDS->GetRasterCount() != nBandCount)
5149 : {
5150 0 : CPLError(CE_Failure, CPLE_AppDefined,
5151 : "Input and output dataset sizes or band counts do not\n"
5152 : "match in GDALDatasetCopyWholeRaster()");
5153 0 : return CE_Failure;
5154 : }
5155 :
5156 : /* -------------------------------------------------------------------- */
5157 : /* Report preliminary (0) progress. */
5158 : /* -------------------------------------------------------------------- */
5159 3265 : if (!pfnProgress(0.0, nullptr, pProgressData))
5160 : {
5161 1 : CPLError(CE_Failure, CPLE_UserInterrupt,
5162 : "User terminated CreateCopy()");
5163 1 : return CE_Failure;
5164 : }
5165 :
5166 : /* -------------------------------------------------------------------- */
5167 : /* Get our prototype band, and assume the others are similarly */
5168 : /* configured. */
5169 : /* -------------------------------------------------------------------- */
5170 3264 : if (nBandCount == 0)
5171 0 : return CE_None;
5172 :
5173 3264 : GDALRasterBand *poSrcPrototypeBand = poSrcDS->GetRasterBand(1);
5174 3264 : GDALRasterBand *poDstPrototypeBand = poDstDS->GetRasterBand(1);
5175 3264 : GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
5176 :
5177 : /* -------------------------------------------------------------------- */
5178 : /* Do we want to try and do the operation in a pixel */
5179 : /* interleaved fashion? */
5180 : /* -------------------------------------------------------------------- */
5181 3264 : bool bInterleave = false;
5182 : const char *pszInterleave =
5183 3264 : poSrcDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
5184 3264 : if (pszInterleave != nullptr &&
5185 2873 : (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
5186 189 : bInterleave = true;
5187 :
5188 3264 : pszInterleave = poDstDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
5189 3264 : if (pszInterleave != nullptr &&
5190 2797 : (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
5191 503 : bInterleave = true;
5192 :
5193 3264 : pszInterleave = CSLFetchNameValue(papszOptions, "INTERLEAVE");
5194 3264 : if (pszInterleave != nullptr && EQUAL(pszInterleave, "PIXEL"))
5195 5 : bInterleave = true;
5196 3259 : else if (pszInterleave != nullptr && EQUAL(pszInterleave, "BAND"))
5197 13 : bInterleave = false;
5198 : // attributes is specific to the TileDB driver
5199 3246 : else if (pszInterleave != nullptr && EQUAL(pszInterleave, "ATTRIBUTES"))
5200 4 : bInterleave = true;
5201 3242 : else if (pszInterleave != nullptr)
5202 : {
5203 0 : CPLError(CE_Warning, CPLE_NotSupported,
5204 : "Unsupported value for option INTERLEAVE");
5205 : }
5206 :
5207 : // If the destination is compressed, we must try to write blocks just once,
5208 : // to save disk space (GTiff case for example), and to avoid data loss
5209 : // (JPEG compression for example).
5210 3264 : bool bDstIsCompressed = false;
5211 : const char *pszDstCompressed =
5212 3264 : CSLFetchNameValue(papszOptions, "COMPRESSED");
5213 3264 : if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
5214 389 : bDstIsCompressed = true;
5215 :
5216 : /* -------------------------------------------------------------------- */
5217 : /* What will our swath size be? */
5218 : /* -------------------------------------------------------------------- */
5219 :
5220 3264 : int nSwathCols = 0;
5221 3264 : int nSwathLines = 0;
5222 3264 : GDALCopyWholeRasterGetSwathSize(poSrcPrototypeBand, poDstPrototypeBand,
5223 : nBandCount, bDstIsCompressed, bInterleave,
5224 : &nSwathCols, &nSwathLines);
5225 :
5226 3264 : int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5227 3264 : if (bInterleave)
5228 556 : nPixelSize *= nBandCount;
5229 :
5230 3264 : void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
5231 3264 : if (pSwathBuf == nullptr)
5232 : {
5233 0 : return CE_Failure;
5234 : }
5235 :
5236 3264 : CPLDebug("GDAL",
5237 : "GDALDatasetCopyWholeRaster(): %d*%d swaths, bInterleave=%d",
5238 : nSwathCols, nSwathLines, static_cast<int>(bInterleave));
5239 :
5240 : // Advise the source raster that we are going to read it completely
5241 : // Note: this might already have been done by GDALCreateCopy() in the
5242 : // likely case this function is indirectly called by it
5243 3264 : poSrcDS->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nBandCount,
5244 3264 : nullptr, nullptr);
5245 :
5246 : /* ==================================================================== */
5247 : /* Band oriented (uninterleaved) case. */
5248 : /* ==================================================================== */
5249 3264 : CPLErr eErr = CE_None;
5250 : const bool bCheckHoles =
5251 3264 : CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
5252 :
5253 3264 : if (!bInterleave)
5254 : {
5255 : GDALRasterIOExtraArg sExtraArg;
5256 2708 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5257 2708 : CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress); // to make cppcheck happy
5258 :
5259 8124 : const GIntBig nTotalBlocks = static_cast<GIntBig>(nBandCount) *
5260 2708 : DIV_ROUND_UP(nYSize, nSwathLines) *
5261 2708 : DIV_ROUND_UP(nXSize, nSwathCols);
5262 2708 : GIntBig nBlocksDone = 0;
5263 :
5264 7832 : for (int iBand = 0; iBand < nBandCount && eErr == CE_None; iBand++)
5265 : {
5266 5124 : int nBand = iBand + 1;
5267 :
5268 10506 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5269 : {
5270 5382 : int nThisLines = nSwathLines;
5271 :
5272 5382 : if (iY + nThisLines > nYSize)
5273 363 : nThisLines = nYSize - iY;
5274 :
5275 10764 : for (int iX = 0; iX < nXSize && eErr == CE_None;
5276 5382 : iX += nSwathCols)
5277 : {
5278 5382 : int nThisCols = nSwathCols;
5279 :
5280 5382 : if (iX + nThisCols > nXSize)
5281 0 : nThisCols = nXSize - iX;
5282 :
5283 5382 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5284 5382 : if (bCheckHoles)
5285 : {
5286 : nStatus = poSrcDS->GetRasterBand(nBand)
5287 3722 : ->GetDataCoverageStatus(
5288 : iX, iY, nThisCols, nThisLines,
5289 : GDAL_DATA_COVERAGE_STATUS_DATA);
5290 : }
5291 5382 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5292 : {
5293 5378 : sExtraArg.pfnProgress = GDALScaledProgress;
5294 10756 : sExtraArg.pProgressData = GDALCreateScaledProgress(
5295 5378 : nBlocksDone / static_cast<double>(nTotalBlocks),
5296 5378 : (nBlocksDone + 0.5) /
5297 5378 : static_cast<double>(nTotalBlocks),
5298 : pfnProgress, pProgressData);
5299 5378 : if (sExtraArg.pProgressData == nullptr)
5300 1630 : sExtraArg.pfnProgress = nullptr;
5301 :
5302 5378 : eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
5303 : nThisLines, pSwathBuf,
5304 : nThisCols, nThisLines, eDT, 1,
5305 : &nBand, 0, 0, 0, &sExtraArg);
5306 :
5307 5378 : GDALDestroyScaledProgress(sExtraArg.pProgressData);
5308 :
5309 5378 : if (eErr == CE_None)
5310 5371 : eErr = poDstDS->RasterIO(
5311 : GF_Write, iX, iY, nThisCols, nThisLines,
5312 : pSwathBuf, nThisCols, nThisLines, eDT, 1,
5313 : &nBand, 0, 0, 0, nullptr);
5314 : }
5315 :
5316 5382 : nBlocksDone++;
5317 10722 : if (eErr == CE_None &&
5318 5340 : !pfnProgress(nBlocksDone /
5319 5340 : static_cast<double>(nTotalBlocks),
5320 : nullptr, pProgressData))
5321 : {
5322 2 : eErr = CE_Failure;
5323 2 : CPLError(CE_Failure, CPLE_UserInterrupt,
5324 : "User terminated CreateCopy()");
5325 : }
5326 : }
5327 : }
5328 : }
5329 : }
5330 :
5331 : /* ==================================================================== */
5332 : /* Pixel interleaved case. */
5333 : /* ==================================================================== */
5334 : else /* if( bInterleave ) */
5335 : {
5336 : GDALRasterIOExtraArg sExtraArg;
5337 556 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5338 556 : CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress); // to make cppcheck happy
5339 :
5340 556 : const GIntBig nTotalBlocks =
5341 556 : static_cast<GIntBig>(DIV_ROUND_UP(nYSize, nSwathLines)) *
5342 556 : DIV_ROUND_UP(nXSize, nSwathCols);
5343 556 : GIntBig nBlocksDone = 0;
5344 :
5345 1331 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5346 : {
5347 775 : int nThisLines = nSwathLines;
5348 :
5349 775 : if (iY + nThisLines > nYSize)
5350 195 : nThisLines = nYSize - iY;
5351 :
5352 1555 : for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
5353 : {
5354 780 : int nThisCols = nSwathCols;
5355 :
5356 780 : if (iX + nThisCols > nXSize)
5357 3 : nThisCols = nXSize - iX;
5358 :
5359 780 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5360 780 : if (bCheckHoles)
5361 : {
5362 547 : nStatus = 0;
5363 600 : for (int iBand = 0; iBand < nBandCount; iBand++)
5364 : {
5365 581 : nStatus |= poSrcDS->GetRasterBand(iBand + 1)
5366 581 : ->GetDataCoverageStatus(
5367 : iX, iY, nThisCols, nThisLines,
5368 : GDAL_DATA_COVERAGE_STATUS_DATA);
5369 581 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5370 528 : break;
5371 : }
5372 : }
5373 780 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5374 : {
5375 761 : sExtraArg.pfnProgress = GDALScaledProgress;
5376 1522 : sExtraArg.pProgressData = GDALCreateScaledProgress(
5377 761 : nBlocksDone / static_cast<double>(nTotalBlocks),
5378 761 : (nBlocksDone + 0.5) / static_cast<double>(nTotalBlocks),
5379 : pfnProgress, pProgressData);
5380 761 : if (sExtraArg.pProgressData == nullptr)
5381 348 : sExtraArg.pfnProgress = nullptr;
5382 :
5383 761 : eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
5384 : nThisLines, pSwathBuf, nThisCols,
5385 : nThisLines, eDT, nBandCount,
5386 : nullptr, 0, 0, 0, &sExtraArg);
5387 :
5388 761 : GDALDestroyScaledProgress(sExtraArg.pProgressData);
5389 :
5390 761 : if (eErr == CE_None)
5391 760 : eErr = poDstDS->RasterIO(
5392 : GF_Write, iX, iY, nThisCols, nThisLines, pSwathBuf,
5393 : nThisCols, nThisLines, eDT, nBandCount, nullptr, 0,
5394 : 0, 0, nullptr);
5395 : }
5396 :
5397 780 : nBlocksDone++;
5398 1556 : if (eErr == CE_None &&
5399 776 : !pfnProgress(nBlocksDone /
5400 776 : static_cast<double>(nTotalBlocks),
5401 : nullptr, pProgressData))
5402 : {
5403 1 : eErr = CE_Failure;
5404 1 : CPLError(CE_Failure, CPLE_UserInterrupt,
5405 : "User terminated CreateCopy()");
5406 : }
5407 : }
5408 : }
5409 : }
5410 :
5411 : /* -------------------------------------------------------------------- */
5412 : /* Cleanup */
5413 : /* -------------------------------------------------------------------- */
5414 3264 : CPLFree(pSwathBuf);
5415 :
5416 3264 : return eErr;
5417 : }
5418 :
5419 : /************************************************************************/
5420 : /* GDALRasterBandCopyWholeRaster() */
5421 : /************************************************************************/
5422 :
5423 : /**
5424 : * \brief Copy a whole raster band
5425 : *
5426 : * This function copies the complete raster contents of one band to
5427 : * another similarly configured band. The source and destination
5428 : * bands must have the same width and height. The bands do not have
5429 : * to have the same data type.
5430 : *
5431 : * It implements efficient copying, in particular "chunking" the copy in
5432 : * substantial blocks.
5433 : *
5434 : * Currently the only papszOptions value supported are :
5435 : * <ul>
5436 : * <li>"COMPRESSED=YES" to force alignment on target dataset block sizes to
5437 : * achieve best compression.</li>
5438 : * <li>"SKIP_HOLES=YES" to skip chunks for which GDALGetDataCoverageStatus()
5439 : * returns GDAL_DATA_COVERAGE_STATUS_EMPTY (GDAL >= 2.2)</li>
5440 : * </ul>
5441 : *
5442 : * @param hSrcBand the source band
5443 : * @param hDstBand the destination band
5444 : * @param papszOptions transfer hints in "StringList" Name=Value format.
5445 : * @param pfnProgress progress reporting function.
5446 : * @param pProgressData callback data for progress function.
5447 : *
5448 : * @return CE_None on success, or CE_Failure on failure.
5449 : */
5450 :
5451 29 : CPLErr CPL_STDCALL GDALRasterBandCopyWholeRaster(
5452 : GDALRasterBandH hSrcBand, GDALRasterBandH hDstBand,
5453 : const char *const *const papszOptions, GDALProgressFunc pfnProgress,
5454 : void *pProgressData)
5455 :
5456 : {
5457 29 : VALIDATE_POINTER1(hSrcBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
5458 29 : VALIDATE_POINTER1(hDstBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
5459 :
5460 29 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
5461 29 : GDALRasterBand *poDstBand = GDALRasterBand::FromHandle(hDstBand);
5462 29 : CPLErr eErr = CE_None;
5463 :
5464 29 : if (pfnProgress == nullptr)
5465 2 : pfnProgress = GDALDummyProgress;
5466 :
5467 : /* -------------------------------------------------------------------- */
5468 : /* Confirm the datasets match in size and band counts. */
5469 : /* -------------------------------------------------------------------- */
5470 29 : int nXSize = poSrcBand->GetXSize();
5471 29 : int nYSize = poSrcBand->GetYSize();
5472 :
5473 29 : if (poDstBand->GetXSize() != nXSize || poDstBand->GetYSize() != nYSize)
5474 : {
5475 0 : CPLError(CE_Failure, CPLE_AppDefined,
5476 : "Input and output band sizes do not\n"
5477 : "match in GDALRasterBandCopyWholeRaster()");
5478 0 : return CE_Failure;
5479 : }
5480 :
5481 : /* -------------------------------------------------------------------- */
5482 : /* Report preliminary (0) progress. */
5483 : /* -------------------------------------------------------------------- */
5484 29 : if (!pfnProgress(0.0, nullptr, pProgressData))
5485 : {
5486 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
5487 : "User terminated CreateCopy()");
5488 0 : return CE_Failure;
5489 : }
5490 :
5491 29 : GDALDataType eDT = poDstBand->GetRasterDataType();
5492 :
5493 : // If the destination is compressed, we must try to write blocks just once,
5494 : // to save disk space (GTiff case for example), and to avoid data loss
5495 : // (JPEG compression for example).
5496 29 : bool bDstIsCompressed = false;
5497 : const char *pszDstCompressed =
5498 29 : CSLFetchNameValue(const_cast<char **>(papszOptions), "COMPRESSED");
5499 29 : if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
5500 26 : bDstIsCompressed = true;
5501 :
5502 : /* -------------------------------------------------------------------- */
5503 : /* What will our swath size be? */
5504 : /* -------------------------------------------------------------------- */
5505 :
5506 29 : int nSwathCols = 0;
5507 29 : int nSwathLines = 0;
5508 29 : GDALCopyWholeRasterGetSwathSize(poSrcBand, poDstBand, 1, bDstIsCompressed,
5509 : FALSE, &nSwathCols, &nSwathLines);
5510 :
5511 29 : const int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5512 :
5513 29 : void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
5514 29 : if (pSwathBuf == nullptr)
5515 : {
5516 0 : return CE_Failure;
5517 : }
5518 :
5519 29 : CPLDebug("GDAL", "GDALRasterBandCopyWholeRaster(): %d*%d swaths",
5520 : nSwathCols, nSwathLines);
5521 :
5522 : const bool bCheckHoles =
5523 29 : CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
5524 :
5525 : // Advise the source raster that we are going to read it completely
5526 29 : poSrcBand->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nullptr);
5527 :
5528 : /* ==================================================================== */
5529 : /* Band oriented (uninterleaved) case. */
5530 : /* ==================================================================== */
5531 :
5532 72 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5533 : {
5534 43 : int nThisLines = nSwathLines;
5535 :
5536 43 : if (iY + nThisLines > nYSize)
5537 8 : nThisLines = nYSize - iY;
5538 :
5539 86 : for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
5540 : {
5541 43 : int nThisCols = nSwathCols;
5542 :
5543 43 : if (iX + nThisCols > nXSize)
5544 0 : nThisCols = nXSize - iX;
5545 :
5546 43 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5547 43 : if (bCheckHoles)
5548 : {
5549 0 : nStatus = poSrcBand->GetDataCoverageStatus(
5550 : iX, iY, nThisCols, nThisLines,
5551 : GDAL_DATA_COVERAGE_STATUS_DATA);
5552 : }
5553 43 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5554 : {
5555 43 : eErr = poSrcBand->RasterIO(GF_Read, iX, iY, nThisCols,
5556 : nThisLines, pSwathBuf, nThisCols,
5557 : nThisLines, eDT, 0, 0, nullptr);
5558 :
5559 43 : if (eErr == CE_None)
5560 43 : eErr = poDstBand->RasterIO(GF_Write, iX, iY, nThisCols,
5561 : nThisLines, pSwathBuf, nThisCols,
5562 : nThisLines, eDT, 0, 0, nullptr);
5563 : }
5564 :
5565 86 : if (eErr == CE_None && !pfnProgress(double(iY + nThisLines) /
5566 43 : static_cast<double>(nYSize),
5567 : nullptr, pProgressData))
5568 : {
5569 0 : eErr = CE_Failure;
5570 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
5571 : "User terminated CreateCopy()");
5572 : }
5573 : }
5574 : }
5575 :
5576 : /* -------------------------------------------------------------------- */
5577 : /* Cleanup */
5578 : /* -------------------------------------------------------------------- */
5579 29 : CPLFree(pSwathBuf);
5580 :
5581 29 : return eErr;
5582 : }
5583 :
5584 : /************************************************************************/
5585 : /* GDALCopyRasterIOExtraArg () */
5586 : /************************************************************************/
5587 :
5588 527273 : void GDALCopyRasterIOExtraArg(GDALRasterIOExtraArg *psDestArg,
5589 : GDALRasterIOExtraArg *psSrcArg)
5590 : {
5591 527273 : INIT_RASTERIO_EXTRA_ARG(*psDestArg);
5592 527273 : if (psSrcArg)
5593 : {
5594 527273 : psDestArg->eResampleAlg = psSrcArg->eResampleAlg;
5595 527273 : psDestArg->pfnProgress = psSrcArg->pfnProgress;
5596 527273 : psDestArg->pProgressData = psSrcArg->pProgressData;
5597 527273 : psDestArg->bFloatingPointWindowValidity =
5598 527273 : psSrcArg->bFloatingPointWindowValidity;
5599 527273 : if (psSrcArg->bFloatingPointWindowValidity)
5600 : {
5601 204391 : psDestArg->dfXOff = psSrcArg->dfXOff;
5602 204391 : psDestArg->dfYOff = psSrcArg->dfYOff;
5603 204391 : psDestArg->dfXSize = psSrcArg->dfXSize;
5604 204391 : psDestArg->dfYSize = psSrcArg->dfYSize;
5605 : }
5606 527273 : if (psSrcArg->nVersion >= 2)
5607 : {
5608 527273 : psDestArg->bUseOnlyThisScale = psSrcArg->bUseOnlyThisScale;
5609 : }
5610 : }
5611 527273 : }
5612 :
5613 : /************************************************************************/
5614 : /* HasOnlyNoData() */
5615 : /************************************************************************/
5616 :
5617 25110502 : template <class T> static inline bool IsEqualToNoData(T value, T noDataValue)
5618 : {
5619 25110502 : return value == noDataValue;
5620 : }
5621 :
5622 0 : template <> bool IsEqualToNoData<GFloat16>(GFloat16 value, GFloat16 noDataValue)
5623 : {
5624 : using std::isnan;
5625 0 : return isnan(noDataValue) ? isnan(value) : value == noDataValue;
5626 : }
5627 :
5628 625510 : template <> bool IsEqualToNoData<float>(float value, float noDataValue)
5629 : {
5630 625510 : return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
5631 : }
5632 :
5633 13560300 : template <> bool IsEqualToNoData<double>(double value, double noDataValue)
5634 : {
5635 13560300 : return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
5636 : }
5637 :
5638 : template <class T>
5639 15902 : static bool HasOnlyNoDataT(const T *pBuffer, T noDataValue, size_t nWidth,
5640 : size_t nHeight, size_t nLineStride,
5641 : size_t nComponents)
5642 : {
5643 : // Fast test: check the 4 corners and the middle pixel.
5644 30897 : for (size_t iBand = 0; iBand < nComponents; iBand++)
5645 : {
5646 32553 : if (!(IsEqualToNoData(pBuffer[iBand], noDataValue) &&
5647 16189 : IsEqualToNoData(pBuffer[(nWidth - 1) * nComponents + iBand],
5648 15961 : noDataValue) &&
5649 15961 : IsEqualToNoData(
5650 15961 : pBuffer[((nHeight - 1) / 2 * nLineStride + (nWidth - 1) / 2) *
5651 15961 : nComponents +
5652 : iBand],
5653 15008 : noDataValue) &&
5654 15008 : IsEqualToNoData(
5655 15008 : pBuffer[(nHeight - 1) * nLineStride * nComponents + iBand],
5656 : noDataValue) &&
5657 15000 : IsEqualToNoData(
5658 15000 : pBuffer[((nHeight - 1) * nLineStride + nWidth - 1) *
5659 15000 : nComponents +
5660 : iBand],
5661 : noDataValue)))
5662 : {
5663 1369 : return false;
5664 : }
5665 : }
5666 :
5667 : // Test all pixels.
5668 46115 : for (size_t iY = 0; iY < nHeight; iY++)
5669 : {
5670 31643 : const T *pBufferLine = pBuffer + iY * nLineStride * nComponents;
5671 39249392 : for (size_t iX = 0; iX < nWidth * nComponents; iX++)
5672 : {
5673 39217846 : if (!IsEqualToNoData(pBufferLine[iX], noDataValue))
5674 : {
5675 61 : return false;
5676 : }
5677 : }
5678 : }
5679 14472 : return true;
5680 : }
5681 :
5682 : /************************************************************************/
5683 : /* GDALBufferHasOnlyNoData() */
5684 : /************************************************************************/
5685 :
5686 43026 : bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
5687 : size_t nWidth, size_t nHeight, size_t nLineStride,
5688 : size_t nComponents, int nBitsPerSample,
5689 : GDALBufferSampleFormat nSampleFormat)
5690 : {
5691 : // In the case where the nodata is 0, we can compare several bytes at
5692 : // once. Select the largest natural integer type for the architecture.
5693 : #if SIZEOF_VOIDP >= 8 || defined(__x86_64__)
5694 : // We test __x86_64__ for x32 arch where SIZEOF_VOIDP == 4
5695 : typedef std::uint64_t WordType;
5696 : #else
5697 : typedef std::uint32_t WordType;
5698 : #endif
5699 43026 : if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
5700 : // Do not use this optimized code path for floating point numbers,
5701 : // as it can't detect negative zero.
5702 : nSampleFormat != GSF_FLOATING_POINT)
5703 : {
5704 27118 : const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
5705 27118 : const size_t nSize =
5706 27118 : (nWidth * nHeight * nComponents * nBitsPerSample + 7) / 8;
5707 27118 : size_t i = 0;
5708 : const size_t nInitialIters =
5709 54236 : std::min(sizeof(WordType) -
5710 27118 : static_cast<size_t>(
5711 : reinterpret_cast<std::uintptr_t>(pabyBuffer) %
5712 : sizeof(WordType)),
5713 27118 : nSize);
5714 223902 : for (; i < nInitialIters; i++)
5715 : {
5716 201171 : if (pabyBuffer[i])
5717 4387 : return false;
5718 : }
5719 16513300 : for (; i + sizeof(WordType) - 1 < nSize; i += sizeof(WordType))
5720 : {
5721 16498200 : if (*(reinterpret_cast<const WordType *>(pabyBuffer + i)))
5722 7600 : return false;
5723 : }
5724 52520 : for (; i < nSize; i++)
5725 : {
5726 37394 : if (pabyBuffer[i])
5727 5 : return false;
5728 : }
5729 15126 : return true;
5730 : }
5731 :
5732 15908 : if (nBitsPerSample == 8 && nSampleFormat == GSF_UNSIGNED_INT)
5733 : {
5734 22274 : return GDALIsValueInRange<uint8_t>(dfNoDataValue) &&
5735 11137 : HasOnlyNoDataT(static_cast<const uint8_t *>(pBuffer),
5736 11137 : static_cast<uint8_t>(dfNoDataValue), nWidth,
5737 11137 : nHeight, nLineStride, nComponents);
5738 : }
5739 4771 : if (nBitsPerSample == 8 && nSampleFormat == GSF_SIGNED_INT)
5740 : {
5741 : // Use unsigned implementation by converting the nodatavalue to
5742 : // unsigned
5743 63 : return GDALIsValueInRange<int8_t>(dfNoDataValue) &&
5744 31 : HasOnlyNoDataT(
5745 : static_cast<const uint8_t *>(pBuffer),
5746 31 : static_cast<uint8_t>(static_cast<int8_t>(dfNoDataValue)),
5747 32 : nWidth, nHeight, nLineStride, nComponents);
5748 : }
5749 4739 : if (nBitsPerSample == 16 && nSampleFormat == GSF_UNSIGNED_INT)
5750 : {
5751 23 : return GDALIsValueInRange<uint16_t>(dfNoDataValue) &&
5752 11 : HasOnlyNoDataT(static_cast<const uint16_t *>(pBuffer),
5753 11 : static_cast<uint16_t>(dfNoDataValue), nWidth,
5754 12 : nHeight, nLineStride, nComponents);
5755 : }
5756 4727 : if (nBitsPerSample == 16 && nSampleFormat == GSF_SIGNED_INT)
5757 : {
5758 : // Use unsigned implementation by converting the nodatavalue to
5759 : // unsigned
5760 99 : return GDALIsValueInRange<int16_t>(dfNoDataValue) &&
5761 49 : HasOnlyNoDataT(
5762 : static_cast<const uint16_t *>(pBuffer),
5763 49 : static_cast<uint16_t>(static_cast<int16_t>(dfNoDataValue)),
5764 50 : nWidth, nHeight, nLineStride, nComponents);
5765 : }
5766 4677 : if (nBitsPerSample == 32 && nSampleFormat == GSF_UNSIGNED_INT)
5767 : {
5768 73 : return GDALIsValueInRange<uint32_t>(dfNoDataValue) &&
5769 36 : HasOnlyNoDataT(static_cast<const uint32_t *>(pBuffer),
5770 : static_cast<uint32_t>(dfNoDataValue), nWidth,
5771 37 : nHeight, nLineStride, nComponents);
5772 : }
5773 4640 : if (nBitsPerSample == 32 && nSampleFormat == GSF_SIGNED_INT)
5774 : {
5775 : // Use unsigned implementation by converting the nodatavalue to
5776 : // unsigned
5777 23 : return GDALIsValueInRange<int32_t>(dfNoDataValue) &&
5778 11 : HasOnlyNoDataT(
5779 : static_cast<const uint32_t *>(pBuffer),
5780 11 : static_cast<uint32_t>(static_cast<int32_t>(dfNoDataValue)),
5781 12 : nWidth, nHeight, nLineStride, nComponents);
5782 : }
5783 4628 : if (nBitsPerSample == 64 && nSampleFormat == GSF_UNSIGNED_INT)
5784 : {
5785 56 : return GDALIsValueInRange<uint64_t>(dfNoDataValue) &&
5786 28 : HasOnlyNoDataT(static_cast<const uint64_t *>(pBuffer),
5787 : static_cast<uint64_t>(dfNoDataValue), nWidth,
5788 28 : nHeight, nLineStride, nComponents);
5789 : }
5790 4600 : if (nBitsPerSample == 64 && nSampleFormat == GSF_SIGNED_INT)
5791 : {
5792 : // Use unsigned implementation by converting the nodatavalue to
5793 : // unsigned
5794 0 : return GDALIsValueInRange<int64_t>(dfNoDataValue) &&
5795 0 : HasOnlyNoDataT(
5796 : static_cast<const uint64_t *>(pBuffer),
5797 0 : static_cast<uint64_t>(static_cast<int64_t>(dfNoDataValue)),
5798 0 : nWidth, nHeight, nLineStride, nComponents);
5799 : }
5800 4600 : if (nBitsPerSample == 16 && nSampleFormat == GSF_FLOATING_POINT)
5801 : {
5802 0 : return (std::isnan(dfNoDataValue) ||
5803 0 : GDALIsValueInRange<GFloat16>(dfNoDataValue)) &&
5804 0 : HasOnlyNoDataT(static_cast<const GFloat16 *>(pBuffer),
5805 : static_cast<GFloat16>(dfNoDataValue), nWidth,
5806 0 : nHeight, nLineStride, nComponents);
5807 : }
5808 4600 : if (nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
5809 : {
5810 760 : return (std::isnan(dfNoDataValue) ||
5811 1519 : GDALIsValueInRange<float>(dfNoDataValue)) &&
5812 759 : HasOnlyNoDataT(static_cast<const float *>(pBuffer),
5813 : static_cast<float>(dfNoDataValue), nWidth,
5814 760 : nHeight, nLineStride, nComponents);
5815 : }
5816 3840 : if (nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
5817 : {
5818 3840 : return HasOnlyNoDataT(static_cast<const double *>(pBuffer),
5819 : dfNoDataValue, nWidth, nHeight, nLineStride,
5820 3840 : nComponents);
5821 : }
5822 0 : return false;
5823 : }
5824 :
5825 : #ifdef HAVE_SSE2
5826 :
5827 : /************************************************************************/
5828 : /* GDALDeinterleave3Byte() */
5829 : /************************************************************************/
5830 :
5831 : #if defined(__GNUC__) && !defined(__clang__)
5832 : __attribute__((optimize("no-tree-vectorize")))
5833 : #endif
5834 : static void
5835 361281 : GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
5836 : GByte *CPL_RESTRICT pabyDest0,
5837 : GByte *CPL_RESTRICT pabyDest1,
5838 : GByte *CPL_RESTRICT pabyDest2, size_t nIters)
5839 : #ifdef USE_NEON_OPTIMIZATIONS
5840 : {
5841 : return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
5842 : nIters);
5843 : }
5844 : #else
5845 : {
5846 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
5847 361281 : if (CPLHaveRuntimeSSSE3())
5848 : {
5849 361279 : return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
5850 361279 : pabyDest2, nIters);
5851 : }
5852 : #endif
5853 :
5854 2 : size_t i = 0;
5855 2 : if (((reinterpret_cast<uintptr_t>(pabySrc) |
5856 2 : reinterpret_cast<uintptr_t>(pabyDest0) |
5857 2 : reinterpret_cast<uintptr_t>(pabyDest1) |
5858 2 : reinterpret_cast<uintptr_t>(pabyDest2)) %
5859 : sizeof(unsigned int)) == 0)
5860 : {
5861 : // Slightly better than GCC autovectorizer
5862 17 : for (size_t j = 0; i + 3 < nIters; i += 4, ++j)
5863 : {
5864 15 : unsigned int word0 =
5865 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i);
5866 15 : unsigned int word1 =
5867 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 4);
5868 15 : unsigned int word2 =
5869 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 8);
5870 15 : reinterpret_cast<unsigned int *>(pabyDest0)[j] =
5871 15 : (word0 & 0xff) | ((word0 >> 24) << 8) | (word1 & 0x00ff0000) |
5872 15 : ((word2 >> 8) << 24);
5873 15 : reinterpret_cast<unsigned int *>(pabyDest1)[j] =
5874 15 : ((word0 >> 8) & 0xff) | ((word1 & 0xff) << 8) |
5875 15 : (((word1 >> 24)) << 16) | ((word2 >> 16) << 24);
5876 15 : pabyDest2[j * 4] = static_cast<GByte>(word0 >> 16);
5877 15 : pabyDest2[j * 4 + 1] = static_cast<GByte>(word1 >> 8);
5878 15 : pabyDest2[j * 4 + 2] = static_cast<GByte>(word2);
5879 15 : pabyDest2[j * 4 + 3] = static_cast<GByte>(word2 >> 24);
5880 : }
5881 : }
5882 : #if defined(__clang__)
5883 : #pragma clang loop vectorize(disable)
5884 : #endif
5885 3 : for (; i < nIters; ++i)
5886 : {
5887 1 : pabyDest0[i] = pabySrc[3 * i + 0];
5888 1 : pabyDest1[i] = pabySrc[3 * i + 1];
5889 1 : pabyDest2[i] = pabySrc[3 * i + 2];
5890 : }
5891 : }
5892 : #endif
5893 :
5894 : /************************************************************************/
5895 : /* GDALDeinterleave4Byte() */
5896 : /************************************************************************/
5897 :
5898 : #if !defined(__GNUC__) || defined(__clang__)
5899 :
5900 : /************************************************************************/
5901 : /* deinterleave() */
5902 : /************************************************************************/
5903 :
5904 : template <bool SHIFT, bool MASK>
5905 : inline __m128i deinterleave(__m128i &xmm0_ori, __m128i &xmm1_ori,
5906 : __m128i &xmm2_ori, __m128i &xmm3_ori)
5907 : {
5908 : // Set higher 24bit of each int32 packed word to 0
5909 : if (SHIFT)
5910 : {
5911 : xmm0_ori = _mm_srli_epi32(xmm0_ori, 8);
5912 : xmm1_ori = _mm_srli_epi32(xmm1_ori, 8);
5913 : xmm2_ori = _mm_srli_epi32(xmm2_ori, 8);
5914 : xmm3_ori = _mm_srli_epi32(xmm3_ori, 8);
5915 : }
5916 : __m128i xmm0;
5917 : __m128i xmm1;
5918 : __m128i xmm2;
5919 : __m128i xmm3;
5920 : if (MASK)
5921 : {
5922 : const __m128i xmm_mask = _mm_set1_epi32(0xff);
5923 : xmm0 = _mm_and_si128(xmm0_ori, xmm_mask);
5924 : xmm1 = _mm_and_si128(xmm1_ori, xmm_mask);
5925 : xmm2 = _mm_and_si128(xmm2_ori, xmm_mask);
5926 : xmm3 = _mm_and_si128(xmm3_ori, xmm_mask);
5927 : }
5928 : else
5929 : {
5930 : xmm0 = xmm0_ori;
5931 : xmm1 = xmm1_ori;
5932 : xmm2 = xmm2_ori;
5933 : xmm3 = xmm3_ori;
5934 : }
5935 : // Pack int32 to int16
5936 : xmm0 = _mm_packs_epi32(xmm0, xmm1);
5937 : xmm2 = _mm_packs_epi32(xmm2, xmm3);
5938 : // Pack int16 to uint8
5939 : xmm0 = _mm_packus_epi16(xmm0, xmm2);
5940 : return xmm0;
5941 : }
5942 :
5943 : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
5944 : GByte *CPL_RESTRICT pabyDest0,
5945 : GByte *CPL_RESTRICT pabyDest1,
5946 : GByte *CPL_RESTRICT pabyDest2,
5947 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
5948 : #ifdef USE_NEON_OPTIMIZATIONS
5949 : {
5950 : return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
5951 : pabyDest3, nIters);
5952 : }
5953 : #else
5954 : {
5955 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
5956 : if (CPLHaveRuntimeSSSE3())
5957 : {
5958 : return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
5959 : pabyDest2, pabyDest3, nIters);
5960 : }
5961 : #endif
5962 :
5963 : // Not the optimal SSE2-only code, as gcc auto-vectorizer manages to
5964 : // do something slightly better.
5965 : size_t i = 0;
5966 : for (; i + 15 < nIters; i += 16)
5967 : {
5968 : __m128i xmm0_ori = _mm_loadu_si128(
5969 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
5970 : __m128i xmm1_ori = _mm_loadu_si128(
5971 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
5972 : __m128i xmm2_ori = _mm_loadu_si128(
5973 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
5974 : __m128i xmm3_ori = _mm_loadu_si128(
5975 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
5976 :
5977 : _mm_storeu_si128(
5978 : reinterpret_cast<__m128i *>(pabyDest0 + i),
5979 : deinterleave<false, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
5980 : _mm_storeu_si128(
5981 : reinterpret_cast<__m128i *>(pabyDest1 + i),
5982 : deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
5983 : _mm_storeu_si128(
5984 : reinterpret_cast<__m128i *>(pabyDest2 + i),
5985 : deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
5986 : _mm_storeu_si128(
5987 : reinterpret_cast<__m128i *>(pabyDest3 + i),
5988 : deinterleave<true, false>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
5989 : }
5990 :
5991 : #if defined(__clang__)
5992 : #pragma clang loop vectorize(disable)
5993 : #endif
5994 : for (; i < nIters; ++i)
5995 : {
5996 : pabyDest0[i] = pabySrc[4 * i + 0];
5997 : pabyDest1[i] = pabySrc[4 * i + 1];
5998 : pabyDest2[i] = pabySrc[4 * i + 2];
5999 : pabyDest3[i] = pabySrc[4 * i + 3];
6000 : }
6001 : }
6002 : #endif
6003 : #else
6004 : // GCC autovectorizer does an excellent job
6005 62351 : __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
6006 : const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
6007 : GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2,
6008 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6009 : {
6010 536364000 : for (size_t i = 0; i < nIters; ++i)
6011 : {
6012 536302000 : pabyDest0[i] = pabySrc[4 * i + 0];
6013 536302000 : pabyDest1[i] = pabySrc[4 * i + 1];
6014 536302000 : pabyDest2[i] = pabySrc[4 * i + 2];
6015 536302000 : pabyDest3[i] = pabySrc[4 * i + 3];
6016 : }
6017 62351 : }
6018 : #endif
6019 :
6020 : #else
6021 :
6022 : /************************************************************************/
6023 : /* GDALDeinterleave3Byte() */
6024 : /************************************************************************/
6025 :
6026 : // TODO: Enabling below could help on non-Intel architectures where GCC knows
6027 : // how to auto-vectorize
6028 : // #if defined(__GNUC__)
6029 : //__attribute__((optimize("tree-vectorize")))
6030 : // #endif
6031 : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
6032 : GByte *CPL_RESTRICT pabyDest0,
6033 : GByte *CPL_RESTRICT pabyDest1,
6034 : GByte *CPL_RESTRICT pabyDest2, size_t nIters)
6035 : {
6036 : for (size_t i = 0; i < nIters; ++i)
6037 : {
6038 : pabyDest0[i] = pabySrc[3 * i + 0];
6039 : pabyDest1[i] = pabySrc[3 * i + 1];
6040 : pabyDest2[i] = pabySrc[3 * i + 2];
6041 : }
6042 : }
6043 :
6044 : /************************************************************************/
6045 : /* GDALDeinterleave4Byte() */
6046 : /************************************************************************/
6047 :
6048 : // TODO: Enabling below could help on non-Intel architectures where gcc knows
6049 : // how to auto-vectorize
6050 : // #if defined(__GNUC__)
6051 : //__attribute__((optimize("tree-vectorize")))
6052 : // #endif
6053 : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
6054 : GByte *CPL_RESTRICT pabyDest0,
6055 : GByte *CPL_RESTRICT pabyDest1,
6056 : GByte *CPL_RESTRICT pabyDest2,
6057 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6058 : {
6059 : for (size_t i = 0; i < nIters; ++i)
6060 : {
6061 : pabyDest0[i] = pabySrc[4 * i + 0];
6062 : pabyDest1[i] = pabySrc[4 * i + 1];
6063 : pabyDest2[i] = pabySrc[4 * i + 2];
6064 : pabyDest3[i] = pabySrc[4 * i + 3];
6065 : }
6066 : }
6067 :
6068 : #endif
6069 :
6070 : /************************************************************************/
6071 : /* GDALDeinterleave() */
6072 : /************************************************************************/
6073 :
6074 : /*! Copy values from a pixel-interleave buffer to multiple per-component
6075 : buffers.
6076 :
6077 : In pseudo-code
6078 : \verbatim
6079 : for(size_t i = 0; i < nIters; ++i)
6080 : for(int iComp = 0; iComp < nComponents; iComp++ )
6081 : ppDestBuffer[iComp][i] = pSourceBuffer[nComponents * i + iComp]
6082 : \endverbatim
6083 :
6084 : The implementation is optimized for a few cases, like de-interleaving
6085 : of 3 or 4-components Byte buffers.
6086 :
6087 : \since GDAL 3.6
6088 : */
6089 423982 : void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
6090 : int nComponents, void **ppDestBuffer,
6091 : GDALDataType eDestDT, size_t nIters)
6092 : {
6093 423982 : if (eSourceDT == eDestDT)
6094 : {
6095 423960 : if (eSourceDT == GDT_UInt8 || eSourceDT == GDT_Int8)
6096 : {
6097 423639 : if (nComponents == 3)
6098 : {
6099 361281 : const GByte *CPL_RESTRICT pabySrc =
6100 : static_cast<const GByte *>(pSourceBuffer);
6101 361281 : GByte *CPL_RESTRICT pabyDest0 =
6102 : static_cast<GByte *>(ppDestBuffer[0]);
6103 361281 : GByte *CPL_RESTRICT pabyDest1 =
6104 : static_cast<GByte *>(ppDestBuffer[1]);
6105 361281 : GByte *CPL_RESTRICT pabyDest2 =
6106 : static_cast<GByte *>(ppDestBuffer[2]);
6107 361281 : GDALDeinterleave3Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6108 : nIters);
6109 361281 : return;
6110 : }
6111 62358 : else if (nComponents == 4)
6112 : {
6113 62351 : const GByte *CPL_RESTRICT pabySrc =
6114 : static_cast<const GByte *>(pSourceBuffer);
6115 62351 : GByte *CPL_RESTRICT pabyDest0 =
6116 : static_cast<GByte *>(ppDestBuffer[0]);
6117 62351 : GByte *CPL_RESTRICT pabyDest1 =
6118 : static_cast<GByte *>(ppDestBuffer[1]);
6119 62351 : GByte *CPL_RESTRICT pabyDest2 =
6120 : static_cast<GByte *>(ppDestBuffer[2]);
6121 62351 : GByte *CPL_RESTRICT pabyDest3 =
6122 : static_cast<GByte *>(ppDestBuffer[3]);
6123 62351 : GDALDeinterleave4Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6124 : pabyDest3, nIters);
6125 62351 : return;
6126 7 : }
6127 : }
6128 : #if ((defined(__GNUC__) && !defined(__clang__)) || \
6129 : defined(__INTEL_CLANG_COMPILER)) && \
6130 : defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
6131 642 : else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
6132 321 : CPLHaveRuntimeSSSE3())
6133 : {
6134 321 : if (nComponents == 3)
6135 : {
6136 126 : const GUInt16 *CPL_RESTRICT panSrc =
6137 : static_cast<const GUInt16 *>(pSourceBuffer);
6138 126 : GUInt16 *CPL_RESTRICT panDest0 =
6139 : static_cast<GUInt16 *>(ppDestBuffer[0]);
6140 126 : GUInt16 *CPL_RESTRICT panDest1 =
6141 : static_cast<GUInt16 *>(ppDestBuffer[1]);
6142 126 : GUInt16 *CPL_RESTRICT panDest2 =
6143 : static_cast<GUInt16 *>(ppDestBuffer[2]);
6144 126 : GDALDeinterleave3UInt16_SSSE3(panSrc, panDest0, panDest1,
6145 : panDest2, nIters);
6146 126 : return;
6147 : }
6148 : #if !defined(__INTEL_CLANG_COMPILER)
6149 : // ICC autovectorizer doesn't do a good job, at least with icx
6150 : // 2022.1.0.20220316
6151 195 : else if (nComponents == 4)
6152 : {
6153 195 : const GUInt16 *CPL_RESTRICT panSrc =
6154 : static_cast<const GUInt16 *>(pSourceBuffer);
6155 195 : GUInt16 *CPL_RESTRICT panDest0 =
6156 : static_cast<GUInt16 *>(ppDestBuffer[0]);
6157 195 : GUInt16 *CPL_RESTRICT panDest1 =
6158 : static_cast<GUInt16 *>(ppDestBuffer[1]);
6159 195 : GUInt16 *CPL_RESTRICT panDest2 =
6160 : static_cast<GUInt16 *>(ppDestBuffer[2]);
6161 195 : GUInt16 *CPL_RESTRICT panDest3 =
6162 : static_cast<GUInt16 *>(ppDestBuffer[3]);
6163 195 : GDALDeinterleave4UInt16_SSSE3(panSrc, panDest0, panDest1,
6164 : panDest2, panDest3, nIters);
6165 195 : return;
6166 : }
6167 : #endif
6168 : }
6169 : #endif
6170 : }
6171 :
6172 29 : const int nSourceDTSize = GDALGetDataTypeSizeBytes(eSourceDT);
6173 29 : const int nDestDTSize = GDALGetDataTypeSizeBytes(eDestDT);
6174 108 : for (int iComp = 0; iComp < nComponents; iComp++)
6175 : {
6176 79 : GDALCopyWords64(static_cast<const GByte *>(pSourceBuffer) +
6177 79 : iComp * nSourceDTSize,
6178 : eSourceDT, nComponents * nSourceDTSize,
6179 79 : ppDestBuffer[iComp], eDestDT, nDestDTSize, nIters);
6180 : }
6181 : }
6182 :
6183 : /************************************************************************/
6184 : /* GDALTranspose2DSingleToSingle() */
6185 : /************************************************************************/
6186 : /**
6187 : * Transpose a 2D array of non-complex values, in a efficient (cache-oblivious) way.
6188 : *
6189 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6190 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6191 : * @param nSrcWidth Width of pSrc array.
6192 : * @param nSrcHeight Height of pSrc array.
6193 : */
6194 :
6195 : template <class DST, class SRC>
6196 160 : void GDALTranspose2DSingleToSingle(const SRC *CPL_RESTRICT pSrc,
6197 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6198 : size_t nSrcHeight)
6199 : {
6200 160 : constexpr size_t blocksize = 32;
6201 345 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6202 : {
6203 185 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6204 5016 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6205 : {
6206 : // transpose the block beginning at [i,j]
6207 4831 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6208 26185 : for (size_t k = i; k < max_k; ++k)
6209 : {
6210 669282 : for (size_t l = j; l < max_l; ++l)
6211 : {
6212 647928 : GDALCopyWord(pSrc[l + k * nSrcWidth],
6213 647928 : pDst[k + l * nSrcHeight]);
6214 : }
6215 : }
6216 : }
6217 : }
6218 160 : }
6219 :
6220 : /************************************************************************/
6221 : /* GDALTranspose2DComplexToComplex() */
6222 : /************************************************************************/
6223 : /**
6224 : * Transpose a 2D array of complex values into an array of complex values,
6225 : * in a efficient (cache-oblivious) way.
6226 : *
6227 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6228 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6229 : * @param nSrcWidth Width of pSrc array.
6230 : * @param nSrcHeight Height of pSrc array.
6231 : */
6232 : template <class DST, class SRC>
6233 25 : void GDALTranspose2DComplexToComplex(const SRC *CPL_RESTRICT pSrc,
6234 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6235 : size_t nSrcHeight)
6236 : {
6237 25 : constexpr size_t blocksize = 32;
6238 50 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6239 : {
6240 25 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6241 50 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6242 : {
6243 : // transpose the block beginning at [i,j]
6244 25 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6245 75 : for (size_t k = i; k < max_k; ++k)
6246 : {
6247 200 : for (size_t l = j; l < max_l; ++l)
6248 : {
6249 150 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
6250 150 : pDst[2 * (k + l * nSrcHeight) + 0]);
6251 150 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 1],
6252 150 : pDst[2 * (k + l * nSrcHeight) + 1]);
6253 : }
6254 : }
6255 : }
6256 : }
6257 25 : }
6258 :
6259 : /************************************************************************/
6260 : /* GDALTranspose2DComplexToSingle() */
6261 : /************************************************************************/
6262 : /**
6263 : * Transpose a 2D array of complex values into an array of non-complex values,
6264 : * in a efficient (cache-oblivious) way.
6265 : *
6266 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6267 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6268 : * @param nSrcWidth Width of pSrc array.
6269 : * @param nSrcHeight Height of pSrc array.
6270 : */
6271 : template <class DST, class SRC>
6272 55 : void GDALTranspose2DComplexToSingle(const SRC *CPL_RESTRICT pSrc,
6273 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6274 : size_t nSrcHeight)
6275 : {
6276 55 : constexpr size_t blocksize = 32;
6277 110 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6278 : {
6279 55 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6280 110 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6281 : {
6282 : // transpose the block beginning at [i,j]
6283 55 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6284 165 : for (size_t k = i; k < max_k; ++k)
6285 : {
6286 440 : for (size_t l = j; l < max_l; ++l)
6287 : {
6288 330 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
6289 330 : pDst[k + l * nSrcHeight]);
6290 : }
6291 : }
6292 : }
6293 : }
6294 55 : }
6295 :
6296 : /************************************************************************/
6297 : /* GDALTranspose2DSingleToComplex() */
6298 : /************************************************************************/
6299 : /**
6300 : * Transpose a 2D array of non-complex values into an array of complex values,
6301 : * in a efficient (cache-oblivious) way.
6302 : *
6303 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6304 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6305 : * @param nSrcWidth Width of pSrc array.
6306 : * @param nSrcHeight Height of pSrc array.
6307 : */
6308 : template <class DST, class SRC>
6309 55 : void GDALTranspose2DSingleToComplex(const SRC *CPL_RESTRICT pSrc,
6310 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6311 : size_t nSrcHeight)
6312 : {
6313 55 : constexpr size_t blocksize = 32;
6314 110 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6315 : {
6316 55 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6317 110 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6318 : {
6319 : // transpose the block beginning at [i,j]
6320 55 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6321 165 : for (size_t k = i; k < max_k; ++k)
6322 : {
6323 440 : for (size_t l = j; l < max_l; ++l)
6324 : {
6325 330 : GDALCopyWord(pSrc[l + k * nSrcWidth],
6326 330 : pDst[2 * (k + l * nSrcHeight) + 0]);
6327 330 : pDst[2 * (k + l * nSrcHeight) + 1] = 0;
6328 : }
6329 : }
6330 : }
6331 : }
6332 55 : }
6333 :
6334 : /************************************************************************/
6335 : /* GDALTranspose2D() */
6336 : /************************************************************************/
6337 :
6338 : template <class DST, bool DST_IS_COMPLEX>
6339 295 : static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
6340 : size_t nSrcWidth, size_t nSrcHeight)
6341 : {
6342 : #define CALL_GDALTranspose2D_internal(SRC_TYPE) \
6343 : do \
6344 : { \
6345 : if constexpr (DST_IS_COMPLEX) \
6346 : { \
6347 : GDALTranspose2DSingleToComplex( \
6348 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6349 : nSrcHeight); \
6350 : } \
6351 : else \
6352 : { \
6353 : GDALTranspose2DSingleToSingle(static_cast<const SRC_TYPE *>(pSrc), \
6354 : pDst, nSrcWidth, nSrcHeight); \
6355 : } \
6356 : } while (0)
6357 :
6358 : #define CALL_GDALTranspose2DComplex_internal(SRC_TYPE) \
6359 : do \
6360 : { \
6361 : if constexpr (DST_IS_COMPLEX) \
6362 : { \
6363 : GDALTranspose2DComplexToComplex( \
6364 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6365 : nSrcHeight); \
6366 : } \
6367 : else \
6368 : { \
6369 : GDALTranspose2DComplexToSingle( \
6370 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6371 : nSrcHeight); \
6372 : } \
6373 : } while (0)
6374 :
6375 : // clang-format off
6376 295 : switch (eSrcType)
6377 : {
6378 16 : case GDT_UInt8: CALL_GDALTranspose2D_internal(uint8_t); break;
6379 15 : case GDT_Int8: CALL_GDALTranspose2D_internal(int8_t); break;
6380 33 : case GDT_UInt16: CALL_GDALTranspose2D_internal(uint16_t); break;
6381 20 : case GDT_Int16: CALL_GDALTranspose2D_internal(int16_t); break;
6382 24 : case GDT_UInt32: CALL_GDALTranspose2D_internal(uint32_t); break;
6383 16 : case GDT_Int32: CALL_GDALTranspose2D_internal(int32_t); break;
6384 16 : case GDT_UInt64: CALL_GDALTranspose2D_internal(uint64_t); break;
6385 16 : case GDT_Int64: CALL_GDALTranspose2D_internal(int64_t); break;
6386 16 : case GDT_Float16: CALL_GDALTranspose2D_internal(GFloat16); break;
6387 19 : case GDT_Float32: CALL_GDALTranspose2D_internal(float); break;
6388 24 : case GDT_Float64: CALL_GDALTranspose2D_internal(double); break;
6389 16 : case GDT_CInt16: CALL_GDALTranspose2DComplex_internal(int16_t); break;
6390 16 : case GDT_CInt32: CALL_GDALTranspose2DComplex_internal(int32_t); break;
6391 16 : case GDT_CFloat16: CALL_GDALTranspose2DComplex_internal(GFloat16); break;
6392 16 : case GDT_CFloat32: CALL_GDALTranspose2DComplex_internal(float); break;
6393 16 : case GDT_CFloat64: CALL_GDALTranspose2DComplex_internal(double); break;
6394 0 : case GDT_Unknown:
6395 : case GDT_TypeCount:
6396 0 : break;
6397 : }
6398 : // clang-format on
6399 :
6400 : #undef CALL_GDALTranspose2D_internal
6401 : #undef CALL_GDALTranspose2DComplex_internal
6402 295 : }
6403 :
6404 : /************************************************************************/
6405 : /* GDALInterleave2Byte() */
6406 : /************************************************************************/
6407 :
6408 : #if defined(HAVE_SSE2) && \
6409 : (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
6410 :
6411 : // ICC autovectorizer doesn't do a good job at generating good SSE code,
6412 : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
6413 : #if defined(__GNUC__)
6414 : __attribute__((noinline))
6415 : #endif
6416 : static void
6417 : GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
6418 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
6419 : {
6420 : size_t i = 0;
6421 : constexpr size_t VALS_PER_ITER = 16;
6422 : for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
6423 : {
6424 : __m128i xmm0 =
6425 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + i));
6426 : __m128i xmm1 = _mm_loadu_si128(
6427 : reinterpret_cast<__m128i const *>(pSrc + i + nIters));
6428 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + 2 * i),
6429 : _mm_unpacklo_epi8(xmm0, xmm1));
6430 : _mm_storeu_si128(
6431 : reinterpret_cast<__m128i *>(pDst + 2 * i + VALS_PER_ITER),
6432 : _mm_unpackhi_epi8(xmm0, xmm1));
6433 : }
6434 : #if defined(__clang__)
6435 : #pragma clang loop vectorize(disable)
6436 : #endif
6437 : for (; i < nIters; ++i)
6438 : {
6439 : pDst[2 * i + 0] = pSrc[i + 0 * nIters];
6440 : pDst[2 * i + 1] = pSrc[i + 1 * nIters];
6441 : }
6442 : }
6443 :
6444 : #else
6445 :
6446 : #if defined(__GNUC__) && !defined(__clang__)
6447 : __attribute__((optimize("tree-vectorize")))
6448 : #endif
6449 : #if defined(__GNUC__)
6450 : __attribute__((noinline))
6451 : #endif
6452 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6453 : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
6454 : #pragma clang diagnostic push
6455 : #pragma clang diagnostic ignored "-Wpass-failed"
6456 : #endif
6457 : static void
6458 9 : GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
6459 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
6460 : {
6461 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6462 : #pragma clang loop vectorize(enable)
6463 : #endif
6464 355429 : for (size_t i = 0; i < nIters; ++i)
6465 : {
6466 355420 : pDst[2 * i + 0] = pSrc[i + 0 * nIters];
6467 355420 : pDst[2 * i + 1] = pSrc[i + 1 * nIters];
6468 : }
6469 9 : }
6470 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6471 : #pragma clang diagnostic pop
6472 : #endif
6473 :
6474 : #endif
6475 :
6476 : /************************************************************************/
6477 : /* GDALInterleave4Byte() */
6478 : /************************************************************************/
6479 :
6480 : #if defined(HAVE_SSE2) && \
6481 : (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
6482 :
6483 : // ICC autovectorizer doesn't do a good job at generating good SSE code,
6484 : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
6485 : #if defined(__GNUC__)
6486 : __attribute__((noinline))
6487 : #endif
6488 : static void
6489 : GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
6490 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
6491 : {
6492 : size_t i = 0;
6493 : constexpr size_t VALS_PER_ITER = 16;
6494 : for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
6495 : {
6496 : __m128i xmm0 = _mm_loadu_si128(
6497 : reinterpret_cast<__m128i const *>(pSrc + i + 0 * nIters));
6498 : __m128i xmm1 = _mm_loadu_si128(
6499 : reinterpret_cast<__m128i const *>(pSrc + i + 1 * nIters));
6500 : __m128i xmm2 = _mm_loadu_si128(
6501 : reinterpret_cast<__m128i const *>(pSrc + i + 2 * nIters));
6502 : __m128i xmm3 = _mm_loadu_si128(
6503 : reinterpret_cast<__m128i const *>(pSrc + i + 3 * nIters));
6504 : auto tmp0 = _mm_unpacklo_epi8(
6505 : xmm0,
6506 : xmm1); // (xmm0_0, xmm1_0, xmm0_1, xmm1_1, xmm0_2, xmm1_2, ...)
6507 : auto tmp1 = _mm_unpackhi_epi8(
6508 : xmm0,
6509 : xmm1); // (xmm0_8, xmm1_8, xmm0_9, xmm1_9, xmm0_10, xmm1_10, ...)
6510 : auto tmp2 = _mm_unpacklo_epi8(
6511 : xmm2,
6512 : xmm3); // (xmm2_0, xmm3_0, xmm2_1, xmm3_1, xmm2_2, xmm3_2, ...)
6513 : auto tmp3 = _mm_unpackhi_epi8(
6514 : xmm2,
6515 : xmm3); // (xmm2_8, xmm3_8, xmm2_9, xmm3_9, xmm2_10, xmm3_10, ...)
6516 : auto tmp2_0 = _mm_unpacklo_epi16(
6517 : tmp0,
6518 : tmp2); // (xmm0_0, xmm1_0, xmm2_0, xmm3_0, xmm0_1, xmm1_1, xmm2_1, xmm3_1, ...)
6519 : auto tmp2_1 = _mm_unpackhi_epi16(tmp0, tmp2);
6520 : auto tmp2_2 = _mm_unpacklo_epi16(tmp1, tmp3);
6521 : auto tmp2_3 = _mm_unpackhi_epi16(tmp1, tmp3);
6522 : _mm_storeu_si128(
6523 : reinterpret_cast<__m128i *>(pDst + 4 * i + 0 * VALS_PER_ITER),
6524 : tmp2_0);
6525 : _mm_storeu_si128(
6526 : reinterpret_cast<__m128i *>(pDst + 4 * i + 1 * VALS_PER_ITER),
6527 : tmp2_1);
6528 : _mm_storeu_si128(
6529 : reinterpret_cast<__m128i *>(pDst + 4 * i + 2 * VALS_PER_ITER),
6530 : tmp2_2);
6531 : _mm_storeu_si128(
6532 : reinterpret_cast<__m128i *>(pDst + 4 * i + 3 * VALS_PER_ITER),
6533 : tmp2_3);
6534 : }
6535 : #if defined(__clang__)
6536 : #pragma clang loop vectorize(disable)
6537 : #endif
6538 : for (; i < nIters; ++i)
6539 : {
6540 : pDst[4 * i + 0] = pSrc[i + 0 * nIters];
6541 : pDst[4 * i + 1] = pSrc[i + 1 * nIters];
6542 : pDst[4 * i + 2] = pSrc[i + 2 * nIters];
6543 : pDst[4 * i + 3] = pSrc[i + 3 * nIters];
6544 : }
6545 : }
6546 :
6547 : #else
6548 :
6549 : #if defined(__GNUC__) && !defined(__clang__)
6550 : __attribute__((optimize("tree-vectorize")))
6551 : #endif
6552 : #if defined(__GNUC__)
6553 : __attribute__((noinline))
6554 : #endif
6555 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6556 : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
6557 : #pragma clang diagnostic push
6558 : #pragma clang diagnostic ignored "-Wpass-failed"
6559 : #endif
6560 : static void
6561 9 : GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
6562 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
6563 : {
6564 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6565 : #pragma clang loop vectorize(enable)
6566 : #endif
6567 75443 : for (size_t i = 0; i < nIters; ++i)
6568 : {
6569 75434 : pDst[4 * i + 0] = pSrc[i + 0 * nIters];
6570 75434 : pDst[4 * i + 1] = pSrc[i + 1 * nIters];
6571 75434 : pDst[4 * i + 2] = pSrc[i + 2 * nIters];
6572 75434 : pDst[4 * i + 3] = pSrc[i + 3 * nIters];
6573 : }
6574 9 : }
6575 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6576 : #pragma clang diagnostic pop
6577 : #endif
6578 :
6579 : #endif
6580 :
6581 : /************************************************************************/
6582 : /* GDALTranspose2D() */
6583 : /************************************************************************/
6584 :
6585 : /**
6586 : * Transpose a 2D array in a efficient (cache-oblivious) way.
6587 : *
6588 : * @param pSrc Source array of width = nSrcWidth and height = nSrcHeight.
6589 : * @param eSrcType Data type of pSrc.
6590 : * @param pDst Destination transposed array of width = nSrcHeight and height = nSrcWidth.
6591 : * @param eDstType Data type of pDst.
6592 : * @param nSrcWidth Width of pSrc array.
6593 : * @param nSrcHeight Height of pSrc array.
6594 : * @since GDAL 3.11
6595 : */
6596 :
6597 346 : void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, void *pDst,
6598 : GDALDataType eDstType, size_t nSrcWidth, size_t nSrcHeight)
6599 : {
6600 346 : if (eSrcType == eDstType && (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8))
6601 : {
6602 51 : if (nSrcHeight == 2)
6603 : {
6604 9 : GDALInterleave2Byte(static_cast<const uint8_t *>(pSrc),
6605 : static_cast<uint8_t *>(pDst), nSrcWidth);
6606 9 : return;
6607 : }
6608 42 : if (nSrcHeight == 4)
6609 : {
6610 9 : GDALInterleave4Byte(static_cast<const uint8_t *>(pSrc),
6611 : static_cast<uint8_t *>(pDst), nSrcWidth);
6612 9 : return;
6613 : }
6614 : #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) && \
6615 : (defined(__x86_64) || defined(_M_X64)))
6616 33 : if (CPLHaveRuntimeSSSE3())
6617 : {
6618 33 : GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
6619 : static_cast<uint8_t *>(pDst), nSrcWidth,
6620 : nSrcHeight);
6621 33 : return;
6622 : }
6623 : #elif defined(USE_NEON_OPTIMIZATIONS)
6624 : {
6625 : GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
6626 : static_cast<uint8_t *>(pDst), nSrcWidth,
6627 : nSrcHeight);
6628 : return;
6629 : }
6630 : #endif
6631 : }
6632 :
6633 : #define CALL_GDALTranspose2D_internal(DST_TYPE, DST_IS_COMPLEX) \
6634 : GDALTranspose2D<DST_TYPE, DST_IS_COMPLEX>( \
6635 : pSrc, eSrcType, static_cast<DST_TYPE *>(pDst), nSrcWidth, nSrcHeight)
6636 :
6637 : // clang-format off
6638 295 : switch (eDstType)
6639 : {
6640 15 : case GDT_UInt8: CALL_GDALTranspose2D_internal(uint8_t, false); break;
6641 15 : case GDT_Int8: CALL_GDALTranspose2D_internal(int8_t, false); break;
6642 33 : case GDT_UInt16: CALL_GDALTranspose2D_internal(uint16_t, false); break;
6643 20 : case GDT_Int16: CALL_GDALTranspose2D_internal(int16_t, false); break;
6644 24 : case GDT_UInt32: CALL_GDALTranspose2D_internal(uint32_t, false); break;
6645 16 : case GDT_Int32: CALL_GDALTranspose2D_internal(int32_t, false); break;
6646 16 : case GDT_UInt64: CALL_GDALTranspose2D_internal(uint64_t, false); break;
6647 16 : case GDT_Int64: CALL_GDALTranspose2D_internal(int64_t, false); break;
6648 16 : case GDT_Float16: CALL_GDALTranspose2D_internal(GFloat16, false); break;
6649 19 : case GDT_Float32: CALL_GDALTranspose2D_internal(float, false); break;
6650 25 : case GDT_Float64: CALL_GDALTranspose2D_internal(double, false); break;
6651 16 : case GDT_CInt16: CALL_GDALTranspose2D_internal(int16_t, true); break;
6652 16 : case GDT_CInt32: CALL_GDALTranspose2D_internal(int32_t, true); break;
6653 16 : case GDT_CFloat16: CALL_GDALTranspose2D_internal(GFloat16, true); break;
6654 16 : case GDT_CFloat32: CALL_GDALTranspose2D_internal(float, true); break;
6655 16 : case GDT_CFloat64: CALL_GDALTranspose2D_internal(double, true); break;
6656 0 : case GDT_Unknown:
6657 : case GDT_TypeCount:
6658 0 : break;
6659 : }
6660 : // clang-format on
6661 :
6662 : #undef CALL_GDALTranspose2D_internal
6663 : }
6664 :
6665 : /************************************************************************/
6666 : /* ExtractBitAndConvertTo255() */
6667 : /************************************************************************/
6668 :
6669 : #if defined(__GNUC__) || defined(_MSC_VER)
6670 : // Signedness of char implementation dependent, so be explicit.
6671 : // Assumes 2-complement integer types and sign extension of right shifting
6672 : // GCC guarantees such:
6673 : // https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
6674 124890 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
6675 : {
6676 124890 : return static_cast<GByte>(static_cast<signed char>(byVal << (7 - nBit)) >>
6677 124890 : 7);
6678 : }
6679 : #else
6680 : // Portable way
6681 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
6682 : {
6683 : return (byVal & (1 << nBit)) ? 255 : 0;
6684 : }
6685 : #endif
6686 :
6687 : /************************************************************************/
6688 : /* ExpandEightPackedBitsToByteAt255() */
6689 : /************************************************************************/
6690 :
6691 15569 : static inline void ExpandEightPackedBitsToByteAt255(GByte byVal,
6692 : GByte abyOutput[8])
6693 : {
6694 15569 : abyOutput[0] = ExtractBitAndConvertTo255(byVal, 7);
6695 15569 : abyOutput[1] = ExtractBitAndConvertTo255(byVal, 6);
6696 15569 : abyOutput[2] = ExtractBitAndConvertTo255(byVal, 5);
6697 15569 : abyOutput[3] = ExtractBitAndConvertTo255(byVal, 4);
6698 15569 : abyOutput[4] = ExtractBitAndConvertTo255(byVal, 3);
6699 15569 : abyOutput[5] = ExtractBitAndConvertTo255(byVal, 2);
6700 15569 : abyOutput[6] = ExtractBitAndConvertTo255(byVal, 1);
6701 15569 : abyOutput[7] = ExtractBitAndConvertTo255(byVal, 0);
6702 15569 : }
6703 :
6704 : /************************************************************************/
6705 : /* GDALExpandPackedBitsToByteAt0Or255() */
6706 : /************************************************************************/
6707 :
6708 : /** Expand packed-bits (ordered from most-significant bit to least one)
6709 : into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
6710 : at 1 to a byte at 255.
6711 :
6712 : The function does (in a possibly more optimized way) the following:
6713 : \code{.cpp}
6714 : for (size_t i = 0; i < nInputBits; ++i )
6715 : {
6716 : pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 255 : 0;
6717 : }
6718 : \endcode
6719 :
6720 : @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
6721 : @param pabyOutput Output array of nInputBits bytes.
6722 : @param nInputBits Number of valid bits in pabyInput.
6723 :
6724 : @since 3.11
6725 : */
6726 :
6727 45145 : void GDALExpandPackedBitsToByteAt0Or255(const GByte *CPL_RESTRICT pabyInput,
6728 : GByte *CPL_RESTRICT pabyOutput,
6729 : size_t nInputBits)
6730 : {
6731 45145 : const size_t nInputWholeBytes = nInputBits / 8;
6732 45145 : size_t iByte = 0;
6733 :
6734 : #ifdef HAVE_SSE2
6735 : // Mask to isolate each bit
6736 45145 : const __m128i bit_mask = _mm_set_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4,
6737 : 8, 16, 32, 64, -128);
6738 45145 : const __m128i zero = _mm_setzero_si128();
6739 45145 : const __m128i all_ones = _mm_set1_epi8(-1);
6740 : #ifdef __SSSE3__
6741 : const __m128i dispatch_two_bytes =
6742 : _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
6743 : #endif
6744 45145 : constexpr size_t SSE_REG_SIZE = sizeof(bit_mask);
6745 135654 : for (; iByte + SSE_REG_SIZE <= nInputWholeBytes; iByte += SSE_REG_SIZE)
6746 : {
6747 90509 : __m128i reg_ori = _mm_loadu_si128(
6748 90509 : reinterpret_cast<const __m128i *>(pabyInput + iByte));
6749 :
6750 90509 : constexpr int NUM_PROCESSED_BYTES_PER_REG = 2;
6751 814581 : for (size_t k = 0; k < SSE_REG_SIZE / NUM_PROCESSED_BYTES_PER_REG; ++k)
6752 : {
6753 : // Given reg_ori = (A, B, ... 14 other bytes ...),
6754 : // expand to (A, A, A, A, A, A, A, A, B, B, B, B, B, B, B, B)
6755 : #ifdef __SSSE3__
6756 : __m128i reg = _mm_shuffle_epi8(reg_ori, dispatch_two_bytes);
6757 : #else
6758 724072 : __m128i reg = _mm_unpacklo_epi8(reg_ori, reg_ori);
6759 724072 : reg = _mm_unpacklo_epi16(reg, reg);
6760 724072 : reg = _mm_unpacklo_epi32(reg, reg);
6761 : #endif
6762 :
6763 : // Test if bits of interest are set
6764 724072 : reg = _mm_and_si128(reg, bit_mask);
6765 :
6766 : // Now test if those bits are set, by comparing to zero. So the
6767 : // result will be that bytes where bits are set will be at 0, and
6768 : // ones where they are cleared will be at 0xFF. So the inverse of
6769 : // the end result we want!
6770 724072 : reg = _mm_cmpeq_epi8(reg, zero);
6771 :
6772 : // Invert the result
6773 724072 : reg = _mm_andnot_si128(reg, all_ones);
6774 :
6775 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyOutput), reg);
6776 :
6777 724072 : pabyOutput += SSE_REG_SIZE;
6778 :
6779 : // Right-shift of 2 bytes
6780 724072 : reg_ori = _mm_bsrli_si128(reg_ori, NUM_PROCESSED_BYTES_PER_REG);
6781 : }
6782 : }
6783 :
6784 : #endif // HAVE_SSE2
6785 :
6786 60714 : for (; iByte < nInputWholeBytes; ++iByte)
6787 : {
6788 15569 : ExpandEightPackedBitsToByteAt255(pabyInput[iByte], pabyOutput);
6789 15569 : pabyOutput += 8;
6790 : }
6791 45483 : for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
6792 : {
6793 338 : *pabyOutput = ExtractBitAndConvertTo255(pabyInput[iByte], 7 - iBit);
6794 338 : ++pabyOutput;
6795 : }
6796 45145 : }
6797 :
6798 : /************************************************************************/
6799 : /* ExpandEightPackedBitsToByteAt1() */
6800 : /************************************************************************/
6801 :
6802 136113 : static inline void ExpandEightPackedBitsToByteAt1(GByte byVal,
6803 : GByte abyOutput[8])
6804 : {
6805 136113 : abyOutput[0] = (byVal >> 7) & 0x1;
6806 136113 : abyOutput[1] = (byVal >> 6) & 0x1;
6807 136113 : abyOutput[2] = (byVal >> 5) & 0x1;
6808 136113 : abyOutput[3] = (byVal >> 4) & 0x1;
6809 136113 : abyOutput[4] = (byVal >> 3) & 0x1;
6810 136113 : abyOutput[5] = (byVal >> 2) & 0x1;
6811 136113 : abyOutput[6] = (byVal >> 1) & 0x1;
6812 136113 : abyOutput[7] = (byVal >> 0) & 0x1;
6813 136113 : }
6814 :
6815 : /************************************************************************/
6816 : /* GDALExpandPackedBitsToByteAt0Or1() */
6817 : /************************************************************************/
6818 :
6819 : /** Expand packed-bits (ordered from most-significant bit to least one)
6820 : into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
6821 : at 1 to a byte at 1.
6822 :
6823 : The function does (in a possibly more optimized way) the following:
6824 : \code{.cpp}
6825 : for (size_t i = 0; i < nInputBits; ++i )
6826 : {
6827 : pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
6828 : }
6829 : \endcode
6830 :
6831 : @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
6832 : @param pabyOutput Output array of nInputBits bytes.
6833 : @param nInputBits Number of valid bits in pabyInput.
6834 :
6835 : @since 3.11
6836 : */
6837 :
6838 7041 : void GDALExpandPackedBitsToByteAt0Or1(const GByte *CPL_RESTRICT pabyInput,
6839 : GByte *CPL_RESTRICT pabyOutput,
6840 : size_t nInputBits)
6841 : {
6842 7041 : const size_t nInputWholeBytes = nInputBits / 8;
6843 7041 : size_t iByte = 0;
6844 143154 : for (; iByte < nInputWholeBytes; ++iByte)
6845 : {
6846 136113 : ExpandEightPackedBitsToByteAt1(pabyInput[iByte], pabyOutput);
6847 136113 : pabyOutput += 8;
6848 : }
6849 18902 : for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
6850 : {
6851 11861 : *pabyOutput = (pabyInput[iByte] >> (7 - iBit)) & 0x1;
6852 11861 : ++pabyOutput;
6853 : }
6854 7041 : }
|