Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: GDAL Core
4 : * Purpose: Contains default implementation of GDALRasterBand::IRasterIO()
5 : * and supporting functions of broader utility.
6 : * Author: Frank Warmerdam, warmerdam@pobox.com
7 : *
8 : ******************************************************************************
9 : * Copyright (c) 1998, Frank Warmerdam
10 : * Copyright (c) 2007-2014, Even Rouault <even dot rouault at spatialys.com>
11 : *
12 : * SPDX-License-Identifier: MIT
13 : ****************************************************************************/
14 :
15 : #include "cpl_port.h"
16 : #include "gdal.h"
17 : #include "gdal_priv.h"
18 :
19 : #include <cassert>
20 : #include <climits>
21 : #include <cmath>
22 : #include <cstddef>
23 : #include <cstdio>
24 : #include <cstdlib>
25 : #include <cstring>
26 :
27 : #include <algorithm>
28 : #include <limits>
29 : #include <stdexcept>
30 : #include <type_traits>
31 :
32 : #include "cpl_conv.h"
33 : #include "cpl_cpu_features.h"
34 : #include "cpl_error.h"
35 : #include "cpl_float.h"
36 : #include "cpl_progress.h"
37 : #include "cpl_string.h"
38 : #include "cpl_vsi.h"
39 : #include "gdal_priv_templates.hpp"
40 : #include "gdal_vrt.h"
41 : #include "gdalwarper.h"
42 : #include "memdataset.h"
43 : #include "vrtdataset.h"
44 :
45 : #if defined(__x86_64) || defined(_M_X64)
46 : #include <emmintrin.h>
47 : #define HAVE_SSE2
48 : #elif defined(USE_NEON_OPTIMIZATIONS)
49 : #include "include_sse2neon.h"
50 : #define HAVE_SSE2
51 : #endif
52 :
53 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
54 : #include "rasterio_ssse3.h"
55 : #ifdef __SSSE3__
56 : #include <tmmintrin.h>
57 : #endif
58 : #endif
59 :
60 : #ifdef __SSE4_1__
61 : #include <smmintrin.h>
62 : #endif
63 :
64 : #ifdef __GNUC__
65 : #define CPL_NOINLINE __attribute__((noinline))
66 : #else
67 : #define CPL_NOINLINE
68 : #endif
69 :
70 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
71 : int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
72 : int nDstPixelStride, GPtrDiff_t nWordCount);
73 :
74 : /************************************************************************/
75 : /* DownsamplingIntegerXFactor() */
76 : /************************************************************************/
77 :
78 : template <bool bSameDataType, int DATA_TYPE_SIZE>
79 695780 : static bool DownsamplingIntegerXFactor(
80 : GDALRasterBand *poBand, int iSrcX, int nSrcXInc, GPtrDiff_t iSrcOffsetCst,
81 : GByte *CPL_RESTRICT pabyDstData, int nPixelSpace, int nBufXSize,
82 : GDALDataType eDataType, GDALDataType eBufType, int &nStartBlockX,
83 : int nBlockXSize, GDALRasterBlock *&poBlock, int nLBlockY)
84 : {
85 695780 : const int nBandDataSize =
86 : bSameDataType ? DATA_TYPE_SIZE : GDALGetDataTypeSizeBytes(eDataType);
87 695780 : int nOuterLoopIters = nBufXSize - 1;
88 695780 : const int nIncSrcOffset = nSrcXInc * nBandDataSize;
89 : const GByte *CPL_RESTRICT pabySrcData;
90 695780 : int nEndBlockX = nBlockXSize + nStartBlockX;
91 :
92 695780 : if (iSrcX < nEndBlockX)
93 : {
94 294999 : CPLAssert(poBlock);
95 294999 : goto no_reload_block;
96 : }
97 400781 : goto reload_block;
98 :
99 : // Don't do the last iteration in the loop, as iSrcX might go beyond
100 : // nRasterXSize - 1
101 1264973 : while (--nOuterLoopIters >= 1)
102 : {
103 201834 : iSrcX += nSrcXInc;
104 201834 : pabySrcData += nIncSrcOffset;
105 201834 : pabyDstData += nPixelSpace;
106 :
107 : /* --------------------------------------------------------------------
108 : */
109 : /* Ensure we have the appropriate block loaded. */
110 : /* --------------------------------------------------------------------
111 : */
112 201834 : if (iSrcX >= nEndBlockX)
113 : {
114 201834 : reload_block:
115 : {
116 615205 : const int nLBlockX = iSrcX / nBlockXSize;
117 615205 : nStartBlockX = nLBlockX * nBlockXSize;
118 615205 : nEndBlockX = nStartBlockX + nBlockXSize;
119 :
120 615205 : if (poBlock != nullptr)
121 341376 : poBlock->DropLock();
122 :
123 615205 : poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
124 615205 : if (poBlock == nullptr)
125 : {
126 1 : return false;
127 : }
128 : }
129 :
130 615204 : no_reload_block:
131 : const GByte *pabySrcBlock =
132 1264973 : static_cast<const GByte *>(poBlock->GetDataRef());
133 1264973 : GPtrDiff_t iSrcOffset =
134 1264973 : (iSrcX - nStartBlockX + iSrcOffsetCst) * nBandDataSize;
135 1264973 : pabySrcData = pabySrcBlock + iSrcOffset;
136 : }
137 :
138 : /* --------------------------------------------------------------------
139 : */
140 : /* Copy the maximum run of pixels. */
141 : /* --------------------------------------------------------------------
142 : */
143 :
144 1264973 : const int nIters = std::min(
145 1264973 : (nEndBlockX - iSrcX + (nSrcXInc - 1)) / nSrcXInc, nOuterLoopIters);
146 : if (bSameDataType)
147 : {
148 1264530 : memcpy(pabyDstData, pabySrcData, nBandDataSize);
149 1264530 : if (nIters > 1)
150 : {
151 : if (DATA_TYPE_SIZE == 1)
152 : {
153 326250 : pabySrcData += nIncSrcOffset;
154 326250 : pabyDstData += nPixelSpace;
155 326250 : GDALFastCopyByte(pabySrcData, nIncSrcOffset, pabyDstData,
156 326250 : nPixelSpace, nIters - 1);
157 326250 : pabySrcData +=
158 326250 : static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 2);
159 326250 : pabyDstData +=
160 326250 : static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 2);
161 : }
162 : else
163 : {
164 4395716 : for (int i = 0; i < nIters - 1; i++)
165 : {
166 4197550 : pabySrcData += nIncSrcOffset;
167 4197550 : pabyDstData += nPixelSpace;
168 4197550 : memcpy(pabyDstData, pabySrcData, nBandDataSize);
169 : }
170 : }
171 524420 : iSrcX += nSrcXInc * (nIters - 1);
172 524420 : nOuterLoopIters -= nIters - 1;
173 : }
174 : }
175 : else
176 : {
177 : // Type to type conversion ...
178 443 : GDALCopyWords64(pabySrcData, eDataType, nIncSrcOffset, pabyDstData,
179 443 : eBufType, nPixelSpace, std::max(1, nIters));
180 443 : if (nIters > 1)
181 : {
182 216 : pabySrcData +=
183 216 : static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 1);
184 216 : pabyDstData +=
185 216 : static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 1);
186 216 : iSrcX += nSrcXInc * (nIters - 1);
187 216 : nOuterLoopIters -= nIters - 1;
188 : }
189 : }
190 : }
191 :
192 : // Deal with last iteration to avoid iSrcX to go beyond nRasterXSize - 1
193 1063139 : if (nOuterLoopIters == 0)
194 : {
195 367360 : const int nRasterXSize = poBand->GetXSize();
196 367360 : iSrcX =
197 734720 : static_cast<int>(std::min(static_cast<GInt64>(iSrcX) + nSrcXInc,
198 367360 : static_cast<GInt64>(nRasterXSize - 1)));
199 367360 : pabyDstData += nPixelSpace;
200 367360 : if (iSrcX < nEndBlockX)
201 : {
202 354770 : goto no_reload_block;
203 : }
204 12590 : goto reload_block;
205 : }
206 695779 : return true;
207 : }
208 :
209 : template <class A, class B>
210 2732000 : CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW inline auto CPLUnsanitizedMul(A a, B b)
211 : {
212 2732000 : return a * b;
213 : }
214 :
215 : /************************************************************************/
216 : /* IRasterIO() */
217 : /* */
218 : /* Default internal implementation of RasterIO() ... utilizes */
219 : /* the Block access methods to satisfy the request. This would */
220 : /* normally only be overridden by formats with overviews. */
221 : /************************************************************************/
222 :
223 6119250 : CPLErr GDALRasterBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
224 : int nXSize, int nYSize, void *pData,
225 : int nBufXSize, int nBufYSize,
226 : GDALDataType eBufType, GSpacing nPixelSpace,
227 : GSpacing nLineSpace,
228 : GDALRasterIOExtraArg *psExtraArg)
229 :
230 : {
231 6119250 : if (eRWFlag == GF_Write && eFlushBlockErr != CE_None)
232 : {
233 0 : CPLError(eFlushBlockErr, CPLE_AppDefined,
234 : "An error occurred while writing a dirty block "
235 : "from GDALRasterBand::IRasterIO");
236 0 : CPLErr eErr = eFlushBlockErr;
237 0 : eFlushBlockErr = CE_None;
238 0 : return eErr;
239 : }
240 6119250 : if (nBlockXSize <= 0 || nBlockYSize <= 0)
241 : {
242 0 : CPLError(CE_Failure, CPLE_AppDefined, "Invalid block size");
243 0 : return CE_Failure;
244 : }
245 :
246 6119250 : const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
247 6119250 : const int nBufDataSize = GDALGetDataTypeSizeBytes(eBufType);
248 6119250 : GByte dummyBlock[2] = {0, 0};
249 6119250 : GByte *pabySrcBlock =
250 : dummyBlock; /* to avoid Coverity warning about nullptr dereference */
251 6119250 : GDALRasterBlock *poBlock = nullptr;
252 6119250 : const bool bUseIntegerRequestCoords =
253 6466460 : (!psExtraArg->bFloatingPointWindowValidity ||
254 347211 : (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
255 323819 : nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
256 :
257 : /* ==================================================================== */
258 : /* A common case is the data requested with the destination */
259 : /* is packed, and the block width is the raster width. */
260 : /* ==================================================================== */
261 6041750 : if (nPixelSpace == nBufDataSize && nLineSpace == nPixelSpace * nXSize &&
262 3191810 : nBlockXSize == GetXSize() && nBufXSize == nXSize &&
263 12161000 : nBufYSize == nYSize && bUseIntegerRequestCoords)
264 : {
265 3079270 : CPLErr eErr = CE_None;
266 3079270 : int nLBlockY = -1;
267 :
268 9621000 : for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
269 : {
270 6542810 : const int iSrcY = iBufYOff + nYOff;
271 :
272 6542810 : if (iSrcY < nLBlockY * nBlockYSize ||
273 6542810 : iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
274 : {
275 3339130 : nLBlockY = iSrcY / nBlockYSize;
276 3339130 : bool bJustInitialize =
277 295478 : eRWFlag == GF_Write && nXOff == 0 &&
278 3691750 : nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
279 57137 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize;
280 :
281 : // Is this a partial tile at right and/or bottom edges of
282 : // the raster, and that is going to be completely written?
283 : // If so, do not load it from storage, but zero it so that
284 : // the content outsize of the validity area is initialized.
285 3339130 : bool bMemZeroBuffer = false;
286 295478 : if (eRWFlag == GF_Write && !bJustInitialize && nXOff == 0 &&
287 23861 : nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
288 3634700 : nYOff + nYSize == GetYSize() &&
289 89 : nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)
290 : {
291 89 : bJustInitialize = true;
292 89 : bMemZeroBuffer = true;
293 : }
294 :
295 3339130 : if (poBlock)
296 259858 : poBlock->DropLock();
297 :
298 3339130 : const GUInt32 nErrorCounter = CPLGetErrorCounter();
299 3339130 : poBlock = GetLockedBlockRef(0, nLBlockY, bJustInitialize);
300 3339130 : if (poBlock == nullptr)
301 : {
302 1079 : if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
303 : nullptr)
304 : {
305 0 : CPLError(CE_Failure, CPLE_AppDefined,
306 : "GetBlockRef failed at X block offset %d, "
307 : "Y block offset %d%s",
308 : 0, nLBlockY,
309 0 : (nErrorCounter != CPLGetErrorCounter())
310 0 : ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
311 : : "");
312 : }
313 1079 : eErr = CE_Failure;
314 1079 : break;
315 : }
316 :
317 3338050 : if (eRWFlag == GF_Write)
318 295478 : poBlock->MarkDirty();
319 :
320 3338050 : pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
321 3338050 : if (bMemZeroBuffer)
322 : {
323 89 : memset(pabySrcBlock, 0,
324 89 : static_cast<GPtrDiff_t>(nBandDataSize) *
325 89 : nBlockXSize * nBlockYSize);
326 : }
327 : }
328 :
329 6541730 : const auto nSrcByteOffset =
330 6541730 : (static_cast<GPtrDiff_t>(iSrcY - nLBlockY * nBlockYSize) *
331 6541730 : nBlockXSize +
332 6541730 : nXOff) *
333 6541730 : nBandDataSize;
334 :
335 6541730 : if (eDataType == eBufType)
336 : {
337 2893900 : if (eRWFlag == GF_Read)
338 2423290 : memcpy(static_cast<GByte *>(pData) +
339 2423290 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
340 2423290 : pabySrcBlock + nSrcByteOffset,
341 : static_cast<size_t>(nLineSpace));
342 : else
343 470615 : memcpy(pabySrcBlock + nSrcByteOffset,
344 470615 : static_cast<GByte *>(pData) +
345 470615 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
346 : static_cast<size_t>(nLineSpace));
347 : }
348 : else
349 : {
350 : // Type to type conversion.
351 3647820 : if (eRWFlag == GF_Read)
352 3626220 : GDALCopyWords64(
353 3626220 : pabySrcBlock + nSrcByteOffset, eDataType, nBandDataSize,
354 : static_cast<GByte *>(pData) +
355 3626220 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
356 : eBufType, static_cast<int>(nPixelSpace), nBufXSize);
357 : else
358 21603 : GDALCopyWords64(static_cast<GByte *>(pData) +
359 21603 : static_cast<GPtrDiff_t>(iBufYOff) *
360 : nLineSpace,
361 : eBufType, static_cast<int>(nPixelSpace),
362 21603 : pabySrcBlock + nSrcByteOffset, eDataType,
363 : nBandDataSize, nBufXSize);
364 : }
365 :
366 6625330 : if (psExtraArg->pfnProgress != nullptr &&
367 83604 : !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
368 : psExtraArg->pProgressData))
369 : {
370 5 : eErr = CE_Failure;
371 5 : break;
372 : }
373 : }
374 :
375 3079270 : if (poBlock)
376 3078190 : poBlock->DropLock();
377 :
378 3079270 : return eErr;
379 : }
380 :
381 : /* ==================================================================== */
382 : /* Do we have overviews that would be appropriate to satisfy */
383 : /* this request? */
384 : /* ==================================================================== */
385 3039970 : if ((nBufXSize < nXSize || nBufYSize < nYSize) && GetOverviewCount() > 0 &&
386 : eRWFlag == GF_Read)
387 : {
388 : GDALRasterIOExtraArg sExtraArg;
389 2967 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
390 :
391 : const int nOverview =
392 2967 : GDALBandGetBestOverviewLevel2(this, nXOff, nYOff, nXSize, nYSize,
393 : nBufXSize, nBufYSize, &sExtraArg);
394 2967 : if (nOverview >= 0)
395 : {
396 2892 : GDALRasterBand *poOverviewBand = GetOverview(nOverview);
397 2892 : if (poOverviewBand == nullptr)
398 2892 : return CE_Failure;
399 :
400 2892 : return poOverviewBand->RasterIO(
401 : eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize,
402 2892 : nBufYSize, eBufType, nPixelSpace, nLineSpace, &sExtraArg);
403 : }
404 : }
405 :
406 848318 : if (eRWFlag == GF_Read && nBufXSize < nXSize / 100 &&
407 6 : nBufYSize < nYSize / 100 && nPixelSpace == nBufDataSize &&
408 3885410 : nLineSpace == nPixelSpace * nBufXSize &&
409 6 : CPLTestBool(CPLGetConfigOption("GDAL_NO_COSTLY_OVERVIEW", "NO")))
410 : {
411 0 : memset(pData, 0, static_cast<size_t>(nLineSpace * nBufYSize));
412 0 : return CE_None;
413 : }
414 :
415 : /* ==================================================================== */
416 : /* The second case when we don't need subsample data but likely */
417 : /* need data type conversion. */
418 : /* ==================================================================== */
419 3037080 : if ( // nPixelSpace == nBufDataSize &&
420 3037080 : nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
421 : {
422 : #if DEBUG_VERBOSE
423 : printf("IRasterIO(%d,%d,%d,%d) rw=%d case 2\n", /*ok*/
424 : nXOff, nYOff, nXSize, nYSize, static_cast<int>(eRWFlag));
425 : #endif
426 :
427 : /* --------------------------------------------------------------------
428 : */
429 : /* Loop over buffer computing source locations. */
430 : /* --------------------------------------------------------------------
431 : */
432 : // Calculate starting values out of loop
433 2471450 : const int nLBlockXStart = nXOff / nBlockXSize;
434 2471450 : const int nXSpanEnd = nBufXSize + nXOff;
435 :
436 2471450 : int nYInc = 0;
437 4982650 : for (int iBufYOff = 0, iSrcY = nYOff; iBufYOff < nBufYSize;
438 2511200 : iBufYOff += nYInc, iSrcY += nYInc)
439 : {
440 2511280 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
441 : static_cast<GPtrDiff_t>(nLineSpace);
442 2511280 : int nLBlockY = iSrcY / nBlockYSize;
443 2511280 : int nLBlockX = nLBlockXStart;
444 2511280 : int iSrcX = nXOff;
445 5243200 : while (iSrcX < nXSpanEnd)
446 : {
447 2732000 : int nXSpan = nLBlockX * nBlockXSize;
448 2732000 : if (nXSpan < INT_MAX - nBlockXSize)
449 2732000 : nXSpan += nBlockXSize;
450 : else
451 0 : nXSpan = INT_MAX;
452 2732000 : const int nXRight = nXSpan;
453 2732000 : nXSpan = (nXSpan < nXSpanEnd ? nXSpan : nXSpanEnd) - iSrcX;
454 :
455 : const size_t nXSpanSize =
456 2732000 : CPLUnsanitizedMul(nXSpan, static_cast<size_t>(nPixelSpace));
457 :
458 2732000 : bool bJustInitialize =
459 2042260 : eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
460 37317 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
461 4799900 : nXOff <= nLBlockX * nBlockXSize &&
462 25639 : nXOff + nXSize >= nXRight;
463 :
464 : // Is this a partial tile at right and/or bottom edges of
465 : // the raster, and that is going to be completely written?
466 : // If so, do not load it from storage, but zero it so that
467 : // the content outsize of the validity area is initialized.
468 2732000 : bool bMemZeroBuffer = false;
469 2042260 : if (eRWFlag == GF_Write && !bJustInitialize &&
470 2017850 : nXOff <= nLBlockX * nBlockXSize &&
471 2016200 : nYOff <= nLBlockY * nBlockYSize &&
472 12152 : (nXOff + nXSize >= nXRight ||
473 : // cppcheck-suppress knownConditionTrueFalse
474 4776970 : (nXOff + nXSize == GetXSize() && nXRight > GetXSize())) &&
475 11972 : (nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize ||
476 10750 : (nYOff + nYSize == GetYSize() &&
477 1958 : nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)))
478 : {
479 3180 : bJustInitialize = true;
480 3180 : bMemZeroBuffer = true;
481 : }
482 :
483 : /* --------------------------------------------------------------------
484 : */
485 : /* Ensure we have the appropriate block loaded. */
486 : /* --------------------------------------------------------------------
487 : */
488 2732000 : const GUInt32 nErrorCounter = CPLGetErrorCounter();
489 2732000 : poBlock =
490 2732000 : GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
491 2732000 : if (!poBlock)
492 : {
493 73 : if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
494 : nullptr)
495 : {
496 0 : CPLError(CE_Failure, CPLE_AppDefined,
497 : "GetBlockRef failed at X block offset %d, "
498 : "Y block offset %d%s",
499 : nLBlockX, nLBlockY,
500 0 : (nErrorCounter != CPLGetErrorCounter())
501 0 : ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
502 : : "");
503 : }
504 73 : return (CE_Failure);
505 : }
506 :
507 2731930 : if (eRWFlag == GF_Write)
508 2042260 : poBlock->MarkDirty();
509 :
510 2731930 : pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
511 2731930 : if (bMemZeroBuffer)
512 : {
513 3180 : memset(pabySrcBlock, 0,
514 3180 : static_cast<GPtrDiff_t>(nBandDataSize) *
515 3180 : nBlockXSize * nBlockYSize);
516 : }
517 : /* --------------------------------------------------------------------
518 : */
519 : /* Copy over this chunk of data. */
520 : /* --------------------------------------------------------------------
521 : */
522 2731930 : GPtrDiff_t iSrcOffset =
523 2731930 : (static_cast<GPtrDiff_t>(iSrcX) -
524 2731930 : static_cast<GPtrDiff_t>(nLBlockX * nBlockXSize) +
525 2731930 : (static_cast<GPtrDiff_t>(iSrcY) -
526 2731930 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
527 2731930 : nBlockXSize) *
528 2731930 : nBandDataSize;
529 : // Fill up as many rows as possible for the loaded block.
530 5463860 : const int kmax = std::min(nBlockYSize - (iSrcY % nBlockYSize),
531 2731930 : nBufYSize - iBufYOff);
532 59818000 : for (int k = 0; k < kmax; k++)
533 : {
534 57086100 : if (eDataType == eBufType && nPixelSpace == nBufDataSize)
535 : {
536 53132800 : if (eRWFlag == GF_Read)
537 48694800 : memcpy(static_cast<GByte *>(pData) + iBufOffset +
538 48694800 : static_cast<GPtrDiff_t>(k) * nLineSpace,
539 48694800 : pabySrcBlock + iSrcOffset, nXSpanSize);
540 : else
541 4438030 : memcpy(pabySrcBlock + iSrcOffset,
542 4438030 : static_cast<GByte *>(pData) + iBufOffset +
543 4438030 : static_cast<GPtrDiff_t>(k) * nLineSpace,
544 : nXSpanSize);
545 : }
546 : else
547 : {
548 : /* type to type conversion */
549 3953230 : if (eRWFlag == GF_Read)
550 3896460 : GDALCopyWords64(
551 3896460 : pabySrcBlock + iSrcOffset, eDataType,
552 : nBandDataSize,
553 3896460 : static_cast<GByte *>(pData) + iBufOffset +
554 3896460 : static_cast<GPtrDiff_t>(k) * nLineSpace,
555 : eBufType, static_cast<int>(nPixelSpace),
556 : nXSpan);
557 : else
558 56776 : GDALCopyWords64(
559 56776 : static_cast<GByte *>(pData) + iBufOffset +
560 56776 : static_cast<GPtrDiff_t>(k) * nLineSpace,
561 : eBufType, static_cast<int>(nPixelSpace),
562 56776 : pabySrcBlock + iSrcOffset, eDataType,
563 : nBandDataSize, nXSpan);
564 : }
565 :
566 57086100 : iSrcOffset +=
567 57086100 : static_cast<GPtrDiff_t>(nBlockXSize) * nBandDataSize;
568 : }
569 :
570 : iBufOffset =
571 2731930 : CPLUnsanitizedAdd<GPtrDiff_t>(iBufOffset, nXSpanSize);
572 2731930 : nLBlockX++;
573 2731930 : iSrcX += nXSpan;
574 :
575 2731930 : poBlock->DropLock();
576 2731930 : poBlock = nullptr;
577 : }
578 :
579 : /* Compute the increment to go on a block boundary */
580 2511200 : nYInc = nBlockYSize - (iSrcY % nBlockYSize);
581 :
582 2513060 : if (psExtraArg->pfnProgress != nullptr &&
583 1856 : !psExtraArg->pfnProgress(
584 2513060 : 1.0 * std::min(nBufYSize, iBufYOff + nYInc) / nBufYSize, "",
585 : psExtraArg->pProgressData))
586 : {
587 0 : return CE_Failure;
588 : }
589 : }
590 :
591 2471380 : return CE_None;
592 : }
593 :
594 : /* ==================================================================== */
595 : /* Loop reading required source blocks to satisfy output */
596 : /* request. This is the most general implementation. */
597 : /* ==================================================================== */
598 :
599 565633 : double dfXOff = nXOff;
600 565633 : double dfYOff = nYOff;
601 565633 : double dfXSize = nXSize;
602 565633 : double dfYSize = nYSize;
603 565633 : if (psExtraArg->bFloatingPointWindowValidity)
604 : {
605 230638 : dfXOff = psExtraArg->dfXOff;
606 230638 : dfYOff = psExtraArg->dfYOff;
607 230638 : dfXSize = psExtraArg->dfXSize;
608 230638 : dfYSize = psExtraArg->dfYSize;
609 : }
610 :
611 : /* -------------------------------------------------------------------- */
612 : /* Compute stepping increment. */
613 : /* -------------------------------------------------------------------- */
614 565633 : const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
615 565633 : const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
616 565633 : CPLErr eErr = CE_None;
617 :
618 565633 : if (eRWFlag == GF_Write)
619 : {
620 : /* --------------------------------------------------------------------
621 : */
622 : /* Write case */
623 : /* Loop over raster window computing source locations in the buffer.
624 : */
625 : /* --------------------------------------------------------------------
626 : */
627 166655 : GByte *pabyDstBlock = nullptr;
628 166655 : int nLBlockX = -1;
629 166655 : int nLBlockY = -1;
630 :
631 1260010 : for (int iDstY = nYOff; iDstY < nYOff + nYSize; iDstY++)
632 : {
633 1093360 : const int iBufYOff = static_cast<int>((iDstY - nYOff) / dfSrcYInc);
634 :
635 12384200 : for (int iDstX = nXOff; iDstX < nXOff + nXSize; iDstX++)
636 : {
637 11290800 : const int iBufXOff =
638 11290800 : static_cast<int>((iDstX - nXOff) / dfSrcXInc);
639 11290800 : GPtrDiff_t iBufOffset =
640 11290800 : static_cast<GPtrDiff_t>(iBufYOff) *
641 : static_cast<GPtrDiff_t>(nLineSpace) +
642 11290800 : iBufXOff * static_cast<GPtrDiff_t>(nPixelSpace);
643 :
644 : // FIXME: this code likely doesn't work if the dirty block gets
645 : // flushed to disk before being completely written.
646 : // In the meantime, bJustInitialize should probably be set to
647 : // FALSE even if it is not ideal performance wise, and for
648 : // lossy compression.
649 :
650 : /* --------------------------------------------------------------------
651 : */
652 : /* Ensure we have the appropriate block loaded. */
653 : /* --------------------------------------------------------------------
654 : */
655 11290800 : if (iDstX < nLBlockX * nBlockXSize ||
656 11041500 : iDstX - nBlockXSize >= nLBlockX * nBlockXSize ||
657 10584800 : iDstY < nLBlockY * nBlockYSize ||
658 10584800 : iDstY - nBlockYSize >= nLBlockY * nBlockYSize)
659 : {
660 738702 : nLBlockX = iDstX / nBlockXSize;
661 738702 : nLBlockY = iDstY / nBlockYSize;
662 :
663 738702 : const bool bJustInitialize =
664 1065990 : nYOff <= nLBlockY * nBlockYSize &&
665 327291 : nYOff + nYSize - nBlockYSize >=
666 327291 : nLBlockY * nBlockYSize &&
667 1116320 : nXOff <= nLBlockX * nBlockXSize &&
668 50325 : nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
669 : /*bool bMemZeroBuffer = FALSE;
670 : if( !bJustInitialize &&
671 : nXOff <= nLBlockX * nBlockXSize &&
672 : nYOff <= nLBlockY * nBlockYSize &&
673 : (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
674 : (nXOff + nXSize == GetXSize() &&
675 : (nLBlockX+1) * nBlockXSize > GetXSize())) &&
676 : (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
677 : (nYOff + nYSize == GetYSize() &&
678 : (nLBlockY+1) * nBlockYSize > GetYSize())) )
679 : {
680 : bJustInitialize = TRUE;
681 : bMemZeroBuffer = TRUE;
682 : }*/
683 738702 : if (poBlock != nullptr)
684 572047 : poBlock->DropLock();
685 :
686 738702 : poBlock =
687 738702 : GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
688 738702 : if (poBlock == nullptr)
689 : {
690 0 : return (CE_Failure);
691 : }
692 :
693 738702 : poBlock->MarkDirty();
694 :
695 738702 : pabyDstBlock = static_cast<GByte *>(poBlock->GetDataRef());
696 : /*if( bMemZeroBuffer )
697 : {
698 : memset(pabyDstBlock, 0,
699 : static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
700 : * nBlockYSize);
701 : }*/
702 : }
703 :
704 : // To make Coverity happy. Should not happen by design.
705 11290800 : if (pabyDstBlock == nullptr)
706 : {
707 0 : CPLAssert(false);
708 : eErr = CE_Failure;
709 : break;
710 : }
711 :
712 : /* --------------------------------------------------------------------
713 : */
714 : /* Copy over this pixel of data. */
715 : /* --------------------------------------------------------------------
716 : */
717 11290800 : GPtrDiff_t iDstOffset =
718 11290800 : (static_cast<GPtrDiff_t>(iDstX) -
719 11290800 : static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
720 11290800 : (static_cast<GPtrDiff_t>(iDstY) -
721 11290800 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
722 11290800 : nBlockXSize) *
723 11290800 : nBandDataSize;
724 :
725 11290800 : if (eDataType == eBufType)
726 : {
727 11287700 : memcpy(pabyDstBlock + iDstOffset,
728 11287700 : static_cast<GByte *>(pData) + iBufOffset,
729 : nBandDataSize);
730 : }
731 : else
732 : {
733 : /* type to type conversion ... ouch, this is expensive way
734 : of handling single words */
735 3096 : GDALCopyWords64(static_cast<GByte *>(pData) + iBufOffset,
736 3096 : eBufType, 0, pabyDstBlock + iDstOffset,
737 : eDataType, 0, 1);
738 : }
739 : }
740 :
741 1093360 : if (psExtraArg->pfnProgress != nullptr &&
742 0 : !psExtraArg->pfnProgress(1.0 * (iDstY - nYOff + 1) / nYSize, "",
743 : psExtraArg->pProgressData))
744 : {
745 0 : eErr = CE_Failure;
746 0 : break;
747 : }
748 : }
749 : }
750 : else
751 : {
752 398978 : if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
753 : {
754 9543 : if ((psExtraArg->eResampleAlg == GRIORA_Cubic ||
755 2719 : psExtraArg->eResampleAlg == GRIORA_CubicSpline ||
756 2681 : psExtraArg->eResampleAlg == GRIORA_Bilinear ||
757 6865 : psExtraArg->eResampleAlg == GRIORA_Lanczos) &&
758 3191 : GetColorTable() != nullptr)
759 : {
760 0 : CPLError(CE_Warning, CPLE_NotSupported,
761 : "Resampling method not supported on paletted band. "
762 : "Falling back to nearest neighbour");
763 : }
764 3415 : else if (psExtraArg->eResampleAlg == GRIORA_Gauss &&
765 3 : GDALDataTypeIsComplex(eDataType))
766 : {
767 0 : CPLError(CE_Warning, CPLE_NotSupported,
768 : "Resampling method not supported on complex data type "
769 : "band. Falling back to nearest neighbour");
770 : }
771 : else
772 : {
773 3412 : return RasterIOResampled(eRWFlag, nXOff, nYOff, nXSize, nYSize,
774 : pData, nBufXSize, nBufYSize, eBufType,
775 3412 : nPixelSpace, nLineSpace, psExtraArg);
776 : }
777 : }
778 :
779 395566 : int nLimitBlockY = 0;
780 395566 : const bool bByteCopy = eDataType == eBufType && nBandDataSize == 1;
781 395566 : int nStartBlockX = -nBlockXSize;
782 395566 : constexpr double EPS = 1e-10;
783 395566 : int nLBlockY = -1;
784 395566 : const double dfSrcXStart = 0.5 * dfSrcXInc + dfXOff + EPS;
785 395566 : const bool bIntegerXFactor =
786 372877 : bUseIntegerRequestCoords &&
787 669395 : static_cast<int>(dfSrcXInc) == dfSrcXInc &&
788 273829 : static_cast<int>(dfSrcXInc) < INT_MAX / nBandDataSize;
789 :
790 : /* --------------------------------------------------------------------
791 : */
792 : /* Read case */
793 : /* Loop over buffer computing source locations. */
794 : /* --------------------------------------------------------------------
795 : */
796 2469430 : for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
797 : {
798 : // Add small epsilon to avoid some numeric precision issues.
799 2073880 : const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
800 2073880 : const int iSrcY = static_cast<int>(std::min(
801 2073880 : std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
802 :
803 2073880 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
804 : static_cast<GPtrDiff_t>(nLineSpace);
805 :
806 2073880 : if (iSrcY >= nLimitBlockY)
807 : {
808 437822 : nLBlockY = iSrcY / nBlockYSize;
809 437822 : nLimitBlockY = nLBlockY * nBlockYSize;
810 437822 : if (nLimitBlockY < INT_MAX - nBlockYSize)
811 437822 : nLimitBlockY += nBlockYSize;
812 : else
813 0 : nLimitBlockY = INT_MAX;
814 : // Make sure a new block is loaded.
815 437822 : nStartBlockX = -nBlockXSize;
816 : }
817 1636050 : else if (static_cast<int>(dfSrcXStart) < nStartBlockX)
818 : {
819 : // Make sure a new block is loaded.
820 441987 : nStartBlockX = -nBlockXSize;
821 : }
822 :
823 2073880 : GPtrDiff_t iSrcOffsetCst = (iSrcY - nLBlockY * nBlockYSize) *
824 2073880 : static_cast<GPtrDiff_t>(nBlockXSize);
825 :
826 2073880 : if (bIntegerXFactor)
827 : {
828 695780 : int iSrcX = static_cast<int>(dfSrcXStart);
829 695780 : const int nSrcXInc = static_cast<int>(dfSrcXInc);
830 695780 : GByte *pabyDstData = static_cast<GByte *>(pData) + iBufOffset;
831 695780 : bool bRet = false;
832 695780 : if (bByteCopy)
833 : {
834 585773 : bRet = DownsamplingIntegerXFactor<true, 1>(
835 : this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
836 : static_cast<int>(nPixelSpace), nBufXSize, GDT_UInt8,
837 : GDT_UInt8, nStartBlockX, nBlockXSize, poBlock,
838 : nLBlockY);
839 : }
840 110007 : else if (eDataType == eBufType)
841 : {
842 109782 : switch (nBandDataSize)
843 : {
844 109630 : case 2:
845 109630 : bRet = DownsamplingIntegerXFactor<true, 2>(
846 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
847 : pabyDstData, static_cast<int>(nPixelSpace),
848 : nBufXSize, eDataType, eDataType, nStartBlockX,
849 : nBlockXSize, poBlock, nLBlockY);
850 109630 : break;
851 54 : case 4:
852 54 : bRet = DownsamplingIntegerXFactor<true, 4>(
853 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
854 : pabyDstData, static_cast<int>(nPixelSpace),
855 : nBufXSize, eDataType, eDataType, nStartBlockX,
856 : nBlockXSize, poBlock, nLBlockY);
857 54 : break;
858 96 : case 8:
859 96 : bRet = DownsamplingIntegerXFactor<true, 8>(
860 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
861 : pabyDstData, static_cast<int>(nPixelSpace),
862 : nBufXSize, eDataType, eDataType, nStartBlockX,
863 : nBlockXSize, poBlock, nLBlockY);
864 96 : break;
865 2 : case 16:
866 2 : bRet = DownsamplingIntegerXFactor<true, 16>(
867 : this, iSrcX, nSrcXInc, iSrcOffsetCst,
868 : pabyDstData, static_cast<int>(nPixelSpace),
869 : nBufXSize, eDataType, eDataType, nStartBlockX,
870 : nBlockXSize, poBlock, nLBlockY);
871 2 : break;
872 0 : default:
873 0 : CPLAssert(false);
874 : break;
875 : }
876 : }
877 : else
878 : {
879 225 : bRet = DownsamplingIntegerXFactor<false, 0>(
880 : this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
881 : static_cast<int>(nPixelSpace), nBufXSize, eDataType,
882 : eBufType, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
883 : }
884 695780 : if (!bRet)
885 1 : eErr = CE_Failure;
886 : }
887 : else
888 : {
889 1378100 : double dfSrcX = dfSrcXStart;
890 598175000 : for (int iBufXOff = 0; iBufXOff < nBufXSize;
891 596797000 : iBufXOff++, dfSrcX += dfSrcXInc)
892 : {
893 : // TODO?: try to avoid the clamping for most iterations
894 : const int iSrcX = static_cast<int>(
895 1193590000 : std::min(std::max(0.0, dfSrcX),
896 596797000 : static_cast<double>(nRasterXSize - 1)));
897 :
898 : /* --------------------------------------------------------------------
899 : */
900 : /* Ensure we have the appropriate block loaded. */
901 : /* --------------------------------------------------------------------
902 : */
903 596797000 : if (iSrcX >= nBlockXSize + nStartBlockX)
904 : {
905 1706900 : const int nLBlockX = iSrcX / nBlockXSize;
906 1706900 : nStartBlockX = nLBlockX * nBlockXSize;
907 :
908 1706900 : if (poBlock != nullptr)
909 1585160 : poBlock->DropLock();
910 :
911 1706900 : poBlock = GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
912 1706900 : if (poBlock == nullptr)
913 : {
914 9 : eErr = CE_Failure;
915 9 : break;
916 : }
917 :
918 : pabySrcBlock =
919 1706890 : static_cast<GByte *>(poBlock->GetDataRef());
920 : }
921 596797000 : const GPtrDiff_t nDiffX =
922 596797000 : static_cast<GPtrDiff_t>(iSrcX - nStartBlockX);
923 :
924 : /* --------------------------------------------------------------------
925 : */
926 : /* Copy over this pixel of data. */
927 : /* --------------------------------------------------------------------
928 : */
929 :
930 596797000 : if (bByteCopy)
931 : {
932 540998000 : GPtrDiff_t iSrcOffset = nDiffX + iSrcOffsetCst;
933 540998000 : static_cast<GByte *>(pData)[iBufOffset] =
934 540998000 : pabySrcBlock[iSrcOffset];
935 : }
936 55799000 : else if (eDataType == eBufType)
937 : {
938 50322800 : GPtrDiff_t iSrcOffset =
939 50322800 : (nDiffX + iSrcOffsetCst) * nBandDataSize;
940 50322800 : memcpy(static_cast<GByte *>(pData) + iBufOffset,
941 50322800 : pabySrcBlock + iSrcOffset, nBandDataSize);
942 : }
943 : else
944 : {
945 : // Type to type conversion ...
946 5476160 : GPtrDiff_t iSrcOffset =
947 5476160 : (nDiffX + iSrcOffsetCst) * nBandDataSize;
948 5476160 : GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
949 : static_cast<GByte *>(pData) +
950 5476160 : iBufOffset,
951 : eBufType, 0, 1);
952 : }
953 :
954 596797000 : iBufOffset += static_cast<int>(nPixelSpace);
955 : }
956 : }
957 2073880 : if (eErr == CE_Failure)
958 11 : break;
959 :
960 2315150 : if (psExtraArg->pfnProgress != nullptr &&
961 241284 : !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
962 : psExtraArg->pProgressData))
963 : {
964 1 : eErr = CE_Failure;
965 1 : break;
966 : }
967 : }
968 : }
969 :
970 562221 : if (poBlock != nullptr)
971 562211 : poBlock->DropLock();
972 :
973 562221 : return eErr;
974 : }
975 :
976 : /************************************************************************/
977 : /* GDALRasterIOTransformer() */
978 : /************************************************************************/
979 :
980 : struct GDALRasterIOTransformerStruct
981 : {
982 : double dfXOff;
983 : double dfYOff;
984 : double dfXRatioDstToSrc;
985 : double dfYRatioDstToSrc;
986 : };
987 :
988 6748 : static int GDALRasterIOTransformer(void *pTransformerArg, int bDstToSrc,
989 : int nPointCount, double *x, double *y,
990 : double * /* z */, int *panSuccess)
991 : {
992 6748 : GDALRasterIOTransformerStruct *psParams =
993 : static_cast<GDALRasterIOTransformerStruct *>(pTransformerArg);
994 6748 : if (bDstToSrc)
995 : {
996 252996 : for (int i = 0; i < nPointCount; i++)
997 : {
998 246836 : x[i] = x[i] * psParams->dfXRatioDstToSrc + psParams->dfXOff;
999 246836 : y[i] = y[i] * psParams->dfYRatioDstToSrc + psParams->dfYOff;
1000 246836 : panSuccess[i] = TRUE;
1001 : }
1002 : }
1003 : else
1004 : {
1005 1176 : for (int i = 0; i < nPointCount; i++)
1006 : {
1007 588 : x[i] = (x[i] - psParams->dfXOff) / psParams->dfXRatioDstToSrc;
1008 588 : y[i] = (y[i] - psParams->dfYOff) / psParams->dfYRatioDstToSrc;
1009 588 : panSuccess[i] = TRUE;
1010 : }
1011 : }
1012 6748 : return TRUE;
1013 : }
1014 :
1015 : /************************************************************************/
1016 : /* RasterIOResampled() */
1017 : /************************************************************************/
1018 :
1019 : //! @cond Doxygen_Suppress
1020 3412 : CPLErr GDALRasterBand::RasterIOResampled(
1021 : GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
1022 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
1023 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
1024 : {
1025 : // Determine if we use warping resampling or overview resampling
1026 : const bool bUseWarp =
1027 3412 : (GDALDataTypeIsComplex(eDataType) &&
1028 3571 : psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
1029 159 : psExtraArg->eResampleAlg != GRIORA_Mode);
1030 :
1031 3412 : double dfXOff = nXOff;
1032 3412 : double dfYOff = nYOff;
1033 3412 : double dfXSize = nXSize;
1034 3412 : double dfYSize = nYSize;
1035 3412 : if (psExtraArg->bFloatingPointWindowValidity)
1036 : {
1037 2717 : dfXOff = psExtraArg->dfXOff;
1038 2717 : dfYOff = psExtraArg->dfYOff;
1039 2717 : dfXSize = psExtraArg->dfXSize;
1040 2717 : dfYSize = psExtraArg->dfYSize;
1041 : }
1042 :
1043 3412 : const double dfXRatioDstToSrc = dfXSize / nBufXSize;
1044 3412 : const double dfYRatioDstToSrc = dfYSize / nBufYSize;
1045 :
1046 : // Determine the coordinates in the "virtual" output raster to see
1047 : // if there are not integers, in which case we will use them as a shift
1048 : // so that subwindow extracts give the exact same results as entire raster
1049 : // scaling.
1050 3412 : double dfDestXOff = dfXOff / dfXRatioDstToSrc;
1051 3412 : bool bHasXOffVirtual = false;
1052 3412 : int nDestXOffVirtual = 0;
1053 3412 : if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
1054 : {
1055 3084 : bHasXOffVirtual = true;
1056 3084 : dfXOff = nXOff;
1057 3084 : nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
1058 : }
1059 :
1060 3412 : double dfDestYOff = dfYOff / dfYRatioDstToSrc;
1061 3412 : bool bHasYOffVirtual = false;
1062 3412 : int nDestYOffVirtual = 0;
1063 3412 : if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
1064 : {
1065 3080 : bHasYOffVirtual = true;
1066 3080 : dfYOff = nYOff;
1067 3080 : nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
1068 : }
1069 :
1070 : // Create a MEM dataset that wraps the output buffer.
1071 : GDALDataset *poMEMDS;
1072 3412 : void *pTempBuffer = nullptr;
1073 3412 : GSpacing nPSMem = nPixelSpace;
1074 3412 : GSpacing nLSMem = nLineSpace;
1075 3412 : void *pDataMem = pData;
1076 3412 : GDALDataType eDTMem = eBufType;
1077 3412 : if (eBufType != eDataType)
1078 : {
1079 44 : nPSMem = GDALGetDataTypeSizeBytes(eDataType);
1080 44 : nLSMem = nPSMem * nBufXSize;
1081 : pTempBuffer =
1082 44 : VSI_MALLOC2_VERBOSE(nBufYSize, static_cast<size_t>(nLSMem));
1083 44 : if (pTempBuffer == nullptr)
1084 0 : return CE_Failure;
1085 44 : pDataMem = pTempBuffer;
1086 44 : eDTMem = eDataType;
1087 : }
1088 :
1089 : poMEMDS =
1090 3412 : MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
1091 : nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr);
1092 3412 : GByte *pabyData = static_cast<GByte *>(pDataMem) -
1093 3412 : nPSMem * nDestXOffVirtual - nLSMem * nDestYOffVirtual;
1094 3412 : GDALRasterBandH hMEMBand = MEMCreateRasterBandEx(
1095 : poMEMDS, 1, pabyData, eDTMem, nPSMem, nLSMem, false);
1096 3412 : poMEMDS->SetBand(1, GDALRasterBand::FromHandle(hMEMBand));
1097 :
1098 3412 : const char *pszNBITS = GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
1099 3412 : const int nNBITS = pszNBITS ? atoi(pszNBITS) : 0;
1100 3412 : if (pszNBITS)
1101 6 : GDALRasterBand::FromHandle(hMEMBand)->SetMetadataItem(
1102 6 : "NBITS", pszNBITS, "IMAGE_STRUCTURE");
1103 :
1104 3412 : CPLErr eErr = CE_None;
1105 :
1106 : // Do the resampling.
1107 3412 : if (bUseWarp)
1108 : {
1109 149 : int bHasNoData = FALSE;
1110 149 : double dfNoDataValue = GetNoDataValue(&bHasNoData);
1111 :
1112 149 : VRTDatasetH hVRTDS = nullptr;
1113 149 : GDALRasterBandH hVRTBand = nullptr;
1114 149 : if (GetDataset() == nullptr)
1115 : {
1116 : /* Create VRT dataset that wraps the whole dataset */
1117 0 : hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
1118 0 : VRTAddBand(hVRTDS, eDataType, nullptr);
1119 0 : hVRTBand = GDALGetRasterBand(hVRTDS, 1);
1120 0 : VRTAddSimpleSource(hVRTBand, this, 0, 0, nRasterXSize, nRasterYSize,
1121 : 0, 0, nRasterXSize, nRasterYSize, nullptr,
1122 : VRT_NODATA_UNSET);
1123 :
1124 : /* Add a mask band if needed */
1125 0 : if (GetMaskFlags() != GMF_ALL_VALID)
1126 : {
1127 0 : GDALDataset::FromHandle(hVRTDS)->CreateMaskBand(0);
1128 : VRTSourcedRasterBand *poVRTMaskBand =
1129 : reinterpret_cast<VRTSourcedRasterBand *>(
1130 : reinterpret_cast<GDALRasterBand *>(hVRTBand)
1131 0 : ->GetMaskBand());
1132 0 : poVRTMaskBand->AddMaskBandSource(this, 0, 0, nRasterXSize,
1133 0 : nRasterYSize, 0, 0,
1134 0 : nRasterXSize, nRasterYSize);
1135 : }
1136 : }
1137 :
1138 149 : GDALWarpOptions *psWarpOptions = GDALCreateWarpOptions();
1139 149 : switch (psExtraArg->eResampleAlg)
1140 : {
1141 0 : case GRIORA_NearestNeighbour:
1142 0 : psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
1143 0 : break;
1144 147 : case GRIORA_Bilinear:
1145 147 : psWarpOptions->eResampleAlg = GRA_Bilinear;
1146 147 : break;
1147 0 : case GRIORA_Cubic:
1148 0 : psWarpOptions->eResampleAlg = GRA_Cubic;
1149 0 : break;
1150 0 : case GRIORA_CubicSpline:
1151 0 : psWarpOptions->eResampleAlg = GRA_CubicSpline;
1152 0 : break;
1153 0 : case GRIORA_Lanczos:
1154 0 : psWarpOptions->eResampleAlg = GRA_Lanczos;
1155 0 : break;
1156 0 : case GRIORA_Average:
1157 0 : psWarpOptions->eResampleAlg = GRA_Average;
1158 0 : break;
1159 2 : case GRIORA_RMS:
1160 2 : psWarpOptions->eResampleAlg = GRA_RMS;
1161 2 : break;
1162 0 : case GRIORA_Mode:
1163 0 : psWarpOptions->eResampleAlg = GRA_Mode;
1164 0 : break;
1165 0 : default:
1166 0 : CPLAssert(false);
1167 : psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
1168 : break;
1169 : }
1170 149 : psWarpOptions->hSrcDS = hVRTDS ? hVRTDS : GetDataset();
1171 149 : psWarpOptions->hDstDS = poMEMDS;
1172 149 : psWarpOptions->nBandCount = 1;
1173 149 : int nSrcBandNumber = hVRTDS ? 1 : nBand;
1174 149 : int nDstBandNumber = 1;
1175 149 : psWarpOptions->panSrcBands = &nSrcBandNumber;
1176 149 : psWarpOptions->panDstBands = &nDstBandNumber;
1177 298 : psWarpOptions->pfnProgress = psExtraArg->pfnProgress
1178 149 : ? psExtraArg->pfnProgress
1179 : : GDALDummyProgress;
1180 149 : psWarpOptions->pProgressArg = psExtraArg->pProgressData;
1181 149 : psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
1182 149 : if (bHasNoData)
1183 : {
1184 0 : psWarpOptions->papszWarpOptions = CSLSetNameValue(
1185 : psWarpOptions->papszWarpOptions, "INIT_DEST", "NO_DATA");
1186 0 : if (psWarpOptions->padfSrcNoDataReal == nullptr)
1187 : {
1188 0 : psWarpOptions->padfSrcNoDataReal =
1189 0 : static_cast<double *>(CPLMalloc(sizeof(double)));
1190 0 : psWarpOptions->padfSrcNoDataReal[0] = dfNoDataValue;
1191 : }
1192 :
1193 0 : if (psWarpOptions->padfDstNoDataReal == nullptr)
1194 : {
1195 0 : psWarpOptions->padfDstNoDataReal =
1196 0 : static_cast<double *>(CPLMalloc(sizeof(double)));
1197 0 : psWarpOptions->padfDstNoDataReal[0] = dfNoDataValue;
1198 : }
1199 : }
1200 :
1201 : GDALRasterIOTransformerStruct sTransformer;
1202 149 : sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
1203 149 : sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
1204 149 : sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
1205 149 : sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
1206 149 : psWarpOptions->pTransformerArg = &sTransformer;
1207 :
1208 : GDALWarpOperationH hWarpOperation =
1209 149 : GDALCreateWarpOperation(psWarpOptions);
1210 149 : eErr = GDALChunkAndWarpImage(hWarpOperation, nDestXOffVirtual,
1211 : nDestYOffVirtual, nBufXSize, nBufYSize);
1212 149 : GDALDestroyWarpOperation(hWarpOperation);
1213 :
1214 149 : psWarpOptions->panSrcBands = nullptr;
1215 149 : psWarpOptions->panDstBands = nullptr;
1216 149 : GDALDestroyWarpOptions(psWarpOptions);
1217 :
1218 149 : if (hVRTDS)
1219 0 : GDALClose(hVRTDS);
1220 : }
1221 : else
1222 : {
1223 3263 : const char *pszResampling =
1224 4254 : (psExtraArg->eResampleAlg == GRIORA_Bilinear) ? "BILINEAR"
1225 1289 : : (psExtraArg->eResampleAlg == GRIORA_Cubic) ? "CUBIC"
1226 558 : : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
1227 479 : : (psExtraArg->eResampleAlg == GRIORA_Lanczos) ? "LANCZOS"
1228 342 : : (psExtraArg->eResampleAlg == GRIORA_Average) ? "AVERAGE"
1229 199 : : (psExtraArg->eResampleAlg == GRIORA_RMS) ? "RMS"
1230 79 : : (psExtraArg->eResampleAlg == GRIORA_Mode) ? "MODE"
1231 3 : : (psExtraArg->eResampleAlg == GRIORA_Gauss) ? "GAUSS"
1232 : : "UNKNOWN";
1233 :
1234 3263 : int nKernelRadius = 0;
1235 : GDALResampleFunction pfnResampleFunc =
1236 3263 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
1237 3263 : CPLAssert(pfnResampleFunc);
1238 : GDALDataType eWrkDataType =
1239 3263 : GDALGetOvrWorkDataType(pszResampling, eDataType);
1240 3263 : int nHasNoData = 0;
1241 3263 : double dfNoDataValue = GetNoDataValue(&nHasNoData);
1242 3263 : const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
1243 3263 : if (!bHasNoData)
1244 3173 : dfNoDataValue = 0.0;
1245 :
1246 3263 : int nDstBlockXSize = nBufXSize;
1247 3263 : int nDstBlockYSize = nBufYSize;
1248 3263 : int nFullResXChunk = 0;
1249 3263 : int nFullResYChunk = 0;
1250 : while (true)
1251 : {
1252 3274 : nFullResXChunk =
1253 3274 : 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
1254 3274 : nFullResYChunk =
1255 3274 : 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
1256 3274 : if (nFullResXChunk > nRasterXSize)
1257 2911 : nFullResXChunk = nRasterXSize;
1258 3274 : if (nFullResYChunk > nRasterYSize)
1259 512 : nFullResYChunk = nRasterYSize;
1260 3274 : if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
1261 3216 : (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
1262 : 1024 * 1024))
1263 : break;
1264 : // When operating on the full width of a raster whose block width is
1265 : // the raster width, prefer doing chunks in height.
1266 11 : if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
1267 : nDstBlockYSize > 1)
1268 0 : nDstBlockYSize /= 2;
1269 : /* Otherwise cut the maximal dimension */
1270 11 : else if (nDstBlockXSize > 1 &&
1271 0 : (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
1272 11 : nDstBlockXSize /= 2;
1273 : else
1274 0 : nDstBlockYSize /= 2;
1275 : }
1276 :
1277 3263 : int nOvrXFactor = static_cast<int>(0.5 + dfXRatioDstToSrc);
1278 3263 : int nOvrYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
1279 3263 : if (nOvrXFactor == 0)
1280 2029 : nOvrXFactor = 1;
1281 3263 : if (nOvrYFactor == 0)
1282 2028 : nOvrYFactor = 1;
1283 3263 : int nFullResXSizeQueried =
1284 3263 : nFullResXChunk + 2 * nKernelRadius * nOvrXFactor;
1285 3263 : int nFullResYSizeQueried =
1286 3263 : nFullResYChunk + 2 * nKernelRadius * nOvrYFactor;
1287 :
1288 3263 : if (nFullResXSizeQueried > nRasterXSize)
1289 2701 : nFullResXSizeQueried = nRasterXSize;
1290 3263 : if (nFullResYSizeQueried > nRasterYSize)
1291 299 : nFullResYSizeQueried = nRasterYSize;
1292 :
1293 : void *pChunk =
1294 3263 : VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
1295 : nFullResXSizeQueried, nFullResYSizeQueried);
1296 3263 : GByte *pabyChunkNoDataMask = nullptr;
1297 :
1298 3263 : GDALRasterBand *poMaskBand = GetMaskBand();
1299 3263 : int l_nMaskFlags = GetMaskFlags();
1300 :
1301 3263 : bool bUseNoDataMask = ((l_nMaskFlags & GMF_ALL_VALID) == 0);
1302 3263 : if (bUseNoDataMask)
1303 : {
1304 158 : pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
1305 : nFullResXSizeQueried, nFullResYSizeQueried));
1306 : }
1307 3263 : if (pChunk == nullptr ||
1308 158 : (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
1309 : {
1310 0 : GDALClose(poMEMDS);
1311 0 : CPLFree(pChunk);
1312 0 : CPLFree(pabyChunkNoDataMask);
1313 0 : VSIFree(pTempBuffer);
1314 0 : return CE_Failure;
1315 : }
1316 :
1317 3263 : const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
1318 3263 : DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
1319 3263 : int nBlocksDone = 0;
1320 :
1321 : int nDstYOff;
1322 6526 : for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
1323 3263 : nDstYOff += nDstBlockYSize)
1324 : {
1325 : int nDstYCount;
1326 3263 : if (nDstYOff + nDstBlockYSize <= nBufYSize)
1327 3263 : nDstYCount = nDstBlockYSize;
1328 : else
1329 0 : nDstYCount = nBufYSize - nDstYOff;
1330 :
1331 3263 : int nChunkYOff =
1332 3263 : nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
1333 3263 : int nChunkYOff2 = nYOff + 1 +
1334 3263 : static_cast<int>(ceil((nDstYOff + nDstYCount) *
1335 : dfYRatioDstToSrc));
1336 3263 : if (nChunkYOff2 > nRasterYSize)
1337 660 : nChunkYOff2 = nRasterYSize;
1338 3263 : int nYCount = nChunkYOff2 - nChunkYOff;
1339 3263 : CPLAssert(nYCount <= nFullResYChunk);
1340 :
1341 3263 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrYFactor;
1342 3263 : int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrYFactor;
1343 3263 : if (nChunkYOffQueried < 0)
1344 : {
1345 458 : nChunkYSizeQueried += nChunkYOffQueried;
1346 458 : nChunkYOffQueried = 0;
1347 : }
1348 3263 : if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
1349 561 : nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
1350 3263 : CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
1351 :
1352 3263 : int nDstXOff = 0;
1353 6526 : for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
1354 3263 : nDstXOff += nDstBlockXSize)
1355 : {
1356 3263 : int nDstXCount = 0;
1357 3263 : if (nDstXOff + nDstBlockXSize <= nBufXSize)
1358 3263 : nDstXCount = nDstBlockXSize;
1359 : else
1360 0 : nDstXCount = nBufXSize - nDstXOff;
1361 :
1362 3263 : int nChunkXOff =
1363 3263 : nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
1364 3263 : int nChunkXOff2 =
1365 3263 : nXOff + 1 +
1366 3263 : static_cast<int>(
1367 3263 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
1368 3263 : if (nChunkXOff2 > nRasterXSize)
1369 2960 : nChunkXOff2 = nRasterXSize;
1370 3263 : int nXCount = nChunkXOff2 - nChunkXOff;
1371 3263 : CPLAssert(nXCount <= nFullResXChunk);
1372 :
1373 3263 : int nChunkXOffQueried =
1374 3263 : nChunkXOff - nKernelRadius * nOvrXFactor;
1375 3263 : int nChunkXSizeQueried =
1376 3263 : nXCount + 2 * nKernelRadius * nOvrXFactor;
1377 3263 : if (nChunkXOffQueried < 0)
1378 : {
1379 2762 : nChunkXSizeQueried += nChunkXOffQueried;
1380 2762 : nChunkXOffQueried = 0;
1381 : }
1382 3263 : if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
1383 2748 : nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
1384 3263 : CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
1385 :
1386 : // Read the source buffers.
1387 3263 : eErr = RasterIO(GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1388 : nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
1389 : nChunkXSizeQueried, nChunkYSizeQueried,
1390 : eWrkDataType, 0, 0, nullptr);
1391 :
1392 3263 : bool bSkipResample = false;
1393 3263 : bool bNoDataMaskFullyOpaque = false;
1394 3263 : if (eErr == CE_None && bUseNoDataMask)
1395 : {
1396 158 : eErr = poMaskBand->RasterIO(
1397 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1398 : nChunkXSizeQueried, nChunkYSizeQueried,
1399 : pabyChunkNoDataMask, nChunkXSizeQueried,
1400 : nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
1401 :
1402 : /* Optimizations if mask if fully opaque or transparent */
1403 158 : int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
1404 158 : GByte bVal = pabyChunkNoDataMask[0];
1405 158 : int i = 1;
1406 3751650 : for (; i < nPixels; i++)
1407 : {
1408 3751590 : if (pabyChunkNoDataMask[i] != bVal)
1409 104 : break;
1410 : }
1411 158 : if (i == nPixels)
1412 : {
1413 54 : if (bVal == 0)
1414 : {
1415 712 : for (int j = 0; j < nDstYCount; j++)
1416 : {
1417 686 : GDALCopyWords64(&dfNoDataValue, GDT_Float64, 0,
1418 : static_cast<GByte *>(pDataMem) +
1419 686 : nLSMem * (j + nDstYOff) +
1420 686 : nDstXOff * nPSMem,
1421 : eDTMem,
1422 : static_cast<int>(nPSMem),
1423 : nDstXCount);
1424 : }
1425 26 : bSkipResample = true;
1426 : }
1427 : else
1428 : {
1429 28 : bNoDataMaskFullyOpaque = true;
1430 : }
1431 : }
1432 : }
1433 :
1434 3263 : if (!bSkipResample && eErr == CE_None)
1435 : {
1436 3234 : const bool bPropagateNoData = false;
1437 3234 : void *pDstBuffer = nullptr;
1438 3234 : GDALDataType eDstBufferDataType = GDT_Unknown;
1439 : GDALRasterBand *poMEMBand =
1440 3234 : GDALRasterBand::FromHandle(hMEMBand);
1441 3234 : GDALOverviewResampleArgs args;
1442 3234 : args.eSrcDataType = eDataType;
1443 3234 : args.eOvrDataType = poMEMBand->GetRasterDataType();
1444 3234 : args.nOvrXSize = poMEMBand->GetXSize();
1445 3234 : args.nOvrYSize = poMEMBand->GetYSize();
1446 3234 : args.nOvrNBITS = nNBITS;
1447 3234 : args.dfXRatioDstToSrc = dfXRatioDstToSrc;
1448 3234 : args.dfYRatioDstToSrc = dfYRatioDstToSrc;
1449 3234 : args.dfSrcXDelta =
1450 3234 : dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
1451 3234 : args.dfSrcYDelta =
1452 3234 : dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
1453 3234 : args.eWrkDataType = eWrkDataType;
1454 3234 : args.pabyChunkNodataMask =
1455 3234 : bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask;
1456 3234 : args.nChunkXOff =
1457 3234 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
1458 3234 : args.nChunkXSize = nChunkXSizeQueried;
1459 3234 : args.nChunkYOff =
1460 3234 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
1461 3234 : args.nChunkYSize = nChunkYSizeQueried;
1462 3234 : args.nDstXOff = nDstXOff + nDestXOffVirtual;
1463 3234 : args.nDstXOff2 = nDstXOff + nDestXOffVirtual + nDstXCount;
1464 3234 : args.nDstYOff = nDstYOff + nDestYOffVirtual;
1465 3234 : args.nDstYOff2 = nDstYOff + nDestYOffVirtual + nDstYCount;
1466 3234 : args.pszResampling = pszResampling;
1467 3234 : args.bHasNoData = bHasNoData;
1468 3234 : args.dfNoDataValue = dfNoDataValue;
1469 3234 : args.poColorTable = GetColorTable();
1470 3234 : args.bPropagateNoData = bPropagateNoData;
1471 3234 : eErr = pfnResampleFunc(args, pChunk, &pDstBuffer,
1472 : &eDstBufferDataType);
1473 3234 : if (eErr == CE_None)
1474 : {
1475 3234 : eErr = poMEMBand->RasterIO(
1476 : GF_Write, nDstXOff + nDestXOffVirtual,
1477 : nDstYOff + nDestYOffVirtual, nDstXCount, nDstYCount,
1478 : pDstBuffer, nDstXCount, nDstYCount,
1479 : eDstBufferDataType, 0, 0, nullptr);
1480 : }
1481 3234 : CPLFree(pDstBuffer);
1482 : }
1483 :
1484 3263 : nBlocksDone++;
1485 3689 : if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
1486 426 : !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
1487 : "", psExtraArg->pProgressData))
1488 : {
1489 1 : eErr = CE_Failure;
1490 : }
1491 : }
1492 : }
1493 :
1494 3263 : CPLFree(pChunk);
1495 3263 : CPLFree(pabyChunkNoDataMask);
1496 : }
1497 :
1498 3412 : if (eBufType != eDataType)
1499 : {
1500 44 : CPL_IGNORE_RET_VAL(poMEMDS->GetRasterBand(1)->RasterIO(
1501 : GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
1502 : pData, nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
1503 : nullptr));
1504 : }
1505 3412 : GDALClose(poMEMDS);
1506 3412 : VSIFree(pTempBuffer);
1507 :
1508 3412 : return eErr;
1509 : }
1510 :
1511 : /************************************************************************/
1512 : /* RasterIOResampled() */
1513 : /************************************************************************/
1514 :
1515 886 : CPLErr GDALDataset::RasterIOResampled(
1516 : GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
1517 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
1518 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
1519 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
1520 :
1521 : {
1522 : #if 0
1523 : // Determine if we use warping resampling or overview resampling
1524 : bool bUseWarp = false;
1525 : if( GDALDataTypeIsComplex( eDataType ) )
1526 : bUseWarp = true;
1527 : #endif
1528 :
1529 886 : double dfXOff = nXOff;
1530 886 : double dfYOff = nYOff;
1531 886 : double dfXSize = nXSize;
1532 886 : double dfYSize = nYSize;
1533 886 : if (psExtraArg->bFloatingPointWindowValidity)
1534 : {
1535 765 : dfXOff = psExtraArg->dfXOff;
1536 765 : dfYOff = psExtraArg->dfYOff;
1537 765 : dfXSize = psExtraArg->dfXSize;
1538 765 : dfYSize = psExtraArg->dfYSize;
1539 : }
1540 :
1541 886 : const double dfXRatioDstToSrc = dfXSize / nBufXSize;
1542 886 : const double dfYRatioDstToSrc = dfYSize / nBufYSize;
1543 :
1544 : // Determine the coordinates in the "virtual" output raster to see
1545 : // if there are not integers, in which case we will use them as a shift
1546 : // so that subwindow extracts give the exact same results as entire raster
1547 : // scaling.
1548 886 : double dfDestXOff = dfXOff / dfXRatioDstToSrc;
1549 886 : bool bHasXOffVirtual = false;
1550 886 : int nDestXOffVirtual = 0;
1551 886 : if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
1552 : {
1553 761 : bHasXOffVirtual = true;
1554 761 : dfXOff = nXOff;
1555 761 : nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
1556 : }
1557 :
1558 886 : double dfDestYOff = dfYOff / dfYRatioDstToSrc;
1559 886 : bool bHasYOffVirtual = false;
1560 886 : int nDestYOffVirtual = 0;
1561 886 : if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
1562 : {
1563 721 : bHasYOffVirtual = true;
1564 721 : dfYOff = nYOff;
1565 721 : nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
1566 : }
1567 :
1568 : // Create a MEM dataset that wraps the output buffer.
1569 : GDALDataset *poMEMDS =
1570 886 : MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
1571 : nDestYOffVirtual + nBufYSize, 0, eBufType, nullptr);
1572 : GDALRasterBand **papoDstBands = static_cast<GDALRasterBand **>(
1573 886 : CPLMalloc(nBandCount * sizeof(GDALRasterBand *)));
1574 886 : int nNBITS = 0;
1575 2878 : for (int i = 0; i < nBandCount; i++)
1576 : {
1577 1992 : char szBuffer[32] = {'\0'};
1578 3984 : int nRet = CPLPrintPointer(
1579 : szBuffer,
1580 1992 : static_cast<GByte *>(pData) - nPixelSpace * nDestXOffVirtual -
1581 1992 : nLineSpace * nDestYOffVirtual + nBandSpace * i,
1582 : sizeof(szBuffer));
1583 1992 : szBuffer[nRet] = 0;
1584 :
1585 1992 : char szBuffer0[64] = {'\0'};
1586 1992 : snprintf(szBuffer0, sizeof(szBuffer0), "DATAPOINTER=%s", szBuffer);
1587 :
1588 1992 : char szBuffer1[64] = {'\0'};
1589 1992 : snprintf(szBuffer1, sizeof(szBuffer1), "PIXELOFFSET=" CPL_FRMT_GIB,
1590 : static_cast<GIntBig>(nPixelSpace));
1591 :
1592 1992 : char szBuffer2[64] = {'\0'};
1593 1992 : snprintf(szBuffer2, sizeof(szBuffer2), "LINEOFFSET=" CPL_FRMT_GIB,
1594 : static_cast<GIntBig>(nLineSpace));
1595 :
1596 1992 : char *apszOptions[4] = {szBuffer0, szBuffer1, szBuffer2, nullptr};
1597 :
1598 1992 : poMEMDS->AddBand(eBufType, apszOptions);
1599 :
1600 1992 : GDALRasterBand *poSrcBand = GetRasterBand(panBandMap[i]);
1601 1992 : papoDstBands[i] = poMEMDS->GetRasterBand(i + 1);
1602 : const char *pszNBITS =
1603 1992 : poSrcBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
1604 1992 : if (pszNBITS)
1605 : {
1606 0 : nNBITS = atoi(pszNBITS);
1607 0 : poMEMDS->GetRasterBand(i + 1)->SetMetadataItem("NBITS", pszNBITS,
1608 0 : "IMAGE_STRUCTURE");
1609 : }
1610 : }
1611 :
1612 886 : CPLErr eErr = CE_None;
1613 :
1614 : // TODO(schwehr): Why disabled? Why not just delete?
1615 : // Looks like this code was initially added as disable by copying
1616 : // from RasterIO here:
1617 : // https://trac.osgeo.org/gdal/changeset/29572
1618 : #if 0
1619 : // Do the resampling.
1620 : if( bUseWarp )
1621 : {
1622 : VRTDatasetH hVRTDS = nullptr;
1623 : GDALRasterBandH hVRTBand = nullptr;
1624 : if( GetDataset() == nullptr )
1625 : {
1626 : /* Create VRT dataset that wraps the whole dataset */
1627 : hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
1628 : VRTAddBand( hVRTDS, eDataType, nullptr );
1629 : hVRTBand = GDALGetRasterBand(hVRTDS, 1);
1630 : VRTAddSimpleSource( (VRTSourcedRasterBandH)hVRTBand,
1631 : (GDALRasterBandH)this,
1632 : 0, 0,
1633 : nRasterXSize, nRasterYSize,
1634 : 0, 0,
1635 : nRasterXSize, nRasterYSize,
1636 : nullptr, VRT_NODATA_UNSET );
1637 :
1638 : /* Add a mask band if needed */
1639 : if( GetMaskFlags() != GMF_ALL_VALID )
1640 : {
1641 : ((GDALDataset*)hVRTDS)->CreateMaskBand(0);
1642 : VRTSourcedRasterBand* poVRTMaskBand =
1643 : (VRTSourcedRasterBand*)(((GDALRasterBand*)hVRTBand)->GetMaskBand());
1644 : poVRTMaskBand->
1645 : AddMaskBandSource( this,
1646 : 0, 0,
1647 : nRasterXSize, nRasterYSize,
1648 : 0, 0,
1649 : nRasterXSize, nRasterYSize);
1650 : }
1651 : }
1652 :
1653 : GDALWarpOptions* psWarpOptions = GDALCreateWarpOptions();
1654 : psWarpOptions->eResampleAlg = (GDALResampleAlg)psExtraArg->eResampleAlg;
1655 : psWarpOptions->hSrcDS = (GDALDatasetH) (hVRTDS ? hVRTDS : GetDataset());
1656 : psWarpOptions->hDstDS = (GDALDatasetH) poMEMDS;
1657 : psWarpOptions->nBandCount = 1;
1658 : int nSrcBandNumber = (hVRTDS ? 1 : nBand);
1659 : int nDstBandNumber = 1;
1660 : psWarpOptions->panSrcBands = &nSrcBandNumber;
1661 : psWarpOptions->panDstBands = &nDstBandNumber;
1662 : psWarpOptions->pfnProgress = psExtraArg->pfnProgress ?
1663 : psExtraArg->pfnProgress : GDALDummyProgress;
1664 : psWarpOptions->pProgressArg = psExtraArg->pProgressData;
1665 : psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
1666 : GDALRasterIOTransformerStruct sTransformer;
1667 : sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
1668 : sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
1669 : sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
1670 : sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
1671 : psWarpOptions->pTransformerArg = &sTransformer;
1672 :
1673 : GDALWarpOperationH hWarpOperation = GDALCreateWarpOperation(psWarpOptions);
1674 : eErr = GDALChunkAndWarpImage( hWarpOperation,
1675 : nDestXOffVirtual, nDestYOffVirtual,
1676 : nBufXSize, nBufYSize );
1677 : GDALDestroyWarpOperation( hWarpOperation );
1678 :
1679 : psWarpOptions->panSrcBands = nullptr;
1680 : psWarpOptions->panDstBands = nullptr;
1681 : GDALDestroyWarpOptions( psWarpOptions );
1682 :
1683 : if( hVRTDS )
1684 : GDALClose(hVRTDS);
1685 : }
1686 : else
1687 : #endif
1688 : {
1689 886 : const char *pszResampling =
1690 1653 : (psExtraArg->eResampleAlg == GRIORA_Bilinear) ? "BILINEAR"
1691 767 : : (psExtraArg->eResampleAlg == GRIORA_Cubic) ? "CUBIC"
1692 0 : : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
1693 0 : : (psExtraArg->eResampleAlg == GRIORA_Lanczos) ? "LANCZOS"
1694 0 : : (psExtraArg->eResampleAlg == GRIORA_Average) ? "AVERAGE"
1695 0 : : (psExtraArg->eResampleAlg == GRIORA_RMS) ? "RMS"
1696 0 : : (psExtraArg->eResampleAlg == GRIORA_Mode) ? "MODE"
1697 0 : : (psExtraArg->eResampleAlg == GRIORA_Gauss) ? "GAUSS"
1698 : : "UNKNOWN";
1699 :
1700 886 : GDALRasterBand *poFirstSrcBand = GetRasterBand(panBandMap[0]);
1701 886 : GDALDataType eDataType = poFirstSrcBand->GetRasterDataType();
1702 : int nBlockXSize, nBlockYSize;
1703 886 : poFirstSrcBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
1704 :
1705 : int nKernelRadius;
1706 : GDALResampleFunction pfnResampleFunc =
1707 886 : GDALGetResampleFunction(pszResampling, &nKernelRadius);
1708 886 : CPLAssert(pfnResampleFunc);
1709 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1710 : GDALResampleFunctionMultiBands pfnResampleFuncMultiBands =
1711 : GDALGetResampleFunctionMultiBands(pszResampling, &nKernelRadius);
1712 : #endif
1713 : GDALDataType eWrkDataType =
1714 886 : GDALGetOvrWorkDataType(pszResampling, eDataType);
1715 :
1716 886 : int nDstBlockXSize = nBufXSize;
1717 886 : int nDstBlockYSize = nBufYSize;
1718 : int nFullResXChunk, nFullResYChunk;
1719 : while (true)
1720 : {
1721 886 : nFullResXChunk =
1722 886 : 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
1723 886 : nFullResYChunk =
1724 886 : 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
1725 886 : if (nFullResXChunk > nRasterXSize)
1726 585 : nFullResXChunk = nRasterXSize;
1727 886 : if (nFullResYChunk > nRasterYSize)
1728 51 : nFullResYChunk = nRasterYSize;
1729 886 : if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
1730 884 : (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
1731 : 1024 * 1024))
1732 : break;
1733 : // When operating on the full width of a raster whose block width is
1734 : // the raster width, prefer doing chunks in height.
1735 0 : if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
1736 : nDstBlockYSize > 1)
1737 0 : nDstBlockYSize /= 2;
1738 : /* Otherwise cut the maximal dimension */
1739 0 : else if (nDstBlockXSize > 1 &&
1740 0 : (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
1741 0 : nDstBlockXSize /= 2;
1742 : else
1743 0 : nDstBlockYSize /= 2;
1744 : }
1745 :
1746 1772 : int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
1747 886 : static_cast<int>(0.5 + dfYRatioDstToSrc));
1748 886 : if (nOvrFactor == 0)
1749 104 : nOvrFactor = 1;
1750 886 : int nFullResXSizeQueried =
1751 886 : nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
1752 886 : int nFullResYSizeQueried =
1753 886 : nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
1754 :
1755 886 : if (nFullResXSizeQueried > nRasterXSize)
1756 610 : nFullResXSizeQueried = nRasterXSize;
1757 886 : if (nFullResYSizeQueried > nRasterYSize)
1758 54 : nFullResYSizeQueried = nRasterYSize;
1759 :
1760 886 : void *pChunk = VSI_MALLOC3_VERBOSE(
1761 : cpl::fits_on<int>(GDALGetDataTypeSizeBytes(eWrkDataType) *
1762 : nBandCount),
1763 : nFullResXSizeQueried, nFullResYSizeQueried);
1764 886 : GByte *pabyChunkNoDataMask = nullptr;
1765 :
1766 886 : GDALRasterBand *poMaskBand = poFirstSrcBand->GetMaskBand();
1767 886 : int nMaskFlags = poFirstSrcBand->GetMaskFlags();
1768 :
1769 886 : bool bUseNoDataMask = ((nMaskFlags & GMF_ALL_VALID) == 0);
1770 886 : if (bUseNoDataMask)
1771 : {
1772 617 : pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
1773 : nFullResXSizeQueried, nFullResYSizeQueried));
1774 : }
1775 886 : if (pChunk == nullptr ||
1776 617 : (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
1777 : {
1778 0 : GDALClose(poMEMDS);
1779 0 : CPLFree(pChunk);
1780 0 : CPLFree(pabyChunkNoDataMask);
1781 0 : CPLFree(papoDstBands);
1782 0 : return CE_Failure;
1783 : }
1784 :
1785 886 : const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
1786 886 : DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
1787 886 : int nBlocksDone = 0;
1788 :
1789 : int nDstYOff;
1790 1772 : for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
1791 886 : nDstYOff += nDstBlockYSize)
1792 : {
1793 : int nDstYCount;
1794 886 : if (nDstYOff + nDstBlockYSize <= nBufYSize)
1795 886 : nDstYCount = nDstBlockYSize;
1796 : else
1797 0 : nDstYCount = nBufYSize - nDstYOff;
1798 :
1799 886 : int nChunkYOff =
1800 886 : nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
1801 886 : int nChunkYOff2 = nYOff + 1 +
1802 886 : static_cast<int>(ceil((nDstYOff + nDstYCount) *
1803 : dfYRatioDstToSrc));
1804 886 : if (nChunkYOff2 > nRasterYSize)
1805 133 : nChunkYOff2 = nRasterYSize;
1806 886 : int nYCount = nChunkYOff2 - nChunkYOff;
1807 886 : CPLAssert(nYCount <= nFullResYChunk);
1808 :
1809 886 : int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
1810 886 : int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
1811 886 : if (nChunkYOffQueried < 0)
1812 : {
1813 136 : nChunkYSizeQueried += nChunkYOffQueried;
1814 136 : nChunkYOffQueried = 0;
1815 : }
1816 886 : if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
1817 151 : nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
1818 886 : CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
1819 :
1820 : int nDstXOff;
1821 1772 : for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
1822 886 : nDstXOff += nDstBlockXSize)
1823 : {
1824 : int nDstXCount;
1825 886 : if (nDstXOff + nDstBlockXSize <= nBufXSize)
1826 886 : nDstXCount = nDstBlockXSize;
1827 : else
1828 0 : nDstXCount = nBufXSize - nDstXOff;
1829 :
1830 886 : int nChunkXOff =
1831 886 : nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
1832 886 : int nChunkXOff2 =
1833 886 : nXOff + 1 +
1834 886 : static_cast<int>(
1835 886 : ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
1836 886 : if (nChunkXOff2 > nRasterXSize)
1837 641 : nChunkXOff2 = nRasterXSize;
1838 886 : int nXCount = nChunkXOff2 - nChunkXOff;
1839 886 : CPLAssert(nXCount <= nFullResXChunk);
1840 :
1841 886 : int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
1842 886 : int nChunkXSizeQueried =
1843 886 : nXCount + 2 * nKernelRadius * nOvrFactor;
1844 886 : if (nChunkXOffQueried < 0)
1845 : {
1846 641 : nChunkXSizeQueried += nChunkXOffQueried;
1847 641 : nChunkXOffQueried = 0;
1848 : }
1849 886 : if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
1850 649 : nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
1851 886 : CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
1852 :
1853 886 : bool bSkipResample = false;
1854 886 : bool bNoDataMaskFullyOpaque = false;
1855 886 : if (eErr == CE_None && bUseNoDataMask)
1856 : {
1857 617 : eErr = poMaskBand->RasterIO(
1858 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1859 : nChunkXSizeQueried, nChunkYSizeQueried,
1860 : pabyChunkNoDataMask, nChunkXSizeQueried,
1861 : nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
1862 :
1863 : /* Optimizations if mask if fully opaque or transparent */
1864 617 : const int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
1865 617 : const GByte bVal = pabyChunkNoDataMask[0];
1866 617 : int i = 1; // Used after for.
1867 48197000 : for (; i < nPixels; i++)
1868 : {
1869 48196500 : if (pabyChunkNoDataMask[i] != bVal)
1870 72 : break;
1871 : }
1872 617 : if (i == nPixels)
1873 : {
1874 545 : if (bVal == 0)
1875 : {
1876 373 : GByte abyZero[16] = {0};
1877 780 : for (int iBand = 0; iBand < nBandCount; iBand++)
1878 : {
1879 3499 : for (int j = 0; j < nDstYCount; j++)
1880 : {
1881 3092 : GDALCopyWords64(
1882 : abyZero, GDT_UInt8, 0,
1883 : static_cast<GByte *>(pData) +
1884 3092 : iBand * nBandSpace +
1885 3092 : nLineSpace * (j + nDstYOff) +
1886 3092 : nDstXOff * nPixelSpace,
1887 : eBufType, static_cast<int>(nPixelSpace),
1888 : nDstXCount);
1889 : }
1890 : }
1891 373 : bSkipResample = true;
1892 : }
1893 : else
1894 : {
1895 172 : bNoDataMaskFullyOpaque = true;
1896 : }
1897 : }
1898 : }
1899 :
1900 886 : if (!bSkipResample && eErr == CE_None)
1901 : {
1902 : /* Read the source buffers */
1903 510 : eErr = RasterIO(
1904 : GF_Read, nChunkXOffQueried, nChunkYOffQueried,
1905 : nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
1906 : nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType,
1907 : nBandCount, panBandMap, 0, 0, 0, nullptr);
1908 : }
1909 :
1910 : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
1911 : if (pfnResampleFuncMultiBands && !bSkipResample &&
1912 : eErr == CE_None)
1913 : {
1914 : eErr = pfnResampleFuncMultiBands(
1915 : dfXRatioDstToSrc, dfYRatioDstToSrc,
1916 : dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
1917 : dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
1918 : eWrkDataType, (GByte *)pChunk, nBandCount,
1919 : bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask,
1920 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
1921 : nChunkXSizeQueried,
1922 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
1923 : nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
1924 : nDstXOff + nDestXOffVirtual + nDstXCount,
1925 : nDstYOff + nDestYOffVirtual,
1926 : nDstYOff + nDestYOffVirtual + nDstYCount, papoDstBands,
1927 : pszResampling, FALSE /*bHasNoData*/,
1928 : 0.0 /* dfNoDataValue */, nullptr /* color table*/,
1929 : eDataType);
1930 : }
1931 : else
1932 : #endif
1933 : {
1934 : size_t nChunkBandOffset =
1935 886 : static_cast<size_t>(nChunkXSizeQueried) *
1936 886 : nChunkYSizeQueried *
1937 886 : GDALGetDataTypeSizeBytes(eWrkDataType);
1938 2462 : for (int i = 0;
1939 2462 : i < nBandCount && !bSkipResample && eErr == CE_None;
1940 : i++)
1941 : {
1942 1576 : const bool bPropagateNoData = false;
1943 1576 : void *pDstBuffer = nullptr;
1944 1576 : GDALDataType eDstBufferDataType = GDT_Unknown;
1945 : GDALRasterBand *poMEMBand =
1946 1576 : poMEMDS->GetRasterBand(i + 1);
1947 1576 : GDALOverviewResampleArgs args;
1948 1576 : args.eSrcDataType = eDataType;
1949 1576 : args.eOvrDataType = poMEMBand->GetRasterDataType();
1950 1576 : args.nOvrXSize = poMEMBand->GetXSize();
1951 1576 : args.nOvrYSize = poMEMBand->GetYSize();
1952 1576 : args.nOvrNBITS = nNBITS;
1953 1576 : args.dfXRatioDstToSrc = dfXRatioDstToSrc;
1954 1576 : args.dfYRatioDstToSrc = dfYRatioDstToSrc;
1955 1576 : args.dfSrcXDelta =
1956 1576 : dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
1957 1576 : args.dfSrcYDelta =
1958 1576 : dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
1959 1576 : args.eWrkDataType = eWrkDataType;
1960 1576 : args.pabyChunkNodataMask = bNoDataMaskFullyOpaque
1961 1576 : ? nullptr
1962 : : pabyChunkNoDataMask;
1963 1576 : args.nChunkXOff =
1964 1576 : nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
1965 1576 : args.nChunkXSize = nChunkXSizeQueried;
1966 1576 : args.nChunkYOff =
1967 1576 : nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
1968 1576 : args.nChunkYSize = nChunkYSizeQueried;
1969 1576 : args.nDstXOff = nDstXOff + nDestXOffVirtual;
1970 1576 : args.nDstXOff2 =
1971 1576 : nDstXOff + nDestXOffVirtual + nDstXCount;
1972 1576 : args.nDstYOff = nDstYOff + nDestYOffVirtual;
1973 1576 : args.nDstYOff2 =
1974 1576 : nDstYOff + nDestYOffVirtual + nDstYCount;
1975 1576 : args.pszResampling = pszResampling;
1976 1576 : args.bHasNoData = false;
1977 1576 : args.dfNoDataValue = 0.0;
1978 1576 : args.poColorTable = nullptr;
1979 1576 : args.bPropagateNoData = bPropagateNoData;
1980 :
1981 : eErr =
1982 3152 : pfnResampleFunc(args,
1983 1576 : reinterpret_cast<GByte *>(pChunk) +
1984 1576 : i * nChunkBandOffset,
1985 : &pDstBuffer, &eDstBufferDataType);
1986 1576 : if (eErr == CE_None)
1987 : {
1988 1576 : eErr = poMEMBand->RasterIO(
1989 : GF_Write, nDstXOff + nDestXOffVirtual,
1990 : nDstYOff + nDestYOffVirtual, nDstXCount,
1991 : nDstYCount, pDstBuffer, nDstXCount, nDstYCount,
1992 : eDstBufferDataType, 0, 0, nullptr);
1993 : }
1994 1576 : CPLFree(pDstBuffer);
1995 : }
1996 : }
1997 :
1998 886 : nBlocksDone++;
1999 1275 : if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
2000 389 : !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
2001 : "", psExtraArg->pProgressData))
2002 : {
2003 0 : eErr = CE_Failure;
2004 : }
2005 : }
2006 : }
2007 :
2008 886 : CPLFree(pChunk);
2009 886 : CPLFree(pabyChunkNoDataMask);
2010 : }
2011 :
2012 886 : CPLFree(papoDstBands);
2013 886 : GDALClose(poMEMDS);
2014 :
2015 886 : return eErr;
2016 : }
2017 :
2018 : //! @endcond
2019 :
2020 : /************************************************************************/
2021 : /* GDALSwapWords() */
2022 : /************************************************************************/
2023 :
2024 : /**
2025 : * Byte swap words in-place.
2026 : *
2027 : * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
2028 : * a memory array. No assumption is made that the words being swapped are
2029 : * word aligned in memory. Use the CPL_LSB and CPL_MSB macros from cpl_port.h
2030 : * to determine if the current platform is big endian or little endian. Use
2031 : * The macros like CPL_SWAP32() to byte swap single values without the overhead
2032 : * of a function call.
2033 : *
2034 : * @param pData pointer to start of data buffer.
2035 : * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
2036 : * @param nWordCount the number of words to be swapped in this call.
2037 : * @param nWordSkip the byte offset from the start of one word to the start of
2038 : * the next. For packed buffers this is the same as nWordSize.
2039 : */
2040 :
2041 497143 : void CPL_STDCALL GDALSwapWords(void *pData, int nWordSize, int nWordCount,
2042 : int nWordSkip)
2043 :
2044 : {
2045 497143 : if (nWordCount > 0)
2046 497143 : VALIDATE_POINTER0(pData, "GDALSwapWords");
2047 :
2048 497143 : GByte *pabyData = static_cast<GByte *>(pData);
2049 :
2050 497143 : switch (nWordSize)
2051 : {
2052 7234 : case 1:
2053 7234 : break;
2054 :
2055 476903 : case 2:
2056 476903 : CPLAssert(nWordSkip >= 2 || nWordCount == 1);
2057 228062000 : for (int i = 0; i < nWordCount; i++)
2058 : {
2059 227585000 : CPL_SWAP16PTR(pabyData);
2060 227585000 : pabyData += nWordSkip;
2061 : }
2062 476903 : break;
2063 :
2064 10580 : case 4:
2065 10580 : CPLAssert(nWordSkip >= 4 || nWordCount == 1);
2066 10580 : if (CPL_IS_ALIGNED(pabyData, 4) && (nWordSkip % 4) == 0)
2067 : {
2068 29140500 : for (int i = 0; i < nWordCount; i++)
2069 : {
2070 29130000 : *reinterpret_cast<GUInt32 *>(pabyData) = CPL_SWAP32(
2071 : *reinterpret_cast<const GUInt32 *>(pabyData));
2072 29130000 : pabyData += nWordSkip;
2073 10577 : }
2074 : }
2075 : else
2076 : {
2077 9 : for (int i = 0; i < nWordCount; i++)
2078 : {
2079 6 : CPL_SWAP32PTR(pabyData);
2080 6 : pabyData += nWordSkip;
2081 : }
2082 : }
2083 10580 : break;
2084 :
2085 2426 : case 8:
2086 2426 : CPLAssert(nWordSkip >= 8 || nWordCount == 1);
2087 2426 : if (CPL_IS_ALIGNED(pabyData, 8) && (nWordSkip % 8) == 0)
2088 : {
2089 3356900 : for (int i = 0; i < nWordCount; i++)
2090 : {
2091 3354480 : *reinterpret_cast<GUInt64 *>(pabyData) = CPL_SWAP64(
2092 : *reinterpret_cast<const GUInt64 *>(pabyData));
2093 3354480 : pabyData += nWordSkip;
2094 2425 : }
2095 : }
2096 : else
2097 : {
2098 3 : for (int i = 0; i < nWordCount; i++)
2099 : {
2100 2 : CPL_SWAP64PTR(pabyData);
2101 2 : pabyData += nWordSkip;
2102 : }
2103 : }
2104 2426 : break;
2105 :
2106 0 : default:
2107 0 : CPLAssert(false);
2108 : }
2109 : }
2110 :
2111 : /************************************************************************/
2112 : /* GDALSwapWordsEx() */
2113 : /************************************************************************/
2114 :
2115 : /**
2116 : * Byte swap words in-place.
2117 : *
2118 : * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
2119 : * a memory array. No assumption is made that the words being swapped are
2120 : * word aligned in memory. Use the CPL_LSB and CPL_MSB macros from cpl_port.h
2121 : * to determine if the current platform is big endian or little endian. Use
2122 : * The macros like CPL_SWAP32() to byte swap single values without the overhead
2123 : * of a function call.
2124 : *
2125 : * @param pData pointer to start of data buffer.
2126 : * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
2127 : * @param nWordCount the number of words to be swapped in this call.
2128 : * @param nWordSkip the byte offset from the start of one word to the start of
2129 : * the next. For packed buffers this is the same as nWordSize.
2130 : */
2131 6124 : void CPL_STDCALL GDALSwapWordsEx(void *pData, int nWordSize, size_t nWordCount,
2132 : int nWordSkip)
2133 : {
2134 6124 : GByte *pabyData = static_cast<GByte *>(pData);
2135 12248 : while (nWordCount)
2136 : {
2137 : // Pick-up a multiple of 8 as max chunk size.
2138 6124 : const int nWordCountSmall =
2139 6124 : (nWordCount > (1 << 30)) ? (1 << 30) : static_cast<int>(nWordCount);
2140 6124 : GDALSwapWords(pabyData, nWordSize, nWordCountSmall, nWordSkip);
2141 6124 : pabyData += static_cast<size_t>(nWordSkip) * nWordCountSmall;
2142 6124 : nWordCount -= nWordCountSmall;
2143 : }
2144 6124 : }
2145 :
2146 : // Place the new GDALCopyWords helpers in an anonymous namespace
2147 : namespace
2148 : {
2149 :
2150 : /************************************************************************/
2151 : /* GDALCopyWordsT() */
2152 : /************************************************************************/
2153 : /**
2154 : * Template function, used to copy data from pSrcData into buffer
2155 : * pDstData, with stride nSrcPixelStride in the source data and
2156 : * stride nDstPixelStride in the destination data. This template can
2157 : * deal with the case where the input data type is real or complex and
2158 : * the output is real.
2159 : *
2160 : * @param pSrcData the source data buffer
2161 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
2162 : * of interest.
2163 : * @param pDstData the destination buffer.
2164 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
2165 : * interest.
2166 : * @param nWordCount the total number of pixel words to copy
2167 : *
2168 : * @code
2169 : * // Assume an input buffer of type GUInt16 named pBufferIn
2170 : * GByte *pBufferOut = new GByte[numBytesOut];
2171 : * GDALCopyWordsT<GUInt16, GByte>(pSrcData, 2, pDstData, 1, numBytesOut);
2172 : * @endcode
2173 : * @note
2174 : * This is a private function, and should not be exposed outside of
2175 : * rasterio.cpp. External users should call the GDALCopyWords driver function.
2176 : */
2177 :
2178 : template <class Tin, class Tout>
2179 42454229 : static void inline GDALCopyWordsGenericT(const Tin *const CPL_RESTRICT pSrcData,
2180 : int nSrcPixelStride,
2181 : Tout *const CPL_RESTRICT pDstData,
2182 : int nDstPixelStride,
2183 : GPtrDiff_t nWordCount)
2184 : {
2185 42454229 : decltype(nWordCount) nDstOffset = 0;
2186 :
2187 42454229 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
2188 42454229 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
2189 384343861 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
2190 : {
2191 341889564 : const Tin tValue =
2192 341889564 : *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
2193 341889564 : Tout *const pOutPixel =
2194 341889564 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2195 :
2196 341889564 : GDALCopyWord(tValue, *pOutPixel);
2197 :
2198 341889564 : nDstOffset += nDstPixelStride;
2199 : }
2200 42454229 : }
2201 :
2202 : template <class Tin, class Tout>
2203 29786219 : static void CPL_NOINLINE GDALCopyWordsT(const Tin *const CPL_RESTRICT pSrcData,
2204 : int nSrcPixelStride,
2205 : Tout *const CPL_RESTRICT pDstData,
2206 : int nDstPixelStride,
2207 : GPtrDiff_t nWordCount)
2208 : {
2209 29786219 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData, nDstPixelStride,
2210 : nWordCount);
2211 29786219 : }
2212 :
2213 : template <class Tin, class Tout>
2214 5076559 : static void inline GDALCopyWordsT_8atatime(
2215 : const Tin *const CPL_RESTRICT pSrcData, int nSrcPixelStride,
2216 : Tout *const CPL_RESTRICT pDstData, int nDstPixelStride,
2217 : GPtrDiff_t nWordCount)
2218 : {
2219 5076559 : decltype(nWordCount) nDstOffset = 0;
2220 :
2221 5076559 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
2222 5076559 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
2223 5076559 : decltype(nWordCount) n = 0;
2224 5076559 : if (nSrcPixelStride == static_cast<int>(sizeof(Tin)) &&
2225 : nDstPixelStride == static_cast<int>(sizeof(Tout)))
2226 : {
2227 57868365 : for (; n < nWordCount - 7; n += 8)
2228 : {
2229 57324286 : const Tin *pInValues = reinterpret_cast<const Tin *>(
2230 57324286 : pSrcDataPtr + (n * nSrcPixelStride));
2231 57324286 : Tout *const pOutPixels =
2232 57324286 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2233 :
2234 57324286 : GDALCopy8Words(pInValues, pOutPixels);
2235 :
2236 57324286 : nDstOffset += 8 * nDstPixelStride;
2237 : }
2238 : }
2239 10454636 : for (; n < nWordCount; n++)
2240 : {
2241 5378077 : const Tin tValue =
2242 5378077 : *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
2243 5378077 : Tout *const pOutPixel =
2244 5378077 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
2245 :
2246 5378077 : GDALCopyWord(tValue, *pOutPixel);
2247 :
2248 5378077 : nDstOffset += nDstPixelStride;
2249 : }
2250 5076559 : }
2251 :
2252 : #ifdef HAVE_SSE2
2253 :
2254 : template <class Tout>
2255 39717 : void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
2256 : int nSrcPixelStride,
2257 : Tout *const CPL_RESTRICT pDstData,
2258 : int nDstPixelStride, GPtrDiff_t nWordCount)
2259 : {
2260 : static_assert(std::is_integral<Tout>::value &&
2261 : sizeof(Tout) == sizeof(uint16_t),
2262 : "Bad Tout");
2263 39717 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2264 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2265 : {
2266 33366 : decltype(nWordCount) n = 0;
2267 33366 : const __m128i xmm_zero = _mm_setzero_si128();
2268 33366 : GByte *CPL_RESTRICT pabyDstDataPtr =
2269 : reinterpret_cast<GByte *>(pDstData);
2270 1415762 : for (; n < nWordCount - 15; n += 16)
2271 : {
2272 1382396 : __m128i xmm = _mm_loadu_si128(
2273 1382396 : reinterpret_cast<const __m128i *>(pSrcData + n));
2274 1382396 : __m128i xmm0 = _mm_unpacklo_epi8(xmm, xmm_zero);
2275 1382396 : __m128i xmm1 = _mm_unpackhi_epi8(xmm, xmm_zero);
2276 : _mm_storeu_si128(
2277 1382396 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2), xmm0);
2278 : _mm_storeu_si128(
2279 1382396 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2 + 16), xmm1);
2280 : }
2281 109389 : for (; n < nWordCount; n++)
2282 : {
2283 76023 : pDstData[n] = pSrcData[n];
2284 33366 : }
2285 : }
2286 : else
2287 : {
2288 6351 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2289 : nDstPixelStride, nWordCount);
2290 : }
2291 39717 : }
2292 :
2293 : template <>
2294 26977 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2295 : int nSrcPixelStride,
2296 : GUInt16 *const CPL_RESTRICT pDstData,
2297 : int nDstPixelStride, GPtrDiff_t nWordCount)
2298 : {
2299 26977 : GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
2300 : nDstPixelStride, nWordCount);
2301 26977 : }
2302 :
2303 : template <>
2304 12740 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2305 : int nSrcPixelStride,
2306 : GInt16 *const CPL_RESTRICT pDstData,
2307 : int nDstPixelStride, GPtrDiff_t nWordCount)
2308 : {
2309 12740 : GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
2310 : nDstPixelStride, nWordCount);
2311 12740 : }
2312 :
2313 : template <class Tout>
2314 12854476 : void GDALCopyWordsByteTo32Bit(const GByte *const CPL_RESTRICT pSrcData,
2315 : int nSrcPixelStride,
2316 : Tout *const CPL_RESTRICT pDstData,
2317 : int nDstPixelStride, GPtrDiff_t nWordCount)
2318 : {
2319 : static_assert(std::is_integral<Tout>::value &&
2320 : sizeof(Tout) == sizeof(uint32_t),
2321 : "Bad Tout");
2322 12854476 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2323 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2324 : {
2325 6293756 : decltype(nWordCount) n = 0;
2326 6293756 : const __m128i xmm_zero = _mm_setzero_si128();
2327 6293756 : GByte *CPL_RESTRICT pabyDstDataPtr =
2328 : reinterpret_cast<GByte *>(pDstData);
2329 70192427 : for (; n < nWordCount - 15; n += 16)
2330 : {
2331 63898661 : __m128i xmm = _mm_loadu_si128(
2332 63898661 : reinterpret_cast<const __m128i *>(pSrcData + n));
2333 63898661 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2334 63898661 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2335 63898661 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2336 63898661 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2337 63898661 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2338 63898661 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2339 : _mm_storeu_si128(
2340 63898661 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4), xmm0);
2341 : _mm_storeu_si128(
2342 63898661 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 16), xmm1);
2343 : _mm_storeu_si128(
2344 63898661 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 32), xmm2);
2345 : _mm_storeu_si128(
2346 63898661 : reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 48), xmm3);
2347 : }
2348 14581316 : for (; n < nWordCount; n++)
2349 : {
2350 8287610 : pDstData[n] = pSrcData[n];
2351 6293756 : }
2352 : }
2353 : else
2354 : {
2355 6560690 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2356 : nDstPixelStride, nWordCount);
2357 : }
2358 12854476 : }
2359 :
2360 : template <>
2361 476 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2362 : int nSrcPixelStride,
2363 : GUInt32 *const CPL_RESTRICT pDstData,
2364 : int nDstPixelStride, GPtrDiff_t nWordCount)
2365 : {
2366 476 : GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
2367 : nDstPixelStride, nWordCount);
2368 476 : }
2369 :
2370 : template <>
2371 12854000 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2372 : int nSrcPixelStride,
2373 : GInt32 *const CPL_RESTRICT pDstData,
2374 : int nDstPixelStride, GPtrDiff_t nWordCount)
2375 : {
2376 12854000 : GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
2377 : nDstPixelStride, nWordCount);
2378 12854000 : }
2379 :
2380 : template <>
2381 2476020 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2382 : int nSrcPixelStride,
2383 : float *const CPL_RESTRICT pDstData,
2384 : int nDstPixelStride, GPtrDiff_t nWordCount)
2385 : {
2386 2476020 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2387 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2388 : {
2389 115285 : decltype(nWordCount) n = 0;
2390 115285 : const __m128i xmm_zero = _mm_setzero_si128();
2391 115285 : GByte *CPL_RESTRICT pabyDstDataPtr =
2392 : reinterpret_cast<GByte *>(pDstData);
2393 3324090 : for (; n < nWordCount - 15; n += 16)
2394 : {
2395 3208800 : __m128i xmm = _mm_loadu_si128(
2396 3208800 : reinterpret_cast<const __m128i *>(pSrcData + n));
2397 3208800 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2398 3208800 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2399 3208800 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2400 3208800 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2401 3208800 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2402 3208800 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2403 3208800 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2404 3208800 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2405 3208800 : __m128 xmm2_f = _mm_cvtepi32_ps(xmm2);
2406 3208800 : __m128 xmm3_f = _mm_cvtepi32_ps(xmm3);
2407 3208800 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2408 : xmm0_f);
2409 : _mm_storeu_ps(
2410 3208800 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2411 : _mm_storeu_ps(
2412 3208800 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 32), xmm2_f);
2413 : _mm_storeu_ps(
2414 3208800 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 48), xmm3_f);
2415 : }
2416 502808 : for (; n < nWordCount; n++)
2417 : {
2418 387523 : pDstData[n] = pSrcData[n];
2419 115285 : }
2420 : }
2421 : else
2422 : {
2423 2360740 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2424 : nDstPixelStride, nWordCount);
2425 : }
2426 2476020 : }
2427 :
2428 : template <>
2429 169970 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
2430 : int nSrcPixelStride,
2431 : double *const CPL_RESTRICT pDstData,
2432 : int nDstPixelStride, GPtrDiff_t nWordCount)
2433 : {
2434 169970 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2435 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2436 : {
2437 146506 : decltype(nWordCount) n = 0;
2438 146506 : const __m128i xmm_zero = _mm_setzero_si128();
2439 146506 : GByte *CPL_RESTRICT pabyDstDataPtr =
2440 : reinterpret_cast<GByte *>(pDstData);
2441 3126180 : for (; n < nWordCount - 15; n += 16)
2442 : {
2443 2979670 : __m128i xmm = _mm_loadu_si128(
2444 2979670 : reinterpret_cast<const __m128i *>(pSrcData + n));
2445 2979670 : __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
2446 2979670 : __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
2447 2979670 : __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
2448 2979670 : __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
2449 2979670 : __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
2450 2979670 : __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
2451 :
2452 : #if defined(__AVX2__) && defined(slightly_slower_than_SSE2)
2453 : _mm256_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2454 : _mm256_cvtepi32_pd(xmm0));
2455 : _mm256_storeu_pd(
2456 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2457 : _mm256_cvtepi32_pd(xmm1));
2458 : _mm256_storeu_pd(
2459 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
2460 : _mm256_cvtepi32_pd(xmm2));
2461 : _mm256_storeu_pd(
2462 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
2463 : _mm256_cvtepi32_pd(xmm3));
2464 : #else
2465 2979670 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2466 2979670 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2467 2979670 : __m128d xmm2_low_d = _mm_cvtepi32_pd(xmm2);
2468 2979670 : __m128d xmm3_low_d = _mm_cvtepi32_pd(xmm3);
2469 2979670 : xmm0 = _mm_srli_si128(xmm0, 8);
2470 2979670 : xmm1 = _mm_srli_si128(xmm1, 8);
2471 2979670 : xmm2 = _mm_srli_si128(xmm2, 8);
2472 2979670 : xmm3 = _mm_srli_si128(xmm3, 8);
2473 2979670 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2474 2979670 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2475 2979670 : __m128d xmm2_high_d = _mm_cvtepi32_pd(xmm2);
2476 2979670 : __m128d xmm3_high_d = _mm_cvtepi32_pd(xmm3);
2477 :
2478 2979670 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2479 : xmm0_low_d);
2480 : _mm_storeu_pd(
2481 2979670 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2482 : xmm0_high_d);
2483 : _mm_storeu_pd(
2484 2979670 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2485 : xmm1_low_d);
2486 : _mm_storeu_pd(
2487 2979670 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2488 : xmm1_high_d);
2489 : _mm_storeu_pd(
2490 2979670 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
2491 : xmm2_low_d);
2492 : _mm_storeu_pd(
2493 2979670 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 80),
2494 : xmm2_high_d);
2495 : _mm_storeu_pd(
2496 2979670 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
2497 : xmm3_low_d);
2498 : _mm_storeu_pd(
2499 2979670 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 112),
2500 : xmm3_high_d);
2501 : #endif
2502 : }
2503 278002 : for (; n < nWordCount; n++)
2504 : {
2505 131496 : pDstData[n] = pSrcData[n];
2506 146506 : }
2507 : }
2508 : else
2509 : {
2510 23464 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2511 : nDstPixelStride, nWordCount);
2512 : }
2513 169970 : }
2514 :
2515 : template <>
2516 148 : CPL_NOINLINE void GDALCopyWordsT(const uint8_t *const CPL_RESTRICT pSrcData,
2517 : int nSrcPixelStride,
2518 : int8_t *const CPL_RESTRICT pDstData,
2519 : int nDstPixelStride, GPtrDiff_t nWordCount)
2520 : {
2521 148 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2522 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2523 : {
2524 142 : decltype(nWordCount) n = 0;
2525 142 : const __m128i xmm_127 = _mm_set1_epi8(127);
2526 146 : for (; n < nWordCount - 31; n += 32)
2527 : {
2528 8 : __m128i xmm0 = _mm_loadu_si128(
2529 4 : reinterpret_cast<const __m128i *>(pSrcData + n));
2530 4 : __m128i xmm1 = _mm_loadu_si128(
2531 4 : reinterpret_cast<const __m128i *>(pSrcData + n + 16));
2532 4 : xmm0 = _mm_min_epu8(xmm0, xmm_127);
2533 4 : xmm1 = _mm_min_epu8(xmm1, xmm_127);
2534 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2535 4 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
2536 : xmm1);
2537 : }
2538 2424 : for (; n < nWordCount; n++)
2539 : {
2540 2282 : pDstData[n] =
2541 2282 : pSrcData[n] >= 127 ? 127 : static_cast<int8_t>(pSrcData[n]);
2542 142 : }
2543 : }
2544 : else
2545 : {
2546 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2547 : nDstPixelStride, nWordCount);
2548 : }
2549 148 : }
2550 :
2551 : template <>
2552 82 : CPL_NOINLINE void GDALCopyWordsT(const int8_t *const CPL_RESTRICT pSrcData,
2553 : int nSrcPixelStride,
2554 : uint8_t *const CPL_RESTRICT pDstData,
2555 : int nDstPixelStride, GPtrDiff_t nWordCount)
2556 : {
2557 82 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2558 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2559 : {
2560 56 : decltype(nWordCount) n = 0;
2561 : #if !(defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS))
2562 56 : const __m128i xmm_INT8_to_UINT8 = _mm_set1_epi8(-128);
2563 : #endif
2564 117 : for (; n < nWordCount - 31; n += 32)
2565 : {
2566 122 : __m128i xmm0 = _mm_loadu_si128(
2567 61 : reinterpret_cast<const __m128i *>(pSrcData + n));
2568 61 : __m128i xmm1 = _mm_loadu_si128(
2569 61 : reinterpret_cast<const __m128i *>(pSrcData + n + 16));
2570 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2571 : xmm0 = _mm_max_epi8(xmm0, _mm_setzero_si128());
2572 : xmm1 = _mm_max_epi8(xmm1, _mm_setzero_si128());
2573 : #else
2574 61 : xmm0 = _mm_add_epi8(xmm0, xmm_INT8_to_UINT8);
2575 61 : xmm1 = _mm_add_epi8(xmm1, xmm_INT8_to_UINT8);
2576 61 : xmm0 = _mm_max_epu8(xmm0, xmm_INT8_to_UINT8);
2577 61 : xmm1 = _mm_max_epu8(xmm1, xmm_INT8_to_UINT8);
2578 61 : xmm0 = _mm_sub_epi8(xmm0, xmm_INT8_to_UINT8);
2579 61 : xmm1 = _mm_sub_epi8(xmm1, xmm_INT8_to_UINT8);
2580 : #endif
2581 61 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2582 61 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
2583 : xmm1);
2584 : }
2585 352 : for (; n < nWordCount; n++)
2586 : {
2587 296 : pDstData[n] =
2588 296 : pSrcData[n] < 0 ? 0 : static_cast<uint8_t>(pSrcData[n]);
2589 56 : }
2590 : }
2591 : else
2592 : {
2593 26 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2594 : nDstPixelStride, nWordCount);
2595 : }
2596 82 : }
2597 :
2598 : template <>
2599 6037 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2600 : int nSrcPixelStride,
2601 : uint8_t *const CPL_RESTRICT pDstData,
2602 : int nDstPixelStride, GPtrDiff_t nWordCount)
2603 : {
2604 6037 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2605 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2606 : {
2607 5062 : decltype(nWordCount) n = 0;
2608 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2609 : const auto xmm_MAX_INT16 = _mm_set1_epi16(32767);
2610 : #else
2611 : // In SSE2, min_epu16 does not exist, so shift from
2612 : // UInt16 to SInt16 to be able to use min_epi16
2613 5062 : const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
2614 5062 : const __m128i xmm_m255_shifted = _mm_set1_epi16(255 - 32768);
2615 : #endif
2616 71888 : for (; n < nWordCount - 15; n += 16)
2617 : {
2618 133652 : __m128i xmm0 = _mm_loadu_si128(
2619 66826 : reinterpret_cast<const __m128i *>(pSrcData + n));
2620 66826 : __m128i xmm1 = _mm_loadu_si128(
2621 66826 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2622 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2623 : xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
2624 : xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
2625 : #else
2626 66826 : xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
2627 66826 : xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
2628 66826 : xmm0 = _mm_min_epi16(xmm0, xmm_m255_shifted);
2629 66826 : xmm1 = _mm_min_epi16(xmm1, xmm_m255_shifted);
2630 66826 : xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
2631 66826 : xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
2632 : #endif
2633 66826 : xmm0 = _mm_packus_epi16(xmm0, xmm1);
2634 66826 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2635 : }
2636 16403 : for (; n < nWordCount; n++)
2637 : {
2638 11341 : pDstData[n] =
2639 11341 : pSrcData[n] >= 255 ? 255 : static_cast<uint8_t>(pSrcData[n]);
2640 5062 : }
2641 : }
2642 : else
2643 : {
2644 975 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2645 : nDstPixelStride, nWordCount);
2646 : }
2647 6037 : }
2648 :
2649 : template <>
2650 46 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2651 : int nSrcPixelStride,
2652 : int16_t *const CPL_RESTRICT pDstData,
2653 : int nDstPixelStride, GPtrDiff_t nWordCount)
2654 : {
2655 46 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2656 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2657 : {
2658 40 : decltype(nWordCount) n = 0;
2659 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2660 : const __m128i xmm_MAX_INT16 = _mm_set1_epi16(32767);
2661 : #else
2662 : // In SSE2, min_epu16 does not exist, so shift from
2663 : // UInt16 to SInt16 to be able to use min_epi16
2664 40 : const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
2665 40 : const __m128i xmm_32767_shifted = _mm_set1_epi16(32767 - 32768);
2666 : #endif
2667 169 : for (; n < nWordCount - 15; n += 16)
2668 : {
2669 258 : __m128i xmm0 = _mm_loadu_si128(
2670 129 : reinterpret_cast<const __m128i *>(pSrcData + n));
2671 129 : __m128i xmm1 = _mm_loadu_si128(
2672 129 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2673 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2674 : xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
2675 : xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
2676 : #else
2677 129 : xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
2678 129 : xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
2679 129 : xmm0 = _mm_min_epi16(xmm0, xmm_32767_shifted);
2680 129 : xmm1 = _mm_min_epi16(xmm1, xmm_32767_shifted);
2681 129 : xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
2682 129 : xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
2683 : #endif
2684 129 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2685 129 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
2686 : xmm1);
2687 : }
2688 191 : for (; n < nWordCount; n++)
2689 : {
2690 282 : pDstData[n] = pSrcData[n] >= 32767
2691 : ? 32767
2692 131 : : static_cast<int16_t>(pSrcData[n]);
2693 40 : }
2694 : }
2695 : else
2696 : {
2697 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2698 : nDstPixelStride, nWordCount);
2699 : }
2700 46 : }
2701 :
2702 : template <>
2703 135 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2704 : int nSrcPixelStride,
2705 : uint16_t *const CPL_RESTRICT pDstData,
2706 : int nDstPixelStride, GPtrDiff_t nWordCount)
2707 : {
2708 135 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2709 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2710 : {
2711 92 : decltype(nWordCount) n = 0;
2712 92 : const __m128i xmm_zero = _mm_setzero_si128();
2713 277 : for (; n < nWordCount - 15; n += 16)
2714 : {
2715 370 : __m128i xmm0 = _mm_loadu_si128(
2716 185 : reinterpret_cast<const __m128i *>(pSrcData + n));
2717 185 : __m128i xmm1 = _mm_loadu_si128(
2718 185 : reinterpret_cast<const __m128i *>(pSrcData + n + 8));
2719 185 : xmm0 = _mm_max_epi16(xmm0, xmm_zero);
2720 185 : xmm1 = _mm_max_epi16(xmm1, xmm_zero);
2721 185 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2722 185 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
2723 : xmm1);
2724 : }
2725 468 : for (; n < nWordCount; n++)
2726 : {
2727 376 : pDstData[n] =
2728 376 : pSrcData[n] < 0 ? 0 : static_cast<uint16_t>(pSrcData[n]);
2729 92 : }
2730 : }
2731 : else
2732 : {
2733 43 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2734 : nDstPixelStride, nWordCount);
2735 : }
2736 135 : }
2737 :
2738 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2739 :
2740 : template <>
2741 : CPL_NOINLINE void GDALCopyWordsT(const uint32_t *const CPL_RESTRICT pSrcData,
2742 : int nSrcPixelStride,
2743 : int32_t *const CPL_RESTRICT pDstData,
2744 : int nDstPixelStride, GPtrDiff_t nWordCount)
2745 : {
2746 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2747 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2748 : {
2749 : decltype(nWordCount) n = 0;
2750 : const __m128i xmm_MAX_INT = _mm_set1_epi32(INT_MAX);
2751 : for (; n < nWordCount - 8; n += 7)
2752 : {
2753 : __m128i xmm0 = _mm_loadu_si128(
2754 : reinterpret_cast<const __m128i *>(pSrcData + n));
2755 : __m128i xmm1 = _mm_loadu_si128(
2756 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
2757 : xmm0 = _mm_min_epu32(xmm0, xmm_MAX_INT);
2758 : xmm1 = _mm_min_epu32(xmm1, xmm_MAX_INT);
2759 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2760 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
2761 : xmm1);
2762 : }
2763 : for (; n < nWordCount; n++)
2764 : {
2765 : pDstData[n] = pSrcData[n] >= INT_MAX
2766 : ? INT_MAX
2767 : : static_cast<int32_t>(pSrcData[n]);
2768 : }
2769 : }
2770 : else
2771 : {
2772 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2773 : nDstPixelStride, nWordCount);
2774 : }
2775 : }
2776 :
2777 : template <>
2778 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
2779 : int nSrcPixelStride,
2780 : uint32_t *const CPL_RESTRICT pDstData,
2781 : int nDstPixelStride, GPtrDiff_t nWordCount)
2782 : {
2783 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2784 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2785 : {
2786 : decltype(nWordCount) n = 0;
2787 : const __m128i xmm_zero = _mm_setzero_si128();
2788 : for (; n < nWordCount - 7; n += 8)
2789 : {
2790 : __m128i xmm0 = _mm_loadu_si128(
2791 : reinterpret_cast<const __m128i *>(pSrcData + n));
2792 : __m128i xmm1 = _mm_loadu_si128(
2793 : reinterpret_cast<const __m128i *>(pSrcData + n + 4));
2794 : xmm0 = _mm_max_epi32(xmm0, xmm_zero);
2795 : xmm1 = _mm_max_epi32(xmm1, xmm_zero);
2796 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
2797 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
2798 : xmm1);
2799 : }
2800 : for (; n < nWordCount; n++)
2801 : {
2802 : pDstData[n] =
2803 : pSrcData[n] < 0 ? 0 : static_cast<uint32_t>(pSrcData[n]);
2804 : }
2805 : }
2806 : else
2807 : {
2808 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2809 : nDstPixelStride, nWordCount);
2810 : }
2811 : }
2812 :
2813 : #endif // defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
2814 :
2815 : template <>
2816 343 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2817 : int nSrcPixelStride,
2818 : float *const CPL_RESTRICT pDstData,
2819 : int nDstPixelStride, GPtrDiff_t nWordCount)
2820 : {
2821 343 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2822 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2823 : {
2824 337 : decltype(nWordCount) n = 0;
2825 337 : const __m128i xmm_zero = _mm_setzero_si128();
2826 337 : GByte *CPL_RESTRICT pabyDstDataPtr =
2827 : reinterpret_cast<GByte *>(pDstData);
2828 1508 : for (; n < nWordCount - 7; n += 8)
2829 : {
2830 1171 : __m128i xmm = _mm_loadu_si128(
2831 1171 : reinterpret_cast<const __m128i *>(pSrcData + n));
2832 1171 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
2833 1171 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
2834 1171 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2835 1171 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2836 1171 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2837 : xmm0_f);
2838 : _mm_storeu_ps(
2839 1171 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2840 : }
2841 1115 : for (; n < nWordCount; n++)
2842 : {
2843 778 : pDstData[n] = pSrcData[n];
2844 337 : }
2845 : }
2846 : else
2847 : {
2848 6 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2849 : nDstPixelStride, nWordCount);
2850 : }
2851 343 : }
2852 :
2853 : template <>
2854 1073480 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2855 : int nSrcPixelStride,
2856 : float *const CPL_RESTRICT pDstData,
2857 : int nDstPixelStride, GPtrDiff_t nWordCount)
2858 : {
2859 1073480 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2860 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2861 : {
2862 83580 : decltype(nWordCount) n = 0;
2863 83580 : GByte *CPL_RESTRICT pabyDstDataPtr =
2864 : reinterpret_cast<GByte *>(pDstData);
2865 565267 : for (; n < nWordCount - 7; n += 8)
2866 : {
2867 481687 : __m128i xmm = _mm_loadu_si128(
2868 481687 : reinterpret_cast<const __m128i *>(pSrcData + n));
2869 481687 : const auto sign = _mm_srai_epi16(xmm, 15);
2870 481687 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
2871 481687 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
2872 481687 : __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
2873 481687 : __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
2874 481687 : _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
2875 : xmm0_f);
2876 : _mm_storeu_ps(
2877 481687 : reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
2878 : }
2879 244181 : for (; n < nWordCount; n++)
2880 : {
2881 160601 : pDstData[n] = pSrcData[n];
2882 83580 : }
2883 : }
2884 : else
2885 : {
2886 989901 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2887 : nDstPixelStride, nWordCount);
2888 : }
2889 1073480 : }
2890 :
2891 : template <>
2892 405 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
2893 : int nSrcPixelStride,
2894 : double *const CPL_RESTRICT pDstData,
2895 : int nDstPixelStride, GPtrDiff_t nWordCount)
2896 : {
2897 405 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2898 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2899 : {
2900 293 : decltype(nWordCount) n = 0;
2901 293 : const __m128i xmm_zero = _mm_setzero_si128();
2902 293 : GByte *CPL_RESTRICT pabyDstDataPtr =
2903 : reinterpret_cast<GByte *>(pDstData);
2904 809 : for (; n < nWordCount - 7; n += 8)
2905 : {
2906 516 : __m128i xmm = _mm_loadu_si128(
2907 516 : reinterpret_cast<const __m128i *>(pSrcData + n));
2908 516 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
2909 516 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
2910 :
2911 516 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2912 516 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2913 516 : xmm0 = _mm_srli_si128(xmm0, 8);
2914 516 : xmm1 = _mm_srli_si128(xmm1, 8);
2915 516 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2916 516 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2917 :
2918 516 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2919 : xmm0_low_d);
2920 : _mm_storeu_pd(
2921 516 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2922 : xmm0_high_d);
2923 : _mm_storeu_pd(
2924 516 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2925 : xmm1_low_d);
2926 : _mm_storeu_pd(
2927 516 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2928 : xmm1_high_d);
2929 : }
2930 1034 : for (; n < nWordCount; n++)
2931 : {
2932 741 : pDstData[n] = pSrcData[n];
2933 293 : }
2934 : }
2935 : else
2936 : {
2937 112 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2938 : nDstPixelStride, nWordCount);
2939 : }
2940 405 : }
2941 :
2942 : template <>
2943 2760350 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
2944 : int nSrcPixelStride,
2945 : double *const CPL_RESTRICT pDstData,
2946 : int nDstPixelStride, GPtrDiff_t nWordCount)
2947 : {
2948 2760350 : if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
2949 : nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
2950 : {
2951 34660 : decltype(nWordCount) n = 0;
2952 34660 : GByte *CPL_RESTRICT pabyDstDataPtr =
2953 : reinterpret_cast<GByte *>(pDstData);
2954 401770 : for (; n < nWordCount - 7; n += 8)
2955 : {
2956 367110 : __m128i xmm = _mm_loadu_si128(
2957 367110 : reinterpret_cast<const __m128i *>(pSrcData + n));
2958 367110 : const auto sign = _mm_srai_epi16(xmm, 15);
2959 367110 : __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
2960 367110 : __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
2961 :
2962 367110 : __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
2963 367110 : __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
2964 367110 : xmm0 = _mm_srli_si128(xmm0, 8);
2965 367110 : xmm1 = _mm_srli_si128(xmm1, 8);
2966 367110 : __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
2967 367110 : __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
2968 :
2969 367110 : _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
2970 : xmm0_low_d);
2971 : _mm_storeu_pd(
2972 367110 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
2973 : xmm0_high_d);
2974 : _mm_storeu_pd(
2975 367110 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
2976 : xmm1_low_d);
2977 : _mm_storeu_pd(
2978 367110 : reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
2979 : xmm1_high_d);
2980 : }
2981 253693 : for (; n < nWordCount; n++)
2982 : {
2983 219033 : pDstData[n] = pSrcData[n];
2984 34660 : }
2985 : }
2986 : else
2987 : {
2988 2725690 : GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
2989 : nDstPixelStride, nWordCount);
2990 : }
2991 2760350 : }
2992 :
2993 : #endif // HAVE_SSE2
2994 :
2995 : template <>
2996 4420700 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
2997 : int nSrcPixelStride,
2998 : GByte *const CPL_RESTRICT pDstData,
2999 : int nDstPixelStride, GPtrDiff_t nWordCount)
3000 : {
3001 4420700 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3002 : nDstPixelStride, nWordCount);
3003 4420700 : }
3004 :
3005 : template <>
3006 38235 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3007 : int nSrcPixelStride,
3008 : GUInt16 *const CPL_RESTRICT pDstData,
3009 : int nDstPixelStride, GPtrDiff_t nWordCount)
3010 : {
3011 38235 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3012 : nDstPixelStride, nWordCount);
3013 38235 : }
3014 :
3015 : template <>
3016 54830 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3017 : int nSrcPixelStride,
3018 : double *const CPL_RESTRICT pDstData,
3019 : int nDstPixelStride, GPtrDiff_t nWordCount)
3020 : {
3021 54830 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3022 : nDstPixelStride, nWordCount);
3023 54830 : }
3024 :
3025 : template <>
3026 122131 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3027 : int nSrcPixelStride,
3028 : float *const CPL_RESTRICT pDstData,
3029 : int nDstPixelStride, GPtrDiff_t nWordCount)
3030 : {
3031 122131 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3032 : nDstPixelStride, nWordCount);
3033 122131 : }
3034 :
3035 : template <>
3036 396 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
3037 : int nSrcPixelStride,
3038 : float *const CPL_RESTRICT pDstData,
3039 : int nDstPixelStride, GPtrDiff_t nWordCount)
3040 : {
3041 396 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3042 : nDstPixelStride, nWordCount);
3043 396 : }
3044 :
3045 : template <>
3046 544 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
3047 : int nSrcPixelStride,
3048 : double *const CPL_RESTRICT pDstData,
3049 : int nDstPixelStride, GPtrDiff_t nWordCount)
3050 : {
3051 544 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3052 : nDstPixelStride, nWordCount);
3053 544 : }
3054 :
3055 : template <>
3056 318163 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3057 : int nSrcPixelStride,
3058 : GByte *const CPL_RESTRICT pDstData,
3059 : int nDstPixelStride, GPtrDiff_t nWordCount)
3060 : {
3061 318163 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3062 : nDstPixelStride, nWordCount);
3063 318163 : }
3064 :
3065 : template <>
3066 55 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3067 : int nSrcPixelStride,
3068 : GInt8 *const CPL_RESTRICT pDstData,
3069 : int nDstPixelStride, GPtrDiff_t nWordCount)
3070 : {
3071 55 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3072 : nDstPixelStride, nWordCount);
3073 55 : }
3074 :
3075 : template <>
3076 15775 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3077 : int nSrcPixelStride,
3078 : GInt16 *const CPL_RESTRICT pDstData,
3079 : int nDstPixelStride, GPtrDiff_t nWordCount)
3080 : {
3081 15775 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3082 : nDstPixelStride, nWordCount);
3083 15775 : }
3084 :
3085 : template <>
3086 61713 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3087 : int nSrcPixelStride,
3088 : GUInt16 *const CPL_RESTRICT pDstData,
3089 : int nDstPixelStride, GPtrDiff_t nWordCount)
3090 : {
3091 61713 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3092 : nDstPixelStride, nWordCount);
3093 61713 : }
3094 :
3095 : template <>
3096 43884 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3097 : int nSrcPixelStride,
3098 : GInt32 *const CPL_RESTRICT pDstData,
3099 : int nDstPixelStride, GPtrDiff_t nWordCount)
3100 : {
3101 43884 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3102 : nDstPixelStride, nWordCount);
3103 43884 : }
3104 :
3105 : template <>
3106 72 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3107 : int nSrcPixelStride,
3108 : GFloat16 *const CPL_RESTRICT pDstData,
3109 : int nDstPixelStride, GPtrDiff_t nWordCount)
3110 : {
3111 72 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3112 : nDstPixelStride, nWordCount);
3113 72 : }
3114 :
3115 : template <>
3116 61 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3117 : int nSrcPixelStride,
3118 : GFloat16 *const CPL_RESTRICT pDstData,
3119 : int nDstPixelStride, GPtrDiff_t nWordCount)
3120 : {
3121 61 : GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3122 : nDstPixelStride, nWordCount);
3123 61 : }
3124 :
3125 : /************************************************************************/
3126 : /* GDALCopyWordsComplexT() */
3127 : /************************************************************************/
3128 : /**
3129 : * Template function, used to copy data from pSrcData into buffer
3130 : * pDstData, with stride nSrcPixelStride in the source data and
3131 : * stride nDstPixelStride in the destination data. Deals with the
3132 : * complex case, where input is complex and output is complex.
3133 : *
3134 : * @param pSrcData the source data buffer
3135 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
3136 : * of interest.
3137 : * @param pDstData the destination buffer.
3138 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
3139 : * interest.
3140 : * @param nWordCount the total number of pixel words to copy
3141 : *
3142 : */
3143 : template <class Tin, class Tout>
3144 98631 : inline void GDALCopyWordsComplexT(const Tin *const CPL_RESTRICT pSrcData,
3145 : int nSrcPixelStride,
3146 : Tout *const CPL_RESTRICT pDstData,
3147 : int nDstPixelStride, GPtrDiff_t nWordCount)
3148 : {
3149 98631 : decltype(nWordCount) nDstOffset = 0;
3150 98631 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
3151 98631 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
3152 :
3153 5630497 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
3154 : {
3155 5531861 : const Tin *const pPixelIn =
3156 5531861 : reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
3157 5531861 : Tout *const pPixelOut =
3158 5531861 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
3159 :
3160 5531861 : GDALCopyWord(pPixelIn[0], pPixelOut[0]);
3161 5531861 : GDALCopyWord(pPixelIn[1], pPixelOut[1]);
3162 :
3163 5531861 : nDstOffset += nDstPixelStride;
3164 : }
3165 98631 : }
3166 :
3167 : /************************************************************************/
3168 : /* GDALCopyWordsComplexOutT() */
3169 : /************************************************************************/
3170 : /**
3171 : * Template function, used to copy data from pSrcData into buffer
3172 : * pDstData, with stride nSrcPixelStride in the source data and
3173 : * stride nDstPixelStride in the destination data. Deals with the
3174 : * case where the value is real coming in, but complex going out.
3175 : *
3176 : * @param pSrcData the source data buffer
3177 : * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
3178 : * of interest, in bytes.
3179 : * @param pDstData the destination buffer.
3180 : * @param nDstPixelStride the stride in the buffer pDstData for pixels of
3181 : * interest, in bytes.
3182 : * @param nWordCount the total number of pixel words to copy
3183 : *
3184 : */
3185 : template <class Tin, class Tout>
3186 4394 : inline void GDALCopyWordsComplexOutT(const Tin *const CPL_RESTRICT pSrcData,
3187 : int nSrcPixelStride,
3188 : Tout *const CPL_RESTRICT pDstData,
3189 : int nDstPixelStride, GPtrDiff_t nWordCount)
3190 : {
3191 4394 : decltype(nWordCount) nDstOffset = 0;
3192 :
3193 4394 : const Tout tOutZero = static_cast<Tout>(0);
3194 :
3195 4394 : const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
3196 4394 : char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
3197 :
3198 1188704 : for (decltype(nWordCount) n = 0; n < nWordCount; n++)
3199 : {
3200 1184310 : const Tin tValue =
3201 1184310 : *reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
3202 1184310 : Tout *const pPixelOut =
3203 1184310 : reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
3204 1184310 : GDALCopyWord(tValue, *pPixelOut);
3205 :
3206 1184310 : pPixelOut[1] = tOutZero;
3207 :
3208 1184310 : nDstOffset += nDstPixelStride;
3209 : }
3210 4394 : }
3211 :
3212 : /************************************************************************/
3213 : /* GDALCopyWordsFromT() */
3214 : /************************************************************************/
3215 : /**
3216 : * Template driver function. Given the input type T, call the appropriate
3217 : * GDALCopyWordsT function template for the desired output type. You should
3218 : * never call this function directly (call GDALCopyWords instead).
3219 : *
3220 : * @param pSrcData source data buffer
3221 : * @param nSrcPixelStride pixel stride in input buffer, in pixel words
3222 : * @param bInComplex input is complex
3223 : * @param pDstData destination data buffer
3224 : * @param eDstType destination data type
3225 : * @param nDstPixelStride pixel stride in output buffer, in pixel words
3226 : * @param nWordCount number of pixel words to be copied
3227 : */
3228 : template <class T>
3229 54346973 : inline void GDALCopyWordsFromT(const T *const CPL_RESTRICT pSrcData,
3230 : int nSrcPixelStride, bool bInComplex,
3231 : void *CPL_RESTRICT pDstData,
3232 : GDALDataType eDstType, int nDstPixelStride,
3233 : GPtrDiff_t nWordCount)
3234 : {
3235 54346973 : switch (eDstType)
3236 : {
3237 4783844 : case GDT_UInt8:
3238 4783844 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3239 : static_cast<unsigned char *>(pDstData),
3240 : nDstPixelStride, nWordCount);
3241 4783844 : break;
3242 753 : case GDT_Int8:
3243 753 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3244 : static_cast<signed char *>(pDstData),
3245 : nDstPixelStride, nWordCount);
3246 753 : break;
3247 140646 : case GDT_UInt16:
3248 140646 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3249 : static_cast<unsigned short *>(pDstData),
3250 : nDstPixelStride, nWordCount);
3251 140646 : break;
3252 4162591 : case GDT_Int16:
3253 4162591 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3254 : static_cast<short *>(pDstData), nDstPixelStride,
3255 : nWordCount);
3256 4162591 : break;
3257 22554 : case GDT_UInt32:
3258 22554 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3259 : static_cast<unsigned int *>(pDstData),
3260 : nDstPixelStride, nWordCount);
3261 22554 : break;
3262 26066731 : case GDT_Int32:
3263 26066731 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3264 : static_cast<int *>(pDstData), nDstPixelStride,
3265 : nWordCount);
3266 26066731 : break;
3267 1110 : case GDT_UInt64:
3268 1110 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3269 : static_cast<std::uint64_t *>(pDstData),
3270 : nDstPixelStride, nWordCount);
3271 1110 : break;
3272 5754 : case GDT_Int64:
3273 5754 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3274 : static_cast<std::int64_t *>(pDstData),
3275 : nDstPixelStride, nWordCount);
3276 5754 : break;
3277 997 : case GDT_Float16:
3278 997 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3279 : static_cast<GFloat16 *>(pDstData), nDstPixelStride,
3280 : nWordCount);
3281 997 : break;
3282 3836699 : case GDT_Float32:
3283 3836699 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3284 : static_cast<float *>(pDstData), nDstPixelStride,
3285 : nWordCount);
3286 3836699 : break;
3287 15222308 : case GDT_Float64:
3288 15222308 : GDALCopyWordsT(pSrcData, nSrcPixelStride,
3289 : static_cast<double *>(pDstData), nDstPixelStride,
3290 : nWordCount);
3291 15222308 : break;
3292 94424 : case GDT_CInt16:
3293 94424 : if (bInComplex)
3294 : {
3295 93170 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3296 : static_cast<short *>(pDstData),
3297 : nDstPixelStride, nWordCount);
3298 : }
3299 : else // input is not complex, so we need to promote to a complex
3300 : // buffer
3301 : {
3302 1254 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3303 : static_cast<short *>(pDstData),
3304 : nDstPixelStride, nWordCount);
3305 : }
3306 94424 : break;
3307 1349 : case GDT_CInt32:
3308 1349 : if (bInComplex)
3309 : {
3310 717 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3311 : static_cast<int *>(pDstData),
3312 : nDstPixelStride, nWordCount);
3313 : }
3314 : else // input is not complex, so we need to promote to a complex
3315 : // buffer
3316 : {
3317 632 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3318 : static_cast<int *>(pDstData),
3319 : nDstPixelStride, nWordCount);
3320 : }
3321 1349 : break;
3322 313 : case GDT_CFloat16:
3323 313 : if (bInComplex)
3324 : {
3325 48 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3326 : static_cast<GFloat16 *>(pDstData),
3327 : nDstPixelStride, nWordCount);
3328 : }
3329 : else // input is not complex, so we need to promote to a complex
3330 : // buffer
3331 : {
3332 265 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3333 : static_cast<GFloat16 *>(pDstData),
3334 : nDstPixelStride, nWordCount);
3335 : }
3336 313 : break;
3337 3791 : case GDT_CFloat32:
3338 3791 : if (bInComplex)
3339 : {
3340 2994 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3341 : static_cast<float *>(pDstData),
3342 : nDstPixelStride, nWordCount);
3343 : }
3344 : else // input is not complex, so we need to promote to a complex
3345 : // buffer
3346 : {
3347 797 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3348 : static_cast<float *>(pDstData),
3349 : nDstPixelStride, nWordCount);
3350 : }
3351 3791 : break;
3352 3148 : case GDT_CFloat64:
3353 3148 : if (bInComplex)
3354 : {
3355 1702 : GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
3356 : static_cast<double *>(pDstData),
3357 : nDstPixelStride, nWordCount);
3358 : }
3359 : else // input is not complex, so we need to promote to a complex
3360 : // buffer
3361 : {
3362 1446 : GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
3363 : static_cast<double *>(pDstData),
3364 : nDstPixelStride, nWordCount);
3365 : }
3366 3148 : break;
3367 0 : case GDT_Unknown:
3368 : case GDT_TypeCount:
3369 0 : CPLAssert(false);
3370 : }
3371 54346973 : }
3372 :
3373 : } // end anonymous namespace
3374 :
3375 : /************************************************************************/
3376 : /* GDALReplicateWord() */
3377 : /************************************************************************/
3378 :
3379 : template <class T>
3380 598683 : inline void GDALReplicateWordT(void *pDstData, int nDstPixelStride,
3381 : GPtrDiff_t nWordCount)
3382 : {
3383 598683 : const T valSet = *static_cast<const T *>(pDstData);
3384 598683 : if (nDstPixelStride == static_cast<int>(sizeof(T)))
3385 : {
3386 568932 : T *pDstPtr = static_cast<T *>(pDstData) + 1;
3387 31900103 : while (nWordCount >= 4)
3388 : {
3389 31331168 : nWordCount -= 4;
3390 31331168 : pDstPtr[0] = valSet;
3391 31331168 : pDstPtr[1] = valSet;
3392 31331168 : pDstPtr[2] = valSet;
3393 31331168 : pDstPtr[3] = valSet;
3394 31331168 : pDstPtr += 4;
3395 : }
3396 1470437 : while (nWordCount > 0)
3397 : {
3398 901505 : --nWordCount;
3399 901505 : *pDstPtr = valSet;
3400 901505 : pDstPtr++;
3401 : }
3402 : }
3403 : else
3404 : {
3405 29751 : GByte *pabyDstPtr = static_cast<GByte *>(pDstData) + nDstPixelStride;
3406 1040338 : while (nWordCount > 0)
3407 : {
3408 1010587 : --nWordCount;
3409 1010587 : *reinterpret_cast<T *>(pabyDstPtr) = valSet;
3410 1010587 : pabyDstPtr += nDstPixelStride;
3411 : }
3412 : }
3413 598683 : }
3414 :
3415 1050480 : static void GDALReplicateWord(const void *CPL_RESTRICT pSrcData,
3416 : GDALDataType eSrcType,
3417 : void *CPL_RESTRICT pDstData,
3418 : GDALDataType eDstType, int nDstPixelStride,
3419 : GPtrDiff_t nWordCount)
3420 : {
3421 : /* -----------------------------------------------------------------------
3422 : */
3423 : /* Special case when the source data is always the same value */
3424 : /* (for VRTSourcedRasterBand::IRasterIO and
3425 : * VRTDerivedRasterBand::IRasterIO*/
3426 : /* for example) */
3427 : /* -----------------------------------------------------------------------
3428 : */
3429 : // Let the general translation case do the necessary conversions
3430 : // on the first destination element.
3431 1050480 : GDALCopyWords64(pSrcData, eSrcType, 0, pDstData, eDstType, 0, 1);
3432 :
3433 : // Now copy the first element to the nWordCount - 1 following destination
3434 : // elements.
3435 1050480 : nWordCount--;
3436 1050480 : GByte *pabyDstWord = reinterpret_cast<GByte *>(pDstData) + nDstPixelStride;
3437 :
3438 1050480 : switch (eDstType)
3439 : {
3440 451704 : case GDT_UInt8:
3441 : case GDT_Int8:
3442 : {
3443 451704 : if (nDstPixelStride == 1)
3444 : {
3445 380124 : if (nWordCount > 0)
3446 380124 : memset(pabyDstWord,
3447 380124 : *reinterpret_cast<const GByte *>(pDstData),
3448 : nWordCount);
3449 : }
3450 : else
3451 : {
3452 71580 : GByte valSet = *reinterpret_cast<const GByte *>(pDstData);
3453 54467500 : while (nWordCount > 0)
3454 : {
3455 54395900 : --nWordCount;
3456 54395900 : *pabyDstWord = valSet;
3457 54395900 : pabyDstWord += nDstPixelStride;
3458 : }
3459 : }
3460 451704 : break;
3461 : }
3462 :
3463 : #define CASE_DUPLICATE_SIMPLE(enum_type, c_type) \
3464 : case enum_type: \
3465 : { \
3466 : GDALReplicateWordT<c_type>(pDstData, nDstPixelStride, nWordCount); \
3467 : break; \
3468 : }
3469 :
3470 34507 : CASE_DUPLICATE_SIMPLE(GDT_UInt16, GUInt16)
3471 202447 : CASE_DUPLICATE_SIMPLE(GDT_Int16, GInt16)
3472 56 : CASE_DUPLICATE_SIMPLE(GDT_UInt32, GUInt32)
3473 300555 : CASE_DUPLICATE_SIMPLE(GDT_Int32, GInt32)
3474 23 : CASE_DUPLICATE_SIMPLE(GDT_UInt64, std::uint64_t)
3475 1066 : CASE_DUPLICATE_SIMPLE(GDT_Int64, std::int64_t)
3476 0 : CASE_DUPLICATE_SIMPLE(GDT_Float16, GFloat16)
3477 52668 : CASE_DUPLICATE_SIMPLE(GDT_Float32, float)
3478 7361 : CASE_DUPLICATE_SIMPLE(GDT_Float64, double)
3479 :
3480 : #define CASE_DUPLICATE_COMPLEX(enum_type, c_type) \
3481 : case enum_type: \
3482 : { \
3483 : c_type valSet1 = reinterpret_cast<const c_type *>(pDstData)[0]; \
3484 : c_type valSet2 = reinterpret_cast<const c_type *>(pDstData)[1]; \
3485 : while (nWordCount > 0) \
3486 : { \
3487 : --nWordCount; \
3488 : reinterpret_cast<c_type *>(pabyDstWord)[0] = valSet1; \
3489 : reinterpret_cast<c_type *>(pabyDstWord)[1] = valSet2; \
3490 : pabyDstWord += nDstPixelStride; \
3491 : } \
3492 : break; \
3493 : }
3494 :
3495 784 : CASE_DUPLICATE_COMPLEX(GDT_CInt16, GInt16)
3496 784 : CASE_DUPLICATE_COMPLEX(GDT_CInt32, GInt32)
3497 6 : CASE_DUPLICATE_COMPLEX(GDT_CFloat16, GFloat16)
3498 790 : CASE_DUPLICATE_COMPLEX(GDT_CFloat32, float)
3499 790 : CASE_DUPLICATE_COMPLEX(GDT_CFloat64, double)
3500 :
3501 0 : case GDT_Unknown:
3502 : case GDT_TypeCount:
3503 0 : CPLAssert(false);
3504 : }
3505 1050480 : }
3506 :
3507 : /************************************************************************/
3508 : /* GDALUnrolledCopy() */
3509 : /************************************************************************/
3510 :
3511 : template <class T, int srcStride, int dstStride>
3512 : #if defined(__GNUC__) && defined(__AVX2__)
3513 : __attribute__((optimize("tree-vectorize")))
3514 : #endif
3515 : static inline void
3516 3033265 : GDALUnrolledCopyGeneric(T *CPL_RESTRICT pDest, const T *CPL_RESTRICT pSrc,
3517 : GPtrDiff_t nIters)
3518 : {
3519 : #if !(defined(__GNUC__) && defined(__AVX2__))
3520 3033265 : if (nIters >= 16)
3521 : {
3522 133236907 : for (GPtrDiff_t i = nIters / 16; i != 0; i--)
3523 : {
3524 130324255 : pDest[0 * dstStride] = pSrc[0 * srcStride];
3525 130324255 : pDest[1 * dstStride] = pSrc[1 * srcStride];
3526 130324255 : pDest[2 * dstStride] = pSrc[2 * srcStride];
3527 130324255 : pDest[3 * dstStride] = pSrc[3 * srcStride];
3528 130324255 : pDest[4 * dstStride] = pSrc[4 * srcStride];
3529 130324255 : pDest[5 * dstStride] = pSrc[5 * srcStride];
3530 130324255 : pDest[6 * dstStride] = pSrc[6 * srcStride];
3531 130324255 : pDest[7 * dstStride] = pSrc[7 * srcStride];
3532 130324255 : pDest[8 * dstStride] = pSrc[8 * srcStride];
3533 130324255 : pDest[9 * dstStride] = pSrc[9 * srcStride];
3534 130324255 : pDest[10 * dstStride] = pSrc[10 * srcStride];
3535 130324255 : pDest[11 * dstStride] = pSrc[11 * srcStride];
3536 130324255 : pDest[12 * dstStride] = pSrc[12 * srcStride];
3537 130324255 : pDest[13 * dstStride] = pSrc[13 * srcStride];
3538 130324255 : pDest[14 * dstStride] = pSrc[14 * srcStride];
3539 130324255 : pDest[15 * dstStride] = pSrc[15 * srcStride];
3540 130324255 : pDest += 16 * dstStride;
3541 130324255 : pSrc += 16 * srcStride;
3542 : }
3543 2912698 : nIters = nIters % 16;
3544 : }
3545 : #else
3546 : #pragma GCC unroll 4
3547 : #endif
3548 5181291 : for (GPtrDiff_t i = 0; i < nIters; i++)
3549 : {
3550 2148037 : pDest[i * dstStride] = *pSrc;
3551 2148037 : pSrc += srcStride;
3552 : }
3553 3033265 : }
3554 :
3555 : template <class T, int srcStride, int dstStride>
3556 3033265 : static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
3557 : const T *CPL_RESTRICT pSrc,
3558 : GPtrDiff_t nIters)
3559 : {
3560 3033265 : GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
3561 3033265 : }
3562 :
3563 : #if defined(__AVX2__) && defined(HAVE_SSSE3_AT_COMPILE_TIME) && \
3564 : (defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS))
3565 :
3566 : template <>
3567 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
3568 : const GByte *CPL_RESTRICT pSrc,
3569 : GPtrDiff_t nIters)
3570 : {
3571 : if (nIters > 16)
3572 : {
3573 : // The SSSE3 variant is slightly faster than what the gcc autovectorizer
3574 : // generates
3575 : GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
3576 : }
3577 : else
3578 : {
3579 : for (GPtrDiff_t i = 0; i < nIters; i++)
3580 : {
3581 : pDest[i] = *pSrc;
3582 : pSrc += 3;
3583 : }
3584 : }
3585 : }
3586 :
3587 : #elif defined(HAVE_SSE2) && !(defined(__GNUC__) && defined(__AVX2__))
3588 :
3589 : template <>
3590 354460 : void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
3591 : const GByte *CPL_RESTRICT pSrc,
3592 : GPtrDiff_t nIters)
3593 : {
3594 354460 : decltype(nIters) i = 0;
3595 354460 : if (nIters > 16)
3596 : {
3597 196203 : const __m128i xmm_mask = _mm_set1_epi16(0xff);
3598 : // If we were sure that there would always be 1 trailing byte, we could
3599 : // check against nIters - 15
3600 3012690 : for (; i < nIters - 16; i += 16)
3601 : {
3602 : __m128i xmm0 =
3603 2816480 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
3604 : __m128i xmm1 =
3605 5632970 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
3606 : // Set higher 8bit of each int16 packed word to 0
3607 2816480 : xmm0 = _mm_and_si128(xmm0, xmm_mask);
3608 2816480 : xmm1 = _mm_and_si128(xmm1, xmm_mask);
3609 : // Pack int16 to uint8 and merge back both vector
3610 2816480 : xmm0 = _mm_packus_epi16(xmm0, xmm1);
3611 :
3612 : // Store result
3613 2816480 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
3614 :
3615 2816480 : pSrc += 2 * 16;
3616 : }
3617 : }
3618 4646080 : for (; i < nIters; i++)
3619 : {
3620 4291620 : pDest[i] = *pSrc;
3621 4291620 : pSrc += 2;
3622 : }
3623 354460 : }
3624 :
3625 1 : static void GDALUnrolledCopy_GByte_3_1_SSE2(GByte *CPL_RESTRICT pDest,
3626 : const GByte *CPL_RESTRICT pSrc,
3627 : GPtrDiff_t nIters)
3628 : {
3629 1 : decltype(nIters) i = 0;
3630 1 : const __m128i xmm_mask_ori = _mm_set_epi32(0, 0, 0, 255);
3631 : // If we were sure that there would always be 2 trailing bytes, we could
3632 : // check against nIters - 15
3633 2 : for (; i < nIters - 16; i += 16)
3634 : {
3635 : __m128i xmm0 =
3636 1 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
3637 : __m128i xmm1 =
3638 1 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
3639 : __m128i xmm2 =
3640 1 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
3641 :
3642 1 : auto xmm_mask0 = xmm_mask_ori;
3643 1 : auto xmm_mask1 = _mm_slli_si128(xmm_mask_ori, 6);
3644 1 : auto xmm_mask2 = _mm_slli_si128(xmm_mask_ori, 11);
3645 :
3646 1 : auto xmm = _mm_and_si128(xmm0, xmm_mask0);
3647 1 : auto xmm_res1 = _mm_and_si128(_mm_slli_si128(xmm1, 4), xmm_mask1);
3648 :
3649 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3650 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3651 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3652 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3653 2 : xmm_res1 = _mm_or_si128(
3654 : xmm_res1, _mm_and_si128(_mm_slli_si128(xmm1, 2), xmm_mask1));
3655 :
3656 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3657 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3658 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3659 2 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3660 1 : xmm_res1 = _mm_or_si128(xmm_res1, _mm_and_si128(xmm1, xmm_mask1));
3661 :
3662 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3663 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3664 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3665 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3666 2 : xmm_res1 = _mm_or_si128(
3667 : xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 2), xmm_mask1));
3668 :
3669 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3670 1 : xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
3671 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3672 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3673 3 : xmm_res1 = _mm_or_si128(
3674 : xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 4), xmm_mask1));
3675 1 : xmm = _mm_or_si128(xmm, xmm_res1);
3676 :
3677 1 : xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
3678 1 : xmm0 = _mm_srli_si128(xmm0, 2);
3679 1 : xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
3680 :
3681 2 : xmm = _mm_or_si128(xmm,
3682 : _mm_and_si128(_mm_slli_si128(xmm2, 10), xmm_mask2));
3683 :
3684 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
3685 2 : xmm = _mm_or_si128(xmm,
3686 : _mm_and_si128(_mm_slli_si128(xmm2, 8), xmm_mask2));
3687 :
3688 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
3689 2 : xmm = _mm_or_si128(xmm,
3690 : _mm_and_si128(_mm_slli_si128(xmm2, 6), xmm_mask2));
3691 :
3692 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
3693 2 : xmm = _mm_or_si128(xmm,
3694 : _mm_and_si128(_mm_slli_si128(xmm2, 4), xmm_mask2));
3695 :
3696 1 : xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
3697 2 : xmm = _mm_or_si128(xmm,
3698 : _mm_and_si128(_mm_slli_si128(xmm2, 2), xmm_mask2));
3699 :
3700 1 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm);
3701 :
3702 1 : pSrc += 3 * 16;
3703 : }
3704 2 : for (; i < nIters; i++)
3705 : {
3706 1 : pDest[i] = *pSrc;
3707 1 : pSrc += 3;
3708 : }
3709 1 : }
3710 :
3711 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
3712 :
3713 : template <>
3714 192064 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
3715 : const GByte *CPL_RESTRICT pSrc,
3716 : GPtrDiff_t nIters)
3717 : {
3718 192064 : if (nIters > 16)
3719 : {
3720 185965 : if (CPLHaveRuntimeSSSE3())
3721 : {
3722 185964 : GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
3723 : }
3724 : else
3725 : {
3726 1 : GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
3727 : }
3728 : }
3729 : else
3730 : {
3731 20168 : for (GPtrDiff_t i = 0; i < nIters; i++)
3732 : {
3733 14069 : pDest[i] = *pSrc;
3734 14069 : pSrc += 3;
3735 : }
3736 : }
3737 192064 : }
3738 :
3739 : #else
3740 :
3741 : template <>
3742 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
3743 : const GByte *CPL_RESTRICT pSrc,
3744 : GPtrDiff_t nIters)
3745 : {
3746 : GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
3747 : }
3748 : #endif
3749 :
3750 : template <>
3751 106698 : void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
3752 : const GByte *CPL_RESTRICT pSrc,
3753 : GPtrDiff_t nIters)
3754 : {
3755 106698 : decltype(nIters) i = 0;
3756 106698 : if (nIters > 16)
3757 : {
3758 101405 : const __m128i xmm_mask = _mm_set1_epi32(0xff);
3759 : // If we were sure that there would always be 3 trailing bytes, we could
3760 : // check against nIters - 15
3761 11580500 : for (; i < nIters - 16; i += 16)
3762 : {
3763 : __m128i xmm0 =
3764 11479100 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
3765 : __m128i xmm1 =
3766 11479100 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
3767 : __m128i xmm2 =
3768 11479100 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
3769 : __m128i xmm3 =
3770 22958200 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 48));
3771 : // Set higher 24bit of each int32 packed word to 0
3772 11479100 : xmm0 = _mm_and_si128(xmm0, xmm_mask);
3773 11479100 : xmm1 = _mm_and_si128(xmm1, xmm_mask);
3774 11479100 : xmm2 = _mm_and_si128(xmm2, xmm_mask);
3775 11479100 : xmm3 = _mm_and_si128(xmm3, xmm_mask);
3776 : // Pack int32 to int16
3777 11479100 : xmm0 = _mm_packs_epi32(xmm0, xmm1);
3778 11479100 : xmm2 = _mm_packs_epi32(xmm2, xmm3);
3779 : // Pack int16 to uint8
3780 11479100 : xmm0 = _mm_packus_epi16(xmm0, xmm2);
3781 :
3782 : // Store result
3783 11479100 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
3784 :
3785 11479100 : pSrc += 4 * 16;
3786 : }
3787 : }
3788 1143150 : for (; i < nIters; i++)
3789 : {
3790 1036450 : pDest[i] = *pSrc;
3791 1036450 : pSrc += 4;
3792 : }
3793 106698 : }
3794 : #endif // HAVE_SSE2
3795 :
3796 : /************************************************************************/
3797 : /* GDALFastCopy() */
3798 : /************************************************************************/
3799 :
3800 : template <class T>
3801 39777600 : static inline void GDALFastCopy(T *CPL_RESTRICT pDest, int nDestStride,
3802 : const T *CPL_RESTRICT pSrc, int nSrcStride,
3803 : GPtrDiff_t nIters)
3804 : {
3805 39777600 : constexpr int sizeofT = static_cast<int>(sizeof(T));
3806 39777600 : if (nIters == 1)
3807 : {
3808 22297230 : *pDest = *pSrc;
3809 : }
3810 17480314 : else if (nDestStride == sizeofT)
3811 : {
3812 14373572 : if (nSrcStride == sizeofT)
3813 : {
3814 13513459 : memcpy(pDest, pSrc, nIters * sizeof(T));
3815 : }
3816 860053 : else if (nSrcStride == 2 * sizeofT)
3817 : {
3818 357675 : GDALUnrolledCopy<T, 2, 1>(pDest, pSrc, nIters);
3819 : }
3820 502378 : else if (nSrcStride == 3 * sizeofT)
3821 : {
3822 288642 : GDALUnrolledCopy<T, 3, 1>(pDest, pSrc, nIters);
3823 : }
3824 213736 : else if (nSrcStride == 4 * sizeofT)
3825 : {
3826 110680 : GDALUnrolledCopy<T, 4, 1>(pDest, pSrc, nIters);
3827 : }
3828 : else
3829 : {
3830 17219290 : while (nIters-- > 0)
3831 : {
3832 17116250 : *pDest = *pSrc;
3833 17116250 : pSrc += nSrcStride / sizeofT;
3834 17116250 : pDest++;
3835 : }
3836 : }
3837 : }
3838 3106802 : else if (nSrcStride == sizeofT)
3839 : {
3840 3093796 : if (nDestStride == 2 * sizeofT)
3841 : {
3842 150268 : GDALUnrolledCopy<T, 1, 2>(pDest, pSrc, nIters);
3843 : }
3844 2943525 : else if (nDestStride == 3 * sizeofT)
3845 : {
3846 2115801 : GDALUnrolledCopy<T, 1, 3>(pDest, pSrc, nIters);
3847 : }
3848 827733 : else if (nDestStride == 4 * sizeofT)
3849 : {
3850 663421 : GDALUnrolledCopy<T, 1, 4>(pDest, pSrc, nIters);
3851 : }
3852 : else
3853 : {
3854 17169660 : while (nIters-- > 0)
3855 : {
3856 17005410 : *pDest = *pSrc;
3857 17005410 : pSrc++;
3858 17005410 : pDest += nDestStride / sizeofT;
3859 : }
3860 : }
3861 : }
3862 : else
3863 : {
3864 1220108 : while (nIters-- > 0)
3865 : {
3866 1207102 : *pDest = *pSrc;
3867 1207102 : pSrc += nSrcStride / sizeofT;
3868 1207102 : pDest += nDestStride / sizeofT;
3869 : }
3870 : }
3871 39777600 : }
3872 :
3873 : /************************************************************************/
3874 : /* GDALFastCopyByte() */
3875 : /************************************************************************/
3876 :
3877 326250 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
3878 : int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
3879 : int nDstPixelStride, GPtrDiff_t nWordCount)
3880 : {
3881 326250 : GDALFastCopy(pDstData, nDstPixelStride, pSrcData, nSrcPixelStride,
3882 : nWordCount);
3883 326250 : }
3884 :
3885 : /************************************************************************/
3886 : /* GDALCopyWords() */
3887 : /************************************************************************/
3888 :
3889 : /**
3890 : * Copy pixel words from buffer to buffer.
3891 : *
3892 : * @see GDALCopyWords64()
3893 : */
3894 78067600 : void CPL_STDCALL GDALCopyWords(const void *CPL_RESTRICT pSrcData,
3895 : GDALDataType eSrcType, int nSrcPixelStride,
3896 : void *CPL_RESTRICT pDstData,
3897 : GDALDataType eDstType, int nDstPixelStride,
3898 : int nWordCount)
3899 : {
3900 78067600 : GDALCopyWords64(pSrcData, eSrcType, nSrcPixelStride, pDstData, eDstType,
3901 : nDstPixelStride, nWordCount);
3902 78067600 : }
3903 :
3904 : /************************************************************************/
3905 : /* GDALCopyWords64() */
3906 : /************************************************************************/
3907 :
3908 : /**
3909 : * Copy pixel words from buffer to buffer.
3910 : *
3911 : * This function is used to copy pixel word values from one memory buffer
3912 : * to another, with support for conversion between data types, and differing
3913 : * step factors. The data type conversion is done using the following
3914 : * rules:
3915 : * <ul>
3916 : * <li>Values assigned to a lower range integer type are clipped. For
3917 : * instance assigning GDT_Int16 values to a GDT_UInt8 buffer will cause values
3918 : * less the 0 to be set to 0, and values larger than 255 to be set to 255.
3919 : * </li>
3920 : * <li>
3921 : * Assignment from floating point to integer rounds to closest integer.
3922 : * +Infinity is mapped to the largest integer. -Infinity is mapped to the
3923 : * smallest integer. NaN is mapped to 0.
3924 : * </li>
3925 : * <li>
3926 : * Assignment from non-complex to complex will result in the imaginary part
3927 : * being set to zero on output.
3928 : * </li>
3929 : * <li> Assignment from complex to
3930 : * non-complex will result in the complex portion being lost and the real
3931 : * component being preserved (<i>not magnitude!</i>).
3932 : * </li>
3933 : * </ul>
3934 : *
3935 : * No assumptions are made about the source or destination words occurring
3936 : * on word boundaries. It is assumed that all values are in native machine
3937 : * byte order.
3938 : *
3939 : * @param pSrcData Pointer to source data to be converted.
3940 : * @param eSrcType the source data type (see GDALDataType enum)
3941 : * @param nSrcPixelStride Source pixel stride (i.e. distance between 2 words),
3942 : * in bytes
3943 : * @param pDstData Pointer to buffer where destination data should go
3944 : * @param eDstType the destination data type (see GDALDataType enum)
3945 : * @param nDstPixelStride Destination pixel stride (i.e. distance between 2
3946 : * words), in bytes
3947 : * @param nWordCount number of words to be copied
3948 : *
3949 : * @note
3950 : * When adding a new data type to GDAL, you must do the following to
3951 : * support it properly within the GDALCopyWords function:
3952 : * 1. Add the data type to the switch on eSrcType in GDALCopyWords.
3953 : * This should invoke the appropriate GDALCopyWordsFromT wrapper.
3954 : * 2. Add the data type to the switch on eDstType in GDALCopyWordsFromT.
3955 : * This should call the appropriate GDALCopyWordsT template.
3956 : * 3. If appropriate, overload the appropriate CopyWord template in the
3957 : * above namespace. This will ensure that any conversion issues are
3958 : * handled (cases like the float -> int32 case, where the min/max)
3959 : * values are subject to roundoff error.
3960 : */
3961 :
3962 108954000 : void CPL_STDCALL GDALCopyWords64(const void *CPL_RESTRICT pSrcData,
3963 : GDALDataType eSrcType, int nSrcPixelStride,
3964 : void *CPL_RESTRICT pDstData,
3965 : GDALDataType eDstType, int nDstPixelStride,
3966 : GPtrDiff_t nWordCount)
3967 :
3968 : {
3969 : // On platforms where alignment matters, be careful
3970 108954000 : const int nSrcDataTypeSize = GDALGetDataTypeSizeBytes(eSrcType);
3971 108954000 : const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(eDstType);
3972 108954000 : if (CPL_UNLIKELY(nSrcDataTypeSize == 0 || nDstDataTypeSize == 0))
3973 : {
3974 2 : CPLError(CE_Failure, CPLE_NotSupported,
3975 : "GDALCopyWords64(): unsupported GDT_Unknown/GDT_TypeCount "
3976 : "argument");
3977 2 : return;
3978 : }
3979 108954000 : if (!(eSrcType == eDstType && nSrcPixelStride == nDstPixelStride) &&
3980 59163100 : ((reinterpret_cast<uintptr_t>(pSrcData) % nSrcDataTypeSize) != 0 ||
3981 59163100 : (reinterpret_cast<uintptr_t>(pDstData) % nDstDataTypeSize) != 0 ||
3982 59162700 : (nSrcPixelStride % nSrcDataTypeSize) != 0 ||
3983 59162600 : (nDstPixelStride % nDstDataTypeSize) != 0))
3984 : {
3985 905 : if (eSrcType == eDstType)
3986 : {
3987 34800 : for (decltype(nWordCount) i = 0; i < nWordCount; i++)
3988 : {
3989 34000 : memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
3990 : static_cast<const GByte *>(pSrcData) +
3991 34000 : nSrcPixelStride * i,
3992 : nDstDataTypeSize);
3993 : }
3994 : }
3995 : else
3996 : {
3997 210 : const auto getAlignedPtr = [](GByte *ptr, int align)
3998 : {
3999 : return ptr +
4000 210 : ((align - (reinterpret_cast<uintptr_t>(ptr) % align)) %
4001 210 : align);
4002 : };
4003 :
4004 : // The largest we need is for CFloat64 (16 bytes), so 32 bytes to
4005 : // be sure to get correctly aligned pointer.
4006 105 : constexpr size_t SIZEOF_CFLOAT64 = 2 * sizeof(double);
4007 : GByte abySrcBuffer[2 * SIZEOF_CFLOAT64];
4008 : GByte abyDstBuffer[2 * SIZEOF_CFLOAT64];
4009 : GByte *pabySrcBuffer =
4010 105 : getAlignedPtr(abySrcBuffer, nSrcDataTypeSize);
4011 : GByte *pabyDstBuffer =
4012 105 : getAlignedPtr(abyDstBuffer, nDstDataTypeSize);
4013 3360 : for (decltype(nWordCount) i = 0; i < nWordCount; i++)
4014 : {
4015 3255 : memcpy(pabySrcBuffer,
4016 : static_cast<const GByte *>(pSrcData) +
4017 3255 : nSrcPixelStride * i,
4018 : nSrcDataTypeSize);
4019 3255 : GDALCopyWords64(pabySrcBuffer, eSrcType, 0, pabyDstBuffer,
4020 : eDstType, 0, 1);
4021 3255 : memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
4022 : pabyDstBuffer, nDstDataTypeSize);
4023 : }
4024 : }
4025 905 : return;
4026 : }
4027 :
4028 : // Deal with the case where we're replicating a single word into the
4029 : // provided buffer
4030 108953000 : if (nSrcPixelStride == 0 && nWordCount > 1)
4031 : {
4032 1050480 : GDALReplicateWord(pSrcData, eSrcType, pDstData, eDstType,
4033 : nDstPixelStride, nWordCount);
4034 1050480 : return;
4035 : }
4036 :
4037 107902000 : if (eSrcType == eDstType)
4038 : {
4039 53817400 : if (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8)
4040 : {
4041 18000300 : GDALFastCopy(static_cast<GByte *>(pDstData), nDstPixelStride,
4042 : static_cast<const GByte *>(pSrcData), nSrcPixelStride,
4043 : nWordCount);
4044 18000300 : return;
4045 : }
4046 :
4047 35817100 : if (nSrcDataTypeSize == 2 && (nSrcPixelStride % 2) == 0 &&
4048 21451000 : (nDstPixelStride % 2) == 0)
4049 : {
4050 21451000 : GDALFastCopy(static_cast<short *>(pDstData), nDstPixelStride,
4051 : static_cast<const short *>(pSrcData), nSrcPixelStride,
4052 : nWordCount);
4053 21451000 : return;
4054 : }
4055 :
4056 14366100 : if (nWordCount == 1)
4057 : {
4058 : #if defined(CSA_BUILD) || defined(__COVERITY__)
4059 : // Avoid false positives...
4060 : memcpy(pDstData, pSrcData, nSrcDataTypeSize);
4061 : #else
4062 13908600 : if (nSrcDataTypeSize == 2)
4063 0 : memcpy(pDstData, pSrcData, 2);
4064 13908600 : else if (nSrcDataTypeSize == 4)
4065 13813500 : memcpy(pDstData, pSrcData, 4);
4066 95125 : else if (nSrcDataTypeSize == 8)
4067 78520 : memcpy(pDstData, pSrcData, 8);
4068 : else /* if( eSrcType == GDT_CFloat64 ) */
4069 16605 : memcpy(pDstData, pSrcData, 16);
4070 : #endif
4071 13908600 : return;
4072 : }
4073 :
4074 : // Let memcpy() handle the case where we're copying a packed buffer
4075 : // of pixels.
4076 457421 : if (nSrcPixelStride == nDstPixelStride)
4077 : {
4078 195607 : if (nSrcPixelStride == nSrcDataTypeSize)
4079 : {
4080 195539 : memcpy(pDstData, pSrcData, nWordCount * nSrcDataTypeSize);
4081 195539 : return;
4082 : }
4083 : }
4084 : }
4085 :
4086 : // Handle the more general case -- deals with conversion of data types
4087 : // directly.
4088 54346900 : switch (eSrcType)
4089 : {
4090 15545100 : case GDT_UInt8:
4091 15545100 : GDALCopyWordsFromT<unsigned char>(
4092 : static_cast<const unsigned char *>(pSrcData), nSrcPixelStride,
4093 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4094 15545100 : break;
4095 1291 : case GDT_Int8:
4096 1291 : GDALCopyWordsFromT<signed char>(
4097 : static_cast<const signed char *>(pSrcData), nSrcPixelStride,
4098 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4099 1291 : break;
4100 54285 : case GDT_UInt16:
4101 54285 : GDALCopyWordsFromT<unsigned short>(
4102 : static_cast<const unsigned short *>(pSrcData), nSrcPixelStride,
4103 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4104 54285 : break;
4105 4353740 : case GDT_Int16:
4106 4353740 : GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
4107 : nSrcPixelStride, false, pDstData,
4108 : eDstType, nDstPixelStride, nWordCount);
4109 4353740 : break;
4110 7432 : case GDT_UInt32:
4111 7432 : GDALCopyWordsFromT<unsigned int>(
4112 : static_cast<const unsigned int *>(pSrcData), nSrcPixelStride,
4113 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4114 7432 : break;
4115 12255400 : case GDT_Int32:
4116 12255400 : GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
4117 : nSrcPixelStride, false, pDstData, eDstType,
4118 : nDstPixelStride, nWordCount);
4119 12255400 : break;
4120 1957 : case GDT_UInt64:
4121 1957 : GDALCopyWordsFromT<std::uint64_t>(
4122 : static_cast<const std::uint64_t *>(pSrcData), nSrcPixelStride,
4123 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4124 1957 : break;
4125 11578 : case GDT_Int64:
4126 11578 : GDALCopyWordsFromT<std::int64_t>(
4127 : static_cast<const std::int64_t *>(pSrcData), nSrcPixelStride,
4128 : false, pDstData, eDstType, nDstPixelStride, nWordCount);
4129 11578 : break;
4130 1371 : case GDT_Float16:
4131 1371 : GDALCopyWordsFromT<GFloat16>(
4132 : static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, false,
4133 : pDstData, eDstType, nDstPixelStride, nWordCount);
4134 1371 : break;
4135 657732 : case GDT_Float32:
4136 657732 : GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
4137 : nSrcPixelStride, false, pDstData,
4138 : eDstType, nDstPixelStride, nWordCount);
4139 657732 : break;
4140 20697400 : case GDT_Float64:
4141 20697400 : GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
4142 : nSrcPixelStride, false, pDstData,
4143 : eDstType, nDstPixelStride, nWordCount);
4144 20697400 : break;
4145 478485 : case GDT_CInt16:
4146 478485 : GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
4147 : nSrcPixelStride, true, pDstData, eDstType,
4148 : nDstPixelStride, nWordCount);
4149 478485 : break;
4150 868 : case GDT_CInt32:
4151 868 : GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
4152 : nSrcPixelStride, true, pDstData, eDstType,
4153 : nDstPixelStride, nWordCount);
4154 868 : break;
4155 508 : case GDT_CFloat16:
4156 508 : GDALCopyWordsFromT<GFloat16>(
4157 : static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, true,
4158 : pDstData, eDstType, nDstPixelStride, nWordCount);
4159 508 : break;
4160 2389 : case GDT_CFloat32:
4161 2389 : GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
4162 : nSrcPixelStride, true, pDstData, eDstType,
4163 : nDstPixelStride, nWordCount);
4164 2389 : break;
4165 277349 : case GDT_CFloat64:
4166 277349 : GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
4167 : nSrcPixelStride, true, pDstData,
4168 : eDstType, nDstPixelStride, nWordCount);
4169 277349 : break;
4170 0 : case GDT_Unknown:
4171 : case GDT_TypeCount:
4172 0 : CPLAssert(false);
4173 : }
4174 : }
4175 :
4176 : /************************************************************************/
4177 : /* GDALCopyBits() */
4178 : /************************************************************************/
4179 :
4180 : /**
4181 : * Bitwise word copying.
4182 : *
4183 : * A function for moving sets of partial bytes around. Loosely
4184 : * speaking this is a bitwise analog to GDALCopyWords().
4185 : *
4186 : * It copies nStepCount "words" where each word is nBitCount bits long.
4187 : * The nSrcStep and nDstStep are the number of bits from the start of one
4188 : * word to the next (same as nBitCount if they are packed). The nSrcOffset
4189 : * and nDstOffset are the offset into the source and destination buffers
4190 : * to start at, also measured in bits.
4191 : *
4192 : * All bit offsets are assumed to start from the high order bit in a byte
4193 : * (i.e. most significant bit first). Currently this function is not very
4194 : * optimized, but it may be improved for some common cases in the future
4195 : * as needed.
4196 : *
4197 : * @param pabySrcData the source data buffer.
4198 : * @param nSrcOffset the offset (in bits) in pabySrcData to the start of the
4199 : * first word to copy.
4200 : * @param nSrcStep the offset in bits from the start one source word to the
4201 : * start of the next.
4202 : * @param pabyDstData the destination data buffer.
4203 : * @param nDstOffset the offset (in bits) in pabyDstData to the start of the
4204 : * first word to copy over.
4205 : * @param nDstStep the offset in bits from the start one word to the
4206 : * start of the next.
4207 : * @param nBitCount the number of bits in a word to be copied.
4208 : * @param nStepCount the number of words to copy.
4209 : */
4210 :
4211 0 : void GDALCopyBits(const GByte *pabySrcData, int nSrcOffset, int nSrcStep,
4212 : GByte *pabyDstData, int nDstOffset, int nDstStep,
4213 : int nBitCount, int nStepCount)
4214 :
4215 : {
4216 0 : VALIDATE_POINTER0(pabySrcData, "GDALCopyBits");
4217 :
4218 0 : for (int iStep = 0; iStep < nStepCount; iStep++)
4219 : {
4220 0 : for (int iBit = 0; iBit < nBitCount; iBit++)
4221 : {
4222 0 : if (pabySrcData[nSrcOffset >> 3] & (0x80 >> (nSrcOffset & 7)))
4223 0 : pabyDstData[nDstOffset >> 3] |= (0x80 >> (nDstOffset & 7));
4224 : else
4225 0 : pabyDstData[nDstOffset >> 3] &= ~(0x80 >> (nDstOffset & 7));
4226 :
4227 0 : nSrcOffset++;
4228 0 : nDstOffset++;
4229 : }
4230 :
4231 0 : nSrcOffset += (nSrcStep - nBitCount);
4232 0 : nDstOffset += (nDstStep - nBitCount);
4233 : }
4234 : }
4235 :
4236 : /************************************************************************/
4237 : /* GDALGetBestOverviewLevel() */
4238 : /* */
4239 : /* Returns the best overview level to satisfy the query or -1 if none */
4240 : /* Also updates nXOff, nYOff, nXSize, nYSize and psExtraArg when */
4241 : /* returning a valid overview level */
4242 : /************************************************************************/
4243 :
4244 0 : int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand, int &nXOff, int &nYOff,
4245 : int &nXSize, int &nYSize, int nBufXSize,
4246 : int nBufYSize)
4247 : {
4248 0 : return GDALBandGetBestOverviewLevel2(poBand, nXOff, nYOff, nXSize, nYSize,
4249 0 : nBufXSize, nBufYSize, nullptr);
4250 : }
4251 :
4252 523998 : int GDALBandGetBestOverviewLevel2(GDALRasterBand *poBand, int &nXOff,
4253 : int &nYOff, int &nXSize, int &nYSize,
4254 : int nBufXSize, int nBufYSize,
4255 : GDALRasterIOExtraArg *psExtraArg)
4256 : {
4257 523998 : if (psExtraArg != nullptr && psExtraArg->nVersion > 1 &&
4258 523998 : psExtraArg->bUseOnlyThisScale)
4259 109 : return -1;
4260 : /* -------------------------------------------------------------------- */
4261 : /* Compute the desired downsampling factor. It is */
4262 : /* based on the least reduced axis, and represents the number */
4263 : /* of source pixels to one destination pixel. */
4264 : /* -------------------------------------------------------------------- */
4265 523889 : const double dfDesiredDownsamplingFactor =
4266 523889 : ((nXSize / static_cast<double>(nBufXSize)) <
4267 361551 : (nYSize / static_cast<double>(nBufYSize)) ||
4268 : nBufYSize == 1)
4269 752276 : ? nXSize / static_cast<double>(nBufXSize)
4270 133164 : : nYSize / static_cast<double>(nBufYSize);
4271 :
4272 : /* -------------------------------------------------------------------- */
4273 : /* Find the overview level that largest downsampling factor (most */
4274 : /* downsampled) that is still less than (or only a little more) */
4275 : /* downsampled than the request. */
4276 : /* -------------------------------------------------------------------- */
4277 523889 : const int nOverviewCount = poBand->GetOverviewCount();
4278 523889 : GDALRasterBand *poBestOverview = nullptr;
4279 523889 : double dfBestDownsamplingFactor = 0;
4280 523889 : int nBestOverviewLevel = -1;
4281 :
4282 : const char *pszOversampligThreshold =
4283 523889 : CPLGetConfigOption("GDAL_OVERVIEW_OVERSAMPLING_THRESHOLD", nullptr);
4284 :
4285 : // Note: keep this logic for overview selection in sync between
4286 : // gdalwarp_lib.cpp and rasterio.cpp
4287 : // Cf https://github.com/OSGeo/gdal/pull/9040#issuecomment-1898524693
4288 : const double dfOversamplingThreshold =
4289 1047770 : pszOversampligThreshold ? CPLAtof(pszOversampligThreshold)
4290 523880 : : psExtraArg && psExtraArg->eResampleAlg != GRIORA_NearestNeighbour
4291 1047760 : ? 1.0
4292 523889 : : 1.2;
4293 526585 : for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
4294 : {
4295 5612 : GDALRasterBand *poOverview = poBand->GetOverview(iOverview);
4296 11224 : if (poOverview == nullptr ||
4297 11223 : poOverview->GetXSize() > poBand->GetXSize() ||
4298 5611 : poOverview->GetYSize() > poBand->GetYSize())
4299 : {
4300 1 : continue;
4301 : }
4302 :
4303 : // Compute downsampling factor of this overview
4304 : const double dfDownsamplingFactor = std::min(
4305 5611 : poBand->GetXSize() / static_cast<double>(poOverview->GetXSize()),
4306 11222 : poBand->GetYSize() / static_cast<double>(poOverview->GetYSize()));
4307 :
4308 : // Is it nearly the requested factor and better (lower) than
4309 : // the current best factor?
4310 : // Use an epsilon because of numerical instability.
4311 5611 : constexpr double EPSILON = 1e-1;
4312 5719 : if (dfDownsamplingFactor >=
4313 5611 : dfDesiredDownsamplingFactor * dfOversamplingThreshold +
4314 5503 : EPSILON ||
4315 : dfDownsamplingFactor <= dfBestDownsamplingFactor)
4316 : {
4317 108 : continue;
4318 : }
4319 :
4320 : // Ignore AVERAGE_BIT2GRAYSCALE overviews for RasterIO purposes.
4321 5503 : const char *pszResampling = poOverview->GetMetadataItem("RESAMPLING");
4322 :
4323 5503 : if (pszResampling != nullptr &&
4324 71 : STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2"))
4325 16 : continue;
4326 :
4327 : // OK, this is our new best overview.
4328 5487 : poBestOverview = poOverview;
4329 5487 : nBestOverviewLevel = iOverview;
4330 5487 : dfBestDownsamplingFactor = dfDownsamplingFactor;
4331 :
4332 5487 : if (std::abs(dfDesiredDownsamplingFactor - dfDownsamplingFactor) <
4333 : EPSILON)
4334 : {
4335 2916 : break;
4336 : }
4337 : }
4338 :
4339 : /* -------------------------------------------------------------------- */
4340 : /* If we didn't find an overview that helps us, just return */
4341 : /* indicating failure and the full resolution image will be used. */
4342 : /* -------------------------------------------------------------------- */
4343 523889 : if (nBestOverviewLevel < 0)
4344 520900 : return -1;
4345 :
4346 : /* -------------------------------------------------------------------- */
4347 : /* Recompute the source window in terms of the selected */
4348 : /* overview. */
4349 : /* -------------------------------------------------------------------- */
4350 : const double dfXFactor =
4351 2989 : poBand->GetXSize() / static_cast<double>(poBestOverview->GetXSize());
4352 : const double dfYFactor =
4353 2989 : poBand->GetYSize() / static_cast<double>(poBestOverview->GetYSize());
4354 2989 : CPLDebug("GDAL", "Selecting overview %d x %d", poBestOverview->GetXSize(),
4355 : poBestOverview->GetYSize());
4356 :
4357 8967 : const int nOXOff = std::min(poBestOverview->GetXSize() - 1,
4358 2989 : static_cast<int>(nXOff / dfXFactor + 0.5));
4359 8967 : const int nOYOff = std::min(poBestOverview->GetYSize() - 1,
4360 2989 : static_cast<int>(nYOff / dfYFactor + 0.5));
4361 2989 : int nOXSize = std::max(1, static_cast<int>(nXSize / dfXFactor + 0.5));
4362 2989 : int nOYSize = std::max(1, static_cast<int>(nYSize / dfYFactor + 0.5));
4363 2989 : if (nOXOff + nOXSize > poBestOverview->GetXSize())
4364 0 : nOXSize = poBestOverview->GetXSize() - nOXOff;
4365 2989 : if (nOYOff + nOYSize > poBestOverview->GetYSize())
4366 2 : nOYSize = poBestOverview->GetYSize() - nOYOff;
4367 :
4368 2989 : if (psExtraArg)
4369 : {
4370 2989 : if (psExtraArg->bFloatingPointWindowValidity)
4371 : {
4372 115 : psExtraArg->dfXOff /= dfXFactor;
4373 115 : psExtraArg->dfXSize /= dfXFactor;
4374 115 : psExtraArg->dfYOff /= dfYFactor;
4375 115 : psExtraArg->dfYSize /= dfYFactor;
4376 : }
4377 2874 : else if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
4378 : {
4379 16 : psExtraArg->bFloatingPointWindowValidity = true;
4380 16 : psExtraArg->dfXOff = nXOff / dfXFactor;
4381 16 : psExtraArg->dfXSize = nXSize / dfXFactor;
4382 16 : psExtraArg->dfYOff = nYOff / dfYFactor;
4383 16 : psExtraArg->dfYSize = nYSize / dfYFactor;
4384 : }
4385 : }
4386 :
4387 2989 : nXOff = nOXOff;
4388 2989 : nYOff = nOYOff;
4389 2989 : nXSize = nOXSize;
4390 2989 : nYSize = nOYSize;
4391 :
4392 2989 : return nBestOverviewLevel;
4393 : }
4394 :
4395 : /************************************************************************/
4396 : /* OverviewRasterIO() */
4397 : /* */
4398 : /* Special work function to utilize available overviews to */
4399 : /* more efficiently satisfy downsampled requests. It will */
4400 : /* return CE_Failure if there are no appropriate overviews */
4401 : /* available but it doesn't emit any error messages. */
4402 : /************************************************************************/
4403 :
4404 : //! @cond Doxygen_Suppress
4405 2 : CPLErr GDALRasterBand::OverviewRasterIO(
4406 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4407 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4408 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
4409 :
4410 : {
4411 : GDALRasterIOExtraArg sExtraArg;
4412 2 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4413 :
4414 2 : const int nOverview = GDALBandGetBestOverviewLevel2(
4415 : this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, &sExtraArg);
4416 2 : if (nOverview < 0)
4417 1 : return CE_Failure;
4418 :
4419 : /* -------------------------------------------------------------------- */
4420 : /* Recast the call in terms of the new raster layer. */
4421 : /* -------------------------------------------------------------------- */
4422 1 : GDALRasterBand *poOverviewBand = GetOverview(nOverview);
4423 1 : if (poOverviewBand == nullptr)
4424 0 : return CE_Failure;
4425 :
4426 1 : return poOverviewBand->RasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
4427 : pData, nBufXSize, nBufYSize, eBufType,
4428 1 : nPixelSpace, nLineSpace, &sExtraArg);
4429 : }
4430 :
4431 : /************************************************************************/
4432 : /* TryOverviewRasterIO() */
4433 : /************************************************************************/
4434 :
4435 362417 : CPLErr GDALRasterBand::TryOverviewRasterIO(
4436 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4437 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4438 : GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg,
4439 : int *pbTried)
4440 : {
4441 362417 : int nXOffMod = nXOff;
4442 362417 : int nYOffMod = nYOff;
4443 362417 : int nXSizeMod = nXSize;
4444 362417 : int nYSizeMod = nYSize;
4445 : GDALRasterIOExtraArg sExtraArg;
4446 :
4447 362417 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4448 :
4449 362417 : int iOvrLevel = GDALBandGetBestOverviewLevel2(
4450 : this, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize, nBufYSize,
4451 : &sExtraArg);
4452 :
4453 362417 : if (iOvrLevel >= 0)
4454 : {
4455 50 : GDALRasterBand *poOverviewBand = GetOverview(iOvrLevel);
4456 50 : if (poOverviewBand)
4457 : {
4458 50 : *pbTried = TRUE;
4459 50 : return poOverviewBand->RasterIO(
4460 : eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData,
4461 : nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
4462 50 : &sExtraArg);
4463 : }
4464 : }
4465 :
4466 362367 : *pbTried = FALSE;
4467 362367 : return CE_None;
4468 : }
4469 :
4470 : /************************************************************************/
4471 : /* TryOverviewRasterIO() */
4472 : /************************************************************************/
4473 :
4474 158605 : CPLErr GDALDataset::TryOverviewRasterIO(
4475 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4476 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4477 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
4478 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg,
4479 : int *pbTried)
4480 : {
4481 158605 : int nXOffMod = nXOff;
4482 158605 : int nYOffMod = nYOff;
4483 158605 : int nXSizeMod = nXSize;
4484 158605 : int nYSizeMod = nYSize;
4485 : GDALRasterIOExtraArg sExtraArg;
4486 158605 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4487 :
4488 317210 : int iOvrLevel = GDALBandGetBestOverviewLevel2(
4489 158605 : papoBands[0], nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize,
4490 : nBufYSize, &sExtraArg);
4491 :
4492 158646 : if (iOvrLevel >= 0 && papoBands[0]->GetOverview(iOvrLevel) != nullptr &&
4493 41 : papoBands[0]->GetOverview(iOvrLevel)->GetDataset() != nullptr)
4494 : {
4495 41 : *pbTried = TRUE;
4496 41 : return papoBands[0]->GetOverview(iOvrLevel)->GetDataset()->RasterIO(
4497 : eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData, nBufXSize,
4498 : nBufYSize, eBufType, nBandCount, panBandMap, nPixelSpace,
4499 41 : nLineSpace, nBandSpace, &sExtraArg);
4500 : }
4501 : else
4502 : {
4503 158564 : *pbTried = FALSE;
4504 158564 : return CE_None;
4505 : }
4506 : }
4507 :
4508 : /************************************************************************/
4509 : /* GetBestOverviewLevel() */
4510 : /* */
4511 : /* Returns the best overview level to satisfy the query or -1 if none */
4512 : /* Also updates nXOff, nYOff, nXSize, nYSize when returning a valid */
4513 : /* overview level */
4514 : /************************************************************************/
4515 :
4516 4 : static int GDALDatasetGetBestOverviewLevel(GDALDataset *poDS, int &nXOff,
4517 : int &nYOff, int &nXSize, int &nYSize,
4518 : int nBufXSize, int nBufYSize,
4519 : int nBandCount,
4520 : const int *panBandMap,
4521 : GDALRasterIOExtraArg *psExtraArg)
4522 : {
4523 4 : int nOverviewCount = 0;
4524 4 : GDALRasterBand *poFirstBand = nullptr;
4525 :
4526 : /* -------------------------------------------------------------------- */
4527 : /* Check that all bands have the same number of overviews and */
4528 : /* that they have all the same size and block dimensions */
4529 : /* -------------------------------------------------------------------- */
4530 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
4531 : {
4532 8 : GDALRasterBand *poBand = poDS->GetRasterBand(panBandMap[iBand]);
4533 8 : if (poBand == nullptr)
4534 0 : return -1;
4535 8 : if (iBand == 0)
4536 : {
4537 4 : poFirstBand = poBand;
4538 4 : nOverviewCount = poBand->GetOverviewCount();
4539 : }
4540 4 : else if (nOverviewCount != poBand->GetOverviewCount())
4541 : {
4542 0 : CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
4543 : "mismatched overview count, use std method.");
4544 0 : return -1;
4545 : }
4546 : else
4547 : {
4548 4 : for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
4549 : {
4550 0 : GDALRasterBand *poOvrBand = poBand->GetOverview(iOverview);
4551 : GDALRasterBand *poOvrFirstBand =
4552 0 : poFirstBand->GetOverview(iOverview);
4553 0 : if (poOvrBand == nullptr || poOvrFirstBand == nullptr)
4554 0 : continue;
4555 :
4556 0 : if (poOvrFirstBand->GetXSize() != poOvrBand->GetXSize() ||
4557 0 : poOvrFirstBand->GetYSize() != poOvrBand->GetYSize())
4558 : {
4559 0 : CPLDebug("GDAL",
4560 : "GDALDataset::GetBestOverviewLevel() ... "
4561 : "mismatched overview sizes, use std method.");
4562 0 : return -1;
4563 : }
4564 0 : int nBlockXSizeFirst = 0;
4565 0 : int nBlockYSizeFirst = 0;
4566 0 : poOvrFirstBand->GetBlockSize(&nBlockXSizeFirst,
4567 : &nBlockYSizeFirst);
4568 :
4569 0 : int nBlockXSizeCurrent = 0;
4570 0 : int nBlockYSizeCurrent = 0;
4571 0 : poOvrBand->GetBlockSize(&nBlockXSizeCurrent,
4572 : &nBlockYSizeCurrent);
4573 :
4574 0 : if (nBlockXSizeFirst != nBlockXSizeCurrent ||
4575 0 : nBlockYSizeFirst != nBlockYSizeCurrent)
4576 : {
4577 0 : CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
4578 : "mismatched block sizes, use std method.");
4579 0 : return -1;
4580 : }
4581 : }
4582 : }
4583 : }
4584 4 : if (poFirstBand == nullptr)
4585 0 : return -1;
4586 :
4587 4 : return GDALBandGetBestOverviewLevel2(poFirstBand, nXOff, nYOff, nXSize,
4588 : nYSize, nBufXSize, nBufYSize,
4589 4 : psExtraArg);
4590 : }
4591 :
4592 : /************************************************************************/
4593 : /* BlockBasedRasterIO() */
4594 : /* */
4595 : /* This convenience function implements a dataset level */
4596 : /* RasterIO() interface based on calling down to fetch blocks, */
4597 : /* much like the GDALRasterBand::IRasterIO(), but it handles */
4598 : /* all bands at once, so that a format driver that handles a */
4599 : /* request for different bands of the same block efficiently */
4600 : /* (i.e. without re-reading interleaved data) will efficiently. */
4601 : /* */
4602 : /* This method is intended to be called by an overridden */
4603 : /* IRasterIO() method in the driver specific GDALDataset */
4604 : /* derived class. */
4605 : /* */
4606 : /* Default internal implementation of RasterIO() ... utilizes */
4607 : /* the Block access methods to satisfy the request. This would */
4608 : /* normally only be overridden by formats with overviews. */
4609 : /* */
4610 : /* To keep things relatively simple, this method does not */
4611 : /* currently take advantage of some special cases addressed in */
4612 : /* GDALRasterBand::IRasterIO(), so it is likely best to only */
4613 : /* call it when you know it will help. That is in cases where */
4614 : /* data is at 1:1 to the buffer, and you know the driver is */
4615 : /* implementing interleaved IO efficiently on a block by block */
4616 : /* basis. Overviews will be used when possible. */
4617 : /************************************************************************/
4618 :
4619 64164 : CPLErr GDALDataset::BlockBasedRasterIO(
4620 : GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
4621 : void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
4622 : int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
4623 : GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
4624 :
4625 : {
4626 64164 : CPLAssert(nullptr != pData);
4627 :
4628 64164 : GByte **papabySrcBlock = nullptr;
4629 64164 : GDALRasterBlock *poBlock = nullptr;
4630 64164 : GDALRasterBlock **papoBlocks = nullptr;
4631 64164 : int nLBlockX = -1;
4632 64164 : int nLBlockY = -1;
4633 : int iBufYOff;
4634 : int iBufXOff;
4635 64164 : int nBlockXSize = 1;
4636 64164 : int nBlockYSize = 1;
4637 64164 : CPLErr eErr = CE_None;
4638 64164 : GDALDataType eDataType = GDT_UInt8;
4639 :
4640 64164 : const bool bUseIntegerRequestCoords =
4641 64194 : (!psExtraArg->bFloatingPointWindowValidity ||
4642 30 : (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
4643 28 : nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
4644 :
4645 : /* -------------------------------------------------------------------- */
4646 : /* Ensure that all bands share a common block size and data type. */
4647 : /* -------------------------------------------------------------------- */
4648 304122 : for (int iBand = 0; iBand < nBandCount; iBand++)
4649 : {
4650 239958 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
4651 :
4652 239958 : if (iBand == 0)
4653 : {
4654 64164 : poBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
4655 64164 : eDataType = poBand->GetRasterDataType();
4656 : }
4657 : else
4658 : {
4659 175794 : int nThisBlockXSize = 0;
4660 175794 : int nThisBlockYSize = 0;
4661 175794 : poBand->GetBlockSize(&nThisBlockXSize, &nThisBlockYSize);
4662 175794 : if (nThisBlockXSize != nBlockXSize ||
4663 175794 : nThisBlockYSize != nBlockYSize)
4664 : {
4665 0 : CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
4666 : "mismatched block sizes, use std method.");
4667 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
4668 : pData, nBufXSize, nBufYSize, eBufType,
4669 : nBandCount, panBandMap, nPixelSpace,
4670 0 : nLineSpace, nBandSpace, psExtraArg);
4671 : }
4672 :
4673 175794 : if (eDataType != poBand->GetRasterDataType() &&
4674 0 : (nXSize != nBufXSize || nYSize != nBufYSize))
4675 : {
4676 0 : CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
4677 : "mismatched band data types, use std method.");
4678 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
4679 : pData, nBufXSize, nBufYSize, eBufType,
4680 : nBandCount, panBandMap, nPixelSpace,
4681 0 : nLineSpace, nBandSpace, psExtraArg);
4682 : }
4683 : }
4684 : }
4685 :
4686 : /* ==================================================================== */
4687 : /* In this special case at full resolution we step through in */
4688 : /* blocks, turning the request over to the per-band */
4689 : /* IRasterIO(), but ensuring that all bands of one block are */
4690 : /* called before proceeding to the next. */
4691 : /* ==================================================================== */
4692 :
4693 64164 : if (nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
4694 : {
4695 : GDALRasterIOExtraArg sDummyExtraArg;
4696 64160 : INIT_RASTERIO_EXTRA_ARG(sDummyExtraArg);
4697 :
4698 64160 : int nChunkYSize = 0;
4699 64160 : int nChunkXSize = 0;
4700 :
4701 210807 : for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff += nChunkYSize)
4702 : {
4703 147664 : const int nChunkYOff = iBufYOff + nYOff;
4704 147664 : nChunkYSize = nBlockYSize - (nChunkYOff % nBlockYSize);
4705 147664 : if (nChunkYOff + nChunkYSize > nYOff + nYSize)
4706 59196 : nChunkYSize = (nYOff + nYSize) - nChunkYOff;
4707 :
4708 818665 : for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff += nChunkXSize)
4709 : {
4710 672016 : const int nChunkXOff = iBufXOff + nXOff;
4711 672016 : nChunkXSize = nBlockXSize - (nChunkXOff % nBlockXSize);
4712 672016 : if (nChunkXOff + nChunkXSize > nXOff + nXSize)
4713 70395 : nChunkXSize = (nXOff + nXSize) - nChunkXOff;
4714 :
4715 672016 : GByte *pabyChunkData =
4716 672016 : static_cast<GByte *>(pData) + iBufXOff * nPixelSpace +
4717 672016 : static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace;
4718 :
4719 3271840 : for (int iBand = 0; iBand < nBandCount; iBand++)
4720 : {
4721 2600840 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
4722 :
4723 5201690 : eErr = poBand->IRasterIO(
4724 : eRWFlag, nChunkXOff, nChunkYOff, nChunkXSize,
4725 : nChunkYSize,
4726 2600840 : pabyChunkData +
4727 2600840 : static_cast<GPtrDiff_t>(iBand) * nBandSpace,
4728 : nChunkXSize, nChunkYSize, eBufType, nPixelSpace,
4729 2600840 : nLineSpace, &sDummyExtraArg);
4730 2600840 : if (eErr != CE_None)
4731 1015 : return eErr;
4732 : }
4733 : }
4734 :
4735 165480 : if (psExtraArg->pfnProgress != nullptr &&
4736 18831 : !psExtraArg->pfnProgress(
4737 165480 : 1.0 * std::min(nBufYSize, iBufYOff + nChunkYSize) /
4738 : nBufYSize,
4739 : "", psExtraArg->pProgressData))
4740 : {
4741 2 : return CE_Failure;
4742 : }
4743 : }
4744 :
4745 63143 : return CE_None;
4746 : }
4747 :
4748 : /* Below code is not compatible with that case. It would need a complete */
4749 : /* separate code like done in GDALRasterBand::IRasterIO. */
4750 4 : if (eRWFlag == GF_Write && (nBufXSize < nXSize || nBufYSize < nYSize))
4751 : {
4752 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
4753 : nBufXSize, nBufYSize, eBufType, nBandCount,
4754 : panBandMap, nPixelSpace, nLineSpace,
4755 0 : nBandSpace, psExtraArg);
4756 : }
4757 :
4758 : /* We could have a smarter implementation, but that will do for now */
4759 4 : if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
4760 0 : (nBufXSize != nXSize || nBufYSize != nYSize))
4761 : {
4762 0 : return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
4763 : nBufXSize, nBufYSize, eBufType, nBandCount,
4764 : panBandMap, nPixelSpace, nLineSpace,
4765 0 : nBandSpace, psExtraArg);
4766 : }
4767 :
4768 : /* ==================================================================== */
4769 : /* Loop reading required source blocks to satisfy output */
4770 : /* request. This is the most general implementation. */
4771 : /* ==================================================================== */
4772 :
4773 4 : const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
4774 :
4775 : papabySrcBlock =
4776 4 : static_cast<GByte **>(CPLCalloc(sizeof(GByte *), nBandCount));
4777 : papoBlocks =
4778 4 : static_cast<GDALRasterBlock **>(CPLCalloc(sizeof(void *), nBandCount));
4779 :
4780 : /* -------------------------------------------------------------------- */
4781 : /* Select an overview level if appropriate. */
4782 : /* -------------------------------------------------------------------- */
4783 :
4784 : GDALRasterIOExtraArg sExtraArg;
4785 4 : GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
4786 4 : const int nOverviewLevel = GDALDatasetGetBestOverviewLevel(
4787 : this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, nBandCount,
4788 : panBandMap, &sExtraArg);
4789 4 : if (nOverviewLevel >= 0)
4790 : {
4791 2 : GetRasterBand(panBandMap[0])
4792 2 : ->GetOverview(nOverviewLevel)
4793 2 : ->GetBlockSize(&nBlockXSize, &nBlockYSize);
4794 : }
4795 :
4796 4 : double dfXOff = nXOff;
4797 4 : double dfYOff = nYOff;
4798 4 : double dfXSize = nXSize;
4799 4 : double dfYSize = nYSize;
4800 4 : if (sExtraArg.bFloatingPointWindowValidity)
4801 : {
4802 2 : dfXOff = sExtraArg.dfXOff;
4803 2 : dfYOff = sExtraArg.dfYOff;
4804 2 : dfXSize = sExtraArg.dfXSize;
4805 2 : dfYSize = sExtraArg.dfYSize;
4806 : }
4807 :
4808 : /* -------------------------------------------------------------------- */
4809 : /* Compute stepping increment. */
4810 : /* -------------------------------------------------------------------- */
4811 4 : const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
4812 4 : const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
4813 :
4814 4 : constexpr double EPS = 1e-10;
4815 : /* -------------------------------------------------------------------- */
4816 : /* Loop over buffer computing source locations. */
4817 : /* -------------------------------------------------------------------- */
4818 36 : for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
4819 : {
4820 : GPtrDiff_t iSrcOffset;
4821 :
4822 : // Add small epsilon to avoid some numeric precision issues.
4823 32 : const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
4824 32 : const int iSrcY = static_cast<int>(std::min(
4825 32 : std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
4826 :
4827 32 : GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
4828 : static_cast<GPtrDiff_t>(nLineSpace);
4829 :
4830 302 : for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff++)
4831 : {
4832 270 : const double dfSrcX = (iBufXOff + 0.5) * dfSrcXInc + dfXOff + EPS;
4833 270 : const int iSrcX = static_cast<int>(std::min(
4834 270 : std::max(0.0, dfSrcX), static_cast<double>(nRasterXSize - 1)));
4835 :
4836 : // FIXME: this code likely doesn't work if the dirty block gets
4837 : // flushed to disk before being completely written. In the meantime,
4838 : // bJustInitialize should probably be set to FALSE even if it is not
4839 : // ideal performance wise, and for lossy compression
4840 :
4841 : /* --------------------------------------------------------------------
4842 : */
4843 : /* Ensure we have the appropriate block loaded. */
4844 : /* --------------------------------------------------------------------
4845 : */
4846 270 : if (iSrcX < nLBlockX * nBlockXSize ||
4847 270 : iSrcX - nBlockXSize >= nLBlockX * nBlockXSize ||
4848 266 : iSrcY < nLBlockY * nBlockYSize ||
4849 266 : iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
4850 : {
4851 4 : nLBlockX = iSrcX / nBlockXSize;
4852 4 : nLBlockY = iSrcY / nBlockYSize;
4853 :
4854 4 : const bool bJustInitialize =
4855 0 : eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
4856 0 : nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
4857 4 : nXOff <= nLBlockX * nBlockXSize &&
4858 0 : nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
4859 : /*bool bMemZeroBuffer = FALSE;
4860 : if( eRWFlag == GF_Write && !bJustInitialize &&
4861 : nXOff <= nLBlockX * nBlockXSize &&
4862 : nYOff <= nLBlockY * nBlockYSize &&
4863 : (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
4864 : (nXOff + nXSize == GetRasterXSize() &&
4865 : (nLBlockX+1) * nBlockXSize > GetRasterXSize())) &&
4866 : (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
4867 : (nYOff + nYSize == GetRasterYSize() &&
4868 : (nLBlockY+1) * nBlockYSize > GetRasterYSize())) )
4869 : {
4870 : bJustInitialize = TRUE;
4871 : bMemZeroBuffer = TRUE;
4872 : }*/
4873 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
4874 : {
4875 8 : GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
4876 8 : if (nOverviewLevel >= 0)
4877 2 : poBand = poBand->GetOverview(nOverviewLevel);
4878 16 : poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY,
4879 8 : bJustInitialize);
4880 8 : if (poBlock == nullptr)
4881 : {
4882 0 : eErr = CE_Failure;
4883 0 : goto CleanupAndReturn;
4884 : }
4885 :
4886 8 : if (eRWFlag == GF_Write)
4887 0 : poBlock->MarkDirty();
4888 :
4889 8 : if (papoBlocks[iBand] != nullptr)
4890 0 : papoBlocks[iBand]->DropLock();
4891 :
4892 8 : papoBlocks[iBand] = poBlock;
4893 :
4894 8 : papabySrcBlock[iBand] =
4895 8 : static_cast<GByte *>(poBlock->GetDataRef());
4896 : /*if( bMemZeroBuffer )
4897 : {
4898 : memset(papabySrcBlock[iBand], 0,
4899 : static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
4900 : * nBlockYSize);
4901 : }*/
4902 : }
4903 : }
4904 :
4905 : /* --------------------------------------------------------------------
4906 : */
4907 : /* Copy over this pixel of data. */
4908 : /* --------------------------------------------------------------------
4909 : */
4910 270 : iSrcOffset = (static_cast<GPtrDiff_t>(iSrcX) -
4911 270 : static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
4912 270 : (static_cast<GPtrDiff_t>(iSrcY) -
4913 270 : static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
4914 270 : nBlockXSize) *
4915 270 : nBandDataSize;
4916 :
4917 980 : for (int iBand = 0; iBand < nBandCount; iBand++)
4918 : {
4919 710 : GByte *pabySrcBlock = papabySrcBlock[iBand];
4920 710 : GPtrDiff_t iBandBufOffset =
4921 710 : iBufOffset + static_cast<GPtrDiff_t>(iBand) *
4922 : static_cast<GPtrDiff_t>(nBandSpace);
4923 :
4924 710 : if (eDataType == eBufType)
4925 : {
4926 710 : if (eRWFlag == GF_Read)
4927 710 : memcpy(static_cast<GByte *>(pData) + iBandBufOffset,
4928 710 : pabySrcBlock + iSrcOffset, nBandDataSize);
4929 : else
4930 0 : memcpy(pabySrcBlock + iSrcOffset,
4931 : static_cast<const GByte *>(pData) +
4932 0 : iBandBufOffset,
4933 : nBandDataSize);
4934 : }
4935 : else
4936 : {
4937 : /* type to type conversion ... ouch, this is expensive way
4938 : of handling single words */
4939 :
4940 0 : if (eRWFlag == GF_Read)
4941 0 : GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
4942 : static_cast<GByte *>(pData) +
4943 0 : iBandBufOffset,
4944 : eBufType, 0, 1);
4945 : else
4946 0 : GDALCopyWords64(static_cast<const GByte *>(pData) +
4947 0 : iBandBufOffset,
4948 0 : eBufType, 0, pabySrcBlock + iSrcOffset,
4949 : eDataType, 0, 1);
4950 : }
4951 : }
4952 :
4953 270 : iBufOffset += static_cast<int>(nPixelSpace);
4954 : }
4955 : }
4956 :
4957 : /* -------------------------------------------------------------------- */
4958 : /* CleanupAndReturn. */
4959 : /* -------------------------------------------------------------------- */
4960 4 : CleanupAndReturn:
4961 4 : CPLFree(papabySrcBlock);
4962 4 : if (papoBlocks != nullptr)
4963 : {
4964 12 : for (int iBand = 0; iBand < nBandCount; iBand++)
4965 : {
4966 8 : if (papoBlocks[iBand] != nullptr)
4967 8 : papoBlocks[iBand]->DropLock();
4968 : }
4969 4 : CPLFree(papoBlocks);
4970 : }
4971 :
4972 4 : return eErr;
4973 : }
4974 :
4975 : //! @endcond
4976 :
4977 : /************************************************************************/
4978 : /* GDALCopyWholeRasterGetSwathSize() */
4979 : /************************************************************************/
4980 :
4981 3296 : static void GDALCopyWholeRasterGetSwathSize(GDALRasterBand *poSrcPrototypeBand,
4982 : GDALRasterBand *poDstPrototypeBand,
4983 : int nBandCount,
4984 : int bDstIsCompressed,
4985 : int bInterleave, int *pnSwathCols,
4986 : int *pnSwathLines)
4987 : {
4988 3296 : GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
4989 3296 : int nSrcBlockXSize = 0;
4990 3296 : int nSrcBlockYSize = 0;
4991 3296 : int nBlockXSize = 0;
4992 3296 : int nBlockYSize = 0;
4993 :
4994 3296 : int nXSize = poSrcPrototypeBand->GetXSize();
4995 3296 : int nYSize = poSrcPrototypeBand->GetYSize();
4996 :
4997 3296 : poSrcPrototypeBand->GetBlockSize(&nSrcBlockXSize, &nSrcBlockYSize);
4998 3296 : poDstPrototypeBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
4999 :
5000 3296 : const int nMaxBlockXSize = std::max(nBlockXSize, nSrcBlockXSize);
5001 3296 : const int nMaxBlockYSize = std::max(nBlockYSize, nSrcBlockYSize);
5002 :
5003 3296 : int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5004 3296 : if (bInterleave)
5005 556 : nPixelSize *= nBandCount;
5006 :
5007 : // aim for one row of blocks. Do not settle for less.
5008 3296 : int nSwathCols = nXSize;
5009 3296 : int nSwathLines = nMaxBlockYSize;
5010 :
5011 : const char *pszSrcCompression =
5012 3296 : poSrcPrototypeBand->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
5013 3296 : if (pszSrcCompression == nullptr)
5014 : {
5015 3270 : auto poSrcDS = poSrcPrototypeBand->GetDataset();
5016 3270 : if (poSrcDS)
5017 : pszSrcCompression =
5018 3264 : poSrcDS->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
5019 : }
5020 :
5021 : /* -------------------------------------------------------------------- */
5022 : /* What will our swath size be? */
5023 : /* -------------------------------------------------------------------- */
5024 : // When writing interleaved data in a compressed format, we want to be sure
5025 : // that each block will only be written once, so the swath size must not be
5026 : // greater than the block cache.
5027 3296 : const char *pszSwathSize = CPLGetConfigOption("GDAL_SWATH_SIZE", nullptr);
5028 : int nTargetSwathSize;
5029 3296 : if (pszSwathSize != nullptr)
5030 0 : nTargetSwathSize = static_cast<int>(
5031 0 : std::min(GIntBig(INT_MAX), CPLAtoGIntBig(pszSwathSize)));
5032 : else
5033 : {
5034 : // As a default, take one 1/4 of the cache size.
5035 3296 : nTargetSwathSize = static_cast<int>(
5036 3296 : std::min(GIntBig(INT_MAX), GDALGetCacheMax64() / 4));
5037 :
5038 : // but if the minimum idal swath buf size is less, then go for it to
5039 : // avoid unnecessarily abusing RAM usage.
5040 : // but try to use 10 MB at least.
5041 3296 : GIntBig nIdealSwathBufSize =
5042 3296 : static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize;
5043 3296 : int nMinTargetSwathSize = 10 * 1000 * 1000;
5044 :
5045 3296 : if ((poSrcPrototypeBand->GetSuggestedBlockAccessPattern() &
5046 3296 : GSBAP_LARGEST_CHUNK_POSSIBLE) != 0)
5047 : {
5048 1 : nMinTargetSwathSize = nTargetSwathSize;
5049 : }
5050 :
5051 3296 : if (nIdealSwathBufSize < nTargetSwathSize &&
5052 3286 : nIdealSwathBufSize < nMinTargetSwathSize)
5053 : {
5054 3283 : nIdealSwathBufSize = nMinTargetSwathSize;
5055 : }
5056 :
5057 3296 : if (pszSrcCompression != nullptr &&
5058 184 : EQUAL(pszSrcCompression, "JPEG2000") &&
5059 0 : (!bDstIsCompressed || ((nSrcBlockXSize % nBlockXSize) == 0 &&
5060 0 : (nSrcBlockYSize % nBlockYSize) == 0)))
5061 : {
5062 2 : nIdealSwathBufSize =
5063 4 : std::max(nIdealSwathBufSize, static_cast<GIntBig>(nSwathCols) *
5064 2 : nSrcBlockYSize * nPixelSize);
5065 : }
5066 3296 : if (nTargetSwathSize > nIdealSwathBufSize)
5067 3283 : nTargetSwathSize = static_cast<int>(
5068 3283 : std::min(GIntBig(INT_MAX), nIdealSwathBufSize));
5069 : }
5070 :
5071 3296 : if (nTargetSwathSize < 1000000)
5072 8 : nTargetSwathSize = 1000000;
5073 :
5074 : /* But let's check that */
5075 3517 : if (bDstIsCompressed && bInterleave &&
5076 221 : nTargetSwathSize > GDALGetCacheMax64())
5077 : {
5078 0 : CPLError(CE_Warning, CPLE_AppDefined,
5079 : "When translating into a compressed interleave format, "
5080 : "the block cache size (" CPL_FRMT_GIB ") "
5081 : "should be at least the size of the swath (%d) "
5082 : "(GDAL_SWATH_SIZE config. option)",
5083 : GDALGetCacheMax64(), nTargetSwathSize);
5084 : }
5085 :
5086 : #define IS_DIVIDER_OF(x, y) ((y) % (x) == 0)
5087 : #define ROUND_TO(x, y) (((x) / (y)) * (y))
5088 :
5089 : // if both input and output datasets are tiled, that the tile dimensions
5090 : // are "compatible", try to stick to a swath dimension that is a multiple
5091 : // of input and output block dimensions.
5092 3296 : if (nBlockXSize != nXSize && nSrcBlockXSize != nXSize &&
5093 43 : IS_DIVIDER_OF(nBlockXSize, nMaxBlockXSize) &&
5094 43 : IS_DIVIDER_OF(nSrcBlockXSize, nMaxBlockXSize) &&
5095 43 : IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
5096 43 : IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
5097 : {
5098 43 : if (static_cast<GIntBig>(nMaxBlockXSize) * nMaxBlockYSize *
5099 43 : nPixelSize <=
5100 43 : static_cast<GIntBig>(nTargetSwathSize))
5101 : {
5102 43 : nSwathCols = nTargetSwathSize / (nMaxBlockYSize * nPixelSize);
5103 43 : nSwathCols = ROUND_TO(nSwathCols, nMaxBlockXSize);
5104 43 : if (nSwathCols == 0)
5105 0 : nSwathCols = nMaxBlockXSize;
5106 43 : if (nSwathCols > nXSize)
5107 41 : nSwathCols = nXSize;
5108 43 : nSwathLines = nMaxBlockYSize;
5109 :
5110 43 : if (static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize >
5111 43 : static_cast<GIntBig>(nTargetSwathSize))
5112 : {
5113 0 : nSwathCols = nXSize;
5114 0 : nSwathLines = nBlockYSize;
5115 : }
5116 : }
5117 : }
5118 :
5119 3296 : const GIntBig nMemoryPerCol = static_cast<GIntBig>(nSwathCols) * nPixelSize;
5120 3296 : const GIntBig nSwathBufSize = nMemoryPerCol * nSwathLines;
5121 3296 : if (nSwathBufSize > static_cast<GIntBig>(nTargetSwathSize))
5122 : {
5123 1 : nSwathLines = static_cast<int>(nTargetSwathSize / nMemoryPerCol);
5124 1 : if (nSwathLines == 0)
5125 1 : nSwathLines = 1;
5126 :
5127 1 : CPLDebug(
5128 : "GDAL",
5129 : "GDALCopyWholeRasterGetSwathSize(): adjusting to %d line swath "
5130 : "since requirement (" CPL_FRMT_GIB " bytes) exceed target swath "
5131 : "size (%d bytes) (GDAL_SWATH_SIZE config. option)",
5132 1 : nSwathLines, nBlockYSize * nMemoryPerCol, nTargetSwathSize);
5133 : }
5134 : // If we are processing single scans, try to handle several at once.
5135 : // If we are handling swaths already, only grow the swath if a row
5136 : // of blocks is substantially less than our target buffer size.
5137 3295 : else if (nSwathLines == 1 ||
5138 2738 : nMemoryPerCol * nSwathLines <
5139 2738 : static_cast<GIntBig>(nTargetSwathSize) / 10)
5140 : {
5141 3267 : nSwathLines = std::min(
5142 : nYSize,
5143 3267 : std::max(1, static_cast<int>(nTargetSwathSize / nMemoryPerCol)));
5144 :
5145 : /* If possible try to align to source and target block height */
5146 3267 : if ((nSwathLines % nMaxBlockYSize) != 0 &&
5147 261 : nSwathLines > nMaxBlockYSize &&
5148 261 : IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
5149 232 : IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
5150 209 : nSwathLines = ROUND_TO(nSwathLines, nMaxBlockYSize);
5151 : }
5152 :
5153 3296 : if (pszSrcCompression != nullptr && EQUAL(pszSrcCompression, "JPEG2000") &&
5154 0 : (!bDstIsCompressed || (IS_DIVIDER_OF(nBlockXSize, nSrcBlockXSize) &&
5155 0 : IS_DIVIDER_OF(nBlockYSize, nSrcBlockYSize))))
5156 : {
5157 : // Typical use case: converting from Pleaiades that is 2048x2048 tiled.
5158 2 : if (nSwathLines < nSrcBlockYSize)
5159 : {
5160 0 : nSwathLines = nSrcBlockYSize;
5161 :
5162 : // Number of pixels that can be read/write simultaneously.
5163 0 : nSwathCols = nTargetSwathSize / (nSrcBlockXSize * nPixelSize);
5164 0 : nSwathCols = ROUND_TO(nSwathCols, nSrcBlockXSize);
5165 0 : if (nSwathCols == 0)
5166 0 : nSwathCols = nSrcBlockXSize;
5167 0 : if (nSwathCols > nXSize)
5168 0 : nSwathCols = nXSize;
5169 :
5170 0 : CPLDebug(
5171 : "GDAL",
5172 : "GDALCopyWholeRasterGetSwathSize(): because of compression and "
5173 : "too high block, "
5174 : "use partial width at one time");
5175 : }
5176 2 : else if ((nSwathLines % nSrcBlockYSize) != 0)
5177 : {
5178 : /* Round on a multiple of nSrcBlockYSize */
5179 0 : nSwathLines = ROUND_TO(nSwathLines, nSrcBlockYSize);
5180 0 : CPLDebug(
5181 : "GDAL",
5182 : "GDALCopyWholeRasterGetSwathSize(): because of compression, "
5183 : "round nSwathLines to block height : %d",
5184 : nSwathLines);
5185 : }
5186 : }
5187 3294 : else if (bDstIsCompressed)
5188 : {
5189 415 : if (nSwathLines < nBlockYSize)
5190 : {
5191 146 : nSwathLines = nBlockYSize;
5192 :
5193 : // Number of pixels that can be read/write simultaneously.
5194 146 : nSwathCols = nTargetSwathSize / (nSwathLines * nPixelSize);
5195 146 : nSwathCols = ROUND_TO(nSwathCols, nBlockXSize);
5196 146 : if (nSwathCols == 0)
5197 0 : nSwathCols = nBlockXSize;
5198 146 : if (nSwathCols > nXSize)
5199 146 : nSwathCols = nXSize;
5200 :
5201 146 : CPLDebug(
5202 : "GDAL",
5203 : "GDALCopyWholeRasterGetSwathSize(): because of compression and "
5204 : "too high block, "
5205 : "use partial width at one time");
5206 : }
5207 269 : else if ((nSwathLines % nBlockYSize) != 0)
5208 : {
5209 : // Round on a multiple of nBlockYSize.
5210 9 : nSwathLines = ROUND_TO(nSwathLines, nBlockYSize);
5211 9 : CPLDebug(
5212 : "GDAL",
5213 : "GDALCopyWholeRasterGetSwathSize(): because of compression, "
5214 : "round nSwathLines to block height : %d",
5215 : nSwathLines);
5216 : }
5217 : }
5218 :
5219 3296 : *pnSwathCols = nSwathCols;
5220 3296 : *pnSwathLines = nSwathLines;
5221 3296 : }
5222 :
5223 : /************************************************************************/
5224 : /* GDALDatasetCopyWholeRaster() */
5225 : /************************************************************************/
5226 :
5227 : /**
5228 : * \brief Copy all dataset raster data.
5229 : *
5230 : * This function copies the complete raster contents of one dataset to
5231 : * another similarly configured dataset. The source and destination
5232 : * dataset must have the same number of bands, and the same width
5233 : * and height. The bands do not have to have the same data type.
5234 : *
5235 : * This function is primarily intended to support implementation of
5236 : * driver specific CreateCopy() functions. It implements efficient copying,
5237 : * in particular "chunking" the copy in substantial blocks and, if appropriate,
5238 : * performing the transfer in a pixel interleaved fashion.
5239 : *
5240 : * Currently the only papszOptions value supported are :
5241 : * <ul>
5242 : * <li>"INTERLEAVE=PIXEL/BAND" to force pixel (resp. band) interleaved read and
5243 : * write access pattern (this does not modify the layout of the destination
5244 : * data)</li> <li>"COMPRESSED=YES" to force alignment on target dataset block
5245 : * sizes to achieve best compression.</li> <li>"SKIP_HOLES=YES" to skip chunks
5246 : * for which GDALGetDataCoverageStatus() returns GDAL_DATA_COVERAGE_STATUS_EMPTY
5247 : * (GDAL >= 2.2)</li>
5248 : * </ul>
5249 : * More options may be supported in the future.
5250 : *
5251 : * @param hSrcDS the source dataset
5252 : * @param hDstDS the destination dataset
5253 : * @param papszOptions transfer hints in "StringList" Name=Value format.
5254 : * @param pfnProgress progress reporting function.
5255 : * @param pProgressData callback data for progress function.
5256 : *
5257 : * @return CE_None on success, or CE_Failure on failure.
5258 : */
5259 :
5260 3268 : CPLErr CPL_STDCALL GDALDatasetCopyWholeRaster(GDALDatasetH hSrcDS,
5261 : GDALDatasetH hDstDS,
5262 : CSLConstList papszOptions,
5263 : GDALProgressFunc pfnProgress,
5264 : void *pProgressData)
5265 :
5266 : {
5267 3268 : VALIDATE_POINTER1(hSrcDS, "GDALDatasetCopyWholeRaster", CE_Failure);
5268 3268 : VALIDATE_POINTER1(hDstDS, "GDALDatasetCopyWholeRaster", CE_Failure);
5269 :
5270 3268 : GDALDataset *poSrcDS = GDALDataset::FromHandle(hSrcDS);
5271 3268 : GDALDataset *poDstDS = GDALDataset::FromHandle(hDstDS);
5272 :
5273 3268 : if (pfnProgress == nullptr)
5274 0 : pfnProgress = GDALDummyProgress;
5275 :
5276 : /* -------------------------------------------------------------------- */
5277 : /* Confirm the datasets match in size and band counts. */
5278 : /* -------------------------------------------------------------------- */
5279 3268 : const int nXSize = poDstDS->GetRasterXSize();
5280 3268 : const int nYSize = poDstDS->GetRasterYSize();
5281 3268 : const int nBandCount = poDstDS->GetRasterCount();
5282 :
5283 3268 : if (poSrcDS->GetRasterXSize() != nXSize ||
5284 6536 : poSrcDS->GetRasterYSize() != nYSize ||
5285 3268 : poSrcDS->GetRasterCount() != nBandCount)
5286 : {
5287 0 : CPLError(CE_Failure, CPLE_AppDefined,
5288 : "Input and output dataset sizes or band counts do not\n"
5289 : "match in GDALDatasetCopyWholeRaster()");
5290 0 : return CE_Failure;
5291 : }
5292 :
5293 : /* -------------------------------------------------------------------- */
5294 : /* Report preliminary (0) progress. */
5295 : /* -------------------------------------------------------------------- */
5296 3268 : if (!pfnProgress(0.0, nullptr, pProgressData))
5297 : {
5298 1 : CPLError(CE_Failure, CPLE_UserInterrupt,
5299 : "User terminated CreateCopy()");
5300 1 : return CE_Failure;
5301 : }
5302 :
5303 : /* -------------------------------------------------------------------- */
5304 : /* Get our prototype band, and assume the others are similarly */
5305 : /* configured. */
5306 : /* -------------------------------------------------------------------- */
5307 3267 : if (nBandCount == 0)
5308 0 : return CE_None;
5309 :
5310 3267 : GDALRasterBand *poSrcPrototypeBand = poSrcDS->GetRasterBand(1);
5311 3267 : GDALRasterBand *poDstPrototypeBand = poDstDS->GetRasterBand(1);
5312 3267 : GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
5313 :
5314 : /* -------------------------------------------------------------------- */
5315 : /* Do we want to try and do the operation in a pixel */
5316 : /* interleaved fashion? */
5317 : /* -------------------------------------------------------------------- */
5318 3267 : bool bInterleave = false;
5319 : const char *pszInterleave =
5320 3267 : poSrcDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
5321 3267 : if (pszInterleave != nullptr &&
5322 2874 : (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
5323 189 : bInterleave = true;
5324 :
5325 3267 : pszInterleave = poDstDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
5326 3267 : if (pszInterleave != nullptr &&
5327 2797 : (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
5328 503 : bInterleave = true;
5329 :
5330 3267 : pszInterleave = CSLFetchNameValue(papszOptions, "INTERLEAVE");
5331 3267 : if (pszInterleave != nullptr && EQUAL(pszInterleave, "PIXEL"))
5332 5 : bInterleave = true;
5333 3262 : else if (pszInterleave != nullptr && EQUAL(pszInterleave, "BAND"))
5334 13 : bInterleave = false;
5335 : // attributes is specific to the TileDB driver
5336 3249 : else if (pszInterleave != nullptr && EQUAL(pszInterleave, "ATTRIBUTES"))
5337 4 : bInterleave = true;
5338 3245 : else if (pszInterleave != nullptr)
5339 : {
5340 0 : CPLError(CE_Warning, CPLE_NotSupported,
5341 : "Unsupported value for option INTERLEAVE");
5342 : }
5343 :
5344 : // If the destination is compressed, we must try to write blocks just once,
5345 : // to save disk space (GTiff case for example), and to avoid data loss
5346 : // (JPEG compression for example).
5347 3267 : bool bDstIsCompressed = false;
5348 : const char *pszDstCompressed =
5349 3267 : CSLFetchNameValue(papszOptions, "COMPRESSED");
5350 3267 : if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
5351 389 : bDstIsCompressed = true;
5352 :
5353 : /* -------------------------------------------------------------------- */
5354 : /* What will our swath size be? */
5355 : /* -------------------------------------------------------------------- */
5356 :
5357 3267 : int nSwathCols = 0;
5358 3267 : int nSwathLines = 0;
5359 3267 : GDALCopyWholeRasterGetSwathSize(poSrcPrototypeBand, poDstPrototypeBand,
5360 : nBandCount, bDstIsCompressed, bInterleave,
5361 : &nSwathCols, &nSwathLines);
5362 :
5363 3267 : int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5364 3267 : if (bInterleave)
5365 556 : nPixelSize *= nBandCount;
5366 :
5367 3267 : void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
5368 3267 : if (pSwathBuf == nullptr)
5369 : {
5370 0 : return CE_Failure;
5371 : }
5372 :
5373 3267 : CPLDebug("GDAL",
5374 : "GDALDatasetCopyWholeRaster(): %d*%d swaths, bInterleave=%d",
5375 : nSwathCols, nSwathLines, static_cast<int>(bInterleave));
5376 :
5377 : // Advise the source raster that we are going to read it completely
5378 : // Note: this might already have been done by GDALCreateCopy() in the
5379 : // likely case this function is indirectly called by it
5380 3267 : poSrcDS->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nBandCount,
5381 3267 : nullptr, nullptr);
5382 :
5383 : /* ==================================================================== */
5384 : /* Band oriented (uninterleaved) case. */
5385 : /* ==================================================================== */
5386 3267 : CPLErr eErr = CE_None;
5387 : const bool bCheckHoles =
5388 3267 : CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
5389 :
5390 3267 : if (!bInterleave)
5391 : {
5392 : GDALRasterIOExtraArg sExtraArg;
5393 2711 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5394 2711 : CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress); // to make cppcheck happy
5395 :
5396 8133 : const GIntBig nTotalBlocks = static_cast<GIntBig>(nBandCount) *
5397 2711 : DIV_ROUND_UP(nYSize, nSwathLines) *
5398 2711 : DIV_ROUND_UP(nXSize, nSwathCols);
5399 2711 : GIntBig nBlocksDone = 0;
5400 :
5401 7838 : for (int iBand = 0; iBand < nBandCount && eErr == CE_None; iBand++)
5402 : {
5403 5127 : int nBand = iBand + 1;
5404 :
5405 10512 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5406 : {
5407 5385 : int nThisLines = nSwathLines;
5408 :
5409 5385 : if (iY + nThisLines > nYSize)
5410 363 : nThisLines = nYSize - iY;
5411 :
5412 10770 : for (int iX = 0; iX < nXSize && eErr == CE_None;
5413 5385 : iX += nSwathCols)
5414 : {
5415 5385 : int nThisCols = nSwathCols;
5416 :
5417 5385 : if (iX + nThisCols > nXSize)
5418 0 : nThisCols = nXSize - iX;
5419 :
5420 5385 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5421 5385 : if (bCheckHoles)
5422 : {
5423 : nStatus = poSrcDS->GetRasterBand(nBand)
5424 3722 : ->GetDataCoverageStatus(
5425 : iX, iY, nThisCols, nThisLines,
5426 : GDAL_DATA_COVERAGE_STATUS_DATA);
5427 : }
5428 5385 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5429 : {
5430 5381 : sExtraArg.pfnProgress = GDALScaledProgress;
5431 10762 : sExtraArg.pProgressData = GDALCreateScaledProgress(
5432 5381 : nBlocksDone / static_cast<double>(nTotalBlocks),
5433 5381 : (nBlocksDone + 0.5) /
5434 5381 : static_cast<double>(nTotalBlocks),
5435 : pfnProgress, pProgressData);
5436 5381 : if (sExtraArg.pProgressData == nullptr)
5437 1633 : sExtraArg.pfnProgress = nullptr;
5438 :
5439 5381 : eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
5440 : nThisLines, pSwathBuf,
5441 : nThisCols, nThisLines, eDT, 1,
5442 : &nBand, 0, 0, 0, &sExtraArg);
5443 :
5444 5381 : GDALDestroyScaledProgress(sExtraArg.pProgressData);
5445 :
5446 5381 : if (eErr == CE_None)
5447 5374 : eErr = poDstDS->RasterIO(
5448 : GF_Write, iX, iY, nThisCols, nThisLines,
5449 : pSwathBuf, nThisCols, nThisLines, eDT, 1,
5450 : &nBand, 0, 0, 0, nullptr);
5451 : }
5452 :
5453 5385 : nBlocksDone++;
5454 10728 : if (eErr == CE_None &&
5455 5343 : !pfnProgress(nBlocksDone /
5456 5343 : static_cast<double>(nTotalBlocks),
5457 : nullptr, pProgressData))
5458 : {
5459 2 : eErr = CE_Failure;
5460 2 : CPLError(CE_Failure, CPLE_UserInterrupt,
5461 : "User terminated CreateCopy()");
5462 : }
5463 : }
5464 : }
5465 : }
5466 : }
5467 :
5468 : /* ==================================================================== */
5469 : /* Pixel interleaved case. */
5470 : /* ==================================================================== */
5471 : else /* if( bInterleave ) */
5472 : {
5473 : GDALRasterIOExtraArg sExtraArg;
5474 556 : INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5475 556 : CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress); // to make cppcheck happy
5476 :
5477 556 : const GIntBig nTotalBlocks =
5478 556 : static_cast<GIntBig>(DIV_ROUND_UP(nYSize, nSwathLines)) *
5479 556 : DIV_ROUND_UP(nXSize, nSwathCols);
5480 556 : GIntBig nBlocksDone = 0;
5481 :
5482 1332 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5483 : {
5484 776 : int nThisLines = nSwathLines;
5485 :
5486 776 : if (iY + nThisLines > nYSize)
5487 195 : nThisLines = nYSize - iY;
5488 :
5489 1558 : for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
5490 : {
5491 782 : int nThisCols = nSwathCols;
5492 :
5493 782 : if (iX + nThisCols > nXSize)
5494 4 : nThisCols = nXSize - iX;
5495 :
5496 782 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5497 782 : if (bCheckHoles)
5498 : {
5499 549 : nStatus = 0;
5500 602 : for (int iBand = 0; iBand < nBandCount; iBand++)
5501 : {
5502 583 : nStatus |= poSrcDS->GetRasterBand(iBand + 1)
5503 583 : ->GetDataCoverageStatus(
5504 : iX, iY, nThisCols, nThisLines,
5505 : GDAL_DATA_COVERAGE_STATUS_DATA);
5506 583 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5507 530 : break;
5508 : }
5509 : }
5510 782 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5511 : {
5512 763 : sExtraArg.pfnProgress = GDALScaledProgress;
5513 1526 : sExtraArg.pProgressData = GDALCreateScaledProgress(
5514 763 : nBlocksDone / static_cast<double>(nTotalBlocks),
5515 763 : (nBlocksDone + 0.5) / static_cast<double>(nTotalBlocks),
5516 : pfnProgress, pProgressData);
5517 763 : if (sExtraArg.pProgressData == nullptr)
5518 348 : sExtraArg.pfnProgress = nullptr;
5519 :
5520 763 : eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
5521 : nThisLines, pSwathBuf, nThisCols,
5522 : nThisLines, eDT, nBandCount,
5523 : nullptr, 0, 0, 0, &sExtraArg);
5524 :
5525 763 : GDALDestroyScaledProgress(sExtraArg.pProgressData);
5526 :
5527 763 : if (eErr == CE_None)
5528 761 : eErr = poDstDS->RasterIO(
5529 : GF_Write, iX, iY, nThisCols, nThisLines, pSwathBuf,
5530 : nThisCols, nThisLines, eDT, nBandCount, nullptr, 0,
5531 : 0, 0, nullptr);
5532 : }
5533 :
5534 782 : nBlocksDone++;
5535 1559 : if (eErr == CE_None &&
5536 777 : !pfnProgress(nBlocksDone /
5537 777 : static_cast<double>(nTotalBlocks),
5538 : nullptr, pProgressData))
5539 : {
5540 0 : eErr = CE_Failure;
5541 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
5542 : "User terminated CreateCopy()");
5543 : }
5544 : }
5545 : }
5546 : }
5547 :
5548 : /* -------------------------------------------------------------------- */
5549 : /* Cleanup */
5550 : /* -------------------------------------------------------------------- */
5551 3267 : CPLFree(pSwathBuf);
5552 :
5553 3267 : return eErr;
5554 : }
5555 :
5556 : /************************************************************************/
5557 : /* GDALRasterBandCopyWholeRaster() */
5558 : /************************************************************************/
5559 :
5560 : /**
5561 : * \brief Copy a whole raster band
5562 : *
5563 : * This function copies the complete raster contents of one band to
5564 : * another similarly configured band. The source and destination
5565 : * bands must have the same width and height. The bands do not have
5566 : * to have the same data type.
5567 : *
5568 : * It implements efficient copying, in particular "chunking" the copy in
5569 : * substantial blocks.
5570 : *
5571 : * Currently the only papszOptions value supported are :
5572 : * <ul>
5573 : * <li>"COMPRESSED=YES" to force alignment on target dataset block sizes to
5574 : * achieve best compression.</li>
5575 : * <li>"SKIP_HOLES=YES" to skip chunks for which GDALGetDataCoverageStatus()
5576 : * returns GDAL_DATA_COVERAGE_STATUS_EMPTY (GDAL >= 2.2)</li>
5577 : * </ul>
5578 : *
5579 : * @param hSrcBand the source band
5580 : * @param hDstBand the destination band
5581 : * @param papszOptions transfer hints in "StringList" Name=Value format.
5582 : * @param pfnProgress progress reporting function.
5583 : * @param pProgressData callback data for progress function.
5584 : *
5585 : * @return CE_None on success, or CE_Failure on failure.
5586 : */
5587 :
5588 29 : CPLErr CPL_STDCALL GDALRasterBandCopyWholeRaster(
5589 : GDALRasterBandH hSrcBand, GDALRasterBandH hDstBand,
5590 : const char *const *const papszOptions, GDALProgressFunc pfnProgress,
5591 : void *pProgressData)
5592 :
5593 : {
5594 29 : VALIDATE_POINTER1(hSrcBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
5595 29 : VALIDATE_POINTER1(hDstBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
5596 :
5597 29 : GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
5598 29 : GDALRasterBand *poDstBand = GDALRasterBand::FromHandle(hDstBand);
5599 29 : CPLErr eErr = CE_None;
5600 :
5601 29 : if (pfnProgress == nullptr)
5602 2 : pfnProgress = GDALDummyProgress;
5603 :
5604 : /* -------------------------------------------------------------------- */
5605 : /* Confirm the datasets match in size and band counts. */
5606 : /* -------------------------------------------------------------------- */
5607 29 : int nXSize = poSrcBand->GetXSize();
5608 29 : int nYSize = poSrcBand->GetYSize();
5609 :
5610 29 : if (poDstBand->GetXSize() != nXSize || poDstBand->GetYSize() != nYSize)
5611 : {
5612 0 : CPLError(CE_Failure, CPLE_AppDefined,
5613 : "Input and output band sizes do not\n"
5614 : "match in GDALRasterBandCopyWholeRaster()");
5615 0 : return CE_Failure;
5616 : }
5617 :
5618 : /* -------------------------------------------------------------------- */
5619 : /* Report preliminary (0) progress. */
5620 : /* -------------------------------------------------------------------- */
5621 29 : if (!pfnProgress(0.0, nullptr, pProgressData))
5622 : {
5623 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
5624 : "User terminated CreateCopy()");
5625 0 : return CE_Failure;
5626 : }
5627 :
5628 29 : GDALDataType eDT = poDstBand->GetRasterDataType();
5629 :
5630 : // If the destination is compressed, we must try to write blocks just once,
5631 : // to save disk space (GTiff case for example), and to avoid data loss
5632 : // (JPEG compression for example).
5633 29 : bool bDstIsCompressed = false;
5634 : const char *pszDstCompressed =
5635 29 : CSLFetchNameValue(const_cast<char **>(papszOptions), "COMPRESSED");
5636 29 : if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
5637 26 : bDstIsCompressed = true;
5638 :
5639 : /* -------------------------------------------------------------------- */
5640 : /* What will our swath size be? */
5641 : /* -------------------------------------------------------------------- */
5642 :
5643 29 : int nSwathCols = 0;
5644 29 : int nSwathLines = 0;
5645 29 : GDALCopyWholeRasterGetSwathSize(poSrcBand, poDstBand, 1, bDstIsCompressed,
5646 : FALSE, &nSwathCols, &nSwathLines);
5647 :
5648 29 : const int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
5649 :
5650 29 : void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
5651 29 : if (pSwathBuf == nullptr)
5652 : {
5653 0 : return CE_Failure;
5654 : }
5655 :
5656 29 : CPLDebug("GDAL", "GDALRasterBandCopyWholeRaster(): %d*%d swaths",
5657 : nSwathCols, nSwathLines);
5658 :
5659 : const bool bCheckHoles =
5660 29 : CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
5661 :
5662 : // Advise the source raster that we are going to read it completely
5663 29 : poSrcBand->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nullptr);
5664 :
5665 : /* ==================================================================== */
5666 : /* Band oriented (uninterleaved) case. */
5667 : /* ==================================================================== */
5668 :
5669 72 : for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
5670 : {
5671 43 : int nThisLines = nSwathLines;
5672 :
5673 43 : if (iY + nThisLines > nYSize)
5674 8 : nThisLines = nYSize - iY;
5675 :
5676 86 : for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
5677 : {
5678 43 : int nThisCols = nSwathCols;
5679 :
5680 43 : if (iX + nThisCols > nXSize)
5681 0 : nThisCols = nXSize - iX;
5682 :
5683 43 : int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
5684 43 : if (bCheckHoles)
5685 : {
5686 0 : nStatus = poSrcBand->GetDataCoverageStatus(
5687 : iX, iY, nThisCols, nThisLines,
5688 : GDAL_DATA_COVERAGE_STATUS_DATA);
5689 : }
5690 43 : if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
5691 : {
5692 43 : eErr = poSrcBand->RasterIO(GF_Read, iX, iY, nThisCols,
5693 : nThisLines, pSwathBuf, nThisCols,
5694 : nThisLines, eDT, 0, 0, nullptr);
5695 :
5696 43 : if (eErr == CE_None)
5697 43 : eErr = poDstBand->RasterIO(GF_Write, iX, iY, nThisCols,
5698 : nThisLines, pSwathBuf, nThisCols,
5699 : nThisLines, eDT, 0, 0, nullptr);
5700 : }
5701 :
5702 86 : if (eErr == CE_None && !pfnProgress(double(iY + nThisLines) /
5703 43 : static_cast<double>(nYSize),
5704 : nullptr, pProgressData))
5705 : {
5706 0 : eErr = CE_Failure;
5707 0 : CPLError(CE_Failure, CPLE_UserInterrupt,
5708 : "User terminated CreateCopy()");
5709 : }
5710 : }
5711 : }
5712 :
5713 : /* -------------------------------------------------------------------- */
5714 : /* Cleanup */
5715 : /* -------------------------------------------------------------------- */
5716 29 : CPLFree(pSwathBuf);
5717 :
5718 29 : return eErr;
5719 : }
5720 :
5721 : /************************************************************************/
5722 : /* GDALCopyRasterIOExtraArg () */
5723 : /************************************************************************/
5724 :
5725 527273 : void GDALCopyRasterIOExtraArg(GDALRasterIOExtraArg *psDestArg,
5726 : GDALRasterIOExtraArg *psSrcArg)
5727 : {
5728 527273 : INIT_RASTERIO_EXTRA_ARG(*psDestArg);
5729 527273 : if (psSrcArg)
5730 : {
5731 527273 : psDestArg->eResampleAlg = psSrcArg->eResampleAlg;
5732 527273 : psDestArg->pfnProgress = psSrcArg->pfnProgress;
5733 527273 : psDestArg->pProgressData = psSrcArg->pProgressData;
5734 527273 : psDestArg->bFloatingPointWindowValidity =
5735 527273 : psSrcArg->bFloatingPointWindowValidity;
5736 527273 : if (psSrcArg->bFloatingPointWindowValidity)
5737 : {
5738 204391 : psDestArg->dfXOff = psSrcArg->dfXOff;
5739 204391 : psDestArg->dfYOff = psSrcArg->dfYOff;
5740 204391 : psDestArg->dfXSize = psSrcArg->dfXSize;
5741 204391 : psDestArg->dfYSize = psSrcArg->dfYSize;
5742 : }
5743 527273 : if (psSrcArg->nVersion >= 2)
5744 : {
5745 527273 : psDestArg->bUseOnlyThisScale = psSrcArg->bUseOnlyThisScale;
5746 : }
5747 : }
5748 527273 : }
5749 :
5750 : /************************************************************************/
5751 : /* HasOnlyNoData() */
5752 : /************************************************************************/
5753 :
5754 25110502 : template <class T> static inline bool IsEqualToNoData(T value, T noDataValue)
5755 : {
5756 25110502 : return value == noDataValue;
5757 : }
5758 :
5759 5509 : template <> bool IsEqualToNoData<GFloat16>(GFloat16 value, GFloat16 noDataValue)
5760 : {
5761 : using std::isnan;
5762 5509 : return isnan(noDataValue) ? isnan(value) : value == noDataValue;
5763 : }
5764 :
5765 250690 : template <> bool IsEqualToNoData<float>(float value, float noDataValue)
5766 : {
5767 250690 : return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
5768 : }
5769 :
5770 263861 : template <> bool IsEqualToNoData<double>(double value, double noDataValue)
5771 : {
5772 263861 : return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
5773 : }
5774 :
5775 : template <class T>
5776 11660 : static bool HasOnlyNoDataT(const T *pBuffer, T noDataValue, size_t nWidth,
5777 : size_t nHeight, size_t nLineStride,
5778 : size_t nComponents)
5779 : {
5780 : // Fast test: check the 4 corners and the middle pixel.
5781 22790 : for (size_t iBand = 0; iBand < nComponents; iBand++)
5782 : {
5783 23528 : if (!(IsEqualToNoData(pBuffer[iBand], noDataValue) &&
5784 11679 : IsEqualToNoData(pBuffer[(nWidth - 1) * nComponents + iBand],
5785 11599 : noDataValue) &&
5786 11599 : IsEqualToNoData(
5787 11599 : pBuffer[((nHeight - 1) / 2 * nLineStride + (nWidth - 1) / 2) *
5788 11599 : nComponents +
5789 : iBand],
5790 11133 : noDataValue) &&
5791 11133 : IsEqualToNoData(
5792 11133 : pBuffer[(nHeight - 1) * nLineStride * nComponents + iBand],
5793 : noDataValue) &&
5794 11133 : IsEqualToNoData(
5795 11133 : pBuffer[((nHeight - 1) * nLineStride + nWidth - 1) *
5796 11133 : nComponents +
5797 : iBand],
5798 : noDataValue)))
5799 : {
5800 719 : return false;
5801 : }
5802 : }
5803 :
5804 : // Test all pixels.
5805 37304 : for (size_t iY = 0; iY < nHeight; iY++)
5806 : {
5807 26466 : const T *pBufferLine = pBuffer + iY * nLineStride * nComponents;
5808 25599518 : for (size_t iX = 0; iX < nWidth * nComponents; iX++)
5809 : {
5810 25573175 : if (!IsEqualToNoData(pBufferLine[iX], noDataValue))
5811 : {
5812 103 : return false;
5813 : }
5814 : }
5815 : }
5816 10838 : return true;
5817 : }
5818 :
5819 : /************************************************************************/
5820 : /* GDALBufferHasOnlyNoData() */
5821 : /************************************************************************/
5822 :
5823 43435 : bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
5824 : size_t nWidth, size_t nHeight, size_t nLineStride,
5825 : size_t nComponents, int nBitsPerSample,
5826 : GDALBufferSampleFormat nSampleFormat)
5827 : {
5828 : // In the case where the nodata is 0, we can compare several bytes at
5829 : // once. Select the largest natural integer type for the architecture.
5830 43435 : if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
5831 : // Do not use this optimized code path for floating point numbers,
5832 : // as it can't detect negative zero.
5833 : nSampleFormat != GSF_FLOATING_POINT)
5834 : {
5835 27219 : const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
5836 27219 : const size_t nSize =
5837 27219 : static_cast<size_t>((static_cast<uint64_t>(nWidth) * nHeight *
5838 27219 : nComponents * nBitsPerSample +
5839 : 7) /
5840 : 8);
5841 : #ifdef HAVE_SSE2
5842 27219 : size_t n = nSize;
5843 : // Align to 16 bytes
5844 27282 : while ((reinterpret_cast<uintptr_t>(pabyBuffer) & 15) != 0 && n > 0)
5845 : {
5846 73 : --n;
5847 73 : if (*pabyBuffer)
5848 10 : return false;
5849 63 : pabyBuffer++;
5850 : }
5851 :
5852 27209 : const auto zero = _mm_setzero_si128();
5853 27209 : constexpr int UNROLLING = 4;
5854 2085580 : while (n >= UNROLLING * sizeof(zero))
5855 : {
5856 2070360 : const auto v0 = _mm_load_si128(reinterpret_cast<const __m128i *>(
5857 : pabyBuffer + 0 * sizeof(zero)));
5858 2070360 : const auto v1 = _mm_load_si128(reinterpret_cast<const __m128i *>(
5859 2070360 : pabyBuffer + 1 * sizeof(zero)));
5860 2070360 : const auto v2 = _mm_load_si128(reinterpret_cast<const __m128i *>(
5861 2070360 : pabyBuffer + 2 * sizeof(zero)));
5862 2070360 : const auto v3 = _mm_load_si128(reinterpret_cast<const __m128i *>(
5863 2070360 : pabyBuffer + 3 * sizeof(zero)));
5864 : const auto v =
5865 6211070 : _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
5866 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
5867 : if (!_mm_test_all_zeros(v, v))
5868 : #else
5869 4140720 : if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
5870 : #endif
5871 : {
5872 11982 : return false;
5873 : }
5874 2058380 : pabyBuffer += UNROLLING * sizeof(zero);
5875 2058380 : n -= UNROLLING * sizeof(zero);
5876 : }
5877 :
5878 233552 : while (n > 0)
5879 : {
5880 218425 : --n;
5881 218425 : if (*pabyBuffer)
5882 100 : return false;
5883 218325 : pabyBuffer++;
5884 : }
5885 : #else
5886 : #if SIZEOF_VOIDP >= 8 || defined(__x86_64__)
5887 : // We test __x86_64__ for x32 arch where SIZEOF_VOIDP == 4
5888 : typedef std::uint64_t WordType;
5889 : #else
5890 : typedef std::uint32_t WordType;
5891 : #endif
5892 :
5893 : const size_t nInitialIters =
5894 : std::min(sizeof(WordType) -
5895 : static_cast<size_t>(
5896 : reinterpret_cast<std::uintptr_t>(pabyBuffer) %
5897 : sizeof(WordType)),
5898 : nSize);
5899 : size_t i = 0;
5900 : for (; i < nInitialIters; i++)
5901 : {
5902 : if (pabyBuffer[i])
5903 : return false;
5904 : }
5905 : for (; i + sizeof(WordType) - 1 < nSize; i += sizeof(WordType))
5906 : {
5907 : if (*(reinterpret_cast<const WordType *>(pabyBuffer + i)))
5908 : return false;
5909 : }
5910 : for (; i < nSize; i++)
5911 : {
5912 : if (pabyBuffer[i])
5913 : return false;
5914 : }
5915 : #endif
5916 15127 : return true;
5917 : }
5918 :
5919 : #ifdef HAVE_SSE2
5920 16216 : else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
5921 708 : nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
5922 : {
5923 708 : const auto signMask = _mm_set1_epi32(0x7FFFFFFF);
5924 708 : const auto zero = _mm_setzero_si128();
5925 708 : const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
5926 708 : const size_t n = nWidth * nHeight * nComponents;
5927 :
5928 708 : size_t i = 0;
5929 708 : constexpr int UNROLLING = 4;
5930 708 : constexpr size_t VALUES_PER_ITER =
5931 : UNROLLING * sizeof(zero) / sizeof(float);
5932 24983 : for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
5933 : {
5934 24934 : const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
5935 : pabyBuffer + 0 * sizeof(zero)));
5936 24934 : const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
5937 24934 : pabyBuffer + 1 * sizeof(zero)));
5938 24934 : const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
5939 24934 : pabyBuffer + 2 * sizeof(zero)));
5940 24934 : const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
5941 24934 : pabyBuffer + 3 * sizeof(zero)));
5942 74802 : auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
5943 : // Clear the sign bit (makes -0.0 become +0.0)
5944 24934 : v = _mm_and_si128(v, signMask);
5945 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
5946 : if (!_mm_test_all_zeros(v, v))
5947 : #else
5948 49868 : if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
5949 : #endif
5950 : {
5951 659 : return false;
5952 : }
5953 24275 : pabyBuffer += UNROLLING * sizeof(zero);
5954 : }
5955 :
5956 304 : for (; i < n; i++)
5957 : {
5958 : uint32_t bits;
5959 272 : memcpy(&bits, pabyBuffer, sizeof(bits));
5960 272 : pabyBuffer += sizeof(bits);
5961 272 : if ((bits & 0x7FFFFFFF) != 0)
5962 17 : return false;
5963 : }
5964 :
5965 32 : return true;
5966 : }
5967 :
5968 15508 : else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
5969 3841 : nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
5970 : {
5971 3841 : const auto signMask = _mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL);
5972 3841 : const auto zero = _mm_setzero_si128();
5973 3841 : const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
5974 3841 : const size_t n = nWidth * nHeight * nComponents;
5975 :
5976 3841 : size_t i = 0;
5977 3841 : constexpr int UNROLLING = 4;
5978 3841 : constexpr size_t VALUES_PER_ITER =
5979 : UNROLLING * sizeof(zero) / sizeof(double);
5980 1664320 : for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
5981 : {
5982 1660710 : const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
5983 : pabyBuffer + 0 * sizeof(zero)));
5984 1660710 : const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
5985 1660710 : pabyBuffer + 1 * sizeof(zero)));
5986 1660710 : const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
5987 1660710 : pabyBuffer + 2 * sizeof(zero)));
5988 1660710 : const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
5989 1660710 : pabyBuffer + 3 * sizeof(zero)));
5990 4982130 : auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
5991 : // Clear the sign bit (makes -0.0 become +0.0)
5992 1660710 : v = _mm_and_si128(v, signMask);
5993 : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
5994 : if (!_mm_test_all_zeros(v, v))
5995 : #else
5996 3321420 : if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
5997 : #endif
5998 : {
5999 227 : return false;
6000 : }
6001 1660480 : pabyBuffer += UNROLLING * sizeof(zero);
6002 : }
6003 :
6004 3633 : for (; i < n; i++)
6005 : {
6006 : uint64_t bits;
6007 26 : memcpy(&bits, pabyBuffer, sizeof(bits));
6008 26 : pabyBuffer += sizeof(bits);
6009 26 : if ((bits & 0x7FFFFFFFFFFFFFFFULL) != 0)
6010 7 : return false;
6011 : }
6012 :
6013 3607 : return true;
6014 : }
6015 : #endif
6016 :
6017 11667 : if (nBitsPerSample == 8 && nSampleFormat == GSF_UNSIGNED_INT)
6018 : {
6019 22274 : return GDALIsValueInRange<uint8_t>(dfNoDataValue) &&
6020 11137 : HasOnlyNoDataT(static_cast<const uint8_t *>(pBuffer),
6021 11137 : static_cast<uint8_t>(dfNoDataValue), nWidth,
6022 11137 : nHeight, nLineStride, nComponents);
6023 : }
6024 530 : if (nBitsPerSample == 8 && nSampleFormat == GSF_SIGNED_INT)
6025 : {
6026 : // Use unsigned implementation by converting the nodatavalue to
6027 : // unsigned
6028 63 : return GDALIsValueInRange<int8_t>(dfNoDataValue) &&
6029 31 : HasOnlyNoDataT(
6030 : static_cast<const uint8_t *>(pBuffer),
6031 31 : static_cast<uint8_t>(static_cast<int8_t>(dfNoDataValue)),
6032 32 : nWidth, nHeight, nLineStride, nComponents);
6033 : }
6034 498 : if (nBitsPerSample == 16 && nSampleFormat == GSF_UNSIGNED_INT)
6035 : {
6036 23 : return GDALIsValueInRange<uint16_t>(dfNoDataValue) &&
6037 11 : HasOnlyNoDataT(static_cast<const uint16_t *>(pBuffer),
6038 11 : static_cast<uint16_t>(dfNoDataValue), nWidth,
6039 12 : nHeight, nLineStride, nComponents);
6040 : }
6041 486 : if (nBitsPerSample == 16 && nSampleFormat == GSF_SIGNED_INT)
6042 : {
6043 : // Use unsigned implementation by converting the nodatavalue to
6044 : // unsigned
6045 99 : return GDALIsValueInRange<int16_t>(dfNoDataValue) &&
6046 49 : HasOnlyNoDataT(
6047 : static_cast<const uint16_t *>(pBuffer),
6048 49 : static_cast<uint16_t>(static_cast<int16_t>(dfNoDataValue)),
6049 50 : nWidth, nHeight, nLineStride, nComponents);
6050 : }
6051 436 : if (nBitsPerSample == 32 && nSampleFormat == GSF_UNSIGNED_INT)
6052 : {
6053 73 : return GDALIsValueInRange<uint32_t>(dfNoDataValue) &&
6054 36 : HasOnlyNoDataT(static_cast<const uint32_t *>(pBuffer),
6055 : static_cast<uint32_t>(dfNoDataValue), nWidth,
6056 37 : nHeight, nLineStride, nComponents);
6057 : }
6058 399 : if (nBitsPerSample == 32 && nSampleFormat == GSF_SIGNED_INT)
6059 : {
6060 : // Use unsigned implementation by converting the nodatavalue to
6061 : // unsigned
6062 23 : return GDALIsValueInRange<int32_t>(dfNoDataValue) &&
6063 11 : HasOnlyNoDataT(
6064 : static_cast<const uint32_t *>(pBuffer),
6065 11 : static_cast<uint32_t>(static_cast<int32_t>(dfNoDataValue)),
6066 12 : nWidth, nHeight, nLineStride, nComponents);
6067 : }
6068 387 : if (nBitsPerSample == 64 && nSampleFormat == GSF_UNSIGNED_INT)
6069 : {
6070 56 : return GDALIsValueInRange<uint64_t>(dfNoDataValue) &&
6071 28 : HasOnlyNoDataT(static_cast<const uint64_t *>(pBuffer),
6072 : static_cast<uint64_t>(dfNoDataValue), nWidth,
6073 28 : nHeight, nLineStride, nComponents);
6074 : }
6075 359 : if (nBitsPerSample == 64 && nSampleFormat == GSF_SIGNED_INT)
6076 : {
6077 : // Use unsigned implementation by converting the nodatavalue to
6078 : // unsigned
6079 0 : return GDALIsValueInRange<int64_t>(dfNoDataValue) &&
6080 0 : HasOnlyNoDataT(
6081 : static_cast<const uint64_t *>(pBuffer),
6082 0 : static_cast<uint64_t>(static_cast<int64_t>(dfNoDataValue)),
6083 0 : nWidth, nHeight, nLineStride, nComponents);
6084 : }
6085 359 : if (nBitsPerSample == 16 && nSampleFormat == GSF_FLOATING_POINT)
6086 : {
6087 106 : return (std::isnan(dfNoDataValue) ||
6088 211 : GDALIsValueInRange<GFloat16>(dfNoDataValue)) &&
6089 105 : HasOnlyNoDataT(static_cast<const GFloat16 *>(pBuffer),
6090 : static_cast<GFloat16>(dfNoDataValue), nWidth,
6091 106 : nHeight, nLineStride, nComponents);
6092 : }
6093 253 : if (nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
6094 : {
6095 153 : return (std::isnan(dfNoDataValue) ||
6096 305 : GDALIsValueInRange<float>(dfNoDataValue)) &&
6097 152 : HasOnlyNoDataT(static_cast<const float *>(pBuffer),
6098 : static_cast<float>(dfNoDataValue), nWidth,
6099 153 : nHeight, nLineStride, nComponents);
6100 : }
6101 100 : if (nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
6102 : {
6103 100 : return HasOnlyNoDataT(static_cast<const double *>(pBuffer),
6104 : dfNoDataValue, nWidth, nHeight, nLineStride,
6105 100 : nComponents);
6106 : }
6107 0 : return false;
6108 : }
6109 :
6110 : #ifdef HAVE_SSE2
6111 :
6112 : /************************************************************************/
6113 : /* GDALDeinterleave3Byte() */
6114 : /************************************************************************/
6115 :
6116 : #if defined(__GNUC__) && !defined(__clang__)
6117 : __attribute__((optimize("no-tree-vectorize")))
6118 : #endif
6119 : static void
6120 361353 : GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
6121 : GByte *CPL_RESTRICT pabyDest0,
6122 : GByte *CPL_RESTRICT pabyDest1,
6123 : GByte *CPL_RESTRICT pabyDest2, size_t nIters)
6124 : #ifdef USE_NEON_OPTIMIZATIONS
6125 : {
6126 : return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6127 : nIters);
6128 : }
6129 : #else
6130 : {
6131 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
6132 361353 : if (CPLHaveRuntimeSSSE3())
6133 : {
6134 361351 : return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
6135 361351 : pabyDest2, nIters);
6136 : }
6137 : #endif
6138 :
6139 2 : size_t i = 0;
6140 2 : if (((reinterpret_cast<uintptr_t>(pabySrc) |
6141 2 : reinterpret_cast<uintptr_t>(pabyDest0) |
6142 2 : reinterpret_cast<uintptr_t>(pabyDest1) |
6143 2 : reinterpret_cast<uintptr_t>(pabyDest2)) %
6144 : sizeof(unsigned int)) == 0)
6145 : {
6146 : // Slightly better than GCC autovectorizer
6147 17 : for (size_t j = 0; i + 3 < nIters; i += 4, ++j)
6148 : {
6149 15 : unsigned int word0 =
6150 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i);
6151 15 : unsigned int word1 =
6152 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 4);
6153 15 : unsigned int word2 =
6154 15 : *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 8);
6155 15 : reinterpret_cast<unsigned int *>(pabyDest0)[j] =
6156 15 : (word0 & 0xff) | ((word0 >> 24) << 8) | (word1 & 0x00ff0000) |
6157 15 : ((word2 >> 8) << 24);
6158 15 : reinterpret_cast<unsigned int *>(pabyDest1)[j] =
6159 15 : ((word0 >> 8) & 0xff) | ((word1 & 0xff) << 8) |
6160 15 : (((word1 >> 24)) << 16) | ((word2 >> 16) << 24);
6161 15 : pabyDest2[j * 4] = static_cast<GByte>(word0 >> 16);
6162 15 : pabyDest2[j * 4 + 1] = static_cast<GByte>(word1 >> 8);
6163 15 : pabyDest2[j * 4 + 2] = static_cast<GByte>(word2);
6164 15 : pabyDest2[j * 4 + 3] = static_cast<GByte>(word2 >> 24);
6165 : }
6166 : }
6167 : #if defined(__clang__)
6168 : #pragma clang loop vectorize(disable)
6169 : #endif
6170 3 : for (; i < nIters; ++i)
6171 : {
6172 1 : pabyDest0[i] = pabySrc[3 * i + 0];
6173 1 : pabyDest1[i] = pabySrc[3 * i + 1];
6174 1 : pabyDest2[i] = pabySrc[3 * i + 2];
6175 : }
6176 : }
6177 : #endif
6178 :
6179 : /************************************************************************/
6180 : /* GDALDeinterleave4Byte() */
6181 : /************************************************************************/
6182 :
6183 : #if !defined(__GNUC__) || defined(__clang__)
6184 :
6185 : /************************************************************************/
6186 : /* deinterleave() */
6187 : /************************************************************************/
6188 :
6189 : template <bool SHIFT, bool MASK>
6190 : inline __m128i deinterleave(__m128i &xmm0_ori, __m128i &xmm1_ori,
6191 : __m128i &xmm2_ori, __m128i &xmm3_ori)
6192 : {
6193 : // Set higher 24bit of each int32 packed word to 0
6194 : if (SHIFT)
6195 : {
6196 : xmm0_ori = _mm_srli_epi32(xmm0_ori, 8);
6197 : xmm1_ori = _mm_srli_epi32(xmm1_ori, 8);
6198 : xmm2_ori = _mm_srli_epi32(xmm2_ori, 8);
6199 : xmm3_ori = _mm_srli_epi32(xmm3_ori, 8);
6200 : }
6201 : __m128i xmm0;
6202 : __m128i xmm1;
6203 : __m128i xmm2;
6204 : __m128i xmm3;
6205 : if (MASK)
6206 : {
6207 : const __m128i xmm_mask = _mm_set1_epi32(0xff);
6208 : xmm0 = _mm_and_si128(xmm0_ori, xmm_mask);
6209 : xmm1 = _mm_and_si128(xmm1_ori, xmm_mask);
6210 : xmm2 = _mm_and_si128(xmm2_ori, xmm_mask);
6211 : xmm3 = _mm_and_si128(xmm3_ori, xmm_mask);
6212 : }
6213 : else
6214 : {
6215 : xmm0 = xmm0_ori;
6216 : xmm1 = xmm1_ori;
6217 : xmm2 = xmm2_ori;
6218 : xmm3 = xmm3_ori;
6219 : }
6220 : // Pack int32 to int16
6221 : xmm0 = _mm_packs_epi32(xmm0, xmm1);
6222 : xmm2 = _mm_packs_epi32(xmm2, xmm3);
6223 : // Pack int16 to uint8
6224 : xmm0 = _mm_packus_epi16(xmm0, xmm2);
6225 : return xmm0;
6226 : }
6227 :
6228 : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
6229 : GByte *CPL_RESTRICT pabyDest0,
6230 : GByte *CPL_RESTRICT pabyDest1,
6231 : GByte *CPL_RESTRICT pabyDest2,
6232 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6233 : #ifdef USE_NEON_OPTIMIZATIONS
6234 : {
6235 : return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6236 : pabyDest3, nIters);
6237 : }
6238 : #else
6239 : {
6240 : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
6241 : if (CPLHaveRuntimeSSSE3())
6242 : {
6243 : return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
6244 : pabyDest2, pabyDest3, nIters);
6245 : }
6246 : #endif
6247 :
6248 : // Not the optimal SSE2-only code, as gcc auto-vectorizer manages to
6249 : // do something slightly better.
6250 : size_t i = 0;
6251 : for (; i + 15 < nIters; i += 16)
6252 : {
6253 : __m128i xmm0_ori = _mm_loadu_si128(
6254 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
6255 : __m128i xmm1_ori = _mm_loadu_si128(
6256 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
6257 : __m128i xmm2_ori = _mm_loadu_si128(
6258 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
6259 : __m128i xmm3_ori = _mm_loadu_si128(
6260 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
6261 :
6262 : _mm_storeu_si128(
6263 : reinterpret_cast<__m128i *>(pabyDest0 + i),
6264 : deinterleave<false, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6265 : _mm_storeu_si128(
6266 : reinterpret_cast<__m128i *>(pabyDest1 + i),
6267 : deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6268 : _mm_storeu_si128(
6269 : reinterpret_cast<__m128i *>(pabyDest2 + i),
6270 : deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6271 : _mm_storeu_si128(
6272 : reinterpret_cast<__m128i *>(pabyDest3 + i),
6273 : deinterleave<true, false>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
6274 : }
6275 :
6276 : #if defined(__clang__)
6277 : #pragma clang loop vectorize(disable)
6278 : #endif
6279 : for (; i < nIters; ++i)
6280 : {
6281 : pabyDest0[i] = pabySrc[4 * i + 0];
6282 : pabyDest1[i] = pabySrc[4 * i + 1];
6283 : pabyDest2[i] = pabySrc[4 * i + 2];
6284 : pabyDest3[i] = pabySrc[4 * i + 3];
6285 : }
6286 : }
6287 : #endif
6288 : #else
6289 : // GCC autovectorizer does an excellent job
6290 62363 : __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
6291 : const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
6292 : GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2,
6293 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6294 : {
6295 537151000 : for (size_t i = 0; i < nIters; ++i)
6296 : {
6297 537089000 : pabyDest0[i] = pabySrc[4 * i + 0];
6298 537089000 : pabyDest1[i] = pabySrc[4 * i + 1];
6299 537089000 : pabyDest2[i] = pabySrc[4 * i + 2];
6300 537089000 : pabyDest3[i] = pabySrc[4 * i + 3];
6301 : }
6302 62363 : }
6303 : #endif
6304 :
6305 : #else
6306 :
6307 : /************************************************************************/
6308 : /* GDALDeinterleave3Byte() */
6309 : /************************************************************************/
6310 :
6311 : // TODO: Enabling below could help on non-Intel architectures where GCC knows
6312 : // how to auto-vectorize
6313 : // #if defined(__GNUC__)
6314 : //__attribute__((optimize("tree-vectorize")))
6315 : // #endif
6316 : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
6317 : GByte *CPL_RESTRICT pabyDest0,
6318 : GByte *CPL_RESTRICT pabyDest1,
6319 : GByte *CPL_RESTRICT pabyDest2, size_t nIters)
6320 : {
6321 : for (size_t i = 0; i < nIters; ++i)
6322 : {
6323 : pabyDest0[i] = pabySrc[3 * i + 0];
6324 : pabyDest1[i] = pabySrc[3 * i + 1];
6325 : pabyDest2[i] = pabySrc[3 * i + 2];
6326 : }
6327 : }
6328 :
6329 : /************************************************************************/
6330 : /* GDALDeinterleave4Byte() */
6331 : /************************************************************************/
6332 :
6333 : // TODO: Enabling below could help on non-Intel architectures where gcc knows
6334 : // how to auto-vectorize
6335 : // #if defined(__GNUC__)
6336 : //__attribute__((optimize("tree-vectorize")))
6337 : // #endif
6338 : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
6339 : GByte *CPL_RESTRICT pabyDest0,
6340 : GByte *CPL_RESTRICT pabyDest1,
6341 : GByte *CPL_RESTRICT pabyDest2,
6342 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
6343 : {
6344 : for (size_t i = 0; i < nIters; ++i)
6345 : {
6346 : pabyDest0[i] = pabySrc[4 * i + 0];
6347 : pabyDest1[i] = pabySrc[4 * i + 1];
6348 : pabyDest2[i] = pabySrc[4 * i + 2];
6349 : pabyDest3[i] = pabySrc[4 * i + 3];
6350 : }
6351 : }
6352 :
6353 : #endif
6354 :
6355 : /************************************************************************/
6356 : /* GDALDeinterleave() */
6357 : /************************************************************************/
6358 :
6359 : /*! Copy values from a pixel-interleave buffer to multiple per-component
6360 : buffers.
6361 :
6362 : In pseudo-code
6363 : \verbatim
6364 : for(size_t i = 0; i < nIters; ++i)
6365 : for(int iComp = 0; iComp < nComponents; iComp++ )
6366 : ppDestBuffer[iComp][i] = pSourceBuffer[nComponents * i + iComp]
6367 : \endverbatim
6368 :
6369 : The implementation is optimized for a few cases, like de-interleaving
6370 : of 3 or 4-components Byte buffers.
6371 :
6372 : \since GDAL 3.6
6373 : */
6374 424066 : void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
6375 : int nComponents, void **ppDestBuffer,
6376 : GDALDataType eDestDT, size_t nIters)
6377 : {
6378 424066 : if (eSourceDT == eDestDT)
6379 : {
6380 424044 : if (eSourceDT == GDT_UInt8 || eSourceDT == GDT_Int8)
6381 : {
6382 423723 : if (nComponents == 3)
6383 : {
6384 361353 : const GByte *CPL_RESTRICT pabySrc =
6385 : static_cast<const GByte *>(pSourceBuffer);
6386 361353 : GByte *CPL_RESTRICT pabyDest0 =
6387 : static_cast<GByte *>(ppDestBuffer[0]);
6388 361353 : GByte *CPL_RESTRICT pabyDest1 =
6389 : static_cast<GByte *>(ppDestBuffer[1]);
6390 361353 : GByte *CPL_RESTRICT pabyDest2 =
6391 : static_cast<GByte *>(ppDestBuffer[2]);
6392 361353 : GDALDeinterleave3Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6393 : nIters);
6394 361353 : return;
6395 : }
6396 62370 : else if (nComponents == 4)
6397 : {
6398 62363 : const GByte *CPL_RESTRICT pabySrc =
6399 : static_cast<const GByte *>(pSourceBuffer);
6400 62363 : GByte *CPL_RESTRICT pabyDest0 =
6401 : static_cast<GByte *>(ppDestBuffer[0]);
6402 62363 : GByte *CPL_RESTRICT pabyDest1 =
6403 : static_cast<GByte *>(ppDestBuffer[1]);
6404 62363 : GByte *CPL_RESTRICT pabyDest2 =
6405 : static_cast<GByte *>(ppDestBuffer[2]);
6406 62363 : GByte *CPL_RESTRICT pabyDest3 =
6407 : static_cast<GByte *>(ppDestBuffer[3]);
6408 62363 : GDALDeinterleave4Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
6409 : pabyDest3, nIters);
6410 62363 : return;
6411 7 : }
6412 : }
6413 : #if ((defined(__GNUC__) && !defined(__clang__)) || \
6414 : defined(__INTEL_CLANG_COMPILER)) && \
6415 : defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
6416 642 : else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
6417 321 : CPLHaveRuntimeSSSE3())
6418 : {
6419 321 : if (nComponents == 3)
6420 : {
6421 126 : const GUInt16 *CPL_RESTRICT panSrc =
6422 : static_cast<const GUInt16 *>(pSourceBuffer);
6423 126 : GUInt16 *CPL_RESTRICT panDest0 =
6424 : static_cast<GUInt16 *>(ppDestBuffer[0]);
6425 126 : GUInt16 *CPL_RESTRICT panDest1 =
6426 : static_cast<GUInt16 *>(ppDestBuffer[1]);
6427 126 : GUInt16 *CPL_RESTRICT panDest2 =
6428 : static_cast<GUInt16 *>(ppDestBuffer[2]);
6429 126 : GDALDeinterleave3UInt16_SSSE3(panSrc, panDest0, panDest1,
6430 : panDest2, nIters);
6431 126 : return;
6432 : }
6433 : #if !defined(__INTEL_CLANG_COMPILER)
6434 : // ICC autovectorizer doesn't do a good job, at least with icx
6435 : // 2022.1.0.20220316
6436 195 : else if (nComponents == 4)
6437 : {
6438 195 : const GUInt16 *CPL_RESTRICT panSrc =
6439 : static_cast<const GUInt16 *>(pSourceBuffer);
6440 195 : GUInt16 *CPL_RESTRICT panDest0 =
6441 : static_cast<GUInt16 *>(ppDestBuffer[0]);
6442 195 : GUInt16 *CPL_RESTRICT panDest1 =
6443 : static_cast<GUInt16 *>(ppDestBuffer[1]);
6444 195 : GUInt16 *CPL_RESTRICT panDest2 =
6445 : static_cast<GUInt16 *>(ppDestBuffer[2]);
6446 195 : GUInt16 *CPL_RESTRICT panDest3 =
6447 : static_cast<GUInt16 *>(ppDestBuffer[3]);
6448 195 : GDALDeinterleave4UInt16_SSSE3(panSrc, panDest0, panDest1,
6449 : panDest2, panDest3, nIters);
6450 195 : return;
6451 : }
6452 : #endif
6453 : }
6454 : #endif
6455 : }
6456 :
6457 29 : const int nSourceDTSize = GDALGetDataTypeSizeBytes(eSourceDT);
6458 29 : const int nDestDTSize = GDALGetDataTypeSizeBytes(eDestDT);
6459 108 : for (int iComp = 0; iComp < nComponents; iComp++)
6460 : {
6461 79 : GDALCopyWords64(static_cast<const GByte *>(pSourceBuffer) +
6462 79 : iComp * nSourceDTSize,
6463 : eSourceDT, nComponents * nSourceDTSize,
6464 79 : ppDestBuffer[iComp], eDestDT, nDestDTSize, nIters);
6465 : }
6466 : }
6467 :
6468 : /************************************************************************/
6469 : /* GDALTranspose2DSingleToSingle() */
6470 : /************************************************************************/
6471 : /**
6472 : * Transpose a 2D array of non-complex values, in a efficient (cache-oblivious) way.
6473 : *
6474 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6475 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6476 : * @param nSrcWidth Width of pSrc array.
6477 : * @param nSrcHeight Height of pSrc array.
6478 : */
6479 :
6480 : template <class DST, class SRC>
6481 160 : void GDALTranspose2DSingleToSingle(const SRC *CPL_RESTRICT pSrc,
6482 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6483 : size_t nSrcHeight)
6484 : {
6485 160 : constexpr size_t blocksize = 32;
6486 345 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6487 : {
6488 185 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6489 5016 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6490 : {
6491 : // transpose the block beginning at [i,j]
6492 4831 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6493 26185 : for (size_t k = i; k < max_k; ++k)
6494 : {
6495 669282 : for (size_t l = j; l < max_l; ++l)
6496 : {
6497 647928 : GDALCopyWord(pSrc[l + k * nSrcWidth],
6498 647928 : pDst[k + l * nSrcHeight]);
6499 : }
6500 : }
6501 : }
6502 : }
6503 160 : }
6504 :
6505 : /************************************************************************/
6506 : /* GDALTranspose2DComplexToComplex() */
6507 : /************************************************************************/
6508 : /**
6509 : * Transpose a 2D array of complex values into an array of complex values,
6510 : * in a efficient (cache-oblivious) way.
6511 : *
6512 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6513 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6514 : * @param nSrcWidth Width of pSrc array.
6515 : * @param nSrcHeight Height of pSrc array.
6516 : */
6517 : template <class DST, class SRC>
6518 25 : void GDALTranspose2DComplexToComplex(const SRC *CPL_RESTRICT pSrc,
6519 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6520 : size_t nSrcHeight)
6521 : {
6522 25 : constexpr size_t blocksize = 32;
6523 50 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6524 : {
6525 25 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6526 50 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6527 : {
6528 : // transpose the block beginning at [i,j]
6529 25 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6530 75 : for (size_t k = i; k < max_k; ++k)
6531 : {
6532 200 : for (size_t l = j; l < max_l; ++l)
6533 : {
6534 150 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
6535 150 : pDst[2 * (k + l * nSrcHeight) + 0]);
6536 150 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 1],
6537 150 : pDst[2 * (k + l * nSrcHeight) + 1]);
6538 : }
6539 : }
6540 : }
6541 : }
6542 25 : }
6543 :
6544 : /************************************************************************/
6545 : /* GDALTranspose2DComplexToSingle() */
6546 : /************************************************************************/
6547 : /**
6548 : * Transpose a 2D array of complex values into an array of non-complex values,
6549 : * in a efficient (cache-oblivious) way.
6550 : *
6551 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6552 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6553 : * @param nSrcWidth Width of pSrc array.
6554 : * @param nSrcHeight Height of pSrc array.
6555 : */
6556 : template <class DST, class SRC>
6557 55 : void GDALTranspose2DComplexToSingle(const SRC *CPL_RESTRICT pSrc,
6558 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6559 : size_t nSrcHeight)
6560 : {
6561 55 : constexpr size_t blocksize = 32;
6562 110 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6563 : {
6564 55 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6565 110 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6566 : {
6567 : // transpose the block beginning at [i,j]
6568 55 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6569 165 : for (size_t k = i; k < max_k; ++k)
6570 : {
6571 440 : for (size_t l = j; l < max_l; ++l)
6572 : {
6573 330 : GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
6574 330 : pDst[k + l * nSrcHeight]);
6575 : }
6576 : }
6577 : }
6578 : }
6579 55 : }
6580 :
6581 : /************************************************************************/
6582 : /* GDALTranspose2DSingleToComplex() */
6583 : /************************************************************************/
6584 : /**
6585 : * Transpose a 2D array of non-complex values into an array of complex values,
6586 : * in a efficient (cache-oblivious) way.
6587 : *
6588 : * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
6589 : * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
6590 : * @param nSrcWidth Width of pSrc array.
6591 : * @param nSrcHeight Height of pSrc array.
6592 : */
6593 : template <class DST, class SRC>
6594 55 : void GDALTranspose2DSingleToComplex(const SRC *CPL_RESTRICT pSrc,
6595 : DST *CPL_RESTRICT pDst, size_t nSrcWidth,
6596 : size_t nSrcHeight)
6597 : {
6598 55 : constexpr size_t blocksize = 32;
6599 110 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
6600 : {
6601 55 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
6602 110 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
6603 : {
6604 : // transpose the block beginning at [i,j]
6605 55 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
6606 165 : for (size_t k = i; k < max_k; ++k)
6607 : {
6608 440 : for (size_t l = j; l < max_l; ++l)
6609 : {
6610 330 : GDALCopyWord(pSrc[l + k * nSrcWidth],
6611 330 : pDst[2 * (k + l * nSrcHeight) + 0]);
6612 330 : pDst[2 * (k + l * nSrcHeight) + 1] = 0;
6613 : }
6614 : }
6615 : }
6616 : }
6617 55 : }
6618 :
6619 : /************************************************************************/
6620 : /* GDALTranspose2D() */
6621 : /************************************************************************/
6622 :
6623 : template <class DST, bool DST_IS_COMPLEX>
6624 295 : static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
6625 : size_t nSrcWidth, size_t nSrcHeight)
6626 : {
6627 : #define CALL_GDALTranspose2D_internal(SRC_TYPE) \
6628 : do \
6629 : { \
6630 : if constexpr (DST_IS_COMPLEX) \
6631 : { \
6632 : GDALTranspose2DSingleToComplex( \
6633 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6634 : nSrcHeight); \
6635 : } \
6636 : else \
6637 : { \
6638 : GDALTranspose2DSingleToSingle(static_cast<const SRC_TYPE *>(pSrc), \
6639 : pDst, nSrcWidth, nSrcHeight); \
6640 : } \
6641 : } while (0)
6642 :
6643 : #define CALL_GDALTranspose2DComplex_internal(SRC_TYPE) \
6644 : do \
6645 : { \
6646 : if constexpr (DST_IS_COMPLEX) \
6647 : { \
6648 : GDALTranspose2DComplexToComplex( \
6649 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6650 : nSrcHeight); \
6651 : } \
6652 : else \
6653 : { \
6654 : GDALTranspose2DComplexToSingle( \
6655 : static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth, \
6656 : nSrcHeight); \
6657 : } \
6658 : } while (0)
6659 :
6660 : // clang-format off
6661 295 : switch (eSrcType)
6662 : {
6663 16 : case GDT_UInt8: CALL_GDALTranspose2D_internal(uint8_t); break;
6664 15 : case GDT_Int8: CALL_GDALTranspose2D_internal(int8_t); break;
6665 33 : case GDT_UInt16: CALL_GDALTranspose2D_internal(uint16_t); break;
6666 20 : case GDT_Int16: CALL_GDALTranspose2D_internal(int16_t); break;
6667 24 : case GDT_UInt32: CALL_GDALTranspose2D_internal(uint32_t); break;
6668 16 : case GDT_Int32: CALL_GDALTranspose2D_internal(int32_t); break;
6669 16 : case GDT_UInt64: CALL_GDALTranspose2D_internal(uint64_t); break;
6670 16 : case GDT_Int64: CALL_GDALTranspose2D_internal(int64_t); break;
6671 16 : case GDT_Float16: CALL_GDALTranspose2D_internal(GFloat16); break;
6672 19 : case GDT_Float32: CALL_GDALTranspose2D_internal(float); break;
6673 24 : case GDT_Float64: CALL_GDALTranspose2D_internal(double); break;
6674 16 : case GDT_CInt16: CALL_GDALTranspose2DComplex_internal(int16_t); break;
6675 16 : case GDT_CInt32: CALL_GDALTranspose2DComplex_internal(int32_t); break;
6676 16 : case GDT_CFloat16: CALL_GDALTranspose2DComplex_internal(GFloat16); break;
6677 16 : case GDT_CFloat32: CALL_GDALTranspose2DComplex_internal(float); break;
6678 16 : case GDT_CFloat64: CALL_GDALTranspose2DComplex_internal(double); break;
6679 0 : case GDT_Unknown:
6680 : case GDT_TypeCount:
6681 0 : break;
6682 : }
6683 : // clang-format on
6684 :
6685 : #undef CALL_GDALTranspose2D_internal
6686 : #undef CALL_GDALTranspose2DComplex_internal
6687 295 : }
6688 :
6689 : /************************************************************************/
6690 : /* GDALInterleave2Byte() */
6691 : /************************************************************************/
6692 :
6693 : #if defined(HAVE_SSE2) && \
6694 : (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
6695 :
6696 : // ICC autovectorizer doesn't do a good job at generating good SSE code,
6697 : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
6698 : #if defined(__GNUC__)
6699 : __attribute__((noinline))
6700 : #endif
6701 : static void
6702 : GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
6703 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
6704 : {
6705 : size_t i = 0;
6706 : constexpr size_t VALS_PER_ITER = 16;
6707 : for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
6708 : {
6709 : __m128i xmm0 =
6710 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + i));
6711 : __m128i xmm1 = _mm_loadu_si128(
6712 : reinterpret_cast<__m128i const *>(pSrc + i + nIters));
6713 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + 2 * i),
6714 : _mm_unpacklo_epi8(xmm0, xmm1));
6715 : _mm_storeu_si128(
6716 : reinterpret_cast<__m128i *>(pDst + 2 * i + VALS_PER_ITER),
6717 : _mm_unpackhi_epi8(xmm0, xmm1));
6718 : }
6719 : #if defined(__clang__)
6720 : #pragma clang loop vectorize(disable)
6721 : #endif
6722 : for (; i < nIters; ++i)
6723 : {
6724 : pDst[2 * i + 0] = pSrc[i + 0 * nIters];
6725 : pDst[2 * i + 1] = pSrc[i + 1 * nIters];
6726 : }
6727 : }
6728 :
6729 : #else
6730 :
6731 : #if defined(__GNUC__) && !defined(__clang__)
6732 : __attribute__((optimize("tree-vectorize")))
6733 : #endif
6734 : #if defined(__GNUC__)
6735 : __attribute__((noinline))
6736 : #endif
6737 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6738 : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
6739 : #pragma clang diagnostic push
6740 : #pragma clang diagnostic ignored "-Wpass-failed"
6741 : #endif
6742 : static void
6743 9 : GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
6744 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
6745 : {
6746 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6747 : #pragma clang loop vectorize(enable)
6748 : #endif
6749 355429 : for (size_t i = 0; i < nIters; ++i)
6750 : {
6751 355420 : pDst[2 * i + 0] = pSrc[i + 0 * nIters];
6752 355420 : pDst[2 * i + 1] = pSrc[i + 1 * nIters];
6753 : }
6754 9 : }
6755 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6756 : #pragma clang diagnostic pop
6757 : #endif
6758 :
6759 : #endif
6760 :
6761 : /************************************************************************/
6762 : /* GDALInterleave4Byte() */
6763 : /************************************************************************/
6764 :
6765 : #if defined(HAVE_SSE2) && \
6766 : (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
6767 :
6768 : // ICC autovectorizer doesn't do a good job at generating good SSE code,
6769 : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
6770 : #if defined(__GNUC__)
6771 : __attribute__((noinline))
6772 : #endif
6773 : static void
6774 : GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
6775 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
6776 : {
6777 : size_t i = 0;
6778 : constexpr size_t VALS_PER_ITER = 16;
6779 : for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
6780 : {
6781 : __m128i xmm0 = _mm_loadu_si128(
6782 : reinterpret_cast<__m128i const *>(pSrc + i + 0 * nIters));
6783 : __m128i xmm1 = _mm_loadu_si128(
6784 : reinterpret_cast<__m128i const *>(pSrc + i + 1 * nIters));
6785 : __m128i xmm2 = _mm_loadu_si128(
6786 : reinterpret_cast<__m128i const *>(pSrc + i + 2 * nIters));
6787 : __m128i xmm3 = _mm_loadu_si128(
6788 : reinterpret_cast<__m128i const *>(pSrc + i + 3 * nIters));
6789 : auto tmp0 = _mm_unpacklo_epi8(
6790 : xmm0,
6791 : xmm1); // (xmm0_0, xmm1_0, xmm0_1, xmm1_1, xmm0_2, xmm1_2, ...)
6792 : auto tmp1 = _mm_unpackhi_epi8(
6793 : xmm0,
6794 : xmm1); // (xmm0_8, xmm1_8, xmm0_9, xmm1_9, xmm0_10, xmm1_10, ...)
6795 : auto tmp2 = _mm_unpacklo_epi8(
6796 : xmm2,
6797 : xmm3); // (xmm2_0, xmm3_0, xmm2_1, xmm3_1, xmm2_2, xmm3_2, ...)
6798 : auto tmp3 = _mm_unpackhi_epi8(
6799 : xmm2,
6800 : xmm3); // (xmm2_8, xmm3_8, xmm2_9, xmm3_9, xmm2_10, xmm3_10, ...)
6801 : auto tmp2_0 = _mm_unpacklo_epi16(
6802 : tmp0,
6803 : tmp2); // (xmm0_0, xmm1_0, xmm2_0, xmm3_0, xmm0_1, xmm1_1, xmm2_1, xmm3_1, ...)
6804 : auto tmp2_1 = _mm_unpackhi_epi16(tmp0, tmp2);
6805 : auto tmp2_2 = _mm_unpacklo_epi16(tmp1, tmp3);
6806 : auto tmp2_3 = _mm_unpackhi_epi16(tmp1, tmp3);
6807 : _mm_storeu_si128(
6808 : reinterpret_cast<__m128i *>(pDst + 4 * i + 0 * VALS_PER_ITER),
6809 : tmp2_0);
6810 : _mm_storeu_si128(
6811 : reinterpret_cast<__m128i *>(pDst + 4 * i + 1 * VALS_PER_ITER),
6812 : tmp2_1);
6813 : _mm_storeu_si128(
6814 : reinterpret_cast<__m128i *>(pDst + 4 * i + 2 * VALS_PER_ITER),
6815 : tmp2_2);
6816 : _mm_storeu_si128(
6817 : reinterpret_cast<__m128i *>(pDst + 4 * i + 3 * VALS_PER_ITER),
6818 : tmp2_3);
6819 : }
6820 : #if defined(__clang__)
6821 : #pragma clang loop vectorize(disable)
6822 : #endif
6823 : for (; i < nIters; ++i)
6824 : {
6825 : pDst[4 * i + 0] = pSrc[i + 0 * nIters];
6826 : pDst[4 * i + 1] = pSrc[i + 1 * nIters];
6827 : pDst[4 * i + 2] = pSrc[i + 2 * nIters];
6828 : pDst[4 * i + 3] = pSrc[i + 3 * nIters];
6829 : }
6830 : }
6831 :
6832 : #else
6833 :
6834 : #if defined(__GNUC__) && !defined(__clang__)
6835 : __attribute__((optimize("tree-vectorize")))
6836 : #endif
6837 : #if defined(__GNUC__)
6838 : __attribute__((noinline))
6839 : #endif
6840 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6841 : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
6842 : #pragma clang diagnostic push
6843 : #pragma clang diagnostic ignored "-Wpass-failed"
6844 : #endif
6845 : static void
6846 9 : GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
6847 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
6848 : {
6849 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6850 : #pragma clang loop vectorize(enable)
6851 : #endif
6852 75443 : for (size_t i = 0; i < nIters; ++i)
6853 : {
6854 75434 : pDst[4 * i + 0] = pSrc[i + 0 * nIters];
6855 75434 : pDst[4 * i + 1] = pSrc[i + 1 * nIters];
6856 75434 : pDst[4 * i + 2] = pSrc[i + 2 * nIters];
6857 75434 : pDst[4 * i + 3] = pSrc[i + 3 * nIters];
6858 : }
6859 9 : }
6860 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
6861 : #pragma clang diagnostic pop
6862 : #endif
6863 :
6864 : #endif
6865 :
6866 : /************************************************************************/
6867 : /* GDALTranspose2D() */
6868 : /************************************************************************/
6869 :
6870 : /**
6871 : * Transpose a 2D array in a efficient (cache-oblivious) way.
6872 : *
6873 : * @param pSrc Source array of width = nSrcWidth and height = nSrcHeight.
6874 : * @param eSrcType Data type of pSrc.
6875 : * @param pDst Destination transposed array of width = nSrcHeight and height = nSrcWidth.
6876 : * @param eDstType Data type of pDst.
6877 : * @param nSrcWidth Width of pSrc array.
6878 : * @param nSrcHeight Height of pSrc array.
6879 : * @since GDAL 3.11
6880 : */
6881 :
6882 346 : void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, void *pDst,
6883 : GDALDataType eDstType, size_t nSrcWidth, size_t nSrcHeight)
6884 : {
6885 346 : if (eSrcType == eDstType && (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8))
6886 : {
6887 51 : if (nSrcHeight == 2)
6888 : {
6889 9 : GDALInterleave2Byte(static_cast<const uint8_t *>(pSrc),
6890 : static_cast<uint8_t *>(pDst), nSrcWidth);
6891 9 : return;
6892 : }
6893 42 : if (nSrcHeight == 4)
6894 : {
6895 9 : GDALInterleave4Byte(static_cast<const uint8_t *>(pSrc),
6896 : static_cast<uint8_t *>(pDst), nSrcWidth);
6897 9 : return;
6898 : }
6899 : #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) && \
6900 : (defined(__x86_64) || defined(_M_X64)))
6901 33 : if (CPLHaveRuntimeSSSE3())
6902 : {
6903 33 : GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
6904 : static_cast<uint8_t *>(pDst), nSrcWidth,
6905 : nSrcHeight);
6906 33 : return;
6907 : }
6908 : #elif defined(USE_NEON_OPTIMIZATIONS)
6909 : {
6910 : GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
6911 : static_cast<uint8_t *>(pDst), nSrcWidth,
6912 : nSrcHeight);
6913 : return;
6914 : }
6915 : #endif
6916 : }
6917 :
6918 : #define CALL_GDALTranspose2D_internal(DST_TYPE, DST_IS_COMPLEX) \
6919 : GDALTranspose2D<DST_TYPE, DST_IS_COMPLEX>( \
6920 : pSrc, eSrcType, static_cast<DST_TYPE *>(pDst), nSrcWidth, nSrcHeight)
6921 :
6922 : // clang-format off
6923 295 : switch (eDstType)
6924 : {
6925 15 : case GDT_UInt8: CALL_GDALTranspose2D_internal(uint8_t, false); break;
6926 15 : case GDT_Int8: CALL_GDALTranspose2D_internal(int8_t, false); break;
6927 33 : case GDT_UInt16: CALL_GDALTranspose2D_internal(uint16_t, false); break;
6928 20 : case GDT_Int16: CALL_GDALTranspose2D_internal(int16_t, false); break;
6929 24 : case GDT_UInt32: CALL_GDALTranspose2D_internal(uint32_t, false); break;
6930 16 : case GDT_Int32: CALL_GDALTranspose2D_internal(int32_t, false); break;
6931 16 : case GDT_UInt64: CALL_GDALTranspose2D_internal(uint64_t, false); break;
6932 16 : case GDT_Int64: CALL_GDALTranspose2D_internal(int64_t, false); break;
6933 16 : case GDT_Float16: CALL_GDALTranspose2D_internal(GFloat16, false); break;
6934 19 : case GDT_Float32: CALL_GDALTranspose2D_internal(float, false); break;
6935 25 : case GDT_Float64: CALL_GDALTranspose2D_internal(double, false); break;
6936 16 : case GDT_CInt16: CALL_GDALTranspose2D_internal(int16_t, true); break;
6937 16 : case GDT_CInt32: CALL_GDALTranspose2D_internal(int32_t, true); break;
6938 16 : case GDT_CFloat16: CALL_GDALTranspose2D_internal(GFloat16, true); break;
6939 16 : case GDT_CFloat32: CALL_GDALTranspose2D_internal(float, true); break;
6940 16 : case GDT_CFloat64: CALL_GDALTranspose2D_internal(double, true); break;
6941 0 : case GDT_Unknown:
6942 : case GDT_TypeCount:
6943 0 : break;
6944 : }
6945 : // clang-format on
6946 :
6947 : #undef CALL_GDALTranspose2D_internal
6948 : }
6949 :
6950 : /************************************************************************/
6951 : /* ExtractBitAndConvertTo255() */
6952 : /************************************************************************/
6953 :
6954 : #if defined(__GNUC__) || defined(_MSC_VER)
6955 : // Signedness of char implementation dependent, so be explicit.
6956 : // Assumes 2-complement integer types and sign extension of right shifting
6957 : // GCC guarantees such:
6958 : // https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
6959 124890 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
6960 : {
6961 124890 : return static_cast<GByte>(static_cast<signed char>(byVal << (7 - nBit)) >>
6962 124890 : 7);
6963 : }
6964 : #else
6965 : // Portable way
6966 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
6967 : {
6968 : return (byVal & (1 << nBit)) ? 255 : 0;
6969 : }
6970 : #endif
6971 :
6972 : /************************************************************************/
6973 : /* ExpandEightPackedBitsToByteAt255() */
6974 : /************************************************************************/
6975 :
6976 15569 : static inline void ExpandEightPackedBitsToByteAt255(GByte byVal,
6977 : GByte abyOutput[8])
6978 : {
6979 15569 : abyOutput[0] = ExtractBitAndConvertTo255(byVal, 7);
6980 15569 : abyOutput[1] = ExtractBitAndConvertTo255(byVal, 6);
6981 15569 : abyOutput[2] = ExtractBitAndConvertTo255(byVal, 5);
6982 15569 : abyOutput[3] = ExtractBitAndConvertTo255(byVal, 4);
6983 15569 : abyOutput[4] = ExtractBitAndConvertTo255(byVal, 3);
6984 15569 : abyOutput[5] = ExtractBitAndConvertTo255(byVal, 2);
6985 15569 : abyOutput[6] = ExtractBitAndConvertTo255(byVal, 1);
6986 15569 : abyOutput[7] = ExtractBitAndConvertTo255(byVal, 0);
6987 15569 : }
6988 :
6989 : /************************************************************************/
6990 : /* GDALExpandPackedBitsToByteAt0Or255() */
6991 : /************************************************************************/
6992 :
6993 : /** Expand packed-bits (ordered from most-significant bit to least one)
6994 : into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
6995 : at 1 to a byte at 255.
6996 :
6997 : The function does (in a possibly more optimized way) the following:
6998 : \code{.cpp}
6999 : for (size_t i = 0; i < nInputBits; ++i )
7000 : {
7001 : pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 255 : 0;
7002 : }
7003 : \endcode
7004 :
7005 : @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
7006 : @param pabyOutput Output array of nInputBits bytes.
7007 : @param nInputBits Number of valid bits in pabyInput.
7008 :
7009 : @since 3.11
7010 : */
7011 :
7012 45145 : void GDALExpandPackedBitsToByteAt0Or255(const GByte *CPL_RESTRICT pabyInput,
7013 : GByte *CPL_RESTRICT pabyOutput,
7014 : size_t nInputBits)
7015 : {
7016 45145 : const size_t nInputWholeBytes = nInputBits / 8;
7017 45145 : size_t iByte = 0;
7018 :
7019 : #ifdef HAVE_SSE2
7020 : // Mask to isolate each bit
7021 45145 : const __m128i bit_mask = _mm_set_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4,
7022 : 8, 16, 32, 64, -128);
7023 45145 : const __m128i zero = _mm_setzero_si128();
7024 45145 : const __m128i all_ones = _mm_set1_epi8(-1);
7025 : #ifdef __SSSE3__
7026 : const __m128i dispatch_two_bytes =
7027 : _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
7028 : #endif
7029 45145 : constexpr size_t SSE_REG_SIZE = sizeof(bit_mask);
7030 135654 : for (; iByte + SSE_REG_SIZE <= nInputWholeBytes; iByte += SSE_REG_SIZE)
7031 : {
7032 90509 : __m128i reg_ori = _mm_loadu_si128(
7033 90509 : reinterpret_cast<const __m128i *>(pabyInput + iByte));
7034 :
7035 90509 : constexpr int NUM_PROCESSED_BYTES_PER_REG = 2;
7036 814581 : for (size_t k = 0; k < SSE_REG_SIZE / NUM_PROCESSED_BYTES_PER_REG; ++k)
7037 : {
7038 : // Given reg_ori = (A, B, ... 14 other bytes ...),
7039 : // expand to (A, A, A, A, A, A, A, A, B, B, B, B, B, B, B, B)
7040 : #ifdef __SSSE3__
7041 : __m128i reg = _mm_shuffle_epi8(reg_ori, dispatch_two_bytes);
7042 : #else
7043 724072 : __m128i reg = _mm_unpacklo_epi8(reg_ori, reg_ori);
7044 724072 : reg = _mm_unpacklo_epi16(reg, reg);
7045 724072 : reg = _mm_unpacklo_epi32(reg, reg);
7046 : #endif
7047 :
7048 : // Test if bits of interest are set
7049 724072 : reg = _mm_and_si128(reg, bit_mask);
7050 :
7051 : // Now test if those bits are set, by comparing to zero. So the
7052 : // result will be that bytes where bits are set will be at 0, and
7053 : // ones where they are cleared will be at 0xFF. So the inverse of
7054 : // the end result we want!
7055 724072 : reg = _mm_cmpeq_epi8(reg, zero);
7056 :
7057 : // Invert the result
7058 724072 : reg = _mm_andnot_si128(reg, all_ones);
7059 :
7060 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyOutput), reg);
7061 :
7062 724072 : pabyOutput += SSE_REG_SIZE;
7063 :
7064 : // Right-shift of 2 bytes
7065 724072 : reg_ori = _mm_bsrli_si128(reg_ori, NUM_PROCESSED_BYTES_PER_REG);
7066 : }
7067 : }
7068 :
7069 : #endif // HAVE_SSE2
7070 :
7071 60714 : for (; iByte < nInputWholeBytes; ++iByte)
7072 : {
7073 15569 : ExpandEightPackedBitsToByteAt255(pabyInput[iByte], pabyOutput);
7074 15569 : pabyOutput += 8;
7075 : }
7076 45483 : for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
7077 : {
7078 338 : *pabyOutput = ExtractBitAndConvertTo255(pabyInput[iByte], 7 - iBit);
7079 338 : ++pabyOutput;
7080 : }
7081 45145 : }
7082 :
7083 : /************************************************************************/
7084 : /* ExpandEightPackedBitsToByteAt1() */
7085 : /************************************************************************/
7086 :
7087 136113 : static inline void ExpandEightPackedBitsToByteAt1(GByte byVal,
7088 : GByte abyOutput[8])
7089 : {
7090 136113 : abyOutput[0] = (byVal >> 7) & 0x1;
7091 136113 : abyOutput[1] = (byVal >> 6) & 0x1;
7092 136113 : abyOutput[2] = (byVal >> 5) & 0x1;
7093 136113 : abyOutput[3] = (byVal >> 4) & 0x1;
7094 136113 : abyOutput[4] = (byVal >> 3) & 0x1;
7095 136113 : abyOutput[5] = (byVal >> 2) & 0x1;
7096 136113 : abyOutput[6] = (byVal >> 1) & 0x1;
7097 136113 : abyOutput[7] = (byVal >> 0) & 0x1;
7098 136113 : }
7099 :
7100 : /************************************************************************/
7101 : /* GDALExpandPackedBitsToByteAt0Or1() */
7102 : /************************************************************************/
7103 :
7104 : /** Expand packed-bits (ordered from most-significant bit to least one)
7105 : into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
7106 : at 1 to a byte at 1.
7107 :
7108 : The function does (in a possibly more optimized way) the following:
7109 : \code{.cpp}
7110 : for (size_t i = 0; i < nInputBits; ++i )
7111 : {
7112 : pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
7113 : }
7114 : \endcode
7115 :
7116 : @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
7117 : @param pabyOutput Output array of nInputBits bytes.
7118 : @param nInputBits Number of valid bits in pabyInput.
7119 :
7120 : @since 3.11
7121 : */
7122 :
7123 7041 : void GDALExpandPackedBitsToByteAt0Or1(const GByte *CPL_RESTRICT pabyInput,
7124 : GByte *CPL_RESTRICT pabyOutput,
7125 : size_t nInputBits)
7126 : {
7127 7041 : const size_t nInputWholeBytes = nInputBits / 8;
7128 7041 : size_t iByte = 0;
7129 143154 : for (; iByte < nInputWholeBytes; ++iByte)
7130 : {
7131 136113 : ExpandEightPackedBitsToByteAt1(pabyInput[iByte], pabyOutput);
7132 136113 : pabyOutput += 8;
7133 : }
7134 18902 : for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
7135 : {
7136 11861 : *pabyOutput = (pabyInput[iByte] >> (7 - iBit)) & 0x1;
7137 11861 : ++pabyOutput;
7138 : }
7139 7041 : }
|