Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: GDAL Core
4 : * Purpose: Inline C++ templates
5 : * Author: Phil Vachon, <philippe at cowpig.ca>
6 : *
7 : ******************************************************************************
8 : * Copyright (c) 2009, Phil Vachon, <philippe at cowpig.ca>
9 : * Copyright (c) 2025, Even Rouault, <even.rouault at spatialys.com>
10 : *
11 : * SPDX-License-Identifier: MIT
12 : ****************************************************************************/
13 :
14 : #ifndef GDAL_PRIV_TEMPLATES_HPP_INCLUDED
15 : #define GDAL_PRIV_TEMPLATES_HPP_INCLUDED
16 :
17 : #include "cpl_port.h"
18 :
19 : #include <algorithm>
20 : #include <cmath>
21 : #include <cstdint>
22 : #include <limits>
23 : #include <type_traits>
24 :
25 : #include "cpl_float.h"
26 :
27 : // Needs SSE2
28 : #if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2) || \
29 : defined(USE_NEON_OPTIMIZATIONS)
30 :
31 : #ifdef USE_NEON_OPTIMIZATIONS
32 : #include "include_sse2neon.h"
33 : #else
34 : #include <immintrin.h>
35 : #endif
36 :
37 50086345 : static inline void GDALCopyXMMToInt32(const __m128i xmm, void *pDest)
38 : {
39 50086345 : int n32 = _mm_cvtsi128_si32(xmm); // Extract lower 32 bit word
40 50086345 : memcpy(pDest, &n32, sizeof(n32));
41 50086345 : }
42 :
43 106174301 : static inline void GDALCopyXMMToInt64(const __m128i xmm, void *pDest)
44 : {
45 : _mm_storel_epi64(reinterpret_cast<__m128i *>(pDest), xmm);
46 106174301 : }
47 :
48 : #if __SSSE3__
49 : #include <tmmintrin.h>
50 : #endif
51 :
52 : #if defined(__SSE4_1__) || defined(__AVX__)
53 : #include <smmintrin.h>
54 : #endif
55 :
56 : #ifdef __F16C__
57 : #include <immintrin.h>
58 : #endif
59 :
60 : #endif
61 :
62 : /************************************************************************/
63 : /* GDALGetDataLimits() */
64 : /************************************************************************/
65 : /**
66 : * Compute the limits of values that can be placed in Tout in terms of
67 : * Tin. Usually used for output clamping, when the output data type's
68 : * limits are stable relative to the input type (i.e. no roundoff error).
69 : *
70 : * @param tMaxValue the returned maximum value
71 : * @param tMinValue the returned minimum value
72 : */
73 :
74 : template <class Tin, class Tout>
75 386471695 : inline void GDALGetDataLimits(Tin &tMaxValue, Tin &tMinValue)
76 : {
77 386471695 : tMaxValue = cpl::NumericLimits<Tin>::max();
78 386471695 : tMinValue = cpl::NumericLimits<Tin>::lowest();
79 :
80 : // Compute the actual minimum value of Tout in terms of Tin.
81 : if constexpr (cpl::NumericLimits<Tout>::is_signed &&
82 : cpl::NumericLimits<Tout>::is_integer)
83 : {
84 : // the minimum value is less than zero
85 : // cppcheck-suppress knownConditionTrueFalse
86 : if constexpr (cpl::NumericLimits<Tout>::digits <
87 : cpl::NumericLimits<Tin>::digits ||
88 : !cpl::NumericLimits<Tin>::is_integer)
89 : {
90 : // Tout is smaller than Tin, so we need to clamp values in input
91 : // to the range of Tout's min/max values
92 : if constexpr (cpl::NumericLimits<Tin>::is_signed)
93 : {
94 85289862 : tMinValue =
95 85289862 : static_cast<Tin>(cpl::NumericLimits<Tout>::lowest());
96 : }
97 85810539 : tMaxValue = static_cast<Tin>(cpl::NumericLimits<Tout>::max());
98 : }
99 : }
100 : else if constexpr (cpl::NumericLimits<Tout>::is_integer)
101 : {
102 : // the output is unsigned, so we just need to determine the max
103 : if constexpr (!std::is_same_v<Tin, Tout> &&
104 : cpl::NumericLimits<Tout>::digits <=
105 : cpl::NumericLimits<Tin>::digits)
106 : {
107 : // Tout is smaller than Tin, so we need to clamp the input values
108 : // to the range of Tout's max
109 145386713 : tMaxValue = static_cast<Tin>(cpl::NumericLimits<Tout>::max());
110 : }
111 147242760 : tMinValue = 0;
112 : }
113 386471695 : }
114 :
115 : /************************************************************************/
116 : /* GDALClampValue() */
117 : /************************************************************************/
118 : /**
119 : * Clamp values of type T to a specified range
120 : *
121 : * @param tValue the value
122 : * @param tMax the max value
123 : * @param tMin the min value
124 : */
125 : template <class T>
126 386472029 : inline T GDALClampValue(const T tValue, const T tMax, const T tMin)
127 : {
128 386472029 : return tValue > tMax ? tMax : tValue < tMin ? tMin : tValue;
129 : }
130 :
131 : /************************************************************************/
132 : /* GDALClampDoubleValue() */
133 : /************************************************************************/
134 : /**
135 : * Clamp double values to a specified range, this uses the same
136 : * argument ordering as std::clamp, returns TRUE if the value was clamped.
137 : *
138 : * @param tValue the value
139 : * @param tMin the min value
140 : * @param tMax the max value
141 : *
142 : */
143 : template <class T2, class T3>
144 269 : inline bool GDALClampDoubleValue(double &tValue, const T2 tMin, const T3 tMax)
145 : {
146 269 : const double tMin2{static_cast<double>(tMin)};
147 269 : const double tMax2{static_cast<double>(tMax)};
148 269 : if (tValue > tMax2 || tValue < tMin2)
149 : {
150 26 : tValue = tValue > tMax2 ? tMax2 : tValue < tMin2 ? tMin2 : tValue;
151 26 : return true;
152 : }
153 : else
154 : {
155 243 : return false;
156 : }
157 : }
158 :
159 : /************************************************************************/
160 : /* GDALIsValueInRange() */
161 : /************************************************************************/
162 : /**
163 : * Returns whether a value is in the type range.
164 : * NaN is considered not to be in type range.
165 : *
166 : * @param dfValue the value
167 : * @return whether the value is in the type range.
168 : */
169 155009 : template <class T> inline bool GDALIsValueInRange(double dfValue)
170 : {
171 309955 : return dfValue >= static_cast<double>(cpl::NumericLimits<T>::lowest()) &&
172 309955 : dfValue <= static_cast<double>(cpl::NumericLimits<T>::max());
173 : }
174 :
175 27 : template <> inline bool GDALIsValueInRange<double>(double dfValue)
176 : {
177 27 : return !CPLIsNan(dfValue);
178 : }
179 :
180 41942 : template <> inline bool GDALIsValueInRange<float>(double dfValue)
181 : {
182 83813 : return CPLIsInf(dfValue) ||
183 : (dfValue >=
184 41871 : -static_cast<double>(std::numeric_limits<float>::max()) &&
185 83808 : dfValue <= static_cast<double>(std::numeric_limits<float>::max()));
186 : }
187 :
188 107 : template <> inline bool GDALIsValueInRange<GFloat16>(double dfValue)
189 : {
190 321 : return CPLIsInf(dfValue) ||
191 214 : (dfValue >= -cpl::NumericLimits<GFloat16>::max() &&
192 214 : dfValue <= cpl::NumericLimits<GFloat16>::max());
193 : }
194 :
195 16901 : template <> inline bool GDALIsValueInRange<int64_t>(double dfValue)
196 : {
197 : // Values in the range [INT64_MAX - 1023, INT64_MAX - 1]
198 : // get converted to a double that once cast to int64_t is
199 : // INT64_MAX + 1, hence the < strict comparison.
200 : return dfValue >=
201 33801 : static_cast<double>(cpl::NumericLimits<int64_t>::lowest()) &&
202 33801 : dfValue < static_cast<double>(cpl::NumericLimits<int64_t>::max());
203 : }
204 :
205 8916 : template <> inline bool GDALIsValueInRange<uint64_t>(double dfValue)
206 : {
207 : // Values in the range [UINT64_MAX - 2047, UINT64_MAX - 1]
208 : // get converted to a double that once cast to uint64_t is
209 : // UINT64_MAX + 1, hence the < strict comparison.
210 17829 : return dfValue >= 0 &&
211 17829 : dfValue < static_cast<double>(cpl::NumericLimits<uint64_t>::max());
212 : }
213 :
214 : /************************************************************************/
215 : /* GDALIsValueExactAs() */
216 : /************************************************************************/
217 : /**
218 : * Returns whether a value can be exactly represented on type T.
219 : *
220 : * That is static_cast\<double\>(static_cast\<T\>(dfValue)) is legal and is
221 : * equal to dfValue.
222 : *
223 : * Note: for T=float or double, a NaN input leads to true
224 : *
225 : * @param dfValue the value
226 : * @return whether the value can be exactly represented on type T.
227 : */
228 7365 : template <class T> inline bool GDALIsValueExactAs(double dfValue)
229 : {
230 14678 : return GDALIsValueInRange<T>(dfValue) &&
231 14678 : static_cast<double>(static_cast<T>(dfValue)) == dfValue;
232 : }
233 :
234 139 : template <> inline bool GDALIsValueExactAs<float>(double dfValue)
235 : {
236 272 : return CPLIsNan(dfValue) ||
237 133 : (GDALIsValueInRange<float>(dfValue) &&
238 268 : static_cast<double>(static_cast<float>(dfValue)) == dfValue);
239 : }
240 :
241 0 : template <> inline bool GDALIsValueExactAs<GFloat16>(double dfValue)
242 : {
243 0 : return CPLIsNan(dfValue) ||
244 0 : (GDALIsValueInRange<GFloat16>(dfValue) &&
245 0 : static_cast<double>(static_cast<GFloat16>(dfValue)) == dfValue);
246 : }
247 :
248 5 : template <> inline bool GDALIsValueExactAs<double>(double)
249 : {
250 5 : return true;
251 : }
252 :
253 : /************************************************************************/
254 : /* GDALCopyWord() */
255 : /************************************************************************/
256 :
257 : // Integer input and output: clamp the input
258 :
259 : template <class Tin, class Tout> struct sGDALCopyWord
260 : {
261 213067224 : static inline void f(const Tin tValueIn, Tout &tValueOut)
262 : {
263 : Tin tMaxVal, tMinVal;
264 213067224 : GDALGetDataLimits<Tin, Tout>(tMaxVal, tMinVal);
265 213067224 : tValueOut =
266 213067224 : static_cast<Tout>(GDALClampValue(tValueIn, tMaxVal, tMinVal));
267 213067224 : }
268 : };
269 :
270 : // Integer input and floating point output: simply convert
271 :
272 : template <class Tin> struct sGDALCopyWord<Tin, GFloat16>
273 : {
274 270266 : static inline void f(const Tin tValueIn, GFloat16 &hfValueOut)
275 : {
276 270266 : hfValueOut = static_cast<GFloat16>(tValueIn);
277 270266 : }
278 : };
279 :
280 : template <class Tin> struct sGDALCopyWord<Tin, float>
281 : {
282 7590659 : static inline void f(const Tin tValueIn, float &fValueOut)
283 : {
284 7590659 : fValueOut = static_cast<float>(tValueIn);
285 7590659 : }
286 : };
287 :
288 : template <class Tin> struct sGDALCopyWord<Tin, double>
289 : {
290 84892766 : static inline void f(const Tin tValueIn, double &dfValueOut)
291 : {
292 84892766 : dfValueOut = static_cast<double>(tValueIn);
293 84892766 : }
294 : };
295 :
296 : // Floating point input and output, converting between identical types: simply copy
297 :
298 : template <> struct sGDALCopyWord<GFloat16, GFloat16>
299 : {
300 : static inline void f(const GFloat16 hfValueIn, GFloat16 &hfValueOut)
301 : {
302 : hfValueOut = hfValueIn;
303 : }
304 : };
305 :
306 : template <> struct sGDALCopyWord<float, float>
307 : {
308 : static inline void f(const float fValueIn, float &fValueOut)
309 : {
310 : fValueOut = fValueIn;
311 : }
312 : };
313 :
314 : template <> struct sGDALCopyWord<double, double>
315 : {
316 : static inline void f(const double dfValueIn, double &dfValueOut)
317 : {
318 : dfValueOut = dfValueIn;
319 : }
320 : };
321 :
322 : // Floating point input and output, converting to a larger type: use implicit conversion
323 :
324 : template <> struct sGDALCopyWord<GFloat16, float>
325 : {
326 5158 : static inline void f(const GFloat16 hfValueIn, float &dfValueOut)
327 : {
328 5158 : dfValueOut = hfValueIn;
329 5158 : }
330 : };
331 :
332 : template <> struct sGDALCopyWord<GFloat16, double>
333 : {
334 4174 : static inline void f(const GFloat16 hfValueIn, double &dfValueOut)
335 : {
336 4174 : dfValueOut = hfValueIn;
337 4174 : }
338 : };
339 :
340 : template <> struct sGDALCopyWord<float, double>
341 : {
342 634718 : static inline void f(const float fValueIn, double &dfValueOut)
343 : {
344 634718 : dfValueOut = static_cast<double>(fValueIn);
345 634718 : }
346 : };
347 :
348 : // Floating point input and out, converting to a smaller type: ensure overflow results in infinity
349 :
350 : template <> struct sGDALCopyWord<float, GFloat16>
351 : {
352 497 : static inline void f(const float fValueIn, GFloat16 &hfValueOut)
353 : {
354 : // Our custom implementation when std::float16_t is not
355 : // available ensures proper behavior.
356 : #if !defined(HAVE_STD_FLOAT16_T)
357 497 : if (fValueIn > cpl::NumericLimits<GFloat16>::max())
358 : {
359 27 : hfValueOut = cpl::NumericLimits<GFloat16>::infinity();
360 27 : return;
361 : }
362 470 : if (fValueIn < -cpl::NumericLimits<GFloat16>::max())
363 : {
364 15 : hfValueOut = -cpl::NumericLimits<GFloat16>::infinity();
365 15 : return;
366 : }
367 : #endif
368 455 : hfValueOut = static_cast<GFloat16>(fValueIn);
369 : }
370 : };
371 :
372 : template <> struct sGDALCopyWord<double, GFloat16>
373 : {
374 347 : static inline void f(const double dfValueIn, GFloat16 &hfValueOut)
375 : {
376 : // Our custom implementation when std::float16_t is not
377 : // available ensures proper behavior.
378 : #if !defined(HAVE_STD_FLOAT16_T)
379 347 : if (dfValueIn > cpl::NumericLimits<GFloat16>::max())
380 : {
381 14 : hfValueOut = cpl::NumericLimits<GFloat16>::infinity();
382 14 : return;
383 : }
384 333 : if (dfValueIn < -cpl::NumericLimits<GFloat16>::max())
385 : {
386 14 : hfValueOut = -cpl::NumericLimits<GFloat16>::infinity();
387 14 : return;
388 : }
389 : #endif
390 319 : hfValueOut = static_cast<GFloat16>(dfValueIn);
391 : }
392 : };
393 :
394 : template <> struct sGDALCopyWord<double, float>
395 : {
396 129198 : static inline void f(const double dfValueIn, float &fValueOut)
397 : {
398 : #if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)
399 : // We could just write fValueOut = static_cast<float>(dfValueIn);
400 : // but a sanitizer might complain with values above FLT_MAX
401 387594 : _mm_store_ss(&fValueOut,
402 : _mm_cvtsd_ss(_mm_undefined_ps(), _mm_load_sd(&dfValueIn)));
403 : #else
404 : if (dfValueIn > static_cast<double>(std::numeric_limits<float>::max()))
405 : {
406 : fValueOut = std::numeric_limits<float>::infinity();
407 : return;
408 : }
409 : if (dfValueIn < static_cast<double>(-std::numeric_limits<float>::max()))
410 : {
411 : fValueOut = -std::numeric_limits<float>::infinity();
412 : return;
413 : }
414 :
415 : fValueOut = static_cast<float>(dfValueIn);
416 : #endif
417 129198 : }
418 : };
419 :
420 : // Floating point input to a small unsigned integer type: nan becomes zero, otherwise round and clamp
421 :
422 : template <class Tout> struct sGDALCopyWord<GFloat16, Tout>
423 : {
424 5032 : static inline void f(const GFloat16 hfValueIn, Tout &tValueOut)
425 : {
426 5032 : if (CPLIsNan(hfValueIn))
427 : {
428 1 : tValueOut = 0;
429 1 : return;
430 : }
431 : GFloat16 hfMaxVal, hfMinVal;
432 5031 : GDALGetDataLimits<GFloat16, Tout>(hfMaxVal, hfMinVal);
433 5031 : tValueOut = static_cast<Tout>(
434 10062 : GDALClampValue(hfValueIn + GFloat16(0.5f), hfMaxVal, hfMinVal));
435 : }
436 : };
437 :
438 : template <class Tout> struct sGDALCopyWord<float, Tout>
439 : {
440 5213580 : static inline void f(const float fValueIn, Tout &tValueOut)
441 : {
442 5213580 : if (CPLIsNan(fValueIn))
443 : {
444 0 : tValueOut = 0;
445 0 : return;
446 : }
447 : float fMaxVal, fMinVal;
448 5213580 : GDALGetDataLimits<float, Tout>(fMaxVal, fMinVal);
449 5213580 : tValueOut = static_cast<Tout>(
450 5213580 : GDALClampValue(fValueIn + 0.5f, fMaxVal, fMinVal));
451 : }
452 : };
453 :
454 : template <class Tout> struct sGDALCopyWord<double, Tout>
455 : {
456 82988110 : static inline void f(const double dfValueIn, Tout &tValueOut)
457 : {
458 82988110 : if (CPLIsNan(dfValueIn))
459 : {
460 6 : tValueOut = 0;
461 6 : return;
462 : }
463 : double dfMaxVal, dfMinVal;
464 82988109 : GDALGetDataLimits<double, Tout>(dfMaxVal, dfMinVal);
465 82988109 : tValueOut = static_cast<Tout>(
466 82988109 : GDALClampValue(dfValueIn + 0.5, dfMaxVal, dfMinVal));
467 : }
468 : };
469 :
470 : // Floating point input to a large unsigned integer type: nan becomes zero, otherwise round and clamp.
471 : // Avoid roundoff while clamping.
472 :
473 : template <> struct sGDALCopyWord<GFloat16, std::uint64_t>
474 : {
475 624 : static inline void f(const GFloat16 hfValueIn, std::uint64_t &nValueOut)
476 : {
477 624 : if (!(hfValueIn > 0))
478 : {
479 4 : nValueOut = 0;
480 : }
481 620 : else if (CPLIsInf(hfValueIn))
482 : {
483 1 : nValueOut = cpl::NumericLimits<std::uint64_t>::max();
484 : }
485 : else
486 : {
487 619 : nValueOut = static_cast<std::uint64_t>(hfValueIn + GFloat16(0.5f));
488 : }
489 624 : }
490 : };
491 :
492 : template <> struct sGDALCopyWord<float, unsigned int>
493 : {
494 5039 : static inline void f(const float fValueIn, unsigned int &nValueOut)
495 : {
496 5039 : if (!(fValueIn > 0))
497 : {
498 149 : nValueOut = 0;
499 : }
500 4890 : else if (fValueIn >=
501 4890 : static_cast<float>(cpl::NumericLimits<unsigned int>::max()))
502 : {
503 134 : nValueOut = cpl::NumericLimits<unsigned int>::max();
504 : }
505 : else
506 : {
507 4756 : nValueOut = static_cast<unsigned int>(fValueIn + 0.5f);
508 : }
509 5039 : }
510 : };
511 :
512 : template <> struct sGDALCopyWord<float, std::uint64_t>
513 : {
514 624 : static inline void f(const float fValueIn, std::uint64_t &nValueOut)
515 : {
516 624 : if (!(fValueIn > 0))
517 : {
518 4 : nValueOut = 0;
519 : }
520 620 : else if (fValueIn >=
521 620 : static_cast<float>(cpl::NumericLimits<std::uint64_t>::max()))
522 : {
523 2 : nValueOut = cpl::NumericLimits<std::uint64_t>::max();
524 : }
525 : else
526 : {
527 618 : nValueOut = static_cast<std::uint64_t>(fValueIn + 0.5f);
528 : }
529 624 : }
530 : };
531 :
532 : template <> struct sGDALCopyWord<double, std::uint64_t>
533 : {
534 1064 : static inline void f(const double dfValueIn, std::uint64_t &nValueOut)
535 : {
536 1064 : if (!(dfValueIn > 0))
537 : {
538 166 : nValueOut = 0;
539 : }
540 898 : else if (dfValueIn >
541 898 : static_cast<double>(cpl::NumericLimits<uint64_t>::max()))
542 : {
543 4 : nValueOut = cpl::NumericLimits<uint64_t>::max();
544 : }
545 : else
546 : {
547 894 : nValueOut = static_cast<std::uint64_t>(dfValueIn + 0.5);
548 : }
549 1064 : }
550 : };
551 :
552 : // Floating point input to a very large unsigned integer type: nan becomes zero, otherwise round and clamp.
553 : // Avoid infinity while clamping when the maximum integer is too large for the floating-point type.
554 : // Avoid roundoff while clamping.
555 :
556 : template <> struct sGDALCopyWord<GFloat16, unsigned short>
557 : {
558 4706 : static inline void f(const GFloat16 hfValueIn, unsigned short &nValueOut)
559 : {
560 4706 : if (!(hfValueIn > 0))
561 : {
562 216 : nValueOut = 0;
563 : }
564 4490 : else if (CPLIsInf(hfValueIn))
565 : {
566 67 : nValueOut = cpl::NumericLimits<unsigned short>::max();
567 : }
568 : else
569 : {
570 4423 : nValueOut = static_cast<unsigned short>(hfValueIn + GFloat16(0.5f));
571 : }
572 4706 : }
573 : };
574 :
575 : template <> struct sGDALCopyWord<GFloat16, unsigned int>
576 : {
577 4639 : static inline void f(const GFloat16 hfValueIn, unsigned int &nValueOut)
578 : {
579 4639 : if (!(hfValueIn > 0))
580 : {
581 149 : nValueOut = 0;
582 : }
583 4490 : else if (CPLIsInf(hfValueIn))
584 : {
585 0 : nValueOut = cpl::NumericLimits<unsigned int>::max();
586 : }
587 : else
588 : {
589 4490 : nValueOut = static_cast<unsigned int>(hfValueIn + GFloat16(0.5f));
590 : }
591 4639 : }
592 : };
593 :
594 : // Floating point input to a small signed integer type: nan becomes zero, otherwise round and clamp.
595 : // Rounding for signed integers is different than for the unsigned integers above.
596 :
597 : template <> struct sGDALCopyWord<GFloat16, signed char>
598 : {
599 1088 : static inline void f(const GFloat16 hfValueIn, signed char &nValueOut)
600 : {
601 1088 : if (CPLIsNan(hfValueIn))
602 : {
603 0 : nValueOut = 0;
604 0 : return;
605 : }
606 : GFloat16 hfMaxVal, hfMinVal;
607 1088 : GDALGetDataLimits<GFloat16, signed char>(hfMaxVal, hfMinVal);
608 1088 : GFloat16 hfValue = hfValueIn >= GFloat16(0.0f)
609 619 : ? hfValueIn + GFloat16(0.5f)
610 1088 : : hfValueIn - GFloat16(0.5f);
611 1088 : nValueOut = static_cast<signed char>(
612 2176 : GDALClampValue(hfValue, hfMaxVal, hfMinVal));
613 : }
614 : };
615 :
616 : template <> struct sGDALCopyWord<float, signed char>
617 : {
618 412 : static inline void f(const float fValueIn, signed char &nValueOut)
619 : {
620 412 : if (CPLIsNan(fValueIn))
621 : {
622 0 : nValueOut = 0;
623 0 : return;
624 : }
625 : float fMaxVal, fMinVal;
626 412 : GDALGetDataLimits<float, signed char>(fMaxVal, fMinVal);
627 412 : float fValue = fValueIn >= 0.0f ? fValueIn + 0.5f : fValueIn - 0.5f;
628 412 : nValueOut =
629 412 : static_cast<signed char>(GDALClampValue(fValue, fMaxVal, fMinVal));
630 : }
631 : };
632 :
633 : template <> struct sGDALCopyWord<float, short>
634 : {
635 2939800 : static inline void f(const float fValueIn, short &nValueOut)
636 : {
637 2939800 : if (CPLIsNan(fValueIn))
638 : {
639 0 : nValueOut = 0;
640 0 : return;
641 : }
642 : float fMaxVal, fMinVal;
643 2939800 : GDALGetDataLimits<float, short>(fMaxVal, fMinVal);
644 2939800 : float fValue = fValueIn >= 0.0f ? fValueIn + 0.5f : fValueIn - 0.5f;
645 2939800 : nValueOut =
646 2939800 : static_cast<short>(GDALClampValue(fValue, fMaxVal, fMinVal));
647 : }
648 : };
649 :
650 : template <> struct sGDALCopyWord<double, signed char>
651 : {
652 1321 : static inline void f(const double dfValueIn, signed char &nValueOut)
653 : {
654 1321 : if (CPLIsNan(dfValueIn))
655 : {
656 0 : nValueOut = 0;
657 0 : return;
658 : }
659 : double dfMaxVal, dfMinVal;
660 1321 : GDALGetDataLimits<double, signed char>(dfMaxVal, dfMinVal);
661 1321 : double dfValue = dfValueIn > 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5;
662 1321 : nValueOut = static_cast<signed char>(
663 1321 : GDALClampValue(dfValue, dfMaxVal, dfMinVal));
664 : }
665 : };
666 :
667 : template <> struct sGDALCopyWord<double, short>
668 : {
669 5156930 : static inline void f(const double dfValueIn, short &nValueOut)
670 : {
671 5156930 : if (CPLIsNan(dfValueIn))
672 : {
673 2 : nValueOut = 0;
674 2 : return;
675 : }
676 : double dfMaxVal, dfMinVal;
677 5156930 : GDALGetDataLimits<double, short>(dfMaxVal, dfMinVal);
678 5156930 : double dfValue = dfValueIn > 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5;
679 5156930 : nValueOut =
680 5156930 : static_cast<short>(GDALClampValue(dfValue, dfMaxVal, dfMinVal));
681 : }
682 : };
683 :
684 : template <> struct sGDALCopyWord<double, int>
685 : {
686 77098200 : static inline void f(const double dfValueIn, int &nValueOut)
687 : {
688 77098200 : if (CPLIsNan(dfValueIn))
689 : {
690 0 : nValueOut = 0;
691 0 : return;
692 : }
693 : double dfMaxVal, dfMinVal;
694 77098200 : GDALGetDataLimits<double, int>(dfMaxVal, dfMinVal);
695 77098200 : double dfValue = dfValueIn >= 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5;
696 77098200 : nValueOut =
697 77098200 : static_cast<int>(GDALClampValue(dfValue, dfMaxVal, dfMinVal));
698 : }
699 : };
700 :
701 : // Floating point input to a large signed integer type: nan becomes zero, otherwise round and clamp.
702 : // Rounding for signed integers is different than for the unsigned integers above.
703 : // Avoid roundoff while clamping.
704 :
705 : template <> struct sGDALCopyWord<GFloat16, short>
706 : {
707 5289 : static inline void f(const GFloat16 hfValueIn, short &nValueOut)
708 : {
709 5289 : if (CPLIsNan(hfValueIn))
710 : {
711 0 : nValueOut = 0;
712 : }
713 5289 : else if (hfValueIn >=
714 5289 : static_cast<GFloat16>(cpl::NumericLimits<short>::max()))
715 : {
716 146 : nValueOut = cpl::NumericLimits<short>::max();
717 : }
718 5143 : else if (hfValueIn <=
719 5143 : static_cast<GFloat16>(cpl::NumericLimits<short>::lowest()))
720 : {
721 213 : nValueOut = cpl::NumericLimits<short>::lowest();
722 : }
723 : else
724 : {
725 9860 : nValueOut = static_cast<short>(hfValueIn > GFloat16(0.0f)
726 4930 : ? hfValueIn + GFloat16(0.5f)
727 5451 : : hfValueIn - GFloat16(0.5f));
728 : }
729 5289 : }
730 : };
731 :
732 : template <> struct sGDALCopyWord<float, int>
733 : {
734 10192 : static inline void f(const float fValueIn, int &nValueOut)
735 : {
736 10192 : if (CPLIsNan(fValueIn))
737 : {
738 0 : nValueOut = 0;
739 : }
740 10192 : else if (fValueIn >= static_cast<float>(cpl::NumericLimits<int>::max()))
741 : {
742 218 : nValueOut = cpl::NumericLimits<int>::max();
743 : }
744 9974 : else if (fValueIn <=
745 9974 : static_cast<float>(cpl::NumericLimits<int>::lowest()))
746 : {
747 90 : nValueOut = cpl::NumericLimits<int>::lowest();
748 : }
749 : else
750 : {
751 10103 : nValueOut = static_cast<int>(fValueIn > 0.0f ? fValueIn + 0.5f
752 219 : : fValueIn - 0.5f);
753 : }
754 10192 : }
755 : };
756 :
757 : template <> struct sGDALCopyWord<float, std::int64_t>
758 : {
759 1093 : static inline void f(const float fValueIn, std::int64_t &nValueOut)
760 : {
761 1093 : if (CPLIsNan(fValueIn))
762 : {
763 1 : nValueOut = 0;
764 : }
765 1092 : else if (fValueIn >=
766 1092 : static_cast<float>(cpl::NumericLimits<std::int64_t>::max()))
767 : {
768 2 : nValueOut = cpl::NumericLimits<std::int64_t>::max();
769 : }
770 1090 : else if (fValueIn <=
771 1090 : static_cast<float>(cpl::NumericLimits<std::int64_t>::lowest()))
772 : {
773 2 : nValueOut = cpl::NumericLimits<std::int64_t>::lowest();
774 : }
775 : else
776 : {
777 2176 : nValueOut = static_cast<std::int64_t>(
778 1088 : fValueIn > 0.0f ? fValueIn + 0.5f : fValueIn - 0.5f);
779 : }
780 1093 : }
781 : };
782 :
783 : template <> struct sGDALCopyWord<double, std::int64_t>
784 : {
785 1547 : static inline void f(const double dfValueIn, std::int64_t &nValueOut)
786 : {
787 1547 : if (CPLIsNan(dfValueIn))
788 : {
789 1 : nValueOut = 0;
790 : }
791 1546 : else if (dfValueIn >=
792 1546 : static_cast<double>(cpl::NumericLimits<std::int64_t>::max()))
793 : {
794 4 : nValueOut = cpl::NumericLimits<std::int64_t>::max();
795 : }
796 1542 : else if (dfValueIn <=
797 1542 : static_cast<double>(cpl::NumericLimits<std::int64_t>::min()))
798 : {
799 4 : nValueOut = cpl::NumericLimits<std::int64_t>::min();
800 : }
801 : else
802 : {
803 3076 : nValueOut = static_cast<std::int64_t>(
804 1538 : dfValueIn > 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5);
805 : }
806 1547 : }
807 : };
808 :
809 : // Floating point input to a very large signed integer type: nan becomes zero, otherwise round and clamp.
810 : // Rounding for signed integers is different than for the unsigned integers above.
811 : // Avoid infinity while clamping when the maximum integer is too large for the floating-point type.
812 : // Avoid roundoff while clamping.
813 :
814 : template <> struct sGDALCopyWord<GFloat16, int>
815 : {
816 5222 : static inline void f(const GFloat16 hfValueIn, int &nValueOut)
817 : {
818 5222 : if (CPLIsNan(hfValueIn))
819 : {
820 0 : nValueOut = 0;
821 : }
822 5222 : else if (CPLIsInf(hfValueIn))
823 : {
824 0 : nValueOut = hfValueIn > GFloat16(0.0f)
825 0 : ? cpl::NumericLimits<int>::max()
826 0 : : cpl::NumericLimits<int>::lowest();
827 : }
828 : else
829 : {
830 10444 : nValueOut = static_cast<int>(hfValueIn > GFloat16(0.0f)
831 5222 : ? hfValueIn + GFloat16(0.5f)
832 5889 : : hfValueIn - GFloat16(0.5f));
833 : }
834 5222 : }
835 : };
836 :
837 : template <> struct sGDALCopyWord<GFloat16, std::int64_t>
838 : {
839 1093 : static inline void f(const GFloat16 hfValueIn, std::int64_t &nValueOut)
840 : {
841 1093 : if (CPLIsNan(hfValueIn))
842 : {
843 1 : nValueOut = 0;
844 : }
845 1092 : else if (CPLIsInf(hfValueIn))
846 : {
847 4 : nValueOut = hfValueIn > GFloat16(0.0f)
848 2 : ? cpl::NumericLimits<std::int64_t>::max()
849 1 : : cpl::NumericLimits<std::int64_t>::lowest();
850 : }
851 : else
852 : {
853 1090 : nValueOut = static_cast<std::int64_t>(
854 1090 : hfValueIn > GFloat16(0.0f) ? hfValueIn + GFloat16(0.5f)
855 1560 : : hfValueIn - GFloat16(0.5f));
856 : }
857 1093 : }
858 : };
859 :
860 : /**
861 : * Copy a single word, optionally rounding if appropriate (i.e. going
862 : * from the float to the integer case). Note that this is the function
863 : * you should specialize if you're adding a new data type.
864 : *
865 : * @param tValueIn value of type Tin; the input value to be converted
866 : * @param tValueOut value of type Tout; the output value
867 : */
868 :
869 : template <class Tin, class Tout>
870 522849540 : inline void GDALCopyWord(const Tin tValueIn, Tout &tValueOut)
871 : {
872 : if constexpr (std::is_same<Tin, Tout>::value)
873 42808928 : tValueOut = tValueIn;
874 : else
875 480040612 : sGDALCopyWord<Tin, Tout>::f(tValueIn, tValueOut);
876 522849540 : }
877 :
878 : /************************************************************************/
879 : /* GDALCopy4Words() */
880 : /************************************************************************/
881 : /**
882 : * Copy 4 packed words to 4 packed words, optionally rounding if appropriate
883 : * (i.e. going from the float to the integer case).
884 : *
885 : * @param pValueIn pointer to 4 input values of type Tin.
886 : * @param pValueOut pointer to 4 output values of type Tout.
887 : */
888 :
889 : template <class Tin, class Tout>
890 604 : inline void GDALCopy4Words(const Tin *pValueIn, Tout *const pValueOut)
891 : {
892 604 : GDALCopyWord(pValueIn[0], pValueOut[0]);
893 604 : GDALCopyWord(pValueIn[1], pValueOut[1]);
894 604 : GDALCopyWord(pValueIn[2], pValueOut[2]);
895 604 : GDALCopyWord(pValueIn[3], pValueOut[3]);
896 604 : }
897 :
898 : /************************************************************************/
899 : /* GDALCopy8Words() */
900 : /************************************************************************/
901 : /**
902 : * Copy 8 packed words to 8 packed words, optionally rounding if appropriate
903 : * (i.e. going from the float to the integer case).
904 : *
905 : * @param pValueIn pointer to 8 input values of type Tin.
906 : * @param pValueOut pointer to 8 output values of type Tout.
907 : */
908 :
909 : template <class Tin, class Tout>
910 49517693 : inline void GDALCopy8Words(const Tin *pValueIn, Tout *const pValueOut)
911 : {
912 49517693 : GDALCopy4Words(pValueIn, pValueOut);
913 49517693 : GDALCopy4Words(pValueIn + 4, pValueOut + 4);
914 49517693 : }
915 :
916 : // Needs SSE2
917 : #if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2) || \
918 : defined(USE_NEON_OPTIMIZATIONS)
919 :
920 : template <>
921 40401602 : inline void GDALCopy4Words(const float *pValueIn, GByte *const pValueOut)
922 : {
923 40401602 : __m128 xmm = _mm_loadu_ps(pValueIn);
924 :
925 : // The following clamping would be useless due to the final saturating
926 : // packing if we could guarantee the input range in [INT_MIN,INT_MAX]
927 40401602 : const __m128 p0d5 = _mm_set1_ps(0.5f);
928 40401602 : const __m128 xmm_max = _mm_set1_ps(255);
929 40401602 : xmm = _mm_add_ps(xmm, p0d5);
930 80803304 : xmm = _mm_min_ps(_mm_max_ps(xmm, p0d5), xmm_max);
931 :
932 40401602 : __m128i xmm_i = _mm_cvttps_epi32(xmm);
933 :
934 : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
935 : xmm_i = _mm_shuffle_epi8(
936 : xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24)));
937 : #else
938 40401602 : xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
939 40401602 : xmm_i = _mm_packus_epi16(xmm_i, xmm_i); // Pack int16 to uint8
940 : #endif
941 40401602 : GDALCopyXMMToInt32(xmm_i, pValueOut);
942 40401602 : }
943 :
944 43491900 : static inline __m128 GDALIfThenElse(__m128 mask, __m128 thenVal, __m128 elseVal)
945 : {
946 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
947 : return _mm_blendv_ps(elseVal, thenVal, mask);
948 : #else
949 130476000 : return _mm_or_ps(_mm_and_ps(mask, thenVal), _mm_andnot_ps(mask, elseVal));
950 : #endif
951 : }
952 :
953 : template <>
954 190 : inline void GDALCopy4Words(const float *pValueIn, GInt8 *const pValueOut)
955 : {
956 190 : __m128 xmm = _mm_loadu_ps(pValueIn);
957 :
958 190 : const __m128 xmm_min = _mm_set1_ps(-128);
959 190 : const __m128 xmm_max = _mm_set1_ps(127);
960 380 : xmm = _mm_min_ps(_mm_max_ps(xmm, xmm_min), xmm_max);
961 :
962 190 : const __m128 p0d5 = _mm_set1_ps(0.5f);
963 190 : const __m128 m0d5 = _mm_set1_ps(-0.5f);
964 190 : const __m128 mask = _mm_cmpge_ps(xmm, p0d5);
965 : // f >= 0.5f ? f + 0.5f : f - 0.5f
966 380 : xmm = _mm_add_ps(xmm, GDALIfThenElse(mask, p0d5, m0d5));
967 :
968 190 : __m128i xmm_i = _mm_cvttps_epi32(xmm);
969 :
970 : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
971 : xmm_i = _mm_shuffle_epi8(
972 : xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24)));
973 : #else
974 190 : xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
975 190 : xmm_i = _mm_packs_epi16(xmm_i, xmm_i); // Pack int16 to int8
976 : #endif
977 190 : GDALCopyXMMToInt32(xmm_i, pValueOut);
978 190 : }
979 :
980 : template <>
981 3363960 : inline void GDALCopy4Words(const float *pValueIn, GInt16 *const pValueOut)
982 : {
983 3363960 : __m128 xmm = _mm_loadu_ps(pValueIn);
984 :
985 3363960 : const __m128 xmm_min = _mm_set1_ps(-32768);
986 3363960 : const __m128 xmm_max = _mm_set1_ps(32767);
987 6727920 : xmm = _mm_min_ps(_mm_max_ps(xmm, xmm_min), xmm_max);
988 :
989 3363960 : const __m128 p0d5 = _mm_set1_ps(0.5f);
990 3363960 : const __m128 m0d5 = _mm_set1_ps(-0.5f);
991 3363960 : const __m128 mask = _mm_cmpge_ps(xmm, p0d5);
992 : // f >= 0.5f ? f + 0.5f : f - 0.5f
993 6727920 : xmm = _mm_add_ps(xmm, GDALIfThenElse(mask, p0d5, m0d5));
994 :
995 3363960 : __m128i xmm_i = _mm_cvttps_epi32(xmm);
996 :
997 3363960 : xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
998 3363960 : GDALCopyXMMToInt64(xmm_i, pValueOut);
999 3363960 : }
1000 :
1001 : template <>
1002 1 : inline void GDALCopy4Words(const float *pValueIn, GUInt16 *const pValueOut)
1003 : {
1004 1 : __m128 xmm = _mm_loadu_ps(pValueIn);
1005 :
1006 1 : const __m128 p0d5 = _mm_set1_ps(0.5f);
1007 1 : const __m128 xmm_max = _mm_set1_ps(65535);
1008 1 : xmm = _mm_add_ps(xmm, p0d5);
1009 2 : xmm = _mm_min_ps(_mm_max_ps(xmm, p0d5), xmm_max);
1010 :
1011 1 : __m128i xmm_i = _mm_cvttps_epi32(xmm);
1012 :
1013 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
1014 : xmm_i = _mm_packus_epi32(xmm_i, xmm_i); // Pack int32 to uint16
1015 : #else
1016 : // Translate to int16 range because _mm_packus_epi32 is SSE4.1 only
1017 2 : xmm_i = _mm_add_epi32(xmm_i, _mm_set1_epi32(-32768));
1018 1 : xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
1019 : // Translate back to uint16 range (actually -32768==32768 in int16)
1020 1 : xmm_i = _mm_add_epi16(xmm_i, _mm_set1_epi16(-32768));
1021 : #endif
1022 1 : GDALCopyXMMToInt64(xmm_i, pValueOut);
1023 1 : }
1024 :
1025 80340700 : static inline __m128i GDALIfThenElse(__m128i mask, __m128i thenVal,
1026 : __m128i elseVal)
1027 : {
1028 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
1029 : return _mm_blendv_epi8(elseVal, thenVal, mask);
1030 : #else
1031 160681000 : return _mm_or_si128(_mm_and_si128(mask, thenVal),
1032 80340700 : _mm_andnot_si128(mask, elseVal));
1033 : #endif
1034 : }
1035 :
1036 : template <>
1037 40127700 : inline void GDALCopy4Words(const float *pValueIn, GInt32 *const pValueOut)
1038 : {
1039 40127700 : __m128 xmm = _mm_loadu_ps(pValueIn);
1040 40127700 : const __m128 xmm_ori = xmm;
1041 :
1042 40127700 : const __m128 p0d5 = _mm_set1_ps(0.5f);
1043 40127700 : const __m128 m0d5 = _mm_set1_ps(-0.5f);
1044 40127700 : const __m128 mask = _mm_cmpge_ps(xmm, p0d5);
1045 : // f >= 0.5f ? f + 0.5f : f - 0.5f
1046 80255500 : xmm = _mm_add_ps(xmm, GDALIfThenElse(mask, p0d5, m0d5));
1047 :
1048 40127700 : __m128i xmm_i = _mm_cvttps_epi32(xmm);
1049 :
1050 40127700 : const __m128 xmm_min = _mm_set1_ps(-2147483648.0f);
1051 40127700 : const __m128 xmm_max = _mm_set1_ps(2147483648.0f);
1052 40127700 : const __m128i xmm_i_min = _mm_set1_epi32(INT_MIN);
1053 40127700 : const __m128i xmm_i_max = _mm_set1_epi32(INT_MAX);
1054 80255500 : xmm_i = GDALIfThenElse(_mm_castps_si128(_mm_cmpge_ps(xmm_ori, xmm_max)),
1055 : xmm_i_max, xmm_i);
1056 80255500 : xmm_i = GDALIfThenElse(_mm_castps_si128(_mm_cmple_ps(xmm_ori, xmm_min)),
1057 : xmm_i_min, xmm_i);
1058 :
1059 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut), xmm_i);
1060 40127700 : }
1061 :
1062 : // ARM64 has an efficient instruction for Float32 -> Float16
1063 : #if !(defined(HAVE__FLOAT16) && \
1064 : (defined(__aarch64__) && defined(_M_ARM64))) && \
1065 : !(defined(__AVX2__) && defined(__F16C__))
1066 :
1067 630 : inline __m128i GDALFourFloat32ToFloat16(__m128i xmm)
1068 : {
1069 : // Ported from https://github.com/simd-everywhere/simde/blob/51743e7920b6e867678cb50e9c62effe28f70b33/simde/simde-f16.h#L176
1070 : // to SSE2 in a branch-less way
1071 :
1072 : // clang-format off
1073 :
1074 : /* This code is CC0, based heavily on code by Fabian Giesen. */
1075 630 : const __m128i f32u_infinity = _mm_set1_epi32(255 << 23);
1076 630 : const __m128i f16u_max = _mm_set1_epi32((127 + 16) << 23);
1077 630 : const __m128i denorm_magic = _mm_set1_epi32(((127 - 15) + (23 - 10) + 1) << 23);
1078 :
1079 1260 : const auto sign = _mm_and_si128(xmm, _mm_set1_epi32(INT_MIN));
1080 630 : xmm = _mm_xor_si128(xmm, sign);
1081 11340 : xmm = GDALIfThenElse(
1082 : _mm_cmpgt_epi32(xmm, f16u_max),
1083 : /* result is Inf or NaN (all exponent bits set) */
1084 : GDALIfThenElse(
1085 : _mm_cmpgt_epi32(xmm, f32u_infinity),
1086 : /* NaN->qNaN and Inf->Inf */
1087 : _mm_set1_epi32(0x7e00),
1088 : _mm_set1_epi32(0x7c00)),
1089 : /* (De)normalized number or zero */
1090 : GDALIfThenElse(
1091 : _mm_cmplt_epi32(xmm, _mm_set1_epi32(113 << 23)),
1092 : /* use a magic value to align our 10 mantissa bits at the bottom of
1093 : * the float. as long as FP addition is round-to-nearest-even this
1094 : * just works. */
1095 : _mm_sub_epi32(
1096 : _mm_castps_si128(_mm_add_ps(_mm_castsi128_ps(xmm),
1097 : _mm_castsi128_ps(denorm_magic))),
1098 : /* and one integer subtract of the bias later,
1099 : * we have our final float! */
1100 : denorm_magic
1101 : ),
1102 : _mm_srli_epi32(
1103 : _mm_add_epi32(
1104 : /* update exponent, rounding bias part 1 */
1105 : // (unsigned)-0x37fff001 = ((unsigned)(15-127) << 23) + 0xfff
1106 : _mm_add_epi32(xmm, _mm_set1_epi32(-0x37fff001)),
1107 : /* rounding bias part 2, using mant_odd */
1108 : _mm_and_si128(_mm_srli_epi32(xmm, 13), _mm_set1_epi32(1))),
1109 : 13
1110 : )
1111 : )
1112 : );
1113 630 : xmm = _mm_or_si128(xmm, _mm_srli_epi32(sign, 16));
1114 :
1115 : // clang-format on
1116 630 : return xmm;
1117 : }
1118 :
1119 : template <>
1120 315 : inline void GDALCopy8Words(const float *pValueIn, GFloat16 *const pValueOut)
1121 : {
1122 : __m128i xmm_lo =
1123 630 : GDALFourFloat32ToFloat16(_mm_castps_si128(_mm_loadu_ps(pValueIn)));
1124 : __m128i xmm_hi =
1125 945 : GDALFourFloat32ToFloat16(_mm_castps_si128(_mm_loadu_ps(pValueIn + 4)));
1126 :
1127 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
1128 : auto xmm = _mm_packus_epi32(xmm_lo, xmm_hi); // Pack int32 to uint16
1129 : #else
1130 : // Translate to int16 range because _mm_packus_epi32 is SSE4.1 only
1131 630 : xmm_lo = _mm_add_epi32(xmm_lo, _mm_set1_epi32(-32768));
1132 630 : xmm_hi = _mm_add_epi32(xmm_hi, _mm_set1_epi32(-32768));
1133 315 : auto xmm = _mm_packs_epi32(xmm_lo, xmm_hi); // Pack int32 to int16
1134 : // Translate back to uint16 range (actually -32768==32768 in int16)
1135 630 : xmm = _mm_add_epi16(xmm, _mm_set1_epi16(-32768));
1136 : #endif
1137 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut), xmm);
1138 315 : }
1139 :
1140 : #endif
1141 :
1142 : template <>
1143 624430 : inline void GDALCopy4Words(const double *pValueIn, float *const pValueOut)
1144 : {
1145 624430 : const __m128d val01 = _mm_loadu_pd(pValueIn);
1146 1248860 : const __m128d val23 = _mm_loadu_pd(pValueIn + 2);
1147 624430 : const __m128 val01_s = _mm_cvtpd_ps(val01);
1148 624430 : const __m128 val23_s = _mm_cvtpd_ps(val23);
1149 624430 : const __m128 val = _mm_movelh_ps(val01_s, val23_s);
1150 : _mm_storeu_ps(pValueOut, val);
1151 624430 : }
1152 :
1153 : template <>
1154 2771880 : inline void GDALCopy4Words(const double *pValueIn, GByte *const pValueOut)
1155 : {
1156 2771880 : const __m128d p0d5 = _mm_set1_pd(0.5);
1157 2771880 : const __m128d xmm_max = _mm_set1_pd(255);
1158 :
1159 2771880 : __m128d val01 = _mm_loadu_pd(pValueIn);
1160 5543750 : __m128d val23 = _mm_loadu_pd(pValueIn + 2);
1161 2771880 : val01 = _mm_add_pd(val01, p0d5);
1162 5543750 : val01 = _mm_min_pd(_mm_max_pd(val01, p0d5), xmm_max);
1163 2771880 : val23 = _mm_add_pd(val23, p0d5);
1164 5543750 : val23 = _mm_min_pd(_mm_max_pd(val23, p0d5), xmm_max);
1165 :
1166 2771880 : const __m128i val01_u32 = _mm_cvttpd_epi32(val01);
1167 2771880 : const __m128i val23_u32 = _mm_cvttpd_epi32(val23);
1168 :
1169 : // Merge 4 int32 values into a single register
1170 8315630 : auto xmm_i = _mm_castpd_si128(_mm_shuffle_pd(
1171 : _mm_castsi128_pd(val01_u32), _mm_castsi128_pd(val23_u32), 0));
1172 :
1173 : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
1174 : xmm_i = _mm_shuffle_epi8(
1175 : xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24)));
1176 : #else
1177 2771880 : xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
1178 2771880 : xmm_i = _mm_packus_epi16(xmm_i, xmm_i); // Pack int16 to uint8
1179 : #endif
1180 2771880 : GDALCopyXMMToInt32(xmm_i, pValueOut);
1181 2771880 : }
1182 :
1183 : template <>
1184 11744900 : inline void GDALCopy4Words(const float *pValueIn, double *const pValueOut)
1185 : {
1186 11744900 : const __m128 valIn = _mm_loadu_ps(pValueIn);
1187 11744900 : _mm_storeu_pd(pValueOut, _mm_cvtps_pd(valIn));
1188 23489800 : _mm_storeu_pd(pValueOut + 2, _mm_cvtps_pd(_mm_movehl_ps(valIn, valIn)));
1189 11744900 : }
1190 :
1191 : #ifdef __F16C__
1192 : template <>
1193 : inline void GDALCopy4Words(const GFloat16 *pValueIn, float *const pValueOut)
1194 : {
1195 : __m128i xmm = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pValueIn));
1196 : _mm_storeu_ps(pValueOut, _mm_cvtph_ps(xmm));
1197 : }
1198 :
1199 : template <>
1200 : inline void GDALCopy4Words(const float *pValueIn, GFloat16 *const pValueOut)
1201 : {
1202 : __m128 xmm = _mm_loadu_ps(pValueIn);
1203 : GDALCopyXMMToInt64(_mm_cvtps_ph(xmm, _MM_FROUND_TO_NEAREST_INT), pValueOut);
1204 : }
1205 :
1206 : template <>
1207 : inline void GDALCopy4Words(const GFloat16 *pValueIn, double *const pValueOut)
1208 : {
1209 : float tmp[4];
1210 : GDALCopy4Words(pValueIn, tmp);
1211 : GDALCopy4Words(tmp, pValueOut);
1212 : }
1213 :
1214 : template <>
1215 : inline void GDALCopy4Words(const double *pValueIn, GFloat16 *const pValueOut)
1216 : {
1217 : float tmp[4];
1218 : GDALCopy4Words(pValueIn, tmp);
1219 : GDALCopy4Words(tmp, pValueOut);
1220 : }
1221 :
1222 : // ARM64 has an efficient instruction for Float16 -> Float32/Float64
1223 : #elif !(defined(HAVE__FLOAT16) && (defined(__aarch64__) && defined(_M_ARM64)))
1224 :
1225 : // Convert 4 float16 values to 4 float 32 values
1226 : // xmm must contain 4 float16 values stored in 32 bit each (with upper 16 bits at zero)
1227 41668 : static inline __m128i GDALFourFloat16ToFloat32(__m128i xmm)
1228 : {
1229 : // Ported from https://github.com/simd-everywhere/simde/blob/51743e7920b6e867678cb50e9c62effe28f70b33/simde/simde-f16.h#L242C4-L242C68
1230 : // to SSE2 in a branch-less way
1231 :
1232 : /* This code is CC0, based heavily on code by Fabian Giesen. */
1233 : const auto denorm_magic =
1234 83336 : _mm_castsi128_ps(_mm_set1_epi32((128 - 15) << 23));
1235 : const auto shifted_exp =
1236 41668 : _mm_set1_epi32(0x7c00 << 13); /* exponent mask after shift */
1237 :
1238 : // Shift exponent and mantissa bits to their position in a float32
1239 125004 : auto f32u = _mm_slli_epi32(_mm_and_si128(xmm, _mm_set1_epi32(0x7fff)), 13);
1240 : // Extract the (shifted) exponent
1241 41668 : const auto exp = _mm_and_si128(shifted_exp, f32u);
1242 : // Adjust the exponent
1243 41668 : const auto exp_adjustment = _mm_set1_epi32((127 - 15) << 23);
1244 41668 : f32u = _mm_add_epi32(f32u, exp_adjustment);
1245 :
1246 41668 : const auto is_inf_nan = _mm_cmpeq_epi32(exp, shifted_exp); /* Inf/NaN? */
1247 : // When is_inf_nan is true: extra exponent adjustment
1248 41668 : const auto f32u_inf_nan = _mm_add_epi32(f32u, exp_adjustment);
1249 :
1250 : const auto is_denormal =
1251 83336 : _mm_cmpeq_epi32(exp, _mm_setzero_si128()); /* Zero/Denormal? */
1252 : // When is_denormal is true:
1253 83336 : auto f32u_denormal = _mm_add_epi32(f32u, _mm_set1_epi32(1 << 23));
1254 83336 : f32u_denormal = _mm_castps_si128(
1255 : _mm_sub_ps(_mm_castsi128_ps(f32u_denormal), denorm_magic));
1256 :
1257 41668 : f32u = GDALIfThenElse(is_inf_nan, f32u_inf_nan, f32u);
1258 41668 : f32u = GDALIfThenElse(is_denormal, f32u_denormal, f32u);
1259 :
1260 : // Re-apply sign bit
1261 125004 : f32u = _mm_or_si128(
1262 : f32u, _mm_slli_epi32(_mm_and_si128(xmm, _mm_set1_epi32(0x8000)), 16));
1263 41668 : return f32u;
1264 : }
1265 :
1266 : template <>
1267 388 : inline void GDALCopy8Words(const GFloat16 *pValueIn, float *const pValueOut)
1268 : {
1269 388 : __m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
1270 : const auto xmm_0 =
1271 776 : GDALFourFloat16ToFloat32(_mm_unpacklo_epi16(xmm, _mm_setzero_si128()));
1272 : const auto xmm_1 =
1273 776 : GDALFourFloat16ToFloat32(_mm_unpackhi_epi16(xmm, _mm_setzero_si128()));
1274 388 : _mm_storeu_ps(pValueOut + 0, _mm_castsi128_ps(xmm_0));
1275 388 : _mm_storeu_ps(pValueOut + 4, _mm_castsi128_ps(xmm_1));
1276 388 : }
1277 :
1278 : template <>
1279 20446 : inline void GDALCopy8Words(const GFloat16 *pValueIn, double *const pValueOut)
1280 : {
1281 20446 : __m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
1282 61338 : const auto xmm_0 = _mm_castsi128_ps(
1283 : GDALFourFloat16ToFloat32(_mm_unpacklo_epi16(xmm, _mm_setzero_si128())));
1284 61338 : const auto xmm_1 = _mm_castsi128_ps(
1285 : GDALFourFloat16ToFloat32(_mm_unpackhi_epi16(xmm, _mm_setzero_si128())));
1286 20446 : _mm_storeu_pd(pValueOut + 0, _mm_cvtps_pd(xmm_0));
1287 40892 : _mm_storeu_pd(pValueOut + 2, _mm_cvtps_pd(_mm_movehl_ps(xmm_0, xmm_0)));
1288 20446 : _mm_storeu_pd(pValueOut + 4, _mm_cvtps_pd(xmm_1));
1289 40892 : _mm_storeu_pd(pValueOut + 6, _mm_cvtps_pd(_mm_movehl_ps(xmm_1, xmm_1)));
1290 20446 : }
1291 :
1292 : #endif // __F16C__
1293 :
1294 : #ifdef __AVX2__
1295 :
1296 : #include <immintrin.h>
1297 :
1298 : template <>
1299 : inline void GDALCopy8Words(const double *pValueIn, float *const pValueOut)
1300 : {
1301 : const __m256d val0123 = _mm256_loadu_pd(pValueIn);
1302 : const __m256d val4567 = _mm256_loadu_pd(pValueIn + 4);
1303 : const __m256 val0123_s = _mm256_castps128_ps256(_mm256_cvtpd_ps(val0123));
1304 : const __m256 val4567_s = _mm256_castps128_ps256(_mm256_cvtpd_ps(val4567));
1305 : const __m256 val =
1306 : _mm256_permute2f128_ps(val0123_s, val4567_s, 0 | (2 << 4));
1307 : _mm256_storeu_ps(pValueOut, val);
1308 : }
1309 :
1310 : template <>
1311 : inline void GDALCopy8Words(const float *pValueIn, double *const pValueOut)
1312 : {
1313 : const __m256 valIn = _mm256_loadu_ps(pValueIn);
1314 : _mm256_storeu_pd(pValueOut, _mm256_cvtps_pd(_mm256_castps256_ps128(valIn)));
1315 : _mm256_storeu_pd(pValueOut + 4,
1316 : _mm256_cvtps_pd(_mm256_castps256_ps128(
1317 : _mm256_permute2f128_ps(valIn, valIn, 1))));
1318 : }
1319 :
1320 : #ifdef __F16C__
1321 :
1322 : template <>
1323 : inline void GDALCopy8Words(const GFloat16 *pValueIn, float *const pValueOut)
1324 : {
1325 : __m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
1326 : _mm256_storeu_ps(pValueOut, _mm256_cvtph_ps(xmm));
1327 : }
1328 :
1329 : template <>
1330 : inline void GDALCopy8Words(const float *pValueIn, GFloat16 *const pValueOut)
1331 : {
1332 : __m256 ymm = _mm256_loadu_ps(pValueIn);
1333 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut),
1334 : _mm256_cvtps_ph(ymm, _MM_FROUND_TO_NEAREST_INT));
1335 : }
1336 :
1337 : template <>
1338 : inline void GDALCopy8Words(const GFloat16 *pValueIn, double *const pValueOut)
1339 : {
1340 : __m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
1341 : const auto ymm = _mm256_cvtph_ps(xmm);
1342 : _mm256_storeu_pd(pValueOut, _mm256_cvtps_pd(_mm256_extractf128_ps(ymm, 0)));
1343 : _mm256_storeu_pd(pValueOut + 4,
1344 : _mm256_cvtps_pd(_mm256_extractf128_ps(ymm, 1)));
1345 : }
1346 :
1347 : template <>
1348 : inline void GDALCopy8Words(const double *pValueIn, GFloat16 *const pValueOut)
1349 : {
1350 : __m256d ymm0 = _mm256_loadu_pd(pValueIn);
1351 : __m256d ymm1 = _mm256_loadu_pd(pValueIn + 4);
1352 : __m256 ymm = _mm256_set_m128(_mm256_cvtpd_ps(ymm1), _mm256_cvtpd_ps(ymm0));
1353 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut),
1354 : _mm256_cvtps_ph(ymm, _MM_FROUND_TO_NEAREST_INT));
1355 : }
1356 :
1357 : #endif
1358 :
1359 : template <>
1360 : inline void GDALCopy8Words(const float *pValueIn, GByte *const pValueOut)
1361 : {
1362 : __m256 ymm = _mm256_loadu_ps(pValueIn);
1363 :
1364 : const __m256 p0d5 = _mm256_set1_ps(0.5f);
1365 : const __m256 ymm_max = _mm256_set1_ps(255);
1366 : ymm = _mm256_add_ps(ymm, p0d5);
1367 : ymm = _mm256_min_ps(_mm256_max_ps(ymm, p0d5), ymm_max);
1368 :
1369 : __m256i ymm_i = _mm256_cvttps_epi32(ymm);
1370 :
1371 : ymm_i = _mm256_packus_epi32(ymm_i, ymm_i); // Pack int32 to uint16
1372 : ymm_i = _mm256_permute4x64_epi64(ymm_i, 0 | (2 << 2)); // AVX2
1373 :
1374 : __m128i xmm_i = _mm256_castsi256_si128(ymm_i);
1375 : xmm_i = _mm_packus_epi16(xmm_i, xmm_i);
1376 : GDALCopyXMMToInt64(xmm_i, pValueOut);
1377 : }
1378 :
1379 : template <>
1380 : inline void GDALCopy8Words(const float *pValueIn, GUInt16 *const pValueOut)
1381 : {
1382 : __m256 ymm = _mm256_loadu_ps(pValueIn);
1383 :
1384 : const __m256 p0d5 = _mm256_set1_ps(0.5f);
1385 : const __m256 ymm_max = _mm256_set1_ps(65535);
1386 : ymm = _mm256_add_ps(ymm, p0d5);
1387 : ymm = _mm256_min_ps(_mm256_max_ps(ymm, p0d5), ymm_max);
1388 :
1389 : __m256i ymm_i = _mm256_cvttps_epi32(ymm);
1390 :
1391 : ymm_i = _mm256_packus_epi32(ymm_i, ymm_i); // Pack int32 to uint16
1392 : ymm_i = _mm256_permute4x64_epi64(ymm_i, 0 | (2 << 2)); // AVX2
1393 :
1394 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut),
1395 : _mm256_castsi256_si128(ymm_i));
1396 : }
1397 : #else
1398 : template <>
1399 7785681 : inline void GDALCopy8Words(const float *pValueIn, GUInt16 *const pValueOut)
1400 : {
1401 7785681 : __m128 xmm = _mm_loadu_ps(pValueIn);
1402 15571402 : __m128 xmm1 = _mm_loadu_ps(pValueIn + 4);
1403 :
1404 7785681 : const __m128 p0d5 = _mm_set1_ps(0.5f);
1405 7785681 : const __m128 xmm_max = _mm_set1_ps(65535);
1406 7785681 : xmm = _mm_add_ps(xmm, p0d5);
1407 7785681 : xmm1 = _mm_add_ps(xmm1, p0d5);
1408 15571402 : xmm = _mm_min_ps(_mm_max_ps(xmm, p0d5), xmm_max);
1409 15571402 : xmm1 = _mm_min_ps(_mm_max_ps(xmm1, p0d5), xmm_max);
1410 :
1411 7785681 : __m128i xmm_i = _mm_cvttps_epi32(xmm);
1412 7785681 : __m128i xmm1_i = _mm_cvttps_epi32(xmm1);
1413 :
1414 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
1415 : xmm_i = _mm_packus_epi32(xmm_i, xmm1_i); // Pack int32 to uint16
1416 : #else
1417 : // Translate to int16 range because _mm_packus_epi32 is SSE4.1 only
1418 15571402 : xmm_i = _mm_add_epi32(xmm_i, _mm_set1_epi32(-32768));
1419 15571402 : xmm1_i = _mm_add_epi32(xmm1_i, _mm_set1_epi32(-32768));
1420 7785681 : xmm_i = _mm_packs_epi32(xmm_i, xmm1_i); // Pack int32 to int16
1421 : // Translate back to uint16 range (actually -32768==32768 in int16)
1422 15571402 : xmm_i = _mm_add_epi16(xmm_i, _mm_set1_epi16(-32768));
1423 : #endif
1424 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut), xmm_i);
1425 7785681 : }
1426 : #endif
1427 :
1428 : // ARM64 has an efficient instruction for Float64 -> Float16
1429 : #if !(defined(HAVE__FLOAT16) && \
1430 : (defined(__aarch64__) && defined(_M_ARM64))) && \
1431 : !(defined(__AVX2__) && defined(__F16C__))
1432 : template <>
1433 235 : inline void GDALCopy8Words(const double *pValueIn, GFloat16 *const pValueOut)
1434 : {
1435 : float fVal[8];
1436 235 : GDALCopy8Words(pValueIn, fVal);
1437 235 : GDALCopy8Words(fVal, pValueOut);
1438 235 : }
1439 : #endif
1440 :
1441 : #endif // defined(__x86_64) || defined(_M_X64)
1442 :
1443 : #endif // GDAL_PRIV_TEMPLATES_HPP_INCLUDED
|