Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: GDAL Core
4 : * Purpose: Inline C++ templates
5 : * Author: Phil Vachon, <philippe at cowpig.ca>
6 : *
7 : ******************************************************************************
8 : * Copyright (c) 2009, Phil Vachon, <philippe at cowpig.ca>
9 : * Copyright (c) 2025, Even Rouault, <even.rouault at spatialys.com>
10 : *
11 : * SPDX-License-Identifier: MIT
12 : ****************************************************************************/
13 :
14 : #ifndef GDAL_PRIV_TEMPLATES_HPP_INCLUDED
15 : #define GDAL_PRIV_TEMPLATES_HPP_INCLUDED
16 :
17 : #include "cpl_port.h"
18 :
19 : #include <algorithm>
20 : #include <cmath>
21 : #include <cstdint>
22 : #include <limits>
23 : #include <type_traits>
24 :
25 : #include "cpl_float.h"
26 :
27 : // Needs SSE2
28 : #if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2) || \
29 : defined(USE_NEON_OPTIMIZATIONS)
30 :
31 : #ifdef USE_NEON_OPTIMIZATIONS
32 : #include "include_sse2neon.h"
33 : #else
34 : #include <immintrin.h>
35 : #endif
36 :
37 40839045 : static inline void GDALCopyXMMToInt32(const __m128i xmm, void *pDest)
38 : {
39 40839045 : int n32 = _mm_cvtsi128_si32(xmm); // Extract lower 32 bit word
40 40839045 : memcpy(pDest, &n32, sizeof(n32));
41 40839045 : }
42 :
43 106951351 : static inline void GDALCopyXMMToInt64(const __m128i xmm, void *pDest)
44 : {
45 : _mm_storel_epi64(reinterpret_cast<__m128i *>(pDest), xmm);
46 106951351 : }
47 :
48 : #if __SSSE3__
49 : #include <tmmintrin.h>
50 : #endif
51 :
52 : #if defined(__SSE4_1__) || defined(__AVX__)
53 : #include <smmintrin.h>
54 : #endif
55 :
56 : #ifdef __F16C__
57 : #include <immintrin.h>
58 : #endif
59 :
60 : #endif
61 :
62 : /************************************************************************/
63 : /* GDALGetDataLimits() */
64 : /************************************************************************/
65 : /**
66 : * Compute the limits of values that can be placed in Tout in terms of
67 : * Tin. Usually used for output clamping, when the output data type's
68 : * limits are stable relative to the input type (i.e. no roundoff error).
69 : *
70 : * @param tMaxValue the returned maximum value
71 : * @param tMinValue the returned minimum value
72 : */
73 :
74 : template <class Tin, class Tout>
75 349576532 : inline void GDALGetDataLimits(Tin &tMaxValue, Tin &tMinValue)
76 : {
77 349576532 : tMaxValue = cpl::NumericLimits<Tin>::max();
78 349576532 : tMinValue = cpl::NumericLimits<Tin>::lowest();
79 :
80 : // Compute the actual minimum value of Tout in terms of Tin.
81 : if constexpr (cpl::NumericLimits<Tout>::is_signed &&
82 : cpl::NumericLimits<Tout>::is_integer)
83 : {
84 : // the minimum value is less than zero
85 : // cppcheck-suppress knownConditionTrueFalse
86 : if constexpr (cpl::NumericLimits<Tout>::digits <
87 : cpl::NumericLimits<Tin>::digits ||
88 : !cpl::NumericLimits<Tin>::is_integer)
89 : {
90 : // Tout is smaller than Tin, so we need to clamp values in input
91 : // to the range of Tout's min/max values
92 : if constexpr (cpl::NumericLimits<Tin>::is_signed)
93 : {
94 87460980 : tMinValue =
95 87460980 : static_cast<Tin>(cpl::NumericLimits<Tout>::lowest());
96 : }
97 87627909 : tMaxValue = static_cast<Tin>(cpl::NumericLimits<Tout>::max());
98 : }
99 : }
100 : else if constexpr (cpl::NumericLimits<Tout>::is_integer)
101 : {
102 : // the output is unsigned, so we just need to determine the max
103 : if constexpr (!std::is_same_v<Tin, Tout> &&
104 : cpl::NumericLimits<Tout>::digits <=
105 : cpl::NumericLimits<Tin>::digits)
106 : {
107 : // Tout is smaller than Tin, so we need to clamp the input values
108 : // to the range of Tout's max
109 102300464 : tMaxValue = static_cast<Tin>(cpl::NumericLimits<Tout>::max());
110 : }
111 105279596 : tMinValue = 0;
112 : }
113 349576532 : }
114 :
115 : /************************************************************************/
116 : /* GDALClampValue() */
117 : /************************************************************************/
118 : /**
119 : * Clamp values of type T to a specified range
120 : *
121 : * @param tValue the value
122 : * @param tMax the max value
123 : * @param tMin the min value
124 : */
125 : template <class T>
126 349387847 : inline T GDALClampValue(const T tValue, const T tMax, const T tMin)
127 : {
128 349387847 : return tValue > tMax ? tMax : tValue < tMin ? tMin : tValue;
129 : }
130 :
131 : /************************************************************************/
132 : /* GDALClampDoubleValue() */
133 : /************************************************************************/
134 : /**
135 : * Clamp double values to a specified range, this uses the same
136 : * argument ordering as std::clamp, returns TRUE if the value was clamped.
137 : *
138 : * @param tValue the value
139 : * @param tMin the min value
140 : * @param tMax the max value
141 : *
142 : */
143 : template <class T2, class T3>
144 309 : inline bool GDALClampDoubleValue(double &tValue, const T2 tMin, const T3 tMax)
145 : {
146 309 : const double tMin2{static_cast<double>(tMin)};
147 309 : const double tMax2{static_cast<double>(tMax)};
148 309 : if (tValue > tMax2 || tValue < tMin2)
149 : {
150 26 : tValue = tValue > tMax2 ? tMax2 : tValue < tMin2 ? tMin2 : tValue;
151 26 : return true;
152 : }
153 : else
154 : {
155 283 : return false;
156 : }
157 : }
158 :
159 : /************************************************************************/
160 : /* GDALClampValueToType() */
161 : /************************************************************************/
162 : /**
163 : * Clamp a value of type T to the limits of type TClamped.
164 : *
165 : * @param tValue the value
166 : */
167 188804 : template <class T, class TClamped> inline T GDALClampValueToType(T tValue)
168 : {
169 : T tMaxValue, tMinValue;
170 188804 : GDALGetDataLimits<T, TClamped>(tMaxValue, tMinValue);
171 188804 : return std::clamp(tValue, tMinValue, tMaxValue);
172 : }
173 :
174 : /************************************************************************/
175 : /* GDALIsValueInRange() */
176 : /************************************************************************/
177 : /**
178 : * Returns whether a value is in the type range.
179 : * NaN is considered not to be in type range.
180 : *
181 : * @param dfValue the value
182 : * @return whether the value is in the type range.
183 : */
184 161028 : template <class T> inline bool GDALIsValueInRange(double dfValue)
185 : {
186 322014 : return dfValue >= static_cast<double>(cpl::NumericLimits<T>::lowest()) &&
187 322014 : dfValue <= static_cast<double>(cpl::NumericLimits<T>::max());
188 : }
189 :
190 27 : template <> inline bool GDALIsValueInRange<double>(double dfValue)
191 : {
192 27 : return !CPLIsNan(dfValue);
193 : }
194 :
195 46960 : template <> inline bool GDALIsValueInRange<float>(double dfValue)
196 : {
197 93793 : return CPLIsInf(dfValue) ||
198 : (dfValue >=
199 46833 : -static_cast<double>(std::numeric_limits<float>::max()) &&
200 93785 : dfValue <= static_cast<double>(std::numeric_limits<float>::max()));
201 : }
202 :
203 120 : template <> inline bool GDALIsValueInRange<GFloat16>(double dfValue)
204 : {
205 360 : return CPLIsInf(dfValue) ||
206 240 : (dfValue >= -cpl::NumericLimits<GFloat16>::max() &&
207 240 : dfValue <= cpl::NumericLimits<GFloat16>::max());
208 : }
209 :
210 16925 : template <> inline bool GDALIsValueInRange<int64_t>(double dfValue)
211 : {
212 : // Values in the range [INT64_MAX - 1023, INT64_MAX - 1]
213 : // get converted to a double that once cast to int64_t is
214 : // INT64_MAX + 1, hence the < strict comparison.
215 : return dfValue >=
216 33849 : static_cast<double>(cpl::NumericLimits<int64_t>::lowest()) &&
217 33849 : dfValue < static_cast<double>(cpl::NumericLimits<int64_t>::max());
218 : }
219 :
220 8968 : template <> inline bool GDALIsValueInRange<uint64_t>(double dfValue)
221 : {
222 : // Values in the range [UINT64_MAX - 2047, UINT64_MAX - 1]
223 : // get converted to a double that once cast to uint64_t is
224 : // UINT64_MAX + 1, hence the < strict comparison.
225 17933 : return dfValue >= 0 &&
226 17933 : dfValue < static_cast<double>(cpl::NumericLimits<uint64_t>::max());
227 : }
228 :
229 : /************************************************************************/
230 : /* GDALIsValueExactAs() */
231 : /************************************************************************/
232 : /**
233 : * Returns whether a value can be exactly represented on type T.
234 : *
235 : * That is static_cast\<double\>(static_cast\<T\>(dfValue)) is legal and is
236 : * equal to dfValue.
237 : *
238 : * Note: for T=float or double, a NaN input leads to true
239 : *
240 : * @param dfValue the value
241 : * @return whether the value can be exactly represented on type T.
242 : */
243 12305 : template <class T> inline bool GDALIsValueExactAs(double dfValue)
244 : {
245 24555 : return GDALIsValueInRange<T>(dfValue) &&
246 24555 : static_cast<double>(static_cast<T>(dfValue)) == dfValue;
247 : }
248 :
249 177 : template <> inline bool GDALIsValueExactAs<float>(double dfValue)
250 : {
251 348 : return CPLIsNan(dfValue) ||
252 171 : (GDALIsValueInRange<float>(dfValue) &&
253 344 : static_cast<double>(static_cast<float>(dfValue)) == dfValue);
254 : }
255 :
256 7 : template <> inline bool GDALIsValueExactAs<GFloat16>(double dfValue)
257 : {
258 14 : return CPLIsNan(dfValue) ||
259 7 : (GDALIsValueInRange<GFloat16>(dfValue) &&
260 14 : static_cast<double>(static_cast<GFloat16>(dfValue)) == dfValue);
261 : }
262 :
263 21 : template <> inline bool GDALIsValueExactAs<double>(double)
264 : {
265 21 : return true;
266 : }
267 :
268 : /************************************************************************/
269 : /* GDALCopyWord() */
270 : /************************************************************************/
271 :
272 : // Integer input and output: clamp the input
273 :
274 : template <class Tin, class Tout> struct sGDALCopyWord
275 : {
276 173520001 : static inline void f(const Tin tValueIn, Tout &tValueOut)
277 : {
278 : Tin tMaxVal, tMinVal;
279 173520001 : GDALGetDataLimits<Tin, Tout>(tMaxVal, tMinVal);
280 173520001 : tValueOut =
281 173520001 : static_cast<Tout>(GDALClampValue(tValueIn, tMaxVal, tMinVal));
282 173520001 : }
283 : };
284 :
285 : // Integer input and floating point output: simply convert
286 :
287 : template <class Tin> struct sGDALCopyWord<Tin, GFloat16>
288 : {
289 270266 : static inline void f(const Tin tValueIn, GFloat16 &hfValueOut)
290 : {
291 270266 : hfValueOut = static_cast<GFloat16>(tValueIn);
292 270266 : }
293 : };
294 :
295 : template <class Tin> struct sGDALCopyWord<Tin, float>
296 : {
297 8027797 : static inline void f(const Tin tValueIn, float &fValueOut)
298 : {
299 8027797 : fValueOut = static_cast<float>(tValueIn);
300 8027797 : }
301 : };
302 :
303 : template <class Tin> struct sGDALCopyWord<Tin, double>
304 : {
305 87067547 : static inline void f(const Tin tValueIn, double &dfValueOut)
306 : {
307 87067547 : dfValueOut = static_cast<double>(tValueIn);
308 87067547 : }
309 : };
310 :
311 : // Floating point input and output, converting between identical types: simply copy
312 :
313 : template <> struct sGDALCopyWord<GFloat16, GFloat16>
314 : {
315 : static inline void f(const GFloat16 hfValueIn, GFloat16 &hfValueOut)
316 : {
317 : hfValueOut = hfValueIn;
318 : }
319 : };
320 :
321 : template <> struct sGDALCopyWord<float, float>
322 : {
323 : static inline void f(const float fValueIn, float &fValueOut)
324 : {
325 : fValueOut = fValueIn;
326 : }
327 : };
328 :
329 : template <> struct sGDALCopyWord<double, double>
330 : {
331 : static inline void f(const double dfValueIn, double &dfValueOut)
332 : {
333 : dfValueOut = dfValueIn;
334 : }
335 : };
336 :
337 : // Floating point input and output, converting to a larger type: use implicit conversion
338 :
339 : template <> struct sGDALCopyWord<GFloat16, float>
340 : {
341 5206 : static inline void f(const GFloat16 hfValueIn, float &dfValueOut)
342 : {
343 5206 : dfValueOut = hfValueIn;
344 5206 : }
345 : };
346 :
347 : template <> struct sGDALCopyWord<GFloat16, double>
348 : {
349 4174 : static inline void f(const GFloat16 hfValueIn, double &dfValueOut)
350 : {
351 4174 : dfValueOut = hfValueIn;
352 4174 : }
353 : };
354 :
355 : template <> struct sGDALCopyWord<float, double>
356 : {
357 638886 : static inline void f(const float fValueIn, double &dfValueOut)
358 : {
359 638886 : dfValueOut = static_cast<double>(fValueIn);
360 638886 : }
361 : };
362 :
363 : // Floating point input and out, converting to a smaller type: ensure overflow results in infinity
364 :
365 : template <> struct sGDALCopyWord<float, GFloat16>
366 : {
367 497 : static inline void f(const float fValueIn, GFloat16 &hfValueOut)
368 : {
369 : // Our custom implementation when std::float16_t is not
370 : // available ensures proper behavior.
371 : #if !defined(HAVE_STD_FLOAT16_T)
372 497 : if (fValueIn > cpl::NumericLimits<GFloat16>::max())
373 : {
374 27 : hfValueOut = cpl::NumericLimits<GFloat16>::infinity();
375 27 : return;
376 : }
377 470 : if (fValueIn < -cpl::NumericLimits<GFloat16>::max())
378 : {
379 15 : hfValueOut = -cpl::NumericLimits<GFloat16>::infinity();
380 15 : return;
381 : }
382 : #endif
383 455 : hfValueOut = static_cast<GFloat16>(fValueIn);
384 : }
385 : };
386 :
387 : template <> struct sGDALCopyWord<double, GFloat16>
388 : {
389 349 : static inline void f(const double dfValueIn, GFloat16 &hfValueOut)
390 : {
391 : // Our custom implementation when std::float16_t is not
392 : // available ensures proper behavior.
393 : #if !defined(HAVE_STD_FLOAT16_T)
394 349 : if (dfValueIn > cpl::NumericLimits<GFloat16>::max())
395 : {
396 14 : hfValueOut = cpl::NumericLimits<GFloat16>::infinity();
397 14 : return;
398 : }
399 335 : if (dfValueIn < -cpl::NumericLimits<GFloat16>::max())
400 : {
401 14 : hfValueOut = -cpl::NumericLimits<GFloat16>::infinity();
402 14 : return;
403 : }
404 : #endif
405 321 : hfValueOut = static_cast<GFloat16>(dfValueIn);
406 : }
407 : };
408 :
409 : template <> struct sGDALCopyWord<double, float>
410 : {
411 130418 : static inline void f(const double dfValueIn, float &fValueOut)
412 : {
413 : if constexpr (!std::numeric_limits<double>::is_iec559)
414 : {
415 : if (dfValueIn >
416 : static_cast<double>(std::numeric_limits<float>::max()))
417 : {
418 : fValueOut = std::numeric_limits<float>::infinity();
419 : return;
420 : }
421 : if (dfValueIn <
422 : static_cast<double>(-std::numeric_limits<float>::max()))
423 : {
424 : fValueOut = -std::numeric_limits<float>::infinity();
425 : return;
426 : }
427 : }
428 :
429 130418 : fValueOut = static_cast<float>(dfValueIn);
430 130418 : }
431 : };
432 :
433 : // Floating point input to a small unsigned integer type: nan becomes zero, otherwise round and clamp
434 :
435 : template <class Tout> struct sGDALCopyWord<GFloat16, Tout>
436 : {
437 5032 : static inline void f(const GFloat16 hfValueIn, Tout &tValueOut)
438 : {
439 5032 : if (CPLIsNan(hfValueIn))
440 : {
441 1 : tValueOut = 0;
442 1 : return;
443 : }
444 : GFloat16 hfMaxVal, hfMinVal;
445 5031 : GDALGetDataLimits<GFloat16, Tout>(hfMaxVal, hfMinVal);
446 5031 : tValueOut = static_cast<Tout>(
447 10062 : GDALClampValue(hfValueIn + GFloat16(0.5f), hfMaxVal, hfMinVal));
448 : }
449 : };
450 :
451 : template <class Tout> struct sGDALCopyWord<float, Tout>
452 : {
453 5476720 : static inline void f(const float fValueIn, Tout &tValueOut)
454 : {
455 5476720 : if (CPLIsNan(fValueIn))
456 : {
457 22 : tValueOut = 0;
458 22 : return;
459 : }
460 : float fMaxVal, fMinVal;
461 5476700 : GDALGetDataLimits<float, Tout>(fMaxVal, fMinVal);
462 5476700 : tValueOut = static_cast<Tout>(
463 5476700 : GDALClampValue(fValueIn + 0.5f, fMaxVal, fMinVal));
464 : }
465 : };
466 :
467 : template <class Tout> struct sGDALCopyWord<double, Tout>
468 : {
469 83013416 : static inline void f(const double dfValueIn, Tout &tValueOut)
470 : {
471 83013416 : if (CPLIsNan(dfValueIn))
472 : {
473 151 : tValueOut = 0;
474 151 : return;
475 : }
476 : double dfMaxVal, dfMinVal;
477 83013348 : GDALGetDataLimits<double, Tout>(dfMaxVal, dfMinVal);
478 83013348 : tValueOut = static_cast<Tout>(
479 83013348 : GDALClampValue(dfValueIn + 0.5, dfMaxVal, dfMinVal));
480 : }
481 : };
482 :
483 : // Floating point input to a large unsigned integer type: nan becomes zero, otherwise round and clamp.
484 : // Avoid roundoff while clamping.
485 :
486 : template <> struct sGDALCopyWord<GFloat16, std::uint64_t>
487 : {
488 624 : static inline void f(const GFloat16 hfValueIn, std::uint64_t &nValueOut)
489 : {
490 624 : if (!(hfValueIn > 0))
491 : {
492 4 : nValueOut = 0;
493 : }
494 620 : else if (CPLIsInf(hfValueIn))
495 : {
496 1 : nValueOut = cpl::NumericLimits<std::uint64_t>::max();
497 : }
498 : else
499 : {
500 619 : nValueOut = static_cast<std::uint64_t>(hfValueIn + GFloat16(0.5f));
501 : }
502 624 : }
503 : };
504 :
505 : template <> struct sGDALCopyWord<float, unsigned int>
506 : {
507 5173 : static inline void f(const float fValueIn, unsigned int &nValueOut)
508 : {
509 5173 : if (!(fValueIn > 0))
510 : {
511 283 : nValueOut = 0;
512 : }
513 4890 : else if (fValueIn >=
514 4890 : static_cast<float>(cpl::NumericLimits<unsigned int>::max()))
515 : {
516 134 : nValueOut = cpl::NumericLimits<unsigned int>::max();
517 : }
518 : else
519 : {
520 4756 : nValueOut = static_cast<unsigned int>(fValueIn + 0.5f);
521 : }
522 5173 : }
523 : };
524 :
525 : template <> struct sGDALCopyWord<float, std::uint64_t>
526 : {
527 758 : static inline void f(const float fValueIn, std::uint64_t &nValueOut)
528 : {
529 758 : if (!(fValueIn > 0))
530 : {
531 138 : nValueOut = 0;
532 : }
533 620 : else if (fValueIn >=
534 620 : static_cast<float>(cpl::NumericLimits<std::uint64_t>::max()))
535 : {
536 2 : nValueOut = cpl::NumericLimits<std::uint64_t>::max();
537 : }
538 : else
539 : {
540 618 : nValueOut = static_cast<std::uint64_t>(fValueIn + 0.5f);
541 : }
542 758 : }
543 : };
544 :
545 : template <> struct sGDALCopyWord<double, std::uint64_t>
546 : {
547 1378 : static inline void f(const double dfValueIn, std::uint64_t &nValueOut)
548 : {
549 1378 : if (!(dfValueIn > 0))
550 : {
551 426 : nValueOut = 0;
552 : }
553 952 : else if (dfValueIn >
554 952 : static_cast<double>(cpl::NumericLimits<uint64_t>::max()))
555 : {
556 4 : nValueOut = cpl::NumericLimits<uint64_t>::max();
557 : }
558 : else
559 : {
560 948 : nValueOut = static_cast<std::uint64_t>(dfValueIn + 0.5);
561 : }
562 1378 : }
563 : };
564 :
565 : // Floating point input to a very large unsigned integer type: nan becomes zero, otherwise round and clamp.
566 : // Avoid infinity while clamping when the maximum integer is too large for the floating-point type.
567 : // Avoid roundoff while clamping.
568 :
569 : template <> struct sGDALCopyWord<GFloat16, unsigned short>
570 : {
571 4706 : static inline void f(const GFloat16 hfValueIn, unsigned short &nValueOut)
572 : {
573 4706 : if (!(hfValueIn > 0))
574 : {
575 216 : nValueOut = 0;
576 : }
577 4490 : else if (CPLIsInf(hfValueIn))
578 : {
579 67 : nValueOut = cpl::NumericLimits<unsigned short>::max();
580 : }
581 : else
582 : {
583 4423 : nValueOut = static_cast<unsigned short>(hfValueIn + GFloat16(0.5f));
584 : }
585 4706 : }
586 : };
587 :
588 : template <> struct sGDALCopyWord<GFloat16, unsigned int>
589 : {
590 4639 : static inline void f(const GFloat16 hfValueIn, unsigned int &nValueOut)
591 : {
592 4639 : if (!(hfValueIn > 0))
593 : {
594 149 : nValueOut = 0;
595 : }
596 4490 : else if (CPLIsInf(hfValueIn))
597 : {
598 0 : nValueOut = cpl::NumericLimits<unsigned int>::max();
599 : }
600 : else
601 : {
602 4490 : nValueOut = static_cast<unsigned int>(hfValueIn + GFloat16(0.5f));
603 : }
604 4639 : }
605 : };
606 :
607 : // Floating point input to a small signed integer type: nan becomes zero, otherwise round and clamp.
608 : // Rounding for signed integers is different than for the unsigned integers above.
609 :
610 : template <> struct sGDALCopyWord<GFloat16, signed char>
611 : {
612 1088 : static inline void f(const GFloat16 hfValueIn, signed char &nValueOut)
613 : {
614 1088 : if (CPLIsNan(hfValueIn))
615 : {
616 0 : nValueOut = 0;
617 0 : return;
618 : }
619 : GFloat16 hfMaxVal, hfMinVal;
620 1088 : GDALGetDataLimits<GFloat16, signed char>(hfMaxVal, hfMinVal);
621 1088 : GFloat16 hfValue = hfValueIn >= GFloat16(0.0f)
622 619 : ? hfValueIn + GFloat16(0.5f)
623 1088 : : hfValueIn - GFloat16(0.5f);
624 1088 : nValueOut = static_cast<signed char>(
625 2176 : GDALClampValue(hfValue, hfMaxVal, hfMinVal));
626 : }
627 : };
628 :
629 : template <> struct sGDALCopyWord<float, signed char>
630 : {
631 454 : static inline void f(const float fValueIn, signed char &nValueOut)
632 : {
633 454 : if (CPLIsNan(fValueIn))
634 : {
635 11 : nValueOut = 0;
636 11 : return;
637 : }
638 : float fMaxVal, fMinVal;
639 443 : GDALGetDataLimits<float, signed char>(fMaxVal, fMinVal);
640 443 : float fValue = fValueIn >= 0.0f ? fValueIn + 0.5f : fValueIn - 0.5f;
641 443 : nValueOut =
642 443 : static_cast<signed char>(GDALClampValue(fValue, fMaxVal, fMinVal));
643 : }
644 : };
645 :
646 : template <> struct sGDALCopyWord<float, short>
647 : {
648 5102540 : static inline void f(const float fValueIn, short &nValueOut)
649 : {
650 5102540 : if (CPLIsNan(fValueIn))
651 : {
652 15 : nValueOut = 0;
653 15 : return;
654 : }
655 : float fMaxVal, fMinVal;
656 5102520 : GDALGetDataLimits<float, short>(fMaxVal, fMinVal);
657 5102520 : float fValue = fValueIn >= 0.0f ? fValueIn + 0.5f : fValueIn - 0.5f;
658 5102520 : nValueOut =
659 5102520 : static_cast<short>(GDALClampValue(fValue, fMaxVal, fMinVal));
660 : }
661 : };
662 :
663 : template <> struct sGDALCopyWord<double, signed char>
664 : {
665 2646 : static inline void f(const double dfValueIn, signed char &nValueOut)
666 : {
667 2646 : if (CPLIsNan(dfValueIn))
668 : {
669 67 : nValueOut = 0;
670 67 : return;
671 : }
672 : double dfMaxVal, dfMinVal;
673 2579 : GDALGetDataLimits<double, signed char>(dfMaxVal, dfMinVal);
674 2579 : double dfValue = dfValueIn > 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5;
675 2579 : nValueOut = static_cast<signed char>(
676 2579 : GDALClampValue(dfValue, dfMaxVal, dfMinVal));
677 : }
678 : };
679 :
680 : template <> struct sGDALCopyWord<double, short>
681 : {
682 5157590 : static inline void f(const double dfValueIn, short &nValueOut)
683 : {
684 5157590 : if (CPLIsNan(dfValueIn))
685 : {
686 73 : nValueOut = 0;
687 73 : return;
688 : }
689 : double dfMaxVal, dfMinVal;
690 5157520 : GDALGetDataLimits<double, short>(dfMaxVal, dfMinVal);
691 5157520 : double dfValue = dfValueIn > 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5;
692 5157520 : nValueOut =
693 5157520 : static_cast<short>(GDALClampValue(dfValue, dfMaxVal, dfMinVal));
694 : }
695 : };
696 :
697 : template <> struct sGDALCopyWord<double, int>
698 : {
699 77108600 : static inline void f(const double dfValueIn, int &nValueOut)
700 : {
701 77108600 : if (CPLIsNan(dfValueIn))
702 : {
703 71 : nValueOut = 0;
704 71 : return;
705 : }
706 : double dfMaxVal, dfMinVal;
707 77108500 : GDALGetDataLimits<double, int>(dfMaxVal, dfMinVal);
708 77108500 : double dfValue = dfValueIn >= 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5;
709 77108500 : nValueOut =
710 77108500 : static_cast<int>(GDALClampValue(dfValue, dfMaxVal, dfMinVal));
711 : }
712 : };
713 :
714 : // Floating point input to a large signed integer type: nan becomes zero, otherwise round and clamp.
715 : // Rounding for signed integers is different than for the unsigned integers above.
716 : // Avoid roundoff while clamping.
717 :
718 : template <> struct sGDALCopyWord<GFloat16, short>
719 : {
720 5289 : static inline void f(const GFloat16 hfValueIn, short &nValueOut)
721 : {
722 5289 : if (CPLIsNan(hfValueIn))
723 : {
724 0 : nValueOut = 0;
725 : }
726 5289 : else if (hfValueIn >=
727 5289 : static_cast<GFloat16>(cpl::NumericLimits<short>::max()))
728 : {
729 146 : nValueOut = cpl::NumericLimits<short>::max();
730 : }
731 5143 : else if (hfValueIn <=
732 5143 : static_cast<GFloat16>(cpl::NumericLimits<short>::lowest()))
733 : {
734 213 : nValueOut = cpl::NumericLimits<short>::lowest();
735 : }
736 : else
737 : {
738 9860 : nValueOut = static_cast<short>(hfValueIn > GFloat16(0.0f)
739 4930 : ? hfValueIn + GFloat16(0.5f)
740 5451 : : hfValueIn - GFloat16(0.5f));
741 : }
742 5289 : }
743 : };
744 :
745 : template <> struct sGDALCopyWord<float, int>
746 : {
747 10285 : static inline void f(const float fValueIn, int &nValueOut)
748 : {
749 10285 : if (CPLIsNan(fValueIn))
750 : {
751 15 : nValueOut = 0;
752 : }
753 10270 : else if (fValueIn >= static_cast<float>(cpl::NumericLimits<int>::max()))
754 : {
755 218 : nValueOut = cpl::NumericLimits<int>::max();
756 : }
757 10052 : else if (fValueIn <=
758 10052 : static_cast<float>(cpl::NumericLimits<int>::lowest()))
759 : {
760 90 : nValueOut = cpl::NumericLimits<int>::lowest();
761 : }
762 : else
763 : {
764 10196 : nValueOut = static_cast<int>(fValueIn > 0.0f ? fValueIn + 0.5f
765 234 : : fValueIn - 0.5f);
766 : }
767 10285 : }
768 : };
769 :
770 : template <> struct sGDALCopyWord<float, std::int64_t>
771 : {
772 1227 : static inline void f(const float fValueIn, std::int64_t &nValueOut)
773 : {
774 1227 : if (CPLIsNan(fValueIn))
775 : {
776 68 : nValueOut = 0;
777 : }
778 1159 : else if (fValueIn >=
779 1159 : static_cast<float>(cpl::NumericLimits<std::int64_t>::max()))
780 : {
781 2 : nValueOut = cpl::NumericLimits<std::int64_t>::max();
782 : }
783 1157 : else if (fValueIn <=
784 1157 : static_cast<float>(cpl::NumericLimits<std::int64_t>::lowest()))
785 : {
786 2 : nValueOut = cpl::NumericLimits<std::int64_t>::lowest();
787 : }
788 : else
789 : {
790 2310 : nValueOut = static_cast<std::int64_t>(
791 1155 : fValueIn > 0.0f ? fValueIn + 0.5f : fValueIn - 0.5f);
792 : }
793 1227 : }
794 : };
795 :
796 : template <> struct sGDALCopyWord<double, std::int64_t>
797 : {
798 1861 : static inline void f(const double dfValueIn, std::int64_t &nValueOut)
799 : {
800 1861 : if (CPLIsNan(dfValueIn))
801 : {
802 68 : nValueOut = 0;
803 : }
804 1793 : else if (dfValueIn >=
805 1793 : static_cast<double>(cpl::NumericLimits<std::int64_t>::max()))
806 : {
807 4 : nValueOut = cpl::NumericLimits<std::int64_t>::max();
808 : }
809 1789 : else if (dfValueIn <=
810 1789 : static_cast<double>(cpl::NumericLimits<std::int64_t>::min()))
811 : {
812 4 : nValueOut = cpl::NumericLimits<std::int64_t>::min();
813 : }
814 : else
815 : {
816 3570 : nValueOut = static_cast<std::int64_t>(
817 1785 : dfValueIn > 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5);
818 : }
819 1861 : }
820 : };
821 :
822 : // Floating point input to a very large signed integer type: nan becomes zero, otherwise round and clamp.
823 : // Rounding for signed integers is different than for the unsigned integers above.
824 : // Avoid infinity while clamping when the maximum integer is too large for the floating-point type.
825 : // Avoid roundoff while clamping.
826 :
827 : template <> struct sGDALCopyWord<GFloat16, int>
828 : {
829 5222 : static inline void f(const GFloat16 hfValueIn, int &nValueOut)
830 : {
831 5222 : if (CPLIsNan(hfValueIn))
832 : {
833 0 : nValueOut = 0;
834 : }
835 5222 : else if (CPLIsInf(hfValueIn))
836 : {
837 0 : nValueOut = hfValueIn > GFloat16(0.0f)
838 0 : ? cpl::NumericLimits<int>::max()
839 0 : : cpl::NumericLimits<int>::lowest();
840 : }
841 : else
842 : {
843 10444 : nValueOut = static_cast<int>(hfValueIn > GFloat16(0.0f)
844 5222 : ? hfValueIn + GFloat16(0.5f)
845 5889 : : hfValueIn - GFloat16(0.5f));
846 : }
847 5222 : }
848 : };
849 :
850 : template <> struct sGDALCopyWord<GFloat16, std::int64_t>
851 : {
852 1093 : static inline void f(const GFloat16 hfValueIn, std::int64_t &nValueOut)
853 : {
854 1093 : if (CPLIsNan(hfValueIn))
855 : {
856 1 : nValueOut = 0;
857 : }
858 1092 : else if (CPLIsInf(hfValueIn))
859 : {
860 4 : nValueOut = hfValueIn > GFloat16(0.0f)
861 2 : ? cpl::NumericLimits<std::int64_t>::max()
862 1 : : cpl::NumericLimits<std::int64_t>::lowest();
863 : }
864 : else
865 : {
866 1090 : nValueOut = static_cast<std::int64_t>(
867 1090 : hfValueIn > GFloat16(0.0f) ? hfValueIn + GFloat16(0.5f)
868 1560 : : hfValueIn - GFloat16(0.5f));
869 : }
870 1093 : }
871 : };
872 :
873 : /**
874 : * Copy a single word, optionally rounding if appropriate (i.e. going
875 : * from the float to the integer case). Note that this is the function
876 : * you should specialize if you're adding a new data type.
877 : *
878 : * @param tValueIn value of type Tin; the input value to be converted
879 : * @param tValueOut value of type Tout; the output value
880 : */
881 :
882 : template <class Tin, class Tout>
883 488399998 : inline void GDALCopyWord(const Tin tValueIn, Tout &tValueOut)
884 : {
885 : if constexpr (std::is_same<Tin, Tout>::value)
886 42824516 : tValueOut = tValueIn;
887 : else
888 445575482 : sGDALCopyWord<Tin, Tout>::f(tValueIn, tValueOut);
889 488399998 : }
890 :
891 : /************************************************************************/
892 : /* GDALCopy4Words() */
893 : /************************************************************************/
894 : /**
895 : * Copy 4 packed words to 4 packed words, optionally rounding if appropriate
896 : * (i.e. going from the float to the integer case).
897 : *
898 : * @param pValueIn pointer to 4 input values of type Tin.
899 : * @param pValueOut pointer to 4 output values of type Tout.
900 : */
901 :
902 : template <class Tin, class Tout>
903 632 : inline void GDALCopy4Words(const Tin *pValueIn, Tout *const pValueOut)
904 : {
905 632 : GDALCopyWord(pValueIn[0], pValueOut[0]);
906 632 : GDALCopyWord(pValueIn[1], pValueOut[1]);
907 632 : GDALCopyWord(pValueIn[2], pValueOut[2]);
908 632 : GDALCopyWord(pValueIn[3], pValueOut[3]);
909 632 : }
910 :
911 : /************************************************************************/
912 : /* GDALCopy8Words() */
913 : /************************************************************************/
914 : /**
915 : * Copy 8 packed words to 8 packed words, optionally rounding if appropriate
916 : * (i.e. going from the float to the integer case).
917 : *
918 : * @param pValueIn pointer to 8 input values of type Tin.
919 : * @param pValueOut pointer to 8 output values of type Tout.
920 : */
921 :
922 : template <class Tin, class Tout>
923 44897771 : inline void GDALCopy8Words(const Tin *pValueIn, Tout *const pValueOut)
924 : {
925 44897771 : GDALCopy4Words(pValueIn, pValueOut);
926 44897771 : GDALCopy4Words(pValueIn + 4, pValueOut + 4);
927 44897771 : }
928 :
929 : // Needs SSE2
930 : #if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2) || \
931 : defined(USE_NEON_OPTIMIZATIONS)
932 :
933 : #ifdef USE_NEON_OPTIMIZATIONS
934 : inline __m128i _mm_cvttps_epi32_neon_no_post_correction(__m128 a)
935 : {
936 : float32x4_t f = vreinterpretq_f32_m128(a);
937 : int32x4_t cvt = vcvtq_s32_f32(f);
938 : return vreinterpretq_m128i_s32(cvt);
939 : }
940 : #endif
941 :
942 : template <>
943 31149802 : inline void GDALCopy4Words(const float *pValueIn, GByte *const pValueOut)
944 : {
945 31149802 : __m128 xmm = _mm_loadu_ps(pValueIn);
946 :
947 : // The following clamping would be useless due to the final saturating
948 : // packing if we could guarantee the input range in [INT_MIN,INT_MAX]
949 :
950 : // Clamp to [UINT8_MIN, UINT8_MAX]
951 31149802 : const __m128 p0d5 = _mm_set1_ps(0.5f);
952 31149802 : const __m128 xmm_max = _mm_set1_ps(255);
953 :
954 31149802 : xmm = _mm_add_ps(xmm, p0d5);
955 62299704 : xmm = _mm_min_ps(_mm_max_ps(xmm, p0d5), xmm_max);
956 :
957 : #ifdef USE_NEON_OPTIMIZATIONS
958 : // Optimization to avoid useless clamping
959 : __m128i xmm_i = _mm_cvttps_epi32_neon_no_post_correction(xmm);
960 : #else
961 31149802 : __m128i xmm_i = _mm_cvttps_epi32(xmm);
962 : #endif
963 :
964 : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
965 : xmm_i = _mm_shuffle_epi8(
966 : xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24)));
967 : #else
968 31149802 : xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
969 31149802 : xmm_i = _mm_packus_epi16(xmm_i, xmm_i); // Pack int16 to uint8
970 : #endif
971 31149802 : GDALCopyXMMToInt32(xmm_i, pValueOut);
972 31149802 : }
973 :
974 43492800 : static inline __m128 AddDotZeroBias(__m128 xmm)
975 : {
976 43492800 : const __m128 zeroDotFive = _mm_set1_ps(0.5f);
977 43492800 : const __m128 negativeZero = _mm_set1_ps(-0.0f);
978 : // f >= 0 ? f + 0.5f : f - 0.5f
979 86985700 : const __m128 bias = _mm_or_ps(zeroDotFive, _mm_and_ps(xmm, negativeZero));
980 43492800 : xmm = _mm_add_ps(xmm, bias);
981 43492800 : return xmm;
982 : }
983 :
984 : template <>
985 218 : inline void GDALCopy4Words(const float *pValueIn, GInt8 *const pValueOut)
986 : {
987 218 : __m128 xmm = _mm_loadu_ps(pValueIn);
988 :
989 : #if !defined(USE_NEON_OPTIMIZATIONS)
990 : // Cast NaN to zero
991 436 : xmm = _mm_andnot_ps(_mm_cmpunord_ps(xmm, xmm), xmm);
992 : #endif
993 :
994 : // Clamp to [INT8_MIN, INT8_MAX]
995 218 : const __m128 xmm_min = _mm_set1_ps(-128);
996 218 : const __m128 xmm_max = _mm_set1_ps(127);
997 218 : xmm = _mm_min_ps(_mm_max_ps(xmm, xmm_min), xmm_max);
998 :
999 218 : xmm = AddDotZeroBias(xmm);
1000 :
1001 : #ifdef USE_NEON_OPTIMIZATIONS
1002 : // Optimization to avoid useless clamping
1003 : __m128i xmm_i = _mm_cvttps_epi32_neon_no_post_correction(xmm);
1004 : #else
1005 218 : __m128i xmm_i = _mm_cvttps_epi32(xmm);
1006 : #endif
1007 :
1008 : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
1009 : xmm_i = _mm_shuffle_epi8(
1010 : xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24)));
1011 : #else
1012 218 : xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
1013 218 : xmm_i = _mm_packs_epi16(xmm_i, xmm_i); // Pack int16 to int8
1014 : #endif
1015 218 : GDALCopyXMMToInt32(xmm_i, pValueOut);
1016 218 : }
1017 :
1018 : template <>
1019 3364010 : inline void GDALCopy4Words(const float *pValueIn, GInt16 *const pValueOut)
1020 : {
1021 3364010 : __m128 xmm = _mm_loadu_ps(pValueIn);
1022 :
1023 : #if !defined(USE_NEON_OPTIMIZATIONS)
1024 : // Cast NaN to zero
1025 6728020 : xmm = _mm_andnot_ps(_mm_cmpunord_ps(xmm, xmm), xmm);
1026 : #endif
1027 :
1028 : // Clamp to [INT16_MIN, INT16_MAX]
1029 3364010 : const __m128 xmm_min = _mm_set1_ps(-32768);
1030 3364010 : const __m128 xmm_max = _mm_set1_ps(32767);
1031 3364010 : xmm = _mm_min_ps(_mm_max_ps(xmm, xmm_min), xmm_max);
1032 :
1033 3364010 : xmm = AddDotZeroBias(xmm);
1034 :
1035 : #ifdef USE_NEON_OPTIMIZATIONS
1036 : // Optimization to avoid useless clamping
1037 : __m128i xmm_i = _mm_cvttps_epi32_neon_no_post_correction(xmm);
1038 : #else
1039 3364010 : __m128i xmm_i = _mm_cvttps_epi32(xmm);
1040 : #endif
1041 :
1042 3364010 : xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
1043 3364010 : GDALCopyXMMToInt64(xmm_i, pValueOut);
1044 3364010 : }
1045 :
1046 13 : inline __m128i GDAL_mm_int32_to_uint16(__m128i xmm_i)
1047 : {
1048 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
1049 : xmm_i = _mm_packus_epi32(xmm_i, xmm_i); // Pack int32 to uint16
1050 : #else
1051 : // Translate to int16 range because _mm_packus_epi32 is SSE4.1 only
1052 26 : xmm_i = _mm_add_epi32(xmm_i, _mm_set1_epi32(-32768));
1053 13 : xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
1054 : // Translate back to uint16 range (actually -32768==32768 in int16)
1055 13 : xmm_i = _mm_add_epi16(xmm_i, _mm_set1_epi16(-32768));
1056 : #endif
1057 13 : return xmm_i;
1058 : }
1059 :
1060 8302141 : inline __m128i GDAL_mm_packus_epi32(__m128i xmm_lo, __m128i xmm_hi)
1061 : {
1062 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
1063 : auto xmm = _mm_packus_epi32(xmm_lo, xmm_hi); // Pack int32 to uint16
1064 : #else
1065 : // Translate to int16 range because _mm_packus_epi32 is SSE4.1 only
1066 16604302 : xmm_lo = _mm_add_epi32(xmm_lo, _mm_set1_epi32(-32768));
1067 16604302 : xmm_hi = _mm_add_epi32(xmm_hi, _mm_set1_epi32(-32768));
1068 8302141 : auto xmm = _mm_packs_epi32(xmm_lo, xmm_hi); // Pack int32 to int16
1069 : // Translate back to uint16 range (actually -32768==32768 in int16)
1070 8302141 : xmm = _mm_add_epi16(xmm, _mm_set1_epi16(-32768));
1071 : #endif
1072 8302141 : return xmm;
1073 : }
1074 :
1075 : template <>
1076 1 : inline void GDALCopy4Words(const float *pValueIn, GUInt16 *const pValueOut)
1077 : {
1078 1 : __m128 xmm = _mm_loadu_ps(pValueIn);
1079 :
1080 : // Clamp to [UINT16_MIN, UINT16_MAX]
1081 1 : const __m128 p0d5 = _mm_set1_ps(0.5f);
1082 1 : const __m128 xmm_max = _mm_set1_ps(65535);
1083 1 : xmm = _mm_add_ps(xmm, p0d5);
1084 2 : xmm = _mm_min_ps(_mm_max_ps(xmm, p0d5), xmm_max);
1085 :
1086 : #ifdef USE_NEON_OPTIMIZATIONS
1087 : // Optimization to avoid useless clamping
1088 : __m128i xmm_i = _mm_cvttps_epi32_neon_no_post_correction(xmm);
1089 : #else
1090 1 : __m128i xmm_i = _mm_cvttps_epi32(xmm);
1091 : #endif
1092 :
1093 1 : xmm_i = GDAL_mm_int32_to_uint16(xmm_i);
1094 1 : GDALCopyXMMToInt64(xmm_i, pValueOut);
1095 1 : }
1096 :
1097 40300600 : static inline __m128i GDALIfThenElse(__m128i mask, __m128i thenVal,
1098 : __m128i elseVal)
1099 : {
1100 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
1101 : return _mm_blendv_epi8(elseVal, thenVal, mask);
1102 : #else
1103 80601300 : return _mm_or_si128(_mm_and_si128(mask, thenVal),
1104 40300600 : _mm_andnot_si128(mask, elseVal));
1105 : #endif
1106 : }
1107 :
1108 : template <>
1109 40128600 : inline void GDALCopy4Words(const float *pValueIn, GInt32 *const pValueOut)
1110 : {
1111 40128600 : __m128 xmm = _mm_loadu_ps(pValueIn);
1112 :
1113 : #if !defined(USE_NEON_OPTIMIZATIONS)
1114 : // Cast NaN to zero
1115 40128600 : xmm = _mm_andnot_ps(_mm_cmpunord_ps(xmm, xmm), xmm);
1116 : #endif
1117 :
1118 40128600 : xmm = AddDotZeroBias(xmm);
1119 :
1120 : #ifdef USE_NEON_OPTIMIZATIONS
1121 : // Optimization to avoid useless clamping
1122 : __m128i xmm_i = _mm_cvttps_epi32_neon_no_post_correction(xmm);
1123 : #else
1124 40128600 : __m128i xmm_i = _mm_cvttps_epi32(xmm);
1125 :
1126 : // Clamp to <= INT32_MAX
1127 40128600 : const __m128 xmm_max = _mm_set1_ps(2147483648.0f);
1128 40128600 : const __m128i xmm_i_max = _mm_set1_epi32(INT_MAX);
1129 80257200 : xmm_i = GDALIfThenElse(_mm_castps_si128(_mm_cmpge_ps(xmm, xmm_max)),
1130 : xmm_i_max, xmm_i);
1131 : #endif
1132 :
1133 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut), xmm_i);
1134 40128600 : }
1135 :
1136 : static inline __m128 GDALIfThenElse(__m128 mask, __m128 thenVal, __m128 elseVal)
1137 : {
1138 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
1139 : return _mm_blendv_ps(elseVal, thenVal, mask);
1140 : #else
1141 : return _mm_or_ps(_mm_and_ps(mask, thenVal), _mm_andnot_ps(mask, elseVal));
1142 : #endif
1143 : }
1144 :
1145 : // ARM64 has an efficient instruction for Float32 -> Float16
1146 : #if !(defined(HAVE__FLOAT16) && \
1147 : (defined(__aarch64__) && defined(_M_ARM64))) && \
1148 : !(defined(__AVX2__) && defined(__F16C__))
1149 :
1150 630 : inline __m128i GDALFourFloat32ToFloat16(__m128i xmm)
1151 : {
1152 : // Ported from https://github.com/simd-everywhere/simde/blob/51743e7920b6e867678cb50e9c62effe28f70b33/simde/simde-f16.h#L176
1153 : // to SSE2 in a branch-less way
1154 :
1155 : // clang-format off
1156 :
1157 : /* This code is CC0, based heavily on code by Fabian Giesen. */
1158 630 : const __m128i f32u_infinity = _mm_set1_epi32(255 << 23);
1159 630 : const __m128i f16u_max = _mm_set1_epi32((127 + 16) << 23);
1160 630 : const __m128i denorm_magic = _mm_set1_epi32(((127 - 15) + (23 - 10) + 1) << 23);
1161 :
1162 1260 : const auto sign = _mm_and_si128(xmm, _mm_set1_epi32(INT_MIN));
1163 630 : xmm = _mm_xor_si128(xmm, sign);
1164 11340 : xmm = GDALIfThenElse(
1165 : _mm_cmpgt_epi32(xmm, f16u_max),
1166 : /* result is Inf or NaN (all exponent bits set) */
1167 : GDALIfThenElse(
1168 : _mm_cmpgt_epi32(xmm, f32u_infinity),
1169 : /* NaN->qNaN and Inf->Inf */
1170 : _mm_set1_epi32(0x7e00),
1171 : _mm_set1_epi32(0x7c00)),
1172 : /* (De)normalized number or zero */
1173 : GDALIfThenElse(
1174 : _mm_cmplt_epi32(xmm, _mm_set1_epi32(113 << 23)),
1175 : /* use a magic value to align our 10 mantissa bits at the bottom of
1176 : * the float. as long as FP addition is round-to-nearest-even this
1177 : * just works. */
1178 : _mm_sub_epi32(
1179 : _mm_castps_si128(_mm_add_ps(_mm_castsi128_ps(xmm),
1180 : _mm_castsi128_ps(denorm_magic))),
1181 : /* and one integer subtract of the bias later,
1182 : * we have our final float! */
1183 : denorm_magic
1184 : ),
1185 : _mm_srli_epi32(
1186 : _mm_add_epi32(
1187 : /* update exponent, rounding bias part 1 */
1188 : // (unsigned)-0x37fff001 = ((unsigned)(15-127) << 23) + 0xfff
1189 : _mm_add_epi32(xmm, _mm_set1_epi32(-0x37fff001)),
1190 : /* rounding bias part 2, using mant_odd */
1191 : _mm_and_si128(_mm_srli_epi32(xmm, 13), _mm_set1_epi32(1))),
1192 : 13
1193 : )
1194 : )
1195 : );
1196 630 : xmm = _mm_or_si128(xmm, _mm_srli_epi32(sign, 16));
1197 :
1198 : // clang-format on
1199 630 : return xmm;
1200 : }
1201 :
1202 : template <>
1203 315 : inline void GDALCopy8Words(const float *pValueIn, GFloat16 *const pValueOut)
1204 : {
1205 : __m128i xmm_lo =
1206 630 : GDALFourFloat32ToFloat16(_mm_castps_si128(_mm_loadu_ps(pValueIn)));
1207 : __m128i xmm_hi =
1208 945 : GDALFourFloat32ToFloat16(_mm_castps_si128(_mm_loadu_ps(pValueIn + 4)));
1209 :
1210 315 : auto xmm = GDAL_mm_packus_epi32(xmm_lo, xmm_hi); // Pack int32 to uint16
1211 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut), xmm);
1212 315 : }
1213 :
1214 : #endif
1215 :
1216 : template <>
1217 624550 : inline void GDALCopy4Words(const double *pValueIn, float *const pValueOut)
1218 : {
1219 624550 : const __m128d val01 = _mm_loadu_pd(pValueIn);
1220 1249100 : const __m128d val23 = _mm_loadu_pd(pValueIn + 2);
1221 624550 : const __m128 val01_s = _mm_cvtpd_ps(val01);
1222 624550 : const __m128 val23_s = _mm_cvtpd_ps(val23);
1223 624550 : const __m128 val = _mm_movelh_ps(val01_s, val23_s);
1224 : _mm_storeu_ps(pValueOut, val);
1225 624550 : }
1226 :
1227 : template <>
1228 2776310 : inline void GDALCopy4Words(const double *pValueIn, GByte *const pValueOut)
1229 : {
1230 2776310 : const __m128d p0d5 = _mm_set1_pd(0.5);
1231 2776310 : const __m128d xmm_max = _mm_set1_pd(255);
1232 :
1233 2776310 : __m128d val01 = _mm_loadu_pd(pValueIn);
1234 5552620 : __m128d val23 = _mm_loadu_pd(pValueIn + 2);
1235 2776310 : val01 = _mm_add_pd(val01, p0d5);
1236 5552620 : val01 = _mm_min_pd(_mm_max_pd(val01, p0d5), xmm_max);
1237 2776310 : val23 = _mm_add_pd(val23, p0d5);
1238 5552620 : val23 = _mm_min_pd(_mm_max_pd(val23, p0d5), xmm_max);
1239 :
1240 2776310 : const __m128i val01_u32 = _mm_cvttpd_epi32(val01);
1241 2776310 : const __m128i val23_u32 = _mm_cvttpd_epi32(val23);
1242 :
1243 : // Merge 4 int32 values into a single register
1244 8328920 : auto xmm_i = _mm_castpd_si128(_mm_shuffle_pd(
1245 : _mm_castsi128_pd(val01_u32), _mm_castsi128_pd(val23_u32), 0));
1246 :
1247 : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
1248 : xmm_i = _mm_shuffle_epi8(
1249 : xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24)));
1250 : #else
1251 2776310 : xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
1252 2776310 : xmm_i = _mm_packus_epi16(xmm_i, xmm_i); // Pack int16 to uint8
1253 : #endif
1254 2776310 : GDALCopyXMMToInt32(xmm_i, pValueOut);
1255 2776310 : }
1256 :
1257 : template <>
1258 11751400 : inline void GDALCopy4Words(const float *pValueIn, double *const pValueOut)
1259 : {
1260 11751400 : const __m128 valIn = _mm_loadu_ps(pValueIn);
1261 11751400 : _mm_storeu_pd(pValueOut, _mm_cvtps_pd(valIn));
1262 23502900 : _mm_storeu_pd(pValueOut + 2, _mm_cvtps_pd(_mm_movehl_ps(valIn, valIn)));
1263 11751400 : }
1264 :
1265 : #ifdef __F16C__
1266 : template <>
1267 : inline void GDALCopy4Words(const GFloat16 *pValueIn, float *const pValueOut)
1268 : {
1269 : __m128i xmm = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pValueIn));
1270 : _mm_storeu_ps(pValueOut, _mm_cvtph_ps(xmm));
1271 : }
1272 :
1273 : template <>
1274 : inline void GDALCopy4Words(const float *pValueIn, GFloat16 *const pValueOut)
1275 : {
1276 : __m128 xmm = _mm_loadu_ps(pValueIn);
1277 : GDALCopyXMMToInt64(_mm_cvtps_ph(xmm, _MM_FROUND_TO_NEAREST_INT), pValueOut);
1278 : }
1279 :
1280 : template <>
1281 : inline void GDALCopy4Words(const GFloat16 *pValueIn, double *const pValueOut)
1282 : {
1283 : float tmp[4];
1284 : GDALCopy4Words(pValueIn, tmp);
1285 : GDALCopy4Words(tmp, pValueOut);
1286 : }
1287 :
1288 : template <>
1289 : inline void GDALCopy4Words(const double *pValueIn, GFloat16 *const pValueOut)
1290 : {
1291 : float tmp[4];
1292 : GDALCopy4Words(pValueIn, tmp);
1293 : GDALCopy4Words(tmp, pValueOut);
1294 : }
1295 :
1296 : // ARM64 has an efficient instruction for Float16 -> Float32/Float64
1297 : #elif !(defined(HAVE__FLOAT16) && (defined(__aarch64__) && defined(_M_ARM64)))
1298 :
1299 : // Convert 4 float16 values to 4 float 32 values
1300 : // xmm must contain 4 float16 values stored in 32 bit each (with upper 16 bits at zero)
1301 41668 : static inline __m128i GDALFourFloat16ToFloat32(__m128i xmm)
1302 : {
1303 : // Ported from https://github.com/simd-everywhere/simde/blob/51743e7920b6e867678cb50e9c62effe28f70b33/simde/simde-f16.h#L242C4-L242C68
1304 : // to SSE2 in a branch-less way
1305 :
1306 : /* This code is CC0, based heavily on code by Fabian Giesen. */
1307 : const auto denorm_magic =
1308 83336 : _mm_castsi128_ps(_mm_set1_epi32((128 - 15) << 23));
1309 : const auto shifted_exp =
1310 41668 : _mm_set1_epi32(0x7c00 << 13); /* exponent mask after shift */
1311 :
1312 : // Shift exponent and mantissa bits to their position in a float32
1313 125004 : auto f32u = _mm_slli_epi32(_mm_and_si128(xmm, _mm_set1_epi32(0x7fff)), 13);
1314 : // Extract the (shifted) exponent
1315 41668 : const auto exp = _mm_and_si128(shifted_exp, f32u);
1316 : // Adjust the exponent
1317 41668 : const auto exp_adjustment = _mm_set1_epi32((127 - 15) << 23);
1318 41668 : f32u = _mm_add_epi32(f32u, exp_adjustment);
1319 :
1320 41668 : const auto is_inf_nan = _mm_cmpeq_epi32(exp, shifted_exp); /* Inf/NaN? */
1321 : // When is_inf_nan is true: extra exponent adjustment
1322 41668 : const auto f32u_inf_nan = _mm_add_epi32(f32u, exp_adjustment);
1323 :
1324 : const auto is_denormal =
1325 83336 : _mm_cmpeq_epi32(exp, _mm_setzero_si128()); /* Zero/Denormal? */
1326 : // When is_denormal is true:
1327 83336 : auto f32u_denormal = _mm_add_epi32(f32u, _mm_set1_epi32(1 << 23));
1328 83336 : f32u_denormal = _mm_castps_si128(
1329 : _mm_sub_ps(_mm_castsi128_ps(f32u_denormal), denorm_magic));
1330 :
1331 41668 : f32u = GDALIfThenElse(is_inf_nan, f32u_inf_nan, f32u);
1332 41668 : f32u = GDALIfThenElse(is_denormal, f32u_denormal, f32u);
1333 :
1334 : // Re-apply sign bit
1335 125004 : f32u = _mm_or_si128(
1336 : f32u, _mm_slli_epi32(_mm_and_si128(xmm, _mm_set1_epi32(0x8000)), 16));
1337 41668 : return f32u;
1338 : }
1339 :
1340 : template <>
1341 388 : inline void GDALCopy8Words(const GFloat16 *pValueIn, float *const pValueOut)
1342 : {
1343 388 : __m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
1344 : const auto xmm_0 =
1345 776 : GDALFourFloat16ToFloat32(_mm_unpacklo_epi16(xmm, _mm_setzero_si128()));
1346 : const auto xmm_1 =
1347 776 : GDALFourFloat16ToFloat32(_mm_unpackhi_epi16(xmm, _mm_setzero_si128()));
1348 388 : _mm_storeu_ps(pValueOut + 0, _mm_castsi128_ps(xmm_0));
1349 388 : _mm_storeu_ps(pValueOut + 4, _mm_castsi128_ps(xmm_1));
1350 388 : }
1351 :
1352 : template <>
1353 20446 : inline void GDALCopy8Words(const GFloat16 *pValueIn, double *const pValueOut)
1354 : {
1355 20446 : __m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
1356 61338 : const auto xmm_0 = _mm_castsi128_ps(
1357 : GDALFourFloat16ToFloat32(_mm_unpacklo_epi16(xmm, _mm_setzero_si128())));
1358 61338 : const auto xmm_1 = _mm_castsi128_ps(
1359 : GDALFourFloat16ToFloat32(_mm_unpackhi_epi16(xmm, _mm_setzero_si128())));
1360 20446 : _mm_storeu_pd(pValueOut + 0, _mm_cvtps_pd(xmm_0));
1361 40892 : _mm_storeu_pd(pValueOut + 2, _mm_cvtps_pd(_mm_movehl_ps(xmm_0, xmm_0)));
1362 20446 : _mm_storeu_pd(pValueOut + 4, _mm_cvtps_pd(xmm_1));
1363 40892 : _mm_storeu_pd(pValueOut + 6, _mm_cvtps_pd(_mm_movehl_ps(xmm_1, xmm_1)));
1364 20446 : }
1365 :
1366 : #endif // __F16C__
1367 :
1368 : #ifdef __AVX2__
1369 :
1370 : #include <immintrin.h>
1371 :
1372 : template <>
1373 : inline void GDALCopy8Words(const double *pValueIn, float *const pValueOut)
1374 : {
1375 : const __m256d val0123 = _mm256_loadu_pd(pValueIn);
1376 : const __m256d val4567 = _mm256_loadu_pd(pValueIn + 4);
1377 : const __m256 val0123_s = _mm256_castps128_ps256(_mm256_cvtpd_ps(val0123));
1378 : const __m256 val4567_s = _mm256_castps128_ps256(_mm256_cvtpd_ps(val4567));
1379 : const __m256 val =
1380 : _mm256_permute2f128_ps(val0123_s, val4567_s, 0 | (2 << 4));
1381 : _mm256_storeu_ps(pValueOut, val);
1382 : }
1383 :
1384 : template <>
1385 : inline void GDALCopy8Words(const float *pValueIn, double *const pValueOut)
1386 : {
1387 : const __m256 valIn = _mm256_loadu_ps(pValueIn);
1388 : _mm256_storeu_pd(pValueOut, _mm256_cvtps_pd(_mm256_castps256_ps128(valIn)));
1389 : _mm256_storeu_pd(pValueOut + 4,
1390 : _mm256_cvtps_pd(_mm256_castps256_ps128(
1391 : _mm256_permute2f128_ps(valIn, valIn, 1))));
1392 : }
1393 :
1394 : #ifdef __F16C__
1395 :
1396 : template <>
1397 : inline void GDALCopy8Words(const GFloat16 *pValueIn, float *const pValueOut)
1398 : {
1399 : __m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
1400 : _mm256_storeu_ps(pValueOut, _mm256_cvtph_ps(xmm));
1401 : }
1402 :
1403 : template <>
1404 : inline void GDALCopy8Words(const float *pValueIn, GFloat16 *const pValueOut)
1405 : {
1406 : __m256 ymm = _mm256_loadu_ps(pValueIn);
1407 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut),
1408 : _mm256_cvtps_ph(ymm, _MM_FROUND_TO_NEAREST_INT));
1409 : }
1410 :
1411 : template <>
1412 : inline void GDALCopy8Words(const GFloat16 *pValueIn, double *const pValueOut)
1413 : {
1414 : __m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
1415 : const auto ymm = _mm256_cvtph_ps(xmm);
1416 : _mm256_storeu_pd(pValueOut, _mm256_cvtps_pd(_mm256_extractf128_ps(ymm, 0)));
1417 : _mm256_storeu_pd(pValueOut + 4,
1418 : _mm256_cvtps_pd(_mm256_extractf128_ps(ymm, 1)));
1419 : }
1420 :
1421 : template <>
1422 : inline void GDALCopy8Words(const double *pValueIn, GFloat16 *const pValueOut)
1423 : {
1424 : __m256d ymm0 = _mm256_loadu_pd(pValueIn);
1425 : __m256d ymm1 = _mm256_loadu_pd(pValueIn + 4);
1426 : __m256 ymm = _mm256_set_m128(_mm256_cvtpd_ps(ymm1), _mm256_cvtpd_ps(ymm0));
1427 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut),
1428 : _mm256_cvtps_ph(ymm, _MM_FROUND_TO_NEAREST_INT));
1429 : }
1430 :
1431 : #endif
1432 :
1433 : template <>
1434 : inline void GDALCopy8Words(const float *pValueIn, GByte *const pValueOut)
1435 : {
1436 : __m256 ymm = _mm256_loadu_ps(pValueIn);
1437 :
1438 : const __m256 p0d5 = _mm256_set1_ps(0.5f);
1439 : const __m256 ymm_max = _mm256_set1_ps(255);
1440 : ymm = _mm256_add_ps(ymm, p0d5);
1441 : ymm = _mm256_min_ps(_mm256_max_ps(ymm, p0d5), ymm_max);
1442 :
1443 : __m256i ymm_i = _mm256_cvttps_epi32(ymm);
1444 :
1445 : ymm_i = _mm256_packus_epi32(ymm_i, ymm_i); // Pack int32 to uint16
1446 : ymm_i = _mm256_permute4x64_epi64(ymm_i, 0 | (2 << 2)); // AVX2
1447 :
1448 : __m128i xmm_i = _mm256_castsi256_si128(ymm_i);
1449 : xmm_i = _mm_packus_epi16(xmm_i, xmm_i);
1450 : GDALCopyXMMToInt64(xmm_i, pValueOut);
1451 : }
1452 :
1453 : template <>
1454 : inline void GDALCopy8Words(const float *pValueIn, GUInt16 *const pValueOut)
1455 : {
1456 : __m256 ymm = _mm256_loadu_ps(pValueIn);
1457 :
1458 : const __m256 p0d5 = _mm256_set1_ps(0.5f);
1459 : const __m256 ymm_max = _mm256_set1_ps(65535);
1460 : ymm = _mm256_add_ps(ymm, p0d5);
1461 : ymm = _mm256_min_ps(_mm256_max_ps(ymm, p0d5), ymm_max);
1462 :
1463 : __m256i ymm_i = _mm256_cvttps_epi32(ymm);
1464 :
1465 : ymm_i = _mm256_packus_epi32(ymm_i, ymm_i); // Pack int32 to uint16
1466 : ymm_i = _mm256_permute4x64_epi64(ymm_i, 0 | (2 << 2)); // AVX2
1467 :
1468 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut),
1469 : _mm256_castsi256_si128(ymm_i));
1470 : }
1471 : #else
1472 : template <>
1473 7785701 : inline void GDALCopy8Words(const float *pValueIn, GUInt16 *const pValueOut)
1474 : {
1475 7785701 : __m128 xmm = _mm_loadu_ps(pValueIn);
1476 15571402 : __m128 xmm1 = _mm_loadu_ps(pValueIn + 4);
1477 :
1478 7785701 : const __m128 p0d5 = _mm_set1_ps(0.5f);
1479 7785701 : const __m128 xmm_max = _mm_set1_ps(65535);
1480 7785701 : xmm = _mm_add_ps(xmm, p0d5);
1481 7785701 : xmm1 = _mm_add_ps(xmm1, p0d5);
1482 15571402 : xmm = _mm_min_ps(_mm_max_ps(xmm, p0d5), xmm_max);
1483 15571402 : xmm1 = _mm_min_ps(_mm_max_ps(xmm1, p0d5), xmm_max);
1484 :
1485 : #ifdef USE_NEON_OPTIMIZATIONS
1486 : // Optimization to avoid useless clamping
1487 : __m128i xmm_i = _mm_cvttps_epi32_neon_no_post_correction(xmm);
1488 : __m128i xmm1_i = _mm_cvttps_epi32_neon_no_post_correction(xmm1);
1489 : #else
1490 7785701 : __m128i xmm_i = _mm_cvttps_epi32(xmm);
1491 7785701 : __m128i xmm1_i = _mm_cvttps_epi32(xmm1);
1492 : #endif
1493 :
1494 7785701 : xmm_i = GDAL_mm_packus_epi32(xmm_i, xmm1_i); // Pack int32 to uint16
1495 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut), xmm_i);
1496 7785701 : }
1497 : #endif
1498 :
1499 : // ARM64 has an efficient instruction for Float64 -> Float16
1500 : #if !(defined(HAVE__FLOAT16) && \
1501 : (defined(__aarch64__) && defined(_M_ARM64))) && \
1502 : !(defined(__AVX2__) && defined(__F16C__))
1503 : template <>
1504 235 : inline void GDALCopy8Words(const double *pValueIn, GFloat16 *const pValueOut)
1505 : {
1506 : float fVal[8];
1507 235 : GDALCopy8Words(pValueIn, fVal);
1508 235 : GDALCopy8Words(fVal, pValueOut);
1509 235 : }
1510 : #endif
1511 :
1512 : #endif // defined(__x86_64) || defined(_M_X64)
1513 :
1514 : #endif // GDAL_PRIV_TEMPLATES_HPP_INCLUDED
|