Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: GDAL Core
4 : * Purpose: Inline C++ templates
5 : * Author: Phil Vachon, <philippe at cowpig.ca>
6 : *
7 : ******************************************************************************
8 : * Copyright (c) 2009, Phil Vachon, <philippe at cowpig.ca>
9 : * Copyright (c) 2025, Even Rouault, <even.rouault at spatialys.com>
10 : *
11 : * SPDX-License-Identifier: MIT
12 : ****************************************************************************/
13 :
14 : #ifndef GDAL_PRIV_TEMPLATES_HPP_INCLUDED
15 : #define GDAL_PRIV_TEMPLATES_HPP_INCLUDED
16 :
17 : #include "cpl_port.h"
18 :
19 : #include <algorithm>
20 : #include <cmath>
21 : #include <cstdint>
22 : #include <limits>
23 : #include <type_traits>
24 :
25 : #include "cpl_float.h"
26 :
27 : // Needs SSE2
28 : #if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2) || \
29 : defined(USE_NEON_OPTIMIZATIONS)
30 :
31 : #ifdef USE_NEON_OPTIMIZATIONS
32 : #include "include_sse2neon.h"
33 : #else
34 : #include <immintrin.h>
35 : #endif
36 :
37 40217145 : static inline void GDALCopyXMMToInt32(const __m128i xmm, void *pDest)
38 : {
39 40217145 : int n32 = _mm_cvtsi128_si32(xmm); // Extract lower 32 bit word
40 40217145 : memcpy(pDest, &n32, sizeof(n32));
41 40217145 : }
42 :
43 106492321 : static inline void GDALCopyXMMToInt64(const __m128i xmm, void *pDest)
44 : {
45 : _mm_storel_epi64(reinterpret_cast<__m128i *>(pDest), xmm);
46 106492321 : }
47 :
48 : #if __SSSE3__
49 : #include <tmmintrin.h>
50 : #endif
51 :
52 : #if defined(__SSE4_1__) || defined(__AVX__)
53 : #include <smmintrin.h>
54 : #endif
55 :
56 : #ifdef __F16C__
57 : #include <immintrin.h>
58 : #endif
59 :
60 : #endif
61 :
62 : /************************************************************************/
63 : /* GDALGetDataLimits() */
64 : /************************************************************************/
65 : /**
66 : * Compute the limits of values that can be placed in Tout in terms of
67 : * Tin. Usually used for output clamping, when the output data type's
68 : * limits are stable relative to the input type (i.e. no roundoff error).
69 : *
70 : * @param tMaxValue the returned maximum value
71 : * @param tMinValue the returned minimum value
72 : */
73 :
74 : template <class Tin, class Tout>
75 352246142 : inline void GDALGetDataLimits(Tin &tMaxValue, Tin &tMinValue)
76 : {
77 352246142 : tMaxValue = cpl::NumericLimits<Tin>::max();
78 352246142 : tMinValue = cpl::NumericLimits<Tin>::lowest();
79 :
80 : // Compute the actual minimum value of Tout in terms of Tin.
81 : if constexpr (cpl::NumericLimits<Tout>::is_signed &&
82 : cpl::NumericLimits<Tout>::is_integer)
83 : {
84 : // the minimum value is less than zero
85 : // cppcheck-suppress knownConditionTrueFalse
86 : if constexpr (cpl::NumericLimits<Tout>::digits <
87 : cpl::NumericLimits<Tin>::digits ||
88 : !cpl::NumericLimits<Tin>::is_integer)
89 : {
90 : // Tout is smaller than Tin, so we need to clamp values in input
91 : // to the range of Tout's min/max values
92 : if constexpr (cpl::NumericLimits<Tin>::is_signed)
93 : {
94 87464460 : tMinValue =
95 87464460 : static_cast<Tin>(cpl::NumericLimits<Tout>::lowest());
96 : }
97 87985123 : tMaxValue = static_cast<Tin>(cpl::NumericLimits<Tout>::max());
98 : }
99 : }
100 : else if constexpr (cpl::NumericLimits<Tout>::is_integer)
101 : {
102 : // the output is unsigned, so we just need to determine the max
103 : if constexpr (!std::is_same_v<Tin, Tout> &&
104 : cpl::NumericLimits<Tout>::digits <=
105 : cpl::NumericLimits<Tin>::digits)
106 : {
107 : // Tout is smaller than Tin, so we need to clamp the input values
108 : // to the range of Tout's max
109 104644102 : tMaxValue = static_cast<Tin>(cpl::NumericLimits<Tout>::max());
110 : }
111 107597980 : tMinValue = 0;
112 : }
113 352246142 : }
114 :
115 : /************************************************************************/
116 : /* GDALClampValue() */
117 : /************************************************************************/
118 : /**
119 : * Clamp values of type T to a specified range
120 : *
121 : * @param tValue the value
122 : * @param tMax the max value
123 : * @param tMin the min value
124 : */
125 : template <class T>
126 352047047 : inline T GDALClampValue(const T tValue, const T tMax, const T tMin)
127 : {
128 352047047 : return tValue > tMax ? tMax : tValue < tMin ? tMin : tValue;
129 : }
130 :
131 : /************************************************************************/
132 : /* GDALClampDoubleValue() */
133 : /************************************************************************/
134 : /**
135 : * Clamp double values to a specified range, this uses the same
136 : * argument ordering as std::clamp, returns TRUE if the value was clamped.
137 : *
138 : * @param tValue the value
139 : * @param tMin the min value
140 : * @param tMax the max value
141 : *
142 : */
143 : template <class T2, class T3>
144 309 : inline bool GDALClampDoubleValue(double &tValue, const T2 tMin, const T3 tMax)
145 : {
146 309 : const double tMin2{static_cast<double>(tMin)};
147 309 : const double tMax2{static_cast<double>(tMax)};
148 309 : if (tValue > tMax2 || tValue < tMin2)
149 : {
150 26 : tValue = tValue > tMax2 ? tMax2 : tValue < tMin2 ? tMin2 : tValue;
151 26 : return true;
152 : }
153 : else
154 : {
155 283 : return false;
156 : }
157 : }
158 :
159 : /************************************************************************/
160 : /* GDALClampValueToType() */
161 : /************************************************************************/
162 : /**
163 : * Clamp a value of type T to the limits of type TClamped.
164 : *
165 : * @param tValue the value
166 : */
167 198828 : template <class T, class TClamped> inline T GDALClampValueToType(T tValue)
168 : {
169 : T tMaxValue, tMinValue;
170 198828 : GDALGetDataLimits<T, TClamped>(tMaxValue, tMinValue);
171 198828 : return std::clamp(tValue, tMinValue, tMaxValue);
172 : }
173 :
174 : /************************************************************************/
175 : /* GDALIsValueInRange() */
176 : /************************************************************************/
177 : /**
178 : * Returns whether a value is in the type range.
179 : * NaN is considered not to be in type range.
180 : *
181 : * @param dfValue the value
182 : * @return whether the value is in the type range.
183 : */
184 156599 : template <class T> inline bool GDALIsValueInRange(double dfValue)
185 : {
186 313157 : return dfValue >= static_cast<double>(cpl::NumericLimits<T>::lowest()) &&
187 313157 : dfValue <= static_cast<double>(cpl::NumericLimits<T>::max());
188 : }
189 :
190 27 : template <> inline bool GDALIsValueInRange<double>(double dfValue)
191 : {
192 27 : return !CPLIsNan(dfValue);
193 : }
194 :
195 46944 : template <> inline bool GDALIsValueInRange<float>(double dfValue)
196 : {
197 93761 : return CPLIsInf(dfValue) ||
198 : (dfValue >=
199 46817 : -static_cast<double>(std::numeric_limits<float>::max()) &&
200 93753 : dfValue <= static_cast<double>(std::numeric_limits<float>::max()));
201 : }
202 :
203 120 : template <> inline bool GDALIsValueInRange<GFloat16>(double dfValue)
204 : {
205 360 : return CPLIsInf(dfValue) ||
206 240 : (dfValue >= -cpl::NumericLimits<GFloat16>::max() &&
207 240 : dfValue <= cpl::NumericLimits<GFloat16>::max());
208 : }
209 :
210 16925 : template <> inline bool GDALIsValueInRange<int64_t>(double dfValue)
211 : {
212 : // Values in the range [INT64_MAX - 1023, INT64_MAX - 1]
213 : // get converted to a double that once cast to int64_t is
214 : // INT64_MAX + 1, hence the < strict comparison.
215 : return dfValue >=
216 33849 : static_cast<double>(cpl::NumericLimits<int64_t>::lowest()) &&
217 33849 : dfValue < static_cast<double>(cpl::NumericLimits<int64_t>::max());
218 : }
219 :
220 8968 : template <> inline bool GDALIsValueInRange<uint64_t>(double dfValue)
221 : {
222 : // Values in the range [UINT64_MAX - 2047, UINT64_MAX - 1]
223 : // get converted to a double that once cast to uint64_t is
224 : // UINT64_MAX + 1, hence the < strict comparison.
225 17933 : return dfValue >= 0 &&
226 17933 : dfValue < static_cast<double>(cpl::NumericLimits<uint64_t>::max());
227 : }
228 :
229 : /************************************************************************/
230 : /* GDALIsValueExactAs() */
231 : /************************************************************************/
232 : /**
233 : * Returns whether a value can be exactly represented on type T.
234 : *
235 : * That is static_cast\<double\>(static_cast\<T\>(dfValue)) is legal and is
236 : * equal to dfValue.
237 : *
238 : * Note: for T=float or double, a NaN input leads to true
239 : *
240 : * @param dfValue the value
241 : * @return whether the value can be exactly represented on type T.
242 : */
243 7849 : template <class T> inline bool GDALIsValueExactAs(double dfValue)
244 : {
245 15644 : return GDALIsValueInRange<T>(dfValue) &&
246 15644 : static_cast<double>(static_cast<T>(dfValue)) == dfValue;
247 : }
248 :
249 174 : template <> inline bool GDALIsValueExactAs<float>(double dfValue)
250 : {
251 342 : return CPLIsNan(dfValue) ||
252 168 : (GDALIsValueInRange<float>(dfValue) &&
253 338 : static_cast<double>(static_cast<float>(dfValue)) == dfValue);
254 : }
255 :
256 7 : template <> inline bool GDALIsValueExactAs<GFloat16>(double dfValue)
257 : {
258 14 : return CPLIsNan(dfValue) ||
259 7 : (GDALIsValueInRange<GFloat16>(dfValue) &&
260 14 : static_cast<double>(static_cast<GFloat16>(dfValue)) == dfValue);
261 : }
262 :
263 21 : template <> inline bool GDALIsValueExactAs<double>(double)
264 : {
265 21 : return true;
266 : }
267 :
268 : /************************************************************************/
269 : /* GDALCopyWord() */
270 : /************************************************************************/
271 :
272 : // Integer input and output: clamp the input
273 :
274 : template <class Tin, class Tout> struct sGDALCopyWord
275 : {
276 176202192 : static inline void f(const Tin tValueIn, Tout &tValueOut)
277 : {
278 : Tin tMaxVal, tMinVal;
279 176202192 : GDALGetDataLimits<Tin, Tout>(tMaxVal, tMinVal);
280 176202192 : tValueOut =
281 176202192 : static_cast<Tout>(GDALClampValue(tValueIn, tMaxVal, tMinVal));
282 176202192 : }
283 : };
284 :
285 : // Integer input and floating point output: simply convert
286 :
287 : template <class Tin> struct sGDALCopyWord<Tin, GFloat16>
288 : {
289 270266 : static inline void f(const Tin tValueIn, GFloat16 &hfValueOut)
290 : {
291 270266 : hfValueOut = static_cast<GFloat16>(tValueIn);
292 270266 : }
293 : };
294 :
295 : template <class Tin> struct sGDALCopyWord<Tin, float>
296 : {
297 8027797 : static inline void f(const Tin tValueIn, float &fValueOut)
298 : {
299 8027797 : fValueOut = static_cast<float>(tValueIn);
300 8027797 : }
301 : };
302 :
303 : template <class Tin> struct sGDALCopyWord<Tin, double>
304 : {
305 87057931 : static inline void f(const Tin tValueIn, double &dfValueOut)
306 : {
307 87057931 : dfValueOut = static_cast<double>(tValueIn);
308 87057931 : }
309 : };
310 :
311 : // Floating point input and output, converting between identical types: simply copy
312 :
313 : template <> struct sGDALCopyWord<GFloat16, GFloat16>
314 : {
315 : static inline void f(const GFloat16 hfValueIn, GFloat16 &hfValueOut)
316 : {
317 : hfValueOut = hfValueIn;
318 : }
319 : };
320 :
321 : template <> struct sGDALCopyWord<float, float>
322 : {
323 : static inline void f(const float fValueIn, float &fValueOut)
324 : {
325 : fValueOut = fValueIn;
326 : }
327 : };
328 :
329 : template <> struct sGDALCopyWord<double, double>
330 : {
331 : static inline void f(const double dfValueIn, double &dfValueOut)
332 : {
333 : dfValueOut = dfValueIn;
334 : }
335 : };
336 :
337 : // Floating point input and output, converting to a larger type: use implicit conversion
338 :
339 : template <> struct sGDALCopyWord<GFloat16, float>
340 : {
341 5206 : static inline void f(const GFloat16 hfValueIn, float &dfValueOut)
342 : {
343 5206 : dfValueOut = hfValueIn;
344 5206 : }
345 : };
346 :
347 : template <> struct sGDALCopyWord<GFloat16, double>
348 : {
349 4174 : static inline void f(const GFloat16 hfValueIn, double &dfValueOut)
350 : {
351 4174 : dfValueOut = hfValueIn;
352 4174 : }
353 : };
354 :
355 : template <> struct sGDALCopyWord<float, double>
356 : {
357 636723 : static inline void f(const float fValueIn, double &dfValueOut)
358 : {
359 636723 : dfValueOut = static_cast<double>(fValueIn);
360 636723 : }
361 : };
362 :
363 : // Floating point input and out, converting to a smaller type: ensure overflow results in infinity
364 :
365 : template <> struct sGDALCopyWord<float, GFloat16>
366 : {
367 497 : static inline void f(const float fValueIn, GFloat16 &hfValueOut)
368 : {
369 : // Our custom implementation when std::float16_t is not
370 : // available ensures proper behavior.
371 : #if !defined(HAVE_STD_FLOAT16_T)
372 497 : if (fValueIn > cpl::NumericLimits<GFloat16>::max())
373 : {
374 27 : hfValueOut = cpl::NumericLimits<GFloat16>::infinity();
375 27 : return;
376 : }
377 470 : if (fValueIn < -cpl::NumericLimits<GFloat16>::max())
378 : {
379 15 : hfValueOut = -cpl::NumericLimits<GFloat16>::infinity();
380 15 : return;
381 : }
382 : #endif
383 455 : hfValueOut = static_cast<GFloat16>(fValueIn);
384 : }
385 : };
386 :
387 : template <> struct sGDALCopyWord<double, GFloat16>
388 : {
389 349 : static inline void f(const double dfValueIn, GFloat16 &hfValueOut)
390 : {
391 : // Our custom implementation when std::float16_t is not
392 : // available ensures proper behavior.
393 : #if !defined(HAVE_STD_FLOAT16_T)
394 349 : if (dfValueIn > cpl::NumericLimits<GFloat16>::max())
395 : {
396 14 : hfValueOut = cpl::NumericLimits<GFloat16>::infinity();
397 14 : return;
398 : }
399 335 : if (dfValueIn < -cpl::NumericLimits<GFloat16>::max())
400 : {
401 14 : hfValueOut = -cpl::NumericLimits<GFloat16>::infinity();
402 14 : return;
403 : }
404 : #endif
405 321 : hfValueOut = static_cast<GFloat16>(dfValueIn);
406 : }
407 : };
408 :
409 : template <> struct sGDALCopyWord<double, float>
410 : {
411 130340 : static inline void f(const double dfValueIn, float &fValueOut)
412 : {
413 : #if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)
414 : // We could just write fValueOut = static_cast<float>(dfValueIn);
415 : // but a sanitizer might complain with values above FLT_MAX
416 391020 : _mm_store_ss(&fValueOut,
417 : _mm_cvtsd_ss(_mm_undefined_ps(), _mm_load_sd(&dfValueIn)));
418 : #else
419 : if (dfValueIn > static_cast<double>(std::numeric_limits<float>::max()))
420 : {
421 : fValueOut = std::numeric_limits<float>::infinity();
422 : return;
423 : }
424 : if (dfValueIn < static_cast<double>(-std::numeric_limits<float>::max()))
425 : {
426 : fValueOut = -std::numeric_limits<float>::infinity();
427 : return;
428 : }
429 :
430 : fValueOut = static_cast<float>(dfValueIn);
431 : #endif
432 130340 : }
433 : };
434 :
435 : // Floating point input to a small unsigned integer type: nan becomes zero, otherwise round and clamp
436 :
437 : template <class Tout> struct sGDALCopyWord<GFloat16, Tout>
438 : {
439 5032 : static inline void f(const GFloat16 hfValueIn, Tout &tValueOut)
440 : {
441 5032 : if (CPLIsNan(hfValueIn))
442 : {
443 1 : tValueOut = 0;
444 1 : return;
445 : }
446 : GFloat16 hfMaxVal, hfMinVal;
447 5031 : GDALGetDataLimits<GFloat16, Tout>(hfMaxVal, hfMinVal);
448 5031 : tValueOut = static_cast<Tout>(
449 10062 : GDALClampValue(hfValueIn + GFloat16(0.5f), hfMaxVal, hfMinVal));
450 : }
451 : };
452 :
453 : template <class Tout> struct sGDALCopyWord<float, Tout>
454 : {
455 5471730 : static inline void f(const float fValueIn, Tout &tValueOut)
456 : {
457 5471730 : if (CPLIsNan(fValueIn))
458 : {
459 0 : tValueOut = 0;
460 0 : return;
461 : }
462 : float fMaxVal, fMinVal;
463 5471730 : GDALGetDataLimits<float, Tout>(fMaxVal, fMinVal);
464 5471730 : tValueOut = static_cast<Tout>(
465 5471730 : GDALClampValue(fValueIn + 0.5f, fMaxVal, fMinVal));
466 : }
467 : };
468 :
469 : template <class Tout> struct sGDALCopyWord<double, Tout>
470 : {
471 82995982 : static inline void f(const double dfValueIn, Tout &tValueOut)
472 : {
473 82995982 : if (CPLIsNan(dfValueIn))
474 : {
475 6 : tValueOut = 0;
476 6 : return;
477 : }
478 : double dfMaxVal, dfMinVal;
479 82995981 : GDALGetDataLimits<double, Tout>(dfMaxVal, dfMinVal);
480 82995981 : tValueOut = static_cast<Tout>(
481 82995981 : GDALClampValue(dfValueIn + 0.5, dfMaxVal, dfMinVal));
482 : }
483 : };
484 :
485 : // Floating point input to a large unsigned integer type: nan becomes zero, otherwise round and clamp.
486 : // Avoid roundoff while clamping.
487 :
488 : template <> struct sGDALCopyWord<GFloat16, std::uint64_t>
489 : {
490 624 : static inline void f(const GFloat16 hfValueIn, std::uint64_t &nValueOut)
491 : {
492 624 : if (!(hfValueIn > 0))
493 : {
494 4 : nValueOut = 0;
495 : }
496 620 : else if (CPLIsInf(hfValueIn))
497 : {
498 1 : nValueOut = cpl::NumericLimits<std::uint64_t>::max();
499 : }
500 : else
501 : {
502 619 : nValueOut = static_cast<std::uint64_t>(hfValueIn + GFloat16(0.5f));
503 : }
504 624 : }
505 : };
506 :
507 : template <> struct sGDALCopyWord<float, unsigned int>
508 : {
509 5039 : static inline void f(const float fValueIn, unsigned int &nValueOut)
510 : {
511 5039 : if (!(fValueIn > 0))
512 : {
513 149 : nValueOut = 0;
514 : }
515 4890 : else if (fValueIn >=
516 4890 : static_cast<float>(cpl::NumericLimits<unsigned int>::max()))
517 : {
518 134 : nValueOut = cpl::NumericLimits<unsigned int>::max();
519 : }
520 : else
521 : {
522 4756 : nValueOut = static_cast<unsigned int>(fValueIn + 0.5f);
523 : }
524 5039 : }
525 : };
526 :
527 : template <> struct sGDALCopyWord<float, std::uint64_t>
528 : {
529 624 : static inline void f(const float fValueIn, std::uint64_t &nValueOut)
530 : {
531 624 : if (!(fValueIn > 0))
532 : {
533 4 : nValueOut = 0;
534 : }
535 620 : else if (fValueIn >=
536 620 : static_cast<float>(cpl::NumericLimits<std::uint64_t>::max()))
537 : {
538 2 : nValueOut = cpl::NumericLimits<std::uint64_t>::max();
539 : }
540 : else
541 : {
542 618 : nValueOut = static_cast<std::uint64_t>(fValueIn + 0.5f);
543 : }
544 624 : }
545 : };
546 :
547 : template <> struct sGDALCopyWord<double, std::uint64_t>
548 : {
549 1244 : static inline void f(const double dfValueIn, std::uint64_t &nValueOut)
550 : {
551 1244 : if (!(dfValueIn > 0))
552 : {
553 292 : nValueOut = 0;
554 : }
555 952 : else if (dfValueIn >
556 952 : static_cast<double>(cpl::NumericLimits<uint64_t>::max()))
557 : {
558 4 : nValueOut = cpl::NumericLimits<uint64_t>::max();
559 : }
560 : else
561 : {
562 948 : nValueOut = static_cast<std::uint64_t>(dfValueIn + 0.5);
563 : }
564 1244 : }
565 : };
566 :
567 : // Floating point input to a very large unsigned integer type: nan becomes zero, otherwise round and clamp.
568 : // Avoid infinity while clamping when the maximum integer is too large for the floating-point type.
569 : // Avoid roundoff while clamping.
570 :
571 : template <> struct sGDALCopyWord<GFloat16, unsigned short>
572 : {
573 4706 : static inline void f(const GFloat16 hfValueIn, unsigned short &nValueOut)
574 : {
575 4706 : if (!(hfValueIn > 0))
576 : {
577 216 : nValueOut = 0;
578 : }
579 4490 : else if (CPLIsInf(hfValueIn))
580 : {
581 67 : nValueOut = cpl::NumericLimits<unsigned short>::max();
582 : }
583 : else
584 : {
585 4423 : nValueOut = static_cast<unsigned short>(hfValueIn + GFloat16(0.5f));
586 : }
587 4706 : }
588 : };
589 :
590 : template <> struct sGDALCopyWord<GFloat16, unsigned int>
591 : {
592 4639 : static inline void f(const GFloat16 hfValueIn, unsigned int &nValueOut)
593 : {
594 4639 : if (!(hfValueIn > 0))
595 : {
596 149 : nValueOut = 0;
597 : }
598 4490 : else if (CPLIsInf(hfValueIn))
599 : {
600 0 : nValueOut = cpl::NumericLimits<unsigned int>::max();
601 : }
602 : else
603 : {
604 4490 : nValueOut = static_cast<unsigned int>(hfValueIn + GFloat16(0.5f));
605 : }
606 4639 : }
607 : };
608 :
609 : // Floating point input to a small signed integer type: nan becomes zero, otherwise round and clamp.
610 : // Rounding for signed integers is different than for the unsigned integers above.
611 :
612 : template <> struct sGDALCopyWord<GFloat16, signed char>
613 : {
614 1088 : static inline void f(const GFloat16 hfValueIn, signed char &nValueOut)
615 : {
616 1088 : if (CPLIsNan(hfValueIn))
617 : {
618 0 : nValueOut = 0;
619 0 : return;
620 : }
621 : GFloat16 hfMaxVal, hfMinVal;
622 1088 : GDALGetDataLimits<GFloat16, signed char>(hfMaxVal, hfMinVal);
623 1088 : GFloat16 hfValue = hfValueIn >= GFloat16(0.0f)
624 619 : ? hfValueIn + GFloat16(0.5f)
625 1088 : : hfValueIn - GFloat16(0.5f);
626 1088 : nValueOut = static_cast<signed char>(
627 2176 : GDALClampValue(hfValue, hfMaxVal, hfMinVal));
628 : }
629 : };
630 :
631 : template <> struct sGDALCopyWord<float, signed char>
632 : {
633 432 : static inline void f(const float fValueIn, signed char &nValueOut)
634 : {
635 432 : if (CPLIsNan(fValueIn))
636 : {
637 0 : nValueOut = 0;
638 0 : return;
639 : }
640 : float fMaxVal, fMinVal;
641 432 : GDALGetDataLimits<float, signed char>(fMaxVal, fMinVal);
642 432 : float fValue = fValueIn >= 0.0f ? fValueIn + 0.5f : fValueIn - 0.5f;
643 432 : nValueOut =
644 432 : static_cast<signed char>(GDALClampValue(fValue, fMaxVal, fMinVal));
645 : }
646 : };
647 :
648 : template <> struct sGDALCopyWord<float, short>
649 : {
650 5102500 : static inline void f(const float fValueIn, short &nValueOut)
651 : {
652 5102500 : if (CPLIsNan(fValueIn))
653 : {
654 0 : nValueOut = 0;
655 0 : return;
656 : }
657 : float fMaxVal, fMinVal;
658 5102500 : GDALGetDataLimits<float, short>(fMaxVal, fMinVal);
659 5102500 : float fValue = fValueIn >= 0.0f ? fValueIn + 0.5f : fValueIn - 0.5f;
660 5102500 : nValueOut =
661 5102500 : static_cast<short>(GDALClampValue(fValue, fMaxVal, fMinVal));
662 : }
663 : };
664 :
665 : template <> struct sGDALCopyWord<double, signed char>
666 : {
667 2512 : static inline void f(const double dfValueIn, signed char &nValueOut)
668 : {
669 2512 : if (CPLIsNan(dfValueIn))
670 : {
671 0 : nValueOut = 0;
672 0 : return;
673 : }
674 : double dfMaxVal, dfMinVal;
675 2512 : GDALGetDataLimits<double, signed char>(dfMaxVal, dfMinVal);
676 2512 : double dfValue = dfValueIn > 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5;
677 2512 : nValueOut = static_cast<signed char>(
678 2512 : GDALClampValue(dfValue, dfMaxVal, dfMinVal));
679 : }
680 : };
681 :
682 : template <> struct sGDALCopyWord<double, short>
683 : {
684 5157450 : static inline void f(const double dfValueIn, short &nValueOut)
685 : {
686 5157450 : if (CPLIsNan(dfValueIn))
687 : {
688 2 : nValueOut = 0;
689 2 : return;
690 : }
691 : double dfMaxVal, dfMinVal;
692 5157450 : GDALGetDataLimits<double, short>(dfMaxVal, dfMinVal);
693 5157450 : double dfValue = dfValueIn > 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5;
694 5157450 : nValueOut =
695 5157450 : static_cast<short>(GDALClampValue(dfValue, dfMaxVal, dfMinVal));
696 : }
697 : };
698 :
699 : template <> struct sGDALCopyWord<double, int>
700 : {
701 77108400 : static inline void f(const double dfValueIn, int &nValueOut)
702 : {
703 77108400 : if (CPLIsNan(dfValueIn))
704 : {
705 0 : nValueOut = 0;
706 0 : return;
707 : }
708 : double dfMaxVal, dfMinVal;
709 77108400 : GDALGetDataLimits<double, int>(dfMaxVal, dfMinVal);
710 77108400 : double dfValue = dfValueIn >= 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5;
711 77108400 : nValueOut =
712 77108400 : static_cast<int>(GDALClampValue(dfValue, dfMaxVal, dfMinVal));
713 : }
714 : };
715 :
716 : // Floating point input to a large signed integer type: nan becomes zero, otherwise round and clamp.
717 : // Rounding for signed integers is different than for the unsigned integers above.
718 : // Avoid roundoff while clamping.
719 :
720 : template <> struct sGDALCopyWord<GFloat16, short>
721 : {
722 5289 : static inline void f(const GFloat16 hfValueIn, short &nValueOut)
723 : {
724 5289 : if (CPLIsNan(hfValueIn))
725 : {
726 0 : nValueOut = 0;
727 : }
728 5289 : else if (hfValueIn >=
729 5289 : static_cast<GFloat16>(cpl::NumericLimits<short>::max()))
730 : {
731 146 : nValueOut = cpl::NumericLimits<short>::max();
732 : }
733 5143 : else if (hfValueIn <=
734 5143 : static_cast<GFloat16>(cpl::NumericLimits<short>::lowest()))
735 : {
736 213 : nValueOut = cpl::NumericLimits<short>::lowest();
737 : }
738 : else
739 : {
740 9860 : nValueOut = static_cast<short>(hfValueIn > GFloat16(0.0f)
741 4930 : ? hfValueIn + GFloat16(0.5f)
742 5451 : : hfValueIn - GFloat16(0.5f));
743 : }
744 5289 : }
745 : };
746 :
747 : template <> struct sGDALCopyWord<float, int>
748 : {
749 10255 : static inline void f(const float fValueIn, int &nValueOut)
750 : {
751 10255 : if (CPLIsNan(fValueIn))
752 : {
753 0 : nValueOut = 0;
754 : }
755 10255 : else if (fValueIn >= static_cast<float>(cpl::NumericLimits<int>::max()))
756 : {
757 218 : nValueOut = cpl::NumericLimits<int>::max();
758 : }
759 10037 : else if (fValueIn <=
760 10037 : static_cast<float>(cpl::NumericLimits<int>::lowest()))
761 : {
762 90 : nValueOut = cpl::NumericLimits<int>::lowest();
763 : }
764 : else
765 : {
766 10166 : nValueOut = static_cast<int>(fValueIn > 0.0f ? fValueIn + 0.5f
767 219 : : fValueIn - 0.5f);
768 : }
769 10255 : }
770 : };
771 :
772 : template <> struct sGDALCopyWord<float, std::int64_t>
773 : {
774 1093 : static inline void f(const float fValueIn, std::int64_t &nValueOut)
775 : {
776 1093 : if (CPLIsNan(fValueIn))
777 : {
778 1 : nValueOut = 0;
779 : }
780 1092 : else if (fValueIn >=
781 1092 : static_cast<float>(cpl::NumericLimits<std::int64_t>::max()))
782 : {
783 2 : nValueOut = cpl::NumericLimits<std::int64_t>::max();
784 : }
785 1090 : else if (fValueIn <=
786 1090 : static_cast<float>(cpl::NumericLimits<std::int64_t>::lowest()))
787 : {
788 2 : nValueOut = cpl::NumericLimits<std::int64_t>::lowest();
789 : }
790 : else
791 : {
792 2176 : nValueOut = static_cast<std::int64_t>(
793 1088 : fValueIn > 0.0f ? fValueIn + 0.5f : fValueIn - 0.5f);
794 : }
795 1093 : }
796 : };
797 :
798 : template <> struct sGDALCopyWord<double, std::int64_t>
799 : {
800 1727 : static inline void f(const double dfValueIn, std::int64_t &nValueOut)
801 : {
802 1727 : if (CPLIsNan(dfValueIn))
803 : {
804 1 : nValueOut = 0;
805 : }
806 1726 : else if (dfValueIn >=
807 1726 : static_cast<double>(cpl::NumericLimits<std::int64_t>::max()))
808 : {
809 4 : nValueOut = cpl::NumericLimits<std::int64_t>::max();
810 : }
811 1722 : else if (dfValueIn <=
812 1722 : static_cast<double>(cpl::NumericLimits<std::int64_t>::min()))
813 : {
814 4 : nValueOut = cpl::NumericLimits<std::int64_t>::min();
815 : }
816 : else
817 : {
818 3436 : nValueOut = static_cast<std::int64_t>(
819 1718 : dfValueIn > 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5);
820 : }
821 1727 : }
822 : };
823 :
824 : // Floating point input to a very large signed integer type: nan becomes zero, otherwise round and clamp.
825 : // Rounding for signed integers is different than for the unsigned integers above.
826 : // Avoid infinity while clamping when the maximum integer is too large for the floating-point type.
827 : // Avoid roundoff while clamping.
828 :
829 : template <> struct sGDALCopyWord<GFloat16, int>
830 : {
831 5222 : static inline void f(const GFloat16 hfValueIn, int &nValueOut)
832 : {
833 5222 : if (CPLIsNan(hfValueIn))
834 : {
835 0 : nValueOut = 0;
836 : }
837 5222 : else if (CPLIsInf(hfValueIn))
838 : {
839 0 : nValueOut = hfValueIn > GFloat16(0.0f)
840 0 : ? cpl::NumericLimits<int>::max()
841 0 : : cpl::NumericLimits<int>::lowest();
842 : }
843 : else
844 : {
845 10444 : nValueOut = static_cast<int>(hfValueIn > GFloat16(0.0f)
846 5222 : ? hfValueIn + GFloat16(0.5f)
847 5889 : : hfValueIn - GFloat16(0.5f));
848 : }
849 5222 : }
850 : };
851 :
852 : template <> struct sGDALCopyWord<GFloat16, std::int64_t>
853 : {
854 1093 : static inline void f(const GFloat16 hfValueIn, std::int64_t &nValueOut)
855 : {
856 1093 : if (CPLIsNan(hfValueIn))
857 : {
858 1 : nValueOut = 0;
859 : }
860 1092 : else if (CPLIsInf(hfValueIn))
861 : {
862 4 : nValueOut = hfValueIn > GFloat16(0.0f)
863 2 : ? cpl::NumericLimits<std::int64_t>::max()
864 1 : : cpl::NumericLimits<std::int64_t>::lowest();
865 : }
866 : else
867 : {
868 1090 : nValueOut = static_cast<std::int64_t>(
869 1090 : hfValueIn > GFloat16(0.0f) ? hfValueIn + GFloat16(0.5f)
870 1560 : : hfValueIn - GFloat16(0.5f));
871 : }
872 1093 : }
873 : };
874 :
875 : /**
876 : * Copy a single word, optionally rounding if appropriate (i.e. going
877 : * from the float to the integer case). Note that this is the function
878 : * you should specialize if you're adding a new data type.
879 : *
880 : * @param tValueIn value of type Tin; the input value to be converted
881 : * @param tValueOut value of type Tout; the output value
882 : */
883 :
884 : template <class Tin, class Tout>
885 491044548 : inline void GDALCopyWord(const Tin tValueIn, Tout &tValueOut)
886 : {
887 : if constexpr (std::is_same<Tin, Tout>::value)
888 42822392 : tValueOut = tValueIn;
889 : else
890 448222156 : sGDALCopyWord<Tin, Tout>::f(tValueIn, tValueOut);
891 491044548 : }
892 :
893 : /************************************************************************/
894 : /* GDALCopy4Words() */
895 : /************************************************************************/
896 : /**
897 : * Copy 4 packed words to 4 packed words, optionally rounding if appropriate
898 : * (i.e. going from the float to the integer case).
899 : *
900 : * @param pValueIn pointer to 4 input values of type Tin.
901 : * @param pValueOut pointer to 4 output values of type Tout.
902 : */
903 :
904 : template <class Tin, class Tout>
905 604 : inline void GDALCopy4Words(const Tin *pValueIn, Tout *const pValueOut)
906 : {
907 604 : GDALCopyWord(pValueIn[0], pValueOut[0]);
908 604 : GDALCopyWord(pValueIn[1], pValueOut[1]);
909 604 : GDALCopyWord(pValueIn[2], pValueOut[2]);
910 604 : GDALCopyWord(pValueIn[3], pValueOut[3]);
911 604 : }
912 :
913 : /************************************************************************/
914 : /* GDALCopy8Words() */
915 : /************************************************************************/
916 : /**
917 : * Copy 8 packed words to 8 packed words, optionally rounding if appropriate
918 : * (i.e. going from the float to the integer case).
919 : *
920 : * @param pValueIn pointer to 8 input values of type Tin.
921 : * @param pValueOut pointer to 8 output values of type Tout.
922 : */
923 :
924 : template <class Tin, class Tout>
925 44584203 : inline void GDALCopy8Words(const Tin *pValueIn, Tout *const pValueOut)
926 : {
927 44584203 : GDALCopy4Words(pValueIn, pValueOut);
928 44584203 : GDALCopy4Words(pValueIn + 4, pValueOut + 4);
929 44584203 : }
930 :
931 : // Needs SSE2
932 : #if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2) || \
933 : defined(USE_NEON_OPTIMIZATIONS)
934 :
935 : template <>
936 30528102 : inline void GDALCopy4Words(const float *pValueIn, GByte *const pValueOut)
937 : {
938 30528102 : __m128 xmm = _mm_loadu_ps(pValueIn);
939 :
940 : // The following clamping would be useless due to the final saturating
941 : // packing if we could guarantee the input range in [INT_MIN,INT_MAX]
942 30528102 : const __m128 p0d5 = _mm_set1_ps(0.5f);
943 30528102 : const __m128 xmm_max = _mm_set1_ps(255);
944 30528102 : xmm = _mm_add_ps(xmm, p0d5);
945 61056104 : xmm = _mm_min_ps(_mm_max_ps(xmm, p0d5), xmm_max);
946 :
947 30528102 : __m128i xmm_i = _mm_cvttps_epi32(xmm);
948 :
949 : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
950 : xmm_i = _mm_shuffle_epi8(
951 : xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24)));
952 : #else
953 30528102 : xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
954 30528102 : xmm_i = _mm_packus_epi16(xmm_i, xmm_i); // Pack int16 to uint8
955 : #endif
956 30528102 : GDALCopyXMMToInt32(xmm_i, pValueOut);
957 30528102 : }
958 :
959 43492800 : static inline __m128 GDALIfThenElse(__m128 mask, __m128 thenVal, __m128 elseVal)
960 : {
961 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
962 : return _mm_blendv_ps(elseVal, thenVal, mask);
963 : #else
964 130478000 : return _mm_or_ps(_mm_and_ps(mask, thenVal), _mm_andnot_ps(mask, elseVal));
965 : #endif
966 : }
967 :
968 : template <>
969 190 : inline void GDALCopy4Words(const float *pValueIn, GInt8 *const pValueOut)
970 : {
971 190 : __m128 xmm = _mm_loadu_ps(pValueIn);
972 :
973 190 : const __m128 xmm_min = _mm_set1_ps(-128);
974 190 : const __m128 xmm_max = _mm_set1_ps(127);
975 380 : xmm = _mm_min_ps(_mm_max_ps(xmm, xmm_min), xmm_max);
976 :
977 190 : const __m128 p0d5 = _mm_set1_ps(0.5f);
978 190 : const __m128 m0d5 = _mm_set1_ps(-0.5f);
979 190 : const __m128 mask = _mm_cmpge_ps(xmm, p0d5);
980 : // f >= 0.5f ? f + 0.5f : f - 0.5f
981 380 : xmm = _mm_add_ps(xmm, GDALIfThenElse(mask, p0d5, m0d5));
982 :
983 190 : __m128i xmm_i = _mm_cvttps_epi32(xmm);
984 :
985 : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
986 : xmm_i = _mm_shuffle_epi8(
987 : xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24)));
988 : #else
989 190 : xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
990 190 : xmm_i = _mm_packs_epi16(xmm_i, xmm_i); // Pack int16 to int8
991 : #endif
992 190 : GDALCopyXMMToInt32(xmm_i, pValueOut);
993 190 : }
994 :
995 : template <>
996 3363980 : inline void GDALCopy4Words(const float *pValueIn, GInt16 *const pValueOut)
997 : {
998 3363980 : __m128 xmm = _mm_loadu_ps(pValueIn);
999 :
1000 3363980 : const __m128 xmm_min = _mm_set1_ps(-32768);
1001 3363980 : const __m128 xmm_max = _mm_set1_ps(32767);
1002 6727960 : xmm = _mm_min_ps(_mm_max_ps(xmm, xmm_min), xmm_max);
1003 :
1004 3363980 : const __m128 p0d5 = _mm_set1_ps(0.5f);
1005 3363980 : const __m128 m0d5 = _mm_set1_ps(-0.5f);
1006 3363980 : const __m128 mask = _mm_cmpge_ps(xmm, p0d5);
1007 : // f >= 0.5f ? f + 0.5f : f - 0.5f
1008 6727960 : xmm = _mm_add_ps(xmm, GDALIfThenElse(mask, p0d5, m0d5));
1009 :
1010 3363980 : __m128i xmm_i = _mm_cvttps_epi32(xmm);
1011 :
1012 3363980 : xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
1013 3363980 : GDALCopyXMMToInt64(xmm_i, pValueOut);
1014 3363980 : }
1015 :
1016 : template <>
1017 1 : inline void GDALCopy4Words(const float *pValueIn, GUInt16 *const pValueOut)
1018 : {
1019 1 : __m128 xmm = _mm_loadu_ps(pValueIn);
1020 :
1021 1 : const __m128 p0d5 = _mm_set1_ps(0.5f);
1022 1 : const __m128 xmm_max = _mm_set1_ps(65535);
1023 1 : xmm = _mm_add_ps(xmm, p0d5);
1024 2 : xmm = _mm_min_ps(_mm_max_ps(xmm, p0d5), xmm_max);
1025 :
1026 1 : __m128i xmm_i = _mm_cvttps_epi32(xmm);
1027 :
1028 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
1029 : xmm_i = _mm_packus_epi32(xmm_i, xmm_i); // Pack int32 to uint16
1030 : #else
1031 : // Translate to int16 range because _mm_packus_epi32 is SSE4.1 only
1032 2 : xmm_i = _mm_add_epi32(xmm_i, _mm_set1_epi32(-32768));
1033 1 : xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
1034 : // Translate back to uint16 range (actually -32768==32768 in int16)
1035 1 : xmm_i = _mm_add_epi16(xmm_i, _mm_set1_epi16(-32768));
1036 : #endif
1037 1 : GDALCopyXMMToInt64(xmm_i, pValueOut);
1038 1 : }
1039 :
1040 80342400 : static inline __m128i GDALIfThenElse(__m128i mask, __m128i thenVal,
1041 : __m128i elseVal)
1042 : {
1043 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
1044 : return _mm_blendv_epi8(elseVal, thenVal, mask);
1045 : #else
1046 160685000 : return _mm_or_si128(_mm_and_si128(mask, thenVal),
1047 80342400 : _mm_andnot_si128(mask, elseVal));
1048 : #endif
1049 : }
1050 :
1051 : template <>
1052 40128600 : inline void GDALCopy4Words(const float *pValueIn, GInt32 *const pValueOut)
1053 : {
1054 40128600 : __m128 xmm = _mm_loadu_ps(pValueIn);
1055 40128600 : const __m128 xmm_ori = xmm;
1056 :
1057 40128600 : const __m128 p0d5 = _mm_set1_ps(0.5f);
1058 40128600 : const __m128 m0d5 = _mm_set1_ps(-0.5f);
1059 40128600 : const __m128 mask = _mm_cmpge_ps(xmm, p0d5);
1060 : // f >= 0.5f ? f + 0.5f : f - 0.5f
1061 80257200 : xmm = _mm_add_ps(xmm, GDALIfThenElse(mask, p0d5, m0d5));
1062 :
1063 40128600 : __m128i xmm_i = _mm_cvttps_epi32(xmm);
1064 :
1065 40128600 : const __m128 xmm_min = _mm_set1_ps(-2147483648.0f);
1066 40128600 : const __m128 xmm_max = _mm_set1_ps(2147483648.0f);
1067 40128600 : const __m128i xmm_i_min = _mm_set1_epi32(INT_MIN);
1068 40128600 : const __m128i xmm_i_max = _mm_set1_epi32(INT_MAX);
1069 80257200 : xmm_i = GDALIfThenElse(_mm_castps_si128(_mm_cmpge_ps(xmm_ori, xmm_max)),
1070 : xmm_i_max, xmm_i);
1071 80257200 : xmm_i = GDALIfThenElse(_mm_castps_si128(_mm_cmple_ps(xmm_ori, xmm_min)),
1072 : xmm_i_min, xmm_i);
1073 :
1074 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut), xmm_i);
1075 40128600 : }
1076 :
1077 : // ARM64 has an efficient instruction for Float32 -> Float16
1078 : #if !(defined(HAVE__FLOAT16) && \
1079 : (defined(__aarch64__) && defined(_M_ARM64))) && \
1080 : !(defined(__AVX2__) && defined(__F16C__))
1081 :
1082 630 : inline __m128i GDALFourFloat32ToFloat16(__m128i xmm)
1083 : {
1084 : // Ported from https://github.com/simd-everywhere/simde/blob/51743e7920b6e867678cb50e9c62effe28f70b33/simde/simde-f16.h#L176
1085 : // to SSE2 in a branch-less way
1086 :
1087 : // clang-format off
1088 :
1089 : /* This code is CC0, based heavily on code by Fabian Giesen. */
1090 630 : const __m128i f32u_infinity = _mm_set1_epi32(255 << 23);
1091 630 : const __m128i f16u_max = _mm_set1_epi32((127 + 16) << 23);
1092 630 : const __m128i denorm_magic = _mm_set1_epi32(((127 - 15) + (23 - 10) + 1) << 23);
1093 :
1094 1260 : const auto sign = _mm_and_si128(xmm, _mm_set1_epi32(INT_MIN));
1095 630 : xmm = _mm_xor_si128(xmm, sign);
1096 11340 : xmm = GDALIfThenElse(
1097 : _mm_cmpgt_epi32(xmm, f16u_max),
1098 : /* result is Inf or NaN (all exponent bits set) */
1099 : GDALIfThenElse(
1100 : _mm_cmpgt_epi32(xmm, f32u_infinity),
1101 : /* NaN->qNaN and Inf->Inf */
1102 : _mm_set1_epi32(0x7e00),
1103 : _mm_set1_epi32(0x7c00)),
1104 : /* (De)normalized number or zero */
1105 : GDALIfThenElse(
1106 : _mm_cmplt_epi32(xmm, _mm_set1_epi32(113 << 23)),
1107 : /* use a magic value to align our 10 mantissa bits at the bottom of
1108 : * the float. as long as FP addition is round-to-nearest-even this
1109 : * just works. */
1110 : _mm_sub_epi32(
1111 : _mm_castps_si128(_mm_add_ps(_mm_castsi128_ps(xmm),
1112 : _mm_castsi128_ps(denorm_magic))),
1113 : /* and one integer subtract of the bias later,
1114 : * we have our final float! */
1115 : denorm_magic
1116 : ),
1117 : _mm_srli_epi32(
1118 : _mm_add_epi32(
1119 : /* update exponent, rounding bias part 1 */
1120 : // (unsigned)-0x37fff001 = ((unsigned)(15-127) << 23) + 0xfff
1121 : _mm_add_epi32(xmm, _mm_set1_epi32(-0x37fff001)),
1122 : /* rounding bias part 2, using mant_odd */
1123 : _mm_and_si128(_mm_srli_epi32(xmm, 13), _mm_set1_epi32(1))),
1124 : 13
1125 : )
1126 : )
1127 : );
1128 630 : xmm = _mm_or_si128(xmm, _mm_srli_epi32(sign, 16));
1129 :
1130 : // clang-format on
1131 630 : return xmm;
1132 : }
1133 :
1134 : template <>
1135 315 : inline void GDALCopy8Words(const float *pValueIn, GFloat16 *const pValueOut)
1136 : {
1137 : __m128i xmm_lo =
1138 630 : GDALFourFloat32ToFloat16(_mm_castps_si128(_mm_loadu_ps(pValueIn)));
1139 : __m128i xmm_hi =
1140 945 : GDALFourFloat32ToFloat16(_mm_castps_si128(_mm_loadu_ps(pValueIn + 4)));
1141 :
1142 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
1143 : auto xmm = _mm_packus_epi32(xmm_lo, xmm_hi); // Pack int32 to uint16
1144 : #else
1145 : // Translate to int16 range because _mm_packus_epi32 is SSE4.1 only
1146 630 : xmm_lo = _mm_add_epi32(xmm_lo, _mm_set1_epi32(-32768));
1147 630 : xmm_hi = _mm_add_epi32(xmm_hi, _mm_set1_epi32(-32768));
1148 315 : auto xmm = _mm_packs_epi32(xmm_lo, xmm_hi); // Pack int32 to int16
1149 : // Translate back to uint16 range (actually -32768==32768 in int16)
1150 630 : xmm = _mm_add_epi16(xmm, _mm_set1_epi16(-32768));
1151 : #endif
1152 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut), xmm);
1153 315 : }
1154 :
1155 : #endif
1156 :
1157 : template <>
1158 624470 : inline void GDALCopy4Words(const double *pValueIn, float *const pValueOut)
1159 : {
1160 624470 : const __m128d val01 = _mm_loadu_pd(pValueIn);
1161 1248940 : const __m128d val23 = _mm_loadu_pd(pValueIn + 2);
1162 624470 : const __m128 val01_s = _mm_cvtpd_ps(val01);
1163 624470 : const __m128 val23_s = _mm_cvtpd_ps(val23);
1164 624470 : const __m128 val = _mm_movelh_ps(val01_s, val23_s);
1165 : _mm_storeu_ps(pValueOut, val);
1166 624470 : }
1167 :
1168 : template <>
1169 2776280 : inline void GDALCopy4Words(const double *pValueIn, GByte *const pValueOut)
1170 : {
1171 2776280 : const __m128d p0d5 = _mm_set1_pd(0.5);
1172 2776280 : const __m128d xmm_max = _mm_set1_pd(255);
1173 :
1174 2776280 : __m128d val01 = _mm_loadu_pd(pValueIn);
1175 5552560 : __m128d val23 = _mm_loadu_pd(pValueIn + 2);
1176 2776280 : val01 = _mm_add_pd(val01, p0d5);
1177 5552560 : val01 = _mm_min_pd(_mm_max_pd(val01, p0d5), xmm_max);
1178 2776280 : val23 = _mm_add_pd(val23, p0d5);
1179 5552560 : val23 = _mm_min_pd(_mm_max_pd(val23, p0d5), xmm_max);
1180 :
1181 2776280 : const __m128i val01_u32 = _mm_cvttpd_epi32(val01);
1182 2776280 : const __m128i val23_u32 = _mm_cvttpd_epi32(val23);
1183 :
1184 : // Merge 4 int32 values into a single register
1185 8328840 : auto xmm_i = _mm_castpd_si128(_mm_shuffle_pd(
1186 : _mm_castsi128_pd(val01_u32), _mm_castsi128_pd(val23_u32), 0));
1187 :
1188 : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
1189 : xmm_i = _mm_shuffle_epi8(
1190 : xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24)));
1191 : #else
1192 2776280 : xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
1193 2776280 : xmm_i = _mm_packus_epi16(xmm_i, xmm_i); // Pack int16 to uint8
1194 : #endif
1195 2776280 : GDALCopyXMMToInt32(xmm_i, pValueOut);
1196 2776280 : }
1197 :
1198 : template <>
1199 11746300 : inline void GDALCopy4Words(const float *pValueIn, double *const pValueOut)
1200 : {
1201 11746300 : const __m128 valIn = _mm_loadu_ps(pValueIn);
1202 11746300 : _mm_storeu_pd(pValueOut, _mm_cvtps_pd(valIn));
1203 23492500 : _mm_storeu_pd(pValueOut + 2, _mm_cvtps_pd(_mm_movehl_ps(valIn, valIn)));
1204 11746300 : }
1205 :
1206 : #ifdef __F16C__
1207 : template <>
1208 : inline void GDALCopy4Words(const GFloat16 *pValueIn, float *const pValueOut)
1209 : {
1210 : __m128i xmm = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pValueIn));
1211 : _mm_storeu_ps(pValueOut, _mm_cvtph_ps(xmm));
1212 : }
1213 :
1214 : template <>
1215 : inline void GDALCopy4Words(const float *pValueIn, GFloat16 *const pValueOut)
1216 : {
1217 : __m128 xmm = _mm_loadu_ps(pValueIn);
1218 : GDALCopyXMMToInt64(_mm_cvtps_ph(xmm, _MM_FROUND_TO_NEAREST_INT), pValueOut);
1219 : }
1220 :
1221 : template <>
1222 : inline void GDALCopy4Words(const GFloat16 *pValueIn, double *const pValueOut)
1223 : {
1224 : float tmp[4];
1225 : GDALCopy4Words(pValueIn, tmp);
1226 : GDALCopy4Words(tmp, pValueOut);
1227 : }
1228 :
1229 : template <>
1230 : inline void GDALCopy4Words(const double *pValueIn, GFloat16 *const pValueOut)
1231 : {
1232 : float tmp[4];
1233 : GDALCopy4Words(pValueIn, tmp);
1234 : GDALCopy4Words(tmp, pValueOut);
1235 : }
1236 :
1237 : // ARM64 has an efficient instruction for Float16 -> Float32/Float64
1238 : #elif !(defined(HAVE__FLOAT16) && (defined(__aarch64__) && defined(_M_ARM64)))
1239 :
1240 : // Convert 4 float16 values to 4 float 32 values
1241 : // xmm must contain 4 float16 values stored in 32 bit each (with upper 16 bits at zero)
1242 41668 : static inline __m128i GDALFourFloat16ToFloat32(__m128i xmm)
1243 : {
1244 : // Ported from https://github.com/simd-everywhere/simde/blob/51743e7920b6e867678cb50e9c62effe28f70b33/simde/simde-f16.h#L242C4-L242C68
1245 : // to SSE2 in a branch-less way
1246 :
1247 : /* This code is CC0, based heavily on code by Fabian Giesen. */
1248 : const auto denorm_magic =
1249 83336 : _mm_castsi128_ps(_mm_set1_epi32((128 - 15) << 23));
1250 : const auto shifted_exp =
1251 41668 : _mm_set1_epi32(0x7c00 << 13); /* exponent mask after shift */
1252 :
1253 : // Shift exponent and mantissa bits to their position in a float32
1254 125004 : auto f32u = _mm_slli_epi32(_mm_and_si128(xmm, _mm_set1_epi32(0x7fff)), 13);
1255 : // Extract the (shifted) exponent
1256 41668 : const auto exp = _mm_and_si128(shifted_exp, f32u);
1257 : // Adjust the exponent
1258 41668 : const auto exp_adjustment = _mm_set1_epi32((127 - 15) << 23);
1259 41668 : f32u = _mm_add_epi32(f32u, exp_adjustment);
1260 :
1261 41668 : const auto is_inf_nan = _mm_cmpeq_epi32(exp, shifted_exp); /* Inf/NaN? */
1262 : // When is_inf_nan is true: extra exponent adjustment
1263 41668 : const auto f32u_inf_nan = _mm_add_epi32(f32u, exp_adjustment);
1264 :
1265 : const auto is_denormal =
1266 83336 : _mm_cmpeq_epi32(exp, _mm_setzero_si128()); /* Zero/Denormal? */
1267 : // When is_denormal is true:
1268 83336 : auto f32u_denormal = _mm_add_epi32(f32u, _mm_set1_epi32(1 << 23));
1269 83336 : f32u_denormal = _mm_castps_si128(
1270 : _mm_sub_ps(_mm_castsi128_ps(f32u_denormal), denorm_magic));
1271 :
1272 41668 : f32u = GDALIfThenElse(is_inf_nan, f32u_inf_nan, f32u);
1273 41668 : f32u = GDALIfThenElse(is_denormal, f32u_denormal, f32u);
1274 :
1275 : // Re-apply sign bit
1276 125004 : f32u = _mm_or_si128(
1277 : f32u, _mm_slli_epi32(_mm_and_si128(xmm, _mm_set1_epi32(0x8000)), 16));
1278 41668 : return f32u;
1279 : }
1280 :
1281 : template <>
1282 388 : inline void GDALCopy8Words(const GFloat16 *pValueIn, float *const pValueOut)
1283 : {
1284 388 : __m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
1285 : const auto xmm_0 =
1286 776 : GDALFourFloat16ToFloat32(_mm_unpacklo_epi16(xmm, _mm_setzero_si128()));
1287 : const auto xmm_1 =
1288 776 : GDALFourFloat16ToFloat32(_mm_unpackhi_epi16(xmm, _mm_setzero_si128()));
1289 388 : _mm_storeu_ps(pValueOut + 0, _mm_castsi128_ps(xmm_0));
1290 388 : _mm_storeu_ps(pValueOut + 4, _mm_castsi128_ps(xmm_1));
1291 388 : }
1292 :
1293 : template <>
1294 20446 : inline void GDALCopy8Words(const GFloat16 *pValueIn, double *const pValueOut)
1295 : {
1296 20446 : __m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
1297 61338 : const auto xmm_0 = _mm_castsi128_ps(
1298 : GDALFourFloat16ToFloat32(_mm_unpacklo_epi16(xmm, _mm_setzero_si128())));
1299 61338 : const auto xmm_1 = _mm_castsi128_ps(
1300 : GDALFourFloat16ToFloat32(_mm_unpackhi_epi16(xmm, _mm_setzero_si128())));
1301 20446 : _mm_storeu_pd(pValueOut + 0, _mm_cvtps_pd(xmm_0));
1302 40892 : _mm_storeu_pd(pValueOut + 2, _mm_cvtps_pd(_mm_movehl_ps(xmm_0, xmm_0)));
1303 20446 : _mm_storeu_pd(pValueOut + 4, _mm_cvtps_pd(xmm_1));
1304 40892 : _mm_storeu_pd(pValueOut + 6, _mm_cvtps_pd(_mm_movehl_ps(xmm_1, xmm_1)));
1305 20446 : }
1306 :
1307 : #endif // __F16C__
1308 :
1309 : #ifdef __AVX2__
1310 :
1311 : #include <immintrin.h>
1312 :
1313 : template <>
1314 : inline void GDALCopy8Words(const double *pValueIn, float *const pValueOut)
1315 : {
1316 : const __m256d val0123 = _mm256_loadu_pd(pValueIn);
1317 : const __m256d val4567 = _mm256_loadu_pd(pValueIn + 4);
1318 : const __m256 val0123_s = _mm256_castps128_ps256(_mm256_cvtpd_ps(val0123));
1319 : const __m256 val4567_s = _mm256_castps128_ps256(_mm256_cvtpd_ps(val4567));
1320 : const __m256 val =
1321 : _mm256_permute2f128_ps(val0123_s, val4567_s, 0 | (2 << 4));
1322 : _mm256_storeu_ps(pValueOut, val);
1323 : }
1324 :
1325 : template <>
1326 : inline void GDALCopy8Words(const float *pValueIn, double *const pValueOut)
1327 : {
1328 : const __m256 valIn = _mm256_loadu_ps(pValueIn);
1329 : _mm256_storeu_pd(pValueOut, _mm256_cvtps_pd(_mm256_castps256_ps128(valIn)));
1330 : _mm256_storeu_pd(pValueOut + 4,
1331 : _mm256_cvtps_pd(_mm256_castps256_ps128(
1332 : _mm256_permute2f128_ps(valIn, valIn, 1))));
1333 : }
1334 :
1335 : #ifdef __F16C__
1336 :
1337 : template <>
1338 : inline void GDALCopy8Words(const GFloat16 *pValueIn, float *const pValueOut)
1339 : {
1340 : __m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
1341 : _mm256_storeu_ps(pValueOut, _mm256_cvtph_ps(xmm));
1342 : }
1343 :
1344 : template <>
1345 : inline void GDALCopy8Words(const float *pValueIn, GFloat16 *const pValueOut)
1346 : {
1347 : __m256 ymm = _mm256_loadu_ps(pValueIn);
1348 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut),
1349 : _mm256_cvtps_ph(ymm, _MM_FROUND_TO_NEAREST_INT));
1350 : }
1351 :
1352 : template <>
1353 : inline void GDALCopy8Words(const GFloat16 *pValueIn, double *const pValueOut)
1354 : {
1355 : __m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
1356 : const auto ymm = _mm256_cvtph_ps(xmm);
1357 : _mm256_storeu_pd(pValueOut, _mm256_cvtps_pd(_mm256_extractf128_ps(ymm, 0)));
1358 : _mm256_storeu_pd(pValueOut + 4,
1359 : _mm256_cvtps_pd(_mm256_extractf128_ps(ymm, 1)));
1360 : }
1361 :
1362 : template <>
1363 : inline void GDALCopy8Words(const double *pValueIn, GFloat16 *const pValueOut)
1364 : {
1365 : __m256d ymm0 = _mm256_loadu_pd(pValueIn);
1366 : __m256d ymm1 = _mm256_loadu_pd(pValueIn + 4);
1367 : __m256 ymm = _mm256_set_m128(_mm256_cvtpd_ps(ymm1), _mm256_cvtpd_ps(ymm0));
1368 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut),
1369 : _mm256_cvtps_ph(ymm, _MM_FROUND_TO_NEAREST_INT));
1370 : }
1371 :
1372 : #endif
1373 :
1374 : template <>
1375 : inline void GDALCopy8Words(const float *pValueIn, GByte *const pValueOut)
1376 : {
1377 : __m256 ymm = _mm256_loadu_ps(pValueIn);
1378 :
1379 : const __m256 p0d5 = _mm256_set1_ps(0.5f);
1380 : const __m256 ymm_max = _mm256_set1_ps(255);
1381 : ymm = _mm256_add_ps(ymm, p0d5);
1382 : ymm = _mm256_min_ps(_mm256_max_ps(ymm, p0d5), ymm_max);
1383 :
1384 : __m256i ymm_i = _mm256_cvttps_epi32(ymm);
1385 :
1386 : ymm_i = _mm256_packus_epi32(ymm_i, ymm_i); // Pack int32 to uint16
1387 : ymm_i = _mm256_permute4x64_epi64(ymm_i, 0 | (2 << 2)); // AVX2
1388 :
1389 : __m128i xmm_i = _mm256_castsi256_si128(ymm_i);
1390 : xmm_i = _mm_packus_epi16(xmm_i, xmm_i);
1391 : GDALCopyXMMToInt64(xmm_i, pValueOut);
1392 : }
1393 :
1394 : template <>
1395 : inline void GDALCopy8Words(const float *pValueIn, GUInt16 *const pValueOut)
1396 : {
1397 : __m256 ymm = _mm256_loadu_ps(pValueIn);
1398 :
1399 : const __m256 p0d5 = _mm256_set1_ps(0.5f);
1400 : const __m256 ymm_max = _mm256_set1_ps(65535);
1401 : ymm = _mm256_add_ps(ymm, p0d5);
1402 : ymm = _mm256_min_ps(_mm256_max_ps(ymm, p0d5), ymm_max);
1403 :
1404 : __m256i ymm_i = _mm256_cvttps_epi32(ymm);
1405 :
1406 : ymm_i = _mm256_packus_epi32(ymm_i, ymm_i); // Pack int32 to uint16
1407 : ymm_i = _mm256_permute4x64_epi64(ymm_i, 0 | (2 << 2)); // AVX2
1408 :
1409 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut),
1410 : _mm256_castsi256_si128(ymm_i));
1411 : }
1412 : #else
1413 : template <>
1414 7785681 : inline void GDALCopy8Words(const float *pValueIn, GUInt16 *const pValueOut)
1415 : {
1416 7785681 : __m128 xmm = _mm_loadu_ps(pValueIn);
1417 15571402 : __m128 xmm1 = _mm_loadu_ps(pValueIn + 4);
1418 :
1419 7785681 : const __m128 p0d5 = _mm_set1_ps(0.5f);
1420 7785681 : const __m128 xmm_max = _mm_set1_ps(65535);
1421 7785681 : xmm = _mm_add_ps(xmm, p0d5);
1422 7785681 : xmm1 = _mm_add_ps(xmm1, p0d5);
1423 15571402 : xmm = _mm_min_ps(_mm_max_ps(xmm, p0d5), xmm_max);
1424 15571402 : xmm1 = _mm_min_ps(_mm_max_ps(xmm1, p0d5), xmm_max);
1425 :
1426 7785681 : __m128i xmm_i = _mm_cvttps_epi32(xmm);
1427 7785681 : __m128i xmm1_i = _mm_cvttps_epi32(xmm1);
1428 :
1429 : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
1430 : xmm_i = _mm_packus_epi32(xmm_i, xmm1_i); // Pack int32 to uint16
1431 : #else
1432 : // Translate to int16 range because _mm_packus_epi32 is SSE4.1 only
1433 15571402 : xmm_i = _mm_add_epi32(xmm_i, _mm_set1_epi32(-32768));
1434 15571402 : xmm1_i = _mm_add_epi32(xmm1_i, _mm_set1_epi32(-32768));
1435 7785681 : xmm_i = _mm_packs_epi32(xmm_i, xmm1_i); // Pack int32 to int16
1436 : // Translate back to uint16 range (actually -32768==32768 in int16)
1437 15571402 : xmm_i = _mm_add_epi16(xmm_i, _mm_set1_epi16(-32768));
1438 : #endif
1439 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut), xmm_i);
1440 7785681 : }
1441 : #endif
1442 :
1443 : // ARM64 has an efficient instruction for Float64 -> Float16
1444 : #if !(defined(HAVE__FLOAT16) && \
1445 : (defined(__aarch64__) && defined(_M_ARM64))) && \
1446 : !(defined(__AVX2__) && defined(__F16C__))
1447 : template <>
1448 235 : inline void GDALCopy8Words(const double *pValueIn, GFloat16 *const pValueOut)
1449 : {
1450 : float fVal[8];
1451 235 : GDALCopy8Words(pValueIn, fVal);
1452 235 : GDALCopy8Words(fVal, pValueOut);
1453 235 : }
1454 : #endif
1455 :
1456 : #endif // defined(__x86_64) || defined(_M_X64)
1457 :
1458 : #endif // GDAL_PRIV_TEMPLATES_HPP_INCLUDED
|