LCOV - code coverage report
Current view: top level - gcore - gdal_priv_templates.hpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 449 468 95.9 %
Date: 2026-04-22 14:22:58 Functions: 393 547 71.8 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  GDAL Core
       4             :  * Purpose:  Inline C++ templates
       5             :  * Author:   Phil Vachon, <philippe at cowpig.ca>
       6             :  *
       7             :  ******************************************************************************
       8             :  * Copyright (c) 2009, Phil Vachon, <philippe at cowpig.ca>
       9             :  * Copyright (c) 2025, Even Rouault, <even.rouault at spatialys.com>
      10             :  *
      11             :  * SPDX-License-Identifier: MIT
      12             :  ****************************************************************************/
      13             : 
      14             : #ifndef GDAL_PRIV_TEMPLATES_HPP_INCLUDED
      15             : #define GDAL_PRIV_TEMPLATES_HPP_INCLUDED
      16             : 
      17             : #include "cpl_port.h"
      18             : 
      19             : #include <algorithm>
      20             : #include <cmath>
      21             : #include <cstdint>
      22             : #include <limits>
      23             : #include <type_traits>
      24             : 
      25             : #include "cpl_float.h"
      26             : 
      27             : // Needs SSE2
      28             : #if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2) ||               \
      29             :     defined(USE_NEON_OPTIMIZATIONS)
      30             : 
      31             : #ifdef USE_NEON_OPTIMIZATIONS
      32             : #include "include_sse2neon.h"
      33             : #else
      34             : #include <immintrin.h>
      35             : #endif
      36             : 
      37    40217145 : static inline void GDALCopyXMMToInt32(const __m128i xmm, void *pDest)
      38             : {
      39    40217145 :     int n32 = _mm_cvtsi128_si32(xmm);  // Extract lower 32 bit word
      40    40217145 :     memcpy(pDest, &n32, sizeof(n32));
      41    40217145 : }
      42             : 
      43   106492321 : static inline void GDALCopyXMMToInt64(const __m128i xmm, void *pDest)
      44             : {
      45             :     _mm_storel_epi64(reinterpret_cast<__m128i *>(pDest), xmm);
      46   106492321 : }
      47             : 
      48             : #if __SSSE3__
      49             : #include <tmmintrin.h>
      50             : #endif
      51             : 
      52             : #if defined(__SSE4_1__) || defined(__AVX__)
      53             : #include <smmintrin.h>
      54             : #endif
      55             : 
      56             : #ifdef __F16C__
      57             : #include <immintrin.h>
      58             : #endif
      59             : 
      60             : #endif
      61             : 
      62             : /************************************************************************/
      63             : /*                         GDALGetDataLimits()                          */
      64             : /************************************************************************/
      65             : /**
      66             :  * Compute the limits of values that can be placed in Tout in terms of
      67             :  * Tin. Usually used for output clamping, when the output data type's
      68             :  * limits are stable relative to the input type (i.e. no roundoff error).
      69             :  *
      70             :  * @param tMaxValue the returned maximum value
      71             :  * @param tMinValue the returned minimum value
      72             :  */
      73             : 
      74             : template <class Tin, class Tout>
      75   352246142 : inline void GDALGetDataLimits(Tin &tMaxValue, Tin &tMinValue)
      76             : {
      77   352246142 :     tMaxValue = cpl::NumericLimits<Tin>::max();
      78   352246142 :     tMinValue = cpl::NumericLimits<Tin>::lowest();
      79             : 
      80             :     // Compute the actual minimum value of Tout in terms of Tin.
      81             :     if constexpr (cpl::NumericLimits<Tout>::is_signed &&
      82             :                   cpl::NumericLimits<Tout>::is_integer)
      83             :     {
      84             :         // the minimum value is less than zero
      85             :         // cppcheck-suppress knownConditionTrueFalse
      86             :         if constexpr (cpl::NumericLimits<Tout>::digits <
      87             :                           cpl::NumericLimits<Tin>::digits ||
      88             :                       !cpl::NumericLimits<Tin>::is_integer)
      89             :         {
      90             :             // Tout is smaller than Tin, so we need to clamp values in input
      91             :             // to the range of Tout's min/max values
      92             :             if constexpr (cpl::NumericLimits<Tin>::is_signed)
      93             :             {
      94    87464460 :                 tMinValue =
      95    87464460 :                     static_cast<Tin>(cpl::NumericLimits<Tout>::lowest());
      96             :             }
      97    87985123 :             tMaxValue = static_cast<Tin>(cpl::NumericLimits<Tout>::max());
      98             :         }
      99             :     }
     100             :     else if constexpr (cpl::NumericLimits<Tout>::is_integer)
     101             :     {
     102             :         // the output is unsigned, so we just need to determine the max
     103             :         if constexpr (!std::is_same_v<Tin, Tout> &&
     104             :                       cpl::NumericLimits<Tout>::digits <=
     105             :                           cpl::NumericLimits<Tin>::digits)
     106             :         {
     107             :             // Tout is smaller than Tin, so we need to clamp the input values
     108             :             // to the range of Tout's max
     109   104644102 :             tMaxValue = static_cast<Tin>(cpl::NumericLimits<Tout>::max());
     110             :         }
     111   107597980 :         tMinValue = 0;
     112             :     }
     113   352246142 : }
     114             : 
     115             : /************************************************************************/
     116             : /*                           GDALClampValue()                           */
     117             : /************************************************************************/
     118             : /**
     119             :  * Clamp values of type T to a specified range
     120             :  *
     121             :  * @param tValue the value
     122             :  * @param tMax the max value
     123             :  * @param tMin the min value
     124             :  */
     125             : template <class T>
     126   352047047 : inline T GDALClampValue(const T tValue, const T tMax, const T tMin)
     127             : {
     128   352047047 :     return tValue > tMax ? tMax : tValue < tMin ? tMin : tValue;
     129             : }
     130             : 
     131             : /************************************************************************/
     132             : /*                        GDALClampDoubleValue()                        */
     133             : /************************************************************************/
     134             : /**
     135             :  * Clamp double values to a specified range, this uses the same
     136             :  * argument ordering as std::clamp, returns TRUE if the value was clamped.
     137             :  *
     138             :  * @param tValue the value
     139             :  * @param tMin the min value
     140             :  * @param tMax the max value
     141             :  *
     142             :  */
     143             : template <class T2, class T3>
     144         309 : inline bool GDALClampDoubleValue(double &tValue, const T2 tMin, const T3 tMax)
     145             : {
     146         309 :     const double tMin2{static_cast<double>(tMin)};
     147         309 :     const double tMax2{static_cast<double>(tMax)};
     148         309 :     if (tValue > tMax2 || tValue < tMin2)
     149             :     {
     150          26 :         tValue = tValue > tMax2 ? tMax2 : tValue < tMin2 ? tMin2 : tValue;
     151          26 :         return true;
     152             :     }
     153             :     else
     154             :     {
     155         283 :         return false;
     156             :     }
     157             : }
     158             : 
     159             : /************************************************************************/
     160             : /*                        GDALClampValueToType()                        */
     161             : /************************************************************************/
     162             : /**
     163             :  * Clamp a value of type T to the limits of type TClamped.
     164             :  *
     165             :  * @param tValue the value
     166             :  */
     167      198828 : template <class T, class TClamped> inline T GDALClampValueToType(T tValue)
     168             : {
     169             :     T tMaxValue, tMinValue;
     170      198828 :     GDALGetDataLimits<T, TClamped>(tMaxValue, tMinValue);
     171      198828 :     return std::clamp(tValue, tMinValue, tMaxValue);
     172             : }
     173             : 
     174             : /************************************************************************/
     175             : /*                         GDALIsValueInRange()                         */
     176             : /************************************************************************/
     177             : /**
     178             :  * Returns whether a value is in the type range.
     179             :  * NaN is considered not to be in type range.
     180             :  *
     181             :  * @param dfValue the value
     182             :  * @return whether the value is in the type range.
     183             :  */
     184      156599 : template <class T> inline bool GDALIsValueInRange(double dfValue)
     185             : {
     186      313157 :     return dfValue >= static_cast<double>(cpl::NumericLimits<T>::lowest()) &&
     187      313157 :            dfValue <= static_cast<double>(cpl::NumericLimits<T>::max());
     188             : }
     189             : 
     190          27 : template <> inline bool GDALIsValueInRange<double>(double dfValue)
     191             : {
     192          27 :     return !CPLIsNan(dfValue);
     193             : }
     194             : 
     195       46944 : template <> inline bool GDALIsValueInRange<float>(double dfValue)
     196             : {
     197       93761 :     return CPLIsInf(dfValue) ||
     198             :            (dfValue >=
     199       46817 :                 -static_cast<double>(std::numeric_limits<float>::max()) &&
     200       93753 :             dfValue <= static_cast<double>(std::numeric_limits<float>::max()));
     201             : }
     202             : 
     203         120 : template <> inline bool GDALIsValueInRange<GFloat16>(double dfValue)
     204             : {
     205         360 :     return CPLIsInf(dfValue) ||
     206         240 :            (dfValue >= -cpl::NumericLimits<GFloat16>::max() &&
     207         240 :             dfValue <= cpl::NumericLimits<GFloat16>::max());
     208             : }
     209             : 
     210       16925 : template <> inline bool GDALIsValueInRange<int64_t>(double dfValue)
     211             : {
     212             :     // Values in the range [INT64_MAX - 1023, INT64_MAX - 1]
     213             :     // get converted to a double that once cast to int64_t is
     214             :     // INT64_MAX + 1, hence the < strict comparison.
     215             :     return dfValue >=
     216       33849 :                static_cast<double>(cpl::NumericLimits<int64_t>::lowest()) &&
     217       33849 :            dfValue < static_cast<double>(cpl::NumericLimits<int64_t>::max());
     218             : }
     219             : 
     220        8968 : template <> inline bool GDALIsValueInRange<uint64_t>(double dfValue)
     221             : {
     222             :     // Values in the range [UINT64_MAX - 2047, UINT64_MAX - 1]
     223             :     // get converted to a double that once cast to uint64_t is
     224             :     // UINT64_MAX + 1, hence the < strict comparison.
     225       17933 :     return dfValue >= 0 &&
     226       17933 :            dfValue < static_cast<double>(cpl::NumericLimits<uint64_t>::max());
     227             : }
     228             : 
     229             : /************************************************************************/
     230             : /*                         GDALIsValueExactAs()                         */
     231             : /************************************************************************/
     232             : /**
     233             :  * Returns whether a value can be exactly represented on type T.
     234             :  *
     235             :  * That is static_cast\<double\>(static_cast\<T\>(dfValue)) is legal and is
     236             :  * equal to dfValue.
     237             :  *
     238             :  * Note: for T=float or double, a NaN input leads to true
     239             :  *
     240             :  * @param dfValue the value
     241             :  * @return whether the value can be exactly represented on type T.
     242             :  */
     243        7849 : template <class T> inline bool GDALIsValueExactAs(double dfValue)
     244             : {
     245       15644 :     return GDALIsValueInRange<T>(dfValue) &&
     246       15644 :            static_cast<double>(static_cast<T>(dfValue)) == dfValue;
     247             : }
     248             : 
     249         174 : template <> inline bool GDALIsValueExactAs<float>(double dfValue)
     250             : {
     251         342 :     return CPLIsNan(dfValue) ||
     252         168 :            (GDALIsValueInRange<float>(dfValue) &&
     253         338 :             static_cast<double>(static_cast<float>(dfValue)) == dfValue);
     254             : }
     255             : 
     256           7 : template <> inline bool GDALIsValueExactAs<GFloat16>(double dfValue)
     257             : {
     258          14 :     return CPLIsNan(dfValue) ||
     259           7 :            (GDALIsValueInRange<GFloat16>(dfValue) &&
     260          14 :             static_cast<double>(static_cast<GFloat16>(dfValue)) == dfValue);
     261             : }
     262             : 
     263          21 : template <> inline bool GDALIsValueExactAs<double>(double)
     264             : {
     265          21 :     return true;
     266             : }
     267             : 
     268             : /************************************************************************/
     269             : /*                            GDALCopyWord()                            */
     270             : /************************************************************************/
     271             : 
     272             : // Integer input and output: clamp the input
     273             : 
     274             : template <class Tin, class Tout> struct sGDALCopyWord
     275             : {
     276   176202192 :     static inline void f(const Tin tValueIn, Tout &tValueOut)
     277             :     {
     278             :         Tin tMaxVal, tMinVal;
     279   176202192 :         GDALGetDataLimits<Tin, Tout>(tMaxVal, tMinVal);
     280   176202192 :         tValueOut =
     281   176202192 :             static_cast<Tout>(GDALClampValue(tValueIn, tMaxVal, tMinVal));
     282   176202192 :     }
     283             : };
     284             : 
     285             : // Integer input and floating point output: simply convert
     286             : 
     287             : template <class Tin> struct sGDALCopyWord<Tin, GFloat16>
     288             : {
     289      270266 :     static inline void f(const Tin tValueIn, GFloat16 &hfValueOut)
     290             :     {
     291      270266 :         hfValueOut = static_cast<GFloat16>(tValueIn);
     292      270266 :     }
     293             : };
     294             : 
     295             : template <class Tin> struct sGDALCopyWord<Tin, float>
     296             : {
     297     8027797 :     static inline void f(const Tin tValueIn, float &fValueOut)
     298             :     {
     299     8027797 :         fValueOut = static_cast<float>(tValueIn);
     300     8027797 :     }
     301             : };
     302             : 
     303             : template <class Tin> struct sGDALCopyWord<Tin, double>
     304             : {
     305    87057931 :     static inline void f(const Tin tValueIn, double &dfValueOut)
     306             :     {
     307    87057931 :         dfValueOut = static_cast<double>(tValueIn);
     308    87057931 :     }
     309             : };
     310             : 
     311             : // Floating point input and output, converting between identical types: simply copy
     312             : 
     313             : template <> struct sGDALCopyWord<GFloat16, GFloat16>
     314             : {
     315             :     static inline void f(const GFloat16 hfValueIn, GFloat16 &hfValueOut)
     316             :     {
     317             :         hfValueOut = hfValueIn;
     318             :     }
     319             : };
     320             : 
     321             : template <> struct sGDALCopyWord<float, float>
     322             : {
     323             :     static inline void f(const float fValueIn, float &fValueOut)
     324             :     {
     325             :         fValueOut = fValueIn;
     326             :     }
     327             : };
     328             : 
     329             : template <> struct sGDALCopyWord<double, double>
     330             : {
     331             :     static inline void f(const double dfValueIn, double &dfValueOut)
     332             :     {
     333             :         dfValueOut = dfValueIn;
     334             :     }
     335             : };
     336             : 
     337             : // Floating point input and output, converting to a larger type: use implicit conversion
     338             : 
     339             : template <> struct sGDALCopyWord<GFloat16, float>
     340             : {
     341        5206 :     static inline void f(const GFloat16 hfValueIn, float &dfValueOut)
     342             :     {
     343        5206 :         dfValueOut = hfValueIn;
     344        5206 :     }
     345             : };
     346             : 
     347             : template <> struct sGDALCopyWord<GFloat16, double>
     348             : {
     349        4174 :     static inline void f(const GFloat16 hfValueIn, double &dfValueOut)
     350             :     {
     351        4174 :         dfValueOut = hfValueIn;
     352        4174 :     }
     353             : };
     354             : 
     355             : template <> struct sGDALCopyWord<float, double>
     356             : {
     357      636723 :     static inline void f(const float fValueIn, double &dfValueOut)
     358             :     {
     359      636723 :         dfValueOut = static_cast<double>(fValueIn);
     360      636723 :     }
     361             : };
     362             : 
     363             : // Floating point input and out, converting to a smaller type: ensure overflow results in infinity
     364             : 
     365             : template <> struct sGDALCopyWord<float, GFloat16>
     366             : {
     367         497 :     static inline void f(const float fValueIn, GFloat16 &hfValueOut)
     368             :     {
     369             :         // Our custom implementation when std::float16_t is not
     370             :         // available ensures proper behavior.
     371             : #if !defined(HAVE_STD_FLOAT16_T)
     372         497 :         if (fValueIn > cpl::NumericLimits<GFloat16>::max())
     373             :         {
     374          27 :             hfValueOut = cpl::NumericLimits<GFloat16>::infinity();
     375          27 :             return;
     376             :         }
     377         470 :         if (fValueIn < -cpl::NumericLimits<GFloat16>::max())
     378             :         {
     379          15 :             hfValueOut = -cpl::NumericLimits<GFloat16>::infinity();
     380          15 :             return;
     381             :         }
     382             : #endif
     383         455 :         hfValueOut = static_cast<GFloat16>(fValueIn);
     384             :     }
     385             : };
     386             : 
     387             : template <> struct sGDALCopyWord<double, GFloat16>
     388             : {
     389         349 :     static inline void f(const double dfValueIn, GFloat16 &hfValueOut)
     390             :     {
     391             :         // Our custom implementation when std::float16_t is not
     392             :         // available ensures proper behavior.
     393             : #if !defined(HAVE_STD_FLOAT16_T)
     394         349 :         if (dfValueIn > cpl::NumericLimits<GFloat16>::max())
     395             :         {
     396          14 :             hfValueOut = cpl::NumericLimits<GFloat16>::infinity();
     397          14 :             return;
     398             :         }
     399         335 :         if (dfValueIn < -cpl::NumericLimits<GFloat16>::max())
     400             :         {
     401          14 :             hfValueOut = -cpl::NumericLimits<GFloat16>::infinity();
     402          14 :             return;
     403             :         }
     404             : #endif
     405         321 :         hfValueOut = static_cast<GFloat16>(dfValueIn);
     406             :     }
     407             : };
     408             : 
     409             : template <> struct sGDALCopyWord<double, float>
     410             : {
     411      130340 :     static inline void f(const double dfValueIn, float &fValueOut)
     412             :     {
     413             : #if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)
     414             :         // We could just write fValueOut = static_cast<float>(dfValueIn);
     415             :         // but a sanitizer might complain with values above FLT_MAX
     416      391020 :         _mm_store_ss(&fValueOut,
     417             :                      _mm_cvtsd_ss(_mm_undefined_ps(), _mm_load_sd(&dfValueIn)));
     418             : #else
     419             :         if (dfValueIn > static_cast<double>(std::numeric_limits<float>::max()))
     420             :         {
     421             :             fValueOut = std::numeric_limits<float>::infinity();
     422             :             return;
     423             :         }
     424             :         if (dfValueIn < static_cast<double>(-std::numeric_limits<float>::max()))
     425             :         {
     426             :             fValueOut = -std::numeric_limits<float>::infinity();
     427             :             return;
     428             :         }
     429             : 
     430             :         fValueOut = static_cast<float>(dfValueIn);
     431             : #endif
     432      130340 :     }
     433             : };
     434             : 
     435             : // Floating point input to a small unsigned integer type: nan becomes zero, otherwise round and clamp
     436             : 
     437             : template <class Tout> struct sGDALCopyWord<GFloat16, Tout>
     438             : {
     439        5032 :     static inline void f(const GFloat16 hfValueIn, Tout &tValueOut)
     440             :     {
     441        5032 :         if (CPLIsNan(hfValueIn))
     442             :         {
     443           1 :             tValueOut = 0;
     444           1 :             return;
     445             :         }
     446             :         GFloat16 hfMaxVal, hfMinVal;
     447        5031 :         GDALGetDataLimits<GFloat16, Tout>(hfMaxVal, hfMinVal);
     448        5031 :         tValueOut = static_cast<Tout>(
     449       10062 :             GDALClampValue(hfValueIn + GFloat16(0.5f), hfMaxVal, hfMinVal));
     450             :     }
     451             : };
     452             : 
     453             : template <class Tout> struct sGDALCopyWord<float, Tout>
     454             : {
     455     5471730 :     static inline void f(const float fValueIn, Tout &tValueOut)
     456             :     {
     457     5471730 :         if (CPLIsNan(fValueIn))
     458             :         {
     459           0 :             tValueOut = 0;
     460           0 :             return;
     461             :         }
     462             :         float fMaxVal, fMinVal;
     463     5471730 :         GDALGetDataLimits<float, Tout>(fMaxVal, fMinVal);
     464     5471730 :         tValueOut = static_cast<Tout>(
     465     5471730 :             GDALClampValue(fValueIn + 0.5f, fMaxVal, fMinVal));
     466             :     }
     467             : };
     468             : 
     469             : template <class Tout> struct sGDALCopyWord<double, Tout>
     470             : {
     471    82995982 :     static inline void f(const double dfValueIn, Tout &tValueOut)
     472             :     {
     473    82995982 :         if (CPLIsNan(dfValueIn))
     474             :         {
     475           6 :             tValueOut = 0;
     476           6 :             return;
     477             :         }
     478             :         double dfMaxVal, dfMinVal;
     479    82995981 :         GDALGetDataLimits<double, Tout>(dfMaxVal, dfMinVal);
     480    82995981 :         tValueOut = static_cast<Tout>(
     481    82995981 :             GDALClampValue(dfValueIn + 0.5, dfMaxVal, dfMinVal));
     482             :     }
     483             : };
     484             : 
     485             : // Floating point input to a large unsigned integer type: nan becomes zero, otherwise round and clamp.
     486             : // Avoid roundoff while clamping.
     487             : 
     488             : template <> struct sGDALCopyWord<GFloat16, std::uint64_t>
     489             : {
     490         624 :     static inline void f(const GFloat16 hfValueIn, std::uint64_t &nValueOut)
     491             :     {
     492         624 :         if (!(hfValueIn > 0))
     493             :         {
     494           4 :             nValueOut = 0;
     495             :         }
     496         620 :         else if (CPLIsInf(hfValueIn))
     497             :         {
     498           1 :             nValueOut = cpl::NumericLimits<std::uint64_t>::max();
     499             :         }
     500             :         else
     501             :         {
     502         619 :             nValueOut = static_cast<std::uint64_t>(hfValueIn + GFloat16(0.5f));
     503             :         }
     504         624 :     }
     505             : };
     506             : 
     507             : template <> struct sGDALCopyWord<float, unsigned int>
     508             : {
     509        5039 :     static inline void f(const float fValueIn, unsigned int &nValueOut)
     510             :     {
     511        5039 :         if (!(fValueIn > 0))
     512             :         {
     513         149 :             nValueOut = 0;
     514             :         }
     515        4890 :         else if (fValueIn >=
     516        4890 :                  static_cast<float>(cpl::NumericLimits<unsigned int>::max()))
     517             :         {
     518         134 :             nValueOut = cpl::NumericLimits<unsigned int>::max();
     519             :         }
     520             :         else
     521             :         {
     522        4756 :             nValueOut = static_cast<unsigned int>(fValueIn + 0.5f);
     523             :         }
     524        5039 :     }
     525             : };
     526             : 
     527             : template <> struct sGDALCopyWord<float, std::uint64_t>
     528             : {
     529         624 :     static inline void f(const float fValueIn, std::uint64_t &nValueOut)
     530             :     {
     531         624 :         if (!(fValueIn > 0))
     532             :         {
     533           4 :             nValueOut = 0;
     534             :         }
     535         620 :         else if (fValueIn >=
     536         620 :                  static_cast<float>(cpl::NumericLimits<std::uint64_t>::max()))
     537             :         {
     538           2 :             nValueOut = cpl::NumericLimits<std::uint64_t>::max();
     539             :         }
     540             :         else
     541             :         {
     542         618 :             nValueOut = static_cast<std::uint64_t>(fValueIn + 0.5f);
     543             :         }
     544         624 :     }
     545             : };
     546             : 
     547             : template <> struct sGDALCopyWord<double, std::uint64_t>
     548             : {
     549        1244 :     static inline void f(const double dfValueIn, std::uint64_t &nValueOut)
     550             :     {
     551        1244 :         if (!(dfValueIn > 0))
     552             :         {
     553         292 :             nValueOut = 0;
     554             :         }
     555         952 :         else if (dfValueIn >
     556         952 :                  static_cast<double>(cpl::NumericLimits<uint64_t>::max()))
     557             :         {
     558           4 :             nValueOut = cpl::NumericLimits<uint64_t>::max();
     559             :         }
     560             :         else
     561             :         {
     562         948 :             nValueOut = static_cast<std::uint64_t>(dfValueIn + 0.5);
     563             :         }
     564        1244 :     }
     565             : };
     566             : 
     567             : // Floating point input to a very large unsigned integer type: nan becomes zero, otherwise round and clamp.
     568             : // Avoid infinity while clamping when the maximum integer is too large for the floating-point type.
     569             : // Avoid roundoff while clamping.
     570             : 
     571             : template <> struct sGDALCopyWord<GFloat16, unsigned short>
     572             : {
     573        4706 :     static inline void f(const GFloat16 hfValueIn, unsigned short &nValueOut)
     574             :     {
     575        4706 :         if (!(hfValueIn > 0))
     576             :         {
     577         216 :             nValueOut = 0;
     578             :         }
     579        4490 :         else if (CPLIsInf(hfValueIn))
     580             :         {
     581          67 :             nValueOut = cpl::NumericLimits<unsigned short>::max();
     582             :         }
     583             :         else
     584             :         {
     585        4423 :             nValueOut = static_cast<unsigned short>(hfValueIn + GFloat16(0.5f));
     586             :         }
     587        4706 :     }
     588             : };
     589             : 
     590             : template <> struct sGDALCopyWord<GFloat16, unsigned int>
     591             : {
     592        4639 :     static inline void f(const GFloat16 hfValueIn, unsigned int &nValueOut)
     593             :     {
     594        4639 :         if (!(hfValueIn > 0))
     595             :         {
     596         149 :             nValueOut = 0;
     597             :         }
     598        4490 :         else if (CPLIsInf(hfValueIn))
     599             :         {
     600           0 :             nValueOut = cpl::NumericLimits<unsigned int>::max();
     601             :         }
     602             :         else
     603             :         {
     604        4490 :             nValueOut = static_cast<unsigned int>(hfValueIn + GFloat16(0.5f));
     605             :         }
     606        4639 :     }
     607             : };
     608             : 
     609             : // Floating point input to a small signed integer type: nan becomes zero, otherwise round and clamp.
     610             : // Rounding for signed integers is different than for the unsigned integers above.
     611             : 
     612             : template <> struct sGDALCopyWord<GFloat16, signed char>
     613             : {
     614        1088 :     static inline void f(const GFloat16 hfValueIn, signed char &nValueOut)
     615             :     {
     616        1088 :         if (CPLIsNan(hfValueIn))
     617             :         {
     618           0 :             nValueOut = 0;
     619           0 :             return;
     620             :         }
     621             :         GFloat16 hfMaxVal, hfMinVal;
     622        1088 :         GDALGetDataLimits<GFloat16, signed char>(hfMaxVal, hfMinVal);
     623        1088 :         GFloat16 hfValue = hfValueIn >= GFloat16(0.0f)
     624         619 :                                ? hfValueIn + GFloat16(0.5f)
     625        1088 :                                : hfValueIn - GFloat16(0.5f);
     626        1088 :         nValueOut = static_cast<signed char>(
     627        2176 :             GDALClampValue(hfValue, hfMaxVal, hfMinVal));
     628             :     }
     629             : };
     630             : 
     631             : template <> struct sGDALCopyWord<float, signed char>
     632             : {
     633         432 :     static inline void f(const float fValueIn, signed char &nValueOut)
     634             :     {
     635         432 :         if (CPLIsNan(fValueIn))
     636             :         {
     637           0 :             nValueOut = 0;
     638           0 :             return;
     639             :         }
     640             :         float fMaxVal, fMinVal;
     641         432 :         GDALGetDataLimits<float, signed char>(fMaxVal, fMinVal);
     642         432 :         float fValue = fValueIn >= 0.0f ? fValueIn + 0.5f : fValueIn - 0.5f;
     643         432 :         nValueOut =
     644         432 :             static_cast<signed char>(GDALClampValue(fValue, fMaxVal, fMinVal));
     645             :     }
     646             : };
     647             : 
     648             : template <> struct sGDALCopyWord<float, short>
     649             : {
     650     5102500 :     static inline void f(const float fValueIn, short &nValueOut)
     651             :     {
     652     5102500 :         if (CPLIsNan(fValueIn))
     653             :         {
     654           0 :             nValueOut = 0;
     655           0 :             return;
     656             :         }
     657             :         float fMaxVal, fMinVal;
     658     5102500 :         GDALGetDataLimits<float, short>(fMaxVal, fMinVal);
     659     5102500 :         float fValue = fValueIn >= 0.0f ? fValueIn + 0.5f : fValueIn - 0.5f;
     660     5102500 :         nValueOut =
     661     5102500 :             static_cast<short>(GDALClampValue(fValue, fMaxVal, fMinVal));
     662             :     }
     663             : };
     664             : 
     665             : template <> struct sGDALCopyWord<double, signed char>
     666             : {
     667        2512 :     static inline void f(const double dfValueIn, signed char &nValueOut)
     668             :     {
     669        2512 :         if (CPLIsNan(dfValueIn))
     670             :         {
     671           0 :             nValueOut = 0;
     672           0 :             return;
     673             :         }
     674             :         double dfMaxVal, dfMinVal;
     675        2512 :         GDALGetDataLimits<double, signed char>(dfMaxVal, dfMinVal);
     676        2512 :         double dfValue = dfValueIn > 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5;
     677        2512 :         nValueOut = static_cast<signed char>(
     678        2512 :             GDALClampValue(dfValue, dfMaxVal, dfMinVal));
     679             :     }
     680             : };
     681             : 
     682             : template <> struct sGDALCopyWord<double, short>
     683             : {
     684     5157450 :     static inline void f(const double dfValueIn, short &nValueOut)
     685             :     {
     686     5157450 :         if (CPLIsNan(dfValueIn))
     687             :         {
     688           2 :             nValueOut = 0;
     689           2 :             return;
     690             :         }
     691             :         double dfMaxVal, dfMinVal;
     692     5157450 :         GDALGetDataLimits<double, short>(dfMaxVal, dfMinVal);
     693     5157450 :         double dfValue = dfValueIn > 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5;
     694     5157450 :         nValueOut =
     695     5157450 :             static_cast<short>(GDALClampValue(dfValue, dfMaxVal, dfMinVal));
     696             :     }
     697             : };
     698             : 
     699             : template <> struct sGDALCopyWord<double, int>
     700             : {
     701    77108400 :     static inline void f(const double dfValueIn, int &nValueOut)
     702             :     {
     703    77108400 :         if (CPLIsNan(dfValueIn))
     704             :         {
     705           0 :             nValueOut = 0;
     706           0 :             return;
     707             :         }
     708             :         double dfMaxVal, dfMinVal;
     709    77108400 :         GDALGetDataLimits<double, int>(dfMaxVal, dfMinVal);
     710    77108400 :         double dfValue = dfValueIn >= 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5;
     711    77108400 :         nValueOut =
     712    77108400 :             static_cast<int>(GDALClampValue(dfValue, dfMaxVal, dfMinVal));
     713             :     }
     714             : };
     715             : 
     716             : // Floating point input to a large signed integer type: nan becomes zero, otherwise round and clamp.
     717             : // Rounding for signed integers is different than for the unsigned integers above.
     718             : // Avoid roundoff while clamping.
     719             : 
     720             : template <> struct sGDALCopyWord<GFloat16, short>
     721             : {
     722        5289 :     static inline void f(const GFloat16 hfValueIn, short &nValueOut)
     723             :     {
     724        5289 :         if (CPLIsNan(hfValueIn))
     725             :         {
     726           0 :             nValueOut = 0;
     727             :         }
     728        5289 :         else if (hfValueIn >=
     729        5289 :                  static_cast<GFloat16>(cpl::NumericLimits<short>::max()))
     730             :         {
     731         146 :             nValueOut = cpl::NumericLimits<short>::max();
     732             :         }
     733        5143 :         else if (hfValueIn <=
     734        5143 :                  static_cast<GFloat16>(cpl::NumericLimits<short>::lowest()))
     735             :         {
     736         213 :             nValueOut = cpl::NumericLimits<short>::lowest();
     737             :         }
     738             :         else
     739             :         {
     740        9860 :             nValueOut = static_cast<short>(hfValueIn > GFloat16(0.0f)
     741        4930 :                                                ? hfValueIn + GFloat16(0.5f)
     742        5451 :                                                : hfValueIn - GFloat16(0.5f));
     743             :         }
     744        5289 :     }
     745             : };
     746             : 
     747             : template <> struct sGDALCopyWord<float, int>
     748             : {
     749       10255 :     static inline void f(const float fValueIn, int &nValueOut)
     750             :     {
     751       10255 :         if (CPLIsNan(fValueIn))
     752             :         {
     753           0 :             nValueOut = 0;
     754             :         }
     755       10255 :         else if (fValueIn >= static_cast<float>(cpl::NumericLimits<int>::max()))
     756             :         {
     757         218 :             nValueOut = cpl::NumericLimits<int>::max();
     758             :         }
     759       10037 :         else if (fValueIn <=
     760       10037 :                  static_cast<float>(cpl::NumericLimits<int>::lowest()))
     761             :         {
     762          90 :             nValueOut = cpl::NumericLimits<int>::lowest();
     763             :         }
     764             :         else
     765             :         {
     766       10166 :             nValueOut = static_cast<int>(fValueIn > 0.0f ? fValueIn + 0.5f
     767         219 :                                                          : fValueIn - 0.5f);
     768             :         }
     769       10255 :     }
     770             : };
     771             : 
     772             : template <> struct sGDALCopyWord<float, std::int64_t>
     773             : {
     774        1093 :     static inline void f(const float fValueIn, std::int64_t &nValueOut)
     775             :     {
     776        1093 :         if (CPLIsNan(fValueIn))
     777             :         {
     778           1 :             nValueOut = 0;
     779             :         }
     780        1092 :         else if (fValueIn >=
     781        1092 :                  static_cast<float>(cpl::NumericLimits<std::int64_t>::max()))
     782             :         {
     783           2 :             nValueOut = cpl::NumericLimits<std::int64_t>::max();
     784             :         }
     785        1090 :         else if (fValueIn <=
     786        1090 :                  static_cast<float>(cpl::NumericLimits<std::int64_t>::lowest()))
     787             :         {
     788           2 :             nValueOut = cpl::NumericLimits<std::int64_t>::lowest();
     789             :         }
     790             :         else
     791             :         {
     792        2176 :             nValueOut = static_cast<std::int64_t>(
     793        1088 :                 fValueIn > 0.0f ? fValueIn + 0.5f : fValueIn - 0.5f);
     794             :         }
     795        1093 :     }
     796             : };
     797             : 
     798             : template <> struct sGDALCopyWord<double, std::int64_t>
     799             : {
     800        1727 :     static inline void f(const double dfValueIn, std::int64_t &nValueOut)
     801             :     {
     802        1727 :         if (CPLIsNan(dfValueIn))
     803             :         {
     804           1 :             nValueOut = 0;
     805             :         }
     806        1726 :         else if (dfValueIn >=
     807        1726 :                  static_cast<double>(cpl::NumericLimits<std::int64_t>::max()))
     808             :         {
     809           4 :             nValueOut = cpl::NumericLimits<std::int64_t>::max();
     810             :         }
     811        1722 :         else if (dfValueIn <=
     812        1722 :                  static_cast<double>(cpl::NumericLimits<std::int64_t>::min()))
     813             :         {
     814           4 :             nValueOut = cpl::NumericLimits<std::int64_t>::min();
     815             :         }
     816             :         else
     817             :         {
     818        3436 :             nValueOut = static_cast<std::int64_t>(
     819        1718 :                 dfValueIn > 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5);
     820             :         }
     821        1727 :     }
     822             : };
     823             : 
     824             : // Floating point input to a very large signed integer type: nan becomes zero, otherwise round and clamp.
     825             : // Rounding for signed integers is different than for the unsigned integers above.
     826             : // Avoid infinity while clamping when the maximum integer is too large for the floating-point type.
     827             : // Avoid roundoff while clamping.
     828             : 
     829             : template <> struct sGDALCopyWord<GFloat16, int>
     830             : {
     831        5222 :     static inline void f(const GFloat16 hfValueIn, int &nValueOut)
     832             :     {
     833        5222 :         if (CPLIsNan(hfValueIn))
     834             :         {
     835           0 :             nValueOut = 0;
     836             :         }
     837        5222 :         else if (CPLIsInf(hfValueIn))
     838             :         {
     839           0 :             nValueOut = hfValueIn > GFloat16(0.0f)
     840           0 :                             ? cpl::NumericLimits<int>::max()
     841           0 :                             : cpl::NumericLimits<int>::lowest();
     842             :         }
     843             :         else
     844             :         {
     845       10444 :             nValueOut = static_cast<int>(hfValueIn > GFloat16(0.0f)
     846        5222 :                                              ? hfValueIn + GFloat16(0.5f)
     847        5889 :                                              : hfValueIn - GFloat16(0.5f));
     848             :         }
     849        5222 :     }
     850             : };
     851             : 
     852             : template <> struct sGDALCopyWord<GFloat16, std::int64_t>
     853             : {
     854        1093 :     static inline void f(const GFloat16 hfValueIn, std::int64_t &nValueOut)
     855             :     {
     856        1093 :         if (CPLIsNan(hfValueIn))
     857             :         {
     858           1 :             nValueOut = 0;
     859             :         }
     860        1092 :         else if (CPLIsInf(hfValueIn))
     861             :         {
     862           4 :             nValueOut = hfValueIn > GFloat16(0.0f)
     863           2 :                             ? cpl::NumericLimits<std::int64_t>::max()
     864           1 :                             : cpl::NumericLimits<std::int64_t>::lowest();
     865             :         }
     866             :         else
     867             :         {
     868        1090 :             nValueOut = static_cast<std::int64_t>(
     869        1090 :                 hfValueIn > GFloat16(0.0f) ? hfValueIn + GFloat16(0.5f)
     870        1560 :                                            : hfValueIn - GFloat16(0.5f));
     871             :         }
     872        1093 :     }
     873             : };
     874             : 
     875             : /**
     876             :  * Copy a single word, optionally rounding if appropriate (i.e. going
     877             :  * from the float to the integer case). Note that this is the function
     878             :  * you should specialize if you're adding a new data type.
     879             :  *
     880             :  * @param tValueIn value of type Tin; the input value to be converted
     881             :  * @param tValueOut value of type Tout; the output value
     882             :  */
     883             : 
     884             : template <class Tin, class Tout>
     885   491044548 : inline void GDALCopyWord(const Tin tValueIn, Tout &tValueOut)
     886             : {
     887             :     if constexpr (std::is_same<Tin, Tout>::value)
     888    42822392 :         tValueOut = tValueIn;
     889             :     else
     890   448222156 :         sGDALCopyWord<Tin, Tout>::f(tValueIn, tValueOut);
     891   491044548 : }
     892             : 
     893             : /************************************************************************/
     894             : /*                           GDALCopy4Words()                           */
     895             : /************************************************************************/
     896             : /**
     897             :  * Copy 4 packed words to 4 packed words, optionally rounding if appropriate
     898             :  * (i.e. going from the float to the integer case).
     899             :  *
     900             :  * @param pValueIn pointer to 4 input values of type Tin.
     901             :  * @param pValueOut pointer to 4 output values of type Tout.
     902             :  */
     903             : 
     904             : template <class Tin, class Tout>
     905         604 : inline void GDALCopy4Words(const Tin *pValueIn, Tout *const pValueOut)
     906             : {
     907         604 :     GDALCopyWord(pValueIn[0], pValueOut[0]);
     908         604 :     GDALCopyWord(pValueIn[1], pValueOut[1]);
     909         604 :     GDALCopyWord(pValueIn[2], pValueOut[2]);
     910         604 :     GDALCopyWord(pValueIn[3], pValueOut[3]);
     911         604 : }
     912             : 
     913             : /************************************************************************/
     914             : /*                           GDALCopy8Words()                           */
     915             : /************************************************************************/
     916             : /**
     917             :  * Copy 8 packed words to 8 packed words, optionally rounding if appropriate
     918             :  * (i.e. going from the float to the integer case).
     919             :  *
     920             :  * @param pValueIn pointer to 8 input values of type Tin.
     921             :  * @param pValueOut pointer to 8 output values of type Tout.
     922             :  */
     923             : 
     924             : template <class Tin, class Tout>
     925    44584203 : inline void GDALCopy8Words(const Tin *pValueIn, Tout *const pValueOut)
     926             : {
     927    44584203 :     GDALCopy4Words(pValueIn, pValueOut);
     928    44584203 :     GDALCopy4Words(pValueIn + 4, pValueOut + 4);
     929    44584203 : }
     930             : 
     931             : // Needs SSE2
     932             : #if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2) ||               \
     933             :     defined(USE_NEON_OPTIMIZATIONS)
     934             : 
     935             : template <>
     936    30528102 : inline void GDALCopy4Words(const float *pValueIn, GByte *const pValueOut)
     937             : {
     938    30528102 :     __m128 xmm = _mm_loadu_ps(pValueIn);
     939             : 
     940             :     // The following clamping would be useless due to the final saturating
     941             :     // packing if we could guarantee the input range in [INT_MIN,INT_MAX]
     942    30528102 :     const __m128 p0d5 = _mm_set1_ps(0.5f);
     943    30528102 :     const __m128 xmm_max = _mm_set1_ps(255);
     944    30528102 :     xmm = _mm_add_ps(xmm, p0d5);
     945    61056104 :     xmm = _mm_min_ps(_mm_max_ps(xmm, p0d5), xmm_max);
     946             : 
     947    30528102 :     __m128i xmm_i = _mm_cvttps_epi32(xmm);
     948             : 
     949             : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
     950             :     xmm_i = _mm_shuffle_epi8(
     951             :         xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24)));
     952             : #else
     953    30528102 :     xmm_i = _mm_packs_epi32(xmm_i, xmm_i);   // Pack int32 to int16
     954    30528102 :     xmm_i = _mm_packus_epi16(xmm_i, xmm_i);  // Pack int16 to uint8
     955             : #endif
     956    30528102 :     GDALCopyXMMToInt32(xmm_i, pValueOut);
     957    30528102 : }
     958             : 
     959    43492800 : static inline __m128 GDALIfThenElse(__m128 mask, __m128 thenVal, __m128 elseVal)
     960             : {
     961             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
     962             :     return _mm_blendv_ps(elseVal, thenVal, mask);
     963             : #else
     964   130478000 :     return _mm_or_ps(_mm_and_ps(mask, thenVal), _mm_andnot_ps(mask, elseVal));
     965             : #endif
     966             : }
     967             : 
     968             : template <>
     969         190 : inline void GDALCopy4Words(const float *pValueIn, GInt8 *const pValueOut)
     970             : {
     971         190 :     __m128 xmm = _mm_loadu_ps(pValueIn);
     972             : 
     973         190 :     const __m128 xmm_min = _mm_set1_ps(-128);
     974         190 :     const __m128 xmm_max = _mm_set1_ps(127);
     975         380 :     xmm = _mm_min_ps(_mm_max_ps(xmm, xmm_min), xmm_max);
     976             : 
     977         190 :     const __m128 p0d5 = _mm_set1_ps(0.5f);
     978         190 :     const __m128 m0d5 = _mm_set1_ps(-0.5f);
     979         190 :     const __m128 mask = _mm_cmpge_ps(xmm, p0d5);
     980             :     // f >= 0.5f ? f + 0.5f : f - 0.5f
     981         380 :     xmm = _mm_add_ps(xmm, GDALIfThenElse(mask, p0d5, m0d5));
     982             : 
     983         190 :     __m128i xmm_i = _mm_cvttps_epi32(xmm);
     984             : 
     985             : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
     986             :     xmm_i = _mm_shuffle_epi8(
     987             :         xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24)));
     988             : #else
     989         190 :     xmm_i = _mm_packs_epi32(xmm_i, xmm_i);  // Pack int32 to int16
     990         190 :     xmm_i = _mm_packs_epi16(xmm_i, xmm_i);  // Pack int16 to int8
     991             : #endif
     992         190 :     GDALCopyXMMToInt32(xmm_i, pValueOut);
     993         190 : }
     994             : 
     995             : template <>
     996     3363980 : inline void GDALCopy4Words(const float *pValueIn, GInt16 *const pValueOut)
     997             : {
     998     3363980 :     __m128 xmm = _mm_loadu_ps(pValueIn);
     999             : 
    1000     3363980 :     const __m128 xmm_min = _mm_set1_ps(-32768);
    1001     3363980 :     const __m128 xmm_max = _mm_set1_ps(32767);
    1002     6727960 :     xmm = _mm_min_ps(_mm_max_ps(xmm, xmm_min), xmm_max);
    1003             : 
    1004     3363980 :     const __m128 p0d5 = _mm_set1_ps(0.5f);
    1005     3363980 :     const __m128 m0d5 = _mm_set1_ps(-0.5f);
    1006     3363980 :     const __m128 mask = _mm_cmpge_ps(xmm, p0d5);
    1007             :     // f >= 0.5f ? f + 0.5f : f - 0.5f
    1008     6727960 :     xmm = _mm_add_ps(xmm, GDALIfThenElse(mask, p0d5, m0d5));
    1009             : 
    1010     3363980 :     __m128i xmm_i = _mm_cvttps_epi32(xmm);
    1011             : 
    1012     3363980 :     xmm_i = _mm_packs_epi32(xmm_i, xmm_i);  // Pack int32 to int16
    1013     3363980 :     GDALCopyXMMToInt64(xmm_i, pValueOut);
    1014     3363980 : }
    1015             : 
    1016             : template <>
    1017           1 : inline void GDALCopy4Words(const float *pValueIn, GUInt16 *const pValueOut)
    1018             : {
    1019           1 :     __m128 xmm = _mm_loadu_ps(pValueIn);
    1020             : 
    1021           1 :     const __m128 p0d5 = _mm_set1_ps(0.5f);
    1022           1 :     const __m128 xmm_max = _mm_set1_ps(65535);
    1023           1 :     xmm = _mm_add_ps(xmm, p0d5);
    1024           2 :     xmm = _mm_min_ps(_mm_max_ps(xmm, p0d5), xmm_max);
    1025             : 
    1026           1 :     __m128i xmm_i = _mm_cvttps_epi32(xmm);
    1027             : 
    1028             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
    1029             :     xmm_i = _mm_packus_epi32(xmm_i, xmm_i);  // Pack int32 to uint16
    1030             : #else
    1031             :     // Translate to int16 range because _mm_packus_epi32 is SSE4.1 only
    1032           2 :     xmm_i = _mm_add_epi32(xmm_i, _mm_set1_epi32(-32768));
    1033           1 :     xmm_i = _mm_packs_epi32(xmm_i, xmm_i);  // Pack int32 to int16
    1034             :     // Translate back to uint16 range (actually -32768==32768 in int16)
    1035           1 :     xmm_i = _mm_add_epi16(xmm_i, _mm_set1_epi16(-32768));
    1036             : #endif
    1037           1 :     GDALCopyXMMToInt64(xmm_i, pValueOut);
    1038           1 : }
    1039             : 
    1040    80342400 : static inline __m128i GDALIfThenElse(__m128i mask, __m128i thenVal,
    1041             :                                      __m128i elseVal)
    1042             : {
    1043             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
    1044             :     return _mm_blendv_epi8(elseVal, thenVal, mask);
    1045             : #else
    1046   160685000 :     return _mm_or_si128(_mm_and_si128(mask, thenVal),
    1047    80342400 :                         _mm_andnot_si128(mask, elseVal));
    1048             : #endif
    1049             : }
    1050             : 
    1051             : template <>
    1052    40128600 : inline void GDALCopy4Words(const float *pValueIn, GInt32 *const pValueOut)
    1053             : {
    1054    40128600 :     __m128 xmm = _mm_loadu_ps(pValueIn);
    1055    40128600 :     const __m128 xmm_ori = xmm;
    1056             : 
    1057    40128600 :     const __m128 p0d5 = _mm_set1_ps(0.5f);
    1058    40128600 :     const __m128 m0d5 = _mm_set1_ps(-0.5f);
    1059    40128600 :     const __m128 mask = _mm_cmpge_ps(xmm, p0d5);
    1060             :     // f >= 0.5f ? f + 0.5f : f - 0.5f
    1061    80257200 :     xmm = _mm_add_ps(xmm, GDALIfThenElse(mask, p0d5, m0d5));
    1062             : 
    1063    40128600 :     __m128i xmm_i = _mm_cvttps_epi32(xmm);
    1064             : 
    1065    40128600 :     const __m128 xmm_min = _mm_set1_ps(-2147483648.0f);
    1066    40128600 :     const __m128 xmm_max = _mm_set1_ps(2147483648.0f);
    1067    40128600 :     const __m128i xmm_i_min = _mm_set1_epi32(INT_MIN);
    1068    40128600 :     const __m128i xmm_i_max = _mm_set1_epi32(INT_MAX);
    1069    80257200 :     xmm_i = GDALIfThenElse(_mm_castps_si128(_mm_cmpge_ps(xmm_ori, xmm_max)),
    1070             :                            xmm_i_max, xmm_i);
    1071    80257200 :     xmm_i = GDALIfThenElse(_mm_castps_si128(_mm_cmple_ps(xmm_ori, xmm_min)),
    1072             :                            xmm_i_min, xmm_i);
    1073             : 
    1074             :     _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut), xmm_i);
    1075    40128600 : }
    1076             : 
    1077             : // ARM64 has an efficient instruction for Float32 -> Float16
    1078             : #if !(defined(HAVE__FLOAT16) &&                                                \
    1079             :       (defined(__aarch64__) && defined(_M_ARM64))) &&                          \
    1080             :     !(defined(__AVX2__) && defined(__F16C__))
    1081             : 
    1082         630 : inline __m128i GDALFourFloat32ToFloat16(__m128i xmm)
    1083             : {
    1084             :     // Ported from https://github.com/simd-everywhere/simde/blob/51743e7920b6e867678cb50e9c62effe28f70b33/simde/simde-f16.h#L176
    1085             :     // to SSE2 in a branch-less way
    1086             : 
    1087             :     // clang-format off
    1088             : 
    1089             :     /* This code is CC0, based heavily on code by Fabian Giesen. */
    1090         630 :     const __m128i f32u_infinity = _mm_set1_epi32(255 << 23);
    1091         630 :     const __m128i f16u_max = _mm_set1_epi32((127 + 16) << 23);
    1092         630 :     const __m128i denorm_magic = _mm_set1_epi32(((127 - 15) + (23 - 10) + 1) << 23);
    1093             : 
    1094        1260 :     const auto sign = _mm_and_si128(xmm, _mm_set1_epi32(INT_MIN));
    1095         630 :     xmm = _mm_xor_si128(xmm, sign);
    1096       11340 :     xmm = GDALIfThenElse(
    1097             :         _mm_cmpgt_epi32(xmm, f16u_max),
    1098             :         /* result is Inf or NaN (all exponent bits set) */
    1099             :         GDALIfThenElse(
    1100             :             _mm_cmpgt_epi32(xmm, f32u_infinity),
    1101             :                 /* NaN->qNaN and Inf->Inf */
    1102             :                _mm_set1_epi32(0x7e00),
    1103             :                _mm_set1_epi32(0x7c00)),
    1104             :         /* (De)normalized number or zero */
    1105             :         GDALIfThenElse(
    1106             :             _mm_cmplt_epi32(xmm, _mm_set1_epi32(113 << 23)),
    1107             :              /* use a magic value to align our 10 mantissa bits at the bottom of
    1108             :               * the float. as long as FP addition is round-to-nearest-even this
    1109             :               * just works. */
    1110             :             _mm_sub_epi32(
    1111             :                _mm_castps_si128(_mm_add_ps(_mm_castsi128_ps(xmm),
    1112             :                                            _mm_castsi128_ps(denorm_magic))),
    1113             :                /* and one integer subtract of the bias later,
    1114             :                 * we have our final float! */
    1115             :                denorm_magic
    1116             :             ),
    1117             :             _mm_srli_epi32(
    1118             :                 _mm_add_epi32(
    1119             :                    /* update exponent, rounding bias part 1 */
    1120             :                    // (unsigned)-0x37fff001 = ((unsigned)(15-127) << 23) + 0xfff
    1121             :                   _mm_add_epi32(xmm, _mm_set1_epi32(-0x37fff001)),
    1122             :                    /* rounding bias part 2, using mant_odd */
    1123             :                   _mm_and_si128(_mm_srli_epi32(xmm, 13), _mm_set1_epi32(1))),
    1124             :                 13
    1125             :             )
    1126             :         )
    1127             :     );
    1128         630 :     xmm = _mm_or_si128(xmm, _mm_srli_epi32(sign, 16));
    1129             : 
    1130             :     // clang-format on
    1131         630 :     return xmm;
    1132             : }
    1133             : 
    1134             : template <>
    1135         315 : inline void GDALCopy8Words(const float *pValueIn, GFloat16 *const pValueOut)
    1136             : {
    1137             :     __m128i xmm_lo =
    1138         630 :         GDALFourFloat32ToFloat16(_mm_castps_si128(_mm_loadu_ps(pValueIn)));
    1139             :     __m128i xmm_hi =
    1140         945 :         GDALFourFloat32ToFloat16(_mm_castps_si128(_mm_loadu_ps(pValueIn + 4)));
    1141             : 
    1142             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
    1143             :     auto xmm = _mm_packus_epi32(xmm_lo, xmm_hi);  // Pack int32 to uint16
    1144             : #else
    1145             :     // Translate to int16 range because _mm_packus_epi32 is SSE4.1 only
    1146         630 :     xmm_lo = _mm_add_epi32(xmm_lo, _mm_set1_epi32(-32768));
    1147         630 :     xmm_hi = _mm_add_epi32(xmm_hi, _mm_set1_epi32(-32768));
    1148         315 :     auto xmm = _mm_packs_epi32(xmm_lo, xmm_hi);  // Pack int32 to int16
    1149             :     // Translate back to uint16 range (actually -32768==32768 in int16)
    1150         630 :     xmm = _mm_add_epi16(xmm, _mm_set1_epi16(-32768));
    1151             : #endif
    1152             :     _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut), xmm);
    1153         315 : }
    1154             : 
    1155             : #endif
    1156             : 
    1157             : template <>
    1158      624470 : inline void GDALCopy4Words(const double *pValueIn, float *const pValueOut)
    1159             : {
    1160      624470 :     const __m128d val01 = _mm_loadu_pd(pValueIn);
    1161     1248940 :     const __m128d val23 = _mm_loadu_pd(pValueIn + 2);
    1162      624470 :     const __m128 val01_s = _mm_cvtpd_ps(val01);
    1163      624470 :     const __m128 val23_s = _mm_cvtpd_ps(val23);
    1164      624470 :     const __m128 val = _mm_movelh_ps(val01_s, val23_s);
    1165             :     _mm_storeu_ps(pValueOut, val);
    1166      624470 : }
    1167             : 
    1168             : template <>
    1169     2776280 : inline void GDALCopy4Words(const double *pValueIn, GByte *const pValueOut)
    1170             : {
    1171     2776280 :     const __m128d p0d5 = _mm_set1_pd(0.5);
    1172     2776280 :     const __m128d xmm_max = _mm_set1_pd(255);
    1173             : 
    1174     2776280 :     __m128d val01 = _mm_loadu_pd(pValueIn);
    1175     5552560 :     __m128d val23 = _mm_loadu_pd(pValueIn + 2);
    1176     2776280 :     val01 = _mm_add_pd(val01, p0d5);
    1177     5552560 :     val01 = _mm_min_pd(_mm_max_pd(val01, p0d5), xmm_max);
    1178     2776280 :     val23 = _mm_add_pd(val23, p0d5);
    1179     5552560 :     val23 = _mm_min_pd(_mm_max_pd(val23, p0d5), xmm_max);
    1180             : 
    1181     2776280 :     const __m128i val01_u32 = _mm_cvttpd_epi32(val01);
    1182     2776280 :     const __m128i val23_u32 = _mm_cvttpd_epi32(val23);
    1183             : 
    1184             :     // Merge 4 int32 values into a single register
    1185     8328840 :     auto xmm_i = _mm_castpd_si128(_mm_shuffle_pd(
    1186             :         _mm_castsi128_pd(val01_u32), _mm_castsi128_pd(val23_u32), 0));
    1187             : 
    1188             : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
    1189             :     xmm_i = _mm_shuffle_epi8(
    1190             :         xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24)));
    1191             : #else
    1192     2776280 :     xmm_i = _mm_packs_epi32(xmm_i, xmm_i);   // Pack int32 to int16
    1193     2776280 :     xmm_i = _mm_packus_epi16(xmm_i, xmm_i);  // Pack int16 to uint8
    1194             : #endif
    1195     2776280 :     GDALCopyXMMToInt32(xmm_i, pValueOut);
    1196     2776280 : }
    1197             : 
    1198             : template <>
    1199    11746300 : inline void GDALCopy4Words(const float *pValueIn, double *const pValueOut)
    1200             : {
    1201    11746300 :     const __m128 valIn = _mm_loadu_ps(pValueIn);
    1202    11746300 :     _mm_storeu_pd(pValueOut, _mm_cvtps_pd(valIn));
    1203    23492500 :     _mm_storeu_pd(pValueOut + 2, _mm_cvtps_pd(_mm_movehl_ps(valIn, valIn)));
    1204    11746300 : }
    1205             : 
    1206             : #ifdef __F16C__
    1207             : template <>
    1208             : inline void GDALCopy4Words(const GFloat16 *pValueIn, float *const pValueOut)
    1209             : {
    1210             :     __m128i xmm = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pValueIn));
    1211             :     _mm_storeu_ps(pValueOut, _mm_cvtph_ps(xmm));
    1212             : }
    1213             : 
    1214             : template <>
    1215             : inline void GDALCopy4Words(const float *pValueIn, GFloat16 *const pValueOut)
    1216             : {
    1217             :     __m128 xmm = _mm_loadu_ps(pValueIn);
    1218             :     GDALCopyXMMToInt64(_mm_cvtps_ph(xmm, _MM_FROUND_TO_NEAREST_INT), pValueOut);
    1219             : }
    1220             : 
    1221             : template <>
    1222             : inline void GDALCopy4Words(const GFloat16 *pValueIn, double *const pValueOut)
    1223             : {
    1224             :     float tmp[4];
    1225             :     GDALCopy4Words(pValueIn, tmp);
    1226             :     GDALCopy4Words(tmp, pValueOut);
    1227             : }
    1228             : 
    1229             : template <>
    1230             : inline void GDALCopy4Words(const double *pValueIn, GFloat16 *const pValueOut)
    1231             : {
    1232             :     float tmp[4];
    1233             :     GDALCopy4Words(pValueIn, tmp);
    1234             :     GDALCopy4Words(tmp, pValueOut);
    1235             : }
    1236             : 
    1237             : // ARM64 has an efficient instruction for Float16 -> Float32/Float64
    1238             : #elif !(defined(HAVE__FLOAT16) && (defined(__aarch64__) && defined(_M_ARM64)))
    1239             : 
    1240             : // Convert 4 float16 values to 4 float 32 values
    1241             : // xmm must contain 4 float16 values stored in 32 bit each (with upper 16 bits at zero)
    1242       41668 : static inline __m128i GDALFourFloat16ToFloat32(__m128i xmm)
    1243             : {
    1244             :     // Ported from https://github.com/simd-everywhere/simde/blob/51743e7920b6e867678cb50e9c62effe28f70b33/simde/simde-f16.h#L242C4-L242C68
    1245             :     // to SSE2 in a branch-less way
    1246             : 
    1247             :     /* This code is CC0, based heavily on code by Fabian Giesen. */
    1248             :     const auto denorm_magic =
    1249       83336 :         _mm_castsi128_ps(_mm_set1_epi32((128 - 15) << 23));
    1250             :     const auto shifted_exp =
    1251       41668 :         _mm_set1_epi32(0x7c00 << 13); /* exponent mask after shift */
    1252             : 
    1253             :     // Shift exponent and mantissa bits to their position in a float32
    1254      125004 :     auto f32u = _mm_slli_epi32(_mm_and_si128(xmm, _mm_set1_epi32(0x7fff)), 13);
    1255             :     // Extract the (shifted) exponent
    1256       41668 :     const auto exp = _mm_and_si128(shifted_exp, f32u);
    1257             :     // Adjust the exponent
    1258       41668 :     const auto exp_adjustment = _mm_set1_epi32((127 - 15) << 23);
    1259       41668 :     f32u = _mm_add_epi32(f32u, exp_adjustment);
    1260             : 
    1261       41668 :     const auto is_inf_nan = _mm_cmpeq_epi32(exp, shifted_exp); /* Inf/NaN? */
    1262             :     // When is_inf_nan is true: extra exponent adjustment
    1263       41668 :     const auto f32u_inf_nan = _mm_add_epi32(f32u, exp_adjustment);
    1264             : 
    1265             :     const auto is_denormal =
    1266       83336 :         _mm_cmpeq_epi32(exp, _mm_setzero_si128()); /* Zero/Denormal? */
    1267             :     // When is_denormal is true:
    1268       83336 :     auto f32u_denormal = _mm_add_epi32(f32u, _mm_set1_epi32(1 << 23));
    1269       83336 :     f32u_denormal = _mm_castps_si128(
    1270             :         _mm_sub_ps(_mm_castsi128_ps(f32u_denormal), denorm_magic));
    1271             : 
    1272       41668 :     f32u = GDALIfThenElse(is_inf_nan, f32u_inf_nan, f32u);
    1273       41668 :     f32u = GDALIfThenElse(is_denormal, f32u_denormal, f32u);
    1274             : 
    1275             :     // Re-apply sign bit
    1276      125004 :     f32u = _mm_or_si128(
    1277             :         f32u, _mm_slli_epi32(_mm_and_si128(xmm, _mm_set1_epi32(0x8000)), 16));
    1278       41668 :     return f32u;
    1279             : }
    1280             : 
    1281             : template <>
    1282         388 : inline void GDALCopy8Words(const GFloat16 *pValueIn, float *const pValueOut)
    1283             : {
    1284         388 :     __m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
    1285             :     const auto xmm_0 =
    1286         776 :         GDALFourFloat16ToFloat32(_mm_unpacklo_epi16(xmm, _mm_setzero_si128()));
    1287             :     const auto xmm_1 =
    1288         776 :         GDALFourFloat16ToFloat32(_mm_unpackhi_epi16(xmm, _mm_setzero_si128()));
    1289         388 :     _mm_storeu_ps(pValueOut + 0, _mm_castsi128_ps(xmm_0));
    1290         388 :     _mm_storeu_ps(pValueOut + 4, _mm_castsi128_ps(xmm_1));
    1291         388 : }
    1292             : 
    1293             : template <>
    1294       20446 : inline void GDALCopy8Words(const GFloat16 *pValueIn, double *const pValueOut)
    1295             : {
    1296       20446 :     __m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
    1297       61338 :     const auto xmm_0 = _mm_castsi128_ps(
    1298             :         GDALFourFloat16ToFloat32(_mm_unpacklo_epi16(xmm, _mm_setzero_si128())));
    1299       61338 :     const auto xmm_1 = _mm_castsi128_ps(
    1300             :         GDALFourFloat16ToFloat32(_mm_unpackhi_epi16(xmm, _mm_setzero_si128())));
    1301       20446 :     _mm_storeu_pd(pValueOut + 0, _mm_cvtps_pd(xmm_0));
    1302       40892 :     _mm_storeu_pd(pValueOut + 2, _mm_cvtps_pd(_mm_movehl_ps(xmm_0, xmm_0)));
    1303       20446 :     _mm_storeu_pd(pValueOut + 4, _mm_cvtps_pd(xmm_1));
    1304       40892 :     _mm_storeu_pd(pValueOut + 6, _mm_cvtps_pd(_mm_movehl_ps(xmm_1, xmm_1)));
    1305       20446 : }
    1306             : 
    1307             : #endif  // __F16C__
    1308             : 
    1309             : #ifdef __AVX2__
    1310             : 
    1311             : #include <immintrin.h>
    1312             : 
    1313             : template <>
    1314             : inline void GDALCopy8Words(const double *pValueIn, float *const pValueOut)
    1315             : {
    1316             :     const __m256d val0123 = _mm256_loadu_pd(pValueIn);
    1317             :     const __m256d val4567 = _mm256_loadu_pd(pValueIn + 4);
    1318             :     const __m256 val0123_s = _mm256_castps128_ps256(_mm256_cvtpd_ps(val0123));
    1319             :     const __m256 val4567_s = _mm256_castps128_ps256(_mm256_cvtpd_ps(val4567));
    1320             :     const __m256 val =
    1321             :         _mm256_permute2f128_ps(val0123_s, val4567_s, 0 | (2 << 4));
    1322             :     _mm256_storeu_ps(pValueOut, val);
    1323             : }
    1324             : 
    1325             : template <>
    1326             : inline void GDALCopy8Words(const float *pValueIn, double *const pValueOut)
    1327             : {
    1328             :     const __m256 valIn = _mm256_loadu_ps(pValueIn);
    1329             :     _mm256_storeu_pd(pValueOut, _mm256_cvtps_pd(_mm256_castps256_ps128(valIn)));
    1330             :     _mm256_storeu_pd(pValueOut + 4,
    1331             :                      _mm256_cvtps_pd(_mm256_castps256_ps128(
    1332             :                          _mm256_permute2f128_ps(valIn, valIn, 1))));
    1333             : }
    1334             : 
    1335             : #ifdef __F16C__
    1336             : 
    1337             : template <>
    1338             : inline void GDALCopy8Words(const GFloat16 *pValueIn, float *const pValueOut)
    1339             : {
    1340             :     __m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
    1341             :     _mm256_storeu_ps(pValueOut, _mm256_cvtph_ps(xmm));
    1342             : }
    1343             : 
    1344             : template <>
    1345             : inline void GDALCopy8Words(const float *pValueIn, GFloat16 *const pValueOut)
    1346             : {
    1347             :     __m256 ymm = _mm256_loadu_ps(pValueIn);
    1348             :     _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut),
    1349             :                      _mm256_cvtps_ph(ymm, _MM_FROUND_TO_NEAREST_INT));
    1350             : }
    1351             : 
    1352             : template <>
    1353             : inline void GDALCopy8Words(const GFloat16 *pValueIn, double *const pValueOut)
    1354             : {
    1355             :     __m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
    1356             :     const auto ymm = _mm256_cvtph_ps(xmm);
    1357             :     _mm256_storeu_pd(pValueOut, _mm256_cvtps_pd(_mm256_extractf128_ps(ymm, 0)));
    1358             :     _mm256_storeu_pd(pValueOut + 4,
    1359             :                      _mm256_cvtps_pd(_mm256_extractf128_ps(ymm, 1)));
    1360             : }
    1361             : 
    1362             : template <>
    1363             : inline void GDALCopy8Words(const double *pValueIn, GFloat16 *const pValueOut)
    1364             : {
    1365             :     __m256d ymm0 = _mm256_loadu_pd(pValueIn);
    1366             :     __m256d ymm1 = _mm256_loadu_pd(pValueIn + 4);
    1367             :     __m256 ymm = _mm256_set_m128(_mm256_cvtpd_ps(ymm1), _mm256_cvtpd_ps(ymm0));
    1368             :     _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut),
    1369             :                      _mm256_cvtps_ph(ymm, _MM_FROUND_TO_NEAREST_INT));
    1370             : }
    1371             : 
    1372             : #endif
    1373             : 
    1374             : template <>
    1375             : inline void GDALCopy8Words(const float *pValueIn, GByte *const pValueOut)
    1376             : {
    1377             :     __m256 ymm = _mm256_loadu_ps(pValueIn);
    1378             : 
    1379             :     const __m256 p0d5 = _mm256_set1_ps(0.5f);
    1380             :     const __m256 ymm_max = _mm256_set1_ps(255);
    1381             :     ymm = _mm256_add_ps(ymm, p0d5);
    1382             :     ymm = _mm256_min_ps(_mm256_max_ps(ymm, p0d5), ymm_max);
    1383             : 
    1384             :     __m256i ymm_i = _mm256_cvttps_epi32(ymm);
    1385             : 
    1386             :     ymm_i = _mm256_packus_epi32(ymm_i, ymm_i);  // Pack int32 to uint16
    1387             :     ymm_i = _mm256_permute4x64_epi64(ymm_i, 0 | (2 << 2));  // AVX2
    1388             : 
    1389             :     __m128i xmm_i = _mm256_castsi256_si128(ymm_i);
    1390             :     xmm_i = _mm_packus_epi16(xmm_i, xmm_i);
    1391             :     GDALCopyXMMToInt64(xmm_i, pValueOut);
    1392             : }
    1393             : 
    1394             : template <>
    1395             : inline void GDALCopy8Words(const float *pValueIn, GUInt16 *const pValueOut)
    1396             : {
    1397             :     __m256 ymm = _mm256_loadu_ps(pValueIn);
    1398             : 
    1399             :     const __m256 p0d5 = _mm256_set1_ps(0.5f);
    1400             :     const __m256 ymm_max = _mm256_set1_ps(65535);
    1401             :     ymm = _mm256_add_ps(ymm, p0d5);
    1402             :     ymm = _mm256_min_ps(_mm256_max_ps(ymm, p0d5), ymm_max);
    1403             : 
    1404             :     __m256i ymm_i = _mm256_cvttps_epi32(ymm);
    1405             : 
    1406             :     ymm_i = _mm256_packus_epi32(ymm_i, ymm_i);  // Pack int32 to uint16
    1407             :     ymm_i = _mm256_permute4x64_epi64(ymm_i, 0 | (2 << 2));  // AVX2
    1408             : 
    1409             :     _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut),
    1410             :                      _mm256_castsi256_si128(ymm_i));
    1411             : }
    1412             : #else
    1413             : template <>
    1414     7785681 : inline void GDALCopy8Words(const float *pValueIn, GUInt16 *const pValueOut)
    1415             : {
    1416     7785681 :     __m128 xmm = _mm_loadu_ps(pValueIn);
    1417    15571402 :     __m128 xmm1 = _mm_loadu_ps(pValueIn + 4);
    1418             : 
    1419     7785681 :     const __m128 p0d5 = _mm_set1_ps(0.5f);
    1420     7785681 :     const __m128 xmm_max = _mm_set1_ps(65535);
    1421     7785681 :     xmm = _mm_add_ps(xmm, p0d5);
    1422     7785681 :     xmm1 = _mm_add_ps(xmm1, p0d5);
    1423    15571402 :     xmm = _mm_min_ps(_mm_max_ps(xmm, p0d5), xmm_max);
    1424    15571402 :     xmm1 = _mm_min_ps(_mm_max_ps(xmm1, p0d5), xmm_max);
    1425             : 
    1426     7785681 :     __m128i xmm_i = _mm_cvttps_epi32(xmm);
    1427     7785681 :     __m128i xmm1_i = _mm_cvttps_epi32(xmm1);
    1428             : 
    1429             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
    1430             :     xmm_i = _mm_packus_epi32(xmm_i, xmm1_i);  // Pack int32 to uint16
    1431             : #else
    1432             :     // Translate to int16 range because _mm_packus_epi32 is SSE4.1 only
    1433    15571402 :     xmm_i = _mm_add_epi32(xmm_i, _mm_set1_epi32(-32768));
    1434    15571402 :     xmm1_i = _mm_add_epi32(xmm1_i, _mm_set1_epi32(-32768));
    1435     7785681 :     xmm_i = _mm_packs_epi32(xmm_i, xmm1_i);  // Pack int32 to int16
    1436             :     // Translate back to uint16 range (actually -32768==32768 in int16)
    1437    15571402 :     xmm_i = _mm_add_epi16(xmm_i, _mm_set1_epi16(-32768));
    1438             : #endif
    1439             :     _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut), xmm_i);
    1440     7785681 : }
    1441             : #endif
    1442             : 
    1443             : // ARM64 has an efficient instruction for Float64 -> Float16
    1444             : #if !(defined(HAVE__FLOAT16) &&                                                \
    1445             :       (defined(__aarch64__) && defined(_M_ARM64))) &&                          \
    1446             :     !(defined(__AVX2__) && defined(__F16C__))
    1447             : template <>
    1448         235 : inline void GDALCopy8Words(const double *pValueIn, GFloat16 *const pValueOut)
    1449             : {
    1450             :     float fVal[8];
    1451         235 :     GDALCopy8Words(pValueIn, fVal);
    1452         235 :     GDALCopy8Words(fVal, pValueOut);
    1453         235 : }
    1454             : #endif
    1455             : 
    1456             : #endif  //  defined(__x86_64) || defined(_M_X64)
    1457             : 
    1458             : #endif  // GDAL_PRIV_TEMPLATES_HPP_INCLUDED

Generated by: LCOV version 1.14