LCOV - code coverage report
Current view: top level - gcore - gdal_priv_templates.hpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 457 465 98.3 %
Date: 2026-06-23 23:27:26 Functions: 395 549 71.9 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  GDAL Core
       4             :  * Purpose:  Inline C++ templates
       5             :  * Author:   Phil Vachon, <philippe at cowpig.ca>
       6             :  *
       7             :  ******************************************************************************
       8             :  * Copyright (c) 2009, Phil Vachon, <philippe at cowpig.ca>
       9             :  * Copyright (c) 2025, Even Rouault, <even.rouault at spatialys.com>
      10             :  *
      11             :  * SPDX-License-Identifier: MIT
      12             :  ****************************************************************************/
      13             : 
      14             : #ifndef GDAL_PRIV_TEMPLATES_HPP_INCLUDED
      15             : #define GDAL_PRIV_TEMPLATES_HPP_INCLUDED
      16             : 
      17             : #include "cpl_port.h"
      18             : 
      19             : #include <algorithm>
      20             : #include <cmath>
      21             : #include <cstdint>
      22             : #include <limits>
      23             : #include <type_traits>
      24             : 
      25             : #include "cpl_float.h"
      26             : 
      27             : // Needs SSE2
      28             : #if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2) ||               \
      29             :     defined(USE_NEON_OPTIMIZATIONS)
      30             : 
      31             : #ifdef USE_NEON_OPTIMIZATIONS
      32             : #include "include_sse2neon.h"
      33             : #else
      34             : #include <immintrin.h>
      35             : #endif
      36             : 
      37    40839045 : static inline void GDALCopyXMMToInt32(const __m128i xmm, void *pDest)
      38             : {
      39    40839045 :     int n32 = _mm_cvtsi128_si32(xmm);  // Extract lower 32 bit word
      40    40839045 :     memcpy(pDest, &n32, sizeof(n32));
      41    40839045 : }
      42             : 
      43   106951351 : static inline void GDALCopyXMMToInt64(const __m128i xmm, void *pDest)
      44             : {
      45             :     _mm_storel_epi64(reinterpret_cast<__m128i *>(pDest), xmm);
      46   106951351 : }
      47             : 
      48             : #if __SSSE3__
      49             : #include <tmmintrin.h>
      50             : #endif
      51             : 
      52             : #if defined(__SSE4_1__) || defined(__AVX__)
      53             : #include <smmintrin.h>
      54             : #endif
      55             : 
      56             : #ifdef __F16C__
      57             : #include <immintrin.h>
      58             : #endif
      59             : 
      60             : #endif
      61             : 
      62             : /************************************************************************/
      63             : /*                         GDALGetDataLimits()                          */
      64             : /************************************************************************/
      65             : /**
      66             :  * Compute the limits of values that can be placed in Tout in terms of
      67             :  * Tin. Usually used for output clamping, when the output data type's
      68             :  * limits are stable relative to the input type (i.e. no roundoff error).
      69             :  *
      70             :  * @param tMaxValue the returned maximum value
      71             :  * @param tMinValue the returned minimum value
      72             :  */
      73             : 
      74             : template <class Tin, class Tout>
      75   349576532 : inline void GDALGetDataLimits(Tin &tMaxValue, Tin &tMinValue)
      76             : {
      77   349576532 :     tMaxValue = cpl::NumericLimits<Tin>::max();
      78   349576532 :     tMinValue = cpl::NumericLimits<Tin>::lowest();
      79             : 
      80             :     // Compute the actual minimum value of Tout in terms of Tin.
      81             :     if constexpr (cpl::NumericLimits<Tout>::is_signed &&
      82             :                   cpl::NumericLimits<Tout>::is_integer)
      83             :     {
      84             :         // the minimum value is less than zero
      85             :         // cppcheck-suppress knownConditionTrueFalse
      86             :         if constexpr (cpl::NumericLimits<Tout>::digits <
      87             :                           cpl::NumericLimits<Tin>::digits ||
      88             :                       !cpl::NumericLimits<Tin>::is_integer)
      89             :         {
      90             :             // Tout is smaller than Tin, so we need to clamp values in input
      91             :             // to the range of Tout's min/max values
      92             :             if constexpr (cpl::NumericLimits<Tin>::is_signed)
      93             :             {
      94    87460980 :                 tMinValue =
      95    87460980 :                     static_cast<Tin>(cpl::NumericLimits<Tout>::lowest());
      96             :             }
      97    87627909 :             tMaxValue = static_cast<Tin>(cpl::NumericLimits<Tout>::max());
      98             :         }
      99             :     }
     100             :     else if constexpr (cpl::NumericLimits<Tout>::is_integer)
     101             :     {
     102             :         // the output is unsigned, so we just need to determine the max
     103             :         if constexpr (!std::is_same_v<Tin, Tout> &&
     104             :                       cpl::NumericLimits<Tout>::digits <=
     105             :                           cpl::NumericLimits<Tin>::digits)
     106             :         {
     107             :             // Tout is smaller than Tin, so we need to clamp the input values
     108             :             // to the range of Tout's max
     109   102300464 :             tMaxValue = static_cast<Tin>(cpl::NumericLimits<Tout>::max());
     110             :         }
     111   105279596 :         tMinValue = 0;
     112             :     }
     113   349576532 : }
     114             : 
     115             : /************************************************************************/
     116             : /*                           GDALClampValue()                           */
     117             : /************************************************************************/
     118             : /**
     119             :  * Clamp values of type T to a specified range
     120             :  *
     121             :  * @param tValue the value
     122             :  * @param tMax the max value
     123             :  * @param tMin the min value
     124             :  */
     125             : template <class T>
     126   349387847 : inline T GDALClampValue(const T tValue, const T tMax, const T tMin)
     127             : {
     128   349387847 :     return tValue > tMax ? tMax : tValue < tMin ? tMin : tValue;
     129             : }
     130             : 
     131             : /************************************************************************/
     132             : /*                        GDALClampDoubleValue()                        */
     133             : /************************************************************************/
     134             : /**
     135             :  * Clamp double values to a specified range, this uses the same
     136             :  * argument ordering as std::clamp, returns TRUE if the value was clamped.
     137             :  *
     138             :  * @param tValue the value
     139             :  * @param tMin the min value
     140             :  * @param tMax the max value
     141             :  *
     142             :  */
     143             : template <class T2, class T3>
     144         309 : inline bool GDALClampDoubleValue(double &tValue, const T2 tMin, const T3 tMax)
     145             : {
     146         309 :     const double tMin2{static_cast<double>(tMin)};
     147         309 :     const double tMax2{static_cast<double>(tMax)};
     148         309 :     if (tValue > tMax2 || tValue < tMin2)
     149             :     {
     150          26 :         tValue = tValue > tMax2 ? tMax2 : tValue < tMin2 ? tMin2 : tValue;
     151          26 :         return true;
     152             :     }
     153             :     else
     154             :     {
     155         283 :         return false;
     156             :     }
     157             : }
     158             : 
     159             : /************************************************************************/
     160             : /*                        GDALClampValueToType()                        */
     161             : /************************************************************************/
     162             : /**
     163             :  * Clamp a value of type T to the limits of type TClamped.
     164             :  *
     165             :  * @param tValue the value
     166             :  */
     167      188804 : template <class T, class TClamped> inline T GDALClampValueToType(T tValue)
     168             : {
     169             :     T tMaxValue, tMinValue;
     170      188804 :     GDALGetDataLimits<T, TClamped>(tMaxValue, tMinValue);
     171      188804 :     return std::clamp(tValue, tMinValue, tMaxValue);
     172             : }
     173             : 
     174             : /************************************************************************/
     175             : /*                         GDALIsValueInRange()                         */
     176             : /************************************************************************/
     177             : /**
     178             :  * Returns whether a value is in the type range.
     179             :  * NaN is considered not to be in type range.
     180             :  *
     181             :  * @param dfValue the value
     182             :  * @return whether the value is in the type range.
     183             :  */
     184      161028 : template <class T> inline bool GDALIsValueInRange(double dfValue)
     185             : {
     186      322014 :     return dfValue >= static_cast<double>(cpl::NumericLimits<T>::lowest()) &&
     187      322014 :            dfValue <= static_cast<double>(cpl::NumericLimits<T>::max());
     188             : }
     189             : 
     190          27 : template <> inline bool GDALIsValueInRange<double>(double dfValue)
     191             : {
     192          27 :     return !CPLIsNan(dfValue);
     193             : }
     194             : 
     195       46960 : template <> inline bool GDALIsValueInRange<float>(double dfValue)
     196             : {
     197       93793 :     return CPLIsInf(dfValue) ||
     198             :            (dfValue >=
     199       46833 :                 -static_cast<double>(std::numeric_limits<float>::max()) &&
     200       93785 :             dfValue <= static_cast<double>(std::numeric_limits<float>::max()));
     201             : }
     202             : 
     203         120 : template <> inline bool GDALIsValueInRange<GFloat16>(double dfValue)
     204             : {
     205         360 :     return CPLIsInf(dfValue) ||
     206         240 :            (dfValue >= -cpl::NumericLimits<GFloat16>::max() &&
     207         240 :             dfValue <= cpl::NumericLimits<GFloat16>::max());
     208             : }
     209             : 
     210       16925 : template <> inline bool GDALIsValueInRange<int64_t>(double dfValue)
     211             : {
     212             :     // Values in the range [INT64_MAX - 1023, INT64_MAX - 1]
     213             :     // get converted to a double that once cast to int64_t is
     214             :     // INT64_MAX + 1, hence the < strict comparison.
     215             :     return dfValue >=
     216       33849 :                static_cast<double>(cpl::NumericLimits<int64_t>::lowest()) &&
     217       33849 :            dfValue < static_cast<double>(cpl::NumericLimits<int64_t>::max());
     218             : }
     219             : 
     220        8968 : template <> inline bool GDALIsValueInRange<uint64_t>(double dfValue)
     221             : {
     222             :     // Values in the range [UINT64_MAX - 2047, UINT64_MAX - 1]
     223             :     // get converted to a double that once cast to uint64_t is
     224             :     // UINT64_MAX + 1, hence the < strict comparison.
     225       17933 :     return dfValue >= 0 &&
     226       17933 :            dfValue < static_cast<double>(cpl::NumericLimits<uint64_t>::max());
     227             : }
     228             : 
     229             : /************************************************************************/
     230             : /*                         GDALIsValueExactAs()                         */
     231             : /************************************************************************/
     232             : /**
     233             :  * Returns whether a value can be exactly represented on type T.
     234             :  *
     235             :  * That is static_cast\<double\>(static_cast\<T\>(dfValue)) is legal and is
     236             :  * equal to dfValue.
     237             :  *
     238             :  * Note: for T=float or double, a NaN input leads to true
     239             :  *
     240             :  * @param dfValue the value
     241             :  * @return whether the value can be exactly represented on type T.
     242             :  */
     243       12305 : template <class T> inline bool GDALIsValueExactAs(double dfValue)
     244             : {
     245       24555 :     return GDALIsValueInRange<T>(dfValue) &&
     246       24555 :            static_cast<double>(static_cast<T>(dfValue)) == dfValue;
     247             : }
     248             : 
     249         177 : template <> inline bool GDALIsValueExactAs<float>(double dfValue)
     250             : {
     251         348 :     return CPLIsNan(dfValue) ||
     252         171 :            (GDALIsValueInRange<float>(dfValue) &&
     253         344 :             static_cast<double>(static_cast<float>(dfValue)) == dfValue);
     254             : }
     255             : 
     256           7 : template <> inline bool GDALIsValueExactAs<GFloat16>(double dfValue)
     257             : {
     258          14 :     return CPLIsNan(dfValue) ||
     259           7 :            (GDALIsValueInRange<GFloat16>(dfValue) &&
     260          14 :             static_cast<double>(static_cast<GFloat16>(dfValue)) == dfValue);
     261             : }
     262             : 
     263          21 : template <> inline bool GDALIsValueExactAs<double>(double)
     264             : {
     265          21 :     return true;
     266             : }
     267             : 
     268             : /************************************************************************/
     269             : /*                            GDALCopyWord()                            */
     270             : /************************************************************************/
     271             : 
     272             : // Integer input and output: clamp the input
     273             : 
     274             : template <class Tin, class Tout> struct sGDALCopyWord
     275             : {
     276   173520001 :     static inline void f(const Tin tValueIn, Tout &tValueOut)
     277             :     {
     278             :         Tin tMaxVal, tMinVal;
     279   173520001 :         GDALGetDataLimits<Tin, Tout>(tMaxVal, tMinVal);
     280   173520001 :         tValueOut =
     281   173520001 :             static_cast<Tout>(GDALClampValue(tValueIn, tMaxVal, tMinVal));
     282   173520001 :     }
     283             : };
     284             : 
     285             : // Integer input and floating point output: simply convert
     286             : 
     287             : template <class Tin> struct sGDALCopyWord<Tin, GFloat16>
     288             : {
     289      270266 :     static inline void f(const Tin tValueIn, GFloat16 &hfValueOut)
     290             :     {
     291      270266 :         hfValueOut = static_cast<GFloat16>(tValueIn);
     292      270266 :     }
     293             : };
     294             : 
     295             : template <class Tin> struct sGDALCopyWord<Tin, float>
     296             : {
     297     8027797 :     static inline void f(const Tin tValueIn, float &fValueOut)
     298             :     {
     299     8027797 :         fValueOut = static_cast<float>(tValueIn);
     300     8027797 :     }
     301             : };
     302             : 
     303             : template <class Tin> struct sGDALCopyWord<Tin, double>
     304             : {
     305    87067547 :     static inline void f(const Tin tValueIn, double &dfValueOut)
     306             :     {
     307    87067547 :         dfValueOut = static_cast<double>(tValueIn);
     308    87067547 :     }
     309             : };
     310             : 
     311             : // Floating point input and output, converting between identical types: simply copy
     312             : 
     313             : template <> struct sGDALCopyWord<GFloat16, GFloat16>
     314             : {
     315             :     static inline void f(const GFloat16 hfValueIn, GFloat16 &hfValueOut)
     316             :     {
     317             :         hfValueOut = hfValueIn;
     318             :     }
     319             : };
     320             : 
     321             : template <> struct sGDALCopyWord<float, float>
     322             : {
     323             :     static inline void f(const float fValueIn, float &fValueOut)
     324             :     {
     325             :         fValueOut = fValueIn;
     326             :     }
     327             : };
     328             : 
     329             : template <> struct sGDALCopyWord<double, double>
     330             : {
     331             :     static inline void f(const double dfValueIn, double &dfValueOut)
     332             :     {
     333             :         dfValueOut = dfValueIn;
     334             :     }
     335             : };
     336             : 
     337             : // Floating point input and output, converting to a larger type: use implicit conversion
     338             : 
     339             : template <> struct sGDALCopyWord<GFloat16, float>
     340             : {
     341        5206 :     static inline void f(const GFloat16 hfValueIn, float &dfValueOut)
     342             :     {
     343        5206 :         dfValueOut = hfValueIn;
     344        5206 :     }
     345             : };
     346             : 
     347             : template <> struct sGDALCopyWord<GFloat16, double>
     348             : {
     349        4174 :     static inline void f(const GFloat16 hfValueIn, double &dfValueOut)
     350             :     {
     351        4174 :         dfValueOut = hfValueIn;
     352        4174 :     }
     353             : };
     354             : 
     355             : template <> struct sGDALCopyWord<float, double>
     356             : {
     357      638886 :     static inline void f(const float fValueIn, double &dfValueOut)
     358             :     {
     359      638886 :         dfValueOut = static_cast<double>(fValueIn);
     360      638886 :     }
     361             : };
     362             : 
     363             : // Floating point input and out, converting to a smaller type: ensure overflow results in infinity
     364             : 
     365             : template <> struct sGDALCopyWord<float, GFloat16>
     366             : {
     367         497 :     static inline void f(const float fValueIn, GFloat16 &hfValueOut)
     368             :     {
     369             :         // Our custom implementation when std::float16_t is not
     370             :         // available ensures proper behavior.
     371             : #if !defined(HAVE_STD_FLOAT16_T)
     372         497 :         if (fValueIn > cpl::NumericLimits<GFloat16>::max())
     373             :         {
     374          27 :             hfValueOut = cpl::NumericLimits<GFloat16>::infinity();
     375          27 :             return;
     376             :         }
     377         470 :         if (fValueIn < -cpl::NumericLimits<GFloat16>::max())
     378             :         {
     379          15 :             hfValueOut = -cpl::NumericLimits<GFloat16>::infinity();
     380          15 :             return;
     381             :         }
     382             : #endif
     383         455 :         hfValueOut = static_cast<GFloat16>(fValueIn);
     384             :     }
     385             : };
     386             : 
     387             : template <> struct sGDALCopyWord<double, GFloat16>
     388             : {
     389         349 :     static inline void f(const double dfValueIn, GFloat16 &hfValueOut)
     390             :     {
     391             :         // Our custom implementation when std::float16_t is not
     392             :         // available ensures proper behavior.
     393             : #if !defined(HAVE_STD_FLOAT16_T)
     394         349 :         if (dfValueIn > cpl::NumericLimits<GFloat16>::max())
     395             :         {
     396          14 :             hfValueOut = cpl::NumericLimits<GFloat16>::infinity();
     397          14 :             return;
     398             :         }
     399         335 :         if (dfValueIn < -cpl::NumericLimits<GFloat16>::max())
     400             :         {
     401          14 :             hfValueOut = -cpl::NumericLimits<GFloat16>::infinity();
     402          14 :             return;
     403             :         }
     404             : #endif
     405         321 :         hfValueOut = static_cast<GFloat16>(dfValueIn);
     406             :     }
     407             : };
     408             : 
     409             : template <> struct sGDALCopyWord<double, float>
     410             : {
     411      130418 :     static inline void f(const double dfValueIn, float &fValueOut)
     412             :     {
     413             :         if constexpr (!std::numeric_limits<double>::is_iec559)
     414             :         {
     415             :             if (dfValueIn >
     416             :                 static_cast<double>(std::numeric_limits<float>::max()))
     417             :             {
     418             :                 fValueOut = std::numeric_limits<float>::infinity();
     419             :                 return;
     420             :             }
     421             :             if (dfValueIn <
     422             :                 static_cast<double>(-std::numeric_limits<float>::max()))
     423             :             {
     424             :                 fValueOut = -std::numeric_limits<float>::infinity();
     425             :                 return;
     426             :             }
     427             :         }
     428             : 
     429      130418 :         fValueOut = static_cast<float>(dfValueIn);
     430      130418 :     }
     431             : };
     432             : 
     433             : // Floating point input to a small unsigned integer type: nan becomes zero, otherwise round and clamp
     434             : 
     435             : template <class Tout> struct sGDALCopyWord<GFloat16, Tout>
     436             : {
     437        5032 :     static inline void f(const GFloat16 hfValueIn, Tout &tValueOut)
     438             :     {
     439        5032 :         if (CPLIsNan(hfValueIn))
     440             :         {
     441           1 :             tValueOut = 0;
     442           1 :             return;
     443             :         }
     444             :         GFloat16 hfMaxVal, hfMinVal;
     445        5031 :         GDALGetDataLimits<GFloat16, Tout>(hfMaxVal, hfMinVal);
     446        5031 :         tValueOut = static_cast<Tout>(
     447       10062 :             GDALClampValue(hfValueIn + GFloat16(0.5f), hfMaxVal, hfMinVal));
     448             :     }
     449             : };
     450             : 
     451             : template <class Tout> struct sGDALCopyWord<float, Tout>
     452             : {
     453     5476720 :     static inline void f(const float fValueIn, Tout &tValueOut)
     454             :     {
     455     5476720 :         if (CPLIsNan(fValueIn))
     456             :         {
     457          22 :             tValueOut = 0;
     458          22 :             return;
     459             :         }
     460             :         float fMaxVal, fMinVal;
     461     5476700 :         GDALGetDataLimits<float, Tout>(fMaxVal, fMinVal);
     462     5476700 :         tValueOut = static_cast<Tout>(
     463     5476700 :             GDALClampValue(fValueIn + 0.5f, fMaxVal, fMinVal));
     464             :     }
     465             : };
     466             : 
     467             : template <class Tout> struct sGDALCopyWord<double, Tout>
     468             : {
     469    83013416 :     static inline void f(const double dfValueIn, Tout &tValueOut)
     470             :     {
     471    83013416 :         if (CPLIsNan(dfValueIn))
     472             :         {
     473         151 :             tValueOut = 0;
     474         151 :             return;
     475             :         }
     476             :         double dfMaxVal, dfMinVal;
     477    83013348 :         GDALGetDataLimits<double, Tout>(dfMaxVal, dfMinVal);
     478    83013348 :         tValueOut = static_cast<Tout>(
     479    83013348 :             GDALClampValue(dfValueIn + 0.5, dfMaxVal, dfMinVal));
     480             :     }
     481             : };
     482             : 
     483             : // Floating point input to a large unsigned integer type: nan becomes zero, otherwise round and clamp.
     484             : // Avoid roundoff while clamping.
     485             : 
     486             : template <> struct sGDALCopyWord<GFloat16, std::uint64_t>
     487             : {
     488         624 :     static inline void f(const GFloat16 hfValueIn, std::uint64_t &nValueOut)
     489             :     {
     490         624 :         if (!(hfValueIn > 0))
     491             :         {
     492           4 :             nValueOut = 0;
     493             :         }
     494         620 :         else if (CPLIsInf(hfValueIn))
     495             :         {
     496           1 :             nValueOut = cpl::NumericLimits<std::uint64_t>::max();
     497             :         }
     498             :         else
     499             :         {
     500         619 :             nValueOut = static_cast<std::uint64_t>(hfValueIn + GFloat16(0.5f));
     501             :         }
     502         624 :     }
     503             : };
     504             : 
     505             : template <> struct sGDALCopyWord<float, unsigned int>
     506             : {
     507        5173 :     static inline void f(const float fValueIn, unsigned int &nValueOut)
     508             :     {
     509        5173 :         if (!(fValueIn > 0))
     510             :         {
     511         283 :             nValueOut = 0;
     512             :         }
     513        4890 :         else if (fValueIn >=
     514        4890 :                  static_cast<float>(cpl::NumericLimits<unsigned int>::max()))
     515             :         {
     516         134 :             nValueOut = cpl::NumericLimits<unsigned int>::max();
     517             :         }
     518             :         else
     519             :         {
     520        4756 :             nValueOut = static_cast<unsigned int>(fValueIn + 0.5f);
     521             :         }
     522        5173 :     }
     523             : };
     524             : 
     525             : template <> struct sGDALCopyWord<float, std::uint64_t>
     526             : {
     527         758 :     static inline void f(const float fValueIn, std::uint64_t &nValueOut)
     528             :     {
     529         758 :         if (!(fValueIn > 0))
     530             :         {
     531         138 :             nValueOut = 0;
     532             :         }
     533         620 :         else if (fValueIn >=
     534         620 :                  static_cast<float>(cpl::NumericLimits<std::uint64_t>::max()))
     535             :         {
     536           2 :             nValueOut = cpl::NumericLimits<std::uint64_t>::max();
     537             :         }
     538             :         else
     539             :         {
     540         618 :             nValueOut = static_cast<std::uint64_t>(fValueIn + 0.5f);
     541             :         }
     542         758 :     }
     543             : };
     544             : 
     545             : template <> struct sGDALCopyWord<double, std::uint64_t>
     546             : {
     547        1378 :     static inline void f(const double dfValueIn, std::uint64_t &nValueOut)
     548             :     {
     549        1378 :         if (!(dfValueIn > 0))
     550             :         {
     551         426 :             nValueOut = 0;
     552             :         }
     553         952 :         else if (dfValueIn >
     554         952 :                  static_cast<double>(cpl::NumericLimits<uint64_t>::max()))
     555             :         {
     556           4 :             nValueOut = cpl::NumericLimits<uint64_t>::max();
     557             :         }
     558             :         else
     559             :         {
     560         948 :             nValueOut = static_cast<std::uint64_t>(dfValueIn + 0.5);
     561             :         }
     562        1378 :     }
     563             : };
     564             : 
     565             : // Floating point input to a very large unsigned integer type: nan becomes zero, otherwise round and clamp.
     566             : // Avoid infinity while clamping when the maximum integer is too large for the floating-point type.
     567             : // Avoid roundoff while clamping.
     568             : 
     569             : template <> struct sGDALCopyWord<GFloat16, unsigned short>
     570             : {
     571        4706 :     static inline void f(const GFloat16 hfValueIn, unsigned short &nValueOut)
     572             :     {
     573        4706 :         if (!(hfValueIn > 0))
     574             :         {
     575         216 :             nValueOut = 0;
     576             :         }
     577        4490 :         else if (CPLIsInf(hfValueIn))
     578             :         {
     579          67 :             nValueOut = cpl::NumericLimits<unsigned short>::max();
     580             :         }
     581             :         else
     582             :         {
     583        4423 :             nValueOut = static_cast<unsigned short>(hfValueIn + GFloat16(0.5f));
     584             :         }
     585        4706 :     }
     586             : };
     587             : 
     588             : template <> struct sGDALCopyWord<GFloat16, unsigned int>
     589             : {
     590        4639 :     static inline void f(const GFloat16 hfValueIn, unsigned int &nValueOut)
     591             :     {
     592        4639 :         if (!(hfValueIn > 0))
     593             :         {
     594         149 :             nValueOut = 0;
     595             :         }
     596        4490 :         else if (CPLIsInf(hfValueIn))
     597             :         {
     598           0 :             nValueOut = cpl::NumericLimits<unsigned int>::max();
     599             :         }
     600             :         else
     601             :         {
     602        4490 :             nValueOut = static_cast<unsigned int>(hfValueIn + GFloat16(0.5f));
     603             :         }
     604        4639 :     }
     605             : };
     606             : 
     607             : // Floating point input to a small signed integer type: nan becomes zero, otherwise round and clamp.
     608             : // Rounding for signed integers is different than for the unsigned integers above.
     609             : 
     610             : template <> struct sGDALCopyWord<GFloat16, signed char>
     611             : {
     612        1088 :     static inline void f(const GFloat16 hfValueIn, signed char &nValueOut)
     613             :     {
     614        1088 :         if (CPLIsNan(hfValueIn))
     615             :         {
     616           0 :             nValueOut = 0;
     617           0 :             return;
     618             :         }
     619             :         GFloat16 hfMaxVal, hfMinVal;
     620        1088 :         GDALGetDataLimits<GFloat16, signed char>(hfMaxVal, hfMinVal);
     621        1088 :         GFloat16 hfValue = hfValueIn >= GFloat16(0.0f)
     622         619 :                                ? hfValueIn + GFloat16(0.5f)
     623        1088 :                                : hfValueIn - GFloat16(0.5f);
     624        1088 :         nValueOut = static_cast<signed char>(
     625        2176 :             GDALClampValue(hfValue, hfMaxVal, hfMinVal));
     626             :     }
     627             : };
     628             : 
     629             : template <> struct sGDALCopyWord<float, signed char>
     630             : {
     631         454 :     static inline void f(const float fValueIn, signed char &nValueOut)
     632             :     {
     633         454 :         if (CPLIsNan(fValueIn))
     634             :         {
     635          11 :             nValueOut = 0;
     636          11 :             return;
     637             :         }
     638             :         float fMaxVal, fMinVal;
     639         443 :         GDALGetDataLimits<float, signed char>(fMaxVal, fMinVal);
     640         443 :         float fValue = fValueIn >= 0.0f ? fValueIn + 0.5f : fValueIn - 0.5f;
     641         443 :         nValueOut =
     642         443 :             static_cast<signed char>(GDALClampValue(fValue, fMaxVal, fMinVal));
     643             :     }
     644             : };
     645             : 
     646             : template <> struct sGDALCopyWord<float, short>
     647             : {
     648     5102540 :     static inline void f(const float fValueIn, short &nValueOut)
     649             :     {
     650     5102540 :         if (CPLIsNan(fValueIn))
     651             :         {
     652          15 :             nValueOut = 0;
     653          15 :             return;
     654             :         }
     655             :         float fMaxVal, fMinVal;
     656     5102520 :         GDALGetDataLimits<float, short>(fMaxVal, fMinVal);
     657     5102520 :         float fValue = fValueIn >= 0.0f ? fValueIn + 0.5f : fValueIn - 0.5f;
     658     5102520 :         nValueOut =
     659     5102520 :             static_cast<short>(GDALClampValue(fValue, fMaxVal, fMinVal));
     660             :     }
     661             : };
     662             : 
     663             : template <> struct sGDALCopyWord<double, signed char>
     664             : {
     665        2646 :     static inline void f(const double dfValueIn, signed char &nValueOut)
     666             :     {
     667        2646 :         if (CPLIsNan(dfValueIn))
     668             :         {
     669          67 :             nValueOut = 0;
     670          67 :             return;
     671             :         }
     672             :         double dfMaxVal, dfMinVal;
     673        2579 :         GDALGetDataLimits<double, signed char>(dfMaxVal, dfMinVal);
     674        2579 :         double dfValue = dfValueIn > 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5;
     675        2579 :         nValueOut = static_cast<signed char>(
     676        2579 :             GDALClampValue(dfValue, dfMaxVal, dfMinVal));
     677             :     }
     678             : };
     679             : 
     680             : template <> struct sGDALCopyWord<double, short>
     681             : {
     682     5157590 :     static inline void f(const double dfValueIn, short &nValueOut)
     683             :     {
     684     5157590 :         if (CPLIsNan(dfValueIn))
     685             :         {
     686          73 :             nValueOut = 0;
     687          73 :             return;
     688             :         }
     689             :         double dfMaxVal, dfMinVal;
     690     5157520 :         GDALGetDataLimits<double, short>(dfMaxVal, dfMinVal);
     691     5157520 :         double dfValue = dfValueIn > 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5;
     692     5157520 :         nValueOut =
     693     5157520 :             static_cast<short>(GDALClampValue(dfValue, dfMaxVal, dfMinVal));
     694             :     }
     695             : };
     696             : 
     697             : template <> struct sGDALCopyWord<double, int>
     698             : {
     699    77108600 :     static inline void f(const double dfValueIn, int &nValueOut)
     700             :     {
     701    77108600 :         if (CPLIsNan(dfValueIn))
     702             :         {
     703          71 :             nValueOut = 0;
     704          71 :             return;
     705             :         }
     706             :         double dfMaxVal, dfMinVal;
     707    77108500 :         GDALGetDataLimits<double, int>(dfMaxVal, dfMinVal);
     708    77108500 :         double dfValue = dfValueIn >= 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5;
     709    77108500 :         nValueOut =
     710    77108500 :             static_cast<int>(GDALClampValue(dfValue, dfMaxVal, dfMinVal));
     711             :     }
     712             : };
     713             : 
     714             : // Floating point input to a large signed integer type: nan becomes zero, otherwise round and clamp.
     715             : // Rounding for signed integers is different than for the unsigned integers above.
     716             : // Avoid roundoff while clamping.
     717             : 
     718             : template <> struct sGDALCopyWord<GFloat16, short>
     719             : {
     720        5289 :     static inline void f(const GFloat16 hfValueIn, short &nValueOut)
     721             :     {
     722        5289 :         if (CPLIsNan(hfValueIn))
     723             :         {
     724           0 :             nValueOut = 0;
     725             :         }
     726        5289 :         else if (hfValueIn >=
     727        5289 :                  static_cast<GFloat16>(cpl::NumericLimits<short>::max()))
     728             :         {
     729         146 :             nValueOut = cpl::NumericLimits<short>::max();
     730             :         }
     731        5143 :         else if (hfValueIn <=
     732        5143 :                  static_cast<GFloat16>(cpl::NumericLimits<short>::lowest()))
     733             :         {
     734         213 :             nValueOut = cpl::NumericLimits<short>::lowest();
     735             :         }
     736             :         else
     737             :         {
     738        9860 :             nValueOut = static_cast<short>(hfValueIn > GFloat16(0.0f)
     739        4930 :                                                ? hfValueIn + GFloat16(0.5f)
     740        5451 :                                                : hfValueIn - GFloat16(0.5f));
     741             :         }
     742        5289 :     }
     743             : };
     744             : 
     745             : template <> struct sGDALCopyWord<float, int>
     746             : {
     747       10285 :     static inline void f(const float fValueIn, int &nValueOut)
     748             :     {
     749       10285 :         if (CPLIsNan(fValueIn))
     750             :         {
     751          15 :             nValueOut = 0;
     752             :         }
     753       10270 :         else if (fValueIn >= static_cast<float>(cpl::NumericLimits<int>::max()))
     754             :         {
     755         218 :             nValueOut = cpl::NumericLimits<int>::max();
     756             :         }
     757       10052 :         else if (fValueIn <=
     758       10052 :                  static_cast<float>(cpl::NumericLimits<int>::lowest()))
     759             :         {
     760          90 :             nValueOut = cpl::NumericLimits<int>::lowest();
     761             :         }
     762             :         else
     763             :         {
     764       10196 :             nValueOut = static_cast<int>(fValueIn > 0.0f ? fValueIn + 0.5f
     765         234 :                                                          : fValueIn - 0.5f);
     766             :         }
     767       10285 :     }
     768             : };
     769             : 
     770             : template <> struct sGDALCopyWord<float, std::int64_t>
     771             : {
     772        1227 :     static inline void f(const float fValueIn, std::int64_t &nValueOut)
     773             :     {
     774        1227 :         if (CPLIsNan(fValueIn))
     775             :         {
     776          68 :             nValueOut = 0;
     777             :         }
     778        1159 :         else if (fValueIn >=
     779        1159 :                  static_cast<float>(cpl::NumericLimits<std::int64_t>::max()))
     780             :         {
     781           2 :             nValueOut = cpl::NumericLimits<std::int64_t>::max();
     782             :         }
     783        1157 :         else if (fValueIn <=
     784        1157 :                  static_cast<float>(cpl::NumericLimits<std::int64_t>::lowest()))
     785             :         {
     786           2 :             nValueOut = cpl::NumericLimits<std::int64_t>::lowest();
     787             :         }
     788             :         else
     789             :         {
     790        2310 :             nValueOut = static_cast<std::int64_t>(
     791        1155 :                 fValueIn > 0.0f ? fValueIn + 0.5f : fValueIn - 0.5f);
     792             :         }
     793        1227 :     }
     794             : };
     795             : 
     796             : template <> struct sGDALCopyWord<double, std::int64_t>
     797             : {
     798        1861 :     static inline void f(const double dfValueIn, std::int64_t &nValueOut)
     799             :     {
     800        1861 :         if (CPLIsNan(dfValueIn))
     801             :         {
     802          68 :             nValueOut = 0;
     803             :         }
     804        1793 :         else if (dfValueIn >=
     805        1793 :                  static_cast<double>(cpl::NumericLimits<std::int64_t>::max()))
     806             :         {
     807           4 :             nValueOut = cpl::NumericLimits<std::int64_t>::max();
     808             :         }
     809        1789 :         else if (dfValueIn <=
     810        1789 :                  static_cast<double>(cpl::NumericLimits<std::int64_t>::min()))
     811             :         {
     812           4 :             nValueOut = cpl::NumericLimits<std::int64_t>::min();
     813             :         }
     814             :         else
     815             :         {
     816        3570 :             nValueOut = static_cast<std::int64_t>(
     817        1785 :                 dfValueIn > 0.0 ? dfValueIn + 0.5 : dfValueIn - 0.5);
     818             :         }
     819        1861 :     }
     820             : };
     821             : 
     822             : // Floating point input to a very large signed integer type: nan becomes zero, otherwise round and clamp.
     823             : // Rounding for signed integers is different than for the unsigned integers above.
     824             : // Avoid infinity while clamping when the maximum integer is too large for the floating-point type.
     825             : // Avoid roundoff while clamping.
     826             : 
     827             : template <> struct sGDALCopyWord<GFloat16, int>
     828             : {
     829        5222 :     static inline void f(const GFloat16 hfValueIn, int &nValueOut)
     830             :     {
     831        5222 :         if (CPLIsNan(hfValueIn))
     832             :         {
     833           0 :             nValueOut = 0;
     834             :         }
     835        5222 :         else if (CPLIsInf(hfValueIn))
     836             :         {
     837           0 :             nValueOut = hfValueIn > GFloat16(0.0f)
     838           0 :                             ? cpl::NumericLimits<int>::max()
     839           0 :                             : cpl::NumericLimits<int>::lowest();
     840             :         }
     841             :         else
     842             :         {
     843       10444 :             nValueOut = static_cast<int>(hfValueIn > GFloat16(0.0f)
     844        5222 :                                              ? hfValueIn + GFloat16(0.5f)
     845        5889 :                                              : hfValueIn - GFloat16(0.5f));
     846             :         }
     847        5222 :     }
     848             : };
     849             : 
     850             : template <> struct sGDALCopyWord<GFloat16, std::int64_t>
     851             : {
     852        1093 :     static inline void f(const GFloat16 hfValueIn, std::int64_t &nValueOut)
     853             :     {
     854        1093 :         if (CPLIsNan(hfValueIn))
     855             :         {
     856           1 :             nValueOut = 0;
     857             :         }
     858        1092 :         else if (CPLIsInf(hfValueIn))
     859             :         {
     860           4 :             nValueOut = hfValueIn > GFloat16(0.0f)
     861           2 :                             ? cpl::NumericLimits<std::int64_t>::max()
     862           1 :                             : cpl::NumericLimits<std::int64_t>::lowest();
     863             :         }
     864             :         else
     865             :         {
     866        1090 :             nValueOut = static_cast<std::int64_t>(
     867        1090 :                 hfValueIn > GFloat16(0.0f) ? hfValueIn + GFloat16(0.5f)
     868        1560 :                                            : hfValueIn - GFloat16(0.5f));
     869             :         }
     870        1093 :     }
     871             : };
     872             : 
     873             : /**
     874             :  * Copy a single word, optionally rounding if appropriate (i.e. going
     875             :  * from the float to the integer case). Note that this is the function
     876             :  * you should specialize if you're adding a new data type.
     877             :  *
     878             :  * @param tValueIn value of type Tin; the input value to be converted
     879             :  * @param tValueOut value of type Tout; the output value
     880             :  */
     881             : 
     882             : template <class Tin, class Tout>
     883   488399998 : inline void GDALCopyWord(const Tin tValueIn, Tout &tValueOut)
     884             : {
     885             :     if constexpr (std::is_same<Tin, Tout>::value)
     886    42824516 :         tValueOut = tValueIn;
     887             :     else
     888   445575482 :         sGDALCopyWord<Tin, Tout>::f(tValueIn, tValueOut);
     889   488399998 : }
     890             : 
     891             : /************************************************************************/
     892             : /*                           GDALCopy4Words()                           */
     893             : /************************************************************************/
     894             : /**
     895             :  * Copy 4 packed words to 4 packed words, optionally rounding if appropriate
     896             :  * (i.e. going from the float to the integer case).
     897             :  *
     898             :  * @param pValueIn pointer to 4 input values of type Tin.
     899             :  * @param pValueOut pointer to 4 output values of type Tout.
     900             :  */
     901             : 
     902             : template <class Tin, class Tout>
     903         632 : inline void GDALCopy4Words(const Tin *pValueIn, Tout *const pValueOut)
     904             : {
     905         632 :     GDALCopyWord(pValueIn[0], pValueOut[0]);
     906         632 :     GDALCopyWord(pValueIn[1], pValueOut[1]);
     907         632 :     GDALCopyWord(pValueIn[2], pValueOut[2]);
     908         632 :     GDALCopyWord(pValueIn[3], pValueOut[3]);
     909         632 : }
     910             : 
     911             : /************************************************************************/
     912             : /*                           GDALCopy8Words()                           */
     913             : /************************************************************************/
     914             : /**
     915             :  * Copy 8 packed words to 8 packed words, optionally rounding if appropriate
     916             :  * (i.e. going from the float to the integer case).
     917             :  *
     918             :  * @param pValueIn pointer to 8 input values of type Tin.
     919             :  * @param pValueOut pointer to 8 output values of type Tout.
     920             :  */
     921             : 
     922             : template <class Tin, class Tout>
     923    44897771 : inline void GDALCopy8Words(const Tin *pValueIn, Tout *const pValueOut)
     924             : {
     925    44897771 :     GDALCopy4Words(pValueIn, pValueOut);
     926    44897771 :     GDALCopy4Words(pValueIn + 4, pValueOut + 4);
     927    44897771 : }
     928             : 
     929             : // Needs SSE2
     930             : #if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2) ||               \
     931             :     defined(USE_NEON_OPTIMIZATIONS)
     932             : 
     933             : #ifdef USE_NEON_OPTIMIZATIONS
     934             : inline __m128i _mm_cvttps_epi32_neon_no_post_correction(__m128 a)
     935             : {
     936             :     float32x4_t f = vreinterpretq_f32_m128(a);
     937             :     int32x4_t cvt = vcvtq_s32_f32(f);
     938             :     return vreinterpretq_m128i_s32(cvt);
     939             : }
     940             : #endif
     941             : 
     942             : template <>
     943    31149802 : inline void GDALCopy4Words(const float *pValueIn, GByte *const pValueOut)
     944             : {
     945    31149802 :     __m128 xmm = _mm_loadu_ps(pValueIn);
     946             : 
     947             :     // The following clamping would be useless due to the final saturating
     948             :     // packing if we could guarantee the input range in [INT_MIN,INT_MAX]
     949             : 
     950             :     // Clamp to [UINT8_MIN, UINT8_MAX]
     951    31149802 :     const __m128 p0d5 = _mm_set1_ps(0.5f);
     952    31149802 :     const __m128 xmm_max = _mm_set1_ps(255);
     953             : 
     954    31149802 :     xmm = _mm_add_ps(xmm, p0d5);
     955    62299704 :     xmm = _mm_min_ps(_mm_max_ps(xmm, p0d5), xmm_max);
     956             : 
     957             : #ifdef USE_NEON_OPTIMIZATIONS
     958             :     // Optimization to avoid useless clamping
     959             :     __m128i xmm_i = _mm_cvttps_epi32_neon_no_post_correction(xmm);
     960             : #else
     961    31149802 :     __m128i xmm_i = _mm_cvttps_epi32(xmm);
     962             : #endif
     963             : 
     964             : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
     965             :     xmm_i = _mm_shuffle_epi8(
     966             :         xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24)));
     967             : #else
     968    31149802 :     xmm_i = _mm_packs_epi32(xmm_i, xmm_i);   // Pack int32 to int16
     969    31149802 :     xmm_i = _mm_packus_epi16(xmm_i, xmm_i);  // Pack int16 to uint8
     970             : #endif
     971    31149802 :     GDALCopyXMMToInt32(xmm_i, pValueOut);
     972    31149802 : }
     973             : 
     974    43492800 : static inline __m128 AddDotZeroBias(__m128 xmm)
     975             : {
     976    43492800 :     const __m128 zeroDotFive = _mm_set1_ps(0.5f);
     977    43492800 :     const __m128 negativeZero = _mm_set1_ps(-0.0f);
     978             :     // f >= 0 ? f + 0.5f : f - 0.5f
     979    86985700 :     const __m128 bias = _mm_or_ps(zeroDotFive, _mm_and_ps(xmm, negativeZero));
     980    43492800 :     xmm = _mm_add_ps(xmm, bias);
     981    43492800 :     return xmm;
     982             : }
     983             : 
     984             : template <>
     985         218 : inline void GDALCopy4Words(const float *pValueIn, GInt8 *const pValueOut)
     986             : {
     987         218 :     __m128 xmm = _mm_loadu_ps(pValueIn);
     988             : 
     989             : #if !defined(USE_NEON_OPTIMIZATIONS)
     990             :     // Cast NaN to zero
     991         436 :     xmm = _mm_andnot_ps(_mm_cmpunord_ps(xmm, xmm), xmm);
     992             : #endif
     993             : 
     994             :     // Clamp to [INT8_MIN, INT8_MAX]
     995         218 :     const __m128 xmm_min = _mm_set1_ps(-128);
     996         218 :     const __m128 xmm_max = _mm_set1_ps(127);
     997         218 :     xmm = _mm_min_ps(_mm_max_ps(xmm, xmm_min), xmm_max);
     998             : 
     999         218 :     xmm = AddDotZeroBias(xmm);
    1000             : 
    1001             : #ifdef USE_NEON_OPTIMIZATIONS
    1002             :     // Optimization to avoid useless clamping
    1003             :     __m128i xmm_i = _mm_cvttps_epi32_neon_no_post_correction(xmm);
    1004             : #else
    1005         218 :     __m128i xmm_i = _mm_cvttps_epi32(xmm);
    1006             : #endif
    1007             : 
    1008             : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
    1009             :     xmm_i = _mm_shuffle_epi8(
    1010             :         xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24)));
    1011             : #else
    1012         218 :     xmm_i = _mm_packs_epi32(xmm_i, xmm_i);  // Pack int32 to int16
    1013         218 :     xmm_i = _mm_packs_epi16(xmm_i, xmm_i);  // Pack int16 to int8
    1014             : #endif
    1015         218 :     GDALCopyXMMToInt32(xmm_i, pValueOut);
    1016         218 : }
    1017             : 
    1018             : template <>
    1019     3364010 : inline void GDALCopy4Words(const float *pValueIn, GInt16 *const pValueOut)
    1020             : {
    1021     3364010 :     __m128 xmm = _mm_loadu_ps(pValueIn);
    1022             : 
    1023             : #if !defined(USE_NEON_OPTIMIZATIONS)
    1024             :     // Cast NaN to zero
    1025     6728020 :     xmm = _mm_andnot_ps(_mm_cmpunord_ps(xmm, xmm), xmm);
    1026             : #endif
    1027             : 
    1028             :     // Clamp to [INT16_MIN, INT16_MAX]
    1029     3364010 :     const __m128 xmm_min = _mm_set1_ps(-32768);
    1030     3364010 :     const __m128 xmm_max = _mm_set1_ps(32767);
    1031     3364010 :     xmm = _mm_min_ps(_mm_max_ps(xmm, xmm_min), xmm_max);
    1032             : 
    1033     3364010 :     xmm = AddDotZeroBias(xmm);
    1034             : 
    1035             : #ifdef USE_NEON_OPTIMIZATIONS
    1036             :     // Optimization to avoid useless clamping
    1037             :     __m128i xmm_i = _mm_cvttps_epi32_neon_no_post_correction(xmm);
    1038             : #else
    1039     3364010 :     __m128i xmm_i = _mm_cvttps_epi32(xmm);
    1040             : #endif
    1041             : 
    1042     3364010 :     xmm_i = _mm_packs_epi32(xmm_i, xmm_i);  // Pack int32 to int16
    1043     3364010 :     GDALCopyXMMToInt64(xmm_i, pValueOut);
    1044     3364010 : }
    1045             : 
    1046          13 : inline __m128i GDAL_mm_int32_to_uint16(__m128i xmm_i)
    1047             : {
    1048             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
    1049             :     xmm_i = _mm_packus_epi32(xmm_i, xmm_i);  // Pack int32 to uint16
    1050             : #else
    1051             :     // Translate to int16 range because _mm_packus_epi32 is SSE4.1 only
    1052          26 :     xmm_i = _mm_add_epi32(xmm_i, _mm_set1_epi32(-32768));
    1053          13 :     xmm_i = _mm_packs_epi32(xmm_i, xmm_i);  // Pack int32 to int16
    1054             :     // Translate back to uint16 range (actually -32768==32768 in int16)
    1055          13 :     xmm_i = _mm_add_epi16(xmm_i, _mm_set1_epi16(-32768));
    1056             : #endif
    1057          13 :     return xmm_i;
    1058             : }
    1059             : 
    1060     8302141 : inline __m128i GDAL_mm_packus_epi32(__m128i xmm_lo, __m128i xmm_hi)
    1061             : {
    1062             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
    1063             :     auto xmm = _mm_packus_epi32(xmm_lo, xmm_hi);  // Pack int32 to uint16
    1064             : #else
    1065             :     // Translate to int16 range because _mm_packus_epi32 is SSE4.1 only
    1066    16604302 :     xmm_lo = _mm_add_epi32(xmm_lo, _mm_set1_epi32(-32768));
    1067    16604302 :     xmm_hi = _mm_add_epi32(xmm_hi, _mm_set1_epi32(-32768));
    1068     8302141 :     auto xmm = _mm_packs_epi32(xmm_lo, xmm_hi);  // Pack int32 to int16
    1069             :     // Translate back to uint16 range (actually -32768==32768 in int16)
    1070     8302141 :     xmm = _mm_add_epi16(xmm, _mm_set1_epi16(-32768));
    1071             : #endif
    1072     8302141 :     return xmm;
    1073             : }
    1074             : 
    1075             : template <>
    1076           1 : inline void GDALCopy4Words(const float *pValueIn, GUInt16 *const pValueOut)
    1077             : {
    1078           1 :     __m128 xmm = _mm_loadu_ps(pValueIn);
    1079             : 
    1080             :     // Clamp to [UINT16_MIN, UINT16_MAX]
    1081           1 :     const __m128 p0d5 = _mm_set1_ps(0.5f);
    1082           1 :     const __m128 xmm_max = _mm_set1_ps(65535);
    1083           1 :     xmm = _mm_add_ps(xmm, p0d5);
    1084           2 :     xmm = _mm_min_ps(_mm_max_ps(xmm, p0d5), xmm_max);
    1085             : 
    1086             : #ifdef USE_NEON_OPTIMIZATIONS
    1087             :     // Optimization to avoid useless clamping
    1088             :     __m128i xmm_i = _mm_cvttps_epi32_neon_no_post_correction(xmm);
    1089             : #else
    1090           1 :     __m128i xmm_i = _mm_cvttps_epi32(xmm);
    1091             : #endif
    1092             : 
    1093           1 :     xmm_i = GDAL_mm_int32_to_uint16(xmm_i);
    1094           1 :     GDALCopyXMMToInt64(xmm_i, pValueOut);
    1095           1 : }
    1096             : 
    1097    40300600 : static inline __m128i GDALIfThenElse(__m128i mask, __m128i thenVal,
    1098             :                                      __m128i elseVal)
    1099             : {
    1100             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
    1101             :     return _mm_blendv_epi8(elseVal, thenVal, mask);
    1102             : #else
    1103    80601300 :     return _mm_or_si128(_mm_and_si128(mask, thenVal),
    1104    40300600 :                         _mm_andnot_si128(mask, elseVal));
    1105             : #endif
    1106             : }
    1107             : 
    1108             : template <>
    1109    40128600 : inline void GDALCopy4Words(const float *pValueIn, GInt32 *const pValueOut)
    1110             : {
    1111    40128600 :     __m128 xmm = _mm_loadu_ps(pValueIn);
    1112             : 
    1113             : #if !defined(USE_NEON_OPTIMIZATIONS)
    1114             :     // Cast NaN to zero
    1115    40128600 :     xmm = _mm_andnot_ps(_mm_cmpunord_ps(xmm, xmm), xmm);
    1116             : #endif
    1117             : 
    1118    40128600 :     xmm = AddDotZeroBias(xmm);
    1119             : 
    1120             : #ifdef USE_NEON_OPTIMIZATIONS
    1121             :     // Optimization to avoid useless clamping
    1122             :     __m128i xmm_i = _mm_cvttps_epi32_neon_no_post_correction(xmm);
    1123             : #else
    1124    40128600 :     __m128i xmm_i = _mm_cvttps_epi32(xmm);
    1125             : 
    1126             :     // Clamp to <= INT32_MAX
    1127    40128600 :     const __m128 xmm_max = _mm_set1_ps(2147483648.0f);
    1128    40128600 :     const __m128i xmm_i_max = _mm_set1_epi32(INT_MAX);
    1129    80257200 :     xmm_i = GDALIfThenElse(_mm_castps_si128(_mm_cmpge_ps(xmm, xmm_max)),
    1130             :                            xmm_i_max, xmm_i);
    1131             : #endif
    1132             : 
    1133             :     _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut), xmm_i);
    1134    40128600 : }
    1135             : 
    1136             : static inline __m128 GDALIfThenElse(__m128 mask, __m128 thenVal, __m128 elseVal)
    1137             : {
    1138             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
    1139             :     return _mm_blendv_ps(elseVal, thenVal, mask);
    1140             : #else
    1141             :     return _mm_or_ps(_mm_and_ps(mask, thenVal), _mm_andnot_ps(mask, elseVal));
    1142             : #endif
    1143             : }
    1144             : 
    1145             : // ARM64 has an efficient instruction for Float32 -> Float16
    1146             : #if !(defined(HAVE__FLOAT16) &&                                                \
    1147             :       (defined(__aarch64__) && defined(_M_ARM64))) &&                          \
    1148             :     !(defined(__AVX2__) && defined(__F16C__))
    1149             : 
    1150         630 : inline __m128i GDALFourFloat32ToFloat16(__m128i xmm)
    1151             : {
    1152             :     // Ported from https://github.com/simd-everywhere/simde/blob/51743e7920b6e867678cb50e9c62effe28f70b33/simde/simde-f16.h#L176
    1153             :     // to SSE2 in a branch-less way
    1154             : 
    1155             :     // clang-format off
    1156             : 
    1157             :     /* This code is CC0, based heavily on code by Fabian Giesen. */
    1158         630 :     const __m128i f32u_infinity = _mm_set1_epi32(255 << 23);
    1159         630 :     const __m128i f16u_max = _mm_set1_epi32((127 + 16) << 23);
    1160         630 :     const __m128i denorm_magic = _mm_set1_epi32(((127 - 15) + (23 - 10) + 1) << 23);
    1161             : 
    1162        1260 :     const auto sign = _mm_and_si128(xmm, _mm_set1_epi32(INT_MIN));
    1163         630 :     xmm = _mm_xor_si128(xmm, sign);
    1164       11340 :     xmm = GDALIfThenElse(
    1165             :         _mm_cmpgt_epi32(xmm, f16u_max),
    1166             :         /* result is Inf or NaN (all exponent bits set) */
    1167             :         GDALIfThenElse(
    1168             :             _mm_cmpgt_epi32(xmm, f32u_infinity),
    1169             :                 /* NaN->qNaN and Inf->Inf */
    1170             :                _mm_set1_epi32(0x7e00),
    1171             :                _mm_set1_epi32(0x7c00)),
    1172             :         /* (De)normalized number or zero */
    1173             :         GDALIfThenElse(
    1174             :             _mm_cmplt_epi32(xmm, _mm_set1_epi32(113 << 23)),
    1175             :              /* use a magic value to align our 10 mantissa bits at the bottom of
    1176             :               * the float. as long as FP addition is round-to-nearest-even this
    1177             :               * just works. */
    1178             :             _mm_sub_epi32(
    1179             :                _mm_castps_si128(_mm_add_ps(_mm_castsi128_ps(xmm),
    1180             :                                            _mm_castsi128_ps(denorm_magic))),
    1181             :                /* and one integer subtract of the bias later,
    1182             :                 * we have our final float! */
    1183             :                denorm_magic
    1184             :             ),
    1185             :             _mm_srli_epi32(
    1186             :                 _mm_add_epi32(
    1187             :                    /* update exponent, rounding bias part 1 */
    1188             :                    // (unsigned)-0x37fff001 = ((unsigned)(15-127) << 23) + 0xfff
    1189             :                   _mm_add_epi32(xmm, _mm_set1_epi32(-0x37fff001)),
    1190             :                    /* rounding bias part 2, using mant_odd */
    1191             :                   _mm_and_si128(_mm_srli_epi32(xmm, 13), _mm_set1_epi32(1))),
    1192             :                 13
    1193             :             )
    1194             :         )
    1195             :     );
    1196         630 :     xmm = _mm_or_si128(xmm, _mm_srli_epi32(sign, 16));
    1197             : 
    1198             :     // clang-format on
    1199         630 :     return xmm;
    1200             : }
    1201             : 
    1202             : template <>
    1203         315 : inline void GDALCopy8Words(const float *pValueIn, GFloat16 *const pValueOut)
    1204             : {
    1205             :     __m128i xmm_lo =
    1206         630 :         GDALFourFloat32ToFloat16(_mm_castps_si128(_mm_loadu_ps(pValueIn)));
    1207             :     __m128i xmm_hi =
    1208         945 :         GDALFourFloat32ToFloat16(_mm_castps_si128(_mm_loadu_ps(pValueIn + 4)));
    1209             : 
    1210         315 :     auto xmm = GDAL_mm_packus_epi32(xmm_lo, xmm_hi);  // Pack int32 to uint16
    1211             :     _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut), xmm);
    1212         315 : }
    1213             : 
    1214             : #endif
    1215             : 
    1216             : template <>
    1217      624550 : inline void GDALCopy4Words(const double *pValueIn, float *const pValueOut)
    1218             : {
    1219      624550 :     const __m128d val01 = _mm_loadu_pd(pValueIn);
    1220     1249100 :     const __m128d val23 = _mm_loadu_pd(pValueIn + 2);
    1221      624550 :     const __m128 val01_s = _mm_cvtpd_ps(val01);
    1222      624550 :     const __m128 val23_s = _mm_cvtpd_ps(val23);
    1223      624550 :     const __m128 val = _mm_movelh_ps(val01_s, val23_s);
    1224             :     _mm_storeu_ps(pValueOut, val);
    1225      624550 : }
    1226             : 
    1227             : template <>
    1228     2776310 : inline void GDALCopy4Words(const double *pValueIn, GByte *const pValueOut)
    1229             : {
    1230     2776310 :     const __m128d p0d5 = _mm_set1_pd(0.5);
    1231     2776310 :     const __m128d xmm_max = _mm_set1_pd(255);
    1232             : 
    1233     2776310 :     __m128d val01 = _mm_loadu_pd(pValueIn);
    1234     5552620 :     __m128d val23 = _mm_loadu_pd(pValueIn + 2);
    1235     2776310 :     val01 = _mm_add_pd(val01, p0d5);
    1236     5552620 :     val01 = _mm_min_pd(_mm_max_pd(val01, p0d5), xmm_max);
    1237     2776310 :     val23 = _mm_add_pd(val23, p0d5);
    1238     5552620 :     val23 = _mm_min_pd(_mm_max_pd(val23, p0d5), xmm_max);
    1239             : 
    1240     2776310 :     const __m128i val01_u32 = _mm_cvttpd_epi32(val01);
    1241     2776310 :     const __m128i val23_u32 = _mm_cvttpd_epi32(val23);
    1242             : 
    1243             :     // Merge 4 int32 values into a single register
    1244     8328920 :     auto xmm_i = _mm_castpd_si128(_mm_shuffle_pd(
    1245             :         _mm_castsi128_pd(val01_u32), _mm_castsi128_pd(val23_u32), 0));
    1246             : 
    1247             : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
    1248             :     xmm_i = _mm_shuffle_epi8(
    1249             :         xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24)));
    1250             : #else
    1251     2776310 :     xmm_i = _mm_packs_epi32(xmm_i, xmm_i);   // Pack int32 to int16
    1252     2776310 :     xmm_i = _mm_packus_epi16(xmm_i, xmm_i);  // Pack int16 to uint8
    1253             : #endif
    1254     2776310 :     GDALCopyXMMToInt32(xmm_i, pValueOut);
    1255     2776310 : }
    1256             : 
    1257             : template <>
    1258    11751400 : inline void GDALCopy4Words(const float *pValueIn, double *const pValueOut)
    1259             : {
    1260    11751400 :     const __m128 valIn = _mm_loadu_ps(pValueIn);
    1261    11751400 :     _mm_storeu_pd(pValueOut, _mm_cvtps_pd(valIn));
    1262    23502900 :     _mm_storeu_pd(pValueOut + 2, _mm_cvtps_pd(_mm_movehl_ps(valIn, valIn)));
    1263    11751400 : }
    1264             : 
    1265             : #ifdef __F16C__
    1266             : template <>
    1267             : inline void GDALCopy4Words(const GFloat16 *pValueIn, float *const pValueOut)
    1268             : {
    1269             :     __m128i xmm = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pValueIn));
    1270             :     _mm_storeu_ps(pValueOut, _mm_cvtph_ps(xmm));
    1271             : }
    1272             : 
    1273             : template <>
    1274             : inline void GDALCopy4Words(const float *pValueIn, GFloat16 *const pValueOut)
    1275             : {
    1276             :     __m128 xmm = _mm_loadu_ps(pValueIn);
    1277             :     GDALCopyXMMToInt64(_mm_cvtps_ph(xmm, _MM_FROUND_TO_NEAREST_INT), pValueOut);
    1278             : }
    1279             : 
    1280             : template <>
    1281             : inline void GDALCopy4Words(const GFloat16 *pValueIn, double *const pValueOut)
    1282             : {
    1283             :     float tmp[4];
    1284             :     GDALCopy4Words(pValueIn, tmp);
    1285             :     GDALCopy4Words(tmp, pValueOut);
    1286             : }
    1287             : 
    1288             : template <>
    1289             : inline void GDALCopy4Words(const double *pValueIn, GFloat16 *const pValueOut)
    1290             : {
    1291             :     float tmp[4];
    1292             :     GDALCopy4Words(pValueIn, tmp);
    1293             :     GDALCopy4Words(tmp, pValueOut);
    1294             : }
    1295             : 
    1296             : // ARM64 has an efficient instruction for Float16 -> Float32/Float64
    1297             : #elif !(defined(HAVE__FLOAT16) && (defined(__aarch64__) && defined(_M_ARM64)))
    1298             : 
    1299             : // Convert 4 float16 values to 4 float 32 values
    1300             : // xmm must contain 4 float16 values stored in 32 bit each (with upper 16 bits at zero)
    1301       41668 : static inline __m128i GDALFourFloat16ToFloat32(__m128i xmm)
    1302             : {
    1303             :     // Ported from https://github.com/simd-everywhere/simde/blob/51743e7920b6e867678cb50e9c62effe28f70b33/simde/simde-f16.h#L242C4-L242C68
    1304             :     // to SSE2 in a branch-less way
    1305             : 
    1306             :     /* This code is CC0, based heavily on code by Fabian Giesen. */
    1307             :     const auto denorm_magic =
    1308       83336 :         _mm_castsi128_ps(_mm_set1_epi32((128 - 15) << 23));
    1309             :     const auto shifted_exp =
    1310       41668 :         _mm_set1_epi32(0x7c00 << 13); /* exponent mask after shift */
    1311             : 
    1312             :     // Shift exponent and mantissa bits to their position in a float32
    1313      125004 :     auto f32u = _mm_slli_epi32(_mm_and_si128(xmm, _mm_set1_epi32(0x7fff)), 13);
    1314             :     // Extract the (shifted) exponent
    1315       41668 :     const auto exp = _mm_and_si128(shifted_exp, f32u);
    1316             :     // Adjust the exponent
    1317       41668 :     const auto exp_adjustment = _mm_set1_epi32((127 - 15) << 23);
    1318       41668 :     f32u = _mm_add_epi32(f32u, exp_adjustment);
    1319             : 
    1320       41668 :     const auto is_inf_nan = _mm_cmpeq_epi32(exp, shifted_exp); /* Inf/NaN? */
    1321             :     // When is_inf_nan is true: extra exponent adjustment
    1322       41668 :     const auto f32u_inf_nan = _mm_add_epi32(f32u, exp_adjustment);
    1323             : 
    1324             :     const auto is_denormal =
    1325       83336 :         _mm_cmpeq_epi32(exp, _mm_setzero_si128()); /* Zero/Denormal? */
    1326             :     // When is_denormal is true:
    1327       83336 :     auto f32u_denormal = _mm_add_epi32(f32u, _mm_set1_epi32(1 << 23));
    1328       83336 :     f32u_denormal = _mm_castps_si128(
    1329             :         _mm_sub_ps(_mm_castsi128_ps(f32u_denormal), denorm_magic));
    1330             : 
    1331       41668 :     f32u = GDALIfThenElse(is_inf_nan, f32u_inf_nan, f32u);
    1332       41668 :     f32u = GDALIfThenElse(is_denormal, f32u_denormal, f32u);
    1333             : 
    1334             :     // Re-apply sign bit
    1335      125004 :     f32u = _mm_or_si128(
    1336             :         f32u, _mm_slli_epi32(_mm_and_si128(xmm, _mm_set1_epi32(0x8000)), 16));
    1337       41668 :     return f32u;
    1338             : }
    1339             : 
    1340             : template <>
    1341         388 : inline void GDALCopy8Words(const GFloat16 *pValueIn, float *const pValueOut)
    1342             : {
    1343         388 :     __m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
    1344             :     const auto xmm_0 =
    1345         776 :         GDALFourFloat16ToFloat32(_mm_unpacklo_epi16(xmm, _mm_setzero_si128()));
    1346             :     const auto xmm_1 =
    1347         776 :         GDALFourFloat16ToFloat32(_mm_unpackhi_epi16(xmm, _mm_setzero_si128()));
    1348         388 :     _mm_storeu_ps(pValueOut + 0, _mm_castsi128_ps(xmm_0));
    1349         388 :     _mm_storeu_ps(pValueOut + 4, _mm_castsi128_ps(xmm_1));
    1350         388 : }
    1351             : 
    1352             : template <>
    1353       20446 : inline void GDALCopy8Words(const GFloat16 *pValueIn, double *const pValueOut)
    1354             : {
    1355       20446 :     __m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
    1356       61338 :     const auto xmm_0 = _mm_castsi128_ps(
    1357             :         GDALFourFloat16ToFloat32(_mm_unpacklo_epi16(xmm, _mm_setzero_si128())));
    1358       61338 :     const auto xmm_1 = _mm_castsi128_ps(
    1359             :         GDALFourFloat16ToFloat32(_mm_unpackhi_epi16(xmm, _mm_setzero_si128())));
    1360       20446 :     _mm_storeu_pd(pValueOut + 0, _mm_cvtps_pd(xmm_0));
    1361       40892 :     _mm_storeu_pd(pValueOut + 2, _mm_cvtps_pd(_mm_movehl_ps(xmm_0, xmm_0)));
    1362       20446 :     _mm_storeu_pd(pValueOut + 4, _mm_cvtps_pd(xmm_1));
    1363       40892 :     _mm_storeu_pd(pValueOut + 6, _mm_cvtps_pd(_mm_movehl_ps(xmm_1, xmm_1)));
    1364       20446 : }
    1365             : 
    1366             : #endif  // __F16C__
    1367             : 
    1368             : #ifdef __AVX2__
    1369             : 
    1370             : #include <immintrin.h>
    1371             : 
    1372             : template <>
    1373             : inline void GDALCopy8Words(const double *pValueIn, float *const pValueOut)
    1374             : {
    1375             :     const __m256d val0123 = _mm256_loadu_pd(pValueIn);
    1376             :     const __m256d val4567 = _mm256_loadu_pd(pValueIn + 4);
    1377             :     const __m256 val0123_s = _mm256_castps128_ps256(_mm256_cvtpd_ps(val0123));
    1378             :     const __m256 val4567_s = _mm256_castps128_ps256(_mm256_cvtpd_ps(val4567));
    1379             :     const __m256 val =
    1380             :         _mm256_permute2f128_ps(val0123_s, val4567_s, 0 | (2 << 4));
    1381             :     _mm256_storeu_ps(pValueOut, val);
    1382             : }
    1383             : 
    1384             : template <>
    1385             : inline void GDALCopy8Words(const float *pValueIn, double *const pValueOut)
    1386             : {
    1387             :     const __m256 valIn = _mm256_loadu_ps(pValueIn);
    1388             :     _mm256_storeu_pd(pValueOut, _mm256_cvtps_pd(_mm256_castps256_ps128(valIn)));
    1389             :     _mm256_storeu_pd(pValueOut + 4,
    1390             :                      _mm256_cvtps_pd(_mm256_castps256_ps128(
    1391             :                          _mm256_permute2f128_ps(valIn, valIn, 1))));
    1392             : }
    1393             : 
    1394             : #ifdef __F16C__
    1395             : 
    1396             : template <>
    1397             : inline void GDALCopy8Words(const GFloat16 *pValueIn, float *const pValueOut)
    1398             : {
    1399             :     __m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
    1400             :     _mm256_storeu_ps(pValueOut, _mm256_cvtph_ps(xmm));
    1401             : }
    1402             : 
    1403             : template <>
    1404             : inline void GDALCopy8Words(const float *pValueIn, GFloat16 *const pValueOut)
    1405             : {
    1406             :     __m256 ymm = _mm256_loadu_ps(pValueIn);
    1407             :     _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut),
    1408             :                      _mm256_cvtps_ph(ymm, _MM_FROUND_TO_NEAREST_INT));
    1409             : }
    1410             : 
    1411             : template <>
    1412             : inline void GDALCopy8Words(const GFloat16 *pValueIn, double *const pValueOut)
    1413             : {
    1414             :     __m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
    1415             :     const auto ymm = _mm256_cvtph_ps(xmm);
    1416             :     _mm256_storeu_pd(pValueOut, _mm256_cvtps_pd(_mm256_extractf128_ps(ymm, 0)));
    1417             :     _mm256_storeu_pd(pValueOut + 4,
    1418             :                      _mm256_cvtps_pd(_mm256_extractf128_ps(ymm, 1)));
    1419             : }
    1420             : 
    1421             : template <>
    1422             : inline void GDALCopy8Words(const double *pValueIn, GFloat16 *const pValueOut)
    1423             : {
    1424             :     __m256d ymm0 = _mm256_loadu_pd(pValueIn);
    1425             :     __m256d ymm1 = _mm256_loadu_pd(pValueIn + 4);
    1426             :     __m256 ymm = _mm256_set_m128(_mm256_cvtpd_ps(ymm1), _mm256_cvtpd_ps(ymm0));
    1427             :     _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut),
    1428             :                      _mm256_cvtps_ph(ymm, _MM_FROUND_TO_NEAREST_INT));
    1429             : }
    1430             : 
    1431             : #endif
    1432             : 
    1433             : template <>
    1434             : inline void GDALCopy8Words(const float *pValueIn, GByte *const pValueOut)
    1435             : {
    1436             :     __m256 ymm = _mm256_loadu_ps(pValueIn);
    1437             : 
    1438             :     const __m256 p0d5 = _mm256_set1_ps(0.5f);
    1439             :     const __m256 ymm_max = _mm256_set1_ps(255);
    1440             :     ymm = _mm256_add_ps(ymm, p0d5);
    1441             :     ymm = _mm256_min_ps(_mm256_max_ps(ymm, p0d5), ymm_max);
    1442             : 
    1443             :     __m256i ymm_i = _mm256_cvttps_epi32(ymm);
    1444             : 
    1445             :     ymm_i = _mm256_packus_epi32(ymm_i, ymm_i);  // Pack int32 to uint16
    1446             :     ymm_i = _mm256_permute4x64_epi64(ymm_i, 0 | (2 << 2));  // AVX2
    1447             : 
    1448             :     __m128i xmm_i = _mm256_castsi256_si128(ymm_i);
    1449             :     xmm_i = _mm_packus_epi16(xmm_i, xmm_i);
    1450             :     GDALCopyXMMToInt64(xmm_i, pValueOut);
    1451             : }
    1452             : 
    1453             : template <>
    1454             : inline void GDALCopy8Words(const float *pValueIn, GUInt16 *const pValueOut)
    1455             : {
    1456             :     __m256 ymm = _mm256_loadu_ps(pValueIn);
    1457             : 
    1458             :     const __m256 p0d5 = _mm256_set1_ps(0.5f);
    1459             :     const __m256 ymm_max = _mm256_set1_ps(65535);
    1460             :     ymm = _mm256_add_ps(ymm, p0d5);
    1461             :     ymm = _mm256_min_ps(_mm256_max_ps(ymm, p0d5), ymm_max);
    1462             : 
    1463             :     __m256i ymm_i = _mm256_cvttps_epi32(ymm);
    1464             : 
    1465             :     ymm_i = _mm256_packus_epi32(ymm_i, ymm_i);  // Pack int32 to uint16
    1466             :     ymm_i = _mm256_permute4x64_epi64(ymm_i, 0 | (2 << 2));  // AVX2
    1467             : 
    1468             :     _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut),
    1469             :                      _mm256_castsi256_si128(ymm_i));
    1470             : }
    1471             : #else
    1472             : template <>
    1473     7785701 : inline void GDALCopy8Words(const float *pValueIn, GUInt16 *const pValueOut)
    1474             : {
    1475     7785701 :     __m128 xmm = _mm_loadu_ps(pValueIn);
    1476    15571402 :     __m128 xmm1 = _mm_loadu_ps(pValueIn + 4);
    1477             : 
    1478     7785701 :     const __m128 p0d5 = _mm_set1_ps(0.5f);
    1479     7785701 :     const __m128 xmm_max = _mm_set1_ps(65535);
    1480     7785701 :     xmm = _mm_add_ps(xmm, p0d5);
    1481     7785701 :     xmm1 = _mm_add_ps(xmm1, p0d5);
    1482    15571402 :     xmm = _mm_min_ps(_mm_max_ps(xmm, p0d5), xmm_max);
    1483    15571402 :     xmm1 = _mm_min_ps(_mm_max_ps(xmm1, p0d5), xmm_max);
    1484             : 
    1485             : #ifdef USE_NEON_OPTIMIZATIONS
    1486             :     // Optimization to avoid useless clamping
    1487             :     __m128i xmm_i = _mm_cvttps_epi32_neon_no_post_correction(xmm);
    1488             :     __m128i xmm1_i = _mm_cvttps_epi32_neon_no_post_correction(xmm1);
    1489             : #else
    1490     7785701 :     __m128i xmm_i = _mm_cvttps_epi32(xmm);
    1491     7785701 :     __m128i xmm1_i = _mm_cvttps_epi32(xmm1);
    1492             : #endif
    1493             : 
    1494     7785701 :     xmm_i = GDAL_mm_packus_epi32(xmm_i, xmm1_i);  // Pack int32 to uint16
    1495             :     _mm_storeu_si128(reinterpret_cast<__m128i *>(pValueOut), xmm_i);
    1496     7785701 : }
    1497             : #endif
    1498             : 
    1499             : // ARM64 has an efficient instruction for Float64 -> Float16
    1500             : #if !(defined(HAVE__FLOAT16) &&                                                \
    1501             :       (defined(__aarch64__) && defined(_M_ARM64))) &&                          \
    1502             :     !(defined(__AVX2__) && defined(__F16C__))
    1503             : template <>
    1504         235 : inline void GDALCopy8Words(const double *pValueIn, GFloat16 *const pValueOut)
    1505             : {
    1506             :     float fVal[8];
    1507         235 :     GDALCopy8Words(pValueIn, fVal);
    1508         235 :     GDALCopy8Words(fVal, pValueOut);
    1509         235 : }
    1510             : #endif
    1511             : 
    1512             : #endif  //  defined(__x86_64) || defined(_M_X64)
    1513             : 
    1514             : #endif  // GDAL_PRIV_TEMPLATES_HPP_INCLUDED

Generated by: LCOV version 1.14