LCOV - gdal_filtered.info - gcore/gdalsse

LCOV - code coverage report

Current view:	top level - gcore - gdalsse_priv.h (source / functions)		Hit	Total	Coverage
Test:	gdal_filtered.info	Lines:	691	695	99.4 %
Date:	2025-08-01 10:10:57	Functions:	139	139	100.0 %

          Line data    Source code

       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  GDAL
       4             :  * Purpose:  SSE2 helper
       5             :  * Author:   Even Rouault <even dot rouault at spatialys dot com>
       6             :  *
       7             :  ******************************************************************************
       8             :  * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
       9             :  *
      10             :  * SPDX-License-Identifier: MIT
      11             :  ****************************************************************************/
      12             : 
      13             : #ifndef GDALSSE_PRIV_H_INCLUDED
      14             : #define GDALSSE_PRIV_H_INCLUDED
      15             : 
      16             : #ifndef DOXYGEN_SKIP
      17             : 
      18             : #include "cpl_port.h"
      19             : 
      20             : /* We restrict to 64bit processors because they are guaranteed to have SSE2 */
      21             : /* Could possibly be used too on 32bit, but we would need to check at runtime */
      22             : #if (defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)) &&             \
      23             :     !defined(USE_SSE2_EMULATION)
      24             : 
      25             : #include <string.h>
      26             : 
      27             : #ifdef USE_NEON_OPTIMIZATIONS
      28             : #include "include_sse2neon.h"
      29             : #else
      30             : /* Requires SSE2 */
      31             : #include <emmintrin.h>
      32             : 
      33             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
      34             : #include <smmintrin.h>
      35             : #endif
      36             : #endif
      37             : 
      38             : #include "gdal_priv_templates.hpp"
      39             : 
      40      535484 : static inline __m128i GDALCopyInt16ToXMM(const void *ptr)
      41             : {
      42             :     unsigned short s;
      43      535484 :     memcpy(&s, ptr, 2);
      44     1070970 :     return _mm_cvtsi32_si128(s);
      45             : }
      46             : 
      47   379216205 : static inline __m128i GDALCopyInt32ToXMM(const void *ptr)
      48             : {
      49             :     GInt32 i;
      50   379216205 :     memcpy(&i, ptr, 4);
      51   758432420 :     return _mm_cvtsi32_si128(i);
      52             : }
      53             : 
      54      265088 : static inline __m128i GDALCopyInt64ToXMM(const void *ptr)
      55             : {
      56             : #if defined(__i386__) || defined(_M_IX86)
      57             :     return _mm_loadl_epi64(static_cast<const __m128i *>(ptr));
      58             : #else
      59             :     GInt64 i;
      60      265088 :     memcpy(&i, ptr, 8);
      61      530176 :     return _mm_cvtsi64_si128(i);
      62             : #endif
      63             : }
      64             : 
      65             : #ifndef GDALCopyXMMToInt16_defined
      66             : #define GDALCopyXMMToInt16_defined
      67             : 
      68             : static inline void GDALCopyXMMToInt16(const __m128i xmm, void *pDest)
      69             : {
      70             :     GInt16 i = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
      71             :     memcpy(pDest, &i, 2);
      72             : }
      73             : #endif
      74             : 
      75             : class XMMReg4Int;
      76             : 
      77             : class XMMReg4Double;
      78             : 
      79             : class XMMReg4Float
      80             : {
      81             :   public:
      82             :     __m128 xmm;
      83             : 
      84    25183300 :     XMMReg4Float()
      85             : #if !defined(_MSC_VER)
      86    25183300 :         : xmm(_mm_undefined_ps())
      87             : #endif
      88             :     {
      89    25183300 :     }
      90             : 
      91       13685 :     XMMReg4Float(const XMMReg4Float &other) : xmm(other.xmm)
      92             :     {
      93       13685 :     }
      94             : 
      95         211 :     static inline XMMReg4Float Zero()
      96             :     {
      97         211 :         XMMReg4Float reg;
      98         211 :         reg.Zeroize();
      99         211 :         return reg;
     100             :     }
     101             : 
     102      264523 :     static inline XMMReg4Float Set1(float f)
     103             :     {
     104      264523 :         XMMReg4Float reg;
     105      264523 :         reg.xmm = _mm_set1_ps(f);
     106      264523 :         return reg;
     107             :     }
     108             : 
     109       86824 :     static inline XMMReg4Float Load4Val(const float *ptr)
     110             :     {
     111       86824 :         XMMReg4Float reg;
     112       86824 :         reg.nsLoad4Val(ptr);
     113       86824 :         return reg;
     114             :     }
     115             : 
     116     1572860 :     static inline XMMReg4Float Load4Val(const unsigned char *ptr)
     117             :     {
     118     1572860 :         XMMReg4Float reg;
     119     1572860 :         reg.nsLoad4Val(ptr);
     120     1572860 :         return reg;
     121             :     }
     122             : 
     123             :     static inline XMMReg4Float Load4Val(const short *ptr)
     124             :     {
     125             :         XMMReg4Float reg;
     126             :         reg.nsLoad4Val(ptr);
     127             :         return reg;
     128             :     }
     129             : 
     130             :     static inline XMMReg4Float Load4Val(const unsigned short *ptr)
     131             :     {
     132             :         XMMReg4Float reg;
     133             :         reg.nsLoad4Val(ptr);
     134             :         return reg;
     135             :     }
     136             : 
     137             :     static inline XMMReg4Float Load4Val(const int *ptr)
     138             :     {
     139             :         XMMReg4Float reg;
     140             :         reg.nsLoad4Val(ptr);
     141             :         return reg;
     142             :     }
     143             : 
     144     1572860 :     static inline XMMReg4Float Equals(const XMMReg4Float &expr1,
     145             :                                       const XMMReg4Float &expr2)
     146             :     {
     147     1572860 :         XMMReg4Float reg;
     148     1572860 :         reg.xmm = _mm_cmpeq_ps(expr1.xmm, expr2.xmm);
     149     1572860 :         return reg;
     150             :     }
     151             : 
     152             :     static inline XMMReg4Float NotEquals(const XMMReg4Float &expr1,
     153             :                                          const XMMReg4Float &expr2)
     154             :     {
     155             :         XMMReg4Float reg;
     156             :         reg.xmm = _mm_cmpneq_ps(expr1.xmm, expr2.xmm);
     157             :         return reg;
     158             :     }
     159             : 
     160      524288 :     static inline XMMReg4Float Lesser(const XMMReg4Float &expr1,
     161             :                                       const XMMReg4Float &expr2)
     162             :     {
     163      524288 :         XMMReg4Float reg;
     164      524288 :         reg.xmm = _mm_cmplt_ps(expr1.xmm, expr2.xmm);
     165      524288 :         return reg;
     166             :     }
     167             : 
     168             :     static inline XMMReg4Float Greater(const XMMReg4Float &expr1,
     169             :                                        const XMMReg4Float &expr2)
     170             :     {
     171             :         XMMReg4Float reg;
     172             :         reg.xmm = _mm_cmpgt_ps(expr1.xmm, expr2.xmm);
     173             :         return reg;
     174             :     }
     175             : 
     176             :     static inline XMMReg4Float And(const XMMReg4Float &expr1,
     177             :                                    const XMMReg4Float &expr2)
     178             :     {
     179             :         XMMReg4Float reg;
     180             :         reg.xmm = _mm_and_ps(expr1.xmm, expr2.xmm);
     181             :         return reg;
     182             :     }
     183             : 
     184     2097150 :     static inline XMMReg4Float Ternary(const XMMReg4Float &cond,
     185             :                                        const XMMReg4Float &true_expr,
     186             :                                        const XMMReg4Float &false_expr)
     187             :     {
     188     2097150 :         XMMReg4Float reg;
     189     4194300 :         reg.xmm = _mm_or_ps(_mm_and_ps(cond.xmm, true_expr.xmm),
     190     2097150 :                             _mm_andnot_ps(cond.xmm, false_expr.xmm));
     191     2097150 :         return reg;
     192             :     }
     193             : 
     194     1048580 :     static inline XMMReg4Float Min(const XMMReg4Float &expr1,
     195             :                                    const XMMReg4Float &expr2)
     196             :     {
     197     1048580 :         XMMReg4Float reg;
     198     1048580 :         reg.xmm = _mm_min_ps(expr1.xmm, expr2.xmm);
     199     1048580 :         return reg;
     200             :     }
     201             : 
     202     1619860 :     static inline XMMReg4Float Max(const XMMReg4Float &expr1,
     203             :                                    const XMMReg4Float &expr2)
     204             :     {
     205     1619860 :         XMMReg4Float reg;
     206     1619860 :         reg.xmm = _mm_max_ps(expr1.xmm, expr2.xmm);
     207     1619860 :         return reg;
     208             :     }
     209             : 
     210       87976 :     inline void nsLoad4Val(const float *ptr)
     211             :     {
     212       87976 :         xmm = _mm_loadu_ps(ptr);
     213       87976 :     }
     214             : 
     215         288 :     static inline void Load16Val(const float *ptr, XMMReg4Float &r0,
     216             :                                  XMMReg4Float &r1, XMMReg4Float &r2,
     217             :                                  XMMReg4Float &r3)
     218             :     {
     219         288 :         r0.nsLoad4Val(ptr);
     220         288 :         r1.nsLoad4Val(ptr + 4);
     221         288 :         r2.nsLoad4Val(ptr + 8);
     222         288 :         r3.nsLoad4Val(ptr + 12);
     223         288 :     }
     224             : 
     225        1088 :     inline void nsLoad4Val(const int *ptr)
     226             :     {
     227             :         const __m128i xmm_i =
     228        1088 :             _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
     229        1088 :         xmm = _mm_cvtepi32_ps(xmm_i);
     230        1088 :     }
     231             : 
     232         272 :     static inline void Load16Val(const int *ptr, XMMReg4Float &r0,
     233             :                                  XMMReg4Float &r1, XMMReg4Float &r2,
     234             :                                  XMMReg4Float &r3)
     235             :     {
     236         272 :         r0.nsLoad4Val(ptr);
     237         272 :         r1.nsLoad4Val(ptr + 4);
     238         272 :         r2.nsLoad4Val(ptr + 8);
     239         272 :         r3.nsLoad4Val(ptr + 12);
     240         272 :     }
     241             : 
     242     2098320 :     static inline __m128i cvtepu8_epi32(__m128i x)
     243             :     {
     244             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
     245             :         return _mm_cvtepu8_epi32(x);
     246             : #else
     247     6294960 :         return _mm_unpacklo_epi16(_mm_unpacklo_epi8(x, _mm_setzero_si128()),
     248     2098320 :                                   _mm_setzero_si128());
     249             : #endif
     250             :     }
     251             : 
     252     1572860 :     inline void nsLoad4Val(const unsigned char *ptr)
     253             :     {
     254     1572860 :         const __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
     255     1572860 :         xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
     256     1572860 :     }
     257             : 
     258      262144 :     static inline void Load8Val(const unsigned char *ptr, XMMReg4Float &r0,
     259             :                                 XMMReg4Float &r1)
     260             :     {
     261      262144 :         const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
     262      262144 :         r0.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
     263      262144 :         r1.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 4)));
     264      262144 :     }
     265             : 
     266         292 :     static inline void Load16Val(const unsigned char *ptr, XMMReg4Float &r0,
     267             :                                  XMMReg4Float &r1, XMMReg4Float &r2,
     268             :                                  XMMReg4Float &r3)
     269             :     {
     270             :         const __m128i xmm_i =
     271         292 :             _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
     272         292 :         r0.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
     273         292 :         r1.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 4)));
     274         292 :         r2.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 8)));
     275         292 :         r3.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 12)));
     276         292 :     }
     277             : 
     278        1088 :     static inline __m128i cvtepi16_epi32(__m128i x)
     279             :     {
     280             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
     281             :         return _mm_cvtepi16_epi32(x);
     282             : #else
     283             :         /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
     284        1088 :         return _mm_srai_epi32(
     285             :             /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
     286        1088 :             _mm_unpacklo_epi16(x, x), 16);
     287             : #endif
     288             :     }
     289             : 
     290             :     inline void nsLoad4Val(const short *ptr)
     291             :     {
     292             :         const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
     293             :         xmm = _mm_cvtepi32_ps(cvtepi16_epi32(xmm_i));
     294             :     }
     295             : 
     296         544 :     static inline void Load8Val(const short *ptr, XMMReg4Float &r0,
     297             :                                 XMMReg4Float &r1)
     298             :     {
     299             :         const __m128i xmm_i =
     300         544 :             _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
     301         544 :         r0.xmm = _mm_cvtepi32_ps(cvtepi16_epi32(xmm_i));
     302         544 :         r1.xmm = _mm_cvtepi32_ps(cvtepi16_epi32(_mm_srli_si128(xmm_i, 8)));
     303         544 :     }
     304             : 
     305         272 :     static inline void Load16Val(const short *ptr, XMMReg4Float &r0,
     306             :                                  XMMReg4Float &r1, XMMReg4Float &r2,
     307             :                                  XMMReg4Float &r3)
     308             :     {
     309         272 :         Load8Val(ptr, r0, r1);
     310         272 :         Load8Val(ptr + 8, r2, r3);
     311         272 :     }
     312             : 
     313        1088 :     static inline __m128i cvtepu16_epi32(__m128i x)
     314             :     {
     315             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
     316             :         return _mm_cvtepu16_epi32(x);
     317             : #else
     318        1088 :         return _mm_unpacklo_epi16(
     319        1088 :             x, _mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
     320             : #endif
     321             :     }
     322             : 
     323             :     inline void nsLoad4Val(const unsigned short *ptr)
     324             :     {
     325             :         const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
     326             :         xmm = _mm_cvtepi32_ps(cvtepu16_epi32(xmm_i));
     327             :     }
     328             : 
     329         544 :     static inline void Load8Val(const unsigned short *ptr, XMMReg4Float &r0,
     330             :                                 XMMReg4Float &r1)
     331             :     {
     332             :         const __m128i xmm_i =
     333         544 :             _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
     334         544 :         r0.xmm = _mm_cvtepi32_ps(cvtepu16_epi32(xmm_i));
     335         544 :         r1.xmm = _mm_cvtepi32_ps(cvtepu16_epi32(_mm_srli_si128(xmm_i, 8)));
     336         544 :     }
     337             : 
     338         272 :     static inline void Load16Val(const unsigned short *ptr, XMMReg4Float &r0,
     339             :                                  XMMReg4Float &r1, XMMReg4Float &r2,
     340             :                                  XMMReg4Float &r3)
     341             :     {
     342         272 :         Load8Val(ptr, r0, r1);
     343         272 :         Load8Val(ptr + 8, r2, r3);
     344         272 :     }
     345             : 
     346         211 :     inline void Zeroize()
     347             :     {
     348         211 :         xmm = _mm_setzero_ps();
     349         211 :     }
     350             : 
     351     1095570 :     inline XMMReg4Float &operator=(const XMMReg4Float &other)
     352             :     {
     353     1095570 :         xmm = other.xmm;
     354     1095570 :         return *this;
     355             :     }
     356             : 
     357       59849 :     inline XMMReg4Float &operator+=(const XMMReg4Float &other)
     358             :     {
     359       59849 :         xmm = _mm_add_ps(xmm, other.xmm);
     360       59849 :         return *this;
     361             :     }
     362             : 
     363       10853 :     inline XMMReg4Float &operator-=(const XMMReg4Float &other)
     364             :     {
     365       10853 :         xmm = _mm_sub_ps(xmm, other.xmm);
     366       10853 :         return *this;
     367             :     }
     368             : 
     369             :     inline XMMReg4Float &operator*=(const XMMReg4Float &other)
     370             :     {
     371             :         xmm = _mm_mul_ps(xmm, other.xmm);
     372             :         return *this;
     373             :     }
     374             : 
     375     3192930 :     inline XMMReg4Float operator+(const XMMReg4Float &other) const
     376             :     {
     377     3192930 :         XMMReg4Float ret;
     378     3192930 :         ret.xmm = _mm_add_ps(xmm, other.xmm);
     379     3192930 :         return ret;
     380             :     }
     381             : 
     382     4762000 :     inline XMMReg4Float operator-(const XMMReg4Float &other) const
     383             :     {
     384     4762000 :         XMMReg4Float ret;
     385     4762000 :         ret.xmm = _mm_sub_ps(xmm, other.xmm);
     386     4762000 :         return ret;
     387             :     }
     388             : 
     389     5242880 :     inline XMMReg4Float operator*(const XMMReg4Float &other) const
     390             :     {
     391     5242880 :         XMMReg4Float ret;
     392     5242880 :         ret.xmm = _mm_mul_ps(xmm, other.xmm);
     393     5242880 :         return ret;
     394             :     }
     395             : 
     396      524288 :     inline XMMReg4Float operator/(const XMMReg4Float &other) const
     397             :     {
     398      524288 :         XMMReg4Float ret;
     399      524288 :         ret.xmm = _mm_div_ps(xmm, other.xmm);
     400      524288 :         return ret;
     401             :     }
     402             : 
     403      524288 :     inline XMMReg4Float inverse() const
     404             :     {
     405      524288 :         XMMReg4Float ret;
     406     1048580 :         ret.xmm = _mm_div_ps(_mm_set1_ps(1.0f), xmm);
     407      524288 :         return ret;
     408             :     }
     409             : 
     410             :     inline XMMReg4Int truncate_to_int() const;
     411             : 
     412             :     inline XMMReg4Double cast_to_double() const;
     413             : 
     414       49823 :     inline void Store4Val(float *ptr) const
     415             :     {
     416       49823 :         _mm_storeu_ps(ptr, xmm);
     417       49823 :     }
     418             : 
     419             :     inline void Store4ValAligned(float *ptr) const
     420             :     {
     421             :         _mm_store_ps(ptr, xmm);
     422             :     }
     423             : 
     424             :     inline operator float() const
     425             :     {
     426             :         return _mm_cvtss_f32(xmm);
     427             :     }
     428             : };
     429             : 
     430             : class XMMReg4Int
     431             : {
     432             :   public:
     433             :     __m128i xmm;
     434             : 
     435     3055100 :     XMMReg4Int()
     436             : #if !defined(_MSC_VER)
     437     3055100 :         : xmm(_mm_undefined_si128())
     438             : #endif
     439             :     {
     440     3055100 :     }
     441             : 
     442       36138 :     XMMReg4Int(const XMMReg4Int &other) : xmm(other.xmm)
     443             :     {
     444       36138 :     }
     445             : 
     446             :     static inline XMMReg4Int Zero()
     447             :     {
     448             :         XMMReg4Int reg;
     449             :         reg.xmm = _mm_setzero_si128();
     450             :         return reg;
     451             :     }
     452             : 
     453             :     static inline XMMReg4Int Set1(int i)
     454             :     {
     455             :         XMMReg4Int reg;
     456             :         reg.xmm = _mm_set1_epi32(i);
     457             :         return reg;
     458             :     }
     459             : 
     460      289104 :     static inline XMMReg4Int Load4Val(const int *ptr)
     461             :     {
     462      289104 :         XMMReg4Int reg;
     463      289104 :         reg.nsLoad4Val(ptr);
     464      289104 :         return reg;
     465             :     }
     466             : 
     467      289104 :     inline void nsLoad4Val(const int *ptr)
     468             :     {
     469      289104 :         xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
     470      289104 :     }
     471             : 
     472             :     static inline XMMReg4Int Equals(const XMMReg4Int &expr1,
     473             :                                     const XMMReg4Int &expr2)
     474             :     {
     475             :         XMMReg4Int reg;
     476             :         reg.xmm = _mm_cmpeq_epi32(expr1.xmm, expr2.xmm);
     477             :         return reg;
     478             :     }
     479             : 
     480             :     static inline XMMReg4Int Ternary(const XMMReg4Int &cond,
     481             :                                      const XMMReg4Int &true_expr,
     482             :                                      const XMMReg4Int &false_expr)
     483             :     {
     484             :         XMMReg4Int reg;
     485             :         reg.xmm = _mm_or_si128(_mm_and_si128(cond.xmm, true_expr.xmm),
     486             :                                _mm_andnot_si128(cond.xmm, false_expr.xmm));
     487             :         return reg;
     488             :     }
     489             : 
     490      180690 :     inline XMMReg4Int &operator+=(const XMMReg4Int &other)
     491             :     {
     492      180690 :         xmm = _mm_add_epi32(xmm, other.xmm);
     493      180690 :         return *this;
     494             :     }
     495             : 
     496       36138 :     inline XMMReg4Int &operator-=(const XMMReg4Int &other)
     497             :     {
     498       36138 :         xmm = _mm_sub_epi32(xmm, other.xmm);
     499       36138 :         return *this;
     500             :     }
     501             : 
     502             :     inline XMMReg4Int operator+(const XMMReg4Int &other) const
     503             :     {
     504             :         XMMReg4Int ret;
     505             :         ret.xmm = _mm_add_epi32(xmm, other.xmm);
     506             :         return ret;
     507             :     }
     508             : 
     509      144552 :     inline XMMReg4Int operator-(const XMMReg4Int &other) const
     510             :     {
     511      144552 :         XMMReg4Int ret;
     512      144552 :         ret.xmm = _mm_sub_epi32(xmm, other.xmm);
     513      144552 :         return ret;
     514             :     }
     515             : 
     516             :     XMMReg4Double cast_to_double() const;
     517             : 
     518      524288 :     XMMReg4Float to_float() const
     519             :     {
     520      524288 :         XMMReg4Float ret;
     521      524288 :         ret.xmm = _mm_cvtepi32_ps(xmm);
     522      524288 :         return ret;
     523             :     }
     524             : };
     525             : 
     526     2621440 : inline XMMReg4Int XMMReg4Float::truncate_to_int() const
     527             : {
     528     2621440 :     XMMReg4Int ret;
     529     2621440 :     ret.xmm = _mm_cvttps_epi32(xmm);
     530     2621440 :     return ret;
     531             : }
     532             : 
     533             : class XMMReg8Byte
     534             : {
     535             :   public:
     536             :     __m128i xmm;
     537             : 
     538     5619710 :     XMMReg8Byte()
     539             : #if !defined(_MSC_VER)
     540     5619710 :         : xmm(_mm_undefined_si128())
     541             : #endif
     542             :     {
     543     5619710 :     }
     544             : 
     545             :     XMMReg8Byte(const XMMReg8Byte &other) : xmm(other.xmm)
     546             :     {
     547             :     }
     548             : 
     549      262144 :     static inline XMMReg8Byte Zero()
     550             :     {
     551      262144 :         XMMReg8Byte reg;
     552      262144 :         reg.xmm = _mm_setzero_si128();
     553      262144 :         return reg;
     554             :     }
     555             : 
     556      262144 :     static inline XMMReg8Byte Set1(char i)
     557             :     {
     558      262144 :         XMMReg8Byte reg;
     559      262144 :         reg.xmm = _mm_set1_epi8(i);
     560      262144 :         return reg;
     561             :     }
     562             : 
     563     1310720 :     static inline XMMReg8Byte Equals(const XMMReg8Byte &expr1,
     564             :                                      const XMMReg8Byte &expr2)
     565             :     {
     566     1310720 :         XMMReg8Byte reg;
     567     1310720 :         reg.xmm = _mm_cmpeq_epi8(expr1.xmm, expr2.xmm);
     568     1310720 :         return reg;
     569             :     }
     570             : 
     571      475136 :     static inline XMMReg8Byte Or(const XMMReg8Byte &expr1,
     572             :                                  const XMMReg8Byte &expr2)
     573             :     {
     574      475136 :         XMMReg8Byte reg;
     575      475136 :         reg.xmm = _mm_or_si128(expr1.xmm, expr2.xmm);
     576      475136 :         return reg;
     577             :     }
     578             : 
     579     1212420 :     static inline XMMReg8Byte Ternary(const XMMReg8Byte &cond,
     580             :                                       const XMMReg8Byte &true_expr,
     581             :                                       const XMMReg8Byte &false_expr)
     582             :     {
     583     1212420 :         XMMReg8Byte reg;
     584     2424830 :         reg.xmm = _mm_or_si128(_mm_and_si128(cond.xmm, true_expr.xmm),
     585     1212420 :                                _mm_andnot_si128(cond.xmm, false_expr.xmm));
     586     1212420 :         return reg;
     587             :     }
     588             : 
     589      524288 :     inline XMMReg8Byte operator+(const XMMReg8Byte &other) const
     590             :     {
     591      524288 :         XMMReg8Byte ret;
     592      524288 :         ret.xmm = _mm_add_epi8(xmm, other.xmm);
     593      524288 :         return ret;
     594             :     }
     595             : 
     596      262144 :     inline XMMReg8Byte operator-(const XMMReg8Byte &other) const
     597             :     {
     598      262144 :         XMMReg8Byte ret;
     599      262144 :         ret.xmm = _mm_sub_epi8(xmm, other.xmm);
     600      262144 :         return ret;
     601             :     }
     602             : 
     603     1310720 :     static inline XMMReg8Byte Pack(const XMMReg4Int &r0, const XMMReg4Int &r1)
     604             :     {
     605     1310720 :         XMMReg8Byte reg;
     606     1310720 :         reg.xmm = _mm_packs_epi32(r0.xmm, r1.xmm);
     607     1310720 :         reg.xmm = _mm_packus_epi16(reg.xmm, reg.xmm);
     608     1310720 :         return reg;
     609             :     }
     610             : 
     611      360448 :     inline void Store8Val(unsigned char *ptr) const
     612             :     {
     613      360448 :         GDALCopyXMMToInt64(xmm, reinterpret_cast<GInt64 *>(ptr));
     614      360448 :     }
     615             : };
     616             : 
     617             : class XMMReg2Double
     618             : {
     619             :   public:
     620             :     __m128d xmm;
     621             : 
     622  4191270084 :     XMMReg2Double()
     623             : #if !defined(_MSC_VER)
     624  4191270084 :         : xmm(_mm_undefined_pd())
     625             : #endif
     626             :     {
     627  4191270084 :     }
     628             : 
     629             :     XMMReg2Double(double val) : xmm(_mm_load_sd(&val))
     630             :     {
     631             :     }
     632             : 
     633     2666440 :     XMMReg2Double(const XMMReg2Double &other) : xmm(other.xmm)
     634             :     {
     635     2666440 :     }
     636             : 
     637       16804 :     static inline XMMReg2Double Set1(double d)
     638             :     {
     639       16804 :         XMMReg2Double reg;
     640       16804 :         reg.xmm = _mm_set1_pd(d);
     641       16804 :         return reg;
     642             :     }
     643             : 
     644     1845030 :     static inline XMMReg2Double Zero()
     645             :     {
     646     1845030 :         XMMReg2Double reg;
     647     1845030 :         reg.Zeroize();
     648     1845030 :         return reg;
     649             :     }
     650             : 
     651             :     static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
     652             :     {
     653             :         XMMReg2Double reg;
     654             :         reg.nsLoad1ValHighAndLow(ptr);
     655             :         return reg;
     656             :     }
     657             : 
     658      145983 :     static inline XMMReg2Double Load2Val(const double *ptr)
     659             :     {
     660      145983 :         XMMReg2Double reg;
     661      145983 :         reg.nsLoad2Val(ptr);
     662      145983 :         return reg;
     663             :     }
     664             : 
     665         768 :     static inline XMMReg2Double Load2Val(const float *ptr)
     666             :     {
     667         768 :         XMMReg2Double reg;
     668         768 :         reg.nsLoad2Val(ptr);
     669         768 :         return reg;
     670             :     }
     671             : 
     672    23062900 :     static inline XMMReg2Double Load2ValAligned(const double *ptr)
     673             :     {
     674    23062900 :         XMMReg2Double reg;
     675    23062900 :         reg.nsLoad2ValAligned(ptr);
     676    23062900 :         return reg;
     677             :     }
     678             : 
     679      535484 :     static inline XMMReg2Double Load2Val(const unsigned char *ptr)
     680             :     {
     681      535484 :         XMMReg2Double reg;
     682      535484 :         reg.nsLoad2Val(ptr);
     683      535484 :         return reg;
     684             :     }
     685             : 
     686       18492 :     static inline XMMReg2Double Load2Val(const short *ptr)
     687             :     {
     688       18492 :         XMMReg2Double reg;
     689       18492 :         reg.nsLoad2Val(ptr);
     690       18492 :         return reg;
     691             :     }
     692             : 
     693       29184 :     static inline XMMReg2Double Load2Val(const unsigned short *ptr)
     694             :     {
     695       29184 :         XMMReg2Double reg;
     696       29184 :         reg.nsLoad2Val(ptr);
     697       29184 :         return reg;
     698             :     }
     699             : 
     700             :     static inline XMMReg2Double Load2Val(const int *ptr)
     701             :     {
     702             :         XMMReg2Double reg;
     703             :         reg.nsLoad2Val(ptr);
     704             :         return reg;
     705             :     }
     706             : 
     707           2 :     static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
     708             :                                        const XMMReg2Double &expr2)
     709             :     {
     710           2 :         XMMReg2Double reg;
     711           2 :         reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
     712           2 :         return reg;
     713             :     }
     714             : 
     715     2637332 :     static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
     716             :                                           const XMMReg2Double &expr2)
     717             :     {
     718     2637332 :         XMMReg2Double reg;
     719     2659662 :         reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
     720     2660962 :         return reg;
     721             :     }
     722             : 
     723           6 :     static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
     724             :                                         const XMMReg2Double &expr2)
     725             :     {
     726           6 :         XMMReg2Double reg;
     727           6 :         reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
     728           6 :         return reg;
     729             :     }
     730             : 
     731     2656300 :     static inline XMMReg2Double And(const XMMReg2Double &expr1,
     732             :                                     const XMMReg2Double &expr2)
     733             :     {
     734     2656300 :         XMMReg2Double reg;
     735     2654990 :         reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
     736     2668280 :         return reg;
     737             :     }
     738             : 
     739           2 :     static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
     740             :                                         const XMMReg2Double &true_expr,
     741             :                                         const XMMReg2Double &false_expr)
     742             :     {
     743           2 :         XMMReg2Double reg;
     744           4 :         reg.xmm = _mm_or_pd(_mm_and_pd(cond.xmm, true_expr.xmm),
     745           2 :                             _mm_andnot_pd(cond.xmm, false_expr.xmm));
     746           2 :         return reg;
     747             :     }
     748             : 
     749     8293602 :     static inline XMMReg2Double Min(const XMMReg2Double &expr1,
     750             :                                     const XMMReg2Double &expr2)
     751             :     {
     752     8293602 :         XMMReg2Double reg;
     753     8284162 :         reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
     754     8301452 :         return reg;
     755             :     }
     756             : 
     757   111036001 :     inline void nsLoad1ValHighAndLow(const double *ptr)
     758             :     {
     759   111036001 :         xmm = _mm_load1_pd(ptr);
     760   111036001 :     }
     761             : 
     762   493389017 :     inline void nsLoad2Val(const double *ptr)
     763             :     {
     764   493389017 :         xmm = _mm_loadu_pd(ptr);
     765   493389017 :     }
     766             : 
     767   209999000 :     inline void nsLoad2ValAligned(const double *ptr)
     768             :     {
     769   209999000 :         xmm = _mm_load_pd(ptr);
     770   209999000 :     }
     771             : 
     772         768 :     inline void nsLoad2Val(const float *ptr)
     773             :     {
     774        1536 :         xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
     775         768 :     }
     776             : 
     777        2176 :     inline void nsLoad2Val(const int *ptr)
     778             :     {
     779        2176 :         xmm = _mm_cvtepi32_pd(GDALCopyInt64ToXMM(ptr));
     780        2176 :     }
     781             : 
     782      535484 :     inline void nsLoad2Val(const unsigned char *ptr)
     783             :     {
     784      535484 :         __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
     785             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
     786             :         xmm_i = _mm_cvtepu8_epi32(xmm_i);
     787             : #else
     788     1070970 :         xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
     789     1070970 :         xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
     790             : #endif
     791      535484 :         xmm = _mm_cvtepi32_pd(xmm_i);
     792      535484 :     }
     793             : 
     794     2048342 :     inline void nsLoad2Val(const short *ptr)
     795             :     {
     796     2048342 :         __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
     797             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
     798             :         xmm_i = _mm_cvtepi16_epi32(xmm_i);
     799             : #else
     800     2048342 :         xmm_i = _mm_unpacklo_epi16(
     801             :             xmm_i, xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
     802     2048342 :         xmm_i = _mm_srai_epi32(
     803             :             xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
     804             : #endif
     805     2048342 :         xmm = _mm_cvtepi32_pd(xmm_i);
     806     2048342 :     }
     807             : 
     808    44819102 :     inline void nsLoad2Val(const unsigned short *ptr)
     809             :     {
     810    44819102 :         __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
     811             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
     812             :         xmm_i = _mm_cvtepu16_epi32(xmm_i);
     813             : #else
     814    89737804 :         xmm_i = _mm_unpacklo_epi16(
     815             :             xmm_i,
     816             :             _mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
     817             : #endif
     818    45131302 :         xmm = _mm_cvtepi32_pd(xmm_i);
     819    45131302 :     }
     820             : 
     821   330468001 :     static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
     822             :                                 XMMReg2Double &high)
     823             :     {
     824   330468001 :         __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
     825             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
     826             :         xmm_i = _mm_cvtepu8_epi32(xmm_i);
     827             : #else
     828   663819002 :         xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
     829   665357002 :         xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
     830             : #endif
     831   333107001 :         low.xmm = _mm_cvtepi32_pd(xmm_i);
     832   333348001 :         high.xmm =
     833   333107001 :             _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i, _MM_SHUFFLE(3, 2, 3, 2)));
     834   333348001 :     }
     835             : 
     836             :     static inline void Load4Val(const short *ptr, XMMReg2Double &low,
     837             :                                 XMMReg2Double &high)
     838             :     {
     839             :         low.nsLoad2Val(ptr);
     840             :         high.nsLoad2Val(ptr + 2);
     841             :     }
     842             : 
     843             :     static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
     844             :                                 XMMReg2Double &high)
     845             :     {
     846             :         low.nsLoad2Val(ptr);
     847             :         high.nsLoad2Val(ptr + 2);
     848             :     }
     849             : 
     850             :     static inline void Load4Val(const double *ptr, XMMReg2Double &low,
     851             :                                 XMMReg2Double &high)
     852             :     {
     853             :         low.nsLoad2Val(ptr);
     854             :         high.nsLoad2Val(ptr + 2);
     855             :     }
     856             : 
     857       38785 :     static inline void Load4Val(const float *ptr, XMMReg2Double &low,
     858             :                                 XMMReg2Double &high)
     859             :     {
     860       38785 :         __m128 temp1 = _mm_loadu_ps(ptr);
     861       38785 :         __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 2, 3, 2));
     862       38785 :         low.xmm = _mm_cvtps_pd(temp1);
     863       38785 :         high.xmm = _mm_cvtps_pd(temp2);
     864       38785 :     }
     865             : 
     866   459267000 :     inline void Zeroize()
     867             :     {
     868   459267000 :         xmm = _mm_setzero_pd();
     869   459267000 :     }
     870             : 
     871  1427350037 :     inline XMMReg2Double &operator=(const XMMReg2Double &other)
     872             :     {
     873  1427350037 :         xmm = other.xmm;
     874  1427350037 :         return *this;
     875             :     }
     876             : 
     877  1054430003 :     inline XMMReg2Double &operator+=(const XMMReg2Double &other)
     878             :     {
     879  1054430003 :         xmm = _mm_add_pd(xmm, other.xmm);
     880  1054430003 :         return *this;
     881             :     }
     882             : 
     883    25340502 :     inline XMMReg2Double &operator*=(const XMMReg2Double &other)
     884             :     {
     885    25340502 :         xmm = _mm_mul_pd(xmm, other.xmm);
     886    25340502 :         return *this;
     887             :     }
     888             : 
     889   222926009 :     inline XMMReg2Double operator+(const XMMReg2Double &other) const
     890             :     {
     891   222926009 :         XMMReg2Double ret;
     892   223156009 :         ret.xmm = _mm_add_pd(xmm, other.xmm);
     893   223156009 :         return ret;
     894             :     }
     895             : 
     896           2 :     inline XMMReg2Double operator-(const XMMReg2Double &other) const
     897             :     {
     898           2 :         XMMReg2Double ret;
     899           2 :         ret.xmm = _mm_sub_pd(xmm, other.xmm);
     900           2 :         return ret;
     901             :     }
     902             : 
     903  1102050002 :     inline XMMReg2Double operator*(const XMMReg2Double &other) const
     904             :     {
     905  1102050002 :         XMMReg2Double ret;
     906  1107260002 :         ret.xmm = _mm_mul_pd(xmm, other.xmm);
     907  1107260002 :         return ret;
     908             :     }
     909             : 
     910     2646262 :     inline XMMReg2Double operator/(const XMMReg2Double &other) const
     911             :     {
     912     2646262 :         XMMReg2Double ret;
     913     2661922 :         ret.xmm = _mm_div_pd(xmm, other.xmm);
     914     2661922 :         return ret;
     915             :     }
     916             : 
     917   224335001 :     inline double GetHorizSum() const
     918             :     {
     919             :         __m128d xmm2;
     920   224335001 :         xmm2 = _mm_shuffle_pd(
     921             :             xmm, xmm,
     922             :             _MM_SHUFFLE2(0, 1)); /* transfer high word into low word of xmm2 */
     923   673214003 :         return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
     924             :     }
     925             : 
     926        6944 :     inline void Store2Val(double *ptr) const
     927             :     {
     928        6944 :         _mm_storeu_pd(ptr, xmm);
     929        6944 :     }
     930             : 
     931             :     inline void Store2ValAligned(double *ptr) const
     932             :     {
     933             :         _mm_store_pd(ptr, xmm);
     934             :     }
     935             : 
     936    96484002 :     inline void Store2Val(float *ptr) const
     937             :     {
     938   193146004 :         __m128i xmm_i = _mm_castps_si128(_mm_cvtpd_ps(xmm));
     939    96661702 :         GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
     940    96604302 :     }
     941             : 
     942             :     inline void Store2Val(unsigned char *ptr) const
     943             :     {
     944             :         __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
     945             :             xmm,
     946             :             _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
     947             :         tmp = _mm_packs_epi32(tmp, tmp);
     948             :         tmp = _mm_packus_epi16(tmp, tmp);
     949             :         GDALCopyXMMToInt16(tmp, reinterpret_cast<GInt16 *>(ptr));
     950             :     }
     951             : 
     952     5027932 :     inline void Store2Val(unsigned short *ptr) const
     953             :     {
     954     5027932 :         __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
     955     5027932 :             xmm,
     956             :             _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
     957             :         // X X X X 0 B 0 A --> X X X X A A B A
     958     5007602 :         tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));
     959     5070112 :         GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
     960     5014862 :     }
     961             : 
     962           8 :     inline void StoreMask(unsigned char *ptr) const
     963             :     {
     964           8 :         _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr),
     965           8 :                          _mm_castpd_si128(xmm));
     966           8 :     }
     967             : 
     968             :     inline operator double() const
     969             :     {
     970             :         return _mm_cvtsd_f64(xmm);
     971             :     }
     972             : };
     973             : 
     974             : #else
     975             : 
     976             : #ifndef NO_WARN_USE_SSE2_EMULATION
     977             : #warning "Software emulation of SSE2 !"
     978             : #endif
     979             : 
     980             : class XMMReg2Double
     981             : {
     982             :   public:
     983             :     double low;
     984             :     double high;
     985             : 
     986             :     // cppcheck-suppress uninitMemberVar
     987             :     XMMReg2Double() = default;
     988             : 
     989             :     explicit XMMReg2Double(double val)
     990             :     {
     991             :         low = val;
     992             :         high = 0.0;
     993             :     }
     994             : 
     995             :     XMMReg2Double(const XMMReg2Double &other) : low(other.low), high(other.high)
     996             :     {
     997             :     }
     998             : 
     999             :     static inline XMMReg2Double Zero()
    1000             :     {
    1001             :         XMMReg2Double reg;
    1002             :         reg.Zeroize();
    1003             :         return reg;
    1004             :     }
    1005             : 
    1006             :     static inline XMMReg2Double Set1(double d)
    1007             :     {
    1008             :         XMMReg2Double reg;
    1009             :         reg.low = d;
    1010             :         reg.high = d;
    1011             :         return reg;
    1012             :     }
    1013             : 
    1014             :     static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
    1015             :     {
    1016             :         XMMReg2Double reg;
    1017             :         reg.nsLoad1ValHighAndLow(ptr);
    1018             :         return reg;
    1019             :     }
    1020             : 
    1021           2 :     static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
    1022             :                                        const XMMReg2Double &expr2)
    1023             :     {
    1024             :         XMMReg2Double reg;
    1025             : 
    1026           2 :         if (expr1.low == expr2.low)
    1027           2 :             memset(&(reg.low), 0xFF, sizeof(double));
    1028             :         else
    1029           0 :             reg.low = 0;
    1030             : 
    1031           2 :         if (expr1.high == expr2.high)
    1032           2 :             memset(&(reg.high), 0xFF, sizeof(double));
    1033             :         else
    1034           0 :             reg.high = 0;
    1035             : 
    1036           2 :         return reg;
    1037             :     }
    1038             : 
    1039           2 :     static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
    1040             :                                           const XMMReg2Double &expr2)
    1041             :     {
    1042             :         XMMReg2Double reg;
    1043             : 
    1044           2 :         if (expr1.low != expr2.low)
    1045           0 :             memset(&(reg.low), 0xFF, sizeof(double));
    1046             :         else
    1047           2 :             reg.low = 0;
    1048             : 
    1049           2 :         if (expr1.high != expr2.high)
    1050           0 :             memset(&(reg.high), 0xFF, sizeof(double));
    1051             :         else
    1052           2 :             reg.high = 0;
    1053             : 
    1054           2 :         return reg;
    1055             :     }
    1056             : 
    1057           6 :     static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
    1058             :                                         const XMMReg2Double &expr2)
    1059             :     {
    1060             :         XMMReg2Double reg;
    1061             : 
    1062           6 :         if (expr1.low > expr2.low)
    1063           2 :             memset(&(reg.low), 0xFF, sizeof(double));
    1064             :         else
    1065           4 :             reg.low = 0;
    1066             : 
    1067           6 :         if (expr1.high > expr2.high)
    1068           2 :             memset(&(reg.high), 0xFF, sizeof(double));
    1069             :         else
    1070           4 :             reg.high = 0;
    1071             : 
    1072           6 :         return reg;
    1073             :     }
    1074             : 
    1075             :     static inline XMMReg2Double And(const XMMReg2Double &expr1,
    1076             :                                     const XMMReg2Double &expr2)
    1077             :     {
    1078             :         XMMReg2Double reg;
    1079             :         int low1[2], high1[2];
    1080             :         int low2[2], high2[2];
    1081             :         memcpy(low1, &expr1.low, sizeof(double));
    1082             :         memcpy(high1, &expr1.high, sizeof(double));
    1083             :         memcpy(low2, &expr2.low, sizeof(double));
    1084             :         memcpy(high2, &expr2.high, sizeof(double));
    1085             :         low1[0] &= low2[0];
    1086             :         low1[1] &= low2[1];
    1087             :         high1[0] &= high2[0];
    1088             :         high1[1] &= high2[1];
    1089             :         memcpy(&reg.low, low1, sizeof(double));
    1090             :         memcpy(&reg.high, high1, sizeof(double));
    1091             :         return reg;
    1092             :     }
    1093             : 
    1094           2 :     static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
    1095             :                                         const XMMReg2Double &true_expr,
    1096             :                                         const XMMReg2Double &false_expr)
    1097             :     {
    1098             :         XMMReg2Double reg;
    1099           2 :         if (cond.low != 0)
    1100           1 :             reg.low = true_expr.low;
    1101             :         else
    1102           1 :             reg.low = false_expr.low;
    1103           2 :         if (cond.high != 0)
    1104           1 :             reg.high = true_expr.high;
    1105             :         else
    1106           1 :             reg.high = false_expr.high;
    1107           2 :         return reg;
    1108             :     }
    1109             : 
    1110           2 :     static inline XMMReg2Double Min(const XMMReg2Double &expr1,
    1111             :                                     const XMMReg2Double &expr2)
    1112             :     {
    1113             :         XMMReg2Double reg;
    1114           2 :         reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
    1115           2 :         reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
    1116           2 :         return reg;
    1117             :     }
    1118             : 
    1119           1 :     static inline XMMReg2Double Load2Val(const double *ptr)
    1120             :     {
    1121             :         XMMReg2Double reg;
    1122           1 :         reg.nsLoad2Val(ptr);
    1123           1 :         return reg;
    1124             :     }
    1125             : 
    1126             :     static inline XMMReg2Double Load2ValAligned(const double *ptr)
    1127             :     {
    1128             :         XMMReg2Double reg;
    1129             :         reg.nsLoad2ValAligned(ptr);
    1130             :         return reg;
    1131             :     }
    1132             : 
    1133             :     static inline XMMReg2Double Load2Val(const float *ptr)
    1134             :     {
    1135             :         XMMReg2Double reg;
    1136             :         reg.nsLoad2Val(ptr);
    1137             :         return reg;
    1138             :     }
    1139             : 
    1140             :     static inline XMMReg2Double Load2Val(const unsigned char *ptr)
    1141             :     {
    1142             :         XMMReg2Double reg;
    1143             :         reg.nsLoad2Val(ptr);
    1144             :         return reg;
    1145             :     }
    1146             : 
    1147             :     static inline XMMReg2Double Load2Val(const short *ptr)
    1148             :     {
    1149             :         XMMReg2Double reg;
    1150             :         reg.nsLoad2Val(ptr);
    1151             :         return reg;
    1152             :     }
    1153             : 
    1154             :     static inline XMMReg2Double Load2Val(const unsigned short *ptr)
    1155             :     {
    1156             :         XMMReg2Double reg;
    1157             :         reg.nsLoad2Val(ptr);
    1158             :         return reg;
    1159             :     }
    1160             : 
    1161             :     static inline XMMReg2Double Load2Val(const int *ptr)
    1162             :     {
    1163             :         XMMReg2Double reg;
    1164             :         reg.nsLoad2Val(ptr);
    1165             :         return reg;
    1166             :     }
    1167             : 
    1168           1 :     inline void nsLoad1ValHighAndLow(const double *ptr)
    1169             :     {
    1170           1 :         low = ptr[0];
    1171           1 :         high = ptr[0];
    1172           1 :     }
    1173             : 
    1174          17 :     inline void nsLoad2Val(const double *ptr)
    1175             :     {
    1176          17 :         low = ptr[0];
    1177          17 :         high = ptr[1];
    1178          17 :     }
    1179             : 
    1180             :     inline void nsLoad2ValAligned(const double *ptr)
    1181             :     {
    1182             :         low = ptr[0];
    1183             :         high = ptr[1];
    1184             :     }
    1185             : 
    1186           2 :     inline void nsLoad2Val(const float *ptr)
    1187             :     {
    1188           2 :         low = ptr[0];
    1189           2 :         high = ptr[1];
    1190           2 :     }
    1191             : 
    1192             :     inline void nsLoad2Val(const unsigned char *ptr)
    1193             :     {
    1194             :         low = ptr[0];
    1195             :         high = ptr[1];
    1196             :     }
    1197             : 
    1198           2 :     inline void nsLoad2Val(const short *ptr)
    1199             :     {
    1200           2 :         low = ptr[0];
    1201           2 :         high = ptr[1];
    1202           2 :     }
    1203             : 
    1204           2 :     inline void nsLoad2Val(const unsigned short *ptr)
    1205             :     {
    1206           2 :         low = ptr[0];
    1207           2 :         high = ptr[1];
    1208           2 :     }
    1209             : 
    1210             :     inline void nsLoad2Val(const int *ptr)
    1211             :     {
    1212             :         low = ptr[0];
    1213             :         high = ptr[1];
    1214             :     }
    1215             : 
    1216           1 :     static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
    1217             :                                 XMMReg2Double &high)
    1218             :     {
    1219           1 :         low.low = ptr[0];
    1220           1 :         low.high = ptr[1];
    1221           1 :         high.low = ptr[2];
    1222           1 :         high.high = ptr[3];
    1223           1 :     }
    1224             : 
    1225             :     static inline void Load4Val(const short *ptr, XMMReg2Double &low,
    1226             :                                 XMMReg2Double &high)
    1227             :     {
    1228             :         low.nsLoad2Val(ptr);
    1229             :         high.nsLoad2Val(ptr + 2);
    1230             :     }
    1231             : 
    1232             :     static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
    1233             :                                 XMMReg2Double &high)
    1234             :     {
    1235             :         low.nsLoad2Val(ptr);
    1236             :         high.nsLoad2Val(ptr + 2);
    1237             :     }
    1238             : 
    1239             :     static inline void Load4Val(const double *ptr, XMMReg2Double &low,
    1240             :                                 XMMReg2Double &high)
    1241             :     {
    1242             :         low.nsLoad2Val(ptr);
    1243             :         high.nsLoad2Val(ptr + 2);
    1244             :     }
    1245             : 
    1246           1 :     static inline void Load4Val(const float *ptr, XMMReg2Double &low,
    1247             :                                 XMMReg2Double &high)
    1248             :     {
    1249           1 :         low.nsLoad2Val(ptr);
    1250           1 :         high.nsLoad2Val(ptr + 2);
    1251           1 :     }
    1252             : 
    1253             :     inline void Zeroize()
    1254             :     {
    1255             :         low = 0.0;
    1256             :         high = 0.0;
    1257             :     }
    1258             : 
    1259          37 :     inline XMMReg2Double &operator=(const XMMReg2Double &other)
    1260             :     {
    1261          37 :         low = other.low;
    1262          37 :         high = other.high;
    1263          37 :         return *this;
    1264             :     }
    1265             : 
    1266           3 :     inline XMMReg2Double &operator+=(const XMMReg2Double &other)
    1267             :     {
    1268           3 :         low += other.low;
    1269           3 :         high += other.high;
    1270           3 :         return *this;
    1271             :     }
    1272             : 
    1273           2 :     inline XMMReg2Double &operator*=(const XMMReg2Double &other)
    1274             :     {
    1275           2 :         low *= other.low;
    1276           2 :         high *= other.high;
    1277           2 :         return *this;
    1278             :     }
    1279             : 
    1280           9 :     inline XMMReg2Double operator+(const XMMReg2Double &other) const
    1281             :     {
    1282             :         XMMReg2Double ret;
    1283           9 :         ret.low = low + other.low;
    1284           9 :         ret.high = high + other.high;
    1285           9 :         return ret;
    1286             :     }
    1287             : 
    1288           2 :     inline XMMReg2Double operator-(const XMMReg2Double &other) const
    1289             :     {
    1290             :         XMMReg2Double ret;
    1291           2 :         ret.low = low - other.low;
    1292           2 :         ret.high = high - other.high;
    1293           2 :         return ret;
    1294             :     }
    1295             : 
    1296           2 :     inline XMMReg2Double operator*(const XMMReg2Double &other) const
    1297             :     {
    1298             :         XMMReg2Double ret;
    1299           2 :         ret.low = low * other.low;
    1300           2 :         ret.high = high * other.high;
    1301           2 :         return ret;
    1302             :     }
    1303             : 
    1304           2 :     inline XMMReg2Double operator/(const XMMReg2Double &other) const
    1305             :     {
    1306             :         XMMReg2Double ret;
    1307           2 :         ret.low = low / other.low;
    1308           2 :         ret.high = high / other.high;
    1309           2 :         return ret;
    1310             :     }
    1311             : 
    1312           1 :     inline double GetHorizSum() const
    1313             :     {
    1314           1 :         return low + high;
    1315             :     }
    1316             : 
    1317          32 :     inline void Store2Val(double *ptr) const
    1318             :     {
    1319          32 :         ptr[0] = low;
    1320          32 :         ptr[1] = high;
    1321          32 :     }
    1322             : 
    1323             :     inline void Store2ValAligned(double *ptr) const
    1324             :     {
    1325             :         ptr[0] = low;
    1326             :         ptr[1] = high;
    1327             :     }
    1328             : 
    1329           2 :     inline void Store2Val(float *ptr) const
    1330             :     {
    1331           2 :         ptr[0] = static_cast<float>(low);
    1332           2 :         ptr[1] = static_cast<float>(high);
    1333           2 :     }
    1334             : 
    1335           2 :     void Store2Val(unsigned char *ptr) const
    1336             :     {
    1337           2 :         ptr[0] = static_cast<unsigned char>(low + 0.5);
    1338           2 :         ptr[1] = static_cast<unsigned char>(high + 0.5);
    1339           2 :     }
    1340             : 
    1341           2 :     void Store2Val(unsigned short *ptr) const
    1342             :     {
    1343           2 :         ptr[0] = static_cast<GUInt16>(low + 0.5);
    1344           2 :         ptr[1] = static_cast<GUInt16>(high + 0.5);
    1345           2 :     }
    1346             : 
    1347           8 :     inline void StoreMask(unsigned char *ptr) const
    1348             :     {
    1349           8 :         memcpy(ptr, &low, 8);
    1350           8 :         memcpy(ptr + 8, &high, 8);
    1351           8 :     }
    1352             : 
    1353             :     inline operator double() const
    1354             :     {
    1355             :         return low;
    1356             :     }
    1357             : };
    1358             : 
    1359             : #endif /*  defined(__x86_64) || defined(_M_X64) */
    1360             : 
    1361             : #if defined(__AVX__) && !defined(USE_SSE2_EMULATION)
    1362             : 
    1363             : #include <immintrin.h>
    1364             : 
    1365             : class XMMReg4Double
    1366             : {
    1367             :   public:
    1368             :     __m256d ymm;
    1369             : 
    1370             :     XMMReg4Double() : ymm(_mm256_setzero_pd())
    1371             :     {
    1372             :     }
    1373             : 
    1374             :     XMMReg4Double(const XMMReg4Double &other) : ymm(other.ymm)
    1375             :     {
    1376             :     }
    1377             : 
    1378             :     static inline XMMReg4Double Zero()
    1379             :     {
    1380             :         XMMReg4Double reg;
    1381             :         reg.Zeroize();
    1382             :         return reg;
    1383             :     }
    1384             : 
    1385             :     static inline XMMReg4Double Set1(double d)
    1386             :     {
    1387             :         XMMReg4Double reg;
    1388             :         reg.ymm = _mm256_set1_pd(d);
    1389             :         return reg;
    1390             :     }
    1391             : 
    1392             :     inline void Zeroize()
    1393             :     {
    1394             :         ymm = _mm256_setzero_pd();
    1395             :     }
    1396             : 
    1397             :     static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
    1398             :     {
    1399             :         XMMReg4Double reg;
    1400             :         reg.nsLoad1ValHighAndLow(ptr);
    1401             :         return reg;
    1402             :     }
    1403             : 
    1404             :     inline void nsLoad1ValHighAndLow(const double *ptr)
    1405             :     {
    1406             :         ymm = _mm256_set1_pd(*ptr);
    1407             :     }
    1408             : 
    1409             :     static inline XMMReg4Double Load4Val(const unsigned char *ptr)
    1410             :     {
    1411             :         XMMReg4Double reg;
    1412             :         reg.nsLoad4Val(ptr);
    1413             :         return reg;
    1414             :     }
    1415             : 
    1416             :     inline void nsLoad4Val(const unsigned char *ptr)
    1417             :     {
    1418             :         __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
    1419             :         xmm_i = _mm_cvtepu8_epi32(xmm_i);
    1420             :         ymm = _mm256_cvtepi32_pd(xmm_i);
    1421             :     }
    1422             : 
    1423             :     static inline void Load8Val(const unsigned char *ptr, XMMReg4Double &low,
    1424             :                                 XMMReg4Double &high)
    1425             :     {
    1426             :         const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
    1427             :         const __m128i xmm_i_low = _mm_cvtepu8_epi32(xmm_i);
    1428             :         low.ymm = _mm256_cvtepi32_pd(xmm_i_low);
    1429             :         const __m128i xmm_i_high = _mm_cvtepu8_epi32(_mm_srli_si128(xmm_i, 4));
    1430             :         high.ymm = _mm256_cvtepi32_pd(xmm_i_high);
    1431             :     }
    1432             : 
    1433             :     static inline XMMReg4Double Load4Val(const short *ptr)
    1434             :     {
    1435             :         XMMReg4Double reg;
    1436             :         reg.nsLoad4Val(ptr);
    1437             :         return reg;
    1438             :     }
    1439             : 
    1440             :     inline void nsLoad4Val(const short *ptr)
    1441             :     {
    1442             :         __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
    1443             :         xmm_i = _mm_cvtepi16_epi32(xmm_i);
    1444             :         ymm = _mm256_cvtepi32_pd(xmm_i);
    1445             :     }
    1446             : 
    1447             :     static inline void Load8Val(const short *ptr, XMMReg4Double &low,
    1448             :                                 XMMReg4Double &high)
    1449             :     {
    1450             :         low.nsLoad4Val(ptr);
    1451             :         high.nsLoad4Val(ptr + 4);
    1452             :     }
    1453             : 
    1454             :     static inline XMMReg4Double Load4Val(const unsigned short *ptr)
    1455             :     {
    1456             :         XMMReg4Double reg;
    1457             :         reg.nsLoad4Val(ptr);
    1458             :         return reg;
    1459             :     }
    1460             : 
    1461             :     inline void nsLoad4Val(const unsigned short *ptr)
    1462             :     {
    1463             :         __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
    1464             :         xmm_i = _mm_cvtepu16_epi32(xmm_i);
    1465             :         ymm = _mm256_cvtepi32_pd(
    1466             :             xmm_i);  // ok to use signed conversion since we are in the ushort
    1467             :                      // range, so cannot be interpreted as negative int32
    1468             :     }
    1469             : 
    1470             :     static inline void Load8Val(const unsigned short *ptr, XMMReg4Double &low,
    1471             :                                 XMMReg4Double &high)
    1472             :     {
    1473             :         low.nsLoad4Val(ptr);
    1474             :         high.nsLoad4Val(ptr + 4);
    1475             :     }
    1476             : 
    1477             :     static inline XMMReg4Double Load4Val(const double *ptr)
    1478             :     {
    1479             :         XMMReg4Double reg;
    1480             :         reg.nsLoad4Val(ptr);
    1481             :         return reg;
    1482             :     }
    1483             : 
    1484             :     inline void nsLoad4Val(const double *ptr)
    1485             :     {
    1486             :         ymm = _mm256_loadu_pd(ptr);
    1487             :     }
    1488             : 
    1489             :     static inline void Load8Val(const double *ptr, XMMReg4Double &low,
    1490             :                                 XMMReg4Double &high)
    1491             :     {
    1492             :         low.nsLoad4Val(ptr);
    1493             :         high.nsLoad4Val(ptr + 4);
    1494             :     }
    1495             : 
    1496             :     static inline XMMReg4Double Load4ValAligned(const double *ptr)
    1497             :     {
    1498             :         XMMReg4Double reg;
    1499             :         reg.nsLoad4ValAligned(ptr);
    1500             :         return reg;
    1501             :     }
    1502             : 
    1503             :     inline void nsLoad4ValAligned(const double *ptr)
    1504             :     {
    1505             :         ymm = _mm256_load_pd(ptr);
    1506             :     }
    1507             : 
    1508             :     static inline XMMReg4Double Load4Val(const float *ptr)
    1509             :     {
    1510             :         XMMReg4Double reg;
    1511             :         reg.nsLoad4Val(ptr);
    1512             :         return reg;
    1513             :     }
    1514             : 
    1515             :     inline void nsLoad4Val(const float *ptr)
    1516             :     {
    1517             :         ymm = _mm256_cvtps_pd(_mm_loadu_ps(ptr));
    1518             :     }
    1519             : 
    1520             :     static inline void Load8Val(const float *ptr, XMMReg4Double &low,
    1521             :                                 XMMReg4Double &high)
    1522             :     {
    1523             :         low.nsLoad4Val(ptr);
    1524             :         high.nsLoad4Val(ptr + 4);
    1525             :     }
    1526             : 
    1527             :     static inline XMMReg4Double Load4Val(const int *ptr)
    1528             :     {
    1529             :         XMMReg4Double reg;
    1530             :         reg.nsLoad4Val(ptr);
    1531             :         return reg;
    1532             :     }
    1533             : 
    1534             :     inline void nsLoad4Val(const int *ptr)
    1535             :     {
    1536             :         ymm = _mm256_cvtepi32_pd(
    1537             :             _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr)));
    1538             :     }
    1539             : 
    1540             :     static inline void Load8Val(const int *ptr, XMMReg4Double &low,
    1541             :                                 XMMReg4Double &high)
    1542             :     {
    1543             :         low.nsLoad4Val(ptr);
    1544             :         high.nsLoad4Val(ptr + 4);
    1545             :     }
    1546             : 
    1547             :     static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
    1548             :                                        const XMMReg4Double &expr2)
    1549             :     {
    1550             :         XMMReg4Double reg;
    1551             :         reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
    1552             :         return reg;
    1553             :     }
    1554             : 
    1555             :     static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
    1556             :                                           const XMMReg4Double &expr2)
    1557             :     {
    1558             :         XMMReg4Double reg;
    1559             :         reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
    1560             :         return reg;
    1561             :     }
    1562             : 
    1563             :     static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
    1564             :                                         const XMMReg4Double &expr2)
    1565             :     {
    1566             :         XMMReg4Double reg;
    1567             :         reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
    1568             :         return reg;
    1569             :     }
    1570             : 
    1571             :     static inline XMMReg4Double And(const XMMReg4Double &expr1,
    1572             :                                     const XMMReg4Double &expr2)
    1573             :     {
    1574             :         XMMReg4Double reg;
    1575             :         reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
    1576             :         return reg;
    1577             :     }
    1578             : 
    1579             :     static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
    1580             :                                         const XMMReg4Double &true_expr,
    1581             :                                         const XMMReg4Double &false_expr)
    1582             :     {
    1583             :         XMMReg4Double reg;
    1584             :         reg.ymm = _mm256_or_pd(_mm256_and_pd(cond.ymm, true_expr.ymm),
    1585             :                                _mm256_andnot_pd(cond.ymm, false_expr.ymm));
    1586             :         return reg;
    1587             :     }
    1588             : 
    1589             :     static inline XMMReg4Double Min(const XMMReg4Double &expr1,
    1590             :                                     const XMMReg4Double &expr2)
    1591             :     {
    1592             :         XMMReg4Double reg;
    1593             :         reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
    1594             :         return reg;
    1595             :     }
    1596             : 
    1597             :     inline XMMReg4Double &operator=(const XMMReg4Double &other)
    1598             :     {
    1599             :         ymm = other.ymm;
    1600             :         return *this;
    1601             :     }
    1602             : 
    1603             :     inline XMMReg4Double &operator+=(const XMMReg4Double &other)
    1604             :     {
    1605             :         ymm = _mm256_add_pd(ymm, other.ymm);
    1606             :         return *this;
    1607             :     }
    1608             : 
    1609             :     inline XMMReg4Double &operator*=(const XMMReg4Double &other)
    1610             :     {
    1611             :         ymm = _mm256_mul_pd(ymm, other.ymm);
    1612             :         return *this;
    1613             :     }
    1614             : 
    1615             :     inline XMMReg4Double operator+(const XMMReg4Double &other) const
    1616             :     {
    1617             :         XMMReg4Double ret;
    1618             :         ret.ymm = _mm256_add_pd(ymm, other.ymm);
    1619             :         return ret;
    1620             :     }
    1621             : 
    1622             :     inline XMMReg4Double operator-(const XMMReg4Double &other) const
    1623             :     {
    1624             :         XMMReg4Double ret;
    1625             :         ret.ymm = _mm256_sub_pd(ymm, other.ymm);
    1626             :         return ret;
    1627             :     }
    1628             : 
    1629             :     inline XMMReg4Double operator*(const XMMReg4Double &other) const
    1630             :     {
    1631             :         XMMReg4Double ret;
    1632             :         ret.ymm = _mm256_mul_pd(ymm, other.ymm);
    1633             :         return ret;
    1634             :     }
    1635             : 
    1636             :     inline XMMReg4Double operator/(const XMMReg4Double &other) const
    1637             :     {
    1638             :         XMMReg4Double ret;
    1639             :         ret.ymm = _mm256_div_pd(ymm, other.ymm);
    1640             :         return ret;
    1641             :     }
    1642             : 
    1643             :     void AddToLow(const XMMReg2Double &other)
    1644             :     {
    1645             :         __m256d ymm2 = _mm256_setzero_pd();
    1646             :         ymm2 = _mm256_insertf128_pd(ymm2, other.xmm, 0);
    1647             :         ymm = _mm256_add_pd(ymm, ymm2);
    1648             :     }
    1649             : 
    1650             :     inline double GetHorizSum() const
    1651             :     {
    1652             :         __m256d ymm_tmp1, ymm_tmp2;
    1653             :         ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
    1654             :         ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
    1655             :         ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
    1656             :         return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
    1657             :     }
    1658             : 
    1659             :     inline XMMReg4Double approx_inv_sqrt(const XMMReg4Double &one,
    1660             :                                          const XMMReg4Double &half) const
    1661             :     {
    1662             :         __m256d reg = ymm;
    1663             :         __m256d reg_half = _mm256_mul_pd(reg, half.ymm);
    1664             :         // Compute rough approximation of 1 / sqrt(b) with _mm_rsqrt_ps
    1665             :         reg = _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(reg)));
    1666             :         // And perform one step of Newton-Raphson approximation to improve it
    1667             :         // approx_inv_sqrt_x = approx_inv_sqrt_x*(1.5 -
    1668             :         //                            0.5*x*approx_inv_sqrt_x*approx_inv_sqrt_x);
    1669             :         const __m256d one_and_a_half = _mm256_add_pd(one.ymm, half.ymm);
    1670             :         reg = _mm256_mul_pd(
    1671             :             reg,
    1672             :             _mm256_sub_pd(one_and_a_half,
    1673             :                           _mm256_mul_pd(reg_half, _mm256_mul_pd(reg, reg))));
    1674             :         XMMReg4Double ret;
    1675             :         ret.ymm = reg;
    1676             :         return ret;
    1677             :     }
    1678             : 
    1679             :     inline XMMReg4Float cast_to_float() const
    1680             :     {
    1681             :         XMMReg4Float ret;
    1682             :         ret.xmm = _mm256_cvtpd_ps(ymm);
    1683             :         return ret;
    1684             :     }
    1685             : 
    1686             :     inline void Store4Val(unsigned char *ptr) const
    1687             :     {
    1688             :         __m128i xmm_i =
    1689             :             _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
    1690             :         // xmm_i = _mm_packs_epi32(xmm_i, xmm_i);   // Pack int32 to int16
    1691             :         // xmm_i = _mm_packus_epi16(xmm_i, xmm_i);  // Pack int16 to uint8
    1692             :         xmm_i =
    1693             :             _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) |
    1694             :                                                       (12 << 24)));  //  SSSE3
    1695             :         GDALCopyXMMToInt32(xmm_i, reinterpret_cast<GInt32 *>(ptr));
    1696             :     }
    1697             : 
    1698             :     inline void Store4Val(unsigned short *ptr) const
    1699             :     {
    1700             :         __m128i xmm_i =
    1701             :             _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
    1702             :         xmm_i = _mm_packus_epi32(xmm_i, xmm_i);  // Pack uint32 to uint16
    1703             :         GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
    1704             :     }
    1705             : 
    1706             :     inline void Store4Val(float *ptr) const
    1707             :     {
    1708             :         _mm_storeu_ps(ptr, _mm256_cvtpd_ps(ymm));
    1709             :     }
    1710             : 
    1711             :     inline void Store4Val(double *ptr) const
    1712             :     {
    1713             :         _mm256_storeu_pd(ptr, ymm);
    1714             :     }
    1715             : 
    1716             :     inline void StoreMask(unsigned char *ptr) const
    1717             :     {
    1718             :         _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr),
    1719             :                             _mm256_castpd_si256(ymm));
    1720             :     }
    1721             : };
    1722             : 
    1723             : inline XMMReg4Double XMMReg4Float::cast_to_double() const
    1724             : {
    1725             :     XMMReg4Double ret;
    1726             :     ret.ymm = _mm256_cvtps_pd(xmm);
    1727             :     return ret;
    1728             : }
    1729             : 
    1730             : inline XMMReg4Double XMMReg4Int::cast_to_double() const
    1731             : {
    1732             :     XMMReg4Double ret;
    1733             :     ret.ymm = _mm256_cvtepi32_pd(xmm);
    1734             :     return ret;
    1735             : }
    1736             : 
    1737             : #else
    1738             : 
    1739             : class XMMReg4Double
    1740             : {
    1741             :   public:
    1742             :     XMMReg2Double low, high;
    1743             : 
    1744  1552980054 :     XMMReg4Double() : low(XMMReg2Double()), high(XMMReg2Double())
    1745             :     {
    1746  1553180054 :     }
    1747             : 
    1748     1340080 :     XMMReg4Double(const XMMReg4Double &other) : low(other.low), high(other.high)
    1749             :     {
    1750     1343900 :     }
    1751             : 
    1752   230388000 :     static inline XMMReg4Double Zero()
    1753             :     {
    1754   230388000 :         XMMReg4Double reg;
    1755   230279000 :         reg.low.Zeroize();
    1756   230295000 :         reg.high.Zeroize();
    1757   230297000 :         return reg;
    1758             :     }
    1759             : 
    1760        8402 :     static inline XMMReg4Double Set1(double d)
    1761             :     {
    1762        8402 :         XMMReg4Double reg;
    1763        8402 :         reg.low = XMMReg2Double::Set1(d);
    1764        8402 :         reg.high = XMMReg2Double::Set1(d);
    1765        8402 :         return reg;
    1766             :     }
    1767             : 
    1768   111147002 :     static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
    1769             :     {
    1770   111147002 :         XMMReg4Double reg;
    1771   111121002 :         reg.low.nsLoad1ValHighAndLow(ptr);
    1772   111306002 :         reg.high = reg.low;
    1773   111211002 :         return reg;
    1774             :     }
    1775             : 
    1776   331459002 :     static inline XMMReg4Double Load4Val(const unsigned char *ptr)
    1777             :     {
    1778   331459002 :         XMMReg4Double reg;
    1779   330887002 :         XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
    1780   332451002 :         return reg;
    1781             :     }
    1782             : 
    1783         624 :     static inline void Load8Val(const unsigned char *ptr, XMMReg4Double &low,
    1784             :                                 XMMReg4Double &high)
    1785             :     {
    1786         624 :         low = Load4Val(ptr);
    1787         624 :         high = Load4Val(ptr + 4);
    1788         624 :     }
    1789             : 
    1790     1014922 :     static inline XMMReg4Double Load4Val(const short *ptr)
    1791             :     {
    1792     1014922 :         XMMReg4Double reg;
    1793     1014922 :         reg.low.nsLoad2Val(ptr);
    1794     1014922 :         reg.high.nsLoad2Val(ptr + 2);
    1795     1014922 :         return reg;
    1796             :     }
    1797             : 
    1798         544 :     static inline void Load8Val(const short *ptr, XMMReg4Double &low,
    1799             :                                 XMMReg4Double &high)
    1800             :     {
    1801         544 :         low = Load4Val(ptr);
    1802         544 :         high = Load4Val(ptr + 4);
    1803         544 :     }
    1804             : 
    1805    22700602 :     static inline XMMReg4Double Load4Val(const unsigned short *ptr)
    1806             :     {
    1807    22700602 :         XMMReg4Double reg;
    1808    22579202 :         reg.low.nsLoad2Val(ptr);
    1809    22753902 :         reg.high.nsLoad2Val(ptr + 2);
    1810    22802202 :         return reg;
    1811             :     }
    1812             : 
    1813         544 :     static inline void Load8Val(const unsigned short *ptr, XMMReg4Double &low,
    1814             :                                 XMMReg4Double &high)
    1815             :     {
    1816         544 :         low = Load4Val(ptr);
    1817         544 :         high = Load4Val(ptr + 4);
    1818         544 :     }
    1819             : 
    1820        1088 :     static inline XMMReg4Double Load4Val(const int *ptr)
    1821             :     {
    1822        1088 :         XMMReg4Double reg;
    1823        1088 :         reg.low.nsLoad2Val(ptr);
    1824        1088 :         reg.high.nsLoad2Val(ptr + 2);
    1825        1088 :         return reg;
    1826             :     }
    1827             : 
    1828         544 :     static inline void Load8Val(const int *ptr, XMMReg4Double &low,
    1829             :                                 XMMReg4Double &high)
    1830             :     {
    1831         544 :         low = Load4Val(ptr);
    1832         544 :         high = Load4Val(ptr + 4);
    1833         544 :     }
    1834             : 
    1835   249962016 :     static inline XMMReg4Double Load4Val(const double *ptr)
    1836             :     {
    1837   249962016 :         XMMReg4Double reg;
    1838   250051016 :         reg.low.nsLoad2Val(ptr);
    1839   250234016 :         reg.high.nsLoad2Val(ptr + 2);
    1840   251183016 :         return reg;
    1841             :     }
    1842             : 
    1843         544 :     static inline void Load8Val(const double *ptr, XMMReg4Double &low,
    1844             :                                 XMMReg4Double &high)
    1845             :     {
    1846         544 :         low = Load4Val(ptr);
    1847         544 :         high = Load4Val(ptr + 4);
    1848         544 :     }
    1849             : 
    1850    93806700 :     static inline XMMReg4Double Load4ValAligned(const double *ptr)
    1851             :     {
    1852    93806700 :         XMMReg4Double reg;
    1853    93795700 :         reg.low.nsLoad2ValAligned(ptr);
    1854    93772800 :         reg.high.nsLoad2ValAligned(ptr + 2);
    1855    93826200 :         return reg;
    1856             :     }
    1857             : 
    1858       38786 :     static inline XMMReg4Double Load4Val(const float *ptr)
    1859             :     {
    1860       38786 :         XMMReg4Double reg;
    1861       38786 :         XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
    1862       38786 :         return reg;
    1863             :     }
    1864             : 
    1865         576 :     static inline void Load8Val(const float *ptr, XMMReg4Double &low,
    1866             :                                 XMMReg4Double &high)
    1867             :     {
    1868         576 :         low = Load4Val(ptr);
    1869         576 :         high = Load4Val(ptr + 4);
    1870         576 :     }
    1871             : 
    1872           2 :     static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
    1873             :                                        const XMMReg4Double &expr2)
    1874             :     {
    1875           2 :         XMMReg4Double reg;
    1876           2 :         reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
    1877           2 :         reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
    1878           2 :         return reg;
    1879             :     }
    1880             : 
    1881     1334402 :     static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
    1882             :                                           const XMMReg4Double &expr2)
    1883             :     {
    1884     1334402 :         XMMReg4Double reg;
    1885     1330402 :         reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
    1886     1336082 :         reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
    1887     1336162 :         return reg;
    1888             :     }
    1889             : 
    1890           6 :     static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
    1891             :                                         const XMMReg4Double &expr2)
    1892             :     {
    1893           6 :         XMMReg4Double reg;
    1894           6 :         reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
    1895           6 :         reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
    1896           6 :         return reg;
    1897             :     }
    1898             : 
    1899     1333640 :     static inline XMMReg4Double And(const XMMReg4Double &expr1,
    1900             :                                     const XMMReg4Double &expr2)
    1901             :     {
    1902     1333640 :         XMMReg4Double reg;
    1903     1336990 :         reg.low = XMMReg2Double::And(expr1.low, expr2.low);
    1904     1336000 :         reg.high = XMMReg2Double::And(expr1.high, expr2.high);
    1905     1337350 :         return reg;
    1906             :     }
    1907             : 
    1908           2 :     static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
    1909             :                                         const XMMReg4Double &true_expr,
    1910             :                                         const XMMReg4Double &false_expr)
    1911             :     {
    1912           2 :         XMMReg4Double reg;
    1913             :         reg.low =
    1914           2 :             XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
    1915             :         reg.high =
    1916           2 :             XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
    1917           2 :         return reg;
    1918             :     }
    1919             : 
    1920     4205962 :     static inline XMMReg4Double Min(const XMMReg4Double &expr1,
    1921             :                                     const XMMReg4Double &expr2)
    1922             :     {
    1923     4205962 :         XMMReg4Double reg;
    1924     4220002 :         reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
    1925     4222402 :         reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
    1926     4213942 :         return reg;
    1927             :     }
    1928             : 
    1929   120682008 :     inline XMMReg4Double &operator=(const XMMReg4Double &other)
    1930             :     {
    1931   120682008 :         low = other.low;
    1932   120664008 :         high = other.high;
    1933   120678008 :         return *this;
    1934             :     }
    1935             : 
    1936   525873002 :     inline XMMReg4Double &operator+=(const XMMReg4Double &other)
    1937             :     {
    1938   525873002 :         low += other.low;
    1939   527432002 :         high += other.high;
    1940   530339002 :         return *this;
    1941             :     }
    1942             : 
    1943    12675302 :     inline XMMReg4Double &operator*=(const XMMReg4Double &other)
    1944             :     {
    1945    12675302 :         low *= other.low;
    1946    12681302 :         high *= other.high;
    1947    12683802 :         return *this;
    1948             :     }
    1949             : 
    1950      189631 :     inline XMMReg4Double operator+(const XMMReg4Double &other) const
    1951             :     {
    1952      189631 :         XMMReg4Double ret;
    1953      189631 :         ret.low = low + other.low;
    1954      189631 :         ret.high = high + other.high;
    1955      189631 :         return ret;
    1956             :     }
    1957             : 
    1958           2 :     inline XMMReg4Double operator-(const XMMReg4Double &other) const
    1959             :     {
    1960           2 :         XMMReg4Double ret;
    1961           2 :         ret.low = low - other.low;
    1962           2 :         ret.high = high - other.high;
    1963           2 :         return ret;
    1964             :     }
    1965             : 
    1966   558378002 :     inline XMMReg4Double operator*(const XMMReg4Double &other) const
    1967             :     {
    1968   558378002 :         XMMReg4Double ret;
    1969   557522002 :         ret.low = low * other.low;
    1970   554012002 :         ret.high = high * other.high;
    1971   556359002 :         return ret;
    1972             :     }
    1973             : 
    1974     1334652 :     inline XMMReg4Double operator/(const XMMReg4Double &other) const
    1975             :     {
    1976     1334652 :         XMMReg4Double ret;
    1977     1337382 :         ret.low = low / other.low;
    1978     1335712 :         ret.high = high / other.high;
    1979     1336712 :         return ret;
    1980             :     }
    1981             : 
    1982      583930 :     void AddToLow(const XMMReg2Double &other)
    1983             :     {
    1984      583930 :         low += other;
    1985      583930 :     }
    1986             : 
    1987   222638002 :     inline double GetHorizSum() const
    1988             :     {
    1989   222638002 :         return (low + high).GetHorizSum();
    1990             :     }
    1991             : 
    1992             : #if !defined(USE_SSE2_EMULATION)
    1993       46991 :     inline XMMReg4Double approx_inv_sqrt(const XMMReg4Double &one,
    1994             :                                          const XMMReg4Double &half) const
    1995             :     {
    1996       46991 :         __m128d reg0 = low.xmm;
    1997       46991 :         __m128d reg1 = high.xmm;
    1998       46991 :         __m128d reg0_half = _mm_mul_pd(reg0, half.low.xmm);
    1999       93982 :         __m128d reg1_half = _mm_mul_pd(reg1, half.low.xmm);
    2000             :         // Compute rough approximation of 1 / sqrt(b) with _mm_rsqrt_ps
    2001      140973 :         reg0 = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(reg0)));
    2002       93982 :         reg1 = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(reg1)));
    2003             :         // And perform one step of Newton-Raphson approximation to improve it
    2004             :         // approx_inv_sqrt_x = approx_inv_sqrt_x*(1.5 -
    2005             :         //                            0.5*x*approx_inv_sqrt_x*approx_inv_sqrt_x);
    2006       93982 :         const __m128d one_and_a_half = _mm_add_pd(one.low.xmm, half.low.xmm);
    2007      187964 :         reg0 = _mm_mul_pd(
    2008             :             reg0, _mm_sub_pd(one_and_a_half,
    2009             :                              _mm_mul_pd(reg0_half, _mm_mul_pd(reg0, reg0))));
    2010      140973 :         reg1 = _mm_mul_pd(
    2011             :             reg1, _mm_sub_pd(one_and_a_half,
    2012             :                              _mm_mul_pd(reg1_half, _mm_mul_pd(reg1, reg1))));
    2013       46991 :         XMMReg4Double ret;
    2014       46991 :         ret.low.xmm = reg0;
    2015       46991 :         ret.high.xmm = reg1;
    2016       46991 :         return ret;
    2017             :     }
    2018             : 
    2019       46991 :     inline XMMReg4Float cast_to_float() const
    2020             :     {
    2021       46991 :         XMMReg4Float ret;
    2022      187964 :         ret.xmm = _mm_castsi128_ps(
    2023       46991 :             _mm_unpacklo_epi64(_mm_castps_si128(_mm_cvtpd_ps(low.xmm)),
    2024       46991 :                                _mm_castps_si128(_mm_cvtpd_ps(high.xmm))));
    2025       46991 :         return ret;
    2026             :     }
    2027             : #endif
    2028             : 
    2029     1676592 :     inline void Store4Val(unsigned char *ptr) const
    2030             :     {
    2031             : #ifdef USE_SSE2_EMULATION
    2032           1 :         low.Store2Val(ptr);
    2033           1 :         high.Store2Val(ptr + 2);
    2034             : #else
    2035     3353852 :         __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(
    2036     1676591 :             low.xmm,
    2037             :             _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
    2038     3354992 :         __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(
    2039     1677261 :             high.xmm,
    2040             :             _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
    2041     5033333 :         auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow),
    2042             :                                                    _mm_castsi128_ps(tmpHigh),
    2043             :                                                    _MM_SHUFFLE(1, 0, 1, 0)));
    2044     1677481 :         tmp = _mm_packs_epi32(tmp, tmp);
    2045     1677921 :         tmp = _mm_packus_epi16(tmp, tmp);
    2046     1677921 :         GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
    2047             : #endif
    2048     1677092 :     }
    2049             : 
    2050     2558312 :     inline void Store4Val(unsigned short *ptr) const
    2051             :     {
    2052             : #if 1
    2053     2558312 :         low.Store2Val(ptr);
    2054     2566222 :         high.Store2Val(ptr + 2);
    2055             : #else
    2056             :         __m128i xmm0 = _mm_cvtpd_epi32(low.xmm);
    2057             :         __m128i xmm1 = _mm_cvtpd_epi32(high.xmm);
    2058             :         xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
    2059             : #if __SSE4_1__
    2060             :         xmm0 = _mm_packus_epi32(xmm0, xmm0);  // Pack uint32 to uint16
    2061             : #else
    2062             :         xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(-32768));
    2063             :         xmm0 = _mm_packs_epi32(xmm0, xmm0);
    2064             :         xmm0 = _mm_sub_epi16(xmm0, _mm_set1_epi16(-32768));
    2065             : #endif
    2066             :         GDALCopyXMMToInt64(xmm0, (GInt64 *)ptr);
    2067             : #endif
    2068     2566452 :     }
    2069             : 
    2070    48369802 :     inline void Store4Val(float *ptr) const
    2071             :     {
    2072    48369802 :         low.Store2Val(ptr);
    2073    48390202 :         high.Store2Val(ptr + 2);
    2074    48392502 :     }
    2075             : 
    2076        3488 :     inline void Store4Val(double *ptr) const
    2077             :     {
    2078        3488 :         low.Store2Val(ptr);
    2079        3488 :         high.Store2Val(ptr + 2);
    2080        3488 :     }
    2081             : 
    2082           8 :     inline void StoreMask(unsigned char *ptr) const
    2083             :     {
    2084           8 :         low.StoreMask(ptr);
    2085           8 :         high.StoreMask(ptr + 16);
    2086           8 :     }
    2087             : };
    2088             : 
    2089             : #if !defined(USE_SSE2_EMULATION)
    2090       21706 : inline XMMReg4Double XMMReg4Float::cast_to_double() const
    2091             : {
    2092       21706 :     XMMReg4Double ret;
    2093       21706 :     ret.low.xmm = _mm_cvtps_pd(xmm);
    2094       43412 :     ret.high.xmm = _mm_cvtps_pd(
    2095       21706 :         _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(xmm), 8)));
    2096       21706 :     return ret;
    2097             : }
    2098             : 
    2099       72276 : inline XMMReg4Double XMMReg4Int::cast_to_double() const
    2100             : {
    2101       72276 :     XMMReg4Double ret;
    2102       72276 :     ret.low.xmm = _mm_cvtepi32_pd(xmm);
    2103       72276 :     ret.high.xmm = _mm_cvtepi32_pd(_mm_srli_si128(xmm, 8));
    2104       72276 :     return ret;
    2105             : }
    2106             : #endif
    2107             : 
    2108             : #endif
    2109             : 
    2110             : #endif /* #ifndef DOXYGEN_SKIP */
    2111             : 
    2112             : #endif /* GDALSSE_PRIV_H_INCLUDED */

Generated by: LCOV version 1.14