LCOV - code coverage report
Current view: top level - gcore - rasterio_ssse3.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 194 194 100.0 %
Date: 2025-01-18 12:42:00 Functions: 14 14 100.0 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  GDAL Core
       4             :  * Purpose:  SSSE3 specializations
       5             :  * Author:   Even Rouault <even dot rouault at spatialys dot com>
       6             :  *
       7             :  ******************************************************************************
       8             :  * Copyright (c) 2016, Even Rouault <even dot rouault at spatialys dot com>
       9             :  *
      10             :  * SPDX-License-Identifier: MIT
      11             :  ****************************************************************************/
      12             : 
      13             : #include "cpl_port.h"
      14             : 
      15             : #include <algorithm>
      16             : 
      17             : #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                    \
      18             :      (defined(__x86_64) || defined(_M_X64))) ||                                \
      19             :     defined(USE_NEON_OPTIMIZATIONS)
      20             : 
      21             : #include "rasterio_ssse3.h"
      22             : 
      23             : #ifdef USE_NEON_OPTIMIZATIONS
      24             : #include "include_sse2neon.h"
      25             : #else
      26             : #include <tmmintrin.h>
      27             : #endif
      28             : 
      29             : #include "gdal_priv_templates.hpp"
      30             : 
      31      179430 : void GDALUnrolledCopy_GByte_3_1_SSSE3(GByte *CPL_RESTRICT pDest,
      32             :                                       const GByte *CPL_RESTRICT pSrc,
      33             :                                       GPtrDiff_t nIters)
      34             : {
      35             :     decltype(nIters) i;
      36      179430 :     const __m128i xmm_shuffle0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
      37             :                                               -1, -1, 15, 12, 9, 6, 3, 0);
      38      179430 :     const __m128i xmm_shuffle1 = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11, 8, 5,
      39             :                                               2, -1, -1, -1, -1, -1, -1);
      40      179430 :     const __m128i xmm_shuffle2 = _mm_set_epi8(13, 10, 7, 4, 1, -1, -1, -1, -1,
      41             :                                               -1, -1, -1, -1, -1, -1, -1);
      42             :     // If we were sure that there would always be 2 trailing bytes, we could
      43             :     // check against nIters - 15
      44    30307300 :     for (i = 0; i < nIters - 16; i += 16)
      45             :     {
      46             :         __m128i xmm0 =
      47    30127900 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
      48             :         __m128i xmm1 =
      49    30127900 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
      50             :         __m128i xmm2 =
      51    60255700 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
      52             : 
      53             :         // From LSB to MSB:
      54             :         // 0,x,x,1,x,x,2,x,x,3,x,x,4,x,x,5 --> 0,1,2,3,4,5,0,0,0,0,0,0,0,0,0
      55    30127900 :         xmm0 = _mm_shuffle_epi8(xmm0, xmm_shuffle0);
      56             :         // x,x,6,x,x,7,x,x,8,x,x,9,x,x,10,x --> 0,0,0,0,0,0,6,7,8,9,10,0,0,0,0,0
      57    30127900 :         xmm1 = _mm_shuffle_epi8(xmm1, xmm_shuffle1);
      58             :         // x,11,x,x,12,x,x,13,x,x,14,x,x,15,x,x -->
      59             :         // 0,0,0,0,0,0,0,0,0,0,0,11,12,13,14,15
      60    30127900 :         xmm2 = _mm_shuffle_epi8(xmm2, xmm_shuffle2);
      61    30127900 :         xmm0 = _mm_or_si128(xmm0, xmm1);
      62    30127900 :         xmm0 = _mm_or_si128(xmm0, xmm2);
      63             : 
      64    30127900 :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
      65             : 
      66    30127900 :         pSrc += 3 * 16;
      67             :     }
      68     2785980 :     for (; i < nIters; i++)
      69             :     {
      70     2606550 :         pDest[i] = *pSrc;
      71     2606550 :         pSrc += 3;
      72             :     }
      73      179430 : }
      74             : 
      75             : /************************************************************************/
      76             : /*                  GDALDeinterleave3Byte_SSSE3()                       */
      77             : /************************************************************************/
      78             : 
      79             : #if defined(__GNUC__) && !defined(__clang__)
      80             : // GCC autovectorizer does an excellent job
      81       69804 : __attribute__((optimize("tree-vectorize"))) void GDALDeinterleave3Byte_SSSE3(
      82             :     const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
      83             :     GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2, size_t nIters)
      84             : {
      85   221989000 :     for (size_t i = 0; i < nIters; ++i)
      86             :     {
      87   221919000 :         pabyDest0[i] = pabySrc[3 * i + 0];
      88   221919000 :         pabyDest1[i] = pabySrc[3 * i + 1];
      89   221919000 :         pabyDest2[i] = pabySrc[3 * i + 2];
      90             :     }
      91       69804 : }
      92             : #else
      93             : void GDALDeinterleave3Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc,
      94             :                                  GByte *CPL_RESTRICT pabyDest0,
      95             :                                  GByte *CPL_RESTRICT pabyDest1,
      96             :                                  GByte *CPL_RESTRICT pabyDest2, size_t nIters)
      97             : {
      98             :     size_t i = 0;
      99             :     for (; i + 15 < nIters; i += 16)
     100             :     {
     101             :         __m128i xmm0 = _mm_loadu_si128(
     102             :             reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 0));
     103             :         __m128i xmm1 = _mm_loadu_si128(
     104             :             reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 16));
     105             :         __m128i xmm2 = _mm_loadu_si128(
     106             :             reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 32));
     107             :         auto xmm0_new =
     108             :             _mm_shuffle_epi8(xmm0, _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10,
     109             :                                                 7, 4, 1, 9, 6, 3, 0));
     110             :         auto xmm1_new = _mm_shuffle_epi8(
     111             :             _mm_alignr_epi8(xmm1, xmm0, 12),
     112             :             _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10, 7, 4, 1, 9, 6, 3, 0));
     113             :         auto xmm2_new = _mm_shuffle_epi8(
     114             :             _mm_alignr_epi8(xmm2, xmm1, 8),
     115             :             _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10, 7, 4, 1, 9, 6, 3, 0));
     116             :         auto xmm3_new =
     117             :             _mm_shuffle_epi8(xmm2, _mm_set_epi8(-1, -1, -1, -1, 15, 12, 9, 6,
     118             :                                                 14, 11, 8, 5, 13, 10, 7, 4));
     119             : 
     120             :         __m128i xmm01lo =
     121             :             _mm_unpacklo_epi32(xmm0_new, xmm1_new);  // W0 W4 W1 W5
     122             :         __m128i xmm01hi = _mm_unpackhi_epi32(xmm0_new, xmm1_new);  // W2 W6 -  -
     123             :         __m128i xmm23lo =
     124             :             _mm_unpacklo_epi32(xmm2_new, xmm3_new);  // W8 WC W9 WD
     125             :         __m128i xmm23hi = _mm_unpackhi_epi32(xmm2_new, xmm3_new);  // WA WE -  -
     126             :         xmm0_new = _mm_unpacklo_epi64(xmm01lo, xmm23lo);  // W0 W4 W8 WC
     127             :         xmm1_new = _mm_unpackhi_epi64(xmm01lo, xmm23lo);  // W1 W5 W9 WD
     128             :         xmm2_new = _mm_unpacklo_epi64(xmm01hi, xmm23hi);  // W2 W6 WA WE
     129             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0_new);
     130             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1_new);
     131             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest2 + i), xmm2_new);
     132             :     }
     133             : #if defined(__clang__)
     134             : #pragma clang loop vectorize(disable)
     135             : #endif
     136             :     for (; i < nIters; ++i)
     137             :     {
     138             :         pabyDest0[i] = pabySrc[3 * i + 0];
     139             :         pabyDest1[i] = pabySrc[3 * i + 1];
     140             :         pabyDest2[i] = pabySrc[3 * i + 2];
     141             :     }
     142             : }
     143             : #endif
     144             : 
     145             : /************************************************************************/
     146             : /*                     GDALTranspose4x4Int32()                          */
     147             : /************************************************************************/
     148             : 
     149             : // Consider that the input registers for 4x4 words of size 4 bytes each,
     150             : // Return the transposition of this 4x4 matrix
     151             : // Considering that in0 = (in00, in01, in02, in03)
     152             : // Considering that in1 = (in10, in11, in12, in13)
     153             : // Considering that in2 = (in20, in21, in22, in23)
     154             : // Considering that in3 = (in30, in31, in32, in33)
     155             : // Return          out0 = (in00, in10, in20, in30)
     156             : // Return          out1 = (in01, in11, in21, in31)
     157             : // Return          out2 = (in02, in12, in22, in32)
     158             : // Return          out3 = (in03, in13, in23, in33)
     159       33056 : inline void GDALTranspose4x4Int32(__m128i in0, __m128i in1, __m128i in2,
     160             :                                   __m128i in3, __m128i &out0, __m128i &out1,
     161             :                                   __m128i &out2, __m128i &out3)
     162             : {
     163       33056 :     __m128i tmp0 = _mm_unpacklo_epi32(in0, in1);  // (in00, in10, in01, in11)
     164       33056 :     __m128i tmp1 = _mm_unpackhi_epi32(in0, in1);  // (in02, in12, in03, in13)
     165       33056 :     __m128i tmp2 = _mm_unpacklo_epi32(in2, in3);  // (in20, in30, in21, in31)
     166       33056 :     __m128i tmp3 = _mm_unpackhi_epi32(in2, in3);  // (in22, in32, in23, in33)
     167             : 
     168       33056 :     out0 = _mm_unpacklo_epi64(tmp0, tmp2);  // (in00, in10, in20, in30)
     169       33056 :     out1 = _mm_unpackhi_epi64(tmp0, tmp2);  // (in01, in11, in21, in31)
     170       33056 :     out2 = _mm_unpacklo_epi64(tmp1, tmp3);  // (in02, in12, in22, in32)
     171       33056 :     out3 = _mm_unpackhi_epi64(tmp1, tmp3);  // (in03, in13, in23, in33)
     172       33056 : }
     173             : 
     174             : /************************************************************************/
     175             : /*                  GDALDeinterleave4Byte_SSSE3()                       */
     176             : /************************************************************************/
     177             : 
     178             : #if !defined(__GNUC__) || defined(__clang__)
     179             : void GDALDeinterleave4Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc,
     180             :                                  GByte *CPL_RESTRICT pabyDest0,
     181             :                                  GByte *CPL_RESTRICT pabyDest1,
     182             :                                  GByte *CPL_RESTRICT pabyDest2,
     183             :                                  GByte *CPL_RESTRICT pabyDest3, size_t nIters)
     184             : {
     185             :     const __m128i shuffle_mask =
     186             :         _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
     187             :     size_t i = 0;
     188             :     for (; i + 15 < nIters; i += 16)
     189             :     {
     190             :         __m128i xmm0 = _mm_loadu_si128(
     191             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
     192             :         __m128i xmm1 = _mm_loadu_si128(
     193             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
     194             :         __m128i xmm2 = _mm_loadu_si128(
     195             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
     196             :         __m128i xmm3 = _mm_loadu_si128(
     197             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
     198             :         xmm0 = _mm_shuffle_epi8(xmm0, shuffle_mask);  // W0 W1 W2 W3
     199             :         xmm1 = _mm_shuffle_epi8(xmm1, shuffle_mask);  // W4 W5 W6 W7
     200             :         xmm2 = _mm_shuffle_epi8(xmm2, shuffle_mask);  // W8 W9 WA WB
     201             :         xmm3 = _mm_shuffle_epi8(xmm3, shuffle_mask);  // WC WD WE WF
     202             : 
     203             :         GDALTranspose4x4Int32(xmm0, xmm1, xmm2, xmm3, xmm0, xmm1, xmm2, xmm3);
     204             : 
     205             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0);
     206             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1);
     207             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest2 + i), xmm2);
     208             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest3 + i), xmm3);
     209             :     }
     210             : #if defined(__clang__)
     211             : #pragma clang loop vectorize(disable)
     212             : #endif
     213             :     for (; i < nIters; ++i)
     214             :     {
     215             :         pabyDest0[i] = pabySrc[4 * i + 0];
     216             :         pabyDest1[i] = pabySrc[4 * i + 1];
     217             :         pabyDest2[i] = pabySrc[4 * i + 2];
     218             :         pabyDest3[i] = pabySrc[4 * i + 3];
     219             :     }
     220             : }
     221             : #endif
     222             : 
     223             : /************************************************************************/
     224             : /*                  GDALDeinterleave3UInt16_SSSE3()                     */
     225             : /************************************************************************/
     226             : 
     227             : #if (defined(__GNUC__) && !defined(__clang__)) ||                              \
     228             :     defined(__INTEL_CLANG_COMPILER)
     229             : #if !defined(__INTEL_CLANG_COMPILER)
     230             : // GCC autovectorizer does an excellent job
     231             : __attribute__((optimize("tree-vectorize")))
     232             : #endif
     233         239 : void GDALDeinterleave3UInt16_SSSE3(const GUInt16* CPL_RESTRICT panSrc,
     234             :                                   GUInt16* CPL_RESTRICT panDest0,
     235             :                                   GUInt16* CPL_RESTRICT panDest1,
     236             :                                   GUInt16* CPL_RESTRICT panDest2,
     237             :                                   size_t nIters)
     238             : {
     239     9256750 :     for (size_t i = 0; i < nIters; ++i)
     240             :     {
     241     9256510 :         panDest0[i] = panSrc[3 * i + 0];
     242     9256510 :         panDest1[i] = panSrc[3 * i + 1];
     243     9256510 :         panDest2[i] = panSrc[3 * i + 2];
     244             :     }
     245         239 : }
     246             : #endif
     247             : 
     248             : /************************************************************************/
     249             : /*                  GDALDeinterleave4UInt16_SSSE3()                     */
     250             : /************************************************************************/
     251             : 
     252             : #if (defined(__GNUC__) && !defined(__clang__)) ||                              \
     253             :     defined(__INTEL_CLANG_COMPILER)
     254             : #if !defined(__INTEL_CLANG_COMPILER)
     255             : // GCC autovectorizer does an excellent job
     256             : __attribute__((optimize("tree-vectorize")))
     257             : #endif
     258         494 : void GDALDeinterleave4UInt16_SSSE3(const GUInt16* CPL_RESTRICT panSrc,
     259             :                                   GUInt16* CPL_RESTRICT panDest0,
     260             :                                   GUInt16* CPL_RESTRICT panDest1,
     261             :                                   GUInt16* CPL_RESTRICT panDest2,
     262             :                                   GUInt16* CPL_RESTRICT panDest3,
     263             :                                   size_t nIters)
     264             : {
     265      472352 :     for (size_t i = 0; i < nIters; ++i)
     266             :     {
     267      471858 :         panDest0[i] = panSrc[4 * i + 0];
     268      471858 :         panDest1[i] = panSrc[4 * i + 1];
     269      471858 :         panDest2[i] = panSrc[4 * i + 2];
     270      471858 :         panDest3[i] = panSrc[4 * i + 3];
     271             :     }
     272         494 : }
     273             : #endif
     274             : 
     275             : /************************************************************************/
     276             : /*                               loadu()                                */
     277             : /************************************************************************/
     278             : 
     279       66117 : inline __m128i loadu(const uint8_t *pSrc, size_t i, size_t srcStride)
     280             : {
     281       66117 :     return _mm_loadu_si128(
     282      132234 :         reinterpret_cast<const __m128i *>(pSrc + i * srcStride));
     283             : }
     284             : 
     285             : /************************************************************************/
     286             : /*                               storeu()                               */
     287             : /************************************************************************/
     288             : 
     289       66117 : inline void storeu(uint8_t *pDst, size_t i, size_t dstStride, __m128i reg)
     290             : {
     291       66117 :     _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + i * dstStride), reg);
     292       66117 : }
     293             : 
     294             : /************************************************************************/
     295             : /*                      GDALInterleave3Byte_SSSE3()                     */
     296             : /************************************************************************/
     297             : 
     298             : #if (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
     299             : 
     300             : inline __m128i GDAL_mm_or_3_si128(__m128i r0, __m128i r1, __m128i r2)
     301             : {
     302             :     return _mm_or_si128(_mm_or_si128(r0, r1), r2);
     303             : }
     304             : 
     305             : // ICC autovectorizer doesn't do a good job at generating good SSE code,
     306             : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
     307             : #if defined(__GNUC__)
     308             : __attribute__((noinline))
     309             : #endif
     310             : static void
     311             : GDALInterleave3Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
     312             :                           uint8_t *CPL_RESTRICT pDst, size_t nIters)
     313             : {
     314             :     size_t i = 0;
     315             :     constexpr size_t VALS_PER_ITER = 16;
     316             : 
     317             :     if (nIters >= VALS_PER_ITER)
     318             :     {
     319             :         // clang-format off
     320             :         constexpr char X = -1;
     321             :         // How to dispatch 16 values of row=0 onto 3x16 bytes
     322             :         const __m128i xmm_shuffle00 = _mm_setr_epi8(0, X, X,
     323             :                                                     1, X, X,
     324             :                                                     2, X, X,
     325             :                                                     3, X, X,
     326             :                                                     4, X, X,
     327             :                                                     5);
     328             :         const __m128i xmm_shuffle01 = _mm_setr_epi8(   X, X,
     329             :                                                     6, X, X,
     330             :                                                     7, X, X,
     331             :                                                     8, X, X,
     332             :                                                     9, X, X,
     333             :                                                     10,X);
     334             :         const __m128i xmm_shuffle02 = _mm_setr_epi8(       X,
     335             :                                                     11, X, X,
     336             :                                                     12, X, X,
     337             :                                                     13, X, X,
     338             :                                                     14, X, X,
     339             :                                                     15, X, X);
     340             : 
     341             :         // How to dispatch 16 values of row=1 onto 3x16 bytes
     342             :         const __m128i xmm_shuffle10 = _mm_setr_epi8(X, 0, X,
     343             :                                                     X, 1, X,
     344             :                                                     X, 2, X,
     345             :                                                     X, 3, X,
     346             :                                                     X, 4, X,
     347             :                                                     X);
     348             :         const __m128i xmm_shuffle11 = _mm_setr_epi8(   5, X,
     349             :                                                     X, 6, X,
     350             :                                                     X, 7, X,
     351             :                                                     X, 8, X,
     352             :                                                     X, 9, X,
     353             :                                                     X,10);
     354             :         const __m128i xmm_shuffle12 = _mm_setr_epi8(       X,
     355             :                                                     X, 11, X,
     356             :                                                     X, 12, X,
     357             :                                                     X, 13, X,
     358             :                                                     X, 14, X,
     359             :                                                     X, 15, X);
     360             : 
     361             :         // How to dispatch 16 values of row=2 onto 3x16 bytes
     362             :         const __m128i xmm_shuffle20 = _mm_setr_epi8(X, X, 0,
     363             :                                                     X, X, 1,
     364             :                                                     X, X, 2,
     365             :                                                     X, X, 3,
     366             :                                                     X, X, 4,
     367             :                                                     X);
     368             :         const __m128i xmm_shuffle21 = _mm_setr_epi8(   X, 5,
     369             :                                                     X, X, 6,
     370             :                                                     X, X, 7,
     371             :                                                     X, X, 8,
     372             :                                                     X, X, 9,
     373             :                                                     X, X);
     374             :         const __m128i xmm_shuffle22 = _mm_setr_epi8(      10,
     375             :                                                     X, X, 11,
     376             :                                                     X, X, 12,
     377             :                                                     X, X, 13,
     378             :                                                     X, X, 14,
     379             :                                                     X, X, 15);
     380             :         // clang-format on
     381             : 
     382             :         for (; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
     383             :         {
     384             : #define LOAD(x) __m128i xmm##x = loadu(pSrc + i, x, nIters)
     385             :             LOAD(0);
     386             :             LOAD(1);
     387             :             LOAD(2);
     388             : 
     389             : #define SHUFFLE(x, y) _mm_shuffle_epi8(xmm##y, xmm_shuffle##y##x)
     390             : #define COMBINE_3(x)                                                           \
     391             :     GDAL_mm_or_3_si128(SHUFFLE(x, 0), SHUFFLE(x, 1), SHUFFLE(x, 2))
     392             : 
     393             : #define STORE(x)                                                               \
     394             :     storeu(pDst, 3 * (i / VALS_PER_ITER) + x, VALS_PER_ITER, COMBINE_3(x))
     395             :             STORE(0);
     396             :             STORE(1);
     397             :             STORE(2);
     398             : #undef LOAD
     399             : #undef COMBINE_3
     400             : #undef SHUFFLE
     401             : #undef STORE
     402             :         }
     403             :     }
     404             : 
     405             :     for (; i < nIters; ++i)
     406             :     {
     407             : #define INTERLEAVE(x) pDst[3 * i + x] = pSrc[i + x * nIters]
     408             :         INTERLEAVE(0);
     409             :         INTERLEAVE(1);
     410             :         INTERLEAVE(2);
     411             : #undef INTERLEAVE
     412             :     }
     413             : }
     414             : 
     415             : #else
     416             : 
     417             : #if defined(__GNUC__) && !defined(__clang__)
     418             : __attribute__((optimize("tree-vectorize")))
     419             : #endif
     420             : #if defined(__GNUC__)
     421             : __attribute__((noinline))
     422             : #endif
     423             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
     424             : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
     425             : #pragma clang diagnostic push
     426             : #pragma clang diagnostic ignored "-Wpass-failed"
     427             : #endif
     428             : static void
     429           4 : GDALInterleave3Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
     430             :                           uint8_t *CPL_RESTRICT pDst, size_t nIters)
     431             : {
     432             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
     433             : #pragma clang loop vectorize(enable)
     434             : #endif
     435          46 :     for (size_t i = 0; i < nIters; ++i)
     436             :     {
     437          42 :         pDst[3 * i + 0] = pSrc[i + 0 * nIters];
     438          42 :         pDst[3 * i + 1] = pSrc[i + 1 * nIters];
     439          42 :         pDst[3 * i + 2] = pSrc[i + 2 * nIters];
     440             :     }
     441           4 : }
     442             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
     443             : #pragma clang diagnostic pop
     444             : #endif
     445             : 
     446             : #endif
     447             : 
     448             : /************************************************************************/
     449             : /*                      GDALInterleave5Byte_SSSE3()                     */
     450             : /************************************************************************/
     451             : 
     452           5 : inline __m128i GDAL_mm_or_5_si128(__m128i r0, __m128i r1, __m128i r2,
     453             :                                   __m128i r3, __m128i r4)
     454             : {
     455          15 :     return _mm_or_si128(
     456           5 :         _mm_or_si128(_mm_or_si128(r0, r1), _mm_or_si128(r2, r3)), r4);
     457             : }
     458             : 
     459           2 : void GDALInterleave5Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
     460             :                                uint8_t *CPL_RESTRICT pDst, size_t nIters)
     461             : {
     462           2 :     size_t i = 0;
     463           2 :     constexpr size_t VALS_PER_ITER = 16;
     464             : 
     465           2 :     if (nIters >= VALS_PER_ITER)
     466             :     {
     467             :         // clang-format off
     468           1 :         constexpr char X = -1;
     469             :         // How to dispatch 16 values of row=0 onto 5x16 bytes
     470           1 :         const __m128i xmm_shuffle00 = _mm_setr_epi8(0, X, X, X, X,
     471             :                                                     1, X, X, X, X,
     472             :                                                     2, X, X, X, X,
     473             :                                                     3);
     474           1 :         const __m128i xmm_shuffle01 = _mm_setr_epi8(   X, X, X, X,
     475             :                                                     4, X, X, X, X,
     476             :                                                     5, X, X, X, X,
     477             :                                                     6, X);
     478           1 :         const __m128i xmm_shuffle02 = _mm_setr_epi8(      X, X, X,
     479             :                                                     7, X, X, X, X,
     480             :                                                     8, X, X, X, X,
     481             :                                                     9, X, X);
     482           1 :         const __m128i xmm_shuffle03 = _mm_setr_epi8(          X, X,
     483             :                                                     10, X, X, X, X,
     484             :                                                     11, X, X, X, X,
     485             :                                                     12, X, X, X);
     486           1 :         const __m128i xmm_shuffle04 = _mm_setr_epi8(             X,
     487             :                                                     13, X, X, X, X,
     488             :                                                     14, X, X, X, X,
     489             :                                                     15, X, X, X, X);
     490             : 
     491             :         // How to dispatch 16 values of row=1 onto 5x16 bytes
     492           1 :         const __m128i xmm_shuffle10 = _mm_setr_epi8(X, 0, X, X, X,
     493             :                                                     X, 1, X, X, X,
     494             :                                                     X, 2, X, X, X,
     495             :                                                     X);
     496           1 :         const __m128i xmm_shuffle11 = _mm_setr_epi8(   3, X, X, X,
     497             :                                                     X, 4, X, X, X,
     498             :                                                     X, 5, X, X, X,
     499             :                                                     X, 6);
     500           1 :         const __m128i xmm_shuffle12 = _mm_setr_epi8(      X, X, X,
     501             :                                                     X, 7, X, X, X,
     502             :                                                     X, 8, X, X, X,
     503             :                                                     X, 9, X);
     504           1 :         const __m128i xmm_shuffle13 = _mm_setr_epi8(          X, X,
     505             :                                                     X, 10, X, X, X,
     506             :                                                     X, 11, X, X, X,
     507             :                                                     X, 12, X, X);
     508           1 :         const __m128i xmm_shuffle14 = _mm_setr_epi8(             X,
     509             :                                                     X, 13, X, X, X,
     510             :                                                     X, 14, X, X, X,
     511             :                                                     X, 15, X, X, X);
     512             : 
     513             :         // How to dispatch 16 values of row=2 onto 5x16 bytes
     514           1 :         const __m128i xmm_shuffle20 = _mm_setr_epi8(X, X, 0, X, X,
     515             :                                                     X, X, 1, X, X,
     516             :                                                     X, X, 2, X, X,
     517             :                                                     X);
     518           1 :         const __m128i xmm_shuffle21 = _mm_setr_epi8(   X, 3, X, X,
     519             :                                                     X, X, 4, X, X,
     520             :                                                     X, X, 5, X, X,
     521             :                                                     X, X);
     522           1 :         const __m128i xmm_shuffle22 = _mm_setr_epi8(      6, X, X,
     523             :                                                     X, X, 7, X, X,
     524             :                                                     X, X, 8, X, X,
     525             :                                                     X, X, 9);
     526           1 :         const __m128i xmm_shuffle23 = _mm_setr_epi8(          X, X,
     527             :                                                     X, X, 10, X, X,
     528             :                                                     X, X, 11, X, X,
     529             :                                                     X, X, 12, X);
     530           1 :         const __m128i xmm_shuffle24 = _mm_setr_epi8(             X,
     531             :                                                     X, X, 13, X, X,
     532             :                                                     X, X, 14, X, X,
     533             :                                                     X, X, 15, X, X);
     534             : 
     535             :         // How to dispatch 16 values of row=3 onto 5x16 bytes
     536           1 :         const __m128i xmm_shuffle30 = _mm_setr_epi8(X, X, X, 0, X,
     537             :                                                     X, X, X, 1, X,
     538             :                                                     X, X, X, 2, X,
     539             :                                                     X);
     540           1 :         const __m128i xmm_shuffle31 = _mm_setr_epi8(   X, X, 3, X,
     541             :                                                     X, X, X, 4, X,
     542             :                                                     X, X, X, 5, X,
     543             :                                                     X, X);
     544           1 :         const __m128i xmm_shuffle32 = _mm_setr_epi8(      X, 6, X,
     545             :                                                     X, X, X, 7, X,
     546             :                                                     X, X, X, 8, X,
     547             :                                                     X, X, X);
     548           1 :         const __m128i xmm_shuffle33 = _mm_setr_epi8(          9, X,
     549             :                                                     X, X, X, 10, X,
     550             :                                                     X, X, X, 11, X,
     551             :                                                     X, X, X, 12);
     552           1 :         const __m128i xmm_shuffle34 = _mm_setr_epi8(             X,
     553             :                                                     X, X, X, 13, X,
     554             :                                                     X, X, X, 14, X,
     555             :                                                     X, X, X, 15, X);
     556             : 
     557             :         // How to dispatch 16 values of row=4 onto 5x16 bytes
     558           1 :         const __m128i xmm_shuffle40 = _mm_setr_epi8(X, X, X, X, 0,
     559             :                                                     X, X, X, X, 1,
     560             :                                                     X, X, X, X, 2,
     561             :                                                     X);
     562           1 :         const __m128i xmm_shuffle41 = _mm_setr_epi8(   X, X, X, 3,
     563             :                                                     X, X, X, X, 4,
     564             :                                                     X, X, X, X, 5,
     565             :                                                     X, X);
     566           1 :         const __m128i xmm_shuffle42 = _mm_setr_epi8(      X, X, 6,
     567             :                                                     X, X, X, X, 7,
     568             :                                                     X, X, X, X, 8,
     569             :                                                     X, X, X);
     570           1 :         const __m128i xmm_shuffle43 = _mm_setr_epi8(         X,  9,
     571             :                                                     X, X, X, X, 10,
     572             :                                                     X, X, X, X, 11,
     573             :                                                     X, X, X, X);
     574           1 :         const __m128i xmm_shuffle44 = _mm_setr_epi8(            12,
     575             :                                                     X, X, X, X, 13,
     576             :                                                     X, X, X, X, 14,
     577             :                                                     X, X, X, X, 15);
     578             :         // clang-format on
     579             : 
     580           2 :         for (; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
     581             :         {
     582             : #define LOAD(x) __m128i xmm##x = loadu(pSrc + i, x, nIters)
     583           1 :             LOAD(0);
     584           1 :             LOAD(1);
     585           1 :             LOAD(2);
     586           1 :             LOAD(3);
     587           1 :             LOAD(4);
     588             : 
     589             : #define SHUFFLE(x, y) _mm_shuffle_epi8(xmm##y, xmm_shuffle##y##x)
     590             : #define COMBINE_5(x)                                                           \
     591             :     GDAL_mm_or_5_si128(SHUFFLE(x, 0), SHUFFLE(x, 1), SHUFFLE(x, 2),            \
     592             :                        SHUFFLE(x, 3), SHUFFLE(x, 4))
     593             : 
     594             : #define STORE(x)                                                               \
     595             :     storeu(pDst, 5 * (i / VALS_PER_ITER) + x, VALS_PER_ITER, COMBINE_5(x))
     596           5 :             STORE(0);
     597           5 :             STORE(1);
     598           5 :             STORE(2);
     599           5 :             STORE(3);
     600           5 :             STORE(4);
     601             : #undef LOAD
     602             : #undef COMBINE_5
     603             : #undef SHUFFLE
     604             : #undef STORE
     605             :         }
     606             :     }
     607             : 
     608          20 :     for (; i < nIters; ++i)
     609             :     {
     610             : #define INTERLEAVE(x) pDst[5 * i + x] = pSrc[i + x * nIters]
     611          18 :         INTERLEAVE(0);
     612          18 :         INTERLEAVE(1);
     613          18 :         INTERLEAVE(2);
     614          18 :         INTERLEAVE(3);
     615          18 :         INTERLEAVE(4);
     616             : #undef INTERLEAVE
     617             :     }
     618           2 : }
     619             : 
     620             : /************************************************************************/
     621             : /*                      GDALTranspose2D_Byte_SSSE3()                    */
     622             : /************************************************************************/
     623             : 
     624             : // Given r = (b00, b01, b02, b03,
     625             : //            b10, b11, b12, b13,
     626             : //            b20, b21, b22, b23,
     627             : //            b30, b31, b32, b33)
     628             : // Return    (b00, b10, b20, b30,
     629             : //            b01, b11, b21, b31,
     630             : //            b02, b12, b22, b32,
     631             : //            b03, b13, b22, b33)
     632       66112 : inline void GDALReorderForTranspose4x4(__m128i &r)
     633             : {
     634             :     const __m128i shuffle_mask =
     635       66112 :         _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
     636             : 
     637       66112 :     r = _mm_shuffle_epi8(r, shuffle_mask);
     638       66112 : }
     639             : 
     640             : // Transpose the 16x16 byte values contained in the 16 SSE registers
     641        4132 : inline void GDALTranspose16x16ByteBlock_SSSE3(
     642             :     __m128i &r00, __m128i &r01, __m128i &r02, __m128i &r03, __m128i &r04,
     643             :     __m128i &r05, __m128i &r06, __m128i &r07, __m128i &r08, __m128i &r09,
     644             :     __m128i &r10, __m128i &r11, __m128i &r12, __m128i &r13, __m128i &r14,
     645             :     __m128i &r15)
     646             : {
     647             :     __m128i tmp00, tmp01, tmp02, tmp03;
     648             :     __m128i tmp10, tmp11, tmp12, tmp13;
     649             :     __m128i tmp20, tmp21, tmp22, tmp23;
     650             :     __m128i tmp30, tmp31, tmp32, tmp33;
     651             : 
     652        4132 :     GDALTranspose4x4Int32(r00, r01, r02, r03, tmp00, tmp01, tmp02, tmp03);
     653        4132 :     GDALTranspose4x4Int32(r04, r05, r06, r07, tmp10, tmp11, tmp12, tmp13);
     654        4132 :     GDALTranspose4x4Int32(r08, r09, r10, r11, tmp20, tmp21, tmp22, tmp23);
     655        4132 :     GDALTranspose4x4Int32(r12, r13, r14, r15, tmp30, tmp31, tmp32, tmp33);
     656             : 
     657        4132 :     GDALReorderForTranspose4x4(tmp00);
     658        4132 :     GDALReorderForTranspose4x4(tmp01);
     659        4132 :     GDALReorderForTranspose4x4(tmp02);
     660        4132 :     GDALReorderForTranspose4x4(tmp03);
     661        4132 :     GDALReorderForTranspose4x4(tmp10);
     662        4132 :     GDALReorderForTranspose4x4(tmp11);
     663        4132 :     GDALReorderForTranspose4x4(tmp12);
     664        4132 :     GDALReorderForTranspose4x4(tmp13);
     665        4132 :     GDALReorderForTranspose4x4(tmp20);
     666        4132 :     GDALReorderForTranspose4x4(tmp21);
     667        4132 :     GDALReorderForTranspose4x4(tmp22);
     668        4132 :     GDALReorderForTranspose4x4(tmp23);
     669        4132 :     GDALReorderForTranspose4x4(tmp30);
     670        4132 :     GDALReorderForTranspose4x4(tmp31);
     671        4132 :     GDALReorderForTranspose4x4(tmp32);
     672        4132 :     GDALReorderForTranspose4x4(tmp33);
     673             : 
     674        4132 :     GDALTranspose4x4Int32(tmp00, tmp10, tmp20, tmp30, r00, r01, r02, r03);
     675        4132 :     GDALTranspose4x4Int32(tmp01, tmp11, tmp21, tmp31, r04, r05, r06, r07);
     676        4132 :     GDALTranspose4x4Int32(tmp02, tmp12, tmp22, tmp32, r08, r09, r10, r11);
     677        4132 :     GDALTranspose4x4Int32(tmp03, tmp13, tmp23, tmp33, r12, r13, r14, r15);
     678        4132 : }
     679             : 
     680        4132 : inline void GDALTranspose2D16x16Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
     681             :                                            uint8_t *CPL_RESTRICT pDst,
     682             :                                            size_t srcStride, size_t dstStride)
     683             : {
     684             : #define LOAD(x) __m128i r##x = loadu(pSrc, x, srcStride)
     685        4132 :     LOAD(0);
     686        4132 :     LOAD(1);
     687        4132 :     LOAD(2);
     688        4132 :     LOAD(3);
     689        4132 :     LOAD(4);
     690        4132 :     LOAD(5);
     691        4132 :     LOAD(6);
     692        4132 :     LOAD(7);
     693        4132 :     LOAD(8);
     694        4132 :     LOAD(9);
     695        4132 :     LOAD(10);
     696        4132 :     LOAD(11);
     697        4132 :     LOAD(12);
     698        4132 :     LOAD(13);
     699        4132 :     LOAD(14);
     700        4132 :     LOAD(15);
     701             : #undef LOAD
     702             : 
     703        4132 :     GDALTranspose16x16ByteBlock_SSSE3(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9,
     704             :                                       r10, r11, r12, r13, r14, r15);
     705             : 
     706             : #define STORE(x) storeu(pDst, x, dstStride, r##x)
     707        4132 :     STORE(0);
     708        4132 :     STORE(1);
     709        4132 :     STORE(2);
     710        4132 :     STORE(3);
     711        4132 :     STORE(4);
     712        4132 :     STORE(5);
     713        4132 :     STORE(6);
     714        4132 :     STORE(7);
     715        4132 :     STORE(8);
     716        4132 :     STORE(9);
     717        4132 :     STORE(10);
     718        4132 :     STORE(11);
     719        4132 :     STORE(12);
     720        4132 :     STORE(13);
     721        4132 :     STORE(14);
     722        4132 :     STORE(15);
     723             : #undef STORE
     724        4132 : }
     725             : 
     726          19 : void GDALTranspose2D_Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
     727             :                                 uint8_t *CPL_RESTRICT pDst, size_t nSrcWidth,
     728             :                                 size_t nSrcHeight)
     729             : {
     730          19 :     if (nSrcHeight == 3)
     731             :     {
     732           4 :         GDALInterleave3Byte_SSSE3(pSrc, pDst, nSrcWidth);
     733             :     }
     734          15 :     else if (nSrcHeight == 5)
     735             :     {
     736           2 :         GDALInterleave5Byte_SSSE3(pSrc, pDst, nSrcWidth);
     737             :     }
     738             :     else
     739             :     {
     740          13 :         constexpr size_t blocksize = 16;
     741         109 :         for (size_t i = 0; i < nSrcHeight; i += blocksize)
     742             :         {
     743          96 :             const size_t max_k = std::min(i + blocksize, nSrcHeight);
     744        4280 :             for (size_t j = 0; j < nSrcWidth; j += blocksize)
     745             :             {
     746             :                 // transpose the block beginning at [i,j]
     747        4184 :                 const size_t max_l = std::min(j + blocksize, nSrcWidth);
     748        4184 :                 if (max_k - i == blocksize && max_l - j == blocksize)
     749             :                 {
     750        4132 :                     GDALTranspose2D16x16Byte_SSSE3(&pSrc[j + i * nSrcWidth],
     751        4132 :                                                    &pDst[i + j * nSrcHeight],
     752             :                                                    nSrcWidth, nSrcHeight);
     753             :                 }
     754             :                 else
     755             :                 {
     756         614 :                     for (size_t k = i; k < max_k; ++k)
     757             :                     {
     758        5005 :                         for (size_t l = j; l < max_l; ++l)
     759             :                         {
     760        4443 :                             GDALCopyWord(pSrc[l + k * nSrcWidth],
     761        4443 :                                          pDst[k + l * nSrcHeight]);
     762             :                         }
     763             :                     }
     764             :                 }
     765             :             }
     766             :         }
     767             :     }
     768          19 : }
     769             : 
     770             : #endif  // HAVE_SSSE3_AT_COMPILE_TIME

Generated by: LCOV version 1.14