LCOV - code coverage report
Current view: top level - gcore - rasterio_ssse3.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 38 38 100.0 %
Date: 2024-11-21 22:18:42 Functions: 4 4 100.0 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  GDAL Core
       4             :  * Purpose:  SSSE3 specializations
       5             :  * Author:   Even Rouault <even dot rouault at spatialys dot com>
       6             :  *
       7             :  ******************************************************************************
       8             :  * Copyright (c) 2016, Even Rouault <even dot rouault at spatialys dot com>
       9             :  *
      10             :  * SPDX-License-Identifier: MIT
      11             :  ****************************************************************************/
      12             : 
      13             : #include "cpl_port.h"
      14             : 
      15             : #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                    \
      16             :      (defined(__x86_64) || defined(_M_X64))) ||                                \
      17             :     defined(USE_NEON_OPTIMIZATIONS)
      18             : 
      19             : #include "rasterio_ssse3.h"
      20             : 
      21             : #ifdef USE_NEON_OPTIMIZATIONS
      22             : #include "include_sse2neon.h"
      23             : #else
      24             : #include <tmmintrin.h>
      25             : #endif
      26             : 
      27             : #include "gdal_priv_templates.hpp"
      28             : 
      29      179427 : void GDALUnrolledCopy_GByte_3_1_SSSE3(GByte *CPL_RESTRICT pDest,
      30             :                                       const GByte *CPL_RESTRICT pSrc,
      31             :                                       GPtrDiff_t nIters)
      32             : {
      33             :     decltype(nIters) i;
      34      179427 :     const __m128i xmm_shuffle0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
      35             :                                               -1, -1, 15, 12, 9, 6, 3, 0);
      36      179427 :     const __m128i xmm_shuffle1 = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11, 8, 5,
      37             :                                               2, -1, -1, -1, -1, -1, -1);
      38      179427 :     const __m128i xmm_shuffle2 = _mm_set_epi8(13, 10, 7, 4, 1, -1, -1, -1, -1,
      39             :                                               -1, -1, -1, -1, -1, -1, -1);
      40             :     // If we were sure that there would always be 2 trailing bytes, we could
      41             :     // check against nIters - 15
      42    30307300 :     for (i = 0; i < nIters - 16; i += 16)
      43             :     {
      44             :         __m128i xmm0 =
      45    30127900 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
      46             :         __m128i xmm1 =
      47    30127900 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
      48             :         __m128i xmm2 =
      49    60255700 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
      50             : 
      51             :         // From LSB to MSB:
      52             :         // 0,x,x,1,x,x,2,x,x,3,x,x,4,x,x,5 --> 0,1,2,3,4,5,0,0,0,0,0,0,0,0,0
      53    30127900 :         xmm0 = _mm_shuffle_epi8(xmm0, xmm_shuffle0);
      54             :         // x,x,6,x,x,7,x,x,8,x,x,9,x,x,10,x --> 0,0,0,0,0,0,6,7,8,9,10,0,0,0,0,0
      55    30127900 :         xmm1 = _mm_shuffle_epi8(xmm1, xmm_shuffle1);
      56             :         // x,11,x,x,12,x,x,13,x,x,14,x,x,15,x,x -->
      57             :         // 0,0,0,0,0,0,0,0,0,0,0,11,12,13,14,15
      58    30127900 :         xmm2 = _mm_shuffle_epi8(xmm2, xmm_shuffle2);
      59    30127900 :         xmm0 = _mm_or_si128(xmm0, xmm1);
      60    30127900 :         xmm0 = _mm_or_si128(xmm0, xmm2);
      61             : 
      62    30127900 :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
      63             : 
      64    30127900 :         pSrc += 3 * 16;
      65             :     }
      66     2785930 :     for (; i < nIters; i++)
      67             :     {
      68     2606510 :         pDest[i] = *pSrc;
      69     2606510 :         pSrc += 3;
      70             :     }
      71      179427 : }
      72             : 
      73             : /************************************************************************/
      74             : /*                  GDALDeinterleave3Byte_SSSE3()                       */
      75             : /************************************************************************/
      76             : 
      77             : #if defined(__GNUC__) && !defined(__clang__)
      78             : // GCC autovectorizer does an excellent job
      79       69765 : __attribute__((optimize("tree-vectorize"))) void GDALDeinterleave3Byte_SSSE3(
      80             :     const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
      81             :     GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2, size_t nIters)
      82             : {
      83   220124000 :     for (size_t i = 0; i < nIters; ++i)
      84             :     {
      85   220055000 :         pabyDest0[i] = pabySrc[3 * i + 0];
      86   220055000 :         pabyDest1[i] = pabySrc[3 * i + 1];
      87   220055000 :         pabyDest2[i] = pabySrc[3 * i + 2];
      88             :     }
      89       69765 : }
      90             : #else
      91             : void GDALDeinterleave3Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc,
      92             :                                  GByte *CPL_RESTRICT pabyDest0,
      93             :                                  GByte *CPL_RESTRICT pabyDest1,
      94             :                                  GByte *CPL_RESTRICT pabyDest2, size_t nIters)
      95             : {
      96             :     size_t i = 0;
      97             :     for (; i + 15 < nIters; i += 16)
      98             :     {
      99             :         __m128i xmm0 = _mm_loadu_si128(
     100             :             reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 0));
     101             :         __m128i xmm1 = _mm_loadu_si128(
     102             :             reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 16));
     103             :         __m128i xmm2 = _mm_loadu_si128(
     104             :             reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 32));
     105             :         auto xmm0_new =
     106             :             _mm_shuffle_epi8(xmm0, _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10,
     107             :                                                 7, 4, 1, 9, 6, 3, 0));
     108             :         auto xmm1_new = _mm_shuffle_epi8(
     109             :             _mm_alignr_epi8(xmm1, xmm0, 12),
     110             :             _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10, 7, 4, 1, 9, 6, 3, 0));
     111             :         auto xmm2_new = _mm_shuffle_epi8(
     112             :             _mm_alignr_epi8(xmm2, xmm1, 8),
     113             :             _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10, 7, 4, 1, 9, 6, 3, 0));
     114             :         auto xmm3_new =
     115             :             _mm_shuffle_epi8(xmm2, _mm_set_epi8(-1, -1, -1, -1, 15, 12, 9, 6,
     116             :                                                 14, 11, 8, 5, 13, 10, 7, 4));
     117             : 
     118             :         __m128i xmm01lo =
     119             :             _mm_unpacklo_epi32(xmm0_new, xmm1_new);  // W0 W4 W1 W5
     120             :         __m128i xmm01hi = _mm_unpackhi_epi32(xmm0_new, xmm1_new);  // W2 W6 -  -
     121             :         __m128i xmm23lo =
     122             :             _mm_unpacklo_epi32(xmm2_new, xmm3_new);  // W8 WC W9 WD
     123             :         __m128i xmm23hi = _mm_unpackhi_epi32(xmm2_new, xmm3_new);  // WA WE -  -
     124             :         xmm0_new = _mm_unpacklo_epi64(xmm01lo, xmm23lo);  // W0 W4 W8 WC
     125             :         xmm1_new = _mm_unpackhi_epi64(xmm01lo, xmm23lo);  // W1 W5 W9 WD
     126             :         xmm2_new = _mm_unpacklo_epi64(xmm01hi, xmm23hi);  // W2 W6 WA WE
     127             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0_new);
     128             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1_new);
     129             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest2 + i), xmm2_new);
     130             :     }
     131             : #if defined(__clang__)
     132             : #pragma clang loop vectorize(disable)
     133             : #endif
     134             :     for (; i < nIters; ++i)
     135             :     {
     136             :         pabyDest0[i] = pabySrc[3 * i + 0];
     137             :         pabyDest1[i] = pabySrc[3 * i + 1];
     138             :         pabyDest2[i] = pabySrc[3 * i + 2];
     139             :     }
     140             : }
     141             : #endif
     142             : 
     143             : /************************************************************************/
     144             : /*                  GDALDeinterleave4Byte_SSSE3()                       */
     145             : /************************************************************************/
     146             : 
     147             : #if !defined(__GNUC__) || defined(__clang__)
     148             : void GDALDeinterleave4Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc,
     149             :                                  GByte *CPL_RESTRICT pabyDest0,
     150             :                                  GByte *CPL_RESTRICT pabyDest1,
     151             :                                  GByte *CPL_RESTRICT pabyDest2,
     152             :                                  GByte *CPL_RESTRICT pabyDest3, size_t nIters)
     153             : {
     154             :     const __m128i shuffle_mask =
     155             :         _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
     156             :     size_t i = 0;
     157             :     for (; i + 15 < nIters; i += 16)
     158             :     {
     159             :         __m128i xmm0 = _mm_loadu_si128(
     160             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
     161             :         __m128i xmm1 = _mm_loadu_si128(
     162             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
     163             :         __m128i xmm2 = _mm_loadu_si128(
     164             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
     165             :         __m128i xmm3 = _mm_loadu_si128(
     166             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
     167             :         xmm0 = _mm_shuffle_epi8(xmm0, shuffle_mask);  // W0 W1 W2 W3
     168             :         xmm1 = _mm_shuffle_epi8(xmm1, shuffle_mask);  // W4 W5 W6 W7
     169             :         xmm2 = _mm_shuffle_epi8(xmm2, shuffle_mask);  // W8 W9 WA WB
     170             :         xmm3 = _mm_shuffle_epi8(xmm3, shuffle_mask);  // WC WD WE WF
     171             : 
     172             :         __m128i xmm01lo = _mm_unpacklo_epi32(xmm0, xmm1);  // W0 W4 W1 W5
     173             :         __m128i xmm01hi = _mm_unpackhi_epi32(xmm0, xmm1);  // W2 W6 W3 W7
     174             :         __m128i xmm23lo = _mm_unpacklo_epi32(xmm2, xmm3);  // W8 WC W9 WD
     175             :         __m128i xmm23hi = _mm_unpackhi_epi32(xmm2, xmm3);  // WA WE WB WF
     176             :         xmm0 = _mm_unpacklo_epi64(xmm01lo, xmm23lo);       // W0 W4 W8 WC
     177             :         xmm1 = _mm_unpackhi_epi64(xmm01lo, xmm23lo);       // W1 W5 W9 WD
     178             :         xmm2 = _mm_unpacklo_epi64(xmm01hi, xmm23hi);       // W2 W6 WA WE
     179             :         xmm3 = _mm_unpackhi_epi64(xmm01hi, xmm23hi);       // W3 W7 WB WF
     180             : 
     181             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0);
     182             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1);
     183             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest2 + i), xmm2);
     184             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest3 + i), xmm3);
     185             :     }
     186             : #if defined(__clang__)
     187             : #pragma clang loop vectorize(disable)
     188             : #endif
     189             :     for (; i < nIters; ++i)
     190             :     {
     191             :         pabyDest0[i] = pabySrc[4 * i + 0];
     192             :         pabyDest1[i] = pabySrc[4 * i + 1];
     193             :         pabyDest2[i] = pabySrc[4 * i + 2];
     194             :         pabyDest3[i] = pabySrc[4 * i + 3];
     195             :     }
     196             : }
     197             : #endif
     198             : 
     199             : /************************************************************************/
     200             : /*                  GDALDeinterleave3UInt16_SSSE3()                     */
     201             : /************************************************************************/
     202             : 
     203             : #if (defined(__GNUC__) && !defined(__clang__)) ||                              \
     204             :     defined(__INTEL_CLANG_COMPILER)
     205             : #if !defined(__INTEL_CLANG_COMPILER)
     206             : // GCC autovectorizer does an excellent job
     207             : __attribute__((optimize("tree-vectorize")))
     208             : #endif
     209         239 : void GDALDeinterleave3UInt16_SSSE3(const GUInt16* CPL_RESTRICT panSrc,
     210             :                                   GUInt16* CPL_RESTRICT panDest0,
     211             :                                   GUInt16* CPL_RESTRICT panDest1,
     212             :                                   GUInt16* CPL_RESTRICT panDest2,
     213             :                                   size_t nIters)
     214             : {
     215     9256750 :     for (size_t i = 0; i < nIters; ++i)
     216             :     {
     217     9256510 :         panDest0[i] = panSrc[3 * i + 0];
     218     9256510 :         panDest1[i] = panSrc[3 * i + 1];
     219     9256510 :         panDest2[i] = panSrc[3 * i + 2];
     220             :     }
     221         239 : }
     222             : #endif
     223             : 
     224             : /************************************************************************/
     225             : /*                  GDALDeinterleave4UInt16_SSSE3()                     */
     226             : /************************************************************************/
     227             : 
     228             : #if (defined(__GNUC__) && !defined(__clang__)) ||                              \
     229             :     defined(__INTEL_CLANG_COMPILER)
     230             : #if !defined(__INTEL_CLANG_COMPILER)
     231             : // GCC autovectorizer does an excellent job
     232             : __attribute__((optimize("tree-vectorize")))
     233             : #endif
     234         494 : void GDALDeinterleave4UInt16_SSSE3(const GUInt16* CPL_RESTRICT panSrc,
     235             :                                   GUInt16* CPL_RESTRICT panDest0,
     236             :                                   GUInt16* CPL_RESTRICT panDest1,
     237             :                                   GUInt16* CPL_RESTRICT panDest2,
     238             :                                   GUInt16* CPL_RESTRICT panDest3,
     239             :                                   size_t nIters)
     240             : {
     241      472352 :     for (size_t i = 0; i < nIters; ++i)
     242             :     {
     243      471858 :         panDest0[i] = panSrc[4 * i + 0];
     244      471858 :         panDest1[i] = panSrc[4 * i + 1];
     245      471858 :         panDest2[i] = panSrc[4 * i + 2];
     246      471858 :         panDest3[i] = panSrc[4 * i + 3];
     247             :     }
     248         494 : }
     249             : #endif
     250             : 
     251             : #endif  // HAVE_SSSE3_AT_COMPILE_TIME

Generated by: LCOV version 1.14