LCOV - code coverage report
Current view: top level - gcore - rasterio_ssse3.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 38 38 100.0 %
Date: 2024-05-03 15:49:35 Functions: 4 4 100.0 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  GDAL Core
       4             :  * Purpose:  SSSE3 specializations
       5             :  * Author:   Even Rouault <even dot rouault at spatialys dot com>
       6             :  *
       7             :  ******************************************************************************
       8             :  * Copyright (c) 2016, Even Rouault <even dot rouault at spatialys dot com>
       9             :  *
      10             :  * Permission is hereby granted, free of charge, to any person obtaining a
      11             :  * copy of this software and associated documentation files (the "Software"),
      12             :  * to deal in the Software without restriction, including without limitation
      13             :  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      14             :  * and/or sell copies of the Software, and to permit persons to whom the
      15             :  * Software is furnished to do so, subject to the following conditions:
      16             :  *
      17             :  * The above copyright notice and this permission notice shall be included
      18             :  * in all copies or substantial portions of the Software.
      19             :  *
      20             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
      21             :  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      22             :  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
      23             :  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
      24             :  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
      25             :  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
      26             :  * DEALINGS IN THE SOFTWARE.
      27             :  ****************************************************************************/
      28             : 
      29             : #include "cpl_port.h"
      30             : 
      31             : #if defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                     \
      32             :     (defined(__x86_64) || defined(_M_X64))
      33             : 
      34             : #include "rasterio_ssse3.h"
      35             : 
      36             : #include <tmmintrin.h>
      37             : #include "gdal_priv_templates.hpp"
      38             : 
      39      179190 : void GDALUnrolledCopy_GByte_3_1_SSSE3(GByte *CPL_RESTRICT pDest,
      40             :                                       const GByte *CPL_RESTRICT pSrc,
      41             :                                       GPtrDiff_t nIters)
      42             : {
      43             :     decltype(nIters) i;
      44      179190 :     const __m128i xmm_shuffle0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
      45             :                                               -1, -1, 15, 12, 9, 6, 3, 0);
      46      179190 :     const __m128i xmm_shuffle1 = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11, 8, 5,
      47             :                                               2, -1, -1, -1, -1, -1, -1);
      48      179190 :     const __m128i xmm_shuffle2 = _mm_set_epi8(13, 10, 7, 4, 1, -1, -1, -1, -1,
      49             :                                               -1, -1, -1, -1, -1, -1, -1);
      50             :     // If we were sure that there would always be 2 trailing bytes, we could
      51             :     // check against nIters - 15
      52    30220800 :     for (i = 0; i < nIters - 16; i += 16)
      53             :     {
      54             :         __m128i xmm0 =
      55    30041600 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
      56             :         __m128i xmm1 =
      57    30041600 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
      58             :         __m128i xmm2 =
      59    60083300 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
      60             : 
      61             :         // From LSB to MSB:
      62             :         // 0,x,x,1,x,x,2,x,x,3,x,x,4,x,x,5 --> 0,1,2,3,4,5,0,0,0,0,0,0,0,0,0
      63    30041600 :         xmm0 = _mm_shuffle_epi8(xmm0, xmm_shuffle0);
      64             :         // x,x,6,x,x,7,x,x,8,x,x,9,x,x,10,x --> 0,0,0,0,0,0,6,7,8,9,10,0,0,0,0,0
      65    30041800 :         xmm1 = _mm_shuffle_epi8(xmm1, xmm_shuffle1);
      66             :         // x,11,x,x,12,x,x,13,x,x,14,x,x,15,x,x -->
      67             :         // 0,0,0,0,0,0,0,0,0,0,0,11,12,13,14,15
      68    30041600 :         xmm2 = _mm_shuffle_epi8(xmm2, xmm_shuffle2);
      69    30041600 :         xmm0 = _mm_or_si128(xmm0, xmm1);
      70    30041600 :         xmm0 = _mm_or_si128(xmm0, xmm2);
      71             : 
      72    30041600 :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
      73             : 
      74    30041600 :         pSrc += 3 * 16;
      75             :     }
      76     2781910 :     for (; i < nIters; i++)
      77             :     {
      78     2602710 :         pDest[i] = *pSrc;
      79     2602710 :         pSrc += 3;
      80             :     }
      81      179195 : }
      82             : 
      83             : /************************************************************************/
      84             : /*                  GDALDeinterleave3Byte_SSSE3()                       */
      85             : /************************************************************************/
      86             : 
      87             : #if defined(__GNUC__) && !defined(__clang__)
      88             : // GCC autovectorizer does an excellent job
      89       58261 : __attribute__((optimize("tree-vectorize"))) void GDALDeinterleave3Byte_SSSE3(
      90             :     const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
      91             :     GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2, size_t nIters)
      92             : {
      93   196353000 :     for (size_t i = 0; i < nIters; ++i)
      94             :     {
      95   196294000 :         pabyDest0[i] = pabySrc[3 * i + 0];
      96   196294000 :         pabyDest1[i] = pabySrc[3 * i + 1];
      97   196294000 :         pabyDest2[i] = pabySrc[3 * i + 2];
      98             :     }
      99       58261 : }
     100             : #else
     101             : void GDALDeinterleave3Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc,
     102             :                                  GByte *CPL_RESTRICT pabyDest0,
     103             :                                  GByte *CPL_RESTRICT pabyDest1,
     104             :                                  GByte *CPL_RESTRICT pabyDest2, size_t nIters)
     105             : {
     106             :     size_t i = 0;
     107             :     for (; i + 15 < nIters; i += 16)
     108             :     {
     109             :         __m128i xmm0 = _mm_loadu_si128(
     110             :             reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 0));
     111             :         __m128i xmm1 = _mm_loadu_si128(
     112             :             reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 16));
     113             :         __m128i xmm2 = _mm_loadu_si128(
     114             :             reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 32));
     115             :         auto xmm0_new =
     116             :             _mm_shuffle_epi8(xmm0, _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10,
     117             :                                                 7, 4, 1, 9, 6, 3, 0));
     118             :         auto xmm1_new = _mm_shuffle_epi8(
     119             :             _mm_alignr_epi8(xmm1, xmm0, 12),
     120             :             _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10, 7, 4, 1, 9, 6, 3, 0));
     121             :         auto xmm2_new = _mm_shuffle_epi8(
     122             :             _mm_alignr_epi8(xmm2, xmm1, 8),
     123             :             _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10, 7, 4, 1, 9, 6, 3, 0));
     124             :         auto xmm3_new =
     125             :             _mm_shuffle_epi8(xmm2, _mm_set_epi8(-1, -1, -1, -1, 15, 12, 9, 6,
     126             :                                                 14, 11, 8, 5, 13, 10, 7, 4));
     127             : 
     128             :         __m128i xmm01lo =
     129             :             _mm_unpacklo_epi32(xmm0_new, xmm1_new);  // W0 W4 W1 W5
     130             :         __m128i xmm01hi = _mm_unpackhi_epi32(xmm0_new, xmm1_new);  // W2 W6 -  -
     131             :         __m128i xmm23lo =
     132             :             _mm_unpacklo_epi32(xmm2_new, xmm3_new);  // W8 WC W9 WD
     133             :         __m128i xmm23hi = _mm_unpackhi_epi32(xmm2_new, xmm3_new);  // WA WE -  -
     134             :         xmm0_new = _mm_unpacklo_epi64(xmm01lo, xmm23lo);  // W0 W4 W8 WC
     135             :         xmm1_new = _mm_unpackhi_epi64(xmm01lo, xmm23lo);  // W1 W5 W9 WD
     136             :         xmm2_new = _mm_unpacklo_epi64(xmm01hi, xmm23hi);  // W2 W6 WA WE
     137             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0_new);
     138             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1_new);
     139             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest2 + i), xmm2_new);
     140             :     }
     141             : #if defined(__clang__)
     142             : #pragma clang loop vectorize(disable)
     143             : #endif
     144             :     for (; i < nIters; ++i)
     145             :     {
     146             :         pabyDest0[i] = pabySrc[3 * i + 0];
     147             :         pabyDest1[i] = pabySrc[3 * i + 1];
     148             :         pabyDest2[i] = pabySrc[3 * i + 2];
     149             :     }
     150             : }
     151             : #endif
     152             : 
     153             : /************************************************************************/
     154             : /*                  GDALDeinterleave4Byte_SSSE3()                       */
     155             : /************************************************************************/
     156             : 
     157             : #if !defined(__GNUC__) || defined(__clang__)
     158             : void GDALDeinterleave4Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc,
     159             :                                  GByte *CPL_RESTRICT pabyDest0,
     160             :                                  GByte *CPL_RESTRICT pabyDest1,
     161             :                                  GByte *CPL_RESTRICT pabyDest2,
     162             :                                  GByte *CPL_RESTRICT pabyDest3, size_t nIters)
     163             : {
     164             :     const __m128i shuffle_mask =
     165             :         _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
     166             :     size_t i = 0;
     167             :     for (; i + 15 < nIters; i += 16)
     168             :     {
     169             :         __m128i xmm0 = _mm_loadu_si128(
     170             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
     171             :         __m128i xmm1 = _mm_loadu_si128(
     172             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
     173             :         __m128i xmm2 = _mm_loadu_si128(
     174             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
     175             :         __m128i xmm3 = _mm_loadu_si128(
     176             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
     177             :         xmm0 = _mm_shuffle_epi8(xmm0, shuffle_mask);  // W0 W1 W2 W3
     178             :         xmm1 = _mm_shuffle_epi8(xmm1, shuffle_mask);  // W4 W5 W6 W7
     179             :         xmm2 = _mm_shuffle_epi8(xmm2, shuffle_mask);  // W8 W9 WA WB
     180             :         xmm3 = _mm_shuffle_epi8(xmm3, shuffle_mask);  // WC WD WE WF
     181             : 
     182             :         __m128i xmm01lo = _mm_unpacklo_epi32(xmm0, xmm1);  // W0 W4 W1 W5
     183             :         __m128i xmm01hi = _mm_unpackhi_epi32(xmm0, xmm1);  // W2 W6 W3 W7
     184             :         __m128i xmm23lo = _mm_unpacklo_epi32(xmm2, xmm3);  // W8 WC W9 WD
     185             :         __m128i xmm23hi = _mm_unpackhi_epi32(xmm2, xmm3);  // WA WE WB WF
     186             :         xmm0 = _mm_unpacklo_epi64(xmm01lo, xmm23lo);       // W0 W4 W8 WC
     187             :         xmm1 = _mm_unpackhi_epi64(xmm01lo, xmm23lo);       // W1 W5 W9 WD
     188             :         xmm2 = _mm_unpacklo_epi64(xmm01hi, xmm23hi);       // W2 W6 WA WE
     189             :         xmm3 = _mm_unpackhi_epi64(xmm01hi, xmm23hi);       // W3 W7 WB WF
     190             : 
     191             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0);
     192             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1);
     193             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest2 + i), xmm2);
     194             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest3 + i), xmm3);
     195             :     }
     196             : #if defined(__clang__)
     197             : #pragma clang loop vectorize(disable)
     198             : #endif
     199             :     for (; i < nIters; ++i)
     200             :     {
     201             :         pabyDest0[i] = pabySrc[4 * i + 0];
     202             :         pabyDest1[i] = pabySrc[4 * i + 1];
     203             :         pabyDest2[i] = pabySrc[4 * i + 2];
     204             :         pabyDest3[i] = pabySrc[4 * i + 3];
     205             :     }
     206             : }
     207             : #endif
     208             : 
     209             : /************************************************************************/
     210             : /*                  GDALDeinterleave3UInt16_SSSE3()                     */
     211             : /************************************************************************/
     212             : 
     213             : #if (defined(__GNUC__) && !defined(__clang__)) ||                              \
     214             :     defined(__INTEL_CLANG_COMPILER)
     215             : #if !defined(__INTEL_CLANG_COMPILER)
     216             : // GCC autovectorizer does an excellent job
     217             : __attribute__((optimize("tree-vectorize")))
     218             : #endif
     219         237 : void GDALDeinterleave3UInt16_SSSE3(const GUInt16* CPL_RESTRICT panSrc,
     220             :                                   GUInt16* CPL_RESTRICT panDest0,
     221             :                                   GUInt16* CPL_RESTRICT panDest1,
     222             :                                   GUInt16* CPL_RESTRICT panDest2,
     223             :                                   size_t nIters)
     224             : {
     225     9256740 :     for (size_t i = 0; i < nIters; ++i)
     226             :     {
     227     9256510 :         panDest0[i] = panSrc[3 * i + 0];
     228     9256510 :         panDest1[i] = panSrc[3 * i + 1];
     229     9256510 :         panDest2[i] = panSrc[3 * i + 2];
     230             :     }
     231         237 : }
     232             : #endif
     233             : 
     234             : /************************************************************************/
     235             : /*                  GDALDeinterleave4UInt16_SSSE3()                     */
     236             : /************************************************************************/
     237             : 
     238             : #if (defined(__GNUC__) && !defined(__clang__)) ||                              \
     239             :     defined(__INTEL_CLANG_COMPILER)
     240             : #if !defined(__INTEL_CLANG_COMPILER)
     241             : // GCC autovectorizer does an excellent job
     242             : __attribute__((optimize("tree-vectorize")))
     243             : #endif
     244         494 : void GDALDeinterleave4UInt16_SSSE3(const GUInt16* CPL_RESTRICT panSrc,
     245             :                                   GUInt16* CPL_RESTRICT panDest0,
     246             :                                   GUInt16* CPL_RESTRICT panDest1,
     247             :                                   GUInt16* CPL_RESTRICT panDest2,
     248             :                                   GUInt16* CPL_RESTRICT panDest3,
     249             :                                   size_t nIters)
     250             : {
     251      472352 :     for (size_t i = 0; i < nIters; ++i)
     252             :     {
     253      471858 :         panDest0[i] = panSrc[4 * i + 0];
     254      471858 :         panDest1[i] = panSrc[4 * i + 1];
     255      471858 :         panDest2[i] = panSrc[4 * i + 2];
     256      471858 :         panDest3[i] = panSrc[4 * i + 3];
     257             :     }
     258         494 : }
     259             : #endif
     260             : 
     261             : #endif  // HAVE_SSSE3_AT_COMPILE_TIME

Generated by: LCOV version 1.14