LCOV - code coverage report
Current view: top level - frmts/png - filter_sse2_intrinsics.c (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 156 156 100.0 %
Date: 2024-05-02 22:57:13 Functions: 12 12 100.0 %

          Line data    Source code
       1             : /* filter_sse2_intrinsics.c - SSE2 optimized filter functions
       2             :  *
       3             :  * Copyright (c) 2018 Cosmin Truta
       4             :  * Copyright (c) 2016-2017 Glenn Randers-Pehrson
       5             :  * Written by Mike Klein and Matt Sarett
       6             :  * Derived from arm/filter_neon_intrinsics.c
       7             :  *
       8             :  * This code is released under the libpng license.
       9             :  * For conditions of distribution and use, see the disclaimer
      10             :  * and license in png.h
      11             :  */
      12             : 
      13             : #ifndef png_debug
      14             : #define png_debug(x, y)
      15             : #endif
      16             : 
      17             : #ifndef PNG_INTEL_SSE_IMPLEMENTATION
      18             : #if defined(__SSE4_1__) || defined(__AVX__)
      19             : /* We are not actually using AVX, but checking for AVX is the best
      20             :    way we can detect SSE4.1 and SSSE3 on MSVC.
      21             : */
      22             : #define PNG_INTEL_SSE_IMPLEMENTATION 3
      23             : #elif defined(__SSSE3__)
      24             : #define PNG_INTEL_SSE_IMPLEMENTATION 2
      25             : #elif defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) ||             \
      26             :     (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
      27             : #define PNG_INTEL_SSE_IMPLEMENTATION 1
      28             : #else
      29             : #define PNG_INTEL_SSE_IMPLEMENTATION 0
      30             : #endif
      31             : #endif
      32             : 
      33             : #include <immintrin.h>
      34             : 
      35             : /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
      36             :  * They're positioned like this:
      37             :  *    prev:  c b
      38             :  *    row:   a d
      39             :  * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
      40             :  * whichever of a, b, or c is closest to p=a+b-c.
      41             :  */
      42             : 
      43    15304500 : static __m128i load4(const void *p)
      44             : {
      45             :     int tmp;
      46    15304500 :     memcpy(&tmp, p, sizeof(tmp));
      47    30608900 :     return _mm_cvtsi32_si128(tmp);
      48             : }
      49             : 
      50     7416590 : static void store4(void *p, __m128i v)
      51             : {
      52     7416590 :     int tmp = _mm_cvtsi128_si32(v);
      53     7416590 :     memcpy(p, &tmp, sizeof(int));
      54     7416590 : }
      55             : 
      56       19859 : static __m128i load3(const void *p)
      57             : {
      58       19859 :     png_uint_32 tmp = 0;
      59       19859 :     memcpy(&tmp, p, 3);
      60       39718 :     return _mm_cvtsi32_si128(tmp);
      61             : }
      62             : 
      63     1203150 : static void store3(void *p, __m128i v)
      64             : {
      65     1203150 :     int tmp = _mm_cvtsi128_si32(v);
      66     1203150 :     memcpy(p, &tmp, 3);
      67     1203150 : }
      68             : 
      69        3921 : static void gdal_png_read_filter_row_sub3_sse2(png_row_infop row_info,
      70             :                                                const GByte *input, GByte *row)
      71             : {
      72             :     /* The Sub filter predicts each pixel as the previous pixel, a.
      73             :      * There is no pixel to the left of the first pixel.  It's encoded directly.
      74             :      * That works with our main loop if we just say that left pixel was zero.
      75             :      */
      76             :     size_t rb;
      77             : 
      78        3921 :     __m128i a, d = _mm_setzero_si128();
      79             : 
      80             :     png_debug(1, "in png_read_filter_row_sub3_sse2");
      81             : 
      82        3921 :     rb = row_info->rowbytes;
      83      328129 :     while (rb >= 4)
      84             :     {
      85      324208 :         a = d;
      86      324208 :         d = load4(input);
      87      324208 :         d = _mm_add_epi8(d, a);
      88      324208 :         store3(row, d);
      89             : 
      90      324208 :         input += 3;
      91      324208 :         row += 3;
      92      324208 :         rb -= 3;
      93             :     }
      94        3921 :     if (rb > 0)
      95             :     {
      96        3921 :         a = d;
      97        3921 :         d = load3(input);
      98        3921 :         d = _mm_add_epi8(d, a);
      99        3921 :         store3(row, d);
     100             : 
     101             :         //input += 3;
     102             :         //row += 3;
     103             :         //rb -= 3;
     104             :     }
     105        3921 : }
     106             : 
     107        5988 : static void gdal_png_read_filter_row_sub4_sse2(png_row_infop row_info,
     108             :                                                const GByte *input, GByte *row)
     109             : {
     110             :     /* The Sub filter predicts each pixel as the previous pixel, a.
     111             :      * There is no pixel to the left of the first pixel.  It's encoded directly.
     112             :      * That works with our main loop if we just say that left pixel was zero.
     113             :      */
     114             :     size_t rb;
     115             : 
     116        5988 :     __m128i a, d = _mm_setzero_si128();
     117             : 
     118             :     png_debug(1, "in png_read_filter_row_sub4_sse2");
     119             : 
     120        5988 :     rb = row_info->rowbytes + 4;
     121     1593010 :     while (rb > 4)
     122             :     {
     123     1587020 :         a = d;
     124     1587020 :         d = load4(input);
     125     1587020 :         d = _mm_add_epi8(d, a);
     126     1587020 :         store4(row, d);
     127             : 
     128     1587020 :         input += 4;
     129     1587020 :         row += 4;
     130     1587020 :         rb -= 4;
     131             :     }
     132        5988 : }
     133             : 
     134        3800 : static void gdal_png_read_filter_row_avg3_sse2(png_row_infop row_info,
     135             :                                                const GByte *input, GByte *row,
     136             :                                                const GByte *prev)
     137             : {
     138             :     /* The Avg filter predicts each pixel as the (truncated) average of a and b.
     139             :      * There's no pixel to the left of the first pixel.  Luckily, it's
     140             :      * predicted to be half of the pixel above it.  So again, this works
     141             :      * perfectly with our loop if we make sure a starts at zero.
     142             :      */
     143             : 
     144             :     size_t rb;
     145             : 
     146        3800 :     const __m128i zero = _mm_setzero_si128();
     147             : 
     148             :     __m128i b;
     149        3800 :     __m128i a, d = zero;
     150             : 
     151             :     png_debug(1, "in png_read_filter_row_avg3_sse2");
     152        3800 :     rb = row_info->rowbytes;
     153      452700 :     while (rb >= 4)
     154             :     {
     155             :         __m128i avg;
     156      448900 :         b = load4(prev);
     157      448900 :         a = d;
     158      448900 :         d = load4(input);
     159             : 
     160             :         /* PNG requires a truncating average, so we can't just use _mm_avg_epu8
     161             :          */
     162      448900 :         avg = _mm_avg_epu8(a, b);
     163             :         /* ...but we can fix it up by subtracting off 1 if it rounded up. */
     164     1795600 :         avg = _mm_sub_epi8(
     165             :             avg, _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi8(1)));
     166      448900 :         d = _mm_add_epi8(d, avg);
     167      448900 :         store3(row, d);
     168             : 
     169      448900 :         input += 3;
     170      448900 :         prev += 3;
     171      448900 :         row += 3;
     172      448900 :         rb -= 3;
     173             :     }
     174        3800 :     if (rb > 0)
     175             :     {
     176             :         __m128i avg;
     177        3800 :         b = load3(prev);
     178        3800 :         a = d;
     179        3800 :         d = load3(input);
     180             : 
     181             :         /* PNG requires a truncating average, so we can't just use _mm_avg_epu8
     182             :          */
     183        3800 :         avg = _mm_avg_epu8(a, b);
     184             :         /* ...but we can fix it up by subtracting off 1 if it rounded up. */
     185       15200 :         avg = _mm_sub_epi8(
     186             :             avg, _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi8(1)));
     187             : 
     188        3800 :         d = _mm_add_epi8(d, avg);
     189        3800 :         store3(row, d);
     190             : 
     191             :         // input += 3;
     192             :         // prev += 3;
     193             :         // row += 3;
     194             :         // rb -= 3;
     195             :     }
     196        3800 : }
     197             : 
     198        3689 : static void gdal_png_read_filter_row_avg4_sse2(png_row_infop row_info,
     199             :                                                const GByte *input, GByte *row,
     200             :                                                const GByte *prev)
     201             : {
     202             :     /* The Avg filter predicts each pixel as the (truncated) average of a and b.
     203             :      * There's no pixel to the left of the first pixel.  Luckily, it's
     204             :      * predicted to be half of the pixel above it.  So again, this works
     205             :      * perfectly with our loop if we make sure a starts at zero.
     206             :      */
     207             :     size_t rb;
     208        3689 :     const __m128i zero = _mm_setzero_si128();
     209             :     __m128i b;
     210        3689 :     __m128i a, d = zero;
     211             : 
     212             :     png_debug(1, "in png_read_filter_row_avg4_sse2");
     213             : 
     214        3689 :     rb = row_info->rowbytes + 4;
     215     1029410 :     while (rb > 4)
     216             :     {
     217             :         __m128i avg;
     218     1025720 :         b = load4(prev);
     219     1025720 :         a = d;
     220     1025720 :         d = load4(input);
     221             : 
     222             :         /* PNG requires a truncating average, so we can't just use _mm_avg_epu8
     223             :          */
     224     1025720 :         avg = _mm_avg_epu8(a, b);
     225             :         /* ...but we can fix it up by subtracting off 1 if it rounded up. */
     226     4102890 :         avg = _mm_sub_epi8(
     227             :             avg, _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi8(1)));
     228             : 
     229     1025720 :         d = _mm_add_epi8(d, avg);
     230     1025720 :         store4(row, d);
     231             : 
     232     1025720 :         input += 4;
     233     1025720 :         prev += 4;
     234     1025720 :         row += 4;
     235     1025720 :         rb -= 4;
     236             :     }
     237        3689 : }
     238             : 
     239             : /* Returns |x| for 16-bit lanes. */
     240    15678500 : static __m128i abs_i16(__m128i x)
     241             : {
     242             : #if PNG_INTEL_SSE_IMPLEMENTATION >= 2
     243             :     return _mm_abs_epi16(x);
     244             : #else
     245             :     /* Read this all as, return x<0 ? -x : x.
     246             :      * To negate two's complement, you flip all the bits then add 1.
     247             :      */
     248    31357000 :     __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());
     249             : 
     250             :     /* Flip negative lanes. */
     251    15678500 :     x = _mm_xor_si128(x, is_negative);
     252             : 
     253             :     /* +1 to negative lanes, else +0. */
     254    15678500 :     x = _mm_sub_epi16(x, is_negative);
     255    15678500 :     return x;
     256             : #endif
     257             : }
     258             : 
     259             : /* Bytewise c ? t : e. */
     260    10452300 : static __m128i if_then_else(__m128i c, __m128i t, __m128i e)
     261             : {
     262             : #if PNG_INTEL_SSE_IMPLEMENTATION >= 3
     263             :     return _mm_blendv_epi8(e, t, c);
     264             : #else
     265    31357000 :     return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));
     266             : #endif
     267             : }
     268             : 
     269        4169 : static void gdal_png_read_filter_row_paeth3_sse2(png_row_infop row_info,
     270             :                                                  const GByte *input, GByte *row,
     271             :                                                  const GByte *prev)
     272             : {
     273             :     /* Paeth tries to predict pixel d using the pixel to the left of it, a,
     274             :      * and two pixels from the previous row, b and c:
     275             :      *   prev: c b
     276             :      *   row:  a d
     277             :      * The Paeth function predicts d to be whichever of a, b, or c is nearest to
     278             :      * p=a+b-c.
     279             :      *
     280             :      * The first pixel has no left context, and so uses an Up filter, p = b.
     281             :      * This works naturally with our main loop's p = a+b-c if we force a and c
     282             :      * to zero.
     283             :      * Here we zero b and d, which become c and a respectively at the start of
     284             :      * the loop.
     285             :      */
     286             :     size_t rb;
     287        4169 :     const __m128i zero = _mm_setzero_si128();
     288        4169 :     __m128i c, b = zero, a, d = zero;
     289             : 
     290             :     png_debug(1, "in png_read_filter_row_paeth3_sse2");
     291             : 
     292        4169 :     rb = row_info->rowbytes;
     293      422317 :     while (rb >= 4)
     294             :     {
     295             :         /* It's easiest to do this math (particularly, deal with pc) with 16-bit
     296             :          * intermediates.
     297             :          */
     298             :         __m128i pa, pb, pc, smallest, nearest;
     299      418148 :         c = b;
     300      418148 :         b = _mm_unpacklo_epi8(load4(prev), zero);
     301      418148 :         a = d;
     302      836296 :         d = _mm_unpacklo_epi8(load4(input), zero);
     303             : 
     304             :         /* (p-a) == (a+b-c - a) == (b-c) */
     305             : 
     306      418148 :         pa = _mm_sub_epi16(b, c);
     307             : 
     308             :         /* (p-b) == (a+b-c - b) == (a-c) */
     309      418148 :         pb = _mm_sub_epi16(a, c);
     310             : 
     311             :         /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
     312      418148 :         pc = _mm_add_epi16(pa, pb);
     313             : 
     314      418148 :         pa = abs_i16(pa); /* |p-a| */
     315      418148 :         pb = abs_i16(pb); /* |p-b| */
     316      418148 :         pc = abs_i16(pc); /* |p-c| */
     317             : 
     318      836296 :         smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
     319             : 
     320             :         /* Paeth breaks ties favoring a over b over c. */
     321             :         nearest =
     322      836296 :             if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
     323             :                          if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));
     324             : 
     325             :         /* Note `_epi8`: we need addition to wrap modulo 255. */
     326      418148 :         d = _mm_add_epi8(d, nearest);
     327      418148 :         store3(row, _mm_packus_epi16(d, d));
     328             : 
     329      418148 :         input += 3;
     330      418148 :         prev += 3;
     331      418148 :         row += 3;
     332      418148 :         rb -= 3;
     333             :     }
     334        4169 :     if (rb > 0)
     335             :     {
     336             :         /* It's easiest to do this math (particularly, deal with pc) with 16-bit
     337             :          * intermediates.
     338             :          */
     339             :         __m128i pa, pb, pc, smallest, nearest;
     340        4169 :         c = b;
     341        4169 :         b = _mm_unpacklo_epi8(load3(prev), zero);
     342        4169 :         a = d;
     343        8338 :         d = _mm_unpacklo_epi8(load3(input), zero);
     344             : 
     345             :         /* (p-a) == (a+b-c - a) == (b-c) */
     346        4169 :         pa = _mm_sub_epi16(b, c);
     347             : 
     348             :         /* (p-b) == (a+b-c - b) == (a-c) */
     349        4169 :         pb = _mm_sub_epi16(a, c);
     350             : 
     351             :         /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
     352        4169 :         pc = _mm_add_epi16(pa, pb);
     353             : 
     354        4169 :         pa = abs_i16(pa); /* |p-a| */
     355        4169 :         pb = abs_i16(pb); /* |p-b| */
     356        4169 :         pc = abs_i16(pc); /* |p-c| */
     357             : 
     358        8338 :         smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
     359             : 
     360             :         /* Paeth breaks ties favoring a over b over c. */
     361             :         nearest =
     362        8338 :             if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
     363             :                          if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));
     364             : 
     365             :         /* Note `_epi8`: we need addition to wrap modulo 255. */
     366        4169 :         d = _mm_add_epi8(d, nearest);
     367        4169 :         store3(row, _mm_packus_epi16(d, d));
     368             : 
     369             :         // input += 3;
     370             :         // prev += 3;
     371             :         // row += 3;
     372             :         // rb -= 3;
     373             :     }
     374        4169 : }
     375             : 
     376       19018 : static void gdal_png_read_filter_row_paeth4_sse2(png_row_infop row_info,
     377             :                                                  const GByte *input, GByte *row,
     378             :                                                  const GByte *prev)
     379             : {
     380             :     /* Paeth tries to predict pixel d using the pixel to the left of it, a,
     381             :      * and two pixels from the previous row, b and c:
     382             :      *   prev: c b
     383             :      *   row:  a d
     384             :      * The Paeth function predicts d to be whichever of a, b, or c is nearest to
     385             :      * p=a+b-c.
     386             :      *
     387             :      * The first pixel has no left context, and so uses an Up filter, p = b.
     388             :      * This works naturally with our main loop's p = a+b-c if we force a and c
     389             :      * to zero.
     390             :      * Here we zero b and d, which become c and a respectively at the start of
     391             :      * the loop.
     392             :      */
     393             :     size_t rb;
     394       19018 :     const __m128i zero = _mm_setzero_si128();
     395             :     __m128i pa, pb, pc, smallest, nearest;
     396       19018 :     __m128i c, b = zero, a, d = zero;
     397             : 
     398             :     png_debug(1, "in png_read_filter_row_paeth4_sse2");
     399             : 
     400       19018 :     rb = row_info->rowbytes + 4;
     401     4822870 :     while (rb > 4)
     402             :     {
     403             :         /* It's easiest to do this math (particularly, deal with pc) with 16-bit
     404             :          * intermediates.
     405             :          */
     406     4803850 :         c = b;
     407     4803850 :         b = _mm_unpacklo_epi8(load4(prev), zero);
     408     4803850 :         a = d;
     409     9607700 :         d = _mm_unpacklo_epi8(load4(input), zero);
     410             : 
     411             :         /* (p-a) == (a+b-c - a) == (b-c) */
     412     4803850 :         pa = _mm_sub_epi16(b, c);
     413             : 
     414             :         /* (p-b) == (a+b-c - b) == (a-c) */
     415     4803850 :         pb = _mm_sub_epi16(a, c);
     416             : 
     417             :         /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
     418     4803850 :         pc = _mm_add_epi16(pa, pb);
     419             : 
     420     4803850 :         pa = abs_i16(pa); /* |p-a| */
     421     4803850 :         pb = abs_i16(pb); /* |p-b| */
     422     4803850 :         pc = abs_i16(pc); /* |p-c| */
     423             : 
     424     9607700 :         smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
     425             : 
     426             :         /* Paeth breaks ties favoring a over b over c. */
     427             :         nearest =
     428     9607700 :             if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
     429             :                          if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));
     430             : 
     431             :         /* Note `_epi8`: we need addition to wrap modulo 255. */
     432     4803850 :         d = _mm_add_epi8(d, nearest);
     433     4803850 :         store4(row, _mm_packus_epi16(d, d));
     434             : 
     435     4803850 :         input += 4;
     436     4803850 :         prev += 4;
     437     4803850 :         row += 4;
     438     4803850 :         rb -= 4;
     439             :     }
     440       19018 : }

Generated by: LCOV version 1.14