LCOV - code coverage report
Current view: top level - frmts/png - filter_sse2_intrinsics.c (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 156 156 100.0 %
Date: 2025-01-18 12:42:00 Functions: 12 12 100.0 %

          Line data    Source code
       1             : /* filter_sse2_intrinsics.c - SSE2 optimized filter functions
       2             :  *
       3             :  * Copyright (c) 2018 Cosmin Truta
       4             :  * Copyright (c) 2016-2017 Glenn Randers-Pehrson
       5             :  * Written by Mike Klein and Matt Sarett
       6             :  * Derived from arm/filter_neon_intrinsics.c
       7             :  *
       8             :  * This code is released under the libpng license.
       9             :  * For conditions of distribution and use, see the disclaimer
      10             :  * and license in png.h
      11             :  */
      12             : 
      13             : #ifndef png_debug
      14             : #define png_debug(x, y)
      15             : #endif
      16             : 
      17             : #ifndef PNG_INTEL_SSE_IMPLEMENTATION
      18             : #if defined(USE_NEON_OPTIMIZATIONS)
      19             : #define PNG_INTEL_SSE_IMPLEMENTATION 3
      20             : #elif defined(__SSE4_1__) || defined(__AVX__)
      21             : /* We are not actually using AVX, but checking for AVX is the best
      22             :    way we can detect SSE4.1 and SSSE3 on MSVC.
      23             : */
      24             : #define PNG_INTEL_SSE_IMPLEMENTATION 3
      25             : #elif defined(__SSSE3__)
      26             : #define PNG_INTEL_SSE_IMPLEMENTATION 2
      27             : #elif defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) ||             \
      28             :     (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
      29             : #define PNG_INTEL_SSE_IMPLEMENTATION 1
      30             : #else
      31             : #define PNG_INTEL_SSE_IMPLEMENTATION 0
      32             : #endif
      33             : #endif
      34             : 
      35             : #if defined(USE_NEON_OPTIMIZATIONS)
      36             : #include "include_sse2neon.h"
      37             : #else
      38             : #include <immintrin.h>
      39             : #endif
      40             : 
      41             : /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
      42             :  * They're positioned like this:
      43             :  *    prev:  c b
      44             :  *    row:   a d
      45             :  * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
      46             :  * whichever of a, b, or c is closest to p=a+b-c.
      47             :  */
      48             : 
      49    30162300 : static __m128i load4(const void *p)
      50             : {
      51             :     int tmp;
      52    30162300 :     memcpy(&tmp, p, sizeof(tmp));
      53    60324700 :     return _mm_cvtsi32_si128(tmp);
      54             : }
      55             : 
      56    14621100 : static void store4(void *p, __m128i v)
      57             : {
      58    14621100 :     int tmp = _mm_cvtsi128_si32(v);
      59    14621100 :     memcpy(p, &tmp, sizeof(int));
      60    14621100 : }
      61             : 
      62       21899 : static __m128i load3(const void *p)
      63             : {
      64       21899 :     png_uint_32 tmp = 0;
      65       21899 :     memcpy(&tmp, p, 3);
      66       43798 :     return _mm_cvtsi32_si128(tmp);
      67             : }
      68             : 
      69     1464270 : static void store3(void *p, __m128i v)
      70             : {
      71     1464270 :     int tmp = _mm_cvtsi128_si32(v);
      72     1464270 :     memcpy(p, &tmp, 3);
      73     1464270 : }
      74             : 
      75        3921 : static void gdal_png_read_filter_row_sub3_sse2(png_row_infop row_info,
      76             :                                                const GByte *input, GByte *row)
      77             : {
      78             :     /* The Sub filter predicts each pixel as the previous pixel, a.
      79             :      * There is no pixel to the left of the first pixel.  It's encoded directly.
      80             :      * That works with our main loop if we just say that left pixel was zero.
      81             :      */
      82             :     size_t rb;
      83             : 
      84        3921 :     __m128i a, d = _mm_setzero_si128();
      85             : 
      86             :     png_debug(1, "in png_read_filter_row_sub3_sse2");
      87             : 
      88        3921 :     rb = row_info->rowbytes;
      89      328129 :     while (rb >= 4)
      90             :     {
      91      324208 :         a = d;
      92      324208 :         d = load4(input);
      93      324208 :         d = _mm_add_epi8(d, a);
      94      324208 :         store3(row, d);
      95             : 
      96      324208 :         input += 3;
      97      324208 :         row += 3;
      98      324208 :         rb -= 3;
      99             :     }
     100        3921 :     if (rb > 0)
     101             :     {
     102        3921 :         a = d;
     103        3921 :         d = load3(input);
     104        3921 :         d = _mm_add_epi8(d, a);
     105        3921 :         store3(row, d);
     106             : 
     107             :         //input += 3;
     108             :         //row += 3;
     109             :         //rb -= 3;
     110             :     }
     111        3921 : }
     112             : 
     113        6233 : static void gdal_png_read_filter_row_sub4_sse2(png_row_infop row_info,
     114             :                                                const GByte *input, GByte *row)
     115             : {
     116             :     /* The Sub filter predicts each pixel as the previous pixel, a.
     117             :      * There is no pixel to the left of the first pixel.  It's encoded directly.
     118             :      * That works with our main loop if we just say that left pixel was zero.
     119             :      */
     120             :     size_t rb;
     121             : 
     122        6233 :     __m128i a, d = _mm_setzero_si128();
     123             : 
     124             :     png_debug(1, "in png_read_filter_row_sub4_sse2");
     125             : 
     126        6233 :     rb = row_info->rowbytes + 4;
     127     1664710 :     while (rb > 4)
     128             :     {
     129     1658470 :         a = d;
     130     1658470 :         d = load4(input);
     131     1658470 :         d = _mm_add_epi8(d, a);
     132     1658470 :         store4(row, d);
     133             : 
     134     1658470 :         input += 4;
     135     1658470 :         row += 4;
     136     1658470 :         rb -= 4;
     137             :     }
     138        6233 : }
     139             : 
     140        3800 : static void gdal_png_read_filter_row_avg3_sse2(png_row_infop row_info,
     141             :                                                const GByte *input, GByte *row,
     142             :                                                const GByte *prev)
     143             : {
     144             :     /* The Avg filter predicts each pixel as the (truncated) average of a and b.
     145             :      * There's no pixel to the left of the first pixel.  Luckily, it's
     146             :      * predicted to be half of the pixel above it.  So again, this works
     147             :      * perfectly with our loop if we make sure a starts at zero.
     148             :      */
     149             : 
     150             :     size_t rb;
     151             : 
     152        3800 :     const __m128i zero = _mm_setzero_si128();
     153             : 
     154             :     __m128i b;
     155        3800 :     __m128i a, d = zero;
     156             : 
     157             :     png_debug(1, "in png_read_filter_row_avg3_sse2");
     158        3800 :     rb = row_info->rowbytes;
     159      452700 :     while (rb >= 4)
     160             :     {
     161             :         __m128i avg;
     162      448900 :         b = load4(prev);
     163      448900 :         a = d;
     164      448900 :         d = load4(input);
     165             : 
     166             :         /* PNG requires a truncating average, so we can't just use _mm_avg_epu8
     167             :          */
     168      448900 :         avg = _mm_avg_epu8(a, b);
     169             :         /* ...but we can fix it up by subtracting off 1 if it rounded up. */
     170     1795600 :         avg = _mm_sub_epi8(
     171             :             avg, _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi8(1)));
     172      448900 :         d = _mm_add_epi8(d, avg);
     173      448900 :         store3(row, d);
     174             : 
     175      448900 :         input += 3;
     176      448900 :         prev += 3;
     177      448900 :         row += 3;
     178      448900 :         rb -= 3;
     179             :     }
     180        3800 :     if (rb > 0)
     181             :     {
     182             :         __m128i avg;
     183        3800 :         b = load3(prev);
     184        3800 :         a = d;
     185        3800 :         d = load3(input);
     186             : 
     187             :         /* PNG requires a truncating average, so we can't just use _mm_avg_epu8
     188             :          */
     189        3800 :         avg = _mm_avg_epu8(a, b);
     190             :         /* ...but we can fix it up by subtracting off 1 if it rounded up. */
     191       15200 :         avg = _mm_sub_epi8(
     192             :             avg, _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi8(1)));
     193             : 
     194        3800 :         d = _mm_add_epi8(d, avg);
     195        3800 :         store3(row, d);
     196             : 
     197             :         // input += 3;
     198             :         // prev += 3;
     199             :         // row += 3;
     200             :         // rb -= 3;
     201             :     }
     202        3800 : }
     203             : 
     204        3902 : static void gdal_png_read_filter_row_avg4_sse2(png_row_infop row_info,
     205             :                                                const GByte *input, GByte *row,
     206             :                                                const GByte *prev)
     207             : {
     208             :     /* The Avg filter predicts each pixel as the (truncated) average of a and b.
     209             :      * There's no pixel to the left of the first pixel.  Luckily, it's
     210             :      * predicted to be half of the pixel above it.  So again, this works
     211             :      * perfectly with our loop if we make sure a starts at zero.
     212             :      */
     213             :     size_t rb;
     214        3902 :     const __m128i zero = _mm_setzero_si128();
     215             :     __m128i b;
     216        3902 :     __m128i a, d = zero;
     217             : 
     218             :     png_debug(1, "in png_read_filter_row_avg4_sse2");
     219             : 
     220        3902 :     rb = row_info->rowbytes + 4;
     221     1085090 :     while (rb > 4)
     222             :     {
     223             :         __m128i avg;
     224     1081190 :         b = load4(prev);
     225     1081190 :         a = d;
     226     1081190 :         d = load4(input);
     227             : 
     228             :         /* PNG requires a truncating average, so we can't just use _mm_avg_epu8
     229             :          */
     230     1081190 :         avg = _mm_avg_epu8(a, b);
     231             :         /* ...but we can fix it up by subtracting off 1 if it rounded up. */
     232     4324740 :         avg = _mm_sub_epi8(
     233             :             avg, _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi8(1)));
     234             : 
     235     1081190 :         d = _mm_add_epi8(d, avg);
     236     1081190 :         store4(row, d);
     237             : 
     238     1081190 :         input += 4;
     239     1081190 :         prev += 4;
     240     1081190 :         row += 4;
     241     1081190 :         rb -= 4;
     242             :     }
     243        3902 : }
     244             : 
     245             : /* Returns |x| for 16-bit lanes. */
     246    37694800 : static __m128i abs_i16(__m128i x)
     247             : {
     248             : #if PNG_INTEL_SSE_IMPLEMENTATION >= 2
     249             :     return _mm_abs_epi16(x);
     250             : #else
     251             :     /* Read this all as, return x<0 ? -x : x.
     252             :      * To negate two's complement, you flip all the bits then add 1.
     253             :      */
     254    75389600 :     __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());
     255             : 
     256             :     /* Flip negative lanes. */
     257    37694800 :     x = _mm_xor_si128(x, is_negative);
     258             : 
     259             :     /* +1 to negative lanes, else +0. */
     260    37694800 :     x = _mm_sub_epi16(x, is_negative);
     261    37694800 :     return x;
     262             : #endif
     263             : }
     264             : 
     265             : /* Bytewise c ? t : e. */
     266    25129900 : static __m128i if_then_else(__m128i c, __m128i t, __m128i e)
     267             : {
     268             : #if PNG_INTEL_SSE_IMPLEMENTATION >= 3
     269             :     return _mm_blendv_epi8(e, t, c);
     270             : #else
     271    75389600 :     return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));
     272             : #endif
     273             : }
     274             : 
     275        5189 : static void gdal_png_read_filter_row_paeth3_sse2(png_row_infop row_info,
     276             :                                                  const GByte *input, GByte *row,
     277             :                                                  const GByte *prev)
     278             : {
     279             :     /* Paeth tries to predict pixel d using the pixel to the left of it, a,
     280             :      * and two pixels from the previous row, b and c:
     281             :      *   prev: c b
     282             :      *   row:  a d
     283             :      * The Paeth function predicts d to be whichever of a, b, or c is nearest to
     284             :      * p=a+b-c.
     285             :      *
     286             :      * The first pixel has no left context, and so uses an Up filter, p = b.
     287             :      * This works naturally with our main loop's p = a+b-c if we force a and c
     288             :      * to zero.
     289             :      * Here we zero b and d, which become c and a respectively at the start of
     290             :      * the loop.
     291             :      */
     292             :     size_t rb;
     293        5189 :     const __m128i zero = _mm_setzero_si128();
     294        5189 :     __m128i c, b = zero, a, d = zero;
     295             : 
     296             :     png_debug(1, "in png_read_filter_row_paeth3_sse2");
     297             : 
     298        5189 :     rb = row_info->rowbytes;
     299      683437 :     while (rb >= 4)
     300             :     {
     301             :         /* It's easiest to do this math (particularly, deal with pc) with 16-bit
     302             :          * intermediates.
     303             :          */
     304             :         __m128i pa, pb, pc, smallest, nearest;
     305      678248 :         c = b;
     306      678248 :         b = _mm_unpacklo_epi8(load4(prev), zero);
     307      678248 :         a = d;
     308     1356500 :         d = _mm_unpacklo_epi8(load4(input), zero);
     309             : 
     310             :         /* (p-a) == (a+b-c - a) == (b-c) */
     311             : 
     312      678248 :         pa = _mm_sub_epi16(b, c);
     313             : 
     314             :         /* (p-b) == (a+b-c - b) == (a-c) */
     315      678248 :         pb = _mm_sub_epi16(a, c);
     316             : 
     317             :         /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
     318      678248 :         pc = _mm_add_epi16(pa, pb);
     319             : 
     320      678248 :         pa = abs_i16(pa); /* |p-a| */
     321      678248 :         pb = abs_i16(pb); /* |p-b| */
     322      678248 :         pc = abs_i16(pc); /* |p-c| */
     323             : 
     324     1356500 :         smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
     325             : 
     326             :         /* Paeth breaks ties favoring a over b over c. */
     327             :         nearest =
     328     1356500 :             if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
     329             :                          if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));
     330             : 
     331             :         /* Note `_epi8`: we need addition to wrap modulo 255. */
     332      678248 :         d = _mm_add_epi8(d, nearest);
     333      678248 :         store3(row, _mm_packus_epi16(d, d));
     334             : 
     335      678248 :         input += 3;
     336      678248 :         prev += 3;
     337      678248 :         row += 3;
     338      678248 :         rb -= 3;
     339             :     }
     340        5189 :     if (rb > 0)
     341             :     {
     342             :         /* It's easiest to do this math (particularly, deal with pc) with 16-bit
     343             :          * intermediates.
     344             :          */
     345             :         __m128i pa, pb, pc, smallest, nearest;
     346        5189 :         c = b;
     347        5189 :         b = _mm_unpacklo_epi8(load3(prev), zero);
     348        5189 :         a = d;
     349       10378 :         d = _mm_unpacklo_epi8(load3(input), zero);
     350             : 
     351             :         /* (p-a) == (a+b-c - a) == (b-c) */
     352        5189 :         pa = _mm_sub_epi16(b, c);
     353             : 
     354             :         /* (p-b) == (a+b-c - b) == (a-c) */
     355        5189 :         pb = _mm_sub_epi16(a, c);
     356             : 
     357             :         /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
     358        5189 :         pc = _mm_add_epi16(pa, pb);
     359             : 
     360        5189 :         pa = abs_i16(pa); /* |p-a| */
     361        5189 :         pb = abs_i16(pb); /* |p-b| */
     362        5189 :         pc = abs_i16(pc); /* |p-c| */
     363             : 
     364       10378 :         smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
     365             : 
     366             :         /* Paeth breaks ties favoring a over b over c. */
     367             :         nearest =
     368       10378 :             if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
     369             :                          if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));
     370             : 
     371             :         /* Note `_epi8`: we need addition to wrap modulo 255. */
     372        5189 :         d = _mm_add_epi8(d, nearest);
     373        5189 :         store3(row, _mm_packus_epi16(d, d));
     374             : 
     375             :         // input += 3;
     376             :         // prev += 3;
     377             :         // row += 3;
     378             :         // rb -= 3;
     379             :     }
     380        5189 : }
     381             : 
     382       46637 : static void gdal_png_read_filter_row_paeth4_sse2(png_row_infop row_info,
     383             :                                                  const GByte *input, GByte *row,
     384             :                                                  const GByte *prev)
     385             : {
     386             :     /* Paeth tries to predict pixel d using the pixel to the left of it, a,
     387             :      * and two pixels from the previous row, b and c:
     388             :      *   prev: c b
     389             :      *   row:  a d
     390             :      * The Paeth function predicts d to be whichever of a, b, or c is nearest to
     391             :      * p=a+b-c.
     392             :      *
     393             :      * The first pixel has no left context, and so uses an Up filter, p = b.
     394             :      * This works naturally with our main loop's p = a+b-c if we force a and c
     395             :      * to zero.
     396             :      * Here we zero b and d, which become c and a respectively at the start of
     397             :      * the loop.
     398             :      */
     399             :     size_t rb;
     400       46637 :     const __m128i zero = _mm_setzero_si128();
     401             :     __m128i pa, pb, pc, smallest, nearest;
     402       46637 :     __m128i c, b = zero, a, d = zero;
     403             : 
     404             :     png_debug(1, "in png_read_filter_row_paeth4_sse2");
     405             : 
     406       46637 :     rb = row_info->rowbytes + 4;
     407    11928100 :     while (rb > 4)
     408             :     {
     409             :         /* It's easiest to do this math (particularly, deal with pc) with 16-bit
     410             :          * intermediates.
     411             :          */
     412    11881500 :         c = b;
     413    11881500 :         b = _mm_unpacklo_epi8(load4(prev), zero);
     414    11881500 :         a = d;
     415    23763000 :         d = _mm_unpacklo_epi8(load4(input), zero);
     416             : 
     417             :         /* (p-a) == (a+b-c - a) == (b-c) */
     418    11881500 :         pa = _mm_sub_epi16(b, c);
     419             : 
     420             :         /* (p-b) == (a+b-c - b) == (a-c) */
     421    11881500 :         pb = _mm_sub_epi16(a, c);
     422             : 
     423             :         /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
     424    11881500 :         pc = _mm_add_epi16(pa, pb);
     425             : 
     426    11881500 :         pa = abs_i16(pa); /* |p-a| */
     427    11881500 :         pb = abs_i16(pb); /* |p-b| */
     428    11881500 :         pc = abs_i16(pc); /* |p-c| */
     429             : 
     430    23763000 :         smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
     431             : 
     432             :         /* Paeth breaks ties favoring a over b over c. */
     433             :         nearest =
     434    23763000 :             if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
     435             :                          if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));
     436             : 
     437             :         /* Note `_epi8`: we need addition to wrap modulo 255. */
     438    11881500 :         d = _mm_add_epi8(d, nearest);
     439    11881500 :         store4(row, _mm_packus_epi16(d, d));
     440             : 
     441    11881500 :         input += 4;
     442    11881500 :         prev += 4;
     443    11881500 :         row += 4;
     444    11881500 :         rb -= 4;
     445             :     }
     446       46637 : }

Generated by: LCOV version 1.14