LCOV - code coverage report
Current view: top level - frmts/png - filter_sse2_intrinsics.c (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 156 156 100.0 %
Date: 2025-05-31 00:00:17 Functions: 12 12 100.0 %

          Line data    Source code
       1             : /* filter_sse2_intrinsics.c - SSE2 optimized filter functions
       2             :  *
       3             :  * Copyright (c) 2018 Cosmin Truta
       4             :  * Copyright (c) 2016-2017 Glenn Randers-Pehrson
       5             :  * Written by Mike Klein and Matt Sarett
       6             :  * Derived from arm/filter_neon_intrinsics.c
       7             :  *
       8             :  * This code is released under the libpng license.
       9             :  * For conditions of distribution and use, see the disclaimer
      10             :  * and license in png.h
      11             :  */
      12             : 
      13             : #ifndef png_debug
      14             : #define png_debug(x, y)
      15             : #endif
      16             : 
      17             : #ifndef PNG_INTEL_SSE_IMPLEMENTATION
      18             : #if defined(USE_NEON_OPTIMIZATIONS)
      19             : #define PNG_INTEL_SSE_IMPLEMENTATION 3
      20             : #elif defined(__SSE4_1__) || defined(__AVX__)
      21             : /* We are not actually using AVX, but checking for AVX is the best
      22             :    way we can detect SSE4.1 and SSSE3 on MSVC.
      23             : */
      24             : #define PNG_INTEL_SSE_IMPLEMENTATION 3
      25             : #elif defined(__SSSE3__)
      26             : #define PNG_INTEL_SSE_IMPLEMENTATION 2
      27             : #elif defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) ||             \
      28             :     (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
      29             : #define PNG_INTEL_SSE_IMPLEMENTATION 1
      30             : #else
      31             : #define PNG_INTEL_SSE_IMPLEMENTATION 0
      32             : #endif
      33             : #endif
      34             : 
      35             : #if defined(USE_NEON_OPTIMIZATIONS)
      36             : #include "include_sse2neon.h"
      37             : #else
      38             : #include <immintrin.h>
      39             : #endif
      40             : 
      41             : /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
      42             :  * They're positioned like this:
      43             :  *    prev:  c b
      44             :  *    row:   a d
      45             :  * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
      46             :  * whichever of a, b, or c is closest to p=a+b-c.
      47             :  */
      48             : 
      49    64914300 : static __m128i load4(const void *p)
      50             : {
      51             :     int tmp;
      52    64914300 :     memcpy(&tmp, p, sizeof(tmp));
      53   129829000 :     return _mm_cvtsi32_si128(tmp);
      54             : }
      55             : 
      56    15314900 : static void store4(void *p, __m128i v)
      57             : {
      58    15314900 :     int tmp = _mm_cvtsi128_si32(v);
      59    15314900 :     memcpy(p, &tmp, sizeof(int));
      60    15314900 : }
      61             : 
      62      163404 : static __m128i load3(const void *p)
      63             : {
      64      163404 :     png_uint_32 tmp = 0;
      65      163404 :     memcpy(&tmp, p, 3);
      66      326808 :     return _mm_cvtsi32_si128(tmp);
      67             : }
      68             : 
      69    18768100 : static void store3(void *p, __m128i v)
      70             : {
      71    18768100 :     int tmp = _mm_cvtsi128_si32(v);
      72    18768100 :     memcpy(p, &tmp, 3);
      73    18768100 : }
      74             : 
      75        4389 : static void gdal_png_read_filter_row_sub3_sse2(png_row_infop row_info,
      76             :                                                const GByte *input, GByte *row)
      77             : {
      78             :     /* The Sub filter predicts each pixel as the previous pixel, a.
      79             :      * There is no pixel to the left of the first pixel.  It's encoded directly.
      80             :      * That works with our main loop if we just say that left pixel was zero.
      81             :      */
      82             :     size_t rb;
      83             : 
      84        4389 :     __m128i a, d = _mm_setzero_si128();
      85             : 
      86             :     png_debug(1, "in png_read_filter_row_sub3_sse2");
      87             : 
      88        4389 :     rb = row_info->rowbytes;
      89      447937 :     while (rb >= 4)
      90             :     {
      91      443548 :         a = d;
      92      443548 :         d = load4(input);
      93      443548 :         d = _mm_add_epi8(d, a);
      94      443548 :         store3(row, d);
      95             : 
      96      443548 :         input += 3;
      97      443548 :         row += 3;
      98      443548 :         rb -= 3;
      99             :     }
     100        4389 :     if (rb > 0)
     101             :     {
     102        4389 :         a = d;
     103        4389 :         d = load3(input);
     104        4389 :         d = _mm_add_epi8(d, a);
     105        4389 :         store3(row, d);
     106             : 
     107             :         //input += 3;
     108             :         //row += 3;
     109             :         //rb -= 3;
     110             :     }
     111        4389 : }
     112             : 
     113        6721 : static void gdal_png_read_filter_row_sub4_sse2(png_row_infop row_info,
     114             :                                                const GByte *input, GByte *row)
     115             : {
     116             :     /* The Sub filter predicts each pixel as the previous pixel, a.
     117             :      * There is no pixel to the left of the first pixel.  It's encoded directly.
     118             :      * That works with our main loop if we just say that left pixel was zero.
     119             :      */
     120             :     size_t rb;
     121             : 
     122        6721 :     __m128i a, d = _mm_setzero_si128();
     123             : 
     124             :     png_debug(1, "in png_read_filter_row_sub4_sse2");
     125             : 
     126        6721 :     rb = row_info->rowbytes + 4;
     127     1790120 :     while (rb > 4)
     128             :     {
     129     1783400 :         a = d;
     130     1783400 :         d = load4(input);
     131     1783400 :         d = _mm_add_epi8(d, a);
     132     1783400 :         store4(row, d);
     133             : 
     134     1783400 :         input += 4;
     135     1783400 :         row += 4;
     136     1783400 :         rb -= 4;
     137             :     }
     138        6721 : }
     139             : 
     140        5341 : static void gdal_png_read_filter_row_avg3_sse2(png_row_infop row_info,
     141             :                                                const GByte *input, GByte *row,
     142             :                                                const GByte *prev)
     143             : {
     144             :     /* The Avg filter predicts each pixel as the (truncated) average of a and b.
     145             :      * There's no pixel to the left of the first pixel.  Luckily, it's
     146             :      * predicted to be half of the pixel above it.  So again, this works
     147             :      * perfectly with our loop if we make sure a starts at zero.
     148             :      */
     149             : 
     150             :     size_t rb;
     151             : 
     152        5341 :     const __m128i zero = _mm_setzero_si128();
     153             : 
     154             :     __m128i b;
     155        5341 :     __m128i a, d = zero;
     156             : 
     157             :     png_debug(1, "in png_read_filter_row_avg3_sse2");
     158        5341 :     rb = row_info->rowbytes;
     159      847466 :     while (rb >= 4)
     160             :     {
     161             :         __m128i avg;
     162      842125 :         b = load4(prev);
     163      842125 :         a = d;
     164      842125 :         d = load4(input);
     165             : 
     166             :         /* PNG requires a truncating average, so we can't just use _mm_avg_epu8
     167             :          */
     168      842125 :         avg = _mm_avg_epu8(a, b);
     169             :         /* ...but we can fix it up by subtracting off 1 if it rounded up. */
     170     3368500 :         avg = _mm_sub_epi8(
     171             :             avg, _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi8(1)));
     172      842125 :         d = _mm_add_epi8(d, avg);
     173      842125 :         store3(row, d);
     174             : 
     175      842125 :         input += 3;
     176      842125 :         prev += 3;
     177      842125 :         row += 3;
     178      842125 :         rb -= 3;
     179             :     }
     180        5341 :     if (rb > 0)
     181             :     {
     182             :         __m128i avg;
     183        5341 :         b = load3(prev);
     184        5341 :         a = d;
     185        5341 :         d = load3(input);
     186             : 
     187             :         /* PNG requires a truncating average, so we can't just use _mm_avg_epu8
     188             :          */
     189        5341 :         avg = _mm_avg_epu8(a, b);
     190             :         /* ...but we can fix it up by subtracting off 1 if it rounded up. */
     191       21364 :         avg = _mm_sub_epi8(
     192             :             avg, _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi8(1)));
     193             : 
     194        5341 :         d = _mm_add_epi8(d, avg);
     195        5341 :         store3(row, d);
     196             : 
     197             :         // input += 3;
     198             :         // prev += 3;
     199             :         // row += 3;
     200             :         // rb -= 3;
     201             :     }
     202        5341 : }
     203             : 
     204        4581 : static void gdal_png_read_filter_row_avg4_sse2(png_row_infop row_info,
     205             :                                                const GByte *input, GByte *row,
     206             :                                                const GByte *prev)
     207             : {
     208             :     /* The Avg filter predicts each pixel as the (truncated) average of a and b.
     209             :      * There's no pixel to the left of the first pixel.  Luckily, it's
     210             :      * predicted to be half of the pixel above it.  So again, this works
     211             :      * perfectly with our loop if we make sure a starts at zero.
     212             :      */
     213             :     size_t rb;
     214        4581 :     const __m128i zero = _mm_setzero_si128();
     215             :     __m128i b;
     216        4581 :     __m128i a, d = zero;
     217             : 
     218             :     png_debug(1, "in png_read_filter_row_avg4_sse2");
     219             : 
     220        4581 :     rb = row_info->rowbytes + 4;
     221     1259590 :     while (rb > 4)
     222             :     {
     223             :         __m128i avg;
     224     1255010 :         b = load4(prev);
     225     1255010 :         a = d;
     226     1255010 :         d = load4(input);
     227             : 
     228             :         /* PNG requires a truncating average, so we can't just use _mm_avg_epu8
     229             :          */
     230     1255010 :         avg = _mm_avg_epu8(a, b);
     231             :         /* ...but we can fix it up by subtracting off 1 if it rounded up. */
     232     5020040 :         avg = _mm_sub_epi8(
     233             :             avg, _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi8(1)));
     234             : 
     235     1255010 :         d = _mm_add_epi8(d, avg);
     236     1255010 :         store4(row, d);
     237             : 
     238     1255010 :         input += 4;
     239     1255010 :         prev += 4;
     240     1255010 :         row += 4;
     241     1255010 :         rb -= 4;
     242             :     }
     243        4581 : }
     244             : 
     245             : /* Returns |x| for 16-bit lanes. */
     246    86633400 : static __m128i abs_i16(__m128i x)
     247             : {
     248             : #if PNG_INTEL_SSE_IMPLEMENTATION >= 2
     249             :     return _mm_abs_epi16(x);
     250             : #else
     251             :     /* Read this all as, return x<0 ? -x : x.
     252             :      * To negate two's complement, you flip all the bits then add 1.
     253             :      */
     254   173267000 :     __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());
     255             : 
     256             :     /* Flip negative lanes. */
     257    86633400 :     x = _mm_xor_si128(x, is_negative);
     258             : 
     259             :     /* +1 to negative lanes, else +0. */
     260    86633400 :     x = _mm_sub_epi16(x, is_negative);
     261    86633400 :     return x;
     262             : #endif
     263             : }
     264             : 
     265             : /* Bytewise c ? t : e. */
     266    57872500 : static __m128i if_then_else(__m128i c, __m128i t, __m128i e)
     267             : {
     268             : #if PNG_INTEL_SSE_IMPLEMENTATION >= 3
     269             :     return _mm_blendv_epi8(e, t, c);
     270             : #else
     271   175544000 :     return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));
     272             : #endif
     273             : }
     274             : 
     275       74087 : static void gdal_png_read_filter_row_paeth3_sse2(png_row_infop row_info,
     276             :                                                  const GByte *input, GByte *row,
     277             :                                                  const GByte *prev)
     278             : {
     279             :     /* Paeth tries to predict pixel d using the pixel to the left of it, a,
     280             :      * and two pixels from the previous row, b and c:
     281             :      *   prev: c b
     282             :      *   row:  a d
     283             :      * The Paeth function predicts d to be whichever of a, b, or c is nearest to
     284             :      * p=a+b-c.
     285             :      *
     286             :      * The first pixel has no left context, and so uses an Up filter, p = b.
     287             :      * This works naturally with our main loop's p = a+b-c if we force a and c
     288             :      * to zero.
     289             :      * Here we zero b and d, which become c and a respectively at the start of
     290             :      * the loop.
     291             :      */
     292             :     size_t rb;
     293       74087 :     const __m128i zero = _mm_setzero_si128();
     294       74087 :     __m128i c, b = zero, a, d = zero;
     295             : 
     296             :     png_debug(1, "in png_read_filter_row_paeth3_sse2");
     297             : 
     298       74087 :     rb = row_info->rowbytes;
     299    17507900 :     while (rb >= 4)
     300             :     {
     301             :         /* It's easiest to do this math (particularly, deal with pc) with 16-bit
     302             :          * intermediates.
     303             :          */
     304             :         __m128i pa, pb, pc, smallest, nearest;
     305    17433700 :         c = b;
     306    17433700 :         b = _mm_unpacklo_epi8(load4(prev), zero);
     307    17611300 :         a = d;
     308    35240900 :         d = _mm_unpacklo_epi8(load4(input), zero);
     309             : 
     310             :         /* (p-a) == (a+b-c - a) == (b-c) */
     311             : 
     312    17629600 :         pa = _mm_sub_epi16(b, c);
     313             : 
     314             :         /* (p-b) == (a+b-c - b) == (a-c) */
     315    17629600 :         pb = _mm_sub_epi16(a, c);
     316             : 
     317             :         /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
     318    17629600 :         pc = _mm_add_epi16(pa, pb);
     319             : 
     320    17629600 :         pa = abs_i16(pa); /* |p-a| */
     321    17377400 :         pb = abs_i16(pb); /* |p-b| */
     322    17268800 :         pc = abs_i16(pc); /* |p-c| */
     323             : 
     324    34756900 :         smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
     325             : 
     326             :         /* Paeth breaks ties favoring a over b over c. */
     327             :         nearest =
     328    34538800 :             if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
     329             :                          if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));
     330             : 
     331             :         /* Note `_epi8`: we need addition to wrap modulo 255. */
     332    17283600 :         d = _mm_add_epi8(d, nearest);
     333    17537900 :         store3(row, _mm_packus_epi16(d, d));
     334             : 
     335    17433800 :         input += 3;
     336    17433800 :         prev += 3;
     337    17433800 :         row += 3;
     338    17433800 :         rb -= 3;
     339             :     }
     340       74179 :     if (rb > 0)
     341             :     {
     342             :         /* It's easiest to do this math (particularly, deal with pc) with 16-bit
     343             :          * intermediates.
     344             :          */
     345             :         __m128i pa, pb, pc, smallest, nearest;
     346       74172 :         c = b;
     347       74172 :         b = _mm_unpacklo_epi8(load3(prev), zero);
     348       74182 :         a = d;
     349      148367 :         d = _mm_unpacklo_epi8(load3(input), zero);
     350             : 
     351             :         /* (p-a) == (a+b-c - a) == (b-c) */
     352       74185 :         pa = _mm_sub_epi16(b, c);
     353             : 
     354             :         /* (p-b) == (a+b-c - b) == (a-c) */
     355       74185 :         pb = _mm_sub_epi16(a, c);
     356             : 
     357             :         /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
     358       74185 :         pc = _mm_add_epi16(pa, pb);
     359             : 
     360       74185 :         pa = abs_i16(pa); /* |p-a| */
     361       74181 :         pb = abs_i16(pb); /* |p-b| */
     362       74175 :         pc = abs_i16(pc); /* |p-c| */
     363             : 
     364      148360 :         smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
     365             : 
     366             :         /* Paeth breaks ties favoring a over b over c. */
     367             :         nearest =
     368      148357 :             if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
     369             :                          if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));
     370             : 
     371             :         /* Note `_epi8`: we need addition to wrap modulo 255. */
     372       74187 :         d = _mm_add_epi8(d, nearest);
     373       74187 :         store3(row, _mm_packus_epi16(d, d));
     374             : 
     375             :         // input += 3;
     376             :         // prev += 3;
     377             :         // row += 3;
     378             :         // rb -= 3;
     379             :     }
     380       74186 : }
     381             : 
     382       48180 : static void gdal_png_read_filter_row_paeth4_sse2(png_row_infop row_info,
     383             :                                                  const GByte *input, GByte *row,
     384             :                                                  const GByte *prev)
     385             : {
     386             :     /* Paeth tries to predict pixel d using the pixel to the left of it, a,
     387             :      * and two pixels from the previous row, b and c:
     388             :      *   prev: c b
     389             :      *   row:  a d
     390             :      * The Paeth function predicts d to be whichever of a, b, or c is nearest to
     391             :      * p=a+b-c.
     392             :      *
     393             :      * The first pixel has no left context, and so uses an Up filter, p = b.
     394             :      * This works naturally with our main loop's p = a+b-c if we force a and c
     395             :      * to zero.
     396             :      * Here we zero b and d, which become c and a respectively at the start of
     397             :      * the loop.
     398             :      */
     399             :     size_t rb;
     400       48180 :     const __m128i zero = _mm_setzero_si128();
     401             :     __m128i pa, pb, pc, smallest, nearest;
     402       48180 :     __m128i c, b = zero, a, d = zero;
     403             : 
     404             :     png_debug(1, "in png_read_filter_row_paeth4_sse2");
     405             : 
     406       48180 :     rb = row_info->rowbytes + 4;
     407    12324700 :     while (rb > 4)
     408             :     {
     409             :         /* It's easiest to do this math (particularly, deal with pc) with 16-bit
     410             :          * intermediates.
     411             :          */
     412    12276500 :         c = b;
     413    12276500 :         b = _mm_unpacklo_epi8(load4(prev), zero);
     414    12276500 :         a = d;
     415    24553000 :         d = _mm_unpacklo_epi8(load4(input), zero);
     416             : 
     417             :         /* (p-a) == (a+b-c - a) == (b-c) */
     418    12276500 :         pa = _mm_sub_epi16(b, c);
     419             : 
     420             :         /* (p-b) == (a+b-c - b) == (a-c) */
     421    12276500 :         pb = _mm_sub_epi16(a, c);
     422             : 
     423             :         /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
     424    12276500 :         pc = _mm_add_epi16(pa, pb);
     425             : 
     426    12276500 :         pa = abs_i16(pa); /* |p-a| */
     427    12276500 :         pb = abs_i16(pb); /* |p-b| */
     428    12276500 :         pc = abs_i16(pc); /* |p-c| */
     429             : 
     430    24553000 :         smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
     431             : 
     432             :         /* Paeth breaks ties favoring a over b over c. */
     433             :         nearest =
     434    24553000 :             if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
     435             :                          if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));
     436             : 
     437             :         /* Note `_epi8`: we need addition to wrap modulo 255. */
     438    12276500 :         d = _mm_add_epi8(d, nearest);
     439    12276500 :         store4(row, _mm_packus_epi16(d, d));
     440             : 
     441    12276500 :         input += 4;
     442    12276500 :         prev += 4;
     443    12276500 :         row += 4;
     444    12276500 :         rb -= 4;
     445             :     }
     446       48180 : }

Generated by: LCOV version 1.14