Line data Source code
1 : /* filter_sse2_intrinsics.c - SSE2 optimized filter functions
2 : *
3 : * Copyright (c) 2018 Cosmin Truta
4 : * Copyright (c) 2016-2017 Glenn Randers-Pehrson
5 : * Written by Mike Klein and Matt Sarett
6 : * Derived from arm/filter_neon_intrinsics.c
7 : *
8 : * This code is released under the libpng license.
9 : * For conditions of distribution and use, see the disclaimer
10 : * and license in png.h
11 : */
12 :
13 : #ifndef png_debug
14 : #define png_debug(x, y)
15 : #endif
16 :
17 : #ifndef PNG_INTEL_SSE_IMPLEMENTATION
18 : #if defined(USE_NEON_OPTIMIZATIONS)
19 : #define PNG_INTEL_SSE_IMPLEMENTATION 3
20 : #elif defined(__SSE4_1__) || defined(__AVX__)
21 : /* We are not actually using AVX, but checking for AVX is the best
22 : way we can detect SSE4.1 and SSSE3 on MSVC.
23 : */
24 : #define PNG_INTEL_SSE_IMPLEMENTATION 3
25 : #elif defined(__SSSE3__)
26 : #define PNG_INTEL_SSE_IMPLEMENTATION 2
27 : #elif defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) || \
28 : (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
29 : #define PNG_INTEL_SSE_IMPLEMENTATION 1
30 : #else
31 : #define PNG_INTEL_SSE_IMPLEMENTATION 0
32 : #endif
33 : #endif
34 :
35 : #if defined(USE_NEON_OPTIMIZATIONS)
36 : #include "include_sse2neon.h"
37 : #else
38 : #include <immintrin.h>
39 : #endif
40 :
41 : /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
42 : * They're positioned like this:
43 : * prev: c b
44 : * row: a d
45 : * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
46 : * whichever of a, b, or c is closest to p=a+b-c.
47 : */
48 :
49 157049000 : static __m128i load4(const void *p)
50 : {
51 : int tmp;
52 157049000 : memcpy(&tmp, p, sizeof(tmp));
53 314098000 : return _mm_cvtsi32_si128(tmp);
54 : }
55 :
56 15830200 : static void store4(void *p, __m128i v)
57 : {
58 15830200 : int tmp = _mm_cvtsi128_si32(v);
59 15830200 : memcpy(p, &tmp, sizeof(int));
60 15830200 : }
61 :
62 526397 : static __m128i load3(const void *p)
63 : {
64 526397 : png_uint_32 tmp = 0;
65 526397 : memcpy(&tmp, p, 3);
66 1052790 : return _mm_cvtsi32_si128(tmp);
67 : }
68 :
69 64690100 : static void store3(void *p, __m128i v)
70 : {
71 64690100 : int tmp = _mm_cvtsi128_si32(v);
72 64690100 : memcpy(p, &tmp, 3);
73 64690100 : }
74 :
75 4957 : static void gdal_png_read_filter_row_sub3_sse2(png_row_infop row_info,
76 : const GByte *input, GByte *row)
77 : {
78 : /* The Sub filter predicts each pixel as the previous pixel, a.
79 : * There is no pixel to the left of the first pixel. It's encoded directly.
80 : * That works with our main loop if we just say that left pixel was zero.
81 : */
82 : size_t rb;
83 :
84 4957 : __m128i a, d = _mm_setzero_si128();
85 :
86 : png_debug(1, "in png_read_filter_row_sub3_sse2");
87 :
88 4957 : rb = row_info->rowbytes;
89 593345 : while (rb >= 4)
90 : {
91 588388 : a = d;
92 588388 : d = load4(input);
93 588388 : d = _mm_add_epi8(d, a);
94 588388 : store3(row, d);
95 :
96 588388 : input += 3;
97 588388 : row += 3;
98 588388 : rb -= 3;
99 : }
100 4957 : if (rb > 0)
101 : {
102 4957 : a = d;
103 4957 : d = load3(input);
104 4957 : d = _mm_add_epi8(d, a);
105 4957 : store3(row, d);
106 :
107 : //input += 3;
108 : //row += 3;
109 : //rb -= 3;
110 : }
111 4957 : }
112 :
113 6320 : static void gdal_png_read_filter_row_sub4_sse2(png_row_infop row_info,
114 : const GByte *input, GByte *row)
115 : {
116 : /* The Sub filter predicts each pixel as the previous pixel, a.
117 : * There is no pixel to the left of the first pixel. It's encoded directly.
118 : * That works with our main loop if we just say that left pixel was zero.
119 : */
120 : size_t rb;
121 :
122 6320 : __m128i a, d = _mm_setzero_si128();
123 :
124 : png_debug(1, "in png_read_filter_row_sub4_sse2");
125 :
126 6320 : rb = row_info->rowbytes + 4;
127 1687070 : while (rb > 4)
128 : {
129 1680750 : a = d;
130 1680750 : d = load4(input);
131 1680750 : d = _mm_add_epi8(d, a);
132 1680750 : store4(row, d);
133 :
134 1680750 : input += 4;
135 1680750 : row += 4;
136 1680750 : rb -= 4;
137 : }
138 6320 : }
139 :
140 54362 : static void gdal_png_read_filter_row_avg3_sse2(png_row_infop row_info,
141 : const GByte *input, GByte *row,
142 : const GByte *prev)
143 : {
144 : /* The Avg filter predicts each pixel as the (truncated) average of a and b.
145 : * There's no pixel to the left of the first pixel. Luckily, it's
146 : * predicted to be half of the pixel above it. So again, this works
147 : * perfectly with our loop if we make sure a starts at zero.
148 : */
149 :
150 : size_t rb;
151 :
152 54362 : const __m128i zero = _mm_setzero_si128();
153 :
154 : __m128i b;
155 54362 : __m128i a, d = zero;
156 :
157 : png_debug(1, "in png_read_filter_row_avg3_sse2");
158 54362 : rb = row_info->rowbytes;
159 13371000 : while (rb >= 4)
160 : {
161 : __m128i avg;
162 13316700 : b = load4(prev);
163 13270200 : a = d;
164 13270200 : d = load4(input);
165 :
166 : /* PNG requires a truncating average, so we can't just use _mm_avg_epu8
167 : */
168 13298200 : avg = _mm_avg_epu8(a, b);
169 : /* ...but we can fix it up by subtracting off 1 if it rounded up. */
170 53192800 : avg = _mm_sub_epi8(
171 : avg, _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi8(1)));
172 13298200 : d = _mm_add_epi8(d, avg);
173 13298200 : store3(row, d);
174 :
175 13316700 : input += 3;
176 13316700 : prev += 3;
177 13316700 : row += 3;
178 13316700 : rb -= 3;
179 : }
180 54362 : if (rb > 0)
181 : {
182 : __m128i avg;
183 54362 : b = load3(prev);
184 54363 : a = d;
185 54363 : d = load3(input);
186 :
187 : /* PNG requires a truncating average, so we can't just use _mm_avg_epu8
188 : */
189 54362 : avg = _mm_avg_epu8(a, b);
190 : /* ...but we can fix it up by subtracting off 1 if it rounded up. */
191 217448 : avg = _mm_sub_epi8(
192 : avg, _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi8(1)));
193 :
194 54362 : d = _mm_add_epi8(d, avg);
195 54362 : store3(row, d);
196 :
197 : // input += 3;
198 : // prev += 3;
199 : // row += 3;
200 : // rb -= 3;
201 : }
202 54362 : }
203 :
204 6597 : static void gdal_png_read_filter_row_avg4_sse2(png_row_infop row_info,
205 : const GByte *input, GByte *row,
206 : const GByte *prev)
207 : {
208 : /* The Avg filter predicts each pixel as the (truncated) average of a and b.
209 : * There's no pixel to the left of the first pixel. Luckily, it's
210 : * predicted to be half of the pixel above it. So again, this works
211 : * perfectly with our loop if we make sure a starts at zero.
212 : */
213 : size_t rb;
214 6597 : const __m128i zero = _mm_setzero_si128();
215 : __m128i b;
216 6597 : __m128i a, d = zero;
217 :
218 : png_debug(1, "in png_read_filter_row_avg4_sse2");
219 :
220 6597 : rb = row_info->rowbytes + 4;
221 1777700 : while (rb > 4)
222 : {
223 : __m128i avg;
224 1771110 : b = load4(prev);
225 1771110 : a = d;
226 1771110 : d = load4(input);
227 :
228 : /* PNG requires a truncating average, so we can't just use _mm_avg_epu8
229 : */
230 1771110 : avg = _mm_avg_epu8(a, b);
231 : /* ...but we can fix it up by subtracting off 1 if it rounded up. */
232 7084420 : avg = _mm_sub_epi8(
233 : avg, _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi8(1)));
234 :
235 1771110 : d = _mm_add_epi8(d, avg);
236 1771110 : store4(row, d);
237 :
238 1771110 : input += 4;
239 1771110 : prev += 4;
240 1771110 : row += 4;
241 1771110 : rb -= 4;
242 : }
243 6597 : }
244 :
245 : /* Returns |x| for 16-bit lanes. */
246 187894000 : static __m128i abs_i16(__m128i x)
247 : {
248 : #if PNG_INTEL_SSE_IMPLEMENTATION >= 2
249 : return _mm_abs_epi16(x);
250 : #else
251 : /* Read this all as, return x<0 ? -x : x.
252 : * To negate two's complement, you flip all the bits then add 1.
253 : */
254 375788000 : __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());
255 :
256 : /* Flip negative lanes. */
257 187894000 : x = _mm_xor_si128(x, is_negative);
258 :
259 : /* +1 to negative lanes, else +0. */
260 187894000 : x = _mm_sub_epi16(x, is_negative);
261 187894000 : return x;
262 : #endif
263 : }
264 :
265 : /* Bytewise c ? t : e. */
266 126197000 : static __m128i if_then_else(__m128i c, __m128i t, __m128i e)
267 : {
268 : #if PNG_INTEL_SSE_IMPLEMENTATION >= 3
269 : return _mm_blendv_epi8(e, t, c);
270 : #else
271 380933000 : return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));
272 : #endif
273 : }
274 :
275 206306 : static void gdal_png_read_filter_row_paeth3_sse2(png_row_infop row_info,
276 : const GByte *input, GByte *row,
277 : const GByte *prev)
278 : {
279 : /* Paeth tries to predict pixel d using the pixel to the left of it, a,
280 : * and two pixels from the previous row, b and c:
281 : * prev: c b
282 : * row: a d
283 : * The Paeth function predicts d to be whichever of a, b, or c is nearest to
284 : * p=a+b-c.
285 : *
286 : * The first pixel has no left context, and so uses an Up filter, p = b.
287 : * This works naturally with our main loop's p = a+b-c if we force a and c
288 : * to zero.
289 : * Here we zero b and d, which become c and a respectively at the start of
290 : * the loop.
291 : */
292 : size_t rb;
293 206306 : const __m128i zero = _mm_setzero_si128();
294 206306 : __m128i c, b = zero, a, d = zero;
295 :
296 : png_debug(1, "in png_read_filter_row_paeth3_sse2");
297 :
298 206306 : rb = row_info->rowbytes;
299 51368100 : while (rb >= 4)
300 : {
301 : /* It's easiest to do this math (particularly, deal with pc) with 16-bit
302 : * intermediates.
303 : */
304 : __m128i pa, pb, pc, smallest, nearest;
305 51161700 : c = b;
306 51161700 : b = _mm_unpacklo_epi8(load4(prev), zero);
307 51202100 : a = d;
308 102595000 : d = _mm_unpacklo_epi8(load4(input), zero);
309 :
310 : /* (p-a) == (a+b-c - a) == (b-c) */
311 :
312 51393200 : pa = _mm_sub_epi16(b, c);
313 :
314 : /* (p-b) == (a+b-c - b) == (a-c) */
315 51393200 : pb = _mm_sub_epi16(a, c);
316 :
317 : /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
318 51393200 : pc = _mm_add_epi16(pa, pb);
319 :
320 51393200 : pa = abs_i16(pa); /* |p-a| */
321 51070400 : pb = abs_i16(pb); /* |p-b| */
322 50888500 : pc = abs_i16(pc); /* |p-c| */
323 :
324 102353000 : smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
325 :
326 : /* Paeth breaks ties favoring a over b over c. */
327 : nearest =
328 102208000 : if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
329 : if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));
330 :
331 : /* Note `_epi8`: we need addition to wrap modulo 255. */
332 51160300 : d = _mm_add_epi8(d, nearest);
333 51262600 : store3(row, _mm_packus_epi16(d, d));
334 :
335 51161700 : input += 3;
336 51161700 : prev += 3;
337 51161700 : row += 3;
338 51161700 : rb -= 3;
339 : }
340 206376 : if (rb > 0)
341 : {
342 : /* It's easiest to do this math (particularly, deal with pc) with 16-bit
343 : * intermediates.
344 : */
345 : __m128i pa, pb, pc, smallest, nearest;
346 206373 : c = b;
347 206373 : b = _mm_unpacklo_epi8(load3(prev), zero);
348 206380 : a = d;
349 412764 : d = _mm_unpacklo_epi8(load3(input), zero);
350 :
351 : /* (p-a) == (a+b-c - a) == (b-c) */
352 206384 : pa = _mm_sub_epi16(b, c);
353 :
354 : /* (p-b) == (a+b-c - b) == (a-c) */
355 206384 : pb = _mm_sub_epi16(a, c);
356 :
357 : /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
358 206384 : pc = _mm_add_epi16(pa, pb);
359 :
360 206384 : pa = abs_i16(pa); /* |p-a| */
361 206377 : pb = abs_i16(pb); /* |p-b| */
362 206372 : pc = abs_i16(pc); /* |p-c| */
363 :
364 412746 : smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
365 :
366 : /* Paeth breaks ties favoring a over b over c. */
367 : nearest =
368 412748 : if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
369 : if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));
370 :
371 : /* Note `_epi8`: we need addition to wrap modulo 255. */
372 206377 : d = _mm_add_epi8(d, nearest);
373 206381 : store3(row, _mm_packus_epi16(d, d));
374 :
375 : // input += 3;
376 : // prev += 3;
377 : // row += 3;
378 : // rb -= 3;
379 : }
380 206376 : }
381 :
382 48578 : static void gdal_png_read_filter_row_paeth4_sse2(png_row_infop row_info,
383 : const GByte *input, GByte *row,
384 : const GByte *prev)
385 : {
386 : /* Paeth tries to predict pixel d using the pixel to the left of it, a,
387 : * and two pixels from the previous row, b and c:
388 : * prev: c b
389 : * row: a d
390 : * The Paeth function predicts d to be whichever of a, b, or c is nearest to
391 : * p=a+b-c.
392 : *
393 : * The first pixel has no left context, and so uses an Up filter, p = b.
394 : * This works naturally with our main loop's p = a+b-c if we force a and c
395 : * to zero.
396 : * Here we zero b and d, which become c and a respectively at the start of
397 : * the loop.
398 : */
399 : size_t rb;
400 48578 : const __m128i zero = _mm_setzero_si128();
401 : __m128i pa, pb, pc, smallest, nearest;
402 48578 : __m128i c, b = zero, a, d = zero;
403 :
404 : png_debug(1, "in png_read_filter_row_paeth4_sse2");
405 :
406 48578 : rb = row_info->rowbytes + 4;
407 12427000 : while (rb > 4)
408 : {
409 : /* It's easiest to do this math (particularly, deal with pc) with 16-bit
410 : * intermediates.
411 : */
412 12378400 : c = b;
413 12378400 : b = _mm_unpacklo_epi8(load4(prev), zero);
414 12378400 : a = d;
415 24756800 : d = _mm_unpacklo_epi8(load4(input), zero);
416 :
417 : /* (p-a) == (a+b-c - a) == (b-c) */
418 12378400 : pa = _mm_sub_epi16(b, c);
419 :
420 : /* (p-b) == (a+b-c - b) == (a-c) */
421 12378400 : pb = _mm_sub_epi16(a, c);
422 :
423 : /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
424 12378400 : pc = _mm_add_epi16(pa, pb);
425 :
426 12378400 : pa = abs_i16(pa); /* |p-a| */
427 12378400 : pb = abs_i16(pb); /* |p-b| */
428 12378400 : pc = abs_i16(pc); /* |p-c| */
429 :
430 24756800 : smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
431 :
432 : /* Paeth breaks ties favoring a over b over c. */
433 : nearest =
434 24756800 : if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
435 : if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));
436 :
437 : /* Note `_epi8`: we need addition to wrap modulo 255. */
438 12378400 : d = _mm_add_epi8(d, nearest);
439 12378400 : store4(row, _mm_packus_epi16(d, d));
440 :
441 12378400 : input += 4;
442 12378400 : prev += 4;
443 12378400 : row += 4;
444 12378400 : rb -= 4;
445 : }
446 48578 : }
|