Line data Source code
1 : /* filter_sse2_intrinsics.c - SSE2 optimized filter functions
2 : *
3 : * Copyright (c) 2018 Cosmin Truta
4 : * Copyright (c) 2016-2017 Glenn Randers-Pehrson
5 : * Written by Mike Klein and Matt Sarett
6 : * Derived from arm/filter_neon_intrinsics.c
7 : *
8 : * This code is released under the libpng license.
9 : * For conditions of distribution and use, see the disclaimer
10 : * and license in png.h
11 : */
12 :
13 : #ifndef png_debug
14 : #define png_debug(x, y)
15 : #endif
16 :
17 : #ifndef PNG_INTEL_SSE_IMPLEMENTATION
18 : #if defined(USE_NEON_OPTIMIZATIONS)
19 : #define PNG_INTEL_SSE_IMPLEMENTATION 3
20 : #elif defined(__SSE4_1__) || defined(__AVX__)
21 : /* We are not actually using AVX, but checking for AVX is the best
22 : way we can detect SSE4.1 and SSSE3 on MSVC.
23 : */
24 : #define PNG_INTEL_SSE_IMPLEMENTATION 3
25 : #elif defined(__SSSE3__)
26 : #define PNG_INTEL_SSE_IMPLEMENTATION 2
27 : #elif defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) || \
28 : (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
29 : #define PNG_INTEL_SSE_IMPLEMENTATION 1
30 : #else
31 : #define PNG_INTEL_SSE_IMPLEMENTATION 0
32 : #endif
33 : #endif
34 :
35 : #if defined(USE_NEON_OPTIMIZATIONS)
36 : #include "include_sse2neon.h"
37 : #else
38 : #include <immintrin.h>
39 : #endif
40 :
41 : /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
42 : * They're positioned like this:
43 : * prev: c b
44 : * row: a d
45 : * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
46 : * whichever of a, b, or c is closest to p=a+b-c.
47 : */
48 :
49 30162300 : static __m128i load4(const void *p)
50 : {
51 : int tmp;
52 30162300 : memcpy(&tmp, p, sizeof(tmp));
53 60324700 : return _mm_cvtsi32_si128(tmp);
54 : }
55 :
56 14621100 : static void store4(void *p, __m128i v)
57 : {
58 14621100 : int tmp = _mm_cvtsi128_si32(v);
59 14621100 : memcpy(p, &tmp, sizeof(int));
60 14621100 : }
61 :
62 21899 : static __m128i load3(const void *p)
63 : {
64 21899 : png_uint_32 tmp = 0;
65 21899 : memcpy(&tmp, p, 3);
66 43798 : return _mm_cvtsi32_si128(tmp);
67 : }
68 :
69 1464270 : static void store3(void *p, __m128i v)
70 : {
71 1464270 : int tmp = _mm_cvtsi128_si32(v);
72 1464270 : memcpy(p, &tmp, 3);
73 1464270 : }
74 :
75 3921 : static void gdal_png_read_filter_row_sub3_sse2(png_row_infop row_info,
76 : const GByte *input, GByte *row)
77 : {
78 : /* The Sub filter predicts each pixel as the previous pixel, a.
79 : * There is no pixel to the left of the first pixel. It's encoded directly.
80 : * That works with our main loop if we just say that left pixel was zero.
81 : */
82 : size_t rb;
83 :
84 3921 : __m128i a, d = _mm_setzero_si128();
85 :
86 : png_debug(1, "in png_read_filter_row_sub3_sse2");
87 :
88 3921 : rb = row_info->rowbytes;
89 328129 : while (rb >= 4)
90 : {
91 324208 : a = d;
92 324208 : d = load4(input);
93 324208 : d = _mm_add_epi8(d, a);
94 324208 : store3(row, d);
95 :
96 324208 : input += 3;
97 324208 : row += 3;
98 324208 : rb -= 3;
99 : }
100 3921 : if (rb > 0)
101 : {
102 3921 : a = d;
103 3921 : d = load3(input);
104 3921 : d = _mm_add_epi8(d, a);
105 3921 : store3(row, d);
106 :
107 : //input += 3;
108 : //row += 3;
109 : //rb -= 3;
110 : }
111 3921 : }
112 :
113 6233 : static void gdal_png_read_filter_row_sub4_sse2(png_row_infop row_info,
114 : const GByte *input, GByte *row)
115 : {
116 : /* The Sub filter predicts each pixel as the previous pixel, a.
117 : * There is no pixel to the left of the first pixel. It's encoded directly.
118 : * That works with our main loop if we just say that left pixel was zero.
119 : */
120 : size_t rb;
121 :
122 6233 : __m128i a, d = _mm_setzero_si128();
123 :
124 : png_debug(1, "in png_read_filter_row_sub4_sse2");
125 :
126 6233 : rb = row_info->rowbytes + 4;
127 1664710 : while (rb > 4)
128 : {
129 1658470 : a = d;
130 1658470 : d = load4(input);
131 1658470 : d = _mm_add_epi8(d, a);
132 1658470 : store4(row, d);
133 :
134 1658470 : input += 4;
135 1658470 : row += 4;
136 1658470 : rb -= 4;
137 : }
138 6233 : }
139 :
140 3800 : static void gdal_png_read_filter_row_avg3_sse2(png_row_infop row_info,
141 : const GByte *input, GByte *row,
142 : const GByte *prev)
143 : {
144 : /* The Avg filter predicts each pixel as the (truncated) average of a and b.
145 : * There's no pixel to the left of the first pixel. Luckily, it's
146 : * predicted to be half of the pixel above it. So again, this works
147 : * perfectly with our loop if we make sure a starts at zero.
148 : */
149 :
150 : size_t rb;
151 :
152 3800 : const __m128i zero = _mm_setzero_si128();
153 :
154 : __m128i b;
155 3800 : __m128i a, d = zero;
156 :
157 : png_debug(1, "in png_read_filter_row_avg3_sse2");
158 3800 : rb = row_info->rowbytes;
159 452700 : while (rb >= 4)
160 : {
161 : __m128i avg;
162 448900 : b = load4(prev);
163 448900 : a = d;
164 448900 : d = load4(input);
165 :
166 : /* PNG requires a truncating average, so we can't just use _mm_avg_epu8
167 : */
168 448900 : avg = _mm_avg_epu8(a, b);
169 : /* ...but we can fix it up by subtracting off 1 if it rounded up. */
170 1795600 : avg = _mm_sub_epi8(
171 : avg, _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi8(1)));
172 448900 : d = _mm_add_epi8(d, avg);
173 448900 : store3(row, d);
174 :
175 448900 : input += 3;
176 448900 : prev += 3;
177 448900 : row += 3;
178 448900 : rb -= 3;
179 : }
180 3800 : if (rb > 0)
181 : {
182 : __m128i avg;
183 3800 : b = load3(prev);
184 3800 : a = d;
185 3800 : d = load3(input);
186 :
187 : /* PNG requires a truncating average, so we can't just use _mm_avg_epu8
188 : */
189 3800 : avg = _mm_avg_epu8(a, b);
190 : /* ...but we can fix it up by subtracting off 1 if it rounded up. */
191 15200 : avg = _mm_sub_epi8(
192 : avg, _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi8(1)));
193 :
194 3800 : d = _mm_add_epi8(d, avg);
195 3800 : store3(row, d);
196 :
197 : // input += 3;
198 : // prev += 3;
199 : // row += 3;
200 : // rb -= 3;
201 : }
202 3800 : }
203 :
204 3902 : static void gdal_png_read_filter_row_avg4_sse2(png_row_infop row_info,
205 : const GByte *input, GByte *row,
206 : const GByte *prev)
207 : {
208 : /* The Avg filter predicts each pixel as the (truncated) average of a and b.
209 : * There's no pixel to the left of the first pixel. Luckily, it's
210 : * predicted to be half of the pixel above it. So again, this works
211 : * perfectly with our loop if we make sure a starts at zero.
212 : */
213 : size_t rb;
214 3902 : const __m128i zero = _mm_setzero_si128();
215 : __m128i b;
216 3902 : __m128i a, d = zero;
217 :
218 : png_debug(1, "in png_read_filter_row_avg4_sse2");
219 :
220 3902 : rb = row_info->rowbytes + 4;
221 1085090 : while (rb > 4)
222 : {
223 : __m128i avg;
224 1081190 : b = load4(prev);
225 1081190 : a = d;
226 1081190 : d = load4(input);
227 :
228 : /* PNG requires a truncating average, so we can't just use _mm_avg_epu8
229 : */
230 1081190 : avg = _mm_avg_epu8(a, b);
231 : /* ...but we can fix it up by subtracting off 1 if it rounded up. */
232 4324740 : avg = _mm_sub_epi8(
233 : avg, _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi8(1)));
234 :
235 1081190 : d = _mm_add_epi8(d, avg);
236 1081190 : store4(row, d);
237 :
238 1081190 : input += 4;
239 1081190 : prev += 4;
240 1081190 : row += 4;
241 1081190 : rb -= 4;
242 : }
243 3902 : }
244 :
245 : /* Returns |x| for 16-bit lanes. */
246 37694800 : static __m128i abs_i16(__m128i x)
247 : {
248 : #if PNG_INTEL_SSE_IMPLEMENTATION >= 2
249 : return _mm_abs_epi16(x);
250 : #else
251 : /* Read this all as, return x<0 ? -x : x.
252 : * To negate two's complement, you flip all the bits then add 1.
253 : */
254 75389600 : __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());
255 :
256 : /* Flip negative lanes. */
257 37694800 : x = _mm_xor_si128(x, is_negative);
258 :
259 : /* +1 to negative lanes, else +0. */
260 37694800 : x = _mm_sub_epi16(x, is_negative);
261 37694800 : return x;
262 : #endif
263 : }
264 :
265 : /* Bytewise c ? t : e. */
266 25129900 : static __m128i if_then_else(__m128i c, __m128i t, __m128i e)
267 : {
268 : #if PNG_INTEL_SSE_IMPLEMENTATION >= 3
269 : return _mm_blendv_epi8(e, t, c);
270 : #else
271 75389600 : return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));
272 : #endif
273 : }
274 :
275 5189 : static void gdal_png_read_filter_row_paeth3_sse2(png_row_infop row_info,
276 : const GByte *input, GByte *row,
277 : const GByte *prev)
278 : {
279 : /* Paeth tries to predict pixel d using the pixel to the left of it, a,
280 : * and two pixels from the previous row, b and c:
281 : * prev: c b
282 : * row: a d
283 : * The Paeth function predicts d to be whichever of a, b, or c is nearest to
284 : * p=a+b-c.
285 : *
286 : * The first pixel has no left context, and so uses an Up filter, p = b.
287 : * This works naturally with our main loop's p = a+b-c if we force a and c
288 : * to zero.
289 : * Here we zero b and d, which become c and a respectively at the start of
290 : * the loop.
291 : */
292 : size_t rb;
293 5189 : const __m128i zero = _mm_setzero_si128();
294 5189 : __m128i c, b = zero, a, d = zero;
295 :
296 : png_debug(1, "in png_read_filter_row_paeth3_sse2");
297 :
298 5189 : rb = row_info->rowbytes;
299 683437 : while (rb >= 4)
300 : {
301 : /* It's easiest to do this math (particularly, deal with pc) with 16-bit
302 : * intermediates.
303 : */
304 : __m128i pa, pb, pc, smallest, nearest;
305 678248 : c = b;
306 678248 : b = _mm_unpacklo_epi8(load4(prev), zero);
307 678248 : a = d;
308 1356500 : d = _mm_unpacklo_epi8(load4(input), zero);
309 :
310 : /* (p-a) == (a+b-c - a) == (b-c) */
311 :
312 678248 : pa = _mm_sub_epi16(b, c);
313 :
314 : /* (p-b) == (a+b-c - b) == (a-c) */
315 678248 : pb = _mm_sub_epi16(a, c);
316 :
317 : /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
318 678248 : pc = _mm_add_epi16(pa, pb);
319 :
320 678248 : pa = abs_i16(pa); /* |p-a| */
321 678248 : pb = abs_i16(pb); /* |p-b| */
322 678248 : pc = abs_i16(pc); /* |p-c| */
323 :
324 1356500 : smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
325 :
326 : /* Paeth breaks ties favoring a over b over c. */
327 : nearest =
328 1356500 : if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
329 : if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));
330 :
331 : /* Note `_epi8`: we need addition to wrap modulo 255. */
332 678248 : d = _mm_add_epi8(d, nearest);
333 678248 : store3(row, _mm_packus_epi16(d, d));
334 :
335 678248 : input += 3;
336 678248 : prev += 3;
337 678248 : row += 3;
338 678248 : rb -= 3;
339 : }
340 5189 : if (rb > 0)
341 : {
342 : /* It's easiest to do this math (particularly, deal with pc) with 16-bit
343 : * intermediates.
344 : */
345 : __m128i pa, pb, pc, smallest, nearest;
346 5189 : c = b;
347 5189 : b = _mm_unpacklo_epi8(load3(prev), zero);
348 5189 : a = d;
349 10378 : d = _mm_unpacklo_epi8(load3(input), zero);
350 :
351 : /* (p-a) == (a+b-c - a) == (b-c) */
352 5189 : pa = _mm_sub_epi16(b, c);
353 :
354 : /* (p-b) == (a+b-c - b) == (a-c) */
355 5189 : pb = _mm_sub_epi16(a, c);
356 :
357 : /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
358 5189 : pc = _mm_add_epi16(pa, pb);
359 :
360 5189 : pa = abs_i16(pa); /* |p-a| */
361 5189 : pb = abs_i16(pb); /* |p-b| */
362 5189 : pc = abs_i16(pc); /* |p-c| */
363 :
364 10378 : smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
365 :
366 : /* Paeth breaks ties favoring a over b over c. */
367 : nearest =
368 10378 : if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
369 : if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));
370 :
371 : /* Note `_epi8`: we need addition to wrap modulo 255. */
372 5189 : d = _mm_add_epi8(d, nearest);
373 5189 : store3(row, _mm_packus_epi16(d, d));
374 :
375 : // input += 3;
376 : // prev += 3;
377 : // row += 3;
378 : // rb -= 3;
379 : }
380 5189 : }
381 :
382 46637 : static void gdal_png_read_filter_row_paeth4_sse2(png_row_infop row_info,
383 : const GByte *input, GByte *row,
384 : const GByte *prev)
385 : {
386 : /* Paeth tries to predict pixel d using the pixel to the left of it, a,
387 : * and two pixels from the previous row, b and c:
388 : * prev: c b
389 : * row: a d
390 : * The Paeth function predicts d to be whichever of a, b, or c is nearest to
391 : * p=a+b-c.
392 : *
393 : * The first pixel has no left context, and so uses an Up filter, p = b.
394 : * This works naturally with our main loop's p = a+b-c if we force a and c
395 : * to zero.
396 : * Here we zero b and d, which become c and a respectively at the start of
397 : * the loop.
398 : */
399 : size_t rb;
400 46637 : const __m128i zero = _mm_setzero_si128();
401 : __m128i pa, pb, pc, smallest, nearest;
402 46637 : __m128i c, b = zero, a, d = zero;
403 :
404 : png_debug(1, "in png_read_filter_row_paeth4_sse2");
405 :
406 46637 : rb = row_info->rowbytes + 4;
407 11928100 : while (rb > 4)
408 : {
409 : /* It's easiest to do this math (particularly, deal with pc) with 16-bit
410 : * intermediates.
411 : */
412 11881500 : c = b;
413 11881500 : b = _mm_unpacklo_epi8(load4(prev), zero);
414 11881500 : a = d;
415 23763000 : d = _mm_unpacklo_epi8(load4(input), zero);
416 :
417 : /* (p-a) == (a+b-c - a) == (b-c) */
418 11881500 : pa = _mm_sub_epi16(b, c);
419 :
420 : /* (p-b) == (a+b-c - b) == (a-c) */
421 11881500 : pb = _mm_sub_epi16(a, c);
422 :
423 : /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
424 11881500 : pc = _mm_add_epi16(pa, pb);
425 :
426 11881500 : pa = abs_i16(pa); /* |p-a| */
427 11881500 : pb = abs_i16(pb); /* |p-b| */
428 11881500 : pc = abs_i16(pc); /* |p-c| */
429 :
430 23763000 : smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
431 :
432 : /* Paeth breaks ties favoring a over b over c. */
433 : nearest =
434 23763000 : if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
435 : if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));
436 :
437 : /* Note `_epi8`: we need addition to wrap modulo 255. */
438 11881500 : d = _mm_add_epi8(d, nearest);
439 11881500 : store4(row, _mm_packus_epi16(d, d));
440 :
441 11881500 : input += 4;
442 11881500 : prev += 4;
443 11881500 : row += 4;
444 11881500 : rb -= 4;
445 : }
446 46637 : }
|