Line data Source code
1 : /* filter_sse2_intrinsics.c - SSE2 optimized filter functions
2 : *
3 : * Copyright (c) 2018 Cosmin Truta
4 : * Copyright (c) 2016-2017 Glenn Randers-Pehrson
5 : * Written by Mike Klein and Matt Sarett
6 : * Derived from arm/filter_neon_intrinsics.c
7 : *
8 : * This code is released under the libpng license.
9 : * For conditions of distribution and use, see the disclaimer
10 : * and license in png.h
11 : */
12 :
13 : #ifndef png_debug
14 : #define png_debug(x, y)
15 : #endif
16 :
17 : #ifndef PNG_INTEL_SSE_IMPLEMENTATION
18 : #if defined(__SSE4_1__) || defined(__AVX__)
19 : /* We are not actually using AVX, but checking for AVX is the best
20 : way we can detect SSE4.1 and SSSE3 on MSVC.
21 : */
22 : #define PNG_INTEL_SSE_IMPLEMENTATION 3
23 : #elif defined(__SSSE3__)
24 : #define PNG_INTEL_SSE_IMPLEMENTATION 2
25 : #elif defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) || \
26 : (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
27 : #define PNG_INTEL_SSE_IMPLEMENTATION 1
28 : #else
29 : #define PNG_INTEL_SSE_IMPLEMENTATION 0
30 : #endif
31 : #endif
32 :
33 : #include <immintrin.h>
34 :
35 : /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
36 : * They're positioned like this:
37 : * prev: c b
38 : * row: a d
39 : * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
40 : * whichever of a, b, or c is closest to p=a+b-c.
41 : */
42 :
43 30162300 : static __m128i load4(const void *p)
44 : {
45 : int tmp;
46 30162300 : memcpy(&tmp, p, sizeof(tmp));
47 60324700 : return _mm_cvtsi32_si128(tmp);
48 : }
49 :
50 14621100 : static void store4(void *p, __m128i v)
51 : {
52 14621100 : int tmp = _mm_cvtsi128_si32(v);
53 14621100 : memcpy(p, &tmp, sizeof(int));
54 14621100 : }
55 :
56 21899 : static __m128i load3(const void *p)
57 : {
58 21899 : png_uint_32 tmp = 0;
59 21899 : memcpy(&tmp, p, 3);
60 43798 : return _mm_cvtsi32_si128(tmp);
61 : }
62 :
63 1464270 : static void store3(void *p, __m128i v)
64 : {
65 1464270 : int tmp = _mm_cvtsi128_si32(v);
66 1464270 : memcpy(p, &tmp, 3);
67 1464270 : }
68 :
69 3921 : static void gdal_png_read_filter_row_sub3_sse2(png_row_infop row_info,
70 : const GByte *input, GByte *row)
71 : {
72 : /* The Sub filter predicts each pixel as the previous pixel, a.
73 : * There is no pixel to the left of the first pixel. It's encoded directly.
74 : * That works with our main loop if we just say that left pixel was zero.
75 : */
76 : size_t rb;
77 :
78 3921 : __m128i a, d = _mm_setzero_si128();
79 :
80 : png_debug(1, "in png_read_filter_row_sub3_sse2");
81 :
82 3921 : rb = row_info->rowbytes;
83 328129 : while (rb >= 4)
84 : {
85 324208 : a = d;
86 324208 : d = load4(input);
87 324208 : d = _mm_add_epi8(d, a);
88 324208 : store3(row, d);
89 :
90 324208 : input += 3;
91 324208 : row += 3;
92 324208 : rb -= 3;
93 : }
94 3921 : if (rb > 0)
95 : {
96 3921 : a = d;
97 3921 : d = load3(input);
98 3921 : d = _mm_add_epi8(d, a);
99 3921 : store3(row, d);
100 :
101 : //input += 3;
102 : //row += 3;
103 : //rb -= 3;
104 : }
105 3921 : }
106 :
107 6233 : static void gdal_png_read_filter_row_sub4_sse2(png_row_infop row_info,
108 : const GByte *input, GByte *row)
109 : {
110 : /* The Sub filter predicts each pixel as the previous pixel, a.
111 : * There is no pixel to the left of the first pixel. It's encoded directly.
112 : * That works with our main loop if we just say that left pixel was zero.
113 : */
114 : size_t rb;
115 :
116 6233 : __m128i a, d = _mm_setzero_si128();
117 :
118 : png_debug(1, "in png_read_filter_row_sub4_sse2");
119 :
120 6233 : rb = row_info->rowbytes + 4;
121 1664710 : while (rb > 4)
122 : {
123 1658470 : a = d;
124 1658470 : d = load4(input);
125 1658470 : d = _mm_add_epi8(d, a);
126 1658470 : store4(row, d);
127 :
128 1658470 : input += 4;
129 1658470 : row += 4;
130 1658470 : rb -= 4;
131 : }
132 6233 : }
133 :
134 3800 : static void gdal_png_read_filter_row_avg3_sse2(png_row_infop row_info,
135 : const GByte *input, GByte *row,
136 : const GByte *prev)
137 : {
138 : /* The Avg filter predicts each pixel as the (truncated) average of a and b.
139 : * There's no pixel to the left of the first pixel. Luckily, it's
140 : * predicted to be half of the pixel above it. So again, this works
141 : * perfectly with our loop if we make sure a starts at zero.
142 : */
143 :
144 : size_t rb;
145 :
146 3800 : const __m128i zero = _mm_setzero_si128();
147 :
148 : __m128i b;
149 3800 : __m128i a, d = zero;
150 :
151 : png_debug(1, "in png_read_filter_row_avg3_sse2");
152 3800 : rb = row_info->rowbytes;
153 452700 : while (rb >= 4)
154 : {
155 : __m128i avg;
156 448900 : b = load4(prev);
157 448900 : a = d;
158 448900 : d = load4(input);
159 :
160 : /* PNG requires a truncating average, so we can't just use _mm_avg_epu8
161 : */
162 448900 : avg = _mm_avg_epu8(a, b);
163 : /* ...but we can fix it up by subtracting off 1 if it rounded up. */
164 1795600 : avg = _mm_sub_epi8(
165 : avg, _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi8(1)));
166 448900 : d = _mm_add_epi8(d, avg);
167 448900 : store3(row, d);
168 :
169 448900 : input += 3;
170 448900 : prev += 3;
171 448900 : row += 3;
172 448900 : rb -= 3;
173 : }
174 3800 : if (rb > 0)
175 : {
176 : __m128i avg;
177 3800 : b = load3(prev);
178 3800 : a = d;
179 3800 : d = load3(input);
180 :
181 : /* PNG requires a truncating average, so we can't just use _mm_avg_epu8
182 : */
183 3800 : avg = _mm_avg_epu8(a, b);
184 : /* ...but we can fix it up by subtracting off 1 if it rounded up. */
185 15200 : avg = _mm_sub_epi8(
186 : avg, _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi8(1)));
187 :
188 3800 : d = _mm_add_epi8(d, avg);
189 3800 : store3(row, d);
190 :
191 : // input += 3;
192 : // prev += 3;
193 : // row += 3;
194 : // rb -= 3;
195 : }
196 3800 : }
197 :
198 3902 : static void gdal_png_read_filter_row_avg4_sse2(png_row_infop row_info,
199 : const GByte *input, GByte *row,
200 : const GByte *prev)
201 : {
202 : /* The Avg filter predicts each pixel as the (truncated) average of a and b.
203 : * There's no pixel to the left of the first pixel. Luckily, it's
204 : * predicted to be half of the pixel above it. So again, this works
205 : * perfectly with our loop if we make sure a starts at zero.
206 : */
207 : size_t rb;
208 3902 : const __m128i zero = _mm_setzero_si128();
209 : __m128i b;
210 3902 : __m128i a, d = zero;
211 :
212 : png_debug(1, "in png_read_filter_row_avg4_sse2");
213 :
214 3902 : rb = row_info->rowbytes + 4;
215 1085090 : while (rb > 4)
216 : {
217 : __m128i avg;
218 1081190 : b = load4(prev);
219 1081190 : a = d;
220 1081190 : d = load4(input);
221 :
222 : /* PNG requires a truncating average, so we can't just use _mm_avg_epu8
223 : */
224 1081190 : avg = _mm_avg_epu8(a, b);
225 : /* ...but we can fix it up by subtracting off 1 if it rounded up. */
226 4324740 : avg = _mm_sub_epi8(
227 : avg, _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi8(1)));
228 :
229 1081190 : d = _mm_add_epi8(d, avg);
230 1081190 : store4(row, d);
231 :
232 1081190 : input += 4;
233 1081190 : prev += 4;
234 1081190 : row += 4;
235 1081190 : rb -= 4;
236 : }
237 3902 : }
238 :
239 : /* Returns |x| for 16-bit lanes. */
240 37694800 : static __m128i abs_i16(__m128i x)
241 : {
242 : #if PNG_INTEL_SSE_IMPLEMENTATION >= 2
243 : return _mm_abs_epi16(x);
244 : #else
245 : /* Read this all as, return x<0 ? -x : x.
246 : * To negate two's complement, you flip all the bits then add 1.
247 : */
248 75389600 : __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());
249 :
250 : /* Flip negative lanes. */
251 37694800 : x = _mm_xor_si128(x, is_negative);
252 :
253 : /* +1 to negative lanes, else +0. */
254 37694800 : x = _mm_sub_epi16(x, is_negative);
255 37694800 : return x;
256 : #endif
257 : }
258 :
259 : /* Bytewise c ? t : e. */
260 25129900 : static __m128i if_then_else(__m128i c, __m128i t, __m128i e)
261 : {
262 : #if PNG_INTEL_SSE_IMPLEMENTATION >= 3
263 : return _mm_blendv_epi8(e, t, c);
264 : #else
265 75389600 : return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));
266 : #endif
267 : }
268 :
269 5189 : static void gdal_png_read_filter_row_paeth3_sse2(png_row_infop row_info,
270 : const GByte *input, GByte *row,
271 : const GByte *prev)
272 : {
273 : /* Paeth tries to predict pixel d using the pixel to the left of it, a,
274 : * and two pixels from the previous row, b and c:
275 : * prev: c b
276 : * row: a d
277 : * The Paeth function predicts d to be whichever of a, b, or c is nearest to
278 : * p=a+b-c.
279 : *
280 : * The first pixel has no left context, and so uses an Up filter, p = b.
281 : * This works naturally with our main loop's p = a+b-c if we force a and c
282 : * to zero.
283 : * Here we zero b and d, which become c and a respectively at the start of
284 : * the loop.
285 : */
286 : size_t rb;
287 5189 : const __m128i zero = _mm_setzero_si128();
288 5189 : __m128i c, b = zero, a, d = zero;
289 :
290 : png_debug(1, "in png_read_filter_row_paeth3_sse2");
291 :
292 5189 : rb = row_info->rowbytes;
293 683437 : while (rb >= 4)
294 : {
295 : /* It's easiest to do this math (particularly, deal with pc) with 16-bit
296 : * intermediates.
297 : */
298 : __m128i pa, pb, pc, smallest, nearest;
299 678248 : c = b;
300 678248 : b = _mm_unpacklo_epi8(load4(prev), zero);
301 678248 : a = d;
302 1356500 : d = _mm_unpacklo_epi8(load4(input), zero);
303 :
304 : /* (p-a) == (a+b-c - a) == (b-c) */
305 :
306 678248 : pa = _mm_sub_epi16(b, c);
307 :
308 : /* (p-b) == (a+b-c - b) == (a-c) */
309 678248 : pb = _mm_sub_epi16(a, c);
310 :
311 : /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
312 678248 : pc = _mm_add_epi16(pa, pb);
313 :
314 678248 : pa = abs_i16(pa); /* |p-a| */
315 678248 : pb = abs_i16(pb); /* |p-b| */
316 678248 : pc = abs_i16(pc); /* |p-c| */
317 :
318 1356500 : smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
319 :
320 : /* Paeth breaks ties favoring a over b over c. */
321 : nearest =
322 1356500 : if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
323 : if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));
324 :
325 : /* Note `_epi8`: we need addition to wrap modulo 255. */
326 678248 : d = _mm_add_epi8(d, nearest);
327 678248 : store3(row, _mm_packus_epi16(d, d));
328 :
329 678248 : input += 3;
330 678248 : prev += 3;
331 678248 : row += 3;
332 678248 : rb -= 3;
333 : }
334 5189 : if (rb > 0)
335 : {
336 : /* It's easiest to do this math (particularly, deal with pc) with 16-bit
337 : * intermediates.
338 : */
339 : __m128i pa, pb, pc, smallest, nearest;
340 5189 : c = b;
341 5189 : b = _mm_unpacklo_epi8(load3(prev), zero);
342 5189 : a = d;
343 10378 : d = _mm_unpacklo_epi8(load3(input), zero);
344 :
345 : /* (p-a) == (a+b-c - a) == (b-c) */
346 5189 : pa = _mm_sub_epi16(b, c);
347 :
348 : /* (p-b) == (a+b-c - b) == (a-c) */
349 5189 : pb = _mm_sub_epi16(a, c);
350 :
351 : /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
352 5189 : pc = _mm_add_epi16(pa, pb);
353 :
354 5189 : pa = abs_i16(pa); /* |p-a| */
355 5189 : pb = abs_i16(pb); /* |p-b| */
356 5189 : pc = abs_i16(pc); /* |p-c| */
357 :
358 10378 : smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
359 :
360 : /* Paeth breaks ties favoring a over b over c. */
361 : nearest =
362 10378 : if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
363 : if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));
364 :
365 : /* Note `_epi8`: we need addition to wrap modulo 255. */
366 5189 : d = _mm_add_epi8(d, nearest);
367 5189 : store3(row, _mm_packus_epi16(d, d));
368 :
369 : // input += 3;
370 : // prev += 3;
371 : // row += 3;
372 : // rb -= 3;
373 : }
374 5189 : }
375 :
376 46637 : static void gdal_png_read_filter_row_paeth4_sse2(png_row_infop row_info,
377 : const GByte *input, GByte *row,
378 : const GByte *prev)
379 : {
380 : /* Paeth tries to predict pixel d using the pixel to the left of it, a,
381 : * and two pixels from the previous row, b and c:
382 : * prev: c b
383 : * row: a d
384 : * The Paeth function predicts d to be whichever of a, b, or c is nearest to
385 : * p=a+b-c.
386 : *
387 : * The first pixel has no left context, and so uses an Up filter, p = b.
388 : * This works naturally with our main loop's p = a+b-c if we force a and c
389 : * to zero.
390 : * Here we zero b and d, which become c and a respectively at the start of
391 : * the loop.
392 : */
393 : size_t rb;
394 46637 : const __m128i zero = _mm_setzero_si128();
395 : __m128i pa, pb, pc, smallest, nearest;
396 46637 : __m128i c, b = zero, a, d = zero;
397 :
398 : png_debug(1, "in png_read_filter_row_paeth4_sse2");
399 :
400 46637 : rb = row_info->rowbytes + 4;
401 11928100 : while (rb > 4)
402 : {
403 : /* It's easiest to do this math (particularly, deal with pc) with 16-bit
404 : * intermediates.
405 : */
406 11881500 : c = b;
407 11881500 : b = _mm_unpacklo_epi8(load4(prev), zero);
408 11881500 : a = d;
409 23763000 : d = _mm_unpacklo_epi8(load4(input), zero);
410 :
411 : /* (p-a) == (a+b-c - a) == (b-c) */
412 11881500 : pa = _mm_sub_epi16(b, c);
413 :
414 : /* (p-b) == (a+b-c - b) == (a-c) */
415 11881500 : pb = _mm_sub_epi16(a, c);
416 :
417 : /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
418 11881500 : pc = _mm_add_epi16(pa, pb);
419 :
420 11881500 : pa = abs_i16(pa); /* |p-a| */
421 11881500 : pb = abs_i16(pb); /* |p-b| */
422 11881500 : pc = abs_i16(pc); /* |p-c| */
423 :
424 23763000 : smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
425 :
426 : /* Paeth breaks ties favoring a over b over c. */
427 : nearest =
428 23763000 : if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
429 : if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));
430 :
431 : /* Note `_epi8`: we need addition to wrap modulo 255. */
432 11881500 : d = _mm_add_epi8(d, nearest);
433 11881500 : store4(row, _mm_packus_epi16(d, d));
434 :
435 11881500 : input += 4;
436 11881500 : prev += 4;
437 11881500 : row += 4;
438 11881500 : rb -= 4;
439 : }
440 46637 : }
|