Line data Source code
1 : /* filter_sse2_intrinsics.c - SSE2 optimized filter functions
2 : *
3 : * Copyright (c) 2018 Cosmin Truta
4 : * Copyright (c) 2016-2017 Glenn Randers-Pehrson
5 : * Written by Mike Klein and Matt Sarett
6 : * Derived from arm/filter_neon_intrinsics.c
7 : *
8 : * This code is released under the libpng license.
9 : * For conditions of distribution and use, see the disclaimer
10 : * and license in png.h
11 : */
12 :
13 : #ifndef png_debug
14 : #define png_debug(x, y)
15 : #endif
16 :
17 : #ifndef PNG_INTEL_SSE_IMPLEMENTATION
18 : #if defined(USE_NEON_OPTIMIZATIONS)
19 : #define PNG_INTEL_SSE_IMPLEMENTATION 3
20 : #elif defined(__SSE4_1__) || defined(__AVX__)
21 : /* We are not actually using AVX, but checking for AVX is the best
22 : way we can detect SSE4.1 and SSSE3 on MSVC.
23 : */
24 : #define PNG_INTEL_SSE_IMPLEMENTATION 3
25 : #elif defined(__SSSE3__)
26 : #define PNG_INTEL_SSE_IMPLEMENTATION 2
27 : #elif defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) || \
28 : (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
29 : #define PNG_INTEL_SSE_IMPLEMENTATION 1
30 : #else
31 : #define PNG_INTEL_SSE_IMPLEMENTATION 0
32 : #endif
33 : #endif
34 :
35 : #if defined(USE_NEON_OPTIMIZATIONS)
36 : #include "include_sse2neon.h"
37 : #else
38 : #include <immintrin.h>
39 : #endif
40 :
41 : /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
42 : * They're positioned like this:
43 : * prev: c b
44 : * row: a d
45 : * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
46 : * whichever of a, b, or c is closest to p=a+b-c.
47 : */
48 :
49 64914300 : static __m128i load4(const void *p)
50 : {
51 : int tmp;
52 64914300 : memcpy(&tmp, p, sizeof(tmp));
53 129829000 : return _mm_cvtsi32_si128(tmp);
54 : }
55 :
56 15314900 : static void store4(void *p, __m128i v)
57 : {
58 15314900 : int tmp = _mm_cvtsi128_si32(v);
59 15314900 : memcpy(p, &tmp, sizeof(int));
60 15314900 : }
61 :
62 163404 : static __m128i load3(const void *p)
63 : {
64 163404 : png_uint_32 tmp = 0;
65 163404 : memcpy(&tmp, p, 3);
66 326808 : return _mm_cvtsi32_si128(tmp);
67 : }
68 :
69 18768100 : static void store3(void *p, __m128i v)
70 : {
71 18768100 : int tmp = _mm_cvtsi128_si32(v);
72 18768100 : memcpy(p, &tmp, 3);
73 18768100 : }
74 :
75 4389 : static void gdal_png_read_filter_row_sub3_sse2(png_row_infop row_info,
76 : const GByte *input, GByte *row)
77 : {
78 : /* The Sub filter predicts each pixel as the previous pixel, a.
79 : * There is no pixel to the left of the first pixel. It's encoded directly.
80 : * That works with our main loop if we just say that left pixel was zero.
81 : */
82 : size_t rb;
83 :
84 4389 : __m128i a, d = _mm_setzero_si128();
85 :
86 : png_debug(1, "in png_read_filter_row_sub3_sse2");
87 :
88 4389 : rb = row_info->rowbytes;
89 447937 : while (rb >= 4)
90 : {
91 443548 : a = d;
92 443548 : d = load4(input);
93 443548 : d = _mm_add_epi8(d, a);
94 443548 : store3(row, d);
95 :
96 443548 : input += 3;
97 443548 : row += 3;
98 443548 : rb -= 3;
99 : }
100 4389 : if (rb > 0)
101 : {
102 4389 : a = d;
103 4389 : d = load3(input);
104 4389 : d = _mm_add_epi8(d, a);
105 4389 : store3(row, d);
106 :
107 : //input += 3;
108 : //row += 3;
109 : //rb -= 3;
110 : }
111 4389 : }
112 :
113 6721 : static void gdal_png_read_filter_row_sub4_sse2(png_row_infop row_info,
114 : const GByte *input, GByte *row)
115 : {
116 : /* The Sub filter predicts each pixel as the previous pixel, a.
117 : * There is no pixel to the left of the first pixel. It's encoded directly.
118 : * That works with our main loop if we just say that left pixel was zero.
119 : */
120 : size_t rb;
121 :
122 6721 : __m128i a, d = _mm_setzero_si128();
123 :
124 : png_debug(1, "in png_read_filter_row_sub4_sse2");
125 :
126 6721 : rb = row_info->rowbytes + 4;
127 1790120 : while (rb > 4)
128 : {
129 1783400 : a = d;
130 1783400 : d = load4(input);
131 1783400 : d = _mm_add_epi8(d, a);
132 1783400 : store4(row, d);
133 :
134 1783400 : input += 4;
135 1783400 : row += 4;
136 1783400 : rb -= 4;
137 : }
138 6721 : }
139 :
140 5341 : static void gdal_png_read_filter_row_avg3_sse2(png_row_infop row_info,
141 : const GByte *input, GByte *row,
142 : const GByte *prev)
143 : {
144 : /* The Avg filter predicts each pixel as the (truncated) average of a and b.
145 : * There's no pixel to the left of the first pixel. Luckily, it's
146 : * predicted to be half of the pixel above it. So again, this works
147 : * perfectly with our loop if we make sure a starts at zero.
148 : */
149 :
150 : size_t rb;
151 :
152 5341 : const __m128i zero = _mm_setzero_si128();
153 :
154 : __m128i b;
155 5341 : __m128i a, d = zero;
156 :
157 : png_debug(1, "in png_read_filter_row_avg3_sse2");
158 5341 : rb = row_info->rowbytes;
159 847466 : while (rb >= 4)
160 : {
161 : __m128i avg;
162 842125 : b = load4(prev);
163 842125 : a = d;
164 842125 : d = load4(input);
165 :
166 : /* PNG requires a truncating average, so we can't just use _mm_avg_epu8
167 : */
168 842125 : avg = _mm_avg_epu8(a, b);
169 : /* ...but we can fix it up by subtracting off 1 if it rounded up. */
170 3368500 : avg = _mm_sub_epi8(
171 : avg, _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi8(1)));
172 842125 : d = _mm_add_epi8(d, avg);
173 842125 : store3(row, d);
174 :
175 842125 : input += 3;
176 842125 : prev += 3;
177 842125 : row += 3;
178 842125 : rb -= 3;
179 : }
180 5341 : if (rb > 0)
181 : {
182 : __m128i avg;
183 5341 : b = load3(prev);
184 5341 : a = d;
185 5341 : d = load3(input);
186 :
187 : /* PNG requires a truncating average, so we can't just use _mm_avg_epu8
188 : */
189 5341 : avg = _mm_avg_epu8(a, b);
190 : /* ...but we can fix it up by subtracting off 1 if it rounded up. */
191 21364 : avg = _mm_sub_epi8(
192 : avg, _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi8(1)));
193 :
194 5341 : d = _mm_add_epi8(d, avg);
195 5341 : store3(row, d);
196 :
197 : // input += 3;
198 : // prev += 3;
199 : // row += 3;
200 : // rb -= 3;
201 : }
202 5341 : }
203 :
204 4581 : static void gdal_png_read_filter_row_avg4_sse2(png_row_infop row_info,
205 : const GByte *input, GByte *row,
206 : const GByte *prev)
207 : {
208 : /* The Avg filter predicts each pixel as the (truncated) average of a and b.
209 : * There's no pixel to the left of the first pixel. Luckily, it's
210 : * predicted to be half of the pixel above it. So again, this works
211 : * perfectly with our loop if we make sure a starts at zero.
212 : */
213 : size_t rb;
214 4581 : const __m128i zero = _mm_setzero_si128();
215 : __m128i b;
216 4581 : __m128i a, d = zero;
217 :
218 : png_debug(1, "in png_read_filter_row_avg4_sse2");
219 :
220 4581 : rb = row_info->rowbytes + 4;
221 1259590 : while (rb > 4)
222 : {
223 : __m128i avg;
224 1255010 : b = load4(prev);
225 1255010 : a = d;
226 1255010 : d = load4(input);
227 :
228 : /* PNG requires a truncating average, so we can't just use _mm_avg_epu8
229 : */
230 1255010 : avg = _mm_avg_epu8(a, b);
231 : /* ...but we can fix it up by subtracting off 1 if it rounded up. */
232 5020040 : avg = _mm_sub_epi8(
233 : avg, _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi8(1)));
234 :
235 1255010 : d = _mm_add_epi8(d, avg);
236 1255010 : store4(row, d);
237 :
238 1255010 : input += 4;
239 1255010 : prev += 4;
240 1255010 : row += 4;
241 1255010 : rb -= 4;
242 : }
243 4581 : }
244 :
245 : /* Returns |x| for 16-bit lanes. */
246 86633400 : static __m128i abs_i16(__m128i x)
247 : {
248 : #if PNG_INTEL_SSE_IMPLEMENTATION >= 2
249 : return _mm_abs_epi16(x);
250 : #else
251 : /* Read this all as, return x<0 ? -x : x.
252 : * To negate two's complement, you flip all the bits then add 1.
253 : */
254 173267000 : __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());
255 :
256 : /* Flip negative lanes. */
257 86633400 : x = _mm_xor_si128(x, is_negative);
258 :
259 : /* +1 to negative lanes, else +0. */
260 86633400 : x = _mm_sub_epi16(x, is_negative);
261 86633400 : return x;
262 : #endif
263 : }
264 :
265 : /* Bytewise c ? t : e. */
266 57872500 : static __m128i if_then_else(__m128i c, __m128i t, __m128i e)
267 : {
268 : #if PNG_INTEL_SSE_IMPLEMENTATION >= 3
269 : return _mm_blendv_epi8(e, t, c);
270 : #else
271 175544000 : return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));
272 : #endif
273 : }
274 :
275 74087 : static void gdal_png_read_filter_row_paeth3_sse2(png_row_infop row_info,
276 : const GByte *input, GByte *row,
277 : const GByte *prev)
278 : {
279 : /* Paeth tries to predict pixel d using the pixel to the left of it, a,
280 : * and two pixels from the previous row, b and c:
281 : * prev: c b
282 : * row: a d
283 : * The Paeth function predicts d to be whichever of a, b, or c is nearest to
284 : * p=a+b-c.
285 : *
286 : * The first pixel has no left context, and so uses an Up filter, p = b.
287 : * This works naturally with our main loop's p = a+b-c if we force a and c
288 : * to zero.
289 : * Here we zero b and d, which become c and a respectively at the start of
290 : * the loop.
291 : */
292 : size_t rb;
293 74087 : const __m128i zero = _mm_setzero_si128();
294 74087 : __m128i c, b = zero, a, d = zero;
295 :
296 : png_debug(1, "in png_read_filter_row_paeth3_sse2");
297 :
298 74087 : rb = row_info->rowbytes;
299 17507900 : while (rb >= 4)
300 : {
301 : /* It's easiest to do this math (particularly, deal with pc) with 16-bit
302 : * intermediates.
303 : */
304 : __m128i pa, pb, pc, smallest, nearest;
305 17433700 : c = b;
306 17433700 : b = _mm_unpacklo_epi8(load4(prev), zero);
307 17611300 : a = d;
308 35240900 : d = _mm_unpacklo_epi8(load4(input), zero);
309 :
310 : /* (p-a) == (a+b-c - a) == (b-c) */
311 :
312 17629600 : pa = _mm_sub_epi16(b, c);
313 :
314 : /* (p-b) == (a+b-c - b) == (a-c) */
315 17629600 : pb = _mm_sub_epi16(a, c);
316 :
317 : /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
318 17629600 : pc = _mm_add_epi16(pa, pb);
319 :
320 17629600 : pa = abs_i16(pa); /* |p-a| */
321 17377400 : pb = abs_i16(pb); /* |p-b| */
322 17268800 : pc = abs_i16(pc); /* |p-c| */
323 :
324 34756900 : smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
325 :
326 : /* Paeth breaks ties favoring a over b over c. */
327 : nearest =
328 34538800 : if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
329 : if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));
330 :
331 : /* Note `_epi8`: we need addition to wrap modulo 255. */
332 17283600 : d = _mm_add_epi8(d, nearest);
333 17537900 : store3(row, _mm_packus_epi16(d, d));
334 :
335 17433800 : input += 3;
336 17433800 : prev += 3;
337 17433800 : row += 3;
338 17433800 : rb -= 3;
339 : }
340 74179 : if (rb > 0)
341 : {
342 : /* It's easiest to do this math (particularly, deal with pc) with 16-bit
343 : * intermediates.
344 : */
345 : __m128i pa, pb, pc, smallest, nearest;
346 74172 : c = b;
347 74172 : b = _mm_unpacklo_epi8(load3(prev), zero);
348 74182 : a = d;
349 148367 : d = _mm_unpacklo_epi8(load3(input), zero);
350 :
351 : /* (p-a) == (a+b-c - a) == (b-c) */
352 74185 : pa = _mm_sub_epi16(b, c);
353 :
354 : /* (p-b) == (a+b-c - b) == (a-c) */
355 74185 : pb = _mm_sub_epi16(a, c);
356 :
357 : /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
358 74185 : pc = _mm_add_epi16(pa, pb);
359 :
360 74185 : pa = abs_i16(pa); /* |p-a| */
361 74181 : pb = abs_i16(pb); /* |p-b| */
362 74175 : pc = abs_i16(pc); /* |p-c| */
363 :
364 148360 : smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
365 :
366 : /* Paeth breaks ties favoring a over b over c. */
367 : nearest =
368 148357 : if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
369 : if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));
370 :
371 : /* Note `_epi8`: we need addition to wrap modulo 255. */
372 74187 : d = _mm_add_epi8(d, nearest);
373 74187 : store3(row, _mm_packus_epi16(d, d));
374 :
375 : // input += 3;
376 : // prev += 3;
377 : // row += 3;
378 : // rb -= 3;
379 : }
380 74186 : }
381 :
382 48180 : static void gdal_png_read_filter_row_paeth4_sse2(png_row_infop row_info,
383 : const GByte *input, GByte *row,
384 : const GByte *prev)
385 : {
386 : /* Paeth tries to predict pixel d using the pixel to the left of it, a,
387 : * and two pixels from the previous row, b and c:
388 : * prev: c b
389 : * row: a d
390 : * The Paeth function predicts d to be whichever of a, b, or c is nearest to
391 : * p=a+b-c.
392 : *
393 : * The first pixel has no left context, and so uses an Up filter, p = b.
394 : * This works naturally with our main loop's p = a+b-c if we force a and c
395 : * to zero.
396 : * Here we zero b and d, which become c and a respectively at the start of
397 : * the loop.
398 : */
399 : size_t rb;
400 48180 : const __m128i zero = _mm_setzero_si128();
401 : __m128i pa, pb, pc, smallest, nearest;
402 48180 : __m128i c, b = zero, a, d = zero;
403 :
404 : png_debug(1, "in png_read_filter_row_paeth4_sse2");
405 :
406 48180 : rb = row_info->rowbytes + 4;
407 12324700 : while (rb > 4)
408 : {
409 : /* It's easiest to do this math (particularly, deal with pc) with 16-bit
410 : * intermediates.
411 : */
412 12276500 : c = b;
413 12276500 : b = _mm_unpacklo_epi8(load4(prev), zero);
414 12276500 : a = d;
415 24553000 : d = _mm_unpacklo_epi8(load4(input), zero);
416 :
417 : /* (p-a) == (a+b-c - a) == (b-c) */
418 12276500 : pa = _mm_sub_epi16(b, c);
419 :
420 : /* (p-b) == (a+b-c - b) == (a-c) */
421 12276500 : pb = _mm_sub_epi16(a, c);
422 :
423 : /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
424 12276500 : pc = _mm_add_epi16(pa, pb);
425 :
426 12276500 : pa = abs_i16(pa); /* |p-a| */
427 12276500 : pb = abs_i16(pb); /* |p-b| */
428 12276500 : pc = abs_i16(pc); /* |p-c| */
429 :
430 24553000 : smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
431 :
432 : /* Paeth breaks ties favoring a over b over c. */
433 : nearest =
434 24553000 : if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
435 : if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));
436 :
437 : /* Note `_epi8`: we need addition to wrap modulo 255. */
438 12276500 : d = _mm_add_epi8(d, nearest);
439 12276500 : store4(row, _mm_packus_epi16(d, d));
440 :
441 12276500 : input += 4;
442 12276500 : prev += 4;
443 12276500 : row += 4;
444 12276500 : rb -= 4;
445 : }
446 48180 : }
|