Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: GDAL Core
4 : * Purpose: SSSE3 specializations
5 : * Author: Even Rouault <even dot rouault at spatialys dot com>
6 : *
7 : ******************************************************************************
8 : * Copyright (c) 2016, Even Rouault <even dot rouault at spatialys dot com>
9 : *
10 : * SPDX-License-Identifier: MIT
11 : ****************************************************************************/
12 :
13 : #include "cpl_port.h"
14 :
15 : #include <algorithm>
16 :
17 : #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) && \
18 : (defined(__x86_64) || defined(_M_X64))) || \
19 : defined(USE_NEON_OPTIMIZATIONS)
20 :
21 : #include "rasterio_ssse3.h"
22 :
23 : #ifdef USE_NEON_OPTIMIZATIONS
24 : #include "include_sse2neon.h"
25 : #else
26 : #include <tmmintrin.h>
27 : #endif
28 :
29 : #include "gdal_priv_templates.hpp"
30 :
31 186141 : void GDALUnrolledCopy_GByte_3_1_SSSE3(GByte *CPL_RESTRICT pDest,
32 : const GByte *CPL_RESTRICT pSrc,
33 : GPtrDiff_t nIters)
34 : {
35 : decltype(nIters) i;
36 186141 : const __m128i xmm_shuffle0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
37 : -1, -1, 15, 12, 9, 6, 3, 0);
38 186141 : const __m128i xmm_shuffle1 = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11, 8, 5,
39 : 2, -1, -1, -1, -1, -1, -1);
40 186141 : const __m128i xmm_shuffle2 = _mm_set_epi8(13, 10, 7, 4, 1, -1, -1, -1, -1,
41 : -1, -1, -1, -1, -1, -1, -1);
42 : // If we were sure that there would always be 2 trailing bytes, we could
43 : // check against nIters - 15
44 30694100 : for (i = 0; i < nIters - 16; i += 16)
45 : {
46 : __m128i xmm0 =
47 30507900 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
48 : __m128i xmm1 =
49 30507900 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
50 : __m128i xmm2 =
51 61015900 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
52 :
53 : // From LSB to MSB:
54 : // 0,x,x,1,x,x,2,x,x,3,x,x,4,x,x,5 --> 0,1,2,3,4,5,0,0,0,0,0,0,0,0,0
55 30507900 : xmm0 = _mm_shuffle_epi8(xmm0, xmm_shuffle0);
56 : // x,x,6,x,x,7,x,x,8,x,x,9,x,x,10,x --> 0,0,0,0,0,0,6,7,8,9,10,0,0,0,0,0
57 30507900 : xmm1 = _mm_shuffle_epi8(xmm1, xmm_shuffle1);
58 : // x,11,x,x,12,x,x,13,x,x,14,x,x,15,x,x -->
59 : // 0,0,0,0,0,0,0,0,0,0,0,11,12,13,14,15
60 30507900 : xmm2 = _mm_shuffle_epi8(xmm2, xmm_shuffle2);
61 30507900 : xmm0 = _mm_or_si128(xmm0, xmm1);
62 30507900 : xmm0 = _mm_or_si128(xmm0, xmm2);
63 :
64 30507900 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
65 :
66 30507900 : pSrc += 3 * 16;
67 : }
68 2895000 : for (; i < nIters; i++)
69 : {
70 2708860 : pDest[i] = *pSrc;
71 2708860 : pSrc += 3;
72 : }
73 186141 : }
74 :
75 : /************************************************************************/
76 : /* GDALDeinterleave3Byte_SSSE3() */
77 : /************************************************************************/
78 :
79 : #if defined(__GNUC__) && !defined(__clang__)
80 : // GCC autovectorizer does an excellent job
81 371752 : __attribute__((optimize("tree-vectorize"))) void GDALDeinterleave3Byte_SSSE3(
82 : const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
83 : GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2, size_t nIters)
84 : {
85 281560000 : for (size_t i = 0; i < nIters; ++i)
86 : {
87 281188000 : pabyDest0[i] = pabySrc[3 * i + 0];
88 281188000 : pabyDest1[i] = pabySrc[3 * i + 1];
89 281188000 : pabyDest2[i] = pabySrc[3 * i + 2];
90 : }
91 371752 : }
92 : #else
93 : void GDALDeinterleave3Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc,
94 : GByte *CPL_RESTRICT pabyDest0,
95 : GByte *CPL_RESTRICT pabyDest1,
96 : GByte *CPL_RESTRICT pabyDest2, size_t nIters)
97 : {
98 : size_t i = 0;
99 : for (; i + 15 < nIters; i += 16)
100 : {
101 : __m128i xmm0 = _mm_loadu_si128(
102 : reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 0));
103 : __m128i xmm1 = _mm_loadu_si128(
104 : reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 16));
105 : __m128i xmm2 = _mm_loadu_si128(
106 : reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 32));
107 : auto xmm0_new =
108 : _mm_shuffle_epi8(xmm0, _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10,
109 : 7, 4, 1, 9, 6, 3, 0));
110 : auto xmm1_new = _mm_shuffle_epi8(
111 : _mm_alignr_epi8(xmm1, xmm0, 12),
112 : _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10, 7, 4, 1, 9, 6, 3, 0));
113 : auto xmm2_new = _mm_shuffle_epi8(
114 : _mm_alignr_epi8(xmm2, xmm1, 8),
115 : _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10, 7, 4, 1, 9, 6, 3, 0));
116 : auto xmm3_new =
117 : _mm_shuffle_epi8(xmm2, _mm_set_epi8(-1, -1, -1, -1, 15, 12, 9, 6,
118 : 14, 11, 8, 5, 13, 10, 7, 4));
119 :
120 : __m128i xmm01lo =
121 : _mm_unpacklo_epi32(xmm0_new, xmm1_new); // W0 W4 W1 W5
122 : __m128i xmm01hi = _mm_unpackhi_epi32(xmm0_new, xmm1_new); // W2 W6 - -
123 : __m128i xmm23lo =
124 : _mm_unpacklo_epi32(xmm2_new, xmm3_new); // W8 WC W9 WD
125 : __m128i xmm23hi = _mm_unpackhi_epi32(xmm2_new, xmm3_new); // WA WE - -
126 : xmm0_new = _mm_unpacklo_epi64(xmm01lo, xmm23lo); // W0 W4 W8 WC
127 : xmm1_new = _mm_unpackhi_epi64(xmm01lo, xmm23lo); // W1 W5 W9 WD
128 : xmm2_new = _mm_unpacklo_epi64(xmm01hi, xmm23hi); // W2 W6 WA WE
129 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0_new);
130 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1_new);
131 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest2 + i), xmm2_new);
132 : }
133 : #if defined(__clang__)
134 : #pragma clang loop vectorize(disable)
135 : #endif
136 : for (; i < nIters; ++i)
137 : {
138 : pabyDest0[i] = pabySrc[3 * i + 0];
139 : pabyDest1[i] = pabySrc[3 * i + 1];
140 : pabyDest2[i] = pabySrc[3 * i + 2];
141 : }
142 : }
143 : #endif
144 :
145 : /************************************************************************/
146 : /* GDALTranspose4x4Int32() */
147 : /************************************************************************/
148 :
149 : // Consider that the input registers for 4x4 words of size 4 bytes each,
150 : // Return the transposition of this 4x4 matrix
151 : // Considering that in0 = (in00, in01, in02, in03)
152 : // Considering that in1 = (in10, in11, in12, in13)
153 : // Considering that in2 = (in20, in21, in22, in23)
154 : // Considering that in3 = (in30, in31, in32, in33)
155 : // Return out0 = (in00, in10, in20, in30)
156 : // Return out1 = (in01, in11, in21, in31)
157 : // Return out2 = (in02, in12, in22, in32)
158 : // Return out3 = (in03, in13, in23, in33)
159 33056 : inline void GDALTranspose4x4Int32(__m128i in0, __m128i in1, __m128i in2,
160 : __m128i in3, __m128i &out0, __m128i &out1,
161 : __m128i &out2, __m128i &out3)
162 : {
163 33056 : __m128i tmp0 = _mm_unpacklo_epi32(in0, in1); // (in00, in10, in01, in11)
164 33056 : __m128i tmp1 = _mm_unpackhi_epi32(in0, in1); // (in02, in12, in03, in13)
165 33056 : __m128i tmp2 = _mm_unpacklo_epi32(in2, in3); // (in20, in30, in21, in31)
166 33056 : __m128i tmp3 = _mm_unpackhi_epi32(in2, in3); // (in22, in32, in23, in33)
167 :
168 33056 : out0 = _mm_unpacklo_epi64(tmp0, tmp2); // (in00, in10, in20, in30)
169 33056 : out1 = _mm_unpackhi_epi64(tmp0, tmp2); // (in01, in11, in21, in31)
170 33056 : out2 = _mm_unpacklo_epi64(tmp1, tmp3); // (in02, in12, in22, in32)
171 33056 : out3 = _mm_unpackhi_epi64(tmp1, tmp3); // (in03, in13, in23, in33)
172 33056 : }
173 :
174 : /************************************************************************/
175 : /* GDALDeinterleave4Byte_SSSE3() */
176 : /************************************************************************/
177 :
178 : #if !defined(__GNUC__) || defined(__clang__)
179 : void GDALDeinterleave4Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc,
180 : GByte *CPL_RESTRICT pabyDest0,
181 : GByte *CPL_RESTRICT pabyDest1,
182 : GByte *CPL_RESTRICT pabyDest2,
183 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
184 : {
185 : const __m128i shuffle_mask =
186 : _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
187 : size_t i = 0;
188 : for (; i + 15 < nIters; i += 16)
189 : {
190 : __m128i xmm0 = _mm_loadu_si128(
191 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
192 : __m128i xmm1 = _mm_loadu_si128(
193 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
194 : __m128i xmm2 = _mm_loadu_si128(
195 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
196 : __m128i xmm3 = _mm_loadu_si128(
197 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
198 : xmm0 = _mm_shuffle_epi8(xmm0, shuffle_mask); // W0 W1 W2 W3
199 : xmm1 = _mm_shuffle_epi8(xmm1, shuffle_mask); // W4 W5 W6 W7
200 : xmm2 = _mm_shuffle_epi8(xmm2, shuffle_mask); // W8 W9 WA WB
201 : xmm3 = _mm_shuffle_epi8(xmm3, shuffle_mask); // WC WD WE WF
202 :
203 : GDALTranspose4x4Int32(xmm0, xmm1, xmm2, xmm3, xmm0, xmm1, xmm2, xmm3);
204 :
205 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0);
206 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1);
207 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest2 + i), xmm2);
208 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest3 + i), xmm3);
209 : }
210 : #if defined(__clang__)
211 : #pragma clang loop vectorize(disable)
212 : #endif
213 : for (; i < nIters; ++i)
214 : {
215 : pabyDest0[i] = pabySrc[4 * i + 0];
216 : pabyDest1[i] = pabySrc[4 * i + 1];
217 : pabyDest2[i] = pabySrc[4 * i + 2];
218 : pabyDest3[i] = pabySrc[4 * i + 3];
219 : }
220 : }
221 : #endif
222 :
223 : /************************************************************************/
224 : /* GDALDeinterleave3UInt16_SSSE3() */
225 : /************************************************************************/
226 :
227 : #if (defined(__GNUC__) && !defined(__clang__)) || \
228 : defined(__INTEL_CLANG_COMPILER)
229 : #if !defined(__INTEL_CLANG_COMPILER)
230 : // GCC autovectorizer does an excellent job
231 : __attribute__((optimize("tree-vectorize")))
232 : #endif
233 126 : void GDALDeinterleave3UInt16_SSSE3(const GUInt16 *CPL_RESTRICT panSrc,
234 : GUInt16 *CPL_RESTRICT panDest0,
235 : GUInt16 *CPL_RESTRICT panDest1,
236 : GUInt16 *CPL_RESTRICT panDest2,
237 : size_t nIters)
238 : {
239 9197340 : for (size_t i = 0; i < nIters; ++i)
240 : {
241 9197210 : panDest0[i] = panSrc[3 * i + 0];
242 9197210 : panDest1[i] = panSrc[3 * i + 1];
243 9197210 : panDest2[i] = panSrc[3 * i + 2];
244 : }
245 126 : }
246 : #endif
247 :
248 : /************************************************************************/
249 : /* GDALDeinterleave4UInt16_SSSE3() */
250 : /************************************************************************/
251 :
252 : #if (defined(__GNUC__) && !defined(__clang__)) || \
253 : defined(__INTEL_CLANG_COMPILER)
254 : #if !defined(__INTEL_CLANG_COMPILER)
255 : // GCC autovectorizer does an excellent job
256 : __attribute__((optimize("tree-vectorize")))
257 : #endif
258 195 : void GDALDeinterleave4UInt16_SSSE3(const GUInt16 *CPL_RESTRICT panSrc,
259 : GUInt16 *CPL_RESTRICT panDest0,
260 : GUInt16 *CPL_RESTRICT panDest1,
261 : GUInt16 *CPL_RESTRICT panDest2,
262 : GUInt16 *CPL_RESTRICT panDest3,
263 : size_t nIters)
264 : {
265 262385 : for (size_t i = 0; i < nIters; ++i)
266 : {
267 262190 : panDest0[i] = panSrc[4 * i + 0];
268 262190 : panDest1[i] = panSrc[4 * i + 1];
269 262190 : panDest2[i] = panSrc[4 * i + 2];
270 262190 : panDest3[i] = panSrc[4 * i + 3];
271 : }
272 195 : }
273 : #endif
274 :
275 : /************************************************************************/
276 : /* loadu() */
277 : /************************************************************************/
278 :
279 66117 : inline __m128i loadu(const uint8_t *pSrc, size_t i, size_t srcStride)
280 : {
281 66117 : return _mm_loadu_si128(
282 132234 : reinterpret_cast<const __m128i *>(pSrc + i * srcStride));
283 : }
284 :
285 : /************************************************************************/
286 : /* storeu() */
287 : /************************************************************************/
288 :
289 66117 : inline void storeu(uint8_t *pDst, size_t i, size_t dstStride, __m128i reg)
290 : {
291 66117 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + i * dstStride), reg);
292 66117 : }
293 :
294 : /************************************************************************/
295 : /* GDALInterleave3Byte_SSSE3() */
296 : /************************************************************************/
297 :
298 : #if (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
299 :
300 : inline __m128i GDAL_mm_or_3_si128(__m128i r0, __m128i r1, __m128i r2)
301 : {
302 : return _mm_or_si128(_mm_or_si128(r0, r1), r2);
303 : }
304 :
305 : // ICC autovectorizer doesn't do a good job at generating good SSE code,
306 : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
307 : #if defined(__GNUC__)
308 : __attribute__((noinline))
309 : #endif
310 : static void GDALInterleave3Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
311 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
312 : {
313 : size_t i = 0;
314 : constexpr size_t VALS_PER_ITER = 16;
315 :
316 : if (nIters >= VALS_PER_ITER)
317 : {
318 : // clang-format off
319 : constexpr char X = -1;
320 : // How to dispatch 16 values of row=0 onto 3x16 bytes
321 : const __m128i xmm_shuffle00 = _mm_setr_epi8(0, X, X,
322 : 1, X, X,
323 : 2, X, X,
324 : 3, X, X,
325 : 4, X, X,
326 : 5);
327 : const __m128i xmm_shuffle01 = _mm_setr_epi8( X, X,
328 : 6, X, X,
329 : 7, X, X,
330 : 8, X, X,
331 : 9, X, X,
332 : 10,X);
333 : const __m128i xmm_shuffle02 = _mm_setr_epi8( X,
334 : 11, X, X,
335 : 12, X, X,
336 : 13, X, X,
337 : 14, X, X,
338 : 15, X, X);
339 :
340 : // How to dispatch 16 values of row=1 onto 3x16 bytes
341 : const __m128i xmm_shuffle10 = _mm_setr_epi8(X, 0, X,
342 : X, 1, X,
343 : X, 2, X,
344 : X, 3, X,
345 : X, 4, X,
346 : X);
347 : const __m128i xmm_shuffle11 = _mm_setr_epi8( 5, X,
348 : X, 6, X,
349 : X, 7, X,
350 : X, 8, X,
351 : X, 9, X,
352 : X,10);
353 : const __m128i xmm_shuffle12 = _mm_setr_epi8( X,
354 : X, 11, X,
355 : X, 12, X,
356 : X, 13, X,
357 : X, 14, X,
358 : X, 15, X);
359 :
360 : // How to dispatch 16 values of row=2 onto 3x16 bytes
361 : const __m128i xmm_shuffle20 = _mm_setr_epi8(X, X, 0,
362 : X, X, 1,
363 : X, X, 2,
364 : X, X, 3,
365 : X, X, 4,
366 : X);
367 : const __m128i xmm_shuffle21 = _mm_setr_epi8( X, 5,
368 : X, X, 6,
369 : X, X, 7,
370 : X, X, 8,
371 : X, X, 9,
372 : X, X);
373 : const __m128i xmm_shuffle22 = _mm_setr_epi8( 10,
374 : X, X, 11,
375 : X, X, 12,
376 : X, X, 13,
377 : X, X, 14,
378 : X, X, 15);
379 : // clang-format on
380 :
381 : for (; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
382 : {
383 : #define LOAD(x) __m128i xmm##x = loadu(pSrc + i, x, nIters)
384 : LOAD(0);
385 : LOAD(1);
386 : LOAD(2);
387 :
388 : #define SHUFFLE(x, y) _mm_shuffle_epi8(xmm##y, xmm_shuffle##y##x)
389 : #define COMBINE_3(x) \
390 : GDAL_mm_or_3_si128(SHUFFLE(x, 0), SHUFFLE(x, 1), SHUFFLE(x, 2))
391 :
392 : #define STORE(x) \
393 : storeu(pDst, 3 * (i / VALS_PER_ITER) + x, VALS_PER_ITER, COMBINE_3(x))
394 : STORE(0);
395 : STORE(1);
396 : STORE(2);
397 : #undef LOAD
398 : #undef COMBINE_3
399 : #undef SHUFFLE
400 : #undef STORE
401 : }
402 : }
403 :
404 : for (; i < nIters; ++i)
405 : {
406 : #define INTERLEAVE(x) pDst[3 * i + x] = pSrc[i + x * nIters]
407 : INTERLEAVE(0);
408 : INTERLEAVE(1);
409 : INTERLEAVE(2);
410 : #undef INTERLEAVE
411 : }
412 : }
413 :
414 : #else
415 :
416 : #if defined(__GNUC__) && !defined(__clang__)
417 : __attribute__((optimize("tree-vectorize")))
418 : #endif
419 : #if defined(__GNUC__)
420 : __attribute__((noinline))
421 : #endif
422 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
423 : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
424 : #pragma clang diagnostic push
425 : #pragma clang diagnostic ignored "-Wpass-failed"
426 : #endif
427 19 : static void GDALInterleave3Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
428 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
429 : {
430 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
431 : #pragma clang loop vectorize(enable)
432 : #endif
433 492946 : for (size_t i = 0; i < nIters; ++i)
434 : {
435 492927 : pDst[3 * i + 0] = pSrc[i + 0 * nIters];
436 492927 : pDst[3 * i + 1] = pSrc[i + 1 * nIters];
437 492927 : pDst[3 * i + 2] = pSrc[i + 2 * nIters];
438 : }
439 19 : }
440 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
441 : #pragma clang diagnostic pop
442 : #endif
443 :
444 : #endif
445 :
446 : /************************************************************************/
447 : /* GDALInterleave5Byte_SSSE3() */
448 : /************************************************************************/
449 :
450 5 : inline __m128i GDAL_mm_or_5_si128(__m128i r0, __m128i r1, __m128i r2,
451 : __m128i r3, __m128i r4)
452 : {
453 15 : return _mm_or_si128(
454 5 : _mm_or_si128(_mm_or_si128(r0, r1), _mm_or_si128(r2, r3)), r4);
455 : }
456 :
457 2 : static void GDALInterleave5Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
458 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
459 : {
460 2 : size_t i = 0;
461 2 : constexpr size_t VALS_PER_ITER = 16;
462 :
463 2 : if (nIters >= VALS_PER_ITER)
464 : {
465 : // clang-format off
466 1 : constexpr char X = -1;
467 : // How to dispatch 16 values of row=0 onto 5x16 bytes
468 1 : const __m128i xmm_shuffle00 = _mm_setr_epi8(0, X, X, X, X,
469 : 1, X, X, X, X,
470 : 2, X, X, X, X,
471 : 3);
472 1 : const __m128i xmm_shuffle01 = _mm_setr_epi8( X, X, X, X,
473 : 4, X, X, X, X,
474 : 5, X, X, X, X,
475 : 6, X);
476 1 : const __m128i xmm_shuffle02 = _mm_setr_epi8( X, X, X,
477 : 7, X, X, X, X,
478 : 8, X, X, X, X,
479 : 9, X, X);
480 1 : const __m128i xmm_shuffle03 = _mm_setr_epi8( X, X,
481 : 10, X, X, X, X,
482 : 11, X, X, X, X,
483 : 12, X, X, X);
484 1 : const __m128i xmm_shuffle04 = _mm_setr_epi8( X,
485 : 13, X, X, X, X,
486 : 14, X, X, X, X,
487 : 15, X, X, X, X);
488 :
489 : // How to dispatch 16 values of row=1 onto 5x16 bytes
490 1 : const __m128i xmm_shuffle10 = _mm_setr_epi8(X, 0, X, X, X,
491 : X, 1, X, X, X,
492 : X, 2, X, X, X,
493 : X);
494 1 : const __m128i xmm_shuffle11 = _mm_setr_epi8( 3, X, X, X,
495 : X, 4, X, X, X,
496 : X, 5, X, X, X,
497 : X, 6);
498 1 : const __m128i xmm_shuffle12 = _mm_setr_epi8( X, X, X,
499 : X, 7, X, X, X,
500 : X, 8, X, X, X,
501 : X, 9, X);
502 1 : const __m128i xmm_shuffle13 = _mm_setr_epi8( X, X,
503 : X, 10, X, X, X,
504 : X, 11, X, X, X,
505 : X, 12, X, X);
506 1 : const __m128i xmm_shuffle14 = _mm_setr_epi8( X,
507 : X, 13, X, X, X,
508 : X, 14, X, X, X,
509 : X, 15, X, X, X);
510 :
511 : // How to dispatch 16 values of row=2 onto 5x16 bytes
512 1 : const __m128i xmm_shuffle20 = _mm_setr_epi8(X, X, 0, X, X,
513 : X, X, 1, X, X,
514 : X, X, 2, X, X,
515 : X);
516 1 : const __m128i xmm_shuffle21 = _mm_setr_epi8( X, 3, X, X,
517 : X, X, 4, X, X,
518 : X, X, 5, X, X,
519 : X, X);
520 1 : const __m128i xmm_shuffle22 = _mm_setr_epi8( 6, X, X,
521 : X, X, 7, X, X,
522 : X, X, 8, X, X,
523 : X, X, 9);
524 1 : const __m128i xmm_shuffle23 = _mm_setr_epi8( X, X,
525 : X, X, 10, X, X,
526 : X, X, 11, X, X,
527 : X, X, 12, X);
528 1 : const __m128i xmm_shuffle24 = _mm_setr_epi8( X,
529 : X, X, 13, X, X,
530 : X, X, 14, X, X,
531 : X, X, 15, X, X);
532 :
533 : // How to dispatch 16 values of row=3 onto 5x16 bytes
534 1 : const __m128i xmm_shuffle30 = _mm_setr_epi8(X, X, X, 0, X,
535 : X, X, X, 1, X,
536 : X, X, X, 2, X,
537 : X);
538 1 : const __m128i xmm_shuffle31 = _mm_setr_epi8( X, X, 3, X,
539 : X, X, X, 4, X,
540 : X, X, X, 5, X,
541 : X, X);
542 1 : const __m128i xmm_shuffle32 = _mm_setr_epi8( X, 6, X,
543 : X, X, X, 7, X,
544 : X, X, X, 8, X,
545 : X, X, X);
546 1 : const __m128i xmm_shuffle33 = _mm_setr_epi8( 9, X,
547 : X, X, X, 10, X,
548 : X, X, X, 11, X,
549 : X, X, X, 12);
550 1 : const __m128i xmm_shuffle34 = _mm_setr_epi8( X,
551 : X, X, X, 13, X,
552 : X, X, X, 14, X,
553 : X, X, X, 15, X);
554 :
555 : // How to dispatch 16 values of row=4 onto 5x16 bytes
556 1 : const __m128i xmm_shuffle40 = _mm_setr_epi8(X, X, X, X, 0,
557 : X, X, X, X, 1,
558 : X, X, X, X, 2,
559 : X);
560 1 : const __m128i xmm_shuffle41 = _mm_setr_epi8( X, X, X, 3,
561 : X, X, X, X, 4,
562 : X, X, X, X, 5,
563 : X, X);
564 1 : const __m128i xmm_shuffle42 = _mm_setr_epi8( X, X, 6,
565 : X, X, X, X, 7,
566 : X, X, X, X, 8,
567 : X, X, X);
568 1 : const __m128i xmm_shuffle43 = _mm_setr_epi8( X, 9,
569 : X, X, X, X, 10,
570 : X, X, X, X, 11,
571 : X, X, X, X);
572 1 : const __m128i xmm_shuffle44 = _mm_setr_epi8( 12,
573 : X, X, X, X, 13,
574 : X, X, X, X, 14,
575 : X, X, X, X, 15);
576 : // clang-format on
577 :
578 2 : for (; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
579 : {
580 : #define LOAD(x) __m128i xmm##x = loadu(pSrc + i, x, nIters)
581 1 : LOAD(0);
582 1 : LOAD(1);
583 1 : LOAD(2);
584 1 : LOAD(3);
585 1 : LOAD(4);
586 :
587 : #define SHUFFLE(x, y) _mm_shuffle_epi8(xmm##y, xmm_shuffle##y##x)
588 : #define COMBINE_5(x) \
589 : GDAL_mm_or_5_si128(SHUFFLE(x, 0), SHUFFLE(x, 1), SHUFFLE(x, 2), \
590 : SHUFFLE(x, 3), SHUFFLE(x, 4))
591 :
592 : #define STORE(x) \
593 : storeu(pDst, 5 * (i / VALS_PER_ITER) + x, VALS_PER_ITER, COMBINE_5(x))
594 5 : STORE(0);
595 5 : STORE(1);
596 5 : STORE(2);
597 5 : STORE(3);
598 5 : STORE(4);
599 : #undef LOAD
600 : #undef COMBINE_5
601 : #undef SHUFFLE
602 : #undef STORE
603 : }
604 : }
605 :
606 20 : for (; i < nIters; ++i)
607 : {
608 : #define INTERLEAVE(x) pDst[5 * i + x] = pSrc[i + x * nIters]
609 18 : INTERLEAVE(0);
610 18 : INTERLEAVE(1);
611 18 : INTERLEAVE(2);
612 18 : INTERLEAVE(3);
613 18 : INTERLEAVE(4);
614 : #undef INTERLEAVE
615 : }
616 2 : }
617 :
618 : /************************************************************************/
619 : /* GDALTranspose2D_Byte_SSSE3() */
620 : /************************************************************************/
621 :
622 : // Given r = (b00, b01, b02, b03,
623 : // b10, b11, b12, b13,
624 : // b20, b21, b22, b23,
625 : // b30, b31, b32, b33)
626 : // Return (b00, b10, b20, b30,
627 : // b01, b11, b21, b31,
628 : // b02, b12, b22, b32,
629 : // b03, b13, b23, b33)
630 66112 : inline void GDALReorderForTranspose4x4(__m128i &r)
631 : {
632 : const __m128i shuffle_mask =
633 66112 : _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
634 :
635 66112 : r = _mm_shuffle_epi8(r, shuffle_mask);
636 66112 : }
637 :
638 : // Transpose the 16x16 byte values contained in the 16 SSE registers
639 4132 : inline void GDALTranspose16x16ByteBlock_SSSE3(
640 : __m128i &r00, __m128i &r01, __m128i &r02, __m128i &r03, __m128i &r04,
641 : __m128i &r05, __m128i &r06, __m128i &r07, __m128i &r08, __m128i &r09,
642 : __m128i &r10, __m128i &r11, __m128i &r12, __m128i &r13, __m128i &r14,
643 : __m128i &r15)
644 : {
645 : __m128i tmp00, tmp01, tmp02, tmp03;
646 : __m128i tmp10, tmp11, tmp12, tmp13;
647 : __m128i tmp20, tmp21, tmp22, tmp23;
648 : __m128i tmp30, tmp31, tmp32, tmp33;
649 :
650 4132 : GDALTranspose4x4Int32(r00, r01, r02, r03, tmp00, tmp01, tmp02, tmp03);
651 4132 : GDALTranspose4x4Int32(r04, r05, r06, r07, tmp10, tmp11, tmp12, tmp13);
652 4132 : GDALTranspose4x4Int32(r08, r09, r10, r11, tmp20, tmp21, tmp22, tmp23);
653 4132 : GDALTranspose4x4Int32(r12, r13, r14, r15, tmp30, tmp31, tmp32, tmp33);
654 :
655 4132 : GDALReorderForTranspose4x4(tmp00);
656 4132 : GDALReorderForTranspose4x4(tmp01);
657 4132 : GDALReorderForTranspose4x4(tmp02);
658 4132 : GDALReorderForTranspose4x4(tmp03);
659 4132 : GDALReorderForTranspose4x4(tmp10);
660 4132 : GDALReorderForTranspose4x4(tmp11);
661 4132 : GDALReorderForTranspose4x4(tmp12);
662 4132 : GDALReorderForTranspose4x4(tmp13);
663 4132 : GDALReorderForTranspose4x4(tmp20);
664 4132 : GDALReorderForTranspose4x4(tmp21);
665 4132 : GDALReorderForTranspose4x4(tmp22);
666 4132 : GDALReorderForTranspose4x4(tmp23);
667 4132 : GDALReorderForTranspose4x4(tmp30);
668 4132 : GDALReorderForTranspose4x4(tmp31);
669 4132 : GDALReorderForTranspose4x4(tmp32);
670 4132 : GDALReorderForTranspose4x4(tmp33);
671 :
672 4132 : GDALTranspose4x4Int32(tmp00, tmp10, tmp20, tmp30, r00, r01, r02, r03);
673 4132 : GDALTranspose4x4Int32(tmp01, tmp11, tmp21, tmp31, r04, r05, r06, r07);
674 4132 : GDALTranspose4x4Int32(tmp02, tmp12, tmp22, tmp32, r08, r09, r10, r11);
675 4132 : GDALTranspose4x4Int32(tmp03, tmp13, tmp23, tmp33, r12, r13, r14, r15);
676 4132 : }
677 :
678 4132 : inline void GDALTranspose2D16x16Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
679 : uint8_t *CPL_RESTRICT pDst,
680 : size_t srcStride, size_t dstStride)
681 : {
682 : #define LOAD(x) __m128i r##x = loadu(pSrc, x, srcStride)
683 4132 : LOAD(0);
684 4132 : LOAD(1);
685 4132 : LOAD(2);
686 4132 : LOAD(3);
687 4132 : LOAD(4);
688 4132 : LOAD(5);
689 4132 : LOAD(6);
690 4132 : LOAD(7);
691 4132 : LOAD(8);
692 4132 : LOAD(9);
693 4132 : LOAD(10);
694 4132 : LOAD(11);
695 4132 : LOAD(12);
696 4132 : LOAD(13);
697 4132 : LOAD(14);
698 4132 : LOAD(15);
699 : #undef LOAD
700 :
701 4132 : GDALTranspose16x16ByteBlock_SSSE3(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9,
702 : r10, r11, r12, r13, r14, r15);
703 :
704 : #define STORE(x) storeu(pDst, x, dstStride, r##x)
705 4132 : STORE(0);
706 4132 : STORE(1);
707 4132 : STORE(2);
708 4132 : STORE(3);
709 4132 : STORE(4);
710 4132 : STORE(5);
711 4132 : STORE(6);
712 4132 : STORE(7);
713 4132 : STORE(8);
714 4132 : STORE(9);
715 4132 : STORE(10);
716 4132 : STORE(11);
717 4132 : STORE(12);
718 4132 : STORE(13);
719 4132 : STORE(14);
720 4132 : STORE(15);
721 : #undef STORE
722 4132 : }
723 :
724 34 : void GDALTranspose2D_Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
725 : uint8_t *CPL_RESTRICT pDst, size_t nSrcWidth,
726 : size_t nSrcHeight)
727 : {
728 34 : if (nSrcHeight == 3)
729 : {
730 19 : GDALInterleave3Byte_SSSE3(pSrc, pDst, nSrcWidth);
731 : }
732 15 : else if (nSrcHeight == 5)
733 : {
734 2 : GDALInterleave5Byte_SSSE3(pSrc, pDst, nSrcWidth);
735 : }
736 : else
737 : {
738 13 : constexpr size_t blocksize = 16;
739 109 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
740 : {
741 96 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
742 4280 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
743 : {
744 : // transpose the block beginning at [i,j]
745 4184 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
746 4184 : if (max_k - i == blocksize && max_l - j == blocksize)
747 : {
748 4132 : GDALTranspose2D16x16Byte_SSSE3(&pSrc[j + i * nSrcWidth],
749 4132 : &pDst[i + j * nSrcHeight],
750 : nSrcWidth, nSrcHeight);
751 : }
752 : else
753 : {
754 614 : for (size_t k = i; k < max_k; ++k)
755 : {
756 5005 : for (size_t l = j; l < max_l; ++l)
757 : {
758 4443 : GDALCopyWord(pSrc[l + k * nSrcWidth],
759 4443 : pDst[k + l * nSrcHeight]);
760 : }
761 : }
762 : }
763 : }
764 : }
765 : }
766 34 : }
767 :
768 : #endif // HAVE_SSSE3_AT_COMPILE_TIME
|