Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: GDAL Core
4 : * Purpose: SSSE3 specializations
5 : * Author: Even Rouault <even dot rouault at spatialys dot com>
6 : *
7 : ******************************************************************************
8 : * Copyright (c) 2016, Even Rouault <even dot rouault at spatialys dot com>
9 : *
10 : * SPDX-License-Identifier: MIT
11 : ****************************************************************************/
12 :
13 : #include "cpl_port.h"
14 :
15 : #include <algorithm>
16 :
17 : #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) && \
18 : (defined(__x86_64) || defined(_M_X64))) || \
19 : defined(USE_NEON_OPTIMIZATIONS)
20 :
21 : #include "rasterio_ssse3.h"
22 :
23 : #ifdef USE_NEON_OPTIMIZATIONS
24 : #include "include_sse2neon.h"
25 : #else
26 : #include <tmmintrin.h>
27 : #endif
28 :
29 : #include "gdal_priv_templates.hpp"
30 :
31 179430 : void GDALUnrolledCopy_GByte_3_1_SSSE3(GByte *CPL_RESTRICT pDest,
32 : const GByte *CPL_RESTRICT pSrc,
33 : GPtrDiff_t nIters)
34 : {
35 : decltype(nIters) i;
36 179430 : const __m128i xmm_shuffle0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
37 : -1, -1, 15, 12, 9, 6, 3, 0);
38 179430 : const __m128i xmm_shuffle1 = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11, 8, 5,
39 : 2, -1, -1, -1, -1, -1, -1);
40 179430 : const __m128i xmm_shuffle2 = _mm_set_epi8(13, 10, 7, 4, 1, -1, -1, -1, -1,
41 : -1, -1, -1, -1, -1, -1, -1);
42 : // If we were sure that there would always be 2 trailing bytes, we could
43 : // check against nIters - 15
44 30307300 : for (i = 0; i < nIters - 16; i += 16)
45 : {
46 : __m128i xmm0 =
47 30127900 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
48 : __m128i xmm1 =
49 30127900 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
50 : __m128i xmm2 =
51 60255700 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
52 :
53 : // From LSB to MSB:
54 : // 0,x,x,1,x,x,2,x,x,3,x,x,4,x,x,5 --> 0,1,2,3,4,5,0,0,0,0,0,0,0,0,0
55 30127900 : xmm0 = _mm_shuffle_epi8(xmm0, xmm_shuffle0);
56 : // x,x,6,x,x,7,x,x,8,x,x,9,x,x,10,x --> 0,0,0,0,0,0,6,7,8,9,10,0,0,0,0,0
57 30127900 : xmm1 = _mm_shuffle_epi8(xmm1, xmm_shuffle1);
58 : // x,11,x,x,12,x,x,13,x,x,14,x,x,15,x,x -->
59 : // 0,0,0,0,0,0,0,0,0,0,0,11,12,13,14,15
60 30127900 : xmm2 = _mm_shuffle_epi8(xmm2, xmm_shuffle2);
61 30127900 : xmm0 = _mm_or_si128(xmm0, xmm1);
62 30127900 : xmm0 = _mm_or_si128(xmm0, xmm2);
63 :
64 30127900 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
65 :
66 30127900 : pSrc += 3 * 16;
67 : }
68 2785980 : for (; i < nIters; i++)
69 : {
70 2606550 : pDest[i] = *pSrc;
71 2606550 : pSrc += 3;
72 : }
73 179430 : }
74 :
75 : /************************************************************************/
76 : /* GDALDeinterleave3Byte_SSSE3() */
77 : /************************************************************************/
78 :
79 : #if defined(__GNUC__) && !defined(__clang__)
80 : // GCC autovectorizer does an excellent job
81 69804 : __attribute__((optimize("tree-vectorize"))) void GDALDeinterleave3Byte_SSSE3(
82 : const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
83 : GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2, size_t nIters)
84 : {
85 221989000 : for (size_t i = 0; i < nIters; ++i)
86 : {
87 221919000 : pabyDest0[i] = pabySrc[3 * i + 0];
88 221919000 : pabyDest1[i] = pabySrc[3 * i + 1];
89 221919000 : pabyDest2[i] = pabySrc[3 * i + 2];
90 : }
91 69804 : }
92 : #else
93 : void GDALDeinterleave3Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc,
94 : GByte *CPL_RESTRICT pabyDest0,
95 : GByte *CPL_RESTRICT pabyDest1,
96 : GByte *CPL_RESTRICT pabyDest2, size_t nIters)
97 : {
98 : size_t i = 0;
99 : for (; i + 15 < nIters; i += 16)
100 : {
101 : __m128i xmm0 = _mm_loadu_si128(
102 : reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 0));
103 : __m128i xmm1 = _mm_loadu_si128(
104 : reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 16));
105 : __m128i xmm2 = _mm_loadu_si128(
106 : reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 32));
107 : auto xmm0_new =
108 : _mm_shuffle_epi8(xmm0, _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10,
109 : 7, 4, 1, 9, 6, 3, 0));
110 : auto xmm1_new = _mm_shuffle_epi8(
111 : _mm_alignr_epi8(xmm1, xmm0, 12),
112 : _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10, 7, 4, 1, 9, 6, 3, 0));
113 : auto xmm2_new = _mm_shuffle_epi8(
114 : _mm_alignr_epi8(xmm2, xmm1, 8),
115 : _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10, 7, 4, 1, 9, 6, 3, 0));
116 : auto xmm3_new =
117 : _mm_shuffle_epi8(xmm2, _mm_set_epi8(-1, -1, -1, -1, 15, 12, 9, 6,
118 : 14, 11, 8, 5, 13, 10, 7, 4));
119 :
120 : __m128i xmm01lo =
121 : _mm_unpacklo_epi32(xmm0_new, xmm1_new); // W0 W4 W1 W5
122 : __m128i xmm01hi = _mm_unpackhi_epi32(xmm0_new, xmm1_new); // W2 W6 - -
123 : __m128i xmm23lo =
124 : _mm_unpacklo_epi32(xmm2_new, xmm3_new); // W8 WC W9 WD
125 : __m128i xmm23hi = _mm_unpackhi_epi32(xmm2_new, xmm3_new); // WA WE - -
126 : xmm0_new = _mm_unpacklo_epi64(xmm01lo, xmm23lo); // W0 W4 W8 WC
127 : xmm1_new = _mm_unpackhi_epi64(xmm01lo, xmm23lo); // W1 W5 W9 WD
128 : xmm2_new = _mm_unpacklo_epi64(xmm01hi, xmm23hi); // W2 W6 WA WE
129 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0_new);
130 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1_new);
131 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest2 + i), xmm2_new);
132 : }
133 : #if defined(__clang__)
134 : #pragma clang loop vectorize(disable)
135 : #endif
136 : for (; i < nIters; ++i)
137 : {
138 : pabyDest0[i] = pabySrc[3 * i + 0];
139 : pabyDest1[i] = pabySrc[3 * i + 1];
140 : pabyDest2[i] = pabySrc[3 * i + 2];
141 : }
142 : }
143 : #endif
144 :
145 : /************************************************************************/
146 : /* GDALTranspose4x4Int32() */
147 : /************************************************************************/
148 :
149 : // Consider that the input registers for 4x4 words of size 4 bytes each,
150 : // Return the transposition of this 4x4 matrix
151 : // Considering that in0 = (in00, in01, in02, in03)
152 : // Considering that in1 = (in10, in11, in12, in13)
153 : // Considering that in2 = (in20, in21, in22, in23)
154 : // Considering that in3 = (in30, in31, in32, in33)
155 : // Return out0 = (in00, in10, in20, in30)
156 : // Return out1 = (in01, in11, in21, in31)
157 : // Return out2 = (in02, in12, in22, in32)
158 : // Return out3 = (in03, in13, in23, in33)
159 33056 : inline void GDALTranspose4x4Int32(__m128i in0, __m128i in1, __m128i in2,
160 : __m128i in3, __m128i &out0, __m128i &out1,
161 : __m128i &out2, __m128i &out3)
162 : {
163 33056 : __m128i tmp0 = _mm_unpacklo_epi32(in0, in1); // (in00, in10, in01, in11)
164 33056 : __m128i tmp1 = _mm_unpackhi_epi32(in0, in1); // (in02, in12, in03, in13)
165 33056 : __m128i tmp2 = _mm_unpacklo_epi32(in2, in3); // (in20, in30, in21, in31)
166 33056 : __m128i tmp3 = _mm_unpackhi_epi32(in2, in3); // (in22, in32, in23, in33)
167 :
168 33056 : out0 = _mm_unpacklo_epi64(tmp0, tmp2); // (in00, in10, in20, in30)
169 33056 : out1 = _mm_unpackhi_epi64(tmp0, tmp2); // (in01, in11, in21, in31)
170 33056 : out2 = _mm_unpacklo_epi64(tmp1, tmp3); // (in02, in12, in22, in32)
171 33056 : out3 = _mm_unpackhi_epi64(tmp1, tmp3); // (in03, in13, in23, in33)
172 33056 : }
173 :
174 : /************************************************************************/
175 : /* GDALDeinterleave4Byte_SSSE3() */
176 : /************************************************************************/
177 :
178 : #if !defined(__GNUC__) || defined(__clang__)
179 : void GDALDeinterleave4Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc,
180 : GByte *CPL_RESTRICT pabyDest0,
181 : GByte *CPL_RESTRICT pabyDest1,
182 : GByte *CPL_RESTRICT pabyDest2,
183 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
184 : {
185 : const __m128i shuffle_mask =
186 : _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
187 : size_t i = 0;
188 : for (; i + 15 < nIters; i += 16)
189 : {
190 : __m128i xmm0 = _mm_loadu_si128(
191 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
192 : __m128i xmm1 = _mm_loadu_si128(
193 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
194 : __m128i xmm2 = _mm_loadu_si128(
195 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
196 : __m128i xmm3 = _mm_loadu_si128(
197 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
198 : xmm0 = _mm_shuffle_epi8(xmm0, shuffle_mask); // W0 W1 W2 W3
199 : xmm1 = _mm_shuffle_epi8(xmm1, shuffle_mask); // W4 W5 W6 W7
200 : xmm2 = _mm_shuffle_epi8(xmm2, shuffle_mask); // W8 W9 WA WB
201 : xmm3 = _mm_shuffle_epi8(xmm3, shuffle_mask); // WC WD WE WF
202 :
203 : GDALTranspose4x4Int32(xmm0, xmm1, xmm2, xmm3, xmm0, xmm1, xmm2, xmm3);
204 :
205 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0);
206 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1);
207 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest2 + i), xmm2);
208 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest3 + i), xmm3);
209 : }
210 : #if defined(__clang__)
211 : #pragma clang loop vectorize(disable)
212 : #endif
213 : for (; i < nIters; ++i)
214 : {
215 : pabyDest0[i] = pabySrc[4 * i + 0];
216 : pabyDest1[i] = pabySrc[4 * i + 1];
217 : pabyDest2[i] = pabySrc[4 * i + 2];
218 : pabyDest3[i] = pabySrc[4 * i + 3];
219 : }
220 : }
221 : #endif
222 :
223 : /************************************************************************/
224 : /* GDALDeinterleave3UInt16_SSSE3() */
225 : /************************************************************************/
226 :
227 : #if (defined(__GNUC__) && !defined(__clang__)) || \
228 : defined(__INTEL_CLANG_COMPILER)
229 : #if !defined(__INTEL_CLANG_COMPILER)
230 : // GCC autovectorizer does an excellent job
231 : __attribute__((optimize("tree-vectorize")))
232 : #endif
233 239 : void GDALDeinterleave3UInt16_SSSE3(const GUInt16* CPL_RESTRICT panSrc,
234 : GUInt16* CPL_RESTRICT panDest0,
235 : GUInt16* CPL_RESTRICT panDest1,
236 : GUInt16* CPL_RESTRICT panDest2,
237 : size_t nIters)
238 : {
239 9256750 : for (size_t i = 0; i < nIters; ++i)
240 : {
241 9256510 : panDest0[i] = panSrc[3 * i + 0];
242 9256510 : panDest1[i] = panSrc[3 * i + 1];
243 9256510 : panDest2[i] = panSrc[3 * i + 2];
244 : }
245 239 : }
246 : #endif
247 :
248 : /************************************************************************/
249 : /* GDALDeinterleave4UInt16_SSSE3() */
250 : /************************************************************************/
251 :
252 : #if (defined(__GNUC__) && !defined(__clang__)) || \
253 : defined(__INTEL_CLANG_COMPILER)
254 : #if !defined(__INTEL_CLANG_COMPILER)
255 : // GCC autovectorizer does an excellent job
256 : __attribute__((optimize("tree-vectorize")))
257 : #endif
258 494 : void GDALDeinterleave4UInt16_SSSE3(const GUInt16* CPL_RESTRICT panSrc,
259 : GUInt16* CPL_RESTRICT panDest0,
260 : GUInt16* CPL_RESTRICT panDest1,
261 : GUInt16* CPL_RESTRICT panDest2,
262 : GUInt16* CPL_RESTRICT panDest3,
263 : size_t nIters)
264 : {
265 472352 : for (size_t i = 0; i < nIters; ++i)
266 : {
267 471858 : panDest0[i] = panSrc[4 * i + 0];
268 471858 : panDest1[i] = panSrc[4 * i + 1];
269 471858 : panDest2[i] = panSrc[4 * i + 2];
270 471858 : panDest3[i] = panSrc[4 * i + 3];
271 : }
272 494 : }
273 : #endif
274 :
275 : /************************************************************************/
276 : /* loadu() */
277 : /************************************************************************/
278 :
279 66117 : inline __m128i loadu(const uint8_t *pSrc, size_t i, size_t srcStride)
280 : {
281 66117 : return _mm_loadu_si128(
282 132234 : reinterpret_cast<const __m128i *>(pSrc + i * srcStride));
283 : }
284 :
285 : /************************************************************************/
286 : /* storeu() */
287 : /************************************************************************/
288 :
289 66117 : inline void storeu(uint8_t *pDst, size_t i, size_t dstStride, __m128i reg)
290 : {
291 66117 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + i * dstStride), reg);
292 66117 : }
293 :
294 : /************************************************************************/
295 : /* GDALInterleave3Byte_SSSE3() */
296 : /************************************************************************/
297 :
298 : #if (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
299 :
300 : inline __m128i GDAL_mm_or_3_si128(__m128i r0, __m128i r1, __m128i r2)
301 : {
302 : return _mm_or_si128(_mm_or_si128(r0, r1), r2);
303 : }
304 :
305 : // ICC autovectorizer doesn't do a good job at generating good SSE code,
306 : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
307 : #if defined(__GNUC__)
308 : __attribute__((noinline))
309 : #endif
310 : static void
311 : GDALInterleave3Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
312 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
313 : {
314 : size_t i = 0;
315 : constexpr size_t VALS_PER_ITER = 16;
316 :
317 : if (nIters >= VALS_PER_ITER)
318 : {
319 : // clang-format off
320 : constexpr char X = -1;
321 : // How to dispatch 16 values of row=0 onto 3x16 bytes
322 : const __m128i xmm_shuffle00 = _mm_setr_epi8(0, X, X,
323 : 1, X, X,
324 : 2, X, X,
325 : 3, X, X,
326 : 4, X, X,
327 : 5);
328 : const __m128i xmm_shuffle01 = _mm_setr_epi8( X, X,
329 : 6, X, X,
330 : 7, X, X,
331 : 8, X, X,
332 : 9, X, X,
333 : 10,X);
334 : const __m128i xmm_shuffle02 = _mm_setr_epi8( X,
335 : 11, X, X,
336 : 12, X, X,
337 : 13, X, X,
338 : 14, X, X,
339 : 15, X, X);
340 :
341 : // How to dispatch 16 values of row=1 onto 3x16 bytes
342 : const __m128i xmm_shuffle10 = _mm_setr_epi8(X, 0, X,
343 : X, 1, X,
344 : X, 2, X,
345 : X, 3, X,
346 : X, 4, X,
347 : X);
348 : const __m128i xmm_shuffle11 = _mm_setr_epi8( 5, X,
349 : X, 6, X,
350 : X, 7, X,
351 : X, 8, X,
352 : X, 9, X,
353 : X,10);
354 : const __m128i xmm_shuffle12 = _mm_setr_epi8( X,
355 : X, 11, X,
356 : X, 12, X,
357 : X, 13, X,
358 : X, 14, X,
359 : X, 15, X);
360 :
361 : // How to dispatch 16 values of row=2 onto 3x16 bytes
362 : const __m128i xmm_shuffle20 = _mm_setr_epi8(X, X, 0,
363 : X, X, 1,
364 : X, X, 2,
365 : X, X, 3,
366 : X, X, 4,
367 : X);
368 : const __m128i xmm_shuffle21 = _mm_setr_epi8( X, 5,
369 : X, X, 6,
370 : X, X, 7,
371 : X, X, 8,
372 : X, X, 9,
373 : X, X);
374 : const __m128i xmm_shuffle22 = _mm_setr_epi8( 10,
375 : X, X, 11,
376 : X, X, 12,
377 : X, X, 13,
378 : X, X, 14,
379 : X, X, 15);
380 : // clang-format on
381 :
382 : for (; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
383 : {
384 : #define LOAD(x) __m128i xmm##x = loadu(pSrc + i, x, nIters)
385 : LOAD(0);
386 : LOAD(1);
387 : LOAD(2);
388 :
389 : #define SHUFFLE(x, y) _mm_shuffle_epi8(xmm##y, xmm_shuffle##y##x)
390 : #define COMBINE_3(x) \
391 : GDAL_mm_or_3_si128(SHUFFLE(x, 0), SHUFFLE(x, 1), SHUFFLE(x, 2))
392 :
393 : #define STORE(x) \
394 : storeu(pDst, 3 * (i / VALS_PER_ITER) + x, VALS_PER_ITER, COMBINE_3(x))
395 : STORE(0);
396 : STORE(1);
397 : STORE(2);
398 : #undef LOAD
399 : #undef COMBINE_3
400 : #undef SHUFFLE
401 : #undef STORE
402 : }
403 : }
404 :
405 : for (; i < nIters; ++i)
406 : {
407 : #define INTERLEAVE(x) pDst[3 * i + x] = pSrc[i + x * nIters]
408 : INTERLEAVE(0);
409 : INTERLEAVE(1);
410 : INTERLEAVE(2);
411 : #undef INTERLEAVE
412 : }
413 : }
414 :
415 : #else
416 :
417 : #if defined(__GNUC__) && !defined(__clang__)
418 : __attribute__((optimize("tree-vectorize")))
419 : #endif
420 : #if defined(__GNUC__)
421 : __attribute__((noinline))
422 : #endif
423 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
424 : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
425 : #pragma clang diagnostic push
426 : #pragma clang diagnostic ignored "-Wpass-failed"
427 : #endif
428 : static void
429 4 : GDALInterleave3Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
430 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
431 : {
432 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
433 : #pragma clang loop vectorize(enable)
434 : #endif
435 46 : for (size_t i = 0; i < nIters; ++i)
436 : {
437 42 : pDst[3 * i + 0] = pSrc[i + 0 * nIters];
438 42 : pDst[3 * i + 1] = pSrc[i + 1 * nIters];
439 42 : pDst[3 * i + 2] = pSrc[i + 2 * nIters];
440 : }
441 4 : }
442 : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
443 : #pragma clang diagnostic pop
444 : #endif
445 :
446 : #endif
447 :
448 : /************************************************************************/
449 : /* GDALInterleave5Byte_SSSE3() */
450 : /************************************************************************/
451 :
452 5 : inline __m128i GDAL_mm_or_5_si128(__m128i r0, __m128i r1, __m128i r2,
453 : __m128i r3, __m128i r4)
454 : {
455 15 : return _mm_or_si128(
456 5 : _mm_or_si128(_mm_or_si128(r0, r1), _mm_or_si128(r2, r3)), r4);
457 : }
458 :
459 2 : void GDALInterleave5Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
460 : uint8_t *CPL_RESTRICT pDst, size_t nIters)
461 : {
462 2 : size_t i = 0;
463 2 : constexpr size_t VALS_PER_ITER = 16;
464 :
465 2 : if (nIters >= VALS_PER_ITER)
466 : {
467 : // clang-format off
468 1 : constexpr char X = -1;
469 : // How to dispatch 16 values of row=0 onto 5x16 bytes
470 1 : const __m128i xmm_shuffle00 = _mm_setr_epi8(0, X, X, X, X,
471 : 1, X, X, X, X,
472 : 2, X, X, X, X,
473 : 3);
474 1 : const __m128i xmm_shuffle01 = _mm_setr_epi8( X, X, X, X,
475 : 4, X, X, X, X,
476 : 5, X, X, X, X,
477 : 6, X);
478 1 : const __m128i xmm_shuffle02 = _mm_setr_epi8( X, X, X,
479 : 7, X, X, X, X,
480 : 8, X, X, X, X,
481 : 9, X, X);
482 1 : const __m128i xmm_shuffle03 = _mm_setr_epi8( X, X,
483 : 10, X, X, X, X,
484 : 11, X, X, X, X,
485 : 12, X, X, X);
486 1 : const __m128i xmm_shuffle04 = _mm_setr_epi8( X,
487 : 13, X, X, X, X,
488 : 14, X, X, X, X,
489 : 15, X, X, X, X);
490 :
491 : // How to dispatch 16 values of row=1 onto 5x16 bytes
492 1 : const __m128i xmm_shuffle10 = _mm_setr_epi8(X, 0, X, X, X,
493 : X, 1, X, X, X,
494 : X, 2, X, X, X,
495 : X);
496 1 : const __m128i xmm_shuffle11 = _mm_setr_epi8( 3, X, X, X,
497 : X, 4, X, X, X,
498 : X, 5, X, X, X,
499 : X, 6);
500 1 : const __m128i xmm_shuffle12 = _mm_setr_epi8( X, X, X,
501 : X, 7, X, X, X,
502 : X, 8, X, X, X,
503 : X, 9, X);
504 1 : const __m128i xmm_shuffle13 = _mm_setr_epi8( X, X,
505 : X, 10, X, X, X,
506 : X, 11, X, X, X,
507 : X, 12, X, X);
508 1 : const __m128i xmm_shuffle14 = _mm_setr_epi8( X,
509 : X, 13, X, X, X,
510 : X, 14, X, X, X,
511 : X, 15, X, X, X);
512 :
513 : // How to dispatch 16 values of row=2 onto 5x16 bytes
514 1 : const __m128i xmm_shuffle20 = _mm_setr_epi8(X, X, 0, X, X,
515 : X, X, 1, X, X,
516 : X, X, 2, X, X,
517 : X);
518 1 : const __m128i xmm_shuffle21 = _mm_setr_epi8( X, 3, X, X,
519 : X, X, 4, X, X,
520 : X, X, 5, X, X,
521 : X, X);
522 1 : const __m128i xmm_shuffle22 = _mm_setr_epi8( 6, X, X,
523 : X, X, 7, X, X,
524 : X, X, 8, X, X,
525 : X, X, 9);
526 1 : const __m128i xmm_shuffle23 = _mm_setr_epi8( X, X,
527 : X, X, 10, X, X,
528 : X, X, 11, X, X,
529 : X, X, 12, X);
530 1 : const __m128i xmm_shuffle24 = _mm_setr_epi8( X,
531 : X, X, 13, X, X,
532 : X, X, 14, X, X,
533 : X, X, 15, X, X);
534 :
535 : // How to dispatch 16 values of row=3 onto 5x16 bytes
536 1 : const __m128i xmm_shuffle30 = _mm_setr_epi8(X, X, X, 0, X,
537 : X, X, X, 1, X,
538 : X, X, X, 2, X,
539 : X);
540 1 : const __m128i xmm_shuffle31 = _mm_setr_epi8( X, X, 3, X,
541 : X, X, X, 4, X,
542 : X, X, X, 5, X,
543 : X, X);
544 1 : const __m128i xmm_shuffle32 = _mm_setr_epi8( X, 6, X,
545 : X, X, X, 7, X,
546 : X, X, X, 8, X,
547 : X, X, X);
548 1 : const __m128i xmm_shuffle33 = _mm_setr_epi8( 9, X,
549 : X, X, X, 10, X,
550 : X, X, X, 11, X,
551 : X, X, X, 12);
552 1 : const __m128i xmm_shuffle34 = _mm_setr_epi8( X,
553 : X, X, X, 13, X,
554 : X, X, X, 14, X,
555 : X, X, X, 15, X);
556 :
557 : // How to dispatch 16 values of row=4 onto 5x16 bytes
558 1 : const __m128i xmm_shuffle40 = _mm_setr_epi8(X, X, X, X, 0,
559 : X, X, X, X, 1,
560 : X, X, X, X, 2,
561 : X);
562 1 : const __m128i xmm_shuffle41 = _mm_setr_epi8( X, X, X, 3,
563 : X, X, X, X, 4,
564 : X, X, X, X, 5,
565 : X, X);
566 1 : const __m128i xmm_shuffle42 = _mm_setr_epi8( X, X, 6,
567 : X, X, X, X, 7,
568 : X, X, X, X, 8,
569 : X, X, X);
570 1 : const __m128i xmm_shuffle43 = _mm_setr_epi8( X, 9,
571 : X, X, X, X, 10,
572 : X, X, X, X, 11,
573 : X, X, X, X);
574 1 : const __m128i xmm_shuffle44 = _mm_setr_epi8( 12,
575 : X, X, X, X, 13,
576 : X, X, X, X, 14,
577 : X, X, X, X, 15);
578 : // clang-format on
579 :
580 2 : for (; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
581 : {
582 : #define LOAD(x) __m128i xmm##x = loadu(pSrc + i, x, nIters)
583 1 : LOAD(0);
584 1 : LOAD(1);
585 1 : LOAD(2);
586 1 : LOAD(3);
587 1 : LOAD(4);
588 :
589 : #define SHUFFLE(x, y) _mm_shuffle_epi8(xmm##y, xmm_shuffle##y##x)
590 : #define COMBINE_5(x) \
591 : GDAL_mm_or_5_si128(SHUFFLE(x, 0), SHUFFLE(x, 1), SHUFFLE(x, 2), \
592 : SHUFFLE(x, 3), SHUFFLE(x, 4))
593 :
594 : #define STORE(x) \
595 : storeu(pDst, 5 * (i / VALS_PER_ITER) + x, VALS_PER_ITER, COMBINE_5(x))
596 5 : STORE(0);
597 5 : STORE(1);
598 5 : STORE(2);
599 5 : STORE(3);
600 5 : STORE(4);
601 : #undef LOAD
602 : #undef COMBINE_5
603 : #undef SHUFFLE
604 : #undef STORE
605 : }
606 : }
607 :
608 20 : for (; i < nIters; ++i)
609 : {
610 : #define INTERLEAVE(x) pDst[5 * i + x] = pSrc[i + x * nIters]
611 18 : INTERLEAVE(0);
612 18 : INTERLEAVE(1);
613 18 : INTERLEAVE(2);
614 18 : INTERLEAVE(3);
615 18 : INTERLEAVE(4);
616 : #undef INTERLEAVE
617 : }
618 2 : }
619 :
620 : /************************************************************************/
621 : /* GDALTranspose2D_Byte_SSSE3() */
622 : /************************************************************************/
623 :
624 : // Given r = (b00, b01, b02, b03,
625 : // b10, b11, b12, b13,
626 : // b20, b21, b22, b23,
627 : // b30, b31, b32, b33)
628 : // Return (b00, b10, b20, b30,
629 : // b01, b11, b21, b31,
630 : // b02, b12, b22, b32,
631 : // b03, b13, b22, b33)
632 66112 : inline void GDALReorderForTranspose4x4(__m128i &r)
633 : {
634 : const __m128i shuffle_mask =
635 66112 : _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
636 :
637 66112 : r = _mm_shuffle_epi8(r, shuffle_mask);
638 66112 : }
639 :
640 : // Transpose the 16x16 byte values contained in the 16 SSE registers
641 4132 : inline void GDALTranspose16x16ByteBlock_SSSE3(
642 : __m128i &r00, __m128i &r01, __m128i &r02, __m128i &r03, __m128i &r04,
643 : __m128i &r05, __m128i &r06, __m128i &r07, __m128i &r08, __m128i &r09,
644 : __m128i &r10, __m128i &r11, __m128i &r12, __m128i &r13, __m128i &r14,
645 : __m128i &r15)
646 : {
647 : __m128i tmp00, tmp01, tmp02, tmp03;
648 : __m128i tmp10, tmp11, tmp12, tmp13;
649 : __m128i tmp20, tmp21, tmp22, tmp23;
650 : __m128i tmp30, tmp31, tmp32, tmp33;
651 :
652 4132 : GDALTranspose4x4Int32(r00, r01, r02, r03, tmp00, tmp01, tmp02, tmp03);
653 4132 : GDALTranspose4x4Int32(r04, r05, r06, r07, tmp10, tmp11, tmp12, tmp13);
654 4132 : GDALTranspose4x4Int32(r08, r09, r10, r11, tmp20, tmp21, tmp22, tmp23);
655 4132 : GDALTranspose4x4Int32(r12, r13, r14, r15, tmp30, tmp31, tmp32, tmp33);
656 :
657 4132 : GDALReorderForTranspose4x4(tmp00);
658 4132 : GDALReorderForTranspose4x4(tmp01);
659 4132 : GDALReorderForTranspose4x4(tmp02);
660 4132 : GDALReorderForTranspose4x4(tmp03);
661 4132 : GDALReorderForTranspose4x4(tmp10);
662 4132 : GDALReorderForTranspose4x4(tmp11);
663 4132 : GDALReorderForTranspose4x4(tmp12);
664 4132 : GDALReorderForTranspose4x4(tmp13);
665 4132 : GDALReorderForTranspose4x4(tmp20);
666 4132 : GDALReorderForTranspose4x4(tmp21);
667 4132 : GDALReorderForTranspose4x4(tmp22);
668 4132 : GDALReorderForTranspose4x4(tmp23);
669 4132 : GDALReorderForTranspose4x4(tmp30);
670 4132 : GDALReorderForTranspose4x4(tmp31);
671 4132 : GDALReorderForTranspose4x4(tmp32);
672 4132 : GDALReorderForTranspose4x4(tmp33);
673 :
674 4132 : GDALTranspose4x4Int32(tmp00, tmp10, tmp20, tmp30, r00, r01, r02, r03);
675 4132 : GDALTranspose4x4Int32(tmp01, tmp11, tmp21, tmp31, r04, r05, r06, r07);
676 4132 : GDALTranspose4x4Int32(tmp02, tmp12, tmp22, tmp32, r08, r09, r10, r11);
677 4132 : GDALTranspose4x4Int32(tmp03, tmp13, tmp23, tmp33, r12, r13, r14, r15);
678 4132 : }
679 :
680 4132 : inline void GDALTranspose2D16x16Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
681 : uint8_t *CPL_RESTRICT pDst,
682 : size_t srcStride, size_t dstStride)
683 : {
684 : #define LOAD(x) __m128i r##x = loadu(pSrc, x, srcStride)
685 4132 : LOAD(0);
686 4132 : LOAD(1);
687 4132 : LOAD(2);
688 4132 : LOAD(3);
689 4132 : LOAD(4);
690 4132 : LOAD(5);
691 4132 : LOAD(6);
692 4132 : LOAD(7);
693 4132 : LOAD(8);
694 4132 : LOAD(9);
695 4132 : LOAD(10);
696 4132 : LOAD(11);
697 4132 : LOAD(12);
698 4132 : LOAD(13);
699 4132 : LOAD(14);
700 4132 : LOAD(15);
701 : #undef LOAD
702 :
703 4132 : GDALTranspose16x16ByteBlock_SSSE3(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9,
704 : r10, r11, r12, r13, r14, r15);
705 :
706 : #define STORE(x) storeu(pDst, x, dstStride, r##x)
707 4132 : STORE(0);
708 4132 : STORE(1);
709 4132 : STORE(2);
710 4132 : STORE(3);
711 4132 : STORE(4);
712 4132 : STORE(5);
713 4132 : STORE(6);
714 4132 : STORE(7);
715 4132 : STORE(8);
716 4132 : STORE(9);
717 4132 : STORE(10);
718 4132 : STORE(11);
719 4132 : STORE(12);
720 4132 : STORE(13);
721 4132 : STORE(14);
722 4132 : STORE(15);
723 : #undef STORE
724 4132 : }
725 :
726 19 : void GDALTranspose2D_Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
727 : uint8_t *CPL_RESTRICT pDst, size_t nSrcWidth,
728 : size_t nSrcHeight)
729 : {
730 19 : if (nSrcHeight == 3)
731 : {
732 4 : GDALInterleave3Byte_SSSE3(pSrc, pDst, nSrcWidth);
733 : }
734 15 : else if (nSrcHeight == 5)
735 : {
736 2 : GDALInterleave5Byte_SSSE3(pSrc, pDst, nSrcWidth);
737 : }
738 : else
739 : {
740 13 : constexpr size_t blocksize = 16;
741 109 : for (size_t i = 0; i < nSrcHeight; i += blocksize)
742 : {
743 96 : const size_t max_k = std::min(i + blocksize, nSrcHeight);
744 4280 : for (size_t j = 0; j < nSrcWidth; j += blocksize)
745 : {
746 : // transpose the block beginning at [i,j]
747 4184 : const size_t max_l = std::min(j + blocksize, nSrcWidth);
748 4184 : if (max_k - i == blocksize && max_l - j == blocksize)
749 : {
750 4132 : GDALTranspose2D16x16Byte_SSSE3(&pSrc[j + i * nSrcWidth],
751 4132 : &pDst[i + j * nSrcHeight],
752 : nSrcWidth, nSrcHeight);
753 : }
754 : else
755 : {
756 614 : for (size_t k = i; k < max_k; ++k)
757 : {
758 5005 : for (size_t l = j; l < max_l; ++l)
759 : {
760 4443 : GDALCopyWord(pSrc[l + k * nSrcWidth],
761 4443 : pDst[k + l * nSrcHeight]);
762 : }
763 : }
764 : }
765 : }
766 : }
767 : }
768 19 : }
769 :
770 : #endif // HAVE_SSSE3_AT_COMPILE_TIME
|