Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: GDAL Core
4 : * Purpose: SSSE3 specializations
5 : * Author: Even Rouault <even dot rouault at spatialys dot com>
6 : *
7 : ******************************************************************************
8 : * Copyright (c) 2016, Even Rouault <even dot rouault at spatialys dot com>
9 : *
10 : * SPDX-License-Identifier: MIT
11 : ****************************************************************************/
12 :
13 : #include "cpl_port.h"
14 :
15 : #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) && \
16 : (defined(__x86_64) || defined(_M_X64))) || \
17 : defined(USE_NEON_OPTIMIZATIONS)
18 :
19 : #include "rasterio_ssse3.h"
20 :
21 : #ifdef USE_NEON_OPTIMIZATIONS
22 : #include "include_sse2neon.h"
23 : #else
24 : #include <tmmintrin.h>
25 : #endif
26 :
27 : #include "gdal_priv_templates.hpp"
28 :
29 179427 : void GDALUnrolledCopy_GByte_3_1_SSSE3(GByte *CPL_RESTRICT pDest,
30 : const GByte *CPL_RESTRICT pSrc,
31 : GPtrDiff_t nIters)
32 : {
33 : decltype(nIters) i;
34 179427 : const __m128i xmm_shuffle0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
35 : -1, -1, 15, 12, 9, 6, 3, 0);
36 179427 : const __m128i xmm_shuffle1 = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11, 8, 5,
37 : 2, -1, -1, -1, -1, -1, -1);
38 179427 : const __m128i xmm_shuffle2 = _mm_set_epi8(13, 10, 7, 4, 1, -1, -1, -1, -1,
39 : -1, -1, -1, -1, -1, -1, -1);
40 : // If we were sure that there would always be 2 trailing bytes, we could
41 : // check against nIters - 15
42 30307300 : for (i = 0; i < nIters - 16; i += 16)
43 : {
44 : __m128i xmm0 =
45 30127900 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
46 : __m128i xmm1 =
47 30127900 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
48 : __m128i xmm2 =
49 60255700 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
50 :
51 : // From LSB to MSB:
52 : // 0,x,x,1,x,x,2,x,x,3,x,x,4,x,x,5 --> 0,1,2,3,4,5,0,0,0,0,0,0,0,0,0
53 30127900 : xmm0 = _mm_shuffle_epi8(xmm0, xmm_shuffle0);
54 : // x,x,6,x,x,7,x,x,8,x,x,9,x,x,10,x --> 0,0,0,0,0,0,6,7,8,9,10,0,0,0,0,0
55 30127900 : xmm1 = _mm_shuffle_epi8(xmm1, xmm_shuffle1);
56 : // x,11,x,x,12,x,x,13,x,x,14,x,x,15,x,x -->
57 : // 0,0,0,0,0,0,0,0,0,0,0,11,12,13,14,15
58 30127900 : xmm2 = _mm_shuffle_epi8(xmm2, xmm_shuffle2);
59 30127900 : xmm0 = _mm_or_si128(xmm0, xmm1);
60 30127900 : xmm0 = _mm_or_si128(xmm0, xmm2);
61 :
62 30127900 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
63 :
64 30127900 : pSrc += 3 * 16;
65 : }
66 2785930 : for (; i < nIters; i++)
67 : {
68 2606510 : pDest[i] = *pSrc;
69 2606510 : pSrc += 3;
70 : }
71 179427 : }
72 :
73 : /************************************************************************/
74 : /* GDALDeinterleave3Byte_SSSE3() */
75 : /************************************************************************/
76 :
77 : #if defined(__GNUC__) && !defined(__clang__)
78 : // GCC autovectorizer does an excellent job
79 69765 : __attribute__((optimize("tree-vectorize"))) void GDALDeinterleave3Byte_SSSE3(
80 : const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
81 : GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2, size_t nIters)
82 : {
83 220124000 : for (size_t i = 0; i < nIters; ++i)
84 : {
85 220055000 : pabyDest0[i] = pabySrc[3 * i + 0];
86 220055000 : pabyDest1[i] = pabySrc[3 * i + 1];
87 220055000 : pabyDest2[i] = pabySrc[3 * i + 2];
88 : }
89 69765 : }
90 : #else
91 : void GDALDeinterleave3Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc,
92 : GByte *CPL_RESTRICT pabyDest0,
93 : GByte *CPL_RESTRICT pabyDest1,
94 : GByte *CPL_RESTRICT pabyDest2, size_t nIters)
95 : {
96 : size_t i = 0;
97 : for (; i + 15 < nIters; i += 16)
98 : {
99 : __m128i xmm0 = _mm_loadu_si128(
100 : reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 0));
101 : __m128i xmm1 = _mm_loadu_si128(
102 : reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 16));
103 : __m128i xmm2 = _mm_loadu_si128(
104 : reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 32));
105 : auto xmm0_new =
106 : _mm_shuffle_epi8(xmm0, _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10,
107 : 7, 4, 1, 9, 6, 3, 0));
108 : auto xmm1_new = _mm_shuffle_epi8(
109 : _mm_alignr_epi8(xmm1, xmm0, 12),
110 : _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10, 7, 4, 1, 9, 6, 3, 0));
111 : auto xmm2_new = _mm_shuffle_epi8(
112 : _mm_alignr_epi8(xmm2, xmm1, 8),
113 : _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10, 7, 4, 1, 9, 6, 3, 0));
114 : auto xmm3_new =
115 : _mm_shuffle_epi8(xmm2, _mm_set_epi8(-1, -1, -1, -1, 15, 12, 9, 6,
116 : 14, 11, 8, 5, 13, 10, 7, 4));
117 :
118 : __m128i xmm01lo =
119 : _mm_unpacklo_epi32(xmm0_new, xmm1_new); // W0 W4 W1 W5
120 : __m128i xmm01hi = _mm_unpackhi_epi32(xmm0_new, xmm1_new); // W2 W6 - -
121 : __m128i xmm23lo =
122 : _mm_unpacklo_epi32(xmm2_new, xmm3_new); // W8 WC W9 WD
123 : __m128i xmm23hi = _mm_unpackhi_epi32(xmm2_new, xmm3_new); // WA WE - -
124 : xmm0_new = _mm_unpacklo_epi64(xmm01lo, xmm23lo); // W0 W4 W8 WC
125 : xmm1_new = _mm_unpackhi_epi64(xmm01lo, xmm23lo); // W1 W5 W9 WD
126 : xmm2_new = _mm_unpacklo_epi64(xmm01hi, xmm23hi); // W2 W6 WA WE
127 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0_new);
128 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1_new);
129 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest2 + i), xmm2_new);
130 : }
131 : #if defined(__clang__)
132 : #pragma clang loop vectorize(disable)
133 : #endif
134 : for (; i < nIters; ++i)
135 : {
136 : pabyDest0[i] = pabySrc[3 * i + 0];
137 : pabyDest1[i] = pabySrc[3 * i + 1];
138 : pabyDest2[i] = pabySrc[3 * i + 2];
139 : }
140 : }
141 : #endif
142 :
143 : /************************************************************************/
144 : /* GDALDeinterleave4Byte_SSSE3() */
145 : /************************************************************************/
146 :
147 : #if !defined(__GNUC__) || defined(__clang__)
148 : void GDALDeinterleave4Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc,
149 : GByte *CPL_RESTRICT pabyDest0,
150 : GByte *CPL_RESTRICT pabyDest1,
151 : GByte *CPL_RESTRICT pabyDest2,
152 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
153 : {
154 : const __m128i shuffle_mask =
155 : _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
156 : size_t i = 0;
157 : for (; i + 15 < nIters; i += 16)
158 : {
159 : __m128i xmm0 = _mm_loadu_si128(
160 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
161 : __m128i xmm1 = _mm_loadu_si128(
162 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
163 : __m128i xmm2 = _mm_loadu_si128(
164 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
165 : __m128i xmm3 = _mm_loadu_si128(
166 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
167 : xmm0 = _mm_shuffle_epi8(xmm0, shuffle_mask); // W0 W1 W2 W3
168 : xmm1 = _mm_shuffle_epi8(xmm1, shuffle_mask); // W4 W5 W6 W7
169 : xmm2 = _mm_shuffle_epi8(xmm2, shuffle_mask); // W8 W9 WA WB
170 : xmm3 = _mm_shuffle_epi8(xmm3, shuffle_mask); // WC WD WE WF
171 :
172 : __m128i xmm01lo = _mm_unpacklo_epi32(xmm0, xmm1); // W0 W4 W1 W5
173 : __m128i xmm01hi = _mm_unpackhi_epi32(xmm0, xmm1); // W2 W6 W3 W7
174 : __m128i xmm23lo = _mm_unpacklo_epi32(xmm2, xmm3); // W8 WC W9 WD
175 : __m128i xmm23hi = _mm_unpackhi_epi32(xmm2, xmm3); // WA WE WB WF
176 : xmm0 = _mm_unpacklo_epi64(xmm01lo, xmm23lo); // W0 W4 W8 WC
177 : xmm1 = _mm_unpackhi_epi64(xmm01lo, xmm23lo); // W1 W5 W9 WD
178 : xmm2 = _mm_unpacklo_epi64(xmm01hi, xmm23hi); // W2 W6 WA WE
179 : xmm3 = _mm_unpackhi_epi64(xmm01hi, xmm23hi); // W3 W7 WB WF
180 :
181 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0);
182 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1);
183 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest2 + i), xmm2);
184 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest3 + i), xmm3);
185 : }
186 : #if defined(__clang__)
187 : #pragma clang loop vectorize(disable)
188 : #endif
189 : for (; i < nIters; ++i)
190 : {
191 : pabyDest0[i] = pabySrc[4 * i + 0];
192 : pabyDest1[i] = pabySrc[4 * i + 1];
193 : pabyDest2[i] = pabySrc[4 * i + 2];
194 : pabyDest3[i] = pabySrc[4 * i + 3];
195 : }
196 : }
197 : #endif
198 :
199 : /************************************************************************/
200 : /* GDALDeinterleave3UInt16_SSSE3() */
201 : /************************************************************************/
202 :
203 : #if (defined(__GNUC__) && !defined(__clang__)) || \
204 : defined(__INTEL_CLANG_COMPILER)
205 : #if !defined(__INTEL_CLANG_COMPILER)
206 : // GCC autovectorizer does an excellent job
207 : __attribute__((optimize("tree-vectorize")))
208 : #endif
209 239 : void GDALDeinterleave3UInt16_SSSE3(const GUInt16* CPL_RESTRICT panSrc,
210 : GUInt16* CPL_RESTRICT panDest0,
211 : GUInt16* CPL_RESTRICT panDest1,
212 : GUInt16* CPL_RESTRICT panDest2,
213 : size_t nIters)
214 : {
215 9256750 : for (size_t i = 0; i < nIters; ++i)
216 : {
217 9256510 : panDest0[i] = panSrc[3 * i + 0];
218 9256510 : panDest1[i] = panSrc[3 * i + 1];
219 9256510 : panDest2[i] = panSrc[3 * i + 2];
220 : }
221 239 : }
222 : #endif
223 :
224 : /************************************************************************/
225 : /* GDALDeinterleave4UInt16_SSSE3() */
226 : /************************************************************************/
227 :
228 : #if (defined(__GNUC__) && !defined(__clang__)) || \
229 : defined(__INTEL_CLANG_COMPILER)
230 : #if !defined(__INTEL_CLANG_COMPILER)
231 : // GCC autovectorizer does an excellent job
232 : __attribute__((optimize("tree-vectorize")))
233 : #endif
234 494 : void GDALDeinterleave4UInt16_SSSE3(const GUInt16* CPL_RESTRICT panSrc,
235 : GUInt16* CPL_RESTRICT panDest0,
236 : GUInt16* CPL_RESTRICT panDest1,
237 : GUInt16* CPL_RESTRICT panDest2,
238 : GUInt16* CPL_RESTRICT panDest3,
239 : size_t nIters)
240 : {
241 472352 : for (size_t i = 0; i < nIters; ++i)
242 : {
243 471858 : panDest0[i] = panSrc[4 * i + 0];
244 471858 : panDest1[i] = panSrc[4 * i + 1];
245 471858 : panDest2[i] = panSrc[4 * i + 2];
246 471858 : panDest3[i] = panSrc[4 * i + 3];
247 : }
248 494 : }
249 : #endif
250 :
251 : #endif // HAVE_SSSE3_AT_COMPILE_TIME
|