Line data Source code
1 : /******************************************************************************
2 : *
3 : * Project: GDAL Core
4 : * Purpose: SSSE3 specializations
5 : * Author: Even Rouault <even dot rouault at spatialys dot com>
6 : *
7 : ******************************************************************************
8 : * Copyright (c) 2016, Even Rouault <even dot rouault at spatialys dot com>
9 : *
10 : * Permission is hereby granted, free of charge, to any person obtaining a
11 : * copy of this software and associated documentation files (the "Software"),
12 : * to deal in the Software without restriction, including without limitation
13 : * the rights to use, copy, modify, merge, publish, distribute, sublicense,
14 : * and/or sell copies of the Software, and to permit persons to whom the
15 : * Software is furnished to do so, subject to the following conditions:
16 : *
17 : * The above copyright notice and this permission notice shall be included
18 : * in all copies or substantial portions of the Software.
19 : *
20 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21 : * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 : * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 : * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 : * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 : * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 : * DEALINGS IN THE SOFTWARE.
27 : ****************************************************************************/
28 :
29 : #include "cpl_port.h"
30 :
31 : #if defined(HAVE_SSSE3_AT_COMPILE_TIME) && \
32 : (defined(__x86_64) || defined(_M_X64))
33 :
34 : #include "rasterio_ssse3.h"
35 :
36 : #include <tmmintrin.h>
37 : #include "gdal_priv_templates.hpp"
38 :
39 179190 : void GDALUnrolledCopy_GByte_3_1_SSSE3(GByte *CPL_RESTRICT pDest,
40 : const GByte *CPL_RESTRICT pSrc,
41 : GPtrDiff_t nIters)
42 : {
43 : decltype(nIters) i;
44 179190 : const __m128i xmm_shuffle0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
45 : -1, -1, 15, 12, 9, 6, 3, 0);
46 179190 : const __m128i xmm_shuffle1 = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11, 8, 5,
47 : 2, -1, -1, -1, -1, -1, -1);
48 179190 : const __m128i xmm_shuffle2 = _mm_set_epi8(13, 10, 7, 4, 1, -1, -1, -1, -1,
49 : -1, -1, -1, -1, -1, -1, -1);
50 : // If we were sure that there would always be 2 trailing bytes, we could
51 : // check against nIters - 15
52 30220800 : for (i = 0; i < nIters - 16; i += 16)
53 : {
54 : __m128i xmm0 =
55 30041600 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
56 : __m128i xmm1 =
57 30041600 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
58 : __m128i xmm2 =
59 60083300 : _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
60 :
61 : // From LSB to MSB:
62 : // 0,x,x,1,x,x,2,x,x,3,x,x,4,x,x,5 --> 0,1,2,3,4,5,0,0,0,0,0,0,0,0,0
63 30041600 : xmm0 = _mm_shuffle_epi8(xmm0, xmm_shuffle0);
64 : // x,x,6,x,x,7,x,x,8,x,x,9,x,x,10,x --> 0,0,0,0,0,0,6,7,8,9,10,0,0,0,0,0
65 30041800 : xmm1 = _mm_shuffle_epi8(xmm1, xmm_shuffle1);
66 : // x,11,x,x,12,x,x,13,x,x,14,x,x,15,x,x -->
67 : // 0,0,0,0,0,0,0,0,0,0,0,11,12,13,14,15
68 30041600 : xmm2 = _mm_shuffle_epi8(xmm2, xmm_shuffle2);
69 30041600 : xmm0 = _mm_or_si128(xmm0, xmm1);
70 30041600 : xmm0 = _mm_or_si128(xmm0, xmm2);
71 :
72 30041600 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
73 :
74 30041600 : pSrc += 3 * 16;
75 : }
76 2781910 : for (; i < nIters; i++)
77 : {
78 2602710 : pDest[i] = *pSrc;
79 2602710 : pSrc += 3;
80 : }
81 179195 : }
82 :
83 : /************************************************************************/
84 : /* GDALDeinterleave3Byte_SSSE3() */
85 : /************************************************************************/
86 :
87 : #if defined(__GNUC__) && !defined(__clang__)
88 : // GCC autovectorizer does an excellent job
89 58261 : __attribute__((optimize("tree-vectorize"))) void GDALDeinterleave3Byte_SSSE3(
90 : const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
91 : GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2, size_t nIters)
92 : {
93 196353000 : for (size_t i = 0; i < nIters; ++i)
94 : {
95 196294000 : pabyDest0[i] = pabySrc[3 * i + 0];
96 196294000 : pabyDest1[i] = pabySrc[3 * i + 1];
97 196294000 : pabyDest2[i] = pabySrc[3 * i + 2];
98 : }
99 58261 : }
100 : #else
101 : void GDALDeinterleave3Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc,
102 : GByte *CPL_RESTRICT pabyDest0,
103 : GByte *CPL_RESTRICT pabyDest1,
104 : GByte *CPL_RESTRICT pabyDest2, size_t nIters)
105 : {
106 : size_t i = 0;
107 : for (; i + 15 < nIters; i += 16)
108 : {
109 : __m128i xmm0 = _mm_loadu_si128(
110 : reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 0));
111 : __m128i xmm1 = _mm_loadu_si128(
112 : reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 16));
113 : __m128i xmm2 = _mm_loadu_si128(
114 : reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 32));
115 : auto xmm0_new =
116 : _mm_shuffle_epi8(xmm0, _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10,
117 : 7, 4, 1, 9, 6, 3, 0));
118 : auto xmm1_new = _mm_shuffle_epi8(
119 : _mm_alignr_epi8(xmm1, xmm0, 12),
120 : _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10, 7, 4, 1, 9, 6, 3, 0));
121 : auto xmm2_new = _mm_shuffle_epi8(
122 : _mm_alignr_epi8(xmm2, xmm1, 8),
123 : _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10, 7, 4, 1, 9, 6, 3, 0));
124 : auto xmm3_new =
125 : _mm_shuffle_epi8(xmm2, _mm_set_epi8(-1, -1, -1, -1, 15, 12, 9, 6,
126 : 14, 11, 8, 5, 13, 10, 7, 4));
127 :
128 : __m128i xmm01lo =
129 : _mm_unpacklo_epi32(xmm0_new, xmm1_new); // W0 W4 W1 W5
130 : __m128i xmm01hi = _mm_unpackhi_epi32(xmm0_new, xmm1_new); // W2 W6 - -
131 : __m128i xmm23lo =
132 : _mm_unpacklo_epi32(xmm2_new, xmm3_new); // W8 WC W9 WD
133 : __m128i xmm23hi = _mm_unpackhi_epi32(xmm2_new, xmm3_new); // WA WE - -
134 : xmm0_new = _mm_unpacklo_epi64(xmm01lo, xmm23lo); // W0 W4 W8 WC
135 : xmm1_new = _mm_unpackhi_epi64(xmm01lo, xmm23lo); // W1 W5 W9 WD
136 : xmm2_new = _mm_unpacklo_epi64(xmm01hi, xmm23hi); // W2 W6 WA WE
137 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0_new);
138 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1_new);
139 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest2 + i), xmm2_new);
140 : }
141 : #if defined(__clang__)
142 : #pragma clang loop vectorize(disable)
143 : #endif
144 : for (; i < nIters; ++i)
145 : {
146 : pabyDest0[i] = pabySrc[3 * i + 0];
147 : pabyDest1[i] = pabySrc[3 * i + 1];
148 : pabyDest2[i] = pabySrc[3 * i + 2];
149 : }
150 : }
151 : #endif
152 :
153 : /************************************************************************/
154 : /* GDALDeinterleave4Byte_SSSE3() */
155 : /************************************************************************/
156 :
157 : #if !defined(__GNUC__) || defined(__clang__)
158 : void GDALDeinterleave4Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc,
159 : GByte *CPL_RESTRICT pabyDest0,
160 : GByte *CPL_RESTRICT pabyDest1,
161 : GByte *CPL_RESTRICT pabyDest2,
162 : GByte *CPL_RESTRICT pabyDest3, size_t nIters)
163 : {
164 : const __m128i shuffle_mask =
165 : _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
166 : size_t i = 0;
167 : for (; i + 15 < nIters; i += 16)
168 : {
169 : __m128i xmm0 = _mm_loadu_si128(
170 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
171 : __m128i xmm1 = _mm_loadu_si128(
172 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
173 : __m128i xmm2 = _mm_loadu_si128(
174 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
175 : __m128i xmm3 = _mm_loadu_si128(
176 : reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
177 : xmm0 = _mm_shuffle_epi8(xmm0, shuffle_mask); // W0 W1 W2 W3
178 : xmm1 = _mm_shuffle_epi8(xmm1, shuffle_mask); // W4 W5 W6 W7
179 : xmm2 = _mm_shuffle_epi8(xmm2, shuffle_mask); // W8 W9 WA WB
180 : xmm3 = _mm_shuffle_epi8(xmm3, shuffle_mask); // WC WD WE WF
181 :
182 : __m128i xmm01lo = _mm_unpacklo_epi32(xmm0, xmm1); // W0 W4 W1 W5
183 : __m128i xmm01hi = _mm_unpackhi_epi32(xmm0, xmm1); // W2 W6 W3 W7
184 : __m128i xmm23lo = _mm_unpacklo_epi32(xmm2, xmm3); // W8 WC W9 WD
185 : __m128i xmm23hi = _mm_unpackhi_epi32(xmm2, xmm3); // WA WE WB WF
186 : xmm0 = _mm_unpacklo_epi64(xmm01lo, xmm23lo); // W0 W4 W8 WC
187 : xmm1 = _mm_unpackhi_epi64(xmm01lo, xmm23lo); // W1 W5 W9 WD
188 : xmm2 = _mm_unpacklo_epi64(xmm01hi, xmm23hi); // W2 W6 WA WE
189 : xmm3 = _mm_unpackhi_epi64(xmm01hi, xmm23hi); // W3 W7 WB WF
190 :
191 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0);
192 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1);
193 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest2 + i), xmm2);
194 : _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest3 + i), xmm3);
195 : }
196 : #if defined(__clang__)
197 : #pragma clang loop vectorize(disable)
198 : #endif
199 : for (; i < nIters; ++i)
200 : {
201 : pabyDest0[i] = pabySrc[4 * i + 0];
202 : pabyDest1[i] = pabySrc[4 * i + 1];
203 : pabyDest2[i] = pabySrc[4 * i + 2];
204 : pabyDest3[i] = pabySrc[4 * i + 3];
205 : }
206 : }
207 : #endif
208 :
209 : /************************************************************************/
210 : /* GDALDeinterleave3UInt16_SSSE3() */
211 : /************************************************************************/
212 :
213 : #if (defined(__GNUC__) && !defined(__clang__)) || \
214 : defined(__INTEL_CLANG_COMPILER)
215 : #if !defined(__INTEL_CLANG_COMPILER)
216 : // GCC autovectorizer does an excellent job
217 : __attribute__((optimize("tree-vectorize")))
218 : #endif
219 237 : void GDALDeinterleave3UInt16_SSSE3(const GUInt16* CPL_RESTRICT panSrc,
220 : GUInt16* CPL_RESTRICT panDest0,
221 : GUInt16* CPL_RESTRICT panDest1,
222 : GUInt16* CPL_RESTRICT panDest2,
223 : size_t nIters)
224 : {
225 9256740 : for (size_t i = 0; i < nIters; ++i)
226 : {
227 9256510 : panDest0[i] = panSrc[3 * i + 0];
228 9256510 : panDest1[i] = panSrc[3 * i + 1];
229 9256510 : panDest2[i] = panSrc[3 * i + 2];
230 : }
231 237 : }
232 : #endif
233 :
234 : /************************************************************************/
235 : /* GDALDeinterleave4UInt16_SSSE3() */
236 : /************************************************************************/
237 :
238 : #if (defined(__GNUC__) && !defined(__clang__)) || \
239 : defined(__INTEL_CLANG_COMPILER)
240 : #if !defined(__INTEL_CLANG_COMPILER)
241 : // GCC autovectorizer does an excellent job
242 : __attribute__((optimize("tree-vectorize")))
243 : #endif
244 494 : void GDALDeinterleave4UInt16_SSSE3(const GUInt16* CPL_RESTRICT panSrc,
245 : GUInt16* CPL_RESTRICT panDest0,
246 : GUInt16* CPL_RESTRICT panDest1,
247 : GUInt16* CPL_RESTRICT panDest2,
248 : GUInt16* CPL_RESTRICT panDest3,
249 : size_t nIters)
250 : {
251 472352 : for (size_t i = 0; i < nIters; ++i)
252 : {
253 471858 : panDest0[i] = panSrc[4 * i + 0];
254 471858 : panDest1[i] = panSrc[4 * i + 1];
255 471858 : panDest2[i] = panSrc[4 * i + 2];
256 471858 : panDest3[i] = panSrc[4 * i + 3];
257 : }
258 494 : }
259 : #endif
260 :
261 : #endif // HAVE_SSSE3_AT_COMPILE_TIME
|