Line data Source code
1 : /* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
2 : * Copyright (C) 2013, 2015, 2021 Mark Adler
3 : * Version 1.4 31 May 2021 Mark Adler
4 : */
5 :
6 : // Even Rouault, 2026: Slightly modified to avoid pthread and replacing by
7 : // C++11 static thread-safe initialization
8 :
9 : /*
10 : This software is provided 'as-is', without any express or implied
11 : warranty. In no event will the author be held liable for any damages
12 : arising from the use of this software.
13 :
14 : Permission is granted to anyone to use this software for any purpose,
15 : including commercial applications, and to alter it and redistribute it
16 : freely, subject to the following restrictions:
17 :
18 : 1. The origin of this software must not be misrepresented; you must not
19 : claim that you wrote the original software. If you use this software
20 : in a product, an acknowledgment in the product documentation would be
21 : appreciated but is not required.
22 : 2. Altered source versions must be plainly marked as such, and must not be
23 : misrepresented as being the original software.
24 : 3. This notice may not be removed or altered from any source distribution.
25 :
26 : Mark Adler
27 : madler@alumni.caltech.edu
28 : */
29 :
30 : /* Use hardware CRC instruction on Intel SSE 4.2 processors. This computes a
31 : CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc. A software
32 : version is provided as a fall-back, as well as for speed comparisons. */
33 :
34 : /* Version history:
35 : 1.0 10 Feb 2013 First version
36 : 1.1 1 Aug 2013 Correct comments on why three crc instructions in parallel
37 : 1.2 1 Nov 2015 Add const qualifier to avoid compiler warning
38 : Load entire input into memory (test code)
39 : Argument gives number of times to repeat (test code)
40 : Argument < 0 forces software implementation (test code)
41 : 1.3 31 Dec 2015 Check for Intel architecture using compiler macro
42 : Support big-endian processors in software calculation
43 : Add header for external use
44 : 1.4 31 May 2021 Correct register constraints on assembly instructions
45 : */
46 :
47 : #include "cpl_port.h"
48 :
49 : #include "crc32c.h"
50 :
51 : crc_func crc32c;
52 :
53 : /* CRC-32C (iSCSI) polynomial in reversed bit order. */
54 : #define POLY 0x82f63b78
55 :
56 : uint32_t crc32c_sw_little(uint32_t crc, void const *buf, size_t len);
57 : uint32_t crc32c_sw_big(uint32_t crc, void const *buf, size_t len);
58 : #ifdef __x86_64__
59 :
60 : /* Hardware CRC-32C for Intel and compatible processors. */
61 :
62 : /* Multiply a matrix times a vector over the Galois field of two elements,
63 : GF(2). Each element is a bit in an unsigned integer. mat must have at
64 : least as many entries as the power of two for most significant one bit in
65 : vec. */
66 2912 : static inline uint32_t gf2_matrix_times(uint32_t *mat, uint32_t vec) {
67 2912 : uint32_t sum = 0;
68 64135 : while (vec) {
69 61223 : if (vec & 1)
70 18172 : sum ^= *mat;
71 61223 : vec >>= 1;
72 61223 : mat++;
73 : }
74 2912 : return sum;
75 : }
76 :
77 : /* Multiply a matrix by itself over GF(2). Both mat and square must have 32
78 : rows. */
79 27 : static inline void gf2_matrix_square(uint32_t *square, uint32_t *mat) {
80 891 : for (unsigned n = 0; n < 32; n++)
81 864 : square[n] = gf2_matrix_times(mat, mat[n]);
82 27 : }
83 :
84 : /* Construct an operator to apply len zeros to a crc. len must be a power of
85 : two. If len is not a power of two, then the result is the same as for the
86 : largest power of two less than len. The result for len == 0 is the same as
87 : for len == 1. A version of this routine could be easily written for any
88 : len, but that is not needed for this application. */
89 2 : static void crc32c_zeros_op(uint32_t *even, size_t len) {
90 : uint32_t odd[32]; /* odd-power-of-two zeros operator */
91 :
92 : /* put operator for one zero bit in odd */
93 2 : odd[0] = POLY; /* CRC-32C polynomial */
94 2 : uint32_t row = 1;
95 64 : for (unsigned n = 1; n < 32; n++) {
96 62 : odd[n] = row;
97 62 : row <<= 1;
98 : }
99 :
100 : /* put operator for two zero bits in even */
101 2 : gf2_matrix_square(even, odd);
102 :
103 : /* put operator for four zero bits in odd */
104 2 : gf2_matrix_square(odd, even);
105 :
106 : /* first square will put the operator for one zero byte (eight zero bits),
107 : in even -- next square puts operator for two zero bytes in odd, and so
108 : on, until len has been rotated down to zero */
109 10 : do {
110 12 : gf2_matrix_square(even, odd);
111 12 : len >>= 1;
112 12 : if (len == 0)
113 1 : return;
114 11 : gf2_matrix_square(odd, even);
115 11 : len >>= 1;
116 11 : } while (len);
117 :
118 : /* answer ended up in odd -- copy to even */
119 33 : for (unsigned n = 0; n < 32; n++)
120 32 : even[n] = odd[n];
121 : }
122 :
123 : /* Take a length and build four lookup tables for applying the zeros operator
124 : for that length, byte-by-byte on the operand. */
125 2 : static void crc32c_zeros(uint32_t zeros[][256], size_t len) {
126 : uint32_t op[32];
127 :
128 2 : crc32c_zeros_op(op, len);
129 514 : for (unsigned n = 0; n < 256; n++) {
130 512 : zeros[0][n] = gf2_matrix_times(op, n);
131 512 : zeros[1][n] = gf2_matrix_times(op, n << 8);
132 512 : zeros[2][n] = gf2_matrix_times(op, n << 16);
133 512 : zeros[3][n] = gf2_matrix_times(op, n << 24);
134 : }
135 2 : }
136 :
137 : /* Apply the zeros operator table to crc. */
138 0 : static inline uint32_t crc32c_shift(uint32_t zeros[][256], uint32_t crc) {
139 0 : return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^
140 0 : zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24];
141 : }
142 :
143 : /* Block sizes for three-way parallel crc computation. LONG and SHORT must
144 : both be powers of two. The associated string constants must be set
145 : accordingly, for use in constructing the assembler instructions. */
146 : #define LONG 8192
147 : #define LONGx1 "8192"
148 : #define LONGx2 "16384"
149 : #define SHORT 256
150 : #define SHORTx1 "256"
151 : #define SHORTx2 "512"
152 :
153 : /* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
154 : static uint32_t crc32c_long[4][256];
155 : static uint32_t crc32c_short[4][256];
156 :
157 : /* Initialize tables for shifting crcs. */
158 1 : static void crc32c_init_hw(void) {
159 1 : crc32c_zeros(crc32c_long, LONG);
160 1 : crc32c_zeros(crc32c_short, SHORT);
161 1 : }
162 :
163 : /* Compute CRC-32C using the Intel hardware instruction. */
164 168 : static uint32_t crc32c_hw(uint32_t crc, void const *buf, size_t len) {
165 : /* populate shift tables the first time through */
166 1 : static bool bInit = [](bool ret)
167 : {
168 1 : crc32c_init_hw();
169 1 : return ret;
170 168 : }(true);
171 : (void)bInit;
172 :
173 : /* pre-process the crc */
174 168 : crc = ~crc;
175 168 : uint64_t crc0 = crc; /* 64-bits for crc32q instruction */
176 :
177 : /* compute the crc for up to seven leading bytes to bring the data pointer
178 : to an eight-byte boundary */
179 168 : unsigned char const *next = static_cast<unsigned char const*>(buf);
180 168 : while (len && (reinterpret_cast<uintptr_t>(next) & 7) != 0) {
181 : __asm__("crc32b\t" "(%1), %0"
182 : : "+r"(crc0)
183 0 : : "r"(next), "m"(*next));
184 0 : next++;
185 0 : len--;
186 : }
187 :
188 : /* compute the crc on sets of LONG*3 bytes, executing three independent crc
189 : instructions, each on LONG bytes -- this is optimized for the Nehalem,
190 : Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
191 : throughput of one crc per cycle, but a latency of three cycles */
192 168 : while (len >= LONG*3) {
193 0 : uint64_t crc1 = 0;
194 0 : uint64_t crc2 = 0;
195 0 : unsigned char const * const end = next + LONG;
196 0 : do {
197 : __asm__("crc32q\t" "(%3), %0\n\t"
198 : "crc32q\t" LONGx1 "(%3), %1\n\t"
199 : "crc32q\t" LONGx2 "(%3), %2"
200 : : "+r"(crc0), "+r"(crc1), "+r"(crc2)
201 0 : : "r"(next), "m"(*next));
202 0 : next += 8;
203 0 : } while (next < end);
204 0 : crc0 = crc32c_shift(crc32c_long, static_cast<uint32_t>(crc0)) ^ crc1;
205 0 : crc0 = crc32c_shift(crc32c_long, static_cast<uint32_t>(crc0)) ^ crc2;
206 0 : next += LONG*2;
207 0 : len -= LONG*3;
208 : }
209 :
210 : /* do the same thing, but now on SHORT*3 blocks for the remaining data less
211 : than a LONG*3 block */
212 168 : while (len >= SHORT*3) {
213 0 : uint64_t crc1 = 0;
214 0 : uint64_t crc2 = 0;
215 0 : unsigned char const * const end = next + SHORT;
216 0 : do {
217 : __asm__("crc32q\t" "(%3), %0\n\t"
218 : "crc32q\t" SHORTx1 "(%3), %1\n\t"
219 : "crc32q\t" SHORTx2 "(%3), %2"
220 : : "+r"(crc0), "+r"(crc1), "+r"(crc2)
221 0 : : "r"(next), "m"(*next));
222 0 : next += 8;
223 0 : } while (next < end);
224 0 : crc0 = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc0)) ^ crc1;
225 0 : crc0 = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc0)) ^ crc2;
226 0 : next += SHORT*2;
227 0 : len -= SHORT*3;
228 : }
229 :
230 : /* compute the crc on the remaining eight-byte units less than a SHORT*3
231 : block */
232 : {
233 168 : unsigned char const * const end = next + (len - (len & 7));
234 534 : while (next < end) {
235 : __asm__("crc32q\t" "(%1), %0"
236 : : "+r"(crc0)
237 366 : : "r"(next), "m"(*next));
238 366 : next += 8;
239 : }
240 168 : len &= 7;
241 : }
242 :
243 : /* compute the crc for up to seven trailing bytes */
244 168 : while (len) {
245 : __asm__("crc32b\t" "(%1), %0"
246 : : "+r"(crc0)
247 0 : : "r"(next), "m"(*next));
248 0 : next++;
249 0 : len--;
250 : }
251 :
252 : /* return a post-processed crc */
253 168 : return ~static_cast<uint32_t>(crc0);
254 : }
255 :
256 : /* Check for SSE 4.2. SSE 4.2 was first supported in Nehalem processors
257 : introduced in November, 2008. This does not check for the existence of the
258 : cpuid instruction itself, which was introduced on the 486SL in 1992, so this
259 : will fail on earlier x86 processors. cpuid works on all Pentium and later
260 : processors. */
261 : #define SSE42(have) \
262 : do { \
263 : uint32_t eax, ecx; \
264 : eax = 1; \
265 : __asm__("cpuid" \
266 : : "=c"(ecx) \
267 : : "a"(eax) \
268 : : "%ebx", "%edx"); \
269 : (have) = (ecx >> 20) & 1; \
270 : } while (0)
271 :
272 : /* Compute a CRC-32C. If the crc32 instruction is available, use the hardware
273 : version. Otherwise, use the software version. */
274 168 : void crc32c_init(void) {
275 :
276 1 : static const bool bInit = [](bool ret){
277 : int sse42;
278 :
279 1 : SSE42(sse42);
280 1 : if (sse42) {
281 1 : crc32c = crc32c_hw;
282 : } else {
283 0 : crc32c = crc32c_sw;
284 : }
285 1 : return ret;
286 168 : }(true);
287 : (void)bInit;
288 168 : }
289 :
290 : #elif defined(__aarch64__) && (defined(__linux__) || defined(__APPLE__))
291 : #if defined(__linux__) && defined(HAVE_SYS_AUX_H)
292 : #include <sys/auxv.h>
293 : #elif defined(__APPLE__)
294 : #include <sys/sysctl.h>
295 : #endif
296 :
297 : #if defined(HWCAP_CRC32)
298 : static inline uint32_t crc32cx(uint32_t crc, const uint64_t data)
299 : {
300 : asm(".arch_extension crc\n"
301 : "crc32cx %w0, %w0, %x1" : "+r" (crc) : "r" (data));
302 : return crc;
303 : }
304 :
305 : static inline uint32_t crc32cb(uint32_t crc, const uint8_t data)
306 : {
307 : asm(".arch_extension crc\n"
308 : "crc32cb %w0, %w0, %w1" : "+r" (crc) : "r" (data));
309 : return crc;
310 : }
311 :
312 : static uint32_t crc32c_hw(uint32_t crc, void const *buf, size_t len) {
313 : crc = ~crc;
314 : unsigned char const *next = buf;
315 :
316 : while (((uintptr_t)next & 7) && len > 0) {
317 : crc = crc32cb(crc, *(uint8_t *)next);
318 : next++;
319 : len--;
320 : }
321 :
322 : while (len >= 64) {
323 : uint64_t *next8 = (uint64_t *)next;
324 : crc = crc32cx(crc, next8[0]);
325 : crc = crc32cx(crc, next8[1]);
326 : crc = crc32cx(crc, next8[2]);
327 : crc = crc32cx(crc, next8[3]);
328 : crc = crc32cx(crc, next8[4]);
329 : crc = crc32cx(crc, next8[5]);
330 : crc = crc32cx(crc, next8[6]);
331 : crc = crc32cx(crc, next8[7]);
332 : next += 64;
333 : len -= 64;
334 : }
335 :
336 : while (len >= 8) {
337 : crc = crc32cx(crc, *(uint64_t *)next);
338 : next += 8;
339 : len -= 8;
340 : }
341 :
342 : while (len > 0) {
343 : crc = crc32cb(crc, *(uint8_t *)next);
344 : next++;
345 : len--;
346 : }
347 :
348 : return ~crc;
349 : }
350 :
351 : void crc32c_init(void) {
352 : #if defined(__linux__)
353 : uint64_t auxv = getauxval(AT_HWCAP);
354 :
355 : crc32c = crc32c_sw;
356 : if (auxv & HWCAP_CRC32)
357 : crc32c = crc32c_hw;
358 : #elif defined(__APPLE__)
359 : int armv8_crc32;
360 : size_t size = sizeof(armv8_crc32);
361 :
362 : if (sysctlbyname("hw.optional.armv8_crc32", &armv8_crc32, &size, NULL, 0) == 0 &&
363 : armv8_crc32 == 1)
364 : crc32c = crc32c_hw;
365 : #endif
366 : }
367 : #else /* no hw crc32 on arm64 system supported? old compiler/libc/kernel? */
368 : void crc32c_init(void) {
369 : crc32c = crc32c_sw;
370 : }
371 : #endif
372 : #else /* !__x86_64__i && !__aarch64__ */
373 : void crc32c_init(void) {
374 : crc32c = crc32c_sw;
375 : }
376 :
377 : #endif
378 :
379 : #if CPL_IS_LSB
380 :
381 : /* Construct table for software CRC-32C little-endian calculation. */
382 :
383 : static uint32_t crc32c_table_little[8][256];
384 0 : static void crc32c_init_sw_little(void) {
385 0 : for (unsigned n = 0; n < 256; n++) {
386 0 : uint32_t crc = n;
387 0 : crc = (crc & 1) ? (crc >> 1) ^ POLY : crc >> 1;
388 0 : crc = (crc & 1) ? (crc >> 1) ^ POLY : crc >> 1;
389 0 : crc = (crc & 1) ? (crc >> 1) ^ POLY : crc >> 1;
390 0 : crc = (crc & 1) ? (crc >> 1) ^ POLY : crc >> 1;
391 0 : crc = (crc & 1) ? (crc >> 1) ^ POLY : crc >> 1;
392 0 : crc = (crc & 1) ? (crc >> 1) ^ POLY : crc >> 1;
393 0 : crc = (crc & 1) ? (crc >> 1) ^ POLY : crc >> 1;
394 0 : crc = (crc & 1) ? (crc >> 1) ^ POLY : crc >> 1;
395 0 : crc32c_table_little[0][n] = crc;
396 : }
397 0 : for (unsigned n = 0; n < 256; n++) {
398 0 : uint32_t crc = crc32c_table_little[0][n];
399 0 : for (unsigned k = 1; k < 8; k++) {
400 0 : crc = crc32c_table_little[0][crc & 0xff] ^ (crc >> 8);
401 0 : crc32c_table_little[k][n] = crc;
402 : }
403 : }
404 0 : }
405 :
406 : /* Compute a CRC-32C in software assuming a little-endian architecture,
407 : constructing the required table if that hasn't already been done. */
408 0 : uint32_t crc32c_sw_little(uint32_t crc, void const *buf, size_t len) {
409 0 : unsigned char const *next = static_cast<unsigned char const*>(buf);
410 :
411 0 : static bool bInit = [](bool ret)
412 : {
413 0 : crc32c_init_sw_little();
414 0 : return ret;
415 0 : }(true);
416 : (void)bInit;
417 :
418 0 : crc = ~crc;
419 0 : while (len && (reinterpret_cast<uintptr_t>(next) & 7) != 0) {
420 0 : crc = crc32c_table_little[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
421 0 : len--;
422 : }
423 0 : if (len >= 8) {
424 0 : uint64_t crcw = crc;
425 0 : do {
426 0 : crcw ^= *reinterpret_cast<uint64_t const *>(next);
427 0 : crcw = crc32c_table_little[7][crcw & 0xff] ^
428 0 : crc32c_table_little[6][(crcw >> 8) & 0xff] ^
429 0 : crc32c_table_little[5][(crcw >> 16) & 0xff] ^
430 0 : crc32c_table_little[4][(crcw >> 24) & 0xff] ^
431 0 : crc32c_table_little[3][(crcw >> 32) & 0xff] ^
432 0 : crc32c_table_little[2][(crcw >> 40) & 0xff] ^
433 0 : crc32c_table_little[1][(crcw >> 48) & 0xff] ^
434 0 : crc32c_table_little[0][crcw >> 56];
435 0 : next += 8;
436 0 : len -= 8;
437 0 : } while (len >= 8);
438 0 : crc = static_cast<uint32_t>(crcw);
439 : }
440 0 : while (len) {
441 0 : crc = crc32c_table_little[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
442 0 : len--;
443 : }
444 0 : return ~crc;
445 : }
446 :
447 : #else
448 :
449 : /* Swap the bytes in a uint64_t. (Only for big-endian.) */
450 : #if defined(__has_builtin) || (defined(__GNUC__) && \
451 : (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)))
452 : # define swap __builtin_bswap64
453 : #else
454 : static inline uint64_t swap(uint64_t x) {
455 : x = ((x << 8) & 0xff00ff00ff00ff00) | ((x >> 8) & 0xff00ff00ff00ff);
456 : x = ((x << 16) & 0xffff0000ffff0000) | ((x >> 16) & 0xffff0000ffff);
457 : return (x << 32) | (x >> 32);
458 : }
459 : #endif
460 :
461 : /* Construct tables for software CRC-32C big-endian calculation. */
462 :
463 : static uint32_t crc32c_table_big_byte[256];
464 : static uint64_t crc32c_table_big[8][256];
465 : static void crc32c_init_sw_big(void) {
466 : for (unsigned n = 0; n < 256; n++) {
467 : uint32_t crc = n;
468 : crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
469 : crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
470 : crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
471 : crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
472 : crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
473 : crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
474 : crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
475 : crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
476 : crc32c_table_big_byte[n] = crc;
477 : }
478 : for (unsigned n = 0; n < 256; n++) {
479 : uint32_t crc = crc32c_table_big_byte[n];
480 : crc32c_table_big[0][n] = swap(crc);
481 : for (unsigned k = 1; k < 8; k++) {
482 : crc = crc32c_table_big_byte[crc & 0xff] ^ (crc >> 8);
483 : crc32c_table_big[k][n] = swap(crc);
484 : }
485 : }
486 : }
487 :
488 : /* Compute a CRC-32C in software assuming a big-endian architecture,
489 : constructing the required tables if that hasn't already been done. */
490 : uint32_t crc32c_sw_big(uint32_t crc, void const *buf, size_t len) {
491 : unsigned char const *next = reinterpret_cast<unsigned char const*>(buf);
492 :
493 : static bool bInit = []()
494 : {
495 : crc32c_init_sw_big();
496 : return true;
497 : }();
498 : (void)bInit;
499 :
500 : crc = ~crc;
501 : while (len && (reinterpret_cast<uintptr_t>(next) & 7) != 0) {
502 : crc = crc32c_table_big_byte[(crc ^ *next++) & 0xff] ^ (crc >> 8);
503 : len--;
504 : }
505 : if (len >= 8) {
506 : uint64_t crcw = swap(crc);
507 : do {
508 : crcw ^= *reinterpret_cast<uint64_t const *>(next);
509 : crcw = crc32c_table_big[0][crcw & 0xff] ^
510 : crc32c_table_big[1][(crcw >> 8) & 0xff] ^
511 : crc32c_table_big[2][(crcw >> 16) & 0xff] ^
512 : crc32c_table_big[3][(crcw >> 24) & 0xff] ^
513 : crc32c_table_big[4][(crcw >> 32) & 0xff] ^
514 : crc32c_table_big[5][(crcw >> 40) & 0xff] ^
515 : crc32c_table_big[6][(crcw >> 48) & 0xff] ^
516 : crc32c_table_big[7][(crcw >> 56)];
517 : next += 8;
518 : len -= 8;
519 : } while (len >= 8);
520 : crc = swap(crcw);
521 : }
522 : while (len) {
523 : crc = crc32c_table_big_byte[(crc ^ *next++) & 0xff] ^ (crc >> 8);
524 : len--;
525 : }
526 : return ~crc;
527 : }
528 :
529 : #endif
530 :
531 : /* Table-driven software CRC-32C. This is about 15 times slower than using the
532 : hardware instructions. Determine the endianness of the processor and proceed
533 : accordingly. Ideally the endianness will be determined at compile time, in
534 : which case the unused functions and tables for the other endianness will be
535 : removed by the optimizer. If not, then the proper routines and tables will
536 : be used, even if the endianness is changed mid-stream. (Yes, there are
537 : processors that permit that -- go figure.) */
538 0 : uint32_t crc32c_sw(uint32_t crc, void const *buf, size_t len) {
539 : #if CPL_IS_LSB
540 0 : return crc32c_sw_little(crc, buf, len);
541 : #else
542 : return crc32c_sw_big(crc, buf, len);
543 : #endif
544 : }
|