Line data Source code
1 : // libdivide.h - Optimized integer division
2 : // https://libdivide.com
3 : //
4 : // Copyright (C) 2010 - 2022 ridiculous_fish, <libdivide@ridiculousfish.com>
5 : // Copyright (C) 2016 - 2022 Kim Walisch, <kim.walisch@gmail.com>
6 : //
7 : // libdivide is dual-licensed under the Boost or zlib licenses.
8 : // You may use libdivide under the terms of either of these.
9 : // See LICENSE.txt for more details.
10 :
11 : #ifndef LIBDIVIDE_H
12 : #define LIBDIVIDE_H
13 :
14 : // *** Version numbers are auto generated - do not edit ***
15 : #define LIBDIVIDE_VERSION "5.2.0"
16 : #define LIBDIVIDE_VERSION_MAJOR 5
17 : #define LIBDIVIDE_VERSION_MINOR 2
18 : #define LIBDIVIDE_VERSION_PATCH 0
19 :
20 : #include <stdint.h>
21 :
22 : #if !defined(__AVR__) && __STDC_HOSTED__ != 0
23 : #include <stdio.h>
24 : #include <stdlib.h>
25 : #endif
26 :
27 : #if defined(_MSC_VER) && (defined(__cplusplus) && (__cplusplus >= 202002L)) || \
28 : (defined(_MSVC_LANG) && (_MSVC_LANG >= 202002L))
29 : #include <limits.h>
30 : #include <type_traits>
31 : #define LIBDIVIDE_VC_CXX20
32 : #endif
33 :
34 : #if defined(LIBDIVIDE_SSE2)
35 : #include <emmintrin.h>
36 : #endif
37 :
38 : #if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512)
39 : #include <immintrin.h>
40 : #endif
41 :
42 : #if defined(LIBDIVIDE_NEON)
43 : #include <arm_neon.h>
44 : #endif
45 :
46 : // Clang-cl prior to Visual Studio 2022 doesn't include __umulh/__mulh intrinsics
47 : #if defined(_MSC_VER) && (!defined(__clang__) || _MSC_VER > 1930) && \
48 : (defined(_M_X64) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC))
49 : #define LIBDIVIDE_MULH_INTRINSICS
50 : #endif
51 :
52 : #if defined(_MSC_VER)
53 : #if defined(LIBDIVIDE_MULH_INTRINSICS) || !defined(__clang__)
54 : #include <intrin.h>
55 : #endif
56 : #ifndef __clang__
57 : #pragma warning(push)
58 : // 4146: unary minus operator applied to unsigned type, result still unsigned
59 : #pragma warning(disable : 4146)
60 :
61 : // 4204: nonstandard extension used : non-constant aggregate initializer
62 : #pragma warning(disable : 4204)
63 : #endif
64 : #define LIBDIVIDE_VC
65 : #endif
66 :
67 : #if !defined(__has_builtin)
68 : #define __has_builtin(x) 0
69 : #endif
70 :
71 : #if defined(__SIZEOF_INT128__)
72 : #define HAS_INT128_T
73 : // clang-cl on Windows does not yet support 128-bit division
74 : #if !(defined(__clang__) && defined(LIBDIVIDE_VC))
75 : #define HAS_INT128_DIV
76 : #endif
77 : #endif
78 :
79 : #if defined(__x86_64__) || defined(_M_X64)
80 : #define LIBDIVIDE_X86_64
81 : #endif
82 :
83 : #if defined(__i386__)
84 : #define LIBDIVIDE_i386
85 : #endif
86 :
87 : #if defined(__GNUC__) || defined(__clang__)
88 : #define LIBDIVIDE_GCC_STYLE_ASM
89 : #endif
90 :
91 : #if defined(__cplusplus) || defined(LIBDIVIDE_VC)
92 : #define LIBDIVIDE_FUNCTION __FUNCTION__
93 : #else
94 : #define LIBDIVIDE_FUNCTION __func__
95 : #endif
96 :
97 : // Set up forced inlining if possible.
98 : // We need both the attribute and keyword to avoid "might not be inlineable" warnings.
99 : #ifdef __has_attribute
100 : #if __has_attribute(always_inline)
101 : #define LIBDIVIDE_INLINE __attribute__((always_inline)) inline
102 : #endif
103 : #endif
104 : #ifndef LIBDIVIDE_INLINE
105 : #ifdef _MSC_VER
106 : #define LIBDIVIDE_INLINE __forceinline
107 : #else
108 : #define LIBDIVIDE_INLINE inline
109 : #endif
110 : #endif
111 :
112 : #if defined(__AVR__) || __STDC_HOSTED__ == 0
113 : #define LIBDIVIDE_ERROR(msg)
114 : #else
115 : #define LIBDIVIDE_ERROR(msg) \
116 : do { \
117 : fprintf(stderr, "libdivide.h:%d: %s(): Error: %s\n", __LINE__, LIBDIVIDE_FUNCTION, msg); \
118 : abort(); \
119 : } while (0)
120 : #endif
121 :
122 : #if defined(LIBDIVIDE_ASSERTIONS_ON) && !defined(__AVR__) && __STDC_HOSTED__ != 0
123 : #define LIBDIVIDE_ASSERT(x) \
124 : do { \
125 : if (!(x)) { \
126 : fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", __LINE__, \
127 : LIBDIVIDE_FUNCTION, #x); \
128 : abort(); \
129 : } \
130 : } while (0)
131 : #else
132 : #define LIBDIVIDE_ASSERT(x)
133 : #endif
134 :
135 : #ifdef __cplusplus
136 :
137 : // For constexpr zero initialization, c++11 might handle things ok,
138 : // but just limit to at least c++14 to ensure we don't break anyone's code:
139 :
140 : // Use https://en.cppreference.com/w/cpp/feature_test#cpp_constexpr
141 : #if defined(__cpp_constexpr) && (__cpp_constexpr >= 201304L)
142 : #define LIBDIVIDE_CONSTEXPR constexpr LIBDIVIDE_INLINE
143 :
144 : // Supposedly, MSVC might not implement feature test macros right:
145 : // https://stackoverflow.com/questions/49316752/feature-test-macros-not-working-properly-in-visual-c
146 : // so check that _MSVC_LANG corresponds to at least c++14, and _MSC_VER corresponds to at least VS
147 : // 2017 15.0 (for extended constexpr support:
148 : // https://learn.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=msvc-170)
149 : #elif (defined(_MSC_VER) && _MSC_VER >= 1910) && (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L)
150 : #define LIBDIVIDE_CONSTEXPR constexpr LIBDIVIDE_INLINE
151 :
152 : #else
153 : #define LIBDIVIDE_CONSTEXPR LIBDIVIDE_INLINE
154 : #endif
155 :
156 : namespace libdivide {
157 : #endif
158 :
159 : #if defined(_MSC_VER) && !defined(__clang__)
160 : #if defined(LIBDIVIDE_VC_CXX20)
161 : static LIBDIVIDE_CONSTEXPR int __builtin_clz(unsigned x) {
162 : if (std::is_constant_evaluated()) {
163 : for (int i = 0; i < sizeof(x) * CHAR_BIT; ++i) {
164 : if (x >> (sizeof(x) * CHAR_BIT - 1 - i)) return i;
165 : }
166 : return sizeof(x) * CHAR_BIT;
167 : }
168 : #else
169 : static LIBDIVIDE_INLINE int __builtin_clz(unsigned x) {
170 : #endif
171 : #if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)
172 : return (int)_CountLeadingZeros(x);
173 : #elif defined(__AVX2__) || defined(__LZCNT__)
174 : return (int)_lzcnt_u32(x);
175 : #else
176 : unsigned long r;
177 : _BitScanReverse(&r, x);
178 : return (int)(r ^ 31);
179 : #endif
180 : }
181 :
182 : #if defined(LIBDIVIDE_VC_CXX20)
183 : static LIBDIVIDE_CONSTEXPR int __builtin_clzll(unsigned long long x) {
184 : if (std::is_constant_evaluated()) {
185 : for (int i = 0; i < sizeof(x) * CHAR_BIT; ++i) {
186 : if (x >> (sizeof(x) * CHAR_BIT - 1 - i)) return i;
187 : }
188 : return sizeof(x) * CHAR_BIT;
189 : }
190 : #else
191 : static LIBDIVIDE_INLINE int __builtin_clzll(unsigned long long x) {
192 : #endif
193 : #if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)
194 : return (int)_CountLeadingZeros64(x);
195 : #elif defined(_WIN64)
196 : #if defined(__AVX2__) || defined(__LZCNT__)
197 : return (int)_lzcnt_u64(x);
198 : #else
199 : unsigned long r;
200 : _BitScanReverse64(&r, x);
201 : return (int)(r ^ 63);
202 : #endif
203 : #else
204 : int l = __builtin_clz((unsigned)x) + 32;
205 : int h = __builtin_clz((unsigned)(x >> 32));
206 : return !!((unsigned)(x >> 32)) ? h : l;
207 : #endif
208 : }
209 : #endif // defined(_MSC_VER) && !defined(__clang__)
210 :
211 : // pack divider structs to prevent compilers from padding.
212 : // This reduces memory usage by up to 43% when using a large
213 : // array of libdivide dividers and improves performance
214 : // by up to 10% because of reduced memory bandwidth.
215 : #pragma pack(push, 1)
216 :
217 : struct libdivide_u16_t {
218 : uint16_t magic;
219 : uint8_t more;
220 : };
221 :
222 : struct libdivide_s16_t {
223 : int16_t magic;
224 : uint8_t more;
225 : };
226 :
227 : struct libdivide_u32_t {
228 : uint32_t magic;
229 : uint8_t more;
230 : };
231 :
232 : struct libdivide_s32_t {
233 : int32_t magic;
234 : uint8_t more;
235 : };
236 :
237 : struct libdivide_u64_t {
238 : uint64_t magic;
239 : uint8_t more;
240 : };
241 :
242 : struct libdivide_s64_t {
243 : int64_t magic;
244 : uint8_t more;
245 : };
246 :
247 : struct libdivide_u16_branchfree_t {
248 : uint16_t magic;
249 : uint8_t more;
250 : };
251 :
252 : struct libdivide_s16_branchfree_t {
253 : int16_t magic;
254 : uint8_t more;
255 : };
256 :
257 : struct libdivide_u32_branchfree_t {
258 : uint32_t magic;
259 : uint8_t more;
260 : };
261 :
262 : struct libdivide_s32_branchfree_t {
263 : int32_t magic;
264 : uint8_t more;
265 : };
266 :
267 : struct libdivide_u64_branchfree_t {
268 : uint64_t magic;
269 : uint8_t more;
270 : };
271 :
272 : struct libdivide_s64_branchfree_t {
273 : int64_t magic;
274 : uint8_t more;
275 : };
276 :
277 : #pragma pack(pop)
278 :
279 : // Explanation of the "more" field:
280 : //
281 : // * Bits 0-5 is the shift value (for shift path or mult path).
282 : // * Bit 6 is the add indicator for mult path.
283 : // * Bit 7 is set if the divisor is negative. We use bit 7 as the negative
284 : // divisor indicator so that we can efficiently use sign extension to
285 : // create a bitmask with all bits set to 1 (if the divisor is negative)
286 : // or 0 (if the divisor is positive).
287 : //
288 : // u32: [0-4] shift value
289 : // [5] ignored
290 : // [6] add indicator
291 : // magic number of 0 indicates shift path
292 : //
293 : // s32: [0-4] shift value
294 : // [5] ignored
295 : // [6] add indicator
296 : // [7] indicates negative divisor
297 : // magic number of 0 indicates shift path
298 : //
299 : // u64: [0-5] shift value
300 : // [6] add indicator
301 : // magic number of 0 indicates shift path
302 : //
303 : // s64: [0-5] shift value
304 : // [6] add indicator
305 : // [7] indicates negative divisor
306 : // magic number of 0 indicates shift path
307 : //
308 : // In s32 and s64 branchfree modes, the magic number is negated according to
309 : // whether the divisor is negated. In branchfree strategy, it is not negated.
310 :
311 : enum {
312 : LIBDIVIDE_16_SHIFT_MASK = 0x1F,
313 : LIBDIVIDE_32_SHIFT_MASK = 0x1F,
314 : LIBDIVIDE_64_SHIFT_MASK = 0x3F,
315 : LIBDIVIDE_ADD_MARKER = 0x40,
316 : LIBDIVIDE_NEGATIVE_DIVISOR = 0x80
317 : };
318 :
319 : static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d);
320 : static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d);
321 : static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d);
322 : static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d);
323 : static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d);
324 : static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d);
325 :
326 : static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d);
327 : static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d);
328 : static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d);
329 : static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d);
330 : static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d);
331 : static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d);
332 :
333 : static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(
334 : int16_t numer, int16_t magic, uint8_t more);
335 : static LIBDIVIDE_INLINE int16_t libdivide_s16_do(
336 : int16_t numer, const struct libdivide_s16_t *denom);
337 : static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(
338 : uint16_t numer, uint16_t magic, uint8_t more);
339 : static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(
340 : uint16_t numer, const struct libdivide_u16_t *denom);
341 : static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(
342 : int32_t numer, int32_t magic, uint8_t more);
343 : static LIBDIVIDE_INLINE int32_t libdivide_s32_do(
344 : int32_t numer, const struct libdivide_s32_t *denom);
345 : static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw(
346 : uint32_t numer, uint32_t magic, uint8_t more);
347 : static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(
348 : uint32_t numer, const struct libdivide_u32_t *denom);
349 : static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(
350 : int64_t numer, int64_t magic, uint8_t more);
351 : static LIBDIVIDE_INLINE int64_t libdivide_s64_do(
352 : int64_t numer, const struct libdivide_s64_t *denom);
353 : static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw(
354 : uint64_t numer, uint64_t magic, uint8_t more);
355 : static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(
356 : uint64_t numer, const struct libdivide_u64_t *denom);
357 :
358 : static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(
359 : int16_t numer, const struct libdivide_s16_branchfree_t *denom);
360 : static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(
361 : uint16_t numer, const struct libdivide_u16_branchfree_t *denom);
362 : static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(
363 : int32_t numer, const struct libdivide_s32_branchfree_t *denom);
364 : static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(
365 : uint32_t numer, const struct libdivide_u32_branchfree_t *denom);
366 : static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(
367 : int64_t numer, const struct libdivide_s64_branchfree_t *denom);
368 : static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(
369 : uint64_t numer, const struct libdivide_u64_branchfree_t *denom);
370 :
371 : static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom);
372 : static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom);
373 : static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom);
374 : static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);
375 : static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom);
376 : static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom);
377 :
378 : static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(
379 : const struct libdivide_s16_branchfree_t *denom);
380 : static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(
381 : const struct libdivide_u16_branchfree_t *denom);
382 : static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(
383 : const struct libdivide_s32_branchfree_t *denom);
384 : static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(
385 : const struct libdivide_u32_branchfree_t *denom);
386 : static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover(
387 : const struct libdivide_s64_branchfree_t *denom);
388 : static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover(
389 : const struct libdivide_u64_branchfree_t *denom);
390 :
391 : //////// Internal Utility Functions
392 :
393 : static LIBDIVIDE_INLINE uint16_t libdivide_mullhi_u16(uint16_t x, uint16_t y) {
394 : uint32_t xl = x, yl = y;
395 : uint32_t rl = xl * yl;
396 : return (uint16_t)(rl >> 16);
397 : }
398 :
399 : static LIBDIVIDE_INLINE int16_t libdivide_mullhi_s16(int16_t x, int16_t y) {
400 : int32_t xl = x, yl = y;
401 : int32_t rl = xl * yl;
402 : // needs to be arithmetic shift
403 : return (int16_t)(rl >> 16);
404 : }
405 :
406 : static LIBDIVIDE_INLINE uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) {
407 2785824 : uint64_t xl = x, yl = y;
408 2785824 : uint64_t rl = xl * yl;
409 2785824 : return (uint32_t)(rl >> 32);
410 : }
411 :
412 : static LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) {
413 : int64_t xl = x, yl = y;
414 : int64_t rl = xl * yl;
415 : // needs to be arithmetic shift
416 : return (int32_t)(rl >> 32);
417 : }
418 :
419 : static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
420 : #if defined(LIBDIVIDE_MULH_INTRINSICS)
421 : return __umulh(x, y);
422 : #elif defined(HAS_INT128_T)
423 : __uint128_t xl = x, yl = y;
424 : __uint128_t rl = xl * yl;
425 : return (uint64_t)(rl >> 64);
426 : #else
427 : // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
428 : uint32_t mask = 0xFFFFFFFF;
429 : uint32_t x0 = (uint32_t)(x & mask);
430 : uint32_t x1 = (uint32_t)(x >> 32);
431 : uint32_t y0 = (uint32_t)(y & mask);
432 : uint32_t y1 = (uint32_t)(y >> 32);
433 : uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);
434 : uint64_t x0y1 = x0 * (uint64_t)y1;
435 : uint64_t x1y0 = x1 * (uint64_t)y0;
436 : uint64_t x1y1 = x1 * (uint64_t)y1;
437 : uint64_t temp = x1y0 + x0y0_hi;
438 : uint64_t temp_lo = temp & mask;
439 : uint64_t temp_hi = temp >> 32;
440 :
441 : return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32);
442 : #endif
443 : }
444 :
445 : static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
446 : #if defined(LIBDIVIDE_MULH_INTRINSICS)
447 : return __mulh(x, y);
448 : #elif defined(HAS_INT128_T)
449 : __int128_t xl = x, yl = y;
450 : __int128_t rl = xl * yl;
451 : return (int64_t)(rl >> 64);
452 : #else
453 : // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
454 : uint32_t mask = 0xFFFFFFFF;
455 : uint32_t x0 = (uint32_t)(x & mask);
456 : uint32_t y0 = (uint32_t)(y & mask);
457 : int32_t x1 = (int32_t)(x >> 32);
458 : int32_t y1 = (int32_t)(y >> 32);
459 : uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);
460 : int64_t t = x1 * (int64_t)y0 + x0y0_hi;
461 : int64_t w1 = x0 * (int64_t)y1 + (t & mask);
462 :
463 : return x1 * (int64_t)y1 + (t >> 32) + (w1 >> 32);
464 : #endif
465 : }
466 :
467 : static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) {
468 : #if defined(__AVR__)
469 : // Fast way to count leading zeros
470 : // On the AVR 8-bit architecture __builtin_clz() works on a int16_t.
471 : return __builtin_clz(val);
472 : #elif defined(__GNUC__) || __has_builtin(__builtin_clz) || defined(_MSC_VER)
473 : // Fast way to count leading zeros
474 6 : return (int16_t)(__builtin_clz(val) - 16);
475 : #else
476 : if (val == 0) return 16;
477 : int16_t result = 4;
478 : uint16_t hi = 0xFU << 12;
479 : while ((val & hi) == 0) {
480 : hi >>= 4;
481 : result += 4;
482 : }
483 : while (val & hi) {
484 : result -= 1;
485 : hi <<= 1;
486 : }
487 : return result;
488 : #endif
489 : }
490 :
491 : static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) {
492 : #if defined(__AVR__)
493 : // Fast way to count leading zeros
494 : return __builtin_clzl(val);
495 : #elif defined(__GNUC__) || __has_builtin(__builtin_clz) || defined(_MSC_VER)
496 : // Fast way to count leading zeros
497 106124 : return __builtin_clz(val);
498 : #else
499 : if (val == 0) return 32;
500 : int32_t result = 8;
501 : uint32_t hi = 0xFFU << 24;
502 : while ((val & hi) == 0) {
503 : hi >>= 8;
504 : result += 8;
505 : }
506 : while (val & hi) {
507 : result -= 1;
508 : hi <<= 1;
509 : }
510 : return result;
511 : #endif
512 : }
513 :
514 : static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) {
515 : #if defined(__GNUC__) || __has_builtin(__builtin_clzll) || defined(_MSC_VER)
516 : // Fast way to count leading zeros
517 : return __builtin_clzll(val);
518 : #else
519 : uint32_t hi = val >> 32;
520 : uint32_t lo = val & 0xFFFFFFFF;
521 : if (hi != 0) return libdivide_count_leading_zeros32(hi);
522 : return 32 + libdivide_count_leading_zeros32(lo);
523 : #endif
524 : }
525 :
526 : // libdivide_32_div_16_to_16: divides a 32-bit uint {u1, u0} by a 16-bit
527 : // uint {v}. The result must fit in 16 bits.
528 : // Returns the quotient directly and the remainder in *r
529 : static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16(
530 : uint16_t u1, uint16_t u0, uint16_t v, uint16_t *r) {
531 3 : uint32_t n = ((uint32_t)u1 << 16) | u0;
532 3 : uint16_t result = (uint16_t)(n / v);
533 3 : *r = (uint16_t)(n - result * (uint32_t)v);
534 3 : return result;
535 : }
536 :
537 : // libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit
538 : // uint {v}. The result must fit in 32 bits.
539 : // Returns the quotient directly and the remainder in *r
540 : static LIBDIVIDE_INLINE uint32_t libdivide_64_div_32_to_32(
541 : uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
542 : #if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && defined(LIBDIVIDE_GCC_STYLE_ASM)
543 : uint32_t result;
544 58043 : __asm__("divl %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1));
545 58043 : return result;
546 : #else
547 : uint64_t n = ((uint64_t)u1 << 32) | u0;
548 : uint32_t result = (uint32_t)(n / v);
549 : *r = (uint32_t)(n - result * (uint64_t)v);
550 : return result;
551 : #endif
552 : }
553 :
554 : // libdivide_128_div_64_to_64: divides a 128-bit uint {numhi, numlo} by a 64-bit uint {den}. The
555 : // result must fit in 64 bits. Returns the quotient directly and the remainder in *r
556 : static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(
557 : uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
558 : // N.B. resist the temptation to use __uint128_t here.
559 : // In LLVM compiler-rt, it performs a 128/128 -> 128 division which is many times slower than
560 : // necessary. In gcc it's better but still slower than the divlu implementation, perhaps because
561 : // it's not LIBDIVIDE_INLINEd.
562 : #if defined(LIBDIVIDE_X86_64) && defined(LIBDIVIDE_GCC_STYLE_ASM)
563 : uint64_t result;
564 : __asm__("div %[v]" : "=a"(result), "=d"(*r) : [v] "r"(den), "a"(numlo), "d"(numhi));
565 : return result;
566 : #else
567 : // We work in base 2**32.
568 : // A uint32 holds a single digit. A uint64 holds two digits.
569 : // Our numerator is conceptually [num3, num2, num1, num0].
570 : // Our denominator is [den1, den0].
571 : const uint64_t b = ((uint64_t)1 << 32);
572 :
573 : // The high and low digits of our computed quotient.
574 : uint32_t q1;
575 : uint32_t q0;
576 :
577 : // The normalization shift factor.
578 : int shift;
579 :
580 : // The high and low digits of our denominator (after normalizing).
581 : // Also the low 2 digits of our numerator (after normalizing).
582 : uint32_t den1;
583 : uint32_t den0;
584 : uint32_t num1;
585 : uint32_t num0;
586 :
587 : // A partial remainder.
588 : uint64_t rem;
589 :
590 : // The estimated quotient, and its corresponding remainder (unrelated to true remainder).
591 : uint64_t qhat;
592 : uint64_t rhat;
593 :
594 : // Variables used to correct the estimated quotient.
595 : uint64_t c1;
596 : uint64_t c2;
597 :
598 : // Check for overflow and divide by 0.
599 : if (numhi >= den) {
600 : if (r) *r = ~0ull;
601 : return ~0ull;
602 : }
603 :
604 : // Determine the normalization factor. We multiply den by this, so that its leading digit is at
605 : // least half b. In binary this means just shifting left by the number of leading zeros, so that
606 : // there's a 1 in the MSB.
607 : // We also shift numer by the same amount. This cannot overflow because numhi < den.
608 : // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
609 : // by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is
610 : // 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it.
611 : shift = libdivide_count_leading_zeros64(den);
612 : den <<= shift;
613 : numhi <<= shift;
614 : numhi |= (numlo >> (-shift & 63)) & (uint64_t)(-(int64_t)shift >> 63);
615 : numlo <<= shift;
616 :
617 : // Extract the low digits of the numerator and both digits of the denominator.
618 : num1 = (uint32_t)(numlo >> 32);
619 : num0 = (uint32_t)(numlo & 0xFFFFFFFFu);
620 : den1 = (uint32_t)(den >> 32);
621 : den0 = (uint32_t)(den & 0xFFFFFFFFu);
622 :
623 : // We wish to compute q1 = [n3 n2 n1] / [d1 d0].
624 : // Estimate q1 as [n3 n2] / [d1], and then correct it.
625 : // Note while qhat may be 2 digits, q1 is always 1 digit.
626 : qhat = numhi / den1;
627 : rhat = numhi % den1;
628 : c1 = qhat * den0;
629 : c2 = rhat * b + num1;
630 : if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
631 : q1 = (uint32_t)qhat;
632 :
633 : // Compute the true (partial) remainder.
634 : rem = numhi * b + num1 - q1 * den;
635 :
636 : // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
637 : // Estimate q0 as [rem1 rem0] / [d1] and correct it.
638 : qhat = rem / den1;
639 : rhat = rem % den1;
640 : c1 = qhat * den0;
641 : c2 = rhat * b + num0;
642 : if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
643 : q0 = (uint32_t)qhat;
644 :
645 : // Return remainder if requested.
646 : if (r) *r = (rem * b + num0 - q0 * den) >> shift;
647 : return ((uint64_t)q1 << 32) | q0;
648 : #endif
649 : }
650 :
651 : #if !(defined(HAS_INT128_T) && \
652 : defined(HAS_INT128_DIV))
653 :
654 : // Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0)
655 : static LIBDIVIDE_INLINE void libdivide_u128_shift(
656 : uint64_t *u1, uint64_t *u0, int32_t signed_shift) {
657 : if (signed_shift > 0) {
658 : uint32_t shift = signed_shift;
659 : *u1 <<= shift;
660 : *u1 |= *u0 >> (64 - shift);
661 : *u0 <<= shift;
662 : } else if (signed_shift < 0) {
663 : uint32_t shift = -signed_shift;
664 : *u0 >>= shift;
665 : *u0 |= *u1 << (64 - shift);
666 : *u1 >>= shift;
667 : }
668 : }
669 :
670 : #endif
671 :
672 : // Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder.
673 : static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64(
674 : uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) {
675 : #if defined(HAS_INT128_T) && defined(HAS_INT128_DIV)
676 : __uint128_t ufull = u_hi;
677 : __uint128_t vfull = v_hi;
678 : ufull = (ufull << 64) | u_lo;
679 : vfull = (vfull << 64) | v_lo;
680 : uint64_t res = (uint64_t)(ufull / vfull);
681 : __uint128_t remainder = ufull - (vfull * res);
682 : *r_lo = (uint64_t)remainder;
683 : *r_hi = (uint64_t)(remainder >> 64);
684 : return res;
685 : #else
686 : // Adapted from "Unsigned Doubleword Division" in Hacker's Delight
687 : // We want to compute u / v
688 : typedef struct {
689 : uint64_t hi;
690 : uint64_t lo;
691 : } u128_t;
692 : u128_t u = {u_hi, u_lo};
693 : u128_t v = {v_hi, v_lo};
694 :
695 : if (v.hi == 0) {
696 : // divisor v is a 64 bit value, so we just need one 128/64 division
697 : // Note that we are simpler than Hacker's Delight here, because we know
698 : // the quotient fits in 64 bits whereas Hacker's Delight demands a full
699 : // 128 bit quotient
700 : *r_hi = 0;
701 : return libdivide_128_div_64_to_64(u.hi, u.lo, v.lo, r_lo);
702 : }
703 : // Here v >= 2**64
704 : // We know that v.hi != 0, so count leading zeros is OK
705 : // We have 0 <= n <= 63
706 : uint32_t n = libdivide_count_leading_zeros64(v.hi);
707 :
708 : // Normalize the divisor so its MSB is 1
709 : u128_t v1t = v;
710 : libdivide_u128_shift(&v1t.hi, &v1t.lo, n);
711 : uint64_t v1 = v1t.hi; // i.e. v1 = v1t >> 64
712 :
713 : // To ensure no overflow
714 : u128_t u1 = u;
715 : libdivide_u128_shift(&u1.hi, &u1.lo, -1);
716 :
717 : // Get quotient from divide unsigned insn.
718 : uint64_t rem_ignored;
719 : uint64_t q1 = libdivide_128_div_64_to_64(u1.hi, u1.lo, v1, &rem_ignored);
720 :
721 : // Undo normalization and division of u by 2.
722 : u128_t q0 = {0, q1};
723 : libdivide_u128_shift(&q0.hi, &q0.lo, n);
724 : libdivide_u128_shift(&q0.hi, &q0.lo, -63);
725 :
726 : // Make q0 correct or too small by 1
727 : // Equivalent to `if (q0 != 0) q0 = q0 - 1;`
728 : if (q0.hi != 0 || q0.lo != 0) {
729 : q0.hi -= (q0.lo == 0); // borrow
730 : q0.lo -= 1;
731 : }
732 :
733 : // Now q0 is correct.
734 : // Compute q0 * v as q0v
735 : // = (q0.hi << 64 + q0.lo) * (v.hi << 64 + v.lo)
736 : // = (q0.hi * v.hi << 128) + (q0.hi * v.lo << 64) +
737 : // (q0.lo * v.hi << 64) + q0.lo * v.lo)
738 : // Each term is 128 bit
739 : // High half of full product (upper 128 bits!) are dropped
740 : u128_t q0v = {0, 0};
741 : q0v.hi = q0.hi * v.lo + q0.lo * v.hi + libdivide_mullhi_u64(q0.lo, v.lo);
742 : q0v.lo = q0.lo * v.lo;
743 :
744 : // Compute u - q0v as u_q0v
745 : // This is the remainder
746 : u128_t u_q0v = u;
747 : u_q0v.hi -= q0v.hi + (u.lo < q0v.lo); // second term is borrow
748 : u_q0v.lo -= q0v.lo;
749 :
750 : // Check if u_q0v >= v
751 : // This checks if our remainder is larger than the divisor
752 : if ((u_q0v.hi > v.hi) || (u_q0v.hi == v.hi && u_q0v.lo >= v.lo)) {
753 : // Increment q0
754 : q0.lo += 1;
755 : q0.hi += (q0.lo == 0); // carry
756 :
757 : // Subtract v from remainder
758 : u_q0v.hi -= v.hi + (u_q0v.lo < v.lo);
759 : u_q0v.lo -= v.lo;
760 : }
761 :
762 : *r_hi = u_q0v.hi;
763 : *r_lo = u_q0v.lo;
764 :
765 : LIBDIVIDE_ASSERT(q0.hi == 0);
766 : return q0.lo;
767 : #endif
768 : }
769 :
770 : ////////// UINT16
771 :
772 : static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
773 : uint16_t d, int branchfree) {
774 6 : if (d == 0) {
775 0 : LIBDIVIDE_ERROR("divider must be != 0");
776 : }
777 :
778 : struct libdivide_u16_t result;
779 6 : uint8_t floor_log_2_d = (uint8_t)(15 - libdivide_count_leading_zeros16(d));
780 :
781 : // Power of 2
782 6 : if ((d & (d - 1)) == 0) {
783 : // We need to subtract 1 from the shift value in case of an unsigned
784 : // branchfree divider because there is a hardcoded right shift by 1
785 : // in its division algorithm. Because of this we also need to add back
786 : // 1 in its recovery algorithm.
787 3 : result.magic = 0;
788 3 : result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
789 : } else {
790 : uint8_t more;
791 : uint16_t rem, proposed_m;
792 3 : proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem);
793 :
794 : LIBDIVIDE_ASSERT(rem > 0 && rem < d);
795 3 : const uint16_t e = d - rem;
796 :
797 : // This power works if e < 2**floor_log_2_d.
798 3 : if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) {
799 : // This power works
800 2 : more = floor_log_2_d;
801 : } else {
802 : // We have to use the general 17-bit algorithm. We need to compute
803 : // (2**power) / d. However, we already have (2**(power-1))/d and
804 : // its remainder. By doubling both, and then correcting the
805 : // remainder, we can compute the larger division.
806 : // don't care about overflow here - in fact, we expect it
807 1 : proposed_m += proposed_m;
808 1 : const uint16_t twice_rem = rem + rem;
809 1 : if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
810 1 : more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
811 : }
812 3 : result.magic = 1 + proposed_m;
813 3 : result.more = more;
814 : // result.more's shift should in general be ceil_log_2_d. But if we
815 : // used the smaller power, we subtract one from the shift because we're
816 : // using the smaller power. If we're using the larger power, we
817 : // subtract one from the shift because it's taken care of by the add
818 : // indicator. So floor_log_2_d happens to be correct in both cases.
819 : }
820 6 : return result;
821 : }
822 :
823 : static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d) {
824 6 : return libdivide_internal_u16_gen(d, 0);
825 : }
826 :
827 : static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
828 : if (d == 1) {
829 : LIBDIVIDE_ERROR("branchfree divider must be != 1");
830 : }
831 : struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1);
832 : struct libdivide_u16_branchfree_t ret = {
833 : tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK)};
834 : return ret;
835 : }
836 :
837 : // The original libdivide_u16_do takes a const pointer. However, this cannot be used
838 : // with a compile time constant libdivide_u16_t: it will generate a warning about
839 : // taking the address of a temporary. Hence this overload.
840 : static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) {
841 : if (!magic) {
842 : return numer >> more;
843 : } else {
844 : uint16_t q = libdivide_mullhi_u16(numer, magic);
845 : if (more & LIBDIVIDE_ADD_MARKER) {
846 : uint16_t t = ((numer - q) >> 1) + q;
847 : return t >> (more & LIBDIVIDE_16_SHIFT_MASK);
848 : } else {
849 : // All upper bits are 0,
850 : // don't need to mask them off.
851 : return q >> more;
852 : }
853 : }
854 : }
855 :
856 : static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t *denom) {
857 : return libdivide_u16_do_raw(numer, denom->magic, denom->more);
858 : }
859 :
860 : static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(
861 : uint16_t numer, const struct libdivide_u16_branchfree_t *denom) {
862 : uint16_t q = libdivide_mullhi_u16(numer, denom->magic);
863 : uint16_t t = ((numer - q) >> 1) + q;
864 : return t >> denom->more;
865 : }
866 :
867 : static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) {
868 : uint8_t more = denom->more;
869 : uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
870 :
871 : if (!denom->magic) {
872 : return (uint16_t)1 << shift;
873 : } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
874 : // We compute q = n/d = n*m / 2^(16 + shift)
875 : // Therefore we have d = 2^(16 + shift) / m
876 : // We need to ceil it.
877 : // We know d is not a power of 2, so m is not a power of 2,
878 : // so we can just add 1 to the floor
879 : uint16_t hi_dividend = (uint16_t)1 << shift;
880 : uint16_t rem_ignored;
881 : return 1 + libdivide_32_div_16_to_16(hi_dividend, 0, denom->magic, &rem_ignored);
882 : } else {
883 : // Here we wish to compute d = 2^(16+shift+1)/(m+2^16).
884 : // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now
885 : // Also note that shift may be as high as 15, so shift + 1 will
886 : // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
887 : // then double the quotient and remainder.
888 : uint32_t half_n = (uint32_t)1 << (16 + shift);
889 : uint32_t d = ((uint32_t)1 << 16) | denom->magic;
890 : // Note that the quotient is guaranteed <= 16 bits, but the remainder
891 : // may need 17!
892 : uint16_t half_q = (uint16_t)(half_n / d);
893 : uint32_t rem = half_n % d;
894 : // We computed 2^(16+shift)/(m+2^16)
895 : // Need to double it, and then add 1 to the quotient if doubling th
896 : // remainder would increase the quotient.
897 : // Note that rem<<1 cannot overflow, since rem < d and d is 17 bits
898 : uint16_t full_q = half_q + half_q + ((rem << 1) >= d);
899 :
900 : // We rounded down in gen (hence +1)
901 : return full_q + 1;
902 : }
903 : }
904 :
905 : static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(const struct libdivide_u16_branchfree_t *denom) {
906 : uint8_t more = denom->more;
907 : uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
908 :
909 : if (!denom->magic) {
910 : return (uint16_t)1 << (shift + 1);
911 : } else {
912 : // Here we wish to compute d = 2^(16+shift+1)/(m+2^16).
913 : // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now
914 : // Also note that shift may be as high as 15, so shift + 1 will
915 : // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
916 : // then double the quotient and remainder.
917 : uint32_t half_n = (uint32_t)1 << (16 + shift);
918 : uint32_t d = ((uint32_t)1 << 16) | denom->magic;
919 : // Note that the quotient is guaranteed <= 16 bits, but the remainder
920 : // may need 17!
921 : uint16_t half_q = (uint16_t)(half_n / d);
922 : uint32_t rem = half_n % d;
923 : // We computed 2^(16+shift)/(m+2^16)
924 : // Need to double it, and then add 1 to the quotient if doubling th
925 : // remainder would increase the quotient.
926 : // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
927 : uint16_t full_q = half_q + half_q + ((rem << 1) >= d);
928 :
929 : // We rounded down in gen (hence +1)
930 : return full_q + 1;
931 : }
932 : }
933 :
934 : ////////// UINT32
935 :
936 : CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW
937 : static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_internal_u32_gen(
938 : uint32_t d, int branchfree) {
939 106124 : if (d == 0) {
940 0 : LIBDIVIDE_ERROR("divider must be != 0");
941 : }
942 :
943 : struct libdivide_u32_t result;
944 106124 : uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(d);
945 :
946 : // Power of 2
947 106124 : if ((d & (d - 1)) == 0) {
948 : // We need to subtract 1 from the shift value in case of an unsigned
949 : // branchfree divider because there is a hardcoded right shift by 1
950 : // in its division algorithm. Because of this we also need to add back
951 : // 1 in its recovery algorithm.
952 48081 : result.magic = 0;
953 48081 : result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
954 : } else {
955 : uint8_t more;
956 : uint32_t rem, proposed_m;
957 58043 : proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << floor_log_2_d, 0, d, &rem);
958 :
959 : LIBDIVIDE_ASSERT(rem > 0 && rem < d);
960 58043 : const uint32_t e = d - rem;
961 :
962 : // This power works if e < 2**floor_log_2_d.
963 58043 : if (!branchfree && (e < ((uint32_t)1 << floor_log_2_d))) {
964 : // This power works
965 48474 : more = (uint8_t)floor_log_2_d;
966 : } else {
967 : // We have to use the general 33-bit algorithm. We need to compute
968 : // (2**power) / d. However, we already have (2**(power-1))/d and
969 : // its remainder. By doubling both, and then correcting the
970 : // remainder, we can compute the larger division.
971 : // don't care about overflow here - in fact, we expect it
972 9569 : proposed_m += proposed_m;
973 9569 : const uint32_t twice_rem = rem + rem;
974 9569 : if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
975 9569 : more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
976 : }
977 58043 : result.magic = 1 + proposed_m;
978 58043 : result.more = more;
979 : // result.more's shift should in general be ceil_log_2_d. But if we
980 : // used the smaller power, we subtract one from the shift because we're
981 : // using the smaller power. If we're using the larger power, we
982 : // subtract one from the shift because it's taken care of by the add
983 : // indicator. So floor_log_2_d happens to be correct in both cases.
984 : }
985 106124 : return result;
986 : }
987 :
988 : static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d) {
989 106124 : return libdivide_internal_u32_gen(d, 0);
990 : }
991 :
992 : static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) {
993 : if (d == 1) {
994 : LIBDIVIDE_ERROR("branchfree divider must be != 1");
995 : }
996 : struct libdivide_u32_t tmp = libdivide_internal_u32_gen(d, 1);
997 : struct libdivide_u32_branchfree_t ret = {
998 : tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)};
999 : return ret;
1000 : }
1001 :
1002 : static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw(uint32_t numer, uint32_t magic, uint8_t more) {
1003 5093324 : if (!magic) {
1004 2307500 : return numer >> more;
1005 : } else {
1006 2785824 : uint32_t q = libdivide_mullhi_u32(numer, magic);
1007 2785824 : if (more & LIBDIVIDE_ADD_MARKER) {
1008 459312 : uint32_t t = ((numer - q) >> 1) + q;
1009 459312 : return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
1010 : } else {
1011 : // All upper bits are 0,
1012 : // don't need to mask them off.
1013 2326514 : return q >> more;
1014 : }
1015 : }
1016 : }
1017 :
1018 : static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
1019 5093324 : return libdivide_u32_do_raw(numer, denom->magic, denom->more);
1020 : }
1021 :
1022 : static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(
1023 : uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {
1024 : uint32_t q = libdivide_mullhi_u32(numer, denom->magic);
1025 : uint32_t t = ((numer - q) >> 1) + q;
1026 : return t >> denom->more;
1027 : }
1028 :
1029 : static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) {
1030 : uint8_t more = denom->more;
1031 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
1032 :
1033 : if (!denom->magic) {
1034 : return (uint32_t)1 << shift;
1035 : } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
1036 : // We compute q = n/d = n*m / 2^(32 + shift)
1037 : // Therefore we have d = 2^(32 + shift) / m
1038 : // We need to ceil it.
1039 : // We know d is not a power of 2, so m is not a power of 2,
1040 : // so we can just add 1 to the floor
1041 : uint32_t hi_dividend = (uint32_t)1 << shift;
1042 : uint32_t rem_ignored;
1043 : return 1 + libdivide_64_div_32_to_32(hi_dividend, 0, denom->magic, &rem_ignored);
1044 : } else {
1045 : // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
1046 : // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
1047 : // Also note that shift may be as high as 31, so shift + 1 will
1048 : // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
1049 : // then double the quotient and remainder.
1050 : uint64_t half_n = (uint64_t)1 << (32 + shift);
1051 : uint64_t d = ((uint64_t)1 << 32) | denom->magic;
1052 : // Note that the quotient is guaranteed <= 32 bits, but the remainder
1053 : // may need 33!
1054 : uint32_t half_q = (uint32_t)(half_n / d);
1055 : uint64_t rem = half_n % d;
1056 : // We computed 2^(32+shift)/(m+2^32)
1057 : // Need to double it, and then add 1 to the quotient if doubling th
1058 : // remainder would increase the quotient.
1059 : // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
1060 : uint32_t full_q = half_q + half_q + ((rem << 1) >= d);
1061 :
1062 : // We rounded down in gen (hence +1)
1063 : return full_q + 1;
1064 : }
1065 : }
1066 :
1067 : static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom) {
1068 : uint8_t more = denom->more;
1069 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
1070 :
1071 : if (!denom->magic) {
1072 : return (uint32_t)1 << (shift + 1);
1073 : } else {
1074 : // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
1075 : // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
1076 : // Also note that shift may be as high as 31, so shift + 1 will
1077 : // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
1078 : // then double the quotient and remainder.
1079 : uint64_t half_n = (uint64_t)1 << (32 + shift);
1080 : uint64_t d = ((uint64_t)1 << 32) | denom->magic;
1081 : // Note that the quotient is guaranteed <= 32 bits, but the remainder
1082 : // may need 33!
1083 : uint32_t half_q = (uint32_t)(half_n / d);
1084 : uint64_t rem = half_n % d;
1085 : // We computed 2^(32+shift)/(m+2^32)
1086 : // Need to double it, and then add 1 to the quotient if doubling th
1087 : // remainder would increase the quotient.
1088 : // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
1089 : uint32_t full_q = half_q + half_q + ((rem << 1) >= d);
1090 :
1091 : // We rounded down in gen (hence +1)
1092 : return full_q + 1;
1093 : }
1094 : }
1095 :
1096 : ////////// UINT64
1097 :
1098 : static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_internal_u64_gen(
1099 : uint64_t d, int branchfree) {
1100 : if (d == 0) {
1101 : LIBDIVIDE_ERROR("divider must be != 0");
1102 : }
1103 :
1104 : struct libdivide_u64_t result;
1105 : uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(d);
1106 :
1107 : // Power of 2
1108 : if ((d & (d - 1)) == 0) {
1109 : // We need to subtract 1 from the shift value in case of an unsigned
1110 : // branchfree divider because there is a hardcoded right shift by 1
1111 : // in its division algorithm. Because of this we also need to add back
1112 : // 1 in its recovery algorithm.
1113 : result.magic = 0;
1114 : result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
1115 : } else {
1116 : uint64_t proposed_m, rem;
1117 : uint8_t more;
1118 : // (1 << (64 + floor_log_2_d)) / d
1119 : proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << floor_log_2_d, 0, d, &rem);
1120 :
1121 : LIBDIVIDE_ASSERT(rem > 0 && rem < d);
1122 : const uint64_t e = d - rem;
1123 :
1124 : // This power works if e < 2**floor_log_2_d.
1125 : if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {
1126 : // This power works
1127 : more = (uint8_t)floor_log_2_d;
1128 : } else {
1129 : // We have to use the general 65-bit algorithm. We need to compute
1130 : // (2**power) / d. However, we already have (2**(power-1))/d and
1131 : // its remainder. By doubling both, and then correcting the
1132 : // remainder, we can compute the larger division.
1133 : // don't care about overflow here - in fact, we expect it
1134 : proposed_m += proposed_m;
1135 : const uint64_t twice_rem = rem + rem;
1136 : if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
1137 : more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
1138 : }
1139 : result.magic = 1 + proposed_m;
1140 : result.more = more;
1141 : // result.more's shift should in general be ceil_log_2_d. But if we
1142 : // used the smaller power, we subtract one from the shift because we're
1143 : // using the smaller power. If we're using the larger power, we
1144 : // subtract one from the shift because it's taken care of by the add
1145 : // indicator. So floor_log_2_d happens to be correct in both cases,
1146 : // which is why we do it outside of the if statement.
1147 : }
1148 : return result;
1149 : }
1150 :
1151 : static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d) {
1152 : return libdivide_internal_u64_gen(d, 0);
1153 : }
1154 :
1155 : static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) {
1156 : if (d == 1) {
1157 : LIBDIVIDE_ERROR("branchfree divider must be != 1");
1158 : }
1159 : struct libdivide_u64_t tmp = libdivide_internal_u64_gen(d, 1);
1160 : struct libdivide_u64_branchfree_t ret = {
1161 : tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_64_SHIFT_MASK)};
1162 : return ret;
1163 : }
1164 :
1165 : static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw(uint64_t numer, uint64_t magic, uint8_t more) {
1166 : if (!magic) {
1167 : return numer >> more;
1168 : } else {
1169 : uint64_t q = libdivide_mullhi_u64(numer, magic);
1170 : if (more & LIBDIVIDE_ADD_MARKER) {
1171 : uint64_t t = ((numer - q) >> 1) + q;
1172 : return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
1173 : } else {
1174 : // All upper bits are 0,
1175 : // don't need to mask them off.
1176 : return q >> more;
1177 : }
1178 : }
1179 : }
1180 :
1181 : static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
1182 : return libdivide_u64_do_raw(numer, denom->magic, denom->more);
1183 : }
1184 :
1185 : static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(
1186 : uint64_t numer, const struct libdivide_u64_branchfree_t *denom) {
1187 : uint64_t q = libdivide_mullhi_u64(numer, denom->magic);
1188 : uint64_t t = ((numer - q) >> 1) + q;
1189 : return t >> denom->more;
1190 : }
1191 :
1192 : static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) {
1193 : uint8_t more = denom->more;
1194 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
1195 :
1196 : if (!denom->magic) {
1197 : return (uint64_t)1 << shift;
1198 : } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
1199 : // We compute q = n/d = n*m / 2^(64 + shift)
1200 : // Therefore we have d = 2^(64 + shift) / m
1201 : // We need to ceil it.
1202 : // We know d is not a power of 2, so m is not a power of 2,
1203 : // so we can just add 1 to the floor
1204 : uint64_t hi_dividend = (uint64_t)1 << shift;
1205 : uint64_t rem_ignored;
1206 : return 1 + libdivide_128_div_64_to_64(hi_dividend, 0, denom->magic, &rem_ignored);
1207 : } else {
1208 : // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).
1209 : // Notice (m + 2^64) is a 65 bit number. This gets hairy. See
1210 : // libdivide_u32_recover for more on what we do here.
1211 : // TODO: do something better than 128 bit math
1212 :
1213 : // Full n is a (potentially) 129 bit value
1214 : // half_n is a 128 bit value
1215 : // Compute the hi half of half_n. Low half is 0.
1216 : uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0;
1217 : // d is a 65 bit value. The high bit is always set to 1.
1218 : const uint64_t d_hi = 1, d_lo = denom->magic;
1219 : // Note that the quotient is guaranteed <= 64 bits,
1220 : // but the remainder may need 65!
1221 : uint64_t r_hi, r_lo;
1222 : uint64_t half_q =
1223 : libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
1224 : // We computed 2^(64+shift)/(m+2^64)
1225 : // Double the remainder ('dr') and check if that is larger than d
1226 : // Note that d is a 65 bit value, so r1 is small and so r1 + r1
1227 : // cannot overflow
1228 : uint64_t dr_lo = r_lo + r_lo;
1229 : uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry
1230 : int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);
1231 : uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);
1232 : return full_q + 1;
1233 : }
1234 : }
1235 :
1236 : static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom) {
1237 : uint8_t more = denom->more;
1238 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
1239 :
1240 : if (!denom->magic) {
1241 : return (uint64_t)1 << (shift + 1);
1242 : } else {
1243 : // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).
1244 : // Notice (m + 2^64) is a 65 bit number. This gets hairy. See
1245 : // libdivide_u32_recover for more on what we do here.
1246 : // TODO: do something better than 128 bit math
1247 :
1248 : // Full n is a (potentially) 129 bit value
1249 : // half_n is a 128 bit value
1250 : // Compute the hi half of half_n. Low half is 0.
1251 : uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0;
1252 : // d is a 65 bit value. The high bit is always set to 1.
1253 : const uint64_t d_hi = 1, d_lo = denom->magic;
1254 : // Note that the quotient is guaranteed <= 64 bits,
1255 : // but the remainder may need 65!
1256 : uint64_t r_hi, r_lo;
1257 : uint64_t half_q =
1258 : libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
1259 : // We computed 2^(64+shift)/(m+2^64)
1260 : // Double the remainder ('dr') and check if that is larger than d
1261 : // Note that d is a 65 bit value, so r1 is small and so r1 + r1
1262 : // cannot overflow
1263 : uint64_t dr_lo = r_lo + r_lo;
1264 : uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry
1265 : int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);
1266 : uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);
1267 : return full_q + 1;
1268 : }
1269 : }
1270 :
1271 : ////////// SINT16
1272 :
1273 : static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen(
1274 : int16_t d, int branchfree) {
1275 : if (d == 0) {
1276 : LIBDIVIDE_ERROR("divider must be != 0");
1277 : }
1278 :
1279 : struct libdivide_s16_t result;
1280 :
1281 : // If d is a power of 2, or negative a power of 2, we have to use a shift.
1282 : // This is especially important because the magic algorithm fails for -1.
1283 : // To check if d is a power of 2 or its inverse, it suffices to check
1284 : // whether its absolute value has exactly one bit set. This works even for
1285 : // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
1286 : // and is a power of 2.
1287 : uint16_t ud = (uint16_t)d;
1288 : uint16_t absD = (d < 0) ? -ud : ud;
1289 : uint16_t floor_log_2_d = 15 - libdivide_count_leading_zeros16(absD);
1290 : // check if exactly one bit is set,
1291 : // don't care if absD is 0 since that's divide by zero
1292 : if ((absD & (absD - 1)) == 0) {
1293 : // Branchfree and normal paths are exactly the same
1294 : result.magic = 0;
1295 : result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
1296 : } else {
1297 : LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
1298 :
1299 : uint8_t more;
1300 : // the dividend here is 2**(floor_log_2_d + 31), so the low 16 bit word
1301 : // is 0 and the high word is floor_log_2_d - 1
1302 : uint16_t rem, proposed_m;
1303 : proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
1304 : const uint16_t e = absD - rem;
1305 :
1306 : // We are going to start with a power of floor_log_2_d - 1.
1307 : // This works if works if e < 2**floor_log_2_d.
1308 : if (!branchfree && e < ((uint16_t)1 << floor_log_2_d)) {
1309 : // This power works
1310 : more = (uint8_t)(floor_log_2_d - 1);
1311 : } else {
1312 : // We need to go one higher. This should not make proposed_m
1313 : // overflow, but it will make it negative when interpreted as an
1314 : // int16_t.
1315 : proposed_m += proposed_m;
1316 : const uint16_t twice_rem = rem + rem;
1317 : if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
1318 : more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
1319 : }
1320 :
1321 : proposed_m += 1;
1322 : int16_t magic = (int16_t)proposed_m;
1323 :
1324 : // Mark if we are negative. Note we only negate the magic number in the
1325 : // branchfull case.
1326 : if (d < 0) {
1327 : more |= LIBDIVIDE_NEGATIVE_DIVISOR;
1328 : if (!branchfree) {
1329 : magic = -magic;
1330 : }
1331 : }
1332 :
1333 : result.more = more;
1334 : result.magic = magic;
1335 : }
1336 : return result;
1337 : }
1338 :
1339 : static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d) {
1340 : return libdivide_internal_s16_gen(d, 0);
1341 : }
1342 :
1343 : static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d) {
1344 : struct libdivide_s16_t tmp = libdivide_internal_s16_gen(d, 1);
1345 : struct libdivide_s16_branchfree_t result = {tmp.magic, tmp.more};
1346 : return result;
1347 : }
1348 :
1349 : // The original libdivide_s16_do takes a const pointer. However, this cannot be used
1350 : // with a compile time constant libdivide_s16_t: it will generate a warning about
1351 : // taking the address of a temporary. Hence this overload.
1352 : static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more) {
1353 : uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
1354 :
1355 : if (!magic) {
1356 : uint16_t sign = (int8_t)more >> 7;
1357 : uint16_t mask = ((uint16_t)1 << shift) - 1;
1358 : uint16_t uq = numer + ((numer >> 15) & mask);
1359 : int16_t q = (int16_t)uq;
1360 : q >>= shift;
1361 : q = (q ^ sign) - sign;
1362 : return q;
1363 : } else {
1364 : uint16_t uq = (uint16_t)libdivide_mullhi_s16(numer, magic);
1365 : if (more & LIBDIVIDE_ADD_MARKER) {
1366 : // must be arithmetic shift and then sign extend
1367 : int16_t sign = (int8_t)more >> 7;
1368 : // q += (more < 0 ? -numer : numer)
1369 : // cast required to avoid UB
1370 : uq += ((uint16_t)numer ^ sign) - sign;
1371 : }
1372 : int16_t q = (int16_t)uq;
1373 : q >>= shift;
1374 : q += (q < 0);
1375 : return q;
1376 : }
1377 : }
1378 :
1379 : static LIBDIVIDE_INLINE int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) {
1380 : return libdivide_s16_do_raw(numer, denom->magic, denom->more);
1381 : }
1382 :
1383 : static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(int16_t numer, const struct libdivide_s16_branchfree_t *denom) {
1384 : uint8_t more = denom->more;
1385 : uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
1386 : // must be arithmetic shift and then sign extend
1387 : int16_t sign = (int8_t)more >> 7;
1388 : int16_t magic = denom->magic;
1389 : int16_t q = libdivide_mullhi_s16(numer, magic);
1390 : q += numer;
1391 :
1392 : // If q is non-negative, we have nothing to do
1393 : // If q is negative, we want to add either (2**shift)-1 if d is a power of
1394 : // 2, or (2**shift) if it is not a power of 2
1395 : uint16_t is_power_of_2 = (magic == 0);
1396 : uint16_t q_sign = (uint16_t)(q >> 15);
1397 : q += q_sign & (((uint16_t)1 << shift) - is_power_of_2);
1398 :
1399 : // Now arithmetic right shift
1400 : q >>= shift;
1401 : // Negate if needed
1402 : q = (q ^ sign) - sign;
1403 :
1404 : return q;
1405 : }
1406 :
1407 : static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom) {
1408 : uint8_t more = denom->more;
1409 : uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
1410 : if (!denom->magic) {
1411 : uint16_t absD = (uint16_t)1 << shift;
1412 : if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
1413 : absD = -absD;
1414 : }
1415 : return (int16_t)absD;
1416 : } else {
1417 : // Unsigned math is much easier
1418 : // We negate the magic number only in the branchfull case, and we don't
1419 : // know which case we're in. However we have enough information to
1420 : // determine the correct sign of the magic number. The divisor was
1421 : // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set,
1422 : // the magic number's sign is opposite that of the divisor.
1423 : // We want to compute the positive magic number.
1424 : int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
1425 : int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
1426 :
1427 : // Handle the power of 2 case (including branchfree)
1428 : if (denom->magic == 0) {
1429 : int16_t result = (uint16_t)1 << shift;
1430 : return negative_divisor ? -result : result;
1431 : }
1432 :
1433 : uint16_t d = (uint16_t)(magic_was_negated ? -denom->magic : denom->magic);
1434 : uint32_t n = (uint32_t)1 << (16 + shift); // this shift cannot exceed 30
1435 : uint16_t q = (uint16_t)(n / d);
1436 : int16_t result = (int16_t)q;
1437 : result += 1;
1438 : return negative_divisor ? -result : result;
1439 : }
1440 : }
1441 :
1442 : static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(const struct libdivide_s16_branchfree_t *denom) {
1443 : const struct libdivide_s16_t den = {denom->magic, denom->more};
1444 : return libdivide_s16_recover(&den);
1445 : }
1446 :
1447 : ////////// SINT32
1448 :
1449 : static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_internal_s32_gen(
1450 : int32_t d, int branchfree) {
1451 : if (d == 0) {
1452 : LIBDIVIDE_ERROR("divider must be != 0");
1453 : }
1454 :
1455 : struct libdivide_s32_t result;
1456 :
1457 : // If d is a power of 2, or negative a power of 2, we have to use a shift.
1458 : // This is especially important because the magic algorithm fails for -1.
1459 : // To check if d is a power of 2 or its inverse, it suffices to check
1460 : // whether its absolute value has exactly one bit set. This works even for
1461 : // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
1462 : // and is a power of 2.
1463 : uint32_t ud = (uint32_t)d;
1464 : uint32_t absD = (d < 0) ? -ud : ud;
1465 : uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(absD);
1466 : // check if exactly one bit is set,
1467 : // don't care if absD is 0 since that's divide by zero
1468 : if ((absD & (absD - 1)) == 0) {
1469 : // Branchfree and normal paths are exactly the same
1470 : result.magic = 0;
1471 : result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
1472 : } else {
1473 : LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
1474 :
1475 : uint8_t more;
1476 : // the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word
1477 : // is 0 and the high word is floor_log_2_d - 1
1478 : uint32_t rem, proposed_m;
1479 : proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
1480 : const uint32_t e = absD - rem;
1481 :
1482 : // We are going to start with a power of floor_log_2_d - 1.
1483 : // This works if works if e < 2**floor_log_2_d.
1484 : if (!branchfree && e < ((uint32_t)1 << floor_log_2_d)) {
1485 : // This power works
1486 : more = (uint8_t)(floor_log_2_d - 1);
1487 : } else {
1488 : // We need to go one higher. This should not make proposed_m
1489 : // overflow, but it will make it negative when interpreted as an
1490 : // int32_t.
1491 : proposed_m += proposed_m;
1492 : const uint32_t twice_rem = rem + rem;
1493 : if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
1494 : more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
1495 : }
1496 :
1497 : proposed_m += 1;
1498 : int32_t magic = (int32_t)proposed_m;
1499 :
1500 : // Mark if we are negative. Note we only negate the magic number in the
1501 : // branchfull case.
1502 : if (d < 0) {
1503 : more |= LIBDIVIDE_NEGATIVE_DIVISOR;
1504 : if (!branchfree) {
1505 : magic = -magic;
1506 : }
1507 : }
1508 :
1509 : result.more = more;
1510 : result.magic = magic;
1511 : }
1512 : return result;
1513 : }
1514 :
1515 : static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d) {
1516 : return libdivide_internal_s32_gen(d, 0);
1517 : }
1518 :
1519 : static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) {
1520 : struct libdivide_s32_t tmp = libdivide_internal_s32_gen(d, 1);
1521 : struct libdivide_s32_branchfree_t result = {tmp.magic, tmp.more};
1522 : return result;
1523 : }
1524 :
1525 : static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) {
1526 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
1527 :
1528 : if (!magic) {
1529 : uint32_t sign = (int8_t)more >> 7;
1530 : uint32_t mask = ((uint32_t)1 << shift) - 1;
1531 : uint32_t uq = numer + ((numer >> 31) & mask);
1532 : int32_t q = (int32_t)uq;
1533 : q >>= shift;
1534 : q = (q ^ sign) - sign;
1535 : return q;
1536 : } else {
1537 : uint32_t uq = (uint32_t)libdivide_mullhi_s32(numer, magic);
1538 : if (more & LIBDIVIDE_ADD_MARKER) {
1539 : // must be arithmetic shift and then sign extend
1540 : int32_t sign = (int8_t)more >> 7;
1541 : // q += (more < 0 ? -numer : numer)
1542 : // cast required to avoid UB
1543 : uq += ((uint32_t)numer ^ sign) - sign;
1544 : }
1545 : int32_t q = (int32_t)uq;
1546 : q >>= shift;
1547 : q += (q < 0);
1548 : return q;
1549 : }
1550 : }
1551 :
1552 : static LIBDIVIDE_INLINE int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
1553 : return libdivide_s32_do_raw(numer, denom->magic, denom->more);
1554 : }
1555 :
1556 : static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) {
1557 : uint8_t more = denom->more;
1558 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
1559 : // must be arithmetic shift and then sign extend
1560 : int32_t sign = (int8_t)more >> 7;
1561 : int32_t magic = denom->magic;
1562 : int32_t q = libdivide_mullhi_s32(numer, magic);
1563 : q += numer;
1564 :
1565 : // If q is non-negative, we have nothing to do
1566 : // If q is negative, we want to add either (2**shift)-1 if d is a power of
1567 : // 2, or (2**shift) if it is not a power of 2
1568 : uint32_t is_power_of_2 = (magic == 0);
1569 : uint32_t q_sign = (uint32_t)(q >> 31);
1570 : q += q_sign & (((uint32_t)1 << shift) - is_power_of_2);
1571 :
1572 : // Now arithmetic right shift
1573 : q >>= shift;
1574 : // Negate if needed
1575 : q = (q ^ sign) - sign;
1576 :
1577 : return q;
1578 : }
1579 :
1580 : static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) {
1581 : uint8_t more = denom->more;
1582 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
1583 : if (!denom->magic) {
1584 : uint32_t absD = (uint32_t)1 << shift;
1585 : if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
1586 : absD = -absD;
1587 : }
1588 : return (int32_t)absD;
1589 : } else {
1590 : // Unsigned math is much easier
1591 : // We negate the magic number only in the branchfull case, and we don't
1592 : // know which case we're in. However we have enough information to
1593 : // determine the correct sign of the magic number. The divisor was
1594 : // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set,
1595 : // the magic number's sign is opposite that of the divisor.
1596 : // We want to compute the positive magic number.
1597 : int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
1598 : int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
1599 :
1600 : // Handle the power of 2 case (including branchfree)
1601 : if (denom->magic == 0) {
1602 : int32_t result = (uint32_t)1 << shift;
1603 : return negative_divisor ? -result : result;
1604 : }
1605 :
1606 : uint32_t d = (uint32_t)(magic_was_negated ? -denom->magic : denom->magic);
1607 : uint64_t n = (uint64_t)1 << (32 + shift); // this shift cannot exceed 30
1608 : uint32_t q = (uint32_t)(n / d);
1609 : int32_t result = (int32_t)q;
1610 : result += 1;
1611 : return negative_divisor ? -result : result;
1612 : }
1613 : }
1614 :
1615 : static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom) {
1616 : const struct libdivide_s32_t den = {denom->magic, denom->more};
1617 : return libdivide_s32_recover(&den);
1618 : }
1619 :
1620 : ////////// SINT64
1621 :
1622 : static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_internal_s64_gen(
1623 : int64_t d, int branchfree) {
1624 : if (d == 0) {
1625 : LIBDIVIDE_ERROR("divider must be != 0");
1626 : }
1627 :
1628 : struct libdivide_s64_t result;
1629 :
1630 : // If d is a power of 2, or negative a power of 2, we have to use a shift.
1631 : // This is especially important because the magic algorithm fails for -1.
1632 : // To check if d is a power of 2 or its inverse, it suffices to check
1633 : // whether its absolute value has exactly one bit set. This works even for
1634 : // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
1635 : // and is a power of 2.
1636 : uint64_t ud = (uint64_t)d;
1637 : uint64_t absD = (d < 0) ? -ud : ud;
1638 : uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(absD);
1639 : // check if exactly one bit is set,
1640 : // don't care if absD is 0 since that's divide by zero
1641 : if ((absD & (absD - 1)) == 0) {
1642 : // Branchfree and non-branchfree cases are the same
1643 : result.magic = 0;
1644 : result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
1645 : } else {
1646 : // the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word
1647 : // is 0 and the high word is floor_log_2_d - 1
1648 : uint8_t more;
1649 : uint64_t rem, proposed_m;
1650 : proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
1651 : const uint64_t e = absD - rem;
1652 :
1653 : // We are going to start with a power of floor_log_2_d - 1.
1654 : // This works if works if e < 2**floor_log_2_d.
1655 : if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {
1656 : // This power works
1657 : more = (uint8_t)(floor_log_2_d - 1);
1658 : } else {
1659 : // We need to go one higher. This should not make proposed_m
1660 : // overflow, but it will make it negative when interpreted as an
1661 : // int32_t.
1662 : proposed_m += proposed_m;
1663 : const uint64_t twice_rem = rem + rem;
1664 : if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
1665 : // note that we only set the LIBDIVIDE_NEGATIVE_DIVISOR bit if we
1666 : // also set ADD_MARKER this is an annoying optimization that
1667 : // enables algorithm #4 to avoid the mask. However we always set it
1668 : // in the branchfree case
1669 : more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
1670 : }
1671 : proposed_m += 1;
1672 : int64_t magic = (int64_t)proposed_m;
1673 :
1674 : // Mark if we are negative
1675 : if (d < 0) {
1676 : more |= LIBDIVIDE_NEGATIVE_DIVISOR;
1677 : if (!branchfree) {
1678 : magic = -magic;
1679 : }
1680 : }
1681 :
1682 : result.more = more;
1683 : result.magic = magic;
1684 : }
1685 : return result;
1686 : }
1687 :
1688 : static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d) {
1689 : return libdivide_internal_s64_gen(d, 0);
1690 : }
1691 :
1692 : static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) {
1693 : struct libdivide_s64_t tmp = libdivide_internal_s64_gen(d, 1);
1694 : struct libdivide_s64_branchfree_t ret = {tmp.magic, tmp.more};
1695 : return ret;
1696 : }
1697 :
1698 : static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) {
1699 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
1700 :
1701 : if (!magic) { // shift path
1702 : uint64_t mask = ((uint64_t)1 << shift) - 1;
1703 : uint64_t uq = numer + ((numer >> 63) & mask);
1704 : int64_t q = (int64_t)uq;
1705 : q >>= shift;
1706 : // must be arithmetic shift and then sign-extend
1707 : int64_t sign = (int8_t)more >> 7;
1708 : q = (q ^ sign) - sign;
1709 : return q;
1710 : } else {
1711 : uint64_t uq = (uint64_t)libdivide_mullhi_s64(numer, magic);
1712 : if (more & LIBDIVIDE_ADD_MARKER) {
1713 : // must be arithmetic shift and then sign extend
1714 : int64_t sign = (int8_t)more >> 7;
1715 : // q += (more < 0 ? -numer : numer)
1716 : // cast required to avoid UB
1717 : uq += ((uint64_t)numer ^ sign) - sign;
1718 : }
1719 : int64_t q = (int64_t)uq;
1720 : q >>= shift;
1721 : q += (q < 0);
1722 : return q;
1723 : }
1724 : }
1725 :
1726 : static LIBDIVIDE_INLINE int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
1727 : return libdivide_s64_do_raw(numer, denom->magic, denom->more);
1728 : }
1729 :
1730 : static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) {
1731 : uint8_t more = denom->more;
1732 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
1733 : // must be arithmetic shift and then sign extend
1734 : int64_t sign = (int8_t)more >> 7;
1735 : int64_t magic = denom->magic;
1736 : int64_t q = libdivide_mullhi_s64(numer, magic);
1737 : q += numer;
1738 :
1739 : // If q is non-negative, we have nothing to do.
1740 : // If q is negative, we want to add either (2**shift)-1 if d is a power of
1741 : // 2, or (2**shift) if it is not a power of 2.
1742 : uint64_t is_power_of_2 = (magic == 0);
1743 : uint64_t q_sign = (uint64_t)(q >> 63);
1744 : q += q_sign & (((uint64_t)1 << shift) - is_power_of_2);
1745 :
1746 : // Arithmetic right shift
1747 : q >>= shift;
1748 : // Negate if needed
1749 : q = (q ^ sign) - sign;
1750 :
1751 : return q;
1752 : }
1753 :
1754 : static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) {
1755 : uint8_t more = denom->more;
1756 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
1757 : if (denom->magic == 0) { // shift path
1758 : uint64_t absD = (uint64_t)1 << shift;
1759 : if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
1760 : absD = -absD;
1761 : }
1762 : return (int64_t)absD;
1763 : } else {
1764 : // Unsigned math is much easier
1765 : int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
1766 : int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
1767 :
1768 : uint64_t d = (uint64_t)(magic_was_negated ? -denom->magic : denom->magic);
1769 : uint64_t n_hi = (uint64_t)1 << shift, n_lo = 0;
1770 : uint64_t rem_ignored;
1771 : uint64_t q = libdivide_128_div_64_to_64(n_hi, n_lo, d, &rem_ignored);
1772 : int64_t result = (int64_t)(q + 1);
1773 : if (negative_divisor) {
1774 : result = -result;
1775 : }
1776 : return result;
1777 : }
1778 : }
1779 :
1780 : static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom) {
1781 : const struct libdivide_s64_t den = {denom->magic, denom->more};
1782 : return libdivide_s64_recover(&den);
1783 : }
1784 :
1785 : // Simplest possible vector type division: treat the vector type as an array
1786 : // of underlying native type.
1787 : //
1788 : // Use a union to read a vector via pointer-to-integer, without violating strict
1789 : // aliasing.
1790 : #define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \
1791 : const size_t count = sizeof(VecT) / sizeof(IntT); \
1792 : union type_pun_vec { \
1793 : VecT vec; \
1794 : IntT arr[sizeof(VecT) / sizeof(IntT)]; \
1795 : }; \
1796 : union type_pun_vec result; \
1797 : union type_pun_vec input; \
1798 : input.vec = numers; \
1799 : for (size_t loop = 0; loop < count; ++loop) { \
1800 : result.arr[loop] = libdivide_##Algo##_do(input.arr[loop], denom); \
1801 : } \
1802 : return result.vec;
1803 :
1804 : #if defined(LIBDIVIDE_NEON)
1805 :
1806 : static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_do_vec128(
1807 : uint16x8_t numers, const struct libdivide_u16_t *denom);
1808 : static LIBDIVIDE_INLINE int16x8_t libdivide_s16_do_vec128(
1809 : int16x8_t numers, const struct libdivide_s16_t *denom);
1810 : static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_do_vec128(
1811 : uint32x4_t numers, const struct libdivide_u32_t *denom);
1812 : static LIBDIVIDE_INLINE int32x4_t libdivide_s32_do_vec128(
1813 : int32x4_t numers, const struct libdivide_s32_t *denom);
1814 : static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_do_vec128(
1815 : uint64x2_t numers, const struct libdivide_u64_t *denom);
1816 : static LIBDIVIDE_INLINE int64x2_t libdivide_s64_do_vec128(
1817 : int64x2_t numers, const struct libdivide_s64_t *denom);
1818 :
1819 : static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_branchfree_do_vec128(
1820 : uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom);
1821 : static LIBDIVIDE_INLINE int16x8_t libdivide_s16_branchfree_do_vec128(
1822 : int16x8_t numers, const struct libdivide_s16_branchfree_t *denom);
1823 : static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_branchfree_do_vec128(
1824 : uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom);
1825 : static LIBDIVIDE_INLINE int32x4_t libdivide_s32_branchfree_do_vec128(
1826 : int32x4_t numers, const struct libdivide_s32_branchfree_t *denom);
1827 : static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_branchfree_do_vec128(
1828 : uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom);
1829 : static LIBDIVIDE_INLINE int64x2_t libdivide_s64_branchfree_do_vec128(
1830 : int64x2_t numers, const struct libdivide_s64_branchfree_t *denom);
1831 :
1832 : //////// Internal Utility Functions
1833 :
1834 : // Logical right shift by runtime value.
1835 : // NEON implements right shift as left shits by negative values.
1836 : static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_neon_srl(uint32x4_t v, uint8_t amt) {
1837 : int32_t wamt = (int32_t)(amt);
1838 : return vshlq_u32(v, vdupq_n_s32(-wamt));
1839 : }
1840 :
1841 : static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_neon_srl(uint64x2_t v, uint8_t amt) {
1842 : int64_t wamt = (int64_t)(amt);
1843 : return vshlq_u64(v, vdupq_n_s64(-wamt));
1844 : }
1845 :
1846 : // Arithmetic right shift by runtime value.
1847 : static LIBDIVIDE_INLINE int32x4_t libdivide_s32_neon_sra(int32x4_t v, uint8_t amt) {
1848 : int32_t wamt = (int32_t)(amt);
1849 : return vshlq_s32(v, vdupq_n_s32(-wamt));
1850 : }
1851 :
1852 : static LIBDIVIDE_INLINE int64x2_t libdivide_s64_neon_sra(int64x2_t v, uint8_t amt) {
1853 : int64_t wamt = (int64_t)(amt);
1854 : return vshlq_s64(v, vdupq_n_s64(-wamt));
1855 : }
1856 :
1857 : static LIBDIVIDE_INLINE int64x2_t libdivide_s64_signbits(int64x2_t v) { return vshrq_n_s64(v, 63); }
1858 :
1859 : static LIBDIVIDE_INLINE uint32x4_t libdivide_mullhi_u32_vec128(uint32x4_t a, uint32_t b) {
1860 : // Desire is [x0, x1, x2, x3]
1861 : uint32x4_t w1 = vreinterpretq_u32_u64(vmull_n_u32(vget_low_u32(a), b)); // [_, x0, _, x1]
1862 : uint32x4_t w2 = vreinterpretq_u32_u64(vmull_high_n_u32(a, b)); //[_, x2, _, x3]
1863 : return vuzp2q_u32(w1, w2); // [x0, x1, x2, x3]
1864 : }
1865 :
1866 : static LIBDIVIDE_INLINE int32x4_t libdivide_mullhi_s32_vec128(int32x4_t a, int32_t b) {
1867 : int32x4_t w1 = vreinterpretq_s32_s64(vmull_n_s32(vget_low_s32(a), b)); // [_, x0, _, x1]
1868 : int32x4_t w2 = vreinterpretq_s32_s64(vmull_high_n_s32(a, b)); //[_, x2, _, x3]
1869 : return vuzp2q_s32(w1, w2); // [x0, x1, x2, x3]
1870 : }
1871 :
1872 : static LIBDIVIDE_INLINE uint64x2_t libdivide_mullhi_u64_vec128(uint64x2_t x, uint64_t sy) {
1873 : // full 128 bits product is:
1874 : // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64)
1875 : // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64.
1876 :
1877 : // Get low and high words. x0 contains low 32 bits, x1 is high 32 bits.
1878 : uint64x2_t y = vdupq_n_u64(sy);
1879 : uint32x2_t x0 = vmovn_u64(x);
1880 : uint32x2_t y0 = vmovn_u64(y);
1881 : uint32x2_t x1 = vshrn_n_u64(x, 32);
1882 : uint32x2_t y1 = vshrn_n_u64(y, 32);
1883 :
1884 : // Compute x0*y0.
1885 : uint64x2_t x0y0 = vmull_u32(x0, y0);
1886 : uint64x2_t x0y0_hi = vshrq_n_u64(x0y0, 32);
1887 :
1888 : // Compute other intermediate products.
1889 : uint64x2_t temp = vmlal_u32(x0y0_hi, x1, y0); // temp = x0y0_hi + x1*y0;
1890 : // We want to split temp into its low 32 bits and high 32 bits, both
1891 : // in the low half of 64 bit registers.
1892 : // Use shifts to avoid needing a reg for the mask.
1893 : uint64x2_t temp_lo = vshrq_n_u64(vshlq_n_u64(temp, 32), 32); // temp_lo = temp & 0xFFFFFFFF;
1894 : uint64x2_t temp_hi = vshrq_n_u64(temp, 32); // temp_hi = temp >> 32;
1895 :
1896 : temp_lo = vmlal_u32(temp_lo, x0, y1); // temp_lo += x0*y0
1897 : temp_lo = vshrq_n_u64(temp_lo, 32); // temp_lo >>= 32
1898 : temp_hi = vmlal_u32(temp_hi, x1, y1); // temp_hi += x1*y1
1899 : uint64x2_t result = vaddq_u64(temp_hi, temp_lo);
1900 : return result;
1901 : }
1902 :
1903 : static LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64_t sy) {
1904 : int64x2_t p = vreinterpretq_s64_u64(
1905 : libdivide_mullhi_u64_vec128(vreinterpretq_u64_s64(x), (uint64_t)(sy)));
1906 : int64x2_t y = vdupq_n_s64(sy);
1907 : int64x2_t t1 = vandq_s64(libdivide_s64_signbits(x), y);
1908 : int64x2_t t2 = vandq_s64(libdivide_s64_signbits(y), x);
1909 : p = vsubq_s64(p, t1);
1910 : p = vsubq_s64(p, t2);
1911 : return p;
1912 : }
1913 :
1914 : ////////// UINT16
1915 :
1916 : uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom){
1917 : SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)}
1918 :
1919 : uint16x8_t libdivide_u16_branchfree_do_vec128(
1920 : uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom){
1921 : SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)}
1922 :
1923 : ////////// UINT32
1924 :
1925 : uint32x4_t libdivide_u32_do_vec128(uint32x4_t numers, const struct libdivide_u32_t *denom) {
1926 : uint8_t more = denom->more;
1927 : if (!denom->magic) {
1928 : return libdivide_u32_neon_srl(numers, more);
1929 : } else {
1930 : uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic);
1931 : if (more & LIBDIVIDE_ADD_MARKER) {
1932 : // uint32_t t = ((numer - q) >> 1) + q;
1933 : // return t >> denom->shift;
1934 : // Note we can use halving-subtract to avoid the shift.
1935 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
1936 : uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q);
1937 : return libdivide_u32_neon_srl(t, shift);
1938 : } else {
1939 : return libdivide_u32_neon_srl(q, more);
1940 : }
1941 : }
1942 : }
1943 :
1944 : uint32x4_t libdivide_u32_branchfree_do_vec128(
1945 : uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom) {
1946 : uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic);
1947 : uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q);
1948 : return libdivide_u32_neon_srl(t, denom->more);
1949 : }
1950 :
1951 : ////////// UINT64
1952 :
1953 : uint64x2_t libdivide_u64_do_vec128(uint64x2_t numers, const struct libdivide_u64_t *denom) {
1954 : uint8_t more = denom->more;
1955 : if (!denom->magic) {
1956 : return libdivide_u64_neon_srl(numers, more);
1957 : } else {
1958 : uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic);
1959 : if (more & LIBDIVIDE_ADD_MARKER) {
1960 : // uint32_t t = ((numer - q) >> 1) + q;
1961 : // return t >> denom->shift;
1962 : // No 64-bit halving subtracts in NEON :(
1963 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
1964 : uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q);
1965 : return libdivide_u64_neon_srl(t, shift);
1966 : } else {
1967 : return libdivide_u64_neon_srl(q, more);
1968 : }
1969 : }
1970 : }
1971 :
1972 : uint64x2_t libdivide_u64_branchfree_do_vec128(
1973 : uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom) {
1974 : uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic);
1975 : uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q);
1976 : return libdivide_u64_neon_srl(t, denom->more);
1977 : }
1978 :
1979 : ////////// SINT16
1980 :
1981 : int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom){
1982 : SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)}
1983 :
1984 : int16x8_t libdivide_s16_branchfree_do_vec128(
1985 : int16x8_t numers, const struct libdivide_s16_branchfree_t *denom){
1986 : SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)}
1987 :
1988 : ////////// SINT32
1989 :
1990 : int32x4_t libdivide_s32_do_vec128(int32x4_t numers, const struct libdivide_s32_t *denom) {
1991 : uint8_t more = denom->more;
1992 : if (!denom->magic) {
1993 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
1994 : uint32_t mask = ((uint32_t)1 << shift) - 1;
1995 : int32x4_t roundToZeroTweak = vdupq_n_s32((int)mask);
1996 : // q = numer + ((numer >> 31) & roundToZeroTweak);
1997 : int32x4_t q = vaddq_s32(numers, vandq_s32(vshrq_n_s32(numers, 31), roundToZeroTweak));
1998 : q = libdivide_s32_neon_sra(q, shift);
1999 : int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
2000 : // q = (q ^ sign) - sign;
2001 : q = vsubq_s32(veorq_s32(q, sign), sign);
2002 : return q;
2003 : } else {
2004 : int32x4_t q = libdivide_mullhi_s32_vec128(numers, denom->magic);
2005 : if (more & LIBDIVIDE_ADD_MARKER) {
2006 : // must be arithmetic shift
2007 : int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
2008 : // q += ((numer ^ sign) - sign);
2009 : q = vaddq_s32(q, vsubq_s32(veorq_s32(numers, sign), sign));
2010 : }
2011 : // q >>= shift
2012 : q = libdivide_s32_neon_sra(q, more & LIBDIVIDE_32_SHIFT_MASK);
2013 : q = vaddq_s32(
2014 : q, vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(q), 31))); // q += (q < 0)
2015 : return q;
2016 : }
2017 : }
2018 :
2019 : int32x4_t libdivide_s32_branchfree_do_vec128(
2020 : int32x4_t numers, const struct libdivide_s32_branchfree_t *denom) {
2021 : int32_t magic = denom->magic;
2022 : uint8_t more = denom->more;
2023 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2024 : // must be arithmetic shift
2025 : int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
2026 : int32x4_t q = libdivide_mullhi_s32_vec128(numers, magic);
2027 : q = vaddq_s32(q, numers); // q += numers
2028 :
2029 : // If q is non-negative, we have nothing to do
2030 : // If q is negative, we want to add either (2**shift)-1 if d is
2031 : // a power of 2, or (2**shift) if it is not a power of 2
2032 : uint32_t is_power_of_2 = (magic == 0);
2033 : int32x4_t q_sign = vshrq_n_s32(q, 31); // q_sign = q >> 31
2034 : int32x4_t mask = vdupq_n_s32(((uint32_t)1 << shift) - is_power_of_2);
2035 : q = vaddq_s32(q, vandq_s32(q_sign, mask)); // q = q + (q_sign & mask)
2036 : q = libdivide_s32_neon_sra(q, shift); // q >>= shift
2037 : q = vsubq_s32(veorq_s32(q, sign), sign); // q = (q ^ sign) - sign
2038 : return q;
2039 : }
2040 :
2041 : ////////// SINT64
2042 :
2043 : int64x2_t libdivide_s64_do_vec128(int64x2_t numers, const struct libdivide_s64_t *denom) {
2044 : uint8_t more = denom->more;
2045 : int64_t magic = denom->magic;
2046 : if (magic == 0) { // shift path
2047 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2048 : uint64_t mask = ((uint64_t)1 << shift) - 1;
2049 : int64x2_t roundToZeroTweak = vdupq_n_s64(mask); // TODO: no need to sign extend
2050 : // q = numer + ((numer >> 63) & roundToZeroTweak);
2051 : int64x2_t q =
2052 : vaddq_s64(numers, vandq_s64(libdivide_s64_signbits(numers), roundToZeroTweak));
2053 : q = libdivide_s64_neon_sra(q, shift);
2054 : // q = (q ^ sign) - sign;
2055 : int64x2_t sign = vreinterpretq_s64_s8(vdupq_n_s8((int8_t)more >> 7));
2056 : q = vsubq_s64(veorq_s64(q, sign), sign);
2057 : return q;
2058 : } else {
2059 : int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic);
2060 : if (more & LIBDIVIDE_ADD_MARKER) {
2061 : // must be arithmetic shift
2062 : int64x2_t sign = vdupq_n_s64((int8_t)more >> 7); // TODO: no need to widen
2063 : // q += ((numer ^ sign) - sign);
2064 : q = vaddq_s64(q, vsubq_s64(veorq_s64(numers, sign), sign));
2065 : }
2066 : // q >>= denom->mult_path.shift
2067 : q = libdivide_s64_neon_sra(q, more & LIBDIVIDE_64_SHIFT_MASK);
2068 : q = vaddq_s64(
2069 : q, vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(q), 63))); // q += (q < 0)
2070 : return q;
2071 : }
2072 : }
2073 :
2074 : int64x2_t libdivide_s64_branchfree_do_vec128(
2075 : int64x2_t numers, const struct libdivide_s64_branchfree_t *denom) {
2076 : int64_t magic = denom->magic;
2077 : uint8_t more = denom->more;
2078 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2079 : // must be arithmetic shift
2080 : int64x2_t sign = vdupq_n_s64((int8_t)more >> 7); // TODO: avoid sign extend
2081 :
2082 : // libdivide_mullhi_s64(numers, magic);
2083 : int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic);
2084 : q = vaddq_s64(q, numers); // q += numers
2085 :
2086 : // If q is non-negative, we have nothing to do.
2087 : // If q is negative, we want to add either (2**shift)-1 if d is
2088 : // a power of 2, or (2**shift) if it is not a power of 2.
2089 : uint32_t is_power_of_2 = (magic == 0);
2090 : int64x2_t q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63
2091 : int64x2_t mask = vdupq_n_s64(((uint64_t)1 << shift) - is_power_of_2);
2092 : q = vaddq_s64(q, vandq_s64(q_sign, mask)); // q = q + (q_sign & mask)
2093 : q = libdivide_s64_neon_sra(q, shift); // q >>= shift
2094 : q = vsubq_s64(veorq_s64(q, sign), sign); // q = (q ^ sign) - sign
2095 : return q;
2096 : }
2097 :
2098 : #endif
2099 :
2100 : #if defined(LIBDIVIDE_AVX512)
2101 :
2102 : static LIBDIVIDE_INLINE __m512i libdivide_u16_do_vec512(
2103 : __m512i numers, const struct libdivide_u16_t *denom);
2104 : static LIBDIVIDE_INLINE __m512i libdivide_s16_do_vec512(
2105 : __m512i numers, const struct libdivide_s16_t *denom);
2106 : static LIBDIVIDE_INLINE __m512i libdivide_u32_do_vec512(
2107 : __m512i numers, const struct libdivide_u32_t *denom);
2108 : static LIBDIVIDE_INLINE __m512i libdivide_s32_do_vec512(
2109 : __m512i numers, const struct libdivide_s32_t *denom);
2110 : static LIBDIVIDE_INLINE __m512i libdivide_u64_do_vec512(
2111 : __m512i numers, const struct libdivide_u64_t *denom);
2112 : static LIBDIVIDE_INLINE __m512i libdivide_s64_do_vec512(
2113 : __m512i numers, const struct libdivide_s64_t *denom);
2114 :
2115 : static LIBDIVIDE_INLINE __m512i libdivide_u16_branchfree_do_vec512(
2116 : __m512i numers, const struct libdivide_u16_branchfree_t *denom);
2117 : static LIBDIVIDE_INLINE __m512i libdivide_s16_branchfree_do_vec512(
2118 : __m512i numers, const struct libdivide_s16_branchfree_t *denom);
2119 : static LIBDIVIDE_INLINE __m512i libdivide_u32_branchfree_do_vec512(
2120 : __m512i numers, const struct libdivide_u32_branchfree_t *denom);
2121 : static LIBDIVIDE_INLINE __m512i libdivide_s32_branchfree_do_vec512(
2122 : __m512i numers, const struct libdivide_s32_branchfree_t *denom);
2123 : static LIBDIVIDE_INLINE __m512i libdivide_u64_branchfree_do_vec512(
2124 : __m512i numers, const struct libdivide_u64_branchfree_t *denom);
2125 : static LIBDIVIDE_INLINE __m512i libdivide_s64_branchfree_do_vec512(
2126 : __m512i numers, const struct libdivide_s64_branchfree_t *denom);
2127 :
2128 : //////// Internal Utility Functions
2129 :
2130 : static LIBDIVIDE_INLINE __m512i libdivide_s64_signbits_vec512(__m512i v) {
2131 : ;
2132 : return _mm512_srai_epi64(v, 63);
2133 : }
2134 :
2135 : static LIBDIVIDE_INLINE __m512i libdivide_s64_shift_right_vec512(__m512i v, int amt) {
2136 : return _mm512_srai_epi64(v, amt);
2137 : }
2138 :
2139 : // Here, b is assumed to contain one 32-bit value repeated.
2140 : static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u32_vec512(__m512i a, __m512i b) {
2141 : __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epu32(a, b), 32);
2142 : __m512i a1X3X = _mm512_srli_epi64(a, 32);
2143 : __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
2144 : __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epu32(a1X3X, b), mask);
2145 : return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3);
2146 : }
2147 :
2148 : // b is one 32-bit value repeated.
2149 : static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s32_vec512(__m512i a, __m512i b) {
2150 : __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epi32(a, b), 32);
2151 : __m512i a1X3X = _mm512_srli_epi64(a, 32);
2152 : __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
2153 : __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epi32(a1X3X, b), mask);
2154 : return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3);
2155 : }
2156 :
2157 : // Here, y is assumed to contain one 64-bit value repeated.
2158 : static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y) {
2159 : // see m128i variant for comments.
2160 : __m512i x0y0 = _mm512_mul_epu32(x, y);
2161 : __m512i x0y0_hi = _mm512_srli_epi64(x0y0, 32);
2162 :
2163 : __m512i x1 = _mm512_shuffle_epi32(x, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1));
2164 : __m512i y1 = _mm512_shuffle_epi32(y, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1));
2165 :
2166 : __m512i x0y1 = _mm512_mul_epu32(x, y1);
2167 : __m512i x1y0 = _mm512_mul_epu32(x1, y);
2168 : __m512i x1y1 = _mm512_mul_epu32(x1, y1);
2169 :
2170 : __m512i mask = _mm512_set1_epi64(0xFFFFFFFF);
2171 : __m512i temp = _mm512_add_epi64(x1y0, x0y0_hi);
2172 : __m512i temp_lo = _mm512_and_si512(temp, mask);
2173 : __m512i temp_hi = _mm512_srli_epi64(temp, 32);
2174 :
2175 : temp_lo = _mm512_srli_epi64(_mm512_add_epi64(temp_lo, x0y1), 32);
2176 : temp_hi = _mm512_add_epi64(x1y1, temp_hi);
2177 : return _mm512_add_epi64(temp_lo, temp_hi);
2178 : }
2179 :
2180 : // y is one 64-bit value repeated.
2181 : static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y) {
2182 : __m512i p = libdivide_mullhi_u64_vec512(x, y);
2183 : __m512i t1 = _mm512_and_si512(libdivide_s64_signbits_vec512(x), y);
2184 : __m512i t2 = _mm512_and_si512(libdivide_s64_signbits_vec512(y), x);
2185 : p = _mm512_sub_epi64(p, t1);
2186 : p = _mm512_sub_epi64(p, t2);
2187 : return p;
2188 : }
2189 :
2190 : ////////// UINT16
2191 :
2192 : __m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom){
2193 : SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)}
2194 :
2195 : __m512i libdivide_u16_branchfree_do_vec512(
2196 : __m512i numers, const struct libdivide_u16_branchfree_t *denom){
2197 : SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)}
2198 :
2199 : ////////// UINT32
2200 :
2201 : __m512i libdivide_u32_do_vec512(__m512i numers, const struct libdivide_u32_t *denom) {
2202 : uint8_t more = denom->more;
2203 : if (!denom->magic) {
2204 : return _mm512_srli_epi32(numers, more);
2205 : } else {
2206 : __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic));
2207 : if (more & LIBDIVIDE_ADD_MARKER) {
2208 : // uint32_t t = ((numer - q) >> 1) + q;
2209 : // return t >> denom->shift;
2210 : uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2211 : __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);
2212 : return _mm512_srli_epi32(t, shift);
2213 : } else {
2214 : return _mm512_srli_epi32(q, more);
2215 : }
2216 : }
2217 : }
2218 :
2219 : __m512i libdivide_u32_branchfree_do_vec512(
2220 : __m512i numers, const struct libdivide_u32_branchfree_t *denom) {
2221 : __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic));
2222 : __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);
2223 : return _mm512_srli_epi32(t, denom->more);
2224 : }
2225 :
2226 : ////////// UINT64
2227 :
2228 : __m512i libdivide_u64_do_vec512(__m512i numers, const struct libdivide_u64_t *denom) {
2229 : uint8_t more = denom->more;
2230 : if (!denom->magic) {
2231 : return _mm512_srli_epi64(numers, more);
2232 : } else {
2233 : __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic));
2234 : if (more & LIBDIVIDE_ADD_MARKER) {
2235 : // uint32_t t = ((numer - q) >> 1) + q;
2236 : // return t >> denom->shift;
2237 : uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2238 : __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);
2239 : return _mm512_srli_epi64(t, shift);
2240 : } else {
2241 : return _mm512_srli_epi64(q, more);
2242 : }
2243 : }
2244 : }
2245 :
2246 : __m512i libdivide_u64_branchfree_do_vec512(
2247 : __m512i numers, const struct libdivide_u64_branchfree_t *denom) {
2248 : __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic));
2249 : __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);
2250 : return _mm512_srli_epi64(t, denom->more);
2251 : }
2252 :
2253 : ////////// SINT16
2254 :
2255 : __m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom){
2256 : SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)}
2257 :
2258 : __m512i libdivide_s16_branchfree_do_vec512(
2259 : __m512i numers, const struct libdivide_s16_branchfree_t *denom){
2260 : SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)}
2261 :
2262 : ////////// SINT32
2263 :
2264 : __m512i libdivide_s32_do_vec512(__m512i numers, const struct libdivide_s32_t *denom) {
2265 : uint8_t more = denom->more;
2266 : if (!denom->magic) {
2267 : uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2268 : uint32_t mask = ((uint32_t)1 << shift) - 1;
2269 : __m512i roundToZeroTweak = _mm512_set1_epi32(mask);
2270 : // q = numer + ((numer >> 31) & roundToZeroTweak);
2271 : __m512i q = _mm512_add_epi32(
2272 : numers, _mm512_and_si512(_mm512_srai_epi32(numers, 31), roundToZeroTweak));
2273 : q = _mm512_srai_epi32(q, shift);
2274 : __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
2275 : // q = (q ^ sign) - sign;
2276 : q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign);
2277 : return q;
2278 : } else {
2279 : __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(denom->magic));
2280 : if (more & LIBDIVIDE_ADD_MARKER) {
2281 : // must be arithmetic shift
2282 : __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
2283 : // q += ((numer ^ sign) - sign);
2284 : q = _mm512_add_epi32(q, _mm512_sub_epi32(_mm512_xor_si512(numers, sign), sign));
2285 : }
2286 : // q >>= shift
2287 : q = _mm512_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
2288 : q = _mm512_add_epi32(q, _mm512_srli_epi32(q, 31)); // q += (q < 0)
2289 : return q;
2290 : }
2291 : }
2292 :
2293 : __m512i libdivide_s32_branchfree_do_vec512(
2294 : __m512i numers, const struct libdivide_s32_branchfree_t *denom) {
2295 : int32_t magic = denom->magic;
2296 : uint8_t more = denom->more;
2297 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2298 : // must be arithmetic shift
2299 : __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
2300 : __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(magic));
2301 : q = _mm512_add_epi32(q, numers); // q += numers
2302 :
2303 : // If q is non-negative, we have nothing to do
2304 : // If q is negative, we want to add either (2**shift)-1 if d is
2305 : // a power of 2, or (2**shift) if it is not a power of 2
2306 : uint32_t is_power_of_2 = (magic == 0);
2307 : __m512i q_sign = _mm512_srai_epi32(q, 31); // q_sign = q >> 31
2308 : __m512i mask = _mm512_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
2309 : q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask)
2310 : q = _mm512_srai_epi32(q, shift); // q >>= shift
2311 : q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign
2312 : return q;
2313 : }
2314 :
2315 : ////////// SINT64
2316 :
2317 : __m512i libdivide_s64_do_vec512(__m512i numers, const struct libdivide_s64_t *denom) {
2318 : uint8_t more = denom->more;
2319 : int64_t magic = denom->magic;
2320 : if (magic == 0) { // shift path
2321 : uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2322 : uint64_t mask = ((uint64_t)1 << shift) - 1;
2323 : __m512i roundToZeroTweak = _mm512_set1_epi64(mask);
2324 : // q = numer + ((numer >> 63) & roundToZeroTweak);
2325 : __m512i q = _mm512_add_epi64(
2326 : numers, _mm512_and_si512(libdivide_s64_signbits_vec512(numers), roundToZeroTweak));
2327 : q = libdivide_s64_shift_right_vec512(q, shift);
2328 : __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
2329 : // q = (q ^ sign) - sign;
2330 : q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign);
2331 : return q;
2332 : } else {
2333 : __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic));
2334 : if (more & LIBDIVIDE_ADD_MARKER) {
2335 : // must be arithmetic shift
2336 : __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
2337 : // q += ((numer ^ sign) - sign);
2338 : q = _mm512_add_epi64(q, _mm512_sub_epi64(_mm512_xor_si512(numers, sign), sign));
2339 : }
2340 : // q >>= denom->mult_path.shift
2341 : q = libdivide_s64_shift_right_vec512(q, more & LIBDIVIDE_64_SHIFT_MASK);
2342 : q = _mm512_add_epi64(q, _mm512_srli_epi64(q, 63)); // q += (q < 0)
2343 : return q;
2344 : }
2345 : }
2346 :
2347 : __m512i libdivide_s64_branchfree_do_vec512(
2348 : __m512i numers, const struct libdivide_s64_branchfree_t *denom) {
2349 : int64_t magic = denom->magic;
2350 : uint8_t more = denom->more;
2351 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2352 : // must be arithmetic shift
2353 : __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
2354 :
2355 : // libdivide_mullhi_s64(numers, magic);
2356 : __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic));
2357 : q = _mm512_add_epi64(q, numers); // q += numers
2358 :
2359 : // If q is non-negative, we have nothing to do.
2360 : // If q is negative, we want to add either (2**shift)-1 if d is
2361 : // a power of 2, or (2**shift) if it is not a power of 2.
2362 : uint32_t is_power_of_2 = (magic == 0);
2363 : __m512i q_sign = libdivide_s64_signbits_vec512(q); // q_sign = q >> 63
2364 : __m512i mask = _mm512_set1_epi64(((uint64_t)1 << shift) - is_power_of_2);
2365 : q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask)
2366 : q = libdivide_s64_shift_right_vec512(q, shift); // q >>= shift
2367 : q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign
2368 : return q;
2369 : }
2370 :
2371 : #endif
2372 :
2373 : #if defined(LIBDIVIDE_AVX2)
2374 :
2375 : static LIBDIVIDE_INLINE __m256i libdivide_u16_do_vec256(
2376 : __m256i numers, const struct libdivide_u16_t *denom);
2377 : static LIBDIVIDE_INLINE __m256i libdivide_s16_do_vec256(
2378 : __m256i numers, const struct libdivide_s16_t *denom);
2379 : static LIBDIVIDE_INLINE __m256i libdivide_u32_do_vec256(
2380 : __m256i numers, const struct libdivide_u32_t *denom);
2381 : static LIBDIVIDE_INLINE __m256i libdivide_s32_do_vec256(
2382 : __m256i numers, const struct libdivide_s32_t *denom);
2383 : static LIBDIVIDE_INLINE __m256i libdivide_u64_do_vec256(
2384 : __m256i numers, const struct libdivide_u64_t *denom);
2385 : static LIBDIVIDE_INLINE __m256i libdivide_s64_do_vec256(
2386 : __m256i numers, const struct libdivide_s64_t *denom);
2387 :
2388 : static LIBDIVIDE_INLINE __m256i libdivide_u16_branchfree_do_vec256(
2389 : __m256i numers, const struct libdivide_u16_branchfree_t *denom);
2390 : static LIBDIVIDE_INLINE __m256i libdivide_s16_branchfree_do_vec256(
2391 : __m256i numers, const struct libdivide_s16_branchfree_t *denom);
2392 : static LIBDIVIDE_INLINE __m256i libdivide_u32_branchfree_do_vec256(
2393 : __m256i numers, const struct libdivide_u32_branchfree_t *denom);
2394 : static LIBDIVIDE_INLINE __m256i libdivide_s32_branchfree_do_vec256(
2395 : __m256i numers, const struct libdivide_s32_branchfree_t *denom);
2396 : static LIBDIVIDE_INLINE __m256i libdivide_u64_branchfree_do_vec256(
2397 : __m256i numers, const struct libdivide_u64_branchfree_t *denom);
2398 : static LIBDIVIDE_INLINE __m256i libdivide_s64_branchfree_do_vec256(
2399 : __m256i numers, const struct libdivide_s64_branchfree_t *denom);
2400 :
2401 : //////// Internal Utility Functions
2402 :
2403 : // Implementation of _mm256_srai_epi64(v, 63) (from AVX512).
2404 : static LIBDIVIDE_INLINE __m256i libdivide_s64_signbits_vec256(__m256i v) {
2405 : __m256i hiBitsDuped = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
2406 : __m256i signBits = _mm256_srai_epi32(hiBitsDuped, 31);
2407 : return signBits;
2408 : }
2409 :
2410 : // Implementation of _mm256_srai_epi64 (from AVX512).
2411 : static LIBDIVIDE_INLINE __m256i libdivide_s64_shift_right_vec256(__m256i v, int amt) {
2412 : const int b = 64 - amt;
2413 : __m256i m = _mm256_set1_epi64x((uint64_t)1 << (b - 1));
2414 : __m256i x = _mm256_srli_epi64(v, amt);
2415 : __m256i result = _mm256_sub_epi64(_mm256_xor_si256(x, m), m);
2416 : return result;
2417 : }
2418 :
2419 : // Here, b is assumed to contain one 32-bit value repeated.
2420 : static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u32_vec256(__m256i a, __m256i b) {
2421 : __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epu32(a, b), 32);
2422 : __m256i a1X3X = _mm256_srli_epi64(a, 32);
2423 : __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
2424 : __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epu32(a1X3X, b), mask);
2425 : return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3);
2426 : }
2427 :
2428 : // b is one 32-bit value repeated.
2429 : static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s32_vec256(__m256i a, __m256i b) {
2430 : __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epi32(a, b), 32);
2431 : __m256i a1X3X = _mm256_srli_epi64(a, 32);
2432 : __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
2433 : __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epi32(a1X3X, b), mask);
2434 : return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3);
2435 : }
2436 :
2437 : // Here, y is assumed to contain one 64-bit value repeated.
2438 : static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y) {
2439 : // see m128i variant for comments.
2440 : __m256i x0y0 = _mm256_mul_epu32(x, y);
2441 : __m256i x0y0_hi = _mm256_srli_epi64(x0y0, 32);
2442 :
2443 : __m256i x1 = _mm256_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1));
2444 : __m256i y1 = _mm256_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1));
2445 :
2446 : __m256i x0y1 = _mm256_mul_epu32(x, y1);
2447 : __m256i x1y0 = _mm256_mul_epu32(x1, y);
2448 : __m256i x1y1 = _mm256_mul_epu32(x1, y1);
2449 :
2450 : __m256i mask = _mm256_set1_epi64x(0xFFFFFFFF);
2451 : __m256i temp = _mm256_add_epi64(x1y0, x0y0_hi);
2452 : __m256i temp_lo = _mm256_and_si256(temp, mask);
2453 : __m256i temp_hi = _mm256_srli_epi64(temp, 32);
2454 :
2455 : temp_lo = _mm256_srli_epi64(_mm256_add_epi64(temp_lo, x0y1), 32);
2456 : temp_hi = _mm256_add_epi64(x1y1, temp_hi);
2457 : return _mm256_add_epi64(temp_lo, temp_hi);
2458 : }
2459 :
2460 : // y is one 64-bit value repeated.
2461 : static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y) {
2462 : __m256i p = libdivide_mullhi_u64_vec256(x, y);
2463 : __m256i t1 = _mm256_and_si256(libdivide_s64_signbits_vec256(x), y);
2464 : __m256i t2 = _mm256_and_si256(libdivide_s64_signbits_vec256(y), x);
2465 : p = _mm256_sub_epi64(p, t1);
2466 : p = _mm256_sub_epi64(p, t2);
2467 : return p;
2468 : }
2469 :
2470 : ////////// UINT16
2471 :
2472 : __m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) {
2473 : uint8_t more = denom->more;
2474 : if (!denom->magic) {
2475 : return _mm256_srli_epi16(numers, more);
2476 : } else {
2477 : __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
2478 : if (more & LIBDIVIDE_ADD_MARKER) {
2479 : __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
2480 : return _mm256_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
2481 : } else {
2482 : return _mm256_srli_epi16(q, more);
2483 : }
2484 : }
2485 : }
2486 :
2487 : __m256i libdivide_u16_branchfree_do_vec256(
2488 : __m256i numers, const struct libdivide_u16_branchfree_t *denom) {
2489 : __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
2490 : __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
2491 : return _mm256_srli_epi16(t, denom->more);
2492 : }
2493 :
2494 : ////////// UINT32
2495 :
2496 : __m256i libdivide_u32_do_vec256(__m256i numers, const struct libdivide_u32_t *denom) {
2497 : uint8_t more = denom->more;
2498 : if (!denom->magic) {
2499 : return _mm256_srli_epi32(numers, more);
2500 : } else {
2501 : __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic));
2502 : if (more & LIBDIVIDE_ADD_MARKER) {
2503 : // uint32_t t = ((numer - q) >> 1) + q;
2504 : // return t >> denom->shift;
2505 : uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2506 : __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);
2507 : return _mm256_srli_epi32(t, shift);
2508 : } else {
2509 : return _mm256_srli_epi32(q, more);
2510 : }
2511 : }
2512 : }
2513 :
2514 : __m256i libdivide_u32_branchfree_do_vec256(
2515 : __m256i numers, const struct libdivide_u32_branchfree_t *denom) {
2516 : __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic));
2517 : __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);
2518 : return _mm256_srli_epi32(t, denom->more);
2519 : }
2520 :
2521 : ////////// UINT64
2522 :
2523 : __m256i libdivide_u64_do_vec256(__m256i numers, const struct libdivide_u64_t *denom) {
2524 : uint8_t more = denom->more;
2525 : if (!denom->magic) {
2526 : return _mm256_srli_epi64(numers, more);
2527 : } else {
2528 : __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic));
2529 : if (more & LIBDIVIDE_ADD_MARKER) {
2530 : // uint32_t t = ((numer - q) >> 1) + q;
2531 : // return t >> denom->shift;
2532 : uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2533 : __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);
2534 : return _mm256_srli_epi64(t, shift);
2535 : } else {
2536 : return _mm256_srli_epi64(q, more);
2537 : }
2538 : }
2539 : }
2540 :
2541 : __m256i libdivide_u64_branchfree_do_vec256(
2542 : __m256i numers, const struct libdivide_u64_branchfree_t *denom) {
2543 : __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic));
2544 : __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);
2545 : return _mm256_srli_epi64(t, denom->more);
2546 : }
2547 :
2548 : ////////// SINT16
2549 :
2550 : __m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) {
2551 : uint8_t more = denom->more;
2552 : if (!denom->magic) {
2553 : uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
2554 : uint16_t mask = ((uint16_t)1 << shift) - 1;
2555 : __m256i roundToZeroTweak = _mm256_set1_epi16(mask);
2556 : // q = numer + ((numer >> 15) & roundToZeroTweak);
2557 : __m256i q = _mm256_add_epi16(
2558 : numers, _mm256_and_si256(_mm256_srai_epi16(numers, 15), roundToZeroTweak));
2559 : q = _mm256_srai_epi16(q, shift);
2560 : __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
2561 : // q = (q ^ sign) - sign;
2562 : q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign);
2563 : return q;
2564 : } else {
2565 : __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(denom->magic));
2566 : if (more & LIBDIVIDE_ADD_MARKER) {
2567 : // must be arithmetic shift
2568 : __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
2569 : // q += ((numer ^ sign) - sign);
2570 : q = _mm256_add_epi16(q, _mm256_sub_epi16(_mm256_xor_si256(numers, sign), sign));
2571 : }
2572 : // q >>= shift
2573 : q = _mm256_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
2574 : q = _mm256_add_epi16(q, _mm256_srli_epi16(q, 15)); // q += (q < 0)
2575 : return q;
2576 : }
2577 : }
2578 :
2579 : __m256i libdivide_s16_branchfree_do_vec256(
2580 : __m256i numers, const struct libdivide_s16_branchfree_t *denom) {
2581 : int16_t magic = denom->magic;
2582 : uint8_t more = denom->more;
2583 : uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
2584 : // must be arithmetic shift
2585 : __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
2586 : __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(magic));
2587 : q = _mm256_add_epi16(q, numers); // q += numers
2588 :
2589 : // If q is non-negative, we have nothing to do
2590 : // If q is negative, we want to add either (2**shift)-1 if d is
2591 : // a power of 2, or (2**shift) if it is not a power of 2
2592 : uint16_t is_power_of_2 = (magic == 0);
2593 : __m256i q_sign = _mm256_srai_epi16(q, 15); // q_sign = q >> 15
2594 : __m256i mask = _mm256_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
2595 : q = _mm256_add_epi16(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask)
2596 : q = _mm256_srai_epi16(q, shift); // q >>= shift
2597 : q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign
2598 : return q;
2599 : }
2600 :
2601 : ////////// SINT32
2602 :
2603 : __m256i libdivide_s32_do_vec256(__m256i numers, const struct libdivide_s32_t *denom) {
2604 : uint8_t more = denom->more;
2605 : if (!denom->magic) {
2606 : uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2607 : uint32_t mask = ((uint32_t)1 << shift) - 1;
2608 : __m256i roundToZeroTweak = _mm256_set1_epi32(mask);
2609 : // q = numer + ((numer >> 31) & roundToZeroTweak);
2610 : __m256i q = _mm256_add_epi32(
2611 : numers, _mm256_and_si256(_mm256_srai_epi32(numers, 31), roundToZeroTweak));
2612 : q = _mm256_srai_epi32(q, shift);
2613 : __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
2614 : // q = (q ^ sign) - sign;
2615 : q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign);
2616 : return q;
2617 : } else {
2618 : __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(denom->magic));
2619 : if (more & LIBDIVIDE_ADD_MARKER) {
2620 : // must be arithmetic shift
2621 : __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
2622 : // q += ((numer ^ sign) - sign);
2623 : q = _mm256_add_epi32(q, _mm256_sub_epi32(_mm256_xor_si256(numers, sign), sign));
2624 : }
2625 : // q >>= shift
2626 : q = _mm256_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
2627 : q = _mm256_add_epi32(q, _mm256_srli_epi32(q, 31)); // q += (q < 0)
2628 : return q;
2629 : }
2630 : }
2631 :
2632 : __m256i libdivide_s32_branchfree_do_vec256(
2633 : __m256i numers, const struct libdivide_s32_branchfree_t *denom) {
2634 : int32_t magic = denom->magic;
2635 : uint8_t more = denom->more;
2636 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2637 : // must be arithmetic shift
2638 : __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
2639 : __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(magic));
2640 : q = _mm256_add_epi32(q, numers); // q += numers
2641 :
2642 : // If q is non-negative, we have nothing to do
2643 : // If q is negative, we want to add either (2**shift)-1 if d is
2644 : // a power of 2, or (2**shift) if it is not a power of 2
2645 : uint32_t is_power_of_2 = (magic == 0);
2646 : __m256i q_sign = _mm256_srai_epi32(q, 31); // q_sign = q >> 31
2647 : __m256i mask = _mm256_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
2648 : q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask)
2649 : q = _mm256_srai_epi32(q, shift); // q >>= shift
2650 : q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign
2651 : return q;
2652 : }
2653 :
2654 : ////////// SINT64
2655 :
2656 : __m256i libdivide_s64_do_vec256(__m256i numers, const struct libdivide_s64_t *denom) {
2657 : uint8_t more = denom->more;
2658 : int64_t magic = denom->magic;
2659 : if (magic == 0) { // shift path
2660 : uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2661 : uint64_t mask = ((uint64_t)1 << shift) - 1;
2662 : __m256i roundToZeroTweak = _mm256_set1_epi64x(mask);
2663 : // q = numer + ((numer >> 63) & roundToZeroTweak);
2664 : __m256i q = _mm256_add_epi64(
2665 : numers, _mm256_and_si256(libdivide_s64_signbits_vec256(numers), roundToZeroTweak));
2666 : q = libdivide_s64_shift_right_vec256(q, shift);
2667 : __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
2668 : // q = (q ^ sign) - sign;
2669 : q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign);
2670 : return q;
2671 : } else {
2672 : __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic));
2673 : if (more & LIBDIVIDE_ADD_MARKER) {
2674 : // must be arithmetic shift
2675 : __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
2676 : // q += ((numer ^ sign) - sign);
2677 : q = _mm256_add_epi64(q, _mm256_sub_epi64(_mm256_xor_si256(numers, sign), sign));
2678 : }
2679 : // q >>= denom->mult_path.shift
2680 : q = libdivide_s64_shift_right_vec256(q, more & LIBDIVIDE_64_SHIFT_MASK);
2681 : q = _mm256_add_epi64(q, _mm256_srli_epi64(q, 63)); // q += (q < 0)
2682 : return q;
2683 : }
2684 : }
2685 :
2686 : __m256i libdivide_s64_branchfree_do_vec256(
2687 : __m256i numers, const struct libdivide_s64_branchfree_t *denom) {
2688 : int64_t magic = denom->magic;
2689 : uint8_t more = denom->more;
2690 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2691 : // must be arithmetic shift
2692 : __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
2693 :
2694 : // libdivide_mullhi_s64(numers, magic);
2695 : __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic));
2696 : q = _mm256_add_epi64(q, numers); // q += numers
2697 :
2698 : // If q is non-negative, we have nothing to do.
2699 : // If q is negative, we want to add either (2**shift)-1 if d is
2700 : // a power of 2, or (2**shift) if it is not a power of 2.
2701 : uint32_t is_power_of_2 = (magic == 0);
2702 : __m256i q_sign = libdivide_s64_signbits_vec256(q); // q_sign = q >> 63
2703 : __m256i mask = _mm256_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);
2704 : q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask)
2705 : q = libdivide_s64_shift_right_vec256(q, shift); // q >>= shift
2706 : q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign
2707 : return q;
2708 : }
2709 :
2710 : #endif
2711 :
2712 : #if defined(LIBDIVIDE_SSE2)
2713 :
2714 : static LIBDIVIDE_INLINE __m128i libdivide_u16_do_vec128(
2715 : __m128i numers, const struct libdivide_u16_t *denom);
2716 : static LIBDIVIDE_INLINE __m128i libdivide_s16_do_vec128(
2717 : __m128i numers, const struct libdivide_s16_t *denom);
2718 : static LIBDIVIDE_INLINE __m128i libdivide_u32_do_vec128(
2719 : __m128i numers, const struct libdivide_u32_t *denom);
2720 : static LIBDIVIDE_INLINE __m128i libdivide_s32_do_vec128(
2721 : __m128i numers, const struct libdivide_s32_t *denom);
2722 : static LIBDIVIDE_INLINE __m128i libdivide_u64_do_vec128(
2723 : __m128i numers, const struct libdivide_u64_t *denom);
2724 : static LIBDIVIDE_INLINE __m128i libdivide_s64_do_vec128(
2725 : __m128i numers, const struct libdivide_s64_t *denom);
2726 :
2727 : static LIBDIVIDE_INLINE __m128i libdivide_u16_branchfree_do_vec128(
2728 : __m128i numers, const struct libdivide_u16_branchfree_t *denom);
2729 : static LIBDIVIDE_INLINE __m128i libdivide_s16_branchfree_do_vec128(
2730 : __m128i numers, const struct libdivide_s16_branchfree_t *denom);
2731 : static LIBDIVIDE_INLINE __m128i libdivide_u32_branchfree_do_vec128(
2732 : __m128i numers, const struct libdivide_u32_branchfree_t *denom);
2733 : static LIBDIVIDE_INLINE __m128i libdivide_s32_branchfree_do_vec128(
2734 : __m128i numers, const struct libdivide_s32_branchfree_t *denom);
2735 : static LIBDIVIDE_INLINE __m128i libdivide_u64_branchfree_do_vec128(
2736 : __m128i numers, const struct libdivide_u64_branchfree_t *denom);
2737 : static LIBDIVIDE_INLINE __m128i libdivide_s64_branchfree_do_vec128(
2738 : __m128i numers, const struct libdivide_s64_branchfree_t *denom);
2739 :
2740 : //////// Internal Utility Functions
2741 :
2742 : // Implementation of _mm_srai_epi64(v, 63) (from AVX512).
2743 : static LIBDIVIDE_INLINE __m128i libdivide_s64_signbits_vec128(__m128i v) {
2744 : __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
2745 : __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31);
2746 : return signBits;
2747 : }
2748 :
2749 : // Implementation of _mm_srai_epi64 (from AVX512).
2750 : static LIBDIVIDE_INLINE __m128i libdivide_s64_shift_right_vec128(__m128i v, int amt) {
2751 : const int b = 64 - amt;
2752 : __m128i m = _mm_set1_epi64x((uint64_t)1 << (b - 1));
2753 : __m128i x = _mm_srli_epi64(v, amt);
2754 : __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m);
2755 : return result;
2756 : }
2757 :
2758 : // Here, b is assumed to contain one 32-bit value repeated.
2759 : static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u32_vec128(__m128i a, __m128i b) {
2760 2436 : __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32);
2761 1218 : __m128i a1X3X = _mm_srli_epi64(a, 32);
2762 1218 : __m128i mask = _mm_set_epi32(-1, 0, -1, 0);
2763 2436 : __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), mask);
2764 1218 : return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3);
2765 : }
2766 :
2767 : // SSE2 does not have a signed multiplication instruction, but we can convert
2768 : // unsigned to signed pretty efficiently. Again, b is just a 32 bit value
2769 : // repeated four times.
2770 : static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s32_vec128(__m128i a, __m128i b) {
2771 : __m128i p = libdivide_mullhi_u32_vec128(a, b);
2772 : // t1 = (a >> 31) & y, arithmetic shift
2773 : __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b);
2774 : __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a);
2775 : p = _mm_sub_epi32(p, t1);
2776 : p = _mm_sub_epi32(p, t2);
2777 : return p;
2778 : }
2779 :
2780 : // Here, y is assumed to contain one 64-bit value repeated.
2781 : static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y) {
2782 : // full 128 bits product is:
2783 : // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64)
2784 : // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64.
2785 :
2786 : // Compute x0*y0.
2787 : // Note x1, y1 are ignored by mul_epu32.
2788 : __m128i x0y0 = _mm_mul_epu32(x, y);
2789 : __m128i x0y0_hi = _mm_srli_epi64(x0y0, 32);
2790 :
2791 : // Get x1, y1 in the low bits.
2792 : // We could shuffle or right shift. Shuffles are preferred as they preserve
2793 : // the source register for the next computation.
2794 : __m128i x1 = _mm_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1));
2795 : __m128i y1 = _mm_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1));
2796 :
2797 : // No need to mask off top 32 bits for mul_epu32.
2798 : __m128i x0y1 = _mm_mul_epu32(x, y1);
2799 : __m128i x1y0 = _mm_mul_epu32(x1, y);
2800 : __m128i x1y1 = _mm_mul_epu32(x1, y1);
2801 :
2802 : // Mask here selects low bits only.
2803 : __m128i mask = _mm_set1_epi64x(0xFFFFFFFF);
2804 : __m128i temp = _mm_add_epi64(x1y0, x0y0_hi);
2805 : __m128i temp_lo = _mm_and_si128(temp, mask);
2806 : __m128i temp_hi = _mm_srli_epi64(temp, 32);
2807 :
2808 : temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32);
2809 : temp_hi = _mm_add_epi64(x1y1, temp_hi);
2810 : return _mm_add_epi64(temp_lo, temp_hi);
2811 : }
2812 :
2813 : // y is one 64-bit value repeated.
2814 : static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y) {
2815 : __m128i p = libdivide_mullhi_u64_vec128(x, y);
2816 : __m128i t1 = _mm_and_si128(libdivide_s64_signbits_vec128(x), y);
2817 : __m128i t2 = _mm_and_si128(libdivide_s64_signbits_vec128(y), x);
2818 : p = _mm_sub_epi64(p, t1);
2819 : p = _mm_sub_epi64(p, t2);
2820 : return p;
2821 : }
2822 :
2823 : ////////// UINT26
2824 :
2825 : __m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) {
2826 10030 : uint8_t more = denom->more;
2827 10030 : if (!denom->magic) {
2828 36 : return _mm_srli_epi16(numers, more);
2829 : } else {
2830 20024 : __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
2831 10012 : if (more & LIBDIVIDE_ADD_MARKER) {
2832 12 : __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
2833 12 : return _mm_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
2834 : } else {
2835 20012 : return _mm_srli_epi16(q, more);
2836 : }
2837 : }
2838 : }
2839 :
2840 : __m128i libdivide_u16_branchfree_do_vec128(
2841 : __m128i numers, const struct libdivide_u16_branchfree_t *denom) {
2842 : __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
2843 : __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
2844 : return _mm_srli_epi16(t, denom->more);
2845 : }
2846 :
2847 : ////////// UINT32
2848 :
2849 : __m128i libdivide_u32_do_vec128(__m128i numers, const struct libdivide_u32_t *denom) {
2850 1272 : uint8_t more = denom->more;
2851 1272 : if (!denom->magic) {
2852 108 : return _mm_srli_epi32(numers, more);
2853 : } else {
2854 2436 : __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic));
2855 1218 : if (more & LIBDIVIDE_ADD_MARKER) {
2856 : // uint32_t t = ((numer - q) >> 1) + q;
2857 : // return t >> denom->shift;
2858 0 : uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2859 0 : __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
2860 0 : return _mm_srli_epi32(t, shift);
2861 : } else {
2862 2436 : return _mm_srli_epi32(q, more);
2863 : }
2864 : }
2865 : }
2866 :
2867 : __m128i libdivide_u32_branchfree_do_vec128(
2868 : __m128i numers, const struct libdivide_u32_branchfree_t *denom) {
2869 : __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic));
2870 : __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
2871 : return _mm_srli_epi32(t, denom->more);
2872 : }
2873 :
2874 : ////////// UINT64
2875 :
2876 : __m128i libdivide_u64_do_vec128(__m128i numers, const struct libdivide_u64_t *denom) {
2877 : uint8_t more = denom->more;
2878 : if (!denom->magic) {
2879 : return _mm_srli_epi64(numers, more);
2880 : } else {
2881 : __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic));
2882 : if (more & LIBDIVIDE_ADD_MARKER) {
2883 : // uint32_t t = ((numer - q) >> 1) + q;
2884 : // return t >> denom->shift;
2885 : uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2886 : __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
2887 : return _mm_srli_epi64(t, shift);
2888 : } else {
2889 : return _mm_srli_epi64(q, more);
2890 : }
2891 : }
2892 : }
2893 :
2894 : __m128i libdivide_u64_branchfree_do_vec128(
2895 : __m128i numers, const struct libdivide_u64_branchfree_t *denom) {
2896 : __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic));
2897 : __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
2898 : return _mm_srli_epi64(t, denom->more);
2899 : }
2900 :
2901 : ////////// SINT16
2902 :
2903 : __m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) {
2904 : uint8_t more = denom->more;
2905 : if (!denom->magic) {
2906 : uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
2907 : uint16_t mask = ((uint16_t)1 << shift) - 1;
2908 : __m128i roundToZeroTweak = _mm_set1_epi16(mask);
2909 : // q = numer + ((numer >> 15) & roundToZeroTweak);
2910 : __m128i q =
2911 : _mm_add_epi16(numers, _mm_and_si128(_mm_srai_epi16(numers, 15), roundToZeroTweak));
2912 : q = _mm_srai_epi16(q, shift);
2913 : __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
2914 : // q = (q ^ sign) - sign;
2915 : q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign);
2916 : return q;
2917 : } else {
2918 : __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(denom->magic));
2919 : if (more & LIBDIVIDE_ADD_MARKER) {
2920 : // must be arithmetic shift
2921 : __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
2922 : // q += ((numer ^ sign) - sign);
2923 : q = _mm_add_epi16(q, _mm_sub_epi16(_mm_xor_si128(numers, sign), sign));
2924 : }
2925 : // q >>= shift
2926 : q = _mm_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
2927 : q = _mm_add_epi16(q, _mm_srli_epi16(q, 15)); // q += (q < 0)
2928 : return q;
2929 : }
2930 : }
2931 :
2932 : __m128i libdivide_s16_branchfree_do_vec128(
2933 : __m128i numers, const struct libdivide_s16_branchfree_t *denom) {
2934 : int16_t magic = denom->magic;
2935 : uint8_t more = denom->more;
2936 : uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
2937 : // must be arithmetic shift
2938 : __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
2939 : __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(magic));
2940 : q = _mm_add_epi16(q, numers); // q += numers
2941 :
2942 : // If q is non-negative, we have nothing to do
2943 : // If q is negative, we want to add either (2**shift)-1 if d is
2944 : // a power of 2, or (2**shift) if it is not a power of 2
2945 : uint16_t is_power_of_2 = (magic == 0);
2946 : __m128i q_sign = _mm_srai_epi16(q, 15); // q_sign = q >> 15
2947 : __m128i mask = _mm_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
2948 : q = _mm_add_epi16(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask)
2949 : q = _mm_srai_epi16(q, shift); // q >>= shift
2950 : q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign
2951 : return q;
2952 : }
2953 :
2954 : ////////// SINT32
2955 :
2956 : __m128i libdivide_s32_do_vec128(__m128i numers, const struct libdivide_s32_t *denom) {
2957 : uint8_t more = denom->more;
2958 : if (!denom->magic) {
2959 : uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2960 : uint32_t mask = ((uint32_t)1 << shift) - 1;
2961 : __m128i roundToZeroTweak = _mm_set1_epi32(mask);
2962 : // q = numer + ((numer >> 31) & roundToZeroTweak);
2963 : __m128i q =
2964 : _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
2965 : q = _mm_srai_epi32(q, shift);
2966 : __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
2967 : // q = (q ^ sign) - sign;
2968 : q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign);
2969 : return q;
2970 : } else {
2971 : __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(denom->magic));
2972 : if (more & LIBDIVIDE_ADD_MARKER) {
2973 : // must be arithmetic shift
2974 : __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
2975 : // q += ((numer ^ sign) - sign);
2976 : q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign));
2977 : }
2978 : // q >>= shift
2979 : q = _mm_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
2980 : q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0)
2981 : return q;
2982 : }
2983 : }
2984 :
2985 : __m128i libdivide_s32_branchfree_do_vec128(
2986 : __m128i numers, const struct libdivide_s32_branchfree_t *denom) {
2987 : int32_t magic = denom->magic;
2988 : uint8_t more = denom->more;
2989 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2990 : // must be arithmetic shift
2991 : __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
2992 : __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(magic));
2993 : q = _mm_add_epi32(q, numers); // q += numers
2994 :
2995 : // If q is non-negative, we have nothing to do
2996 : // If q is negative, we want to add either (2**shift)-1 if d is
2997 : // a power of 2, or (2**shift) if it is not a power of 2
2998 : uint32_t is_power_of_2 = (magic == 0);
2999 : __m128i q_sign = _mm_srai_epi32(q, 31); // q_sign = q >> 31
3000 : __m128i mask = _mm_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
3001 : q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask)
3002 : q = _mm_srai_epi32(q, shift); // q >>= shift
3003 : q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign
3004 : return q;
3005 : }
3006 :
3007 : ////////// SINT64
3008 :
3009 : __m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *denom) {
3010 : uint8_t more = denom->more;
3011 : int64_t magic = denom->magic;
3012 : if (magic == 0) { // shift path
3013 : uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
3014 : uint64_t mask = ((uint64_t)1 << shift) - 1;
3015 : __m128i roundToZeroTweak = _mm_set1_epi64x(mask);
3016 : // q = numer + ((numer >> 63) & roundToZeroTweak);
3017 : __m128i q = _mm_add_epi64(
3018 : numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak));
3019 : q = libdivide_s64_shift_right_vec128(q, shift);
3020 : __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
3021 : // q = (q ^ sign) - sign;
3022 : q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign);
3023 : return q;
3024 : } else {
3025 : __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic));
3026 : if (more & LIBDIVIDE_ADD_MARKER) {
3027 : // must be arithmetic shift
3028 : __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
3029 : // q += ((numer ^ sign) - sign);
3030 : q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign));
3031 : }
3032 : // q >>= denom->mult_path.shift
3033 : q = libdivide_s64_shift_right_vec128(q, more & LIBDIVIDE_64_SHIFT_MASK);
3034 : q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0)
3035 : return q;
3036 : }
3037 : }
3038 :
3039 : __m128i libdivide_s64_branchfree_do_vec128(
3040 : __m128i numers, const struct libdivide_s64_branchfree_t *denom) {
3041 : int64_t magic = denom->magic;
3042 : uint8_t more = denom->more;
3043 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
3044 : // must be arithmetic shift
3045 : __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
3046 :
3047 : // libdivide_mullhi_s64(numers, magic);
3048 : __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic));
3049 : q = _mm_add_epi64(q, numers); // q += numers
3050 :
3051 : // If q is non-negative, we have nothing to do.
3052 : // If q is negative, we want to add either (2**shift)-1 if d is
3053 : // a power of 2, or (2**shift) if it is not a power of 2.
3054 : uint32_t is_power_of_2 = (magic == 0);
3055 : __m128i q_sign = libdivide_s64_signbits_vec128(q); // q_sign = q >> 63
3056 : __m128i mask = _mm_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);
3057 : q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask)
3058 : q = libdivide_s64_shift_right_vec128(q, shift); // q >>= shift
3059 : q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign
3060 : return q;
3061 : }
3062 :
3063 : #endif
3064 :
3065 : ////////// C++ stuff
3066 :
3067 : #ifdef __cplusplus
3068 :
3069 : enum Branching {
3070 : BRANCHFULL, // use branching algorithms
3071 : BRANCHFREE // use branchfree algorithms
3072 : };
3073 :
3074 : namespace detail {
3075 : enum Signedness {
3076 : SIGNED,
3077 : UNSIGNED,
3078 : };
3079 :
3080 : #if defined(LIBDIVIDE_NEON)
3081 : // Helper to deduce NEON vector type for integral type.
3082 : template <int _WIDTH, Signedness _SIGN>
3083 : struct NeonVec {};
3084 :
3085 : template <>
3086 : struct NeonVec<16, UNSIGNED> {
3087 : typedef uint16x8_t type;
3088 : };
3089 :
3090 : template <>
3091 : struct NeonVec<16, SIGNED> {
3092 : typedef int16x8_t type;
3093 : };
3094 :
3095 : template <>
3096 : struct NeonVec<32, UNSIGNED> {
3097 : typedef uint32x4_t type;
3098 : };
3099 :
3100 : template <>
3101 : struct NeonVec<32, SIGNED> {
3102 : typedef int32x4_t type;
3103 : };
3104 :
3105 : template <>
3106 : struct NeonVec<64, UNSIGNED> {
3107 : typedef uint64x2_t type;
3108 : };
3109 :
3110 : template <>
3111 : struct NeonVec<64, SIGNED> {
3112 : typedef int64x2_t type;
3113 : };
3114 :
3115 : template <typename T>
3116 : struct NeonVecFor {
3117 : // See 'class divider' for an explanation of these template parameters.
3118 : typedef typename NeonVec<sizeof(T) * 8, (((T)0 >> 0) > (T)(-1) ? SIGNED : UNSIGNED)>::type type;
3119 : };
3120 :
3121 : #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) \
3122 : LIBDIVIDE_INLINE typename NeonVecFor<INT_TYPE>::type divide( \
3123 : typename NeonVecFor<INT_TYPE>::type n) const { \
3124 : return libdivide_##ALGO##_do_vec128(n, &denom); \
3125 : }
3126 : #else
3127 : #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)
3128 : #endif
3129 :
3130 : #if defined(LIBDIVIDE_SSE2)
3131 : #define LIBDIVIDE_DIVIDE_SSE2(ALGO) \
3132 : LIBDIVIDE_INLINE __m128i divide(__m128i n) const { \
3133 : return libdivide_##ALGO##_do_vec128(n, &denom); \
3134 : }
3135 : #else
3136 : #define LIBDIVIDE_DIVIDE_SSE2(ALGO)
3137 : #endif
3138 :
3139 : #if defined(LIBDIVIDE_AVX2)
3140 : #define LIBDIVIDE_DIVIDE_AVX2(ALGO) \
3141 : LIBDIVIDE_INLINE __m256i divide(__m256i n) const { \
3142 : return libdivide_##ALGO##_do_vec256(n, &denom); \
3143 : }
3144 : #else
3145 : #define LIBDIVIDE_DIVIDE_AVX2(ALGO)
3146 : #endif
3147 :
3148 : #if defined(LIBDIVIDE_AVX512)
3149 : #define LIBDIVIDE_DIVIDE_AVX512(ALGO) \
3150 : LIBDIVIDE_INLINE __m512i divide(__m512i n) const { \
3151 : return libdivide_##ALGO##_do_vec512(n, &denom); \
3152 : }
3153 : #else
3154 : #define LIBDIVIDE_DIVIDE_AVX512(ALGO)
3155 : #endif
3156 :
3157 : // The DISPATCHER_GEN() macro generates C++ methods (for the given integer
3158 : // and algorithm types) that redirect to libdivide's C API.
3159 : #define DISPATCHER_GEN(T, ALGO) \
3160 : libdivide_##ALGO##_t denom; \
3161 : LIBDIVIDE_INLINE dispatcher() {} \
3162 : explicit LIBDIVIDE_CONSTEXPR dispatcher(decltype(nullptr)) : denom{} {} \
3163 : LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {} \
3164 : LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \
3165 : LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \
3166 : LIBDIVIDE_DIVIDE_NEON(ALGO, T) \
3167 : LIBDIVIDE_DIVIDE_SSE2(ALGO) \
3168 : LIBDIVIDE_DIVIDE_AVX2(ALGO) \
3169 : LIBDIVIDE_DIVIDE_AVX512(ALGO)
3170 :
3171 : // The dispatcher selects a specific division algorithm for a given
3172 : // width, signedness, and ALGO using partial template specialization.
3173 : template <int _WIDTH, Signedness _SIGN, Branching _ALGO>
3174 : struct dispatcher {};
3175 :
3176 : template <>
3177 : struct dispatcher<16, SIGNED, BRANCHFULL> {
3178 : DISPATCHER_GEN(int16_t, s16)
3179 : };
3180 : template <>
3181 : struct dispatcher<16, SIGNED, BRANCHFREE> {
3182 : DISPATCHER_GEN(int16_t, s16_branchfree)
3183 : };
3184 : template <>
3185 : struct dispatcher<16, UNSIGNED, BRANCHFULL> {
3186 10036 : DISPATCHER_GEN(uint16_t, u16)
3187 : };
3188 : template <>
3189 : struct dispatcher<16, UNSIGNED, BRANCHFREE> {
3190 : DISPATCHER_GEN(uint16_t, u16_branchfree)
3191 : };
3192 : template <>
3193 : struct dispatcher<32, SIGNED, BRANCHFULL> {
3194 : DISPATCHER_GEN(int32_t, s32)
3195 : };
3196 : template <>
3197 : struct dispatcher<32, SIGNED, BRANCHFREE> {
3198 : DISPATCHER_GEN(int32_t, s32_branchfree)
3199 : };
3200 : template <>
3201 : struct dispatcher<32, UNSIGNED, BRANCHFULL> {
3202 5200722 : DISPATCHER_GEN(uint32_t, u32)
3203 : };
3204 : template <>
3205 : struct dispatcher<32, UNSIGNED, BRANCHFREE> {
3206 : DISPATCHER_GEN(uint32_t, u32_branchfree)
3207 : };
3208 : template <>
3209 : struct dispatcher<64, SIGNED, BRANCHFULL> {
3210 : DISPATCHER_GEN(int64_t, s64)
3211 : };
3212 : template <>
3213 : struct dispatcher<64, SIGNED, BRANCHFREE> {
3214 : DISPATCHER_GEN(int64_t, s64_branchfree)
3215 : };
3216 : template <>
3217 : struct dispatcher<64, UNSIGNED, BRANCHFULL> {
3218 : DISPATCHER_GEN(uint64_t, u64)
3219 : };
3220 : template <>
3221 : struct dispatcher<64, UNSIGNED, BRANCHFREE> {
3222 : DISPATCHER_GEN(uint64_t, u64_branchfree)
3223 : };
3224 : } // namespace detail
3225 :
3226 : #if defined(LIBDIVIDE_NEON)
3227 : // Allow NeonVecFor outside of detail namespace.
3228 : template <typename T>
3229 : struct NeonVecFor {
3230 : typedef typename detail::NeonVecFor<T>::type type;
3231 : };
3232 : #endif
3233 :
3234 : // This is the main divider class for use by the user (C++ API).
3235 : // The actual division algorithm is selected using the dispatcher struct
3236 : // based on the integer width and algorithm template parameters.
3237 : template <typename T, Branching ALGO = BRANCHFULL>
3238 : class divider {
3239 : private:
3240 : // Dispatch based on the size and signedness.
3241 : // We avoid using type_traits as it's not available in AVR.
3242 : // Detect signedness by checking if T(-1) is less than T(0).
3243 : // Also throw in a shift by 0, which prevents floating point types from being passed.
3244 : typedef detail::dispatcher<sizeof(T) * 8,
3245 : (((T)0 >> 0) > (T)(-1) ? detail::SIGNED : detail::UNSIGNED), ALGO>
3246 : dispatcher_t;
3247 :
3248 : public:
3249 : // We leave the default constructor empty so that creating
3250 : // an array of dividers and then initializing them
3251 : // later doesn't slow us down.
3252 : divider() {}
3253 :
3254 : // constexpr zero-initialization to allow for use w/ static constinit
3255 : explicit LIBDIVIDE_CONSTEXPR divider(decltype(nullptr)) : div(nullptr) {}
3256 :
3257 : // Constructor that takes the divisor as a parameter
3258 212260 : LIBDIVIDE_INLINE divider(T d) : div(d) {}
3259 :
3260 : // Divides n by the divisor
3261 10186648 : LIBDIVIDE_INLINE T divide(T n) const { return div.divide(n); }
3262 :
3263 : // Recovers the divisor, returns the value that was
3264 : // used to initialize this divider object.
3265 : T recover() const { return div.recover(); }
3266 :
3267 : bool operator==(const divider<T, ALGO> &other) const {
3268 : return div.denom.magic == other.div.denom.magic && div.denom.more == other.div.denom.more;
3269 : }
3270 :
3271 : bool operator!=(const divider<T, ALGO> &other) const { return !(*this == other); }
3272 :
3273 : // Vector variants treat the input as packed integer values with the same type as the divider
3274 : // (e.g. s32, u32, s64, u64) and divides each of them by the divider, returning the packed
3275 : // quotients.
3276 : #if defined(LIBDIVIDE_SSE2)
3277 11302 : LIBDIVIDE_INLINE __m128i divide(__m128i n) const { return div.divide(n); }
3278 : #endif
3279 : #if defined(LIBDIVIDE_AVX2)
3280 : LIBDIVIDE_INLINE __m256i divide(__m256i n) const { return div.divide(n); }
3281 : #endif
3282 : #if defined(LIBDIVIDE_AVX512)
3283 : LIBDIVIDE_INLINE __m512i divide(__m512i n) const { return div.divide(n); }
3284 : #endif
3285 : #if defined(LIBDIVIDE_NEON)
3286 : LIBDIVIDE_INLINE typename NeonVecFor<T>::type divide(typename NeonVecFor<T>::type n) const {
3287 : return div.divide(n);
3288 : }
3289 : #endif
3290 :
3291 : private:
3292 : // Storage for the actual divisor
3293 : dispatcher_t div;
3294 : };
3295 :
3296 : // Overload of operator / for scalar division
3297 : template <typename T, Branching ALGO>
3298 : LIBDIVIDE_INLINE T operator/(T n, const divider<T, ALGO> &div) {
3299 5093324 : return div.divide(n);
3300 : }
3301 :
3302 : // Overload of operator /= for scalar division
3303 : template <typename T, Branching ALGO>
3304 : LIBDIVIDE_INLINE T &operator/=(T &n, const divider<T, ALGO> &div) {
3305 : n = div.divide(n);
3306 : return n;
3307 : }
3308 :
3309 : // Overloads for vector types.
3310 : #if defined(LIBDIVIDE_SSE2)
3311 : template <typename T, Branching ALGO>
3312 : LIBDIVIDE_INLINE __m128i operator/(__m128i n, const divider<T, ALGO> &div) {
3313 : return div.divide(n);
3314 : }
3315 :
3316 : template <typename T, Branching ALGO>
3317 : LIBDIVIDE_INLINE __m128i operator/=(__m128i &n, const divider<T, ALGO> &div) {
3318 11302 : n = div.divide(n);
3319 11302 : return n;
3320 : }
3321 : #endif
3322 : #if defined(LIBDIVIDE_AVX2)
3323 : template <typename T, Branching ALGO>
3324 : LIBDIVIDE_INLINE __m256i operator/(__m256i n, const divider<T, ALGO> &div) {
3325 : return div.divide(n);
3326 : }
3327 :
3328 : template <typename T, Branching ALGO>
3329 : LIBDIVIDE_INLINE __m256i operator/=(__m256i &n, const divider<T, ALGO> &div) {
3330 : n = div.divide(n);
3331 : return n;
3332 : }
3333 : #endif
3334 : #if defined(LIBDIVIDE_AVX512)
3335 : template <typename T, Branching ALGO>
3336 : LIBDIVIDE_INLINE __m512i operator/(__m512i n, const divider<T, ALGO> &div) {
3337 : return div.divide(n);
3338 : }
3339 :
3340 : template <typename T, Branching ALGO>
3341 : LIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider<T, ALGO> &div) {
3342 : n = div.divide(n);
3343 : return n;
3344 : }
3345 : #endif
3346 :
3347 : #if defined(LIBDIVIDE_NEON)
3348 : template <typename T, Branching ALGO>
3349 : LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/(
3350 : typename NeonVecFor<T>::type n, const divider<T, ALGO> &div) {
3351 : return div.divide(n);
3352 : }
3353 :
3354 : template <typename T, Branching ALGO>
3355 : LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/=(
3356 : typename NeonVecFor<T>::type &n, const divider<T, ALGO> &div) {
3357 : n = div.divide(n);
3358 : return n;
3359 : }
3360 : #endif
3361 :
3362 : #if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900)
3363 : // libdivide::branchfree_divider<T>
3364 : template <typename T>
3365 : using branchfree_divider = divider<T, BRANCHFREE>;
3366 : #endif
3367 :
3368 : } // namespace libdivide
3369 :
3370 : #endif // __cplusplus
3371 :
3372 : #if defined(_MSC_VER) && !defined(__clang__)
3373 : #pragma warning(pop)
3374 : #endif
3375 :
3376 : #endif // LIBDIVIDE_H
|