Line data Source code
1 : // libdivide.h - Optimized integer division
2 : // https://libdivide.com
3 : //
4 : // Copyright (C) 2010 - 2022 ridiculous_fish, <libdivide@ridiculousfish.com>
5 : // Copyright (C) 2016 - 2022 Kim Walisch, <kim.walisch@gmail.com>
6 : //
7 : // libdivide is dual-licensed under the Boost or zlib licenses.
8 : // You may use libdivide under the terms of either of these.
9 : // See LICENSE.txt for more details.
10 :
11 : #ifndef LIBDIVIDE_H
12 : #define LIBDIVIDE_H
13 :
14 : // *** Version numbers are auto generated - do not edit ***
15 : #define LIBDIVIDE_VERSION "5.2.0"
16 : #define LIBDIVIDE_VERSION_MAJOR 5
17 : #define LIBDIVIDE_VERSION_MINOR 2
18 : #define LIBDIVIDE_VERSION_PATCH 0
19 :
20 : #include <stdint.h>
21 :
22 : #if !defined(__AVR__) && __STDC_HOSTED__ != 0
23 : #include <stdio.h>
24 : #include <stdlib.h>
25 : #endif
26 :
27 : #if defined(_MSC_VER) && (defined(__cplusplus) && (__cplusplus >= 202002L)) || \
28 : (defined(_MSVC_LANG) && (_MSVC_LANG >= 202002L))
29 : #include <limits.h>
30 : #include <type_traits>
31 : #define LIBDIVIDE_VC_CXX20
32 : #endif
33 :
34 : #if defined(LIBDIVIDE_SSE2)
35 : #include <emmintrin.h>
36 : #endif
37 :
38 : #if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512)
39 : #include <immintrin.h>
40 : #endif
41 :
42 : #if defined(LIBDIVIDE_NEON)
43 : #include <arm_neon.h>
44 : #endif
45 :
46 : // Clang-cl prior to Visual Studio 2022 doesn't include __umulh/__mulh intrinsics
47 : #if defined(_MSC_VER) && (!defined(__clang__) || _MSC_VER > 1930) && \
48 : (defined(_M_X64) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC))
49 : #define LIBDIVIDE_MULH_INTRINSICS
50 : #endif
51 :
52 : #if defined(_MSC_VER)
53 : #if defined(LIBDIVIDE_MULH_INTRINSICS) || !defined(__clang__)
54 : #include <intrin.h>
55 : #endif
56 : #ifndef __clang__
57 : #pragma warning(push)
58 : // 4146: unary minus operator applied to unsigned type, result still unsigned
59 : #pragma warning(disable : 4146)
60 :
61 : // 4204: nonstandard extension used : non-constant aggregate initializer
62 : #pragma warning(disable : 4204)
63 : #endif
64 : #define LIBDIVIDE_VC
65 : #endif
66 :
67 : #if !defined(__has_builtin)
68 : #define __has_builtin(x) 0
69 : #endif
70 :
71 : #if defined(__SIZEOF_INT128__)
72 : #define HAS_INT128_T
73 : // clang-cl on Windows does not yet support 128-bit division
74 : #if !(defined(__clang__) && defined(LIBDIVIDE_VC))
75 : #define HAS_INT128_DIV
76 : #endif
77 : #endif
78 :
79 : #if defined(__x86_64__) || defined(_M_X64)
80 : #define LIBDIVIDE_X86_64
81 : #endif
82 :
83 : #if defined(__i386__)
84 : #define LIBDIVIDE_i386
85 : #endif
86 :
87 : #if defined(__GNUC__) || defined(__clang__)
88 : #define LIBDIVIDE_GCC_STYLE_ASM
89 : #endif
90 :
91 : #if defined(__cplusplus) || defined(LIBDIVIDE_VC)
92 : #define LIBDIVIDE_FUNCTION __FUNCTION__
93 : #else
94 : #define LIBDIVIDE_FUNCTION __func__
95 : #endif
96 :
97 : // Set up forced inlining if possible.
98 : // We need both the attribute and keyword to avoid "might not be inlineable" warnings.
99 : #ifdef __has_attribute
100 : #if __has_attribute(always_inline)
101 : #define LIBDIVIDE_INLINE __attribute__((always_inline)) inline
102 : #endif
103 : #endif
104 : #ifndef LIBDIVIDE_INLINE
105 : #ifdef _MSC_VER
106 : #define LIBDIVIDE_INLINE __forceinline
107 : #else
108 : #define LIBDIVIDE_INLINE inline
109 : #endif
110 : #endif
111 :
112 : #if defined(__AVR__) || __STDC_HOSTED__ == 0
113 : #define LIBDIVIDE_ERROR(msg)
114 : #else
115 : #define LIBDIVIDE_ERROR(msg) \
116 : do { \
117 : fprintf(stderr, "libdivide.h:%d: %s(): Error: %s\n", __LINE__, LIBDIVIDE_FUNCTION, msg); \
118 : abort(); \
119 : } while (0)
120 : #endif
121 :
122 : #if defined(LIBDIVIDE_ASSERTIONS_ON) && !defined(__AVR__) && __STDC_HOSTED__ != 0
123 : #define LIBDIVIDE_ASSERT(x) \
124 : do { \
125 : if (!(x)) { \
126 : fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", __LINE__, \
127 : LIBDIVIDE_FUNCTION, #x); \
128 : abort(); \
129 : } \
130 : } while (0)
131 : #else
132 : #define LIBDIVIDE_ASSERT(x)
133 : #endif
134 :
135 : #ifdef __cplusplus
136 :
137 : // For constexpr zero initialization, c++11 might handle things ok,
138 : // but just limit to at least c++14 to ensure we don't break anyone's code:
139 :
140 : // Use https://en.cppreference.com/w/cpp/feature_test#cpp_constexpr
141 : #if defined(__cpp_constexpr) && (__cpp_constexpr >= 201304L)
142 : #define LIBDIVIDE_CONSTEXPR constexpr LIBDIVIDE_INLINE
143 :
144 : // Supposedly, MSVC might not implement feature test macros right:
145 : // https://stackoverflow.com/questions/49316752/feature-test-macros-not-working-properly-in-visual-c
146 : // so check that _MSVC_LANG corresponds to at least c++14, and _MSC_VER corresponds to at least VS
147 : // 2017 15.0 (for extended constexpr support:
148 : // https://learn.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=msvc-170)
149 : #elif (defined(_MSC_VER) && _MSC_VER >= 1910) && (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L)
150 : #define LIBDIVIDE_CONSTEXPR constexpr LIBDIVIDE_INLINE
151 :
152 : #else
153 : #define LIBDIVIDE_CONSTEXPR LIBDIVIDE_INLINE
154 : #endif
155 :
156 : namespace libdivide {
157 : #endif
158 :
159 : #if defined(_MSC_VER) && !defined(__clang__)
160 : #if defined(LIBDIVIDE_VC_CXX20)
161 : static LIBDIVIDE_CONSTEXPR int __builtin_clz(unsigned x) {
162 : if (std::is_constant_evaluated()) {
163 : for (int i = 0; i < sizeof(x) * CHAR_BIT; ++i) {
164 : if (x >> (sizeof(x) * CHAR_BIT - 1 - i)) return i;
165 : }
166 : return sizeof(x) * CHAR_BIT;
167 : }
168 : #else
169 : static LIBDIVIDE_INLINE int __builtin_clz(unsigned x) {
170 : #endif
171 : #if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)
172 : return (int)_CountLeadingZeros(x);
173 : #elif defined(__AVX2__) || defined(__LZCNT__)
174 : return (int)_lzcnt_u32(x);
175 : #else
176 : unsigned long r;
177 : _BitScanReverse(&r, x);
178 : return (int)(r ^ 31);
179 : #endif
180 : }
181 :
182 : #if defined(LIBDIVIDE_VC_CXX20)
183 : static LIBDIVIDE_CONSTEXPR int __builtin_clzll(unsigned long long x) {
184 : if (std::is_constant_evaluated()) {
185 : for (int i = 0; i < sizeof(x) * CHAR_BIT; ++i) {
186 : if (x >> (sizeof(x) * CHAR_BIT - 1 - i)) return i;
187 : }
188 : return sizeof(x) * CHAR_BIT;
189 : }
190 : #else
191 : static LIBDIVIDE_INLINE int __builtin_clzll(unsigned long long x) {
192 : #endif
193 : #if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)
194 : return (int)_CountLeadingZeros64(x);
195 : #elif defined(_WIN64)
196 : #if defined(__AVX2__) || defined(__LZCNT__)
197 : return (int)_lzcnt_u64(x);
198 : #else
199 : unsigned long r;
200 : _BitScanReverse64(&r, x);
201 : return (int)(r ^ 63);
202 : #endif
203 : #else
204 : int l = __builtin_clz((unsigned)x) + 32;
205 : int h = __builtin_clz((unsigned)(x >> 32));
206 : return !!((unsigned)(x >> 32)) ? h : l;
207 : #endif
208 : }
209 : #endif // defined(_MSC_VER) && !defined(__clang__)
210 :
211 : // pack divider structs to prevent compilers from padding.
212 : // This reduces memory usage by up to 43% when using a large
213 : // array of libdivide dividers and improves performance
214 : // by up to 10% because of reduced memory bandwidth.
215 : #pragma pack(push, 1)
216 :
217 : struct libdivide_u16_t {
218 : uint16_t magic;
219 : uint8_t more;
220 : };
221 :
222 : struct libdivide_s16_t {
223 : int16_t magic;
224 : uint8_t more;
225 : };
226 :
227 : struct libdivide_u32_t {
228 : uint32_t magic;
229 : uint8_t more;
230 : };
231 :
232 : struct libdivide_s32_t {
233 : int32_t magic;
234 : uint8_t more;
235 : };
236 :
237 : struct libdivide_u64_t {
238 : uint64_t magic;
239 : uint8_t more;
240 : };
241 :
242 : struct libdivide_s64_t {
243 : int64_t magic;
244 : uint8_t more;
245 : };
246 :
247 : struct libdivide_u16_branchfree_t {
248 : uint16_t magic;
249 : uint8_t more;
250 : };
251 :
252 : struct libdivide_s16_branchfree_t {
253 : int16_t magic;
254 : uint8_t more;
255 : };
256 :
257 : struct libdivide_u32_branchfree_t {
258 : uint32_t magic;
259 : uint8_t more;
260 : };
261 :
262 : struct libdivide_s32_branchfree_t {
263 : int32_t magic;
264 : uint8_t more;
265 : };
266 :
267 : struct libdivide_u64_branchfree_t {
268 : uint64_t magic;
269 : uint8_t more;
270 : };
271 :
272 : struct libdivide_s64_branchfree_t {
273 : int64_t magic;
274 : uint8_t more;
275 : };
276 :
277 : #pragma pack(pop)
278 :
279 : // Explanation of the "more" field:
280 : //
281 : // * Bits 0-5 is the shift value (for shift path or mult path).
282 : // * Bit 6 is the add indicator for mult path.
283 : // * Bit 7 is set if the divisor is negative. We use bit 7 as the negative
284 : // divisor indicator so that we can efficiently use sign extension to
285 : // create a bitmask with all bits set to 1 (if the divisor is negative)
286 : // or 0 (if the divisor is positive).
287 : //
288 : // u32: [0-4] shift value
289 : // [5] ignored
290 : // [6] add indicator
291 : // magic number of 0 indicates shift path
292 : //
293 : // s32: [0-4] shift value
294 : // [5] ignored
295 : // [6] add indicator
296 : // [7] indicates negative divisor
297 : // magic number of 0 indicates shift path
298 : //
299 : // u64: [0-5] shift value
300 : // [6] add indicator
301 : // magic number of 0 indicates shift path
302 : //
303 : // s64: [0-5] shift value
304 : // [6] add indicator
305 : // [7] indicates negative divisor
306 : // magic number of 0 indicates shift path
307 : //
308 : // In s32 and s64 branchfree modes, the magic number is negated according to
309 : // whether the divisor is negated. In branchfree strategy, it is not negated.
310 :
311 : enum {
312 : LIBDIVIDE_16_SHIFT_MASK = 0x1F,
313 : LIBDIVIDE_32_SHIFT_MASK = 0x1F,
314 : LIBDIVIDE_64_SHIFT_MASK = 0x3F,
315 : LIBDIVIDE_ADD_MARKER = 0x40,
316 : LIBDIVIDE_NEGATIVE_DIVISOR = 0x80
317 : };
318 :
319 : static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d);
320 : static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d);
321 : static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d);
322 : static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d);
323 : static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d);
324 : static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d);
325 :
326 : static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d);
327 : static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d);
328 : static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d);
329 : static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d);
330 : static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d);
331 : static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d);
332 :
333 : static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(
334 : int16_t numer, int16_t magic, uint8_t more);
335 : static LIBDIVIDE_INLINE int16_t libdivide_s16_do(
336 : int16_t numer, const struct libdivide_s16_t *denom);
337 : static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(
338 : uint16_t numer, uint16_t magic, uint8_t more);
339 : static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(
340 : uint16_t numer, const struct libdivide_u16_t *denom);
341 : static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(
342 : int32_t numer, int32_t magic, uint8_t more);
343 : static LIBDIVIDE_INLINE int32_t libdivide_s32_do(
344 : int32_t numer, const struct libdivide_s32_t *denom);
345 : static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw(
346 : uint32_t numer, uint32_t magic, uint8_t more);
347 : static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(
348 : uint32_t numer, const struct libdivide_u32_t *denom);
349 : static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(
350 : int64_t numer, int64_t magic, uint8_t more);
351 : static LIBDIVIDE_INLINE int64_t libdivide_s64_do(
352 : int64_t numer, const struct libdivide_s64_t *denom);
353 : static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw(
354 : uint64_t numer, uint64_t magic, uint8_t more);
355 : static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(
356 : uint64_t numer, const struct libdivide_u64_t *denom);
357 :
358 : static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(
359 : int16_t numer, const struct libdivide_s16_branchfree_t *denom);
360 : static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(
361 : uint16_t numer, const struct libdivide_u16_branchfree_t *denom);
362 : static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(
363 : int32_t numer, const struct libdivide_s32_branchfree_t *denom);
364 : static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(
365 : uint32_t numer, const struct libdivide_u32_branchfree_t *denom);
366 : static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(
367 : int64_t numer, const struct libdivide_s64_branchfree_t *denom);
368 : static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(
369 : uint64_t numer, const struct libdivide_u64_branchfree_t *denom);
370 :
371 : static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom);
372 : static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom);
373 : static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom);
374 : static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);
375 : static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom);
376 : static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom);
377 :
378 : static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(
379 : const struct libdivide_s16_branchfree_t *denom);
380 : static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(
381 : const struct libdivide_u16_branchfree_t *denom);
382 : static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(
383 : const struct libdivide_s32_branchfree_t *denom);
384 : static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(
385 : const struct libdivide_u32_branchfree_t *denom);
386 : static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover(
387 : const struct libdivide_s64_branchfree_t *denom);
388 : static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover(
389 : const struct libdivide_u64_branchfree_t *denom);
390 :
391 : //////// Internal Utility Functions
392 :
393 : static LIBDIVIDE_INLINE uint16_t libdivide_mullhi_u16(uint16_t x, uint16_t y) {
394 : uint32_t xl = x, yl = y;
395 : uint32_t rl = xl * yl;
396 : return (uint16_t)(rl >> 16);
397 : }
398 :
399 : static LIBDIVIDE_INLINE int16_t libdivide_mullhi_s16(int16_t x, int16_t y) {
400 : int32_t xl = x, yl = y;
401 : int32_t rl = xl * yl;
402 : // needs to be arithmetic shift
403 : return (int16_t)(rl >> 16);
404 : }
405 :
406 : static LIBDIVIDE_INLINE uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) {
407 : uint64_t xl = x, yl = y;
408 : uint64_t rl = xl * yl;
409 : return (uint32_t)(rl >> 32);
410 : }
411 :
412 : static LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) {
413 : int64_t xl = x, yl = y;
414 : int64_t rl = xl * yl;
415 : // needs to be arithmetic shift
416 : return (int32_t)(rl >> 32);
417 : }
418 :
419 : static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
420 : #if defined(LIBDIVIDE_MULH_INTRINSICS)
421 : return __umulh(x, y);
422 : #elif defined(HAS_INT128_T)
423 : __uint128_t xl = x, yl = y;
424 : __uint128_t rl = xl * yl;
425 : return (uint64_t)(rl >> 64);
426 : #else
427 : // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
428 : uint32_t mask = 0xFFFFFFFF;
429 : uint32_t x0 = (uint32_t)(x & mask);
430 : uint32_t x1 = (uint32_t)(x >> 32);
431 : uint32_t y0 = (uint32_t)(y & mask);
432 : uint32_t y1 = (uint32_t)(y >> 32);
433 : uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);
434 : uint64_t x0y1 = x0 * (uint64_t)y1;
435 : uint64_t x1y0 = x1 * (uint64_t)y0;
436 : uint64_t x1y1 = x1 * (uint64_t)y1;
437 : uint64_t temp = x1y0 + x0y0_hi;
438 : uint64_t temp_lo = temp & mask;
439 : uint64_t temp_hi = temp >> 32;
440 :
441 : return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32);
442 : #endif
443 : }
444 :
445 : static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
446 : #if defined(LIBDIVIDE_MULH_INTRINSICS)
447 : return __mulh(x, y);
448 : #elif defined(HAS_INT128_T)
449 : __int128_t xl = x, yl = y;
450 : __int128_t rl = xl * yl;
451 : return (int64_t)(rl >> 64);
452 : #else
453 : // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
454 : uint32_t mask = 0xFFFFFFFF;
455 : uint32_t x0 = (uint32_t)(x & mask);
456 : uint32_t y0 = (uint32_t)(y & mask);
457 : int32_t x1 = (int32_t)(x >> 32);
458 : int32_t y1 = (int32_t)(y >> 32);
459 : uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);
460 : int64_t t = x1 * (int64_t)y0 + x0y0_hi;
461 : int64_t w1 = x0 * (int64_t)y1 + (t & mask);
462 :
463 : return x1 * (int64_t)y1 + (t >> 32) + (w1 >> 32);
464 : #endif
465 : }
466 :
467 : static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) {
468 : #if defined(__AVR__)
469 : // Fast way to count leading zeros
470 : // On the AVR 8-bit architecture __builtin_clz() works on a int16_t.
471 : return __builtin_clz(val);
472 : #elif defined(__GNUC__) || __has_builtin(__builtin_clz) || defined(_MSC_VER)
473 : // Fast way to count leading zeros
474 8 : return (int16_t)(__builtin_clz(val) - 16);
475 : #else
476 : if (val == 0) return 16;
477 : int16_t result = 4;
478 : uint16_t hi = 0xFU << 12;
479 : while ((val & hi) == 0) {
480 : hi >>= 4;
481 : result += 4;
482 : }
483 : while (val & hi) {
484 : result -= 1;
485 : hi <<= 1;
486 : }
487 : return result;
488 : #endif
489 : }
490 :
491 : static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) {
492 : #if defined(__AVR__)
493 : // Fast way to count leading zeros
494 : return __builtin_clzl(val);
495 : #elif defined(__GNUC__) || __has_builtin(__builtin_clz) || defined(_MSC_VER)
496 : // Fast way to count leading zeros
497 13 : return __builtin_clz(val);
498 : #else
499 : if (val == 0) return 32;
500 : int32_t result = 8;
501 : uint32_t hi = 0xFFU << 24;
502 : while ((val & hi) == 0) {
503 : hi >>= 8;
504 : result += 8;
505 : }
506 : while (val & hi) {
507 : result -= 1;
508 : hi <<= 1;
509 : }
510 : return result;
511 : #endif
512 : }
513 :
514 : static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) {
515 : #if defined(__GNUC__) || __has_builtin(__builtin_clzll) || defined(_MSC_VER)
516 : // Fast way to count leading zeros
517 : return __builtin_clzll(val);
518 : #else
519 : uint32_t hi = val >> 32;
520 : uint32_t lo = val & 0xFFFFFFFF;
521 : if (hi != 0) return libdivide_count_leading_zeros32(hi);
522 : return 32 + libdivide_count_leading_zeros32(lo);
523 : #endif
524 : }
525 :
526 : // libdivide_32_div_16_to_16: divides a 32-bit uint {u1, u0} by a 16-bit
527 : // uint {v}. The result must fit in 16 bits.
528 : // Returns the quotient directly and the remainder in *r
529 : static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16(
530 : uint16_t u1, uint16_t u0, uint16_t v, uint16_t *r) {
531 4 : uint32_t n = ((uint32_t)u1 << 16) | u0;
532 4 : uint16_t result = (uint16_t)(n / v);
533 4 : *r = (uint16_t)(n - result * (uint32_t)v);
534 4 : return result;
535 : }
536 :
537 : // libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit
538 : // uint {v}. The result must fit in 32 bits.
539 : // Returns the quotient directly and the remainder in *r
540 : static LIBDIVIDE_INLINE uint32_t libdivide_64_div_32_to_32(
541 : uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
542 : #if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && defined(LIBDIVIDE_GCC_STYLE_ASM)
543 : uint32_t result;
544 5 : __asm__("divl %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1));
545 5 : return result;
546 : #else
547 : uint64_t n = ((uint64_t)u1 << 32) | u0;
548 : uint32_t result = (uint32_t)(n / v);
549 : *r = (uint32_t)(n - result * (uint64_t)v);
550 : return result;
551 : #endif
552 : }
553 :
554 : // libdivide_128_div_64_to_64: divides a 128-bit uint {numhi, numlo} by a 64-bit uint {den}. The
555 : // result must fit in 64 bits. Returns the quotient directly and the remainder in *r
556 : static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(
557 : uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
558 : // N.B. resist the temptation to use __uint128_t here.
559 : // In LLVM compiler-rt, it performs a 128/128 -> 128 division which is many times slower than
560 : // necessary. In gcc it's better but still slower than the divlu implementation, perhaps because
561 : // it's not LIBDIVIDE_INLINEd.
562 : #if defined(LIBDIVIDE_X86_64) && defined(LIBDIVIDE_GCC_STYLE_ASM)
563 : uint64_t result;
564 : __asm__("div %[v]" : "=a"(result), "=d"(*r) : [v] "r"(den), "a"(numlo), "d"(numhi));
565 : return result;
566 : #else
567 : // We work in base 2**32.
568 : // A uint32 holds a single digit. A uint64 holds two digits.
569 : // Our numerator is conceptually [num3, num2, num1, num0].
570 : // Our denominator is [den1, den0].
571 : const uint64_t b = ((uint64_t)1 << 32);
572 :
573 : // The high and low digits of our computed quotient.
574 : uint32_t q1;
575 : uint32_t q0;
576 :
577 : // The normalization shift factor.
578 : int shift;
579 :
580 : // The high and low digits of our denominator (after normalizing).
581 : // Also the low 2 digits of our numerator (after normalizing).
582 : uint32_t den1;
583 : uint32_t den0;
584 : uint32_t num1;
585 : uint32_t num0;
586 :
587 : // A partial remainder.
588 : uint64_t rem;
589 :
590 : // The estimated quotient, and its corresponding remainder (unrelated to true remainder).
591 : uint64_t qhat;
592 : uint64_t rhat;
593 :
594 : // Variables used to correct the estimated quotient.
595 : uint64_t c1;
596 : uint64_t c2;
597 :
598 : // Check for overflow and divide by 0.
599 : if (numhi >= den) {
600 : if (r) *r = ~0ull;
601 : return ~0ull;
602 : }
603 :
604 : // Determine the normalization factor. We multiply den by this, so that its leading digit is at
605 : // least half b. In binary this means just shifting left by the number of leading zeros, so that
606 : // there's a 1 in the MSB.
607 : // We also shift numer by the same amount. This cannot overflow because numhi < den.
608 : // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
609 : // by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is
610 : // 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it.
611 : shift = libdivide_count_leading_zeros64(den);
612 : den <<= shift;
613 : numhi <<= shift;
614 : numhi |= (numlo >> (-shift & 63)) & (uint64_t)(-(int64_t)shift >> 63);
615 : numlo <<= shift;
616 :
617 : // Extract the low digits of the numerator and both digits of the denominator.
618 : num1 = (uint32_t)(numlo >> 32);
619 : num0 = (uint32_t)(numlo & 0xFFFFFFFFu);
620 : den1 = (uint32_t)(den >> 32);
621 : den0 = (uint32_t)(den & 0xFFFFFFFFu);
622 :
623 : // We wish to compute q1 = [n3 n2 n1] / [d1 d0].
624 : // Estimate q1 as [n3 n2] / [d1], and then correct it.
625 : // Note while qhat may be 2 digits, q1 is always 1 digit.
626 : qhat = numhi / den1;
627 : rhat = numhi % den1;
628 : c1 = qhat * den0;
629 : c2 = rhat * b + num1;
630 : if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
631 : q1 = (uint32_t)qhat;
632 :
633 : // Compute the true (partial) remainder.
634 : rem = numhi * b + num1 - q1 * den;
635 :
636 : // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
637 : // Estimate q0 as [rem1 rem0] / [d1] and correct it.
638 : qhat = rem / den1;
639 : rhat = rem % den1;
640 : c1 = qhat * den0;
641 : c2 = rhat * b + num0;
642 : if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
643 : q0 = (uint32_t)qhat;
644 :
645 : // Return remainder if requested.
646 : if (r) *r = (rem * b + num0 - q0 * den) >> shift;
647 : return ((uint64_t)q1 << 32) | q0;
648 : #endif
649 : }
650 :
651 : #if !(defined(HAS_INT128_T) && \
652 : defined(HAS_INT128_DIV))
653 :
654 : // Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0)
655 : static LIBDIVIDE_INLINE void libdivide_u128_shift(
656 : uint64_t *u1, uint64_t *u0, int32_t signed_shift) {
657 : if (signed_shift > 0) {
658 : uint32_t shift = signed_shift;
659 : *u1 <<= shift;
660 : *u1 |= *u0 >> (64 - shift);
661 : *u0 <<= shift;
662 : } else if (signed_shift < 0) {
663 : uint32_t shift = -signed_shift;
664 : *u0 >>= shift;
665 : *u0 |= *u1 << (64 - shift);
666 : *u1 >>= shift;
667 : }
668 : }
669 :
670 : #endif
671 :
672 : // Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder.
673 : static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64(
674 : uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) {
675 : #if defined(HAS_INT128_T) && defined(HAS_INT128_DIV)
676 : __uint128_t ufull = u_hi;
677 : __uint128_t vfull = v_hi;
678 : ufull = (ufull << 64) | u_lo;
679 : vfull = (vfull << 64) | v_lo;
680 : uint64_t res = (uint64_t)(ufull / vfull);
681 : __uint128_t remainder = ufull - (vfull * res);
682 : *r_lo = (uint64_t)remainder;
683 : *r_hi = (uint64_t)(remainder >> 64);
684 : return res;
685 : #else
686 : // Adapted from "Unsigned Doubleword Division" in Hacker's Delight
687 : // We want to compute u / v
688 : typedef struct {
689 : uint64_t hi;
690 : uint64_t lo;
691 : } u128_t;
692 : u128_t u = {u_hi, u_lo};
693 : u128_t v = {v_hi, v_lo};
694 :
695 : if (v.hi == 0) {
696 : // divisor v is a 64 bit value, so we just need one 128/64 division
697 : // Note that we are simpler than Hacker's Delight here, because we know
698 : // the quotient fits in 64 bits whereas Hacker's Delight demands a full
699 : // 128 bit quotient
700 : *r_hi = 0;
701 : return libdivide_128_div_64_to_64(u.hi, u.lo, v.lo, r_lo);
702 : }
703 : // Here v >= 2**64
704 : // We know that v.hi != 0, so count leading zeros is OK
705 : // We have 0 <= n <= 63
706 : uint32_t n = libdivide_count_leading_zeros64(v.hi);
707 :
708 : // Normalize the divisor so its MSB is 1
709 : u128_t v1t = v;
710 : libdivide_u128_shift(&v1t.hi, &v1t.lo, n);
711 : uint64_t v1 = v1t.hi; // i.e. v1 = v1t >> 64
712 :
713 : // To ensure no overflow
714 : u128_t u1 = u;
715 : libdivide_u128_shift(&u1.hi, &u1.lo, -1);
716 :
717 : // Get quotient from divide unsigned insn.
718 : uint64_t rem_ignored;
719 : uint64_t q1 = libdivide_128_div_64_to_64(u1.hi, u1.lo, v1, &rem_ignored);
720 :
721 : // Undo normalization and division of u by 2.
722 : u128_t q0 = {0, q1};
723 : libdivide_u128_shift(&q0.hi, &q0.lo, n);
724 : libdivide_u128_shift(&q0.hi, &q0.lo, -63);
725 :
726 : // Make q0 correct or too small by 1
727 : // Equivalent to `if (q0 != 0) q0 = q0 - 1;`
728 : if (q0.hi != 0 || q0.lo != 0) {
729 : q0.hi -= (q0.lo == 0); // borrow
730 : q0.lo -= 1;
731 : }
732 :
733 : // Now q0 is correct.
734 : // Compute q0 * v as q0v
735 : // = (q0.hi << 64 + q0.lo) * (v.hi << 64 + v.lo)
736 : // = (q0.hi * v.hi << 128) + (q0.hi * v.lo << 64) +
737 : // (q0.lo * v.hi << 64) + q0.lo * v.lo)
738 : // Each term is 128 bit
739 : // High half of full product (upper 128 bits!) are dropped
740 : u128_t q0v = {0, 0};
741 : q0v.hi = q0.hi * v.lo + q0.lo * v.hi + libdivide_mullhi_u64(q0.lo, v.lo);
742 : q0v.lo = q0.lo * v.lo;
743 :
744 : // Compute u - q0v as u_q0v
745 : // This is the remainder
746 : u128_t u_q0v = u;
747 : u_q0v.hi -= q0v.hi + (u.lo < q0v.lo); // second term is borrow
748 : u_q0v.lo -= q0v.lo;
749 :
750 : // Check if u_q0v >= v
751 : // This checks if our remainder is larger than the divisor
752 : if ((u_q0v.hi > v.hi) || (u_q0v.hi == v.hi && u_q0v.lo >= v.lo)) {
753 : // Increment q0
754 : q0.lo += 1;
755 : q0.hi += (q0.lo == 0); // carry
756 :
757 : // Subtract v from remainder
758 : u_q0v.hi -= v.hi + (u_q0v.lo < v.lo);
759 : u_q0v.lo -= v.lo;
760 : }
761 :
762 : *r_hi = u_q0v.hi;
763 : *r_lo = u_q0v.lo;
764 :
765 : LIBDIVIDE_ASSERT(q0.hi == 0);
766 : return q0.lo;
767 : #endif
768 : }
769 :
770 : ////////// UINT16
771 :
772 : static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
773 : uint16_t d, int branchfree) {
774 8 : if (d == 0) {
775 0 : LIBDIVIDE_ERROR("divider must be != 0");
776 : }
777 :
778 : struct libdivide_u16_t result;
779 8 : uint8_t floor_log_2_d = (uint8_t)(15 - libdivide_count_leading_zeros16(d));
780 :
781 : // Power of 2
782 8 : if ((d & (d - 1)) == 0) {
783 : // We need to subtract 1 from the shift value in case of an unsigned
784 : // branchfree divider because there is a hardcoded right shift by 1
785 : // in its division algorithm. Because of this we also need to add back
786 : // 1 in its recovery algorithm.
787 4 : result.magic = 0;
788 4 : result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
789 : } else {
790 : uint8_t more;
791 : uint16_t rem, proposed_m;
792 4 : proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem);
793 :
794 : LIBDIVIDE_ASSERT(rem > 0 && rem < d);
795 4 : const uint16_t e = d - rem;
796 :
797 : // This power works if e < 2**floor_log_2_d.
798 4 : if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) {
799 : // This power works
800 3 : more = floor_log_2_d;
801 : } else {
802 : // We have to use the general 17-bit algorithm. We need to compute
803 : // (2**power) / d. However, we already have (2**(power-1))/d and
804 : // its remainder. By doubling both, and then correcting the
805 : // remainder, we can compute the larger division.
806 : // don't care about overflow here - in fact, we expect it
807 1 : proposed_m += proposed_m;
808 1 : const uint16_t twice_rem = rem + rem;
809 1 : if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
810 1 : more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
811 : }
812 4 : result.magic = 1 + proposed_m;
813 4 : result.more = more;
814 : // result.more's shift should in general be ceil_log_2_d. But if we
815 : // used the smaller power, we subtract one from the shift because we're
816 : // using the smaller power. If we're using the larger power, we
817 : // subtract one from the shift because it's taken care of by the add
818 : // indicator. So floor_log_2_d happens to be correct in both cases.
819 : }
820 8 : return result;
821 : }
822 :
823 : static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d) {
824 8 : return libdivide_internal_u16_gen(d, 0);
825 : }
826 :
827 : static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
828 : if (d == 1) {
829 : LIBDIVIDE_ERROR("branchfree divider must be != 1");
830 : }
831 : struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1);
832 : struct libdivide_u16_branchfree_t ret = {
833 : tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK)};
834 : return ret;
835 : }
836 :
837 : // The original libdivide_u16_do takes a const pointer. However, this cannot be used
838 : // with a compile time constant libdivide_u16_t: it will generate a warning about
839 : // taking the address of a temporary. Hence this overload.
840 : static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) {
841 : if (!magic) {
842 : return numer >> more;
843 : } else {
844 : uint16_t q = libdivide_mullhi_u16(numer, magic);
845 : if (more & LIBDIVIDE_ADD_MARKER) {
846 : uint16_t t = ((numer - q) >> 1) + q;
847 : return t >> (more & LIBDIVIDE_16_SHIFT_MASK);
848 : } else {
849 : // All upper bits are 0,
850 : // don't need to mask them off.
851 : return q >> more;
852 : }
853 : }
854 : }
855 :
856 : static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t *denom) {
857 : return libdivide_u16_do_raw(numer, denom->magic, denom->more);
858 : }
859 :
860 : static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(
861 : uint16_t numer, const struct libdivide_u16_branchfree_t *denom) {
862 : uint16_t q = libdivide_mullhi_u16(numer, denom->magic);
863 : uint16_t t = ((numer - q) >> 1) + q;
864 : return t >> denom->more;
865 : }
866 :
867 : static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) {
868 : uint8_t more = denom->more;
869 : uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
870 :
871 : if (!denom->magic) {
872 : return (uint16_t)1 << shift;
873 : } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
874 : // We compute q = n/d = n*m / 2^(16 + shift)
875 : // Therefore we have d = 2^(16 + shift) / m
876 : // We need to ceil it.
877 : // We know d is not a power of 2, so m is not a power of 2,
878 : // so we can just add 1 to the floor
879 : uint16_t hi_dividend = (uint16_t)1 << shift;
880 : uint16_t rem_ignored;
881 : return 1 + libdivide_32_div_16_to_16(hi_dividend, 0, denom->magic, &rem_ignored);
882 : } else {
883 : // Here we wish to compute d = 2^(16+shift+1)/(m+2^16).
884 : // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now
885 : // Also note that shift may be as high as 15, so shift + 1 will
886 : // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
887 : // then double the quotient and remainder.
888 : uint32_t half_n = (uint32_t)1 << (16 + shift);
889 : uint32_t d = ((uint32_t)1 << 16) | denom->magic;
890 : // Note that the quotient is guaranteed <= 16 bits, but the remainder
891 : // may need 17!
892 : uint16_t half_q = (uint16_t)(half_n / d);
893 : uint32_t rem = half_n % d;
894 : // We computed 2^(16+shift)/(m+2^16)
895 : // Need to double it, and then add 1 to the quotient if doubling th
896 : // remainder would increase the quotient.
897 : // Note that rem<<1 cannot overflow, since rem < d and d is 17 bits
898 : uint16_t full_q = half_q + half_q + ((rem << 1) >= d);
899 :
900 : // We rounded down in gen (hence +1)
901 : return full_q + 1;
902 : }
903 : }
904 :
905 : static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(const struct libdivide_u16_branchfree_t *denom) {
906 : uint8_t more = denom->more;
907 : uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
908 :
909 : if (!denom->magic) {
910 : return (uint16_t)1 << (shift + 1);
911 : } else {
912 : // Here we wish to compute d = 2^(16+shift+1)/(m+2^16).
913 : // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now
914 : // Also note that shift may be as high as 15, so shift + 1 will
915 : // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
916 : // then double the quotient and remainder.
917 : uint32_t half_n = (uint32_t)1 << (16 + shift);
918 : uint32_t d = ((uint32_t)1 << 16) | denom->magic;
919 : // Note that the quotient is guaranteed <= 16 bits, but the remainder
920 : // may need 17!
921 : uint16_t half_q = (uint16_t)(half_n / d);
922 : uint32_t rem = half_n % d;
923 : // We computed 2^(16+shift)/(m+2^16)
924 : // Need to double it, and then add 1 to the quotient if doubling th
925 : // remainder would increase the quotient.
926 : // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
927 : uint16_t full_q = half_q + half_q + ((rem << 1) >= d);
928 :
929 : // We rounded down in gen (hence +1)
930 : return full_q + 1;
931 : }
932 : }
933 :
934 : ////////// UINT32
935 :
936 : static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_internal_u32_gen(
937 : uint32_t d, int branchfree) {
938 13 : if (d == 0) {
939 0 : LIBDIVIDE_ERROR("divider must be != 0");
940 : }
941 :
942 : struct libdivide_u32_t result;
943 13 : uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(d);
944 :
945 : // Power of 2
946 13 : if ((d & (d - 1)) == 0) {
947 : // We need to subtract 1 from the shift value in case of an unsigned
948 : // branchfree divider because there is a hardcoded right shift by 1
949 : // in its division algorithm. Because of this we also need to add back
950 : // 1 in its recovery algorithm.
951 8 : result.magic = 0;
952 8 : result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
953 : } else {
954 : uint8_t more;
955 : uint32_t rem, proposed_m;
956 5 : proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << floor_log_2_d, 0, d, &rem);
957 :
958 : LIBDIVIDE_ASSERT(rem > 0 && rem < d);
959 5 : const uint32_t e = d - rem;
960 :
961 : // This power works if e < 2**floor_log_2_d.
962 5 : if (!branchfree && (e < ((uint32_t)1 << floor_log_2_d))) {
963 : // This power works
964 5 : more = (uint8_t)floor_log_2_d;
965 : } else {
966 : // We have to use the general 33-bit algorithm. We need to compute
967 : // (2**power) / d. However, we already have (2**(power-1))/d and
968 : // its remainder. By doubling both, and then correcting the
969 : // remainder, we can compute the larger division.
970 : // don't care about overflow here - in fact, we expect it
971 0 : proposed_m += proposed_m;
972 0 : const uint32_t twice_rem = rem + rem;
973 0 : if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
974 0 : more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
975 : }
976 5 : result.magic = 1 + proposed_m;
977 5 : result.more = more;
978 : // result.more's shift should in general be ceil_log_2_d. But if we
979 : // used the smaller power, we subtract one from the shift because we're
980 : // using the smaller power. If we're using the larger power, we
981 : // subtract one from the shift because it's taken care of by the add
982 : // indicator. So floor_log_2_d happens to be correct in both cases.
983 : }
984 13 : return result;
985 : }
986 :
987 : static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d) {
988 13 : return libdivide_internal_u32_gen(d, 0);
989 : }
990 :
991 : static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) {
992 : if (d == 1) {
993 : LIBDIVIDE_ERROR("branchfree divider must be != 1");
994 : }
995 : struct libdivide_u32_t tmp = libdivide_internal_u32_gen(d, 1);
996 : struct libdivide_u32_branchfree_t ret = {
997 : tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)};
998 : return ret;
999 : }
1000 :
1001 : static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw(uint32_t numer, uint32_t magic, uint8_t more) {
1002 : if (!magic) {
1003 : return numer >> more;
1004 : } else {
1005 : uint32_t q = libdivide_mullhi_u32(numer, magic);
1006 : if (more & LIBDIVIDE_ADD_MARKER) {
1007 : uint32_t t = ((numer - q) >> 1) + q;
1008 : return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
1009 : } else {
1010 : // All upper bits are 0,
1011 : // don't need to mask them off.
1012 : return q >> more;
1013 : }
1014 : }
1015 : }
1016 :
1017 : static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
1018 : return libdivide_u32_do_raw(numer, denom->magic, denom->more);
1019 : }
1020 :
1021 : static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(
1022 : uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {
1023 : uint32_t q = libdivide_mullhi_u32(numer, denom->magic);
1024 : uint32_t t = ((numer - q) >> 1) + q;
1025 : return t >> denom->more;
1026 : }
1027 :
1028 : static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) {
1029 : uint8_t more = denom->more;
1030 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
1031 :
1032 : if (!denom->magic) {
1033 : return (uint32_t)1 << shift;
1034 : } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
1035 : // We compute q = n/d = n*m / 2^(32 + shift)
1036 : // Therefore we have d = 2^(32 + shift) / m
1037 : // We need to ceil it.
1038 : // We know d is not a power of 2, so m is not a power of 2,
1039 : // so we can just add 1 to the floor
1040 : uint32_t hi_dividend = (uint32_t)1 << shift;
1041 : uint32_t rem_ignored;
1042 : return 1 + libdivide_64_div_32_to_32(hi_dividend, 0, denom->magic, &rem_ignored);
1043 : } else {
1044 : // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
1045 : // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
1046 : // Also note that shift may be as high as 31, so shift + 1 will
1047 : // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
1048 : // then double the quotient and remainder.
1049 : uint64_t half_n = (uint64_t)1 << (32 + shift);
1050 : uint64_t d = ((uint64_t)1 << 32) | denom->magic;
1051 : // Note that the quotient is guaranteed <= 32 bits, but the remainder
1052 : // may need 33!
1053 : uint32_t half_q = (uint32_t)(half_n / d);
1054 : uint64_t rem = half_n % d;
1055 : // We computed 2^(32+shift)/(m+2^32)
1056 : // Need to double it, and then add 1 to the quotient if doubling th
1057 : // remainder would increase the quotient.
1058 : // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
1059 : uint32_t full_q = half_q + half_q + ((rem << 1) >= d);
1060 :
1061 : // We rounded down in gen (hence +1)
1062 : return full_q + 1;
1063 : }
1064 : }
1065 :
1066 : static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom) {
1067 : uint8_t more = denom->more;
1068 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
1069 :
1070 : if (!denom->magic) {
1071 : return (uint32_t)1 << (shift + 1);
1072 : } else {
1073 : // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
1074 : // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
1075 : // Also note that shift may be as high as 31, so shift + 1 will
1076 : // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
1077 : // then double the quotient and remainder.
1078 : uint64_t half_n = (uint64_t)1 << (32 + shift);
1079 : uint64_t d = ((uint64_t)1 << 32) | denom->magic;
1080 : // Note that the quotient is guaranteed <= 32 bits, but the remainder
1081 : // may need 33!
1082 : uint32_t half_q = (uint32_t)(half_n / d);
1083 : uint64_t rem = half_n % d;
1084 : // We computed 2^(32+shift)/(m+2^32)
1085 : // Need to double it, and then add 1 to the quotient if doubling th
1086 : // remainder would increase the quotient.
1087 : // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
1088 : uint32_t full_q = half_q + half_q + ((rem << 1) >= d);
1089 :
1090 : // We rounded down in gen (hence +1)
1091 : return full_q + 1;
1092 : }
1093 : }
1094 :
1095 : ////////// UINT64
1096 :
1097 : static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_internal_u64_gen(
1098 : uint64_t d, int branchfree) {
1099 : if (d == 0) {
1100 : LIBDIVIDE_ERROR("divider must be != 0");
1101 : }
1102 :
1103 : struct libdivide_u64_t result;
1104 : uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(d);
1105 :
1106 : // Power of 2
1107 : if ((d & (d - 1)) == 0) {
1108 : // We need to subtract 1 from the shift value in case of an unsigned
1109 : // branchfree divider because there is a hardcoded right shift by 1
1110 : // in its division algorithm. Because of this we also need to add back
1111 : // 1 in its recovery algorithm.
1112 : result.magic = 0;
1113 : result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
1114 : } else {
1115 : uint64_t proposed_m, rem;
1116 : uint8_t more;
1117 : // (1 << (64 + floor_log_2_d)) / d
1118 : proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << floor_log_2_d, 0, d, &rem);
1119 :
1120 : LIBDIVIDE_ASSERT(rem > 0 && rem < d);
1121 : const uint64_t e = d - rem;
1122 :
1123 : // This power works if e < 2**floor_log_2_d.
1124 : if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {
1125 : // This power works
1126 : more = (uint8_t)floor_log_2_d;
1127 : } else {
1128 : // We have to use the general 65-bit algorithm. We need to compute
1129 : // (2**power) / d. However, we already have (2**(power-1))/d and
1130 : // its remainder. By doubling both, and then correcting the
1131 : // remainder, we can compute the larger division.
1132 : // don't care about overflow here - in fact, we expect it
1133 : proposed_m += proposed_m;
1134 : const uint64_t twice_rem = rem + rem;
1135 : if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
1136 : more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
1137 : }
1138 : result.magic = 1 + proposed_m;
1139 : result.more = more;
1140 : // result.more's shift should in general be ceil_log_2_d. But if we
1141 : // used the smaller power, we subtract one from the shift because we're
1142 : // using the smaller power. If we're using the larger power, we
1143 : // subtract one from the shift because it's taken care of by the add
1144 : // indicator. So floor_log_2_d happens to be correct in both cases,
1145 : // which is why we do it outside of the if statement.
1146 : }
1147 : return result;
1148 : }
1149 :
1150 : static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d) {
1151 : return libdivide_internal_u64_gen(d, 0);
1152 : }
1153 :
1154 : static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) {
1155 : if (d == 1) {
1156 : LIBDIVIDE_ERROR("branchfree divider must be != 1");
1157 : }
1158 : struct libdivide_u64_t tmp = libdivide_internal_u64_gen(d, 1);
1159 : struct libdivide_u64_branchfree_t ret = {
1160 : tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_64_SHIFT_MASK)};
1161 : return ret;
1162 : }
1163 :
1164 : static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw(uint64_t numer, uint64_t magic, uint8_t more) {
1165 : if (!magic) {
1166 : return numer >> more;
1167 : } else {
1168 : uint64_t q = libdivide_mullhi_u64(numer, magic);
1169 : if (more & LIBDIVIDE_ADD_MARKER) {
1170 : uint64_t t = ((numer - q) >> 1) + q;
1171 : return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
1172 : } else {
1173 : // All upper bits are 0,
1174 : // don't need to mask them off.
1175 : return q >> more;
1176 : }
1177 : }
1178 : }
1179 :
1180 : static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
1181 : return libdivide_u64_do_raw(numer, denom->magic, denom->more);
1182 : }
1183 :
1184 : static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(
1185 : uint64_t numer, const struct libdivide_u64_branchfree_t *denom) {
1186 : uint64_t q = libdivide_mullhi_u64(numer, denom->magic);
1187 : uint64_t t = ((numer - q) >> 1) + q;
1188 : return t >> denom->more;
1189 : }
1190 :
1191 : static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) {
1192 : uint8_t more = denom->more;
1193 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
1194 :
1195 : if (!denom->magic) {
1196 : return (uint64_t)1 << shift;
1197 : } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
1198 : // We compute q = n/d = n*m / 2^(64 + shift)
1199 : // Therefore we have d = 2^(64 + shift) / m
1200 : // We need to ceil it.
1201 : // We know d is not a power of 2, so m is not a power of 2,
1202 : // so we can just add 1 to the floor
1203 : uint64_t hi_dividend = (uint64_t)1 << shift;
1204 : uint64_t rem_ignored;
1205 : return 1 + libdivide_128_div_64_to_64(hi_dividend, 0, denom->magic, &rem_ignored);
1206 : } else {
1207 : // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).
1208 : // Notice (m + 2^64) is a 65 bit number. This gets hairy. See
1209 : // libdivide_u32_recover for more on what we do here.
1210 : // TODO: do something better than 128 bit math
1211 :
1212 : // Full n is a (potentially) 129 bit value
1213 : // half_n is a 128 bit value
1214 : // Compute the hi half of half_n. Low half is 0.
1215 : uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0;
1216 : // d is a 65 bit value. The high bit is always set to 1.
1217 : const uint64_t d_hi = 1, d_lo = denom->magic;
1218 : // Note that the quotient is guaranteed <= 64 bits,
1219 : // but the remainder may need 65!
1220 : uint64_t r_hi, r_lo;
1221 : uint64_t half_q =
1222 : libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
1223 : // We computed 2^(64+shift)/(m+2^64)
1224 : // Double the remainder ('dr') and check if that is larger than d
1225 : // Note that d is a 65 bit value, so r1 is small and so r1 + r1
1226 : // cannot overflow
1227 : uint64_t dr_lo = r_lo + r_lo;
1228 : uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry
1229 : int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);
1230 : uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);
1231 : return full_q + 1;
1232 : }
1233 : }
1234 :
1235 : static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom) {
1236 : uint8_t more = denom->more;
1237 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
1238 :
1239 : if (!denom->magic) {
1240 : return (uint64_t)1 << (shift + 1);
1241 : } else {
1242 : // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).
1243 : // Notice (m + 2^64) is a 65 bit number. This gets hairy. See
1244 : // libdivide_u32_recover for more on what we do here.
1245 : // TODO: do something better than 128 bit math
1246 :
1247 : // Full n is a (potentially) 129 bit value
1248 : // half_n is a 128 bit value
1249 : // Compute the hi half of half_n. Low half is 0.
1250 : uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0;
1251 : // d is a 65 bit value. The high bit is always set to 1.
1252 : const uint64_t d_hi = 1, d_lo = denom->magic;
1253 : // Note that the quotient is guaranteed <= 64 bits,
1254 : // but the remainder may need 65!
1255 : uint64_t r_hi, r_lo;
1256 : uint64_t half_q =
1257 : libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
1258 : // We computed 2^(64+shift)/(m+2^64)
1259 : // Double the remainder ('dr') and check if that is larger than d
1260 : // Note that d is a 65 bit value, so r1 is small and so r1 + r1
1261 : // cannot overflow
1262 : uint64_t dr_lo = r_lo + r_lo;
1263 : uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry
1264 : int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);
1265 : uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);
1266 : return full_q + 1;
1267 : }
1268 : }
1269 :
1270 : ////////// SINT16
1271 :
1272 : static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen(
1273 : int16_t d, int branchfree) {
1274 : if (d == 0) {
1275 : LIBDIVIDE_ERROR("divider must be != 0");
1276 : }
1277 :
1278 : struct libdivide_s16_t result;
1279 :
1280 : // If d is a power of 2, or negative a power of 2, we have to use a shift.
1281 : // This is especially important because the magic algorithm fails for -1.
1282 : // To check if d is a power of 2 or its inverse, it suffices to check
1283 : // whether its absolute value has exactly one bit set. This works even for
1284 : // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
1285 : // and is a power of 2.
1286 : uint16_t ud = (uint16_t)d;
1287 : uint16_t absD = (d < 0) ? -ud : ud;
1288 : uint16_t floor_log_2_d = 15 - libdivide_count_leading_zeros16(absD);
1289 : // check if exactly one bit is set,
1290 : // don't care if absD is 0 since that's divide by zero
1291 : if ((absD & (absD - 1)) == 0) {
1292 : // Branchfree and normal paths are exactly the same
1293 : result.magic = 0;
1294 : result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
1295 : } else {
1296 : LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
1297 :
1298 : uint8_t more;
1299 : // the dividend here is 2**(floor_log_2_d + 31), so the low 16 bit word
1300 : // is 0 and the high word is floor_log_2_d - 1
1301 : uint16_t rem, proposed_m;
1302 : proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
1303 : const uint16_t e = absD - rem;
1304 :
1305 : // We are going to start with a power of floor_log_2_d - 1.
1306 : // This works if works if e < 2**floor_log_2_d.
1307 : if (!branchfree && e < ((uint16_t)1 << floor_log_2_d)) {
1308 : // This power works
1309 : more = (uint8_t)(floor_log_2_d - 1);
1310 : } else {
1311 : // We need to go one higher. This should not make proposed_m
1312 : // overflow, but it will make it negative when interpreted as an
1313 : // int16_t.
1314 : proposed_m += proposed_m;
1315 : const uint16_t twice_rem = rem + rem;
1316 : if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
1317 : more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
1318 : }
1319 :
1320 : proposed_m += 1;
1321 : int16_t magic = (int16_t)proposed_m;
1322 :
1323 : // Mark if we are negative. Note we only negate the magic number in the
1324 : // branchfull case.
1325 : if (d < 0) {
1326 : more |= LIBDIVIDE_NEGATIVE_DIVISOR;
1327 : if (!branchfree) {
1328 : magic = -magic;
1329 : }
1330 : }
1331 :
1332 : result.more = more;
1333 : result.magic = magic;
1334 : }
1335 : return result;
1336 : }
1337 :
1338 : static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d) {
1339 : return libdivide_internal_s16_gen(d, 0);
1340 : }
1341 :
1342 : static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d) {
1343 : struct libdivide_s16_t tmp = libdivide_internal_s16_gen(d, 1);
1344 : struct libdivide_s16_branchfree_t result = {tmp.magic, tmp.more};
1345 : return result;
1346 : }
1347 :
1348 : // The original libdivide_s16_do takes a const pointer. However, this cannot be used
1349 : // with a compile time constant libdivide_s16_t: it will generate a warning about
1350 : // taking the address of a temporary. Hence this overload.
1351 : static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more) {
1352 : uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
1353 :
1354 : if (!magic) {
1355 : uint16_t sign = (int8_t)more >> 7;
1356 : uint16_t mask = ((uint16_t)1 << shift) - 1;
1357 : uint16_t uq = numer + ((numer >> 15) & mask);
1358 : int16_t q = (int16_t)uq;
1359 : q >>= shift;
1360 : q = (q ^ sign) - sign;
1361 : return q;
1362 : } else {
1363 : uint16_t uq = (uint16_t)libdivide_mullhi_s16(numer, magic);
1364 : if (more & LIBDIVIDE_ADD_MARKER) {
1365 : // must be arithmetic shift and then sign extend
1366 : int16_t sign = (int8_t)more >> 7;
1367 : // q += (more < 0 ? -numer : numer)
1368 : // cast required to avoid UB
1369 : uq += ((uint16_t)numer ^ sign) - sign;
1370 : }
1371 : int16_t q = (int16_t)uq;
1372 : q >>= shift;
1373 : q += (q < 0);
1374 : return q;
1375 : }
1376 : }
1377 :
1378 : static LIBDIVIDE_INLINE int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) {
1379 : return libdivide_s16_do_raw(numer, denom->magic, denom->more);
1380 : }
1381 :
1382 : static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(int16_t numer, const struct libdivide_s16_branchfree_t *denom) {
1383 : uint8_t more = denom->more;
1384 : uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
1385 : // must be arithmetic shift and then sign extend
1386 : int16_t sign = (int8_t)more >> 7;
1387 : int16_t magic = denom->magic;
1388 : int16_t q = libdivide_mullhi_s16(numer, magic);
1389 : q += numer;
1390 :
1391 : // If q is non-negative, we have nothing to do
1392 : // If q is negative, we want to add either (2**shift)-1 if d is a power of
1393 : // 2, or (2**shift) if it is not a power of 2
1394 : uint16_t is_power_of_2 = (magic == 0);
1395 : uint16_t q_sign = (uint16_t)(q >> 15);
1396 : q += q_sign & (((uint16_t)1 << shift) - is_power_of_2);
1397 :
1398 : // Now arithmetic right shift
1399 : q >>= shift;
1400 : // Negate if needed
1401 : q = (q ^ sign) - sign;
1402 :
1403 : return q;
1404 : }
1405 :
1406 : static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom) {
1407 : uint8_t more = denom->more;
1408 : uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
1409 : if (!denom->magic) {
1410 : uint16_t absD = (uint16_t)1 << shift;
1411 : if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
1412 : absD = -absD;
1413 : }
1414 : return (int16_t)absD;
1415 : } else {
1416 : // Unsigned math is much easier
1417 : // We negate the magic number only in the branchfull case, and we don't
1418 : // know which case we're in. However we have enough information to
1419 : // determine the correct sign of the magic number. The divisor was
1420 : // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set,
1421 : // the magic number's sign is opposite that of the divisor.
1422 : // We want to compute the positive magic number.
1423 : int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
1424 : int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
1425 :
1426 : // Handle the power of 2 case (including branchfree)
1427 : if (denom->magic == 0) {
1428 : int16_t result = (uint16_t)1 << shift;
1429 : return negative_divisor ? -result : result;
1430 : }
1431 :
1432 : uint16_t d = (uint16_t)(magic_was_negated ? -denom->magic : denom->magic);
1433 : uint32_t n = (uint32_t)1 << (16 + shift); // this shift cannot exceed 30
1434 : uint16_t q = (uint16_t)(n / d);
1435 : int16_t result = (int16_t)q;
1436 : result += 1;
1437 : return negative_divisor ? -result : result;
1438 : }
1439 : }
1440 :
1441 : static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(const struct libdivide_s16_branchfree_t *denom) {
1442 : const struct libdivide_s16_t den = {denom->magic, denom->more};
1443 : return libdivide_s16_recover(&den);
1444 : }
1445 :
1446 : ////////// SINT32
1447 :
1448 : static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_internal_s32_gen(
1449 : int32_t d, int branchfree) {
1450 : if (d == 0) {
1451 : LIBDIVIDE_ERROR("divider must be != 0");
1452 : }
1453 :
1454 : struct libdivide_s32_t result;
1455 :
1456 : // If d is a power of 2, or negative a power of 2, we have to use a shift.
1457 : // This is especially important because the magic algorithm fails for -1.
1458 : // To check if d is a power of 2 or its inverse, it suffices to check
1459 : // whether its absolute value has exactly one bit set. This works even for
1460 : // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
1461 : // and is a power of 2.
1462 : uint32_t ud = (uint32_t)d;
1463 : uint32_t absD = (d < 0) ? -ud : ud;
1464 : uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(absD);
1465 : // check if exactly one bit is set,
1466 : // don't care if absD is 0 since that's divide by zero
1467 : if ((absD & (absD - 1)) == 0) {
1468 : // Branchfree and normal paths are exactly the same
1469 : result.magic = 0;
1470 : result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
1471 : } else {
1472 : LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
1473 :
1474 : uint8_t more;
1475 : // the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word
1476 : // is 0 and the high word is floor_log_2_d - 1
1477 : uint32_t rem, proposed_m;
1478 : proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
1479 : const uint32_t e = absD - rem;
1480 :
1481 : // We are going to start with a power of floor_log_2_d - 1.
1482 : // This works if works if e < 2**floor_log_2_d.
1483 : if (!branchfree && e < ((uint32_t)1 << floor_log_2_d)) {
1484 : // This power works
1485 : more = (uint8_t)(floor_log_2_d - 1);
1486 : } else {
1487 : // We need to go one higher. This should not make proposed_m
1488 : // overflow, but it will make it negative when interpreted as an
1489 : // int32_t.
1490 : proposed_m += proposed_m;
1491 : const uint32_t twice_rem = rem + rem;
1492 : if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
1493 : more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
1494 : }
1495 :
1496 : proposed_m += 1;
1497 : int32_t magic = (int32_t)proposed_m;
1498 :
1499 : // Mark if we are negative. Note we only negate the magic number in the
1500 : // branchfull case.
1501 : if (d < 0) {
1502 : more |= LIBDIVIDE_NEGATIVE_DIVISOR;
1503 : if (!branchfree) {
1504 : magic = -magic;
1505 : }
1506 : }
1507 :
1508 : result.more = more;
1509 : result.magic = magic;
1510 : }
1511 : return result;
1512 : }
1513 :
1514 : static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d) {
1515 : return libdivide_internal_s32_gen(d, 0);
1516 : }
1517 :
1518 : static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) {
1519 : struct libdivide_s32_t tmp = libdivide_internal_s32_gen(d, 1);
1520 : struct libdivide_s32_branchfree_t result = {tmp.magic, tmp.more};
1521 : return result;
1522 : }
1523 :
1524 : static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) {
1525 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
1526 :
1527 : if (!magic) {
1528 : uint32_t sign = (int8_t)more >> 7;
1529 : uint32_t mask = ((uint32_t)1 << shift) - 1;
1530 : uint32_t uq = numer + ((numer >> 31) & mask);
1531 : int32_t q = (int32_t)uq;
1532 : q >>= shift;
1533 : q = (q ^ sign) - sign;
1534 : return q;
1535 : } else {
1536 : uint32_t uq = (uint32_t)libdivide_mullhi_s32(numer, magic);
1537 : if (more & LIBDIVIDE_ADD_MARKER) {
1538 : // must be arithmetic shift and then sign extend
1539 : int32_t sign = (int8_t)more >> 7;
1540 : // q += (more < 0 ? -numer : numer)
1541 : // cast required to avoid UB
1542 : uq += ((uint32_t)numer ^ sign) - sign;
1543 : }
1544 : int32_t q = (int32_t)uq;
1545 : q >>= shift;
1546 : q += (q < 0);
1547 : return q;
1548 : }
1549 : }
1550 :
1551 : static LIBDIVIDE_INLINE int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
1552 : return libdivide_s32_do_raw(numer, denom->magic, denom->more);
1553 : }
1554 :
1555 : static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) {
1556 : uint8_t more = denom->more;
1557 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
1558 : // must be arithmetic shift and then sign extend
1559 : int32_t sign = (int8_t)more >> 7;
1560 : int32_t magic = denom->magic;
1561 : int32_t q = libdivide_mullhi_s32(numer, magic);
1562 : q += numer;
1563 :
1564 : // If q is non-negative, we have nothing to do
1565 : // If q is negative, we want to add either (2**shift)-1 if d is a power of
1566 : // 2, or (2**shift) if it is not a power of 2
1567 : uint32_t is_power_of_2 = (magic == 0);
1568 : uint32_t q_sign = (uint32_t)(q >> 31);
1569 : q += q_sign & (((uint32_t)1 << shift) - is_power_of_2);
1570 :
1571 : // Now arithmetic right shift
1572 : q >>= shift;
1573 : // Negate if needed
1574 : q = (q ^ sign) - sign;
1575 :
1576 : return q;
1577 : }
1578 :
1579 : static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) {
1580 : uint8_t more = denom->more;
1581 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
1582 : if (!denom->magic) {
1583 : uint32_t absD = (uint32_t)1 << shift;
1584 : if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
1585 : absD = -absD;
1586 : }
1587 : return (int32_t)absD;
1588 : } else {
1589 : // Unsigned math is much easier
1590 : // We negate the magic number only in the branchfull case, and we don't
1591 : // know which case we're in. However we have enough information to
1592 : // determine the correct sign of the magic number. The divisor was
1593 : // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set,
1594 : // the magic number's sign is opposite that of the divisor.
1595 : // We want to compute the positive magic number.
1596 : int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
1597 : int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
1598 :
1599 : // Handle the power of 2 case (including branchfree)
1600 : if (denom->magic == 0) {
1601 : int32_t result = (uint32_t)1 << shift;
1602 : return negative_divisor ? -result : result;
1603 : }
1604 :
1605 : uint32_t d = (uint32_t)(magic_was_negated ? -denom->magic : denom->magic);
1606 : uint64_t n = (uint64_t)1 << (32 + shift); // this shift cannot exceed 30
1607 : uint32_t q = (uint32_t)(n / d);
1608 : int32_t result = (int32_t)q;
1609 : result += 1;
1610 : return negative_divisor ? -result : result;
1611 : }
1612 : }
1613 :
1614 : static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom) {
1615 : const struct libdivide_s32_t den = {denom->magic, denom->more};
1616 : return libdivide_s32_recover(&den);
1617 : }
1618 :
1619 : ////////// SINT64
1620 :
1621 : static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_internal_s64_gen(
1622 : int64_t d, int branchfree) {
1623 : if (d == 0) {
1624 : LIBDIVIDE_ERROR("divider must be != 0");
1625 : }
1626 :
1627 : struct libdivide_s64_t result;
1628 :
1629 : // If d is a power of 2, or negative a power of 2, we have to use a shift.
1630 : // This is especially important because the magic algorithm fails for -1.
1631 : // To check if d is a power of 2 or its inverse, it suffices to check
1632 : // whether its absolute value has exactly one bit set. This works even for
1633 : // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
1634 : // and is a power of 2.
1635 : uint64_t ud = (uint64_t)d;
1636 : uint64_t absD = (d < 0) ? -ud : ud;
1637 : uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(absD);
1638 : // check if exactly one bit is set,
1639 : // don't care if absD is 0 since that's divide by zero
1640 : if ((absD & (absD - 1)) == 0) {
1641 : // Branchfree and non-branchfree cases are the same
1642 : result.magic = 0;
1643 : result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
1644 : } else {
1645 : // the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word
1646 : // is 0 and the high word is floor_log_2_d - 1
1647 : uint8_t more;
1648 : uint64_t rem, proposed_m;
1649 : proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
1650 : const uint64_t e = absD - rem;
1651 :
1652 : // We are going to start with a power of floor_log_2_d - 1.
1653 : // This works if works if e < 2**floor_log_2_d.
1654 : if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {
1655 : // This power works
1656 : more = (uint8_t)(floor_log_2_d - 1);
1657 : } else {
1658 : // We need to go one higher. This should not make proposed_m
1659 : // overflow, but it will make it negative when interpreted as an
1660 : // int32_t.
1661 : proposed_m += proposed_m;
1662 : const uint64_t twice_rem = rem + rem;
1663 : if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
1664 : // note that we only set the LIBDIVIDE_NEGATIVE_DIVISOR bit if we
1665 : // also set ADD_MARKER this is an annoying optimization that
1666 : // enables algorithm #4 to avoid the mask. However we always set it
1667 : // in the branchfree case
1668 : more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
1669 : }
1670 : proposed_m += 1;
1671 : int64_t magic = (int64_t)proposed_m;
1672 :
1673 : // Mark if we are negative
1674 : if (d < 0) {
1675 : more |= LIBDIVIDE_NEGATIVE_DIVISOR;
1676 : if (!branchfree) {
1677 : magic = -magic;
1678 : }
1679 : }
1680 :
1681 : result.more = more;
1682 : result.magic = magic;
1683 : }
1684 : return result;
1685 : }
1686 :
1687 : static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d) {
1688 : return libdivide_internal_s64_gen(d, 0);
1689 : }
1690 :
1691 : static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) {
1692 : struct libdivide_s64_t tmp = libdivide_internal_s64_gen(d, 1);
1693 : struct libdivide_s64_branchfree_t ret = {tmp.magic, tmp.more};
1694 : return ret;
1695 : }
1696 :
1697 : static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) {
1698 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
1699 :
1700 : if (!magic) { // shift path
1701 : uint64_t mask = ((uint64_t)1 << shift) - 1;
1702 : uint64_t uq = numer + ((numer >> 63) & mask);
1703 : int64_t q = (int64_t)uq;
1704 : q >>= shift;
1705 : // must be arithmetic shift and then sign-extend
1706 : int64_t sign = (int8_t)more >> 7;
1707 : q = (q ^ sign) - sign;
1708 : return q;
1709 : } else {
1710 : uint64_t uq = (uint64_t)libdivide_mullhi_s64(numer, magic);
1711 : if (more & LIBDIVIDE_ADD_MARKER) {
1712 : // must be arithmetic shift and then sign extend
1713 : int64_t sign = (int8_t)more >> 7;
1714 : // q += (more < 0 ? -numer : numer)
1715 : // cast required to avoid UB
1716 : uq += ((uint64_t)numer ^ sign) - sign;
1717 : }
1718 : int64_t q = (int64_t)uq;
1719 : q >>= shift;
1720 : q += (q < 0);
1721 : return q;
1722 : }
1723 : }
1724 :
1725 : static LIBDIVIDE_INLINE int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
1726 : return libdivide_s64_do_raw(numer, denom->magic, denom->more);
1727 : }
1728 :
1729 : static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) {
1730 : uint8_t more = denom->more;
1731 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
1732 : // must be arithmetic shift and then sign extend
1733 : int64_t sign = (int8_t)more >> 7;
1734 : int64_t magic = denom->magic;
1735 : int64_t q = libdivide_mullhi_s64(numer, magic);
1736 : q += numer;
1737 :
1738 : // If q is non-negative, we have nothing to do.
1739 : // If q is negative, we want to add either (2**shift)-1 if d is a power of
1740 : // 2, or (2**shift) if it is not a power of 2.
1741 : uint64_t is_power_of_2 = (magic == 0);
1742 : uint64_t q_sign = (uint64_t)(q >> 63);
1743 : q += q_sign & (((uint64_t)1 << shift) - is_power_of_2);
1744 :
1745 : // Arithmetic right shift
1746 : q >>= shift;
1747 : // Negate if needed
1748 : q = (q ^ sign) - sign;
1749 :
1750 : return q;
1751 : }
1752 :
1753 : static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) {
1754 : uint8_t more = denom->more;
1755 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
1756 : if (denom->magic == 0) { // shift path
1757 : uint64_t absD = (uint64_t)1 << shift;
1758 : if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
1759 : absD = -absD;
1760 : }
1761 : return (int64_t)absD;
1762 : } else {
1763 : // Unsigned math is much easier
1764 : int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
1765 : int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
1766 :
1767 : uint64_t d = (uint64_t)(magic_was_negated ? -denom->magic : denom->magic);
1768 : uint64_t n_hi = (uint64_t)1 << shift, n_lo = 0;
1769 : uint64_t rem_ignored;
1770 : uint64_t q = libdivide_128_div_64_to_64(n_hi, n_lo, d, &rem_ignored);
1771 : int64_t result = (int64_t)(q + 1);
1772 : if (negative_divisor) {
1773 : result = -result;
1774 : }
1775 : return result;
1776 : }
1777 : }
1778 :
1779 : static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom) {
1780 : const struct libdivide_s64_t den = {denom->magic, denom->more};
1781 : return libdivide_s64_recover(&den);
1782 : }
1783 :
1784 : // Simplest possible vector type division: treat the vector type as an array
1785 : // of underlying native type.
1786 : //
1787 : // Use a union to read a vector via pointer-to-integer, without violating strict
1788 : // aliasing.
1789 : #define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \
1790 : const size_t count = sizeof(VecT) / sizeof(IntT); \
1791 : union type_pun_vec { \
1792 : VecT vec; \
1793 : IntT arr[sizeof(VecT) / sizeof(IntT)]; \
1794 : }; \
1795 : union type_pun_vec result; \
1796 : union type_pun_vec input; \
1797 : input.vec = numers; \
1798 : for (size_t loop = 0; loop < count; ++loop) { \
1799 : result.arr[loop] = libdivide_##Algo##_do(input.arr[loop], denom); \
1800 : } \
1801 : return result.vec;
1802 :
1803 : #if defined(LIBDIVIDE_NEON)
1804 :
1805 : static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_do_vec128(
1806 : uint16x8_t numers, const struct libdivide_u16_t *denom);
1807 : static LIBDIVIDE_INLINE int16x8_t libdivide_s16_do_vec128(
1808 : int16x8_t numers, const struct libdivide_s16_t *denom);
1809 : static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_do_vec128(
1810 : uint32x4_t numers, const struct libdivide_u32_t *denom);
1811 : static LIBDIVIDE_INLINE int32x4_t libdivide_s32_do_vec128(
1812 : int32x4_t numers, const struct libdivide_s32_t *denom);
1813 : static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_do_vec128(
1814 : uint64x2_t numers, const struct libdivide_u64_t *denom);
1815 : static LIBDIVIDE_INLINE int64x2_t libdivide_s64_do_vec128(
1816 : int64x2_t numers, const struct libdivide_s64_t *denom);
1817 :
1818 : static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_branchfree_do_vec128(
1819 : uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom);
1820 : static LIBDIVIDE_INLINE int16x8_t libdivide_s16_branchfree_do_vec128(
1821 : int16x8_t numers, const struct libdivide_s16_branchfree_t *denom);
1822 : static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_branchfree_do_vec128(
1823 : uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom);
1824 : static LIBDIVIDE_INLINE int32x4_t libdivide_s32_branchfree_do_vec128(
1825 : int32x4_t numers, const struct libdivide_s32_branchfree_t *denom);
1826 : static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_branchfree_do_vec128(
1827 : uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom);
1828 : static LIBDIVIDE_INLINE int64x2_t libdivide_s64_branchfree_do_vec128(
1829 : int64x2_t numers, const struct libdivide_s64_branchfree_t *denom);
1830 :
1831 : //////// Internal Utility Functions
1832 :
1833 : // Logical right shift by runtime value.
1834 : // NEON implements right shift as left shits by negative values.
1835 : static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_neon_srl(uint32x4_t v, uint8_t amt) {
1836 : int32_t wamt = (int32_t)(amt);
1837 : return vshlq_u32(v, vdupq_n_s32(-wamt));
1838 : }
1839 :
1840 : static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_neon_srl(uint64x2_t v, uint8_t amt) {
1841 : int64_t wamt = (int64_t)(amt);
1842 : return vshlq_u64(v, vdupq_n_s64(-wamt));
1843 : }
1844 :
1845 : // Arithmetic right shift by runtime value.
1846 : static LIBDIVIDE_INLINE int32x4_t libdivide_s32_neon_sra(int32x4_t v, uint8_t amt) {
1847 : int32_t wamt = (int32_t)(amt);
1848 : return vshlq_s32(v, vdupq_n_s32(-wamt));
1849 : }
1850 :
1851 : static LIBDIVIDE_INLINE int64x2_t libdivide_s64_neon_sra(int64x2_t v, uint8_t amt) {
1852 : int64_t wamt = (int64_t)(amt);
1853 : return vshlq_s64(v, vdupq_n_s64(-wamt));
1854 : }
1855 :
1856 : static LIBDIVIDE_INLINE int64x2_t libdivide_s64_signbits(int64x2_t v) { return vshrq_n_s64(v, 63); }
1857 :
1858 : static LIBDIVIDE_INLINE uint32x4_t libdivide_mullhi_u32_vec128(uint32x4_t a, uint32_t b) {
1859 : // Desire is [x0, x1, x2, x3]
1860 : uint32x4_t w1 = vreinterpretq_u32_u64(vmull_n_u32(vget_low_u32(a), b)); // [_, x0, _, x1]
1861 : uint32x4_t w2 = vreinterpretq_u32_u64(vmull_high_n_u32(a, b)); //[_, x2, _, x3]
1862 : return vuzp2q_u32(w1, w2); // [x0, x1, x2, x3]
1863 : }
1864 :
1865 : static LIBDIVIDE_INLINE int32x4_t libdivide_mullhi_s32_vec128(int32x4_t a, int32_t b) {
1866 : int32x4_t w1 = vreinterpretq_s32_s64(vmull_n_s32(vget_low_s32(a), b)); // [_, x0, _, x1]
1867 : int32x4_t w2 = vreinterpretq_s32_s64(vmull_high_n_s32(a, b)); //[_, x2, _, x3]
1868 : return vuzp2q_s32(w1, w2); // [x0, x1, x2, x3]
1869 : }
1870 :
1871 : static LIBDIVIDE_INLINE uint64x2_t libdivide_mullhi_u64_vec128(uint64x2_t x, uint64_t sy) {
1872 : // full 128 bits product is:
1873 : // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64)
1874 : // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64.
1875 :
1876 : // Get low and high words. x0 contains low 32 bits, x1 is high 32 bits.
1877 : uint64x2_t y = vdupq_n_u64(sy);
1878 : uint32x2_t x0 = vmovn_u64(x);
1879 : uint32x2_t y0 = vmovn_u64(y);
1880 : uint32x2_t x1 = vshrn_n_u64(x, 32);
1881 : uint32x2_t y1 = vshrn_n_u64(y, 32);
1882 :
1883 : // Compute x0*y0.
1884 : uint64x2_t x0y0 = vmull_u32(x0, y0);
1885 : uint64x2_t x0y0_hi = vshrq_n_u64(x0y0, 32);
1886 :
1887 : // Compute other intermediate products.
1888 : uint64x2_t temp = vmlal_u32(x0y0_hi, x1, y0); // temp = x0y0_hi + x1*y0;
1889 : // We want to split temp into its low 32 bits and high 32 bits, both
1890 : // in the low half of 64 bit registers.
1891 : // Use shifts to avoid needing a reg for the mask.
1892 : uint64x2_t temp_lo = vshrq_n_u64(vshlq_n_u64(temp, 32), 32); // temp_lo = temp & 0xFFFFFFFF;
1893 : uint64x2_t temp_hi = vshrq_n_u64(temp, 32); // temp_hi = temp >> 32;
1894 :
1895 : temp_lo = vmlal_u32(temp_lo, x0, y1); // temp_lo += x0*y0
1896 : temp_lo = vshrq_n_u64(temp_lo, 32); // temp_lo >>= 32
1897 : temp_hi = vmlal_u32(temp_hi, x1, y1); // temp_hi += x1*y1
1898 : uint64x2_t result = vaddq_u64(temp_hi, temp_lo);
1899 : return result;
1900 : }
1901 :
1902 : static LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64_t sy) {
1903 : int64x2_t p = vreinterpretq_s64_u64(
1904 : libdivide_mullhi_u64_vec128(vreinterpretq_u64_s64(x), (uint64_t)(sy)));
1905 : int64x2_t y = vdupq_n_s64(sy);
1906 : int64x2_t t1 = vandq_s64(libdivide_s64_signbits(x), y);
1907 : int64x2_t t2 = vandq_s64(libdivide_s64_signbits(y), x);
1908 : p = vsubq_s64(p, t1);
1909 : p = vsubq_s64(p, t2);
1910 : return p;
1911 : }
1912 :
1913 : ////////// UINT16
1914 :
1915 : uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom){
1916 : SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)}
1917 :
1918 : uint16x8_t libdivide_u16_branchfree_do_vec128(
1919 : uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom){
1920 : SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)}
1921 :
1922 : ////////// UINT32
1923 :
1924 : uint32x4_t libdivide_u32_do_vec128(uint32x4_t numers, const struct libdivide_u32_t *denom) {
1925 : uint8_t more = denom->more;
1926 : if (!denom->magic) {
1927 : return libdivide_u32_neon_srl(numers, more);
1928 : } else {
1929 : uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic);
1930 : if (more & LIBDIVIDE_ADD_MARKER) {
1931 : // uint32_t t = ((numer - q) >> 1) + q;
1932 : // return t >> denom->shift;
1933 : // Note we can use halving-subtract to avoid the shift.
1934 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
1935 : uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q);
1936 : return libdivide_u32_neon_srl(t, shift);
1937 : } else {
1938 : return libdivide_u32_neon_srl(q, more);
1939 : }
1940 : }
1941 : }
1942 :
1943 : uint32x4_t libdivide_u32_branchfree_do_vec128(
1944 : uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom) {
1945 : uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic);
1946 : uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q);
1947 : return libdivide_u32_neon_srl(t, denom->more);
1948 : }
1949 :
1950 : ////////// UINT64
1951 :
1952 : uint64x2_t libdivide_u64_do_vec128(uint64x2_t numers, const struct libdivide_u64_t *denom) {
1953 : uint8_t more = denom->more;
1954 : if (!denom->magic) {
1955 : return libdivide_u64_neon_srl(numers, more);
1956 : } else {
1957 : uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic);
1958 : if (more & LIBDIVIDE_ADD_MARKER) {
1959 : // uint32_t t = ((numer - q) >> 1) + q;
1960 : // return t >> denom->shift;
1961 : // No 64-bit halving subtracts in NEON :(
1962 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
1963 : uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q);
1964 : return libdivide_u64_neon_srl(t, shift);
1965 : } else {
1966 : return libdivide_u64_neon_srl(q, more);
1967 : }
1968 : }
1969 : }
1970 :
1971 : uint64x2_t libdivide_u64_branchfree_do_vec128(
1972 : uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom) {
1973 : uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic);
1974 : uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q);
1975 : return libdivide_u64_neon_srl(t, denom->more);
1976 : }
1977 :
1978 : ////////// SINT16
1979 :
1980 : int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom){
1981 : SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)}
1982 :
1983 : int16x8_t libdivide_s16_branchfree_do_vec128(
1984 : int16x8_t numers, const struct libdivide_s16_branchfree_t *denom){
1985 : SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)}
1986 :
1987 : ////////// SINT32
1988 :
1989 : int32x4_t libdivide_s32_do_vec128(int32x4_t numers, const struct libdivide_s32_t *denom) {
1990 : uint8_t more = denom->more;
1991 : if (!denom->magic) {
1992 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
1993 : uint32_t mask = ((uint32_t)1 << shift) - 1;
1994 : int32x4_t roundToZeroTweak = vdupq_n_s32((int)mask);
1995 : // q = numer + ((numer >> 31) & roundToZeroTweak);
1996 : int32x4_t q = vaddq_s32(numers, vandq_s32(vshrq_n_s32(numers, 31), roundToZeroTweak));
1997 : q = libdivide_s32_neon_sra(q, shift);
1998 : int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
1999 : // q = (q ^ sign) - sign;
2000 : q = vsubq_s32(veorq_s32(q, sign), sign);
2001 : return q;
2002 : } else {
2003 : int32x4_t q = libdivide_mullhi_s32_vec128(numers, denom->magic);
2004 : if (more & LIBDIVIDE_ADD_MARKER) {
2005 : // must be arithmetic shift
2006 : int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
2007 : // q += ((numer ^ sign) - sign);
2008 : q = vaddq_s32(q, vsubq_s32(veorq_s32(numers, sign), sign));
2009 : }
2010 : // q >>= shift
2011 : q = libdivide_s32_neon_sra(q, more & LIBDIVIDE_32_SHIFT_MASK);
2012 : q = vaddq_s32(
2013 : q, vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(q), 31))); // q += (q < 0)
2014 : return q;
2015 : }
2016 : }
2017 :
2018 : int32x4_t libdivide_s32_branchfree_do_vec128(
2019 : int32x4_t numers, const struct libdivide_s32_branchfree_t *denom) {
2020 : int32_t magic = denom->magic;
2021 : uint8_t more = denom->more;
2022 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2023 : // must be arithmetic shift
2024 : int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
2025 : int32x4_t q = libdivide_mullhi_s32_vec128(numers, magic);
2026 : q = vaddq_s32(q, numers); // q += numers
2027 :
2028 : // If q is non-negative, we have nothing to do
2029 : // If q is negative, we want to add either (2**shift)-1 if d is
2030 : // a power of 2, or (2**shift) if it is not a power of 2
2031 : uint32_t is_power_of_2 = (magic == 0);
2032 : int32x4_t q_sign = vshrq_n_s32(q, 31); // q_sign = q >> 31
2033 : int32x4_t mask = vdupq_n_s32(((uint32_t)1 << shift) - is_power_of_2);
2034 : q = vaddq_s32(q, vandq_s32(q_sign, mask)); // q = q + (q_sign & mask)
2035 : q = libdivide_s32_neon_sra(q, shift); // q >>= shift
2036 : q = vsubq_s32(veorq_s32(q, sign), sign); // q = (q ^ sign) - sign
2037 : return q;
2038 : }
2039 :
2040 : ////////// SINT64
2041 :
2042 : int64x2_t libdivide_s64_do_vec128(int64x2_t numers, const struct libdivide_s64_t *denom) {
2043 : uint8_t more = denom->more;
2044 : int64_t magic = denom->magic;
2045 : if (magic == 0) { // shift path
2046 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2047 : uint64_t mask = ((uint64_t)1 << shift) - 1;
2048 : int64x2_t roundToZeroTweak = vdupq_n_s64(mask); // TODO: no need to sign extend
2049 : // q = numer + ((numer >> 63) & roundToZeroTweak);
2050 : int64x2_t q =
2051 : vaddq_s64(numers, vandq_s64(libdivide_s64_signbits(numers), roundToZeroTweak));
2052 : q = libdivide_s64_neon_sra(q, shift);
2053 : // q = (q ^ sign) - sign;
2054 : int64x2_t sign = vreinterpretq_s64_s8(vdupq_n_s8((int8_t)more >> 7));
2055 : q = vsubq_s64(veorq_s64(q, sign), sign);
2056 : return q;
2057 : } else {
2058 : int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic);
2059 : if (more & LIBDIVIDE_ADD_MARKER) {
2060 : // must be arithmetic shift
2061 : int64x2_t sign = vdupq_n_s64((int8_t)more >> 7); // TODO: no need to widen
2062 : // q += ((numer ^ sign) - sign);
2063 : q = vaddq_s64(q, vsubq_s64(veorq_s64(numers, sign), sign));
2064 : }
2065 : // q >>= denom->mult_path.shift
2066 : q = libdivide_s64_neon_sra(q, more & LIBDIVIDE_64_SHIFT_MASK);
2067 : q = vaddq_s64(
2068 : q, vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(q), 63))); // q += (q < 0)
2069 : return q;
2070 : }
2071 : }
2072 :
2073 : int64x2_t libdivide_s64_branchfree_do_vec128(
2074 : int64x2_t numers, const struct libdivide_s64_branchfree_t *denom) {
2075 : int64_t magic = denom->magic;
2076 : uint8_t more = denom->more;
2077 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2078 : // must be arithmetic shift
2079 : int64x2_t sign = vdupq_n_s64((int8_t)more >> 7); // TODO: avoid sign extend
2080 :
2081 : // libdivide_mullhi_s64(numers, magic);
2082 : int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic);
2083 : q = vaddq_s64(q, numers); // q += numers
2084 :
2085 : // If q is non-negative, we have nothing to do.
2086 : // If q is negative, we want to add either (2**shift)-1 if d is
2087 : // a power of 2, or (2**shift) if it is not a power of 2.
2088 : uint32_t is_power_of_2 = (magic == 0);
2089 : int64x2_t q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63
2090 : int64x2_t mask = vdupq_n_s64(((uint64_t)1 << shift) - is_power_of_2);
2091 : q = vaddq_s64(q, vandq_s64(q_sign, mask)); // q = q + (q_sign & mask)
2092 : q = libdivide_s64_neon_sra(q, shift); // q >>= shift
2093 : q = vsubq_s64(veorq_s64(q, sign), sign); // q = (q ^ sign) - sign
2094 : return q;
2095 : }
2096 :
2097 : #endif
2098 :
2099 : #if defined(LIBDIVIDE_AVX512)
2100 :
2101 : static LIBDIVIDE_INLINE __m512i libdivide_u16_do_vec512(
2102 : __m512i numers, const struct libdivide_u16_t *denom);
2103 : static LIBDIVIDE_INLINE __m512i libdivide_s16_do_vec512(
2104 : __m512i numers, const struct libdivide_s16_t *denom);
2105 : static LIBDIVIDE_INLINE __m512i libdivide_u32_do_vec512(
2106 : __m512i numers, const struct libdivide_u32_t *denom);
2107 : static LIBDIVIDE_INLINE __m512i libdivide_s32_do_vec512(
2108 : __m512i numers, const struct libdivide_s32_t *denom);
2109 : static LIBDIVIDE_INLINE __m512i libdivide_u64_do_vec512(
2110 : __m512i numers, const struct libdivide_u64_t *denom);
2111 : static LIBDIVIDE_INLINE __m512i libdivide_s64_do_vec512(
2112 : __m512i numers, const struct libdivide_s64_t *denom);
2113 :
2114 : static LIBDIVIDE_INLINE __m512i libdivide_u16_branchfree_do_vec512(
2115 : __m512i numers, const struct libdivide_u16_branchfree_t *denom);
2116 : static LIBDIVIDE_INLINE __m512i libdivide_s16_branchfree_do_vec512(
2117 : __m512i numers, const struct libdivide_s16_branchfree_t *denom);
2118 : static LIBDIVIDE_INLINE __m512i libdivide_u32_branchfree_do_vec512(
2119 : __m512i numers, const struct libdivide_u32_branchfree_t *denom);
2120 : static LIBDIVIDE_INLINE __m512i libdivide_s32_branchfree_do_vec512(
2121 : __m512i numers, const struct libdivide_s32_branchfree_t *denom);
2122 : static LIBDIVIDE_INLINE __m512i libdivide_u64_branchfree_do_vec512(
2123 : __m512i numers, const struct libdivide_u64_branchfree_t *denom);
2124 : static LIBDIVIDE_INLINE __m512i libdivide_s64_branchfree_do_vec512(
2125 : __m512i numers, const struct libdivide_s64_branchfree_t *denom);
2126 :
2127 : //////// Internal Utility Functions
2128 :
2129 : static LIBDIVIDE_INLINE __m512i libdivide_s64_signbits_vec512(__m512i v) {
2130 : ;
2131 : return _mm512_srai_epi64(v, 63);
2132 : }
2133 :
2134 : static LIBDIVIDE_INLINE __m512i libdivide_s64_shift_right_vec512(__m512i v, int amt) {
2135 : return _mm512_srai_epi64(v, amt);
2136 : }
2137 :
2138 : // Here, b is assumed to contain one 32-bit value repeated.
2139 : static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u32_vec512(__m512i a, __m512i b) {
2140 : __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epu32(a, b), 32);
2141 : __m512i a1X3X = _mm512_srli_epi64(a, 32);
2142 : __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
2143 : __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epu32(a1X3X, b), mask);
2144 : return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3);
2145 : }
2146 :
2147 : // b is one 32-bit value repeated.
2148 : static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s32_vec512(__m512i a, __m512i b) {
2149 : __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epi32(a, b), 32);
2150 : __m512i a1X3X = _mm512_srli_epi64(a, 32);
2151 : __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
2152 : __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epi32(a1X3X, b), mask);
2153 : return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3);
2154 : }
2155 :
2156 : // Here, y is assumed to contain one 64-bit value repeated.
2157 : static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y) {
2158 : // see m128i variant for comments.
2159 : __m512i x0y0 = _mm512_mul_epu32(x, y);
2160 : __m512i x0y0_hi = _mm512_srli_epi64(x0y0, 32);
2161 :
2162 : __m512i x1 = _mm512_shuffle_epi32(x, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1));
2163 : __m512i y1 = _mm512_shuffle_epi32(y, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1));
2164 :
2165 : __m512i x0y1 = _mm512_mul_epu32(x, y1);
2166 : __m512i x1y0 = _mm512_mul_epu32(x1, y);
2167 : __m512i x1y1 = _mm512_mul_epu32(x1, y1);
2168 :
2169 : __m512i mask = _mm512_set1_epi64(0xFFFFFFFF);
2170 : __m512i temp = _mm512_add_epi64(x1y0, x0y0_hi);
2171 : __m512i temp_lo = _mm512_and_si512(temp, mask);
2172 : __m512i temp_hi = _mm512_srli_epi64(temp, 32);
2173 :
2174 : temp_lo = _mm512_srli_epi64(_mm512_add_epi64(temp_lo, x0y1), 32);
2175 : temp_hi = _mm512_add_epi64(x1y1, temp_hi);
2176 : return _mm512_add_epi64(temp_lo, temp_hi);
2177 : }
2178 :
2179 : // y is one 64-bit value repeated.
2180 : static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y) {
2181 : __m512i p = libdivide_mullhi_u64_vec512(x, y);
2182 : __m512i t1 = _mm512_and_si512(libdivide_s64_signbits_vec512(x), y);
2183 : __m512i t2 = _mm512_and_si512(libdivide_s64_signbits_vec512(y), x);
2184 : p = _mm512_sub_epi64(p, t1);
2185 : p = _mm512_sub_epi64(p, t2);
2186 : return p;
2187 : }
2188 :
2189 : ////////// UINT16
2190 :
2191 : __m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom){
2192 : SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)}
2193 :
2194 : __m512i libdivide_u16_branchfree_do_vec512(
2195 : __m512i numers, const struct libdivide_u16_branchfree_t *denom){
2196 : SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)}
2197 :
2198 : ////////// UINT32
2199 :
2200 : __m512i libdivide_u32_do_vec512(__m512i numers, const struct libdivide_u32_t *denom) {
2201 : uint8_t more = denom->more;
2202 : if (!denom->magic) {
2203 : return _mm512_srli_epi32(numers, more);
2204 : } else {
2205 : __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic));
2206 : if (more & LIBDIVIDE_ADD_MARKER) {
2207 : // uint32_t t = ((numer - q) >> 1) + q;
2208 : // return t >> denom->shift;
2209 : uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2210 : __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);
2211 : return _mm512_srli_epi32(t, shift);
2212 : } else {
2213 : return _mm512_srli_epi32(q, more);
2214 : }
2215 : }
2216 : }
2217 :
2218 : __m512i libdivide_u32_branchfree_do_vec512(
2219 : __m512i numers, const struct libdivide_u32_branchfree_t *denom) {
2220 : __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic));
2221 : __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);
2222 : return _mm512_srli_epi32(t, denom->more);
2223 : }
2224 :
2225 : ////////// UINT64
2226 :
2227 : __m512i libdivide_u64_do_vec512(__m512i numers, const struct libdivide_u64_t *denom) {
2228 : uint8_t more = denom->more;
2229 : if (!denom->magic) {
2230 : return _mm512_srli_epi64(numers, more);
2231 : } else {
2232 : __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic));
2233 : if (more & LIBDIVIDE_ADD_MARKER) {
2234 : // uint32_t t = ((numer - q) >> 1) + q;
2235 : // return t >> denom->shift;
2236 : uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2237 : __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);
2238 : return _mm512_srli_epi64(t, shift);
2239 : } else {
2240 : return _mm512_srli_epi64(q, more);
2241 : }
2242 : }
2243 : }
2244 :
2245 : __m512i libdivide_u64_branchfree_do_vec512(
2246 : __m512i numers, const struct libdivide_u64_branchfree_t *denom) {
2247 : __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic));
2248 : __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);
2249 : return _mm512_srli_epi64(t, denom->more);
2250 : }
2251 :
2252 : ////////// SINT16
2253 :
2254 : __m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom){
2255 : SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)}
2256 :
2257 : __m512i libdivide_s16_branchfree_do_vec512(
2258 : __m512i numers, const struct libdivide_s16_branchfree_t *denom){
2259 : SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)}
2260 :
2261 : ////////// SINT32
2262 :
2263 : __m512i libdivide_s32_do_vec512(__m512i numers, const struct libdivide_s32_t *denom) {
2264 : uint8_t more = denom->more;
2265 : if (!denom->magic) {
2266 : uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2267 : uint32_t mask = ((uint32_t)1 << shift) - 1;
2268 : __m512i roundToZeroTweak = _mm512_set1_epi32(mask);
2269 : // q = numer + ((numer >> 31) & roundToZeroTweak);
2270 : __m512i q = _mm512_add_epi32(
2271 : numers, _mm512_and_si512(_mm512_srai_epi32(numers, 31), roundToZeroTweak));
2272 : q = _mm512_srai_epi32(q, shift);
2273 : __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
2274 : // q = (q ^ sign) - sign;
2275 : q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign);
2276 : return q;
2277 : } else {
2278 : __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(denom->magic));
2279 : if (more & LIBDIVIDE_ADD_MARKER) {
2280 : // must be arithmetic shift
2281 : __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
2282 : // q += ((numer ^ sign) - sign);
2283 : q = _mm512_add_epi32(q, _mm512_sub_epi32(_mm512_xor_si512(numers, sign), sign));
2284 : }
2285 : // q >>= shift
2286 : q = _mm512_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
2287 : q = _mm512_add_epi32(q, _mm512_srli_epi32(q, 31)); // q += (q < 0)
2288 : return q;
2289 : }
2290 : }
2291 :
2292 : __m512i libdivide_s32_branchfree_do_vec512(
2293 : __m512i numers, const struct libdivide_s32_branchfree_t *denom) {
2294 : int32_t magic = denom->magic;
2295 : uint8_t more = denom->more;
2296 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2297 : // must be arithmetic shift
2298 : __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
2299 : __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(magic));
2300 : q = _mm512_add_epi32(q, numers); // q += numers
2301 :
2302 : // If q is non-negative, we have nothing to do
2303 : // If q is negative, we want to add either (2**shift)-1 if d is
2304 : // a power of 2, or (2**shift) if it is not a power of 2
2305 : uint32_t is_power_of_2 = (magic == 0);
2306 : __m512i q_sign = _mm512_srai_epi32(q, 31); // q_sign = q >> 31
2307 : __m512i mask = _mm512_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
2308 : q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask)
2309 : q = _mm512_srai_epi32(q, shift); // q >>= shift
2310 : q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign
2311 : return q;
2312 : }
2313 :
2314 : ////////// SINT64
2315 :
2316 : __m512i libdivide_s64_do_vec512(__m512i numers, const struct libdivide_s64_t *denom) {
2317 : uint8_t more = denom->more;
2318 : int64_t magic = denom->magic;
2319 : if (magic == 0) { // shift path
2320 : uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2321 : uint64_t mask = ((uint64_t)1 << shift) - 1;
2322 : __m512i roundToZeroTweak = _mm512_set1_epi64(mask);
2323 : // q = numer + ((numer >> 63) & roundToZeroTweak);
2324 : __m512i q = _mm512_add_epi64(
2325 : numers, _mm512_and_si512(libdivide_s64_signbits_vec512(numers), roundToZeroTweak));
2326 : q = libdivide_s64_shift_right_vec512(q, shift);
2327 : __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
2328 : // q = (q ^ sign) - sign;
2329 : q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign);
2330 : return q;
2331 : } else {
2332 : __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic));
2333 : if (more & LIBDIVIDE_ADD_MARKER) {
2334 : // must be arithmetic shift
2335 : __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
2336 : // q += ((numer ^ sign) - sign);
2337 : q = _mm512_add_epi64(q, _mm512_sub_epi64(_mm512_xor_si512(numers, sign), sign));
2338 : }
2339 : // q >>= denom->mult_path.shift
2340 : q = libdivide_s64_shift_right_vec512(q, more & LIBDIVIDE_64_SHIFT_MASK);
2341 : q = _mm512_add_epi64(q, _mm512_srli_epi64(q, 63)); // q += (q < 0)
2342 : return q;
2343 : }
2344 : }
2345 :
2346 : __m512i libdivide_s64_branchfree_do_vec512(
2347 : __m512i numers, const struct libdivide_s64_branchfree_t *denom) {
2348 : int64_t magic = denom->magic;
2349 : uint8_t more = denom->more;
2350 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2351 : // must be arithmetic shift
2352 : __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
2353 :
2354 : // libdivide_mullhi_s64(numers, magic);
2355 : __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic));
2356 : q = _mm512_add_epi64(q, numers); // q += numers
2357 :
2358 : // If q is non-negative, we have nothing to do.
2359 : // If q is negative, we want to add either (2**shift)-1 if d is
2360 : // a power of 2, or (2**shift) if it is not a power of 2.
2361 : uint32_t is_power_of_2 = (magic == 0);
2362 : __m512i q_sign = libdivide_s64_signbits_vec512(q); // q_sign = q >> 63
2363 : __m512i mask = _mm512_set1_epi64(((uint64_t)1 << shift) - is_power_of_2);
2364 : q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask)
2365 : q = libdivide_s64_shift_right_vec512(q, shift); // q >>= shift
2366 : q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign
2367 : return q;
2368 : }
2369 :
2370 : #endif
2371 :
2372 : #if defined(LIBDIVIDE_AVX2)
2373 :
2374 : static LIBDIVIDE_INLINE __m256i libdivide_u16_do_vec256(
2375 : __m256i numers, const struct libdivide_u16_t *denom);
2376 : static LIBDIVIDE_INLINE __m256i libdivide_s16_do_vec256(
2377 : __m256i numers, const struct libdivide_s16_t *denom);
2378 : static LIBDIVIDE_INLINE __m256i libdivide_u32_do_vec256(
2379 : __m256i numers, const struct libdivide_u32_t *denom);
2380 : static LIBDIVIDE_INLINE __m256i libdivide_s32_do_vec256(
2381 : __m256i numers, const struct libdivide_s32_t *denom);
2382 : static LIBDIVIDE_INLINE __m256i libdivide_u64_do_vec256(
2383 : __m256i numers, const struct libdivide_u64_t *denom);
2384 : static LIBDIVIDE_INLINE __m256i libdivide_s64_do_vec256(
2385 : __m256i numers, const struct libdivide_s64_t *denom);
2386 :
2387 : static LIBDIVIDE_INLINE __m256i libdivide_u16_branchfree_do_vec256(
2388 : __m256i numers, const struct libdivide_u16_branchfree_t *denom);
2389 : static LIBDIVIDE_INLINE __m256i libdivide_s16_branchfree_do_vec256(
2390 : __m256i numers, const struct libdivide_s16_branchfree_t *denom);
2391 : static LIBDIVIDE_INLINE __m256i libdivide_u32_branchfree_do_vec256(
2392 : __m256i numers, const struct libdivide_u32_branchfree_t *denom);
2393 : static LIBDIVIDE_INLINE __m256i libdivide_s32_branchfree_do_vec256(
2394 : __m256i numers, const struct libdivide_s32_branchfree_t *denom);
2395 : static LIBDIVIDE_INLINE __m256i libdivide_u64_branchfree_do_vec256(
2396 : __m256i numers, const struct libdivide_u64_branchfree_t *denom);
2397 : static LIBDIVIDE_INLINE __m256i libdivide_s64_branchfree_do_vec256(
2398 : __m256i numers, const struct libdivide_s64_branchfree_t *denom);
2399 :
2400 : //////// Internal Utility Functions
2401 :
2402 : // Implementation of _mm256_srai_epi64(v, 63) (from AVX512).
2403 : static LIBDIVIDE_INLINE __m256i libdivide_s64_signbits_vec256(__m256i v) {
2404 : __m256i hiBitsDuped = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
2405 : __m256i signBits = _mm256_srai_epi32(hiBitsDuped, 31);
2406 : return signBits;
2407 : }
2408 :
2409 : // Implementation of _mm256_srai_epi64 (from AVX512).
2410 : static LIBDIVIDE_INLINE __m256i libdivide_s64_shift_right_vec256(__m256i v, int amt) {
2411 : const int b = 64 - amt;
2412 : __m256i m = _mm256_set1_epi64x((uint64_t)1 << (b - 1));
2413 : __m256i x = _mm256_srli_epi64(v, amt);
2414 : __m256i result = _mm256_sub_epi64(_mm256_xor_si256(x, m), m);
2415 : return result;
2416 : }
2417 :
2418 : // Here, b is assumed to contain one 32-bit value repeated.
2419 : static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u32_vec256(__m256i a, __m256i b) {
2420 : __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epu32(a, b), 32);
2421 : __m256i a1X3X = _mm256_srli_epi64(a, 32);
2422 : __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
2423 : __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epu32(a1X3X, b), mask);
2424 : return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3);
2425 : }
2426 :
2427 : // b is one 32-bit value repeated.
2428 : static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s32_vec256(__m256i a, __m256i b) {
2429 : __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epi32(a, b), 32);
2430 : __m256i a1X3X = _mm256_srli_epi64(a, 32);
2431 : __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
2432 : __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epi32(a1X3X, b), mask);
2433 : return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3);
2434 : }
2435 :
2436 : // Here, y is assumed to contain one 64-bit value repeated.
2437 : static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y) {
2438 : // see m128i variant for comments.
2439 : __m256i x0y0 = _mm256_mul_epu32(x, y);
2440 : __m256i x0y0_hi = _mm256_srli_epi64(x0y0, 32);
2441 :
2442 : __m256i x1 = _mm256_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1));
2443 : __m256i y1 = _mm256_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1));
2444 :
2445 : __m256i x0y1 = _mm256_mul_epu32(x, y1);
2446 : __m256i x1y0 = _mm256_mul_epu32(x1, y);
2447 : __m256i x1y1 = _mm256_mul_epu32(x1, y1);
2448 :
2449 : __m256i mask = _mm256_set1_epi64x(0xFFFFFFFF);
2450 : __m256i temp = _mm256_add_epi64(x1y0, x0y0_hi);
2451 : __m256i temp_lo = _mm256_and_si256(temp, mask);
2452 : __m256i temp_hi = _mm256_srli_epi64(temp, 32);
2453 :
2454 : temp_lo = _mm256_srli_epi64(_mm256_add_epi64(temp_lo, x0y1), 32);
2455 : temp_hi = _mm256_add_epi64(x1y1, temp_hi);
2456 : return _mm256_add_epi64(temp_lo, temp_hi);
2457 : }
2458 :
2459 : // y is one 64-bit value repeated.
2460 : static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y) {
2461 : __m256i p = libdivide_mullhi_u64_vec256(x, y);
2462 : __m256i t1 = _mm256_and_si256(libdivide_s64_signbits_vec256(x), y);
2463 : __m256i t2 = _mm256_and_si256(libdivide_s64_signbits_vec256(y), x);
2464 : p = _mm256_sub_epi64(p, t1);
2465 : p = _mm256_sub_epi64(p, t2);
2466 : return p;
2467 : }
2468 :
2469 : ////////// UINT16
2470 :
2471 : __m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) {
2472 : uint8_t more = denom->more;
2473 : if (!denom->magic) {
2474 : return _mm256_srli_epi16(numers, more);
2475 : } else {
2476 : __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
2477 : if (more & LIBDIVIDE_ADD_MARKER) {
2478 : __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
2479 : return _mm256_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
2480 : } else {
2481 : return _mm256_srli_epi16(q, more);
2482 : }
2483 : }
2484 : }
2485 :
2486 : __m256i libdivide_u16_branchfree_do_vec256(
2487 : __m256i numers, const struct libdivide_u16_branchfree_t *denom) {
2488 : __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
2489 : __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
2490 : return _mm256_srli_epi16(t, denom->more);
2491 : }
2492 :
2493 : ////////// UINT32
2494 :
2495 : __m256i libdivide_u32_do_vec256(__m256i numers, const struct libdivide_u32_t *denom) {
2496 : uint8_t more = denom->more;
2497 : if (!denom->magic) {
2498 : return _mm256_srli_epi32(numers, more);
2499 : } else {
2500 : __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic));
2501 : if (more & LIBDIVIDE_ADD_MARKER) {
2502 : // uint32_t t = ((numer - q) >> 1) + q;
2503 : // return t >> denom->shift;
2504 : uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2505 : __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);
2506 : return _mm256_srli_epi32(t, shift);
2507 : } else {
2508 : return _mm256_srli_epi32(q, more);
2509 : }
2510 : }
2511 : }
2512 :
2513 : __m256i libdivide_u32_branchfree_do_vec256(
2514 : __m256i numers, const struct libdivide_u32_branchfree_t *denom) {
2515 : __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic));
2516 : __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);
2517 : return _mm256_srli_epi32(t, denom->more);
2518 : }
2519 :
2520 : ////////// UINT64
2521 :
2522 : __m256i libdivide_u64_do_vec256(__m256i numers, const struct libdivide_u64_t *denom) {
2523 : uint8_t more = denom->more;
2524 : if (!denom->magic) {
2525 : return _mm256_srli_epi64(numers, more);
2526 : } else {
2527 : __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic));
2528 : if (more & LIBDIVIDE_ADD_MARKER) {
2529 : // uint32_t t = ((numer - q) >> 1) + q;
2530 : // return t >> denom->shift;
2531 : uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2532 : __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);
2533 : return _mm256_srli_epi64(t, shift);
2534 : } else {
2535 : return _mm256_srli_epi64(q, more);
2536 : }
2537 : }
2538 : }
2539 :
2540 : __m256i libdivide_u64_branchfree_do_vec256(
2541 : __m256i numers, const struct libdivide_u64_branchfree_t *denom) {
2542 : __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic));
2543 : __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);
2544 : return _mm256_srli_epi64(t, denom->more);
2545 : }
2546 :
2547 : ////////// SINT16
2548 :
2549 : __m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) {
2550 : uint8_t more = denom->more;
2551 : if (!denom->magic) {
2552 : uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
2553 : uint16_t mask = ((uint16_t)1 << shift) - 1;
2554 : __m256i roundToZeroTweak = _mm256_set1_epi16(mask);
2555 : // q = numer + ((numer >> 15) & roundToZeroTweak);
2556 : __m256i q = _mm256_add_epi16(
2557 : numers, _mm256_and_si256(_mm256_srai_epi16(numers, 15), roundToZeroTweak));
2558 : q = _mm256_srai_epi16(q, shift);
2559 : __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
2560 : // q = (q ^ sign) - sign;
2561 : q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign);
2562 : return q;
2563 : } else {
2564 : __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(denom->magic));
2565 : if (more & LIBDIVIDE_ADD_MARKER) {
2566 : // must be arithmetic shift
2567 : __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
2568 : // q += ((numer ^ sign) - sign);
2569 : q = _mm256_add_epi16(q, _mm256_sub_epi16(_mm256_xor_si256(numers, sign), sign));
2570 : }
2571 : // q >>= shift
2572 : q = _mm256_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
2573 : q = _mm256_add_epi16(q, _mm256_srli_epi16(q, 15)); // q += (q < 0)
2574 : return q;
2575 : }
2576 : }
2577 :
2578 : __m256i libdivide_s16_branchfree_do_vec256(
2579 : __m256i numers, const struct libdivide_s16_branchfree_t *denom) {
2580 : int16_t magic = denom->magic;
2581 : uint8_t more = denom->more;
2582 : uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
2583 : // must be arithmetic shift
2584 : __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
2585 : __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(magic));
2586 : q = _mm256_add_epi16(q, numers); // q += numers
2587 :
2588 : // If q is non-negative, we have nothing to do
2589 : // If q is negative, we want to add either (2**shift)-1 if d is
2590 : // a power of 2, or (2**shift) if it is not a power of 2
2591 : uint16_t is_power_of_2 = (magic == 0);
2592 : __m256i q_sign = _mm256_srai_epi16(q, 15); // q_sign = q >> 15
2593 : __m256i mask = _mm256_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
2594 : q = _mm256_add_epi16(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask)
2595 : q = _mm256_srai_epi16(q, shift); // q >>= shift
2596 : q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign
2597 : return q;
2598 : }
2599 :
2600 : ////////// SINT32
2601 :
2602 : __m256i libdivide_s32_do_vec256(__m256i numers, const struct libdivide_s32_t *denom) {
2603 : uint8_t more = denom->more;
2604 : if (!denom->magic) {
2605 : uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2606 : uint32_t mask = ((uint32_t)1 << shift) - 1;
2607 : __m256i roundToZeroTweak = _mm256_set1_epi32(mask);
2608 : // q = numer + ((numer >> 31) & roundToZeroTweak);
2609 : __m256i q = _mm256_add_epi32(
2610 : numers, _mm256_and_si256(_mm256_srai_epi32(numers, 31), roundToZeroTweak));
2611 : q = _mm256_srai_epi32(q, shift);
2612 : __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
2613 : // q = (q ^ sign) - sign;
2614 : q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign);
2615 : return q;
2616 : } else {
2617 : __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(denom->magic));
2618 : if (more & LIBDIVIDE_ADD_MARKER) {
2619 : // must be arithmetic shift
2620 : __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
2621 : // q += ((numer ^ sign) - sign);
2622 : q = _mm256_add_epi32(q, _mm256_sub_epi32(_mm256_xor_si256(numers, sign), sign));
2623 : }
2624 : // q >>= shift
2625 : q = _mm256_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
2626 : q = _mm256_add_epi32(q, _mm256_srli_epi32(q, 31)); // q += (q < 0)
2627 : return q;
2628 : }
2629 : }
2630 :
2631 : __m256i libdivide_s32_branchfree_do_vec256(
2632 : __m256i numers, const struct libdivide_s32_branchfree_t *denom) {
2633 : int32_t magic = denom->magic;
2634 : uint8_t more = denom->more;
2635 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2636 : // must be arithmetic shift
2637 : __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
2638 : __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(magic));
2639 : q = _mm256_add_epi32(q, numers); // q += numers
2640 :
2641 : // If q is non-negative, we have nothing to do
2642 : // If q is negative, we want to add either (2**shift)-1 if d is
2643 : // a power of 2, or (2**shift) if it is not a power of 2
2644 : uint32_t is_power_of_2 = (magic == 0);
2645 : __m256i q_sign = _mm256_srai_epi32(q, 31); // q_sign = q >> 31
2646 : __m256i mask = _mm256_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
2647 : q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask)
2648 : q = _mm256_srai_epi32(q, shift); // q >>= shift
2649 : q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign
2650 : return q;
2651 : }
2652 :
2653 : ////////// SINT64
2654 :
2655 : __m256i libdivide_s64_do_vec256(__m256i numers, const struct libdivide_s64_t *denom) {
2656 : uint8_t more = denom->more;
2657 : int64_t magic = denom->magic;
2658 : if (magic == 0) { // shift path
2659 : uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2660 : uint64_t mask = ((uint64_t)1 << shift) - 1;
2661 : __m256i roundToZeroTweak = _mm256_set1_epi64x(mask);
2662 : // q = numer + ((numer >> 63) & roundToZeroTweak);
2663 : __m256i q = _mm256_add_epi64(
2664 : numers, _mm256_and_si256(libdivide_s64_signbits_vec256(numers), roundToZeroTweak));
2665 : q = libdivide_s64_shift_right_vec256(q, shift);
2666 : __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
2667 : // q = (q ^ sign) - sign;
2668 : q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign);
2669 : return q;
2670 : } else {
2671 : __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic));
2672 : if (more & LIBDIVIDE_ADD_MARKER) {
2673 : // must be arithmetic shift
2674 : __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
2675 : // q += ((numer ^ sign) - sign);
2676 : q = _mm256_add_epi64(q, _mm256_sub_epi64(_mm256_xor_si256(numers, sign), sign));
2677 : }
2678 : // q >>= denom->mult_path.shift
2679 : q = libdivide_s64_shift_right_vec256(q, more & LIBDIVIDE_64_SHIFT_MASK);
2680 : q = _mm256_add_epi64(q, _mm256_srli_epi64(q, 63)); // q += (q < 0)
2681 : return q;
2682 : }
2683 : }
2684 :
2685 : __m256i libdivide_s64_branchfree_do_vec256(
2686 : __m256i numers, const struct libdivide_s64_branchfree_t *denom) {
2687 : int64_t magic = denom->magic;
2688 : uint8_t more = denom->more;
2689 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2690 : // must be arithmetic shift
2691 : __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
2692 :
2693 : // libdivide_mullhi_s64(numers, magic);
2694 : __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic));
2695 : q = _mm256_add_epi64(q, numers); // q += numers
2696 :
2697 : // If q is non-negative, we have nothing to do.
2698 : // If q is negative, we want to add either (2**shift)-1 if d is
2699 : // a power of 2, or (2**shift) if it is not a power of 2.
2700 : uint32_t is_power_of_2 = (magic == 0);
2701 : __m256i q_sign = libdivide_s64_signbits_vec256(q); // q_sign = q >> 63
2702 : __m256i mask = _mm256_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);
2703 : q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask)
2704 : q = libdivide_s64_shift_right_vec256(q, shift); // q >>= shift
2705 : q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign
2706 : return q;
2707 : }
2708 :
2709 : #endif
2710 :
2711 : #if defined(LIBDIVIDE_SSE2)
2712 :
2713 : static LIBDIVIDE_INLINE __m128i libdivide_u16_do_vec128(
2714 : __m128i numers, const struct libdivide_u16_t *denom);
2715 : static LIBDIVIDE_INLINE __m128i libdivide_s16_do_vec128(
2716 : __m128i numers, const struct libdivide_s16_t *denom);
2717 : static LIBDIVIDE_INLINE __m128i libdivide_u32_do_vec128(
2718 : __m128i numers, const struct libdivide_u32_t *denom);
2719 : static LIBDIVIDE_INLINE __m128i libdivide_s32_do_vec128(
2720 : __m128i numers, const struct libdivide_s32_t *denom);
2721 : static LIBDIVIDE_INLINE __m128i libdivide_u64_do_vec128(
2722 : __m128i numers, const struct libdivide_u64_t *denom);
2723 : static LIBDIVIDE_INLINE __m128i libdivide_s64_do_vec128(
2724 : __m128i numers, const struct libdivide_s64_t *denom);
2725 :
2726 : static LIBDIVIDE_INLINE __m128i libdivide_u16_branchfree_do_vec128(
2727 : __m128i numers, const struct libdivide_u16_branchfree_t *denom);
2728 : static LIBDIVIDE_INLINE __m128i libdivide_s16_branchfree_do_vec128(
2729 : __m128i numers, const struct libdivide_s16_branchfree_t *denom);
2730 : static LIBDIVIDE_INLINE __m128i libdivide_u32_branchfree_do_vec128(
2731 : __m128i numers, const struct libdivide_u32_branchfree_t *denom);
2732 : static LIBDIVIDE_INLINE __m128i libdivide_s32_branchfree_do_vec128(
2733 : __m128i numers, const struct libdivide_s32_branchfree_t *denom);
2734 : static LIBDIVIDE_INLINE __m128i libdivide_u64_branchfree_do_vec128(
2735 : __m128i numers, const struct libdivide_u64_branchfree_t *denom);
2736 : static LIBDIVIDE_INLINE __m128i libdivide_s64_branchfree_do_vec128(
2737 : __m128i numers, const struct libdivide_s64_branchfree_t *denom);
2738 :
2739 : //////// Internal Utility Functions
2740 :
2741 : // Implementation of _mm_srai_epi64(v, 63) (from AVX512).
2742 : static LIBDIVIDE_INLINE __m128i libdivide_s64_signbits_vec128(__m128i v) {
2743 : __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
2744 : __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31);
2745 : return signBits;
2746 : }
2747 :
2748 : // Implementation of _mm_srai_epi64 (from AVX512).
2749 : static LIBDIVIDE_INLINE __m128i libdivide_s64_shift_right_vec128(__m128i v, int amt) {
2750 : const int b = 64 - amt;
2751 : __m128i m = _mm_set1_epi64x((uint64_t)1 << (b - 1));
2752 : __m128i x = _mm_srli_epi64(v, amt);
2753 : __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m);
2754 : return result;
2755 : }
2756 :
2757 : // Here, b is assumed to contain one 32-bit value repeated.
2758 : static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u32_vec128(__m128i a, __m128i b) {
2759 2436 : __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32);
2760 1218 : __m128i a1X3X = _mm_srli_epi64(a, 32);
2761 1218 : __m128i mask = _mm_set_epi32(-1, 0, -1, 0);
2762 2436 : __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), mask);
2763 1218 : return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3);
2764 : }
2765 :
2766 : // SSE2 does not have a signed multiplication instruction, but we can convert
2767 : // unsigned to signed pretty efficiently. Again, b is just a 32 bit value
2768 : // repeated four times.
2769 : static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s32_vec128(__m128i a, __m128i b) {
2770 : __m128i p = libdivide_mullhi_u32_vec128(a, b);
2771 : // t1 = (a >> 31) & y, arithmetic shift
2772 : __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b);
2773 : __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a);
2774 : p = _mm_sub_epi32(p, t1);
2775 : p = _mm_sub_epi32(p, t2);
2776 : return p;
2777 : }
2778 :
2779 : // Here, y is assumed to contain one 64-bit value repeated.
2780 : static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y) {
2781 : // full 128 bits product is:
2782 : // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64)
2783 : // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64.
2784 :
2785 : // Compute x0*y0.
2786 : // Note x1, y1 are ignored by mul_epu32.
2787 : __m128i x0y0 = _mm_mul_epu32(x, y);
2788 : __m128i x0y0_hi = _mm_srli_epi64(x0y0, 32);
2789 :
2790 : // Get x1, y1 in the low bits.
2791 : // We could shuffle or right shift. Shuffles are preferred as they preserve
2792 : // the source register for the next computation.
2793 : __m128i x1 = _mm_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1));
2794 : __m128i y1 = _mm_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1));
2795 :
2796 : // No need to mask off top 32 bits for mul_epu32.
2797 : __m128i x0y1 = _mm_mul_epu32(x, y1);
2798 : __m128i x1y0 = _mm_mul_epu32(x1, y);
2799 : __m128i x1y1 = _mm_mul_epu32(x1, y1);
2800 :
2801 : // Mask here selects low bits only.
2802 : __m128i mask = _mm_set1_epi64x(0xFFFFFFFF);
2803 : __m128i temp = _mm_add_epi64(x1y0, x0y0_hi);
2804 : __m128i temp_lo = _mm_and_si128(temp, mask);
2805 : __m128i temp_hi = _mm_srli_epi64(temp, 32);
2806 :
2807 : temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32);
2808 : temp_hi = _mm_add_epi64(x1y1, temp_hi);
2809 : return _mm_add_epi64(temp_lo, temp_hi);
2810 : }
2811 :
2812 : // y is one 64-bit value repeated.
2813 : static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y) {
2814 : __m128i p = libdivide_mullhi_u64_vec128(x, y);
2815 : __m128i t1 = _mm_and_si128(libdivide_s64_signbits_vec128(x), y);
2816 : __m128i t2 = _mm_and_si128(libdivide_s64_signbits_vec128(y), x);
2817 : p = _mm_sub_epi64(p, t1);
2818 : p = _mm_sub_epi64(p, t2);
2819 : return p;
2820 : }
2821 :
2822 : ////////// UINT26
2823 :
2824 : __m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) {
2825 10030 : uint8_t more = denom->more;
2826 10030 : if (!denom->magic) {
2827 36 : return _mm_srli_epi16(numers, more);
2828 : } else {
2829 20024 : __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
2830 10012 : if (more & LIBDIVIDE_ADD_MARKER) {
2831 12 : __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
2832 12 : return _mm_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
2833 : } else {
2834 20012 : return _mm_srli_epi16(q, more);
2835 : }
2836 : }
2837 : }
2838 :
2839 : __m128i libdivide_u16_branchfree_do_vec128(
2840 : __m128i numers, const struct libdivide_u16_branchfree_t *denom) {
2841 : __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
2842 : __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
2843 : return _mm_srli_epi16(t, denom->more);
2844 : }
2845 :
2846 : ////////// UINT32
2847 :
2848 : __m128i libdivide_u32_do_vec128(__m128i numers, const struct libdivide_u32_t *denom) {
2849 1272 : uint8_t more = denom->more;
2850 1272 : if (!denom->magic) {
2851 108 : return _mm_srli_epi32(numers, more);
2852 : } else {
2853 2436 : __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic));
2854 1218 : if (more & LIBDIVIDE_ADD_MARKER) {
2855 : // uint32_t t = ((numer - q) >> 1) + q;
2856 : // return t >> denom->shift;
2857 0 : uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2858 0 : __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
2859 0 : return _mm_srli_epi32(t, shift);
2860 : } else {
2861 2436 : return _mm_srli_epi32(q, more);
2862 : }
2863 : }
2864 : }
2865 :
2866 : __m128i libdivide_u32_branchfree_do_vec128(
2867 : __m128i numers, const struct libdivide_u32_branchfree_t *denom) {
2868 : __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic));
2869 : __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
2870 : return _mm_srli_epi32(t, denom->more);
2871 : }
2872 :
2873 : ////////// UINT64
2874 :
2875 : __m128i libdivide_u64_do_vec128(__m128i numers, const struct libdivide_u64_t *denom) {
2876 : uint8_t more = denom->more;
2877 : if (!denom->magic) {
2878 : return _mm_srli_epi64(numers, more);
2879 : } else {
2880 : __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic));
2881 : if (more & LIBDIVIDE_ADD_MARKER) {
2882 : // uint32_t t = ((numer - q) >> 1) + q;
2883 : // return t >> denom->shift;
2884 : uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2885 : __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
2886 : return _mm_srli_epi64(t, shift);
2887 : } else {
2888 : return _mm_srli_epi64(q, more);
2889 : }
2890 : }
2891 : }
2892 :
2893 : __m128i libdivide_u64_branchfree_do_vec128(
2894 : __m128i numers, const struct libdivide_u64_branchfree_t *denom) {
2895 : __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic));
2896 : __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
2897 : return _mm_srli_epi64(t, denom->more);
2898 : }
2899 :
2900 : ////////// SINT16
2901 :
2902 : __m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) {
2903 : uint8_t more = denom->more;
2904 : if (!denom->magic) {
2905 : uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
2906 : uint16_t mask = ((uint16_t)1 << shift) - 1;
2907 : __m128i roundToZeroTweak = _mm_set1_epi16(mask);
2908 : // q = numer + ((numer >> 15) & roundToZeroTweak);
2909 : __m128i q =
2910 : _mm_add_epi16(numers, _mm_and_si128(_mm_srai_epi16(numers, 15), roundToZeroTweak));
2911 : q = _mm_srai_epi16(q, shift);
2912 : __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
2913 : // q = (q ^ sign) - sign;
2914 : q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign);
2915 : return q;
2916 : } else {
2917 : __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(denom->magic));
2918 : if (more & LIBDIVIDE_ADD_MARKER) {
2919 : // must be arithmetic shift
2920 : __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
2921 : // q += ((numer ^ sign) - sign);
2922 : q = _mm_add_epi16(q, _mm_sub_epi16(_mm_xor_si128(numers, sign), sign));
2923 : }
2924 : // q >>= shift
2925 : q = _mm_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
2926 : q = _mm_add_epi16(q, _mm_srli_epi16(q, 15)); // q += (q < 0)
2927 : return q;
2928 : }
2929 : }
2930 :
2931 : __m128i libdivide_s16_branchfree_do_vec128(
2932 : __m128i numers, const struct libdivide_s16_branchfree_t *denom) {
2933 : int16_t magic = denom->magic;
2934 : uint8_t more = denom->more;
2935 : uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
2936 : // must be arithmetic shift
2937 : __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
2938 : __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(magic));
2939 : q = _mm_add_epi16(q, numers); // q += numers
2940 :
2941 : // If q is non-negative, we have nothing to do
2942 : // If q is negative, we want to add either (2**shift)-1 if d is
2943 : // a power of 2, or (2**shift) if it is not a power of 2
2944 : uint16_t is_power_of_2 = (magic == 0);
2945 : __m128i q_sign = _mm_srai_epi16(q, 15); // q_sign = q >> 15
2946 : __m128i mask = _mm_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
2947 : q = _mm_add_epi16(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask)
2948 : q = _mm_srai_epi16(q, shift); // q >>= shift
2949 : q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign
2950 : return q;
2951 : }
2952 :
2953 : ////////// SINT32
2954 :
2955 : __m128i libdivide_s32_do_vec128(__m128i numers, const struct libdivide_s32_t *denom) {
2956 : uint8_t more = denom->more;
2957 : if (!denom->magic) {
2958 : uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2959 : uint32_t mask = ((uint32_t)1 << shift) - 1;
2960 : __m128i roundToZeroTweak = _mm_set1_epi32(mask);
2961 : // q = numer + ((numer >> 31) & roundToZeroTweak);
2962 : __m128i q =
2963 : _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
2964 : q = _mm_srai_epi32(q, shift);
2965 : __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
2966 : // q = (q ^ sign) - sign;
2967 : q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign);
2968 : return q;
2969 : } else {
2970 : __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(denom->magic));
2971 : if (more & LIBDIVIDE_ADD_MARKER) {
2972 : // must be arithmetic shift
2973 : __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
2974 : // q += ((numer ^ sign) - sign);
2975 : q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign));
2976 : }
2977 : // q >>= shift
2978 : q = _mm_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
2979 : q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0)
2980 : return q;
2981 : }
2982 : }
2983 :
2984 : __m128i libdivide_s32_branchfree_do_vec128(
2985 : __m128i numers, const struct libdivide_s32_branchfree_t *denom) {
2986 : int32_t magic = denom->magic;
2987 : uint8_t more = denom->more;
2988 : uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2989 : // must be arithmetic shift
2990 : __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
2991 : __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(magic));
2992 : q = _mm_add_epi32(q, numers); // q += numers
2993 :
2994 : // If q is non-negative, we have nothing to do
2995 : // If q is negative, we want to add either (2**shift)-1 if d is
2996 : // a power of 2, or (2**shift) if it is not a power of 2
2997 : uint32_t is_power_of_2 = (magic == 0);
2998 : __m128i q_sign = _mm_srai_epi32(q, 31); // q_sign = q >> 31
2999 : __m128i mask = _mm_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
3000 : q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask)
3001 : q = _mm_srai_epi32(q, shift); // q >>= shift
3002 : q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign
3003 : return q;
3004 : }
3005 :
3006 : ////////// SINT64
3007 :
3008 : __m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *denom) {
3009 : uint8_t more = denom->more;
3010 : int64_t magic = denom->magic;
3011 : if (magic == 0) { // shift path
3012 : uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
3013 : uint64_t mask = ((uint64_t)1 << shift) - 1;
3014 : __m128i roundToZeroTweak = _mm_set1_epi64x(mask);
3015 : // q = numer + ((numer >> 63) & roundToZeroTweak);
3016 : __m128i q = _mm_add_epi64(
3017 : numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak));
3018 : q = libdivide_s64_shift_right_vec128(q, shift);
3019 : __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
3020 : // q = (q ^ sign) - sign;
3021 : q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign);
3022 : return q;
3023 : } else {
3024 : __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic));
3025 : if (more & LIBDIVIDE_ADD_MARKER) {
3026 : // must be arithmetic shift
3027 : __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
3028 : // q += ((numer ^ sign) - sign);
3029 : q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign));
3030 : }
3031 : // q >>= denom->mult_path.shift
3032 : q = libdivide_s64_shift_right_vec128(q, more & LIBDIVIDE_64_SHIFT_MASK);
3033 : q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0)
3034 : return q;
3035 : }
3036 : }
3037 :
3038 : __m128i libdivide_s64_branchfree_do_vec128(
3039 : __m128i numers, const struct libdivide_s64_branchfree_t *denom) {
3040 : int64_t magic = denom->magic;
3041 : uint8_t more = denom->more;
3042 : uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
3043 : // must be arithmetic shift
3044 : __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
3045 :
3046 : // libdivide_mullhi_s64(numers, magic);
3047 : __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic));
3048 : q = _mm_add_epi64(q, numers); // q += numers
3049 :
3050 : // If q is non-negative, we have nothing to do.
3051 : // If q is negative, we want to add either (2**shift)-1 if d is
3052 : // a power of 2, or (2**shift) if it is not a power of 2.
3053 : uint32_t is_power_of_2 = (magic == 0);
3054 : __m128i q_sign = libdivide_s64_signbits_vec128(q); // q_sign = q >> 63
3055 : __m128i mask = _mm_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);
3056 : q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask)
3057 : q = libdivide_s64_shift_right_vec128(q, shift); // q >>= shift
3058 : q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign
3059 : return q;
3060 : }
3061 :
3062 : #endif
3063 :
3064 : ////////// C++ stuff
3065 :
3066 : #ifdef __cplusplus
3067 :
3068 : enum Branching {
3069 : BRANCHFULL, // use branching algorithms
3070 : BRANCHFREE // use branchfree algorithms
3071 : };
3072 :
3073 : namespace detail {
3074 : enum Signedness {
3075 : SIGNED,
3076 : UNSIGNED,
3077 : };
3078 :
3079 : #if defined(LIBDIVIDE_NEON)
3080 : // Helper to deduce NEON vector type for integral type.
3081 : template <int _WIDTH, Signedness _SIGN>
3082 : struct NeonVec {};
3083 :
3084 : template <>
3085 : struct NeonVec<16, UNSIGNED> {
3086 : typedef uint16x8_t type;
3087 : };
3088 :
3089 : template <>
3090 : struct NeonVec<16, SIGNED> {
3091 : typedef int16x8_t type;
3092 : };
3093 :
3094 : template <>
3095 : struct NeonVec<32, UNSIGNED> {
3096 : typedef uint32x4_t type;
3097 : };
3098 :
3099 : template <>
3100 : struct NeonVec<32, SIGNED> {
3101 : typedef int32x4_t type;
3102 : };
3103 :
3104 : template <>
3105 : struct NeonVec<64, UNSIGNED> {
3106 : typedef uint64x2_t type;
3107 : };
3108 :
3109 : template <>
3110 : struct NeonVec<64, SIGNED> {
3111 : typedef int64x2_t type;
3112 : };
3113 :
3114 : template <typename T>
3115 : struct NeonVecFor {
3116 : // See 'class divider' for an explanation of these template parameters.
3117 : typedef typename NeonVec<sizeof(T) * 8, (((T)0 >> 0) > (T)(-1) ? SIGNED : UNSIGNED)>::type type;
3118 : };
3119 :
3120 : #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) \
3121 : LIBDIVIDE_INLINE typename NeonVecFor<INT_TYPE>::type divide( \
3122 : typename NeonVecFor<INT_TYPE>::type n) const { \
3123 : return libdivide_##ALGO##_do_vec128(n, &denom); \
3124 : }
3125 : #else
3126 : #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)
3127 : #endif
3128 :
3129 : #if defined(LIBDIVIDE_SSE2)
3130 : #define LIBDIVIDE_DIVIDE_SSE2(ALGO) \
3131 : LIBDIVIDE_INLINE __m128i divide(__m128i n) const { \
3132 : return libdivide_##ALGO##_do_vec128(n, &denom); \
3133 : }
3134 : #else
3135 : #define LIBDIVIDE_DIVIDE_SSE2(ALGO)
3136 : #endif
3137 :
3138 : #if defined(LIBDIVIDE_AVX2)
3139 : #define LIBDIVIDE_DIVIDE_AVX2(ALGO) \
3140 : LIBDIVIDE_INLINE __m256i divide(__m256i n) const { \
3141 : return libdivide_##ALGO##_do_vec256(n, &denom); \
3142 : }
3143 : #else
3144 : #define LIBDIVIDE_DIVIDE_AVX2(ALGO)
3145 : #endif
3146 :
3147 : #if defined(LIBDIVIDE_AVX512)
3148 : #define LIBDIVIDE_DIVIDE_AVX512(ALGO) \
3149 : LIBDIVIDE_INLINE __m512i divide(__m512i n) const { \
3150 : return libdivide_##ALGO##_do_vec512(n, &denom); \
3151 : }
3152 : #else
3153 : #define LIBDIVIDE_DIVIDE_AVX512(ALGO)
3154 : #endif
3155 :
3156 : // The DISPATCHER_GEN() macro generates C++ methods (for the given integer
3157 : // and algorithm types) that redirect to libdivide's C API.
3158 : #define DISPATCHER_GEN(T, ALGO) \
3159 : libdivide_##ALGO##_t denom; \
3160 : LIBDIVIDE_INLINE dispatcher() {} \
3161 : explicit LIBDIVIDE_CONSTEXPR dispatcher(decltype(nullptr)) : denom{} {} \
3162 : LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {} \
3163 : LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \
3164 : LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \
3165 : LIBDIVIDE_DIVIDE_NEON(ALGO, T) \
3166 : LIBDIVIDE_DIVIDE_SSE2(ALGO) \
3167 : LIBDIVIDE_DIVIDE_AVX2(ALGO) \
3168 : LIBDIVIDE_DIVIDE_AVX512(ALGO)
3169 :
3170 : // The dispatcher selects a specific division algorithm for a given
3171 : // width, signedness, and ALGO using partial template specialization.
3172 : template <int _WIDTH, Signedness _SIGN, Branching _ALGO>
3173 : struct dispatcher {};
3174 :
3175 : template <>
3176 : struct dispatcher<16, SIGNED, BRANCHFULL> {
3177 : DISPATCHER_GEN(int16_t, s16)
3178 : };
3179 : template <>
3180 : struct dispatcher<16, SIGNED, BRANCHFREE> {
3181 : DISPATCHER_GEN(int16_t, s16_branchfree)
3182 : };
3183 : template <>
3184 : struct dispatcher<16, UNSIGNED, BRANCHFULL> {
3185 10038 : DISPATCHER_GEN(uint16_t, u16)
3186 : };
3187 : template <>
3188 : struct dispatcher<16, UNSIGNED, BRANCHFREE> {
3189 : DISPATCHER_GEN(uint16_t, u16_branchfree)
3190 : };
3191 : template <>
3192 : struct dispatcher<32, SIGNED, BRANCHFULL> {
3193 : DISPATCHER_GEN(int32_t, s32)
3194 : };
3195 : template <>
3196 : struct dispatcher<32, SIGNED, BRANCHFREE> {
3197 : DISPATCHER_GEN(int32_t, s32_branchfree)
3198 : };
3199 : template <>
3200 : struct dispatcher<32, UNSIGNED, BRANCHFULL> {
3201 1285 : DISPATCHER_GEN(uint32_t, u32)
3202 : };
3203 : template <>
3204 : struct dispatcher<32, UNSIGNED, BRANCHFREE> {
3205 : DISPATCHER_GEN(uint32_t, u32_branchfree)
3206 : };
3207 : template <>
3208 : struct dispatcher<64, SIGNED, BRANCHFULL> {
3209 : DISPATCHER_GEN(int64_t, s64)
3210 : };
3211 : template <>
3212 : struct dispatcher<64, SIGNED, BRANCHFREE> {
3213 : DISPATCHER_GEN(int64_t, s64_branchfree)
3214 : };
3215 : template <>
3216 : struct dispatcher<64, UNSIGNED, BRANCHFULL> {
3217 : DISPATCHER_GEN(uint64_t, u64)
3218 : };
3219 : template <>
3220 : struct dispatcher<64, UNSIGNED, BRANCHFREE> {
3221 : DISPATCHER_GEN(uint64_t, u64_branchfree)
3222 : };
3223 : } // namespace detail
3224 :
3225 : #if defined(LIBDIVIDE_NEON)
3226 : // Allow NeonVecFor outside of detail namespace.
3227 : template <typename T>
3228 : struct NeonVecFor {
3229 : typedef typename detail::NeonVecFor<T>::type type;
3230 : };
3231 : #endif
3232 :
3233 : // This is the main divider class for use by the user (C++ API).
3234 : // The actual division algorithm is selected using the dispatcher struct
3235 : // based on the integer width and algorithm template parameters.
3236 : template <typename T, Branching ALGO = BRANCHFULL>
3237 : class divider {
3238 : private:
3239 : // Dispatch based on the size and signedness.
3240 : // We avoid using type_traits as it's not available in AVR.
3241 : // Detect signedness by checking if T(-1) is less than T(0).
3242 : // Also throw in a shift by 0, which prevents floating point types from being passed.
3243 : typedef detail::dispatcher<sizeof(T) * 8,
3244 : (((T)0 >> 0) > (T)(-1) ? detail::SIGNED : detail::UNSIGNED), ALGO>
3245 : dispatcher_t;
3246 :
3247 : public:
3248 : // We leave the default constructor empty so that creating
3249 : // an array of dividers and then initializing them
3250 : // later doesn't slow us down.
3251 : divider() {}
3252 :
3253 : // constexpr zero-initialization to allow for use w/ static constinit
3254 : explicit LIBDIVIDE_CONSTEXPR divider(decltype(nullptr)) : div(nullptr) {}
3255 :
3256 : // Constructor that takes the divisor as a parameter
3257 42 : LIBDIVIDE_INLINE divider(T d) : div(d) {}
3258 :
3259 : // Divides n by the divisor
3260 : LIBDIVIDE_INLINE T divide(T n) const { return div.divide(n); }
3261 :
3262 : // Recovers the divisor, returns the value that was
3263 : // used to initialize this divider object.
3264 : T recover() const { return div.recover(); }
3265 :
3266 : bool operator==(const divider<T, ALGO> &other) const {
3267 : return div.denom.magic == other.div.denom.magic && div.denom.more == other.div.denom.more;
3268 : }
3269 :
3270 : bool operator!=(const divider<T, ALGO> &other) const { return !(*this == other); }
3271 :
3272 : // Vector variants treat the input as packed integer values with the same type as the divider
3273 : // (e.g. s32, u32, s64, u64) and divides each of them by the divider, returning the packed
3274 : // quotients.
3275 : #if defined(LIBDIVIDE_SSE2)
3276 11302 : LIBDIVIDE_INLINE __m128i divide(__m128i n) const { return div.divide(n); }
3277 : #endif
3278 : #if defined(LIBDIVIDE_AVX2)
3279 : LIBDIVIDE_INLINE __m256i divide(__m256i n) const { return div.divide(n); }
3280 : #endif
3281 : #if defined(LIBDIVIDE_AVX512)
3282 : LIBDIVIDE_INLINE __m512i divide(__m512i n) const { return div.divide(n); }
3283 : #endif
3284 : #if defined(LIBDIVIDE_NEON)
3285 : LIBDIVIDE_INLINE typename NeonVecFor<T>::type divide(typename NeonVecFor<T>::type n) const {
3286 : return div.divide(n);
3287 : }
3288 : #endif
3289 :
3290 : private:
3291 : // Storage for the actual divisor
3292 : dispatcher_t div;
3293 : };
3294 :
3295 : // Overload of operator / for scalar division
3296 : template <typename T, Branching ALGO>
3297 : LIBDIVIDE_INLINE T operator/(T n, const divider<T, ALGO> &div) {
3298 : return div.divide(n);
3299 : }
3300 :
3301 : // Overload of operator /= for scalar division
3302 : template <typename T, Branching ALGO>
3303 : LIBDIVIDE_INLINE T &operator/=(T &n, const divider<T, ALGO> &div) {
3304 : n = div.divide(n);
3305 : return n;
3306 : }
3307 :
3308 : // Overloads for vector types.
3309 : #if defined(LIBDIVIDE_SSE2)
3310 : template <typename T, Branching ALGO>
3311 : LIBDIVIDE_INLINE __m128i operator/(__m128i n, const divider<T, ALGO> &div) {
3312 : return div.divide(n);
3313 : }
3314 :
3315 : template <typename T, Branching ALGO>
3316 : LIBDIVIDE_INLINE __m128i operator/=(__m128i &n, const divider<T, ALGO> &div) {
3317 11302 : n = div.divide(n);
3318 11302 : return n;
3319 : }
3320 : #endif
3321 : #if defined(LIBDIVIDE_AVX2)
3322 : template <typename T, Branching ALGO>
3323 : LIBDIVIDE_INLINE __m256i operator/(__m256i n, const divider<T, ALGO> &div) {
3324 : return div.divide(n);
3325 : }
3326 :
3327 : template <typename T, Branching ALGO>
3328 : LIBDIVIDE_INLINE __m256i operator/=(__m256i &n, const divider<T, ALGO> &div) {
3329 : n = div.divide(n);
3330 : return n;
3331 : }
3332 : #endif
3333 : #if defined(LIBDIVIDE_AVX512)
3334 : template <typename T, Branching ALGO>
3335 : LIBDIVIDE_INLINE __m512i operator/(__m512i n, const divider<T, ALGO> &div) {
3336 : return div.divide(n);
3337 : }
3338 :
3339 : template <typename T, Branching ALGO>
3340 : LIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider<T, ALGO> &div) {
3341 : n = div.divide(n);
3342 : return n;
3343 : }
3344 : #endif
3345 :
3346 : #if defined(LIBDIVIDE_NEON)
3347 : template <typename T, Branching ALGO>
3348 : LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/(
3349 : typename NeonVecFor<T>::type n, const divider<T, ALGO> &div) {
3350 : return div.divide(n);
3351 : }
3352 :
3353 : template <typename T, Branching ALGO>
3354 : LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/=(
3355 : typename NeonVecFor<T>::type &n, const divider<T, ALGO> &div) {
3356 : n = div.divide(n);
3357 : return n;
3358 : }
3359 : #endif
3360 :
3361 : #if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900)
3362 : // libdivide::branchfree_divider<T>
3363 : template <typename T>
3364 : using branchfree_divider = divider<T, BRANCHFREE>;
3365 : #endif
3366 :
3367 : } // namespace libdivide
3368 :
3369 : #endif // __cplusplus
3370 :
3371 : #if defined(_MSC_VER) && !defined(__clang__)
3372 : #pragma warning(pop)
3373 : #endif
3374 :
3375 : #endif // LIBDIVIDE_H
|