LCOV - code coverage report
Current view: top level - third_party/libdivide - libdivide.h (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 63 72 87.5 %
Date: 2025-06-19 12:30:01 Functions: 0 0 -

          Line data    Source code
       1             : // libdivide.h - Optimized integer division
       2             : // https://libdivide.com
       3             : //
       4             : // Copyright (C) 2010 - 2022 ridiculous_fish, <libdivide@ridiculousfish.com>
       5             : // Copyright (C) 2016 - 2022 Kim Walisch, <kim.walisch@gmail.com>
       6             : //
       7             : // libdivide is dual-licensed under the Boost or zlib licenses.
       8             : // You may use libdivide under the terms of either of these.
       9             : // See LICENSE.txt for more details.
      10             : 
      11             : #ifndef LIBDIVIDE_H
      12             : #define LIBDIVIDE_H
      13             : 
      14             : // *** Version numbers are auto generated - do not edit ***
      15             : #define LIBDIVIDE_VERSION "5.2.0"
      16             : #define LIBDIVIDE_VERSION_MAJOR 5
      17             : #define LIBDIVIDE_VERSION_MINOR 2
      18             : #define LIBDIVIDE_VERSION_PATCH 0
      19             : 
      20             : #include <stdint.h>
      21             : 
      22             : #if !defined(__AVR__) && __STDC_HOSTED__ != 0
      23             : #include <stdio.h>
      24             : #include <stdlib.h>
      25             : #endif
      26             : 
      27             : #if defined(_MSC_VER) && (defined(__cplusplus) && (__cplusplus >= 202002L)) || \
      28             :     (defined(_MSVC_LANG) && (_MSVC_LANG >= 202002L))
      29             : #include <limits.h>
      30             : #include <type_traits>
      31             : #define LIBDIVIDE_VC_CXX20
      32             : #endif
      33             : 
      34             : #if defined(LIBDIVIDE_SSE2)
      35             : #include <emmintrin.h>
      36             : #endif
      37             : 
      38             : #if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512)
      39             : #include <immintrin.h>
      40             : #endif
      41             : 
      42             : #if defined(LIBDIVIDE_NEON)
      43             : #include <arm_neon.h>
      44             : #endif
      45             : 
      46             : // Clang-cl prior to Visual Studio 2022 doesn't include __umulh/__mulh intrinsics
      47             : #if defined(_MSC_VER) && (!defined(__clang__) || _MSC_VER > 1930) && \
      48             :     (defined(_M_X64) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC))
      49             : #define LIBDIVIDE_MULH_INTRINSICS
      50             : #endif
      51             : 
      52             : #if defined(_MSC_VER)
      53             : #if defined(LIBDIVIDE_MULH_INTRINSICS) || !defined(__clang__)
      54             : #include <intrin.h>
      55             : #endif
      56             : #ifndef __clang__
      57             : #pragma warning(push)
      58             : // 4146: unary minus operator applied to unsigned type, result still unsigned
      59             : #pragma warning(disable : 4146)
      60             : 
      61             : // 4204: nonstandard extension used : non-constant aggregate initializer
      62             : #pragma warning(disable : 4204)
      63             : #endif
      64             : #define LIBDIVIDE_VC
      65             : #endif
      66             : 
      67             : #if !defined(__has_builtin)
      68             : #define __has_builtin(x) 0
      69             : #endif
      70             : 
      71             : #if defined(__SIZEOF_INT128__)
      72             : #define HAS_INT128_T
      73             : // clang-cl on Windows does not yet support 128-bit division
      74             : #if !(defined(__clang__) && defined(LIBDIVIDE_VC))
      75             : #define HAS_INT128_DIV
      76             : #endif
      77             : #endif
      78             : 
      79             : #if defined(__x86_64__) || defined(_M_X64)
      80             : #define LIBDIVIDE_X86_64
      81             : #endif
      82             : 
      83             : #if defined(__i386__)
      84             : #define LIBDIVIDE_i386
      85             : #endif
      86             : 
      87             : #if defined(__GNUC__) || defined(__clang__)
      88             : #define LIBDIVIDE_GCC_STYLE_ASM
      89             : #endif
      90             : 
      91             : #if defined(__cplusplus) || defined(LIBDIVIDE_VC)
      92             : #define LIBDIVIDE_FUNCTION __FUNCTION__
      93             : #else
      94             : #define LIBDIVIDE_FUNCTION __func__
      95             : #endif
      96             : 
      97             : // Set up forced inlining if possible.
      98             : // We need both the attribute and keyword to avoid "might not be inlineable" warnings.
      99             : #ifdef __has_attribute
     100             : #if __has_attribute(always_inline)
     101             : #define LIBDIVIDE_INLINE __attribute__((always_inline)) inline
     102             : #endif
     103             : #endif
     104             : #ifndef LIBDIVIDE_INLINE
     105             : #ifdef _MSC_VER
     106             : #define LIBDIVIDE_INLINE __forceinline
     107             : #else
     108             : #define LIBDIVIDE_INLINE inline
     109             : #endif
     110             : #endif
     111             : 
     112             : #if defined(__AVR__) || __STDC_HOSTED__ == 0
     113             : #define LIBDIVIDE_ERROR(msg)
     114             : #else
     115             : #define LIBDIVIDE_ERROR(msg)                                                                     \
     116             :     do {                                                                                         \
     117             :         fprintf(stderr, "libdivide.h:%d: %s(): Error: %s\n", __LINE__, LIBDIVIDE_FUNCTION, msg); \
     118             :         abort();                                                                                 \
     119             :     } while (0)
     120             : #endif
     121             : 
     122             : #if defined(LIBDIVIDE_ASSERTIONS_ON) && !defined(__AVR__) && __STDC_HOSTED__ != 0
     123             : #define LIBDIVIDE_ASSERT(x)                                                           \
     124             :     do {                                                                              \
     125             :         if (!(x)) {                                                                   \
     126             :             fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", __LINE__, \
     127             :                 LIBDIVIDE_FUNCTION, #x);                                              \
     128             :             abort();                                                                  \
     129             :         }                                                                             \
     130             :     } while (0)
     131             : #else
     132             : #define LIBDIVIDE_ASSERT(x)
     133             : #endif
     134             : 
     135             : #ifdef __cplusplus
     136             : 
     137             : // For constexpr zero initialization, c++11 might handle things ok,
     138             : // but just limit to at least c++14 to ensure we don't break anyone's code:
     139             : 
     140             : // Use https://en.cppreference.com/w/cpp/feature_test#cpp_constexpr
     141             : #if defined(__cpp_constexpr) && (__cpp_constexpr >= 201304L)
     142             : #define LIBDIVIDE_CONSTEXPR constexpr LIBDIVIDE_INLINE
     143             : 
     144             : // Supposedly, MSVC might not implement feature test macros right:
     145             : // https://stackoverflow.com/questions/49316752/feature-test-macros-not-working-properly-in-visual-c
     146             : // so check that _MSVC_LANG corresponds to at least c++14, and _MSC_VER corresponds to at least VS
     147             : // 2017 15.0 (for extended constexpr support:
     148             : // https://learn.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=msvc-170)
     149             : #elif (defined(_MSC_VER) && _MSC_VER >= 1910) && (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L)
     150             : #define LIBDIVIDE_CONSTEXPR constexpr LIBDIVIDE_INLINE
     151             : 
     152             : #else
     153             : #define LIBDIVIDE_CONSTEXPR LIBDIVIDE_INLINE
     154             : #endif
     155             : 
     156             : namespace libdivide {
     157             : #endif
     158             : 
     159             : #if defined(_MSC_VER) && !defined(__clang__)
     160             : #if defined(LIBDIVIDE_VC_CXX20)
     161             : static LIBDIVIDE_CONSTEXPR int __builtin_clz(unsigned x) {
     162             :     if (std::is_constant_evaluated()) {
     163             :         for (int i = 0; i < sizeof(x) * CHAR_BIT; ++i) {
     164             :             if (x >> (sizeof(x) * CHAR_BIT - 1 - i)) return i;
     165             :         }
     166             :         return sizeof(x) * CHAR_BIT;
     167             :     }
     168             : #else
     169             : static LIBDIVIDE_INLINE int __builtin_clz(unsigned x) {
     170             : #endif
     171             : #if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)
     172             :     return (int)_CountLeadingZeros(x);
     173             : #elif defined(__AVX2__) || defined(__LZCNT__)
     174             :     return (int)_lzcnt_u32(x);
     175             : #else
     176             :     unsigned long r;
     177             :     _BitScanReverse(&r, x);
     178             :     return (int)(r ^ 31);
     179             : #endif
     180             : }
     181             : 
     182             : #if defined(LIBDIVIDE_VC_CXX20)
     183             : static LIBDIVIDE_CONSTEXPR int __builtin_clzll(unsigned long long x) {
     184             :     if (std::is_constant_evaluated()) {
     185             :         for (int i = 0; i < sizeof(x) * CHAR_BIT; ++i) {
     186             :             if (x >> (sizeof(x) * CHAR_BIT - 1 - i)) return i;
     187             :         }
     188             :         return sizeof(x) * CHAR_BIT;
     189             :     }
     190             : #else
     191             : static LIBDIVIDE_INLINE int __builtin_clzll(unsigned long long x) {
     192             : #endif
     193             : #if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)
     194             :     return (int)_CountLeadingZeros64(x);
     195             : #elif defined(_WIN64)
     196             : #if defined(__AVX2__) || defined(__LZCNT__)
     197             :     return (int)_lzcnt_u64(x);
     198             : #else
     199             :     unsigned long r;
     200             :     _BitScanReverse64(&r, x);
     201             :     return (int)(r ^ 63);
     202             : #endif
     203             : #else
     204             :     int l = __builtin_clz((unsigned)x) + 32;
     205             :     int h = __builtin_clz((unsigned)(x >> 32));
     206             :     return !!((unsigned)(x >> 32)) ? h : l;
     207             : #endif
     208             : }
     209             : #endif // defined(_MSC_VER) && !defined(__clang__)
     210             : 
     211             : // pack divider structs to prevent compilers from padding.
     212             : // This reduces memory usage by up to 43% when using a large
     213             : // array of libdivide dividers and improves performance
     214             : // by up to 10% because of reduced memory bandwidth.
     215             : #pragma pack(push, 1)
     216             : 
     217             : struct libdivide_u16_t {
     218             :     uint16_t magic;
     219             :     uint8_t more;
     220             : };
     221             : 
     222             : struct libdivide_s16_t {
     223             :     int16_t magic;
     224             :     uint8_t more;
     225             : };
     226             : 
     227             : struct libdivide_u32_t {
     228             :     uint32_t magic;
     229             :     uint8_t more;
     230             : };
     231             : 
     232             : struct libdivide_s32_t {
     233             :     int32_t magic;
     234             :     uint8_t more;
     235             : };
     236             : 
     237             : struct libdivide_u64_t {
     238             :     uint64_t magic;
     239             :     uint8_t more;
     240             : };
     241             : 
     242             : struct libdivide_s64_t {
     243             :     int64_t magic;
     244             :     uint8_t more;
     245             : };
     246             : 
     247             : struct libdivide_u16_branchfree_t {
     248             :     uint16_t magic;
     249             :     uint8_t more;
     250             : };
     251             : 
     252             : struct libdivide_s16_branchfree_t {
     253             :     int16_t magic;
     254             :     uint8_t more;
     255             : };
     256             : 
     257             : struct libdivide_u32_branchfree_t {
     258             :     uint32_t magic;
     259             :     uint8_t more;
     260             : };
     261             : 
     262             : struct libdivide_s32_branchfree_t {
     263             :     int32_t magic;
     264             :     uint8_t more;
     265             : };
     266             : 
     267             : struct libdivide_u64_branchfree_t {
     268             :     uint64_t magic;
     269             :     uint8_t more;
     270             : };
     271             : 
     272             : struct libdivide_s64_branchfree_t {
     273             :     int64_t magic;
     274             :     uint8_t more;
     275             : };
     276             : 
     277             : #pragma pack(pop)
     278             : 
     279             : // Explanation of the "more" field:
     280             : //
     281             : // * Bits 0-5 is the shift value (for shift path or mult path).
     282             : // * Bit 6 is the add indicator for mult path.
     283             : // * Bit 7 is set if the divisor is negative. We use bit 7 as the negative
     284             : //   divisor indicator so that we can efficiently use sign extension to
     285             : //   create a bitmask with all bits set to 1 (if the divisor is negative)
     286             : //   or 0 (if the divisor is positive).
     287             : //
     288             : // u32: [0-4] shift value
     289             : //      [5] ignored
     290             : //      [6] add indicator
     291             : //      magic number of 0 indicates shift path
     292             : //
     293             : // s32: [0-4] shift value
     294             : //      [5] ignored
     295             : //      [6] add indicator
     296             : //      [7] indicates negative divisor
     297             : //      magic number of 0 indicates shift path
     298             : //
     299             : // u64: [0-5] shift value
     300             : //      [6] add indicator
     301             : //      magic number of 0 indicates shift path
     302             : //
     303             : // s64: [0-5] shift value
     304             : //      [6] add indicator
     305             : //      [7] indicates negative divisor
     306             : //      magic number of 0 indicates shift path
     307             : //
     308             : // In s32 and s64 branchfree modes, the magic number is negated according to
     309             : // whether the divisor is negated. In branchfree strategy, it is not negated.
     310             : 
     311             : enum {
     312             :     LIBDIVIDE_16_SHIFT_MASK = 0x1F,
     313             :     LIBDIVIDE_32_SHIFT_MASK = 0x1F,
     314             :     LIBDIVIDE_64_SHIFT_MASK = 0x3F,
     315             :     LIBDIVIDE_ADD_MARKER = 0x40,
     316             :     LIBDIVIDE_NEGATIVE_DIVISOR = 0x80
     317             : };
     318             : 
     319             : static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d);
     320             : static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d);
     321             : static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d);
     322             : static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d);
     323             : static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d);
     324             : static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d);
     325             : 
     326             : static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d);
     327             : static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d);
     328             : static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d);
     329             : static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d);
     330             : static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d);
     331             : static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d);
     332             : 
     333             : static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(
     334             :     int16_t numer, int16_t magic, uint8_t more);
     335             : static LIBDIVIDE_INLINE int16_t libdivide_s16_do(
     336             :     int16_t numer, const struct libdivide_s16_t *denom);
     337             : static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(
     338             :     uint16_t numer, uint16_t magic, uint8_t more);
     339             : static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(
     340             :     uint16_t numer, const struct libdivide_u16_t *denom);
     341             : static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(
     342             :     int32_t numer, int32_t magic, uint8_t more);
     343             : static LIBDIVIDE_INLINE int32_t libdivide_s32_do(
     344             :     int32_t numer, const struct libdivide_s32_t *denom);
     345             : static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw(
     346             :     uint32_t numer, uint32_t magic, uint8_t more);
     347             : static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(
     348             :     uint32_t numer, const struct libdivide_u32_t *denom);
     349             : static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(
     350             :     int64_t numer, int64_t magic, uint8_t more);
     351             : static LIBDIVIDE_INLINE int64_t libdivide_s64_do(
     352             :     int64_t numer, const struct libdivide_s64_t *denom);
     353             : static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw(
     354             :     uint64_t numer, uint64_t magic, uint8_t more);
     355             : static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(
     356             :     uint64_t numer, const struct libdivide_u64_t *denom);
     357             : 
     358             : static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(
     359             :     int16_t numer, const struct libdivide_s16_branchfree_t *denom);
     360             : static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(
     361             :     uint16_t numer, const struct libdivide_u16_branchfree_t *denom);
     362             : static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(
     363             :     int32_t numer, const struct libdivide_s32_branchfree_t *denom);
     364             : static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(
     365             :     uint32_t numer, const struct libdivide_u32_branchfree_t *denom);
     366             : static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(
     367             :     int64_t numer, const struct libdivide_s64_branchfree_t *denom);
     368             : static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(
     369             :     uint64_t numer, const struct libdivide_u64_branchfree_t *denom);
     370             : 
     371             : static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom);
     372             : static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom);
     373             : static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom);
     374             : static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);
     375             : static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom);
     376             : static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom);
     377             : 
     378             : static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(
     379             :     const struct libdivide_s16_branchfree_t *denom);
     380             : static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(
     381             :     const struct libdivide_u16_branchfree_t *denom);
     382             : static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(
     383             :     const struct libdivide_s32_branchfree_t *denom);
     384             : static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(
     385             :     const struct libdivide_u32_branchfree_t *denom);
     386             : static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover(
     387             :     const struct libdivide_s64_branchfree_t *denom);
     388             : static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover(
     389             :     const struct libdivide_u64_branchfree_t *denom);
     390             : 
     391             : //////// Internal Utility Functions
     392             : 
     393             : static LIBDIVIDE_INLINE uint16_t libdivide_mullhi_u16(uint16_t x, uint16_t y) {
     394             :     uint32_t xl = x, yl = y;
     395             :     uint32_t rl = xl * yl;
     396             :     return (uint16_t)(rl >> 16);
     397             : }
     398             : 
     399             : static LIBDIVIDE_INLINE int16_t libdivide_mullhi_s16(int16_t x, int16_t y) {
     400             :     int32_t xl = x, yl = y;
     401             :     int32_t rl = xl * yl;
     402             :     // needs to be arithmetic shift
     403             :     return (int16_t)(rl >> 16);
     404             : }
     405             : 
     406             : static LIBDIVIDE_INLINE uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) {
     407             :     uint64_t xl = x, yl = y;
     408             :     uint64_t rl = xl * yl;
     409             :     return (uint32_t)(rl >> 32);
     410             : }
     411             : 
     412             : static LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) {
     413             :     int64_t xl = x, yl = y;
     414             :     int64_t rl = xl * yl;
     415             :     // needs to be arithmetic shift
     416             :     return (int32_t)(rl >> 32);
     417             : }
     418             : 
     419             : static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
     420             : #if defined(LIBDIVIDE_MULH_INTRINSICS)
     421             :     return __umulh(x, y);
     422             : #elif defined(HAS_INT128_T)
     423             :     __uint128_t xl = x, yl = y;
     424             :     __uint128_t rl = xl * yl;
     425             :     return (uint64_t)(rl >> 64);
     426             : #else
     427             :     // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
     428             :     uint32_t mask = 0xFFFFFFFF;
     429             :     uint32_t x0 = (uint32_t)(x & mask);
     430             :     uint32_t x1 = (uint32_t)(x >> 32);
     431             :     uint32_t y0 = (uint32_t)(y & mask);
     432             :     uint32_t y1 = (uint32_t)(y >> 32);
     433             :     uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);
     434             :     uint64_t x0y1 = x0 * (uint64_t)y1;
     435             :     uint64_t x1y0 = x1 * (uint64_t)y0;
     436             :     uint64_t x1y1 = x1 * (uint64_t)y1;
     437             :     uint64_t temp = x1y0 + x0y0_hi;
     438             :     uint64_t temp_lo = temp & mask;
     439             :     uint64_t temp_hi = temp >> 32;
     440             : 
     441             :     return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32);
     442             : #endif
     443             : }
     444             : 
     445             : static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
     446             : #if defined(LIBDIVIDE_MULH_INTRINSICS)
     447             :     return __mulh(x, y);
     448             : #elif defined(HAS_INT128_T)
     449             :     __int128_t xl = x, yl = y;
     450             :     __int128_t rl = xl * yl;
     451             :     return (int64_t)(rl >> 64);
     452             : #else
     453             :     // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
     454             :     uint32_t mask = 0xFFFFFFFF;
     455             :     uint32_t x0 = (uint32_t)(x & mask);
     456             :     uint32_t y0 = (uint32_t)(y & mask);
     457             :     int32_t x1 = (int32_t)(x >> 32);
     458             :     int32_t y1 = (int32_t)(y >> 32);
     459             :     uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);
     460             :     int64_t t = x1 * (int64_t)y0 + x0y0_hi;
     461             :     int64_t w1 = x0 * (int64_t)y1 + (t & mask);
     462             : 
     463             :     return x1 * (int64_t)y1 + (t >> 32) + (w1 >> 32);
     464             : #endif
     465             : }
     466             : 
     467             : static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) {
     468             : #if defined(__AVR__)
     469             :     // Fast way to count leading zeros
     470             :     // On the AVR 8-bit architecture __builtin_clz() works on a int16_t.
     471             :     return __builtin_clz(val);
     472             : #elif defined(__GNUC__) || __has_builtin(__builtin_clz) || defined(_MSC_VER)
     473             :     // Fast way to count leading zeros
     474           8 :     return (int16_t)(__builtin_clz(val) - 16);
     475             : #else
     476             :     if (val == 0) return 16;
     477             :     int16_t result = 4;
     478             :     uint16_t hi = 0xFU << 12;
     479             :     while ((val & hi) == 0) {
     480             :         hi >>= 4;
     481             :         result += 4;
     482             :     }
     483             :     while (val & hi) {
     484             :         result -= 1;
     485             :         hi <<= 1;
     486             :     }
     487             :     return result;
     488             : #endif
     489             : }
     490             : 
     491             : static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) {
     492             : #if defined(__AVR__)
     493             :     // Fast way to count leading zeros
     494             :     return __builtin_clzl(val);
     495             : #elif defined(__GNUC__) || __has_builtin(__builtin_clz) || defined(_MSC_VER)
     496             :     // Fast way to count leading zeros
     497          13 :     return __builtin_clz(val);
     498             : #else
     499             :     if (val == 0) return 32;
     500             :     int32_t result = 8;
     501             :     uint32_t hi = 0xFFU << 24;
     502             :     while ((val & hi) == 0) {
     503             :         hi >>= 8;
     504             :         result += 8;
     505             :     }
     506             :     while (val & hi) {
     507             :         result -= 1;
     508             :         hi <<= 1;
     509             :     }
     510             :     return result;
     511             : #endif
     512             : }
     513             : 
     514             : static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) {
     515             : #if defined(__GNUC__) || __has_builtin(__builtin_clzll) || defined(_MSC_VER)
     516             :     // Fast way to count leading zeros
     517             :     return __builtin_clzll(val);
     518             : #else
     519             :     uint32_t hi = val >> 32;
     520             :     uint32_t lo = val & 0xFFFFFFFF;
     521             :     if (hi != 0) return libdivide_count_leading_zeros32(hi);
     522             :     return 32 + libdivide_count_leading_zeros32(lo);
     523             : #endif
     524             : }
     525             : 
     526             : // libdivide_32_div_16_to_16: divides a 32-bit uint {u1, u0} by a 16-bit
     527             : // uint {v}. The result must fit in 16 bits.
     528             : // Returns the quotient directly and the remainder in *r
     529             : static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16(
     530             :     uint16_t u1, uint16_t u0, uint16_t v, uint16_t *r) {
     531           4 :     uint32_t n = ((uint32_t)u1 << 16) | u0;
     532           4 :     uint16_t result = (uint16_t)(n / v);
     533           4 :     *r = (uint16_t)(n - result * (uint32_t)v);
     534           4 :     return result;
     535             : }
     536             : 
     537             : // libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit
     538             : // uint {v}. The result must fit in 32 bits.
     539             : // Returns the quotient directly and the remainder in *r
     540             : static LIBDIVIDE_INLINE uint32_t libdivide_64_div_32_to_32(
     541             :     uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
     542             : #if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && defined(LIBDIVIDE_GCC_STYLE_ASM)
     543             :     uint32_t result;
     544           5 :     __asm__("divl %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1));
     545           5 :     return result;
     546             : #else
     547             :     uint64_t n = ((uint64_t)u1 << 32) | u0;
     548             :     uint32_t result = (uint32_t)(n / v);
     549             :     *r = (uint32_t)(n - result * (uint64_t)v);
     550             :     return result;
     551             : #endif
     552             : }
     553             : 
     554             : // libdivide_128_div_64_to_64: divides a 128-bit uint {numhi, numlo} by a 64-bit uint {den}. The
     555             : // result must fit in 64 bits. Returns the quotient directly and the remainder in *r
     556             : static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(
     557             :     uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
     558             :     // N.B. resist the temptation to use __uint128_t here.
     559             :     // In LLVM compiler-rt, it performs a 128/128 -> 128 division which is many times slower than
     560             :     // necessary. In gcc it's better but still slower than the divlu implementation, perhaps because
     561             :     // it's not LIBDIVIDE_INLINEd.
     562             : #if defined(LIBDIVIDE_X86_64) && defined(LIBDIVIDE_GCC_STYLE_ASM)
     563             :     uint64_t result;
     564             :     __asm__("div %[v]" : "=a"(result), "=d"(*r) : [v] "r"(den), "a"(numlo), "d"(numhi));
     565             :     return result;
     566             : #else
     567             :     // We work in base 2**32.
     568             :     // A uint32 holds a single digit. A uint64 holds two digits.
     569             :     // Our numerator is conceptually [num3, num2, num1, num0].
     570             :     // Our denominator is [den1, den0].
     571             :     const uint64_t b = ((uint64_t)1 << 32);
     572             : 
     573             :     // The high and low digits of our computed quotient.
     574             :     uint32_t q1;
     575             :     uint32_t q0;
     576             : 
     577             :     // The normalization shift factor.
     578             :     int shift;
     579             : 
     580             :     // The high and low digits of our denominator (after normalizing).
     581             :     // Also the low 2 digits of our numerator (after normalizing).
     582             :     uint32_t den1;
     583             :     uint32_t den0;
     584             :     uint32_t num1;
     585             :     uint32_t num0;
     586             : 
     587             :     // A partial remainder.
     588             :     uint64_t rem;
     589             : 
     590             :     // The estimated quotient, and its corresponding remainder (unrelated to true remainder).
     591             :     uint64_t qhat;
     592             :     uint64_t rhat;
     593             : 
     594             :     // Variables used to correct the estimated quotient.
     595             :     uint64_t c1;
     596             :     uint64_t c2;
     597             : 
     598             :     // Check for overflow and divide by 0.
     599             :     if (numhi >= den) {
     600             :         if (r) *r = ~0ull;
     601             :         return ~0ull;
     602             :     }
     603             : 
     604             :     // Determine the normalization factor. We multiply den by this, so that its leading digit is at
     605             :     // least half b. In binary this means just shifting left by the number of leading zeros, so that
     606             :     // there's a 1 in the MSB.
     607             :     // We also shift numer by the same amount. This cannot overflow because numhi < den.
     608             :     // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
     609             :     // by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is
     610             :     // 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it.
     611             :     shift = libdivide_count_leading_zeros64(den);
     612             :     den <<= shift;
     613             :     numhi <<= shift;
     614             :     numhi |= (numlo >> (-shift & 63)) & (uint64_t)(-(int64_t)shift >> 63);
     615             :     numlo <<= shift;
     616             : 
     617             :     // Extract the low digits of the numerator and both digits of the denominator.
     618             :     num1 = (uint32_t)(numlo >> 32);
     619             :     num0 = (uint32_t)(numlo & 0xFFFFFFFFu);
     620             :     den1 = (uint32_t)(den >> 32);
     621             :     den0 = (uint32_t)(den & 0xFFFFFFFFu);
     622             : 
     623             :     // We wish to compute q1 = [n3 n2 n1] / [d1 d0].
     624             :     // Estimate q1 as [n3 n2] / [d1], and then correct it.
     625             :     // Note while qhat may be 2 digits, q1 is always 1 digit.
     626             :     qhat = numhi / den1;
     627             :     rhat = numhi % den1;
     628             :     c1 = qhat * den0;
     629             :     c2 = rhat * b + num1;
     630             :     if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
     631             :     q1 = (uint32_t)qhat;
     632             : 
     633             :     // Compute the true (partial) remainder.
     634             :     rem = numhi * b + num1 - q1 * den;
     635             : 
     636             :     // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
     637             :     // Estimate q0 as [rem1 rem0] / [d1] and correct it.
     638             :     qhat = rem / den1;
     639             :     rhat = rem % den1;
     640             :     c1 = qhat * den0;
     641             :     c2 = rhat * b + num0;
     642             :     if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
     643             :     q0 = (uint32_t)qhat;
     644             : 
     645             :     // Return remainder if requested.
     646             :     if (r) *r = (rem * b + num0 - q0 * den) >> shift;
     647             :     return ((uint64_t)q1 << 32) | q0;
     648             : #endif
     649             : }
     650             : 
     651             : #if !(defined(HAS_INT128_T) && \
     652             :       defined(HAS_INT128_DIV))
     653             : 
     654             : // Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0)
     655             : static LIBDIVIDE_INLINE void libdivide_u128_shift(
     656             :     uint64_t *u1, uint64_t *u0, int32_t signed_shift) {
     657             :     if (signed_shift > 0) {
     658             :         uint32_t shift = signed_shift;
     659             :         *u1 <<= shift;
     660             :         *u1 |= *u0 >> (64 - shift);
     661             :         *u0 <<= shift;
     662             :     } else if (signed_shift < 0) {
     663             :         uint32_t shift = -signed_shift;
     664             :         *u0 >>= shift;
     665             :         *u0 |= *u1 << (64 - shift);
     666             :         *u1 >>= shift;
     667             :     }
     668             : }
     669             : 
     670             : #endif
     671             : 
     672             : // Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder.
     673             : static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64(
     674             :     uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) {
     675             : #if defined(HAS_INT128_T) && defined(HAS_INT128_DIV)
     676             :     __uint128_t ufull = u_hi;
     677             :     __uint128_t vfull = v_hi;
     678             :     ufull = (ufull << 64) | u_lo;
     679             :     vfull = (vfull << 64) | v_lo;
     680             :     uint64_t res = (uint64_t)(ufull / vfull);
     681             :     __uint128_t remainder = ufull - (vfull * res);
     682             :     *r_lo = (uint64_t)remainder;
     683             :     *r_hi = (uint64_t)(remainder >> 64);
     684             :     return res;
     685             : #else
     686             :     // Adapted from "Unsigned Doubleword Division" in Hacker's Delight
     687             :     // We want to compute u / v
     688             :     typedef struct {
     689             :         uint64_t hi;
     690             :         uint64_t lo;
     691             :     } u128_t;
     692             :     u128_t u = {u_hi, u_lo};
     693             :     u128_t v = {v_hi, v_lo};
     694             : 
     695             :     if (v.hi == 0) {
     696             :         // divisor v is a 64 bit value, so we just need one 128/64 division
     697             :         // Note that we are simpler than Hacker's Delight here, because we know
     698             :         // the quotient fits in 64 bits whereas Hacker's Delight demands a full
     699             :         // 128 bit quotient
     700             :         *r_hi = 0;
     701             :         return libdivide_128_div_64_to_64(u.hi, u.lo, v.lo, r_lo);
     702             :     }
     703             :     // Here v >= 2**64
     704             :     // We know that v.hi != 0, so count leading zeros is OK
     705             :     // We have 0 <= n <= 63
     706             :     uint32_t n = libdivide_count_leading_zeros64(v.hi);
     707             : 
     708             :     // Normalize the divisor so its MSB is 1
     709             :     u128_t v1t = v;
     710             :     libdivide_u128_shift(&v1t.hi, &v1t.lo, n);
     711             :     uint64_t v1 = v1t.hi;  // i.e. v1 = v1t >> 64
     712             : 
     713             :     // To ensure no overflow
     714             :     u128_t u1 = u;
     715             :     libdivide_u128_shift(&u1.hi, &u1.lo, -1);
     716             : 
     717             :     // Get quotient from divide unsigned insn.
     718             :     uint64_t rem_ignored;
     719             :     uint64_t q1 = libdivide_128_div_64_to_64(u1.hi, u1.lo, v1, &rem_ignored);
     720             : 
     721             :     // Undo normalization and division of u by 2.
     722             :     u128_t q0 = {0, q1};
     723             :     libdivide_u128_shift(&q0.hi, &q0.lo, n);
     724             :     libdivide_u128_shift(&q0.hi, &q0.lo, -63);
     725             : 
     726             :     // Make q0 correct or too small by 1
     727             :     // Equivalent to `if (q0 != 0) q0 = q0 - 1;`
     728             :     if (q0.hi != 0 || q0.lo != 0) {
     729             :         q0.hi -= (q0.lo == 0);  // borrow
     730             :         q0.lo -= 1;
     731             :     }
     732             : 
     733             :     // Now q0 is correct.
     734             :     // Compute q0 * v as q0v
     735             :     // = (q0.hi << 64 + q0.lo) * (v.hi << 64 + v.lo)
     736             :     // = (q0.hi * v.hi << 128) + (q0.hi * v.lo << 64) +
     737             :     //   (q0.lo * v.hi <<  64) + q0.lo * v.lo)
     738             :     // Each term is 128 bit
     739             :     // High half of full product (upper 128 bits!) are dropped
     740             :     u128_t q0v = {0, 0};
     741             :     q0v.hi = q0.hi * v.lo + q0.lo * v.hi + libdivide_mullhi_u64(q0.lo, v.lo);
     742             :     q0v.lo = q0.lo * v.lo;
     743             : 
     744             :     // Compute u - q0v as u_q0v
     745             :     // This is the remainder
     746             :     u128_t u_q0v = u;
     747             :     u_q0v.hi -= q0v.hi + (u.lo < q0v.lo);  // second term is borrow
     748             :     u_q0v.lo -= q0v.lo;
     749             : 
     750             :     // Check if u_q0v >= v
     751             :     // This checks if our remainder is larger than the divisor
     752             :     if ((u_q0v.hi > v.hi) || (u_q0v.hi == v.hi && u_q0v.lo >= v.lo)) {
     753             :         // Increment q0
     754             :         q0.lo += 1;
     755             :         q0.hi += (q0.lo == 0);  // carry
     756             : 
     757             :         // Subtract v from remainder
     758             :         u_q0v.hi -= v.hi + (u_q0v.lo < v.lo);
     759             :         u_q0v.lo -= v.lo;
     760             :     }
     761             : 
     762             :     *r_hi = u_q0v.hi;
     763             :     *r_lo = u_q0v.lo;
     764             : 
     765             :     LIBDIVIDE_ASSERT(q0.hi == 0);
     766             :     return q0.lo;
     767             : #endif
     768             : }
     769             : 
     770             : ////////// UINT16
     771             : 
     772             : static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
     773             :     uint16_t d, int branchfree) {
     774           8 :     if (d == 0) {
     775           0 :         LIBDIVIDE_ERROR("divider must be != 0");
     776             :     }
     777             : 
     778             :     struct libdivide_u16_t result;
     779           8 :     uint8_t floor_log_2_d = (uint8_t)(15 - libdivide_count_leading_zeros16(d));
     780             : 
     781             :     // Power of 2
     782           8 :     if ((d & (d - 1)) == 0) {
     783             :         // We need to subtract 1 from the shift value in case of an unsigned
     784             :         // branchfree divider because there is a hardcoded right shift by 1
     785             :         // in its division algorithm. Because of this we also need to add back
     786             :         // 1 in its recovery algorithm.
     787           4 :         result.magic = 0;
     788           4 :         result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
     789             :     } else {
     790             :         uint8_t more;
     791             :         uint16_t rem, proposed_m;
     792           4 :         proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem);
     793             : 
     794             :         LIBDIVIDE_ASSERT(rem > 0 && rem < d);
     795           4 :         const uint16_t e = d - rem;
     796             : 
     797             :         // This power works if e < 2**floor_log_2_d.
     798           4 :         if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) {
     799             :             // This power works
     800           3 :             more = floor_log_2_d;
     801             :         } else {
     802             :             // We have to use the general 17-bit algorithm.  We need to compute
     803             :             // (2**power) / d. However, we already have (2**(power-1))/d and
     804             :             // its remainder.  By doubling both, and then correcting the
     805             :             // remainder, we can compute the larger division.
     806             :             // don't care about overflow here - in fact, we expect it
     807           1 :             proposed_m += proposed_m;
     808           1 :             const uint16_t twice_rem = rem + rem;
     809           1 :             if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
     810           1 :             more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
     811             :         }
     812           4 :         result.magic = 1 + proposed_m;
     813           4 :         result.more = more;
     814             :         // result.more's shift should in general be ceil_log_2_d. But if we
     815             :         // used the smaller power, we subtract one from the shift because we're
     816             :         // using the smaller power. If we're using the larger power, we
     817             :         // subtract one from the shift because it's taken care of by the add
     818             :         // indicator. So floor_log_2_d happens to be correct in both cases.
     819             :     }
     820           8 :     return result;
     821             : }
     822             : 
     823             : static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d) {
     824           8 :     return libdivide_internal_u16_gen(d, 0);
     825             : }
     826             : 
     827             : static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
     828             :     if (d == 1) {
     829             :         LIBDIVIDE_ERROR("branchfree divider must be != 1");
     830             :     }
     831             :     struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1);
     832             :     struct libdivide_u16_branchfree_t ret = {
     833             :         tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK)};
     834             :     return ret;
     835             : }
     836             : 
     837             : // The original libdivide_u16_do takes a const pointer. However, this cannot be used
     838             : // with a compile time constant libdivide_u16_t: it will generate a warning about
     839             : // taking the address of a temporary. Hence this overload.
     840             : static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) {
     841             :     if (!magic) {
     842             :         return numer >> more;
     843             :     } else {
     844             :         uint16_t q = libdivide_mullhi_u16(numer, magic);
     845             :         if (more & LIBDIVIDE_ADD_MARKER) {
     846             :             uint16_t t = ((numer - q) >> 1) + q;
     847             :             return t >> (more & LIBDIVIDE_16_SHIFT_MASK);
     848             :         } else {
     849             :             // All upper bits are 0,
     850             :             // don't need to mask them off.
     851             :             return q >> more;
     852             :         }
     853             :     }
     854             : }
     855             : 
     856             : static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t *denom) {
     857             :     return libdivide_u16_do_raw(numer, denom->magic, denom->more);
     858             : }
     859             : 
     860             : static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(
     861             :     uint16_t numer, const struct libdivide_u16_branchfree_t *denom) {
     862             :     uint16_t q = libdivide_mullhi_u16(numer, denom->magic);
     863             :     uint16_t t = ((numer - q) >> 1) + q;
     864             :     return t >> denom->more;
     865             : }
     866             : 
     867             : static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) {
     868             :     uint8_t more = denom->more;
     869             :     uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
     870             : 
     871             :     if (!denom->magic) {
     872             :         return (uint16_t)1 << shift;
     873             :     } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
     874             :         // We compute q = n/d = n*m / 2^(16 + shift)
     875             :         // Therefore we have d = 2^(16 + shift) / m
     876             :         // We need to ceil it.
     877             :         // We know d is not a power of 2, so m is not a power of 2,
     878             :         // so we can just add 1 to the floor
     879             :         uint16_t hi_dividend = (uint16_t)1 << shift;
     880             :         uint16_t rem_ignored;
     881             :         return 1 + libdivide_32_div_16_to_16(hi_dividend, 0, denom->magic, &rem_ignored);
     882             :     } else {
     883             :         // Here we wish to compute d = 2^(16+shift+1)/(m+2^16).
     884             :         // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now
     885             :         // Also note that shift may be as high as 15, so shift + 1 will
     886             :         // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
     887             :         // then double the quotient and remainder.
     888             :         uint32_t half_n = (uint32_t)1 << (16 + shift);
     889             :         uint32_t d = ((uint32_t)1 << 16) | denom->magic;
     890             :         // Note that the quotient is guaranteed <= 16 bits, but the remainder
     891             :         // may need 17!
     892             :         uint16_t half_q = (uint16_t)(half_n / d);
     893             :         uint32_t rem = half_n % d;
     894             :         // We computed 2^(16+shift)/(m+2^16)
     895             :         // Need to double it, and then add 1 to the quotient if doubling th
     896             :         // remainder would increase the quotient.
     897             :         // Note that rem<<1 cannot overflow, since rem < d and d is 17 bits
     898             :         uint16_t full_q = half_q + half_q + ((rem << 1) >= d);
     899             : 
     900             :         // We rounded down in gen (hence +1)
     901             :         return full_q + 1;
     902             :     }
     903             : }
     904             : 
     905             : static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(const struct libdivide_u16_branchfree_t *denom) {
     906             :     uint8_t more = denom->more;
     907             :     uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
     908             : 
     909             :     if (!denom->magic) {
     910             :         return (uint16_t)1 << (shift + 1);
     911             :     } else {
     912             :         // Here we wish to compute d = 2^(16+shift+1)/(m+2^16).
     913             :         // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now
     914             :         // Also note that shift may be as high as 15, so shift + 1 will
     915             :         // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
     916             :         // then double the quotient and remainder.
     917             :         uint32_t half_n = (uint32_t)1 << (16 + shift);
     918             :         uint32_t d = ((uint32_t)1 << 16) | denom->magic;
     919             :         // Note that the quotient is guaranteed <= 16 bits, but the remainder
     920             :         // may need 17!
     921             :         uint16_t half_q = (uint16_t)(half_n / d);
     922             :         uint32_t rem = half_n % d;
     923             :         // We computed 2^(16+shift)/(m+2^16)
     924             :         // Need to double it, and then add 1 to the quotient if doubling th
     925             :         // remainder would increase the quotient.
     926             :         // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
     927             :         uint16_t full_q = half_q + half_q + ((rem << 1) >= d);
     928             : 
     929             :         // We rounded down in gen (hence +1)
     930             :         return full_q + 1;
     931             :     }
     932             : }
     933             : 
     934             : ////////// UINT32
     935             : 
     936             : static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_internal_u32_gen(
     937             :     uint32_t d, int branchfree) {
     938          13 :     if (d == 0) {
     939           0 :         LIBDIVIDE_ERROR("divider must be != 0");
     940             :     }
     941             : 
     942             :     struct libdivide_u32_t result;
     943          13 :     uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(d);
     944             : 
     945             :     // Power of 2
     946          13 :     if ((d & (d - 1)) == 0) {
     947             :         // We need to subtract 1 from the shift value in case of an unsigned
     948             :         // branchfree divider because there is a hardcoded right shift by 1
     949             :         // in its division algorithm. Because of this we also need to add back
     950             :         // 1 in its recovery algorithm.
     951           8 :         result.magic = 0;
     952           8 :         result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
     953             :     } else {
     954             :         uint8_t more;
     955             :         uint32_t rem, proposed_m;
     956           5 :         proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << floor_log_2_d, 0, d, &rem);
     957             : 
     958             :         LIBDIVIDE_ASSERT(rem > 0 && rem < d);
     959           5 :         const uint32_t e = d - rem;
     960             : 
     961             :         // This power works if e < 2**floor_log_2_d.
     962           5 :         if (!branchfree && (e < ((uint32_t)1 << floor_log_2_d))) {
     963             :             // This power works
     964           5 :             more = (uint8_t)floor_log_2_d;
     965             :         } else {
     966             :             // We have to use the general 33-bit algorithm.  We need to compute
     967             :             // (2**power) / d. However, we already have (2**(power-1))/d and
     968             :             // its remainder.  By doubling both, and then correcting the
     969             :             // remainder, we can compute the larger division.
     970             :             // don't care about overflow here - in fact, we expect it
     971           0 :             proposed_m += proposed_m;
     972           0 :             const uint32_t twice_rem = rem + rem;
     973           0 :             if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
     974           0 :             more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
     975             :         }
     976           5 :         result.magic = 1 + proposed_m;
     977           5 :         result.more = more;
     978             :         // result.more's shift should in general be ceil_log_2_d. But if we
     979             :         // used the smaller power, we subtract one from the shift because we're
     980             :         // using the smaller power. If we're using the larger power, we
     981             :         // subtract one from the shift because it's taken care of by the add
     982             :         // indicator. So floor_log_2_d happens to be correct in both cases.
     983             :     }
     984          13 :     return result;
     985             : }
     986             : 
     987             : static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d) {
     988          13 :     return libdivide_internal_u32_gen(d, 0);
     989             : }
     990             : 
     991             : static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) {
     992             :     if (d == 1) {
     993             :         LIBDIVIDE_ERROR("branchfree divider must be != 1");
     994             :     }
     995             :     struct libdivide_u32_t tmp = libdivide_internal_u32_gen(d, 1);
     996             :     struct libdivide_u32_branchfree_t ret = {
     997             :         tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)};
     998             :     return ret;
     999             : }
    1000             : 
    1001             : static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw(uint32_t numer, uint32_t magic, uint8_t more) {
    1002             :     if (!magic) {
    1003             :         return numer >> more;
    1004             :     } else {
    1005             :         uint32_t q = libdivide_mullhi_u32(numer, magic);
    1006             :         if (more & LIBDIVIDE_ADD_MARKER) {
    1007             :             uint32_t t = ((numer - q) >> 1) + q;
    1008             :             return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
    1009             :         } else {
    1010             :             // All upper bits are 0,
    1011             :             // don't need to mask them off.
    1012             :             return q >> more;
    1013             :         }
    1014             :     }
    1015             : }
    1016             : 
    1017             : static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
    1018             :     return libdivide_u32_do_raw(numer, denom->magic, denom->more);
    1019             : }
    1020             : 
    1021             : static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(
    1022             :     uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {
    1023             :     uint32_t q = libdivide_mullhi_u32(numer, denom->magic);
    1024             :     uint32_t t = ((numer - q) >> 1) + q;
    1025             :     return t >> denom->more;
    1026             : }
    1027             : 
    1028             : static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) {
    1029             :     uint8_t more = denom->more;
    1030             :     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    1031             : 
    1032             :     if (!denom->magic) {
    1033             :         return (uint32_t)1 << shift;
    1034             :     } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
    1035             :         // We compute q = n/d = n*m / 2^(32 + shift)
    1036             :         // Therefore we have d = 2^(32 + shift) / m
    1037             :         // We need to ceil it.
    1038             :         // We know d is not a power of 2, so m is not a power of 2,
    1039             :         // so we can just add 1 to the floor
    1040             :         uint32_t hi_dividend = (uint32_t)1 << shift;
    1041             :         uint32_t rem_ignored;
    1042             :         return 1 + libdivide_64_div_32_to_32(hi_dividend, 0, denom->magic, &rem_ignored);
    1043             :     } else {
    1044             :         // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
    1045             :         // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
    1046             :         // Also note that shift may be as high as 31, so shift + 1 will
    1047             :         // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
    1048             :         // then double the quotient and remainder.
    1049             :         uint64_t half_n = (uint64_t)1 << (32 + shift);
    1050             :         uint64_t d = ((uint64_t)1 << 32) | denom->magic;
    1051             :         // Note that the quotient is guaranteed <= 32 bits, but the remainder
    1052             :         // may need 33!
    1053             :         uint32_t half_q = (uint32_t)(half_n / d);
    1054             :         uint64_t rem = half_n % d;
    1055             :         // We computed 2^(32+shift)/(m+2^32)
    1056             :         // Need to double it, and then add 1 to the quotient if doubling th
    1057             :         // remainder would increase the quotient.
    1058             :         // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
    1059             :         uint32_t full_q = half_q + half_q + ((rem << 1) >= d);
    1060             : 
    1061             :         // We rounded down in gen (hence +1)
    1062             :         return full_q + 1;
    1063             :     }
    1064             : }
    1065             : 
    1066             : static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom) {
    1067             :     uint8_t more = denom->more;
    1068             :     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    1069             : 
    1070             :     if (!denom->magic) {
    1071             :         return (uint32_t)1 << (shift + 1);
    1072             :     } else {
    1073             :         // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
    1074             :         // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
    1075             :         // Also note that shift may be as high as 31, so shift + 1 will
    1076             :         // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
    1077             :         // then double the quotient and remainder.
    1078             :         uint64_t half_n = (uint64_t)1 << (32 + shift);
    1079             :         uint64_t d = ((uint64_t)1 << 32) | denom->magic;
    1080             :         // Note that the quotient is guaranteed <= 32 bits, but the remainder
    1081             :         // may need 33!
    1082             :         uint32_t half_q = (uint32_t)(half_n / d);
    1083             :         uint64_t rem = half_n % d;
    1084             :         // We computed 2^(32+shift)/(m+2^32)
    1085             :         // Need to double it, and then add 1 to the quotient if doubling th
    1086             :         // remainder would increase the quotient.
    1087             :         // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
    1088             :         uint32_t full_q = half_q + half_q + ((rem << 1) >= d);
    1089             : 
    1090             :         // We rounded down in gen (hence +1)
    1091             :         return full_q + 1;
    1092             :     }
    1093             : }
    1094             : 
    1095             : ////////// UINT64
    1096             : 
    1097             : static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_internal_u64_gen(
    1098             :     uint64_t d, int branchfree) {
    1099             :     if (d == 0) {
    1100             :         LIBDIVIDE_ERROR("divider must be != 0");
    1101             :     }
    1102             : 
    1103             :     struct libdivide_u64_t result;
    1104             :     uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(d);
    1105             : 
    1106             :     // Power of 2
    1107             :     if ((d & (d - 1)) == 0) {
    1108             :         // We need to subtract 1 from the shift value in case of an unsigned
    1109             :         // branchfree divider because there is a hardcoded right shift by 1
    1110             :         // in its division algorithm. Because of this we also need to add back
    1111             :         // 1 in its recovery algorithm.
    1112             :         result.magic = 0;
    1113             :         result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
    1114             :     } else {
    1115             :         uint64_t proposed_m, rem;
    1116             :         uint8_t more;
    1117             :         // (1 << (64 + floor_log_2_d)) / d
    1118             :         proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << floor_log_2_d, 0, d, &rem);
    1119             : 
    1120             :         LIBDIVIDE_ASSERT(rem > 0 && rem < d);
    1121             :         const uint64_t e = d - rem;
    1122             : 
    1123             :         // This power works if e < 2**floor_log_2_d.
    1124             :         if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {
    1125             :             // This power works
    1126             :             more = (uint8_t)floor_log_2_d;
    1127             :         } else {
    1128             :             // We have to use the general 65-bit algorithm.  We need to compute
    1129             :             // (2**power) / d. However, we already have (2**(power-1))/d and
    1130             :             // its remainder. By doubling both, and then correcting the
    1131             :             // remainder, we can compute the larger division.
    1132             :             // don't care about overflow here - in fact, we expect it
    1133             :             proposed_m += proposed_m;
    1134             :             const uint64_t twice_rem = rem + rem;
    1135             :             if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
    1136             :             more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
    1137             :         }
    1138             :         result.magic = 1 + proposed_m;
    1139             :         result.more = more;
    1140             :         // result.more's shift should in general be ceil_log_2_d. But if we
    1141             :         // used the smaller power, we subtract one from the shift because we're
    1142             :         // using the smaller power. If we're using the larger power, we
    1143             :         // subtract one from the shift because it's taken care of by the add
    1144             :         // indicator. So floor_log_2_d happens to be correct in both cases,
    1145             :         // which is why we do it outside of the if statement.
    1146             :     }
    1147             :     return result;
    1148             : }
    1149             : 
    1150             : static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d) {
    1151             :     return libdivide_internal_u64_gen(d, 0);
    1152             : }
    1153             : 
    1154             : static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) {
    1155             :     if (d == 1) {
    1156             :         LIBDIVIDE_ERROR("branchfree divider must be != 1");
    1157             :     }
    1158             :     struct libdivide_u64_t tmp = libdivide_internal_u64_gen(d, 1);
    1159             :     struct libdivide_u64_branchfree_t ret = {
    1160             :         tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_64_SHIFT_MASK)};
    1161             :     return ret;
    1162             : }
    1163             : 
    1164             : static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw(uint64_t numer, uint64_t magic, uint8_t more) {
    1165             :    if (!magic) {
    1166             :         return numer >> more;
    1167             :     } else {
    1168             :         uint64_t q = libdivide_mullhi_u64(numer, magic);
    1169             :         if (more & LIBDIVIDE_ADD_MARKER) {
    1170             :             uint64_t t = ((numer - q) >> 1) + q;
    1171             :             return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
    1172             :         } else {
    1173             :             // All upper bits are 0,
    1174             :             // don't need to mask them off.
    1175             :             return q >> more;
    1176             :         }
    1177             :     }
    1178             : }
    1179             : 
    1180             : static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
    1181             :     return libdivide_u64_do_raw(numer, denom->magic, denom->more);
    1182             : }
    1183             : 
    1184             : static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(
    1185             :     uint64_t numer, const struct libdivide_u64_branchfree_t *denom) {
    1186             :     uint64_t q = libdivide_mullhi_u64(numer, denom->magic);
    1187             :     uint64_t t = ((numer - q) >> 1) + q;
    1188             :     return t >> denom->more;
    1189             : }
    1190             : 
    1191             : static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) {
    1192             :     uint8_t more = denom->more;
    1193             :     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    1194             : 
    1195             :     if (!denom->magic) {
    1196             :         return (uint64_t)1 << shift;
    1197             :     } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
    1198             :         // We compute q = n/d = n*m / 2^(64 + shift)
    1199             :         // Therefore we have d = 2^(64 + shift) / m
    1200             :         // We need to ceil it.
    1201             :         // We know d is not a power of 2, so m is not a power of 2,
    1202             :         // so we can just add 1 to the floor
    1203             :         uint64_t hi_dividend = (uint64_t)1 << shift;
    1204             :         uint64_t rem_ignored;
    1205             :         return 1 + libdivide_128_div_64_to_64(hi_dividend, 0, denom->magic, &rem_ignored);
    1206             :     } else {
    1207             :         // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).
    1208             :         // Notice (m + 2^64) is a 65 bit number. This gets hairy. See
    1209             :         // libdivide_u32_recover for more on what we do here.
    1210             :         // TODO: do something better than 128 bit math
    1211             : 
    1212             :         // Full n is a (potentially) 129 bit value
    1213             :         // half_n is a 128 bit value
    1214             :         // Compute the hi half of half_n. Low half is 0.
    1215             :         uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0;
    1216             :         // d is a 65 bit value. The high bit is always set to 1.
    1217             :         const uint64_t d_hi = 1, d_lo = denom->magic;
    1218             :         // Note that the quotient is guaranteed <= 64 bits,
    1219             :         // but the remainder may need 65!
    1220             :         uint64_t r_hi, r_lo;
    1221             :         uint64_t half_q =
    1222             :             libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
    1223             :         // We computed 2^(64+shift)/(m+2^64)
    1224             :         // Double the remainder ('dr') and check if that is larger than d
    1225             :         // Note that d is a 65 bit value, so r1 is small and so r1 + r1
    1226             :         // cannot overflow
    1227             :         uint64_t dr_lo = r_lo + r_lo;
    1228             :         uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo);  // last term is carry
    1229             :         int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);
    1230             :         uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);
    1231             :         return full_q + 1;
    1232             :     }
    1233             : }
    1234             : 
    1235             : static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom) {
    1236             :     uint8_t more = denom->more;
    1237             :     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    1238             : 
    1239             :     if (!denom->magic) {
    1240             :         return (uint64_t)1 << (shift + 1);
    1241             :     } else {
    1242             :         // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).
    1243             :         // Notice (m + 2^64) is a 65 bit number. This gets hairy. See
    1244             :         // libdivide_u32_recover for more on what we do here.
    1245             :         // TODO: do something better than 128 bit math
    1246             : 
    1247             :         // Full n is a (potentially) 129 bit value
    1248             :         // half_n is a 128 bit value
    1249             :         // Compute the hi half of half_n. Low half is 0.
    1250             :         uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0;
    1251             :         // d is a 65 bit value. The high bit is always set to 1.
    1252             :         const uint64_t d_hi = 1, d_lo = denom->magic;
    1253             :         // Note that the quotient is guaranteed <= 64 bits,
    1254             :         // but the remainder may need 65!
    1255             :         uint64_t r_hi, r_lo;
    1256             :         uint64_t half_q =
    1257             :             libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
    1258             :         // We computed 2^(64+shift)/(m+2^64)
    1259             :         // Double the remainder ('dr') and check if that is larger than d
    1260             :         // Note that d is a 65 bit value, so r1 is small and so r1 + r1
    1261             :         // cannot overflow
    1262             :         uint64_t dr_lo = r_lo + r_lo;
    1263             :         uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo);  // last term is carry
    1264             :         int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);
    1265             :         uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);
    1266             :         return full_q + 1;
    1267             :     }
    1268             : }
    1269             : 
    1270             : ////////// SINT16
    1271             : 
    1272             : static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen(
    1273             :     int16_t d, int branchfree) {
    1274             :     if (d == 0) {
    1275             :         LIBDIVIDE_ERROR("divider must be != 0");
    1276             :     }
    1277             : 
    1278             :     struct libdivide_s16_t result;
    1279             : 
    1280             :     // If d is a power of 2, or negative a power of 2, we have to use a shift.
    1281             :     // This is especially important because the magic algorithm fails for -1.
    1282             :     // To check if d is a power of 2 or its inverse, it suffices to check
    1283             :     // whether its absolute value has exactly one bit set. This works even for
    1284             :     // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
    1285             :     // and is a power of 2.
    1286             :     uint16_t ud = (uint16_t)d;
    1287             :     uint16_t absD = (d < 0) ? -ud : ud;
    1288             :     uint16_t floor_log_2_d = 15 - libdivide_count_leading_zeros16(absD);
    1289             :     // check if exactly one bit is set,
    1290             :     // don't care if absD is 0 since that's divide by zero
    1291             :     if ((absD & (absD - 1)) == 0) {
    1292             :         // Branchfree and normal paths are exactly the same
    1293             :         result.magic = 0;
    1294             :         result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
    1295             :     } else {
    1296             :         LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
    1297             : 
    1298             :         uint8_t more;
    1299             :         // the dividend here is 2**(floor_log_2_d + 31), so the low 16 bit word
    1300             :         // is 0 and the high word is floor_log_2_d - 1
    1301             :         uint16_t rem, proposed_m;
    1302             :         proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
    1303             :         const uint16_t e = absD - rem;
    1304             : 
    1305             :         // We are going to start with a power of floor_log_2_d - 1.
    1306             :         // This works if works if e < 2**floor_log_2_d.
    1307             :         if (!branchfree && e < ((uint16_t)1 << floor_log_2_d)) {
    1308             :             // This power works
    1309             :             more = (uint8_t)(floor_log_2_d - 1);
    1310             :         } else {
    1311             :             // We need to go one higher. This should not make proposed_m
    1312             :             // overflow, but it will make it negative when interpreted as an
    1313             :             // int16_t.
    1314             :             proposed_m += proposed_m;
    1315             :             const uint16_t twice_rem = rem + rem;
    1316             :             if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
    1317             :             more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
    1318             :         }
    1319             : 
    1320             :         proposed_m += 1;
    1321             :         int16_t magic = (int16_t)proposed_m;
    1322             : 
    1323             :         // Mark if we are negative. Note we only negate the magic number in the
    1324             :         // branchfull case.
    1325             :         if (d < 0) {
    1326             :             more |= LIBDIVIDE_NEGATIVE_DIVISOR;
    1327             :             if (!branchfree) {
    1328             :                 magic = -magic;
    1329             :             }
    1330             :         }
    1331             : 
    1332             :         result.more = more;
    1333             :         result.magic = magic;
    1334             :     }
    1335             :     return result;
    1336             : }
    1337             : 
    1338             : static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d) {
    1339             :     return libdivide_internal_s16_gen(d, 0);
    1340             : }
    1341             : 
    1342             : static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d) {
    1343             :     struct libdivide_s16_t tmp = libdivide_internal_s16_gen(d, 1);
    1344             :     struct libdivide_s16_branchfree_t result = {tmp.magic, tmp.more};
    1345             :     return result;
    1346             : }
    1347             : 
    1348             : // The original libdivide_s16_do takes a const pointer. However, this cannot be used
    1349             : // with a compile time constant libdivide_s16_t: it will generate a warning about
    1350             : // taking the address of a temporary. Hence this overload.
    1351             : static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more) {
    1352             :     uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
    1353             : 
    1354             :     if (!magic) {
    1355             :         uint16_t sign = (int8_t)more >> 7;
    1356             :         uint16_t mask = ((uint16_t)1 << shift) - 1;
    1357             :         uint16_t uq = numer + ((numer >> 15) & mask);
    1358             :         int16_t q = (int16_t)uq;
    1359             :         q >>= shift;
    1360             :         q = (q ^ sign) - sign;
    1361             :         return q;
    1362             :     } else {
    1363             :         uint16_t uq = (uint16_t)libdivide_mullhi_s16(numer, magic);
    1364             :         if (more & LIBDIVIDE_ADD_MARKER) {
    1365             :             // must be arithmetic shift and then sign extend
    1366             :             int16_t sign = (int8_t)more >> 7;
    1367             :             // q += (more < 0 ? -numer : numer)
    1368             :             // cast required to avoid UB
    1369             :             uq += ((uint16_t)numer ^ sign) - sign;
    1370             :         }
    1371             :         int16_t q = (int16_t)uq;
    1372             :         q >>= shift;
    1373             :         q += (q < 0);
    1374             :         return q;
    1375             :     }
    1376             : }
    1377             : 
    1378             : static LIBDIVIDE_INLINE int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) {
    1379             :     return libdivide_s16_do_raw(numer, denom->magic, denom->more);
    1380             : }
    1381             : 
    1382             : static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(int16_t numer, const struct libdivide_s16_branchfree_t *denom) {
    1383             :     uint8_t more = denom->more;
    1384             :     uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
    1385             :     // must be arithmetic shift and then sign extend
    1386             :     int16_t sign = (int8_t)more >> 7;
    1387             :     int16_t magic = denom->magic;
    1388             :     int16_t q = libdivide_mullhi_s16(numer, magic);
    1389             :     q += numer;
    1390             : 
    1391             :     // If q is non-negative, we have nothing to do
    1392             :     // If q is negative, we want to add either (2**shift)-1 if d is a power of
    1393             :     // 2, or (2**shift) if it is not a power of 2
    1394             :     uint16_t is_power_of_2 = (magic == 0);
    1395             :     uint16_t q_sign = (uint16_t)(q >> 15);
    1396             :     q += q_sign & (((uint16_t)1 << shift) - is_power_of_2);
    1397             : 
    1398             :     // Now arithmetic right shift
    1399             :     q >>= shift;
    1400             :     // Negate if needed
    1401             :     q = (q ^ sign) - sign;
    1402             : 
    1403             :     return q;
    1404             : }
    1405             : 
    1406             : static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom) {
    1407             :     uint8_t more = denom->more;
    1408             :     uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
    1409             :     if (!denom->magic) {
    1410             :         uint16_t absD = (uint16_t)1 << shift;
    1411             :         if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
    1412             :             absD = -absD;
    1413             :         }
    1414             :         return (int16_t)absD;
    1415             :     } else {
    1416             :         // Unsigned math is much easier
    1417             :         // We negate the magic number only in the branchfull case, and we don't
    1418             :         // know which case we're in. However we have enough information to
    1419             :         // determine the correct sign of the magic number. The divisor was
    1420             :         // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set,
    1421             :         // the magic number's sign is opposite that of the divisor.
    1422             :         // We want to compute the positive magic number.
    1423             :         int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
    1424             :         int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
    1425             : 
    1426             :         // Handle the power of 2 case (including branchfree)
    1427             :         if (denom->magic == 0) {
    1428             :             int16_t result = (uint16_t)1 << shift;
    1429             :             return negative_divisor ? -result : result;
    1430             :         }
    1431             : 
    1432             :         uint16_t d = (uint16_t)(magic_was_negated ? -denom->magic : denom->magic);
    1433             :         uint32_t n = (uint32_t)1 << (16 + shift);  // this shift cannot exceed 30
    1434             :         uint16_t q = (uint16_t)(n / d);
    1435             :         int16_t result = (int16_t)q;
    1436             :         result += 1;
    1437             :         return negative_divisor ? -result : result;
    1438             :     }
    1439             : }
    1440             : 
    1441             : static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(const struct libdivide_s16_branchfree_t *denom) {
    1442             :     const struct libdivide_s16_t den = {denom->magic, denom->more};
    1443             :     return libdivide_s16_recover(&den);
    1444             : }
    1445             : 
    1446             : ////////// SINT32
    1447             : 
    1448             : static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_internal_s32_gen(
    1449             :     int32_t d, int branchfree) {
    1450             :     if (d == 0) {
    1451             :         LIBDIVIDE_ERROR("divider must be != 0");
    1452             :     }
    1453             : 
    1454             :     struct libdivide_s32_t result;
    1455             : 
    1456             :     // If d is a power of 2, or negative a power of 2, we have to use a shift.
    1457             :     // This is especially important because the magic algorithm fails for -1.
    1458             :     // To check if d is a power of 2 or its inverse, it suffices to check
    1459             :     // whether its absolute value has exactly one bit set. This works even for
    1460             :     // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
    1461             :     // and is a power of 2.
    1462             :     uint32_t ud = (uint32_t)d;
    1463             :     uint32_t absD = (d < 0) ? -ud : ud;
    1464             :     uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(absD);
    1465             :     // check if exactly one bit is set,
    1466             :     // don't care if absD is 0 since that's divide by zero
    1467             :     if ((absD & (absD - 1)) == 0) {
    1468             :         // Branchfree and normal paths are exactly the same
    1469             :         result.magic = 0;
    1470             :         result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
    1471             :     } else {
    1472             :         LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
    1473             : 
    1474             :         uint8_t more;
    1475             :         // the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word
    1476             :         // is 0 and the high word is floor_log_2_d - 1
    1477             :         uint32_t rem, proposed_m;
    1478             :         proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
    1479             :         const uint32_t e = absD - rem;
    1480             : 
    1481             :         // We are going to start with a power of floor_log_2_d - 1.
    1482             :         // This works if works if e < 2**floor_log_2_d.
    1483             :         if (!branchfree && e < ((uint32_t)1 << floor_log_2_d)) {
    1484             :             // This power works
    1485             :             more = (uint8_t)(floor_log_2_d - 1);
    1486             :         } else {
    1487             :             // We need to go one higher. This should not make proposed_m
    1488             :             // overflow, but it will make it negative when interpreted as an
    1489             :             // int32_t.
    1490             :             proposed_m += proposed_m;
    1491             :             const uint32_t twice_rem = rem + rem;
    1492             :             if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
    1493             :             more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
    1494             :         }
    1495             : 
    1496             :         proposed_m += 1;
    1497             :         int32_t magic = (int32_t)proposed_m;
    1498             : 
    1499             :         // Mark if we are negative. Note we only negate the magic number in the
    1500             :         // branchfull case.
    1501             :         if (d < 0) {
    1502             :             more |= LIBDIVIDE_NEGATIVE_DIVISOR;
    1503             :             if (!branchfree) {
    1504             :                 magic = -magic;
    1505             :             }
    1506             :         }
    1507             : 
    1508             :         result.more = more;
    1509             :         result.magic = magic;
    1510             :     }
    1511             :     return result;
    1512             : }
    1513             : 
    1514             : static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d) {
    1515             :     return libdivide_internal_s32_gen(d, 0);
    1516             : }
    1517             : 
    1518             : static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) {
    1519             :     struct libdivide_s32_t tmp = libdivide_internal_s32_gen(d, 1);
    1520             :     struct libdivide_s32_branchfree_t result = {tmp.magic, tmp.more};
    1521             :     return result;
    1522             : }
    1523             : 
    1524             : static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) {
    1525             :     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    1526             : 
    1527             :     if (!magic) {
    1528             :         uint32_t sign = (int8_t)more >> 7;
    1529             :         uint32_t mask = ((uint32_t)1 << shift) - 1;
    1530             :         uint32_t uq = numer + ((numer >> 31) & mask);
    1531             :         int32_t q = (int32_t)uq;
    1532             :         q >>= shift;
    1533             :         q = (q ^ sign) - sign;
    1534             :         return q;
    1535             :     } else {
    1536             :         uint32_t uq = (uint32_t)libdivide_mullhi_s32(numer, magic);
    1537             :         if (more & LIBDIVIDE_ADD_MARKER) {
    1538             :             // must be arithmetic shift and then sign extend
    1539             :             int32_t sign = (int8_t)more >> 7;
    1540             :             // q += (more < 0 ? -numer : numer)
    1541             :             // cast required to avoid UB
    1542             :             uq += ((uint32_t)numer ^ sign) - sign;
    1543             :         }
    1544             :         int32_t q = (int32_t)uq;
    1545             :         q >>= shift;
    1546             :         q += (q < 0);
    1547             :         return q;
    1548             :     }
    1549             : }
    1550             : 
    1551             : static LIBDIVIDE_INLINE int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
    1552             :     return libdivide_s32_do_raw(numer, denom->magic, denom->more);
    1553             : }
    1554             : 
    1555             : static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) {
    1556             :     uint8_t more = denom->more;
    1557             :     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    1558             :     // must be arithmetic shift and then sign extend
    1559             :     int32_t sign = (int8_t)more >> 7;
    1560             :     int32_t magic = denom->magic;
    1561             :     int32_t q = libdivide_mullhi_s32(numer, magic);
    1562             :     q += numer;
    1563             : 
    1564             :     // If q is non-negative, we have nothing to do
    1565             :     // If q is negative, we want to add either (2**shift)-1 if d is a power of
    1566             :     // 2, or (2**shift) if it is not a power of 2
    1567             :     uint32_t is_power_of_2 = (magic == 0);
    1568             :     uint32_t q_sign = (uint32_t)(q >> 31);
    1569             :     q += q_sign & (((uint32_t)1 << shift) - is_power_of_2);
    1570             : 
    1571             :     // Now arithmetic right shift
    1572             :     q >>= shift;
    1573             :     // Negate if needed
    1574             :     q = (q ^ sign) - sign;
    1575             : 
    1576             :     return q;
    1577             : }
    1578             : 
    1579             : static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) {
    1580             :     uint8_t more = denom->more;
    1581             :     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    1582             :     if (!denom->magic) {
    1583             :         uint32_t absD = (uint32_t)1 << shift;
    1584             :         if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
    1585             :             absD = -absD;
    1586             :         }
    1587             :         return (int32_t)absD;
    1588             :     } else {
    1589             :         // Unsigned math is much easier
    1590             :         // We negate the magic number only in the branchfull case, and we don't
    1591             :         // know which case we're in. However we have enough information to
    1592             :         // determine the correct sign of the magic number. The divisor was
    1593             :         // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set,
    1594             :         // the magic number's sign is opposite that of the divisor.
    1595             :         // We want to compute the positive magic number.
    1596             :         int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
    1597             :         int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
    1598             : 
    1599             :         // Handle the power of 2 case (including branchfree)
    1600             :         if (denom->magic == 0) {
    1601             :             int32_t result = (uint32_t)1 << shift;
    1602             :             return negative_divisor ? -result : result;
    1603             :         }
    1604             : 
    1605             :         uint32_t d = (uint32_t)(magic_was_negated ? -denom->magic : denom->magic);
    1606             :         uint64_t n = (uint64_t)1 << (32 + shift);  // this shift cannot exceed 30
    1607             :         uint32_t q = (uint32_t)(n / d);
    1608             :         int32_t result = (int32_t)q;
    1609             :         result += 1;
    1610             :         return negative_divisor ? -result : result;
    1611             :     }
    1612             : }
    1613             : 
    1614             : static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom) {
    1615             :     const struct libdivide_s32_t den = {denom->magic, denom->more};
    1616             :     return libdivide_s32_recover(&den);
    1617             : }
    1618             : 
    1619             : ////////// SINT64
    1620             : 
    1621             : static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_internal_s64_gen(
    1622             :     int64_t d, int branchfree) {
    1623             :     if (d == 0) {
    1624             :         LIBDIVIDE_ERROR("divider must be != 0");
    1625             :     }
    1626             : 
    1627             :     struct libdivide_s64_t result;
    1628             : 
    1629             :     // If d is a power of 2, or negative a power of 2, we have to use a shift.
    1630             :     // This is especially important because the magic algorithm fails for -1.
    1631             :     // To check if d is a power of 2 or its inverse, it suffices to check
    1632             :     // whether its absolute value has exactly one bit set.  This works even for
    1633             :     // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
    1634             :     // and is a power of 2.
    1635             :     uint64_t ud = (uint64_t)d;
    1636             :     uint64_t absD = (d < 0) ? -ud : ud;
    1637             :     uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(absD);
    1638             :     // check if exactly one bit is set,
    1639             :     // don't care if absD is 0 since that's divide by zero
    1640             :     if ((absD & (absD - 1)) == 0) {
    1641             :         // Branchfree and non-branchfree cases are the same
    1642             :         result.magic = 0;
    1643             :         result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
    1644             :     } else {
    1645             :         // the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word
    1646             :         // is 0 and the high word is floor_log_2_d - 1
    1647             :         uint8_t more;
    1648             :         uint64_t rem, proposed_m;
    1649             :         proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
    1650             :         const uint64_t e = absD - rem;
    1651             : 
    1652             :         // We are going to start with a power of floor_log_2_d - 1.
    1653             :         // This works if works if e < 2**floor_log_2_d.
    1654             :         if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {
    1655             :             // This power works
    1656             :             more = (uint8_t)(floor_log_2_d - 1);
    1657             :         } else {
    1658             :             // We need to go one higher. This should not make proposed_m
    1659             :             // overflow, but it will make it negative when interpreted as an
    1660             :             // int32_t.
    1661             :             proposed_m += proposed_m;
    1662             :             const uint64_t twice_rem = rem + rem;
    1663             :             if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
    1664             :             // note that we only set the LIBDIVIDE_NEGATIVE_DIVISOR bit if we
    1665             :             // also set ADD_MARKER this is an annoying optimization that
    1666             :             // enables algorithm #4 to avoid the mask. However we always set it
    1667             :             // in the branchfree case
    1668             :             more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
    1669             :         }
    1670             :         proposed_m += 1;
    1671             :         int64_t magic = (int64_t)proposed_m;
    1672             : 
    1673             :         // Mark if we are negative
    1674             :         if (d < 0) {
    1675             :             more |= LIBDIVIDE_NEGATIVE_DIVISOR;
    1676             :             if (!branchfree) {
    1677             :                 magic = -magic;
    1678             :             }
    1679             :         }
    1680             : 
    1681             :         result.more = more;
    1682             :         result.magic = magic;
    1683             :     }
    1684             :     return result;
    1685             : }
    1686             : 
    1687             : static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d) {
    1688             :     return libdivide_internal_s64_gen(d, 0);
    1689             : }
    1690             : 
    1691             : static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) {
    1692             :     struct libdivide_s64_t tmp = libdivide_internal_s64_gen(d, 1);
    1693             :     struct libdivide_s64_branchfree_t ret = {tmp.magic, tmp.more};
    1694             :     return ret;
    1695             : }
    1696             : 
    1697             : static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) {
    1698             :     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    1699             : 
    1700             :     if (!magic) {  // shift path
    1701             :         uint64_t mask = ((uint64_t)1 << shift) - 1;
    1702             :         uint64_t uq = numer + ((numer >> 63) & mask);
    1703             :         int64_t q = (int64_t)uq;
    1704             :         q >>= shift;
    1705             :         // must be arithmetic shift and then sign-extend
    1706             :         int64_t sign = (int8_t)more >> 7;
    1707             :         q = (q ^ sign) - sign;
    1708             :         return q;
    1709             :     } else {
    1710             :         uint64_t uq = (uint64_t)libdivide_mullhi_s64(numer, magic);
    1711             :         if (more & LIBDIVIDE_ADD_MARKER) {
    1712             :             // must be arithmetic shift and then sign extend
    1713             :             int64_t sign = (int8_t)more >> 7;
    1714             :             // q += (more < 0 ? -numer : numer)
    1715             :             // cast required to avoid UB
    1716             :             uq += ((uint64_t)numer ^ sign) - sign;
    1717             :         }
    1718             :         int64_t q = (int64_t)uq;
    1719             :         q >>= shift;
    1720             :         q += (q < 0);
    1721             :         return q;
    1722             :     }
    1723             : }
    1724             : 
    1725             : static LIBDIVIDE_INLINE int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
    1726             :     return libdivide_s64_do_raw(numer, denom->magic, denom->more);
    1727             : }
    1728             : 
    1729             : static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) {
    1730             :     uint8_t more = denom->more;
    1731             :     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    1732             :     // must be arithmetic shift and then sign extend
    1733             :     int64_t sign = (int8_t)more >> 7;
    1734             :     int64_t magic = denom->magic;
    1735             :     int64_t q = libdivide_mullhi_s64(numer, magic);
    1736             :     q += numer;
    1737             : 
    1738             :     // If q is non-negative, we have nothing to do.
    1739             :     // If q is negative, we want to add either (2**shift)-1 if d is a power of
    1740             :     // 2, or (2**shift) if it is not a power of 2.
    1741             :     uint64_t is_power_of_2 = (magic == 0);
    1742             :     uint64_t q_sign = (uint64_t)(q >> 63);
    1743             :     q += q_sign & (((uint64_t)1 << shift) - is_power_of_2);
    1744             : 
    1745             :     // Arithmetic right shift
    1746             :     q >>= shift;
    1747             :     // Negate if needed
    1748             :     q = (q ^ sign) - sign;
    1749             : 
    1750             :     return q;
    1751             : }
    1752             : 
    1753             : static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) {
    1754             :     uint8_t more = denom->more;
    1755             :     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    1756             :     if (denom->magic == 0) {  // shift path
    1757             :         uint64_t absD = (uint64_t)1 << shift;
    1758             :         if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
    1759             :             absD = -absD;
    1760             :         }
    1761             :         return (int64_t)absD;
    1762             :     } else {
    1763             :         // Unsigned math is much easier
    1764             :         int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
    1765             :         int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
    1766             : 
    1767             :         uint64_t d = (uint64_t)(magic_was_negated ? -denom->magic : denom->magic);
    1768             :         uint64_t n_hi = (uint64_t)1 << shift, n_lo = 0;
    1769             :         uint64_t rem_ignored;
    1770             :         uint64_t q = libdivide_128_div_64_to_64(n_hi, n_lo, d, &rem_ignored);
    1771             :         int64_t result = (int64_t)(q + 1);
    1772             :         if (negative_divisor) {
    1773             :             result = -result;
    1774             :         }
    1775             :         return result;
    1776             :     }
    1777             : }
    1778             : 
    1779             : static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom) {
    1780             :     const struct libdivide_s64_t den = {denom->magic, denom->more};
    1781             :     return libdivide_s64_recover(&den);
    1782             : }
    1783             : 
    1784             : // Simplest possible vector type division: treat the vector type as an array
    1785             : // of underlying native type.
    1786             : //
    1787             : // Use a union to read a vector via pointer-to-integer, without violating strict
    1788             : // aliasing.
    1789             : #define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo)                          \
    1790             :     const size_t count = sizeof(VecT) / sizeof(IntT);                     \
    1791             :     union type_pun_vec {                                                  \
    1792             :         VecT vec;                                                         \
    1793             :         IntT arr[sizeof(VecT) / sizeof(IntT)];                            \
    1794             :     };                                                                    \
    1795             :     union type_pun_vec result;                                            \
    1796             :     union type_pun_vec input;                                             \
    1797             :     input.vec = numers;                                                   \
    1798             :     for (size_t loop = 0; loop < count; ++loop) {                         \
    1799             :         result.arr[loop] = libdivide_##Algo##_do(input.arr[loop], denom); \
    1800             :     }                                                                     \
    1801             :     return result.vec;
    1802             : 
    1803             : #if defined(LIBDIVIDE_NEON)
    1804             : 
    1805             : static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_do_vec128(
    1806             :     uint16x8_t numers, const struct libdivide_u16_t *denom);
    1807             : static LIBDIVIDE_INLINE int16x8_t libdivide_s16_do_vec128(
    1808             :     int16x8_t numers, const struct libdivide_s16_t *denom);
    1809             : static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_do_vec128(
    1810             :     uint32x4_t numers, const struct libdivide_u32_t *denom);
    1811             : static LIBDIVIDE_INLINE int32x4_t libdivide_s32_do_vec128(
    1812             :     int32x4_t numers, const struct libdivide_s32_t *denom);
    1813             : static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_do_vec128(
    1814             :     uint64x2_t numers, const struct libdivide_u64_t *denom);
    1815             : static LIBDIVIDE_INLINE int64x2_t libdivide_s64_do_vec128(
    1816             :     int64x2_t numers, const struct libdivide_s64_t *denom);
    1817             : 
    1818             : static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_branchfree_do_vec128(
    1819             :     uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom);
    1820             : static LIBDIVIDE_INLINE int16x8_t libdivide_s16_branchfree_do_vec128(
    1821             :     int16x8_t numers, const struct libdivide_s16_branchfree_t *denom);
    1822             : static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_branchfree_do_vec128(
    1823             :     uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom);
    1824             : static LIBDIVIDE_INLINE int32x4_t libdivide_s32_branchfree_do_vec128(
    1825             :     int32x4_t numers, const struct libdivide_s32_branchfree_t *denom);
    1826             : static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_branchfree_do_vec128(
    1827             :     uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom);
    1828             : static LIBDIVIDE_INLINE int64x2_t libdivide_s64_branchfree_do_vec128(
    1829             :     int64x2_t numers, const struct libdivide_s64_branchfree_t *denom);
    1830             : 
    1831             : //////// Internal Utility Functions
    1832             : 
    1833             : // Logical right shift by runtime value.
    1834             : // NEON implements right shift as left shits by negative values.
    1835             : static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_neon_srl(uint32x4_t v, uint8_t amt) {
    1836             :     int32_t wamt = (int32_t)(amt);
    1837             :     return vshlq_u32(v, vdupq_n_s32(-wamt));
    1838             : }
    1839             : 
    1840             : static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_neon_srl(uint64x2_t v, uint8_t amt) {
    1841             :     int64_t wamt = (int64_t)(amt);
    1842             :     return vshlq_u64(v, vdupq_n_s64(-wamt));
    1843             : }
    1844             : 
    1845             : // Arithmetic right shift by runtime value.
    1846             : static LIBDIVIDE_INLINE int32x4_t libdivide_s32_neon_sra(int32x4_t v, uint8_t amt) {
    1847             :     int32_t wamt = (int32_t)(amt);
    1848             :     return vshlq_s32(v, vdupq_n_s32(-wamt));
    1849             : }
    1850             : 
    1851             : static LIBDIVIDE_INLINE int64x2_t libdivide_s64_neon_sra(int64x2_t v, uint8_t amt) {
    1852             :     int64_t wamt = (int64_t)(amt);
    1853             :     return vshlq_s64(v, vdupq_n_s64(-wamt));
    1854             : }
    1855             : 
    1856             : static LIBDIVIDE_INLINE int64x2_t libdivide_s64_signbits(int64x2_t v) { return vshrq_n_s64(v, 63); }
    1857             : 
    1858             : static LIBDIVIDE_INLINE uint32x4_t libdivide_mullhi_u32_vec128(uint32x4_t a, uint32_t b) {
    1859             :     // Desire is [x0, x1, x2, x3]
    1860             :     uint32x4_t w1 = vreinterpretq_u32_u64(vmull_n_u32(vget_low_u32(a), b));  // [_, x0, _, x1]
    1861             :     uint32x4_t w2 = vreinterpretq_u32_u64(vmull_high_n_u32(a, b));           //[_, x2, _, x3]
    1862             :     return vuzp2q_u32(w1, w2);                                               // [x0, x1, x2, x3]
    1863             : }
    1864             : 
    1865             : static LIBDIVIDE_INLINE int32x4_t libdivide_mullhi_s32_vec128(int32x4_t a, int32_t b) {
    1866             :     int32x4_t w1 = vreinterpretq_s32_s64(vmull_n_s32(vget_low_s32(a), b));  // [_, x0, _, x1]
    1867             :     int32x4_t w2 = vreinterpretq_s32_s64(vmull_high_n_s32(a, b));           //[_, x2, _, x3]
    1868             :     return vuzp2q_s32(w1, w2);                                              // [x0, x1, x2, x3]
    1869             : }
    1870             : 
    1871             : static LIBDIVIDE_INLINE uint64x2_t libdivide_mullhi_u64_vec128(uint64x2_t x, uint64_t sy) {
    1872             :     // full 128 bits product is:
    1873             :     // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64)
    1874             :     // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64.
    1875             : 
    1876             :     // Get low and high words. x0 contains low 32 bits, x1 is high 32 bits.
    1877             :     uint64x2_t y = vdupq_n_u64(sy);
    1878             :     uint32x2_t x0 = vmovn_u64(x);
    1879             :     uint32x2_t y0 = vmovn_u64(y);
    1880             :     uint32x2_t x1 = vshrn_n_u64(x, 32);
    1881             :     uint32x2_t y1 = vshrn_n_u64(y, 32);
    1882             : 
    1883             :     // Compute x0*y0.
    1884             :     uint64x2_t x0y0 = vmull_u32(x0, y0);
    1885             :     uint64x2_t x0y0_hi = vshrq_n_u64(x0y0, 32);
    1886             : 
    1887             :     // Compute other intermediate products.
    1888             :     uint64x2_t temp = vmlal_u32(x0y0_hi, x1, y0);  // temp = x0y0_hi + x1*y0;
    1889             :     // We want to split temp into its low 32 bits and high 32 bits, both
    1890             :     // in the low half of 64 bit registers.
    1891             :     // Use shifts to avoid needing a reg for the mask.
    1892             :     uint64x2_t temp_lo = vshrq_n_u64(vshlq_n_u64(temp, 32), 32);  // temp_lo = temp & 0xFFFFFFFF;
    1893             :     uint64x2_t temp_hi = vshrq_n_u64(temp, 32);                   // temp_hi = temp >> 32;
    1894             : 
    1895             :     temp_lo = vmlal_u32(temp_lo, x0, y1);  // temp_lo += x0*y0
    1896             :     temp_lo = vshrq_n_u64(temp_lo, 32);    // temp_lo >>= 32
    1897             :     temp_hi = vmlal_u32(temp_hi, x1, y1);  // temp_hi += x1*y1
    1898             :     uint64x2_t result = vaddq_u64(temp_hi, temp_lo);
    1899             :     return result;
    1900             : }
    1901             : 
    1902             : static LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64_t sy) {
    1903             :     int64x2_t p = vreinterpretq_s64_u64(
    1904             :         libdivide_mullhi_u64_vec128(vreinterpretq_u64_s64(x), (uint64_t)(sy)));
    1905             :     int64x2_t y = vdupq_n_s64(sy);
    1906             :     int64x2_t t1 = vandq_s64(libdivide_s64_signbits(x), y);
    1907             :     int64x2_t t2 = vandq_s64(libdivide_s64_signbits(y), x);
    1908             :     p = vsubq_s64(p, t1);
    1909             :     p = vsubq_s64(p, t2);
    1910             :     return p;
    1911             : }
    1912             : 
    1913             : ////////// UINT16
    1914             : 
    1915             : uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom){
    1916             :     SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)}
    1917             : 
    1918             : uint16x8_t libdivide_u16_branchfree_do_vec128(
    1919             :     uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom){
    1920             :     SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)}
    1921             : 
    1922             : ////////// UINT32
    1923             : 
    1924             : uint32x4_t libdivide_u32_do_vec128(uint32x4_t numers, const struct libdivide_u32_t *denom) {
    1925             :     uint8_t more = denom->more;
    1926             :     if (!denom->magic) {
    1927             :         return libdivide_u32_neon_srl(numers, more);
    1928             :     } else {
    1929             :         uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic);
    1930             :         if (more & LIBDIVIDE_ADD_MARKER) {
    1931             :             // uint32_t t = ((numer - q) >> 1) + q;
    1932             :             // return t >> denom->shift;
    1933             :             // Note we can use halving-subtract to avoid the shift.
    1934             :             uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    1935             :             uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q);
    1936             :             return libdivide_u32_neon_srl(t, shift);
    1937             :         } else {
    1938             :             return libdivide_u32_neon_srl(q, more);
    1939             :         }
    1940             :     }
    1941             : }
    1942             : 
    1943             : uint32x4_t libdivide_u32_branchfree_do_vec128(
    1944             :     uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom) {
    1945             :     uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic);
    1946             :     uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q);
    1947             :     return libdivide_u32_neon_srl(t, denom->more);
    1948             : }
    1949             : 
    1950             : ////////// UINT64
    1951             : 
    1952             : uint64x2_t libdivide_u64_do_vec128(uint64x2_t numers, const struct libdivide_u64_t *denom) {
    1953             :     uint8_t more = denom->more;
    1954             :     if (!denom->magic) {
    1955             :         return libdivide_u64_neon_srl(numers, more);
    1956             :     } else {
    1957             :         uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic);
    1958             :         if (more & LIBDIVIDE_ADD_MARKER) {
    1959             :             // uint32_t t = ((numer - q) >> 1) + q;
    1960             :             // return t >> denom->shift;
    1961             :             // No 64-bit halving subtracts in NEON :(
    1962             :             uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    1963             :             uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q);
    1964             :             return libdivide_u64_neon_srl(t, shift);
    1965             :         } else {
    1966             :             return libdivide_u64_neon_srl(q, more);
    1967             :         }
    1968             :     }
    1969             : }
    1970             : 
    1971             : uint64x2_t libdivide_u64_branchfree_do_vec128(
    1972             :     uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom) {
    1973             :     uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic);
    1974             :     uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q);
    1975             :     return libdivide_u64_neon_srl(t, denom->more);
    1976             : }
    1977             : 
    1978             : ////////// SINT16
    1979             : 
    1980             : int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom){
    1981             :     SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)}
    1982             : 
    1983             : int16x8_t libdivide_s16_branchfree_do_vec128(
    1984             :     int16x8_t numers, const struct libdivide_s16_branchfree_t *denom){
    1985             :     SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)}
    1986             : 
    1987             : ////////// SINT32
    1988             : 
    1989             : int32x4_t libdivide_s32_do_vec128(int32x4_t numers, const struct libdivide_s32_t *denom) {
    1990             :     uint8_t more = denom->more;
    1991             :     if (!denom->magic) {
    1992             :         uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    1993             :         uint32_t mask = ((uint32_t)1 << shift) - 1;
    1994             :         int32x4_t roundToZeroTweak = vdupq_n_s32((int)mask);
    1995             :         // q = numer + ((numer >> 31) & roundToZeroTweak);
    1996             :         int32x4_t q = vaddq_s32(numers, vandq_s32(vshrq_n_s32(numers, 31), roundToZeroTweak));
    1997             :         q = libdivide_s32_neon_sra(q, shift);
    1998             :         int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
    1999             :         // q = (q ^ sign) - sign;
    2000             :         q = vsubq_s32(veorq_s32(q, sign), sign);
    2001             :         return q;
    2002             :     } else {
    2003             :         int32x4_t q = libdivide_mullhi_s32_vec128(numers, denom->magic);
    2004             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2005             :             // must be arithmetic shift
    2006             :             int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
    2007             :             // q += ((numer ^ sign) - sign);
    2008             :             q = vaddq_s32(q, vsubq_s32(veorq_s32(numers, sign), sign));
    2009             :         }
    2010             :         // q >>= shift
    2011             :         q = libdivide_s32_neon_sra(q, more & LIBDIVIDE_32_SHIFT_MASK);
    2012             :         q = vaddq_s32(
    2013             :             q, vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(q), 31)));  // q += (q < 0)
    2014             :         return q;
    2015             :     }
    2016             : }
    2017             : 
    2018             : int32x4_t libdivide_s32_branchfree_do_vec128(
    2019             :     int32x4_t numers, const struct libdivide_s32_branchfree_t *denom) {
    2020             :     int32_t magic = denom->magic;
    2021             :     uint8_t more = denom->more;
    2022             :     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    2023             :     // must be arithmetic shift
    2024             :     int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
    2025             :     int32x4_t q = libdivide_mullhi_s32_vec128(numers, magic);
    2026             :     q = vaddq_s32(q, numers);  // q += numers
    2027             : 
    2028             :     // If q is non-negative, we have nothing to do
    2029             :     // If q is negative, we want to add either (2**shift)-1 if d is
    2030             :     // a power of 2, or (2**shift) if it is not a power of 2
    2031             :     uint32_t is_power_of_2 = (magic == 0);
    2032             :     int32x4_t q_sign = vshrq_n_s32(q, 31);  // q_sign = q >> 31
    2033             :     int32x4_t mask = vdupq_n_s32(((uint32_t)1 << shift) - is_power_of_2);
    2034             :     q = vaddq_s32(q, vandq_s32(q_sign, mask));  // q = q + (q_sign & mask)
    2035             :     q = libdivide_s32_neon_sra(q, shift);       // q >>= shift
    2036             :     q = vsubq_s32(veorq_s32(q, sign), sign);    // q = (q ^ sign) - sign
    2037             :     return q;
    2038             : }
    2039             : 
    2040             : ////////// SINT64
    2041             : 
    2042             : int64x2_t libdivide_s64_do_vec128(int64x2_t numers, const struct libdivide_s64_t *denom) {
    2043             :     uint8_t more = denom->more;
    2044             :     int64_t magic = denom->magic;
    2045             :     if (magic == 0) {  // shift path
    2046             :         uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    2047             :         uint64_t mask = ((uint64_t)1 << shift) - 1;
    2048             :         int64x2_t roundToZeroTweak = vdupq_n_s64(mask);  // TODO: no need to sign extend
    2049             :         // q = numer + ((numer >> 63) & roundToZeroTweak);
    2050             :         int64x2_t q =
    2051             :             vaddq_s64(numers, vandq_s64(libdivide_s64_signbits(numers), roundToZeroTweak));
    2052             :         q = libdivide_s64_neon_sra(q, shift);
    2053             :         // q = (q ^ sign) - sign;
    2054             :         int64x2_t sign = vreinterpretq_s64_s8(vdupq_n_s8((int8_t)more >> 7));
    2055             :         q = vsubq_s64(veorq_s64(q, sign), sign);
    2056             :         return q;
    2057             :     } else {
    2058             :         int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic);
    2059             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2060             :             // must be arithmetic shift
    2061             :             int64x2_t sign = vdupq_n_s64((int8_t)more >> 7);  // TODO: no need to widen
    2062             :             // q += ((numer ^ sign) - sign);
    2063             :             q = vaddq_s64(q, vsubq_s64(veorq_s64(numers, sign), sign));
    2064             :         }
    2065             :         // q >>= denom->mult_path.shift
    2066             :         q = libdivide_s64_neon_sra(q, more & LIBDIVIDE_64_SHIFT_MASK);
    2067             :         q = vaddq_s64(
    2068             :             q, vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(q), 63)));  // q += (q < 0)
    2069             :         return q;
    2070             :     }
    2071             : }
    2072             : 
    2073             : int64x2_t libdivide_s64_branchfree_do_vec128(
    2074             :     int64x2_t numers, const struct libdivide_s64_branchfree_t *denom) {
    2075             :     int64_t magic = denom->magic;
    2076             :     uint8_t more = denom->more;
    2077             :     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    2078             :     // must be arithmetic shift
    2079             :     int64x2_t sign = vdupq_n_s64((int8_t)more >> 7);  // TODO: avoid sign extend
    2080             : 
    2081             :     // libdivide_mullhi_s64(numers, magic);
    2082             :     int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic);
    2083             :     q = vaddq_s64(q, numers);  // q += numers
    2084             : 
    2085             :     // If q is non-negative, we have nothing to do.
    2086             :     // If q is negative, we want to add either (2**shift)-1 if d is
    2087             :     // a power of 2, or (2**shift) if it is not a power of 2.
    2088             :     uint32_t is_power_of_2 = (magic == 0);
    2089             :     int64x2_t q_sign = libdivide_s64_signbits(q);  // q_sign = q >> 63
    2090             :     int64x2_t mask = vdupq_n_s64(((uint64_t)1 << shift) - is_power_of_2);
    2091             :     q = vaddq_s64(q, vandq_s64(q_sign, mask));  // q = q + (q_sign & mask)
    2092             :     q = libdivide_s64_neon_sra(q, shift);       // q >>= shift
    2093             :     q = vsubq_s64(veorq_s64(q, sign), sign);    // q = (q ^ sign) - sign
    2094             :     return q;
    2095             : }
    2096             : 
    2097             : #endif
    2098             : 
    2099             : #if defined(LIBDIVIDE_AVX512)
    2100             : 
    2101             : static LIBDIVIDE_INLINE __m512i libdivide_u16_do_vec512(
    2102             :     __m512i numers, const struct libdivide_u16_t *denom);
    2103             : static LIBDIVIDE_INLINE __m512i libdivide_s16_do_vec512(
    2104             :     __m512i numers, const struct libdivide_s16_t *denom);
    2105             : static LIBDIVIDE_INLINE __m512i libdivide_u32_do_vec512(
    2106             :     __m512i numers, const struct libdivide_u32_t *denom);
    2107             : static LIBDIVIDE_INLINE __m512i libdivide_s32_do_vec512(
    2108             :     __m512i numers, const struct libdivide_s32_t *denom);
    2109             : static LIBDIVIDE_INLINE __m512i libdivide_u64_do_vec512(
    2110             :     __m512i numers, const struct libdivide_u64_t *denom);
    2111             : static LIBDIVIDE_INLINE __m512i libdivide_s64_do_vec512(
    2112             :     __m512i numers, const struct libdivide_s64_t *denom);
    2113             : 
    2114             : static LIBDIVIDE_INLINE __m512i libdivide_u16_branchfree_do_vec512(
    2115             :     __m512i numers, const struct libdivide_u16_branchfree_t *denom);
    2116             : static LIBDIVIDE_INLINE __m512i libdivide_s16_branchfree_do_vec512(
    2117             :     __m512i numers, const struct libdivide_s16_branchfree_t *denom);
    2118             : static LIBDIVIDE_INLINE __m512i libdivide_u32_branchfree_do_vec512(
    2119             :     __m512i numers, const struct libdivide_u32_branchfree_t *denom);
    2120             : static LIBDIVIDE_INLINE __m512i libdivide_s32_branchfree_do_vec512(
    2121             :     __m512i numers, const struct libdivide_s32_branchfree_t *denom);
    2122             : static LIBDIVIDE_INLINE __m512i libdivide_u64_branchfree_do_vec512(
    2123             :     __m512i numers, const struct libdivide_u64_branchfree_t *denom);
    2124             : static LIBDIVIDE_INLINE __m512i libdivide_s64_branchfree_do_vec512(
    2125             :     __m512i numers, const struct libdivide_s64_branchfree_t *denom);
    2126             : 
    2127             : //////// Internal Utility Functions
    2128             : 
    2129             : static LIBDIVIDE_INLINE __m512i libdivide_s64_signbits_vec512(__m512i v) {
    2130             :     ;
    2131             :     return _mm512_srai_epi64(v, 63);
    2132             : }
    2133             : 
    2134             : static LIBDIVIDE_INLINE __m512i libdivide_s64_shift_right_vec512(__m512i v, int amt) {
    2135             :     return _mm512_srai_epi64(v, amt);
    2136             : }
    2137             : 
    2138             : // Here, b is assumed to contain one 32-bit value repeated.
    2139             : static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u32_vec512(__m512i a, __m512i b) {
    2140             :     __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epu32(a, b), 32);
    2141             :     __m512i a1X3X = _mm512_srli_epi64(a, 32);
    2142             :     __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
    2143             :     __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epu32(a1X3X, b), mask);
    2144             :     return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3);
    2145             : }
    2146             : 
    2147             : // b is one 32-bit value repeated.
    2148             : static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s32_vec512(__m512i a, __m512i b) {
    2149             :     __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epi32(a, b), 32);
    2150             :     __m512i a1X3X = _mm512_srli_epi64(a, 32);
    2151             :     __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
    2152             :     __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epi32(a1X3X, b), mask);
    2153             :     return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3);
    2154             : }
    2155             : 
    2156             : // Here, y is assumed to contain one 64-bit value repeated.
    2157             : static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y) {
    2158             :     // see m128i variant for comments.
    2159             :     __m512i x0y0 = _mm512_mul_epu32(x, y);
    2160             :     __m512i x0y0_hi = _mm512_srli_epi64(x0y0, 32);
    2161             : 
    2162             :     __m512i x1 = _mm512_shuffle_epi32(x, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1));
    2163             :     __m512i y1 = _mm512_shuffle_epi32(y, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1));
    2164             : 
    2165             :     __m512i x0y1 = _mm512_mul_epu32(x, y1);
    2166             :     __m512i x1y0 = _mm512_mul_epu32(x1, y);
    2167             :     __m512i x1y1 = _mm512_mul_epu32(x1, y1);
    2168             : 
    2169             :     __m512i mask = _mm512_set1_epi64(0xFFFFFFFF);
    2170             :     __m512i temp = _mm512_add_epi64(x1y0, x0y0_hi);
    2171             :     __m512i temp_lo = _mm512_and_si512(temp, mask);
    2172             :     __m512i temp_hi = _mm512_srli_epi64(temp, 32);
    2173             : 
    2174             :     temp_lo = _mm512_srli_epi64(_mm512_add_epi64(temp_lo, x0y1), 32);
    2175             :     temp_hi = _mm512_add_epi64(x1y1, temp_hi);
    2176             :     return _mm512_add_epi64(temp_lo, temp_hi);
    2177             : }
    2178             : 
    2179             : // y is one 64-bit value repeated.
    2180             : static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y) {
    2181             :     __m512i p = libdivide_mullhi_u64_vec512(x, y);
    2182             :     __m512i t1 = _mm512_and_si512(libdivide_s64_signbits_vec512(x), y);
    2183             :     __m512i t2 = _mm512_and_si512(libdivide_s64_signbits_vec512(y), x);
    2184             :     p = _mm512_sub_epi64(p, t1);
    2185             :     p = _mm512_sub_epi64(p, t2);
    2186             :     return p;
    2187             : }
    2188             : 
    2189             : ////////// UINT16
    2190             : 
    2191             : __m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom){
    2192             :     SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)}
    2193             : 
    2194             : __m512i libdivide_u16_branchfree_do_vec512(
    2195             :     __m512i numers, const struct libdivide_u16_branchfree_t *denom){
    2196             :     SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)}
    2197             : 
    2198             : ////////// UINT32
    2199             : 
    2200             : __m512i libdivide_u32_do_vec512(__m512i numers, const struct libdivide_u32_t *denom) {
    2201             :     uint8_t more = denom->more;
    2202             :     if (!denom->magic) {
    2203             :         return _mm512_srli_epi32(numers, more);
    2204             :     } else {
    2205             :         __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic));
    2206             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2207             :             // uint32_t t = ((numer - q) >> 1) + q;
    2208             :             // return t >> denom->shift;
    2209             :             uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    2210             :             __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);
    2211             :             return _mm512_srli_epi32(t, shift);
    2212             :         } else {
    2213             :             return _mm512_srli_epi32(q, more);
    2214             :         }
    2215             :     }
    2216             : }
    2217             : 
    2218             : __m512i libdivide_u32_branchfree_do_vec512(
    2219             :     __m512i numers, const struct libdivide_u32_branchfree_t *denom) {
    2220             :     __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic));
    2221             :     __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);
    2222             :     return _mm512_srli_epi32(t, denom->more);
    2223             : }
    2224             : 
    2225             : ////////// UINT64
    2226             : 
    2227             : __m512i libdivide_u64_do_vec512(__m512i numers, const struct libdivide_u64_t *denom) {
    2228             :     uint8_t more = denom->more;
    2229             :     if (!denom->magic) {
    2230             :         return _mm512_srli_epi64(numers, more);
    2231             :     } else {
    2232             :         __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic));
    2233             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2234             :             // uint32_t t = ((numer - q) >> 1) + q;
    2235             :             // return t >> denom->shift;
    2236             :             uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    2237             :             __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);
    2238             :             return _mm512_srli_epi64(t, shift);
    2239             :         } else {
    2240             :             return _mm512_srli_epi64(q, more);
    2241             :         }
    2242             :     }
    2243             : }
    2244             : 
    2245             : __m512i libdivide_u64_branchfree_do_vec512(
    2246             :     __m512i numers, const struct libdivide_u64_branchfree_t *denom) {
    2247             :     __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic));
    2248             :     __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);
    2249             :     return _mm512_srli_epi64(t, denom->more);
    2250             : }
    2251             : 
    2252             : ////////// SINT16
    2253             : 
    2254             : __m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom){
    2255             :     SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)}
    2256             : 
    2257             : __m512i libdivide_s16_branchfree_do_vec512(
    2258             :     __m512i numers, const struct libdivide_s16_branchfree_t *denom){
    2259             :     SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)}
    2260             : 
    2261             : ////////// SINT32
    2262             : 
    2263             : __m512i libdivide_s32_do_vec512(__m512i numers, const struct libdivide_s32_t *denom) {
    2264             :     uint8_t more = denom->more;
    2265             :     if (!denom->magic) {
    2266             :         uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    2267             :         uint32_t mask = ((uint32_t)1 << shift) - 1;
    2268             :         __m512i roundToZeroTweak = _mm512_set1_epi32(mask);
    2269             :         // q = numer + ((numer >> 31) & roundToZeroTweak);
    2270             :         __m512i q = _mm512_add_epi32(
    2271             :             numers, _mm512_and_si512(_mm512_srai_epi32(numers, 31), roundToZeroTweak));
    2272             :         q = _mm512_srai_epi32(q, shift);
    2273             :         __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
    2274             :         // q = (q ^ sign) - sign;
    2275             :         q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign);
    2276             :         return q;
    2277             :     } else {
    2278             :         __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(denom->magic));
    2279             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2280             :             // must be arithmetic shift
    2281             :             __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
    2282             :             // q += ((numer ^ sign) - sign);
    2283             :             q = _mm512_add_epi32(q, _mm512_sub_epi32(_mm512_xor_si512(numers, sign), sign));
    2284             :         }
    2285             :         // q >>= shift
    2286             :         q = _mm512_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
    2287             :         q = _mm512_add_epi32(q, _mm512_srli_epi32(q, 31));  // q += (q < 0)
    2288             :         return q;
    2289             :     }
    2290             : }
    2291             : 
    2292             : __m512i libdivide_s32_branchfree_do_vec512(
    2293             :     __m512i numers, const struct libdivide_s32_branchfree_t *denom) {
    2294             :     int32_t magic = denom->magic;
    2295             :     uint8_t more = denom->more;
    2296             :     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    2297             :     // must be arithmetic shift
    2298             :     __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
    2299             :     __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(magic));
    2300             :     q = _mm512_add_epi32(q, numers);  // q += numers
    2301             : 
    2302             :     // If q is non-negative, we have nothing to do
    2303             :     // If q is negative, we want to add either (2**shift)-1 if d is
    2304             :     // a power of 2, or (2**shift) if it is not a power of 2
    2305             :     uint32_t is_power_of_2 = (magic == 0);
    2306             :     __m512i q_sign = _mm512_srai_epi32(q, 31);  // q_sign = q >> 31
    2307             :     __m512i mask = _mm512_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
    2308             :     q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask));  // q = q + (q_sign & mask)
    2309             :     q = _mm512_srai_epi32(q, shift);                          // q >>= shift
    2310             :     q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign);    // q = (q ^ sign) - sign
    2311             :     return q;
    2312             : }
    2313             : 
    2314             : ////////// SINT64
    2315             : 
    2316             : __m512i libdivide_s64_do_vec512(__m512i numers, const struct libdivide_s64_t *denom) {
    2317             :     uint8_t more = denom->more;
    2318             :     int64_t magic = denom->magic;
    2319             :     if (magic == 0) {  // shift path
    2320             :         uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    2321             :         uint64_t mask = ((uint64_t)1 << shift) - 1;
    2322             :         __m512i roundToZeroTweak = _mm512_set1_epi64(mask);
    2323             :         // q = numer + ((numer >> 63) & roundToZeroTweak);
    2324             :         __m512i q = _mm512_add_epi64(
    2325             :             numers, _mm512_and_si512(libdivide_s64_signbits_vec512(numers), roundToZeroTweak));
    2326             :         q = libdivide_s64_shift_right_vec512(q, shift);
    2327             :         __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
    2328             :         // q = (q ^ sign) - sign;
    2329             :         q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign);
    2330             :         return q;
    2331             :     } else {
    2332             :         __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic));
    2333             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2334             :             // must be arithmetic shift
    2335             :             __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
    2336             :             // q += ((numer ^ sign) - sign);
    2337             :             q = _mm512_add_epi64(q, _mm512_sub_epi64(_mm512_xor_si512(numers, sign), sign));
    2338             :         }
    2339             :         // q >>= denom->mult_path.shift
    2340             :         q = libdivide_s64_shift_right_vec512(q, more & LIBDIVIDE_64_SHIFT_MASK);
    2341             :         q = _mm512_add_epi64(q, _mm512_srli_epi64(q, 63));  // q += (q < 0)
    2342             :         return q;
    2343             :     }
    2344             : }
    2345             : 
    2346             : __m512i libdivide_s64_branchfree_do_vec512(
    2347             :     __m512i numers, const struct libdivide_s64_branchfree_t *denom) {
    2348             :     int64_t magic = denom->magic;
    2349             :     uint8_t more = denom->more;
    2350             :     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    2351             :     // must be arithmetic shift
    2352             :     __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
    2353             : 
    2354             :     // libdivide_mullhi_s64(numers, magic);
    2355             :     __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic));
    2356             :     q = _mm512_add_epi64(q, numers);  // q += numers
    2357             : 
    2358             :     // If q is non-negative, we have nothing to do.
    2359             :     // If q is negative, we want to add either (2**shift)-1 if d is
    2360             :     // a power of 2, or (2**shift) if it is not a power of 2.
    2361             :     uint32_t is_power_of_2 = (magic == 0);
    2362             :     __m512i q_sign = libdivide_s64_signbits_vec512(q);  // q_sign = q >> 63
    2363             :     __m512i mask = _mm512_set1_epi64(((uint64_t)1 << shift) - is_power_of_2);
    2364             :     q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask));  // q = q + (q_sign & mask)
    2365             :     q = libdivide_s64_shift_right_vec512(q, shift);           // q >>= shift
    2366             :     q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign);    // q = (q ^ sign) - sign
    2367             :     return q;
    2368             : }
    2369             : 
    2370             : #endif
    2371             : 
    2372             : #if defined(LIBDIVIDE_AVX2)
    2373             : 
    2374             : static LIBDIVIDE_INLINE __m256i libdivide_u16_do_vec256(
    2375             :     __m256i numers, const struct libdivide_u16_t *denom);
    2376             : static LIBDIVIDE_INLINE __m256i libdivide_s16_do_vec256(
    2377             :     __m256i numers, const struct libdivide_s16_t *denom);
    2378             : static LIBDIVIDE_INLINE __m256i libdivide_u32_do_vec256(
    2379             :     __m256i numers, const struct libdivide_u32_t *denom);
    2380             : static LIBDIVIDE_INLINE __m256i libdivide_s32_do_vec256(
    2381             :     __m256i numers, const struct libdivide_s32_t *denom);
    2382             : static LIBDIVIDE_INLINE __m256i libdivide_u64_do_vec256(
    2383             :     __m256i numers, const struct libdivide_u64_t *denom);
    2384             : static LIBDIVIDE_INLINE __m256i libdivide_s64_do_vec256(
    2385             :     __m256i numers, const struct libdivide_s64_t *denom);
    2386             : 
    2387             : static LIBDIVIDE_INLINE __m256i libdivide_u16_branchfree_do_vec256(
    2388             :     __m256i numers, const struct libdivide_u16_branchfree_t *denom);
    2389             : static LIBDIVIDE_INLINE __m256i libdivide_s16_branchfree_do_vec256(
    2390             :     __m256i numers, const struct libdivide_s16_branchfree_t *denom);
    2391             : static LIBDIVIDE_INLINE __m256i libdivide_u32_branchfree_do_vec256(
    2392             :     __m256i numers, const struct libdivide_u32_branchfree_t *denom);
    2393             : static LIBDIVIDE_INLINE __m256i libdivide_s32_branchfree_do_vec256(
    2394             :     __m256i numers, const struct libdivide_s32_branchfree_t *denom);
    2395             : static LIBDIVIDE_INLINE __m256i libdivide_u64_branchfree_do_vec256(
    2396             :     __m256i numers, const struct libdivide_u64_branchfree_t *denom);
    2397             : static LIBDIVIDE_INLINE __m256i libdivide_s64_branchfree_do_vec256(
    2398             :     __m256i numers, const struct libdivide_s64_branchfree_t *denom);
    2399             : 
    2400             : //////// Internal Utility Functions
    2401             : 
    2402             : // Implementation of _mm256_srai_epi64(v, 63) (from AVX512).
    2403             : static LIBDIVIDE_INLINE __m256i libdivide_s64_signbits_vec256(__m256i v) {
    2404             :     __m256i hiBitsDuped = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
    2405             :     __m256i signBits = _mm256_srai_epi32(hiBitsDuped, 31);
    2406             :     return signBits;
    2407             : }
    2408             : 
    2409             : // Implementation of _mm256_srai_epi64 (from AVX512).
    2410             : static LIBDIVIDE_INLINE __m256i libdivide_s64_shift_right_vec256(__m256i v, int amt) {
    2411             :     const int b = 64 - amt;
    2412             :     __m256i m = _mm256_set1_epi64x((uint64_t)1 << (b - 1));
    2413             :     __m256i x = _mm256_srli_epi64(v, amt);
    2414             :     __m256i result = _mm256_sub_epi64(_mm256_xor_si256(x, m), m);
    2415             :     return result;
    2416             : }
    2417             : 
    2418             : // Here, b is assumed to contain one 32-bit value repeated.
    2419             : static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u32_vec256(__m256i a, __m256i b) {
    2420             :     __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epu32(a, b), 32);
    2421             :     __m256i a1X3X = _mm256_srli_epi64(a, 32);
    2422             :     __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
    2423             :     __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epu32(a1X3X, b), mask);
    2424             :     return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3);
    2425             : }
    2426             : 
    2427             : // b is one 32-bit value repeated.
    2428             : static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s32_vec256(__m256i a, __m256i b) {
    2429             :     __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epi32(a, b), 32);
    2430             :     __m256i a1X3X = _mm256_srli_epi64(a, 32);
    2431             :     __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
    2432             :     __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epi32(a1X3X, b), mask);
    2433             :     return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3);
    2434             : }
    2435             : 
    2436             : // Here, y is assumed to contain one 64-bit value repeated.
    2437             : static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y) {
    2438             :     // see m128i variant for comments.
    2439             :     __m256i x0y0 = _mm256_mul_epu32(x, y);
    2440             :     __m256i x0y0_hi = _mm256_srli_epi64(x0y0, 32);
    2441             : 
    2442             :     __m256i x1 = _mm256_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1));
    2443             :     __m256i y1 = _mm256_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1));
    2444             : 
    2445             :     __m256i x0y1 = _mm256_mul_epu32(x, y1);
    2446             :     __m256i x1y0 = _mm256_mul_epu32(x1, y);
    2447             :     __m256i x1y1 = _mm256_mul_epu32(x1, y1);
    2448             : 
    2449             :     __m256i mask = _mm256_set1_epi64x(0xFFFFFFFF);
    2450             :     __m256i temp = _mm256_add_epi64(x1y0, x0y0_hi);
    2451             :     __m256i temp_lo = _mm256_and_si256(temp, mask);
    2452             :     __m256i temp_hi = _mm256_srli_epi64(temp, 32);
    2453             : 
    2454             :     temp_lo = _mm256_srli_epi64(_mm256_add_epi64(temp_lo, x0y1), 32);
    2455             :     temp_hi = _mm256_add_epi64(x1y1, temp_hi);
    2456             :     return _mm256_add_epi64(temp_lo, temp_hi);
    2457             : }
    2458             : 
    2459             : // y is one 64-bit value repeated.
    2460             : static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y) {
    2461             :     __m256i p = libdivide_mullhi_u64_vec256(x, y);
    2462             :     __m256i t1 = _mm256_and_si256(libdivide_s64_signbits_vec256(x), y);
    2463             :     __m256i t2 = _mm256_and_si256(libdivide_s64_signbits_vec256(y), x);
    2464             :     p = _mm256_sub_epi64(p, t1);
    2465             :     p = _mm256_sub_epi64(p, t2);
    2466             :     return p;
    2467             : }
    2468             : 
    2469             : ////////// UINT16
    2470             : 
    2471             : __m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) {
    2472             :     uint8_t more = denom->more;
    2473             :     if (!denom->magic) {
    2474             :         return _mm256_srli_epi16(numers, more);
    2475             :     } else {
    2476             :         __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
    2477             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2478             :             __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
    2479             :             return _mm256_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
    2480             :         } else {
    2481             :             return _mm256_srli_epi16(q, more);
    2482             :         }
    2483             :     }
    2484             : }
    2485             : 
    2486             : __m256i libdivide_u16_branchfree_do_vec256(
    2487             :     __m256i numers, const struct libdivide_u16_branchfree_t *denom) {
    2488             :     __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
    2489             :     __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
    2490             :     return _mm256_srli_epi16(t, denom->more);
    2491             : }
    2492             : 
    2493             : ////////// UINT32
    2494             : 
    2495             : __m256i libdivide_u32_do_vec256(__m256i numers, const struct libdivide_u32_t *denom) {
    2496             :     uint8_t more = denom->more;
    2497             :     if (!denom->magic) {
    2498             :         return _mm256_srli_epi32(numers, more);
    2499             :     } else {
    2500             :         __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic));
    2501             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2502             :             // uint32_t t = ((numer - q) >> 1) + q;
    2503             :             // return t >> denom->shift;
    2504             :             uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    2505             :             __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);
    2506             :             return _mm256_srli_epi32(t, shift);
    2507             :         } else {
    2508             :             return _mm256_srli_epi32(q, more);
    2509             :         }
    2510             :     }
    2511             : }
    2512             : 
    2513             : __m256i libdivide_u32_branchfree_do_vec256(
    2514             :     __m256i numers, const struct libdivide_u32_branchfree_t *denom) {
    2515             :     __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic));
    2516             :     __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);
    2517             :     return _mm256_srli_epi32(t, denom->more);
    2518             : }
    2519             : 
    2520             : ////////// UINT64
    2521             : 
    2522             : __m256i libdivide_u64_do_vec256(__m256i numers, const struct libdivide_u64_t *denom) {
    2523             :     uint8_t more = denom->more;
    2524             :     if (!denom->magic) {
    2525             :         return _mm256_srli_epi64(numers, more);
    2526             :     } else {
    2527             :         __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic));
    2528             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2529             :             // uint32_t t = ((numer - q) >> 1) + q;
    2530             :             // return t >> denom->shift;
    2531             :             uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    2532             :             __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);
    2533             :             return _mm256_srli_epi64(t, shift);
    2534             :         } else {
    2535             :             return _mm256_srli_epi64(q, more);
    2536             :         }
    2537             :     }
    2538             : }
    2539             : 
    2540             : __m256i libdivide_u64_branchfree_do_vec256(
    2541             :     __m256i numers, const struct libdivide_u64_branchfree_t *denom) {
    2542             :     __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic));
    2543             :     __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);
    2544             :     return _mm256_srli_epi64(t, denom->more);
    2545             : }
    2546             : 
    2547             : ////////// SINT16
    2548             : 
    2549             : __m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) {
    2550             :     uint8_t more = denom->more;
    2551             :     if (!denom->magic) {
    2552             :         uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
    2553             :         uint16_t mask = ((uint16_t)1 << shift) - 1;
    2554             :         __m256i roundToZeroTweak = _mm256_set1_epi16(mask);
    2555             :         // q = numer + ((numer >> 15) & roundToZeroTweak);
    2556             :         __m256i q = _mm256_add_epi16(
    2557             :             numers, _mm256_and_si256(_mm256_srai_epi16(numers, 15), roundToZeroTweak));
    2558             :         q = _mm256_srai_epi16(q, shift);
    2559             :         __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
    2560             :         // q = (q ^ sign) - sign;
    2561             :         q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign);
    2562             :         return q;
    2563             :     } else {
    2564             :         __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(denom->magic));
    2565             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2566             :             // must be arithmetic shift
    2567             :             __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
    2568             :             // q += ((numer ^ sign) - sign);
    2569             :             q = _mm256_add_epi16(q, _mm256_sub_epi16(_mm256_xor_si256(numers, sign), sign));
    2570             :         }
    2571             :         // q >>= shift
    2572             :         q = _mm256_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
    2573             :         q = _mm256_add_epi16(q, _mm256_srli_epi16(q, 15));  // q += (q < 0)
    2574             :         return q;
    2575             :     }
    2576             : }
    2577             : 
    2578             : __m256i libdivide_s16_branchfree_do_vec256(
    2579             :     __m256i numers, const struct libdivide_s16_branchfree_t *denom) {
    2580             :     int16_t magic = denom->magic;
    2581             :     uint8_t more = denom->more;
    2582             :     uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
    2583             :     // must be arithmetic shift
    2584             :     __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
    2585             :     __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(magic));
    2586             :     q = _mm256_add_epi16(q, numers);  // q += numers
    2587             : 
    2588             :     // If q is non-negative, we have nothing to do
    2589             :     // If q is negative, we want to add either (2**shift)-1 if d is
    2590             :     // a power of 2, or (2**shift) if it is not a power of 2
    2591             :     uint16_t is_power_of_2 = (magic == 0);
    2592             :     __m256i q_sign = _mm256_srai_epi16(q, 15);  // q_sign = q >> 15
    2593             :     __m256i mask = _mm256_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
    2594             :     q = _mm256_add_epi16(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)
    2595             :     q = _mm256_srai_epi16(q, shift);                          // q >>= shift
    2596             :     q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign
    2597             :     return q;
    2598             : }
    2599             : 
    2600             : ////////// SINT32
    2601             : 
    2602             : __m256i libdivide_s32_do_vec256(__m256i numers, const struct libdivide_s32_t *denom) {
    2603             :     uint8_t more = denom->more;
    2604             :     if (!denom->magic) {
    2605             :         uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    2606             :         uint32_t mask = ((uint32_t)1 << shift) - 1;
    2607             :         __m256i roundToZeroTweak = _mm256_set1_epi32(mask);
    2608             :         // q = numer + ((numer >> 31) & roundToZeroTweak);
    2609             :         __m256i q = _mm256_add_epi32(
    2610             :             numers, _mm256_and_si256(_mm256_srai_epi32(numers, 31), roundToZeroTweak));
    2611             :         q = _mm256_srai_epi32(q, shift);
    2612             :         __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
    2613             :         // q = (q ^ sign) - sign;
    2614             :         q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign);
    2615             :         return q;
    2616             :     } else {
    2617             :         __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(denom->magic));
    2618             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2619             :             // must be arithmetic shift
    2620             :             __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
    2621             :             // q += ((numer ^ sign) - sign);
    2622             :             q = _mm256_add_epi32(q, _mm256_sub_epi32(_mm256_xor_si256(numers, sign), sign));
    2623             :         }
    2624             :         // q >>= shift
    2625             :         q = _mm256_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
    2626             :         q = _mm256_add_epi32(q, _mm256_srli_epi32(q, 31));  // q += (q < 0)
    2627             :         return q;
    2628             :     }
    2629             : }
    2630             : 
    2631             : __m256i libdivide_s32_branchfree_do_vec256(
    2632             :     __m256i numers, const struct libdivide_s32_branchfree_t *denom) {
    2633             :     int32_t magic = denom->magic;
    2634             :     uint8_t more = denom->more;
    2635             :     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    2636             :     // must be arithmetic shift
    2637             :     __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
    2638             :     __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(magic));
    2639             :     q = _mm256_add_epi32(q, numers);  // q += numers
    2640             : 
    2641             :     // If q is non-negative, we have nothing to do
    2642             :     // If q is negative, we want to add either (2**shift)-1 if d is
    2643             :     // a power of 2, or (2**shift) if it is not a power of 2
    2644             :     uint32_t is_power_of_2 = (magic == 0);
    2645             :     __m256i q_sign = _mm256_srai_epi32(q, 31);  // q_sign = q >> 31
    2646             :     __m256i mask = _mm256_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
    2647             :     q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)
    2648             :     q = _mm256_srai_epi32(q, shift);                          // q >>= shift
    2649             :     q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign
    2650             :     return q;
    2651             : }
    2652             : 
    2653             : ////////// SINT64
    2654             : 
    2655             : __m256i libdivide_s64_do_vec256(__m256i numers, const struct libdivide_s64_t *denom) {
    2656             :     uint8_t more = denom->more;
    2657             :     int64_t magic = denom->magic;
    2658             :     if (magic == 0) {  // shift path
    2659             :         uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    2660             :         uint64_t mask = ((uint64_t)1 << shift) - 1;
    2661             :         __m256i roundToZeroTweak = _mm256_set1_epi64x(mask);
    2662             :         // q = numer + ((numer >> 63) & roundToZeroTweak);
    2663             :         __m256i q = _mm256_add_epi64(
    2664             :             numers, _mm256_and_si256(libdivide_s64_signbits_vec256(numers), roundToZeroTweak));
    2665             :         q = libdivide_s64_shift_right_vec256(q, shift);
    2666             :         __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
    2667             :         // q = (q ^ sign) - sign;
    2668             :         q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign);
    2669             :         return q;
    2670             :     } else {
    2671             :         __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic));
    2672             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2673             :             // must be arithmetic shift
    2674             :             __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
    2675             :             // q += ((numer ^ sign) - sign);
    2676             :             q = _mm256_add_epi64(q, _mm256_sub_epi64(_mm256_xor_si256(numers, sign), sign));
    2677             :         }
    2678             :         // q >>= denom->mult_path.shift
    2679             :         q = libdivide_s64_shift_right_vec256(q, more & LIBDIVIDE_64_SHIFT_MASK);
    2680             :         q = _mm256_add_epi64(q, _mm256_srli_epi64(q, 63));  // q += (q < 0)
    2681             :         return q;
    2682             :     }
    2683             : }
    2684             : 
    2685             : __m256i libdivide_s64_branchfree_do_vec256(
    2686             :     __m256i numers, const struct libdivide_s64_branchfree_t *denom) {
    2687             :     int64_t magic = denom->magic;
    2688             :     uint8_t more = denom->more;
    2689             :     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    2690             :     // must be arithmetic shift
    2691             :     __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
    2692             : 
    2693             :     // libdivide_mullhi_s64(numers, magic);
    2694             :     __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic));
    2695             :     q = _mm256_add_epi64(q, numers);  // q += numers
    2696             : 
    2697             :     // If q is non-negative, we have nothing to do.
    2698             :     // If q is negative, we want to add either (2**shift)-1 if d is
    2699             :     // a power of 2, or (2**shift) if it is not a power of 2.
    2700             :     uint32_t is_power_of_2 = (magic == 0);
    2701             :     __m256i q_sign = libdivide_s64_signbits_vec256(q);  // q_sign = q >> 63
    2702             :     __m256i mask = _mm256_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);
    2703             :     q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)
    2704             :     q = libdivide_s64_shift_right_vec256(q, shift);           // q >>= shift
    2705             :     q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign
    2706             :     return q;
    2707             : }
    2708             : 
    2709             : #endif
    2710             : 
    2711             : #if defined(LIBDIVIDE_SSE2)
    2712             : 
    2713             : static LIBDIVIDE_INLINE __m128i libdivide_u16_do_vec128(
    2714             :     __m128i numers, const struct libdivide_u16_t *denom);
    2715             : static LIBDIVIDE_INLINE __m128i libdivide_s16_do_vec128(
    2716             :     __m128i numers, const struct libdivide_s16_t *denom);
    2717             : static LIBDIVIDE_INLINE __m128i libdivide_u32_do_vec128(
    2718             :     __m128i numers, const struct libdivide_u32_t *denom);
    2719             : static LIBDIVIDE_INLINE __m128i libdivide_s32_do_vec128(
    2720             :     __m128i numers, const struct libdivide_s32_t *denom);
    2721             : static LIBDIVIDE_INLINE __m128i libdivide_u64_do_vec128(
    2722             :     __m128i numers, const struct libdivide_u64_t *denom);
    2723             : static LIBDIVIDE_INLINE __m128i libdivide_s64_do_vec128(
    2724             :     __m128i numers, const struct libdivide_s64_t *denom);
    2725             : 
    2726             : static LIBDIVIDE_INLINE __m128i libdivide_u16_branchfree_do_vec128(
    2727             :     __m128i numers, const struct libdivide_u16_branchfree_t *denom);
    2728             : static LIBDIVIDE_INLINE __m128i libdivide_s16_branchfree_do_vec128(
    2729             :     __m128i numers, const struct libdivide_s16_branchfree_t *denom);
    2730             : static LIBDIVIDE_INLINE __m128i libdivide_u32_branchfree_do_vec128(
    2731             :     __m128i numers, const struct libdivide_u32_branchfree_t *denom);
    2732             : static LIBDIVIDE_INLINE __m128i libdivide_s32_branchfree_do_vec128(
    2733             :     __m128i numers, const struct libdivide_s32_branchfree_t *denom);
    2734             : static LIBDIVIDE_INLINE __m128i libdivide_u64_branchfree_do_vec128(
    2735             :     __m128i numers, const struct libdivide_u64_branchfree_t *denom);
    2736             : static LIBDIVIDE_INLINE __m128i libdivide_s64_branchfree_do_vec128(
    2737             :     __m128i numers, const struct libdivide_s64_branchfree_t *denom);
    2738             : 
    2739             : //////// Internal Utility Functions
    2740             : 
    2741             : // Implementation of _mm_srai_epi64(v, 63) (from AVX512).
    2742             : static LIBDIVIDE_INLINE __m128i libdivide_s64_signbits_vec128(__m128i v) {
    2743             :     __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
    2744             :     __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31);
    2745             :     return signBits;
    2746             : }
    2747             : 
    2748             : // Implementation of _mm_srai_epi64 (from AVX512).
    2749             : static LIBDIVIDE_INLINE __m128i libdivide_s64_shift_right_vec128(__m128i v, int amt) {
    2750             :     const int b = 64 - amt;
    2751             :     __m128i m = _mm_set1_epi64x((uint64_t)1 << (b - 1));
    2752             :     __m128i x = _mm_srli_epi64(v, amt);
    2753             :     __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m);
    2754             :     return result;
    2755             : }
    2756             : 
    2757             : // Here, b is assumed to contain one 32-bit value repeated.
    2758             : static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u32_vec128(__m128i a, __m128i b) {
    2759        2436 :     __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32);
    2760        1218 :     __m128i a1X3X = _mm_srli_epi64(a, 32);
    2761        1218 :     __m128i mask = _mm_set_epi32(-1, 0, -1, 0);
    2762        2436 :     __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), mask);
    2763        1218 :     return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3);
    2764             : }
    2765             : 
    2766             : // SSE2 does not have a signed multiplication instruction, but we can convert
    2767             : // unsigned to signed pretty efficiently. Again, b is just a 32 bit value
    2768             : // repeated four times.
    2769             : static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s32_vec128(__m128i a, __m128i b) {
    2770             :     __m128i p = libdivide_mullhi_u32_vec128(a, b);
    2771             :     // t1 = (a >> 31) & y, arithmetic shift
    2772             :     __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b);
    2773             :     __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a);
    2774             :     p = _mm_sub_epi32(p, t1);
    2775             :     p = _mm_sub_epi32(p, t2);
    2776             :     return p;
    2777             : }
    2778             : 
    2779             : // Here, y is assumed to contain one 64-bit value repeated.
    2780             : static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y) {
    2781             :     // full 128 bits product is:
    2782             :     // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64)
    2783             :     // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64.
    2784             : 
    2785             :     // Compute x0*y0.
    2786             :     // Note x1, y1 are ignored by mul_epu32.
    2787             :     __m128i x0y0 = _mm_mul_epu32(x, y);
    2788             :     __m128i x0y0_hi = _mm_srli_epi64(x0y0, 32);
    2789             : 
    2790             :     // Get x1, y1 in the low bits.
    2791             :     // We could shuffle or right shift. Shuffles are preferred as they preserve
    2792             :     // the source register for the next computation.
    2793             :     __m128i x1 = _mm_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1));
    2794             :     __m128i y1 = _mm_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1));
    2795             : 
    2796             :     // No need to mask off top 32 bits for mul_epu32.
    2797             :     __m128i x0y1 = _mm_mul_epu32(x, y1);
    2798             :     __m128i x1y0 = _mm_mul_epu32(x1, y);
    2799             :     __m128i x1y1 = _mm_mul_epu32(x1, y1);
    2800             : 
    2801             :     // Mask here selects low bits only.
    2802             :     __m128i mask = _mm_set1_epi64x(0xFFFFFFFF);
    2803             :     __m128i temp = _mm_add_epi64(x1y0, x0y0_hi);
    2804             :     __m128i temp_lo = _mm_and_si128(temp, mask);
    2805             :     __m128i temp_hi = _mm_srli_epi64(temp, 32);
    2806             : 
    2807             :     temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32);
    2808             :     temp_hi = _mm_add_epi64(x1y1, temp_hi);
    2809             :     return _mm_add_epi64(temp_lo, temp_hi);
    2810             : }
    2811             : 
    2812             : // y is one 64-bit value repeated.
    2813             : static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y) {
    2814             :     __m128i p = libdivide_mullhi_u64_vec128(x, y);
    2815             :     __m128i t1 = _mm_and_si128(libdivide_s64_signbits_vec128(x), y);
    2816             :     __m128i t2 = _mm_and_si128(libdivide_s64_signbits_vec128(y), x);
    2817             :     p = _mm_sub_epi64(p, t1);
    2818             :     p = _mm_sub_epi64(p, t2);
    2819             :     return p;
    2820             : }
    2821             : 
    2822             : ////////// UINT26
    2823             : 
    2824             : __m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) {
    2825       10030 :     uint8_t more = denom->more;
    2826       10030 :     if (!denom->magic) {
    2827          36 :         return _mm_srli_epi16(numers, more);
    2828             :     } else {
    2829       20024 :         __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
    2830       10012 :         if (more & LIBDIVIDE_ADD_MARKER) {
    2831          12 :             __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
    2832          12 :             return _mm_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
    2833             :         } else {
    2834       20012 :             return _mm_srli_epi16(q, more);
    2835             :         }
    2836             :     }
    2837             : }
    2838             : 
    2839             : __m128i libdivide_u16_branchfree_do_vec128(
    2840             :     __m128i numers, const struct libdivide_u16_branchfree_t *denom) {
    2841             :     __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
    2842             :     __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
    2843             :     return _mm_srli_epi16(t, denom->more);
    2844             : }
    2845             : 
    2846             : ////////// UINT32
    2847             : 
    2848             : __m128i libdivide_u32_do_vec128(__m128i numers, const struct libdivide_u32_t *denom) {
    2849        1272 :     uint8_t more = denom->more;
    2850        1272 :     if (!denom->magic) {
    2851         108 :         return _mm_srli_epi32(numers, more);
    2852             :     } else {
    2853        2436 :         __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic));
    2854        1218 :         if (more & LIBDIVIDE_ADD_MARKER) {
    2855             :             // uint32_t t = ((numer - q) >> 1) + q;
    2856             :             // return t >> denom->shift;
    2857           0 :             uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    2858           0 :             __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
    2859           0 :             return _mm_srli_epi32(t, shift);
    2860             :         } else {
    2861        2436 :             return _mm_srli_epi32(q, more);
    2862             :         }
    2863             :     }
    2864             : }
    2865             : 
    2866             : __m128i libdivide_u32_branchfree_do_vec128(
    2867             :     __m128i numers, const struct libdivide_u32_branchfree_t *denom) {
    2868             :     __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic));
    2869             :     __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
    2870             :     return _mm_srli_epi32(t, denom->more);
    2871             : }
    2872             : 
    2873             : ////////// UINT64
    2874             : 
    2875             : __m128i libdivide_u64_do_vec128(__m128i numers, const struct libdivide_u64_t *denom) {
    2876             :     uint8_t more = denom->more;
    2877             :     if (!denom->magic) {
    2878             :         return _mm_srli_epi64(numers, more);
    2879             :     } else {
    2880             :         __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic));
    2881             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2882             :             // uint32_t t = ((numer - q) >> 1) + q;
    2883             :             // return t >> denom->shift;
    2884             :             uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    2885             :             __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
    2886             :             return _mm_srli_epi64(t, shift);
    2887             :         } else {
    2888             :             return _mm_srli_epi64(q, more);
    2889             :         }
    2890             :     }
    2891             : }
    2892             : 
    2893             : __m128i libdivide_u64_branchfree_do_vec128(
    2894             :     __m128i numers, const struct libdivide_u64_branchfree_t *denom) {
    2895             :     __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic));
    2896             :     __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
    2897             :     return _mm_srli_epi64(t, denom->more);
    2898             : }
    2899             : 
    2900             : ////////// SINT16
    2901             : 
    2902             : __m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) {
    2903             :     uint8_t more = denom->more;
    2904             :     if (!denom->magic) {
    2905             :         uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
    2906             :         uint16_t mask = ((uint16_t)1 << shift) - 1;
    2907             :         __m128i roundToZeroTweak = _mm_set1_epi16(mask);
    2908             :         // q = numer + ((numer >> 15) & roundToZeroTweak);
    2909             :         __m128i q =
    2910             :             _mm_add_epi16(numers, _mm_and_si128(_mm_srai_epi16(numers, 15), roundToZeroTweak));
    2911             :         q = _mm_srai_epi16(q, shift);
    2912             :         __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
    2913             :         // q = (q ^ sign) - sign;
    2914             :         q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign);
    2915             :         return q;
    2916             :     } else {
    2917             :         __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(denom->magic));
    2918             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2919             :             // must be arithmetic shift
    2920             :             __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
    2921             :             // q += ((numer ^ sign) - sign);
    2922             :             q = _mm_add_epi16(q, _mm_sub_epi16(_mm_xor_si128(numers, sign), sign));
    2923             :         }
    2924             :         // q >>= shift
    2925             :         q = _mm_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
    2926             :         q = _mm_add_epi16(q, _mm_srli_epi16(q, 15));  // q += (q < 0)
    2927             :         return q;
    2928             :     }
    2929             : }
    2930             : 
    2931             : __m128i libdivide_s16_branchfree_do_vec128(
    2932             :     __m128i numers, const struct libdivide_s16_branchfree_t *denom) {
    2933             :     int16_t magic = denom->magic;
    2934             :     uint8_t more = denom->more;
    2935             :     uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
    2936             :     // must be arithmetic shift
    2937             :     __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
    2938             :     __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(magic));
    2939             :     q = _mm_add_epi16(q, numers);  // q += numers
    2940             : 
    2941             :     // If q is non-negative, we have nothing to do
    2942             :     // If q is negative, we want to add either (2**shift)-1 if d is
    2943             :     // a power of 2, or (2**shift) if it is not a power of 2
    2944             :     uint16_t is_power_of_2 = (magic == 0);
    2945             :     __m128i q_sign = _mm_srai_epi16(q, 15);  // q_sign = q >> 15
    2946             :     __m128i mask = _mm_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
    2947             :     q = _mm_add_epi16(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)
    2948             :     q = _mm_srai_epi16(q, shift);                       // q >>= shift
    2949             :     q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign
    2950             :     return q;
    2951             : }
    2952             : 
    2953             : ////////// SINT32
    2954             : 
    2955             : __m128i libdivide_s32_do_vec128(__m128i numers, const struct libdivide_s32_t *denom) {
    2956             :     uint8_t more = denom->more;
    2957             :     if (!denom->magic) {
    2958             :         uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    2959             :         uint32_t mask = ((uint32_t)1 << shift) - 1;
    2960             :         __m128i roundToZeroTweak = _mm_set1_epi32(mask);
    2961             :         // q = numer + ((numer >> 31) & roundToZeroTweak);
    2962             :         __m128i q =
    2963             :             _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
    2964             :         q = _mm_srai_epi32(q, shift);
    2965             :         __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
    2966             :         // q = (q ^ sign) - sign;
    2967             :         q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign);
    2968             :         return q;
    2969             :     } else {
    2970             :         __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(denom->magic));
    2971             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2972             :             // must be arithmetic shift
    2973             :             __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
    2974             :             // q += ((numer ^ sign) - sign);
    2975             :             q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign));
    2976             :         }
    2977             :         // q >>= shift
    2978             :         q = _mm_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
    2979             :         q = _mm_add_epi32(q, _mm_srli_epi32(q, 31));  // q += (q < 0)
    2980             :         return q;
    2981             :     }
    2982             : }
    2983             : 
    2984             : __m128i libdivide_s32_branchfree_do_vec128(
    2985             :     __m128i numers, const struct libdivide_s32_branchfree_t *denom) {
    2986             :     int32_t magic = denom->magic;
    2987             :     uint8_t more = denom->more;
    2988             :     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    2989             :     // must be arithmetic shift
    2990             :     __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
    2991             :     __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(magic));
    2992             :     q = _mm_add_epi32(q, numers);  // q += numers
    2993             : 
    2994             :     // If q is non-negative, we have nothing to do
    2995             :     // If q is negative, we want to add either (2**shift)-1 if d is
    2996             :     // a power of 2, or (2**shift) if it is not a power of 2
    2997             :     uint32_t is_power_of_2 = (magic == 0);
    2998             :     __m128i q_sign = _mm_srai_epi32(q, 31);  // q_sign = q >> 31
    2999             :     __m128i mask = _mm_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
    3000             :     q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)
    3001             :     q = _mm_srai_epi32(q, shift);                       // q >>= shift
    3002             :     q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign
    3003             :     return q;
    3004             : }
    3005             : 
    3006             : ////////// SINT64
    3007             : 
    3008             : __m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *denom) {
    3009             :     uint8_t more = denom->more;
    3010             :     int64_t magic = denom->magic;
    3011             :     if (magic == 0) {  // shift path
    3012             :         uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    3013             :         uint64_t mask = ((uint64_t)1 << shift) - 1;
    3014             :         __m128i roundToZeroTweak = _mm_set1_epi64x(mask);
    3015             :         // q = numer + ((numer >> 63) & roundToZeroTweak);
    3016             :         __m128i q = _mm_add_epi64(
    3017             :             numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak));
    3018             :         q = libdivide_s64_shift_right_vec128(q, shift);
    3019             :         __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
    3020             :         // q = (q ^ sign) - sign;
    3021             :         q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign);
    3022             :         return q;
    3023             :     } else {
    3024             :         __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic));
    3025             :         if (more & LIBDIVIDE_ADD_MARKER) {
    3026             :             // must be arithmetic shift
    3027             :             __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
    3028             :             // q += ((numer ^ sign) - sign);
    3029             :             q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign));
    3030             :         }
    3031             :         // q >>= denom->mult_path.shift
    3032             :         q = libdivide_s64_shift_right_vec128(q, more & LIBDIVIDE_64_SHIFT_MASK);
    3033             :         q = _mm_add_epi64(q, _mm_srli_epi64(q, 63));  // q += (q < 0)
    3034             :         return q;
    3035             :     }
    3036             : }
    3037             : 
    3038             : __m128i libdivide_s64_branchfree_do_vec128(
    3039             :     __m128i numers, const struct libdivide_s64_branchfree_t *denom) {
    3040             :     int64_t magic = denom->magic;
    3041             :     uint8_t more = denom->more;
    3042             :     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    3043             :     // must be arithmetic shift
    3044             :     __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
    3045             : 
    3046             :     // libdivide_mullhi_s64(numers, magic);
    3047             :     __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic));
    3048             :     q = _mm_add_epi64(q, numers);  // q += numers
    3049             : 
    3050             :     // If q is non-negative, we have nothing to do.
    3051             :     // If q is negative, we want to add either (2**shift)-1 if d is
    3052             :     // a power of 2, or (2**shift) if it is not a power of 2.
    3053             :     uint32_t is_power_of_2 = (magic == 0);
    3054             :     __m128i q_sign = libdivide_s64_signbits_vec128(q);  // q_sign = q >> 63
    3055             :     __m128i mask = _mm_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);
    3056             :     q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)
    3057             :     q = libdivide_s64_shift_right_vec128(q, shift);     // q >>= shift
    3058             :     q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign
    3059             :     return q;
    3060             : }
    3061             : 
    3062             : #endif
    3063             : 
    3064             : ////////// C++ stuff
    3065             : 
    3066             : #ifdef __cplusplus
    3067             : 
    3068             : enum Branching {
    3069             :     BRANCHFULL,  // use branching algorithms
    3070             :     BRANCHFREE   // use branchfree algorithms
    3071             : };
    3072             : 
    3073             : namespace detail {
    3074             : enum Signedness {
    3075             :     SIGNED,
    3076             :     UNSIGNED,
    3077             : };
    3078             : 
    3079             : #if defined(LIBDIVIDE_NEON)
    3080             : // Helper to deduce NEON vector type for integral type.
    3081             : template <int _WIDTH, Signedness _SIGN>
    3082             : struct NeonVec {};
    3083             : 
    3084             : template <>
    3085             : struct NeonVec<16, UNSIGNED> {
    3086             :     typedef uint16x8_t type;
    3087             : };
    3088             : 
    3089             : template <>
    3090             : struct NeonVec<16, SIGNED> {
    3091             :     typedef int16x8_t type;
    3092             : };
    3093             : 
    3094             : template <>
    3095             : struct NeonVec<32, UNSIGNED> {
    3096             :     typedef uint32x4_t type;
    3097             : };
    3098             : 
    3099             : template <>
    3100             : struct NeonVec<32, SIGNED> {
    3101             :     typedef int32x4_t type;
    3102             : };
    3103             : 
    3104             : template <>
    3105             : struct NeonVec<64, UNSIGNED> {
    3106             :     typedef uint64x2_t type;
    3107             : };
    3108             : 
    3109             : template <>
    3110             : struct NeonVec<64, SIGNED> {
    3111             :     typedef int64x2_t type;
    3112             : };
    3113             : 
    3114             : template <typename T>
    3115             : struct NeonVecFor {
    3116             :     // See 'class divider' for an explanation of these template parameters.
    3117             :     typedef typename NeonVec<sizeof(T) * 8, (((T)0 >> 0) > (T)(-1) ? SIGNED : UNSIGNED)>::type type;
    3118             : };
    3119             : 
    3120             : #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)                    \
    3121             :     LIBDIVIDE_INLINE typename NeonVecFor<INT_TYPE>::type divide( \
    3122             :         typename NeonVecFor<INT_TYPE>::type n) const {           \
    3123             :         return libdivide_##ALGO##_do_vec128(n, &denom);          \
    3124             :     }
    3125             : #else
    3126             : #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)
    3127             : #endif
    3128             : 
    3129             : #if defined(LIBDIVIDE_SSE2)
    3130             : #define LIBDIVIDE_DIVIDE_SSE2(ALGO)                     \
    3131             :     LIBDIVIDE_INLINE __m128i divide(__m128i n) const {  \
    3132             :         return libdivide_##ALGO##_do_vec128(n, &denom); \
    3133             :     }
    3134             : #else
    3135             : #define LIBDIVIDE_DIVIDE_SSE2(ALGO)
    3136             : #endif
    3137             : 
    3138             : #if defined(LIBDIVIDE_AVX2)
    3139             : #define LIBDIVIDE_DIVIDE_AVX2(ALGO)                     \
    3140             :     LIBDIVIDE_INLINE __m256i divide(__m256i n) const {  \
    3141             :         return libdivide_##ALGO##_do_vec256(n, &denom); \
    3142             :     }
    3143             : #else
    3144             : #define LIBDIVIDE_DIVIDE_AVX2(ALGO)
    3145             : #endif
    3146             : 
    3147             : #if defined(LIBDIVIDE_AVX512)
    3148             : #define LIBDIVIDE_DIVIDE_AVX512(ALGO)                   \
    3149             :     LIBDIVIDE_INLINE __m512i divide(__m512i n) const {  \
    3150             :         return libdivide_##ALGO##_do_vec512(n, &denom); \
    3151             :     }
    3152             : #else
    3153             : #define LIBDIVIDE_DIVIDE_AVX512(ALGO)
    3154             : #endif
    3155             : 
    3156             : // The DISPATCHER_GEN() macro generates C++ methods (for the given integer
    3157             : // and algorithm types) that redirect to libdivide's C API.
    3158             : #define DISPATCHER_GEN(T, ALGO)                                                       \
    3159             :     libdivide_##ALGO##_t denom;                                                       \
    3160             :     LIBDIVIDE_INLINE dispatcher() {}                                                  \
    3161             :     explicit LIBDIVIDE_CONSTEXPR dispatcher(decltype(nullptr)) : denom{} {}              \
    3162             :     LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {}            \
    3163             :     LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \
    3164             :     LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \
    3165             :     LIBDIVIDE_DIVIDE_NEON(ALGO, T)                                                    \
    3166             :     LIBDIVIDE_DIVIDE_SSE2(ALGO)                                                       \
    3167             :     LIBDIVIDE_DIVIDE_AVX2(ALGO)                                                       \
    3168             :     LIBDIVIDE_DIVIDE_AVX512(ALGO)
    3169             : 
    3170             : // The dispatcher selects a specific division algorithm for a given
    3171             : // width, signedness, and ALGO using partial template specialization.
    3172             : template <int _WIDTH, Signedness _SIGN, Branching _ALGO>
    3173             : struct dispatcher {};
    3174             : 
    3175             : template <>
    3176             : struct dispatcher<16, SIGNED, BRANCHFULL> {
    3177             :     DISPATCHER_GEN(int16_t, s16)
    3178             : };
    3179             : template <>
    3180             : struct dispatcher<16, SIGNED, BRANCHFREE> {
    3181             :     DISPATCHER_GEN(int16_t, s16_branchfree)
    3182             : };
    3183             : template <>
    3184             : struct dispatcher<16, UNSIGNED, BRANCHFULL> {
    3185       10038 :     DISPATCHER_GEN(uint16_t, u16)
    3186             : };
    3187             : template <>
    3188             : struct dispatcher<16, UNSIGNED, BRANCHFREE> {
    3189             :     DISPATCHER_GEN(uint16_t, u16_branchfree)
    3190             : };
    3191             : template <>
    3192             : struct dispatcher<32, SIGNED, BRANCHFULL> {
    3193             :     DISPATCHER_GEN(int32_t, s32)
    3194             : };
    3195             : template <>
    3196             : struct dispatcher<32, SIGNED, BRANCHFREE> {
    3197             :     DISPATCHER_GEN(int32_t, s32_branchfree)
    3198             : };
    3199             : template <>
    3200             : struct dispatcher<32, UNSIGNED, BRANCHFULL> {
    3201        1285 :     DISPATCHER_GEN(uint32_t, u32)
    3202             : };
    3203             : template <>
    3204             : struct dispatcher<32, UNSIGNED, BRANCHFREE> {
    3205             :     DISPATCHER_GEN(uint32_t, u32_branchfree)
    3206             : };
    3207             : template <>
    3208             : struct dispatcher<64, SIGNED, BRANCHFULL> {
    3209             :     DISPATCHER_GEN(int64_t, s64)
    3210             : };
    3211             : template <>
    3212             : struct dispatcher<64, SIGNED, BRANCHFREE> {
    3213             :     DISPATCHER_GEN(int64_t, s64_branchfree)
    3214             : };
    3215             : template <>
    3216             : struct dispatcher<64, UNSIGNED, BRANCHFULL> {
    3217             :     DISPATCHER_GEN(uint64_t, u64)
    3218             : };
    3219             : template <>
    3220             : struct dispatcher<64, UNSIGNED, BRANCHFREE> {
    3221             :     DISPATCHER_GEN(uint64_t, u64_branchfree)
    3222             : };
    3223             : }  // namespace detail
    3224             : 
    3225             : #if defined(LIBDIVIDE_NEON)
    3226             : // Allow NeonVecFor outside of detail namespace.
    3227             : template <typename T>
    3228             : struct NeonVecFor {
    3229             :     typedef typename detail::NeonVecFor<T>::type type;
    3230             : };
    3231             : #endif
    3232             : 
    3233             : // This is the main divider class for use by the user (C++ API).
    3234             : // The actual division algorithm is selected using the dispatcher struct
    3235             : // based on the integer width and algorithm template parameters.
    3236             : template <typename T, Branching ALGO = BRANCHFULL>
    3237             : class divider {
    3238             :    private:
    3239             :     // Dispatch based on the size and signedness.
    3240             :     // We avoid using type_traits as it's not available in AVR.
    3241             :     // Detect signedness by checking if T(-1) is less than T(0).
    3242             :     // Also throw in a shift by 0, which prevents floating point types from being passed.
    3243             :     typedef detail::dispatcher<sizeof(T) * 8,
    3244             :         (((T)0 >> 0) > (T)(-1) ? detail::SIGNED : detail::UNSIGNED), ALGO>
    3245             :         dispatcher_t;
    3246             : 
    3247             :    public:
    3248             :     // We leave the default constructor empty so that creating
    3249             :     // an array of dividers and then initializing them
    3250             :     // later doesn't slow us down.
    3251             :     divider() {}
    3252             : 
    3253             :     // constexpr zero-initialization to allow for use w/ static constinit
    3254             :     explicit LIBDIVIDE_CONSTEXPR divider(decltype(nullptr)) : div(nullptr) {}
    3255             : 
    3256             :     // Constructor that takes the divisor as a parameter
    3257          42 :     LIBDIVIDE_INLINE divider(T d) : div(d) {}
    3258             : 
    3259             :     // Divides n by the divisor
    3260             :     LIBDIVIDE_INLINE T divide(T n) const { return div.divide(n); }
    3261             : 
    3262             :     // Recovers the divisor, returns the value that was
    3263             :     // used to initialize this divider object.
    3264             :     T recover() const { return div.recover(); }
    3265             : 
    3266             :     bool operator==(const divider<T, ALGO> &other) const {
    3267             :         return div.denom.magic == other.div.denom.magic && div.denom.more == other.div.denom.more;
    3268             :     }
    3269             : 
    3270             :     bool operator!=(const divider<T, ALGO> &other) const { return !(*this == other); }
    3271             : 
    3272             :     // Vector variants treat the input as packed integer values with the same type as the divider
    3273             :     // (e.g. s32, u32, s64, u64) and divides each of them by the divider, returning the packed
    3274             :     // quotients.
    3275             : #if defined(LIBDIVIDE_SSE2)
    3276       11302 :     LIBDIVIDE_INLINE __m128i divide(__m128i n) const { return div.divide(n); }
    3277             : #endif
    3278             : #if defined(LIBDIVIDE_AVX2)
    3279             :     LIBDIVIDE_INLINE __m256i divide(__m256i n) const { return div.divide(n); }
    3280             : #endif
    3281             : #if defined(LIBDIVIDE_AVX512)
    3282             :     LIBDIVIDE_INLINE __m512i divide(__m512i n) const { return div.divide(n); }
    3283             : #endif
    3284             : #if defined(LIBDIVIDE_NEON)
    3285             :     LIBDIVIDE_INLINE typename NeonVecFor<T>::type divide(typename NeonVecFor<T>::type n) const {
    3286             :         return div.divide(n);
    3287             :     }
    3288             : #endif
    3289             : 
    3290             :    private:
    3291             :     // Storage for the actual divisor
    3292             :     dispatcher_t div;
    3293             : };
    3294             : 
    3295             : // Overload of operator / for scalar division
    3296             : template <typename T, Branching ALGO>
    3297             : LIBDIVIDE_INLINE T operator/(T n, const divider<T, ALGO> &div) {
    3298             :     return div.divide(n);
    3299             : }
    3300             : 
    3301             : // Overload of operator /= for scalar division
    3302             : template <typename T, Branching ALGO>
    3303             : LIBDIVIDE_INLINE T &operator/=(T &n, const divider<T, ALGO> &div) {
    3304             :     n = div.divide(n);
    3305             :     return n;
    3306             : }
    3307             : 
    3308             : // Overloads for vector types.
    3309             : #if defined(LIBDIVIDE_SSE2)
    3310             : template <typename T, Branching ALGO>
    3311             : LIBDIVIDE_INLINE __m128i operator/(__m128i n, const divider<T, ALGO> &div) {
    3312             :     return div.divide(n);
    3313             : }
    3314             : 
    3315             : template <typename T, Branching ALGO>
    3316             : LIBDIVIDE_INLINE __m128i operator/=(__m128i &n, const divider<T, ALGO> &div) {
    3317       11302 :     n = div.divide(n);
    3318       11302 :     return n;
    3319             : }
    3320             : #endif
    3321             : #if defined(LIBDIVIDE_AVX2)
    3322             : template <typename T, Branching ALGO>
    3323             : LIBDIVIDE_INLINE __m256i operator/(__m256i n, const divider<T, ALGO> &div) {
    3324             :     return div.divide(n);
    3325             : }
    3326             : 
    3327             : template <typename T, Branching ALGO>
    3328             : LIBDIVIDE_INLINE __m256i operator/=(__m256i &n, const divider<T, ALGO> &div) {
    3329             :     n = div.divide(n);
    3330             :     return n;
    3331             : }
    3332             : #endif
    3333             : #if defined(LIBDIVIDE_AVX512)
    3334             : template <typename T, Branching ALGO>
    3335             : LIBDIVIDE_INLINE __m512i operator/(__m512i n, const divider<T, ALGO> &div) {
    3336             :     return div.divide(n);
    3337             : }
    3338             : 
    3339             : template <typename T, Branching ALGO>
    3340             : LIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider<T, ALGO> &div) {
    3341             :     n = div.divide(n);
    3342             :     return n;
    3343             : }
    3344             : #endif
    3345             : 
    3346             : #if defined(LIBDIVIDE_NEON)
    3347             : template <typename T, Branching ALGO>
    3348             : LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/(
    3349             :     typename NeonVecFor<T>::type n, const divider<T, ALGO> &div) {
    3350             :     return div.divide(n);
    3351             : }
    3352             : 
    3353             : template <typename T, Branching ALGO>
    3354             : LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/=(
    3355             :     typename NeonVecFor<T>::type &n, const divider<T, ALGO> &div) {
    3356             :     n = div.divide(n);
    3357             :     return n;
    3358             : }
    3359             : #endif
    3360             : 
    3361             : #if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900)
    3362             : // libdivide::branchfree_divider<T>
    3363             : template <typename T>
    3364             : using branchfree_divider = divider<T, BRANCHFREE>;
    3365             : #endif
    3366             : 
    3367             : }  // namespace libdivide
    3368             : 
    3369             : #endif  // __cplusplus
    3370             : 
    3371             : #if defined(_MSC_VER) && !defined(__clang__)
    3372             : #pragma warning(pop)
    3373             : #endif
    3374             : 
    3375             : #endif  // LIBDIVIDE_H

Generated by: LCOV version 1.14