LCOV - code coverage report
Current view: top level - third_party/libdivide - libdivide.h (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 80 85 94.1 %
Date: 2026-03-05 10:33:42 Functions: 0 0 -

          Line data    Source code
       1             : // libdivide.h - Optimized integer division
       2             : // https://libdivide.com
       3             : //
       4             : // Copyright (C) 2010 - 2022 ridiculous_fish, <libdivide@ridiculousfish.com>
       5             : // Copyright (C) 2016 - 2022 Kim Walisch, <kim.walisch@gmail.com>
       6             : //
       7             : // libdivide is dual-licensed under the Boost or zlib licenses.
       8             : // You may use libdivide under the terms of either of these.
       9             : // See LICENSE.txt for more details.
      10             : 
      11             : #ifndef LIBDIVIDE_H
      12             : #define LIBDIVIDE_H
      13             : 
      14             : // *** Version numbers are auto generated - do not edit ***
      15             : #define LIBDIVIDE_VERSION "5.2.0"
      16             : #define LIBDIVIDE_VERSION_MAJOR 5
      17             : #define LIBDIVIDE_VERSION_MINOR 2
      18             : #define LIBDIVIDE_VERSION_PATCH 0
      19             : 
      20             : #include <stdint.h>
      21             : 
      22             : #if !defined(__AVR__) && __STDC_HOSTED__ != 0
      23             : #include <stdio.h>
      24             : #include <stdlib.h>
      25             : #endif
      26             : 
      27             : #if defined(_MSC_VER) && (defined(__cplusplus) && (__cplusplus >= 202002L)) || \
      28             :     (defined(_MSVC_LANG) && (_MSVC_LANG >= 202002L))
      29             : #include <limits.h>
      30             : #include <type_traits>
      31             : #define LIBDIVIDE_VC_CXX20
      32             : #endif
      33             : 
      34             : #if defined(LIBDIVIDE_SSE2)
      35             : #include <emmintrin.h>
      36             : #endif
      37             : 
      38             : #if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512)
      39             : #include <immintrin.h>
      40             : #endif
      41             : 
      42             : #if defined(LIBDIVIDE_NEON)
      43             : #include <arm_neon.h>
      44             : #endif
      45             : 
      46             : // Clang-cl prior to Visual Studio 2022 doesn't include __umulh/__mulh intrinsics
      47             : #if defined(_MSC_VER) && (!defined(__clang__) || _MSC_VER > 1930) && \
      48             :     (defined(_M_X64) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC))
      49             : #define LIBDIVIDE_MULH_INTRINSICS
      50             : #endif
      51             : 
      52             : #if defined(_MSC_VER)
      53             : #if defined(LIBDIVIDE_MULH_INTRINSICS) || !defined(__clang__)
      54             : #include <intrin.h>
      55             : #endif
      56             : #ifndef __clang__
      57             : #pragma warning(push)
      58             : // 4146: unary minus operator applied to unsigned type, result still unsigned
      59             : #pragma warning(disable : 4146)
      60             : 
      61             : // 4204: nonstandard extension used : non-constant aggregate initializer
      62             : #pragma warning(disable : 4204)
      63             : #endif
      64             : #define LIBDIVIDE_VC
      65             : #endif
      66             : 
      67             : #if !defined(__has_builtin)
      68             : #define __has_builtin(x) 0
      69             : #endif
      70             : 
      71             : #if defined(__SIZEOF_INT128__)
      72             : #define HAS_INT128_T
      73             : // clang-cl on Windows does not yet support 128-bit division
      74             : #if !(defined(__clang__) && defined(LIBDIVIDE_VC))
      75             : #define HAS_INT128_DIV
      76             : #endif
      77             : #endif
      78             : 
      79             : #if defined(__x86_64__) || defined(_M_X64)
      80             : #define LIBDIVIDE_X86_64
      81             : #endif
      82             : 
      83             : #if defined(__i386__)
      84             : #define LIBDIVIDE_i386
      85             : #endif
      86             : 
      87             : #if defined(__GNUC__) || defined(__clang__)
      88             : #define LIBDIVIDE_GCC_STYLE_ASM
      89             : #endif
      90             : 
      91             : #if defined(__cplusplus) || defined(LIBDIVIDE_VC)
      92             : #define LIBDIVIDE_FUNCTION __FUNCTION__
      93             : #else
      94             : #define LIBDIVIDE_FUNCTION __func__
      95             : #endif
      96             : 
      97             : // Set up forced inlining if possible.
      98             : // We need both the attribute and keyword to avoid "might not be inlineable" warnings.
      99             : #ifdef __has_attribute
     100             : #if __has_attribute(always_inline)
     101             : #define LIBDIVIDE_INLINE __attribute__((always_inline)) inline
     102             : #endif
     103             : #endif
     104             : #ifndef LIBDIVIDE_INLINE
     105             : #ifdef _MSC_VER
     106             : #define LIBDIVIDE_INLINE __forceinline
     107             : #else
     108             : #define LIBDIVIDE_INLINE inline
     109             : #endif
     110             : #endif
     111             : 
     112             : #if defined(__AVR__) || __STDC_HOSTED__ == 0
     113             : #define LIBDIVIDE_ERROR(msg)
     114             : #else
     115             : #define LIBDIVIDE_ERROR(msg)                                                                     \
     116             :     do {                                                                                         \
     117             :         fprintf(stderr, "libdivide.h:%d: %s(): Error: %s\n", __LINE__, LIBDIVIDE_FUNCTION, msg); \
     118             :         abort();                                                                                 \
     119             :     } while (0)
     120             : #endif
     121             : 
     122             : #if defined(LIBDIVIDE_ASSERTIONS_ON) && !defined(__AVR__) && __STDC_HOSTED__ != 0
     123             : #define LIBDIVIDE_ASSERT(x)                                                           \
     124             :     do {                                                                              \
     125             :         if (!(x)) {                                                                   \
     126             :             fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", __LINE__, \
     127             :                 LIBDIVIDE_FUNCTION, #x);                                              \
     128             :             abort();                                                                  \
     129             :         }                                                                             \
     130             :     } while (0)
     131             : #else
     132             : #define LIBDIVIDE_ASSERT(x)
     133             : #endif
     134             : 
     135             : #ifdef __cplusplus
     136             : 
     137             : // For constexpr zero initialization, c++11 might handle things ok,
     138             : // but just limit to at least c++14 to ensure we don't break anyone's code:
     139             : 
     140             : // Use https://en.cppreference.com/w/cpp/feature_test#cpp_constexpr
     141             : #if defined(__cpp_constexpr) && (__cpp_constexpr >= 201304L)
     142             : #define LIBDIVIDE_CONSTEXPR constexpr LIBDIVIDE_INLINE
     143             : 
     144             : // Supposedly, MSVC might not implement feature test macros right:
     145             : // https://stackoverflow.com/questions/49316752/feature-test-macros-not-working-properly-in-visual-c
     146             : // so check that _MSVC_LANG corresponds to at least c++14, and _MSC_VER corresponds to at least VS
     147             : // 2017 15.0 (for extended constexpr support:
     148             : // https://learn.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=msvc-170)
     149             : #elif (defined(_MSC_VER) && _MSC_VER >= 1910) && (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L)
     150             : #define LIBDIVIDE_CONSTEXPR constexpr LIBDIVIDE_INLINE
     151             : 
     152             : #else
     153             : #define LIBDIVIDE_CONSTEXPR LIBDIVIDE_INLINE
     154             : #endif
     155             : 
     156             : namespace libdivide {
     157             : #endif
     158             : 
     159             : #if defined(_MSC_VER) && !defined(__clang__)
     160             : #if defined(LIBDIVIDE_VC_CXX20)
     161             : static LIBDIVIDE_CONSTEXPR int __builtin_clz(unsigned x) {
     162             :     if (std::is_constant_evaluated()) {
     163             :         for (int i = 0; i < sizeof(x) * CHAR_BIT; ++i) {
     164             :             if (x >> (sizeof(x) * CHAR_BIT - 1 - i)) return i;
     165             :         }
     166             :         return sizeof(x) * CHAR_BIT;
     167             :     }
     168             : #else
     169             : static LIBDIVIDE_INLINE int __builtin_clz(unsigned x) {
     170             : #endif
     171             : #if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)
     172             :     return (int)_CountLeadingZeros(x);
     173             : #elif defined(__AVX2__) || defined(__LZCNT__)
     174             :     return (int)_lzcnt_u32(x);
     175             : #else
     176             :     unsigned long r;
     177             :     _BitScanReverse(&r, x);
     178             :     return (int)(r ^ 31);
     179             : #endif
     180             : }
     181             : 
     182             : #if defined(LIBDIVIDE_VC_CXX20)
     183             : static LIBDIVIDE_CONSTEXPR int __builtin_clzll(unsigned long long x) {
     184             :     if (std::is_constant_evaluated()) {
     185             :         for (int i = 0; i < sizeof(x) * CHAR_BIT; ++i) {
     186             :             if (x >> (sizeof(x) * CHAR_BIT - 1 - i)) return i;
     187             :         }
     188             :         return sizeof(x) * CHAR_BIT;
     189             :     }
     190             : #else
     191             : static LIBDIVIDE_INLINE int __builtin_clzll(unsigned long long x) {
     192             : #endif
     193             : #if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)
     194             :     return (int)_CountLeadingZeros64(x);
     195             : #elif defined(_WIN64)
     196             : #if defined(__AVX2__) || defined(__LZCNT__)
     197             :     return (int)_lzcnt_u64(x);
     198             : #else
     199             :     unsigned long r;
     200             :     _BitScanReverse64(&r, x);
     201             :     return (int)(r ^ 63);
     202             : #endif
     203             : #else
     204             :     int l = __builtin_clz((unsigned)x) + 32;
     205             :     int h = __builtin_clz((unsigned)(x >> 32));
     206             :     return !!((unsigned)(x >> 32)) ? h : l;
     207             : #endif
     208             : }
     209             : #endif // defined(_MSC_VER) && !defined(__clang__)
     210             : 
     211             : // pack divider structs to prevent compilers from padding.
     212             : // This reduces memory usage by up to 43% when using a large
     213             : // array of libdivide dividers and improves performance
     214             : // by up to 10% because of reduced memory bandwidth.
     215             : #pragma pack(push, 1)
     216             : 
     217             : struct libdivide_u16_t {
     218             :     uint16_t magic;
     219             :     uint8_t more;
     220             : };
     221             : 
     222             : struct libdivide_s16_t {
     223             :     int16_t magic;
     224             :     uint8_t more;
     225             : };
     226             : 
     227             : struct libdivide_u32_t {
     228             :     uint32_t magic;
     229             :     uint8_t more;
     230             : };
     231             : 
     232             : struct libdivide_s32_t {
     233             :     int32_t magic;
     234             :     uint8_t more;
     235             : };
     236             : 
     237             : struct libdivide_u64_t {
     238             :     uint64_t magic;
     239             :     uint8_t more;
     240             : };
     241             : 
     242             : struct libdivide_s64_t {
     243             :     int64_t magic;
     244             :     uint8_t more;
     245             : };
     246             : 
     247             : struct libdivide_u16_branchfree_t {
     248             :     uint16_t magic;
     249             :     uint8_t more;
     250             : };
     251             : 
     252             : struct libdivide_s16_branchfree_t {
     253             :     int16_t magic;
     254             :     uint8_t more;
     255             : };
     256             : 
     257             : struct libdivide_u32_branchfree_t {
     258             :     uint32_t magic;
     259             :     uint8_t more;
     260             : };
     261             : 
     262             : struct libdivide_s32_branchfree_t {
     263             :     int32_t magic;
     264             :     uint8_t more;
     265             : };
     266             : 
     267             : struct libdivide_u64_branchfree_t {
     268             :     uint64_t magic;
     269             :     uint8_t more;
     270             : };
     271             : 
     272             : struct libdivide_s64_branchfree_t {
     273             :     int64_t magic;
     274             :     uint8_t more;
     275             : };
     276             : 
     277             : #pragma pack(pop)
     278             : 
     279             : // Explanation of the "more" field:
     280             : //
     281             : // * Bits 0-5 is the shift value (for shift path or mult path).
     282             : // * Bit 6 is the add indicator for mult path.
     283             : // * Bit 7 is set if the divisor is negative. We use bit 7 as the negative
     284             : //   divisor indicator so that we can efficiently use sign extension to
     285             : //   create a bitmask with all bits set to 1 (if the divisor is negative)
     286             : //   or 0 (if the divisor is positive).
     287             : //
     288             : // u32: [0-4] shift value
     289             : //      [5] ignored
     290             : //      [6] add indicator
     291             : //      magic number of 0 indicates shift path
     292             : //
     293             : // s32: [0-4] shift value
     294             : //      [5] ignored
     295             : //      [6] add indicator
     296             : //      [7] indicates negative divisor
     297             : //      magic number of 0 indicates shift path
     298             : //
     299             : // u64: [0-5] shift value
     300             : //      [6] add indicator
     301             : //      magic number of 0 indicates shift path
     302             : //
     303             : // s64: [0-5] shift value
     304             : //      [6] add indicator
     305             : //      [7] indicates negative divisor
     306             : //      magic number of 0 indicates shift path
     307             : //
     308             : // In s32 and s64 branchfree modes, the magic number is negated according to
     309             : // whether the divisor is negated. In branchfree strategy, it is not negated.
     310             : 
     311             : enum {
     312             :     LIBDIVIDE_16_SHIFT_MASK = 0x1F,
     313             :     LIBDIVIDE_32_SHIFT_MASK = 0x1F,
     314             :     LIBDIVIDE_64_SHIFT_MASK = 0x3F,
     315             :     LIBDIVIDE_ADD_MARKER = 0x40,
     316             :     LIBDIVIDE_NEGATIVE_DIVISOR = 0x80
     317             : };
     318             : 
     319             : static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d);
     320             : static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d);
     321             : static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d);
     322             : static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d);
     323             : static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d);
     324             : static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d);
     325             : 
     326             : static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d);
     327             : static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d);
     328             : static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d);
     329             : static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d);
     330             : static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d);
     331             : static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d);
     332             : 
     333             : static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(
     334             :     int16_t numer, int16_t magic, uint8_t more);
     335             : static LIBDIVIDE_INLINE int16_t libdivide_s16_do(
     336             :     int16_t numer, const struct libdivide_s16_t *denom);
     337             : static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(
     338             :     uint16_t numer, uint16_t magic, uint8_t more);
     339             : static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(
     340             :     uint16_t numer, const struct libdivide_u16_t *denom);
     341             : static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(
     342             :     int32_t numer, int32_t magic, uint8_t more);
     343             : static LIBDIVIDE_INLINE int32_t libdivide_s32_do(
     344             :     int32_t numer, const struct libdivide_s32_t *denom);
     345             : static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw(
     346             :     uint32_t numer, uint32_t magic, uint8_t more);
     347             : static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(
     348             :     uint32_t numer, const struct libdivide_u32_t *denom);
     349             : static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(
     350             :     int64_t numer, int64_t magic, uint8_t more);
     351             : static LIBDIVIDE_INLINE int64_t libdivide_s64_do(
     352             :     int64_t numer, const struct libdivide_s64_t *denom);
     353             : static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw(
     354             :     uint64_t numer, uint64_t magic, uint8_t more);
     355             : static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(
     356             :     uint64_t numer, const struct libdivide_u64_t *denom);
     357             : 
     358             : static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(
     359             :     int16_t numer, const struct libdivide_s16_branchfree_t *denom);
     360             : static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(
     361             :     uint16_t numer, const struct libdivide_u16_branchfree_t *denom);
     362             : static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(
     363             :     int32_t numer, const struct libdivide_s32_branchfree_t *denom);
     364             : static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(
     365             :     uint32_t numer, const struct libdivide_u32_branchfree_t *denom);
     366             : static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(
     367             :     int64_t numer, const struct libdivide_s64_branchfree_t *denom);
     368             : static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(
     369             :     uint64_t numer, const struct libdivide_u64_branchfree_t *denom);
     370             : 
     371             : static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom);
     372             : static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom);
     373             : static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom);
     374             : static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);
     375             : static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom);
     376             : static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom);
     377             : 
     378             : static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(
     379             :     const struct libdivide_s16_branchfree_t *denom);
     380             : static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(
     381             :     const struct libdivide_u16_branchfree_t *denom);
     382             : static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(
     383             :     const struct libdivide_s32_branchfree_t *denom);
     384             : static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(
     385             :     const struct libdivide_u32_branchfree_t *denom);
     386             : static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover(
     387             :     const struct libdivide_s64_branchfree_t *denom);
     388             : static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover(
     389             :     const struct libdivide_u64_branchfree_t *denom);
     390             : 
     391             : //////// Internal Utility Functions
     392             : 
     393             : static LIBDIVIDE_INLINE uint16_t libdivide_mullhi_u16(uint16_t x, uint16_t y) {
     394             :     uint32_t xl = x, yl = y;
     395             :     uint32_t rl = xl * yl;
     396             :     return (uint16_t)(rl >> 16);
     397             : }
     398             : 
     399             : static LIBDIVIDE_INLINE int16_t libdivide_mullhi_s16(int16_t x, int16_t y) {
     400             :     int32_t xl = x, yl = y;
     401             :     int32_t rl = xl * yl;
     402             :     // needs to be arithmetic shift
     403             :     return (int16_t)(rl >> 16);
     404             : }
     405             : 
     406             : static LIBDIVIDE_INLINE uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) {
     407     2785824 :     uint64_t xl = x, yl = y;
     408     2785824 :     uint64_t rl = xl * yl;
     409     2785824 :     return (uint32_t)(rl >> 32);
     410             : }
     411             : 
     412             : static LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) {
     413             :     int64_t xl = x, yl = y;
     414             :     int64_t rl = xl * yl;
     415             :     // needs to be arithmetic shift
     416             :     return (int32_t)(rl >> 32);
     417             : }
     418             : 
     419             : static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
     420             : #if defined(LIBDIVIDE_MULH_INTRINSICS)
     421             :     return __umulh(x, y);
     422             : #elif defined(HAS_INT128_T)
     423             :     __uint128_t xl = x, yl = y;
     424             :     __uint128_t rl = xl * yl;
     425             :     return (uint64_t)(rl >> 64);
     426             : #else
     427             :     // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
     428             :     uint32_t mask = 0xFFFFFFFF;
     429             :     uint32_t x0 = (uint32_t)(x & mask);
     430             :     uint32_t x1 = (uint32_t)(x >> 32);
     431             :     uint32_t y0 = (uint32_t)(y & mask);
     432             :     uint32_t y1 = (uint32_t)(y >> 32);
     433             :     uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);
     434             :     uint64_t x0y1 = x0 * (uint64_t)y1;
     435             :     uint64_t x1y0 = x1 * (uint64_t)y0;
     436             :     uint64_t x1y1 = x1 * (uint64_t)y1;
     437             :     uint64_t temp = x1y0 + x0y0_hi;
     438             :     uint64_t temp_lo = temp & mask;
     439             :     uint64_t temp_hi = temp >> 32;
     440             : 
     441             :     return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32);
     442             : #endif
     443             : }
     444             : 
     445             : static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
     446             : #if defined(LIBDIVIDE_MULH_INTRINSICS)
     447             :     return __mulh(x, y);
     448             : #elif defined(HAS_INT128_T)
     449             :     __int128_t xl = x, yl = y;
     450             :     __int128_t rl = xl * yl;
     451             :     return (int64_t)(rl >> 64);
     452             : #else
     453             :     // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
     454             :     uint32_t mask = 0xFFFFFFFF;
     455             :     uint32_t x0 = (uint32_t)(x & mask);
     456             :     uint32_t y0 = (uint32_t)(y & mask);
     457             :     int32_t x1 = (int32_t)(x >> 32);
     458             :     int32_t y1 = (int32_t)(y >> 32);
     459             :     uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);
     460             :     int64_t t = x1 * (int64_t)y0 + x0y0_hi;
     461             :     int64_t w1 = x0 * (int64_t)y1 + (t & mask);
     462             : 
     463             :     return x1 * (int64_t)y1 + (t >> 32) + (w1 >> 32);
     464             : #endif
     465             : }
     466             : 
     467             : static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) {
     468             : #if defined(__AVR__)
     469             :     // Fast way to count leading zeros
     470             :     // On the AVR 8-bit architecture __builtin_clz() works on a int16_t.
     471             :     return __builtin_clz(val);
     472             : #elif defined(__GNUC__) || __has_builtin(__builtin_clz) || defined(_MSC_VER)
     473             :     // Fast way to count leading zeros
     474           6 :     return (int16_t)(__builtin_clz(val) - 16);
     475             : #else
     476             :     if (val == 0) return 16;
     477             :     int16_t result = 4;
     478             :     uint16_t hi = 0xFU << 12;
     479             :     while ((val & hi) == 0) {
     480             :         hi >>= 4;
     481             :         result += 4;
     482             :     }
     483             :     while (val & hi) {
     484             :         result -= 1;
     485             :         hi <<= 1;
     486             :     }
     487             :     return result;
     488             : #endif
     489             : }
     490             : 
     491             : static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) {
     492             : #if defined(__AVR__)
     493             :     // Fast way to count leading zeros
     494             :     return __builtin_clzl(val);
     495             : #elif defined(__GNUC__) || __has_builtin(__builtin_clz) || defined(_MSC_VER)
     496             :     // Fast way to count leading zeros
     497      106124 :     return __builtin_clz(val);
     498             : #else
     499             :     if (val == 0) return 32;
     500             :     int32_t result = 8;
     501             :     uint32_t hi = 0xFFU << 24;
     502             :     while ((val & hi) == 0) {
     503             :         hi >>= 8;
     504             :         result += 8;
     505             :     }
     506             :     while (val & hi) {
     507             :         result -= 1;
     508             :         hi <<= 1;
     509             :     }
     510             :     return result;
     511             : #endif
     512             : }
     513             : 
     514             : static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) {
     515             : #if defined(__GNUC__) || __has_builtin(__builtin_clzll) || defined(_MSC_VER)
     516             :     // Fast way to count leading zeros
     517             :     return __builtin_clzll(val);
     518             : #else
     519             :     uint32_t hi = val >> 32;
     520             :     uint32_t lo = val & 0xFFFFFFFF;
     521             :     if (hi != 0) return libdivide_count_leading_zeros32(hi);
     522             :     return 32 + libdivide_count_leading_zeros32(lo);
     523             : #endif
     524             : }
     525             : 
     526             : // libdivide_32_div_16_to_16: divides a 32-bit uint {u1, u0} by a 16-bit
     527             : // uint {v}. The result must fit in 16 bits.
     528             : // Returns the quotient directly and the remainder in *r
     529             : static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16(
     530             :     uint16_t u1, uint16_t u0, uint16_t v, uint16_t *r) {
     531           3 :     uint32_t n = ((uint32_t)u1 << 16) | u0;
     532           3 :     uint16_t result = (uint16_t)(n / v);
     533           3 :     *r = (uint16_t)(n - result * (uint32_t)v);
     534           3 :     return result;
     535             : }
     536             : 
     537             : // libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit
     538             : // uint {v}. The result must fit in 32 bits.
     539             : // Returns the quotient directly and the remainder in *r
     540             : static LIBDIVIDE_INLINE uint32_t libdivide_64_div_32_to_32(
     541             :     uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
     542             : #if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && defined(LIBDIVIDE_GCC_STYLE_ASM)
     543             :     uint32_t result;
     544       58043 :     __asm__("divl %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1));
     545       58043 :     return result;
     546             : #else
     547             :     uint64_t n = ((uint64_t)u1 << 32) | u0;
     548             :     uint32_t result = (uint32_t)(n / v);
     549             :     *r = (uint32_t)(n - result * (uint64_t)v);
     550             :     return result;
     551             : #endif
     552             : }
     553             : 
     554             : // libdivide_128_div_64_to_64: divides a 128-bit uint {numhi, numlo} by a 64-bit uint {den}. The
     555             : // result must fit in 64 bits. Returns the quotient directly and the remainder in *r
     556             : static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(
     557             :     uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
     558             :     // N.B. resist the temptation to use __uint128_t here.
     559             :     // In LLVM compiler-rt, it performs a 128/128 -> 128 division which is many times slower than
     560             :     // necessary. In gcc it's better but still slower than the divlu implementation, perhaps because
     561             :     // it's not LIBDIVIDE_INLINEd.
     562             : #if defined(LIBDIVIDE_X86_64) && defined(LIBDIVIDE_GCC_STYLE_ASM)
     563             :     uint64_t result;
     564             :     __asm__("div %[v]" : "=a"(result), "=d"(*r) : [v] "r"(den), "a"(numlo), "d"(numhi));
     565             :     return result;
     566             : #else
     567             :     // We work in base 2**32.
     568             :     // A uint32 holds a single digit. A uint64 holds two digits.
     569             :     // Our numerator is conceptually [num3, num2, num1, num0].
     570             :     // Our denominator is [den1, den0].
     571             :     const uint64_t b = ((uint64_t)1 << 32);
     572             : 
     573             :     // The high and low digits of our computed quotient.
     574             :     uint32_t q1;
     575             :     uint32_t q0;
     576             : 
     577             :     // The normalization shift factor.
     578             :     int shift;
     579             : 
     580             :     // The high and low digits of our denominator (after normalizing).
     581             :     // Also the low 2 digits of our numerator (after normalizing).
     582             :     uint32_t den1;
     583             :     uint32_t den0;
     584             :     uint32_t num1;
     585             :     uint32_t num0;
     586             : 
     587             :     // A partial remainder.
     588             :     uint64_t rem;
     589             : 
     590             :     // The estimated quotient, and its corresponding remainder (unrelated to true remainder).
     591             :     uint64_t qhat;
     592             :     uint64_t rhat;
     593             : 
     594             :     // Variables used to correct the estimated quotient.
     595             :     uint64_t c1;
     596             :     uint64_t c2;
     597             : 
     598             :     // Check for overflow and divide by 0.
     599             :     if (numhi >= den) {
     600             :         if (r) *r = ~0ull;
     601             :         return ~0ull;
     602             :     }
     603             : 
     604             :     // Determine the normalization factor. We multiply den by this, so that its leading digit is at
     605             :     // least half b. In binary this means just shifting left by the number of leading zeros, so that
     606             :     // there's a 1 in the MSB.
     607             :     // We also shift numer by the same amount. This cannot overflow because numhi < den.
     608             :     // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
     609             :     // by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is
     610             :     // 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it.
     611             :     shift = libdivide_count_leading_zeros64(den);
     612             :     den <<= shift;
     613             :     numhi <<= shift;
     614             :     numhi |= (numlo >> (-shift & 63)) & (uint64_t)(-(int64_t)shift >> 63);
     615             :     numlo <<= shift;
     616             : 
     617             :     // Extract the low digits of the numerator and both digits of the denominator.
     618             :     num1 = (uint32_t)(numlo >> 32);
     619             :     num0 = (uint32_t)(numlo & 0xFFFFFFFFu);
     620             :     den1 = (uint32_t)(den >> 32);
     621             :     den0 = (uint32_t)(den & 0xFFFFFFFFu);
     622             : 
     623             :     // We wish to compute q1 = [n3 n2 n1] / [d1 d0].
     624             :     // Estimate q1 as [n3 n2] / [d1], and then correct it.
     625             :     // Note while qhat may be 2 digits, q1 is always 1 digit.
     626             :     qhat = numhi / den1;
     627             :     rhat = numhi % den1;
     628             :     c1 = qhat * den0;
     629             :     c2 = rhat * b + num1;
     630             :     if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
     631             :     q1 = (uint32_t)qhat;
     632             : 
     633             :     // Compute the true (partial) remainder.
     634             :     rem = numhi * b + num1 - q1 * den;
     635             : 
     636             :     // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
     637             :     // Estimate q0 as [rem1 rem0] / [d1] and correct it.
     638             :     qhat = rem / den1;
     639             :     rhat = rem % den1;
     640             :     c1 = qhat * den0;
     641             :     c2 = rhat * b + num0;
     642             :     if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
     643             :     q0 = (uint32_t)qhat;
     644             : 
     645             :     // Return remainder if requested.
     646             :     if (r) *r = (rem * b + num0 - q0 * den) >> shift;
     647             :     return ((uint64_t)q1 << 32) | q0;
     648             : #endif
     649             : }
     650             : 
     651             : #if !(defined(HAS_INT128_T) && \
     652             :       defined(HAS_INT128_DIV))
     653             : 
     654             : // Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0)
     655             : static LIBDIVIDE_INLINE void libdivide_u128_shift(
     656             :     uint64_t *u1, uint64_t *u0, int32_t signed_shift) {
     657             :     if (signed_shift > 0) {
     658             :         uint32_t shift = signed_shift;
     659             :         *u1 <<= shift;
     660             :         *u1 |= *u0 >> (64 - shift);
     661             :         *u0 <<= shift;
     662             :     } else if (signed_shift < 0) {
     663             :         uint32_t shift = -signed_shift;
     664             :         *u0 >>= shift;
     665             :         *u0 |= *u1 << (64 - shift);
     666             :         *u1 >>= shift;
     667             :     }
     668             : }
     669             : 
     670             : #endif
     671             : 
     672             : // Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder.
     673             : static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64(
     674             :     uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) {
     675             : #if defined(HAS_INT128_T) && defined(HAS_INT128_DIV)
     676             :     __uint128_t ufull = u_hi;
     677             :     __uint128_t vfull = v_hi;
     678             :     ufull = (ufull << 64) | u_lo;
     679             :     vfull = (vfull << 64) | v_lo;
     680             :     uint64_t res = (uint64_t)(ufull / vfull);
     681             :     __uint128_t remainder = ufull - (vfull * res);
     682             :     *r_lo = (uint64_t)remainder;
     683             :     *r_hi = (uint64_t)(remainder >> 64);
     684             :     return res;
     685             : #else
     686             :     // Adapted from "Unsigned Doubleword Division" in Hacker's Delight
     687             :     // We want to compute u / v
     688             :     typedef struct {
     689             :         uint64_t hi;
     690             :         uint64_t lo;
     691             :     } u128_t;
     692             :     u128_t u = {u_hi, u_lo};
     693             :     u128_t v = {v_hi, v_lo};
     694             : 
     695             :     if (v.hi == 0) {
     696             :         // divisor v is a 64 bit value, so we just need one 128/64 division
     697             :         // Note that we are simpler than Hacker's Delight here, because we know
     698             :         // the quotient fits in 64 bits whereas Hacker's Delight demands a full
     699             :         // 128 bit quotient
     700             :         *r_hi = 0;
     701             :         return libdivide_128_div_64_to_64(u.hi, u.lo, v.lo, r_lo);
     702             :     }
     703             :     // Here v >= 2**64
     704             :     // We know that v.hi != 0, so count leading zeros is OK
     705             :     // We have 0 <= n <= 63
     706             :     uint32_t n = libdivide_count_leading_zeros64(v.hi);
     707             : 
     708             :     // Normalize the divisor so its MSB is 1
     709             :     u128_t v1t = v;
     710             :     libdivide_u128_shift(&v1t.hi, &v1t.lo, n);
     711             :     uint64_t v1 = v1t.hi;  // i.e. v1 = v1t >> 64
     712             : 
     713             :     // To ensure no overflow
     714             :     u128_t u1 = u;
     715             :     libdivide_u128_shift(&u1.hi, &u1.lo, -1);
     716             : 
     717             :     // Get quotient from divide unsigned insn.
     718             :     uint64_t rem_ignored;
     719             :     uint64_t q1 = libdivide_128_div_64_to_64(u1.hi, u1.lo, v1, &rem_ignored);
     720             : 
     721             :     // Undo normalization and division of u by 2.
     722             :     u128_t q0 = {0, q1};
     723             :     libdivide_u128_shift(&q0.hi, &q0.lo, n);
     724             :     libdivide_u128_shift(&q0.hi, &q0.lo, -63);
     725             : 
     726             :     // Make q0 correct or too small by 1
     727             :     // Equivalent to `if (q0 != 0) q0 = q0 - 1;`
     728             :     if (q0.hi != 0 || q0.lo != 0) {
     729             :         q0.hi -= (q0.lo == 0);  // borrow
     730             :         q0.lo -= 1;
     731             :     }
     732             : 
     733             :     // Now q0 is correct.
     734             :     // Compute q0 * v as q0v
     735             :     // = (q0.hi << 64 + q0.lo) * (v.hi << 64 + v.lo)
     736             :     // = (q0.hi * v.hi << 128) + (q0.hi * v.lo << 64) +
     737             :     //   (q0.lo * v.hi <<  64) + q0.lo * v.lo)
     738             :     // Each term is 128 bit
     739             :     // High half of full product (upper 128 bits!) are dropped
     740             :     u128_t q0v = {0, 0};
     741             :     q0v.hi = q0.hi * v.lo + q0.lo * v.hi + libdivide_mullhi_u64(q0.lo, v.lo);
     742             :     q0v.lo = q0.lo * v.lo;
     743             : 
     744             :     // Compute u - q0v as u_q0v
     745             :     // This is the remainder
     746             :     u128_t u_q0v = u;
     747             :     u_q0v.hi -= q0v.hi + (u.lo < q0v.lo);  // second term is borrow
     748             :     u_q0v.lo -= q0v.lo;
     749             : 
     750             :     // Check if u_q0v >= v
     751             :     // This checks if our remainder is larger than the divisor
     752             :     if ((u_q0v.hi > v.hi) || (u_q0v.hi == v.hi && u_q0v.lo >= v.lo)) {
     753             :         // Increment q0
     754             :         q0.lo += 1;
     755             :         q0.hi += (q0.lo == 0);  // carry
     756             : 
     757             :         // Subtract v from remainder
     758             :         u_q0v.hi -= v.hi + (u_q0v.lo < v.lo);
     759             :         u_q0v.lo -= v.lo;
     760             :     }
     761             : 
     762             :     *r_hi = u_q0v.hi;
     763             :     *r_lo = u_q0v.lo;
     764             : 
     765             :     LIBDIVIDE_ASSERT(q0.hi == 0);
     766             :     return q0.lo;
     767             : #endif
     768             : }
     769             : 
     770             : ////////// UINT16
     771             : 
     772             : static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
     773             :     uint16_t d, int branchfree) {
     774           6 :     if (d == 0) {
     775           0 :         LIBDIVIDE_ERROR("divider must be != 0");
     776             :     }
     777             : 
     778             :     struct libdivide_u16_t result;
     779           6 :     uint8_t floor_log_2_d = (uint8_t)(15 - libdivide_count_leading_zeros16(d));
     780             : 
     781             :     // Power of 2
     782           6 :     if ((d & (d - 1)) == 0) {
     783             :         // We need to subtract 1 from the shift value in case of an unsigned
     784             :         // branchfree divider because there is a hardcoded right shift by 1
     785             :         // in its division algorithm. Because of this we also need to add back
     786             :         // 1 in its recovery algorithm.
     787           3 :         result.magic = 0;
     788           3 :         result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
     789             :     } else {
     790             :         uint8_t more;
     791             :         uint16_t rem, proposed_m;
     792           3 :         proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem);
     793             : 
     794             :         LIBDIVIDE_ASSERT(rem > 0 && rem < d);
     795           3 :         const uint16_t e = d - rem;
     796             : 
     797             :         // This power works if e < 2**floor_log_2_d.
     798           3 :         if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) {
     799             :             // This power works
     800           2 :             more = floor_log_2_d;
     801             :         } else {
     802             :             // We have to use the general 17-bit algorithm.  We need to compute
     803             :             // (2**power) / d. However, we already have (2**(power-1))/d and
     804             :             // its remainder.  By doubling both, and then correcting the
     805             :             // remainder, we can compute the larger division.
     806             :             // don't care about overflow here - in fact, we expect it
     807           1 :             proposed_m += proposed_m;
     808           1 :             const uint16_t twice_rem = rem + rem;
     809           1 :             if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
     810           1 :             more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
     811             :         }
     812           3 :         result.magic = 1 + proposed_m;
     813           3 :         result.more = more;
     814             :         // result.more's shift should in general be ceil_log_2_d. But if we
     815             :         // used the smaller power, we subtract one from the shift because we're
     816             :         // using the smaller power. If we're using the larger power, we
     817             :         // subtract one from the shift because it's taken care of by the add
     818             :         // indicator. So floor_log_2_d happens to be correct in both cases.
     819             :     }
     820           6 :     return result;
     821             : }
     822             : 
     823             : static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d) {
     824           6 :     return libdivide_internal_u16_gen(d, 0);
     825             : }
     826             : 
     827             : static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
     828             :     if (d == 1) {
     829             :         LIBDIVIDE_ERROR("branchfree divider must be != 1");
     830             :     }
     831             :     struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1);
     832             :     struct libdivide_u16_branchfree_t ret = {
     833             :         tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK)};
     834             :     return ret;
     835             : }
     836             : 
     837             : // The original libdivide_u16_do takes a const pointer. However, this cannot be used
     838             : // with a compile time constant libdivide_u16_t: it will generate a warning about
     839             : // taking the address of a temporary. Hence this overload.
     840             : static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) {
     841             :     if (!magic) {
     842             :         return numer >> more;
     843             :     } else {
     844             :         uint16_t q = libdivide_mullhi_u16(numer, magic);
     845             :         if (more & LIBDIVIDE_ADD_MARKER) {
     846             :             uint16_t t = ((numer - q) >> 1) + q;
     847             :             return t >> (more & LIBDIVIDE_16_SHIFT_MASK);
     848             :         } else {
     849             :             // All upper bits are 0,
     850             :             // don't need to mask them off.
     851             :             return q >> more;
     852             :         }
     853             :     }
     854             : }
     855             : 
     856             : static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t *denom) {
     857             :     return libdivide_u16_do_raw(numer, denom->magic, denom->more);
     858             : }
     859             : 
     860             : static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(
     861             :     uint16_t numer, const struct libdivide_u16_branchfree_t *denom) {
     862             :     uint16_t q = libdivide_mullhi_u16(numer, denom->magic);
     863             :     uint16_t t = ((numer - q) >> 1) + q;
     864             :     return t >> denom->more;
     865             : }
     866             : 
     867             : static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) {
     868             :     uint8_t more = denom->more;
     869             :     uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
     870             : 
     871             :     if (!denom->magic) {
     872             :         return (uint16_t)1 << shift;
     873             :     } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
     874             :         // We compute q = n/d = n*m / 2^(16 + shift)
     875             :         // Therefore we have d = 2^(16 + shift) / m
     876             :         // We need to ceil it.
     877             :         // We know d is not a power of 2, so m is not a power of 2,
     878             :         // so we can just add 1 to the floor
     879             :         uint16_t hi_dividend = (uint16_t)1 << shift;
     880             :         uint16_t rem_ignored;
     881             :         return 1 + libdivide_32_div_16_to_16(hi_dividend, 0, denom->magic, &rem_ignored);
     882             :     } else {
     883             :         // Here we wish to compute d = 2^(16+shift+1)/(m+2^16).
     884             :         // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now
     885             :         // Also note that shift may be as high as 15, so shift + 1 will
     886             :         // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
     887             :         // then double the quotient and remainder.
     888             :         uint32_t half_n = (uint32_t)1 << (16 + shift);
     889             :         uint32_t d = ((uint32_t)1 << 16) | denom->magic;
     890             :         // Note that the quotient is guaranteed <= 16 bits, but the remainder
     891             :         // may need 17!
     892             :         uint16_t half_q = (uint16_t)(half_n / d);
     893             :         uint32_t rem = half_n % d;
     894             :         // We computed 2^(16+shift)/(m+2^16)
     895             :         // Need to double it, and then add 1 to the quotient if doubling th
     896             :         // remainder would increase the quotient.
     897             :         // Note that rem<<1 cannot overflow, since rem < d and d is 17 bits
     898             :         uint16_t full_q = half_q + half_q + ((rem << 1) >= d);
     899             : 
     900             :         // We rounded down in gen (hence +1)
     901             :         return full_q + 1;
     902             :     }
     903             : }
     904             : 
     905             : static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(const struct libdivide_u16_branchfree_t *denom) {
     906             :     uint8_t more = denom->more;
     907             :     uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
     908             : 
     909             :     if (!denom->magic) {
     910             :         return (uint16_t)1 << (shift + 1);
     911             :     } else {
     912             :         // Here we wish to compute d = 2^(16+shift+1)/(m+2^16).
     913             :         // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now
     914             :         // Also note that shift may be as high as 15, so shift + 1 will
     915             :         // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
     916             :         // then double the quotient and remainder.
     917             :         uint32_t half_n = (uint32_t)1 << (16 + shift);
     918             :         uint32_t d = ((uint32_t)1 << 16) | denom->magic;
     919             :         // Note that the quotient is guaranteed <= 16 bits, but the remainder
     920             :         // may need 17!
     921             :         uint16_t half_q = (uint16_t)(half_n / d);
     922             :         uint32_t rem = half_n % d;
     923             :         // We computed 2^(16+shift)/(m+2^16)
     924             :         // Need to double it, and then add 1 to the quotient if doubling th
     925             :         // remainder would increase the quotient.
     926             :         // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
     927             :         uint16_t full_q = half_q + half_q + ((rem << 1) >= d);
     928             : 
     929             :         // We rounded down in gen (hence +1)
     930             :         return full_q + 1;
     931             :     }
     932             : }
     933             : 
     934             : ////////// UINT32
     935             : 
     936             : CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW
     937             : static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_internal_u32_gen(
     938             :     uint32_t d, int branchfree) {
     939      106124 :     if (d == 0) {
     940           0 :         LIBDIVIDE_ERROR("divider must be != 0");
     941             :     }
     942             : 
     943             :     struct libdivide_u32_t result;
     944      106124 :     uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(d);
     945             : 
     946             :     // Power of 2
     947      106124 :     if ((d & (d - 1)) == 0) {
     948             :         // We need to subtract 1 from the shift value in case of an unsigned
     949             :         // branchfree divider because there is a hardcoded right shift by 1
     950             :         // in its division algorithm. Because of this we also need to add back
     951             :         // 1 in its recovery algorithm.
     952       48081 :         result.magic = 0;
     953       48081 :         result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
     954             :     } else {
     955             :         uint8_t more;
     956             :         uint32_t rem, proposed_m;
     957       58043 :         proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << floor_log_2_d, 0, d, &rem);
     958             : 
     959             :         LIBDIVIDE_ASSERT(rem > 0 && rem < d);
     960       58043 :         const uint32_t e = d - rem;
     961             : 
     962             :         // This power works if e < 2**floor_log_2_d.
     963       58043 :         if (!branchfree && (e < ((uint32_t)1 << floor_log_2_d))) {
     964             :             // This power works
     965       48474 :             more = (uint8_t)floor_log_2_d;
     966             :         } else {
     967             :             // We have to use the general 33-bit algorithm.  We need to compute
     968             :             // (2**power) / d. However, we already have (2**(power-1))/d and
     969             :             // its remainder.  By doubling both, and then correcting the
     970             :             // remainder, we can compute the larger division.
     971             :             // don't care about overflow here - in fact, we expect it
     972        9569 :             proposed_m += proposed_m;
     973        9569 :             const uint32_t twice_rem = rem + rem;
     974        9569 :             if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
     975        9569 :             more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
     976             :         }
     977       58043 :         result.magic = 1 + proposed_m;
     978       58043 :         result.more = more;
     979             :         // result.more's shift should in general be ceil_log_2_d. But if we
     980             :         // used the smaller power, we subtract one from the shift because we're
     981             :         // using the smaller power. If we're using the larger power, we
     982             :         // subtract one from the shift because it's taken care of by the add
     983             :         // indicator. So floor_log_2_d happens to be correct in both cases.
     984             :     }
     985      106124 :     return result;
     986             : }
     987             : 
     988             : static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d) {
     989      106124 :     return libdivide_internal_u32_gen(d, 0);
     990             : }
     991             : 
     992             : static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) {
     993             :     if (d == 1) {
     994             :         LIBDIVIDE_ERROR("branchfree divider must be != 1");
     995             :     }
     996             :     struct libdivide_u32_t tmp = libdivide_internal_u32_gen(d, 1);
     997             :     struct libdivide_u32_branchfree_t ret = {
     998             :         tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)};
     999             :     return ret;
    1000             : }
    1001             : 
    1002             : static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw(uint32_t numer, uint32_t magic, uint8_t more) {
    1003     5093324 :     if (!magic) {
    1004     2307500 :         return numer >> more;
    1005             :     } else {
    1006     2785824 :         uint32_t q = libdivide_mullhi_u32(numer, magic);
    1007     2785824 :         if (more & LIBDIVIDE_ADD_MARKER) {
    1008      459312 :             uint32_t t = ((numer - q) >> 1) + q;
    1009      459312 :             return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
    1010             :         } else {
    1011             :             // All upper bits are 0,
    1012             :             // don't need to mask them off.
    1013     2326514 :             return q >> more;
    1014             :         }
    1015             :     }
    1016             : }
    1017             : 
    1018             : static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
    1019     5093324 :     return libdivide_u32_do_raw(numer, denom->magic, denom->more);
    1020             : }
    1021             : 
    1022             : static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(
    1023             :     uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {
    1024             :     uint32_t q = libdivide_mullhi_u32(numer, denom->magic);
    1025             :     uint32_t t = ((numer - q) >> 1) + q;
    1026             :     return t >> denom->more;
    1027             : }
    1028             : 
    1029             : static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) {
    1030             :     uint8_t more = denom->more;
    1031             :     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    1032             : 
    1033             :     if (!denom->magic) {
    1034             :         return (uint32_t)1 << shift;
    1035             :     } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
    1036             :         // We compute q = n/d = n*m / 2^(32 + shift)
    1037             :         // Therefore we have d = 2^(32 + shift) / m
    1038             :         // We need to ceil it.
    1039             :         // We know d is not a power of 2, so m is not a power of 2,
    1040             :         // so we can just add 1 to the floor
    1041             :         uint32_t hi_dividend = (uint32_t)1 << shift;
    1042             :         uint32_t rem_ignored;
    1043             :         return 1 + libdivide_64_div_32_to_32(hi_dividend, 0, denom->magic, &rem_ignored);
    1044             :     } else {
    1045             :         // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
    1046             :         // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
    1047             :         // Also note that shift may be as high as 31, so shift + 1 will
    1048             :         // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
    1049             :         // then double the quotient and remainder.
    1050             :         uint64_t half_n = (uint64_t)1 << (32 + shift);
    1051             :         uint64_t d = ((uint64_t)1 << 32) | denom->magic;
    1052             :         // Note that the quotient is guaranteed <= 32 bits, but the remainder
    1053             :         // may need 33!
    1054             :         uint32_t half_q = (uint32_t)(half_n / d);
    1055             :         uint64_t rem = half_n % d;
    1056             :         // We computed 2^(32+shift)/(m+2^32)
    1057             :         // Need to double it, and then add 1 to the quotient if doubling th
    1058             :         // remainder would increase the quotient.
    1059             :         // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
    1060             :         uint32_t full_q = half_q + half_q + ((rem << 1) >= d);
    1061             : 
    1062             :         // We rounded down in gen (hence +1)
    1063             :         return full_q + 1;
    1064             :     }
    1065             : }
    1066             : 
    1067             : static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom) {
    1068             :     uint8_t more = denom->more;
    1069             :     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    1070             : 
    1071             :     if (!denom->magic) {
    1072             :         return (uint32_t)1 << (shift + 1);
    1073             :     } else {
    1074             :         // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
    1075             :         // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
    1076             :         // Also note that shift may be as high as 31, so shift + 1 will
    1077             :         // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
    1078             :         // then double the quotient and remainder.
    1079             :         uint64_t half_n = (uint64_t)1 << (32 + shift);
    1080             :         uint64_t d = ((uint64_t)1 << 32) | denom->magic;
    1081             :         // Note that the quotient is guaranteed <= 32 bits, but the remainder
    1082             :         // may need 33!
    1083             :         uint32_t half_q = (uint32_t)(half_n / d);
    1084             :         uint64_t rem = half_n % d;
    1085             :         // We computed 2^(32+shift)/(m+2^32)
    1086             :         // Need to double it, and then add 1 to the quotient if doubling th
    1087             :         // remainder would increase the quotient.
    1088             :         // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
    1089             :         uint32_t full_q = half_q + half_q + ((rem << 1) >= d);
    1090             : 
    1091             :         // We rounded down in gen (hence +1)
    1092             :         return full_q + 1;
    1093             :     }
    1094             : }
    1095             : 
    1096             : ////////// UINT64
    1097             : 
    1098             : static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_internal_u64_gen(
    1099             :     uint64_t d, int branchfree) {
    1100             :     if (d == 0) {
    1101             :         LIBDIVIDE_ERROR("divider must be != 0");
    1102             :     }
    1103             : 
    1104             :     struct libdivide_u64_t result;
    1105             :     uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(d);
    1106             : 
    1107             :     // Power of 2
    1108             :     if ((d & (d - 1)) == 0) {
    1109             :         // We need to subtract 1 from the shift value in case of an unsigned
    1110             :         // branchfree divider because there is a hardcoded right shift by 1
    1111             :         // in its division algorithm. Because of this we also need to add back
    1112             :         // 1 in its recovery algorithm.
    1113             :         result.magic = 0;
    1114             :         result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
    1115             :     } else {
    1116             :         uint64_t proposed_m, rem;
    1117             :         uint8_t more;
    1118             :         // (1 << (64 + floor_log_2_d)) / d
    1119             :         proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << floor_log_2_d, 0, d, &rem);
    1120             : 
    1121             :         LIBDIVIDE_ASSERT(rem > 0 && rem < d);
    1122             :         const uint64_t e = d - rem;
    1123             : 
    1124             :         // This power works if e < 2**floor_log_2_d.
    1125             :         if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {
    1126             :             // This power works
    1127             :             more = (uint8_t)floor_log_2_d;
    1128             :         } else {
    1129             :             // We have to use the general 65-bit algorithm.  We need to compute
    1130             :             // (2**power) / d. However, we already have (2**(power-1))/d and
    1131             :             // its remainder. By doubling both, and then correcting the
    1132             :             // remainder, we can compute the larger division.
    1133             :             // don't care about overflow here - in fact, we expect it
    1134             :             proposed_m += proposed_m;
    1135             :             const uint64_t twice_rem = rem + rem;
    1136             :             if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
    1137             :             more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
    1138             :         }
    1139             :         result.magic = 1 + proposed_m;
    1140             :         result.more = more;
    1141             :         // result.more's shift should in general be ceil_log_2_d. But if we
    1142             :         // used the smaller power, we subtract one from the shift because we're
    1143             :         // using the smaller power. If we're using the larger power, we
    1144             :         // subtract one from the shift because it's taken care of by the add
    1145             :         // indicator. So floor_log_2_d happens to be correct in both cases,
    1146             :         // which is why we do it outside of the if statement.
    1147             :     }
    1148             :     return result;
    1149             : }
    1150             : 
    1151             : static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d) {
    1152             :     return libdivide_internal_u64_gen(d, 0);
    1153             : }
    1154             : 
    1155             : static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) {
    1156             :     if (d == 1) {
    1157             :         LIBDIVIDE_ERROR("branchfree divider must be != 1");
    1158             :     }
    1159             :     struct libdivide_u64_t tmp = libdivide_internal_u64_gen(d, 1);
    1160             :     struct libdivide_u64_branchfree_t ret = {
    1161             :         tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_64_SHIFT_MASK)};
    1162             :     return ret;
    1163             : }
    1164             : 
    1165             : static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw(uint64_t numer, uint64_t magic, uint8_t more) {
    1166             :    if (!magic) {
    1167             :         return numer >> more;
    1168             :     } else {
    1169             :         uint64_t q = libdivide_mullhi_u64(numer, magic);
    1170             :         if (more & LIBDIVIDE_ADD_MARKER) {
    1171             :             uint64_t t = ((numer - q) >> 1) + q;
    1172             :             return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
    1173             :         } else {
    1174             :             // All upper bits are 0,
    1175             :             // don't need to mask them off.
    1176             :             return q >> more;
    1177             :         }
    1178             :     }
    1179             : }
    1180             : 
    1181             : static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
    1182             :     return libdivide_u64_do_raw(numer, denom->magic, denom->more);
    1183             : }
    1184             : 
    1185             : static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(
    1186             :     uint64_t numer, const struct libdivide_u64_branchfree_t *denom) {
    1187             :     uint64_t q = libdivide_mullhi_u64(numer, denom->magic);
    1188             :     uint64_t t = ((numer - q) >> 1) + q;
    1189             :     return t >> denom->more;
    1190             : }
    1191             : 
    1192             : static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) {
    1193             :     uint8_t more = denom->more;
    1194             :     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    1195             : 
    1196             :     if (!denom->magic) {
    1197             :         return (uint64_t)1 << shift;
    1198             :     } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
    1199             :         // We compute q = n/d = n*m / 2^(64 + shift)
    1200             :         // Therefore we have d = 2^(64 + shift) / m
    1201             :         // We need to ceil it.
    1202             :         // We know d is not a power of 2, so m is not a power of 2,
    1203             :         // so we can just add 1 to the floor
    1204             :         uint64_t hi_dividend = (uint64_t)1 << shift;
    1205             :         uint64_t rem_ignored;
    1206             :         return 1 + libdivide_128_div_64_to_64(hi_dividend, 0, denom->magic, &rem_ignored);
    1207             :     } else {
    1208             :         // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).
    1209             :         // Notice (m + 2^64) is a 65 bit number. This gets hairy. See
    1210             :         // libdivide_u32_recover for more on what we do here.
    1211             :         // TODO: do something better than 128 bit math
    1212             : 
    1213             :         // Full n is a (potentially) 129 bit value
    1214             :         // half_n is a 128 bit value
    1215             :         // Compute the hi half of half_n. Low half is 0.
    1216             :         uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0;
    1217             :         // d is a 65 bit value. The high bit is always set to 1.
    1218             :         const uint64_t d_hi = 1, d_lo = denom->magic;
    1219             :         // Note that the quotient is guaranteed <= 64 bits,
    1220             :         // but the remainder may need 65!
    1221             :         uint64_t r_hi, r_lo;
    1222             :         uint64_t half_q =
    1223             :             libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
    1224             :         // We computed 2^(64+shift)/(m+2^64)
    1225             :         // Double the remainder ('dr') and check if that is larger than d
    1226             :         // Note that d is a 65 bit value, so r1 is small and so r1 + r1
    1227             :         // cannot overflow
    1228             :         uint64_t dr_lo = r_lo + r_lo;
    1229             :         uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo);  // last term is carry
    1230             :         int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);
    1231             :         uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);
    1232             :         return full_q + 1;
    1233             :     }
    1234             : }
    1235             : 
    1236             : static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom) {
    1237             :     uint8_t more = denom->more;
    1238             :     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    1239             : 
    1240             :     if (!denom->magic) {
    1241             :         return (uint64_t)1 << (shift + 1);
    1242             :     } else {
    1243             :         // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).
    1244             :         // Notice (m + 2^64) is a 65 bit number. This gets hairy. See
    1245             :         // libdivide_u32_recover for more on what we do here.
    1246             :         // TODO: do something better than 128 bit math
    1247             : 
    1248             :         // Full n is a (potentially) 129 bit value
    1249             :         // half_n is a 128 bit value
    1250             :         // Compute the hi half of half_n. Low half is 0.
    1251             :         uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0;
    1252             :         // d is a 65 bit value. The high bit is always set to 1.
    1253             :         const uint64_t d_hi = 1, d_lo = denom->magic;
    1254             :         // Note that the quotient is guaranteed <= 64 bits,
    1255             :         // but the remainder may need 65!
    1256             :         uint64_t r_hi, r_lo;
    1257             :         uint64_t half_q =
    1258             :             libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
    1259             :         // We computed 2^(64+shift)/(m+2^64)
    1260             :         // Double the remainder ('dr') and check if that is larger than d
    1261             :         // Note that d is a 65 bit value, so r1 is small and so r1 + r1
    1262             :         // cannot overflow
    1263             :         uint64_t dr_lo = r_lo + r_lo;
    1264             :         uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo);  // last term is carry
    1265             :         int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);
    1266             :         uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);
    1267             :         return full_q + 1;
    1268             :     }
    1269             : }
    1270             : 
    1271             : ////////// SINT16
    1272             : 
    1273             : static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen(
    1274             :     int16_t d, int branchfree) {
    1275             :     if (d == 0) {
    1276             :         LIBDIVIDE_ERROR("divider must be != 0");
    1277             :     }
    1278             : 
    1279             :     struct libdivide_s16_t result;
    1280             : 
    1281             :     // If d is a power of 2, or negative a power of 2, we have to use a shift.
    1282             :     // This is especially important because the magic algorithm fails for -1.
    1283             :     // To check if d is a power of 2 or its inverse, it suffices to check
    1284             :     // whether its absolute value has exactly one bit set. This works even for
    1285             :     // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
    1286             :     // and is a power of 2.
    1287             :     uint16_t ud = (uint16_t)d;
    1288             :     uint16_t absD = (d < 0) ? -ud : ud;
    1289             :     uint16_t floor_log_2_d = 15 - libdivide_count_leading_zeros16(absD);
    1290             :     // check if exactly one bit is set,
    1291             :     // don't care if absD is 0 since that's divide by zero
    1292             :     if ((absD & (absD - 1)) == 0) {
    1293             :         // Branchfree and normal paths are exactly the same
    1294             :         result.magic = 0;
    1295             :         result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
    1296             :     } else {
    1297             :         LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
    1298             : 
    1299             :         uint8_t more;
    1300             :         // the dividend here is 2**(floor_log_2_d + 31), so the low 16 bit word
    1301             :         // is 0 and the high word is floor_log_2_d - 1
    1302             :         uint16_t rem, proposed_m;
    1303             :         proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
    1304             :         const uint16_t e = absD - rem;
    1305             : 
    1306             :         // We are going to start with a power of floor_log_2_d - 1.
    1307             :         // This works if works if e < 2**floor_log_2_d.
    1308             :         if (!branchfree && e < ((uint16_t)1 << floor_log_2_d)) {
    1309             :             // This power works
    1310             :             more = (uint8_t)(floor_log_2_d - 1);
    1311             :         } else {
    1312             :             // We need to go one higher. This should not make proposed_m
    1313             :             // overflow, but it will make it negative when interpreted as an
    1314             :             // int16_t.
    1315             :             proposed_m += proposed_m;
    1316             :             const uint16_t twice_rem = rem + rem;
    1317             :             if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
    1318             :             more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
    1319             :         }
    1320             : 
    1321             :         proposed_m += 1;
    1322             :         int16_t magic = (int16_t)proposed_m;
    1323             : 
    1324             :         // Mark if we are negative. Note we only negate the magic number in the
    1325             :         // branchfull case.
    1326             :         if (d < 0) {
    1327             :             more |= LIBDIVIDE_NEGATIVE_DIVISOR;
    1328             :             if (!branchfree) {
    1329             :                 magic = -magic;
    1330             :             }
    1331             :         }
    1332             : 
    1333             :         result.more = more;
    1334             :         result.magic = magic;
    1335             :     }
    1336             :     return result;
    1337             : }
    1338             : 
    1339             : static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d) {
    1340             :     return libdivide_internal_s16_gen(d, 0);
    1341             : }
    1342             : 
    1343             : static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d) {
    1344             :     struct libdivide_s16_t tmp = libdivide_internal_s16_gen(d, 1);
    1345             :     struct libdivide_s16_branchfree_t result = {tmp.magic, tmp.more};
    1346             :     return result;
    1347             : }
    1348             : 
    1349             : // The original libdivide_s16_do takes a const pointer. However, this cannot be used
    1350             : // with a compile time constant libdivide_s16_t: it will generate a warning about
    1351             : // taking the address of a temporary. Hence this overload.
    1352             : static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more) {
    1353             :     uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
    1354             : 
    1355             :     if (!magic) {
    1356             :         uint16_t sign = (int8_t)more >> 7;
    1357             :         uint16_t mask = ((uint16_t)1 << shift) - 1;
    1358             :         uint16_t uq = numer + ((numer >> 15) & mask);
    1359             :         int16_t q = (int16_t)uq;
    1360             :         q >>= shift;
    1361             :         q = (q ^ sign) - sign;
    1362             :         return q;
    1363             :     } else {
    1364             :         uint16_t uq = (uint16_t)libdivide_mullhi_s16(numer, magic);
    1365             :         if (more & LIBDIVIDE_ADD_MARKER) {
    1366             :             // must be arithmetic shift and then sign extend
    1367             :             int16_t sign = (int8_t)more >> 7;
    1368             :             // q += (more < 0 ? -numer : numer)
    1369             :             // cast required to avoid UB
    1370             :             uq += ((uint16_t)numer ^ sign) - sign;
    1371             :         }
    1372             :         int16_t q = (int16_t)uq;
    1373             :         q >>= shift;
    1374             :         q += (q < 0);
    1375             :         return q;
    1376             :     }
    1377             : }
    1378             : 
    1379             : static LIBDIVIDE_INLINE int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) {
    1380             :     return libdivide_s16_do_raw(numer, denom->magic, denom->more);
    1381             : }
    1382             : 
    1383             : static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(int16_t numer, const struct libdivide_s16_branchfree_t *denom) {
    1384             :     uint8_t more = denom->more;
    1385             :     uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
    1386             :     // must be arithmetic shift and then sign extend
    1387             :     int16_t sign = (int8_t)more >> 7;
    1388             :     int16_t magic = denom->magic;
    1389             :     int16_t q = libdivide_mullhi_s16(numer, magic);
    1390             :     q += numer;
    1391             : 
    1392             :     // If q is non-negative, we have nothing to do
    1393             :     // If q is negative, we want to add either (2**shift)-1 if d is a power of
    1394             :     // 2, or (2**shift) if it is not a power of 2
    1395             :     uint16_t is_power_of_2 = (magic == 0);
    1396             :     uint16_t q_sign = (uint16_t)(q >> 15);
    1397             :     q += q_sign & (((uint16_t)1 << shift) - is_power_of_2);
    1398             : 
    1399             :     // Now arithmetic right shift
    1400             :     q >>= shift;
    1401             :     // Negate if needed
    1402             :     q = (q ^ sign) - sign;
    1403             : 
    1404             :     return q;
    1405             : }
    1406             : 
    1407             : static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom) {
    1408             :     uint8_t more = denom->more;
    1409             :     uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
    1410             :     if (!denom->magic) {
    1411             :         uint16_t absD = (uint16_t)1 << shift;
    1412             :         if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
    1413             :             absD = -absD;
    1414             :         }
    1415             :         return (int16_t)absD;
    1416             :     } else {
    1417             :         // Unsigned math is much easier
    1418             :         // We negate the magic number only in the branchfull case, and we don't
    1419             :         // know which case we're in. However we have enough information to
    1420             :         // determine the correct sign of the magic number. The divisor was
    1421             :         // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set,
    1422             :         // the magic number's sign is opposite that of the divisor.
    1423             :         // We want to compute the positive magic number.
    1424             :         int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
    1425             :         int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
    1426             : 
    1427             :         // Handle the power of 2 case (including branchfree)
    1428             :         if (denom->magic == 0) {
    1429             :             int16_t result = (uint16_t)1 << shift;
    1430             :             return negative_divisor ? -result : result;
    1431             :         }
    1432             : 
    1433             :         uint16_t d = (uint16_t)(magic_was_negated ? -denom->magic : denom->magic);
    1434             :         uint32_t n = (uint32_t)1 << (16 + shift);  // this shift cannot exceed 30
    1435             :         uint16_t q = (uint16_t)(n / d);
    1436             :         int16_t result = (int16_t)q;
    1437             :         result += 1;
    1438             :         return negative_divisor ? -result : result;
    1439             :     }
    1440             : }
    1441             : 
    1442             : static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(const struct libdivide_s16_branchfree_t *denom) {
    1443             :     const struct libdivide_s16_t den = {denom->magic, denom->more};
    1444             :     return libdivide_s16_recover(&den);
    1445             : }
    1446             : 
    1447             : ////////// SINT32
    1448             : 
    1449             : static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_internal_s32_gen(
    1450             :     int32_t d, int branchfree) {
    1451             :     if (d == 0) {
    1452             :         LIBDIVIDE_ERROR("divider must be != 0");
    1453             :     }
    1454             : 
    1455             :     struct libdivide_s32_t result;
    1456             : 
    1457             :     // If d is a power of 2, or negative a power of 2, we have to use a shift.
    1458             :     // This is especially important because the magic algorithm fails for -1.
    1459             :     // To check if d is a power of 2 or its inverse, it suffices to check
    1460             :     // whether its absolute value has exactly one bit set. This works even for
    1461             :     // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
    1462             :     // and is a power of 2.
    1463             :     uint32_t ud = (uint32_t)d;
    1464             :     uint32_t absD = (d < 0) ? -ud : ud;
    1465             :     uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(absD);
    1466             :     // check if exactly one bit is set,
    1467             :     // don't care if absD is 0 since that's divide by zero
    1468             :     if ((absD & (absD - 1)) == 0) {
    1469             :         // Branchfree and normal paths are exactly the same
    1470             :         result.magic = 0;
    1471             :         result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
    1472             :     } else {
    1473             :         LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
    1474             : 
    1475             :         uint8_t more;
    1476             :         // the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word
    1477             :         // is 0 and the high word is floor_log_2_d - 1
    1478             :         uint32_t rem, proposed_m;
    1479             :         proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
    1480             :         const uint32_t e = absD - rem;
    1481             : 
    1482             :         // We are going to start with a power of floor_log_2_d - 1.
    1483             :         // This works if works if e < 2**floor_log_2_d.
    1484             :         if (!branchfree && e < ((uint32_t)1 << floor_log_2_d)) {
    1485             :             // This power works
    1486             :             more = (uint8_t)(floor_log_2_d - 1);
    1487             :         } else {
    1488             :             // We need to go one higher. This should not make proposed_m
    1489             :             // overflow, but it will make it negative when interpreted as an
    1490             :             // int32_t.
    1491             :             proposed_m += proposed_m;
    1492             :             const uint32_t twice_rem = rem + rem;
    1493             :             if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
    1494             :             more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
    1495             :         }
    1496             : 
    1497             :         proposed_m += 1;
    1498             :         int32_t magic = (int32_t)proposed_m;
    1499             : 
    1500             :         // Mark if we are negative. Note we only negate the magic number in the
    1501             :         // branchfull case.
    1502             :         if (d < 0) {
    1503             :             more |= LIBDIVIDE_NEGATIVE_DIVISOR;
    1504             :             if (!branchfree) {
    1505             :                 magic = -magic;
    1506             :             }
    1507             :         }
    1508             : 
    1509             :         result.more = more;
    1510             :         result.magic = magic;
    1511             :     }
    1512             :     return result;
    1513             : }
    1514             : 
    1515             : static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d) {
    1516             :     return libdivide_internal_s32_gen(d, 0);
    1517             : }
    1518             : 
    1519             : static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) {
    1520             :     struct libdivide_s32_t tmp = libdivide_internal_s32_gen(d, 1);
    1521             :     struct libdivide_s32_branchfree_t result = {tmp.magic, tmp.more};
    1522             :     return result;
    1523             : }
    1524             : 
    1525             : static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) {
    1526             :     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    1527             : 
    1528             :     if (!magic) {
    1529             :         uint32_t sign = (int8_t)more >> 7;
    1530             :         uint32_t mask = ((uint32_t)1 << shift) - 1;
    1531             :         uint32_t uq = numer + ((numer >> 31) & mask);
    1532             :         int32_t q = (int32_t)uq;
    1533             :         q >>= shift;
    1534             :         q = (q ^ sign) - sign;
    1535             :         return q;
    1536             :     } else {
    1537             :         uint32_t uq = (uint32_t)libdivide_mullhi_s32(numer, magic);
    1538             :         if (more & LIBDIVIDE_ADD_MARKER) {
    1539             :             // must be arithmetic shift and then sign extend
    1540             :             int32_t sign = (int8_t)more >> 7;
    1541             :             // q += (more < 0 ? -numer : numer)
    1542             :             // cast required to avoid UB
    1543             :             uq += ((uint32_t)numer ^ sign) - sign;
    1544             :         }
    1545             :         int32_t q = (int32_t)uq;
    1546             :         q >>= shift;
    1547             :         q += (q < 0);
    1548             :         return q;
    1549             :     }
    1550             : }
    1551             : 
    1552             : static LIBDIVIDE_INLINE int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
    1553             :     return libdivide_s32_do_raw(numer, denom->magic, denom->more);
    1554             : }
    1555             : 
    1556             : static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) {
    1557             :     uint8_t more = denom->more;
    1558             :     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    1559             :     // must be arithmetic shift and then sign extend
    1560             :     int32_t sign = (int8_t)more >> 7;
    1561             :     int32_t magic = denom->magic;
    1562             :     int32_t q = libdivide_mullhi_s32(numer, magic);
    1563             :     q += numer;
    1564             : 
    1565             :     // If q is non-negative, we have nothing to do
    1566             :     // If q is negative, we want to add either (2**shift)-1 if d is a power of
    1567             :     // 2, or (2**shift) if it is not a power of 2
    1568             :     uint32_t is_power_of_2 = (magic == 0);
    1569             :     uint32_t q_sign = (uint32_t)(q >> 31);
    1570             :     q += q_sign & (((uint32_t)1 << shift) - is_power_of_2);
    1571             : 
    1572             :     // Now arithmetic right shift
    1573             :     q >>= shift;
    1574             :     // Negate if needed
    1575             :     q = (q ^ sign) - sign;
    1576             : 
    1577             :     return q;
    1578             : }
    1579             : 
    1580             : static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) {
    1581             :     uint8_t more = denom->more;
    1582             :     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    1583             :     if (!denom->magic) {
    1584             :         uint32_t absD = (uint32_t)1 << shift;
    1585             :         if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
    1586             :             absD = -absD;
    1587             :         }
    1588             :         return (int32_t)absD;
    1589             :     } else {
    1590             :         // Unsigned math is much easier
    1591             :         // We negate the magic number only in the branchfull case, and we don't
    1592             :         // know which case we're in. However we have enough information to
    1593             :         // determine the correct sign of the magic number. The divisor was
    1594             :         // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set,
    1595             :         // the magic number's sign is opposite that of the divisor.
    1596             :         // We want to compute the positive magic number.
    1597             :         int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
    1598             :         int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
    1599             : 
    1600             :         // Handle the power of 2 case (including branchfree)
    1601             :         if (denom->magic == 0) {
    1602             :             int32_t result = (uint32_t)1 << shift;
    1603             :             return negative_divisor ? -result : result;
    1604             :         }
    1605             : 
    1606             :         uint32_t d = (uint32_t)(magic_was_negated ? -denom->magic : denom->magic);
    1607             :         uint64_t n = (uint64_t)1 << (32 + shift);  // this shift cannot exceed 30
    1608             :         uint32_t q = (uint32_t)(n / d);
    1609             :         int32_t result = (int32_t)q;
    1610             :         result += 1;
    1611             :         return negative_divisor ? -result : result;
    1612             :     }
    1613             : }
    1614             : 
    1615             : static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom) {
    1616             :     const struct libdivide_s32_t den = {denom->magic, denom->more};
    1617             :     return libdivide_s32_recover(&den);
    1618             : }
    1619             : 
    1620             : ////////// SINT64
    1621             : 
    1622             : static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_internal_s64_gen(
    1623             :     int64_t d, int branchfree) {
    1624             :     if (d == 0) {
    1625             :         LIBDIVIDE_ERROR("divider must be != 0");
    1626             :     }
    1627             : 
    1628             :     struct libdivide_s64_t result;
    1629             : 
    1630             :     // If d is a power of 2, or negative a power of 2, we have to use a shift.
    1631             :     // This is especially important because the magic algorithm fails for -1.
    1632             :     // To check if d is a power of 2 or its inverse, it suffices to check
    1633             :     // whether its absolute value has exactly one bit set.  This works even for
    1634             :     // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
    1635             :     // and is a power of 2.
    1636             :     uint64_t ud = (uint64_t)d;
    1637             :     uint64_t absD = (d < 0) ? -ud : ud;
    1638             :     uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(absD);
    1639             :     // check if exactly one bit is set,
    1640             :     // don't care if absD is 0 since that's divide by zero
    1641             :     if ((absD & (absD - 1)) == 0) {
    1642             :         // Branchfree and non-branchfree cases are the same
    1643             :         result.magic = 0;
    1644             :         result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
    1645             :     } else {
    1646             :         // the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word
    1647             :         // is 0 and the high word is floor_log_2_d - 1
    1648             :         uint8_t more;
    1649             :         uint64_t rem, proposed_m;
    1650             :         proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
    1651             :         const uint64_t e = absD - rem;
    1652             : 
    1653             :         // We are going to start with a power of floor_log_2_d - 1.
    1654             :         // This works if works if e < 2**floor_log_2_d.
    1655             :         if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {
    1656             :             // This power works
    1657             :             more = (uint8_t)(floor_log_2_d - 1);
    1658             :         } else {
    1659             :             // We need to go one higher. This should not make proposed_m
    1660             :             // overflow, but it will make it negative when interpreted as an
    1661             :             // int32_t.
    1662             :             proposed_m += proposed_m;
    1663             :             const uint64_t twice_rem = rem + rem;
    1664             :             if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
    1665             :             // note that we only set the LIBDIVIDE_NEGATIVE_DIVISOR bit if we
    1666             :             // also set ADD_MARKER this is an annoying optimization that
    1667             :             // enables algorithm #4 to avoid the mask. However we always set it
    1668             :             // in the branchfree case
    1669             :             more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
    1670             :         }
    1671             :         proposed_m += 1;
    1672             :         int64_t magic = (int64_t)proposed_m;
    1673             : 
    1674             :         // Mark if we are negative
    1675             :         if (d < 0) {
    1676             :             more |= LIBDIVIDE_NEGATIVE_DIVISOR;
    1677             :             if (!branchfree) {
    1678             :                 magic = -magic;
    1679             :             }
    1680             :         }
    1681             : 
    1682             :         result.more = more;
    1683             :         result.magic = magic;
    1684             :     }
    1685             :     return result;
    1686             : }
    1687             : 
    1688             : static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d) {
    1689             :     return libdivide_internal_s64_gen(d, 0);
    1690             : }
    1691             : 
    1692             : static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) {
    1693             :     struct libdivide_s64_t tmp = libdivide_internal_s64_gen(d, 1);
    1694             :     struct libdivide_s64_branchfree_t ret = {tmp.magic, tmp.more};
    1695             :     return ret;
    1696             : }
    1697             : 
    1698             : static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) {
    1699             :     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    1700             : 
    1701             :     if (!magic) {  // shift path
    1702             :         uint64_t mask = ((uint64_t)1 << shift) - 1;
    1703             :         uint64_t uq = numer + ((numer >> 63) & mask);
    1704             :         int64_t q = (int64_t)uq;
    1705             :         q >>= shift;
    1706             :         // must be arithmetic shift and then sign-extend
    1707             :         int64_t sign = (int8_t)more >> 7;
    1708             :         q = (q ^ sign) - sign;
    1709             :         return q;
    1710             :     } else {
    1711             :         uint64_t uq = (uint64_t)libdivide_mullhi_s64(numer, magic);
    1712             :         if (more & LIBDIVIDE_ADD_MARKER) {
    1713             :             // must be arithmetic shift and then sign extend
    1714             :             int64_t sign = (int8_t)more >> 7;
    1715             :             // q += (more < 0 ? -numer : numer)
    1716             :             // cast required to avoid UB
    1717             :             uq += ((uint64_t)numer ^ sign) - sign;
    1718             :         }
    1719             :         int64_t q = (int64_t)uq;
    1720             :         q >>= shift;
    1721             :         q += (q < 0);
    1722             :         return q;
    1723             :     }
    1724             : }
    1725             : 
    1726             : static LIBDIVIDE_INLINE int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
    1727             :     return libdivide_s64_do_raw(numer, denom->magic, denom->more);
    1728             : }
    1729             : 
    1730             : static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) {
    1731             :     uint8_t more = denom->more;
    1732             :     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    1733             :     // must be arithmetic shift and then sign extend
    1734             :     int64_t sign = (int8_t)more >> 7;
    1735             :     int64_t magic = denom->magic;
    1736             :     int64_t q = libdivide_mullhi_s64(numer, magic);
    1737             :     q += numer;
    1738             : 
    1739             :     // If q is non-negative, we have nothing to do.
    1740             :     // If q is negative, we want to add either (2**shift)-1 if d is a power of
    1741             :     // 2, or (2**shift) if it is not a power of 2.
    1742             :     uint64_t is_power_of_2 = (magic == 0);
    1743             :     uint64_t q_sign = (uint64_t)(q >> 63);
    1744             :     q += q_sign & (((uint64_t)1 << shift) - is_power_of_2);
    1745             : 
    1746             :     // Arithmetic right shift
    1747             :     q >>= shift;
    1748             :     // Negate if needed
    1749             :     q = (q ^ sign) - sign;
    1750             : 
    1751             :     return q;
    1752             : }
    1753             : 
    1754             : static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) {
    1755             :     uint8_t more = denom->more;
    1756             :     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    1757             :     if (denom->magic == 0) {  // shift path
    1758             :         uint64_t absD = (uint64_t)1 << shift;
    1759             :         if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
    1760             :             absD = -absD;
    1761             :         }
    1762             :         return (int64_t)absD;
    1763             :     } else {
    1764             :         // Unsigned math is much easier
    1765             :         int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
    1766             :         int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
    1767             : 
    1768             :         uint64_t d = (uint64_t)(magic_was_negated ? -denom->magic : denom->magic);
    1769             :         uint64_t n_hi = (uint64_t)1 << shift, n_lo = 0;
    1770             :         uint64_t rem_ignored;
    1771             :         uint64_t q = libdivide_128_div_64_to_64(n_hi, n_lo, d, &rem_ignored);
    1772             :         int64_t result = (int64_t)(q + 1);
    1773             :         if (negative_divisor) {
    1774             :             result = -result;
    1775             :         }
    1776             :         return result;
    1777             :     }
    1778             : }
    1779             : 
    1780             : static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom) {
    1781             :     const struct libdivide_s64_t den = {denom->magic, denom->more};
    1782             :     return libdivide_s64_recover(&den);
    1783             : }
    1784             : 
    1785             : // Simplest possible vector type division: treat the vector type as an array
    1786             : // of underlying native type.
    1787             : //
    1788             : // Use a union to read a vector via pointer-to-integer, without violating strict
    1789             : // aliasing.
    1790             : #define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo)                          \
    1791             :     const size_t count = sizeof(VecT) / sizeof(IntT);                     \
    1792             :     union type_pun_vec {                                                  \
    1793             :         VecT vec;                                                         \
    1794             :         IntT arr[sizeof(VecT) / sizeof(IntT)];                            \
    1795             :     };                                                                    \
    1796             :     union type_pun_vec result;                                            \
    1797             :     union type_pun_vec input;                                             \
    1798             :     input.vec = numers;                                                   \
    1799             :     for (size_t loop = 0; loop < count; ++loop) {                         \
    1800             :         result.arr[loop] = libdivide_##Algo##_do(input.arr[loop], denom); \
    1801             :     }                                                                     \
    1802             :     return result.vec;
    1803             : 
    1804             : #if defined(LIBDIVIDE_NEON)
    1805             : 
    1806             : static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_do_vec128(
    1807             :     uint16x8_t numers, const struct libdivide_u16_t *denom);
    1808             : static LIBDIVIDE_INLINE int16x8_t libdivide_s16_do_vec128(
    1809             :     int16x8_t numers, const struct libdivide_s16_t *denom);
    1810             : static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_do_vec128(
    1811             :     uint32x4_t numers, const struct libdivide_u32_t *denom);
    1812             : static LIBDIVIDE_INLINE int32x4_t libdivide_s32_do_vec128(
    1813             :     int32x4_t numers, const struct libdivide_s32_t *denom);
    1814             : static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_do_vec128(
    1815             :     uint64x2_t numers, const struct libdivide_u64_t *denom);
    1816             : static LIBDIVIDE_INLINE int64x2_t libdivide_s64_do_vec128(
    1817             :     int64x2_t numers, const struct libdivide_s64_t *denom);
    1818             : 
    1819             : static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_branchfree_do_vec128(
    1820             :     uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom);
    1821             : static LIBDIVIDE_INLINE int16x8_t libdivide_s16_branchfree_do_vec128(
    1822             :     int16x8_t numers, const struct libdivide_s16_branchfree_t *denom);
    1823             : static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_branchfree_do_vec128(
    1824             :     uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom);
    1825             : static LIBDIVIDE_INLINE int32x4_t libdivide_s32_branchfree_do_vec128(
    1826             :     int32x4_t numers, const struct libdivide_s32_branchfree_t *denom);
    1827             : static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_branchfree_do_vec128(
    1828             :     uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom);
    1829             : static LIBDIVIDE_INLINE int64x2_t libdivide_s64_branchfree_do_vec128(
    1830             :     int64x2_t numers, const struct libdivide_s64_branchfree_t *denom);
    1831             : 
    1832             : //////// Internal Utility Functions
    1833             : 
    1834             : // Logical right shift by runtime value.
    1835             : // NEON implements right shift as left shits by negative values.
    1836             : static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_neon_srl(uint32x4_t v, uint8_t amt) {
    1837             :     int32_t wamt = (int32_t)(amt);
    1838             :     return vshlq_u32(v, vdupq_n_s32(-wamt));
    1839             : }
    1840             : 
    1841             : static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_neon_srl(uint64x2_t v, uint8_t amt) {
    1842             :     int64_t wamt = (int64_t)(amt);
    1843             :     return vshlq_u64(v, vdupq_n_s64(-wamt));
    1844             : }
    1845             : 
    1846             : // Arithmetic right shift by runtime value.
    1847             : static LIBDIVIDE_INLINE int32x4_t libdivide_s32_neon_sra(int32x4_t v, uint8_t amt) {
    1848             :     int32_t wamt = (int32_t)(amt);
    1849             :     return vshlq_s32(v, vdupq_n_s32(-wamt));
    1850             : }
    1851             : 
    1852             : static LIBDIVIDE_INLINE int64x2_t libdivide_s64_neon_sra(int64x2_t v, uint8_t amt) {
    1853             :     int64_t wamt = (int64_t)(amt);
    1854             :     return vshlq_s64(v, vdupq_n_s64(-wamt));
    1855             : }
    1856             : 
    1857             : static LIBDIVIDE_INLINE int64x2_t libdivide_s64_signbits(int64x2_t v) { return vshrq_n_s64(v, 63); }
    1858             : 
    1859             : static LIBDIVIDE_INLINE uint32x4_t libdivide_mullhi_u32_vec128(uint32x4_t a, uint32_t b) {
    1860             :     // Desire is [x0, x1, x2, x3]
    1861             :     uint32x4_t w1 = vreinterpretq_u32_u64(vmull_n_u32(vget_low_u32(a), b));  // [_, x0, _, x1]
    1862             :     uint32x4_t w2 = vreinterpretq_u32_u64(vmull_high_n_u32(a, b));           //[_, x2, _, x3]
    1863             :     return vuzp2q_u32(w1, w2);                                               // [x0, x1, x2, x3]
    1864             : }
    1865             : 
    1866             : static LIBDIVIDE_INLINE int32x4_t libdivide_mullhi_s32_vec128(int32x4_t a, int32_t b) {
    1867             :     int32x4_t w1 = vreinterpretq_s32_s64(vmull_n_s32(vget_low_s32(a), b));  // [_, x0, _, x1]
    1868             :     int32x4_t w2 = vreinterpretq_s32_s64(vmull_high_n_s32(a, b));           //[_, x2, _, x3]
    1869             :     return vuzp2q_s32(w1, w2);                                              // [x0, x1, x2, x3]
    1870             : }
    1871             : 
    1872             : static LIBDIVIDE_INLINE uint64x2_t libdivide_mullhi_u64_vec128(uint64x2_t x, uint64_t sy) {
    1873             :     // full 128 bits product is:
    1874             :     // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64)
    1875             :     // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64.
    1876             : 
    1877             :     // Get low and high words. x0 contains low 32 bits, x1 is high 32 bits.
    1878             :     uint64x2_t y = vdupq_n_u64(sy);
    1879             :     uint32x2_t x0 = vmovn_u64(x);
    1880             :     uint32x2_t y0 = vmovn_u64(y);
    1881             :     uint32x2_t x1 = vshrn_n_u64(x, 32);
    1882             :     uint32x2_t y1 = vshrn_n_u64(y, 32);
    1883             : 
    1884             :     // Compute x0*y0.
    1885             :     uint64x2_t x0y0 = vmull_u32(x0, y0);
    1886             :     uint64x2_t x0y0_hi = vshrq_n_u64(x0y0, 32);
    1887             : 
    1888             :     // Compute other intermediate products.
    1889             :     uint64x2_t temp = vmlal_u32(x0y0_hi, x1, y0);  // temp = x0y0_hi + x1*y0;
    1890             :     // We want to split temp into its low 32 bits and high 32 bits, both
    1891             :     // in the low half of 64 bit registers.
    1892             :     // Use shifts to avoid needing a reg for the mask.
    1893             :     uint64x2_t temp_lo = vshrq_n_u64(vshlq_n_u64(temp, 32), 32);  // temp_lo = temp & 0xFFFFFFFF;
    1894             :     uint64x2_t temp_hi = vshrq_n_u64(temp, 32);                   // temp_hi = temp >> 32;
    1895             : 
    1896             :     temp_lo = vmlal_u32(temp_lo, x0, y1);  // temp_lo += x0*y0
    1897             :     temp_lo = vshrq_n_u64(temp_lo, 32);    // temp_lo >>= 32
    1898             :     temp_hi = vmlal_u32(temp_hi, x1, y1);  // temp_hi += x1*y1
    1899             :     uint64x2_t result = vaddq_u64(temp_hi, temp_lo);
    1900             :     return result;
    1901             : }
    1902             : 
    1903             : static LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64_t sy) {
    1904             :     int64x2_t p = vreinterpretq_s64_u64(
    1905             :         libdivide_mullhi_u64_vec128(vreinterpretq_u64_s64(x), (uint64_t)(sy)));
    1906             :     int64x2_t y = vdupq_n_s64(sy);
    1907             :     int64x2_t t1 = vandq_s64(libdivide_s64_signbits(x), y);
    1908             :     int64x2_t t2 = vandq_s64(libdivide_s64_signbits(y), x);
    1909             :     p = vsubq_s64(p, t1);
    1910             :     p = vsubq_s64(p, t2);
    1911             :     return p;
    1912             : }
    1913             : 
    1914             : ////////// UINT16
    1915             : 
    1916             : uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom){
    1917             :     SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)}
    1918             : 
    1919             : uint16x8_t libdivide_u16_branchfree_do_vec128(
    1920             :     uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom){
    1921             :     SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)}
    1922             : 
    1923             : ////////// UINT32
    1924             : 
    1925             : uint32x4_t libdivide_u32_do_vec128(uint32x4_t numers, const struct libdivide_u32_t *denom) {
    1926             :     uint8_t more = denom->more;
    1927             :     if (!denom->magic) {
    1928             :         return libdivide_u32_neon_srl(numers, more);
    1929             :     } else {
    1930             :         uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic);
    1931             :         if (more & LIBDIVIDE_ADD_MARKER) {
    1932             :             // uint32_t t = ((numer - q) >> 1) + q;
    1933             :             // return t >> denom->shift;
    1934             :             // Note we can use halving-subtract to avoid the shift.
    1935             :             uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    1936             :             uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q);
    1937             :             return libdivide_u32_neon_srl(t, shift);
    1938             :         } else {
    1939             :             return libdivide_u32_neon_srl(q, more);
    1940             :         }
    1941             :     }
    1942             : }
    1943             : 
    1944             : uint32x4_t libdivide_u32_branchfree_do_vec128(
    1945             :     uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom) {
    1946             :     uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic);
    1947             :     uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q);
    1948             :     return libdivide_u32_neon_srl(t, denom->more);
    1949             : }
    1950             : 
    1951             : ////////// UINT64
    1952             : 
    1953             : uint64x2_t libdivide_u64_do_vec128(uint64x2_t numers, const struct libdivide_u64_t *denom) {
    1954             :     uint8_t more = denom->more;
    1955             :     if (!denom->magic) {
    1956             :         return libdivide_u64_neon_srl(numers, more);
    1957             :     } else {
    1958             :         uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic);
    1959             :         if (more & LIBDIVIDE_ADD_MARKER) {
    1960             :             // uint32_t t = ((numer - q) >> 1) + q;
    1961             :             // return t >> denom->shift;
    1962             :             // No 64-bit halving subtracts in NEON :(
    1963             :             uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    1964             :             uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q);
    1965             :             return libdivide_u64_neon_srl(t, shift);
    1966             :         } else {
    1967             :             return libdivide_u64_neon_srl(q, more);
    1968             :         }
    1969             :     }
    1970             : }
    1971             : 
    1972             : uint64x2_t libdivide_u64_branchfree_do_vec128(
    1973             :     uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom) {
    1974             :     uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic);
    1975             :     uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q);
    1976             :     return libdivide_u64_neon_srl(t, denom->more);
    1977             : }
    1978             : 
    1979             : ////////// SINT16
    1980             : 
    1981             : int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom){
    1982             :     SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)}
    1983             : 
    1984             : int16x8_t libdivide_s16_branchfree_do_vec128(
    1985             :     int16x8_t numers, const struct libdivide_s16_branchfree_t *denom){
    1986             :     SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)}
    1987             : 
    1988             : ////////// SINT32
    1989             : 
    1990             : int32x4_t libdivide_s32_do_vec128(int32x4_t numers, const struct libdivide_s32_t *denom) {
    1991             :     uint8_t more = denom->more;
    1992             :     if (!denom->magic) {
    1993             :         uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    1994             :         uint32_t mask = ((uint32_t)1 << shift) - 1;
    1995             :         int32x4_t roundToZeroTweak = vdupq_n_s32((int)mask);
    1996             :         // q = numer + ((numer >> 31) & roundToZeroTweak);
    1997             :         int32x4_t q = vaddq_s32(numers, vandq_s32(vshrq_n_s32(numers, 31), roundToZeroTweak));
    1998             :         q = libdivide_s32_neon_sra(q, shift);
    1999             :         int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
    2000             :         // q = (q ^ sign) - sign;
    2001             :         q = vsubq_s32(veorq_s32(q, sign), sign);
    2002             :         return q;
    2003             :     } else {
    2004             :         int32x4_t q = libdivide_mullhi_s32_vec128(numers, denom->magic);
    2005             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2006             :             // must be arithmetic shift
    2007             :             int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
    2008             :             // q += ((numer ^ sign) - sign);
    2009             :             q = vaddq_s32(q, vsubq_s32(veorq_s32(numers, sign), sign));
    2010             :         }
    2011             :         // q >>= shift
    2012             :         q = libdivide_s32_neon_sra(q, more & LIBDIVIDE_32_SHIFT_MASK);
    2013             :         q = vaddq_s32(
    2014             :             q, vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(q), 31)));  // q += (q < 0)
    2015             :         return q;
    2016             :     }
    2017             : }
    2018             : 
    2019             : int32x4_t libdivide_s32_branchfree_do_vec128(
    2020             :     int32x4_t numers, const struct libdivide_s32_branchfree_t *denom) {
    2021             :     int32_t magic = denom->magic;
    2022             :     uint8_t more = denom->more;
    2023             :     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    2024             :     // must be arithmetic shift
    2025             :     int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
    2026             :     int32x4_t q = libdivide_mullhi_s32_vec128(numers, magic);
    2027             :     q = vaddq_s32(q, numers);  // q += numers
    2028             : 
    2029             :     // If q is non-negative, we have nothing to do
    2030             :     // If q is negative, we want to add either (2**shift)-1 if d is
    2031             :     // a power of 2, or (2**shift) if it is not a power of 2
    2032             :     uint32_t is_power_of_2 = (magic == 0);
    2033             :     int32x4_t q_sign = vshrq_n_s32(q, 31);  // q_sign = q >> 31
    2034             :     int32x4_t mask = vdupq_n_s32(((uint32_t)1 << shift) - is_power_of_2);
    2035             :     q = vaddq_s32(q, vandq_s32(q_sign, mask));  // q = q + (q_sign & mask)
    2036             :     q = libdivide_s32_neon_sra(q, shift);       // q >>= shift
    2037             :     q = vsubq_s32(veorq_s32(q, sign), sign);    // q = (q ^ sign) - sign
    2038             :     return q;
    2039             : }
    2040             : 
    2041             : ////////// SINT64
    2042             : 
    2043             : int64x2_t libdivide_s64_do_vec128(int64x2_t numers, const struct libdivide_s64_t *denom) {
    2044             :     uint8_t more = denom->more;
    2045             :     int64_t magic = denom->magic;
    2046             :     if (magic == 0) {  // shift path
    2047             :         uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    2048             :         uint64_t mask = ((uint64_t)1 << shift) - 1;
    2049             :         int64x2_t roundToZeroTweak = vdupq_n_s64(mask);  // TODO: no need to sign extend
    2050             :         // q = numer + ((numer >> 63) & roundToZeroTweak);
    2051             :         int64x2_t q =
    2052             :             vaddq_s64(numers, vandq_s64(libdivide_s64_signbits(numers), roundToZeroTweak));
    2053             :         q = libdivide_s64_neon_sra(q, shift);
    2054             :         // q = (q ^ sign) - sign;
    2055             :         int64x2_t sign = vreinterpretq_s64_s8(vdupq_n_s8((int8_t)more >> 7));
    2056             :         q = vsubq_s64(veorq_s64(q, sign), sign);
    2057             :         return q;
    2058             :     } else {
    2059             :         int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic);
    2060             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2061             :             // must be arithmetic shift
    2062             :             int64x2_t sign = vdupq_n_s64((int8_t)more >> 7);  // TODO: no need to widen
    2063             :             // q += ((numer ^ sign) - sign);
    2064             :             q = vaddq_s64(q, vsubq_s64(veorq_s64(numers, sign), sign));
    2065             :         }
    2066             :         // q >>= denom->mult_path.shift
    2067             :         q = libdivide_s64_neon_sra(q, more & LIBDIVIDE_64_SHIFT_MASK);
    2068             :         q = vaddq_s64(
    2069             :             q, vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(q), 63)));  // q += (q < 0)
    2070             :         return q;
    2071             :     }
    2072             : }
    2073             : 
    2074             : int64x2_t libdivide_s64_branchfree_do_vec128(
    2075             :     int64x2_t numers, const struct libdivide_s64_branchfree_t *denom) {
    2076             :     int64_t magic = denom->magic;
    2077             :     uint8_t more = denom->more;
    2078             :     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    2079             :     // must be arithmetic shift
    2080             :     int64x2_t sign = vdupq_n_s64((int8_t)more >> 7);  // TODO: avoid sign extend
    2081             : 
    2082             :     // libdivide_mullhi_s64(numers, magic);
    2083             :     int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic);
    2084             :     q = vaddq_s64(q, numers);  // q += numers
    2085             : 
    2086             :     // If q is non-negative, we have nothing to do.
    2087             :     // If q is negative, we want to add either (2**shift)-1 if d is
    2088             :     // a power of 2, or (2**shift) if it is not a power of 2.
    2089             :     uint32_t is_power_of_2 = (magic == 0);
    2090             :     int64x2_t q_sign = libdivide_s64_signbits(q);  // q_sign = q >> 63
    2091             :     int64x2_t mask = vdupq_n_s64(((uint64_t)1 << shift) - is_power_of_2);
    2092             :     q = vaddq_s64(q, vandq_s64(q_sign, mask));  // q = q + (q_sign & mask)
    2093             :     q = libdivide_s64_neon_sra(q, shift);       // q >>= shift
    2094             :     q = vsubq_s64(veorq_s64(q, sign), sign);    // q = (q ^ sign) - sign
    2095             :     return q;
    2096             : }
    2097             : 
    2098             : #endif
    2099             : 
    2100             : #if defined(LIBDIVIDE_AVX512)
    2101             : 
    2102             : static LIBDIVIDE_INLINE __m512i libdivide_u16_do_vec512(
    2103             :     __m512i numers, const struct libdivide_u16_t *denom);
    2104             : static LIBDIVIDE_INLINE __m512i libdivide_s16_do_vec512(
    2105             :     __m512i numers, const struct libdivide_s16_t *denom);
    2106             : static LIBDIVIDE_INLINE __m512i libdivide_u32_do_vec512(
    2107             :     __m512i numers, const struct libdivide_u32_t *denom);
    2108             : static LIBDIVIDE_INLINE __m512i libdivide_s32_do_vec512(
    2109             :     __m512i numers, const struct libdivide_s32_t *denom);
    2110             : static LIBDIVIDE_INLINE __m512i libdivide_u64_do_vec512(
    2111             :     __m512i numers, const struct libdivide_u64_t *denom);
    2112             : static LIBDIVIDE_INLINE __m512i libdivide_s64_do_vec512(
    2113             :     __m512i numers, const struct libdivide_s64_t *denom);
    2114             : 
    2115             : static LIBDIVIDE_INLINE __m512i libdivide_u16_branchfree_do_vec512(
    2116             :     __m512i numers, const struct libdivide_u16_branchfree_t *denom);
    2117             : static LIBDIVIDE_INLINE __m512i libdivide_s16_branchfree_do_vec512(
    2118             :     __m512i numers, const struct libdivide_s16_branchfree_t *denom);
    2119             : static LIBDIVIDE_INLINE __m512i libdivide_u32_branchfree_do_vec512(
    2120             :     __m512i numers, const struct libdivide_u32_branchfree_t *denom);
    2121             : static LIBDIVIDE_INLINE __m512i libdivide_s32_branchfree_do_vec512(
    2122             :     __m512i numers, const struct libdivide_s32_branchfree_t *denom);
    2123             : static LIBDIVIDE_INLINE __m512i libdivide_u64_branchfree_do_vec512(
    2124             :     __m512i numers, const struct libdivide_u64_branchfree_t *denom);
    2125             : static LIBDIVIDE_INLINE __m512i libdivide_s64_branchfree_do_vec512(
    2126             :     __m512i numers, const struct libdivide_s64_branchfree_t *denom);
    2127             : 
    2128             : //////// Internal Utility Functions
    2129             : 
    2130             : static LIBDIVIDE_INLINE __m512i libdivide_s64_signbits_vec512(__m512i v) {
    2131             :     ;
    2132             :     return _mm512_srai_epi64(v, 63);
    2133             : }
    2134             : 
    2135             : static LIBDIVIDE_INLINE __m512i libdivide_s64_shift_right_vec512(__m512i v, int amt) {
    2136             :     return _mm512_srai_epi64(v, amt);
    2137             : }
    2138             : 
    2139             : // Here, b is assumed to contain one 32-bit value repeated.
    2140             : static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u32_vec512(__m512i a, __m512i b) {
    2141             :     __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epu32(a, b), 32);
    2142             :     __m512i a1X3X = _mm512_srli_epi64(a, 32);
    2143             :     __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
    2144             :     __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epu32(a1X3X, b), mask);
    2145             :     return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3);
    2146             : }
    2147             : 
    2148             : // b is one 32-bit value repeated.
    2149             : static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s32_vec512(__m512i a, __m512i b) {
    2150             :     __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epi32(a, b), 32);
    2151             :     __m512i a1X3X = _mm512_srli_epi64(a, 32);
    2152             :     __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
    2153             :     __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epi32(a1X3X, b), mask);
    2154             :     return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3);
    2155             : }
    2156             : 
    2157             : // Here, y is assumed to contain one 64-bit value repeated.
    2158             : static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y) {
    2159             :     // see m128i variant for comments.
    2160             :     __m512i x0y0 = _mm512_mul_epu32(x, y);
    2161             :     __m512i x0y0_hi = _mm512_srli_epi64(x0y0, 32);
    2162             : 
    2163             :     __m512i x1 = _mm512_shuffle_epi32(x, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1));
    2164             :     __m512i y1 = _mm512_shuffle_epi32(y, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1));
    2165             : 
    2166             :     __m512i x0y1 = _mm512_mul_epu32(x, y1);
    2167             :     __m512i x1y0 = _mm512_mul_epu32(x1, y);
    2168             :     __m512i x1y1 = _mm512_mul_epu32(x1, y1);
    2169             : 
    2170             :     __m512i mask = _mm512_set1_epi64(0xFFFFFFFF);
    2171             :     __m512i temp = _mm512_add_epi64(x1y0, x0y0_hi);
    2172             :     __m512i temp_lo = _mm512_and_si512(temp, mask);
    2173             :     __m512i temp_hi = _mm512_srli_epi64(temp, 32);
    2174             : 
    2175             :     temp_lo = _mm512_srli_epi64(_mm512_add_epi64(temp_lo, x0y1), 32);
    2176             :     temp_hi = _mm512_add_epi64(x1y1, temp_hi);
    2177             :     return _mm512_add_epi64(temp_lo, temp_hi);
    2178             : }
    2179             : 
    2180             : // y is one 64-bit value repeated.
    2181             : static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y) {
    2182             :     __m512i p = libdivide_mullhi_u64_vec512(x, y);
    2183             :     __m512i t1 = _mm512_and_si512(libdivide_s64_signbits_vec512(x), y);
    2184             :     __m512i t2 = _mm512_and_si512(libdivide_s64_signbits_vec512(y), x);
    2185             :     p = _mm512_sub_epi64(p, t1);
    2186             :     p = _mm512_sub_epi64(p, t2);
    2187             :     return p;
    2188             : }
    2189             : 
    2190             : ////////// UINT16
    2191             : 
    2192             : __m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom){
    2193             :     SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)}
    2194             : 
    2195             : __m512i libdivide_u16_branchfree_do_vec512(
    2196             :     __m512i numers, const struct libdivide_u16_branchfree_t *denom){
    2197             :     SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)}
    2198             : 
    2199             : ////////// UINT32
    2200             : 
    2201             : __m512i libdivide_u32_do_vec512(__m512i numers, const struct libdivide_u32_t *denom) {
    2202             :     uint8_t more = denom->more;
    2203             :     if (!denom->magic) {
    2204             :         return _mm512_srli_epi32(numers, more);
    2205             :     } else {
    2206             :         __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic));
    2207             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2208             :             // uint32_t t = ((numer - q) >> 1) + q;
    2209             :             // return t >> denom->shift;
    2210             :             uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    2211             :             __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);
    2212             :             return _mm512_srli_epi32(t, shift);
    2213             :         } else {
    2214             :             return _mm512_srli_epi32(q, more);
    2215             :         }
    2216             :     }
    2217             : }
    2218             : 
    2219             : __m512i libdivide_u32_branchfree_do_vec512(
    2220             :     __m512i numers, const struct libdivide_u32_branchfree_t *denom) {
    2221             :     __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic));
    2222             :     __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);
    2223             :     return _mm512_srli_epi32(t, denom->more);
    2224             : }
    2225             : 
    2226             : ////////// UINT64
    2227             : 
    2228             : __m512i libdivide_u64_do_vec512(__m512i numers, const struct libdivide_u64_t *denom) {
    2229             :     uint8_t more = denom->more;
    2230             :     if (!denom->magic) {
    2231             :         return _mm512_srli_epi64(numers, more);
    2232             :     } else {
    2233             :         __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic));
    2234             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2235             :             // uint32_t t = ((numer - q) >> 1) + q;
    2236             :             // return t >> denom->shift;
    2237             :             uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    2238             :             __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);
    2239             :             return _mm512_srli_epi64(t, shift);
    2240             :         } else {
    2241             :             return _mm512_srli_epi64(q, more);
    2242             :         }
    2243             :     }
    2244             : }
    2245             : 
    2246             : __m512i libdivide_u64_branchfree_do_vec512(
    2247             :     __m512i numers, const struct libdivide_u64_branchfree_t *denom) {
    2248             :     __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic));
    2249             :     __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);
    2250             :     return _mm512_srli_epi64(t, denom->more);
    2251             : }
    2252             : 
    2253             : ////////// SINT16
    2254             : 
    2255             : __m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom){
    2256             :     SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)}
    2257             : 
    2258             : __m512i libdivide_s16_branchfree_do_vec512(
    2259             :     __m512i numers, const struct libdivide_s16_branchfree_t *denom){
    2260             :     SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)}
    2261             : 
    2262             : ////////// SINT32
    2263             : 
    2264             : __m512i libdivide_s32_do_vec512(__m512i numers, const struct libdivide_s32_t *denom) {
    2265             :     uint8_t more = denom->more;
    2266             :     if (!denom->magic) {
    2267             :         uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    2268             :         uint32_t mask = ((uint32_t)1 << shift) - 1;
    2269             :         __m512i roundToZeroTweak = _mm512_set1_epi32(mask);
    2270             :         // q = numer + ((numer >> 31) & roundToZeroTweak);
    2271             :         __m512i q = _mm512_add_epi32(
    2272             :             numers, _mm512_and_si512(_mm512_srai_epi32(numers, 31), roundToZeroTweak));
    2273             :         q = _mm512_srai_epi32(q, shift);
    2274             :         __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
    2275             :         // q = (q ^ sign) - sign;
    2276             :         q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign);
    2277             :         return q;
    2278             :     } else {
    2279             :         __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(denom->magic));
    2280             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2281             :             // must be arithmetic shift
    2282             :             __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
    2283             :             // q += ((numer ^ sign) - sign);
    2284             :             q = _mm512_add_epi32(q, _mm512_sub_epi32(_mm512_xor_si512(numers, sign), sign));
    2285             :         }
    2286             :         // q >>= shift
    2287             :         q = _mm512_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
    2288             :         q = _mm512_add_epi32(q, _mm512_srli_epi32(q, 31));  // q += (q < 0)
    2289             :         return q;
    2290             :     }
    2291             : }
    2292             : 
    2293             : __m512i libdivide_s32_branchfree_do_vec512(
    2294             :     __m512i numers, const struct libdivide_s32_branchfree_t *denom) {
    2295             :     int32_t magic = denom->magic;
    2296             :     uint8_t more = denom->more;
    2297             :     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    2298             :     // must be arithmetic shift
    2299             :     __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
    2300             :     __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(magic));
    2301             :     q = _mm512_add_epi32(q, numers);  // q += numers
    2302             : 
    2303             :     // If q is non-negative, we have nothing to do
    2304             :     // If q is negative, we want to add either (2**shift)-1 if d is
    2305             :     // a power of 2, or (2**shift) if it is not a power of 2
    2306             :     uint32_t is_power_of_2 = (magic == 0);
    2307             :     __m512i q_sign = _mm512_srai_epi32(q, 31);  // q_sign = q >> 31
    2308             :     __m512i mask = _mm512_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
    2309             :     q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask));  // q = q + (q_sign & mask)
    2310             :     q = _mm512_srai_epi32(q, shift);                          // q >>= shift
    2311             :     q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign);    // q = (q ^ sign) - sign
    2312             :     return q;
    2313             : }
    2314             : 
    2315             : ////////// SINT64
    2316             : 
    2317             : __m512i libdivide_s64_do_vec512(__m512i numers, const struct libdivide_s64_t *denom) {
    2318             :     uint8_t more = denom->more;
    2319             :     int64_t magic = denom->magic;
    2320             :     if (magic == 0) {  // shift path
    2321             :         uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    2322             :         uint64_t mask = ((uint64_t)1 << shift) - 1;
    2323             :         __m512i roundToZeroTweak = _mm512_set1_epi64(mask);
    2324             :         // q = numer + ((numer >> 63) & roundToZeroTweak);
    2325             :         __m512i q = _mm512_add_epi64(
    2326             :             numers, _mm512_and_si512(libdivide_s64_signbits_vec512(numers), roundToZeroTweak));
    2327             :         q = libdivide_s64_shift_right_vec512(q, shift);
    2328             :         __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
    2329             :         // q = (q ^ sign) - sign;
    2330             :         q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign);
    2331             :         return q;
    2332             :     } else {
    2333             :         __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic));
    2334             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2335             :             // must be arithmetic shift
    2336             :             __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
    2337             :             // q += ((numer ^ sign) - sign);
    2338             :             q = _mm512_add_epi64(q, _mm512_sub_epi64(_mm512_xor_si512(numers, sign), sign));
    2339             :         }
    2340             :         // q >>= denom->mult_path.shift
    2341             :         q = libdivide_s64_shift_right_vec512(q, more & LIBDIVIDE_64_SHIFT_MASK);
    2342             :         q = _mm512_add_epi64(q, _mm512_srli_epi64(q, 63));  // q += (q < 0)
    2343             :         return q;
    2344             :     }
    2345             : }
    2346             : 
    2347             : __m512i libdivide_s64_branchfree_do_vec512(
    2348             :     __m512i numers, const struct libdivide_s64_branchfree_t *denom) {
    2349             :     int64_t magic = denom->magic;
    2350             :     uint8_t more = denom->more;
    2351             :     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    2352             :     // must be arithmetic shift
    2353             :     __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
    2354             : 
    2355             :     // libdivide_mullhi_s64(numers, magic);
    2356             :     __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic));
    2357             :     q = _mm512_add_epi64(q, numers);  // q += numers
    2358             : 
    2359             :     // If q is non-negative, we have nothing to do.
    2360             :     // If q is negative, we want to add either (2**shift)-1 if d is
    2361             :     // a power of 2, or (2**shift) if it is not a power of 2.
    2362             :     uint32_t is_power_of_2 = (magic == 0);
    2363             :     __m512i q_sign = libdivide_s64_signbits_vec512(q);  // q_sign = q >> 63
    2364             :     __m512i mask = _mm512_set1_epi64(((uint64_t)1 << shift) - is_power_of_2);
    2365             :     q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask));  // q = q + (q_sign & mask)
    2366             :     q = libdivide_s64_shift_right_vec512(q, shift);           // q >>= shift
    2367             :     q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign);    // q = (q ^ sign) - sign
    2368             :     return q;
    2369             : }
    2370             : 
    2371             : #endif
    2372             : 
    2373             : #if defined(LIBDIVIDE_AVX2)
    2374             : 
    2375             : static LIBDIVIDE_INLINE __m256i libdivide_u16_do_vec256(
    2376             :     __m256i numers, const struct libdivide_u16_t *denom);
    2377             : static LIBDIVIDE_INLINE __m256i libdivide_s16_do_vec256(
    2378             :     __m256i numers, const struct libdivide_s16_t *denom);
    2379             : static LIBDIVIDE_INLINE __m256i libdivide_u32_do_vec256(
    2380             :     __m256i numers, const struct libdivide_u32_t *denom);
    2381             : static LIBDIVIDE_INLINE __m256i libdivide_s32_do_vec256(
    2382             :     __m256i numers, const struct libdivide_s32_t *denom);
    2383             : static LIBDIVIDE_INLINE __m256i libdivide_u64_do_vec256(
    2384             :     __m256i numers, const struct libdivide_u64_t *denom);
    2385             : static LIBDIVIDE_INLINE __m256i libdivide_s64_do_vec256(
    2386             :     __m256i numers, const struct libdivide_s64_t *denom);
    2387             : 
    2388             : static LIBDIVIDE_INLINE __m256i libdivide_u16_branchfree_do_vec256(
    2389             :     __m256i numers, const struct libdivide_u16_branchfree_t *denom);
    2390             : static LIBDIVIDE_INLINE __m256i libdivide_s16_branchfree_do_vec256(
    2391             :     __m256i numers, const struct libdivide_s16_branchfree_t *denom);
    2392             : static LIBDIVIDE_INLINE __m256i libdivide_u32_branchfree_do_vec256(
    2393             :     __m256i numers, const struct libdivide_u32_branchfree_t *denom);
    2394             : static LIBDIVIDE_INLINE __m256i libdivide_s32_branchfree_do_vec256(
    2395             :     __m256i numers, const struct libdivide_s32_branchfree_t *denom);
    2396             : static LIBDIVIDE_INLINE __m256i libdivide_u64_branchfree_do_vec256(
    2397             :     __m256i numers, const struct libdivide_u64_branchfree_t *denom);
    2398             : static LIBDIVIDE_INLINE __m256i libdivide_s64_branchfree_do_vec256(
    2399             :     __m256i numers, const struct libdivide_s64_branchfree_t *denom);
    2400             : 
    2401             : //////// Internal Utility Functions
    2402             : 
    2403             : // Implementation of _mm256_srai_epi64(v, 63) (from AVX512).
    2404             : static LIBDIVIDE_INLINE __m256i libdivide_s64_signbits_vec256(__m256i v) {
    2405             :     __m256i hiBitsDuped = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
    2406             :     __m256i signBits = _mm256_srai_epi32(hiBitsDuped, 31);
    2407             :     return signBits;
    2408             : }
    2409             : 
    2410             : // Implementation of _mm256_srai_epi64 (from AVX512).
    2411             : static LIBDIVIDE_INLINE __m256i libdivide_s64_shift_right_vec256(__m256i v, int amt) {
    2412             :     const int b = 64 - amt;
    2413             :     __m256i m = _mm256_set1_epi64x((uint64_t)1 << (b - 1));
    2414             :     __m256i x = _mm256_srli_epi64(v, amt);
    2415             :     __m256i result = _mm256_sub_epi64(_mm256_xor_si256(x, m), m);
    2416             :     return result;
    2417             : }
    2418             : 
    2419             : // Here, b is assumed to contain one 32-bit value repeated.
    2420             : static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u32_vec256(__m256i a, __m256i b) {
    2421             :     __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epu32(a, b), 32);
    2422             :     __m256i a1X3X = _mm256_srli_epi64(a, 32);
    2423             :     __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
    2424             :     __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epu32(a1X3X, b), mask);
    2425             :     return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3);
    2426             : }
    2427             : 
    2428             : // b is one 32-bit value repeated.
    2429             : static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s32_vec256(__m256i a, __m256i b) {
    2430             :     __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epi32(a, b), 32);
    2431             :     __m256i a1X3X = _mm256_srli_epi64(a, 32);
    2432             :     __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
    2433             :     __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epi32(a1X3X, b), mask);
    2434             :     return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3);
    2435             : }
    2436             : 
    2437             : // Here, y is assumed to contain one 64-bit value repeated.
    2438             : static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y) {
    2439             :     // see m128i variant for comments.
    2440             :     __m256i x0y0 = _mm256_mul_epu32(x, y);
    2441             :     __m256i x0y0_hi = _mm256_srli_epi64(x0y0, 32);
    2442             : 
    2443             :     __m256i x1 = _mm256_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1));
    2444             :     __m256i y1 = _mm256_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1));
    2445             : 
    2446             :     __m256i x0y1 = _mm256_mul_epu32(x, y1);
    2447             :     __m256i x1y0 = _mm256_mul_epu32(x1, y);
    2448             :     __m256i x1y1 = _mm256_mul_epu32(x1, y1);
    2449             : 
    2450             :     __m256i mask = _mm256_set1_epi64x(0xFFFFFFFF);
    2451             :     __m256i temp = _mm256_add_epi64(x1y0, x0y0_hi);
    2452             :     __m256i temp_lo = _mm256_and_si256(temp, mask);
    2453             :     __m256i temp_hi = _mm256_srli_epi64(temp, 32);
    2454             : 
    2455             :     temp_lo = _mm256_srli_epi64(_mm256_add_epi64(temp_lo, x0y1), 32);
    2456             :     temp_hi = _mm256_add_epi64(x1y1, temp_hi);
    2457             :     return _mm256_add_epi64(temp_lo, temp_hi);
    2458             : }
    2459             : 
    2460             : // y is one 64-bit value repeated.
    2461             : static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y) {
    2462             :     __m256i p = libdivide_mullhi_u64_vec256(x, y);
    2463             :     __m256i t1 = _mm256_and_si256(libdivide_s64_signbits_vec256(x), y);
    2464             :     __m256i t2 = _mm256_and_si256(libdivide_s64_signbits_vec256(y), x);
    2465             :     p = _mm256_sub_epi64(p, t1);
    2466             :     p = _mm256_sub_epi64(p, t2);
    2467             :     return p;
    2468             : }
    2469             : 
    2470             : ////////// UINT16
    2471             : 
    2472             : __m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) {
    2473             :     uint8_t more = denom->more;
    2474             :     if (!denom->magic) {
    2475             :         return _mm256_srli_epi16(numers, more);
    2476             :     } else {
    2477             :         __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
    2478             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2479             :             __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
    2480             :             return _mm256_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
    2481             :         } else {
    2482             :             return _mm256_srli_epi16(q, more);
    2483             :         }
    2484             :     }
    2485             : }
    2486             : 
    2487             : __m256i libdivide_u16_branchfree_do_vec256(
    2488             :     __m256i numers, const struct libdivide_u16_branchfree_t *denom) {
    2489             :     __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
    2490             :     __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
    2491             :     return _mm256_srli_epi16(t, denom->more);
    2492             : }
    2493             : 
    2494             : ////////// UINT32
    2495             : 
    2496             : __m256i libdivide_u32_do_vec256(__m256i numers, const struct libdivide_u32_t *denom) {
    2497             :     uint8_t more = denom->more;
    2498             :     if (!denom->magic) {
    2499             :         return _mm256_srli_epi32(numers, more);
    2500             :     } else {
    2501             :         __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic));
    2502             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2503             :             // uint32_t t = ((numer - q) >> 1) + q;
    2504             :             // return t >> denom->shift;
    2505             :             uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    2506             :             __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);
    2507             :             return _mm256_srli_epi32(t, shift);
    2508             :         } else {
    2509             :             return _mm256_srli_epi32(q, more);
    2510             :         }
    2511             :     }
    2512             : }
    2513             : 
    2514             : __m256i libdivide_u32_branchfree_do_vec256(
    2515             :     __m256i numers, const struct libdivide_u32_branchfree_t *denom) {
    2516             :     __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic));
    2517             :     __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);
    2518             :     return _mm256_srli_epi32(t, denom->more);
    2519             : }
    2520             : 
    2521             : ////////// UINT64
    2522             : 
    2523             : __m256i libdivide_u64_do_vec256(__m256i numers, const struct libdivide_u64_t *denom) {
    2524             :     uint8_t more = denom->more;
    2525             :     if (!denom->magic) {
    2526             :         return _mm256_srli_epi64(numers, more);
    2527             :     } else {
    2528             :         __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic));
    2529             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2530             :             // uint32_t t = ((numer - q) >> 1) + q;
    2531             :             // return t >> denom->shift;
    2532             :             uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    2533             :             __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);
    2534             :             return _mm256_srli_epi64(t, shift);
    2535             :         } else {
    2536             :             return _mm256_srli_epi64(q, more);
    2537             :         }
    2538             :     }
    2539             : }
    2540             : 
    2541             : __m256i libdivide_u64_branchfree_do_vec256(
    2542             :     __m256i numers, const struct libdivide_u64_branchfree_t *denom) {
    2543             :     __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic));
    2544             :     __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);
    2545             :     return _mm256_srli_epi64(t, denom->more);
    2546             : }
    2547             : 
    2548             : ////////// SINT16
    2549             : 
    2550             : __m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) {
    2551             :     uint8_t more = denom->more;
    2552             :     if (!denom->magic) {
    2553             :         uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
    2554             :         uint16_t mask = ((uint16_t)1 << shift) - 1;
    2555             :         __m256i roundToZeroTweak = _mm256_set1_epi16(mask);
    2556             :         // q = numer + ((numer >> 15) & roundToZeroTweak);
    2557             :         __m256i q = _mm256_add_epi16(
    2558             :             numers, _mm256_and_si256(_mm256_srai_epi16(numers, 15), roundToZeroTweak));
    2559             :         q = _mm256_srai_epi16(q, shift);
    2560             :         __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
    2561             :         // q = (q ^ sign) - sign;
    2562             :         q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign);
    2563             :         return q;
    2564             :     } else {
    2565             :         __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(denom->magic));
    2566             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2567             :             // must be arithmetic shift
    2568             :             __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
    2569             :             // q += ((numer ^ sign) - sign);
    2570             :             q = _mm256_add_epi16(q, _mm256_sub_epi16(_mm256_xor_si256(numers, sign), sign));
    2571             :         }
    2572             :         // q >>= shift
    2573             :         q = _mm256_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
    2574             :         q = _mm256_add_epi16(q, _mm256_srli_epi16(q, 15));  // q += (q < 0)
    2575             :         return q;
    2576             :     }
    2577             : }
    2578             : 
    2579             : __m256i libdivide_s16_branchfree_do_vec256(
    2580             :     __m256i numers, const struct libdivide_s16_branchfree_t *denom) {
    2581             :     int16_t magic = denom->magic;
    2582             :     uint8_t more = denom->more;
    2583             :     uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
    2584             :     // must be arithmetic shift
    2585             :     __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
    2586             :     __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(magic));
    2587             :     q = _mm256_add_epi16(q, numers);  // q += numers
    2588             : 
    2589             :     // If q is non-negative, we have nothing to do
    2590             :     // If q is negative, we want to add either (2**shift)-1 if d is
    2591             :     // a power of 2, or (2**shift) if it is not a power of 2
    2592             :     uint16_t is_power_of_2 = (magic == 0);
    2593             :     __m256i q_sign = _mm256_srai_epi16(q, 15);  // q_sign = q >> 15
    2594             :     __m256i mask = _mm256_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
    2595             :     q = _mm256_add_epi16(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)
    2596             :     q = _mm256_srai_epi16(q, shift);                          // q >>= shift
    2597             :     q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign
    2598             :     return q;
    2599             : }
    2600             : 
    2601             : ////////// SINT32
    2602             : 
    2603             : __m256i libdivide_s32_do_vec256(__m256i numers, const struct libdivide_s32_t *denom) {
    2604             :     uint8_t more = denom->more;
    2605             :     if (!denom->magic) {
    2606             :         uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    2607             :         uint32_t mask = ((uint32_t)1 << shift) - 1;
    2608             :         __m256i roundToZeroTweak = _mm256_set1_epi32(mask);
    2609             :         // q = numer + ((numer >> 31) & roundToZeroTweak);
    2610             :         __m256i q = _mm256_add_epi32(
    2611             :             numers, _mm256_and_si256(_mm256_srai_epi32(numers, 31), roundToZeroTweak));
    2612             :         q = _mm256_srai_epi32(q, shift);
    2613             :         __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
    2614             :         // q = (q ^ sign) - sign;
    2615             :         q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign);
    2616             :         return q;
    2617             :     } else {
    2618             :         __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(denom->magic));
    2619             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2620             :             // must be arithmetic shift
    2621             :             __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
    2622             :             // q += ((numer ^ sign) - sign);
    2623             :             q = _mm256_add_epi32(q, _mm256_sub_epi32(_mm256_xor_si256(numers, sign), sign));
    2624             :         }
    2625             :         // q >>= shift
    2626             :         q = _mm256_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
    2627             :         q = _mm256_add_epi32(q, _mm256_srli_epi32(q, 31));  // q += (q < 0)
    2628             :         return q;
    2629             :     }
    2630             : }
    2631             : 
    2632             : __m256i libdivide_s32_branchfree_do_vec256(
    2633             :     __m256i numers, const struct libdivide_s32_branchfree_t *denom) {
    2634             :     int32_t magic = denom->magic;
    2635             :     uint8_t more = denom->more;
    2636             :     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    2637             :     // must be arithmetic shift
    2638             :     __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
    2639             :     __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(magic));
    2640             :     q = _mm256_add_epi32(q, numers);  // q += numers
    2641             : 
    2642             :     // If q is non-negative, we have nothing to do
    2643             :     // If q is negative, we want to add either (2**shift)-1 if d is
    2644             :     // a power of 2, or (2**shift) if it is not a power of 2
    2645             :     uint32_t is_power_of_2 = (magic == 0);
    2646             :     __m256i q_sign = _mm256_srai_epi32(q, 31);  // q_sign = q >> 31
    2647             :     __m256i mask = _mm256_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
    2648             :     q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)
    2649             :     q = _mm256_srai_epi32(q, shift);                          // q >>= shift
    2650             :     q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign
    2651             :     return q;
    2652             : }
    2653             : 
    2654             : ////////// SINT64
    2655             : 
    2656             : __m256i libdivide_s64_do_vec256(__m256i numers, const struct libdivide_s64_t *denom) {
    2657             :     uint8_t more = denom->more;
    2658             :     int64_t magic = denom->magic;
    2659             :     if (magic == 0) {  // shift path
    2660             :         uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    2661             :         uint64_t mask = ((uint64_t)1 << shift) - 1;
    2662             :         __m256i roundToZeroTweak = _mm256_set1_epi64x(mask);
    2663             :         // q = numer + ((numer >> 63) & roundToZeroTweak);
    2664             :         __m256i q = _mm256_add_epi64(
    2665             :             numers, _mm256_and_si256(libdivide_s64_signbits_vec256(numers), roundToZeroTweak));
    2666             :         q = libdivide_s64_shift_right_vec256(q, shift);
    2667             :         __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
    2668             :         // q = (q ^ sign) - sign;
    2669             :         q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign);
    2670             :         return q;
    2671             :     } else {
    2672             :         __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic));
    2673             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2674             :             // must be arithmetic shift
    2675             :             __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
    2676             :             // q += ((numer ^ sign) - sign);
    2677             :             q = _mm256_add_epi64(q, _mm256_sub_epi64(_mm256_xor_si256(numers, sign), sign));
    2678             :         }
    2679             :         // q >>= denom->mult_path.shift
    2680             :         q = libdivide_s64_shift_right_vec256(q, more & LIBDIVIDE_64_SHIFT_MASK);
    2681             :         q = _mm256_add_epi64(q, _mm256_srli_epi64(q, 63));  // q += (q < 0)
    2682             :         return q;
    2683             :     }
    2684             : }
    2685             : 
    2686             : __m256i libdivide_s64_branchfree_do_vec256(
    2687             :     __m256i numers, const struct libdivide_s64_branchfree_t *denom) {
    2688             :     int64_t magic = denom->magic;
    2689             :     uint8_t more = denom->more;
    2690             :     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    2691             :     // must be arithmetic shift
    2692             :     __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
    2693             : 
    2694             :     // libdivide_mullhi_s64(numers, magic);
    2695             :     __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic));
    2696             :     q = _mm256_add_epi64(q, numers);  // q += numers
    2697             : 
    2698             :     // If q is non-negative, we have nothing to do.
    2699             :     // If q is negative, we want to add either (2**shift)-1 if d is
    2700             :     // a power of 2, or (2**shift) if it is not a power of 2.
    2701             :     uint32_t is_power_of_2 = (magic == 0);
    2702             :     __m256i q_sign = libdivide_s64_signbits_vec256(q);  // q_sign = q >> 63
    2703             :     __m256i mask = _mm256_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);
    2704             :     q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)
    2705             :     q = libdivide_s64_shift_right_vec256(q, shift);           // q >>= shift
    2706             :     q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign
    2707             :     return q;
    2708             : }
    2709             : 
    2710             : #endif
    2711             : 
    2712             : #if defined(LIBDIVIDE_SSE2)
    2713             : 
    2714             : static LIBDIVIDE_INLINE __m128i libdivide_u16_do_vec128(
    2715             :     __m128i numers, const struct libdivide_u16_t *denom);
    2716             : static LIBDIVIDE_INLINE __m128i libdivide_s16_do_vec128(
    2717             :     __m128i numers, const struct libdivide_s16_t *denom);
    2718             : static LIBDIVIDE_INLINE __m128i libdivide_u32_do_vec128(
    2719             :     __m128i numers, const struct libdivide_u32_t *denom);
    2720             : static LIBDIVIDE_INLINE __m128i libdivide_s32_do_vec128(
    2721             :     __m128i numers, const struct libdivide_s32_t *denom);
    2722             : static LIBDIVIDE_INLINE __m128i libdivide_u64_do_vec128(
    2723             :     __m128i numers, const struct libdivide_u64_t *denom);
    2724             : static LIBDIVIDE_INLINE __m128i libdivide_s64_do_vec128(
    2725             :     __m128i numers, const struct libdivide_s64_t *denom);
    2726             : 
    2727             : static LIBDIVIDE_INLINE __m128i libdivide_u16_branchfree_do_vec128(
    2728             :     __m128i numers, const struct libdivide_u16_branchfree_t *denom);
    2729             : static LIBDIVIDE_INLINE __m128i libdivide_s16_branchfree_do_vec128(
    2730             :     __m128i numers, const struct libdivide_s16_branchfree_t *denom);
    2731             : static LIBDIVIDE_INLINE __m128i libdivide_u32_branchfree_do_vec128(
    2732             :     __m128i numers, const struct libdivide_u32_branchfree_t *denom);
    2733             : static LIBDIVIDE_INLINE __m128i libdivide_s32_branchfree_do_vec128(
    2734             :     __m128i numers, const struct libdivide_s32_branchfree_t *denom);
    2735             : static LIBDIVIDE_INLINE __m128i libdivide_u64_branchfree_do_vec128(
    2736             :     __m128i numers, const struct libdivide_u64_branchfree_t *denom);
    2737             : static LIBDIVIDE_INLINE __m128i libdivide_s64_branchfree_do_vec128(
    2738             :     __m128i numers, const struct libdivide_s64_branchfree_t *denom);
    2739             : 
    2740             : //////// Internal Utility Functions
    2741             : 
    2742             : // Implementation of _mm_srai_epi64(v, 63) (from AVX512).
    2743             : static LIBDIVIDE_INLINE __m128i libdivide_s64_signbits_vec128(__m128i v) {
    2744             :     __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
    2745             :     __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31);
    2746             :     return signBits;
    2747             : }
    2748             : 
    2749             : // Implementation of _mm_srai_epi64 (from AVX512).
    2750             : static LIBDIVIDE_INLINE __m128i libdivide_s64_shift_right_vec128(__m128i v, int amt) {
    2751             :     const int b = 64 - amt;
    2752             :     __m128i m = _mm_set1_epi64x((uint64_t)1 << (b - 1));
    2753             :     __m128i x = _mm_srli_epi64(v, amt);
    2754             :     __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m);
    2755             :     return result;
    2756             : }
    2757             : 
    2758             : // Here, b is assumed to contain one 32-bit value repeated.
    2759             : static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u32_vec128(__m128i a, __m128i b) {
    2760        2436 :     __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32);
    2761        1218 :     __m128i a1X3X = _mm_srli_epi64(a, 32);
    2762        1218 :     __m128i mask = _mm_set_epi32(-1, 0, -1, 0);
    2763        2436 :     __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), mask);
    2764        1218 :     return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3);
    2765             : }
    2766             : 
    2767             : // SSE2 does not have a signed multiplication instruction, but we can convert
    2768             : // unsigned to signed pretty efficiently. Again, b is just a 32 bit value
    2769             : // repeated four times.
    2770             : static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s32_vec128(__m128i a, __m128i b) {
    2771             :     __m128i p = libdivide_mullhi_u32_vec128(a, b);
    2772             :     // t1 = (a >> 31) & y, arithmetic shift
    2773             :     __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b);
    2774             :     __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a);
    2775             :     p = _mm_sub_epi32(p, t1);
    2776             :     p = _mm_sub_epi32(p, t2);
    2777             :     return p;
    2778             : }
    2779             : 
    2780             : // Here, y is assumed to contain one 64-bit value repeated.
    2781             : static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y) {
    2782             :     // full 128 bits product is:
    2783             :     // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64)
    2784             :     // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64.
    2785             : 
    2786             :     // Compute x0*y0.
    2787             :     // Note x1, y1 are ignored by mul_epu32.
    2788             :     __m128i x0y0 = _mm_mul_epu32(x, y);
    2789             :     __m128i x0y0_hi = _mm_srli_epi64(x0y0, 32);
    2790             : 
    2791             :     // Get x1, y1 in the low bits.
    2792             :     // We could shuffle or right shift. Shuffles are preferred as they preserve
    2793             :     // the source register for the next computation.
    2794             :     __m128i x1 = _mm_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1));
    2795             :     __m128i y1 = _mm_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1));
    2796             : 
    2797             :     // No need to mask off top 32 bits for mul_epu32.
    2798             :     __m128i x0y1 = _mm_mul_epu32(x, y1);
    2799             :     __m128i x1y0 = _mm_mul_epu32(x1, y);
    2800             :     __m128i x1y1 = _mm_mul_epu32(x1, y1);
    2801             : 
    2802             :     // Mask here selects low bits only.
    2803             :     __m128i mask = _mm_set1_epi64x(0xFFFFFFFF);
    2804             :     __m128i temp = _mm_add_epi64(x1y0, x0y0_hi);
    2805             :     __m128i temp_lo = _mm_and_si128(temp, mask);
    2806             :     __m128i temp_hi = _mm_srli_epi64(temp, 32);
    2807             : 
    2808             :     temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32);
    2809             :     temp_hi = _mm_add_epi64(x1y1, temp_hi);
    2810             :     return _mm_add_epi64(temp_lo, temp_hi);
    2811             : }
    2812             : 
    2813             : // y is one 64-bit value repeated.
    2814             : static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y) {
    2815             :     __m128i p = libdivide_mullhi_u64_vec128(x, y);
    2816             :     __m128i t1 = _mm_and_si128(libdivide_s64_signbits_vec128(x), y);
    2817             :     __m128i t2 = _mm_and_si128(libdivide_s64_signbits_vec128(y), x);
    2818             :     p = _mm_sub_epi64(p, t1);
    2819             :     p = _mm_sub_epi64(p, t2);
    2820             :     return p;
    2821             : }
    2822             : 
    2823             : ////////// UINT26
    2824             : 
    2825             : __m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) {
    2826       10030 :     uint8_t more = denom->more;
    2827       10030 :     if (!denom->magic) {
    2828          36 :         return _mm_srli_epi16(numers, more);
    2829             :     } else {
    2830       20024 :         __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
    2831       10012 :         if (more & LIBDIVIDE_ADD_MARKER) {
    2832          12 :             __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
    2833          12 :             return _mm_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
    2834             :         } else {
    2835       20012 :             return _mm_srli_epi16(q, more);
    2836             :         }
    2837             :     }
    2838             : }
    2839             : 
    2840             : __m128i libdivide_u16_branchfree_do_vec128(
    2841             :     __m128i numers, const struct libdivide_u16_branchfree_t *denom) {
    2842             :     __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
    2843             :     __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
    2844             :     return _mm_srli_epi16(t, denom->more);
    2845             : }
    2846             : 
    2847             : ////////// UINT32
    2848             : 
    2849             : __m128i libdivide_u32_do_vec128(__m128i numers, const struct libdivide_u32_t *denom) {
    2850        1272 :     uint8_t more = denom->more;
    2851        1272 :     if (!denom->magic) {
    2852         108 :         return _mm_srli_epi32(numers, more);
    2853             :     } else {
    2854        2436 :         __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic));
    2855        1218 :         if (more & LIBDIVIDE_ADD_MARKER) {
    2856             :             // uint32_t t = ((numer - q) >> 1) + q;
    2857             :             // return t >> denom->shift;
    2858           0 :             uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    2859           0 :             __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
    2860           0 :             return _mm_srli_epi32(t, shift);
    2861             :         } else {
    2862        2436 :             return _mm_srli_epi32(q, more);
    2863             :         }
    2864             :     }
    2865             : }
    2866             : 
    2867             : __m128i libdivide_u32_branchfree_do_vec128(
    2868             :     __m128i numers, const struct libdivide_u32_branchfree_t *denom) {
    2869             :     __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic));
    2870             :     __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
    2871             :     return _mm_srli_epi32(t, denom->more);
    2872             : }
    2873             : 
    2874             : ////////// UINT64
    2875             : 
    2876             : __m128i libdivide_u64_do_vec128(__m128i numers, const struct libdivide_u64_t *denom) {
    2877             :     uint8_t more = denom->more;
    2878             :     if (!denom->magic) {
    2879             :         return _mm_srli_epi64(numers, more);
    2880             :     } else {
    2881             :         __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic));
    2882             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2883             :             // uint32_t t = ((numer - q) >> 1) + q;
    2884             :             // return t >> denom->shift;
    2885             :             uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    2886             :             __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
    2887             :             return _mm_srli_epi64(t, shift);
    2888             :         } else {
    2889             :             return _mm_srli_epi64(q, more);
    2890             :         }
    2891             :     }
    2892             : }
    2893             : 
    2894             : __m128i libdivide_u64_branchfree_do_vec128(
    2895             :     __m128i numers, const struct libdivide_u64_branchfree_t *denom) {
    2896             :     __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic));
    2897             :     __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
    2898             :     return _mm_srli_epi64(t, denom->more);
    2899             : }
    2900             : 
    2901             : ////////// SINT16
    2902             : 
    2903             : __m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) {
    2904             :     uint8_t more = denom->more;
    2905             :     if (!denom->magic) {
    2906             :         uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
    2907             :         uint16_t mask = ((uint16_t)1 << shift) - 1;
    2908             :         __m128i roundToZeroTweak = _mm_set1_epi16(mask);
    2909             :         // q = numer + ((numer >> 15) & roundToZeroTweak);
    2910             :         __m128i q =
    2911             :             _mm_add_epi16(numers, _mm_and_si128(_mm_srai_epi16(numers, 15), roundToZeroTweak));
    2912             :         q = _mm_srai_epi16(q, shift);
    2913             :         __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
    2914             :         // q = (q ^ sign) - sign;
    2915             :         q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign);
    2916             :         return q;
    2917             :     } else {
    2918             :         __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(denom->magic));
    2919             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2920             :             // must be arithmetic shift
    2921             :             __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
    2922             :             // q += ((numer ^ sign) - sign);
    2923             :             q = _mm_add_epi16(q, _mm_sub_epi16(_mm_xor_si128(numers, sign), sign));
    2924             :         }
    2925             :         // q >>= shift
    2926             :         q = _mm_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
    2927             :         q = _mm_add_epi16(q, _mm_srli_epi16(q, 15));  // q += (q < 0)
    2928             :         return q;
    2929             :     }
    2930             : }
    2931             : 
    2932             : __m128i libdivide_s16_branchfree_do_vec128(
    2933             :     __m128i numers, const struct libdivide_s16_branchfree_t *denom) {
    2934             :     int16_t magic = denom->magic;
    2935             :     uint8_t more = denom->more;
    2936             :     uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
    2937             :     // must be arithmetic shift
    2938             :     __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
    2939             :     __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(magic));
    2940             :     q = _mm_add_epi16(q, numers);  // q += numers
    2941             : 
    2942             :     // If q is non-negative, we have nothing to do
    2943             :     // If q is negative, we want to add either (2**shift)-1 if d is
    2944             :     // a power of 2, or (2**shift) if it is not a power of 2
    2945             :     uint16_t is_power_of_2 = (magic == 0);
    2946             :     __m128i q_sign = _mm_srai_epi16(q, 15);  // q_sign = q >> 15
    2947             :     __m128i mask = _mm_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
    2948             :     q = _mm_add_epi16(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)
    2949             :     q = _mm_srai_epi16(q, shift);                       // q >>= shift
    2950             :     q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign
    2951             :     return q;
    2952             : }
    2953             : 
    2954             : ////////// SINT32
    2955             : 
    2956             : __m128i libdivide_s32_do_vec128(__m128i numers, const struct libdivide_s32_t *denom) {
    2957             :     uint8_t more = denom->more;
    2958             :     if (!denom->magic) {
    2959             :         uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    2960             :         uint32_t mask = ((uint32_t)1 << shift) - 1;
    2961             :         __m128i roundToZeroTweak = _mm_set1_epi32(mask);
    2962             :         // q = numer + ((numer >> 31) & roundToZeroTweak);
    2963             :         __m128i q =
    2964             :             _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
    2965             :         q = _mm_srai_epi32(q, shift);
    2966             :         __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
    2967             :         // q = (q ^ sign) - sign;
    2968             :         q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign);
    2969             :         return q;
    2970             :     } else {
    2971             :         __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(denom->magic));
    2972             :         if (more & LIBDIVIDE_ADD_MARKER) {
    2973             :             // must be arithmetic shift
    2974             :             __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
    2975             :             // q += ((numer ^ sign) - sign);
    2976             :             q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign));
    2977             :         }
    2978             :         // q >>= shift
    2979             :         q = _mm_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
    2980             :         q = _mm_add_epi32(q, _mm_srli_epi32(q, 31));  // q += (q < 0)
    2981             :         return q;
    2982             :     }
    2983             : }
    2984             : 
    2985             : __m128i libdivide_s32_branchfree_do_vec128(
    2986             :     __m128i numers, const struct libdivide_s32_branchfree_t *denom) {
    2987             :     int32_t magic = denom->magic;
    2988             :     uint8_t more = denom->more;
    2989             :     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    2990             :     // must be arithmetic shift
    2991             :     __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
    2992             :     __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(magic));
    2993             :     q = _mm_add_epi32(q, numers);  // q += numers
    2994             : 
    2995             :     // If q is non-negative, we have nothing to do
    2996             :     // If q is negative, we want to add either (2**shift)-1 if d is
    2997             :     // a power of 2, or (2**shift) if it is not a power of 2
    2998             :     uint32_t is_power_of_2 = (magic == 0);
    2999             :     __m128i q_sign = _mm_srai_epi32(q, 31);  // q_sign = q >> 31
    3000             :     __m128i mask = _mm_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
    3001             :     q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)
    3002             :     q = _mm_srai_epi32(q, shift);                       // q >>= shift
    3003             :     q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign
    3004             :     return q;
    3005             : }
    3006             : 
    3007             : ////////// SINT64
    3008             : 
    3009             : __m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *denom) {
    3010             :     uint8_t more = denom->more;
    3011             :     int64_t magic = denom->magic;
    3012             :     if (magic == 0) {  // shift path
    3013             :         uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    3014             :         uint64_t mask = ((uint64_t)1 << shift) - 1;
    3015             :         __m128i roundToZeroTweak = _mm_set1_epi64x(mask);
    3016             :         // q = numer + ((numer >> 63) & roundToZeroTweak);
    3017             :         __m128i q = _mm_add_epi64(
    3018             :             numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak));
    3019             :         q = libdivide_s64_shift_right_vec128(q, shift);
    3020             :         __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
    3021             :         // q = (q ^ sign) - sign;
    3022             :         q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign);
    3023             :         return q;
    3024             :     } else {
    3025             :         __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic));
    3026             :         if (more & LIBDIVIDE_ADD_MARKER) {
    3027             :             // must be arithmetic shift
    3028             :             __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
    3029             :             // q += ((numer ^ sign) - sign);
    3030             :             q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign));
    3031             :         }
    3032             :         // q >>= denom->mult_path.shift
    3033             :         q = libdivide_s64_shift_right_vec128(q, more & LIBDIVIDE_64_SHIFT_MASK);
    3034             :         q = _mm_add_epi64(q, _mm_srli_epi64(q, 63));  // q += (q < 0)
    3035             :         return q;
    3036             :     }
    3037             : }
    3038             : 
    3039             : __m128i libdivide_s64_branchfree_do_vec128(
    3040             :     __m128i numers, const struct libdivide_s64_branchfree_t *denom) {
    3041             :     int64_t magic = denom->magic;
    3042             :     uint8_t more = denom->more;
    3043             :     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    3044             :     // must be arithmetic shift
    3045             :     __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
    3046             : 
    3047             :     // libdivide_mullhi_s64(numers, magic);
    3048             :     __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic));
    3049             :     q = _mm_add_epi64(q, numers);  // q += numers
    3050             : 
    3051             :     // If q is non-negative, we have nothing to do.
    3052             :     // If q is negative, we want to add either (2**shift)-1 if d is
    3053             :     // a power of 2, or (2**shift) if it is not a power of 2.
    3054             :     uint32_t is_power_of_2 = (magic == 0);
    3055             :     __m128i q_sign = libdivide_s64_signbits_vec128(q);  // q_sign = q >> 63
    3056             :     __m128i mask = _mm_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);
    3057             :     q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)
    3058             :     q = libdivide_s64_shift_right_vec128(q, shift);     // q >>= shift
    3059             :     q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign
    3060             :     return q;
    3061             : }
    3062             : 
    3063             : #endif
    3064             : 
    3065             : ////////// C++ stuff
    3066             : 
    3067             : #ifdef __cplusplus
    3068             : 
    3069             : enum Branching {
    3070             :     BRANCHFULL,  // use branching algorithms
    3071             :     BRANCHFREE   // use branchfree algorithms
    3072             : };
    3073             : 
    3074             : namespace detail {
    3075             : enum Signedness {
    3076             :     SIGNED,
    3077             :     UNSIGNED,
    3078             : };
    3079             : 
    3080             : #if defined(LIBDIVIDE_NEON)
    3081             : // Helper to deduce NEON vector type for integral type.
    3082             : template <int _WIDTH, Signedness _SIGN>
    3083             : struct NeonVec {};
    3084             : 
    3085             : template <>
    3086             : struct NeonVec<16, UNSIGNED> {
    3087             :     typedef uint16x8_t type;
    3088             : };
    3089             : 
    3090             : template <>
    3091             : struct NeonVec<16, SIGNED> {
    3092             :     typedef int16x8_t type;
    3093             : };
    3094             : 
    3095             : template <>
    3096             : struct NeonVec<32, UNSIGNED> {
    3097             :     typedef uint32x4_t type;
    3098             : };
    3099             : 
    3100             : template <>
    3101             : struct NeonVec<32, SIGNED> {
    3102             :     typedef int32x4_t type;
    3103             : };
    3104             : 
    3105             : template <>
    3106             : struct NeonVec<64, UNSIGNED> {
    3107             :     typedef uint64x2_t type;
    3108             : };
    3109             : 
    3110             : template <>
    3111             : struct NeonVec<64, SIGNED> {
    3112             :     typedef int64x2_t type;
    3113             : };
    3114             : 
    3115             : template <typename T>
    3116             : struct NeonVecFor {
    3117             :     // See 'class divider' for an explanation of these template parameters.
    3118             :     typedef typename NeonVec<sizeof(T) * 8, (((T)0 >> 0) > (T)(-1) ? SIGNED : UNSIGNED)>::type type;
    3119             : };
    3120             : 
    3121             : #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)                    \
    3122             :     LIBDIVIDE_INLINE typename NeonVecFor<INT_TYPE>::type divide( \
    3123             :         typename NeonVecFor<INT_TYPE>::type n) const {           \
    3124             :         return libdivide_##ALGO##_do_vec128(n, &denom);          \
    3125             :     }
    3126             : #else
    3127             : #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)
    3128             : #endif
    3129             : 
    3130             : #if defined(LIBDIVIDE_SSE2)
    3131             : #define LIBDIVIDE_DIVIDE_SSE2(ALGO)                     \
    3132             :     LIBDIVIDE_INLINE __m128i divide(__m128i n) const {  \
    3133             :         return libdivide_##ALGO##_do_vec128(n, &denom); \
    3134             :     }
    3135             : #else
    3136             : #define LIBDIVIDE_DIVIDE_SSE2(ALGO)
    3137             : #endif
    3138             : 
    3139             : #if defined(LIBDIVIDE_AVX2)
    3140             : #define LIBDIVIDE_DIVIDE_AVX2(ALGO)                     \
    3141             :     LIBDIVIDE_INLINE __m256i divide(__m256i n) const {  \
    3142             :         return libdivide_##ALGO##_do_vec256(n, &denom); \
    3143             :     }
    3144             : #else
    3145             : #define LIBDIVIDE_DIVIDE_AVX2(ALGO)
    3146             : #endif
    3147             : 
    3148             : #if defined(LIBDIVIDE_AVX512)
    3149             : #define LIBDIVIDE_DIVIDE_AVX512(ALGO)                   \
    3150             :     LIBDIVIDE_INLINE __m512i divide(__m512i n) const {  \
    3151             :         return libdivide_##ALGO##_do_vec512(n, &denom); \
    3152             :     }
    3153             : #else
    3154             : #define LIBDIVIDE_DIVIDE_AVX512(ALGO)
    3155             : #endif
    3156             : 
    3157             : // The DISPATCHER_GEN() macro generates C++ methods (for the given integer
    3158             : // and algorithm types) that redirect to libdivide's C API.
    3159             : #define DISPATCHER_GEN(T, ALGO)                                                       \
    3160             :     libdivide_##ALGO##_t denom;                                                       \
    3161             :     LIBDIVIDE_INLINE dispatcher() {}                                                  \
    3162             :     explicit LIBDIVIDE_CONSTEXPR dispatcher(decltype(nullptr)) : denom{} {}              \
    3163             :     LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {}            \
    3164             :     LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \
    3165             :     LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \
    3166             :     LIBDIVIDE_DIVIDE_NEON(ALGO, T)                                                    \
    3167             :     LIBDIVIDE_DIVIDE_SSE2(ALGO)                                                       \
    3168             :     LIBDIVIDE_DIVIDE_AVX2(ALGO)                                                       \
    3169             :     LIBDIVIDE_DIVIDE_AVX512(ALGO)
    3170             : 
    3171             : // The dispatcher selects a specific division algorithm for a given
    3172             : // width, signedness, and ALGO using partial template specialization.
    3173             : template <int _WIDTH, Signedness _SIGN, Branching _ALGO>
    3174             : struct dispatcher {};
    3175             : 
    3176             : template <>
    3177             : struct dispatcher<16, SIGNED, BRANCHFULL> {
    3178             :     DISPATCHER_GEN(int16_t, s16)
    3179             : };
    3180             : template <>
    3181             : struct dispatcher<16, SIGNED, BRANCHFREE> {
    3182             :     DISPATCHER_GEN(int16_t, s16_branchfree)
    3183             : };
    3184             : template <>
    3185             : struct dispatcher<16, UNSIGNED, BRANCHFULL> {
    3186       10036 :     DISPATCHER_GEN(uint16_t, u16)
    3187             : };
    3188             : template <>
    3189             : struct dispatcher<16, UNSIGNED, BRANCHFREE> {
    3190             :     DISPATCHER_GEN(uint16_t, u16_branchfree)
    3191             : };
    3192             : template <>
    3193             : struct dispatcher<32, SIGNED, BRANCHFULL> {
    3194             :     DISPATCHER_GEN(int32_t, s32)
    3195             : };
    3196             : template <>
    3197             : struct dispatcher<32, SIGNED, BRANCHFREE> {
    3198             :     DISPATCHER_GEN(int32_t, s32_branchfree)
    3199             : };
    3200             : template <>
    3201             : struct dispatcher<32, UNSIGNED, BRANCHFULL> {
    3202     5200722 :     DISPATCHER_GEN(uint32_t, u32)
    3203             : };
    3204             : template <>
    3205             : struct dispatcher<32, UNSIGNED, BRANCHFREE> {
    3206             :     DISPATCHER_GEN(uint32_t, u32_branchfree)
    3207             : };
    3208             : template <>
    3209             : struct dispatcher<64, SIGNED, BRANCHFULL> {
    3210             :     DISPATCHER_GEN(int64_t, s64)
    3211             : };
    3212             : template <>
    3213             : struct dispatcher<64, SIGNED, BRANCHFREE> {
    3214             :     DISPATCHER_GEN(int64_t, s64_branchfree)
    3215             : };
    3216             : template <>
    3217             : struct dispatcher<64, UNSIGNED, BRANCHFULL> {
    3218             :     DISPATCHER_GEN(uint64_t, u64)
    3219             : };
    3220             : template <>
    3221             : struct dispatcher<64, UNSIGNED, BRANCHFREE> {
    3222             :     DISPATCHER_GEN(uint64_t, u64_branchfree)
    3223             : };
    3224             : }  // namespace detail
    3225             : 
    3226             : #if defined(LIBDIVIDE_NEON)
    3227             : // Allow NeonVecFor outside of detail namespace.
    3228             : template <typename T>
    3229             : struct NeonVecFor {
    3230             :     typedef typename detail::NeonVecFor<T>::type type;
    3231             : };
    3232             : #endif
    3233             : 
    3234             : // This is the main divider class for use by the user (C++ API).
    3235             : // The actual division algorithm is selected using the dispatcher struct
    3236             : // based on the integer width and algorithm template parameters.
    3237             : template <typename T, Branching ALGO = BRANCHFULL>
    3238             : class divider {
    3239             :    private:
    3240             :     // Dispatch based on the size and signedness.
    3241             :     // We avoid using type_traits as it's not available in AVR.
    3242             :     // Detect signedness by checking if T(-1) is less than T(0).
    3243             :     // Also throw in a shift by 0, which prevents floating point types from being passed.
    3244             :     typedef detail::dispatcher<sizeof(T) * 8,
    3245             :         (((T)0 >> 0) > (T)(-1) ? detail::SIGNED : detail::UNSIGNED), ALGO>
    3246             :         dispatcher_t;
    3247             : 
    3248             :    public:
    3249             :     // We leave the default constructor empty so that creating
    3250             :     // an array of dividers and then initializing them
    3251             :     // later doesn't slow us down.
    3252             :     divider() {}
    3253             : 
    3254             :     // constexpr zero-initialization to allow for use w/ static constinit
    3255             :     explicit LIBDIVIDE_CONSTEXPR divider(decltype(nullptr)) : div(nullptr) {}
    3256             : 
    3257             :     // Constructor that takes the divisor as a parameter
    3258      212260 :     LIBDIVIDE_INLINE divider(T d) : div(d) {}
    3259             : 
    3260             :     // Divides n by the divisor
    3261    10186648 :     LIBDIVIDE_INLINE T divide(T n) const { return div.divide(n); }
    3262             : 
    3263             :     // Recovers the divisor, returns the value that was
    3264             :     // used to initialize this divider object.
    3265             :     T recover() const { return div.recover(); }
    3266             : 
    3267             :     bool operator==(const divider<T, ALGO> &other) const {
    3268             :         return div.denom.magic == other.div.denom.magic && div.denom.more == other.div.denom.more;
    3269             :     }
    3270             : 
    3271             :     bool operator!=(const divider<T, ALGO> &other) const { return !(*this == other); }
    3272             : 
    3273             :     // Vector variants treat the input as packed integer values with the same type as the divider
    3274             :     // (e.g. s32, u32, s64, u64) and divides each of them by the divider, returning the packed
    3275             :     // quotients.
    3276             : #if defined(LIBDIVIDE_SSE2)
    3277       11302 :     LIBDIVIDE_INLINE __m128i divide(__m128i n) const { return div.divide(n); }
    3278             : #endif
    3279             : #if defined(LIBDIVIDE_AVX2)
    3280             :     LIBDIVIDE_INLINE __m256i divide(__m256i n) const { return div.divide(n); }
    3281             : #endif
    3282             : #if defined(LIBDIVIDE_AVX512)
    3283             :     LIBDIVIDE_INLINE __m512i divide(__m512i n) const { return div.divide(n); }
    3284             : #endif
    3285             : #if defined(LIBDIVIDE_NEON)
    3286             :     LIBDIVIDE_INLINE typename NeonVecFor<T>::type divide(typename NeonVecFor<T>::type n) const {
    3287             :         return div.divide(n);
    3288             :     }
    3289             : #endif
    3290             : 
    3291             :    private:
    3292             :     // Storage for the actual divisor
    3293             :     dispatcher_t div;
    3294             : };
    3295             : 
    3296             : // Overload of operator / for scalar division
    3297             : template <typename T, Branching ALGO>
    3298             : LIBDIVIDE_INLINE T operator/(T n, const divider<T, ALGO> &div) {
    3299     5093324 :     return div.divide(n);
    3300             : }
    3301             : 
    3302             : // Overload of operator /= for scalar division
    3303             : template <typename T, Branching ALGO>
    3304             : LIBDIVIDE_INLINE T &operator/=(T &n, const divider<T, ALGO> &div) {
    3305             :     n = div.divide(n);
    3306             :     return n;
    3307             : }
    3308             : 
    3309             : // Overloads for vector types.
    3310             : #if defined(LIBDIVIDE_SSE2)
    3311             : template <typename T, Branching ALGO>
    3312             : LIBDIVIDE_INLINE __m128i operator/(__m128i n, const divider<T, ALGO> &div) {
    3313             :     return div.divide(n);
    3314             : }
    3315             : 
    3316             : template <typename T, Branching ALGO>
    3317             : LIBDIVIDE_INLINE __m128i operator/=(__m128i &n, const divider<T, ALGO> &div) {
    3318       11302 :     n = div.divide(n);
    3319       11302 :     return n;
    3320             : }
    3321             : #endif
    3322             : #if defined(LIBDIVIDE_AVX2)
    3323             : template <typename T, Branching ALGO>
    3324             : LIBDIVIDE_INLINE __m256i operator/(__m256i n, const divider<T, ALGO> &div) {
    3325             :     return div.divide(n);
    3326             : }
    3327             : 
    3328             : template <typename T, Branching ALGO>
    3329             : LIBDIVIDE_INLINE __m256i operator/=(__m256i &n, const divider<T, ALGO> &div) {
    3330             :     n = div.divide(n);
    3331             :     return n;
    3332             : }
    3333             : #endif
    3334             : #if defined(LIBDIVIDE_AVX512)
    3335             : template <typename T, Branching ALGO>
    3336             : LIBDIVIDE_INLINE __m512i operator/(__m512i n, const divider<T, ALGO> &div) {
    3337             :     return div.divide(n);
    3338             : }
    3339             : 
    3340             : template <typename T, Branching ALGO>
    3341             : LIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider<T, ALGO> &div) {
    3342             :     n = div.divide(n);
    3343             :     return n;
    3344             : }
    3345             : #endif
    3346             : 
    3347             : #if defined(LIBDIVIDE_NEON)
    3348             : template <typename T, Branching ALGO>
    3349             : LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/(
    3350             :     typename NeonVecFor<T>::type n, const divider<T, ALGO> &div) {
    3351             :     return div.divide(n);
    3352             : }
    3353             : 
    3354             : template <typename T, Branching ALGO>
    3355             : LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/=(
    3356             :     typename NeonVecFor<T>::type &n, const divider<T, ALGO> &div) {
    3357             :     n = div.divide(n);
    3358             :     return n;
    3359             : }
    3360             : #endif
    3361             : 
    3362             : #if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900)
    3363             : // libdivide::branchfree_divider<T>
    3364             : template <typename T>
    3365             : using branchfree_divider = divider<T, BRANCHFREE>;
    3366             : #endif
    3367             : 
    3368             : }  // namespace libdivide
    3369             : 
    3370             : #endif  // __cplusplus
    3371             : 
    3372             : #if defined(_MSC_VER) && !defined(__clang__)
    3373             : #pragma warning(pop)
    3374             : #endif
    3375             : 
    3376             : #endif  // LIBDIVIDE_H

Generated by: LCOV version 1.14