LCOV - gdal_filtered.info

LCOV - code coverage report

Current view:	top level - port - utf8.h (source / functions)		Hit	Total	Coverage
Test:	gdal_filtered.info	Lines:	70	337	20.8 %
Date:	2025-01-18 12:42:00	Functions:	4	4	100.0 %

          Line data    Source code

       1             : // NOTE: for GDAL, this is an extract from the https://github.com/sheredom/utf8.h
       2             : // code
       3             : 
       4             : /* The latest version of this library is available on GitHub;
       5             :  * https://github.com/sheredom/utf8.h */
       6             : 
       7             : /* This is free and unencumbered software released into the public domain.
       8             :  *
       9             :  * Anyone is free to copy, modify, publish, use, compile, sell, or
      10             :  * distribute this software, either in source code form or as a compiled
      11             :  * binary, for any purpose, commercial or non-commercial, and by any
      12             :  * means.
      13             :  *
      14             :  * In jurisdictions that recognize copyright laws, the author or authors
      15             :  * of this software dedicate any and all copyright interest in the
      16             :  * software to the public domain. We make this dedication for the benefit
      17             :  * of the public at large and to the detriment of our heirs and
      18             :  * successors. We intend this dedication to be an overt act of
      19             :  * relinquishment in perpetuity of all present and future rights to this
      20             :  * software under copyright law.
      21             :  *
      22             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
      23             :  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
      24             :  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
      25             :  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
      26             :  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
      27             :  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
      28             :  * OTHER DEALINGS IN THE SOFTWARE.
      29             :  *
      30             :  * For more information, please refer to <http://unlicense.org/> */
      31             : 
      32             : #ifndef SHEREDOM_UTF8_H_INCLUDED
      33             : #define SHEREDOM_UTF8_H_INCLUDED
      34             : 
      35             : #if defined(_MSC_VER)
      36             : #pragma warning(push)
      37             : 
      38             : /* disable warning: no function prototype given: converting '()' to '(void)' */
      39             : #pragma warning(disable : 4255)
      40             : 
      41             : /* disable warning: '__cplusplus' is not defined as a preprocessor macro,
      42             :  * replacing with '0' for '#if/#elif' */
      43             : #pragma warning(disable : 4668)
      44             : 
      45             : /* disable warning: bytes padding added after construct */
      46             : #pragma warning(disable : 4820)
      47             : #endif
      48             : 
      49             : #include <stddef.h>
      50             : #include <stdlib.h>
      51             : 
      52             : #if defined(_MSC_VER)
      53             : #pragma warning(pop)
      54             : #endif
      55             : 
      56             : #if defined(_MSC_VER) && (_MSC_VER < 1920)
      57             : typedef __int32 utf8_int32_t;
      58             : #else
      59             : #include <stdint.h>
      60             : typedef int32_t utf8_int32_t;
      61             : #endif
      62             : 
      63             : #if defined(__clang__)
      64             : #pragma clang diagnostic push
      65             : #pragma clang diagnostic ignored "-Wold-style-cast"
      66             : #pragma clang diagnostic ignored "-Wcast-qual"
      67             : 
      68             : #if __has_warning("-Wunsafe-buffer-usage")
      69             : #pragma clang diagnostic ignored "-Wunsafe-buffer-usage"
      70             : #endif
      71             : #endif
      72             : 
      73             : namespace {
      74             : 
      75             : #if defined(_MSC_VER)
      76             : #define utf8_nonnull
      77             : #define utf8_pure
      78             : #define utf8_restrict __restrict
      79             : #define utf8_weak __inline
      80             : #elif defined(__clang__) || defined(__GNUC__)
      81             : #define utf8_nonnull __attribute__((nonnull))
      82             : #define utf8_pure __attribute__((pure))
      83             : #define utf8_restrict __restrict__
      84             : #define utf8_weak __attribute__((weak))
      85             : #else
      86             : #define utf8_nonnull
      87             : #define utf8_pure
      88             : #define utf8_restrict
      89             : #define utf8_weak
      90             : #endif
      91             : 
      92             : #ifdef __cplusplus
      93             : #define utf8_null NULL
      94             : #else
      95             : #define utf8_null 0
      96             : #endif
      97             : 
      98             : #if (defined(__cplusplus) && __cplusplus >= 201402L)
      99             : #define utf8_constexpr14 constexpr
     100             : #define utf8_constexpr14_impl constexpr
     101             : #else
     102             : /* constexpr and weak are incompatible. so only enable one of them */
     103             : #define utf8_constexpr14 utf8_weak
     104             : #define utf8_constexpr14_impl
     105             : #endif
     106             : 
     107             : #if defined(__cplusplus) && __cplusplus >= 202002L
     108             : using utf8_int8_t = char8_t; /* Introduced in C++20 */
     109             : #else
     110             : typedef char utf8_int8_t;
     111             : #endif
     112             : 
     113             : #if 0
     114             : /* Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 >
     115             :  * src2 respectively, case insensitive. */
     116             : utf8_constexpr14 utf8_nonnull utf8_pure int
     117             : utf8casecmp(const utf8_int8_t *src1, const utf8_int8_t *src2);
     118             : 
     119             : /* Append the utf8 string src onto the utf8 string dst. */
     120             : utf8_nonnull utf8_weak utf8_int8_t *
     121             : utf8cat(utf8_int8_t *utf8_restrict dst, const utf8_int8_t *utf8_restrict src);
     122             : 
     123             : /* Find the first match of the utf8 codepoint chr in the utf8 string src. */
     124             : utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
     125             : utf8chr(const utf8_int8_t *src, utf8_int32_t chr);
     126             : 
     127             : /* Return less than 0, 0, greater than 0 if src1 < src2,
     128             :  * src1 == src2, src1 > src2 respectively. */
     129             : utf8_constexpr14 utf8_nonnull utf8_pure int utf8cmp(const utf8_int8_t *src1,
     130             :                                                     const utf8_int8_t *src2);
     131             : 
     132             : /* Copy the utf8 string src onto the memory allocated in dst. */
     133             : utf8_nonnull utf8_weak utf8_int8_t *
     134             : utf8cpy(utf8_int8_t *utf8_restrict dst, const utf8_int8_t *utf8_restrict src);
     135             : 
     136             : /* Number of utf8 codepoints in the utf8 string src that consists entirely
     137             :  * of utf8 codepoints not from the utf8 string reject. */
     138             : utf8_constexpr14 utf8_nonnull utf8_pure size_t
     139             : utf8cspn(const utf8_int8_t *src, const utf8_int8_t *reject);
     140             : 
     141             : /* Duplicate the utf8 string src by getting its size, malloc'ing a new buffer
     142             :  * copying over the data, and returning that. Or 0 if malloc failed. */
     143             : utf8_weak utf8_int8_t *utf8dup(const utf8_int8_t *src);
     144             : 
     145             : /* Number of utf8 codepoints in the utf8 string str,
     146             :  * excluding the null terminating byte. */
     147             : utf8_constexpr14 utf8_nonnull utf8_pure size_t utf8len(const utf8_int8_t *str);
     148             : 
     149             : /* Similar to utf8len, except that only at most n bytes of src are looked. */
     150             : utf8_constexpr14 utf8_nonnull utf8_pure size_t utf8nlen(const utf8_int8_t *str,
     151             :                                                         size_t n);
     152             : 
     153             : /* Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 >
     154             :  * src2 respectively, case insensitive. Checking at most n bytes of each utf8
     155             :  * string. */
     156             : utf8_constexpr14 utf8_nonnull utf8_pure int
     157             : utf8ncasecmp(const utf8_int8_t *src1, const utf8_int8_t *src2, size_t n);
     158             : 
     159             : /* Append the utf8 string src onto the utf8 string dst,
     160             :  * writing at most n+1 bytes. Can produce an invalid utf8
     161             :  * string if n falls partway through a utf8 codepoint. */
     162             : utf8_nonnull utf8_weak utf8_int8_t *
     163             : utf8ncat(utf8_int8_t *utf8_restrict dst, const utf8_int8_t *utf8_restrict src,
     164             :          size_t n);
     165             : 
     166             : /* Return less than 0, 0, greater than 0 if src1 < src2,
     167             :  * src1 == src2, src1 > src2 respectively. Checking at most n
     168             :  * bytes of each utf8 string. */
     169             : utf8_constexpr14 utf8_nonnull utf8_pure int
     170             : utf8ncmp(const utf8_int8_t *src1, const utf8_int8_t *src2, size_t n);
     171             : 
     172             : /* Copy the utf8 string src onto the memory allocated in dst.
     173             :  * Copies at most n bytes. If n falls partway through a utf8
     174             :  * codepoint, or if dst doesn't have enough room for a null
     175             :  * terminator, the final string will be cut short to preserve
     176             :  * utf8 validity. */
     177             : 
     178             : utf8_nonnull utf8_weak utf8_int8_t *
     179             : utf8ncpy(utf8_int8_t *utf8_restrict dst, const utf8_int8_t *utf8_restrict src,
     180             :          size_t n);
     181             : 
     182             : /* Similar to utf8dup, except that at most n bytes of src are copied. If src is
     183             :  * longer than n, only n bytes are copied and a null byte is added.
     184             :  *
     185             :  * Returns a new string if successful, 0 otherwise */
     186             : utf8_weak utf8_int8_t *utf8ndup(const utf8_int8_t *src, size_t n);
     187             : 
     188             : /* Locates the first occurrence in the utf8 string str of any byte in the
     189             :  * utf8 string accept, or 0 if no match was found. */
     190             : utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
     191             : utf8pbrk(const utf8_int8_t *str, const utf8_int8_t *accept);
     192             : 
     193             : /* Find the last match of the utf8 codepoint chr in the utf8 string src. */
     194             : utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
     195             : utf8rchr(const utf8_int8_t *src, int chr);
     196             : 
     197             : /* Number of bytes in the utf8 string str,
     198             :  * including the null terminating byte. */
     199             : utf8_constexpr14 utf8_nonnull utf8_pure size_t utf8size(const utf8_int8_t *str);
     200             : 
     201             : /* Similar to utf8size, except that the null terminating byte is excluded. */
     202             : utf8_constexpr14 utf8_nonnull utf8_pure size_t
     203             : utf8size_lazy(const utf8_int8_t *str);
     204             : 
     205             : /* Similar to utf8size, except that only at most n bytes of src are looked and
     206             :  * the null terminating byte is excluded. */
     207             : utf8_constexpr14 utf8_nonnull utf8_pure size_t
     208             : utf8nsize_lazy(const utf8_int8_t *str, size_t n);
     209             : 
     210             : /* Number of utf8 codepoints in the utf8 string src that consists entirely
     211             :  * of utf8 codepoints from the utf8 string accept. */
     212             : utf8_constexpr14 utf8_nonnull utf8_pure size_t
     213             : utf8spn(const utf8_int8_t *src, const utf8_int8_t *accept);
     214             : 
     215             : /* The position of the utf8 string needle in the utf8 string haystack. */
     216             : utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
     217             : utf8str(const utf8_int8_t *haystack, const utf8_int8_t *needle);
     218             : 
     219             : /* The position of the utf8 string needle in the utf8 string haystack, case
     220             :  * insensitive. */
     221             : utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
     222             : utf8casestr(const utf8_int8_t *haystack, const utf8_int8_t *needle);
     223             : 
     224             : /* Return 0 on success, or the position of the invalid
     225             :  * utf8 codepoint on failure. */
     226             : utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
     227             : utf8valid(const utf8_int8_t *str);
     228             : 
     229             : /* Similar to utf8valid, except that only at most n bytes of src are looked. */
     230             : utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
     231             : utf8nvalid(const utf8_int8_t *str, size_t n);
     232             : 
     233             : /* Given a null-terminated string, makes the string valid by replacing invalid
     234             :  * codepoints with a 1-byte replacement. Returns 0 on success. */
     235             : utf8_nonnull utf8_weak int utf8makevalid(utf8_int8_t *str,
     236             :                                          const utf8_int32_t replacement);
     237             : #endif
     238             : /* Sets out_codepoint to the current utf8 codepoint in str, and returns the
     239             :  * address of the next utf8 codepoint after the current one in str. */
     240             : utf8_constexpr14 utf8_nonnull utf8_int8_t *
     241             : utf8codepoint(const utf8_int8_t *utf8_restrict str,
     242             :               utf8_int32_t *utf8_restrict out_codepoint);
     243             : 
     244             : /* Calculates the size of the next utf8 codepoint in str. */
     245             : utf8_constexpr14 utf8_nonnull size_t
     246             : utf8codepointcalcsize(const utf8_int8_t *str);
     247             : 
     248             : #if 0
     249             : /* Returns the size of the given codepoint in bytes. */
     250             : utf8_constexpr14 size_t utf8codepointsize(utf8_int32_t chr);
     251             : 
     252             : /* Write a codepoint to the given string, and return the address to the next
     253             :  * place after the written codepoint. Pass how many bytes left in the buffer to
     254             :  * n. If there is not enough space for the codepoint, this function returns
     255             :  * null. */
     256             : utf8_nonnull utf8_weak utf8_int8_t *
     257             : utf8catcodepoint(utf8_int8_t *str, utf8_int32_t chr, size_t n);
     258             : 
     259             : /* Returns 1 if the given character is lowercase, or 0 if it is not. */
     260             : utf8_constexpr14 int utf8islower(utf8_int32_t chr);
     261             : 
     262             : /* Returns 1 if the given character is uppercase, or 0 if it is not. */
     263             : utf8_constexpr14 int utf8isupper(utf8_int32_t chr);
     264             : 
     265             : /* Transform the given string into all lowercase codepoints. */
     266             : utf8_nonnull utf8_weak void utf8lwr(utf8_int8_t *utf8_restrict str);
     267             : 
     268             : /* Transform the given string into all uppercase codepoints. */
     269             : utf8_nonnull utf8_weak void utf8upr(utf8_int8_t *utf8_restrict str);
     270             : #endif
     271             : 
     272             : /* Make a codepoint lower case if possible. */
     273             : utf8_constexpr14 utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp);
     274             : 
     275             : /* Make a codepoint upper case if possible. */
     276             : utf8_constexpr14 utf8_int32_t utf8uprcodepoint(utf8_int32_t cp);
     277             : 
     278             : #if 0
     279             : /* Sets out_codepoint to the current utf8 codepoint in str, and returns the
     280             :  * address of the previous utf8 codepoint before the current one in str. */
     281             : utf8_constexpr14 utf8_nonnull utf8_int8_t *
     282             : utf8rcodepoint(const utf8_int8_t *utf8_restrict str,
     283             :                utf8_int32_t *utf8_restrict out_codepoint);
     284             : 
     285             : /* Duplicate the utf8 string src by getting its size, calling alloc_func_ptr to
     286             :  * copy over data to a new buffer, and returning that. Or 0 if alloc_func_ptr
     287             :  * returned null. */
     288             : utf8_weak utf8_int8_t *utf8dup_ex(const utf8_int8_t *src,
     289             :                                   utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *,
     290             :                                                                  size_t),
     291             :                                   utf8_int8_t *user_data);
     292             : 
     293             : /* Similar to utf8dup, except that at most n bytes of src are copied. If src is
     294             :  * longer than n, only n bytes are copied and a null byte is added.
     295             :  *
     296             :  * Returns a new string if successful, 0 otherwise. */
     297             : utf8_weak utf8_int8_t *utf8ndup_ex(const utf8_int8_t *src, size_t n,
     298             :                                    utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *,
     299             :                                                                   size_t),
     300             :                                    utf8_int8_t *user_data);
     301             : #endif
     302             : 
     303             : #undef utf8_weak
     304             : #undef utf8_pure
     305             : #undef utf8_nonnull
     306             : 
     307             : #if 0
     308             : utf8_constexpr14_impl int utf8casecmp(const utf8_int8_t *src1,
     309             :                                       const utf8_int8_t *src2) {
     310             :   utf8_int32_t src1_lwr_cp = 0, src2_lwr_cp = 0, src1_upr_cp = 0,
     311             :                src2_upr_cp = 0, src1_orig_cp = 0, src2_orig_cp = 0;
     312             : 
     313             :   for (;;) {
     314             :     src1 = utf8codepoint(src1, &src1_orig_cp);
     315             :     src2 = utf8codepoint(src2, &src2_orig_cp);
     316             : 
     317             :     /* lower the srcs if required */
     318             :     src1_lwr_cp = utf8lwrcodepoint(src1_orig_cp);
     319             :     src2_lwr_cp = utf8lwrcodepoint(src2_orig_cp);
     320             : 
     321             :     /* lower the srcs if required */
     322             :     src1_upr_cp = utf8uprcodepoint(src1_orig_cp);
     323             :     src2_upr_cp = utf8uprcodepoint(src2_orig_cp);
     324             : 
     325             :     /* check if the lowered codepoints match */
     326             :     if ((0 == src1_orig_cp) && (0 == src2_orig_cp)) {
     327             :       return 0;
     328             :     } else if ((src1_lwr_cp == src2_lwr_cp) || (src1_upr_cp == src2_upr_cp)) {
     329             :       continue;
     330             :     }
     331             : 
     332             :     /* if they don't match, then we return the difference between the characters
     333             :      */
     334             :     return src1_lwr_cp - src2_lwr_cp;
     335             :   }
     336             : }
     337             : 
     338             : utf8_int8_t *utf8cat(utf8_int8_t *utf8_restrict dst,
     339             :                      const utf8_int8_t *utf8_restrict src) {
     340             :   utf8_int8_t *d = dst;
     341             :   /* find the null terminating byte in dst */
     342             :   while ('\0' != *d) {
     343             :     d++;
     344             :   }
     345             : 
     346             :   /* overwriting the null terminating byte in dst, append src byte-by-byte */
     347             :   while ('\0' != *src) {
     348             :     *d++ = *src++;
     349             :   }
     350             : 
     351             :   /* write out a new null terminating byte into dst */
     352             :   *d = '\0';
     353             : 
     354             :   return dst;
     355             : }
     356             : 
     357             : utf8_constexpr14_impl utf8_int8_t *utf8chr(const utf8_int8_t *src,
     358             :                                            utf8_int32_t chr) {
     359             :   utf8_int8_t c[5] = {'\0', '\0', '\0', '\0', '\0'};
     360             : 
     361             :   if (0 == chr) {
     362             :     /* being asked to return position of null terminating byte, so
     363             :      * just run s to the end, and return! */
     364             :     while ('\0' != *src) {
     365             :       src++;
     366             :     }
     367             :     return (utf8_int8_t *)src;
     368             :   } else if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
     369             :     /* 1-byte/7-bit ascii
     370             :      * (0b0xxxxxxx) */
     371             :     c[0] = (utf8_int8_t)chr;
     372             :   } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
     373             :     /* 2-byte/11-bit utf8 code point
     374             :      * (0b110xxxxx 0b10xxxxxx) */
     375             :     c[0] = (utf8_int8_t)(0xc0 | (utf8_int8_t)(chr >> 6));
     376             :     c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
     377             :   } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
     378             :     /* 3-byte/16-bit utf8 code point
     379             :      * (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) */
     380             :     c[0] = (utf8_int8_t)(0xe0 | (utf8_int8_t)(chr >> 12));
     381             :     c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
     382             :     c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
     383             :   } else { /* if (0 == ((int)0xffe00000 & chr)) { */
     384             :     /* 4-byte/21-bit utf8 code point
     385             :      * (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx) */
     386             :     c[0] = (utf8_int8_t)(0xf0 | (utf8_int8_t)(chr >> 18));
     387             :     c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 12) & 0x3f));
     388             :     c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
     389             :     c[3] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
     390             :   }
     391             : 
     392             :   /* we've made c into a 2 utf8 codepoint string, one for the chr we are
     393             :    * seeking, another for the null terminating byte. Now use utf8str to
     394             :    * search */
     395             :   return utf8str(src, c);
     396             : }
     397             : 
     398             : utf8_constexpr14_impl int utf8cmp(const utf8_int8_t *src1,
     399             :                                   const utf8_int8_t *src2) {
     400             :   while (('\0' != *src1) || ('\0' != *src2)) {
     401             :     if (*src1 < *src2) {
     402             :       return -1;
     403             :     } else if (*src1 > *src2) {
     404             :       return 1;
     405             :     }
     406             : 
     407             :     src1++;
     408             :     src2++;
     409             :   }
     410             : 
     411             :   /* both utf8 strings matched */
     412             :   return 0;
     413             : }
     414             : 
     415             : utf8_constexpr14_impl int utf8coll(const utf8_int8_t *src1,
     416             :                                    const utf8_int8_t *src2);
     417             : 
     418             : utf8_int8_t *utf8cpy(utf8_int8_t *utf8_restrict dst,
     419             :                      const utf8_int8_t *utf8_restrict src) {
     420             :   utf8_int8_t *d = dst;
     421             : 
     422             :   /* overwriting anything previously in dst, write byte-by-byte
     423             :    * from src */
     424             :   while ('\0' != *src) {
     425             :     *d++ = *src++;
     426             :   }
     427             : 
     428             :   /* append null terminating byte */
     429             :   *d = '\0';
     430             : 
     431             :   return dst;
     432             : }
     433             : 
     434             : utf8_constexpr14_impl size_t utf8cspn(const utf8_int8_t *src,
     435             :                                       const utf8_int8_t *reject) {
     436             :   size_t chars = 0;
     437             : 
     438             :   while ('\0' != *src) {
     439             :     const utf8_int8_t *r = reject;
     440             :     size_t offset = 0;
     441             : 
     442             :     while ('\0' != *r) {
     443             :       /* checking that if *r is the start of a utf8 codepoint
     444             :        * (it is not 0b10xxxxxx) and we have successfully matched
     445             :        * a previous character (0 < offset) - we found a match */
     446             :       if ((0x80 != (0xc0 & *r)) && (0 < offset)) {
     447             :         return chars;
     448             :       } else {
     449             :         if (*r == src[offset]) {
     450             :           /* part of a utf8 codepoint matched, so move our checking
     451             :            * onwards to the next byte */
     452             :           offset++;
     453             :           r++;
     454             :         } else {
     455             :           /* r could be in the middle of an unmatching utf8 code point,
     456             :            * so we need to march it on to the next character beginning, */
     457             : 
     458             :           do {
     459             :             r++;
     460             :           } while (0x80 == (0xc0 & *r));
     461             : 
     462             :           /* reset offset too as we found a mismatch */
     463             :           offset = 0;
     464             :         }
     465             :       }
     466             :     }
     467             : 
     468             :     /* found a match at the end of *r, so didn't get a chance to test it */
     469             :     if (0 < offset) {
     470             :       return chars;
     471             :     }
     472             : 
     473             :     /* the current utf8 codepoint in src did not match reject, but src
     474             :      * could have been partway through a utf8 codepoint, so we need to
     475             :      * march it onto the next utf8 codepoint starting byte */
     476             :     do {
     477             :       src++;
     478             :     } while ((0x80 == (0xc0 & *src)));
     479             :     chars++;
     480             :   }
     481             : 
     482             :   return chars;
     483             : }
     484             : 
     485             : utf8_int8_t *utf8dup(const utf8_int8_t *src) {
     486             :   return utf8dup_ex(src, utf8_null, utf8_null);
     487             : }
     488             : 
     489             : utf8_int8_t *utf8dup_ex(const utf8_int8_t *src,
     490             :                         utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *, size_t),
     491             :                         utf8_int8_t *user_data) {
     492             :   utf8_int8_t *n = utf8_null;
     493             : 
     494             :   /* figure out how many bytes (including the terminator) we need to copy first
     495             :    */
     496             :   size_t bytes = utf8size(src);
     497             : 
     498             :   if (alloc_func_ptr) {
     499             :     n = alloc_func_ptr(user_data, bytes);
     500             :   } else {
     501             : #if !defined(UTF8_NO_STD_MALLOC)
     502             :     n = (utf8_int8_t *)malloc(bytes);
     503             : #else
     504             :     return utf8_null;
     505             : #endif
     506             :   }
     507             : 
     508             :   if (utf8_null == n) {
     509             :     /* out of memory so we bail */
     510             :     return utf8_null;
     511             :   } else {
     512             :     bytes = 0;
     513             : 
     514             :     /* copy src byte-by-byte into our new utf8 string */
     515             :     while ('\0' != src[bytes]) {
     516             :       n[bytes] = src[bytes];
     517             :       bytes++;
     518             :     }
     519             : 
     520             :     /* append null terminating byte */
     521             :     n[bytes] = '\0';
     522             :     return n;
     523             :   }
     524             : }
     525             : 
     526             : utf8_constexpr14_impl utf8_int8_t *utf8fry(const utf8_int8_t *str);
     527             : 
     528             : utf8_constexpr14_impl size_t utf8len(const utf8_int8_t *str) {
     529             :   return utf8nlen(str, SIZE_MAX);
     530             : }
     531             : 
     532             : utf8_constexpr14_impl size_t utf8nlen(const utf8_int8_t *str, size_t n) {
     533             :   const utf8_int8_t *t = str;
     534             :   size_t length = 0;
     535             : 
     536             :   while ((size_t)(str - t) < n && '\0' != *str) {
     537             :     if (0xf0 == (0xf8 & *str)) {
     538             :       /* 4-byte utf8 code point (began with 0b11110xxx) */
     539             :       str += 4;
     540             :     } else if (0xe0 == (0xf0 & *str)) {
     541             :       /* 3-byte utf8 code point (began with 0b1110xxxx) */
     542             :       str += 3;
     543             :     } else if (0xc0 == (0xe0 & *str)) {
     544             :       /* 2-byte utf8 code point (began with 0b110xxxxx) */
     545             :       str += 2;
     546             :     } else { /* if (0x00 == (0x80 & *s)) { */
     547             :       /* 1-byte ascii (began with 0b0xxxxxxx) */
     548             :       str += 1;
     549             :     }
     550             : 
     551             :     /* no matter the bytes we marched s forward by, it was
     552             :      * only 1 utf8 codepoint */
     553             :     length++;
     554             :   }
     555             : 
     556             :   if ((size_t)(str - t) > n) {
     557             :     length--;
     558             :   }
     559             :   return length;
     560             : }
     561             : 
     562             : utf8_constexpr14_impl int utf8ncasecmp(const utf8_int8_t *src1,
     563             :                                        const utf8_int8_t *src2, size_t n) {
     564             :   utf8_int32_t src1_lwr_cp = 0, src2_lwr_cp = 0, src1_upr_cp = 0,
     565             :                src2_upr_cp = 0, src1_orig_cp = 0, src2_orig_cp = 0;
     566             : 
     567             :   do {
     568             :     const utf8_int8_t *const s1 = src1;
     569             :     const utf8_int8_t *const s2 = src2;
     570             : 
     571             :     /* first check that we have enough bytes left in n to contain an entire
     572             :      * codepoint */
     573             :     if (0 == n) {
     574             :       return 0;
     575             :     }
     576             : 
     577             :     if ((1 == n) && ((0xc0 == (0xe0 & *s1)) || (0xc0 == (0xe0 & *s2)))) {
     578             :       const utf8_int32_t c1 = (0xe0 & *s1);
     579             :       const utf8_int32_t c2 = (0xe0 & *s2);
     580             : 
     581             :       if (c1 < c2) {
     582             :         return c1 - c2;
     583             :       } else {
     584             :         return 0;
     585             :       }
     586             :     }
     587             : 
     588             :     if ((2 >= n) && ((0xe0 == (0xf0 & *s1)) || (0xe0 == (0xf0 & *s2)))) {
     589             :       const utf8_int32_t c1 = (0xf0 & *s1);
     590             :       const utf8_int32_t c2 = (0xf0 & *s2);
     591             : 
     592             :       if (c1 < c2) {
     593             :         return c1 - c2;
     594             :       } else {
     595             :         return 0;
     596             :       }
     597             :     }
     598             : 
     599             :     if ((3 >= n) && ((0xf0 == (0xf8 & *s1)) || (0xf0 == (0xf8 & *s2)))) {
     600             :       const utf8_int32_t c1 = (0xf8 & *s1);
     601             :       const utf8_int32_t c2 = (0xf8 & *s2);
     602             : 
     603             :       if (c1 < c2) {
     604             :         return c1 - c2;
     605             :       } else {
     606             :         return 0;
     607             :       }
     608             :     }
     609             : 
     610             :     src1 = utf8codepoint(src1, &src1_orig_cp);
     611             :     src2 = utf8codepoint(src2, &src2_orig_cp);
     612             :     n -= utf8codepointsize(src1_orig_cp);
     613             : 
     614             :     src1_lwr_cp = utf8lwrcodepoint(src1_orig_cp);
     615             :     src2_lwr_cp = utf8lwrcodepoint(src2_orig_cp);
     616             : 
     617             :     src1_upr_cp = utf8uprcodepoint(src1_orig_cp);
     618             :     src2_upr_cp = utf8uprcodepoint(src2_orig_cp);
     619             : 
     620             :     /* check if the lowered codepoints match */
     621             :     if ((0 == src1_orig_cp) && (0 == src2_orig_cp)) {
     622             :       return 0;
     623             :     } else if ((src1_lwr_cp == src2_lwr_cp) || (src1_upr_cp == src2_upr_cp)) {
     624             :       continue;
     625             :     }
     626             : 
     627             :     /* if they don't match, then we return the difference between the characters
     628             :      */
     629             :     return src1_lwr_cp - src2_lwr_cp;
     630             :   } while (0 < n);
     631             : 
     632             :   /* both utf8 strings matched */
     633             :   return 0;
     634             : }
     635             : 
     636             : utf8_int8_t *utf8ncat(utf8_int8_t *utf8_restrict dst,
     637             :                       const utf8_int8_t *utf8_restrict src, size_t n) {
     638             :   utf8_int8_t *d = dst;
     639             : 
     640             :   /* find the null terminating byte in dst */
     641             :   while ('\0' != *d) {
     642             :     d++;
     643             :   }
     644             : 
     645             :   /* overwriting the null terminating byte in dst, append src byte-by-byte
     646             :    * stopping if we run out of space */
     647             :   while (('\0' != *src) && (0 != n--)) {
     648             :     *d++ = *src++;
     649             :   }
     650             : 
     651             :   /* write out a new null terminating byte into dst */
     652             :   *d = '\0';
     653             : 
     654             :   return dst;
     655             : }
     656             : 
     657             : utf8_constexpr14_impl int utf8ncmp(const utf8_int8_t *src1,
     658             :                                    const utf8_int8_t *src2, size_t n) {
     659             :   while ((0 != n--) && (('\0' != *src1) || ('\0' != *src2))) {
     660             :     if (*src1 < *src2) {
     661             :       return -1;
     662             :     } else if (*src1 > *src2) {
     663             :       return 1;
     664             :     }
     665             : 
     666             :     src1++;
     667             :     src2++;
     668             :   }
     669             : 
     670             :   /* both utf8 strings matched */
     671             :   return 0;
     672             : }
     673             : 
     674             : utf8_int8_t *utf8ncpy(utf8_int8_t *utf8_restrict dst,
     675             :                       const utf8_int8_t *utf8_restrict src, size_t n) {
     676             :   utf8_int8_t *d = dst;
     677             :   size_t index = 0, check_index = 0;
     678             : 
     679             :   if (n == 0) {
     680             :     return dst;
     681             :   }
     682             : 
     683             :   /* overwriting anything previously in dst, write byte-by-byte
     684             :    * from src */
     685             :   for (index = 0; index < n; index++) {
     686             :     d[index] = src[index];
     687             :     if ('\0' == src[index]) {
     688             :       break;
     689             :     }
     690             :   }
     691             : 
     692             :   for (check_index = index - 1;
     693             :        check_index > 0 && 0x80 == (0xc0 & d[check_index]); check_index--) {
     694             :     /* just moving the index */
     695             :   }
     696             : 
     697             :   if (check_index < index &&
     698             :       ((index - check_index) < utf8codepointcalcsize(&d[check_index]) ||
     699             :        (index - check_index) == n)) {
     700             :     index = check_index;
     701             :   }
     702             : 
     703             :   /* append null terminating byte */
     704             :   for (; index < n; index++) {
     705             :     d[index] = 0;
     706             :   }
     707             : 
     708             :   return dst;
     709             : }
     710             : 
     711             : utf8_int8_t *utf8ndup(const utf8_int8_t *src, size_t n) {
     712             :   return utf8ndup_ex(src, n, utf8_null, utf8_null);
     713             : }
     714             : 
     715             : utf8_int8_t *utf8ndup_ex(const utf8_int8_t *src, size_t n,
     716             :                          utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *, size_t),
     717             :                          utf8_int8_t *user_data) {
     718             :   utf8_int8_t *c = utf8_null;
     719             :   size_t bytes = 0;
     720             : 
     721             :   /* Find the end of the string or stop when n is reached */
     722             :   while ('\0' != src[bytes] && bytes < n) {
     723             :     bytes++;
     724             :   }
     725             : 
     726             :   /* In case bytes is actually less than n, we need to set it
     727             :    * to be used later in the copy byte by byte. */
     728             :   n = bytes;
     729             : 
     730             :   if (alloc_func_ptr) {
     731             :     c = alloc_func_ptr(user_data, bytes + 1);
     732             :   } else {
     733             : #if !defined(UTF8_NO_STD_MALLOC)
     734             :     c = (utf8_int8_t *)malloc(bytes + 1);
     735             : #else
     736             :     c = utf8_null;
     737             : #endif
     738             :   }
     739             : 
     740             :   if (utf8_null == c) {
     741             :     /* out of memory so we bail */
     742             :     return utf8_null;
     743             :   }
     744             : 
     745             :   bytes = 0;
     746             : 
     747             :   /* copy src byte-by-byte into our new utf8 string */
     748             :   while ('\0' != src[bytes] && bytes < n) {
     749             :     c[bytes] = src[bytes];
     750             :     bytes++;
     751             :   }
     752             : 
     753             :   /* append null terminating byte */
     754             :   c[bytes] = '\0';
     755             :   return c;
     756             : }
     757             : 
     758             : utf8_constexpr14_impl utf8_int8_t *utf8rchr(const utf8_int8_t *src, int chr) {
     759             : 
     760             :   utf8_int8_t *match = utf8_null;
     761             :   utf8_int8_t c[5] = {'\0', '\0', '\0', '\0', '\0'};
     762             : 
     763             :   if (0 == chr) {
     764             :     /* being asked to return position of null terminating byte, so
     765             :      * just run s to the end, and return! */
     766             :     while ('\0' != *src) {
     767             :       src++;
     768             :     }
     769             :     return (utf8_int8_t *)src;
     770             :   } else if (0 == ((int)0xffffff80 & chr)) {
     771             :     /* 1-byte/7-bit ascii
     772             :      * (0b0xxxxxxx) */
     773             :     c[0] = (utf8_int8_t)chr;
     774             :   } else if (0 == ((int)0xfffff800 & chr)) {
     775             :     /* 2-byte/11-bit utf8 code point
     776             :      * (0b110xxxxx 0b10xxxxxx) */
     777             :     c[0] = (utf8_int8_t)(0xc0 | (utf8_int8_t)(chr >> 6));
     778             :     c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
     779             :   } else if (0 == ((int)0xffff0000 & chr)) {
     780             :     /* 3-byte/16-bit utf8 code point
     781             :      * (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) */
     782             :     c[0] = (utf8_int8_t)(0xe0 | (utf8_int8_t)(chr >> 12));
     783             :     c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
     784             :     c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
     785             :   } else { /* if (0 == ((int)0xffe00000 & chr)) { */
     786             :     /* 4-byte/21-bit utf8 code point
     787             :      * (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx) */
     788             :     c[0] = (utf8_int8_t)(0xf0 | (utf8_int8_t)(chr >> 18));
     789             :     c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 12) & 0x3f));
     790             :     c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
     791             :     c[3] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
     792             :   }
     793             : 
     794             :   /* we've created a 2 utf8 codepoint string in c that is
     795             :    * the utf8 character asked for by chr, and a null
     796             :    * terminating byte */
     797             : 
     798             :   while ('\0' != *src) {
     799             :     size_t offset = 0;
     800             : 
     801             :     while ((src[offset] == c[offset]) && ('\0' != src[offset])) {
     802             :       offset++;
     803             :     }
     804             : 
     805             :     if ('\0' == c[offset]) {
     806             :       /* we found a matching utf8 code point */
     807             :       match = (utf8_int8_t *)src;
     808             :       src += offset;
     809             : 
     810             :       if ('\0' == *src) {
     811             :         break;
     812             :       }
     813             :     } else {
     814             :       src += offset;
     815             : 
     816             :       /* need to march s along to next utf8 codepoint start
     817             :        * (the next byte that doesn't match 0b10xxxxxx) */
     818             :       if ('\0' != *src) {
     819             :         do {
     820             :           src++;
     821             :         } while (0x80 == (0xc0 & *src));
     822             :       }
     823             :     }
     824             :   }
     825             : 
     826             :   /* return the last match we found (or 0 if no match was found) */
     827             :   return match;
     828             : }
     829             : 
     830             : utf8_constexpr14_impl utf8_int8_t *utf8pbrk(const utf8_int8_t *str,
     831             :                                             const utf8_int8_t *accept) {
     832             :   while ('\0' != *str) {
     833             :     const utf8_int8_t *a = accept;
     834             :     size_t offset = 0;
     835             : 
     836             :     while ('\0' != *a) {
     837             :       /* checking that if *a is the start of a utf8 codepoint
     838             :        * (it is not 0b10xxxxxx) and we have successfully matched
     839             :        * a previous character (0 < offset) - we found a match */
     840             :       if ((0x80 != (0xc0 & *a)) && (0 < offset)) {
     841             :         return (utf8_int8_t *)str;
     842             :       } else {
     843             :         if (*a == str[offset]) {
     844             :           /* part of a utf8 codepoint matched, so move our checking
     845             :            * onwards to the next byte */
     846             :           offset++;
     847             :           a++;
     848             :         } else {
     849             :           /* r could be in the middle of an unmatching utf8 code point,
     850             :            * so we need to march it on to the next character beginning, */
     851             : 
     852             :           do {
     853             :             a++;
     854             :           } while (0x80 == (0xc0 & *a));
     855             : 
     856             :           /* reset offset too as we found a mismatch */
     857             :           offset = 0;
     858             :         }
     859             :       }
     860             :     }
     861             : 
     862             :     /* we found a match on the last utf8 codepoint */
     863             :     if (0 < offset) {
     864             :       return (utf8_int8_t *)str;
     865             :     }
     866             : 
     867             :     /* the current utf8 codepoint in src did not match accept, but src
     868             :      * could have been partway through a utf8 codepoint, so we need to
     869             :      * march it onto the next utf8 codepoint starting byte */
     870             :     do {
     871             :       str++;
     872             :     } while ((0x80 == (0xc0 & *str)));
     873             :   }
     874             : 
     875             :   return utf8_null;
     876             : }
     877             : 
     878             : utf8_constexpr14_impl size_t utf8size(const utf8_int8_t *str) {
     879             :   return utf8size_lazy(str) + 1;
     880             : }
     881             : 
     882             : utf8_constexpr14_impl size_t utf8size_lazy(const utf8_int8_t *str) {
     883             :   return utf8nsize_lazy(str, SIZE_MAX);
     884             : }
     885             : 
     886             : utf8_constexpr14_impl size_t utf8nsize_lazy(const utf8_int8_t *str, size_t n) {
     887             :   size_t size = 0;
     888             :   while (size < n && '\0' != str[size]) {
     889             :     size++;
     890             :   }
     891             :   return size;
     892             : }
     893             : 
     894             : utf8_constexpr14_impl size_t utf8spn(const utf8_int8_t *src,
     895             :                                      const utf8_int8_t *accept) {
     896             :   size_t chars = 0;
     897             : 
     898             :   while ('\0' != *src) {
     899             :     const utf8_int8_t *a = accept;
     900             :     size_t offset = 0;
     901             : 
     902             :     while ('\0' != *a) {
     903             :       /* checking that if *r is the start of a utf8 codepoint
     904             :        * (it is not 0b10xxxxxx) and we have successfully matched
     905             :        * a previous character (0 < offset) - we found a match */
     906             :       if ((0x80 != (0xc0 & *a)) && (0 < offset)) {
     907             :         /* found a match, so increment the number of utf8 codepoints
     908             :          * that have matched and stop checking whether any other utf8
     909             :          * codepoints in a match */
     910             :         chars++;
     911             :         src += offset;
     912             :         offset = 0;
     913             :         break;
     914             :       } else {
     915             :         if (*a == src[offset]) {
     916             :           offset++;
     917             :           a++;
     918             :         } else {
     919             :           /* a could be in the middle of an unmatching utf8 codepoint,
     920             :            * so we need to march it on to the next character beginning, */
     921             :           do {
     922             :             a++;
     923             :           } while (0x80 == (0xc0 & *a));
     924             : 
     925             :           /* reset offset too as we found a mismatch */
     926             :           offset = 0;
     927             :         }
     928             :       }
     929             :     }
     930             : 
     931             :     /* found a match at the end of *a, so didn't get a chance to test it */
     932             :     if (0 < offset) {
     933             :       chars++;
     934             :       src += offset;
     935             :       continue;
     936             :     }
     937             : 
     938             :     /* if a got to its terminating null byte, then we didn't find a match.
     939             :      * Return the current number of matched utf8 codepoints */
     940             :     if ('\0' == *a) {
     941             :       return chars;
     942             :     }
     943             :   }
     944             : 
     945             :   return chars;
     946             : }
     947             : 
     948             : utf8_constexpr14_impl utf8_int8_t *utf8str(const utf8_int8_t *haystack,
     949             :                                            const utf8_int8_t *needle) {
     950             :   utf8_int32_t throwaway_codepoint = 0;
     951             : 
     952             :   /* if needle has no utf8 codepoints before the null terminating
     953             :    * byte then return haystack */
     954             :   if ('\0' == *needle) {
     955             :     return (utf8_int8_t *)haystack;
     956             :   }
     957             : 
     958             :   while ('\0' != *haystack) {
     959             :     const utf8_int8_t *maybeMatch = haystack;
     960             :     const utf8_int8_t *n = needle;
     961             : 
     962             :     while (*haystack == *n && (*haystack != '\0' && *n != '\0')) {
     963             :       n++;
     964             :       haystack++;
     965             :     }
     966             : 
     967             :     if ('\0' == *n) {
     968             :       /* we found the whole utf8 string for needle in haystack at
     969             :        * maybeMatch, so return it */
     970             :       return (utf8_int8_t *)maybeMatch;
     971             :     } else {
     972             :       /* h could be in the middle of an unmatching utf8 codepoint,
     973             :        * so we need to march it on to the next character beginning
     974             :        * starting from the current character */
     975             :       haystack = utf8codepoint(maybeMatch, &throwaway_codepoint);
     976             :     }
     977             :   }
     978             : 
     979             :   /* no match */
     980             :   return utf8_null;
     981             : }
     982             : 
     983             : utf8_constexpr14_impl utf8_int8_t *utf8casestr(const utf8_int8_t *haystack,
     984             :                                                const utf8_int8_t *needle) {
     985             :   /* if needle has no utf8 codepoints before the null terminating
     986             :    * byte then return haystack */
     987             :   if ('\0' == *needle) {
     988             :     return (utf8_int8_t *)haystack;
     989             :   }
     990             : 
     991             :   for (;;) {
     992             :     const utf8_int8_t *maybeMatch = haystack;
     993             :     const utf8_int8_t *n = needle;
     994             :     utf8_int32_t h_cp = 0, n_cp = 0;
     995             : 
     996             :     /* Get the next code point and track it */
     997             :     const utf8_int8_t *nextH = haystack = utf8codepoint(haystack, &h_cp);
     998             :     n = utf8codepoint(n, &n_cp);
     999             : 
    1000             :     while ((0 != h_cp) && (0 != n_cp)) {
    1001             :       h_cp = utf8lwrcodepoint(h_cp);
    1002             :       n_cp = utf8lwrcodepoint(n_cp);
    1003             : 
    1004             :       /* if we find a mismatch, bail out! */
    1005             :       if (h_cp != n_cp) {
    1006             :         break;
    1007             :       }
    1008             : 
    1009             :       haystack = utf8codepoint(haystack, &h_cp);
    1010             :       n = utf8codepoint(n, &n_cp);
    1011             :     }
    1012             : 
    1013             :     if (0 == n_cp) {
    1014             :       /* we found the whole utf8 string for needle in haystack at
    1015             :        * maybeMatch, so return it */
    1016             :       return (utf8_int8_t *)maybeMatch;
    1017             :     }
    1018             : 
    1019             :     if (0 == h_cp) {
    1020             :       /* no match */
    1021             :       return utf8_null;
    1022             :     }
    1023             : 
    1024             :     /* Roll back to the next code point in the haystack to test */
    1025             :     haystack = nextH;
    1026             :   }
    1027             : }
    1028             : 
    1029             : utf8_constexpr14_impl utf8_int8_t *utf8valid(const utf8_int8_t *str) {
    1030             :   return utf8nvalid(str, SIZE_MAX);
    1031             : }
    1032             : 
    1033             : utf8_constexpr14_impl utf8_int8_t *utf8nvalid(const utf8_int8_t *str,
    1034             :                                               size_t n) {
    1035             :   const utf8_int8_t *t = str;
    1036             :   size_t consumed = 0;
    1037             : 
    1038             :   while ((void)(consumed = (size_t)(str - t)), consumed < n && '\0' != *str) {
    1039             :     const size_t remaining = n - consumed;
    1040             : 
    1041             :     if (0xf0 == (0xf8 & *str)) {
    1042             :       /* ensure that there's 4 bytes or more remaining */
    1043             :       if (remaining < 4) {
    1044             :         return (utf8_int8_t *)str;
    1045             :       }
    1046             : 
    1047             :       /* ensure each of the 3 following bytes in this 4-byte
    1048             :        * utf8 codepoint began with 0b10xxxxxx */
    1049             :       if ((0x80 != (0xc0 & str[1])) || (0x80 != (0xc0 & str[2])) ||
    1050             :           (0x80 != (0xc0 & str[3]))) {
    1051             :         return (utf8_int8_t *)str;
    1052             :       }
    1053             : 
    1054             :       /* ensure that our utf8 codepoint ended after 4 bytes */
    1055             :       if ((remaining != 4) && (0x80 == (0xc0 & str[4]))) {
    1056             :         return (utf8_int8_t *)str;
    1057             :       }
    1058             : 
    1059             :       /* ensure that the top 5 bits of this 4-byte utf8
    1060             :        * codepoint were not 0, as then we could have used
    1061             :        * one of the smaller encodings */
    1062             :       if ((0 == (0x07 & str[0])) && (0 == (0x30 & str[1]))) {
    1063             :         return (utf8_int8_t *)str;
    1064             :       }
    1065             : 
    1066             :       /* 4-byte utf8 code point (began with 0b11110xxx) */
    1067             :       str += 4;
    1068             :     } else if (0xe0 == (0xf0 & *str)) {
    1069             :       /* ensure that there's 3 bytes or more remaining */
    1070             :       if (remaining < 3) {
    1071             :         return (utf8_int8_t *)str;
    1072             :       }
    1073             : 
    1074             :       /* ensure each of the 2 following bytes in this 3-byte
    1075             :        * utf8 codepoint began with 0b10xxxxxx */
    1076             :       if ((0x80 != (0xc0 & str[1])) || (0x80 != (0xc0 & str[2]))) {
    1077             :         return (utf8_int8_t *)str;
    1078             :       }
    1079             : 
    1080             :       /* ensure that our utf8 codepoint ended after 3 bytes */
    1081             :       if ((remaining != 3) && (0x80 == (0xc0 & str[3]))) {
    1082             :         return (utf8_int8_t *)str;
    1083             :       }
    1084             : 
    1085             :       /* ensure that the top 5 bits of this 3-byte utf8
    1086             :        * codepoint were not 0, as then we could have used
    1087             :        * one of the smaller encodings */
    1088             :       if ((0 == (0x0f & str[0])) && (0 == (0x20 & str[1]))) {
    1089             :         return (utf8_int8_t *)str;
    1090             :       }
    1091             : 
    1092             :       /* 3-byte utf8 code point (began with 0b1110xxxx) */
    1093             :       str += 3;
    1094             :     } else if (0xc0 == (0xe0 & *str)) {
    1095             :       /* ensure that there's 2 bytes or more remaining */
    1096             :       if (remaining < 2) {
    1097             :         return (utf8_int8_t *)str;
    1098             :       }
    1099             : 
    1100             :       /* ensure the 1 following byte in this 2-byte
    1101             :        * utf8 codepoint began with 0b10xxxxxx */
    1102             :       if (0x80 != (0xc0 & str[1])) {
    1103             :         return (utf8_int8_t *)str;
    1104             :       }
    1105             : 
    1106             :       /* ensure that our utf8 codepoint ended after 2 bytes */
    1107             :       if ((remaining != 2) && (0x80 == (0xc0 & str[2]))) {
    1108             :         return (utf8_int8_t *)str;
    1109             :       }
    1110             : 
    1111             :       /* ensure that the top 4 bits of this 2-byte utf8
    1112             :        * codepoint were not 0, as then we could have used
    1113             :        * one of the smaller encodings */
    1114             :       if (0 == (0x1e & str[0])) {
    1115             :         return (utf8_int8_t *)str;
    1116             :       }
    1117             : 
    1118             :       /* 2-byte utf8 code point (began with 0b110xxxxx) */
    1119             :       str += 2;
    1120             :     } else if (0x00 == (0x80 & *str)) {
    1121             :       /* 1-byte ascii (began with 0b0xxxxxxx) */
    1122             :       str += 1;
    1123             :     } else {
    1124             :       /* we have an invalid 0b1xxxxxxx utf8 code point entry */
    1125             :       return (utf8_int8_t *)str;
    1126             :     }
    1127             :   }
    1128             : 
    1129             :   return utf8_null;
    1130             : }
    1131             : 
    1132             : int utf8makevalid(utf8_int8_t *str, const utf8_int32_t replacement) {
    1133             :   utf8_int8_t *read = str;
    1134             :   utf8_int8_t *write = read;
    1135             :   const utf8_int8_t r = (utf8_int8_t)replacement;
    1136             :   utf8_int32_t codepoint = 0;
    1137             : 
    1138             :   if (replacement > 0x7f) {
    1139             :     return -1;
    1140             :   }
    1141             : 
    1142             :   while ('\0' != *read) {
    1143             :     if (0xf0 == (0xf8 & *read)) {
    1144             :       /* ensure each of the 3 following bytes in this 4-byte
    1145             :        * utf8 codepoint began with 0b10xxxxxx */
    1146             :       if ((0x80 != (0xc0 & read[1])) || (0x80 != (0xc0 & read[2])) ||
    1147             :           (0x80 != (0xc0 & read[3]))) {
    1148             :         *write++ = r;
    1149             :         read++;
    1150             :         continue;
    1151             :       }
    1152             : 
    1153             :       /* 4-byte utf8 code point (began with 0b11110xxx) */
    1154             :       read = utf8codepoint(read, &codepoint);
    1155             :       write = utf8catcodepoint(write, codepoint, 4);
    1156             :     } else if (0xe0 == (0xf0 & *read)) {
    1157             :       /* ensure each of the 2 following bytes in this 3-byte
    1158             :        * utf8 codepoint began with 0b10xxxxxx */
    1159             :       if ((0x80 != (0xc0 & read[1])) || (0x80 != (0xc0 & read[2]))) {
    1160             :         *write++ = r;
    1161             :         read++;
    1162             :         continue;
    1163             :       }
    1164             : 
    1165             :       /* 3-byte utf8 code point (began with 0b1110xxxx) */
    1166             :       read = utf8codepoint(read, &codepoint);
    1167             :       write = utf8catcodepoint(write, codepoint, 3);
    1168             :     } else if (0xc0 == (0xe0 & *read)) {
    1169             :       /* ensure the 1 following byte in this 2-byte
    1170             :        * utf8 codepoint began with 0b10xxxxxx */
    1171             :       if (0x80 != (0xc0 & read[1])) {
    1172             :         *write++ = r;
    1173             :         read++;
    1174             :         continue;
    1175             :       }
    1176             : 
    1177             :       /* 2-byte utf8 code point (began with 0b110xxxxx) */
    1178             :       read = utf8codepoint(read, &codepoint);
    1179             :       write = utf8catcodepoint(write, codepoint, 2);
    1180             :     } else if (0x00 == (0x80 & *read)) {
    1181             :       /* 1-byte ascii (began with 0b0xxxxxxx) */
    1182             :       read = utf8codepoint(read, &codepoint);
    1183             :       write = utf8catcodepoint(write, codepoint, 1);
    1184             :     } else {
    1185             :       /* if we got here then we've got a dangling continuation (0b10xxxxxx) */
    1186             :       *write++ = r;
    1187             :       read++;
    1188             :       continue;
    1189             :     }
    1190             :   }
    1191             : 
    1192             :   *write = '\0';
    1193             : 
    1194             :   return 0;
    1195             : }
    1196             : #endif
    1197             : 
    1198             : utf8_constexpr14_impl utf8_int8_t *
    1199   146582188 : utf8codepoint(const utf8_int8_t *utf8_restrict str,
    1200             :               utf8_int32_t *utf8_restrict out_codepoint) {
    1201   146582188 :   if (0xf0 == (0xf8 & str[0])) {
    1202             :     /* 4 byte utf8 codepoint */
    1203           0 :     *out_codepoint = ((0x07 & str[0]) << 18) | ((0x3f & str[1]) << 12) |
    1204           0 :                      ((0x3f & str[2]) << 6) | (0x3f & str[3]);
    1205           0 :     str += 4;
    1206   146582188 :   } else if (0xe0 == (0xf0 & str[0])) {
    1207             :     /* 3 byte utf8 codepoint */
    1208           0 :     *out_codepoint =
    1209           0 :         ((0x0f & str[0]) << 12) | ((0x3f & str[1]) << 6) | (0x3f & str[2]);
    1210           0 :     str += 3;
    1211   146582188 :   } else if (0xc0 == (0xe0 & str[0])) {
    1212             :     /* 2 byte utf8 codepoint */
    1213         229 :     *out_codepoint = ((0x1f & str[0]) << 6) | (0x3f & str[1]);
    1214         229 :     str += 2;
    1215             :   } else {
    1216             :     /* 1 byte utf8 codepoint otherwise */
    1217   146582000 :     *out_codepoint = str[0];
    1218   146582000 :     str += 1;
    1219             :   }
    1220             : 
    1221   146582188 :   return const_cast<utf8_int8_t *>(str);
    1222             : }
    1223             : 
    1224   146582190 : utf8_constexpr14_impl size_t utf8codepointcalcsize(const utf8_int8_t *str) {
    1225   146582190 :   if (0xf0 == (0xf8 & str[0])) {
    1226             :     /* 4 byte utf8 codepoint */
    1227           0 :     return 4;
    1228   146582190 :   } else if (0xe0 == (0xf0 & str[0])) {
    1229             :     /* 3 byte utf8 codepoint */
    1230           1 :     return 3;
    1231   146582189 :   } else if (0xc0 == (0xe0 & str[0])) {
    1232             :     /* 2 byte utf8 codepoint */
    1233         230 :     return 2;
    1234             :   }
    1235             : 
    1236             :   /* 1 byte utf8 codepoint otherwise */
    1237   146582000 :   return 1;
    1238             : }
    1239             : 
    1240             : #if 0
    1241             : utf8_constexpr14_impl size_t utf8codepointsize(utf8_int32_t chr) {
    1242             :   if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
    1243             :     return 1;
    1244             :   } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
    1245             :     return 2;
    1246             :   } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
    1247             :     return 3;
    1248             :   } else { /* if (0 == ((int)0xffe00000 & chr)) { */
    1249             :     return 4;
    1250             :   }
    1251             : }
    1252             : 
    1253             : utf8_int8_t *utf8catcodepoint(utf8_int8_t *str, utf8_int32_t chr, size_t n) {
    1254             :   if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
    1255             :     /* 1-byte/7-bit ascii
    1256             :      * (0b0xxxxxxx) */
    1257             :     if (n < 1) {
    1258             :       return utf8_null;
    1259             :     }
    1260             :     str[0] = (utf8_int8_t)chr;
    1261             :     str += 1;
    1262             :   } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
    1263             :     /* 2-byte/11-bit utf8 code point
    1264             :      * (0b110xxxxx 0b10xxxxxx) */
    1265             :     if (n < 2) {
    1266             :       return utf8_null;
    1267             :     }
    1268             :     str[0] = (utf8_int8_t)(0xc0 | (utf8_int8_t)((chr >> 6) & 0x1f));
    1269             :     str[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
    1270             :     str += 2;
    1271             :   } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
    1272             :     /* 3-byte/16-bit utf8 code point
    1273             :      * (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) */
    1274             :     if (n < 3) {
    1275             :       return utf8_null;
    1276             :     }
    1277             :     str[0] = (utf8_int8_t)(0xe0 | (utf8_int8_t)((chr >> 12) & 0x0f));
    1278             :     str[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
    1279             :     str[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
    1280             :     str += 3;
    1281             :   } else { /* if (0 == ((int)0xffe00000 & chr)) { */
    1282             :     /* 4-byte/21-bit utf8 code point
    1283             :      * (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx) */
    1284             :     if (n < 4) {
    1285             :       return utf8_null;
    1286             :     }
    1287             :     str[0] = (utf8_int8_t)(0xf0 | (utf8_int8_t)((chr >> 18) & 0x07));
    1288             :     str[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 12) & 0x3f));
    1289             :     str[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
    1290             :     str[3] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
    1291             :     str += 4;
    1292             :   }
    1293             : 
    1294             :   return str;
    1295             : }
    1296             : 
    1297             : utf8_constexpr14_impl int utf8islower(utf8_int32_t chr) {
    1298             :   return chr != utf8uprcodepoint(chr);
    1299             : }
    1300             : 
    1301             : utf8_constexpr14_impl int utf8isupper(utf8_int32_t chr) {
    1302             :   return chr != utf8lwrcodepoint(chr);
    1303             : }
    1304             : 
    1305             : void utf8lwr(utf8_int8_t *utf8_restrict str) {
    1306             :   utf8_int32_t cp = 0;
    1307             :   utf8_int8_t *pn = utf8codepoint(str, &cp);
    1308             : 
    1309             :   while (cp != 0) {
    1310             :     const utf8_int32_t lwr_cp = utf8lwrcodepoint(cp);
    1311             :     const size_t size = utf8codepointsize(lwr_cp);
    1312             : 
    1313             :     if (lwr_cp != cp) {
    1314             :       utf8catcodepoint(str, lwr_cp, size);
    1315             :     }
    1316             : 
    1317             :     str = pn;
    1318             :     pn = utf8codepoint(str, &cp);
    1319             :   }
    1320             : }
    1321             : 
    1322             : void utf8upr(utf8_int8_t *utf8_restrict str) {
    1323             :   utf8_int32_t cp = 0;
    1324             :   utf8_int8_t *pn = utf8codepoint(str, &cp);
    1325             : 
    1326             :   while (cp != 0) {
    1327             :     const utf8_int32_t lwr_cp = utf8uprcodepoint(cp);
    1328             :     const size_t size = utf8codepointsize(lwr_cp);
    1329             : 
    1330             :     if (lwr_cp != cp) {
    1331             :       utf8catcodepoint(str, lwr_cp, size);
    1332             :     }
    1333             : 
    1334             :     str = pn;
    1335             :     pn = utf8codepoint(str, &cp);
    1336             :   }
    1337             : }
    1338             : #endif
    1339             : 
    1340   136534000 : utf8_constexpr14_impl utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp) {
    1341   136534000 :   if (((0x0041 <= cp) && (0x005a >= cp)) ||
    1342    63465400 :       ((0x00c0 <= cp) && (0x00d6 >= cp)) ||
    1343    63465400 :       ((0x00d8 <= cp) && (0x00de >= cp)) ||
    1344    63465400 :       ((0x0391 <= cp) && (0x03a1 >= cp)) ||
    1345    63465400 :       ((0x03a3 <= cp) && (0x03ab >= cp)) ||
    1346           0 :       ((0x0410 <= cp) && (0x042f >= cp))) {
    1347    73068300 :     cp += 32;
    1348    63465400 :   } else if ((0x0400 <= cp) && (0x040f >= cp)) {
    1349           0 :     cp += 80;
    1350    63465400 :   } else if (((0x0100 <= cp) && (0x012f >= cp)) ||
    1351    63465400 :              ((0x0132 <= cp) && (0x0137 >= cp)) ||
    1352    63465400 :              ((0x014a <= cp) && (0x0177 >= cp)) ||
    1353    63465400 :              ((0x0182 <= cp) && (0x0185 >= cp)) ||
    1354    63465400 :              ((0x01a0 <= cp) && (0x01a5 >= cp)) ||
    1355    63465400 :              ((0x01de <= cp) && (0x01ef >= cp)) ||
    1356    63465400 :              ((0x01f8 <= cp) && (0x021f >= cp)) ||
    1357    63465400 :              ((0x0222 <= cp) && (0x0233 >= cp)) ||
    1358    63465400 :              ((0x0246 <= cp) && (0x024f >= cp)) ||
    1359    63465400 :              ((0x03d8 <= cp) && (0x03ef >= cp)) ||
    1360    63465400 :              ((0x0460 <= cp) && (0x0481 >= cp)) ||
    1361           0 :              ((0x048a <= cp) && (0x04ff >= cp))) {
    1362           0 :     cp |= 0x1;
    1363    63465400 :   } else if (((0x0139 <= cp) && (0x0148 >= cp)) ||
    1364    63465400 :              ((0x0179 <= cp) && (0x017e >= cp)) ||
    1365    63465400 :              ((0x01af <= cp) && (0x01b0 >= cp)) ||
    1366    63465400 :              ((0x01b3 <= cp) && (0x01b6 >= cp)) ||
    1367           0 :              ((0x01cd <= cp) && (0x01dc >= cp))) {
    1368           0 :     cp += 1;
    1369           0 :     cp &= ~0x1;
    1370             :   } else {
    1371    63465400 :     switch (cp) {
    1372    63465400 :     default:
    1373    63465400 :       break;
    1374           0 :     case 0x0178:
    1375           0 :       cp = 0x00ff;
    1376           0 :       break;
    1377           0 :     case 0x0243:
    1378           0 :       cp = 0x0180;
    1379           0 :       break;
    1380           0 :     case 0x018e:
    1381           0 :       cp = 0x01dd;
    1382           0 :       break;
    1383           0 :     case 0x023d:
    1384           0 :       cp = 0x019a;
    1385           0 :       break;
    1386           0 :     case 0x0220:
    1387           0 :       cp = 0x019e;
    1388           0 :       break;
    1389           0 :     case 0x01b7:
    1390           0 :       cp = 0x0292;
    1391           0 :       break;
    1392           0 :     case 0x01c4:
    1393           0 :       cp = 0x01c6;
    1394           0 :       break;
    1395           0 :     case 0x01c7:
    1396           0 :       cp = 0x01c9;
    1397           0 :       break;
    1398           0 :     case 0x01ca:
    1399           0 :       cp = 0x01cc;
    1400           0 :       break;
    1401           0 :     case 0x01f1:
    1402           0 :       cp = 0x01f3;
    1403           0 :       break;
    1404           0 :     case 0x01f7:
    1405           0 :       cp = 0x01bf;
    1406           0 :       break;
    1407           0 :     case 0x0187:
    1408           0 :       cp = 0x0188;
    1409           0 :       break;
    1410           0 :     case 0x018b:
    1411           0 :       cp = 0x018c;
    1412           0 :       break;
    1413           0 :     case 0x0191:
    1414           0 :       cp = 0x0192;
    1415           0 :       break;
    1416           0 :     case 0x0198:
    1417           0 :       cp = 0x0199;
    1418           0 :       break;
    1419           0 :     case 0x01a7:
    1420           0 :       cp = 0x01a8;
    1421           0 :       break;
    1422           0 :     case 0x01ac:
    1423           0 :       cp = 0x01ad;
    1424           0 :       break;
    1425           0 :     case 0x01b8:
    1426           0 :       cp = 0x01b9;
    1427           0 :       break;
    1428           0 :     case 0x01bc:
    1429           0 :       cp = 0x01bd;
    1430           0 :       break;
    1431           0 :     case 0x01f4:
    1432           0 :       cp = 0x01f5;
    1433           0 :       break;
    1434           0 :     case 0x023b:
    1435           0 :       cp = 0x023c;
    1436           0 :       break;
    1437           0 :     case 0x0241:
    1438           0 :       cp = 0x0242;
    1439           0 :       break;
    1440           0 :     case 0x03fd:
    1441           0 :       cp = 0x037b;
    1442           0 :       break;
    1443           0 :     case 0x03fe:
    1444           0 :       cp = 0x037c;
    1445           0 :       break;
    1446           0 :     case 0x03ff:
    1447           0 :       cp = 0x037d;
    1448           0 :       break;
    1449           0 :     case 0x037f:
    1450           0 :       cp = 0x03f3;
    1451           0 :       break;
    1452           0 :     case 0x0386:
    1453           0 :       cp = 0x03ac;
    1454           0 :       break;
    1455           0 :     case 0x0388:
    1456           0 :       cp = 0x03ad;
    1457           0 :       break;
    1458           0 :     case 0x0389:
    1459           0 :       cp = 0x03ae;
    1460           0 :       break;
    1461           0 :     case 0x038a:
    1462           0 :       cp = 0x03af;
    1463           0 :       break;
    1464           0 :     case 0x038c:
    1465           0 :       cp = 0x03cc;
    1466           0 :       break;
    1467           0 :     case 0x038e:
    1468           0 :       cp = 0x03cd;
    1469           0 :       break;
    1470           0 :     case 0x038f:
    1471           0 :       cp = 0x03ce;
    1472           0 :       break;
    1473           0 :     case 0x0370:
    1474           0 :       cp = 0x0371;
    1475           0 :       break;
    1476           0 :     case 0x0372:
    1477           0 :       cp = 0x0373;
    1478           0 :       break;
    1479           0 :     case 0x0376:
    1480           0 :       cp = 0x0377;
    1481           0 :       break;
    1482           0 :     case 0x03f4:
    1483           0 :       cp = 0x03b8;
    1484           0 :       break;
    1485           0 :     case 0x03cf:
    1486           0 :       cp = 0x03d7;
    1487           0 :       break;
    1488           0 :     case 0x03f9:
    1489           0 :       cp = 0x03f2;
    1490           0 :       break;
    1491           0 :     case 0x03f7:
    1492           0 :       cp = 0x03f8;
    1493           0 :       break;
    1494           0 :     case 0x03fa:
    1495           0 :       cp = 0x03fb;
    1496           0 :       break;
    1497             :     }
    1498             :   }
    1499             : 
    1500   136534000 :   return cp;
    1501             : }
    1502             : 
    1503   141827000 : utf8_constexpr14_impl utf8_int32_t utf8uprcodepoint(utf8_int32_t cp) {
    1504   141827000 :   if (((0x0061 <= cp) && (0x007a >= cp)) ||
    1505    93287800 :       ((0x00e0 <= cp) && (0x00f6 >= cp)) ||
    1506    93287700 :       ((0x00f8 <= cp) && (0x00fe >= cp)) ||
    1507    93287700 :       ((0x03b1 <= cp) && (0x03c1 >= cp)) ||
    1508    93287700 :       ((0x03c3 <= cp) && (0x03cb >= cp)) ||
    1509           0 :       ((0x0430 <= cp) && (0x044f >= cp))) {
    1510    48539100 :     cp -= 32;
    1511    93287700 :   } else if ((0x0450 <= cp) && (0x045f >= cp)) {
    1512           0 :     cp -= 80;
    1513    93287700 :   } else if (((0x0100 <= cp) && (0x012f >= cp)) ||
    1514    93287700 :              ((0x0132 <= cp) && (0x0137 >= cp)) ||
    1515    93287700 :              ((0x014a <= cp) && (0x0177 >= cp)) ||
    1516    93287700 :              ((0x0182 <= cp) && (0x0185 >= cp)) ||
    1517    93287700 :              ((0x01a0 <= cp) && (0x01a5 >= cp)) ||
    1518    93287700 :              ((0x01de <= cp) && (0x01ef >= cp)) ||
    1519    93287700 :              ((0x01f8 <= cp) && (0x021f >= cp)) ||
    1520    93287700 :              ((0x0222 <= cp) && (0x0233 >= cp)) ||
    1521    93287700 :              ((0x0246 <= cp) && (0x024f >= cp)) ||
    1522    93287700 :              ((0x03d8 <= cp) && (0x03ef >= cp)) ||
    1523    93287700 :              ((0x0460 <= cp) && (0x0481 >= cp)) ||
    1524           0 :              ((0x048a <= cp) && (0x04ff >= cp))) {
    1525           0 :     cp &= ~0x1;
    1526    93287700 :   } else if (((0x0139 <= cp) && (0x0148 >= cp)) ||
    1527    93287700 :              ((0x0179 <= cp) && (0x017e >= cp)) ||
    1528    93287700 :              ((0x01af <= cp) && (0x01b0 >= cp)) ||
    1529    93287700 :              ((0x01b3 <= cp) && (0x01b6 >= cp)) ||
    1530           0 :              ((0x01cd <= cp) && (0x01dc >= cp))) {
    1531           0 :     cp -= 1;
    1532           0 :     cp |= 0x1;
    1533             :   } else {
    1534    93287700 :     switch (cp) {
    1535    93287700 :     default:
    1536    93287700 :       break;
    1537           0 :     case 0x00ff:
    1538           0 :       cp = 0x0178;
    1539           0 :       break;
    1540           0 :     case 0x0180:
    1541           0 :       cp = 0x0243;
    1542           0 :       break;
    1543           0 :     case 0x01dd:
    1544           0 :       cp = 0x018e;
    1545           0 :       break;
    1546           0 :     case 0x019a:
    1547           0 :       cp = 0x023d;
    1548           0 :       break;
    1549           0 :     case 0x019e:
    1550           0 :       cp = 0x0220;
    1551           0 :       break;
    1552           0 :     case 0x0292:
    1553           0 :       cp = 0x01b7;
    1554           0 :       break;
    1555           0 :     case 0x01c6:
    1556           0 :       cp = 0x01c4;
    1557           0 :       break;
    1558           0 :     case 0x01c9:
    1559           0 :       cp = 0x01c7;
    1560           0 :       break;
    1561           0 :     case 0x01cc:
    1562           0 :       cp = 0x01ca;
    1563           0 :       break;
    1564           0 :     case 0x01f3:
    1565           0 :       cp = 0x01f1;
    1566           0 :       break;
    1567           0 :     case 0x01bf:
    1568           0 :       cp = 0x01f7;
    1569           0 :       break;
    1570           0 :     case 0x0188:
    1571           0 :       cp = 0x0187;
    1572           0 :       break;
    1573           0 :     case 0x018c:
    1574           0 :       cp = 0x018b;
    1575           0 :       break;
    1576           0 :     case 0x0192:
    1577           0 :       cp = 0x0191;
    1578           0 :       break;
    1579           0 :     case 0x0199:
    1580           0 :       cp = 0x0198;
    1581           0 :       break;
    1582           0 :     case 0x01a8:
    1583           0 :       cp = 0x01a7;
    1584           0 :       break;
    1585           0 :     case 0x01ad:
    1586           0 :       cp = 0x01ac;
    1587           0 :       break;
    1588           0 :     case 0x01b9:
    1589           0 :       cp = 0x01b8;
    1590           0 :       break;
    1591           0 :     case 0x01bd:
    1592           0 :       cp = 0x01bc;
    1593           0 :       break;
    1594           0 :     case 0x01f5:
    1595           0 :       cp = 0x01f4;
    1596           0 :       break;
    1597           0 :     case 0x023c:
    1598           0 :       cp = 0x023b;
    1599           0 :       break;
    1600           0 :     case 0x0242:
    1601           0 :       cp = 0x0241;
    1602           0 :       break;
    1603           0 :     case 0x037b:
    1604           0 :       cp = 0x03fd;
    1605           0 :       break;
    1606           0 :     case 0x037c:
    1607           0 :       cp = 0x03fe;
    1608           0 :       break;
    1609           0 :     case 0x037d:
    1610           0 :       cp = 0x03ff;
    1611           0 :       break;
    1612           0 :     case 0x03f3:
    1613           0 :       cp = 0x037f;
    1614           0 :       break;
    1615           0 :     case 0x03ac:
    1616           0 :       cp = 0x0386;
    1617           0 :       break;
    1618           0 :     case 0x03ad:
    1619           0 :       cp = 0x0388;
    1620           0 :       break;
    1621           0 :     case 0x03ae:
    1622           0 :       cp = 0x0389;
    1623           0 :       break;
    1624           0 :     case 0x03af:
    1625           0 :       cp = 0x038a;
    1626           0 :       break;
    1627           0 :     case 0x03cc:
    1628           0 :       cp = 0x038c;
    1629           0 :       break;
    1630           0 :     case 0x03cd:
    1631           0 :       cp = 0x038e;
    1632           0 :       break;
    1633           0 :     case 0x03ce:
    1634           0 :       cp = 0x038f;
    1635           0 :       break;
    1636           0 :     case 0x0371:
    1637           0 :       cp = 0x0370;
    1638           0 :       break;
    1639           0 :     case 0x0373:
    1640           0 :       cp = 0x0372;
    1641           0 :       break;
    1642           0 :     case 0x0377:
    1643           0 :       cp = 0x0376;
    1644           0 :       break;
    1645           0 :     case 0x03d1:
    1646           0 :       cp = 0x0398;
    1647           0 :       break;
    1648           0 :     case 0x03d7:
    1649           0 :       cp = 0x03cf;
    1650           0 :       break;
    1651           0 :     case 0x03f2:
    1652           0 :       cp = 0x03f9;
    1653           0 :       break;
    1654           0 :     case 0x03f8:
    1655           0 :       cp = 0x03f7;
    1656           0 :       break;
    1657           0 :     case 0x03fb:
    1658           0 :       cp = 0x03fa;
    1659           0 :       break;
    1660             :     }
    1661             :   }
    1662             : 
    1663   141827000 :   return cp;
    1664             : }
    1665             : 
    1666             : utf8_constexpr14_impl utf8_int8_t *
    1667             : utf8rcodepoint(const utf8_int8_t *utf8_restrict str,
    1668             :                utf8_int32_t *utf8_restrict out_codepoint) {
    1669             :   const utf8_int8_t *s = static_cast<const utf8_int8_t *>(str);
    1670             : 
    1671             :   if (0xf0 == (0xf8 & s[0])) {
    1672             :     /* 4 byte utf8 codepoint */
    1673             :     *out_codepoint = ((0x07 & s[0]) << 18) | ((0x3f & s[1]) << 12) |
    1674             :                      ((0x3f & s[2]) << 6) | (0x3f & s[3]);
    1675             :   } else if (0xe0 == (0xf0 & s[0])) {
    1676             :     /* 3 byte utf8 codepoint */
    1677             :     *out_codepoint =
    1678             :         ((0x0f & s[0]) << 12) | ((0x3f & s[1]) << 6) | (0x3f & s[2]);
    1679             :   } else if (0xc0 == (0xe0 & s[0])) {
    1680             :     /* 2 byte utf8 codepoint */
    1681             :     *out_codepoint = ((0x1f & s[0]) << 6) | (0x3f & s[1]);
    1682             :   } else {
    1683             :     /* 1 byte utf8 codepoint otherwise */
    1684             :     *out_codepoint = s[0];
    1685             :   }
    1686             : 
    1687             :   do {
    1688             :     s--;
    1689             :   } while ((0 != (0x80 & s[0])) && (0x80 == (0xc0 & s[0])));
    1690             : 
    1691             :   return const_cast<utf8_int8_t *>(s);
    1692             : }
    1693             : 
    1694             : #undef utf8_restrict
    1695             : #undef utf8_constexpr14
    1696             : #undef utf8_null
    1697             : 
    1698             : } // namespace
    1699             : 
    1700             : #if defined(__clang__)
    1701             : #pragma clang diagnostic pop
    1702             : #endif
    1703             : 
    1704             : #endif /* SHEREDOM_UTF8_H_INCLUDED */

Generated by: LCOV version 1.14