LCOV - code coverage report
Current view: top level - port - cpl_recode.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 105 125 84.0 %
Date: 2024-11-21 22:18:42 Functions: 10 10 100.0 %

          Line data    Source code
       1             : /**********************************************************************
       2             :  *
       3             :  * Name:     cpl_recode.cpp
       4             :  * Project:  CPL - Common Portability Library
       5             :  * Purpose:  Character set recoding and char/wchar_t conversions.
       6             :  * Author:   Andrey Kiselev, dron@ak4719.spb.edu
       7             :  *
       8             :  **********************************************************************
       9             :  * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
      10             :  * Copyright (c) 2008, Frank Warmerdam
      11             :  * Copyright (c) 2011-2014, Even Rouault <even dot rouault at spatialys.com>
      12             :  *
      13             :  * Permission to use, copy, modify, and distribute this software for any
      14             :  * purpose with or without fee is hereby granted, provided that the above
      15             :  * copyright notice and this permission notice appear in all copies.
      16             :  *
      17             :  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
      18             :  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
      19             :  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
      20             :  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
      21             :  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
      22             :  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
      23             :  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
      24             :  **********************************************************************/
      25             : 
      26             : #include "cpl_port.h"
      27             : #include "cpl_string.h"
      28             : 
      29             : #include <cstring>
      30             : 
      31             : #include "cpl_conv.h"
      32             : #include "cpl_character_sets.h"
      33             : 
      34             : #include "utf8.h"
      35             : 
      36             : #ifdef CPL_RECODE_ICONV
      37             : extern void CPLClearRecodeIconvWarningFlags();
      38             : extern char *CPLRecodeIconv(const char *, const char *,
      39             :                             const char *) CPL_RETURNS_NONNULL;
      40             : extern char *CPLRecodeFromWCharIconv(const wchar_t *, const char *,
      41             :                                      const char *);
      42             : extern wchar_t *CPLRecodeToWCharIconv(const char *, const char *, const char *);
      43             : #endif  // CPL_RECODE_ICONV
      44             : 
      45             : extern void CPLClearRecodeStubWarningFlags();
      46             : extern char *CPLRecodeStub(const char *, const char *,
      47             :                            const char *) CPL_RETURNS_NONNULL;
      48             : extern char *CPLRecodeFromWCharStub(const wchar_t *, const char *,
      49             :                                     const char *);
      50             : extern wchar_t *CPLRecodeToWCharStub(const char *, const char *, const char *);
      51             : extern int CPLIsUTF8Stub(const char *, int);
      52             : 
      53             : /************************************************************************/
      54             : /*                             CPLRecode()                              */
      55             : /************************************************************************/
      56             : 
      57             : /**
      58             :  * Convert a string from a source encoding to a destination encoding.
      59             :  *
      60             :  * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
      61             :  * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
      62             :  * <ul>
      63             :  *  <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in
      64             :  *  fact)</li>
      65             :  *  <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
      66             :  *  <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
      67             :  * </ul>
      68             :  *
      69             :  * If an error occurs an error may, or may not be posted with CPLError().
      70             :  *
      71             :  * @param pszSource a NULL terminated string.
      72             :  * @param pszSrcEncoding the source encoding.
      73             :  * @param pszDstEncoding the destination encoding.
      74             :  *
      75             :  * @return a NULL terminated string which should be freed with CPLFree().
      76             :  *
      77             :  * @since GDAL 1.6.0
      78             :  */
      79             : 
      80      783245 : char CPL_DLL *CPLRecode(const char *pszSource, const char *pszSrcEncoding,
      81             :                         const char *pszDstEncoding)
      82             : 
      83             : {
      84             :     /* -------------------------------------------------------------------- */
      85             :     /*      Handle a few common short cuts.                                 */
      86             :     /* -------------------------------------------------------------------- */
      87      783245 :     if (EQUAL(pszSrcEncoding, pszDstEncoding))
      88          96 :         return CPLStrdup(pszSource);
      89             : 
      90      783149 :     if (EQUAL(pszSrcEncoding, CPL_ENC_ASCII) &&
      91           0 :         (EQUAL(pszDstEncoding, CPL_ENC_UTF8) ||
      92           0 :          EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1)))
      93           0 :         return CPLStrdup(pszSource);
      94             : 
      95             :     // A few hard coded CPxxx/ISO-8859-x to UTF-8 tables
      96     1516800 :     if (EQUAL(pszDstEncoding, CPL_ENC_UTF8) &&
      97      733647 :         CPLGetConversionTableToUTF8(pszSrcEncoding))
      98             :     {
      99       21020 :         return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
     100             :     }
     101             : 
     102             : #ifdef CPL_RECODE_ICONV
     103             :     /* -------------------------------------------------------------------- */
     104             :     /*      CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8                               */
     105             :     /*      and CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1 conversions are handled   */
     106             :     /*      very well by the stub implementation which is faster than the   */
     107             :     /*      iconv() route. Use a stub for these two ones and iconv()        */
     108             :     /*      everything else.                                                */
     109             :     /* -------------------------------------------------------------------- */
     110      762129 :     if ((EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1) &&
     111      712490 :          EQUAL(pszDstEncoding, CPL_ENC_UTF8)) ||
     112       49639 :         (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) &&
     113       49502 :          EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1)))
     114             :     {
     115      761846 :         return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
     116             :     }
     117             : #ifdef _WIN32
     118             :     else if (((EQUAL(pszSrcEncoding, "CP_ACP") ||
     119             :                EQUAL(pszSrcEncoding, "CP_OEMCP")) &&
     120             :               EQUAL(pszDstEncoding, CPL_ENC_UTF8)) ||
     121             :              (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) &&
     122             :               (EQUAL(pszDstEncoding, "CP_ACP") ||
     123             :                EQUAL(pszDstEncoding, "CP_OEMCP"))))
     124             :     {
     125             :         return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
     126             :     }
     127             : #endif
     128             :     else
     129             :     {
     130         283 :         return CPLRecodeIconv(pszSource, pszSrcEncoding, pszDstEncoding);
     131             :     }
     132             : #else   // CPL_RECODE_STUB
     133             :     return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
     134             : #endif  // CPL_RECODE_ICONV
     135             : }
     136             : 
     137             : /************************************************************************/
     138             : /*                         CPLRecodeFromWChar()                         */
     139             : /************************************************************************/
     140             : 
     141             : /**
     142             :  * Convert wchar_t string to UTF-8.
     143             :  *
     144             :  * Convert a wchar_t string into a multibyte utf-8 string.  The only
     145             :  * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
     146             :  * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
     147             :  * and CPL_ENC_ISO8859_1.  In some cases (i.e. using iconv()) other encodings
     148             :  * may also be supported.
     149             :  *
     150             :  * Note that the wchar_t type varies in size on different systems. On
     151             :  * win32 it is normally 2 bytes, and on UNIX 4 bytes.
     152             :  *
     153             :  * If an error occurs an error may, or may not be posted with CPLError().
     154             :  *
     155             :  * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
     156             :  * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
     157             :  * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
     158             :  *
     159             :  * @return a zero terminated multi-byte string which should be freed with
     160             :  * CPLFree(), or NULL if an error occurs.
     161             :  *
     162             :  * @since GDAL 1.6.0
     163             :  */
     164             : 
     165      111145 : char CPL_DLL *CPLRecodeFromWChar(const wchar_t *pwszSource,
     166             :                                  const char *pszSrcEncoding,
     167             :                                  const char *pszDstEncoding)
     168             : 
     169             : {
     170             : #ifdef CPL_RECODE_ICONV
     171             :     /* -------------------------------------------------------------------- */
     172             :     /*      Conversions from CPL_ENC_UCS2                                   */
     173             :     /*      to CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well   */
     174             :     /*      handled by the stub implementation.                             */
     175             :     /* -------------------------------------------------------------------- */
     176      111145 :     if ((EQUAL(pszSrcEncoding, CPL_ENC_UCS2) ||
     177        1360 :          EQUAL(pszSrcEncoding, "WCHAR_T")) &&
     178      111144 :         (EQUAL(pszDstEncoding, CPL_ENC_UTF8) ||
     179           0 :          EQUAL(pszDstEncoding, CPL_ENC_ASCII) ||
     180           0 :          EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1)))
     181             :     {
     182      111144 :         return CPLRecodeFromWCharStub(pwszSource, pszSrcEncoding,
     183      111144 :                                       pszDstEncoding);
     184             :     }
     185             : 
     186           1 :     return CPLRecodeFromWCharIconv(pwszSource, pszSrcEncoding, pszDstEncoding);
     187             : 
     188             : #else   // CPL_RECODE_STUB
     189             :     return CPLRecodeFromWCharStub(pwszSource, pszSrcEncoding, pszDstEncoding);
     190             : #endif  // CPL_RECODE_ICONV
     191             : }
     192             : 
     193             : /************************************************************************/
     194             : /*                          CPLRecodeToWChar()                          */
     195             : /************************************************************************/
     196             : 
     197             : /**
     198             :  * Convert UTF-8 string to a wchar_t string.
     199             :  *
     200             :  * Convert a 8bit, multi-byte per character input string into a wide
     201             :  * character (wchar_t) string.  The only guaranteed supported source encodings
     202             :  * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1).  The only
     203             :  * guaranteed supported destination encoding is CPL_ENC_UCS2.  Other source
     204             :  * and destination encodings may be supported depending on the underlying
     205             :  * implementation.
     206             :  *
     207             :  * Note that the wchar_t type varies in size on different systems. On
     208             :  * win32 it is normally 2 bytes, and on UNIX 4 bytes.
     209             :  *
     210             :  * If an error occurs an error may, or may not be posted with CPLError().
     211             :  *
     212             :  * @param pszSource input multi-byte character string.
     213             :  * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
     214             :  * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
     215             :  *
     216             :  * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
     217             :  * NULL on error.
     218             :  *
     219             :  * @since GDAL 1.6.0
     220             :  */
     221             : 
     222       52345 : wchar_t CPL_DLL *CPLRecodeToWChar(const char *pszSource,
     223             :                                   const char *pszSrcEncoding,
     224             :                                   const char *pszDstEncoding)
     225             : 
     226             : {
     227             : #ifdef CPL_RECODE_ICONV
     228             :     /* -------------------------------------------------------------------- */
     229             :     /*      Conversions to CPL_ENC_UCS2                                     */
     230             :     /*      from CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */
     231             :     /*      handled by the stub implementation.                             */
     232             :     /* -------------------------------------------------------------------- */
     233       52345 :     if ((EQUAL(pszDstEncoding, CPL_ENC_UCS2) ||
     234           0 :          EQUAL(pszDstEncoding, "WCHAR_T")) &&
     235       52345 :         (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) ||
     236           0 :          EQUAL(pszSrcEncoding, CPL_ENC_ASCII) ||
     237           0 :          EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1)))
     238             :     {
     239       52345 :         return CPLRecodeToWCharStub(pszSource, pszSrcEncoding, pszDstEncoding);
     240             :     }
     241             : 
     242           0 :     return CPLRecodeToWCharIconv(pszSource, pszSrcEncoding, pszDstEncoding);
     243             : 
     244             : #else   // CPL_RECODE_STUB
     245             :     return CPLRecodeToWCharStub(pszSource, pszSrcEncoding, pszDstEncoding);
     246             : #endif  // CPL_RECODE_ICONV
     247             : }
     248             : 
     249             : /************************************************************************/
     250             : /*                               CPLIsASCII()                           */
     251             : /************************************************************************/
     252             : 
     253             : /**
     254             :  * Test if a string is encoded as ASCII.
     255             :  *
     256             :  * @param pabyData input string to test
     257             :  * @param nLen length of the input string, or -1 if the function must compute
     258             :  *             the string length. In which case it must be null terminated.
     259             :  * @return true if the string is encoded as ASCII. false otherwise
     260             :  *
     261             :  * @since GDAL 3.6.0
     262             :  */
     263         750 : bool CPLIsASCII(const char *pabyData, size_t nLen)
     264             : {
     265         750 :     if (nLen == static_cast<size_t>(-1))
     266          21 :         nLen = strlen(pabyData);
     267       12791 :     for (size_t i = 0; i < nLen; ++i)
     268             :     {
     269       12043 :         if (static_cast<unsigned char>(pabyData[i]) > 127)
     270           2 :             return false;
     271             :     }
     272         748 :     return true;
     273             : }
     274             : 
     275             : /************************************************************************/
     276             : /*                          CPLForceToASCII()                           */
     277             : /************************************************************************/
     278             : 
     279             : /**
     280             :  * Return a new string that is made only of ASCII characters. If non-ASCII
     281             :  * characters are found in the input string, they will be replaced by the
     282             :  * provided replacement character.
     283             :  *
     284             :  * This function does not make any assumption on the encoding of the input
     285             :  * string (except it must be nul-terminated if nLen equals -1, or have at
     286             :  * least nLen bytes otherwise). CPLUTF8ForceToASCII() can be used instead when
     287             :  * the input string is known to be UTF-8 encoded.
     288             :  *
     289             :  * @param pabyData input string to test
     290             :  * @param nLen length of the input string, or -1 if the function must compute
     291             :  *             the string length. In which case it must be null terminated.
     292             : 
     293             :  * @param chReplacementChar character which will be used when the input stream
     294             :  *                          contains a non ASCII character. Must be valid ASCII!
     295             :  *
     296             :  * @return a new string that must be freed with CPLFree().
     297             :  *
     298             :  * @since GDAL 1.7.0
     299             :  */
     300           5 : char *CPLForceToASCII(const char *pabyData, int nLen, char chReplacementChar)
     301             : {
     302           5 :     const size_t nRealLen =
     303           5 :         (nLen >= 0) ? static_cast<size_t>(nLen) : strlen(pabyData);
     304           5 :     char *pszOutputString = static_cast<char *>(CPLMalloc(nRealLen + 1));
     305           5 :     const char *pszPtr = pabyData;
     306           5 :     const char *pszEnd = pabyData + nRealLen;
     307           5 :     size_t i = 0;
     308          19 :     while (pszPtr != pszEnd)
     309             :     {
     310          14 :         if (*reinterpret_cast<const unsigned char *>(pszPtr) > 127)
     311             :         {
     312           3 :             pszOutputString[i] = chReplacementChar;
     313           3 :             ++pszPtr;
     314           3 :             ++i;
     315             :         }
     316             :         else
     317             :         {
     318          11 :             pszOutputString[i] = *pszPtr;
     319          11 :             ++pszPtr;
     320          11 :             ++i;
     321             :         }
     322             :     }
     323           5 :     pszOutputString[i] = '\0';
     324           5 :     return pszOutputString;
     325             : }
     326             : 
     327             : /************************************************************************/
     328             : /*                       CPLUTF8ForceToASCII()                          */
     329             : /************************************************************************/
     330             : 
     331             : /**
     332             :  * Return a new string that is made only of ASCII characters. If non-ASCII
     333             :  * characters are found in the input string, for which an "equivalent" ASCII
     334             :  * character is not found, they will be replaced by the provided replacement
     335             :  * character.
     336             :  *
     337             :  * This function is aware of https://en.wikipedia.org/wiki/Latin-1_Supplement
     338             :  * and https://en.wikipedia.org/wiki/Latin_Extended-A to provide sensible
     339             :  * replacements for accented characters.
     340             : 
     341             :  * @param pszStr NUL-terminated UTF-8 string.
     342             :  * @param chReplacementChar character which will be used when the input stream
     343             :  *                          contains a non ASCII character that cannot be
     344             :  *                          substituted with an equivalent ASCII character.
     345             :  *                          Must be valid ASCII!
     346             :  *
     347             :  * @return a new string that must be freed with CPLFree().
     348             :  *
     349             :  * @since GDAL 3.9
     350             :  */
     351          16 : char *CPLUTF8ForceToASCII(const char *pszStr, char chReplacementChar)
     352             : {
     353             :     static const struct
     354             :     {
     355             :         short nCodePoint;
     356             :         char chFirst;
     357             :         char chSecond;
     358             :     } aLatinCharacters[] = {
     359             :         // https://en.wikipedia.org/wiki/Latin-1_Supplement
     360             :         {0xC0, 'A', 0},    // Latin Capital Letter A with grave
     361             :         {0xC1, 'A', 0},    // Latin Capital letter A with acute
     362             :         {0xC2, 'A', 0},    // Latin Capital letter A with circumflex
     363             :         {0xC3, 'A', 0},    // Latin Capital letter A with tilde
     364             :         {0xC4, 'A', 0},    // Latin Capital letter A with diaeresis
     365             :         {0xC5, 'A', 0},    // Latin Capital letter A with ring above
     366             :         {0xC6, 'A', 'E'},  // Latin Capital letter AE
     367             :         {0xC7, 'C', 0},    // Latin Capital letter C with cedilla
     368             :         {0xC8, 'E', 0},    // Latin Capital letter E with grave
     369             :         {0xC9, 'E', 0},    // Latin Capital letter E with acute
     370             :         {0xCA, 'E', 0},    // Latin Capital letter E with circumflex
     371             :         {0xCB, 'E', 0},    // Latin Capital letter E with diaeresis
     372             :         {0xCC, 'I', 0},    // Latin Capital letter I with grave
     373             :         {0xCD, 'I', 0},    // Latin Capital letter I with acute
     374             :         {0xCE, 'I', 0},    // Latin Capital letter I with circumflex
     375             :         {0xCF, 'I', 0},    // Latin Capital letter I with diaeresis
     376             :         // { 0xD0, '?', 0 }, // Latin Capital letter Eth
     377             :         {0xD1, 'N', 0},  // Latin Capital letter N with tilde
     378             :         {0xD2, 'O', 0},  // Latin Capital letter O with grave
     379             :         {0xD3, 'O', 0},  // Latin Capital letter O with acute
     380             :         {0xD4, 'O', 0},  // Latin Capital letter O with circumflex
     381             :         {0xD5, 'O', 0},  // Latin Capital letter O with tilde
     382             :         {0xD6, 'O', 0},  // Latin Capital letter O with diaeresis
     383             :         {0xD8, 'O', 0},  // Latin Capital letter O with stroke
     384             :         {0xD9, 'U', 0},  // Latin Capital letter U with grave
     385             :         {0xDA, 'U', 0},  // Latin Capital letter U with acute
     386             :         {0xDB, 'U', 0},  // Latin Capital Letter U with circumflex
     387             :         {0xDC, 'U', 0},  // Latin Capital Letter U with diaeresis
     388             :         {0xDD, 'Y', 0},  // Latin Capital Letter Y with acute
     389             :         // { 0xDE, '?', 0 }, // Latin Capital Letter Thorn
     390             :         {0xDF, 'S', 'S'},  // Latin Small Letter sharp S
     391             :         {0xE0, 'a', 0},    // Latin Small Letter A with grave
     392             :         {0xE1, 'a', 0},    // Latin Small Letter A with acute
     393             :         {0xE2, 'a', 0},    // Latin Small Letter A with circumflex
     394             :         {0xE3, 'a', 0},    // Latin Small Letter A with tilde
     395             :         {0xE4, 'a', 0},    // Latin Small Letter A with diaeresis
     396             :         {0xE5, 'a', 0},    // Latin Small Letter A with ring above
     397             :         {0xE6, 'a', 'e'},  // Latin Small Letter AE
     398             :         {0xE7, 'c', 0},    // Latin Small Letter C with cedilla
     399             :         {0xE8, 'e', 0},    // Latin Small Letter E with grave
     400             :         {0xE9, 'e', 0},    // Latin Small Letter E with acute
     401             :         {0xEA, 'e', 0},    // Latin Small Letter E with circumflex
     402             :         {0xEB, 'e', 0},    // Latin Small Letter E with diaeresis
     403             :         {0xEC, 'i', 0},    // Latin Small Letter I with grave
     404             :         {0xED, 'i', 0},    // Latin Small Letter I with acute
     405             :         {0xEE, 'i', 0},    // Latin Small Letter I with circumflex
     406             :         {0xEF, 'i', 0},    // Latin Small Letter I with diaeresis
     407             :         // { 0xF0, '?', 0 }, // Latin Small Letter Eth
     408             :         {0xF1, 'n', 0},  // Latin Small Letter N with tilde
     409             :         {0xF2, 'o', 0},  // Latin Small Letter O with grave
     410             :         {0xF3, 'o', 0},  // Latin Small Letter O with acute
     411             :         {0xF4, 'o', 0},  // Latin Small Letter O with circumflex
     412             :         {0xF5, 'o', 0},  // Latin Small Letter O with tilde
     413             :         {0xF6, 'o', 0},  // Latin Small Letter O with diaeresis
     414             :         {0xF8, 'o', 0},  // Latin Small Letter O with stroke
     415             :         {0xF9, 'u', 0},  // Latin Small Letter U with grave
     416             :         {0xFA, 'u', 0},  // Latin Small Letter U with acute
     417             :         {0xFB, 'u', 0},  // Latin Small Letter U with circumflex
     418             :         {0xFC, 'u', 0},  // Latin Small Letter U with diaeresis
     419             :         {0xFD, 'y', 0},  // Latin Small Letter Y with acute
     420             :         // { 0xFE, '?', 0 }, // Latin Small Letter Thorn
     421             :         {0xFF, 'u', 0},  // Latin Small Letter Y with diaeresis
     422             : 
     423             :         // https://en.wikipedia.org/wiki/Latin_Extended-A
     424             :         {
     425             :             0x0100,
     426             :             'A',
     427             :             0,
     428             :         },  // Latin Capital letter A with macron
     429             :         {
     430             :             0x0101,
     431             :             'a',
     432             :             0,
     433             :         },  // Latin Small letter A with macron
     434             :         {
     435             :             0x0102,
     436             :             'A',
     437             :             0,
     438             :         },  // Latin Capital letter A with breve
     439             :         {
     440             :             0x0103,
     441             :             'a',
     442             :             0,
     443             :         },  // Latin Small letter A with breve
     444             :         {
     445             :             0x0104,
     446             :             'A',
     447             :             0,
     448             :         },  // Latin Capital letter A with ogonek
     449             :         {
     450             :             0x0105,
     451             :             'a',
     452             :             0,
     453             :         },  // Latin Small letter A with ogonek
     454             :         {
     455             :             0x0106,
     456             :             'C',
     457             :             0,
     458             :         },  // Latin Capital letter C with acute
     459             :         {
     460             :             0x0107,
     461             :             'c',
     462             :             0,
     463             :         },  // Latin Small letter C with acute
     464             :         {
     465             :             0x0108,
     466             :             'C',
     467             :             0,
     468             :         },  // Latin Capital letter C with circumflex
     469             :         {
     470             :             0x0109,
     471             :             'c',
     472             :             0,
     473             :         },  // Latin Small letter C with circumflex
     474             :         {
     475             :             0x010A,
     476             :             'C',
     477             :             0,
     478             :         },  // Latin Capital letter C with dot above
     479             :         {
     480             :             0x010B,
     481             :             'c',
     482             :             0,
     483             :         },  // Latin Small letter C with dot above
     484             :         {
     485             :             0x010C,
     486             :             'C',
     487             :             0,
     488             :         },  // Latin Capital letter C with caron
     489             :         {
     490             :             0x010D,
     491             :             'c',
     492             :             0,
     493             :         },  // Latin Small letter C with caron
     494             :         {
     495             :             0x010E,
     496             :             'D',
     497             :             0,
     498             :         },  // Latin Capital letter D with caron
     499             :         {
     500             :             0x010F,
     501             :             'd',
     502             :             0,
     503             :         },  // Latin Small letter D with caron
     504             :         {
     505             :             0x0110,
     506             :             'D',
     507             :             0,
     508             :         },  // Latin Capital letter D with stroke
     509             :         {
     510             :             0x0111,
     511             :             'd',
     512             :             0,
     513             :         },  // Latin Small letter D with stroke
     514             :         {
     515             :             0x0112,
     516             :             'E',
     517             :             0,
     518             :         },  // Latin Capital letter E with macron
     519             :         {
     520             :             0x0113,
     521             :             'e',
     522             :             0,
     523             :         },  // Latin Small letter E with macron
     524             :         {
     525             :             0x0114,
     526             :             'E',
     527             :             0,
     528             :         },  // Latin Capital letter E with breve
     529             :         {
     530             :             0x0115,
     531             :             'e',
     532             :             0,
     533             :         },  // Latin Small letter E with breve
     534             :         {
     535             :             0x0116,
     536             :             'E',
     537             :             0,
     538             :         },  // Latin Capital letter E with dot above
     539             :         {
     540             :             0x0117,
     541             :             'e',
     542             :             0,
     543             :         },  // Latin Small letter E with dot above
     544             :         {
     545             :             0x0118,
     546             :             'E',
     547             :             0,
     548             :         },  // Latin Capital letter E with ogonek
     549             :         {
     550             :             0x0119,
     551             :             'e',
     552             :             0,
     553             :         },  // Latin Small letter E with ogonek
     554             :         {
     555             :             0x011A,
     556             :             'E',
     557             :             0,
     558             :         },  // Latin Capital letter E with caron
     559             :         {
     560             :             0x011B,
     561             :             'e',
     562             :             0,
     563             :         },  // Latin Small letter E with caron
     564             :         {
     565             :             0x011C,
     566             :             'G',
     567             :             0,
     568             :         },  // Latin Capital letter G with circumflex
     569             :         {
     570             :             0x011D,
     571             :             'g',
     572             :             0,
     573             :         },  // Latin Small letter G with circumflex
     574             :         {
     575             :             0x011E,
     576             :             'G',
     577             :             0,
     578             :         },  // Latin Capital letter G with breve
     579             :         {
     580             :             0x011F,
     581             :             'g',
     582             :             0,
     583             :         },  // Latin Small letter G with breve
     584             :         {
     585             :             0x0120,
     586             :             'G',
     587             :             0,
     588             :         },  // Latin Capital letter G with dot above
     589             :         {
     590             :             0x0121,
     591             :             'g',
     592             :             0,
     593             :         },  // Latin Small letter G with dot above
     594             :         {
     595             :             0x0122,
     596             :             'G',
     597             :             0,
     598             :         },  // Latin Capital letter G with cedilla
     599             :         {
     600             :             0x0123,
     601             :             'g',
     602             :             0,
     603             :         },  // Latin Small letter G with cedilla
     604             :         {
     605             :             0x0124,
     606             :             'H',
     607             :             0,
     608             :         },  // Latin Capital letter H with circumflex
     609             :         {
     610             :             0x0125,
     611             :             'h',
     612             :             0,
     613             :         },  // Latin Small letter H with circumflex
     614             :         {
     615             :             0x0126,
     616             :             'H',
     617             :             0,
     618             :         },  // Latin Capital letter H with stroke
     619             :         {
     620             :             0x0127,
     621             :             'h',
     622             :             0,
     623             :         },  // Latin Small letter H with stroke
     624             :         {
     625             :             0x0128,
     626             :             'I',
     627             :             0,
     628             :         },  // Latin Capital letter I with tilde
     629             :         {
     630             :             0x0129,
     631             :             'i',
     632             :             0,
     633             :         },  // Latin Small letter I with tilde
     634             :         {
     635             :             0x012A,
     636             :             'I',
     637             :             0,
     638             :         },  // Latin Capital letter I with macron
     639             :         {
     640             :             0x012B,
     641             :             'i',
     642             :             0,
     643             :         },  // Latin Small letter I with macron
     644             :         {
     645             :             0x012C,
     646             :             'I',
     647             :             0,
     648             :         },  // Latin Capital letter I with breve
     649             :         {
     650             :             0x012D,
     651             :             'i',
     652             :             0,
     653             :         },  // Latin Small letter I with breve
     654             :         {
     655             :             0x012E,
     656             :             'I',
     657             :             0,
     658             :         },  // Latin Capital letter I with ogonek
     659             :         {
     660             :             0x012F,
     661             :             'i',
     662             :             0,
     663             :         },  // Latin Small letter I with ogonek
     664             :         {
     665             :             0x0130,
     666             :             'I',
     667             :             0,
     668             :         },  // Latin Capital letter I with dot above
     669             :         {
     670             :             0x0131,
     671             :             'i',
     672             :             0,
     673             :         },  // Latin Small letter dotless I
     674             :         {
     675             :             0x0132,
     676             :             'I',
     677             :             'J',
     678             :         },  // Latin Capital Ligature IJ
     679             :         {
     680             :             0x0133,
     681             :             'i',
     682             :             'j',
     683             :         },  // Latin Small Ligature IJ
     684             :         {
     685             :             0x0134,
     686             :             'J',
     687             :             0,
     688             :         },  // Latin Capital letter J with circumflex
     689             :         {
     690             :             0x0135,
     691             :             'j',
     692             :             0,
     693             :         },  // Latin Small letter J with circumflex
     694             :         {
     695             :             0x0136,
     696             :             'K',
     697             :             0,
     698             :         },  // Latin Capital letter K with cedilla
     699             :         {
     700             :             0x0137,
     701             :             'k',
     702             :             0,
     703             :         },  // Latin Small letter K with cedilla
     704             :         {
     705             :             0x0138,
     706             :             'k',
     707             :             0,
     708             :         },  // Latin Small letter Kra
     709             :         {
     710             :             0x0139,
     711             :             'L',
     712             :             0,
     713             :         },  // Latin Capital letter L with acute
     714             :         {
     715             :             0x013A,
     716             :             'l',
     717             :             0,
     718             :         },  // Latin Small letter L with acute
     719             :         {
     720             :             0x013B,
     721             :             'L',
     722             :             0,
     723             :         },  // Latin Capital letter L with cedilla
     724             :         {
     725             :             0x013C,
     726             :             'l',
     727             :             0,
     728             :         },  // Latin Small letter L with cedilla
     729             :         {
     730             :             0x013D,
     731             :             'L',
     732             :             0,
     733             :         },  // Latin Capital letter L with caron
     734             :         {
     735             :             0x013E,
     736             :             'l',
     737             :             0,
     738             :         },  // Latin Small letter L with caron
     739             :         {
     740             :             0x013F,
     741             :             'L',
     742             :             0,
     743             :         },  // Latin Capital letter L with middle dot
     744             :         {
     745             :             0x0140,
     746             :             'l',
     747             :             0,
     748             :         },  // Latin Small letter L with middle dot
     749             :         {
     750             :             0x0141,
     751             :             'L',
     752             :             0,
     753             :         },  // Latin Capital letter L with stroke
     754             :         {
     755             :             0x0142,
     756             :             'l',
     757             :             0,
     758             :         },  // Latin Small letter L with stroke
     759             :         {
     760             :             0x0143,
     761             :             'N',
     762             :             0,
     763             :         },  // Latin Capital letter N with acute
     764             :         {
     765             :             0x0144,
     766             :             'n',
     767             :             0,
     768             :         },  // Latin Small letter N with acute
     769             :         {
     770             :             0x0145,
     771             :             'N',
     772             :             0,
     773             :         },  // Latin Capital letter N with cedilla
     774             :         {
     775             :             0x0146,
     776             :             'n',
     777             :             0,
     778             :         },  // Latin Small letter N with cedilla
     779             :         {
     780             :             0x0147,
     781             :             'N',
     782             :             0,
     783             :         },  // Latin Capital letter N with caron
     784             :         {
     785             :             0x0148,
     786             :             'n',
     787             :             0,
     788             :         },  // Latin Small letter N with caron
     789             :         // { 0x014A , '?' , 0, }, // Latin Capital letter Eng
     790             :         // { 0x014B , '?' , 0, }, // Latin Small letter Eng
     791             :         {
     792             :             0x014C,
     793             :             'O',
     794             :             0,
     795             :         },  // Latin Capital letter O with macron
     796             :         {
     797             :             0x014D,
     798             :             'o',
     799             :             0,
     800             :         },  // Latin Small letter O with macron
     801             :         {
     802             :             0x014E,
     803             :             'O',
     804             :             0,
     805             :         },  // Latin Capital letter O with breve
     806             :         {
     807             :             0x014F,
     808             :             'o',
     809             :             0,
     810             :         },  // Latin Small letter O with breve
     811             :         {
     812             :             0x0150,
     813             :             'O',
     814             :             0,
     815             :         },  // Latin Capital Letter O with double acute
     816             :         {
     817             :             0x0151,
     818             :             'o',
     819             :             0,
     820             :         },  // Latin Small Letter O with double acute
     821             :         {
     822             :             0x0152,
     823             :             'O',
     824             :             'E',
     825             :         },  // Latin Capital Ligature OE
     826             :         {
     827             :             0x0153,
     828             :             'o',
     829             :             'e',
     830             :         },  // Latin Small Ligature OE
     831             :         {
     832             :             0x0154,
     833             :             'R',
     834             :             0,
     835             :         },  // Latin Capital letter R with acute
     836             :         {
     837             :             0x0155,
     838             :             'r',
     839             :             0,
     840             :         },  // Latin Small letter R with acute
     841             :         {
     842             :             0x0156,
     843             :             'R',
     844             :             0,
     845             :         },  // Latin Capital letter R with cedilla
     846             :         {
     847             :             0x0157,
     848             :             'r',
     849             :             0,
     850             :         },  // Latin Small letter R with cedilla
     851             :         {
     852             :             0x0158,
     853             :             'R',
     854             :             0,
     855             :         },  // Latin Capital letter R with caron
     856             :         {
     857             :             0x0159,
     858             :             'r',
     859             :             0,
     860             :         },  // Latin Small letter R with caron
     861             :         {
     862             :             0x015A,
     863             :             'S',
     864             :             0,
     865             :         },  // Latin Capital letter S with acute
     866             :         {
     867             :             0x015B,
     868             :             's',
     869             :             0,
     870             :         },  // Latin Small letter S with acute
     871             :         {
     872             :             0x015C,
     873             :             'S',
     874             :             0,
     875             :         },  // Latin Capital letter S with circumflex
     876             :         {
     877             :             0x015D,
     878             :             's',
     879             :             0,
     880             :         },  // Latin Small letter S with circumflex
     881             :         {
     882             :             0x015E,
     883             :             'S',
     884             :             0,
     885             :         },  // Latin Capital letter S with cedilla
     886             :         {
     887             :             0x015F,
     888             :             's',
     889             :             0,
     890             :         },  // Latin Small letter S with cedilla
     891             :         {
     892             :             0x0160,
     893             :             'S',
     894             :             0,
     895             :         },  // Latin Capital letter S with caron
     896             :         {
     897             :             0x0161,
     898             :             's',
     899             :             0,
     900             :         },  // Latin Small letter S with caron
     901             :         {
     902             :             0x0162,
     903             :             'T',
     904             :             0,
     905             :         },  // Latin Capital letter T with cedilla
     906             :         {
     907             :             0x0163,
     908             :             't',
     909             :             0,
     910             :         },  // Latin Small letter T with cedilla
     911             :         {
     912             :             0x0164,
     913             :             'T',
     914             :             0,
     915             :         },  // Latin Capital letter T with caron
     916             :         {
     917             :             0x0165,
     918             :             't',
     919             :             0,
     920             :         },  // Latin Small letter T with caron
     921             :         {
     922             :             0x0166,
     923             :             'T',
     924             :             0,
     925             :         },  // Latin Capital letter T with stroke
     926             :         {
     927             :             0x0167,
     928             :             't',
     929             :             0,
     930             :         },  // Latin Small letter T with stroke
     931             :         {
     932             :             0x0168,
     933             :             'U',
     934             :             0,
     935             :         },  // Latin Capital letter U with tilde
     936             :         {
     937             :             0x0169,
     938             :             'u',
     939             :             0,
     940             :         },  // Latin Small letter U with tilde
     941             :         {
     942             :             0x016A,
     943             :             'U',
     944             :             0,
     945             :         },  // Latin Capital letter U with macron
     946             :         {
     947             :             0x016B,
     948             :             'u',
     949             :             0,
     950             :         },  // Latin Small letter U with macron
     951             :         {
     952             :             0x016C,
     953             :             'U',
     954             :             0,
     955             :         },  // Latin Capital letter U with breve
     956             :         {
     957             :             0x016D,
     958             :             'u',
     959             :             0,
     960             :         },  // Latin Small letter U with breve
     961             :         {
     962             :             0x016E,
     963             :             'U',
     964             :             0,
     965             :         },  // Latin Capital letter U with ring above
     966             :         {
     967             :             0x016F,
     968             :             'u',
     969             :             0,
     970             :         },  // Latin Small letter U with ring above
     971             :         {
     972             :             0x0170,
     973             :             'U',
     974             :             0,
     975             :         },  // Latin Capital Letter U with double acute
     976             :         {
     977             :             0x0171,
     978             :             'u',
     979             :             0,
     980             :         },  // Latin Small Letter U with double acute
     981             :         {
     982             :             0x0172,
     983             :             'U',
     984             :             0,
     985             :         },  // Latin Capital letter U with ogonek
     986             :         {
     987             :             0x0173,
     988             :             'u',
     989             :             0,
     990             :         },  // Latin Small letter U with ogonek
     991             :         {
     992             :             0x0174,
     993             :             'W',
     994             :             0,
     995             :         },  // Latin Capital letter W with circumflex
     996             :         {
     997             :             0x0175,
     998             :             'w',
     999             :             0,
    1000             :         },  // Latin Small letter W with circumflex
    1001             :         {
    1002             :             0x0176,
    1003             :             'Y',
    1004             :             0,
    1005             :         },  // Latin Capital letter Y with circumflex
    1006             :         {
    1007             :             0x0177,
    1008             :             'y',
    1009             :             0,
    1010             :         },  // Latin Small letter Y with circumflex
    1011             :         {
    1012             :             0x0178,
    1013             :             'Y',
    1014             :             0,
    1015             :         },  // Latin Capital letter Y with diaeresis
    1016             :         {
    1017             :             0x0179,
    1018             :             'Z',
    1019             :             0,
    1020             :         },  // Latin Capital letter Z with acute
    1021             :         {
    1022             :             0x017A,
    1023             :             'z',
    1024             :             0,
    1025             :         },  // Latin Small letter Z with acute
    1026             :         {
    1027             :             0x017B,
    1028             :             'Z',
    1029             :             0,
    1030             :         },  // Latin Capital letter Z with dot above
    1031             :         {
    1032             :             0x017C,
    1033             :             'z',
    1034             :             0,
    1035             :         },  // Latin Small letter Z with dot above
    1036             :         {
    1037             :             0x017D,
    1038             :             'Z',
    1039             :             0,
    1040             :         },  // Latin Capital letter Z with caron
    1041             :         {
    1042             :             0x017E,
    1043             :             'z',
    1044             :             0,
    1045             :         },  // Latin Small letter Z with caron
    1046             :     };
    1047             : 
    1048          16 :     const size_t nLen = strlen(pszStr);
    1049          16 :     char *pszOutputString = static_cast<char *>(CPLMalloc(nLen + 1));
    1050          16 :     const char *pszPtr = pszStr;
    1051          16 :     const char *pszEnd = pszStr + nLen;
    1052          16 :     size_t i = 0;
    1053         248 :     while (pszPtr != pszEnd)
    1054             :     {
    1055         233 :         if (*reinterpret_cast<const unsigned char *>(pszPtr) > 127)
    1056             :         {
    1057             :             utf8_int32_t codepoint;
    1058         189 :             if (pszPtr + utf8codepointcalcsize(
    1059         189 :                              reinterpret_cast<const utf8_int8_t *>(pszPtr)) >
    1060             :                 pszEnd)
    1061           1 :                 break;
    1062         188 :             auto pszNext = reinterpret_cast<const char *>(utf8codepoint(
    1063             :                 reinterpret_cast<const utf8_int8_t *>(pszPtr), &codepoint));
    1064         188 :             char ch = chReplacementChar;
    1065       17075 :             for (const auto &latin1char : aLatinCharacters)
    1066             :             {
    1067       17073 :                 if (codepoint == latin1char.nCodePoint)
    1068             :                 {
    1069         186 :                     pszOutputString[i] = latin1char.chFirst;
    1070         186 :                     ++i;
    1071         186 :                     if (latin1char.chSecond)
    1072             :                     {
    1073           7 :                         pszOutputString[i] = latin1char.chSecond;
    1074           7 :                         ++i;
    1075             :                     }
    1076         186 :                     ch = 0;
    1077         186 :                     break;
    1078             :                 }
    1079             :             }
    1080         188 :             if (ch)
    1081             :             {
    1082           2 :                 pszOutputString[i] = ch;
    1083           2 :                 ++i;
    1084             :             }
    1085         188 :             pszPtr = pszNext;
    1086             :         }
    1087             :         else
    1088             :         {
    1089          44 :             pszOutputString[i] = *pszPtr;
    1090          44 :             ++pszPtr;
    1091          44 :             ++i;
    1092             :         }
    1093             :     }
    1094          16 :     pszOutputString[i] = '\0';
    1095          16 :     return pszOutputString;
    1096             : }
    1097             : 
    1098             : /************************************************************************/
    1099             : /*                        CPLEncodingCharSize()                         */
    1100             : /************************************************************************/
    1101             : 
    1102             : /**
    1103             :  * Return bytes per character for encoding.
    1104             :  *
    1105             :  * This function returns the size in bytes of the smallest character
    1106             :  * in this encoding.  For fixed width encodings (ASCII, UCS-2, UCS-4) this
    1107             :  * is straight forward.  For encodings like UTF8 and UTF16 which represent
    1108             :  * some characters as a sequence of atomic character sizes the function
    1109             :  * still returns the atomic character size (1 for UTF8, 2 for UTF16).
    1110             :  *
    1111             :  * This function will return the correct value for well known encodings
    1112             :  * with corresponding CPL_ENC_ values.  It may not return the correct value
    1113             :  * for other encodings even if they are supported by the underlying iconv
    1114             :  * or windows transliteration services.  Hopefully it will improve over time.
    1115             :  *
    1116             :  * @param pszEncoding the name of the encoding.
    1117             :  *
    1118             :  * @return the size of a minimal character in bytes or -1 if the size is
    1119             :  * unknown.
    1120             :  */
    1121             : 
    1122           1 : int CPLEncodingCharSize(const char *pszEncoding)
    1123             : 
    1124             : {
    1125           1 :     if (EQUAL(pszEncoding, CPL_ENC_UTF8))
    1126           0 :         return 1;
    1127           1 :     else if (EQUAL(pszEncoding, CPL_ENC_UTF16) ||
    1128           1 :              EQUAL(pszEncoding, "UTF-16LE"))
    1129           1 :         return 2;
    1130           0 :     else if (EQUAL(pszEncoding, CPL_ENC_UCS2) || EQUAL(pszEncoding, "UCS-2LE"))
    1131           0 :         return 2;
    1132           0 :     else if (EQUAL(pszEncoding, CPL_ENC_UCS4))
    1133           0 :         return 4;
    1134           0 :     else if (EQUAL(pszEncoding, CPL_ENC_ASCII))
    1135           0 :         return 1;
    1136           0 :     else if (STARTS_WITH_CI(pszEncoding, "ISO-8859-"))
    1137           0 :         return 1;
    1138             : 
    1139           0 :     return -1;
    1140             : }
    1141             : 
    1142             : /************************************************************************/
    1143             : /*                    CPLClearRecodeWarningFlags()                      */
    1144             : /************************************************************************/
    1145             : 
    1146       10404 : void CPLClearRecodeWarningFlags()
    1147             : {
    1148             : #ifdef CPL_RECODE_ICONV
    1149       10404 :     CPLClearRecodeIconvWarningFlags();
    1150             : #endif
    1151       10404 :     CPLClearRecodeStubWarningFlags();
    1152       10404 : }
    1153             : 
    1154             : /************************************************************************/
    1155             : /*                         CPLStrlenUTF8()                              */
    1156             : /************************************************************************/
    1157             : 
    1158             : /**
    1159             :  * Return the number of UTF-8 characters of a nul-terminated string.
    1160             :  *
    1161             :  * This is different from strlen() which returns the number of bytes.
    1162             :  *
    1163             :  * @param pszUTF8Str a nul-terminated UTF-8 string
    1164             :  *
    1165             :  * @return the number of UTF-8 characters.
    1166             :  */
    1167             : 
    1168      358245 : int CPLStrlenUTF8(const char *pszUTF8Str)
    1169             : {
    1170      358245 :     int nCharacterCount = 0;
    1171    18609400 :     for (int i = 0; pszUTF8Str[i] != '\0'; ++i)
    1172             :     {
    1173    18251100 :         if ((pszUTF8Str[i] & 0xc0) != 0x80)
    1174    18251100 :             ++nCharacterCount;
    1175             :     }
    1176      358245 :     return nCharacterCount;
    1177             : }
    1178             : 
    1179             : /************************************************************************/
    1180             : /*                           CPLCanRecode()                             */
    1181             : /************************************************************************/
    1182             : 
    1183             : /**
    1184             :  * Checks if it is possible to recode a string from one encoding to another.
    1185             :  *
    1186             :  * @param pszTestStr a NULL terminated string.
    1187             :  * @param pszSrcEncoding the source encoding.
    1188             :  * @param pszDstEncoding the destination encoding.
    1189             :  *
    1190             :  * @return a TRUE if recode is possible.
    1191             :  *
    1192             :  * @since GDAL 3.1.0
    1193             :  */
    1194        5586 : int CPLCanRecode(const char *pszTestStr, const char *pszSrcEncoding,
    1195             :                  const char *pszDstEncoding)
    1196             : {
    1197        5586 :     CPLClearRecodeWarningFlags();
    1198        5586 :     CPLErrorReset();
    1199             : 
    1200        5586 :     CPLPushErrorHandler(CPLQuietErrorHandler);
    1201        5586 :     char *pszRec(CPLRecode(pszTestStr, pszSrcEncoding, pszDstEncoding));
    1202        5586 :     CPLPopErrorHandler();
    1203             : 
    1204        5586 :     if (pszRec == nullptr)
    1205             :     {
    1206           0 :         return FALSE;
    1207             :     }
    1208             : 
    1209        5586 :     CPLFree(pszRec);
    1210             : 
    1211        5586 :     if (CPLGetLastErrorType() != 0)
    1212             :     {
    1213           1 :         return FALSE;
    1214             :     }
    1215             : 
    1216        5585 :     return TRUE;
    1217             : }

Generated by: LCOV version 1.14