LCOV - code coverage report
Current view: top level - port - cpl_recode_stub.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 208 314 66.2 %
Date: 2025-10-22 13:51:22 Functions: 11 11 100.0 %

          Line data    Source code
       1             : /**********************************************************************
       2             :  *
       3             :  * Name:     cpl_recode_stub.cpp
       4             :  * Project:  CPL - Common Portability Library
       5             :  * Purpose:  Character set recoding and char/wchar_t conversions, stub
       6             :  *           implementation to be used if iconv() functionality is not
       7             :  *           available.
       8             :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       9             :  *
      10             :  * The bulk of this code is derived from the utf.c module from FLTK. It
      11             :  * was originally downloaded from:
      12             :  *    http://svn.easysw.com/public/fltk/fltk/trunk/src/utf.c
      13             :  *
      14             :  **********************************************************************
      15             :  * Copyright (c) 2008, Frank Warmerdam
      16             :  * Copyright 2006 by Bill Spitzak and others.
      17             :  * Copyright (c) 2009-2014, Even Rouault <even dot rouault at spatialys.com>
      18             :  *
      19             :  * Permission to use, copy, modify, and distribute this software for any
      20             :  * purpose with or without fee is hereby granted, provided that the above
      21             :  * copyright notice and this permission notice appear in all copies.
      22             :  *
      23             :  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
      24             :  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
      25             :  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
      26             :  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
      27             :  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
      28             :  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
      29             :  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
      30             :  **********************************************************************/
      31             : 
      32             : #include "cpl_port.h"
      33             : #include "cpl_string.h"
      34             : 
      35             : #include <cstring>
      36             : 
      37             : #include "cpl_conv.h"
      38             : #include "cpl_error.h"
      39             : #include "cpl_character_sets.c"
      40             : 
      41             : static unsigned utf8decode(const char *p, const char *end, int *len);
      42             : static unsigned utf8towc(const char *src, unsigned srclen, wchar_t *dst,
      43             :                          unsigned dstlen);
      44             : static unsigned utf8toa(const char *src, unsigned srclen, char *dst,
      45             :                         unsigned dstlen);
      46             : static unsigned utf8fromwc(char *dst, unsigned dstlen, const wchar_t *src,
      47             :                            unsigned srclen);
      48             : static unsigned utf8froma(char *dst, unsigned dstlen, const char *src,
      49             :                           unsigned srclen);
      50             : static int utf8test(const char *src, unsigned srclen);
      51             : 
      52             : #ifdef _WIN32
      53             : 
      54             : #include <windows.h>
      55             : #include <winnls.h>
      56             : 
      57             : static char *CPLWin32Recode(const char *src, unsigned src_code_page,
      58             :                             unsigned dst_code_page) CPL_RETURNS_NONNULL;
      59             : #endif
      60             : 
      61             : /* used by cpl_recode.cpp */
      62             : extern void CPLClearRecodeStubWarningFlags();
      63             : extern char *CPLRecodeStub(const char *, const char *,
      64             :                            const char *) CPL_RETURNS_NONNULL;
      65             : extern char *CPLRecodeFromWCharStub(const wchar_t *, const char *,
      66             :                                     const char *);
      67             : extern wchar_t *CPLRecodeToWCharStub(const char *, const char *, const char *);
      68             : 
      69             : /************************************************************************/
      70             : /* ==================================================================== */
      71             : /*      Stub Implementation not depending on iconv() or WIN32 API.      */
      72             : /* ==================================================================== */
      73             : /************************************************************************/
      74             : 
      75             : static bool bHaveWarned1 = false;
      76             : static bool bHaveWarned2 = false;
      77             : static bool bHaveWarned3 = false;
      78             : static bool bHaveWarned4 = false;
      79             : static bool bHaveWarned5 = false;
      80             : static bool bHaveWarned6 = false;
      81             : 
      82             : /************************************************************************/
      83             : /*                 CPLClearRecodeStubWarningFlags()                     */
      84             : /************************************************************************/
      85             : 
      86       13601 : void CPLClearRecodeStubWarningFlags()
      87             : {
      88       13601 :     bHaveWarned1 = false;
      89       13601 :     bHaveWarned2 = false;
      90       13601 :     bHaveWarned3 = false;
      91       13601 :     bHaveWarned4 = false;
      92       13601 :     bHaveWarned5 = false;
      93       13601 :     bHaveWarned6 = false;
      94       13601 : }
      95             : 
      96             : /************************************************************************/
      97             : /*                           CPLRecodeStub()                            */
      98             : /************************************************************************/
      99             : 
     100             : /**
     101             :  * Convert a string from a source encoding to a destination encoding.
     102             :  *
     103             :  * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
     104             :  * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
     105             :  * <ul>
     106             :  *  <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in
     107             :  *  fact)</li>
     108             :  *  <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
     109             :  *  <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
     110             :  * </ul>
     111             :  *
     112             :  * If an error occurs an error may, or may not be posted with CPLError().
     113             :  *
     114             :  * @param pszSource a NULL terminated string.
     115             :  * @param pszSrcEncoding the source encoding.
     116             :  * @param pszDstEncoding the destination encoding.
     117             :  *
     118             :  * @return a NULL terminated string which should be freed with CPLFree().
     119             :  */
     120             : 
     121     1243320 : char *CPLRecodeStub(const char *pszSource, const char *pszSrcEncoding,
     122             :                     const char *pszDstEncoding)
     123             : 
     124             : {
     125             :     /* -------------------------------------------------------------------- */
     126             :     /*      If the source or destination is current locale(), we change     */
     127             :     /*      it to ISO8859-1 since our stub implementation does not          */
     128             :     /*      attempt to address locales properly.                            */
     129             :     /* -------------------------------------------------------------------- */
     130             : 
     131     1243320 :     if (pszSrcEncoding[0] == '\0')
     132           0 :         pszSrcEncoding = CPL_ENC_ISO8859_1;
     133             : 
     134     1243320 :     if (pszDstEncoding[0] == '\0')
     135           0 :         pszDstEncoding = CPL_ENC_ISO8859_1;
     136             : 
     137             :     /* -------------------------------------------------------------------- */
     138             :     /*      ISO8859 to UTF8                                                 */
     139             :     /* -------------------------------------------------------------------- */
     140     1243320 :     if (strcmp(pszSrcEncoding, CPL_ENC_ISO8859_1) == 0 &&
     141     1171150 :         strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
     142             :     {
     143     1171150 :         const int nCharCount = static_cast<int>(strlen(pszSource));
     144     1171150 :         char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1));
     145             : 
     146     1171150 :         utf8froma(pszResult, nCharCount * 2 + 1, pszSource, nCharCount);
     147             : 
     148     1171150 :         return pszResult;
     149             :     }
     150             : 
     151             :     /* -------------------------------------------------------------------- */
     152             :     /*      UTF8 to ISO8859                                                 */
     153             :     /* -------------------------------------------------------------------- */
     154       72170 :     if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0 &&
     155       47577 :         strcmp(pszDstEncoding, CPL_ENC_ISO8859_1) == 0)
     156             :     {
     157       47577 :         int nCharCount = static_cast<int>(strlen(pszSource));
     158       47577 :         char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount + 1));
     159             : 
     160       47577 :         utf8toa(pszSource, nCharCount, pszResult, nCharCount + 1);
     161             : 
     162       47577 :         return pszResult;
     163             :     }
     164             : 
     165             :     // A few hard coded CPxxx/ISO-8859-x to UTF-8 tables
     166       24593 :     if (EQUAL(pszDstEncoding, CPL_ENC_UTF8))
     167             :     {
     168       24593 :         const auto pConvTable = CPLGetConversionTableToUTF8(pszSrcEncoding);
     169       24593 :         if (pConvTable)
     170             :         {
     171       24593 :             const auto convTable = *pConvTable;
     172       24593 :             const size_t nCharCount = strlen(pszSource);
     173             :             char *pszResult =
     174       24593 :                 static_cast<char *>(CPLCalloc(1, nCharCount * 3 + 1));
     175       24593 :             size_t iDst = 0;
     176       24593 :             unsigned char *pabyResult =
     177             :                 reinterpret_cast<unsigned char *>(pszResult);
     178      580111 :             for (size_t i = 0; i < nCharCount; ++i)
     179             :             {
     180      555518 :                 const unsigned char nChar =
     181      555518 :                     static_cast<unsigned char>(pszSource[i]);
     182      555518 :                 if (nChar <= 127)
     183             :                 {
     184      522071 :                     pszResult[iDst] = pszSource[i];
     185      522071 :                     ++iDst;
     186             :                 }
     187             :                 else
     188             :                 {
     189       33447 :                     const unsigned char nShiftedChar = nChar - 128;
     190       33447 :                     if (convTable[nShiftedChar][0])
     191             :                     {
     192       33446 :                         pabyResult[iDst] = convTable[nShiftedChar][0];
     193       33446 :                         ++iDst;
     194       33446 :                         CPLAssert(convTable[nShiftedChar][1]);
     195       33446 :                         pabyResult[iDst] = convTable[nShiftedChar][1];
     196       33446 :                         ++iDst;
     197       33446 :                         if (convTable[nShiftedChar][2])
     198             :                         {
     199          13 :                             pabyResult[iDst] = convTable[nShiftedChar][2];
     200          13 :                             ++iDst;
     201             :                         }
     202             :                     }
     203             :                     else
     204             :                     {
     205             :                         // Skip the invalid sequence in the input string.
     206           1 :                         if (!bHaveWarned2)
     207             :                         {
     208           1 :                             bHaveWarned2 = true;
     209           1 :                             CPLError(CE_Warning, CPLE_AppDefined,
     210             :                                      "One or several characters couldn't be "
     211             :                                      "converted correctly from %s to %s. "
     212             :                                      "This warning will not be emitted anymore",
     213             :                                      pszSrcEncoding, pszDstEncoding);
     214             :                         }
     215             :                     }
     216             :                 }
     217             :             }
     218             : 
     219       24593 :             pszResult[iDst] = 0;
     220       24593 :             return pszResult;
     221             :         }
     222             :     }
     223             : 
     224             : #ifdef _WIN32
     225             :     const auto MapEncodingToWindowsCodePage = [](const char *pszEncoding)
     226             :     {
     227             :         // Cf https://learn.microsoft.com/fr-fr/windows/win32/intl/code-page-identifiers
     228             :         if (STARTS_WITH(pszEncoding, "CP"))
     229             :         {
     230             :             const int nCode = atoi(pszEncoding + strlen("CP"));
     231             :             if (nCode > 0)
     232             :                 return nCode;
     233             :             else if (EQUAL(pszEncoding, "CP_OEMCP"))
     234             :                 return CP_OEMCP;
     235             :             else if (EQUAL(pszEncoding, "CP_ACP"))
     236             :                 return CP_ACP;
     237             :         }
     238             :         else if (STARTS_WITH(pszEncoding, "WINDOWS-"))
     239             :         {
     240             :             const int nCode = atoi(pszEncoding + strlen("WINDOWS-"));
     241             :             if (nCode > 0)
     242             :                 return nCode;
     243             :         }
     244             :         else if (STARTS_WITH(pszEncoding, "ISO-8859-"))
     245             :         {
     246             :             const int nCode = atoi(pszEncoding + strlen("ISO-8859-"));
     247             :             if ((nCode >= 1 && nCode <= 9) || nCode == 13 || nCode == 15)
     248             :                 return 28590 + nCode;
     249             :         }
     250             : 
     251             :         // Return a negative value, since CP_ACP = 0
     252             :         return -1;
     253             :     };
     254             : 
     255             :     /* ---------------------------------------------------------------------*/
     256             :     /*     XXX to UTF8                                                      */
     257             :     /* ---------------------------------------------------------------------*/
     258             :     if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
     259             :     {
     260             :         const int nCode = MapEncodingToWindowsCodePage(pszSrcEncoding);
     261             :         if (nCode >= 0)
     262             :         {
     263             :             return CPLWin32Recode(pszSource, nCode, CP_UTF8);
     264             :         }
     265             :     }
     266             : 
     267             :     /* ---------------------------------------------------------------------*/
     268             :     /*      UTF8 to XXX                                                     */
     269             :     /* ---------------------------------------------------------------------*/
     270             :     if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0)
     271             :     {
     272             :         const int nCode = MapEncodingToWindowsCodePage(pszDstEncoding);
     273             :         if (nCode >= 0)
     274             :         {
     275             :             return CPLWin32Recode(pszSource, CP_UTF8, nCode);
     276             :         }
     277             :     }
     278             : #endif
     279             : 
     280             :     /* -------------------------------------------------------------------- */
     281             :     /*      Anything else to UTF-8 is treated as ISO8859-1 to UTF-8 with    */
     282             :     /*      a one-time warning.                                             */
     283             :     /* -------------------------------------------------------------------- */
     284           0 :     if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
     285             :     {
     286           0 :         const int nCharCount = static_cast<int>(strlen(pszSource));
     287           0 :         char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1));
     288             : 
     289           0 :         if (!bHaveWarned1)
     290             :         {
     291           0 :             bHaveWarned1 = true;
     292           0 :             CPLError(CE_Warning, CPLE_AppDefined,
     293             :                      "Recode from %s to UTF-8 not supported, "
     294             :                      "treated as ISO-8859-1 to UTF-8.",
     295             :                      pszSrcEncoding);
     296             :         }
     297             : 
     298           0 :         utf8froma(pszResult, nCharCount * 2 + 1, pszSource, nCharCount);
     299             : 
     300           0 :         return pszResult;
     301             :     }
     302             : 
     303             :     /* -------------------------------------------------------------------- */
     304             :     /*      Everything else is treated as a no-op with a warning.           */
     305             :     /* -------------------------------------------------------------------- */
     306             :     {
     307           0 :         if (!bHaveWarned3)
     308             :         {
     309           0 :             bHaveWarned3 = true;
     310           0 :             CPLError(CE_Warning, CPLE_AppDefined,
     311             :                      "Recode from %s to %s not supported, no change applied.",
     312             :                      pszSrcEncoding, pszDstEncoding);
     313             :         }
     314             : 
     315           0 :         return CPLStrdup(pszSource);
     316             :     }
     317             : }
     318             : 
     319             : /************************************************************************/
     320             : /*                       CPLRecodeFromWCharStub()                       */
     321             : /************************************************************************/
     322             : 
     323             : /**
     324             :  * Convert wchar_t string to UTF-8.
     325             :  *
     326             :  * Convert a wchar_t string into a multibyte utf-8 string.  The only
     327             :  * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
     328             :  * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
     329             :  * and CPL_ENC_ISO8859_1.  In some cases (i.e. using iconv()) other encodings
     330             :  * may also be supported.
     331             :  *
     332             :  * Note that the wchar_t type varies in size on different systems. On
     333             :  * win32 it is normally 2 bytes, and on unix 4 bytes.
     334             :  *
     335             :  * If an error occurs an error may, or may not be posted with CPLError().
     336             :  *
     337             :  * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
     338             :  * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
     339             :  * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
     340             :  *
     341             :  * @return a zero terminated multi-byte string which should be freed with
     342             :  * CPLFree(), or NULL if an error occurs.
     343             :  */
     344             : 
     345      130852 : char *CPLRecodeFromWCharStub(const wchar_t *pwszSource,
     346             :                              const char *pszSrcEncoding,
     347             :                              const char *pszDstEncoding)
     348             : 
     349             : {
     350             :     /* -------------------------------------------------------------------- */
     351             :     /*      We try to avoid changes of character set.  We are just          */
     352             :     /*      providing for unicode to unicode.                               */
     353             :     /* -------------------------------------------------------------------- */
     354      130852 :     if (strcmp(pszSrcEncoding, "WCHAR_T") != 0 &&
     355      129190 :         strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0 &&
     356      129190 :         strcmp(pszSrcEncoding, CPL_ENC_UTF16) != 0 &&
     357      129190 :         strcmp(pszSrcEncoding, CPL_ENC_UCS2) != 0 &&
     358           0 :         strcmp(pszSrcEncoding, CPL_ENC_UCS4) != 0)
     359             :     {
     360           0 :         CPLError(CE_Failure, CPLE_AppDefined,
     361             :                  "Stub recoding implementation does not support "
     362             :                  "CPLRecodeFromWCharStub(...,%s,%s)",
     363             :                  pszSrcEncoding, pszDstEncoding);
     364           0 :         return nullptr;
     365             :     }
     366             : 
     367             :     /* -------------------------------------------------------------------- */
     368             :     /*      What is the source length.                                      */
     369             :     /* -------------------------------------------------------------------- */
     370      130852 :     int nSrcLen = 0;
     371             : 
     372     1926710 :     while (pwszSource[nSrcLen] != 0)
     373     1795860 :         nSrcLen++;
     374             : 
     375             :     /* -------------------------------------------------------------------- */
     376             :     /*      Allocate destination buffer plenty big.                         */
     377             :     /* -------------------------------------------------------------------- */
     378      130852 :     const int nDstBufSize = nSrcLen * 4 + 1;
     379             :     // Nearly worst case.
     380      130852 :     char *pszResult = static_cast<char *>(CPLMalloc(nDstBufSize));
     381             : 
     382      130852 :     if (nSrcLen == 0)
     383             :     {
     384       57926 :         pszResult[0] = '\0';
     385       57926 :         return pszResult;
     386             :     }
     387             : 
     388             :     /* -------------------------------------------------------------------- */
     389             :     /*      Convert, and confirm we had enough space.                       */
     390             :     /* -------------------------------------------------------------------- */
     391       72926 :     const int nDstLen = utf8fromwc(pszResult, nDstBufSize, pwszSource, nSrcLen);
     392       72926 :     if (nDstLen >= nDstBufSize)
     393             :     {
     394           0 :         CPLAssert(false);  // too small!
     395             :         return nullptr;
     396             :     }
     397             : 
     398             :     /* -------------------------------------------------------------------- */
     399             :     /*      If something other than UTF-8 was requested, recode now.        */
     400             :     /* -------------------------------------------------------------------- */
     401       72926 :     if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
     402       72926 :         return pszResult;
     403             : 
     404             :     char *pszFinalResult =
     405           0 :         CPLRecodeStub(pszResult, CPL_ENC_UTF8, pszDstEncoding);
     406             : 
     407           0 :     CPLFree(pszResult);
     408             : 
     409           0 :     return pszFinalResult;
     410             : }
     411             : 
     412             : /************************************************************************/
     413             : /*                        CPLRecodeToWCharStub()                        */
     414             : /************************************************************************/
     415             : 
     416             : /**
     417             :  * Convert UTF-8 string to a wchar_t string.
     418             :  *
     419             :  * Convert a 8bit, multi-byte per character input string into a wide
     420             :  * character (wchar_t) string.  The only guaranteed supported source encodings
     421             :  * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1).  The only
     422             :  * guaranteed supported destination encoding is CPL_ENC_UCS2.  Other source
     423             :  * and destination encodings may be supported depending on the underlying
     424             :  * implementation.
     425             :  *
     426             :  * Note that the wchar_t type varies in size on different systems. On
     427             :  * win32 it is normally 2 bytes, and on unix 4 bytes.
     428             :  *
     429             :  * If an error occurs an error may, or may not be posted with CPLError().
     430             :  *
     431             :  * @param pszSource input multi-byte character string.
     432             :  * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
     433             :  * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
     434             :  *
     435             :  * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
     436             :  * NULL on error.
     437             :  *
     438             :  */
     439             : 
     440       41083 : wchar_t *CPLRecodeToWCharStub(const char *pszSource, const char *pszSrcEncoding,
     441             :                               const char *pszDstEncoding)
     442             : 
     443             : {
     444       41083 :     char *pszUTF8Source = const_cast<char *>(pszSource);
     445             : 
     446       41083 :     if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0 &&
     447           0 :         strcmp(pszSrcEncoding, CPL_ENC_ASCII) != 0)
     448             :     {
     449           0 :         pszUTF8Source = CPLRecodeStub(pszSource, pszSrcEncoding, CPL_ENC_UTF8);
     450           0 :         if (pszUTF8Source == nullptr)
     451           0 :             return nullptr;
     452             :     }
     453             : 
     454             :     /* -------------------------------------------------------------------- */
     455             :     /*      We try to avoid changes of character set.  We are just          */
     456             :     /*      providing for unicode to unicode.                               */
     457             :     /* -------------------------------------------------------------------- */
     458       41083 :     if (strcmp(pszDstEncoding, "WCHAR_T") != 0 &&
     459       41083 :         strcmp(pszDstEncoding, CPL_ENC_UCS2) != 0 &&
     460           0 :         strcmp(pszDstEncoding, CPL_ENC_UCS4) != 0 &&
     461           0 :         strcmp(pszDstEncoding, CPL_ENC_UTF16) != 0)
     462             :     {
     463           0 :         CPLError(CE_Failure, CPLE_AppDefined,
     464             :                  "Stub recoding implementation does not support "
     465             :                  "CPLRecodeToWCharStub(...,%s,%s)",
     466             :                  pszSrcEncoding, pszDstEncoding);
     467           0 :         if (pszUTF8Source != pszSource)
     468           0 :             CPLFree(pszUTF8Source);
     469           0 :         return nullptr;
     470             :     }
     471             : 
     472             :     /* -------------------------------------------------------------------- */
     473             :     /*      Do the UTF-8 to UCS-2 recoding.                                 */
     474             :     /* -------------------------------------------------------------------- */
     475       41083 :     int nSrcLen = static_cast<int>(strlen(pszUTF8Source));
     476             :     wchar_t *pwszResult =
     477       41083 :         static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), nSrcLen + 1));
     478             : 
     479       41083 :     utf8towc(pszUTF8Source, nSrcLen, pwszResult, nSrcLen + 1);
     480             : 
     481       41083 :     if (pszUTF8Source != pszSource)
     482           0 :         CPLFree(pszUTF8Source);
     483             : 
     484       41083 :     return pwszResult;
     485             : }
     486             : 
     487             : /************************************************************************/
     488             : /*                                 CPLIsUTF8()                          */
     489             : /************************************************************************/
     490             : 
     491             : /**
     492             :  * Test if a string is encoded as UTF-8.
     493             :  *
     494             :  * @param pabyData input string to test
     495             :  * @param nLen length of the input string, or -1 if the function must compute
     496             :  *             the string length. In which case it must be null terminated.
     497             :  * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
     498             :  *
     499             :  */
     500       18933 : int CPLIsUTF8(const char *pabyData, int nLen)
     501             : {
     502       18933 :     if (nLen < 0)
     503       14319 :         nLen = static_cast<int>(strlen(pabyData));
     504       18933 :     return utf8test(pabyData, static_cast<unsigned>(nLen)) != 0;
     505             : }
     506             : 
     507             : /************************************************************************/
     508             : /* ==================================================================== */
     509             : /*      UTF.C code from FLTK with some modifications.                   */
     510             : /* ==================================================================== */
     511             : /************************************************************************/
     512             : 
     513             : /* Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
     514             :    they are instead turned into the Unicode REPLACEMENT CHARACTER, of
     515             :    value 0xfffd.
     516             :    If this is on utf8decode will correctly map most (perhaps all)
     517             :    human-readable text that is in ISO-8859-1. This may allow you
     518             :    to completely ignore character sets in your code because virtually
     519             :    everything is either ISO-8859-1 or UTF-8.
     520             : */
     521             : #define ERRORS_TO_ISO8859_1 1
     522             : 
     523             : /* Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
     524             :    Unicode index for Microsoft's CP1252 character set. You should
     525             :    also set ERRORS_TO_ISO8859_1. With this a huge amount of more
     526             :    available text (such as all web pages) are correctly converted
     527             :    to Unicode.
     528             : */
     529             : #define ERRORS_TO_CP1252 1
     530             : 
     531             : /* A number of Unicode code points are in fact illegal and should not
     532             :    be produced by a UTF-8 converter. Turn this on will replace the
     533             :    bytes in those encodings with errors. If you do this then converting
     534             :    arbitrary 16-bit data to UTF-8 and then back is not an identity,
     535             :    which will probably break a lot of software.
     536             : */
     537             : #define STRICT_RFC3629 0
     538             : 
     539             : #if ERRORS_TO_CP1252
     540             : // Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
     541             : // to Unicode:
     542             : constexpr unsigned short cp1252[32] = {
     543             :     0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
     544             :     0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
     545             :     0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
     546             :     0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178};
     547             : #endif
     548             : 
     549             : /************************************************************************/
     550             : /*                             utf8decode()                             */
     551             : /************************************************************************/
     552             : 
     553             : /*
     554             :     Decode a single UTF-8 encoded character starting at \e p. The
     555             :     resulting Unicode value (in the range 0-0x10ffff) is returned,
     556             :     and \e len is set the number of bytes in the UTF-8 encoding
     557             :     (adding \e len to \e p will point at the next character).
     558             : 
     559             :     If \a p points at an illegal UTF-8 encoding, including one that
     560             :     would go past \e end, or where a code is uses more bytes than
     561             :     necessary, then *reinterpret_cast<const unsigned char*>(p) is translated as
     562             : though it is in the Microsoft CP1252 character set and \e len is set to 1.
     563             :     Treating errors this way allows this to decode almost any
     564             :     ISO-8859-1 or CP1252 text that has been mistakenly placed where
     565             :     UTF-8 is expected, and has proven very useful.
     566             : 
     567             :     If you want errors to be converted to error characters (as the
     568             :     standards recommend), adding a test to see if the length is
     569             :     unexpectedly 1 will work:
     570             : 
     571             : \code
     572             :     if( *p & 0x80 )
     573             :     {  // What should be a multibyte encoding.
     574             :       code = utf8decode(p, end, &len);
     575             :       if( len<2 ) code = 0xFFFD;  // Turn errors into REPLACEMENT CHARACTER.
     576             :     }
     577             :     else
     578             :     {  // Handle the 1-byte utf8 encoding:
     579             :       code = *p;
     580             :       len = 1;
     581             :     }
     582             : \endcode
     583             : 
     584             :     Direct testing for the 1-byte case (as shown above) will also
     585             :     speed up the scanning of strings where the majority of characters
     586             :     are ASCII.
     587             : */
     588        2615 : static unsigned utf8decode(const char *p, const char *end, int *len)
     589             : {
     590        2615 :     unsigned char c = *reinterpret_cast<const unsigned char *>(p);
     591        2615 :     if (c < 0x80)
     592             :     {
     593           0 :         *len = 1;
     594           0 :         return c;
     595             : #if ERRORS_TO_CP1252
     596             :     }
     597        2615 :     else if (c < 0xa0)
     598             :     {
     599          39 :         *len = 1;
     600          39 :         return cp1252[c - 0x80];
     601             : #endif
     602             :     }
     603        2576 :     else if (c < 0xc2)
     604             :     {
     605          10 :         goto FAIL;
     606             :     }
     607        2566 :     if (p + 1 >= end || (p[1] & 0xc0) != 0x80)
     608          70 :         goto FAIL;
     609        2496 :     if (c < 0xe0)
     610             :     {
     611        2488 :         *len = 2;
     612        2488 :         return ((p[0] & 0x1f) << 6) + ((p[1] & 0x3f));
     613             :     }
     614           8 :     else if (c == 0xe0)
     615             :     {
     616           0 :         if ((reinterpret_cast<const unsigned char *>(p))[1] < 0xa0)
     617           0 :             goto FAIL;
     618           0 :         goto UTF8_3;
     619             : #if STRICT_RFC3629
     620             :     }
     621             :     else if (c == 0xed)
     622             :     {
     623             :         // RFC 3629 says surrogate chars are illegal.
     624             :         if ((reinterpret_cast<const unsigned char *>(p))[1] >= 0xa0)
     625             :             goto FAIL;
     626             :         goto UTF8_3;
     627             :     }
     628             :     else if (c == 0xef)
     629             :     {
     630             :         // 0xfffe and 0xffff are also illegal characters.
     631             :         if ((reinterpret_cast<const unsigned char *>(p))[1] == 0xbf &&
     632             :             (reinterpret_cast<const unsigned char *>(p))[2] >= 0xbe)
     633             :             goto FAIL;
     634             :         goto UTF8_3;
     635             : #endif
     636             :     }
     637           8 :     else if (c < 0xf0)
     638             :     {
     639           4 :     UTF8_3:
     640           4 :         if (p + 2 >= end || (p[2] & 0xc0) != 0x80)
     641           0 :             goto FAIL;
     642           4 :         *len = 3;
     643           4 :         return ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + ((p[2] & 0x3f));
     644             :     }
     645           4 :     else if (c == 0xf0)
     646             :     {
     647           4 :         if ((reinterpret_cast<const unsigned char *>(p))[1] < 0x90)
     648           0 :             goto FAIL;
     649           4 :         goto UTF8_4;
     650             :     }
     651           0 :     else if (c < 0xf4)
     652             :     {
     653           0 :     UTF8_4:
     654           4 :         if (p + 3 >= end || (p[2] & 0xc0) != 0x80 || (p[3] & 0xc0) != 0x80)
     655           0 :             goto FAIL;
     656           4 :         *len = 4;
     657             : #if STRICT_RFC3629
     658             :         // RFC 3629 says all codes ending in fffe or ffff are illegal:
     659             :         if ((p[1] & 0xf) == 0xf &&
     660             :             (reinterpret_cast<const unsigned char *>(p))[2] == 0xbf &&
     661             :             (reinterpret_cast<const unsigned char *>(p))[3] >= 0xbe)
     662             :             goto FAIL;
     663             : #endif
     664           4 :         return ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12) +
     665           4 :                ((p[2] & 0x3f) << 6) + ((p[3] & 0x3f));
     666             :     }
     667           0 :     else if (c == 0xf4)
     668             :     {
     669           0 :         if ((reinterpret_cast<const unsigned char *>(p))[1] > 0x8f)
     670           0 :             goto FAIL;  // After 0x10ffff.
     671           0 :         goto UTF8_4;
     672             :     }
     673             :     else
     674             :     {
     675           0 :     FAIL:
     676          80 :         *len = 1;
     677             : #if ERRORS_TO_ISO8859_1
     678          80 :         return c;
     679             : #else
     680             :         return 0xfffd;  // Unicode REPLACEMENT CHARACTER
     681             : #endif
     682             :     }
     683             : }
     684             : 
     685             : /************************************************************************/
     686             : /*                              utf8towc()                              */
     687             : /************************************************************************/
     688             : 
     689             : /*  Convert a UTF-8 sequence into an array of wchar_t. These
     690             :     are used by some system calls, especially on Windows.
     691             : 
     692             :     \a src points at the UTF-8, and \a srclen is the number of bytes to
     693             :     convert.
     694             : 
     695             :     \a dst points at an array to write, and \a dstlen is the number of
     696             :     locations in this array. At most \a dstlen-1 words will be
     697             :     written there, plus a 0 terminating word. Thus this function
     698             :     will never overwrite the buffer and will always return a
     699             :     zero-terminated string. If \a dstlen is zero then \a dst can be
     700             :     null and no data is written, but the length is returned.
     701             : 
     702             :     The return value is the number of words that \e would be written
     703             :     to \a dst if it were long enough, not counting the terminating
     704             :     zero. If the return value is greater or equal to \a dstlen it
     705             :     indicates truncation, you can then allocate a new array of size
     706             :     return+1 and call this again.
     707             : 
     708             :     Errors in the UTF-8 are converted as though each byte in the
     709             :     erroneous string is in the Microsoft CP1252 encoding. This allows
     710             :     ISO-8859-1 text mistakenly identified as UTF-8 to be printed
     711             :     correctly.
     712             : 
     713             :     Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
     714             :     and most other systems. Where wchar_t is 16 bits, Unicode
     715             :     characters in the range 0x10000 to 0x10ffff are converted to
     716             :     "surrogate pairs" which take two words each (this is called UTF-16
     717             :     encoding). If wchar_t is 32 bits this rather nasty problem is
     718             :     avoided.
     719             : */
     720       41083 : static unsigned utf8towc(const char *src, unsigned srclen, wchar_t *dst,
     721             :                          unsigned dstlen)
     722             : {
     723       41083 :     const char *p = src;
     724       41083 :     const char *e = src + srclen;
     725       41083 :     unsigned count = 0;
     726       41083 :     if (dstlen)
     727             :         while (true)
     728             :         {
     729      299522 :             if (p >= e)
     730             :             {
     731       41083 :                 dst[count] = 0;
     732       41083 :                 return count;
     733             :             }
     734      258439 :             if (!(*p & 0x80))
     735             :             {
     736             :                 // ASCII
     737      258237 :                 dst[count] = *p++;
     738             :             }
     739             :             else
     740             :             {
     741         202 :                 int len = 0;
     742         202 :                 unsigned ucs = utf8decode(p, e, &len);
     743         202 :                 p += len;
     744             : #ifdef _WIN32
     745             :                 if (ucs < 0x10000)
     746             :                 {
     747             :                     dst[count] = static_cast<wchar_t>(ucs);
     748             :                 }
     749             :                 else
     750             :                 {
     751             :                     // Make a surrogate pair:
     752             :                     if (count + 2 >= dstlen)
     753             :                     {
     754             :                         dst[count] = 0;
     755             :                         count += 2;
     756             :                         break;
     757             :                     }
     758             :                     dst[count] = static_cast<wchar_t>(
     759             :                         (((ucs - 0x10000u) >> 10) & 0x3ff) | 0xd800);
     760             :                     dst[++count] = static_cast<wchar_t>((ucs & 0x3ff) | 0xdc00);
     761             :                 }
     762             : #else
     763         202 :                 dst[count] = static_cast<wchar_t>(ucs);
     764             : #endif
     765             :             }
     766      258439 :             if (++count == dstlen)
     767             :             {
     768           0 :                 dst[count - 1] = 0;
     769           0 :                 break;
     770             :             }
     771      258439 :         }
     772             :     // We filled dst, measure the rest:
     773           0 :     while (p < e)
     774             :     {
     775           0 :         if (!(*p & 0x80))
     776             :         {
     777           0 :             p++;
     778             :         }
     779             :         else
     780             :         {
     781           0 :             int len = 0;
     782             : #ifdef _WIN32
     783             :             const unsigned ucs = utf8decode(p, e, &len);
     784             :             p += len;
     785             :             if (ucs >= 0x10000)
     786             :                 ++count;
     787             : #else
     788           0 :             utf8decode(p, e, &len);
     789           0 :             p += len;
     790             : #endif
     791             :         }
     792           0 :         ++count;
     793             :     }
     794             : 
     795           0 :     return count;
     796             : }
     797             : 
     798             : /************************************************************************/
     799             : /*                              utf8toa()                               */
     800             : /************************************************************************/
     801             : /* Convert a UTF-8 sequence into an array of 1-byte characters.
     802             : 
     803             :     If the UTF-8 decodes to a character greater than 0xff then it is
     804             :     replaced with '?'.
     805             : 
     806             :     Errors in the UTF-8 are converted as individual bytes, same as
     807             :     utf8decode() does. This allows ISO-8859-1 text mistakenly identified
     808             :     as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
     809             : 
     810             :     \a src points at the UTF-8, and \a srclen is the number of bytes to
     811             :     convert.
     812             : 
     813             :     Up to \a dstlen bytes are written to \a dst, including a null
     814             :     terminator. The return value is the number of bytes that would be
     815             :     written, not counting the null terminator. If greater or equal to
     816             :     \a dstlen then if you malloc a new array of size n+1 you will have
     817             :     the space needed for the entire string. If \a dstlen is zero then
     818             :     nothing is written and this call just measures the storage space
     819             :     needed.
     820             : */
     821       47577 : static unsigned int utf8toa(const char *src, unsigned srclen, char *dst,
     822             :                             unsigned dstlen)
     823             : {
     824       47577 :     const char *p = src;
     825       47577 :     const char *e = src + srclen;
     826       47577 :     unsigned int count = 0;
     827       47577 :     if (dstlen)
     828             :         while (true)
     829             :         {
     830      172257 :             if (p >= e)
     831             :             {
     832       47577 :                 dst[count] = 0;
     833       47577 :                 return count;
     834             :             }
     835      124680 :             unsigned char c = *reinterpret_cast<const unsigned char *>(p);
     836      124680 :             if (c < 0xC2)
     837             :             {
     838             :                 // ASCII or bad code.
     839      123880 :                 dst[count] = c;
     840      123880 :                 p++;
     841             :             }
     842             :             else
     843             :             {
     844         800 :                 int len = 0;
     845         800 :                 const unsigned int ucs = utf8decode(p, e, &len);
     846         800 :                 p += len;
     847         800 :                 if (ucs < 0x100)
     848             :                 {
     849         796 :                     dst[count] = static_cast<char>(ucs);
     850             :                 }
     851             :                 else
     852             :                 {
     853           4 :                     if (!bHaveWarned4)
     854             :                     {
     855           2 :                         bHaveWarned4 = true;
     856           2 :                         CPLError(
     857             :                             CE_Warning, CPLE_AppDefined,
     858             :                             "One or several characters couldn't be converted "
     859             :                             "correctly from UTF-8 to ISO-8859-1.  "
     860             :                             "This warning will not be emitted anymore.");
     861             :                     }
     862           4 :                     dst[count] = '?';
     863             :                 }
     864             :             }
     865      124680 :             if (++count >= dstlen)
     866             :             {
     867           0 :                 dst[count - 1] = 0;
     868           0 :                 break;
     869             :             }
     870      124680 :         }
     871             :     // We filled dst, measure the rest:
     872           0 :     while (p < e)
     873             :     {
     874           0 :         if (!(*p & 0x80))
     875             :         {
     876           0 :             p++;
     877             :         }
     878             :         else
     879             :         {
     880           0 :             int len = 0;
     881           0 :             utf8decode(p, e, &len);
     882           0 :             p += len;
     883             :         }
     884           0 :         ++count;
     885             :     }
     886           0 :     return count;
     887             : }
     888             : 
     889             : /************************************************************************/
     890             : /*                             utf8fromwc()                             */
     891             : /************************************************************************/
     892             : /* Turn "wide characters" as returned by some system calls
     893             :     (especially on Windows) into UTF-8.
     894             : 
     895             :     Up to \a dstlen bytes are written to \a dst, including a null
     896             :     terminator. The return value is the number of bytes that would be
     897             :     written, not counting the null terminator. If greater or equal to
     898             :     \a dstlen then if you malloc a new array of size n+1 you will have
     899             :     the space needed for the entire string. If \a dstlen is zero then
     900             :     nothing is written and this call just measures the storage space
     901             :     needed.
     902             : 
     903             :     \a srclen is the number of words in \a src to convert. On Windows
     904             :     this is not necessarily the number of characters, due to there
     905             :     possibly being "surrogate pairs" in the UTF-16 encoding used.
     906             :     On Unix wchar_t is 32 bits and each location is a character.
     907             : 
     908             :     On Unix if a src word is greater than 0x10ffff then this is an
     909             :     illegal character according to RFC 3629. These are converted as
     910             :     though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
     911             :     range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
     912             :     illegal according to RFC 3629. However I encode these as though
     913             :     they are legal, so that utf8towc will return the original data.
     914             : 
     915             :     On Windows "surrogate pairs" are converted to a single character
     916             :     and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
     917             :     pairs are converted as though they are individual characters.
     918             : */
     919       72926 : static unsigned int utf8fromwc(char *dst, unsigned dstlen, const wchar_t *src,
     920             :                                unsigned srclen)
     921             : {
     922       72926 :     unsigned int i = 0;
     923       72926 :     unsigned int count = 0;
     924       72926 :     if (dstlen)
     925             :         while (true)
     926             :         {
     927     1868780 :             if (i >= srclen)
     928             :             {
     929       72926 :                 dst[count] = 0;
     930       72926 :                 return count;
     931             :             }
     932     1795860 :             unsigned int ucs = src[i++];
     933     1795860 :             if (ucs < 0x80U)
     934             :             {
     935     1788950 :                 dst[count++] = static_cast<char>(ucs);
     936     1788950 :                 if (count >= dstlen)
     937             :                 {
     938           0 :                     dst[count - 1] = 0;
     939           0 :                     break;
     940             :                 }
     941             :             }
     942        6909 :             else if (ucs < 0x800U)
     943             :             {
     944             :                 // 2 bytes.
     945        4263 :                 if (count + 2 >= dstlen)
     946             :                 {
     947           0 :                     dst[count] = 0;
     948           0 :                     count += 2;
     949           0 :                     break;
     950             :                 }
     951        4263 :                 dst[count++] = 0xc0 | static_cast<char>(ucs >> 6);
     952        4263 :                 dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
     953             : #ifdef _WIN32
     954             :             }
     955             :             else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
     956             :                      src[i] >= 0xdc00 && src[i] <= 0xdfff)
     957             :             {
     958             :                 // Surrogate pair.
     959             :                 unsigned int ucs2 = src[i++];
     960             :                 ucs = 0x10000U + ((ucs & 0x3ff) << 10) + (ucs2 & 0x3ff);
     961             :                 // All surrogate pairs turn into 4-byte utf8.
     962             : #else
     963             :             }
     964        2646 :             else if (ucs >= 0x10000)
     965             :             {
     966           1 :                 if (ucs > 0x10ffff)
     967             :                 {
     968           1 :                     ucs = 0xfffd;
     969           1 :                     goto J1;
     970             :                 }
     971             : #endif
     972           0 :                 if (count + 4 >= dstlen)
     973             :                 {
     974           0 :                     dst[count] = 0;
     975           0 :                     count += 4;
     976           0 :                     break;
     977             :                 }
     978           0 :                 dst[count++] = 0xf0 | static_cast<char>(ucs >> 18);
     979           0 :                 dst[count++] = 0x80 | static_cast<char>((ucs >> 12) & 0x3F);
     980           0 :                 dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F);
     981           0 :                 dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
     982             :             }
     983             :             else
     984             :             {
     985             : #ifndef _WIN32
     986        2645 :             J1:
     987             : #endif
     988             :                 // All others are 3 bytes:
     989        2646 :                 if (count + 3 >= dstlen)
     990             :                 {
     991           0 :                     dst[count] = 0;
     992           0 :                     count += 3;
     993           0 :                     break;
     994             :                 }
     995        2646 :                 dst[count++] = 0xe0 | static_cast<char>(ucs >> 12);
     996        2646 :                 dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F);
     997        2646 :                 dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
     998             :             }
     999     1795860 :         }
    1000             : 
    1001             :     // We filled dst, measure the rest:
    1002           0 :     while (i < srclen)
    1003             :     {
    1004           0 :         unsigned int ucs = src[i++];
    1005           0 :         if (ucs < 0x80U)
    1006             :         {
    1007           0 :             count++;
    1008             :         }
    1009           0 :         else if (ucs < 0x800U)
    1010             :         {
    1011             :             // 2 bytes.
    1012           0 :             count += 2;
    1013             : #ifdef _WIN32
    1014             :         }
    1015             :         else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen - 1 &&
    1016             :                  src[i + 1] >= 0xdc00 && src[i + 1] <= 0xdfff)
    1017             :         {
    1018             :             // Surrogate pair.
    1019             :             ++i;
    1020             : #else
    1021             :         }
    1022           0 :         else if (ucs >= 0x10000 && ucs <= 0x10ffff)
    1023             :         {
    1024             : #endif
    1025           0 :             count += 4;
    1026             :         }
    1027             :         else
    1028             :         {
    1029           0 :             count += 3;
    1030             :         }
    1031             :     }
    1032           0 :     return count;
    1033             : }
    1034             : 
    1035             : /************************************************************************/
    1036             : /*                             utf8froma()                              */
    1037             : /************************************************************************/
    1038             : 
    1039             : /* Convert an ISO-8859-1 (i.e. normal c-string) byte stream to UTF-8.
    1040             : 
    1041             :     It is possible this should convert Microsoft's CP1252 to UTF-8
    1042             :     instead. This would translate the codes in the range 0x80-0x9f
    1043             :     to different characters. Currently it does not do this.
    1044             : 
    1045             :     Up to \a dstlen bytes are written to \a dst, including a null
    1046             :     terminator. The return value is the number of bytes that would be
    1047             :     written, not counting the null terminator. If greater or equal to
    1048             :     \a dstlen then if you malloc a new array of size n+1 you will have
    1049             :     the space needed for the entire string. If \a dstlen is zero then
    1050             :     nothing is written and this call just measures the storage space
    1051             :     needed.
    1052             : 
    1053             :     \a srclen is the number of bytes in \a src to convert.
    1054             : 
    1055             :     If the return value equals \a srclen then this indicates that
    1056             :     no conversion is necessary, as only ASCII characters are in the
    1057             :     string.
    1058             : */
    1059     1171150 : static unsigned utf8froma(char *dst, unsigned dstlen, const char *src,
    1060             :                           unsigned srclen)
    1061             : {
    1062     1171150 :     const char *p = src;
    1063     1171150 :     const char *e = src + srclen;
    1064     1171150 :     unsigned count = 0;
    1065     1171150 :     if (dstlen)
    1066             :         while (true)
    1067             :         {
    1068    17815800 :             if (p >= e)
    1069             :             {
    1070     1171150 :                 dst[count] = 0;
    1071     1171150 :                 return count;
    1072             :             }
    1073    16644600 :             unsigned char ucs = *reinterpret_cast<const unsigned char *>(p);
    1074    16644600 :             p++;
    1075    16644600 :             if (ucs < 0x80U)
    1076             :             {
    1077    16597600 :                 dst[count++] = ucs;
    1078    16597600 :                 if (count >= dstlen)
    1079             :                 {
    1080           0 :                     dst[count - 1] = 0;
    1081           0 :                     break;
    1082             :                 }
    1083             :             }
    1084             :             else
    1085             :             {
    1086             :                 // 2 bytes (note that CP1252 translate could make 3 bytes!)
    1087       47068 :                 if (count + 2 >= dstlen)
    1088             :                 {
    1089           0 :                     dst[count] = 0;
    1090           0 :                     count += 2;
    1091           0 :                     break;
    1092             :                 }
    1093       47068 :                 dst[count++] = 0xc0 | (ucs >> 6);
    1094       47068 :                 dst[count++] = 0x80 | (ucs & 0x3F);
    1095             :             }
    1096    16644600 :         }
    1097             : 
    1098             :     // We filled dst, measure the rest:
    1099           0 :     while (p < e)
    1100             :     {
    1101           0 :         unsigned char ucs = *reinterpret_cast<const unsigned char *>(p);
    1102           0 :         p++;
    1103           0 :         if (ucs < 0x80U)
    1104             :         {
    1105           0 :             count++;
    1106             :         }
    1107             :         else
    1108             :         {
    1109           0 :             count += 2;
    1110             :         }
    1111             :     }
    1112             : 
    1113           0 :     return count;
    1114             : }
    1115             : 
    1116             : #ifdef _WIN32
    1117             : 
    1118             : /************************************************************************/
    1119             : /*                            CPLWin32Recode()                          */
    1120             : /************************************************************************/
    1121             : 
    1122             : /* Convert an CODEPAGE (i.e. normal c-string) byte stream
    1123             :      to another CODEPAGE (i.e. normal c-string) byte stream.
    1124             : 
    1125             :     \a src is target c-string byte stream (including a null terminator).
    1126             :     \a src_code_page is target c-string byte code page.
    1127             :     \a dst_code_page is destination c-string byte code page.
    1128             : 
    1129             :    UTF7          65000
    1130             :    UTF8          65001
    1131             :    OEM-US          437
    1132             :    OEM-ALABIC      720
    1133             :    OEM-GREEK       737
    1134             :    OEM-BALTIC      775
    1135             :    OEM-MLATIN1     850
    1136             :    OEM-LATIN2      852
    1137             :    OEM-CYRILLIC    855
    1138             :    OEM-TURKISH     857
    1139             :    OEM-MLATIN1P    858
    1140             :    OEM-HEBREW      862
    1141             :    OEM-RUSSIAN     866
    1142             : 
    1143             :    THAI            874
    1144             :    SJIS            932
    1145             :    GBK             936
    1146             :    KOREA           949
    1147             :    BIG5            950
    1148             : 
    1149             :    EUROPE         1250
    1150             :    CYRILLIC       1251
    1151             :    LATIN1         1252
    1152             :    GREEK          1253
    1153             :    TURKISH        1254
    1154             :    HEBREW         1255
    1155             :    ARABIC         1256
    1156             :    BALTIC         1257
    1157             :    VIETNAM        1258
    1158             : 
    1159             :    ISO-LATIN1    28591
    1160             :    ISO-LATIN2    28592
    1161             :    ISO-LATIN3    28593
    1162             :    ISO-BALTIC    28594
    1163             :    ISO-CYRILLIC  28595
    1164             :    ISO-ARABIC    28596
    1165             :    ISO-HEBREW    28598
    1166             :    ISO-TURKISH   28599
    1167             :    ISO-LATIN9    28605
    1168             : 
    1169             :    ISO-2022-JP   50220
    1170             : 
    1171             : */
    1172             : 
    1173             : char *CPLWin32Recode(const char *src, unsigned src_code_page,
    1174             :                      unsigned dst_code_page)
    1175             : {
    1176             :     // Convert from source code page to Unicode.
    1177             : 
    1178             :     // Compute the length in wide characters.
    1179             :     int wlen = MultiByteToWideChar(src_code_page, MB_ERR_INVALID_CHARS, src, -1,
    1180             :                                    nullptr, 0);
    1181             :     if (wlen == 0 && GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
    1182             :     {
    1183             :         if (!bHaveWarned5)
    1184             :         {
    1185             :             bHaveWarned5 = true;
    1186             :             CPLError(
    1187             :                 CE_Warning, CPLE_AppDefined,
    1188             :                 "One or several characters could not be translated from CP%d. "
    1189             :                 "This warning will not be emitted anymore.",
    1190             :                 src_code_page);
    1191             :         }
    1192             : 
    1193             :         // Retry now without MB_ERR_INVALID_CHARS flag.
    1194             :         wlen = MultiByteToWideChar(src_code_page, 0, src, -1, nullptr, 0);
    1195             :     }
    1196             : 
    1197             :     // Do the actual conversion.
    1198             :     wchar_t *tbuf =
    1199             :         static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), wlen + 1));
    1200             :     tbuf[wlen] = 0;
    1201             :     MultiByteToWideChar(src_code_page, 0, src, -1, tbuf, wlen + 1);
    1202             : 
    1203             :     // Convert from Unicode to destination code page.
    1204             : 
    1205             :     // Compute the length in chars.
    1206             :     BOOL bUsedDefaultChar = FALSE;
    1207             :     int len = 0;
    1208             :     if (dst_code_page == CP_UTF7 || dst_code_page == CP_UTF8)
    1209             :         len = WideCharToMultiByte(dst_code_page, 0, tbuf, -1, nullptr, 0,
    1210             :                                   nullptr, nullptr);
    1211             :     else
    1212             :         len = WideCharToMultiByte(dst_code_page, 0, tbuf, -1, nullptr, 0,
    1213             :                                   nullptr, &bUsedDefaultChar);
    1214             :     if (bUsedDefaultChar)
    1215             :     {
    1216             :         if (!bHaveWarned6)
    1217             :         {
    1218             :             bHaveWarned6 = true;
    1219             :             CPLError(
    1220             :                 CE_Warning, CPLE_AppDefined,
    1221             :                 "One or several characters could not be translated to CP%d. "
    1222             :                 "This warning will not be emitted anymore.",
    1223             :                 dst_code_page);
    1224             :         }
    1225             :     }
    1226             : 
    1227             :     // Do the actual conversion.
    1228             :     char *pszResult = static_cast<char *>(CPLCalloc(sizeof(char), len + 1));
    1229             :     WideCharToMultiByte(dst_code_page, 0, tbuf, -1, pszResult, len + 1, nullptr,
    1230             :                         nullptr);
    1231             :     pszResult[len] = 0;
    1232             : 
    1233             :     CPLFree(tbuf);
    1234             : 
    1235             :     return pszResult;
    1236             : }
    1237             : 
    1238             : #endif
    1239             : 
    1240             : /*
    1241             : ** For now we disable the rest which is locale() related.  We may need
    1242             : ** parts of it later.
    1243             : */
    1244             : 
    1245             : #ifdef notdef
    1246             : 
    1247             : #ifdef _WIN32
    1248             : #include <windows.h>
    1249             : #endif
    1250             : 
    1251             : /*! Return true if the "locale" seems to indicate that UTF-8 encoding
    1252             :     is used. If true the utf8tomb and utf8frommb don't do anything
    1253             :     useful.
    1254             : 
    1255             :     <i>It is highly recommended that you change your system so this
    1256             :     does return true.</i> On Windows this is done by setting the
    1257             :     "codepage" to CP_UTF8.  On Unix this is done by setting $LC_CTYPE
    1258             :     to a string containing the letters "utf" or "UTF" in it, or by
    1259             :     deleting all $LC* and $LANG environment variables. In the future
    1260             :     it is likely that all non-Asian Unix systems will return true,
    1261             :     due to the compatibility of UTF-8 with ISO-8859-1.
    1262             : */
    1263             : int utf8locale(void)
    1264             : {
    1265             :     static int ret = 2;
    1266             :     if (ret == 2)
    1267             :     {
    1268             : #ifdef _WIN32
    1269             :         ret = GetACP() == CP_UTF8;
    1270             : #else
    1271             :         char *s;
    1272             :         ret = 1;  // assume UTF-8 if no locale
    1273             :         if (((s = getenv("LC_CTYPE")) && *s) ||
    1274             :             ((s = getenv("LC_ALL")) && *s) || ((s = getenv("LANG")) && *s))
    1275             :         {
    1276             :             ret = strstr(s, "utf") || strstr(s, "UTF");
    1277             :         }
    1278             : #endif
    1279             :     }
    1280             : 
    1281             :     return ret;
    1282             : }
    1283             : 
    1284             : /*! Convert the UTF-8 used by FLTK to the locale-specific encoding
    1285             :     used for filenames (and sometimes used for data in files).
    1286             :     Unfortunately due to stupid design you will have to do this as
    1287             :     needed for filenames. This is a bug on both Unix and Windows.
    1288             : 
    1289             :     Up to \a dstlen bytes are written to \a dst, including a null
    1290             :     terminator. The return value is the number of bytes that would be
    1291             :     written, not counting the null terminator. If greater or equal to
    1292             :     \a dstlen then if you malloc a new array of size n+1 you will have
    1293             :     the space needed for the entire string. If \a dstlen is zero then
    1294             :     nothing is written and this call just measures the storage space
    1295             :     needed.
    1296             : 
    1297             :     If utf8locale() returns true then this does not change the data.
    1298             :     It is copied and truncated as necessary to
    1299             :     the destination buffer and \a srclen is always returned.  */
    1300             : unsigned utf8tomb(const char *src, unsigned srclen, char *dst, unsigned dstlen)
    1301             : {
    1302             :     if (!utf8locale())
    1303             :     {
    1304             : #ifdef _WIN32
    1305             :         wchar_t lbuf[1024] = {};
    1306             :         wchar_t *buf = lbuf;
    1307             :         unsigned length = utf8towc(src, srclen, buf, 1024);
    1308             :         unsigned ret;
    1309             :         if (length >= 1024)
    1310             :         {
    1311             :             buf =
    1312             :                 static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t)));
    1313             :             utf8towc(src, srclen, buf, length + 1);
    1314             :         }
    1315             :         if (dstlen)
    1316             :         {
    1317             :             // apparently this does not null-terminate, even though msdn
    1318             :             // documentation claims it does:
    1319             :             ret = WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0,
    1320             :                                       0);
    1321             :             dst[ret] = 0;
    1322             :         }
    1323             :         // if it overflows or measuring length, get the actual length:
    1324             :         if (dstlen == 0 || ret >= dstlen - 1)
    1325             :             ret = WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
    1326             :         if (buf != lbuf)
    1327             :             free((void *)buf);
    1328             :         return ret;
    1329             : #else
    1330             :         wchar_t lbuf[1024] = {};
    1331             :         wchar_t *buf = lbuf;
    1332             :         unsigned length = utf8towc(src, srclen, buf, 1024);
    1333             :         if (length >= 1024)
    1334             :         {
    1335             :             buf =
    1336             :                 static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t)));
    1337             :             utf8towc(src, srclen, buf, length + 1);
    1338             :         }
    1339             :         int ret = 0;
    1340             :         if (dstlen)
    1341             :         {
    1342             :             ret = wcstombs(dst, buf, dstlen);
    1343             :             if (ret >= dstlen - 1)
    1344             :                 ret = wcstombs(0, buf, 0);
    1345             :         }
    1346             :         else
    1347             :         {
    1348             :             ret = wcstombs(0, buf, 0);
    1349             :         }
    1350             :         if (buf != lbuf)
    1351             :             free((void *)buf);
    1352             :         if (ret >= 0)
    1353             :             return (unsigned)ret;
    1354             :             // On any errors we return the UTF-8 as raw text...
    1355             : #endif
    1356             :     }
    1357             :     // Identity transform:
    1358             :     if (srclen < dstlen)
    1359             :     {
    1360             :         memcpy(dst, src, srclen);
    1361             :         dst[srclen] = 0;
    1362             :     }
    1363             :     else
    1364             :     {
    1365             :         memcpy(dst, src, dstlen - 1);
    1366             :         dst[dstlen - 1] = 0;
    1367             :     }
    1368             :     return srclen;
    1369             : }
    1370             : 
    1371             : /*! Convert a filename from the locale-specific multibyte encoding
    1372             :     used by Windows to UTF-8 as used by FLTK.
    1373             : 
    1374             :     Up to \a dstlen bytes are written to \a dst, including a null
    1375             :     terminator. The return value is the number of bytes that would be
    1376             :     written, not counting the null terminator. If greater or equal to
    1377             :     \a dstlen then if you malloc a new array of size n+1 you will have
    1378             :     the space needed for the entire string. If \a dstlen is zero then
    1379             :     nothing is written and this call just measures the storage space
    1380             :     needed.
    1381             : 
    1382             :     On Unix or on Windows when a UTF-8 locale is in effect, this
    1383             :     does not change the data. It is copied and truncated as necessary to
    1384             :     the destination buffer and \a srclen is always returned.
    1385             :     You may also want to check if utf8test() returns non-zero, so that
    1386             :     the filesystem can store filenames in UTF-8 encoding regardless of
    1387             :     the locale.
    1388             : */
    1389             : unsigned utf8frommb(char *dst, unsigned dstlen, const char *src,
    1390             :                     unsigned srclen)
    1391             : {
    1392             :     if (!utf8locale())
    1393             :     {
    1394             : #ifdef _WIN32
    1395             :         wchar_t lbuf[1024] = {};
    1396             :         wchar_t *buf = lbuf;
    1397             :         unsigned ret;
    1398             :         const unsigned length =
    1399             :             MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
    1400             :         if (length >= 1024)
    1401             :         {
    1402             :             length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
    1403             :             buf = static_cast<wchar_t *>(malloc(length * sizeof(wchar_t)));
    1404             :             MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
    1405             :         }
    1406             :         ret = utf8fromwc(dst, dstlen, buf, length);
    1407             :         if (buf != lbuf)
    1408             :             free(buf);
    1409             :         return ret;
    1410             : #else
    1411             :         wchar_t lbuf[1024] = {};
    1412             :         wchar_t *buf = lbuf;
    1413             :         const int length = mbstowcs(buf, src, 1024);
    1414             :         if (length >= 1024)
    1415             :         {
    1416             :             length = mbstowcs(0, src, 0) + 1;
    1417             :             buf =
    1418             :                 static_cast<wchar_t *>(malloc(length * sizeof(unsigned short)));
    1419             :             mbstowcs(buf, src, length);
    1420             :         }
    1421             :         if (length >= 0)
    1422             :         {
    1423             :             const unsigned ret = utf8fromwc(dst, dstlen, buf, length);
    1424             :             if (buf != lbuf)
    1425             :                 free(buf);
    1426             :             return ret;
    1427             :         }
    1428             :         // Errors in conversion return the UTF-8 unchanged.
    1429             : #endif
    1430             :     }
    1431             :     // Identity transform:
    1432             :     if (srclen < dstlen)
    1433             :     {
    1434             :         memcpy(dst, src, srclen);
    1435             :         dst[srclen] = 0;
    1436             :     }
    1437             :     else
    1438             :     {
    1439             :         memcpy(dst, src, dstlen - 1);
    1440             :         dst[dstlen - 1] = 0;
    1441             :     }
    1442             :     return srclen;
    1443             : }
    1444             : 
    1445             : #endif  // def notdef - disabled locale specific stuff.
    1446             : 
    1447             : /*! Examines the first \a srclen bytes in \a src and return a verdict
    1448             :     on whether it is UTF-8 or not.
    1449             :     - Returns 0 if there is any illegal UTF-8 sequences, using the
    1450             :       same rules as utf8decode(). Note that some UCS values considered
    1451             :       illegal by RFC 3629, such as 0xffff, are considered legal by this.
    1452             :     - Returns 1 if there are only single-byte characters (i.e. no bytes
    1453             :       have the high bit set). This is legal UTF-8, but also indicates
    1454             :       plain ASCII. It also returns 1 if \a srclen is zero.
    1455             :     - Returns 2 if there are only characters less than 0x800.
    1456             :     - Returns 3 if there are only characters less than 0x10000.
    1457             :     - Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
    1458             : 
    1459             :     Because there are many illegal sequences in UTF-8, it is almost
    1460             :     impossible for a string in another encoding to be confused with
    1461             :     UTF-8. This is very useful for transitioning Unix to UTF-8
    1462             :     filenames, you can simply test each filename with this to decide
    1463             :     if it is UTF-8 or in the locale encoding. My hope is that if
    1464             :     this is done we will be able to cleanly transition to a locale-less
    1465             :     encoding.
    1466             : */
    1467             : 
    1468       18933 : static int utf8test(const char *src, unsigned srclen)
    1469             : {
    1470       18933 :     int ret = 1;
    1471       18933 :     const char *p = src;
    1472       18933 :     const char *e = src + srclen;
    1473     1766760 :     while (p < e)
    1474             :     {
    1475     1747880 :         if (*p == 0)
    1476           0 :             return 0;
    1477     1747880 :         if (*p & 0x80)
    1478             :         {
    1479        1613 :             int len = 0;
    1480        1613 :             utf8decode(p, e, &len);
    1481        1613 :             if (len < 2)
    1482          53 :                 return 0;
    1483        1560 :             if (len > ret)
    1484         555 :                 ret = len;
    1485        1560 :             p += len;
    1486             :         }
    1487             :         else
    1488             :         {
    1489     1746270 :             p++;
    1490             :         }
    1491             :     }
    1492       18880 :     return ret;
    1493             : }

Generated by: LCOV version 1.14