LCOV - gdal_filtered.info - port/cpl_recode

LCOV - code coverage report

Current view:	top level - port - cpl_recode_stub.cpp (source / functions)		Hit	Total	Coverage
Test:	gdal_filtered.info	Lines:	208	314	66.2 %
Date:	2025-08-19 18:03:11	Functions:	11	11	100.0 %

          Line data    Source code

       1             : /**********************************************************************
       2             :  *
       3             :  * Name:     cpl_recode_stub.cpp
       4             :  * Project:  CPL - Common Portability Library
       5             :  * Purpose:  Character set recoding and char/wchar_t conversions, stub
       6             :  *           implementation to be used if iconv() functionality is not
       7             :  *           available.
       8             :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       9             :  *
      10             :  * The bulk of this code is derived from the utf.c module from FLTK. It
      11             :  * was originally downloaded from:
      12             :  *    http://svn.easysw.com/public/fltk/fltk/trunk/src/utf.c
      13             :  *
      14             :  **********************************************************************
      15             :  * Copyright (c) 2008, Frank Warmerdam
      16             :  * Copyright 2006 by Bill Spitzak and others.
      17             :  * Copyright (c) 2009-2014, Even Rouault <even dot rouault at spatialys.com>
      18             :  *
      19             :  * Permission to use, copy, modify, and distribute this software for any
      20             :  * purpose with or without fee is hereby granted, provided that the above
      21             :  * copyright notice and this permission notice appear in all copies.
      22             :  *
      23             :  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
      24             :  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
      25             :  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
      26             :  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
      27             :  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
      28             :  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
      29             :  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
      30             :  **********************************************************************/
      31             : 
      32             : #include "cpl_port.h"
      33             : #include "cpl_string.h"
      34             : 
      35             : #include <cstring>
      36             : 
      37             : #include "cpl_conv.h"
      38             : #include "cpl_error.h"
      39             : #include "cpl_character_sets.c"
      40             : 
      41             : static unsigned utf8decode(const char *p, const char *end, int *len);
      42             : static unsigned utf8towc(const char *src, unsigned srclen, wchar_t *dst,
      43             :                          unsigned dstlen);
      44             : static unsigned utf8toa(const char *src, unsigned srclen, char *dst,
      45             :                         unsigned dstlen);
      46             : static unsigned utf8fromwc(char *dst, unsigned dstlen, const wchar_t *src,
      47             :                            unsigned srclen);
      48             : static unsigned utf8froma(char *dst, unsigned dstlen, const char *src,
      49             :                           unsigned srclen);
      50             : static int utf8test(const char *src, unsigned srclen);
      51             : 
      52             : #ifdef _WIN32
      53             : 
      54             : #include <windows.h>
      55             : #include <winnls.h>
      56             : 
      57             : static char *CPLWin32Recode(const char *src, unsigned src_code_page,
      58             :                             unsigned dst_code_page) CPL_RETURNS_NONNULL;
      59             : #endif
      60             : 
      61             : /* used by cpl_recode.cpp */
      62             : extern void CPLClearRecodeStubWarningFlags();
      63             : extern char *CPLRecodeStub(const char *, const char *,
      64             :                            const char *) CPL_RETURNS_NONNULL;
      65             : extern char *CPLRecodeFromWCharStub(const wchar_t *, const char *,
      66             :                                     const char *);
      67             : extern wchar_t *CPLRecodeToWCharStub(const char *, const char *, const char *);
      68             : 
      69             : /************************************************************************/
      70             : /* ==================================================================== */
      71             : /*      Stub Implementation not depending on iconv() or WIN32 API.      */
      72             : /* ==================================================================== */
      73             : /************************************************************************/
      74             : 
      75             : static bool bHaveWarned1 = false;
      76             : static bool bHaveWarned2 = false;
      77             : static bool bHaveWarned3 = false;
      78             : static bool bHaveWarned4 = false;
      79             : static bool bHaveWarned5 = false;
      80             : static bool bHaveWarned6 = false;
      81             : 
      82             : /************************************************************************/
      83             : /*                 CPLClearRecodeStubWarningFlags()                     */
      84             : /************************************************************************/
      85             : 
      86       10930 : void CPLClearRecodeStubWarningFlags()
      87             : {
      88       10930 :     bHaveWarned1 = false;
      89       10930 :     bHaveWarned2 = false;
      90       10930 :     bHaveWarned3 = false;
      91       10930 :     bHaveWarned4 = false;
      92       10930 :     bHaveWarned5 = false;
      93       10930 :     bHaveWarned6 = false;
      94       10930 : }
      95             : 
      96             : /************************************************************************/
      97             : /*                           CPLRecodeStub()                            */
      98             : /************************************************************************/
      99             : 
     100             : /**
     101             :  * Convert a string from a source encoding to a destination encoding.
     102             :  *
     103             :  * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
     104             :  * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
     105             :  * <ul>
     106             :  *  <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in
     107             :  *  fact)</li>
     108             :  *  <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
     109             :  *  <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
     110             :  * </ul>
     111             :  *
     112             :  * If an error occurs an error may, or may not be posted with CPLError().
     113             :  *
     114             :  * @param pszSource a NULL terminated string.
     115             :  * @param pszSrcEncoding the source encoding.
     116             :  * @param pszDstEncoding the destination encoding.
     117             :  *
     118             :  * @return a NULL terminated string which should be freed with CPLFree().
     119             :  */
     120             : 
     121      785226 : char *CPLRecodeStub(const char *pszSource, const char *pszSrcEncoding,
     122             :                     const char *pszDstEncoding)
     123             : 
     124             : {
     125             :     /* -------------------------------------------------------------------- */
     126             :     /*      If the source or destination is current locale(), we change     */
     127             :     /*      it to ISO8859-1 since our stub implementation does not          */
     128             :     /*      attempt to address locales properly.                            */
     129             :     /* -------------------------------------------------------------------- */
     130             : 
     131      785226 :     if (pszSrcEncoding[0] == '\0')
     132           0 :         pszSrcEncoding = CPL_ENC_ISO8859_1;
     133             : 
     134      785226 :     if (pszDstEncoding[0] == '\0')
     135           0 :         pszDstEncoding = CPL_ENC_ISO8859_1;
     136             : 
     137             :     /* -------------------------------------------------------------------- */
     138             :     /*      ISO8859 to UTF8                                                 */
     139             :     /* -------------------------------------------------------------------- */
     140      785226 :     if (strcmp(pszSrcEncoding, CPL_ENC_ISO8859_1) == 0 &&
     141      713610 :         strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
     142             :     {
     143      713610 :         const int nCharCount = static_cast<int>(strlen(pszSource));
     144      713610 :         char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1));
     145             : 
     146      713610 :         utf8froma(pszResult, nCharCount * 2 + 1, pszSource, nCharCount);
     147             : 
     148      713610 :         return pszResult;
     149             :     }
     150             : 
     151             :     /* -------------------------------------------------------------------- */
     152             :     /*      UTF8 to ISO8859                                                 */
     153             :     /* -------------------------------------------------------------------- */
     154       71616 :     if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0 &&
     155       49939 :         strcmp(pszDstEncoding, CPL_ENC_ISO8859_1) == 0)
     156             :     {
     157       49939 :         int nCharCount = static_cast<int>(strlen(pszSource));
     158       49939 :         char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount + 1));
     159             : 
     160       49939 :         utf8toa(pszSource, nCharCount, pszResult, nCharCount + 1);
     161             : 
     162       49939 :         return pszResult;
     163             :     }
     164             : 
     165             :     // A few hard coded CPxxx/ISO-8859-x to UTF-8 tables
     166       21677 :     if (EQUAL(pszDstEncoding, CPL_ENC_UTF8))
     167             :     {
     168       21677 :         const auto pConvTable = CPLGetConversionTableToUTF8(pszSrcEncoding);
     169       21677 :         if (pConvTable)
     170             :         {
     171       21677 :             const auto convTable = *pConvTable;
     172       21677 :             const size_t nCharCount = strlen(pszSource);
     173             :             char *pszResult =
     174       21677 :                 static_cast<char *>(CPLCalloc(1, nCharCount * 3 + 1));
     175       21677 :             size_t iDst = 0;
     176       21677 :             unsigned char *pabyResult =
     177             :                 reinterpret_cast<unsigned char *>(pszResult);
     178      550181 :             for (size_t i = 0; i < nCharCount; ++i)
     179             :             {
     180      528504 :                 const unsigned char nChar =
     181      528504 :                     static_cast<unsigned char>(pszSource[i]);
     182      528504 :                 if (nChar <= 127)
     183             :                 {
     184      495466 :                     pszResult[iDst] = pszSource[i];
     185      495466 :                     ++iDst;
     186             :                 }
     187             :                 else
     188             :                 {
     189       33038 :                     const unsigned char nShiftedChar = nChar - 128;
     190       33038 :                     if (convTable[nShiftedChar][0])
     191             :                     {
     192       33037 :                         pabyResult[iDst] = convTable[nShiftedChar][0];
     193       33037 :                         ++iDst;
     194       33037 :                         CPLAssert(convTable[nShiftedChar][1]);
     195       33037 :                         pabyResult[iDst] = convTable[nShiftedChar][1];
     196       33037 :                         ++iDst;
     197       33037 :                         if (convTable[nShiftedChar][2])
     198             :                         {
     199           3 :                             pabyResult[iDst] = convTable[nShiftedChar][2];
     200           3 :                             ++iDst;
     201             :                         }
     202             :                     }
     203             :                     else
     204             :                     {
     205             :                         // Skip the invalid sequence in the input string.
     206           1 :                         if (!bHaveWarned2)
     207             :                         {
     208           1 :                             bHaveWarned2 = true;
     209           1 :                             CPLError(CE_Warning, CPLE_AppDefined,
     210             :                                      "One or several characters couldn't be "
     211             :                                      "converted correctly from %s to %s. "
     212             :                                      "This warning will not be emitted anymore",
     213             :                                      pszSrcEncoding, pszDstEncoding);
     214             :                         }
     215             :                     }
     216             :                 }
     217             :             }
     218             : 
     219       21677 :             pszResult[iDst] = 0;
     220       21677 :             return pszResult;
     221             :         }
     222             :     }
     223             : 
     224             : #ifdef _WIN32
     225             :     const auto MapEncodingToWindowsCodePage = [](const char *pszEncoding)
     226             :     {
     227             :         // Cf https://learn.microsoft.com/fr-fr/windows/win32/intl/code-page-identifiers
     228             :         if (STARTS_WITH(pszEncoding, "CP"))
     229             :         {
     230             :             const int nCode = atoi(pszEncoding + strlen("CP"));
     231             :             if (nCode > 0)
     232             :                 return nCode;
     233             :             else if (EQUAL(pszEncoding, "CP_OEMCP"))
     234             :                 return CP_OEMCP;
     235             :             else if (EQUAL(pszEncoding, "CP_ACP"))
     236             :                 return CP_ACP;
     237             :         }
     238             :         else if (STARTS_WITH(pszEncoding, "WINDOWS-"))
     239             :         {
     240             :             const int nCode = atoi(pszEncoding + strlen("WINDOWS-"));
     241             :             if (nCode > 0)
     242             :                 return nCode;
     243             :         }
     244             :         else if (STARTS_WITH(pszEncoding, "ISO-8859-"))
     245             :         {
     246             :             const int nCode = atoi(pszEncoding + strlen("ISO-8859-"));
     247             :             if ((nCode >= 1 && nCode <= 9) || nCode == 13 || nCode == 15)
     248             :                 return 28590 + nCode;
     249             :         }
     250             : 
     251             :         // Return a negative value, since CP_ACP = 0
     252             :         return -1;
     253             :     };
     254             : 
     255             :     /* ---------------------------------------------------------------------*/
     256             :     /*     XXX to UTF8                                                      */
     257             :     /* ---------------------------------------------------------------------*/
     258             :     if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
     259             :     {
     260             :         const int nCode = MapEncodingToWindowsCodePage(pszSrcEncoding);
     261             :         if (nCode >= 0)
     262             :         {
     263             :             return CPLWin32Recode(pszSource, nCode, CP_UTF8);
     264             :         }
     265             :     }
     266             : 
     267             :     /* ---------------------------------------------------------------------*/
     268             :     /*      UTF8 to XXX                                                     */
     269             :     /* ---------------------------------------------------------------------*/
     270             :     if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0)
     271             :     {
     272             :         const int nCode = MapEncodingToWindowsCodePage(pszDstEncoding);
     273             :         if (nCode >= 0)
     274             :         {
     275             :             return CPLWin32Recode(pszSource, CP_UTF8, nCode);
     276             :         }
     277             :     }
     278             : #endif
     279             : 
     280             :     /* -------------------------------------------------------------------- */
     281             :     /*      Anything else to UTF-8 is treated as ISO8859-1 to UTF-8 with    */
     282             :     /*      a one-time warning.                                             */
     283             :     /* -------------------------------------------------------------------- */
     284           0 :     if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
     285             :     {
     286           0 :         const int nCharCount = static_cast<int>(strlen(pszSource));
     287           0 :         char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1));
     288             : 
     289           0 :         if (!bHaveWarned1)
     290             :         {
     291           0 :             bHaveWarned1 = true;
     292           0 :             CPLError(CE_Warning, CPLE_AppDefined,
     293             :                      "Recode from %s to UTF-8 not supported, "
     294             :                      "treated as ISO-8859-1 to UTF-8.",
     295             :                      pszSrcEncoding);
     296             :         }
     297             : 
     298           0 :         utf8froma(pszResult, nCharCount * 2 + 1, pszSource, nCharCount);
     299             : 
     300           0 :         return pszResult;
     301             :     }
     302             : 
     303             :     /* -------------------------------------------------------------------- */
     304             :     /*      Everything else is treated as a no-op with a warning.           */
     305             :     /* -------------------------------------------------------------------- */
     306             :     {
     307           0 :         if (!bHaveWarned3)
     308             :         {
     309           0 :             bHaveWarned3 = true;
     310           0 :             CPLError(CE_Warning, CPLE_AppDefined,
     311             :                      "Recode from %s to %s not supported, no change applied.",
     312             :                      pszSrcEncoding, pszDstEncoding);
     313             :         }
     314             : 
     315           0 :         return CPLStrdup(pszSource);
     316             :     }
     317             : }
     318             : 
     319             : /************************************************************************/
     320             : /*                       CPLRecodeFromWCharStub()                       */
     321             : /************************************************************************/
     322             : 
     323             : /**
     324             :  * Convert wchar_t string to UTF-8.
     325             :  *
     326             :  * Convert a wchar_t string into a multibyte utf-8 string.  The only
     327             :  * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
     328             :  * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
     329             :  * and CPL_ENC_ISO8859_1.  In some cases (i.e. using iconv()) other encodings
     330             :  * may also be supported.
     331             :  *
     332             :  * Note that the wchar_t type varies in size on different systems. On
     333             :  * win32 it is normally 2 bytes, and on unix 4 bytes.
     334             :  *
     335             :  * If an error occurs an error may, or may not be posted with CPLError().
     336             :  *
     337             :  * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
     338             :  * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
     339             :  * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
     340             :  *
     341             :  * @return a zero terminated multi-byte string which should be freed with
     342             :  * CPLFree(), or NULL if an error occurs.
     343             :  */
     344             : 
     345      130502 : char *CPLRecodeFromWCharStub(const wchar_t *pwszSource,
     346             :                              const char *pszSrcEncoding,
     347             :                              const char *pszDstEncoding)
     348             : 
     349             : {
     350             :     /* -------------------------------------------------------------------- */
     351             :     /*      We try to avoid changes of character set.  We are just          */
     352             :     /*      providing for unicode to unicode.                               */
     353             :     /* -------------------------------------------------------------------- */
     354      130502 :     if (strcmp(pszSrcEncoding, "WCHAR_T") != 0 &&
     355      129143 :         strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0 &&
     356      129143 :         strcmp(pszSrcEncoding, CPL_ENC_UTF16) != 0 &&
     357      129143 :         strcmp(pszSrcEncoding, CPL_ENC_UCS2) != 0 &&
     358           0 :         strcmp(pszSrcEncoding, CPL_ENC_UCS4) != 0)
     359             :     {
     360           0 :         CPLError(CE_Failure, CPLE_AppDefined,
     361             :                  "Stub recoding implementation does not support "
     362             :                  "CPLRecodeFromWCharStub(...,%s,%s)",
     363             :                  pszSrcEncoding, pszDstEncoding);
     364           0 :         return nullptr;
     365             :     }
     366             : 
     367             :     /* -------------------------------------------------------------------- */
     368             :     /*      What is the source length.                                      */
     369             :     /* -------------------------------------------------------------------- */
     370      130502 :     int nSrcLen = 0;
     371             : 
     372     1764300 :     while (pwszSource[nSrcLen] != 0)
     373     1633790 :         nSrcLen++;
     374             : 
     375             :     /* -------------------------------------------------------------------- */
     376             :     /*      Allocate destination buffer plenty big.                         */
     377             :     /* -------------------------------------------------------------------- */
     378      130502 :     const int nDstBufSize = nSrcLen * 4 + 1;
     379             :     // Nearly worst case.
     380      130502 :     char *pszResult = static_cast<char *>(CPLMalloc(nDstBufSize));
     381             : 
     382      130502 :     if (nSrcLen == 0)
     383             :     {
     384       57928 :         pszResult[0] = '\0';
     385       57928 :         return pszResult;
     386             :     }
     387             : 
     388             :     /* -------------------------------------------------------------------- */
     389             :     /*      Convert, and confirm we had enough space.                       */
     390             :     /* -------------------------------------------------------------------- */
     391       72574 :     const int nDstLen = utf8fromwc(pszResult, nDstBufSize, pwszSource, nSrcLen);
     392       72574 :     if (nDstLen >= nDstBufSize)
     393             :     {
     394           0 :         CPLAssert(false);  // too small!
     395             :         return nullptr;
     396             :     }
     397             : 
     398             :     /* -------------------------------------------------------------------- */
     399             :     /*      If something other than UTF-8 was requested, recode now.        */
     400             :     /* -------------------------------------------------------------------- */
     401       72574 :     if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
     402       72574 :         return pszResult;
     403             : 
     404             :     char *pszFinalResult =
     405           0 :         CPLRecodeStub(pszResult, CPL_ENC_UTF8, pszDstEncoding);
     406             : 
     407           0 :     CPLFree(pszResult);
     408             : 
     409           0 :     return pszFinalResult;
     410             : }
     411             : 
     412             : /************************************************************************/
     413             : /*                        CPLRecodeToWCharStub()                        */
     414             : /************************************************************************/
     415             : 
     416             : /**
     417             :  * Convert UTF-8 string to a wchar_t string.
     418             :  *
     419             :  * Convert a 8bit, multi-byte per character input string into a wide
     420             :  * character (wchar_t) string.  The only guaranteed supported source encodings
     421             :  * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1).  The only
     422             :  * guaranteed supported destination encoding is CPL_ENC_UCS2.  Other source
     423             :  * and destination encodings may be supported depending on the underlying
     424             :  * implementation.
     425             :  *
     426             :  * Note that the wchar_t type varies in size on different systems. On
     427             :  * win32 it is normally 2 bytes, and on unix 4 bytes.
     428             :  *
     429             :  * If an error occurs an error may, or may not be posted with CPLError().
     430             :  *
     431             :  * @param pszSource input multi-byte character string.
     432             :  * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
     433             :  * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
     434             :  *
     435             :  * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
     436             :  * NULL on error.
     437             :  *
     438             :  * @since GDAL 1.6.0
     439             :  */
     440             : 
     441       40952 : wchar_t *CPLRecodeToWCharStub(const char *pszSource, const char *pszSrcEncoding,
     442             :                               const char *pszDstEncoding)
     443             : 
     444             : {
     445       40952 :     char *pszUTF8Source = const_cast<char *>(pszSource);
     446             : 
     447       40952 :     if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0 &&
     448           0 :         strcmp(pszSrcEncoding, CPL_ENC_ASCII) != 0)
     449             :     {
     450           0 :         pszUTF8Source = CPLRecodeStub(pszSource, pszSrcEncoding, CPL_ENC_UTF8);
     451           0 :         if (pszUTF8Source == nullptr)
     452           0 :             return nullptr;
     453             :     }
     454             : 
     455             :     /* -------------------------------------------------------------------- */
     456             :     /*      We try to avoid changes of character set.  We are just          */
     457             :     /*      providing for unicode to unicode.                               */
     458             :     /* -------------------------------------------------------------------- */
     459       40952 :     if (strcmp(pszDstEncoding, "WCHAR_T") != 0 &&
     460       40952 :         strcmp(pszDstEncoding, CPL_ENC_UCS2) != 0 &&
     461           0 :         strcmp(pszDstEncoding, CPL_ENC_UCS4) != 0 &&
     462           0 :         strcmp(pszDstEncoding, CPL_ENC_UTF16) != 0)
     463             :     {
     464           0 :         CPLError(CE_Failure, CPLE_AppDefined,
     465             :                  "Stub recoding implementation does not support "
     466             :                  "CPLRecodeToWCharStub(...,%s,%s)",
     467             :                  pszSrcEncoding, pszDstEncoding);
     468           0 :         if (pszUTF8Source != pszSource)
     469           0 :             CPLFree(pszUTF8Source);
     470           0 :         return nullptr;
     471             :     }
     472             : 
     473             :     /* -------------------------------------------------------------------- */
     474             :     /*      Do the UTF-8 to UCS-2 recoding.                                 */
     475             :     /* -------------------------------------------------------------------- */
     476       40952 :     int nSrcLen = static_cast<int>(strlen(pszUTF8Source));
     477             :     wchar_t *pwszResult =
     478       40952 :         static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), nSrcLen + 1));
     479             : 
     480       40952 :     utf8towc(pszUTF8Source, nSrcLen, pwszResult, nSrcLen + 1);
     481             : 
     482       40952 :     if (pszUTF8Source != pszSource)
     483           0 :         CPLFree(pszUTF8Source);
     484             : 
     485       40952 :     return pwszResult;
     486             : }
     487             : 
     488             : /************************************************************************/
     489             : /*                                 CPLIsUTF8()                          */
     490             : /************************************************************************/
     491             : 
     492             : /**
     493             :  * Test if a string is encoded as UTF-8.
     494             :  *
     495             :  * @param pabyData input string to test
     496             :  * @param nLen length of the input string, or -1 if the function must compute
     497             :  *             the string length. In which case it must be null terminated.
     498             :  * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
     499             :  *
     500             :  * @since GDAL 1.7.0
     501             :  */
     502       19550 : int CPLIsUTF8(const char *pabyData, int nLen)
     503             : {
     504       19550 :     if (nLen < 0)
     505       14949 :         nLen = static_cast<int>(strlen(pabyData));
     506       19550 :     return utf8test(pabyData, static_cast<unsigned>(nLen)) != 0;
     507             : }
     508             : 
     509             : /************************************************************************/
     510             : /* ==================================================================== */
     511             : /*      UTF.C code from FLTK with some modifications.                   */
     512             : /* ==================================================================== */
     513             : /************************************************************************/
     514             : 
     515             : /* Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
     516             :    they are instead turned into the Unicode REPLACEMENT CHARACTER, of
     517             :    value 0xfffd.
     518             :    If this is on utf8decode will correctly map most (perhaps all)
     519             :    human-readable text that is in ISO-8859-1. This may allow you
     520             :    to completely ignore character sets in your code because virtually
     521             :    everything is either ISO-8859-1 or UTF-8.
     522             : */
     523             : #define ERRORS_TO_ISO8859_1 1
     524             : 
     525             : /* Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
     526             :    Unicode index for Microsoft's CP1252 character set. You should
     527             :    also set ERRORS_TO_ISO8859_1. With this a huge amount of more
     528             :    available text (such as all web pages) are correctly converted
     529             :    to Unicode.
     530             : */
     531             : #define ERRORS_TO_CP1252 1
     532             : 
     533             : /* A number of Unicode code points are in fact illegal and should not
     534             :    be produced by a UTF-8 converter. Turn this on will replace the
     535             :    bytes in those encodings with errors. If you do this then converting
     536             :    arbitrary 16-bit data to UTF-8 and then back is not an identity,
     537             :    which will probably break a lot of software.
     538             : */
     539             : #define STRICT_RFC3629 0
     540             : 
     541             : #if ERRORS_TO_CP1252
     542             : // Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
     543             : // to Unicode:
     544             : constexpr unsigned short cp1252[32] = {
     545             :     0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
     546             :     0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
     547             :     0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
     548             :     0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178};
     549             : #endif
     550             : 
     551             : /************************************************************************/
     552             : /*                             utf8decode()                             */
     553             : /************************************************************************/
     554             : 
     555             : /*
     556             :     Decode a single UTF-8 encoded character starting at \e p. The
     557             :     resulting Unicode value (in the range 0-0x10ffff) is returned,
     558             :     and \e len is set the number of bytes in the UTF-8 encoding
     559             :     (adding \e len to \e p will point at the next character).
     560             : 
     561             :     If \a p points at an illegal UTF-8 encoding, including one that
     562             :     would go past \e end, or where a code is uses more bytes than
     563             :     necessary, then *reinterpret_cast<const unsigned char*>(p) is translated as
     564             : though it is in the Microsoft CP1252 character set and \e len is set to 1.
     565             :     Treating errors this way allows this to decode almost any
     566             :     ISO-8859-1 or CP1252 text that has been mistakenly placed where
     567             :     UTF-8 is expected, and has proven very useful.
     568             : 
     569             :     If you want errors to be converted to error characters (as the
     570             :     standards recommend), adding a test to see if the length is
     571             :     unexpectedly 1 will work:
     572             : 
     573             : \code
     574             :     if( *p & 0x80 )
     575             :     {  // What should be a multibyte encoding.
     576             :       code = utf8decode(p, end, &len);
     577             :       if( len<2 ) code = 0xFFFD;  // Turn errors into REPLACEMENT CHARACTER.
     578             :     }
     579             :     else
     580             :     {  // Handle the 1-byte utf8 encoding:
     581             :       code = *p;
     582             :       len = 1;
     583             :     }
     584             : \endcode
     585             : 
     586             :     Direct testing for the 1-byte case (as shown above) will also
     587             :     speed up the scanning of strings where the majority of characters
     588             :     are ASCII.
     589             : */
     590        2707 : static unsigned utf8decode(const char *p, const char *end, int *len)
     591             : {
     592        2707 :     unsigned char c = *reinterpret_cast<const unsigned char *>(p);
     593        2707 :     if (c < 0x80)
     594             :     {
     595           0 :         *len = 1;
     596           0 :         return c;
     597             : #if ERRORS_TO_CP1252
     598             :     }
     599        2707 :     else if (c < 0xa0)
     600             :     {
     601          39 :         *len = 1;
     602          39 :         return cp1252[c - 0x80];
     603             : #endif
     604             :     }
     605        2668 :     else if (c < 0xc2)
     606             :     {
     607          10 :         goto FAIL;
     608             :     }
     609        2658 :     if (p + 1 >= end || (p[1] & 0xc0) != 0x80)
     610          72 :         goto FAIL;
     611        2586 :     if (c < 0xe0)
     612             :     {
     613        2578 :         *len = 2;
     614        2578 :         return ((p[0] & 0x1f) << 6) + ((p[1] & 0x3f));
     615             :     }
     616           8 :     else if (c == 0xe0)
     617             :     {
     618           0 :         if ((reinterpret_cast<const unsigned char *>(p))[1] < 0xa0)
     619           0 :             goto FAIL;
     620           0 :         goto UTF8_3;
     621             : #if STRICT_RFC3629
     622             :     }
     623             :     else if (c == 0xed)
     624             :     {
     625             :         // RFC 3629 says surrogate chars are illegal.
     626             :         if ((reinterpret_cast<const unsigned char *>(p))[1] >= 0xa0)
     627             :             goto FAIL;
     628             :         goto UTF8_3;
     629             :     }
     630             :     else if (c == 0xef)
     631             :     {
     632             :         // 0xfffe and 0xffff are also illegal characters.
     633             :         if ((reinterpret_cast<const unsigned char *>(p))[1] == 0xbf &&
     634             :             (reinterpret_cast<const unsigned char *>(p))[2] >= 0xbe)
     635             :             goto FAIL;
     636             :         goto UTF8_3;
     637             : #endif
     638             :     }
     639           8 :     else if (c < 0xf0)
     640             :     {
     641           4 :     UTF8_3:
     642           4 :         if (p + 2 >= end || (p[2] & 0xc0) != 0x80)
     643           0 :             goto FAIL;
     644           4 :         *len = 3;
     645           4 :         return ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + ((p[2] & 0x3f));
     646             :     }
     647           4 :     else if (c == 0xf0)
     648             :     {
     649           4 :         if ((reinterpret_cast<const unsigned char *>(p))[1] < 0x90)
     650           0 :             goto FAIL;
     651           4 :         goto UTF8_4;
     652             :     }
     653           0 :     else if (c < 0xf4)
     654             :     {
     655           0 :     UTF8_4:
     656           4 :         if (p + 3 >= end || (p[2] & 0xc0) != 0x80 || (p[3] & 0xc0) != 0x80)
     657           0 :             goto FAIL;
     658           4 :         *len = 4;
     659             : #if STRICT_RFC3629
     660             :         // RFC 3629 says all codes ending in fffe or ffff are illegal:
     661             :         if ((p[1] & 0xf) == 0xf &&
     662             :             (reinterpret_cast<const unsigned char *>(p))[2] == 0xbf &&
     663             :             (reinterpret_cast<const unsigned char *>(p))[3] >= 0xbe)
     664             :             goto FAIL;
     665             : #endif
     666           4 :         return ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12) +
     667           4 :                ((p[2] & 0x3f) << 6) + ((p[3] & 0x3f));
     668             :     }
     669           0 :     else if (c == 0xf4)
     670             :     {
     671           0 :         if ((reinterpret_cast<const unsigned char *>(p))[1] > 0x8f)
     672           0 :             goto FAIL;  // After 0x10ffff.
     673           0 :         goto UTF8_4;
     674             :     }
     675             :     else
     676             :     {
     677           0 :     FAIL:
     678          82 :         *len = 1;
     679             : #if ERRORS_TO_ISO8859_1
     680          82 :         return c;
     681             : #else
     682             :         return 0xfffd;  // Unicode REPLACEMENT CHARACTER
     683             : #endif
     684             :     }
     685             : }
     686             : 
     687             : /************************************************************************/
     688             : /*                              utf8towc()                              */
     689             : /************************************************************************/
     690             : 
     691             : /*  Convert a UTF-8 sequence into an array of wchar_t. These
     692             :     are used by some system calls, especially on Windows.
     693             : 
     694             :     \a src points at the UTF-8, and \a srclen is the number of bytes to
     695             :     convert.
     696             : 
     697             :     \a dst points at an array to write, and \a dstlen is the number of
     698             :     locations in this array. At most \a dstlen-1 words will be
     699             :     written there, plus a 0 terminating word. Thus this function
     700             :     will never overwrite the buffer and will always return a
     701             :     zero-terminated string. If \a dstlen is zero then \a dst can be
     702             :     null and no data is written, but the length is returned.
     703             : 
     704             :     The return value is the number of words that \e would be written
     705             :     to \a dst if it were long enough, not counting the terminating
     706             :     zero. If the return value is greater or equal to \a dstlen it
     707             :     indicates truncation, you can then allocate a new array of size
     708             :     return+1 and call this again.
     709             : 
     710             :     Errors in the UTF-8 are converted as though each byte in the
     711             :     erroneous string is in the Microsoft CP1252 encoding. This allows
     712             :     ISO-8859-1 text mistakenly identified as UTF-8 to be printed
     713             :     correctly.
     714             : 
     715             :     Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
     716             :     and most other systems. Where wchar_t is 16 bits, Unicode
     717             :     characters in the range 0x10000 to 0x10ffff are converted to
     718             :     "surrogate pairs" which take two words each (this is called UTF-16
     719             :     encoding). If wchar_t is 32 bits this rather nasty problem is
     720             :     avoided.
     721             : */
     722       40952 : static unsigned utf8towc(const char *src, unsigned srclen, wchar_t *dst,
     723             :                          unsigned dstlen)
     724             : {
     725       40952 :     const char *p = src;
     726       40952 :     const char *e = src + srclen;
     727       40952 :     unsigned count = 0;
     728       40952 :     if (dstlen)
     729             :         while (true)
     730             :         {
     731      298216 :             if (p >= e)
     732             :             {
     733       40952 :                 dst[count] = 0;
     734       40952 :                 return count;
     735             :             }
     736      257264 :             if (!(*p & 0x80))
     737             :             {
     738             :                 // ASCII
     739      257060 :                 dst[count] = *p++;
     740             :             }
     741             :             else
     742             :             {
     743         204 :                 int len = 0;
     744         204 :                 unsigned ucs = utf8decode(p, e, &len);
     745         204 :                 p += len;
     746             : #ifdef _WIN32
     747             :                 if (ucs < 0x10000)
     748             :                 {
     749             :                     dst[count] = static_cast<wchar_t>(ucs);
     750             :                 }
     751             :                 else
     752             :                 {
     753             :                     // Make a surrogate pair:
     754             :                     if (count + 2 >= dstlen)
     755             :                     {
     756             :                         dst[count] = 0;
     757             :                         count += 2;
     758             :                         break;
     759             :                     }
     760             :                     dst[count] = static_cast<wchar_t>(
     761             :                         (((ucs - 0x10000u) >> 10) & 0x3ff) | 0xd800);
     762             :                     dst[++count] = static_cast<wchar_t>((ucs & 0x3ff) | 0xdc00);
     763             :                 }
     764             : #else
     765         204 :                 dst[count] = static_cast<wchar_t>(ucs);
     766             : #endif
     767             :             }
     768      257264 :             if (++count == dstlen)
     769             :             {
     770           0 :                 dst[count - 1] = 0;
     771           0 :                 break;
     772             :             }
     773      257264 :         }
     774             :     // We filled dst, measure the rest:
     775           0 :     while (p < e)
     776             :     {
     777           0 :         if (!(*p & 0x80))
     778             :         {
     779           0 :             p++;
     780             :         }
     781             :         else
     782             :         {
     783           0 :             int len = 0;
     784             : #ifdef _WIN32
     785             :             const unsigned ucs = utf8decode(p, e, &len);
     786             :             p += len;
     787             :             if (ucs >= 0x10000)
     788             :                 ++count;
     789             : #else
     790           0 :             utf8decode(p, e, &len);
     791           0 :             p += len;
     792             : #endif
     793             :         }
     794           0 :         ++count;
     795             :     }
     796             : 
     797           0 :     return count;
     798             : }
     799             : 
     800             : /************************************************************************/
     801             : /*                              utf8toa()                               */
     802             : /************************************************************************/
     803             : /* Convert a UTF-8 sequence into an array of 1-byte characters.
     804             : 
     805             :     If the UTF-8 decodes to a character greater than 0xff then it is
     806             :     replaced with '?'.
     807             : 
     808             :     Errors in the UTF-8 are converted as individual bytes, same as
     809             :     utf8decode() does. This allows ISO-8859-1 text mistakenly identified
     810             :     as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
     811             : 
     812             :     \a src points at the UTF-8, and \a srclen is the number of bytes to
     813             :     convert.
     814             : 
     815             :     Up to \a dstlen bytes are written to \a dst, including a null
     816             :     terminator. The return value is the number of bytes that would be
     817             :     written, not counting the null terminator. If greater or equal to
     818             :     \a dstlen then if you malloc a new array of size n+1 you will have
     819             :     the space needed for the entire string. If \a dstlen is zero then
     820             :     nothing is written and this call just measures the storage space
     821             :     needed.
     822             : */
     823       49939 : static unsigned int utf8toa(const char *src, unsigned srclen, char *dst,
     824             :                             unsigned dstlen)
     825             : {
     826       49939 :     const char *p = src;
     827       49939 :     const char *e = src + srclen;
     828       49939 :     unsigned int count = 0;
     829       49939 :     if (dstlen)
     830             :         while (true)
     831             :         {
     832      193955 :             if (p >= e)
     833             :             {
     834       49939 :                 dst[count] = 0;
     835       49939 :                 return count;
     836             :             }
     837      144016 :             unsigned char c = *reinterpret_cast<const unsigned char *>(p);
     838      144016 :             if (c < 0xC2)
     839             :             {
     840             :                 // ASCII or bad code.
     841      143118 :                 dst[count] = c;
     842      143118 :                 p++;
     843             :             }
     844             :             else
     845             :             {
     846         898 :                 int len = 0;
     847         898 :                 const unsigned int ucs = utf8decode(p, e, &len);
     848         898 :                 p += len;
     849         898 :                 if (ucs < 0x100)
     850             :                 {
     851         894 :                     dst[count] = static_cast<char>(ucs);
     852             :                 }
     853             :                 else
     854             :                 {
     855           4 :                     if (!bHaveWarned4)
     856             :                     {
     857           2 :                         bHaveWarned4 = true;
     858           2 :                         CPLError(
     859             :                             CE_Warning, CPLE_AppDefined,
     860             :                             "One or several characters couldn't be converted "
     861             :                             "correctly from UTF-8 to ISO-8859-1.  "
     862             :                             "This warning will not be emitted anymore.");
     863             :                     }
     864           4 :                     dst[count] = '?';
     865             :                 }
     866             :             }
     867      144016 :             if (++count >= dstlen)
     868             :             {
     869           0 :                 dst[count - 1] = 0;
     870           0 :                 break;
     871             :             }
     872      144016 :         }
     873             :     // We filled dst, measure the rest:
     874           0 :     while (p < e)
     875             :     {
     876           0 :         if (!(*p & 0x80))
     877             :         {
     878           0 :             p++;
     879             :         }
     880             :         else
     881             :         {
     882           0 :             int len = 0;
     883           0 :             utf8decode(p, e, &len);
     884           0 :             p += len;
     885             :         }
     886           0 :         ++count;
     887             :     }
     888           0 :     return count;
     889             : }
     890             : 
     891             : /************************************************************************/
     892             : /*                             utf8fromwc()                             */
     893             : /************************************************************************/
     894             : /* Turn "wide characters" as returned by some system calls
     895             :     (especially on Windows) into UTF-8.
     896             : 
     897             :     Up to \a dstlen bytes are written to \a dst, including a null
     898             :     terminator. The return value is the number of bytes that would be
     899             :     written, not counting the null terminator. If greater or equal to
     900             :     \a dstlen then if you malloc a new array of size n+1 you will have
     901             :     the space needed for the entire string. If \a dstlen is zero then
     902             :     nothing is written and this call just measures the storage space
     903             :     needed.
     904             : 
     905             :     \a srclen is the number of words in \a src to convert. On Windows
     906             :     this is not necessarily the number of characters, due to there
     907             :     possibly being "surrogate pairs" in the UTF-16 encoding used.
     908             :     On Unix wchar_t is 32 bits and each location is a character.
     909             : 
     910             :     On Unix if a src word is greater than 0x10ffff then this is an
     911             :     illegal character according to RFC 3629. These are converted as
     912             :     though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
     913             :     range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
     914             :     illegal according to RFC 3629. However I encode these as though
     915             :     they are legal, so that utf8towc will return the original data.
     916             : 
     917             :     On Windows "surrogate pairs" are converted to a single character
     918             :     and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
     919             :     pairs are converted as though they are individual characters.
     920             : */
     921       72574 : static unsigned int utf8fromwc(char *dst, unsigned dstlen, const wchar_t *src,
     922             :                                unsigned srclen)
     923             : {
     924       72574 :     unsigned int i = 0;
     925       72574 :     unsigned int count = 0;
     926       72574 :     if (dstlen)
     927             :         while (true)
     928             :         {
     929     1706370 :             if (i >= srclen)
     930             :             {
     931       72574 :                 dst[count] = 0;
     932       72574 :                 return count;
     933             :             }
     934     1633790 :             unsigned int ucs = src[i++];
     935     1633790 :             if (ucs < 0x80U)
     936             :             {
     937     1627390 :                 dst[count++] = static_cast<char>(ucs);
     938     1627390 :                 if (count >= dstlen)
     939             :                 {
     940           0 :                     dst[count - 1] = 0;
     941           0 :                     break;
     942             :                 }
     943             :             }
     944        6406 :             else if (ucs < 0x800U)
     945             :             {
     946             :                 // 2 bytes.
     947        4033 :                 if (count + 2 >= dstlen)
     948             :                 {
     949           0 :                     dst[count] = 0;
     950           0 :                     count += 2;
     951           0 :                     break;
     952             :                 }
     953        4033 :                 dst[count++] = 0xc0 | static_cast<char>(ucs >> 6);
     954        4033 :                 dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
     955             : #ifdef _WIN32
     956             :             }
     957             :             else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
     958             :                      src[i] >= 0xdc00 && src[i] <= 0xdfff)
     959             :             {
     960             :                 // Surrogate pair.
     961             :                 unsigned int ucs2 = src[i++];
     962             :                 ucs = 0x10000U + ((ucs & 0x3ff) << 10) + (ucs2 & 0x3ff);
     963             :                 // All surrogate pairs turn into 4-byte utf8.
     964             : #else
     965             :             }
     966        2373 :             else if (ucs >= 0x10000)
     967             :             {
     968           1 :                 if (ucs > 0x10ffff)
     969             :                 {
     970           1 :                     ucs = 0xfffd;
     971           1 :                     goto J1;
     972             :                 }
     973             : #endif
     974           0 :                 if (count + 4 >= dstlen)
     975             :                 {
     976           0 :                     dst[count] = 0;
     977           0 :                     count += 4;
     978           0 :                     break;
     979             :                 }
     980           0 :                 dst[count++] = 0xf0 | static_cast<char>(ucs >> 18);
     981           0 :                 dst[count++] = 0x80 | static_cast<char>((ucs >> 12) & 0x3F);
     982           0 :                 dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F);
     983           0 :                 dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
     984             :             }
     985             :             else
     986             :             {
     987             : #ifndef _WIN32
     988        2372 :             J1:
     989             : #endif
     990             :                 // All others are 3 bytes:
     991        2373 :                 if (count + 3 >= dstlen)
     992             :                 {
     993           0 :                     dst[count] = 0;
     994           0 :                     count += 3;
     995           0 :                     break;
     996             :                 }
     997        2373 :                 dst[count++] = 0xe0 | static_cast<char>(ucs >> 12);
     998        2373 :                 dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F);
     999        2373 :                 dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
    1000             :             }
    1001     1633790 :         }
    1002             : 
    1003             :     // We filled dst, measure the rest:
    1004           0 :     while (i < srclen)
    1005             :     {
    1006           0 :         unsigned int ucs = src[i++];
    1007           0 :         if (ucs < 0x80U)
    1008             :         {
    1009           0 :             count++;
    1010             :         }
    1011           0 :         else if (ucs < 0x800U)
    1012             :         {
    1013             :             // 2 bytes.
    1014           0 :             count += 2;
    1015             : #ifdef _WIN32
    1016             :         }
    1017             :         else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen - 1 &&
    1018             :                  src[i + 1] >= 0xdc00 && src[i + 1] <= 0xdfff)
    1019             :         {
    1020             :             // Surrogate pair.
    1021             :             ++i;
    1022             : #else
    1023             :         }
    1024           0 :         else if (ucs >= 0x10000 && ucs <= 0x10ffff)
    1025             :         {
    1026             : #endif
    1027           0 :             count += 4;
    1028             :         }
    1029             :         else
    1030             :         {
    1031           0 :             count += 3;
    1032             :         }
    1033             :     }
    1034           0 :     return count;
    1035             : }
    1036             : 
    1037             : /************************************************************************/
    1038             : /*                             utf8froma()                              */
    1039             : /************************************************************************/
    1040             : 
    1041             : /* Convert an ISO-8859-1 (i.e. normal c-string) byte stream to UTF-8.
    1042             : 
    1043             :     It is possible this should convert Microsoft's CP1252 to UTF-8
    1044             :     instead. This would translate the codes in the range 0x80-0x9f
    1045             :     to different characters. Currently it does not do this.
    1046             : 
    1047             :     Up to \a dstlen bytes are written to \a dst, including a null
    1048             :     terminator. The return value is the number of bytes that would be
    1049             :     written, not counting the null terminator. If greater or equal to
    1050             :     \a dstlen then if you malloc a new array of size n+1 you will have
    1051             :     the space needed for the entire string. If \a dstlen is zero then
    1052             :     nothing is written and this call just measures the storage space
    1053             :     needed.
    1054             : 
    1055             :     \a srclen is the number of bytes in \a src to convert.
    1056             : 
    1057             :     If the return value equals \a srclen then this indicates that
    1058             :     no conversion is necessary, as only ASCII characters are in the
    1059             :     string.
    1060             : */
    1061      713610 : static unsigned utf8froma(char *dst, unsigned dstlen, const char *src,
    1062             :                           unsigned srclen)
    1063             : {
    1064      713610 :     const char *p = src;
    1065      713610 :     const char *e = src + srclen;
    1066      713610 :     unsigned count = 0;
    1067      713610 :     if (dstlen)
    1068             :         while (true)
    1069             :         {
    1070     8448250 :             if (p >= e)
    1071             :             {
    1072      713610 :                 dst[count] = 0;
    1073      713610 :                 return count;
    1074             :             }
    1075     7734640 :             unsigned char ucs = *reinterpret_cast<const unsigned char *>(p);
    1076     7734640 :             p++;
    1077     7734640 :             if (ucs < 0x80U)
    1078             :             {
    1079     7696850 :                 dst[count++] = ucs;
    1080     7696850 :                 if (count >= dstlen)
    1081             :                 {
    1082           0 :                     dst[count - 1] = 0;
    1083           0 :                     break;
    1084             :                 }
    1085             :             }
    1086             :             else
    1087             :             {
    1088             :                 // 2 bytes (note that CP1252 translate could make 3 bytes!)
    1089       37785 :                 if (count + 2 >= dstlen)
    1090             :                 {
    1091           0 :                     dst[count] = 0;
    1092           0 :                     count += 2;
    1093           0 :                     break;
    1094             :                 }
    1095       37785 :                 dst[count++] = 0xc0 | (ucs >> 6);
    1096       37785 :                 dst[count++] = 0x80 | (ucs & 0x3F);
    1097             :             }
    1098     7734640 :         }
    1099             : 
    1100             :     // We filled dst, measure the rest:
    1101           0 :     while (p < e)
    1102             :     {
    1103           0 :         unsigned char ucs = *reinterpret_cast<const unsigned char *>(p);
    1104           0 :         p++;
    1105           0 :         if (ucs < 0x80U)
    1106             :         {
    1107           0 :             count++;
    1108             :         }
    1109             :         else
    1110             :         {
    1111           0 :             count += 2;
    1112             :         }
    1113             :     }
    1114             : 
    1115           0 :     return count;
    1116             : }
    1117             : 
    1118             : #ifdef _WIN32
    1119             : 
    1120             : /************************************************************************/
    1121             : /*                            CPLWin32Recode()                          */
    1122             : /************************************************************************/
    1123             : 
    1124             : /* Convert an CODEPAGE (i.e. normal c-string) byte stream
    1125             :      to another CODEPAGE (i.e. normal c-string) byte stream.
    1126             : 
    1127             :     \a src is target c-string byte stream (including a null terminator).
    1128             :     \a src_code_page is target c-string byte code page.
    1129             :     \a dst_code_page is destination c-string byte code page.
    1130             : 
    1131             :    UTF7          65000
    1132             :    UTF8          65001
    1133             :    OEM-US          437
    1134             :    OEM-ALABIC      720
    1135             :    OEM-GREEK       737
    1136             :    OEM-BALTIC      775
    1137             :    OEM-MLATIN1     850
    1138             :    OEM-LATIN2      852
    1139             :    OEM-CYRILLIC    855
    1140             :    OEM-TURKISH     857
    1141             :    OEM-MLATIN1P    858
    1142             :    OEM-HEBREW      862
    1143             :    OEM-RUSSIAN     866
    1144             : 
    1145             :    THAI            874
    1146             :    SJIS            932
    1147             :    GBK             936
    1148             :    KOREA           949
    1149             :    BIG5            950
    1150             : 
    1151             :    EUROPE         1250
    1152             :    CYRILLIC       1251
    1153             :    LATIN1         1252
    1154             :    GREEK          1253
    1155             :    TURKISH        1254
    1156             :    HEBREW         1255
    1157             :    ARABIC         1256
    1158             :    BALTIC         1257
    1159             :    VIETNAM        1258
    1160             : 
    1161             :    ISO-LATIN1    28591
    1162             :    ISO-LATIN2    28592
    1163             :    ISO-LATIN3    28593
    1164             :    ISO-BALTIC    28594
    1165             :    ISO-CYRILLIC  28595
    1166             :    ISO-ARABIC    28596
    1167             :    ISO-HEBREW    28598
    1168             :    ISO-TURKISH   28599
    1169             :    ISO-LATIN9    28605
    1170             : 
    1171             :    ISO-2022-JP   50220
    1172             : 
    1173             : */
    1174             : 
    1175             : char *CPLWin32Recode(const char *src, unsigned src_code_page,
    1176             :                      unsigned dst_code_page)
    1177             : {
    1178             :     // Convert from source code page to Unicode.
    1179             : 
    1180             :     // Compute the length in wide characters.
    1181             :     int wlen = MultiByteToWideChar(src_code_page, MB_ERR_INVALID_CHARS, src, -1,
    1182             :                                    nullptr, 0);
    1183             :     if (wlen == 0 && GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
    1184             :     {
    1185             :         if (!bHaveWarned5)
    1186             :         {
    1187             :             bHaveWarned5 = true;
    1188             :             CPLError(
    1189             :                 CE_Warning, CPLE_AppDefined,
    1190             :                 "One or several characters could not be translated from CP%d. "
    1191             :                 "This warning will not be emitted anymore.",
    1192             :                 src_code_page);
    1193             :         }
    1194             : 
    1195             :         // Retry now without MB_ERR_INVALID_CHARS flag.
    1196             :         wlen = MultiByteToWideChar(src_code_page, 0, src, -1, nullptr, 0);
    1197             :     }
    1198             : 
    1199             :     // Do the actual conversion.
    1200             :     wchar_t *tbuf =
    1201             :         static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), wlen + 1));
    1202             :     tbuf[wlen] = 0;
    1203             :     MultiByteToWideChar(src_code_page, 0, src, -1, tbuf, wlen + 1);
    1204             : 
    1205             :     // Convert from Unicode to destination code page.
    1206             : 
    1207             :     // Compute the length in chars.
    1208             :     BOOL bUsedDefaultChar = FALSE;
    1209             :     int len = 0;
    1210             :     if (dst_code_page == CP_UTF7 || dst_code_page == CP_UTF8)
    1211             :         len = WideCharToMultiByte(dst_code_page, 0, tbuf, -1, nullptr, 0,
    1212             :                                   nullptr, nullptr);
    1213             :     else
    1214             :         len = WideCharToMultiByte(dst_code_page, 0, tbuf, -1, nullptr, 0,
    1215             :                                   nullptr, &bUsedDefaultChar);
    1216             :     if (bUsedDefaultChar)
    1217             :     {
    1218             :         if (!bHaveWarned6)
    1219             :         {
    1220             :             bHaveWarned6 = true;
    1221             :             CPLError(
    1222             :                 CE_Warning, CPLE_AppDefined,
    1223             :                 "One or several characters could not be translated to CP%d. "
    1224             :                 "This warning will not be emitted anymore.",
    1225             :                 dst_code_page);
    1226             :         }
    1227             :     }
    1228             : 
    1229             :     // Do the actual conversion.
    1230             :     char *pszResult = static_cast<char *>(CPLCalloc(sizeof(char), len + 1));
    1231             :     WideCharToMultiByte(dst_code_page, 0, tbuf, -1, pszResult, len + 1, nullptr,
    1232             :                         nullptr);
    1233             :     pszResult[len] = 0;
    1234             : 
    1235             :     CPLFree(tbuf);
    1236             : 
    1237             :     return pszResult;
    1238             : }
    1239             : 
    1240             : #endif
    1241             : 
    1242             : /*
    1243             : ** For now we disable the rest which is locale() related.  We may need
    1244             : ** parts of it later.
    1245             : */
    1246             : 
    1247             : #ifdef notdef
    1248             : 
    1249             : #ifdef _WIN32
    1250             : #include <windows.h>
    1251             : #endif
    1252             : 
    1253             : /*! Return true if the "locale" seems to indicate that UTF-8 encoding
    1254             :     is used. If true the utf8tomb and utf8frommb don't do anything
    1255             :     useful.
    1256             : 
    1257             :     <i>It is highly recommended that you change your system so this
    1258             :     does return true.</i> On Windows this is done by setting the
    1259             :     "codepage" to CP_UTF8.  On Unix this is done by setting $LC_CTYPE
    1260             :     to a string containing the letters "utf" or "UTF" in it, or by
    1261             :     deleting all $LC* and $LANG environment variables. In the future
    1262             :     it is likely that all non-Asian Unix systems will return true,
    1263             :     due to the compatibility of UTF-8 with ISO-8859-1.
    1264             : */
    1265             : int utf8locale(void)
    1266             : {
    1267             :     static int ret = 2;
    1268             :     if (ret == 2)
    1269             :     {
    1270             : #ifdef _WIN32
    1271             :         ret = GetACP() == CP_UTF8;
    1272             : #else
    1273             :         char *s;
    1274             :         ret = 1;  // assume UTF-8 if no locale
    1275             :         if (((s = getenv("LC_CTYPE")) && *s) ||
    1276             :             ((s = getenv("LC_ALL")) && *s) || ((s = getenv("LANG")) && *s))
    1277             :         {
    1278             :             ret = strstr(s, "utf") || strstr(s, "UTF");
    1279             :         }
    1280             : #endif
    1281             :     }
    1282             : 
    1283             :     return ret;
    1284             : }
    1285             : 
    1286             : /*! Convert the UTF-8 used by FLTK to the locale-specific encoding
    1287             :     used for filenames (and sometimes used for data in files).
    1288             :     Unfortunately due to stupid design you will have to do this as
    1289             :     needed for filenames. This is a bug on both Unix and Windows.
    1290             : 
    1291             :     Up to \a dstlen bytes are written to \a dst, including a null
    1292             :     terminator. The return value is the number of bytes that would be
    1293             :     written, not counting the null terminator. If greater or equal to
    1294             :     \a dstlen then if you malloc a new array of size n+1 you will have
    1295             :     the space needed for the entire string. If \a dstlen is zero then
    1296             :     nothing is written and this call just measures the storage space
    1297             :     needed.
    1298             : 
    1299             :     If utf8locale() returns true then this does not change the data.
    1300             :     It is copied and truncated as necessary to
    1301             :     the destination buffer and \a srclen is always returned.  */
    1302             : unsigned utf8tomb(const char *src, unsigned srclen, char *dst, unsigned dstlen)
    1303             : {
    1304             :     if (!utf8locale())
    1305             :     {
    1306             : #ifdef _WIN32
    1307             :         wchar_t lbuf[1024] = {};
    1308             :         wchar_t *buf = lbuf;
    1309             :         unsigned length = utf8towc(src, srclen, buf, 1024);
    1310             :         unsigned ret;
    1311             :         if (length >= 1024)
    1312             :         {
    1313             :             buf =
    1314             :                 static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t)));
    1315             :             utf8towc(src, srclen, buf, length + 1);
    1316             :         }
    1317             :         if (dstlen)
    1318             :         {
    1319             :             // apparently this does not null-terminate, even though msdn
    1320             :             // documentation claims it does:
    1321             :             ret = WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0,
    1322             :                                       0);
    1323             :             dst[ret] = 0;
    1324             :         }
    1325             :         // if it overflows or measuring length, get the actual length:
    1326             :         if (dstlen == 0 || ret >= dstlen - 1)
    1327             :             ret = WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
    1328             :         if (buf != lbuf)
    1329             :             free((void *)buf);
    1330             :         return ret;
    1331             : #else
    1332             :         wchar_t lbuf[1024] = {};
    1333             :         wchar_t *buf = lbuf;
    1334             :         unsigned length = utf8towc(src, srclen, buf, 1024);
    1335             :         if (length >= 1024)
    1336             :         {
    1337             :             buf =
    1338             :                 static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t)));
    1339             :             utf8towc(src, srclen, buf, length + 1);
    1340             :         }
    1341             :         int ret = 0;
    1342             :         if (dstlen)
    1343             :         {
    1344             :             ret = wcstombs(dst, buf, dstlen);
    1345             :             if (ret >= dstlen - 1)
    1346             :                 ret = wcstombs(0, buf, 0);
    1347             :         }
    1348             :         else
    1349             :         {
    1350             :             ret = wcstombs(0, buf, 0);
    1351             :         }
    1352             :         if (buf != lbuf)
    1353             :             free((void *)buf);
    1354             :         if (ret >= 0)
    1355             :             return (unsigned)ret;
    1356             :             // On any errors we return the UTF-8 as raw text...
    1357             : #endif
    1358             :     }
    1359             :     // Identity transform:
    1360             :     if (srclen < dstlen)
    1361             :     {
    1362             :         memcpy(dst, src, srclen);
    1363             :         dst[srclen] = 0;
    1364             :     }
    1365             :     else
    1366             :     {
    1367             :         memcpy(dst, src, dstlen - 1);
    1368             :         dst[dstlen - 1] = 0;
    1369             :     }
    1370             :     return srclen;
    1371             : }
    1372             : 
    1373             : /*! Convert a filename from the locale-specific multibyte encoding
    1374             :     used by Windows to UTF-8 as used by FLTK.
    1375             : 
    1376             :     Up to \a dstlen bytes are written to \a dst, including a null
    1377             :     terminator. The return value is the number of bytes that would be
    1378             :     written, not counting the null terminator. If greater or equal to
    1379             :     \a dstlen then if you malloc a new array of size n+1 you will have
    1380             :     the space needed for the entire string. If \a dstlen is zero then
    1381             :     nothing is written and this call just measures the storage space
    1382             :     needed.
    1383             : 
    1384             :     On Unix or on Windows when a UTF-8 locale is in effect, this
    1385             :     does not change the data. It is copied and truncated as necessary to
    1386             :     the destination buffer and \a srclen is always returned.
    1387             :     You may also want to check if utf8test() returns non-zero, so that
    1388             :     the filesystem can store filenames in UTF-8 encoding regardless of
    1389             :     the locale.
    1390             : */
    1391             : unsigned utf8frommb(char *dst, unsigned dstlen, const char *src,
    1392             :                     unsigned srclen)
    1393             : {
    1394             :     if (!utf8locale())
    1395             :     {
    1396             : #ifdef _WIN32
    1397             :         wchar_t lbuf[1024] = {};
    1398             :         wchar_t *buf = lbuf;
    1399             :         unsigned ret;
    1400             :         const unsigned length =
    1401             :             MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
    1402             :         if (length >= 1024)
    1403             :         {
    1404             :             length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
    1405             :             buf = static_cast<wchar_t *>(malloc(length * sizeof(wchar_t)));
    1406             :             MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
    1407             :         }
    1408             :         ret = utf8fromwc(dst, dstlen, buf, length);
    1409             :         if (buf != lbuf)
    1410             :             free(buf);
    1411             :         return ret;
    1412             : #else
    1413             :         wchar_t lbuf[1024] = {};
    1414             :         wchar_t *buf = lbuf;
    1415             :         const int length = mbstowcs(buf, src, 1024);
    1416             :         if (length >= 1024)
    1417             :         {
    1418             :             length = mbstowcs(0, src, 0) + 1;
    1419             :             buf =
    1420             :                 static_cast<wchar_t *>(malloc(length * sizeof(unsigned short)));
    1421             :             mbstowcs(buf, src, length);
    1422             :         }
    1423             :         if (length >= 0)
    1424             :         {
    1425             :             const unsigned ret = utf8fromwc(dst, dstlen, buf, length);
    1426             :             if (buf != lbuf)
    1427             :                 free(buf);
    1428             :             return ret;
    1429             :         }
    1430             :         // Errors in conversion return the UTF-8 unchanged.
    1431             : #endif
    1432             :     }
    1433             :     // Identity transform:
    1434             :     if (srclen < dstlen)
    1435             :     {
    1436             :         memcpy(dst, src, srclen);
    1437             :         dst[srclen] = 0;
    1438             :     }
    1439             :     else
    1440             :     {
    1441             :         memcpy(dst, src, dstlen - 1);
    1442             :         dst[dstlen - 1] = 0;
    1443             :     }
    1444             :     return srclen;
    1445             : }
    1446             : 
    1447             : #endif  // def notdef - disabled locale specific stuff.
    1448             : 
    1449             : /*! Examines the first \a srclen bytes in \a src and return a verdict
    1450             :     on whether it is UTF-8 or not.
    1451             :     - Returns 0 if there is any illegal UTF-8 sequences, using the
    1452             :       same rules as utf8decode(). Note that some UCS values considered
    1453             :       illegal by RFC 3629, such as 0xffff, are considered legal by this.
    1454             :     - Returns 1 if there are only single-byte characters (i.e. no bytes
    1455             :       have the high bit set). This is legal UTF-8, but also indicates
    1456             :       plain ASCII. It also returns 1 if \a srclen is zero.
    1457             :     - Returns 2 if there are only characters less than 0x800.
    1458             :     - Returns 3 if there are only characters less than 0x10000.
    1459             :     - Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
    1460             : 
    1461             :     Because there are many illegal sequences in UTF-8, it is almost
    1462             :     impossible for a string in another encoding to be confused with
    1463             :     UTF-8. This is very useful for transitioning Unix to UTF-8
    1464             :     filenames, you can simply test each filename with this to decide
    1465             :     if it is UTF-8 or in the locale encoding. My hope is that if
    1466             :     this is done we will be able to cleanly transition to a locale-less
    1467             :     encoding.
    1468             : */
    1469             : 
    1470       19550 : static int utf8test(const char *src, unsigned srclen)
    1471             : {
    1472       19550 :     int ret = 1;
    1473       19550 :     const char *p = src;
    1474       19550 :     const char *e = src + srclen;
    1475     1861660 :     while (p < e)
    1476             :     {
    1477     1842160 :         if (*p == 0)
    1478           0 :             return 0;
    1479     1842160 :         if (*p & 0x80)
    1480             :         {
    1481        1605 :             int len = 0;
    1482        1605 :             utf8decode(p, e, &len);
    1483        1605 :             if (len < 2)
    1484          53 :                 return 0;
    1485        1552 :             if (len > ret)
    1486         553 :                 ret = len;
    1487        1552 :             p += len;
    1488             :         }
    1489             :         else
    1490             :         {
    1491     1840550 :             p++;
    1492             :         }
    1493             :     }
    1494       19497 :     return ret;
    1495             : }

Generated by: LCOV version 1.14