LCOV - code coverage report
Current view: top level - port - cpl_recode_stub.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 179 294 60.9 %
Date: 2024-04-28 23:18:46 Functions: 11 11 100.0 %

          Line data    Source code
       1             : /**********************************************************************
       2             :  *
       3             :  * Name:     cpl_recode_stub.cpp
       4             :  * Project:  CPL - Common Portability Library
       5             :  * Purpose:  Character set recoding and char/wchar_t conversions, stub
       6             :  *           implementation to be used if iconv() functionality is not
       7             :  *           available.
       8             :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       9             :  *
      10             :  * The bulk of this code is derived from the utf.c module from FLTK. It
      11             :  * was originally downloaded from:
      12             :  *    http://svn.easysw.com/public/fltk/fltk/trunk/src/utf.c
      13             :  *
      14             :  **********************************************************************
      15             :  * Copyright (c) 2008, Frank Warmerdam
      16             :  * Copyright 2006 by Bill Spitzak and others.
      17             :  * Copyright (c) 2009-2014, Even Rouault <even dot rouault at spatialys.com>
      18             :  *
      19             :  * Permission to use, copy, modify, and distribute this software for any
      20             :  * purpose with or without fee is hereby granted, provided that the above
      21             :  * copyright notice and this permission notice appear in all copies.
      22             :  *
      23             :  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
      24             :  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
      25             :  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
      26             :  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
      27             :  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
      28             :  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
      29             :  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
      30             :  **********************************************************************/
      31             : 
      32             : #include "cpl_port.h"
      33             : #include "cpl_string.h"
      34             : 
      35             : #include <cstring>
      36             : 
      37             : #include "cpl_conv.h"
      38             : #include "cpl_error.h"
      39             : 
      40             : static unsigned utf8decode(const char *p, const char *end, int *len);
      41             : static unsigned utf8towc(const char *src, unsigned srclen, wchar_t *dst,
      42             :                          unsigned dstlen);
      43             : static unsigned utf8toa(const char *src, unsigned srclen, char *dst,
      44             :                         unsigned dstlen);
      45             : static unsigned utf8fromwc(char *dst, unsigned dstlen, const wchar_t *src,
      46             :                            unsigned srclen);
      47             : static unsigned utf8froma(char *dst, unsigned dstlen, const char *src,
      48             :                           unsigned srclen);
      49             : static int utf8test(const char *src, unsigned srclen);
      50             : 
      51             : #ifdef _WIN32
      52             : 
      53             : #include <windows.h>
      54             : #include <winnls.h>
      55             : 
      56             : static char *CPLWin32Recode(const char *src, unsigned src_code_page,
      57             :                             unsigned dst_code_page) CPL_RETURNS_NONNULL;
      58             : #endif
      59             : 
      60             : /* used by cpl_recode.cpp */
      61             : extern void CPLClearRecodeStubWarningFlags();
      62             : extern char *CPLRecodeStub(const char *, const char *,
      63             :                            const char *) CPL_RETURNS_NONNULL;
      64             : extern char *CPLRecodeFromWCharStub(const wchar_t *, const char *,
      65             :                                     const char *);
      66             : extern wchar_t *CPLRecodeToWCharStub(const char *, const char *, const char *);
      67             : 
      68             : /************************************************************************/
      69             : /* ==================================================================== */
      70             : /*      Stub Implementation not depending on iconv() or WIN32 API.      */
      71             : /* ==================================================================== */
      72             : /************************************************************************/
      73             : 
      74             : static bool bHaveWarned1 = false;
      75             : static bool bHaveWarned2 = false;
      76             : static bool bHaveWarned3 = false;
      77             : static bool bHaveWarned4 = false;
      78             : static bool bHaveWarned5 = false;
      79             : static bool bHaveWarned6 = false;
      80             : 
      81             : /************************************************************************/
      82             : /*                 CPLClearRecodeStubWarningFlags()                     */
      83             : /************************************************************************/
      84             : 
      85       10377 : void CPLClearRecodeStubWarningFlags()
      86             : {
      87       10377 :     bHaveWarned1 = false;
      88       10377 :     bHaveWarned2 = false;
      89       10377 :     bHaveWarned3 = false;
      90       10377 :     bHaveWarned4 = false;
      91       10377 :     bHaveWarned5 = false;
      92       10377 :     bHaveWarned6 = false;
      93       10377 : }
      94             : 
      95             : /************************************************************************/
      96             : /*                           CPLRecodeStub()                            */
      97             : /************************************************************************/
      98             : 
      99             : /**
     100             :  * Convert a string from a source encoding to a destination encoding.
     101             :  *
     102             :  * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
     103             :  * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
     104             :  * <ul>
     105             :  *  <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in
     106             :  *  fact)</li>
     107             :  *  <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
     108             :  *  <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
     109             :  * </ul>
     110             :  *
     111             :  * If an error occurs an error may, or may not be posted with CPLError().
     112             :  *
     113             :  * @param pszSource a NULL terminated string.
     114             :  * @param pszSrcEncoding the source encoding.
     115             :  * @param pszDstEncoding the destination encoding.
     116             :  *
     117             :  * @return a NULL terminated string which should be freed with CPLFree().
     118             :  */
     119             : 
     120      714482 : char *CPLRecodeStub(const char *pszSource, const char *pszSrcEncoding,
     121             :                     const char *pszDstEncoding)
     122             : 
     123             : {
     124             :     /* -------------------------------------------------------------------- */
     125             :     /*      If the source or destination is current locale(), we change     */
     126             :     /*      it to ISO8859-1 since our stub implementation does not          */
     127             :     /*      attempt to address locales properly.                            */
     128             :     /* -------------------------------------------------------------------- */
     129             : 
     130      714482 :     if (pszSrcEncoding[0] == '\0')
     131           0 :         pszSrcEncoding = CPL_ENC_ISO8859_1;
     132             : 
     133      714482 :     if (pszDstEncoding[0] == '\0')
     134           0 :         pszDstEncoding = CPL_ENC_ISO8859_1;
     135             : 
     136             :     /* -------------------------------------------------------------------- */
     137             :     /*      ISO8859 to UTF8                                                 */
     138             :     /* -------------------------------------------------------------------- */
     139      714482 :     if (strcmp(pszSrcEncoding, CPL_ENC_ISO8859_1) == 0 &&
     140      664723 :         strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
     141             :     {
     142      664723 :         const int nCharCount = static_cast<int>(strlen(pszSource));
     143      664723 :         char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1));
     144             : 
     145      664723 :         utf8froma(pszResult, nCharCount * 2 + 1, pszSource, nCharCount);
     146             : 
     147      664723 :         return pszResult;
     148             :     }
     149             : 
     150             :     /* -------------------------------------------------------------------- */
     151             :     /*      UTF8 to ISO8859                                                 */
     152             :     /* -------------------------------------------------------------------- */
     153       49759 :     if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0 &&
     154       49759 :         strcmp(pszDstEncoding, CPL_ENC_ISO8859_1) == 0)
     155             :     {
     156       49759 :         int nCharCount = static_cast<int>(strlen(pszSource));
     157       49759 :         char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount + 1));
     158             : 
     159       49759 :         utf8toa(pszSource, nCharCount, pszResult, nCharCount + 1);
     160             : 
     161       49759 :         return pszResult;
     162             :     }
     163             : 
     164             : #ifdef _WIN32
     165             :     /* ---------------------------------------------------------------------*/
     166             :     /*      CPXXX to UTF8                                                   */
     167             :     /* ---------------------------------------------------------------------*/
     168             :     if (STARTS_WITH(pszSrcEncoding, "CP") &&
     169             :         strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
     170             :     {
     171             :         int nCode = atoi(pszSrcEncoding + 2);
     172             :         if (nCode > 0)
     173             :         {
     174             :             return CPLWin32Recode(pszSource, nCode, CP_UTF8);
     175             :         }
     176             :         else if (EQUAL(pszSrcEncoding, "CP_OEMCP"))
     177             :             return CPLWin32Recode(pszSource, CP_OEMCP, CP_UTF8);
     178             :         else if (EQUAL(pszSrcEncoding, "CP_ACP"))
     179             :             return CPLWin32Recode(pszSource, CP_ACP, CP_UTF8);
     180             :     }
     181             : 
     182             :     /* ---------------------------------------------------------------------*/
     183             :     /*      UTF8 to CPXXX                                                   */
     184             :     /* ---------------------------------------------------------------------*/
     185             :     if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0 &&
     186             :         STARTS_WITH(pszDstEncoding, "CP"))
     187             :     {
     188             :         int nCode = atoi(pszDstEncoding + 2);
     189             :         if (nCode > 0)
     190             :         {
     191             :             return CPLWin32Recode(pszSource, CP_UTF8, nCode);
     192             :         }
     193             :         else if (EQUAL(pszDstEncoding, "CP_OEMCP"))
     194             :             return CPLWin32Recode(pszSource, CP_UTF8, CP_OEMCP);
     195             :         else if (EQUAL(pszDstEncoding, "CP_ACP"))
     196             :             return CPLWin32Recode(pszSource, CP_UTF8, CP_ACP);
     197             :     }
     198             : #endif
     199             : 
     200             :     /* -------------------------------------------------------------------- */
     201             :     /*      Anything else to UTF-8 is treated as ISO8859-1 to UTF-8 with    */
     202             :     /*      a one-time warning.                                             */
     203             :     /* -------------------------------------------------------------------- */
     204           0 :     if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
     205             :     {
     206           0 :         const int nCharCount = static_cast<int>(strlen(pszSource));
     207           0 :         char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1));
     208             : 
     209           0 :         if (!bHaveWarned1)
     210             :         {
     211           0 :             bHaveWarned1 = true;
     212           0 :             CPLError(CE_Warning, CPLE_AppDefined,
     213             :                      "Recode from %s to UTF-8 not supported, "
     214             :                      "treated as ISO-8859-1 to UTF-8.",
     215             :                      pszSrcEncoding);
     216             :         }
     217             : 
     218           0 :         utf8froma(pszResult, nCharCount * 2 + 1, pszSource, nCharCount);
     219             : 
     220           0 :         return pszResult;
     221             :     }
     222             : 
     223             :     /* -------------------------------------------------------------------- */
     224             :     /*      UTF-8 to anything else is treated as UTF-8 to ISO-8859-1        */
     225             :     /*      with a warning.                                                 */
     226             :     /* -------------------------------------------------------------------- */
     227           0 :     if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0 &&
     228           0 :         strcmp(pszDstEncoding, CPL_ENC_ISO8859_1) == 0)
     229             :     {
     230           0 :         int nCharCount = static_cast<int>(strlen(pszSource));
     231           0 :         char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount + 1));
     232             : 
     233           0 :         if (!bHaveWarned2)
     234             :         {
     235           0 :             bHaveWarned2 = true;
     236           0 :             CPLError(CE_Warning, CPLE_AppDefined,
     237             :                      "Recode from UTF-8 to %s not supported, "
     238             :                      "treated as UTF-8 to ISO-8859-1.",
     239             :                      pszDstEncoding);
     240             :         }
     241             : 
     242           0 :         utf8toa(pszSource, nCharCount, pszResult, nCharCount + 1);
     243             : 
     244           0 :         return pszResult;
     245             :     }
     246             : 
     247             :     /* -------------------------------------------------------------------- */
     248             :     /*      Everything else is treated as a no-op with a warning.           */
     249             :     /* -------------------------------------------------------------------- */
     250             :     {
     251           0 :         if (!bHaveWarned3)
     252             :         {
     253           0 :             bHaveWarned3 = true;
     254           0 :             CPLError(CE_Warning, CPLE_AppDefined,
     255             :                      "Recode from %s to %s not supported, no change applied.",
     256             :                      pszSrcEncoding, pszDstEncoding);
     257             :         }
     258             : 
     259           0 :         return CPLStrdup(pszSource);
     260             :     }
     261             : }
     262             : 
     263             : /************************************************************************/
     264             : /*                       CPLRecodeFromWCharStub()                       */
     265             : /************************************************************************/
     266             : 
     267             : /**
     268             :  * Convert wchar_t string to UTF-8.
     269             :  *
     270             :  * Convert a wchar_t string into a multibyte utf-8 string.  The only
     271             :  * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
     272             :  * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
     273             :  * and CPL_ENC_ISO8859_1.  In some cases (i.e. using iconv()) other encodings
     274             :  * may also be supported.
     275             :  *
     276             :  * Note that the wchar_t type varies in size on different systems. On
     277             :  * win32 it is normally 2 bytes, and on unix 4 bytes.
     278             :  *
     279             :  * If an error occurs an error may, or may not be posted with CPLError().
     280             :  *
     281             :  * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
     282             :  * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
     283             :  * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
     284             :  *
     285             :  * @return a zero terminated multi-byte string which should be freed with
     286             :  * CPLFree(), or NULL if an error occurs.
     287             :  */
     288             : 
     289      107675 : char *CPLRecodeFromWCharStub(const wchar_t *pwszSource,
     290             :                              const char *pszSrcEncoding,
     291             :                              const char *pszDstEncoding)
     292             : 
     293             : {
     294             :     /* -------------------------------------------------------------------- */
     295             :     /*      We try to avoid changes of character set.  We are just          */
     296             :     /*      providing for unicode to unicode.                               */
     297             :     /* -------------------------------------------------------------------- */
     298      107675 :     if (strcmp(pszSrcEncoding, "WCHAR_T") != 0 &&
     299      106316 :         strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0 &&
     300      106316 :         strcmp(pszSrcEncoding, CPL_ENC_UTF16) != 0 &&
     301      106316 :         strcmp(pszSrcEncoding, CPL_ENC_UCS2) != 0 &&
     302           0 :         strcmp(pszSrcEncoding, CPL_ENC_UCS4) != 0)
     303             :     {
     304           0 :         CPLError(CE_Failure, CPLE_AppDefined,
     305             :                  "Stub recoding implementation does not support "
     306             :                  "CPLRecodeFromWCharStub(...,%s,%s)",
     307             :                  pszSrcEncoding, pszDstEncoding);
     308           0 :         return nullptr;
     309             :     }
     310             : 
     311             :     /* -------------------------------------------------------------------- */
     312             :     /*      What is the source length.                                      */
     313             :     /* -------------------------------------------------------------------- */
     314      107675 :     int nSrcLen = 0;
     315             : 
     316     1578860 :     while (pwszSource[nSrcLen] != 0)
     317     1471180 :         nSrcLen++;
     318             : 
     319             :     /* -------------------------------------------------------------------- */
     320             :     /*      Allocate destination buffer plenty big.                         */
     321             :     /* -------------------------------------------------------------------- */
     322      107675 :     const int nDstBufSize = nSrcLen * 4 + 1;
     323             :     // Nearly worst case.
     324      107675 :     char *pszResult = static_cast<char *>(CPLMalloc(nDstBufSize));
     325             : 
     326      107675 :     if (nSrcLen == 0)
     327             :     {
     328       44803 :         pszResult[0] = '\0';
     329       44803 :         return pszResult;
     330             :     }
     331             : 
     332             :     /* -------------------------------------------------------------------- */
     333             :     /*      Convert, and confirm we had enough space.                       */
     334             :     /* -------------------------------------------------------------------- */
     335       62872 :     const int nDstLen = utf8fromwc(pszResult, nDstBufSize, pwszSource, nSrcLen);
     336       62872 :     if (nDstLen >= nDstBufSize)
     337             :     {
     338           0 :         CPLAssert(false);  // too small!
     339             :         return nullptr;
     340             :     }
     341             : 
     342             :     /* -------------------------------------------------------------------- */
     343             :     /*      If something other than UTF-8 was requested, recode now.        */
     344             :     /* -------------------------------------------------------------------- */
     345       62872 :     if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
     346       62872 :         return pszResult;
     347             : 
     348             :     char *pszFinalResult =
     349           0 :         CPLRecodeStub(pszResult, CPL_ENC_UTF8, pszDstEncoding);
     350             : 
     351           0 :     CPLFree(pszResult);
     352             : 
     353           0 :     return pszFinalResult;
     354             : }
     355             : 
     356             : /************************************************************************/
     357             : /*                        CPLRecodeToWCharStub()                        */
     358             : /************************************************************************/
     359             : 
     360             : /**
     361             :  * Convert UTF-8 string to a wchar_t string.
     362             :  *
     363             :  * Convert a 8bit, multi-byte per character input string into a wide
     364             :  * character (wchar_t) string.  The only guaranteed supported source encodings
     365             :  * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1).  The only
     366             :  * guaranteed supported destination encoding is CPL_ENC_UCS2.  Other source
     367             :  * and destination encodings may be supported depending on the underlying
     368             :  * implementation.
     369             :  *
     370             :  * Note that the wchar_t type varies in size on different systems. On
     371             :  * win32 it is normally 2 bytes, and on unix 4 bytes.
     372             :  *
     373             :  * If an error occurs an error may, or may not be posted with CPLError().
     374             :  *
     375             :  * @param pszSource input multi-byte character string.
     376             :  * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
     377             :  * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
     378             :  *
     379             :  * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
     380             :  * NULL on error.
     381             :  *
     382             :  * @since GDAL 1.6.0
     383             :  */
     384             : 
     385       51932 : wchar_t *CPLRecodeToWCharStub(const char *pszSource, const char *pszSrcEncoding,
     386             :                               const char *pszDstEncoding)
     387             : 
     388             : {
     389       51932 :     char *pszUTF8Source = const_cast<char *>(pszSource);
     390             : 
     391       51932 :     if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0 &&
     392           0 :         strcmp(pszSrcEncoding, CPL_ENC_ASCII) != 0)
     393             :     {
     394           0 :         pszUTF8Source = CPLRecodeStub(pszSource, pszSrcEncoding, CPL_ENC_UTF8);
     395           0 :         if (pszUTF8Source == nullptr)
     396           0 :             return nullptr;
     397             :     }
     398             : 
     399             :     /* -------------------------------------------------------------------- */
     400             :     /*      We try to avoid changes of character set.  We are just          */
     401             :     /*      providing for unicode to unicode.                               */
     402             :     /* -------------------------------------------------------------------- */
     403       51932 :     if (strcmp(pszDstEncoding, "WCHAR_T") != 0 &&
     404       51932 :         strcmp(pszDstEncoding, CPL_ENC_UCS2) != 0 &&
     405           0 :         strcmp(pszDstEncoding, CPL_ENC_UCS4) != 0 &&
     406           0 :         strcmp(pszDstEncoding, CPL_ENC_UTF16) != 0)
     407             :     {
     408           0 :         CPLError(CE_Failure, CPLE_AppDefined,
     409             :                  "Stub recoding implementation does not support "
     410             :                  "CPLRecodeToWCharStub(...,%s,%s)",
     411             :                  pszSrcEncoding, pszDstEncoding);
     412           0 :         if (pszUTF8Source != pszSource)
     413           0 :             CPLFree(pszUTF8Source);
     414           0 :         return nullptr;
     415             :     }
     416             : 
     417             :     /* -------------------------------------------------------------------- */
     418             :     /*      Do the UTF-8 to UCS-2 recoding.                                 */
     419             :     /* -------------------------------------------------------------------- */
     420       51932 :     int nSrcLen = static_cast<int>(strlen(pszUTF8Source));
     421             :     wchar_t *pwszResult =
     422       51932 :         static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), nSrcLen + 1));
     423             : 
     424       51932 :     utf8towc(pszUTF8Source, nSrcLen, pwszResult, nSrcLen + 1);
     425             : 
     426       51932 :     if (pszUTF8Source != pszSource)
     427           0 :         CPLFree(pszUTF8Source);
     428             : 
     429       51932 :     return pwszResult;
     430             : }
     431             : 
     432             : /************************************************************************/
     433             : /*                                 CPLIsUTF8()                          */
     434             : /************************************************************************/
     435             : 
     436             : /**
     437             :  * Test if a string is encoded as UTF-8.
     438             :  *
     439             :  * @param pabyData input string to test
     440             :  * @param nLen length of the input string, or -1 if the function must compute
     441             :  *             the string length. In which case it must be null terminated.
     442             :  * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
     443             :  *
     444             :  * @since GDAL 1.7.0
     445             :  */
     446       14858 : int CPLIsUTF8(const char *pabyData, int nLen)
     447             : {
     448       14858 :     if (nLen < 0)
     449       10270 :         nLen = static_cast<int>(strlen(pabyData));
     450       14858 :     return utf8test(pabyData, static_cast<unsigned>(nLen)) != 0;
     451             : }
     452             : 
     453             : /************************************************************************/
     454             : /* ==================================================================== */
     455             : /*      UTF.C code from FLTK with some modifications.                   */
     456             : /* ==================================================================== */
     457             : /************************************************************************/
     458             : 
     459             : /* Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
     460             :    they are instead turned into the Unicode REPLACEMENT CHARACTER, of
     461             :    value 0xfffd.
     462             :    If this is on utf8decode will correctly map most (perhaps all)
     463             :    human-readable text that is in ISO-8859-1. This may allow you
     464             :    to completely ignore character sets in your code because virtually
     465             :    everything is either ISO-8859-1 or UTF-8.
     466             : */
     467             : #define ERRORS_TO_ISO8859_1 1
     468             : 
     469             : /* Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
     470             :    Unicode index for Microsoft's CP1252 character set. You should
     471             :    also set ERRORS_TO_ISO8859_1. With this a huge amount of more
     472             :    available text (such as all web pages) are correctly converted
     473             :    to Unicode.
     474             : */
     475             : #define ERRORS_TO_CP1252 1
     476             : 
     477             : /* A number of Unicode code points are in fact illegal and should not
     478             :    be produced by a UTF-8 converter. Turn this on will replace the
     479             :    bytes in those encodings with errors. If you do this then converting
     480             :    arbitrary 16-bit data to UTF-8 and then back is not an identity,
     481             :    which will probably break a lot of software.
     482             : */
     483             : #define STRICT_RFC3629 0
     484             : 
     485             : #if ERRORS_TO_CP1252
     486             : // Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
     487             : // to Unicode:
     488             : constexpr unsigned short cp1252[32] = {
     489             :     0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
     490             :     0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
     491             :     0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
     492             :     0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178};
     493             : #endif
     494             : 
     495             : /************************************************************************/
     496             : /*                             utf8decode()                             */
     497             : /************************************************************************/
     498             : 
     499             : /*
     500             :     Decode a single UTF-8 encoded character starting at \e p. The
     501             :     resulting Unicode value (in the range 0-0x10ffff) is returned,
     502             :     and \e len is set the number of bytes in the UTF-8 encoding
     503             :     (adding \e len to \e p will point at the next character).
     504             : 
     505             :     If \a p points at an illegal UTF-8 encoding, including one that
     506             :     would go past \e end, or where a code is uses more bytes than
     507             :     necessary, then *reinterpret_cast<const unsigned char*>(p) is translated as
     508             : though it is in the Microsoft CP1252 character set and \e len is set to 1.
     509             :     Treating errors this way allows this to decode almost any
     510             :     ISO-8859-1 or CP1252 text that has been mistakenly placed where
     511             :     UTF-8 is expected, and has proven very useful.
     512             : 
     513             :     If you want errors to be converted to error characters (as the
     514             :     standards recommend), adding a test to see if the length is
     515             :     unexpectedly 1 will work:
     516             : 
     517             : \code
     518             :     if( *p & 0x80 )
     519             :     {  // What should be a multibyte encoding.
     520             :       code = utf8decode(p, end, &len);
     521             :       if( len<2 ) code = 0xFFFD;  // Turn errors into REPLACEMENT CHARACTER.
     522             :     }
     523             :     else
     524             :     {  // Handle the 1-byte utf8 encoding:
     525             :       code = *p;
     526             :       len = 1;
     527             :     }
     528             : \endcode
     529             : 
     530             :     Direct testing for the 1-byte case (as shown above) will also
     531             :     speed up the scanning of strings where the majority of characters
     532             :     are ASCII.
     533             : */
     534        3922 : static unsigned utf8decode(const char *p, const char *end, int *len)
     535             : {
     536        3922 :     unsigned char c = *reinterpret_cast<const unsigned char *>(p);
     537        3922 :     if (c < 0x80)
     538             :     {
     539           0 :         *len = 1;
     540           0 :         return c;
     541             : #if ERRORS_TO_CP1252
     542             :     }
     543        3922 :     else if (c < 0xa0)
     544             :     {
     545          39 :         *len = 1;
     546          39 :         return cp1252[c - 0x80];
     547             : #endif
     548             :     }
     549        3883 :     else if (c < 0xc2)
     550             :     {
     551          10 :         goto FAIL;
     552             :     }
     553        3873 :     if (p + 1 >= end || (p[1] & 0xc0) != 0x80)
     554          71 :         goto FAIL;
     555        3802 :     if (c < 0xe0)
     556             :     {
     557        3794 :         *len = 2;
     558        3794 :         return ((p[0] & 0x1f) << 6) + ((p[1] & 0x3f));
     559             :     }
     560           8 :     else if (c == 0xe0)
     561             :     {
     562           0 :         if ((reinterpret_cast<const unsigned char *>(p))[1] < 0xa0)
     563           0 :             goto FAIL;
     564           0 :         goto UTF8_3;
     565             : #if STRICT_RFC3629
     566             :     }
     567             :     else if (c == 0xed)
     568             :     {
     569             :         // RFC 3629 says surrogate chars are illegal.
     570             :         if ((reinterpret_cast<const unsigned char *>(p))[1] >= 0xa0)
     571             :             goto FAIL;
     572             :         goto UTF8_3;
     573             :     }
     574             :     else if (c == 0xef)
     575             :     {
     576             :         // 0xfffe and 0xffff are also illegal characters.
     577             :         if ((reinterpret_cast<const unsigned char *>(p))[1] == 0xbf &&
     578             :             (reinterpret_cast<const unsigned char *>(p))[2] >= 0xbe)
     579             :             goto FAIL;
     580             :         goto UTF8_3;
     581             : #endif
     582             :     }
     583           8 :     else if (c < 0xf0)
     584             :     {
     585           4 :     UTF8_3:
     586           4 :         if (p + 2 >= end || (p[2] & 0xc0) != 0x80)
     587           0 :             goto FAIL;
     588           4 :         *len = 3;
     589           4 :         return ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + ((p[2] & 0x3f));
     590             :     }
     591           4 :     else if (c == 0xf0)
     592             :     {
     593           4 :         if ((reinterpret_cast<const unsigned char *>(p))[1] < 0x90)
     594           0 :             goto FAIL;
     595           4 :         goto UTF8_4;
     596             :     }
     597           0 :     else if (c < 0xf4)
     598             :     {
     599           0 :     UTF8_4:
     600           4 :         if (p + 3 >= end || (p[2] & 0xc0) != 0x80 || (p[3] & 0xc0) != 0x80)
     601           0 :             goto FAIL;
     602           4 :         *len = 4;
     603             : #if STRICT_RFC3629
     604             :         // RFC 3629 says all codes ending in fffe or ffff are illegal:
     605             :         if ((p[1] & 0xf) == 0xf &&
     606             :             (reinterpret_cast<const unsigned char *>(p))[2] == 0xbf &&
     607             :             (reinterpret_cast<const unsigned char *>(p))[3] >= 0xbe)
     608             :             goto FAIL;
     609             : #endif
     610           4 :         return ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12) +
     611           4 :                ((p[2] & 0x3f) << 6) + ((p[3] & 0x3f));
     612             :     }
     613           0 :     else if (c == 0xf4)
     614             :     {
     615           0 :         if ((reinterpret_cast<const unsigned char *>(p))[1] > 0x8f)
     616           0 :             goto FAIL;  // After 0x10ffff.
     617           0 :         goto UTF8_4;
     618             :     }
     619             :     else
     620             :     {
     621           0 :     FAIL:
     622          81 :         *len = 1;
     623             : #if ERRORS_TO_ISO8859_1
     624          81 :         return c;
     625             : #else
     626             :         return 0xfffd;  // Unicode REPLACEMENT CHARACTER
     627             : #endif
     628             :     }
     629             : }
     630             : 
     631             : /************************************************************************/
     632             : /*                              utf8towc()                              */
     633             : /************************************************************************/
     634             : 
     635             : /*  Convert a UTF-8 sequence into an array of wchar_t. These
     636             :     are used by some system calls, especially on Windows.
     637             : 
     638             :     \a src points at the UTF-8, and \a srclen is the number of bytes to
     639             :     convert.
     640             : 
     641             :     \a dst points at an array to write, and \a dstlen is the number of
     642             :     locations in this array. At most \a dstlen-1 words will be
     643             :     written there, plus a 0 terminating word. Thus this function
     644             :     will never overwrite the buffer and will always return a
     645             :     zero-terminated string. If \a dstlen is zero then \a dst can be
     646             :     null and no data is written, but the length is returned.
     647             : 
     648             :     The return value is the number of words that \e would be written
     649             :     to \a dst if it were long enough, not counting the terminating
     650             :     zero. If the return value is greater or equal to \a dstlen it
     651             :     indicates truncation, you can then allocate a new array of size
     652             :     return+1 and call this again.
     653             : 
     654             :     Errors in the UTF-8 are converted as though each byte in the
     655             :     erroneous string is in the Microsoft CP1252 encoding. This allows
     656             :     ISO-8859-1 text mistakenly identified as UTF-8 to be printed
     657             :     correctly.
     658             : 
     659             :     Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
     660             :     and most other systems. Where wchar_t is 16 bits, Unicode
     661             :     characters in the range 0x10000 to 0x10ffff are converted to
     662             :     "surrogate pairs" which take two words each (this is called UTF-16
     663             :     encoding). If wchar_t is 32 bits this rather nasty problem is
     664             :     avoided.
     665             : */
     666       51932 : static unsigned utf8towc(const char *src, unsigned srclen, wchar_t *dst,
     667             :                          unsigned dstlen)
     668             : {
     669       51932 :     const char *p = src;
     670       51932 :     const char *e = src + srclen;
     671       51932 :     unsigned count = 0;
     672       51932 :     if (dstlen)
     673             :         while (true)
     674             :         {
     675      380839 :             if (p >= e)
     676             :             {
     677       51932 :                 dst[count] = 0;
     678       51932 :                 return count;
     679             :             }
     680      328907 :             if (!(*p & 0x80))
     681             :             {
     682             :                 // ASCII
     683      327489 :                 dst[count] = *p++;
     684             :             }
     685             :             else
     686             :             {
     687        1418 :                 int len = 0;
     688        1418 :                 unsigned ucs = utf8decode(p, e, &len);
     689        1418 :                 p += len;
     690             : #ifdef _WIN32
     691             :                 if (ucs < 0x10000)
     692             :                 {
     693             :                     dst[count] = static_cast<wchar_t>(ucs);
     694             :                 }
     695             :                 else
     696             :                 {
     697             :                     // Make a surrogate pair:
     698             :                     if (count + 2 >= dstlen)
     699             :                     {
     700             :                         dst[count] = 0;
     701             :                         count += 2;
     702             :                         break;
     703             :                     }
     704             :                     dst[count] = static_cast<wchar_t>(
     705             :                         (((ucs - 0x10000u) >> 10) & 0x3ff) | 0xd800);
     706             :                     dst[++count] = static_cast<wchar_t>((ucs & 0x3ff) | 0xdc00);
     707             :                 }
     708             : #else
     709        1418 :                 dst[count] = static_cast<wchar_t>(ucs);
     710             : #endif
     711             :             }
     712      328907 :             if (++count == dstlen)
     713             :             {
     714           0 :                 dst[count - 1] = 0;
     715           0 :                 break;
     716             :             }
     717      328907 :         }
     718             :     // We filled dst, measure the rest:
     719           0 :     while (p < e)
     720             :     {
     721           0 :         if (!(*p & 0x80))
     722             :         {
     723           0 :             p++;
     724             :         }
     725             :         else
     726             :         {
     727           0 :             int len = 0;
     728             : #ifdef _WIN32
     729             :             const unsigned ucs = utf8decode(p, e, &len);
     730             :             p += len;
     731             :             if (ucs >= 0x10000)
     732             :                 ++count;
     733             : #else
     734           0 :             utf8decode(p, e, &len);
     735           0 :             p += len;
     736             : #endif
     737             :         }
     738           0 :         ++count;
     739             :     }
     740             : 
     741           0 :     return count;
     742             : }
     743             : 
     744             : /************************************************************************/
     745             : /*                              utf8toa()                               */
     746             : /************************************************************************/
     747             : /* Convert a UTF-8 sequence into an array of 1-byte characters.
     748             : 
     749             :     If the UTF-8 decodes to a character greater than 0xff then it is
     750             :     replaced with '?'.
     751             : 
     752             :     Errors in the UTF-8 are converted as individual bytes, same as
     753             :     utf8decode() does. This allows ISO-8859-1 text mistakenly identified
     754             :     as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
     755             : 
     756             :     \a src points at the UTF-8, and \a srclen is the number of bytes to
     757             :     convert.
     758             : 
     759             :     Up to \a dstlen bytes are written to \a dst, including a null
     760             :     terminator. The return value is the number of bytes that would be
     761             :     written, not counting the null terminator. If greater or equal to
     762             :     \a dstlen then if you malloc a new array of size n+1 you will have
     763             :     the space needed for the entire string. If \a dstlen is zero then
     764             :     nothing is written and this call just measures the storage space
     765             :     needed.
     766             : */
     767       49759 : static unsigned int utf8toa(const char *src, unsigned srclen, char *dst,
     768             :                             unsigned dstlen)
     769             : {
     770       49759 :     const char *p = src;
     771       49759 :     const char *e = src + srclen;
     772       49759 :     unsigned int count = 0;
     773       49759 :     if (dstlen)
     774             :         while (true)
     775             :         {
     776      189612 :             if (p >= e)
     777             :             {
     778       49759 :                 dst[count] = 0;
     779       49759 :                 return count;
     780             :             }
     781      139853 :             unsigned char c = *reinterpret_cast<const unsigned char *>(p);
     782      139853 :             if (c < 0xC2)
     783             :             {
     784             :                 // ASCII or bad code.
     785      138951 :                 dst[count] = c;
     786      138951 :                 p++;
     787             :             }
     788             :             else
     789             :             {
     790         902 :                 int len = 0;
     791         902 :                 const unsigned int ucs = utf8decode(p, e, &len);
     792         902 :                 p += len;
     793         902 :                 if (ucs < 0x100)
     794             :                 {
     795         898 :                     dst[count] = static_cast<char>(ucs);
     796             :                 }
     797             :                 else
     798             :                 {
     799           4 :                     if (!bHaveWarned4)
     800             :                     {
     801           2 :                         bHaveWarned4 = true;
     802           2 :                         CPLError(
     803             :                             CE_Warning, CPLE_AppDefined,
     804             :                             "One or several characters couldn't be converted "
     805             :                             "correctly from UTF-8 to ISO-8859-1.  "
     806             :                             "This warning will not be emitted anymore.");
     807             :                     }
     808           4 :                     dst[count] = '?';
     809             :                 }
     810             :             }
     811      139853 :             if (++count >= dstlen)
     812             :             {
     813           0 :                 dst[count - 1] = 0;
     814           0 :                 break;
     815             :             }
     816      139853 :         }
     817             :     // We filled dst, measure the rest:
     818           0 :     while (p < e)
     819             :     {
     820           0 :         if (!(*p & 0x80))
     821             :         {
     822           0 :             p++;
     823             :         }
     824             :         else
     825             :         {
     826           0 :             int len = 0;
     827           0 :             utf8decode(p, e, &len);
     828           0 :             p += len;
     829             :         }
     830           0 :         ++count;
     831             :     }
     832           0 :     return count;
     833             : }
     834             : 
     835             : /************************************************************************/
     836             : /*                             utf8fromwc()                             */
     837             : /************************************************************************/
     838             : /* Turn "wide characters" as returned by some system calls
     839             :     (especially on Windows) into UTF-8.
     840             : 
     841             :     Up to \a dstlen bytes are written to \a dst, including a null
     842             :     terminator. The return value is the number of bytes that would be
     843             :     written, not counting the null terminator. If greater or equal to
     844             :     \a dstlen then if you malloc a new array of size n+1 you will have
     845             :     the space needed for the entire string. If \a dstlen is zero then
     846             :     nothing is written and this call just measures the storage space
     847             :     needed.
     848             : 
     849             :     \a srclen is the number of words in \a src to convert. On Windows
     850             :     this is not necessarily the number of characters, due to there
     851             :     possibly being "surrogate pairs" in the UTF-16 encoding used.
     852             :     On Unix wchar_t is 32 bits and each location is a character.
     853             : 
     854             :     On Unix if a src word is greater than 0x10ffff then this is an
     855             :     illegal character according to RFC 3629. These are converted as
     856             :     though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
     857             :     range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
     858             :     illegal according to RFC 3629. However I encode these as though
     859             :     they are legal, so that utf8towc will return the original data.
     860             : 
     861             :     On Windows "surrogate pairs" are converted to a single character
     862             :     and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
     863             :     pairs are converted as though they are individual characters.
     864             : */
     865       62872 : static unsigned int utf8fromwc(char *dst, unsigned dstlen, const wchar_t *src,
     866             :                                unsigned srclen)
     867             : {
     868       62872 :     unsigned int i = 0;
     869       62872 :     unsigned int count = 0;
     870       62872 :     if (dstlen)
     871             :         while (true)
     872             :         {
     873     1534060 :             if (i >= srclen)
     874             :             {
     875       62872 :                 dst[count] = 0;
     876       62872 :                 return count;
     877             :             }
     878     1471180 :             unsigned int ucs = src[i++];
     879     1471180 :             if (ucs < 0x80U)
     880             :             {
     881     1464520 :                 dst[count++] = static_cast<char>(ucs);
     882     1464520 :                 if (count >= dstlen)
     883             :                 {
     884           0 :                     dst[count - 1] = 0;
     885           0 :                     break;
     886             :                 }
     887             :             }
     888        6664 :             else if (ucs < 0x800U)
     889             :             {
     890             :                 // 2 bytes.
     891        4291 :                 if (count + 2 >= dstlen)
     892             :                 {
     893           0 :                     dst[count] = 0;
     894           0 :                     count += 2;
     895           0 :                     break;
     896             :                 }
     897        4291 :                 dst[count++] = 0xc0 | static_cast<char>(ucs >> 6);
     898        4291 :                 dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
     899             : #ifdef _WIN32
     900             :             }
     901             :             else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
     902             :                      src[i] >= 0xdc00 && src[i] <= 0xdfff)
     903             :             {
     904             :                 // Surrogate pair.
     905             :                 unsigned int ucs2 = src[i++];
     906             :                 ucs = 0x10000U + ((ucs & 0x3ff) << 10) + (ucs2 & 0x3ff);
     907             :                 // All surrogate pairs turn into 4-byte utf8.
     908             : #else
     909             :             }
     910        2373 :             else if (ucs >= 0x10000)
     911             :             {
     912           1 :                 if (ucs > 0x10ffff)
     913             :                 {
     914           1 :                     ucs = 0xfffd;
     915           1 :                     goto J1;
     916             :                 }
     917             : #endif
     918           0 :                 if (count + 4 >= dstlen)
     919             :                 {
     920           0 :                     dst[count] = 0;
     921           0 :                     count += 4;
     922           0 :                     break;
     923             :                 }
     924           0 :                 dst[count++] = 0xf0 | static_cast<char>(ucs >> 18);
     925           0 :                 dst[count++] = 0x80 | static_cast<char>((ucs >> 12) & 0x3F);
     926           0 :                 dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F);
     927           0 :                 dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
     928             :             }
     929             :             else
     930             :             {
     931             : #ifndef _WIN32
     932        2372 :             J1:
     933             : #endif
     934             :                 // All others are 3 bytes:
     935        2373 :                 if (count + 3 >= dstlen)
     936             :                 {
     937           0 :                     dst[count] = 0;
     938           0 :                     count += 3;
     939           0 :                     break;
     940             :                 }
     941        2373 :                 dst[count++] = 0xe0 | static_cast<char>(ucs >> 12);
     942        2373 :                 dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F);
     943        2373 :                 dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
     944             :             }
     945     1471180 :         }
     946             : 
     947             :     // We filled dst, measure the rest:
     948           0 :     while (i < srclen)
     949             :     {
     950           0 :         unsigned int ucs = src[i++];
     951           0 :         if (ucs < 0x80U)
     952             :         {
     953           0 :             count++;
     954             :         }
     955           0 :         else if (ucs < 0x800U)
     956             :         {
     957             :             // 2 bytes.
     958           0 :             count += 2;
     959             : #ifdef _WIN32
     960             :         }
     961             :         else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen - 1 &&
     962             :                  src[i + 1] >= 0xdc00 && src[i + 1] <= 0xdfff)
     963             :         {
     964             :             // Surrogate pair.
     965             :             ++i;
     966             : #else
     967             :         }
     968           0 :         else if (ucs >= 0x10000 && ucs <= 0x10ffff)
     969             :         {
     970             : #endif
     971           0 :             count += 4;
     972             :         }
     973             :         else
     974             :         {
     975           0 :             count += 3;
     976             :         }
     977             :     }
     978           0 :     return count;
     979             : }
     980             : 
     981             : /************************************************************************/
     982             : /*                             utf8froma()                              */
     983             : /************************************************************************/
     984             : 
     985             : /* Convert an ISO-8859-1 (i.e. normal c-string) byte stream to UTF-8.
     986             : 
     987             :     It is possible this should convert Microsoft's CP1252 to UTF-8
     988             :     instead. This would translate the codes in the range 0x80-0x9f
     989             :     to different characters. Currently it does not do this.
     990             : 
     991             :     Up to \a dstlen bytes are written to \a dst, including a null
     992             :     terminator. The return value is the number of bytes that would be
     993             :     written, not counting the null terminator. If greater or equal to
     994             :     \a dstlen then if you malloc a new array of size n+1 you will have
     995             :     the space needed for the entire string. If \a dstlen is zero then
     996             :     nothing is written and this call just measures the storage space
     997             :     needed.
     998             : 
     999             :     \a srclen is the number of bytes in \a src to convert.
    1000             : 
    1001             :     If the return value equals \a srclen then this indicates that
    1002             :     no conversion is necessary, as only ASCII characters are in the
    1003             :     string.
    1004             : */
    1005      664723 : static unsigned utf8froma(char *dst, unsigned dstlen, const char *src,
    1006             :                           unsigned srclen)
    1007             : {
    1008      664723 :     const char *p = src;
    1009      664723 :     const char *e = src + srclen;
    1010      664723 :     unsigned count = 0;
    1011      664723 :     if (dstlen)
    1012             :         while (true)
    1013             :         {
    1014     7665170 :             if (p >= e)
    1015             :             {
    1016      664723 :                 dst[count] = 0;
    1017      664723 :                 return count;
    1018             :             }
    1019     7000450 :             unsigned char ucs = *reinterpret_cast<const unsigned char *>(p);
    1020     7000450 :             p++;
    1021     7000450 :             if (ucs < 0x80U)
    1022             :             {
    1023     6963070 :                 dst[count++] = ucs;
    1024     6963070 :                 if (count >= dstlen)
    1025             :                 {
    1026           0 :                     dst[count - 1] = 0;
    1027           0 :                     break;
    1028             :                 }
    1029             :             }
    1030             :             else
    1031             :             {
    1032             :                 // 2 bytes (note that CP1252 translate could make 3 bytes!)
    1033       37383 :                 if (count + 2 >= dstlen)
    1034             :                 {
    1035           0 :                     dst[count] = 0;
    1036           0 :                     count += 2;
    1037           0 :                     break;
    1038             :                 }
    1039       37383 :                 dst[count++] = 0xc0 | (ucs >> 6);
    1040       37383 :                 dst[count++] = 0x80 | (ucs & 0x3F);
    1041             :             }
    1042     7000450 :         }
    1043             : 
    1044             :     // We filled dst, measure the rest:
    1045           0 :     while (p < e)
    1046             :     {
    1047           0 :         unsigned char ucs = *reinterpret_cast<const unsigned char *>(p);
    1048           0 :         p++;
    1049           0 :         if (ucs < 0x80U)
    1050             :         {
    1051           0 :             count++;
    1052             :         }
    1053             :         else
    1054             :         {
    1055           0 :             count += 2;
    1056             :         }
    1057             :     }
    1058             : 
    1059           0 :     return count;
    1060             : }
    1061             : 
    1062             : #ifdef _WIN32
    1063             : 
    1064             : /************************************************************************/
    1065             : /*                            CPLWin32Recode()                          */
    1066             : /************************************************************************/
    1067             : 
    1068             : /* Convert an CODEPAGE (i.e. normal c-string) byte stream
    1069             :      to another CODEPAGE (i.e. normal c-string) byte stream.
    1070             : 
    1071             :     \a src is target c-string byte stream (including a null terminator).
    1072             :     \a src_code_page is target c-string byte code page.
    1073             :     \a dst_code_page is destination c-string byte code page.
    1074             : 
    1075             :    UTF7          65000
    1076             :    UTF8          65001
    1077             :    OEM-US          437
    1078             :    OEM-ALABIC      720
    1079             :    OEM-GREEK       737
    1080             :    OEM-BALTIC      775
    1081             :    OEM-MLATIN1     850
    1082             :    OEM-LATIN2      852
    1083             :    OEM-CYRILLIC    855
    1084             :    OEM-TURKISH     857
    1085             :    OEM-MLATIN1P    858
    1086             :    OEM-HEBREW      862
    1087             :    OEM-RUSSIAN     866
    1088             : 
    1089             :    THAI            874
    1090             :    SJIS            932
    1091             :    GBK             936
    1092             :    KOREA           949
    1093             :    BIG5            950
    1094             : 
    1095             :    EUROPE         1250
    1096             :    CYRILLIC       1251
    1097             :    LATIN1         1252
    1098             :    GREEK          1253
    1099             :    TURKISH        1254
    1100             :    HEBREW         1255
    1101             :    ARABIC         1256
    1102             :    BALTIC         1257
    1103             :    VIETNAM        1258
    1104             : 
    1105             :    ISO-LATIN1    28591
    1106             :    ISO-LATIN2    28592
    1107             :    ISO-LATIN3    28593
    1108             :    ISO-BALTIC    28594
    1109             :    ISO-CYRILLIC  28595
    1110             :    ISO-ARABIC    28596
    1111             :    ISO-HEBREW    28598
    1112             :    ISO-TURKISH   28599
    1113             :    ISO-LATIN9    28605
    1114             : 
    1115             :    ISO-2022-JP   50220
    1116             : 
    1117             : */
    1118             : 
    1119             : char *CPLWin32Recode(const char *src, unsigned src_code_page,
    1120             :                      unsigned dst_code_page)
    1121             : {
    1122             :     // Convert from source code page to Unicode.
    1123             : 
    1124             :     // Compute the length in wide characters.
    1125             :     int wlen = MultiByteToWideChar(src_code_page, MB_ERR_INVALID_CHARS, src, -1,
    1126             :                                    nullptr, 0);
    1127             :     if (wlen == 0 && GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
    1128             :     {
    1129             :         if (!bHaveWarned5)
    1130             :         {
    1131             :             bHaveWarned5 = true;
    1132             :             CPLError(
    1133             :                 CE_Warning, CPLE_AppDefined,
    1134             :                 "One or several characters could not be translated from CP%d. "
    1135             :                 "This warning will not be emitted anymore.",
    1136             :                 src_code_page);
    1137             :         }
    1138             : 
    1139             :         // Retry now without MB_ERR_INVALID_CHARS flag.
    1140             :         wlen = MultiByteToWideChar(src_code_page, 0, src, -1, nullptr, 0);
    1141             :     }
    1142             : 
    1143             :     // Do the actual conversion.
    1144             :     wchar_t *tbuf =
    1145             :         static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), wlen + 1));
    1146             :     tbuf[wlen] = 0;
    1147             :     MultiByteToWideChar(src_code_page, 0, src, -1, tbuf, wlen + 1);
    1148             : 
    1149             :     // Convert from Unicode to destination code page.
    1150             : 
    1151             :     // Compute the length in chars.
    1152             :     BOOL bUsedDefaultChar = FALSE;
    1153             :     int len = 0;
    1154             :     if (dst_code_page == CP_UTF7 || dst_code_page == CP_UTF8)
    1155             :         len = WideCharToMultiByte(dst_code_page, 0, tbuf, -1, nullptr, 0,
    1156             :                                   nullptr, nullptr);
    1157             :     else
    1158             :         len = WideCharToMultiByte(dst_code_page, 0, tbuf, -1, nullptr, 0,
    1159             :                                   nullptr, &bUsedDefaultChar);
    1160             :     if (bUsedDefaultChar)
    1161             :     {
    1162             :         if (!bHaveWarned6)
    1163             :         {
    1164             :             bHaveWarned6 = true;
    1165             :             CPLError(
    1166             :                 CE_Warning, CPLE_AppDefined,
    1167             :                 "One or several characters could not be translated to CP%d. "
    1168             :                 "This warning will not be emitted anymore.",
    1169             :                 dst_code_page);
    1170             :         }
    1171             :     }
    1172             : 
    1173             :     // Do the actual conversion.
    1174             :     char *pszResult = static_cast<char *>(CPLCalloc(sizeof(char), len + 1));
    1175             :     WideCharToMultiByte(dst_code_page, 0, tbuf, -1, pszResult, len + 1, nullptr,
    1176             :                         nullptr);
    1177             :     pszResult[len] = 0;
    1178             : 
    1179             :     CPLFree(tbuf);
    1180             : 
    1181             :     return pszResult;
    1182             : }
    1183             : 
    1184             : #endif
    1185             : 
    1186             : /*
    1187             : ** For now we disable the rest which is locale() related.  We may need
    1188             : ** parts of it later.
    1189             : */
    1190             : 
    1191             : #ifdef notdef
    1192             : 
    1193             : #ifdef _WIN32
    1194             : #include <windows.h>
    1195             : #endif
    1196             : 
    1197             : /*! Return true if the "locale" seems to indicate that UTF-8 encoding
    1198             :     is used. If true the utf8tomb and utf8frommb don't do anything
    1199             :     useful.
    1200             : 
    1201             :     <i>It is highly recommended that you change your system so this
    1202             :     does return true.</i> On Windows this is done by setting the
    1203             :     "codepage" to CP_UTF8.  On Unix this is done by setting $LC_CTYPE
    1204             :     to a string containing the letters "utf" or "UTF" in it, or by
    1205             :     deleting all $LC* and $LANG environment variables. In the future
    1206             :     it is likely that all non-Asian Unix systems will return true,
    1207             :     due to the compatibility of UTF-8 with ISO-8859-1.
    1208             : */
    1209             : int utf8locale(void)
    1210             : {
    1211             :     static int ret = 2;
    1212             :     if (ret == 2)
    1213             :     {
    1214             : #ifdef _WIN32
    1215             :         ret = GetACP() == CP_UTF8;
    1216             : #else
    1217             :         char *s;
    1218             :         ret = 1;  // assume UTF-8 if no locale
    1219             :         if (((s = getenv("LC_CTYPE")) && *s) ||
    1220             :             ((s = getenv("LC_ALL")) && *s) || ((s = getenv("LANG")) && *s))
    1221             :         {
    1222             :             ret = strstr(s, "utf") || strstr(s, "UTF");
    1223             :         }
    1224             : #endif
    1225             :     }
    1226             : 
    1227             :     return ret;
    1228             : }
    1229             : 
    1230             : /*! Convert the UTF-8 used by FLTK to the locale-specific encoding
    1231             :     used for filenames (and sometimes used for data in files).
    1232             :     Unfortunately due to stupid design you will have to do this as
    1233             :     needed for filenames. This is a bug on both Unix and Windows.
    1234             : 
    1235             :     Up to \a dstlen bytes are written to \a dst, including a null
    1236             :     terminator. The return value is the number of bytes that would be
    1237             :     written, not counting the null terminator. If greater or equal to
    1238             :     \a dstlen then if you malloc a new array of size n+1 you will have
    1239             :     the space needed for the entire string. If \a dstlen is zero then
    1240             :     nothing is written and this call just measures the storage space
    1241             :     needed.
    1242             : 
    1243             :     If utf8locale() returns true then this does not change the data.
    1244             :     It is copied and truncated as necessary to
    1245             :     the destination buffer and \a srclen is always returned.  */
    1246             : unsigned utf8tomb(const char *src, unsigned srclen, char *dst, unsigned dstlen)
    1247             : {
    1248             :     if (!utf8locale())
    1249             :     {
    1250             : #ifdef _WIN32
    1251             :         wchar_t lbuf[1024] = {};
    1252             :         wchar_t *buf = lbuf;
    1253             :         unsigned length = utf8towc(src, srclen, buf, 1024);
    1254             :         unsigned ret;
    1255             :         if (length >= 1024)
    1256             :         {
    1257             :             buf =
    1258             :                 static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t)));
    1259             :             utf8towc(src, srclen, buf, length + 1);
    1260             :         }
    1261             :         if (dstlen)
    1262             :         {
    1263             :             // apparently this does not null-terminate, even though msdn
    1264             :             // documentation claims it does:
    1265             :             ret = WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0,
    1266             :                                       0);
    1267             :             dst[ret] = 0;
    1268             :         }
    1269             :         // if it overflows or measuring length, get the actual length:
    1270             :         if (dstlen == 0 || ret >= dstlen - 1)
    1271             :             ret = WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
    1272             :         if (buf != lbuf)
    1273             :             free((void *)buf);
    1274             :         return ret;
    1275             : #else
    1276             :         wchar_t lbuf[1024] = {};
    1277             :         wchar_t *buf = lbuf;
    1278             :         unsigned length = utf8towc(src, srclen, buf, 1024);
    1279             :         if (length >= 1024)
    1280             :         {
    1281             :             buf =
    1282             :                 static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t)));
    1283             :             utf8towc(src, srclen, buf, length + 1);
    1284             :         }
    1285             :         int ret = 0;
    1286             :         if (dstlen)
    1287             :         {
    1288             :             ret = wcstombs(dst, buf, dstlen);
    1289             :             if (ret >= dstlen - 1)
    1290             :                 ret = wcstombs(0, buf, 0);
    1291             :         }
    1292             :         else
    1293             :         {
    1294             :             ret = wcstombs(0, buf, 0);
    1295             :         }
    1296             :         if (buf != lbuf)
    1297             :             free((void *)buf);
    1298             :         if (ret >= 0)
    1299             :             return (unsigned)ret;
    1300             :             // On any errors we return the UTF-8 as raw text...
    1301             : #endif
    1302             :     }
    1303             :     // Identity transform:
    1304             :     if (srclen < dstlen)
    1305             :     {
    1306             :         memcpy(dst, src, srclen);
    1307             :         dst[srclen] = 0;
    1308             :     }
    1309             :     else
    1310             :     {
    1311             :         memcpy(dst, src, dstlen - 1);
    1312             :         dst[dstlen - 1] = 0;
    1313             :     }
    1314             :     return srclen;
    1315             : }
    1316             : 
    1317             : /*! Convert a filename from the locale-specific multibyte encoding
    1318             :     used by Windows to UTF-8 as used by FLTK.
    1319             : 
    1320             :     Up to \a dstlen bytes are written to \a dst, including a null
    1321             :     terminator. The return value is the number of bytes that would be
    1322             :     written, not counting the null terminator. If greater or equal to
    1323             :     \a dstlen then if you malloc a new array of size n+1 you will have
    1324             :     the space needed for the entire string. If \a dstlen is zero then
    1325             :     nothing is written and this call just measures the storage space
    1326             :     needed.
    1327             : 
    1328             :     On Unix or on Windows when a UTF-8 locale is in effect, this
    1329             :     does not change the data. It is copied and truncated as necessary to
    1330             :     the destination buffer and \a srclen is always returned.
    1331             :     You may also want to check if utf8test() returns non-zero, so that
    1332             :     the filesystem can store filenames in UTF-8 encoding regardless of
    1333             :     the locale.
    1334             : */
    1335             : unsigned utf8frommb(char *dst, unsigned dstlen, const char *src,
    1336             :                     unsigned srclen)
    1337             : {
    1338             :     if (!utf8locale())
    1339             :     {
    1340             : #ifdef _WIN32
    1341             :         wchar_t lbuf[1024] = {};
    1342             :         wchar_t *buf = lbuf;
    1343             :         unsigned ret;
    1344             :         const unsigned length =
    1345             :             MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
    1346             :         if (length >= 1024)
    1347             :         {
    1348             :             length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
    1349             :             buf = static_cast<wchar_t *>(malloc(length * sizeof(wchar_t)));
    1350             :             MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
    1351             :         }
    1352             :         ret = utf8fromwc(dst, dstlen, buf, length);
    1353             :         if (buf != lbuf)
    1354             :             free(buf);
    1355             :         return ret;
    1356             : #else
    1357             :         wchar_t lbuf[1024] = {};
    1358             :         wchar_t *buf = lbuf;
    1359             :         const int length = mbstowcs(buf, src, 1024);
    1360             :         if (length >= 1024)
    1361             :         {
    1362             :             length = mbstowcs(0, src, 0) + 1;
    1363             :             buf =
    1364             :                 static_cast<wchar_t *>(malloc(length * sizeof(unsigned short)));
    1365             :             mbstowcs(buf, src, length);
    1366             :         }
    1367             :         if (length >= 0)
    1368             :         {
    1369             :             const unsigned ret = utf8fromwc(dst, dstlen, buf, length);
    1370             :             if (buf != lbuf)
    1371             :                 free(buf);
    1372             :             return ret;
    1373             :         }
    1374             :         // Errors in conversion return the UTF-8 unchanged.
    1375             : #endif
    1376             :     }
    1377             :     // Identity transform:
    1378             :     if (srclen < dstlen)
    1379             :     {
    1380             :         memcpy(dst, src, srclen);
    1381             :         dst[srclen] = 0;
    1382             :     }
    1383             :     else
    1384             :     {
    1385             :         memcpy(dst, src, dstlen - 1);
    1386             :         dst[dstlen - 1] = 0;
    1387             :     }
    1388             :     return srclen;
    1389             : }
    1390             : 
    1391             : #endif  // def notdef - disabled locale specific stuff.
    1392             : 
    1393             : /*! Examines the first \a srclen bytes in \a src and return a verdict
    1394             :     on whether it is UTF-8 or not.
    1395             :     - Returns 0 if there is any illegal UTF-8 sequences, using the
    1396             :       same rules as utf8decode(). Note that some UCS values considered
    1397             :       illegal by RFC 3629, such as 0xffff, are considered legal by this.
    1398             :     - Returns 1 if there are only single-byte characters (i.e. no bytes
    1399             :       have the high bit set). This is legal UTF-8, but also indicates
    1400             :       plain ASCII. It also returns 1 if \a srclen is zero.
    1401             :     - Returns 2 if there are only characters less than 0x800.
    1402             :     - Returns 3 if there are only characters less than 0x10000.
    1403             :     - Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
    1404             : 
    1405             :     Because there are many illegal sequences in UTF-8, it is almost
    1406             :     impossible for a string in another encoding to be confused with
    1407             :     UTF-8. This is very useful for transitioning Unix to UTF-8
    1408             :     filenames, you can simply test each filename with this to decide
    1409             :     if it is UTF-8 or in the locale encoding. My hope is that if
    1410             :     this is done we will be able to cleanly transition to a locale-less
    1411             :     encoding.
    1412             : */
    1413             : 
    1414       14858 : static int utf8test(const char *src, unsigned srclen)
    1415             : {
    1416       14858 :     int ret = 1;
    1417       14858 :     const char *p = src;
    1418       14858 :     const char *e = src + srclen;
    1419     1801830 :     while (p < e)
    1420             :     {
    1421     1787020 :         if (*p == 0)
    1422           0 :             return 0;
    1423     1787020 :         if (*p & 0x80)
    1424             :         {
    1425        1602 :             int len = 0;
    1426        1602 :             utf8decode(p, e, &len);
    1427        1602 :             if (len < 2)
    1428          52 :                 return 0;
    1429        1550 :             if (len > ret)
    1430         552 :                 ret = len;
    1431        1550 :             p += len;
    1432             :         }
    1433             :         else
    1434             :         {
    1435     1785420 :             p++;
    1436             :         }
    1437             :     }
    1438       14806 :     return ret;
    1439             : }

Generated by: LCOV version 1.14