LCOV - code coverage report
Current view: top level - port - cpl_recode_iconv.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 76 110 69.1 %
Date: 2024-04-28 23:18:46 Functions: 4 5 80.0 %

          Line data    Source code
       1             : /**********************************************************************
       2             :  *
       3             :  * Name:     cpl_recode_iconv.cpp
       4             :  * Project:  CPL - Common Portability Library
       5             :  * Purpose:  Character set recoding and char/wchar_t conversions implemented
       6             :  *           using the iconv() functionality.
       7             :  * Author:   Andrey Kiselev, dron@ak4719.spb.edu
       8             :  *
       9             :  **********************************************************************
      10             :  * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
      11             :  * Copyright (c) 2011-2012, Even Rouault <even dot rouault at spatialys.com>
      12             :  *
      13             :  * Permission to use, copy, modify, and distribute this software for any
      14             :  * purpose with or without fee is hereby granted, provided that the above
      15             :  * copyright notice and this permission notice appear in all copies.
      16             :  *
      17             :  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
      18             :  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
      19             :  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
      20             :  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
      21             :  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
      22             :  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
      23             :  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
      24             :  **********************************************************************/
      25             : 
      26             : #include "cpl_port.h"
      27             : 
      28             : #include <algorithm>
      29             : 
      30             : #ifdef CPL_RECODE_ICONV
      31             : 
      32             : #include <iconv.h>
      33             : #include "cpl_string.h"
      34             : 
      35             : #ifndef ICONV_CPP_CONST
      36             : #define ICONV_CPP_CONST ICONV_CONST
      37             : #endif
      38             : 
      39             : constexpr size_t CPL_RECODE_DSTBUF_SIZE = 32768;
      40             : 
      41             : /* used by cpl_recode.cpp */
      42             : extern void CPLClearRecodeIconvWarningFlags();
      43             : extern char *CPLRecodeIconv(const char *, const char *,
      44             :                             const char *) CPL_RETURNS_NONNULL;
      45             : extern char *CPLRecodeFromWCharIconv(const wchar_t *, const char *,
      46             :                                      const char *);
      47             : extern wchar_t *CPLRecodeToWCharIconv(const char *, const char *, const char *);
      48             : 
      49             : /************************************************************************/
      50             : /*                 CPLClearRecodeIconvWarningFlags()                    */
      51             : /************************************************************************/
      52             : 
      53             : static bool bHaveWarned1 = false;
      54             : static bool bHaveWarned2 = false;
      55             : 
      56       10377 : void CPLClearRecodeIconvWarningFlags()
      57             : {
      58       10377 :     bHaveWarned1 = false;
      59       10377 :     bHaveWarned2 = false;
      60       10377 : }
      61             : 
      62             : /************************************************************************/
      63             : /*                      CPLFixInputEncoding()                           */
      64             : /************************************************************************/
      65             : 
      66        6362 : static const char *CPLFixInputEncoding(const char *pszSrcEncoding,
      67             :                                        int nFirstVal)
      68             : {
      69             : #if CPL_IS_LSB
      70             :     // iconv on Alpine Linux seems to assume BE order, when it is not explicit
      71        6362 :     if (EQUAL(pszSrcEncoding, CPL_ENC_UCS2))
      72           1 :         pszSrcEncoding = "UCS-2LE";
      73        6361 :     else if (EQUAL(pszSrcEncoding, CPL_ENC_UTF16) && nFirstVal != 0xFF &&
      74           1 :              nFirstVal != 0xFE && nFirstVal != 0xFFFE && nFirstVal != 0xFEFF)
      75             :     {
      76             :         // Only force UTF-16LE if there's no starting endianness marker
      77           1 :         pszSrcEncoding = "UTF-16LE";
      78             :     }
      79             : #else
      80             :     CPL_IGNORE_RET_VAL(nFirstVal);
      81             : #endif
      82        6362 :     return pszSrcEncoding;
      83             : }
      84             : 
      85             : /************************************************************************/
      86             : /*                          CPLRecodeIconv()                            */
      87             : /************************************************************************/
      88             : 
      89             : /**
      90             :  * Convert a string from a source encoding to a destination encoding
      91             :  * using the iconv() function.
      92             :  *
      93             :  * If an error occurs an error may, or may not be posted with CPLError().
      94             :  *
      95             :  * @param pszSource a NULL terminated string.
      96             :  * @param pszSrcEncoding the source encoding.
      97             :  * @param pszDstEncoding the destination encoding.
      98             :  *
      99             :  * @return a NULL terminated string which should be freed with CPLFree().
     100             :  */
     101             : 
     102        6361 : char *CPLRecodeIconv(const char *pszSource, const char *pszSrcEncoding,
     103             :                      const char *pszDstEncoding)
     104             : 
     105             : {
     106        6361 :     pszSrcEncoding = CPLFixInputEncoding(
     107        6361 :         pszSrcEncoding, static_cast<unsigned char>(pszSource[0]));
     108             : 
     109             :     iconv_t sConv;
     110             : 
     111        6361 :     sConv = iconv_open(pszDstEncoding, pszSrcEncoding);
     112             : 
     113             : #ifdef __GNUC__
     114             : #pragma GCC diagnostic push
     115             : #pragma GCC diagnostic ignored "-Wold-style-cast"
     116             : #endif
     117             :     // iconv_t might be a integer or a pointer, so we have to fallback to
     118             :     // C-style cast
     119        6361 :     if (sConv == (iconv_t)(-1))
     120             : #ifdef __GNUC__
     121             : #pragma GCC diagnostic pop
     122             : #endif
     123             :     {
     124           1 :         CPLError(CE_Warning, CPLE_AppDefined,
     125             :                  "Recode from %s to %s failed with the error: \"%s\".",
     126           1 :                  pszSrcEncoding, pszDstEncoding, strerror(errno));
     127             : 
     128           1 :         return CPLStrdup(pszSource);
     129             :     }
     130             : 
     131             :     /* -------------------------------------------------------------------- */
     132             :     /*      XXX: There is a portability issue: iconv() function could be    */
     133             :     /*      declared differently on different platforms. The second         */
     134             :     /*      argument could be declared as char** (as POSIX defines) or      */
     135             :     /*      as a const char**. Handle it with the ICONV_CPP_CONST macro here. */
     136             :     /* -------------------------------------------------------------------- */
     137        6360 :     ICONV_CPP_CONST char *pszSrcBuf =
     138             :         const_cast<ICONV_CPP_CONST char *>(pszSource);
     139        6360 :     size_t nSrcLen = strlen(pszSource);
     140        6360 :     size_t nDstCurLen = std::max(CPL_RECODE_DSTBUF_SIZE, nSrcLen);
     141        6360 :     size_t nDstLen = nDstCurLen;
     142             :     char *pszDestination =
     143        6360 :         static_cast<char *>(CPLCalloc(nDstCurLen + 1, sizeof(char)));
     144        6360 :     char *pszDstBuf = pszDestination;
     145             : 
     146       12710 :     while (nSrcLen > 0)
     147             :     {
     148             :         size_t nConverted =
     149        6355 :             iconv(sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen);
     150             : 
     151        6355 :         if (nConverted == static_cast<size_t>(-1))
     152             :         {
     153          18 :             if (errno == EILSEQ)
     154             :             {
     155             :                 // Skip the invalid sequence in the input string.
     156          12 :                 if (!bHaveWarned1)
     157             :                 {
     158           1 :                     bHaveWarned1 = true;
     159           1 :                     CPLError(CE_Warning, CPLE_AppDefined,
     160             :                              "One or several characters couldn't be converted "
     161             :                              "correctly from %s to %s.  "
     162             :                              "This warning will not be emitted anymore",
     163             :                              pszSrcEncoding, pszDstEncoding);
     164             :                 }
     165          12 :                 if (nSrcLen == 0)
     166           0 :                     break;
     167          12 :                 nSrcLen--;
     168          12 :                 pszSrcBuf++;
     169          12 :                 continue;
     170             :             }
     171             : 
     172           6 :             else if (errno == E2BIG)
     173             :             {
     174             :                 // We are running out of the output buffer.
     175             :                 // Dynamically increase the buffer size.
     176           1 :                 size_t nTmp = nDstCurLen;
     177           1 :                 nDstCurLen *= 2;
     178             :                 pszDestination = static_cast<char *>(
     179           1 :                     CPLRealloc(pszDestination, nDstCurLen + 1));
     180           1 :                 pszDstBuf = pszDestination + nTmp - nDstLen;
     181           1 :                 nDstLen += nTmp;
     182           1 :                 continue;
     183             :             }
     184             : 
     185             :             else
     186           5 :                 break;
     187             :         }
     188             :     }
     189             : 
     190        6360 :     pszDestination[nDstCurLen - nDstLen] = '\0';
     191             : 
     192        6360 :     iconv_close(sConv);
     193             : 
     194        6360 :     return pszDestination;
     195             : }
     196             : 
     197             : /************************************************************************/
     198             : /*                      CPLRecodeFromWCharIconv()                       */
     199             : /************************************************************************/
     200             : 
     201             : /**
     202             :  * Convert wchar_t string to UTF-8.
     203             :  *
     204             :  * Convert a wchar_t string into a multibyte utf-8 string
     205             :  * using the iconv() function.
     206             :  *
     207             :  * Note that the wchar_t type varies in size on different systems. On
     208             :  * win32 it is normally 2 bytes, and on unix 4 bytes.
     209             :  *
     210             :  * If an error occurs an error may, or may not be posted with CPLError().
     211             :  *
     212             :  * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
     213             :  * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
     214             :  * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
     215             :  *
     216             :  * @return a zero terminated multi-byte string which should be freed with
     217             :  * CPLFree(), or NULL if an error occurs.
     218             :  */
     219             : 
     220           1 : char *CPLRecodeFromWCharIconv(const wchar_t *pwszSource,
     221             :                               const char *pszSrcEncoding,
     222             :                               const char *pszDstEncoding)
     223             : 
     224             : {
     225           1 :     pszSrcEncoding = CPLFixInputEncoding(pszSrcEncoding, pwszSource[0]);
     226             : 
     227             :     /* -------------------------------------------------------------------- */
     228             :     /*      What is the source length.                                      */
     229             :     /* -------------------------------------------------------------------- */
     230           1 :     size_t nSrcLen = 0;
     231             : 
     232        2049 :     while (pwszSource[nSrcLen] != 0)
     233        2048 :         nSrcLen++;
     234             : 
     235             :     /* -------------------------------------------------------------------- */
     236             :     /*      iconv() does not support wchar_t so we need to repack the       */
     237             :     /*      characters according to the width of a character in the         */
     238             :     /*      source encoding.  For instance if wchar_t is 4 bytes but our    */
     239             :     /*      source is UTF16 then we need to pack down into 2 byte           */
     240             :     /*      characters before passing to iconv().                           */
     241             :     /* -------------------------------------------------------------------- */
     242           1 :     const int nTargetCharWidth = CPLEncodingCharSize(pszSrcEncoding);
     243             : 
     244           1 :     if (nTargetCharWidth < 1)
     245             :     {
     246           0 :         CPLError(CE_Warning, CPLE_AppDefined,
     247             :                  "Recode from %s with CPLRecodeFromWChar() failed because"
     248             :                  " the width of characters in the encoding are not known.",
     249             :                  pszSrcEncoding);
     250           0 :         return CPLStrdup("");
     251             :     }
     252             : 
     253             :     GByte *pszIconvSrcBuf =
     254           1 :         static_cast<GByte *>(CPLCalloc((nSrcLen + 1), nTargetCharWidth));
     255             : 
     256        2050 :     for (unsigned int iSrc = 0; iSrc <= nSrcLen; iSrc++)
     257             :     {
     258        2049 :         if (nTargetCharWidth == 1)
     259           0 :             pszIconvSrcBuf[iSrc] = static_cast<GByte>(pwszSource[iSrc]);
     260        2049 :         else if (nTargetCharWidth == 2)
     261        2049 :             (reinterpret_cast<short *>(pszIconvSrcBuf))[iSrc] =
     262        2049 :                 static_cast<short>(pwszSource[iSrc]);
     263           0 :         else if (nTargetCharWidth == 4)
     264           0 :             (reinterpret_cast<GInt32 *>(pszIconvSrcBuf))[iSrc] =
     265           0 :                 pwszSource[iSrc];
     266             :     }
     267             : 
     268             :     /* -------------------------------------------------------------------- */
     269             :     /*      Create the iconv() translation object.                          */
     270             :     /* -------------------------------------------------------------------- */
     271             :     iconv_t sConv;
     272             : 
     273           1 :     sConv = iconv_open(pszDstEncoding, pszSrcEncoding);
     274             : 
     275             : #ifdef __GNUC__
     276             : #pragma GCC diagnostic push
     277             : #pragma GCC diagnostic ignored "-Wold-style-cast"
     278             : #endif
     279             :     // iconv_t might be a integer or a pointer, so we have to fallback to
     280             :     // C-style cast
     281           1 :     if (sConv == (iconv_t)(-1))
     282             : #ifdef __GNUC__
     283             : #pragma GCC diagnostic pop
     284             : #endif
     285             :     {
     286           0 :         CPLFree(pszIconvSrcBuf);
     287           0 :         CPLError(CE_Warning, CPLE_AppDefined,
     288             :                  "Recode from %s to %s failed with the error: \"%s\".",
     289           0 :                  pszSrcEncoding, pszDstEncoding, strerror(errno));
     290             : 
     291           0 :         return CPLStrdup("");
     292             :     }
     293             : 
     294             :     /* -------------------------------------------------------------------- */
     295             :     /*      XXX: There is a portability issue: iconv() function could be    */
     296             :     /*      declared differently on different platforms. The second         */
     297             :     /*      argument could be declared as char** (as POSIX defines) or      */
     298             :     /*      as a const char**. Handle it with the ICONV_CPP_CONST macro here. */
     299             :     /* -------------------------------------------------------------------- */
     300           1 :     ICONV_CPP_CONST char *pszSrcBuf = const_cast<ICONV_CPP_CONST char *>(
     301             :         reinterpret_cast<char *>(pszIconvSrcBuf));
     302             : 
     303             :     /* iconv expects a number of bytes, not characters */
     304           1 :     nSrcLen *= nTargetCharWidth;
     305             : 
     306             :     /* -------------------------------------------------------------------- */
     307             :     /*      Allocate destination buffer.                                    */
     308             :     /* -------------------------------------------------------------------- */
     309           1 :     size_t nDstCurLen = std::max(CPL_RECODE_DSTBUF_SIZE, nSrcLen + 1);
     310           1 :     size_t nDstLen = nDstCurLen;
     311             :     char *pszDestination =
     312           1 :         static_cast<char *>(CPLCalloc(nDstCurLen, sizeof(char)));
     313           1 :     char *pszDstBuf = pszDestination;
     314             : 
     315           2 :     while (nSrcLen > 0)
     316             :     {
     317             :         const size_t nConverted =
     318           1 :             iconv(sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen);
     319             : 
     320           1 :         if (nConverted == static_cast<size_t>(-1))
     321             :         {
     322           0 :             if (errno == EILSEQ)
     323             :             {
     324             :                 // Skip the invalid sequence in the input string.
     325           0 :                 nSrcLen -= nTargetCharWidth;
     326           0 :                 pszSrcBuf += nTargetCharWidth;
     327           0 :                 if (!bHaveWarned2)
     328             :                 {
     329           0 :                     bHaveWarned2 = true;
     330           0 :                     CPLError(CE_Warning, CPLE_AppDefined,
     331             :                              "One or several characters couldn't be converted "
     332             :                              "correctly from %s to %s.  "
     333             :                              "This warning will not be emitted anymore",
     334             :                              pszSrcEncoding, pszDstEncoding);
     335             :                 }
     336           0 :                 continue;
     337             :             }
     338             : 
     339           0 :             else if (errno == E2BIG)
     340             :             {
     341             :                 // We are running out of the output buffer.
     342             :                 // Dynamically increase the buffer size.
     343           0 :                 size_t nTmp = nDstCurLen;
     344           0 :                 nDstCurLen *= 2;
     345             :                 pszDestination =
     346           0 :                     static_cast<char *>(CPLRealloc(pszDestination, nDstCurLen));
     347           0 :                 pszDstBuf = pszDestination + nTmp - nDstLen;
     348           0 :                 nDstLen += nDstCurLen - nTmp;
     349           0 :                 continue;
     350             :             }
     351             : 
     352             :             else
     353           0 :                 break;
     354             :         }
     355             :     }
     356             : 
     357           1 :     if (nDstLen == 0)
     358             :     {
     359           0 :         ++nDstCurLen;
     360             :         pszDestination =
     361           0 :             static_cast<char *>(CPLRealloc(pszDestination, nDstCurLen));
     362           0 :         ++nDstLen;
     363             :     }
     364           1 :     pszDestination[nDstCurLen - nDstLen] = '\0';
     365             : 
     366           1 :     iconv_close(sConv);
     367             : 
     368           1 :     CPLFree(pszIconvSrcBuf);
     369             : 
     370           1 :     return pszDestination;
     371             : }
     372             : 
     373             : /************************************************************************/
     374             : /*                        CPLRecodeToWCharIconv()                       */
     375             : /************************************************************************/
     376             : 
     377             : /**
     378             :  * Convert UTF-8 string to a wchar_t string.
     379             :  *
     380             :  * Convert a 8bit, multi-byte per character input string into a wide
     381             :  * character (wchar_t) string using the iconv() function.
     382             :  *
     383             :  * Note that the wchar_t type varies in size on different systems. On
     384             :  * win32 it is normally 2 bytes, and on unix 4 bytes.
     385             :  *
     386             :  * If an error occurs an error may, or may not be posted with CPLError().
     387             :  *
     388             :  * @param pszSource input multi-byte character string.
     389             :  * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
     390             :  * @param pszDstEncoding destination encoding. Must be "WCHAR_T".
     391             :  *
     392             :  * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
     393             :  * NULL on error.
     394             :  */
     395             : 
     396           0 : wchar_t *CPLRecodeToWCharIconv(const char *pszSource,
     397             :                                const char *pszSrcEncoding,
     398             :                                const char *pszDstEncoding)
     399             : 
     400             : {
     401           0 :     if (strcmp(pszDstEncoding, "WCHAR_T") != 0)
     402             :     {
     403           0 :         CPLError(CE_Failure, CPLE_AppDefined,
     404             :                  "Stub recoding implementation does not support "
     405             :                  "CPLRecodeToWCharIconv(...,%s,%s)",
     406             :                  pszSrcEncoding, pszDstEncoding);
     407           0 :         return nullptr;
     408             :     }
     409             : 
     410             :     // Using double static_cast<> makes CodeQL cpp/incorrect-string-type-conversion
     411             :     // check happy...
     412             :     return static_cast<wchar_t *>(static_cast<void *>(
     413           0 :         CPLRecodeIconv(pszSource, pszSrcEncoding, pszDstEncoding)));
     414             : }
     415             : 
     416             : #endif /* CPL_RECODE_ICONV */

Generated by: LCOV version 1.14