Line data Source code
1 : /**********************************************************************
2 : *
3 : * Name: cpl_recode_iconv.cpp
4 : * Project: CPL - Common Portability Library
5 : * Purpose: Character set recoding and char/wchar_t conversions implemented
6 : * using the iconv() functionality.
7 : * Author: Andrey Kiselev, dron@ak4719.spb.edu
8 : *
9 : **********************************************************************
10 : * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
11 : * Copyright (c) 2011-2012, Even Rouault <even dot rouault at spatialys.com>
12 : *
13 : * Permission to use, copy, modify, and distribute this software for any
14 : * purpose with or without fee is hereby granted, provided that the above
15 : * copyright notice and this permission notice appear in all copies.
16 : *
17 : * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
18 : * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
19 : * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
20 : * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
21 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
22 : * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
23 : * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
24 : **********************************************************************/
25 :
26 : #include "cpl_port.h"
27 :
28 : #include <algorithm>
29 :
30 : #ifdef CPL_RECODE_ICONV
31 :
32 : #include <iconv.h>
33 : #include "cpl_string.h"
34 :
35 : #ifndef ICONV_CPP_CONST
36 : #define ICONV_CPP_CONST ICONV_CONST
37 : #endif
38 :
39 : constexpr size_t CPL_RECODE_DSTBUF_SIZE = 32768;
40 :
41 : /* used by cpl_recode.cpp */
42 : extern void CPLClearRecodeIconvWarningFlags();
43 : extern char *CPLRecodeIconv(const char *, const char *,
44 : const char *) CPL_RETURNS_NONNULL;
45 : extern char *CPLRecodeFromWCharIconv(const wchar_t *, const char *,
46 : const char *);
47 : extern wchar_t *CPLRecodeToWCharIconv(const char *, const char *, const char *);
48 :
49 : /************************************************************************/
50 : /* CPLClearRecodeIconvWarningFlags() */
51 : /************************************************************************/
52 :
53 : static bool bHaveWarned1 = false;
54 : static bool bHaveWarned2 = false;
55 :
56 10404 : void CPLClearRecodeIconvWarningFlags()
57 : {
58 10404 : bHaveWarned1 = false;
59 10404 : bHaveWarned2 = false;
60 10404 : }
61 :
62 : /************************************************************************/
63 : /* CPLFixInputEncoding() */
64 : /************************************************************************/
65 :
66 284 : static const char *CPLFixInputEncoding(const char *pszSrcEncoding,
67 : int nFirstVal)
68 : {
69 : #if CPL_IS_LSB
70 : // iconv on Alpine Linux seems to assume BE order, when it is not explicit
71 284 : if (EQUAL(pszSrcEncoding, CPL_ENC_UCS2))
72 1 : pszSrcEncoding = "UCS-2LE";
73 283 : else if (EQUAL(pszSrcEncoding, CPL_ENC_UTF16) && nFirstVal != 0xFF &&
74 1 : nFirstVal != 0xFE && nFirstVal != 0xFFFE && nFirstVal != 0xFEFF)
75 : {
76 : // Only force UTF-16LE if there's no starting endianness marker
77 1 : pszSrcEncoding = "UTF-16LE";
78 : }
79 : #else
80 : CPL_IGNORE_RET_VAL(nFirstVal);
81 : #endif
82 284 : return pszSrcEncoding;
83 : }
84 :
85 : /************************************************************************/
86 : /* CPLRecodeIconv() */
87 : /************************************************************************/
88 :
89 : /**
90 : * Convert a string from a source encoding to a destination encoding
91 : * using the iconv() function.
92 : *
93 : * If an error occurs an error may, or may not be posted with CPLError().
94 : *
95 : * @param pszSource a NULL terminated string.
96 : * @param pszSrcEncoding the source encoding.
97 : * @param pszDstEncoding the destination encoding.
98 : *
99 : * @return a NULL terminated string which should be freed with CPLFree().
100 : */
101 :
102 283 : char *CPLRecodeIconv(const char *pszSource, const char *pszSrcEncoding,
103 : const char *pszDstEncoding)
104 :
105 : {
106 283 : pszSrcEncoding = CPLFixInputEncoding(
107 283 : pszSrcEncoding, static_cast<unsigned char>(pszSource[0]));
108 :
109 : iconv_t sConv;
110 :
111 283 : sConv = iconv_open(pszDstEncoding, pszSrcEncoding);
112 :
113 : #ifdef __GNUC__
114 : #pragma GCC diagnostic push
115 : #pragma GCC diagnostic ignored "-Wold-style-cast"
116 : #endif
117 : // iconv_t might be a integer or a pointer, so we have to fallback to
118 : // C-style cast
119 283 : if (sConv == (iconv_t)(-1))
120 : #ifdef __GNUC__
121 : #pragma GCC diagnostic pop
122 : #endif
123 : {
124 1 : CPLError(CE_Warning, CPLE_AppDefined,
125 : "Recode from %s to %s failed with the error: \"%s\".",
126 1 : pszSrcEncoding, pszDstEncoding, strerror(errno));
127 :
128 1 : return CPLStrdup(pszSource);
129 : }
130 :
131 : /* -------------------------------------------------------------------- */
132 : /* XXX: There is a portability issue: iconv() function could be */
133 : /* declared differently on different platforms. The second */
134 : /* argument could be declared as char** (as POSIX defines) or */
135 : /* as a const char**. Handle it with the ICONV_CPP_CONST macro here. */
136 : /* -------------------------------------------------------------------- */
137 282 : ICONV_CPP_CONST char *pszSrcBuf =
138 : const_cast<ICONV_CPP_CONST char *>(pszSource);
139 282 : size_t nSrcLen = strlen(pszSource);
140 282 : size_t nDstCurLen = std::max(CPL_RECODE_DSTBUF_SIZE, nSrcLen);
141 282 : size_t nDstLen = nDstCurLen;
142 : char *pszDestination =
143 282 : static_cast<char *>(CPLCalloc(nDstCurLen + 1, sizeof(char)));
144 282 : char *pszDstBuf = pszDestination;
145 :
146 567 : while (nSrcLen > 0)
147 : {
148 : size_t nConverted =
149 290 : iconv(sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen);
150 :
151 290 : if (nConverted == static_cast<size_t>(-1))
152 : {
153 18 : if (errno == EILSEQ)
154 : {
155 : // Skip the invalid sequence in the input string.
156 12 : if (!bHaveWarned1)
157 : {
158 1 : bHaveWarned1 = true;
159 1 : CPLError(CE_Warning, CPLE_AppDefined,
160 : "One or several characters couldn't be converted "
161 : "correctly from %s to %s. "
162 : "This warning will not be emitted anymore",
163 : pszSrcEncoding, pszDstEncoding);
164 : }
165 12 : if (nSrcLen == 0)
166 0 : break;
167 12 : nSrcLen--;
168 12 : pszSrcBuf++;
169 12 : continue;
170 : }
171 :
172 6 : else if (errno == E2BIG)
173 : {
174 : // We are running out of the output buffer.
175 : // Dynamically increase the buffer size.
176 1 : size_t nTmp = nDstCurLen;
177 1 : nDstCurLen *= 2;
178 : pszDestination = static_cast<char *>(
179 1 : CPLRealloc(pszDestination, nDstCurLen + 1));
180 1 : pszDstBuf = pszDestination + nTmp - nDstLen;
181 1 : nDstLen += nTmp;
182 1 : continue;
183 : }
184 :
185 : else
186 5 : break;
187 : }
188 : }
189 :
190 282 : pszDestination[nDstCurLen - nDstLen] = '\0';
191 :
192 282 : iconv_close(sConv);
193 :
194 282 : return pszDestination;
195 : }
196 :
197 : /************************************************************************/
198 : /* CPLRecodeFromWCharIconv() */
199 : /************************************************************************/
200 :
201 : /**
202 : * Convert wchar_t string to UTF-8.
203 : *
204 : * Convert a wchar_t string into a multibyte utf-8 string
205 : * using the iconv() function.
206 : *
207 : * Note that the wchar_t type varies in size on different systems. On
208 : * win32 it is normally 2 bytes, and on unix 4 bytes.
209 : *
210 : * If an error occurs an error may, or may not be posted with CPLError().
211 : *
212 : * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
213 : * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
214 : * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
215 : *
216 : * @return a zero terminated multi-byte string which should be freed with
217 : * CPLFree(), or NULL if an error occurs.
218 : */
219 :
220 1 : char *CPLRecodeFromWCharIconv(const wchar_t *pwszSource,
221 : const char *pszSrcEncoding,
222 : const char *pszDstEncoding)
223 :
224 : {
225 1 : pszSrcEncoding = CPLFixInputEncoding(pszSrcEncoding, pwszSource[0]);
226 :
227 : /* -------------------------------------------------------------------- */
228 : /* What is the source length. */
229 : /* -------------------------------------------------------------------- */
230 1 : size_t nSrcLen = 0;
231 :
232 2049 : while (pwszSource[nSrcLen] != 0)
233 2048 : nSrcLen++;
234 :
235 : /* -------------------------------------------------------------------- */
236 : /* iconv() does not support wchar_t so we need to repack the */
237 : /* characters according to the width of a character in the */
238 : /* source encoding. For instance if wchar_t is 4 bytes but our */
239 : /* source is UTF16 then we need to pack down into 2 byte */
240 : /* characters before passing to iconv(). */
241 : /* -------------------------------------------------------------------- */
242 1 : const int nTargetCharWidth = CPLEncodingCharSize(pszSrcEncoding);
243 :
244 1 : if (nTargetCharWidth < 1)
245 : {
246 0 : CPLError(CE_Warning, CPLE_AppDefined,
247 : "Recode from %s with CPLRecodeFromWChar() failed because"
248 : " the width of characters in the encoding are not known.",
249 : pszSrcEncoding);
250 0 : return CPLStrdup("");
251 : }
252 :
253 : GByte *pszIconvSrcBuf =
254 1 : static_cast<GByte *>(CPLCalloc((nSrcLen + 1), nTargetCharWidth));
255 :
256 2050 : for (unsigned int iSrc = 0; iSrc <= nSrcLen; iSrc++)
257 : {
258 2049 : if (nTargetCharWidth == 1)
259 0 : pszIconvSrcBuf[iSrc] = static_cast<GByte>(pwszSource[iSrc]);
260 2049 : else if (nTargetCharWidth == 2)
261 2049 : (reinterpret_cast<short *>(pszIconvSrcBuf))[iSrc] =
262 2049 : static_cast<short>(pwszSource[iSrc]);
263 0 : else if (nTargetCharWidth == 4)
264 0 : (reinterpret_cast<GInt32 *>(pszIconvSrcBuf))[iSrc] =
265 0 : pwszSource[iSrc];
266 : }
267 :
268 : /* -------------------------------------------------------------------- */
269 : /* Create the iconv() translation object. */
270 : /* -------------------------------------------------------------------- */
271 : iconv_t sConv;
272 :
273 1 : sConv = iconv_open(pszDstEncoding, pszSrcEncoding);
274 :
275 : #ifdef __GNUC__
276 : #pragma GCC diagnostic push
277 : #pragma GCC diagnostic ignored "-Wold-style-cast"
278 : #endif
279 : // iconv_t might be a integer or a pointer, so we have to fallback to
280 : // C-style cast
281 1 : if (sConv == (iconv_t)(-1))
282 : #ifdef __GNUC__
283 : #pragma GCC diagnostic pop
284 : #endif
285 : {
286 0 : CPLFree(pszIconvSrcBuf);
287 0 : CPLError(CE_Warning, CPLE_AppDefined,
288 : "Recode from %s to %s failed with the error: \"%s\".",
289 0 : pszSrcEncoding, pszDstEncoding, strerror(errno));
290 :
291 0 : return CPLStrdup("");
292 : }
293 :
294 : /* -------------------------------------------------------------------- */
295 : /* XXX: There is a portability issue: iconv() function could be */
296 : /* declared differently on different platforms. The second */
297 : /* argument could be declared as char** (as POSIX defines) or */
298 : /* as a const char**. Handle it with the ICONV_CPP_CONST macro here. */
299 : /* -------------------------------------------------------------------- */
300 1 : ICONV_CPP_CONST char *pszSrcBuf = const_cast<ICONV_CPP_CONST char *>(
301 : reinterpret_cast<char *>(pszIconvSrcBuf));
302 :
303 : /* iconv expects a number of bytes, not characters */
304 1 : nSrcLen *= nTargetCharWidth;
305 :
306 : /* -------------------------------------------------------------------- */
307 : /* Allocate destination buffer. */
308 : /* -------------------------------------------------------------------- */
309 1 : size_t nDstCurLen = std::max(CPL_RECODE_DSTBUF_SIZE, nSrcLen + 1);
310 1 : size_t nDstLen = nDstCurLen;
311 : char *pszDestination =
312 1 : static_cast<char *>(CPLCalloc(nDstCurLen, sizeof(char)));
313 1 : char *pszDstBuf = pszDestination;
314 :
315 2 : while (nSrcLen > 0)
316 : {
317 : const size_t nConverted =
318 1 : iconv(sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen);
319 :
320 1 : if (nConverted == static_cast<size_t>(-1))
321 : {
322 0 : if (errno == EILSEQ)
323 : {
324 : // Skip the invalid sequence in the input string.
325 0 : nSrcLen -= nTargetCharWidth;
326 0 : pszSrcBuf += nTargetCharWidth;
327 0 : if (!bHaveWarned2)
328 : {
329 0 : bHaveWarned2 = true;
330 0 : CPLError(CE_Warning, CPLE_AppDefined,
331 : "One or several characters couldn't be converted "
332 : "correctly from %s to %s. "
333 : "This warning will not be emitted anymore",
334 : pszSrcEncoding, pszDstEncoding);
335 : }
336 0 : continue;
337 : }
338 :
339 0 : else if (errno == E2BIG)
340 : {
341 : // We are running out of the output buffer.
342 : // Dynamically increase the buffer size.
343 0 : size_t nTmp = nDstCurLen;
344 0 : nDstCurLen *= 2;
345 : pszDestination =
346 0 : static_cast<char *>(CPLRealloc(pszDestination, nDstCurLen));
347 0 : pszDstBuf = pszDestination + nTmp - nDstLen;
348 0 : nDstLen += nDstCurLen - nTmp;
349 0 : continue;
350 : }
351 :
352 : else
353 0 : break;
354 : }
355 : }
356 :
357 1 : if (nDstLen == 0)
358 : {
359 0 : ++nDstCurLen;
360 : pszDestination =
361 0 : static_cast<char *>(CPLRealloc(pszDestination, nDstCurLen));
362 0 : ++nDstLen;
363 : }
364 1 : pszDestination[nDstCurLen - nDstLen] = '\0';
365 :
366 1 : iconv_close(sConv);
367 :
368 1 : CPLFree(pszIconvSrcBuf);
369 :
370 1 : return pszDestination;
371 : }
372 :
373 : /************************************************************************/
374 : /* CPLRecodeToWCharIconv() */
375 : /************************************************************************/
376 :
377 : /**
378 : * Convert UTF-8 string to a wchar_t string.
379 : *
380 : * Convert a 8bit, multi-byte per character input string into a wide
381 : * character (wchar_t) string using the iconv() function.
382 : *
383 : * Note that the wchar_t type varies in size on different systems. On
384 : * win32 it is normally 2 bytes, and on unix 4 bytes.
385 : *
386 : * If an error occurs an error may, or may not be posted with CPLError().
387 : *
388 : * @param pszSource input multi-byte character string.
389 : * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
390 : * @param pszDstEncoding destination encoding. Must be "WCHAR_T".
391 : *
392 : * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
393 : * NULL on error.
394 : */
395 :
396 0 : wchar_t *CPLRecodeToWCharIconv(const char *pszSource,
397 : const char *pszSrcEncoding,
398 : const char *pszDstEncoding)
399 :
400 : {
401 0 : if (strcmp(pszDstEncoding, "WCHAR_T") != 0)
402 : {
403 0 : CPLError(CE_Failure, CPLE_AppDefined,
404 : "Stub recoding implementation does not support "
405 : "CPLRecodeToWCharIconv(...,%s,%s)",
406 : pszSrcEncoding, pszDstEncoding);
407 0 : return nullptr;
408 : }
409 :
410 : // Using double static_cast<> makes CodeQL cpp/incorrect-string-type-conversion
411 : // check happy...
412 : return static_cast<wchar_t *>(static_cast<void *>(
413 0 : CPLRecodeIconv(pszSource, pszSrcEncoding, pszDstEncoding)));
414 : }
415 :
416 : #endif /* CPL_RECODE_ICONV */
|