Line data Source code
1 : /**********************************************************************
2 : *
3 : * Name: cpl_recode_stub.cpp
4 : * Project: CPL - Common Portability Library
5 : * Purpose: Character set recoding and char/wchar_t conversions, stub
6 : * implementation to be used if iconv() functionality is not
7 : * available.
8 : * Author: Frank Warmerdam, warmerdam@pobox.com
9 : *
10 : * The bulk of this code is derived from the utf.c module from FLTK. It
11 : * was originally downloaded from:
12 : * http://svn.easysw.com/public/fltk/fltk/trunk/src/utf.c
13 : *
14 : **********************************************************************
15 : * Copyright (c) 2008, Frank Warmerdam
16 : * Copyright 2006 by Bill Spitzak and others.
17 : * Copyright (c) 2009-2014, Even Rouault <even dot rouault at spatialys.com>
18 : *
19 : * Permission to use, copy, modify, and distribute this software for any
20 : * purpose with or without fee is hereby granted, provided that the above
21 : * copyright notice and this permission notice appear in all copies.
22 : *
23 : * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
24 : * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
25 : * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
26 : * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
27 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
28 : * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
29 : * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
30 : **********************************************************************/
31 :
32 : #include "cpl_port.h"
33 : #include "cpl_string.h"
34 :
35 : #include <cstring>
36 :
37 : #include "cpl_conv.h"
38 : #include "cpl_error.h"
39 : #include "cpl_character_sets.c"
40 :
41 : static unsigned utf8decode(const char *p, const char *end, int *len);
42 : static unsigned utf8towc(const char *src, unsigned srclen, wchar_t *dst,
43 : unsigned dstlen);
44 : static unsigned utf8toa(const char *src, unsigned srclen, char *dst,
45 : unsigned dstlen);
46 : static unsigned utf8fromwc(char *dst, unsigned dstlen, const wchar_t *src,
47 : unsigned srclen);
48 : static unsigned utf8froma(char *dst, unsigned dstlen, const char *src,
49 : unsigned srclen);
50 : static int utf8test(const char *src, unsigned srclen);
51 :
52 : #ifdef _WIN32
53 :
54 : #include <windows.h>
55 : #include <winnls.h>
56 :
57 : static char *CPLWin32Recode(const char *src, unsigned src_code_page,
58 : unsigned dst_code_page) CPL_RETURNS_NONNULL;
59 : #endif
60 :
61 : /* used by cpl_recode.cpp */
62 : extern void CPLClearRecodeStubWarningFlags();
63 : extern char *CPLRecodeStub(const char *, const char *,
64 : const char *) CPL_RETURNS_NONNULL;
65 : extern char *CPLRecodeFromWCharStub(const wchar_t *, const char *,
66 : const char *);
67 : extern wchar_t *CPLRecodeToWCharStub(const char *, const char *, const char *);
68 :
69 : /************************************************************************/
70 : /* ==================================================================== */
71 : /* Stub Implementation not depending on iconv() or WIN32 API. */
72 : /* ==================================================================== */
73 : /************************************************************************/
74 :
75 : static bool bHaveWarned1 = false;
76 : static bool bHaveWarned2 = false;
77 : static bool bHaveWarned3 = false;
78 : static bool bHaveWarned4 = false;
79 : static bool bHaveWarned5 = false;
80 : static bool bHaveWarned6 = false;
81 :
82 : /************************************************************************/
83 : /* CPLClearRecodeStubWarningFlags() */
84 : /************************************************************************/
85 :
86 10404 : void CPLClearRecodeStubWarningFlags()
87 : {
88 10404 : bHaveWarned1 = false;
89 10404 : bHaveWarned2 = false;
90 10404 : bHaveWarned3 = false;
91 10404 : bHaveWarned4 = false;
92 10404 : bHaveWarned5 = false;
93 10404 : bHaveWarned6 = false;
94 10404 : }
95 :
96 : /************************************************************************/
97 : /* CPLRecodeStub() */
98 : /************************************************************************/
99 :
100 : /**
101 : * Convert a string from a source encoding to a destination encoding.
102 : *
103 : * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
104 : * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
105 : * <ul>
106 : * <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in
107 : * fact)</li>
108 : * <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
109 : * <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
110 : * </ul>
111 : *
112 : * If an error occurs an error may, or may not be posted with CPLError().
113 : *
114 : * @param pszSource a NULL terminated string.
115 : * @param pszSrcEncoding the source encoding.
116 : * @param pszDstEncoding the destination encoding.
117 : *
118 : * @return a NULL terminated string which should be freed with CPLFree().
119 : */
120 :
121 782866 : char *CPLRecodeStub(const char *pszSource, const char *pszSrcEncoding,
122 : const char *pszDstEncoding)
123 :
124 : {
125 : /* -------------------------------------------------------------------- */
126 : /* If the source or destination is current locale(), we change */
127 : /* it to ISO8859-1 since our stub implementation does not */
128 : /* attempt to address locales properly. */
129 : /* -------------------------------------------------------------------- */
130 :
131 782866 : if (pszSrcEncoding[0] == '\0')
132 0 : pszSrcEncoding = CPL_ENC_ISO8859_1;
133 :
134 782866 : if (pszDstEncoding[0] == '\0')
135 0 : pszDstEncoding = CPL_ENC_ISO8859_1;
136 :
137 : /* -------------------------------------------------------------------- */
138 : /* ISO8859 to UTF8 */
139 : /* -------------------------------------------------------------------- */
140 782866 : if (strcmp(pszSrcEncoding, CPL_ENC_ISO8859_1) == 0 &&
141 712490 : strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
142 : {
143 712490 : const int nCharCount = static_cast<int>(strlen(pszSource));
144 712490 : char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1));
145 :
146 712490 : utf8froma(pszResult, nCharCount * 2 + 1, pszSource, nCharCount);
147 :
148 712490 : return pszResult;
149 : }
150 :
151 : /* -------------------------------------------------------------------- */
152 : /* UTF8 to ISO8859 */
153 : /* -------------------------------------------------------------------- */
154 70376 : if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0 &&
155 49356 : strcmp(pszDstEncoding, CPL_ENC_ISO8859_1) == 0)
156 : {
157 49356 : int nCharCount = static_cast<int>(strlen(pszSource));
158 49356 : char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount + 1));
159 :
160 49356 : utf8toa(pszSource, nCharCount, pszResult, nCharCount + 1);
161 :
162 49356 : return pszResult;
163 : }
164 :
165 : // A few hard coded CPxxx/ISO-8859-x to UTF-8 tables
166 21020 : if (EQUAL(pszDstEncoding, CPL_ENC_UTF8))
167 : {
168 21020 : const auto pConvTable = CPLGetConversionTableToUTF8(pszSrcEncoding);
169 21020 : if (pConvTable)
170 : {
171 21020 : const auto convTable = *pConvTable;
172 21020 : const size_t nCharCount = strlen(pszSource);
173 : char *pszResult =
174 21020 : static_cast<char *>(CPLCalloc(1, nCharCount * 3 + 1));
175 21020 : size_t iDst = 0;
176 21020 : unsigned char *pabyResult =
177 : reinterpret_cast<unsigned char *>(pszResult);
178 538745 : for (size_t i = 0; i < nCharCount; ++i)
179 : {
180 517725 : const unsigned char nChar =
181 517725 : static_cast<unsigned char>(pszSource[i]);
182 517725 : if (nChar <= 127)
183 : {
184 484687 : pszResult[iDst] = pszSource[i];
185 484687 : ++iDst;
186 : }
187 : else
188 : {
189 33038 : const unsigned char nShiftedChar = nChar - 128;
190 33038 : if (convTable[nShiftedChar][0])
191 : {
192 33037 : pabyResult[iDst] = convTable[nShiftedChar][0];
193 33037 : ++iDst;
194 33037 : CPLAssert(convTable[nShiftedChar][1]);
195 33037 : pabyResult[iDst] = convTable[nShiftedChar][1];
196 33037 : ++iDst;
197 33037 : if (convTable[nShiftedChar][2])
198 : {
199 3 : pabyResult[iDst] = convTable[nShiftedChar][2];
200 3 : ++iDst;
201 : }
202 : }
203 : else
204 : {
205 : // Skip the invalid sequence in the input string.
206 1 : if (!bHaveWarned2)
207 : {
208 1 : bHaveWarned2 = true;
209 1 : CPLError(CE_Warning, CPLE_AppDefined,
210 : "One or several characters couldn't be "
211 : "converted correctly from %s to %s. "
212 : "This warning will not be emitted anymore",
213 : pszSrcEncoding, pszDstEncoding);
214 : }
215 : }
216 : }
217 : }
218 :
219 21020 : pszResult[iDst] = 0;
220 21020 : return pszResult;
221 : }
222 : }
223 :
224 : #ifdef _WIN32
225 : const auto MapEncodingToWindowsCodePage = [](const char *pszEncoding)
226 : {
227 : // Cf https://learn.microsoft.com/fr-fr/windows/win32/intl/code-page-identifiers
228 : if (STARTS_WITH(pszEncoding, "CP"))
229 : {
230 : const int nCode = atoi(pszEncoding + strlen("CP"));
231 : if (nCode > 0)
232 : return nCode;
233 : else if (EQUAL(pszEncoding, "CP_OEMCP"))
234 : return CP_OEMCP;
235 : else if (EQUAL(pszEncoding, "CP_ACP"))
236 : return CP_ACP;
237 : }
238 : else if (STARTS_WITH(pszEncoding, "WINDOWS-"))
239 : {
240 : const int nCode = atoi(pszEncoding + strlen("WINDOWS-"));
241 : if (nCode > 0)
242 : return nCode;
243 : }
244 : else if (STARTS_WITH(pszEncoding, "ISO-8859-"))
245 : {
246 : const int nCode = atoi(pszEncoding + strlen("ISO-8859-"));
247 : if ((nCode >= 1 && nCode <= 9) || nCode == 13 || nCode == 15)
248 : return 28590 + nCode;
249 : }
250 :
251 : // Return a negative value, since CP_ACP = 0
252 : return -1;
253 : };
254 :
255 : /* ---------------------------------------------------------------------*/
256 : /* XXX to UTF8 */
257 : /* ---------------------------------------------------------------------*/
258 : if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
259 : {
260 : const int nCode = MapEncodingToWindowsCodePage(pszSrcEncoding);
261 : if (nCode >= 0)
262 : {
263 : return CPLWin32Recode(pszSource, nCode, CP_UTF8);
264 : }
265 : }
266 :
267 : /* ---------------------------------------------------------------------*/
268 : /* UTF8 to XXX */
269 : /* ---------------------------------------------------------------------*/
270 : if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0)
271 : {
272 : const int nCode = MapEncodingToWindowsCodePage(pszDstEncoding);
273 : if (nCode >= 0)
274 : {
275 : return CPLWin32Recode(pszSource, CP_UTF8, nCode);
276 : }
277 : }
278 : #endif
279 :
280 : /* -------------------------------------------------------------------- */
281 : /* Anything else to UTF-8 is treated as ISO8859-1 to UTF-8 with */
282 : /* a one-time warning. */
283 : /* -------------------------------------------------------------------- */
284 0 : if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
285 : {
286 0 : const int nCharCount = static_cast<int>(strlen(pszSource));
287 0 : char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1));
288 :
289 0 : if (!bHaveWarned1)
290 : {
291 0 : bHaveWarned1 = true;
292 0 : CPLError(CE_Warning, CPLE_AppDefined,
293 : "Recode from %s to UTF-8 not supported, "
294 : "treated as ISO-8859-1 to UTF-8.",
295 : pszSrcEncoding);
296 : }
297 :
298 0 : utf8froma(pszResult, nCharCount * 2 + 1, pszSource, nCharCount);
299 :
300 0 : return pszResult;
301 : }
302 :
303 : /* -------------------------------------------------------------------- */
304 : /* Everything else is treated as a no-op with a warning. */
305 : /* -------------------------------------------------------------------- */
306 : {
307 0 : if (!bHaveWarned3)
308 : {
309 0 : bHaveWarned3 = true;
310 0 : CPLError(CE_Warning, CPLE_AppDefined,
311 : "Recode from %s to %s not supported, no change applied.",
312 : pszSrcEncoding, pszDstEncoding);
313 : }
314 :
315 0 : return CPLStrdup(pszSource);
316 : }
317 : }
318 :
319 : /************************************************************************/
320 : /* CPLRecodeFromWCharStub() */
321 : /************************************************************************/
322 :
323 : /**
324 : * Convert wchar_t string to UTF-8.
325 : *
326 : * Convert a wchar_t string into a multibyte utf-8 string. The only
327 : * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
328 : * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
329 : * and CPL_ENC_ISO8859_1. In some cases (i.e. using iconv()) other encodings
330 : * may also be supported.
331 : *
332 : * Note that the wchar_t type varies in size on different systems. On
333 : * win32 it is normally 2 bytes, and on unix 4 bytes.
334 : *
335 : * If an error occurs an error may, or may not be posted with CPLError().
336 : *
337 : * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
338 : * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
339 : * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
340 : *
341 : * @return a zero terminated multi-byte string which should be freed with
342 : * CPLFree(), or NULL if an error occurs.
343 : */
344 :
345 111144 : char *CPLRecodeFromWCharStub(const wchar_t *pwszSource,
346 : const char *pszSrcEncoding,
347 : const char *pszDstEncoding)
348 :
349 : {
350 : /* -------------------------------------------------------------------- */
351 : /* We try to avoid changes of character set. We are just */
352 : /* providing for unicode to unicode. */
353 : /* -------------------------------------------------------------------- */
354 111144 : if (strcmp(pszSrcEncoding, "WCHAR_T") != 0 &&
355 109785 : strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0 &&
356 109785 : strcmp(pszSrcEncoding, CPL_ENC_UTF16) != 0 &&
357 109785 : strcmp(pszSrcEncoding, CPL_ENC_UCS2) != 0 &&
358 0 : strcmp(pszSrcEncoding, CPL_ENC_UCS4) != 0)
359 : {
360 0 : CPLError(CE_Failure, CPLE_AppDefined,
361 : "Stub recoding implementation does not support "
362 : "CPLRecodeFromWCharStub(...,%s,%s)",
363 : pszSrcEncoding, pszDstEncoding);
364 0 : return nullptr;
365 : }
366 :
367 : /* -------------------------------------------------------------------- */
368 : /* What is the source length. */
369 : /* -------------------------------------------------------------------- */
370 111144 : int nSrcLen = 0;
371 :
372 1613350 : while (pwszSource[nSrcLen] != 0)
373 1502200 : nSrcLen++;
374 :
375 : /* -------------------------------------------------------------------- */
376 : /* Allocate destination buffer plenty big. */
377 : /* -------------------------------------------------------------------- */
378 111144 : const int nDstBufSize = nSrcLen * 4 + 1;
379 : // Nearly worst case.
380 111144 : char *pszResult = static_cast<char *>(CPLMalloc(nDstBufSize));
381 :
382 111144 : if (nSrcLen == 0)
383 : {
384 46286 : pszResult[0] = '\0';
385 46286 : return pszResult;
386 : }
387 :
388 : /* -------------------------------------------------------------------- */
389 : /* Convert, and confirm we had enough space. */
390 : /* -------------------------------------------------------------------- */
391 64858 : const int nDstLen = utf8fromwc(pszResult, nDstBufSize, pwszSource, nSrcLen);
392 64858 : if (nDstLen >= nDstBufSize)
393 : {
394 0 : CPLAssert(false); // too small!
395 : return nullptr;
396 : }
397 :
398 : /* -------------------------------------------------------------------- */
399 : /* If something other than UTF-8 was requested, recode now. */
400 : /* -------------------------------------------------------------------- */
401 64858 : if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
402 64858 : return pszResult;
403 :
404 : char *pszFinalResult =
405 0 : CPLRecodeStub(pszResult, CPL_ENC_UTF8, pszDstEncoding);
406 :
407 0 : CPLFree(pszResult);
408 :
409 0 : return pszFinalResult;
410 : }
411 :
412 : /************************************************************************/
413 : /* CPLRecodeToWCharStub() */
414 : /************************************************************************/
415 :
416 : /**
417 : * Convert UTF-8 string to a wchar_t string.
418 : *
419 : * Convert a 8bit, multi-byte per character input string into a wide
420 : * character (wchar_t) string. The only guaranteed supported source encodings
421 : * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1). The only
422 : * guaranteed supported destination encoding is CPL_ENC_UCS2. Other source
423 : * and destination encodings may be supported depending on the underlying
424 : * implementation.
425 : *
426 : * Note that the wchar_t type varies in size on different systems. On
427 : * win32 it is normally 2 bytes, and on unix 4 bytes.
428 : *
429 : * If an error occurs an error may, or may not be posted with CPLError().
430 : *
431 : * @param pszSource input multi-byte character string.
432 : * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
433 : * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
434 : *
435 : * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
436 : * NULL on error.
437 : *
438 : * @since GDAL 1.6.0
439 : */
440 :
441 52345 : wchar_t *CPLRecodeToWCharStub(const char *pszSource, const char *pszSrcEncoding,
442 : const char *pszDstEncoding)
443 :
444 : {
445 52345 : char *pszUTF8Source = const_cast<char *>(pszSource);
446 :
447 52345 : if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0 &&
448 0 : strcmp(pszSrcEncoding, CPL_ENC_ASCII) != 0)
449 : {
450 0 : pszUTF8Source = CPLRecodeStub(pszSource, pszSrcEncoding, CPL_ENC_UTF8);
451 0 : if (pszUTF8Source == nullptr)
452 0 : return nullptr;
453 : }
454 :
455 : /* -------------------------------------------------------------------- */
456 : /* We try to avoid changes of character set. We are just */
457 : /* providing for unicode to unicode. */
458 : /* -------------------------------------------------------------------- */
459 52345 : if (strcmp(pszDstEncoding, "WCHAR_T") != 0 &&
460 52345 : strcmp(pszDstEncoding, CPL_ENC_UCS2) != 0 &&
461 0 : strcmp(pszDstEncoding, CPL_ENC_UCS4) != 0 &&
462 0 : strcmp(pszDstEncoding, CPL_ENC_UTF16) != 0)
463 : {
464 0 : CPLError(CE_Failure, CPLE_AppDefined,
465 : "Stub recoding implementation does not support "
466 : "CPLRecodeToWCharStub(...,%s,%s)",
467 : pszSrcEncoding, pszDstEncoding);
468 0 : if (pszUTF8Source != pszSource)
469 0 : CPLFree(pszUTF8Source);
470 0 : return nullptr;
471 : }
472 :
473 : /* -------------------------------------------------------------------- */
474 : /* Do the UTF-8 to UCS-2 recoding. */
475 : /* -------------------------------------------------------------------- */
476 52345 : int nSrcLen = static_cast<int>(strlen(pszUTF8Source));
477 : wchar_t *pwszResult =
478 52345 : static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), nSrcLen + 1));
479 :
480 52345 : utf8towc(pszUTF8Source, nSrcLen, pwszResult, nSrcLen + 1);
481 :
482 52345 : if (pszUTF8Source != pszSource)
483 0 : CPLFree(pszUTF8Source);
484 :
485 52345 : return pwszResult;
486 : }
487 :
488 : /************************************************************************/
489 : /* CPLIsUTF8() */
490 : /************************************************************************/
491 :
492 : /**
493 : * Test if a string is encoded as UTF-8.
494 : *
495 : * @param pabyData input string to test
496 : * @param nLen length of the input string, or -1 if the function must compute
497 : * the string length. In which case it must be null terminated.
498 : * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
499 : *
500 : * @since GDAL 1.7.0
501 : */
502 18353 : int CPLIsUTF8(const char *pabyData, int nLen)
503 : {
504 18353 : if (nLen < 0)
505 13752 : nLen = static_cast<int>(strlen(pabyData));
506 18353 : return utf8test(pabyData, static_cast<unsigned>(nLen)) != 0;
507 : }
508 :
509 : /************************************************************************/
510 : /* ==================================================================== */
511 : /* UTF.C code from FLTK with some modifications. */
512 : /* ==================================================================== */
513 : /************************************************************************/
514 :
515 : /* Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
516 : they are instead turned into the Unicode REPLACEMENT CHARACTER, of
517 : value 0xfffd.
518 : If this is on utf8decode will correctly map most (perhaps all)
519 : human-readable text that is in ISO-8859-1. This may allow you
520 : to completely ignore character sets in your code because virtually
521 : everything is either ISO-8859-1 or UTF-8.
522 : */
523 : #define ERRORS_TO_ISO8859_1 1
524 :
525 : /* Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
526 : Unicode index for Microsoft's CP1252 character set. You should
527 : also set ERRORS_TO_ISO8859_1. With this a huge amount of more
528 : available text (such as all web pages) are correctly converted
529 : to Unicode.
530 : */
531 : #define ERRORS_TO_CP1252 1
532 :
533 : /* A number of Unicode code points are in fact illegal and should not
534 : be produced by a UTF-8 converter. Turn this on will replace the
535 : bytes in those encodings with errors. If you do this then converting
536 : arbitrary 16-bit data to UTF-8 and then back is not an identity,
537 : which will probably break a lot of software.
538 : */
539 : #define STRICT_RFC3629 0
540 :
541 : #if ERRORS_TO_CP1252
542 : // Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
543 : // to Unicode:
544 : constexpr unsigned short cp1252[32] = {
545 : 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
546 : 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
547 : 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
548 : 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178};
549 : #endif
550 :
551 : /************************************************************************/
552 : /* utf8decode() */
553 : /************************************************************************/
554 :
555 : /*
556 : Decode a single UTF-8 encoded character starting at \e p. The
557 : resulting Unicode value (in the range 0-0x10ffff) is returned,
558 : and \e len is set the number of bytes in the UTF-8 encoding
559 : (adding \e len to \e p will point at the next character).
560 :
561 : If \a p points at an illegal UTF-8 encoding, including one that
562 : would go past \e end, or where a code is uses more bytes than
563 : necessary, then *reinterpret_cast<const unsigned char*>(p) is translated as
564 : though it is in the Microsoft CP1252 character set and \e len is set to 1.
565 : Treating errors this way allows this to decode almost any
566 : ISO-8859-1 or CP1252 text that has been mistakenly placed where
567 : UTF-8 is expected, and has proven very useful.
568 :
569 : If you want errors to be converted to error characters (as the
570 : standards recommend), adding a test to see if the length is
571 : unexpectedly 1 will work:
572 :
573 : \code
574 : if( *p & 0x80 )
575 : { // What should be a multibyte encoding.
576 : code = utf8decode(p, end, &len);
577 : if( len<2 ) code = 0xFFFD; // Turn errors into REPLACEMENT CHARACTER.
578 : }
579 : else
580 : { // Handle the 1-byte utf8 encoding:
581 : code = *p;
582 : len = 1;
583 : }
584 : \endcode
585 :
586 : Direct testing for the 1-byte case (as shown above) will also
587 : speed up the scanning of strings where the majority of characters
588 : are ASCII.
589 : */
590 3924 : static unsigned utf8decode(const char *p, const char *end, int *len)
591 : {
592 3924 : unsigned char c = *reinterpret_cast<const unsigned char *>(p);
593 3924 : if (c < 0x80)
594 : {
595 0 : *len = 1;
596 0 : return c;
597 : #if ERRORS_TO_CP1252
598 : }
599 3924 : else if (c < 0xa0)
600 : {
601 39 : *len = 1;
602 39 : return cp1252[c - 0x80];
603 : #endif
604 : }
605 3885 : else if (c < 0xc2)
606 : {
607 10 : goto FAIL;
608 : }
609 3875 : if (p + 1 >= end || (p[1] & 0xc0) != 0x80)
610 71 : goto FAIL;
611 3804 : if (c < 0xe0)
612 : {
613 3796 : *len = 2;
614 3796 : return ((p[0] & 0x1f) << 6) + ((p[1] & 0x3f));
615 : }
616 8 : else if (c == 0xe0)
617 : {
618 0 : if ((reinterpret_cast<const unsigned char *>(p))[1] < 0xa0)
619 0 : goto FAIL;
620 0 : goto UTF8_3;
621 : #if STRICT_RFC3629
622 : }
623 : else if (c == 0xed)
624 : {
625 : // RFC 3629 says surrogate chars are illegal.
626 : if ((reinterpret_cast<const unsigned char *>(p))[1] >= 0xa0)
627 : goto FAIL;
628 : goto UTF8_3;
629 : }
630 : else if (c == 0xef)
631 : {
632 : // 0xfffe and 0xffff are also illegal characters.
633 : if ((reinterpret_cast<const unsigned char *>(p))[1] == 0xbf &&
634 : (reinterpret_cast<const unsigned char *>(p))[2] >= 0xbe)
635 : goto FAIL;
636 : goto UTF8_3;
637 : #endif
638 : }
639 8 : else if (c < 0xf0)
640 : {
641 4 : UTF8_3:
642 4 : if (p + 2 >= end || (p[2] & 0xc0) != 0x80)
643 0 : goto FAIL;
644 4 : *len = 3;
645 4 : return ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + ((p[2] & 0x3f));
646 : }
647 4 : else if (c == 0xf0)
648 : {
649 4 : if ((reinterpret_cast<const unsigned char *>(p))[1] < 0x90)
650 0 : goto FAIL;
651 4 : goto UTF8_4;
652 : }
653 0 : else if (c < 0xf4)
654 : {
655 0 : UTF8_4:
656 4 : if (p + 3 >= end || (p[2] & 0xc0) != 0x80 || (p[3] & 0xc0) != 0x80)
657 0 : goto FAIL;
658 4 : *len = 4;
659 : #if STRICT_RFC3629
660 : // RFC 3629 says all codes ending in fffe or ffff are illegal:
661 : if ((p[1] & 0xf) == 0xf &&
662 : (reinterpret_cast<const unsigned char *>(p))[2] == 0xbf &&
663 : (reinterpret_cast<const unsigned char *>(p))[3] >= 0xbe)
664 : goto FAIL;
665 : #endif
666 4 : return ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12) +
667 4 : ((p[2] & 0x3f) << 6) + ((p[3] & 0x3f));
668 : }
669 0 : else if (c == 0xf4)
670 : {
671 0 : if ((reinterpret_cast<const unsigned char *>(p))[1] > 0x8f)
672 0 : goto FAIL; // After 0x10ffff.
673 0 : goto UTF8_4;
674 : }
675 : else
676 : {
677 0 : FAIL:
678 81 : *len = 1;
679 : #if ERRORS_TO_ISO8859_1
680 81 : return c;
681 : #else
682 : return 0xfffd; // Unicode REPLACEMENT CHARACTER
683 : #endif
684 : }
685 : }
686 :
687 : /************************************************************************/
688 : /* utf8towc() */
689 : /************************************************************************/
690 :
691 : /* Convert a UTF-8 sequence into an array of wchar_t. These
692 : are used by some system calls, especially on Windows.
693 :
694 : \a src points at the UTF-8, and \a srclen is the number of bytes to
695 : convert.
696 :
697 : \a dst points at an array to write, and \a dstlen is the number of
698 : locations in this array. At most \a dstlen-1 words will be
699 : written there, plus a 0 terminating word. Thus this function
700 : will never overwrite the buffer and will always return a
701 : zero-terminated string. If \a dstlen is zero then \a dst can be
702 : null and no data is written, but the length is returned.
703 :
704 : The return value is the number of words that \e would be written
705 : to \a dst if it were long enough, not counting the terminating
706 : zero. If the return value is greater or equal to \a dstlen it
707 : indicates truncation, you can then allocate a new array of size
708 : return+1 and call this again.
709 :
710 : Errors in the UTF-8 are converted as though each byte in the
711 : erroneous string is in the Microsoft CP1252 encoding. This allows
712 : ISO-8859-1 text mistakenly identified as UTF-8 to be printed
713 : correctly.
714 :
715 : Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
716 : and most other systems. Where wchar_t is 16 bits, Unicode
717 : characters in the range 0x10000 to 0x10ffff are converted to
718 : "surrogate pairs" which take two words each (this is called UTF-16
719 : encoding). If wchar_t is 32 bits this rather nasty problem is
720 : avoided.
721 : */
722 52345 : static unsigned utf8towc(const char *src, unsigned srclen, wchar_t *dst,
723 : unsigned dstlen)
724 : {
725 52345 : const char *p = src;
726 52345 : const char *e = src + srclen;
727 52345 : unsigned count = 0;
728 52345 : if (dstlen)
729 : while (true)
730 : {
731 383923 : if (p >= e)
732 : {
733 52345 : dst[count] = 0;
734 52345 : return count;
735 : }
736 331578 : if (!(*p & 0x80))
737 : {
738 : // ASCII
739 330160 : dst[count] = *p++;
740 : }
741 : else
742 : {
743 1418 : int len = 0;
744 1418 : unsigned ucs = utf8decode(p, e, &len);
745 1418 : p += len;
746 : #ifdef _WIN32
747 : if (ucs < 0x10000)
748 : {
749 : dst[count] = static_cast<wchar_t>(ucs);
750 : }
751 : else
752 : {
753 : // Make a surrogate pair:
754 : if (count + 2 >= dstlen)
755 : {
756 : dst[count] = 0;
757 : count += 2;
758 : break;
759 : }
760 : dst[count] = static_cast<wchar_t>(
761 : (((ucs - 0x10000u) >> 10) & 0x3ff) | 0xd800);
762 : dst[++count] = static_cast<wchar_t>((ucs & 0x3ff) | 0xdc00);
763 : }
764 : #else
765 1418 : dst[count] = static_cast<wchar_t>(ucs);
766 : #endif
767 : }
768 331578 : if (++count == dstlen)
769 : {
770 0 : dst[count - 1] = 0;
771 0 : break;
772 : }
773 331578 : }
774 : // We filled dst, measure the rest:
775 0 : while (p < e)
776 : {
777 0 : if (!(*p & 0x80))
778 : {
779 0 : p++;
780 : }
781 : else
782 : {
783 0 : int len = 0;
784 : #ifdef _WIN32
785 : const unsigned ucs = utf8decode(p, e, &len);
786 : p += len;
787 : if (ucs >= 0x10000)
788 : ++count;
789 : #else
790 0 : utf8decode(p, e, &len);
791 0 : p += len;
792 : #endif
793 : }
794 0 : ++count;
795 : }
796 :
797 0 : return count;
798 : }
799 :
800 : /************************************************************************/
801 : /* utf8toa() */
802 : /************************************************************************/
803 : /* Convert a UTF-8 sequence into an array of 1-byte characters.
804 :
805 : If the UTF-8 decodes to a character greater than 0xff then it is
806 : replaced with '?'.
807 :
808 : Errors in the UTF-8 are converted as individual bytes, same as
809 : utf8decode() does. This allows ISO-8859-1 text mistakenly identified
810 : as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
811 :
812 : \a src points at the UTF-8, and \a srclen is the number of bytes to
813 : convert.
814 :
815 : Up to \a dstlen bytes are written to \a dst, including a null
816 : terminator. The return value is the number of bytes that would be
817 : written, not counting the null terminator. If greater or equal to
818 : \a dstlen then if you malloc a new array of size n+1 you will have
819 : the space needed for the entire string. If \a dstlen is zero then
820 : nothing is written and this call just measures the storage space
821 : needed.
822 : */
823 49356 : static unsigned int utf8toa(const char *src, unsigned srclen, char *dst,
824 : unsigned dstlen)
825 : {
826 49356 : const char *p = src;
827 49356 : const char *e = src + srclen;
828 49356 : unsigned int count = 0;
829 49356 : if (dstlen)
830 : while (true)
831 : {
832 188587 : if (p >= e)
833 : {
834 49356 : dst[count] = 0;
835 49356 : return count;
836 : }
837 139231 : unsigned char c = *reinterpret_cast<const unsigned char *>(p);
838 139231 : if (c < 0xC2)
839 : {
840 : // ASCII or bad code.
841 138327 : dst[count] = c;
842 138327 : p++;
843 : }
844 : else
845 : {
846 904 : int len = 0;
847 904 : const unsigned int ucs = utf8decode(p, e, &len);
848 904 : p += len;
849 904 : if (ucs < 0x100)
850 : {
851 900 : dst[count] = static_cast<char>(ucs);
852 : }
853 : else
854 : {
855 4 : if (!bHaveWarned4)
856 : {
857 2 : bHaveWarned4 = true;
858 2 : CPLError(
859 : CE_Warning, CPLE_AppDefined,
860 : "One or several characters couldn't be converted "
861 : "correctly from UTF-8 to ISO-8859-1. "
862 : "This warning will not be emitted anymore.");
863 : }
864 4 : dst[count] = '?';
865 : }
866 : }
867 139231 : if (++count >= dstlen)
868 : {
869 0 : dst[count - 1] = 0;
870 0 : break;
871 : }
872 139231 : }
873 : // We filled dst, measure the rest:
874 0 : while (p < e)
875 : {
876 0 : if (!(*p & 0x80))
877 : {
878 0 : p++;
879 : }
880 : else
881 : {
882 0 : int len = 0;
883 0 : utf8decode(p, e, &len);
884 0 : p += len;
885 : }
886 0 : ++count;
887 : }
888 0 : return count;
889 : }
890 :
891 : /************************************************************************/
892 : /* utf8fromwc() */
893 : /************************************************************************/
894 : /* Turn "wide characters" as returned by some system calls
895 : (especially on Windows) into UTF-8.
896 :
897 : Up to \a dstlen bytes are written to \a dst, including a null
898 : terminator. The return value is the number of bytes that would be
899 : written, not counting the null terminator. If greater or equal to
900 : \a dstlen then if you malloc a new array of size n+1 you will have
901 : the space needed for the entire string. If \a dstlen is zero then
902 : nothing is written and this call just measures the storage space
903 : needed.
904 :
905 : \a srclen is the number of words in \a src to convert. On Windows
906 : this is not necessarily the number of characters, due to there
907 : possibly being "surrogate pairs" in the UTF-16 encoding used.
908 : On Unix wchar_t is 32 bits and each location is a character.
909 :
910 : On Unix if a src word is greater than 0x10ffff then this is an
911 : illegal character according to RFC 3629. These are converted as
912 : though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
913 : range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
914 : illegal according to RFC 3629. However I encode these as though
915 : they are legal, so that utf8towc will return the original data.
916 :
917 : On Windows "surrogate pairs" are converted to a single character
918 : and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
919 : pairs are converted as though they are individual characters.
920 : */
921 64858 : static unsigned int utf8fromwc(char *dst, unsigned dstlen, const wchar_t *src,
922 : unsigned srclen)
923 : {
924 64858 : unsigned int i = 0;
925 64858 : unsigned int count = 0;
926 64858 : if (dstlen)
927 : while (true)
928 : {
929 1567060 : if (i >= srclen)
930 : {
931 64858 : dst[count] = 0;
932 64858 : return count;
933 : }
934 1502200 : unsigned int ucs = src[i++];
935 1502200 : if (ucs < 0x80U)
936 : {
937 1495540 : dst[count++] = static_cast<char>(ucs);
938 1495540 : if (count >= dstlen)
939 : {
940 0 : dst[count - 1] = 0;
941 0 : break;
942 : }
943 : }
944 6664 : else if (ucs < 0x800U)
945 : {
946 : // 2 bytes.
947 4291 : if (count + 2 >= dstlen)
948 : {
949 0 : dst[count] = 0;
950 0 : count += 2;
951 0 : break;
952 : }
953 4291 : dst[count++] = 0xc0 | static_cast<char>(ucs >> 6);
954 4291 : dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
955 : #ifdef _WIN32
956 : }
957 : else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
958 : src[i] >= 0xdc00 && src[i] <= 0xdfff)
959 : {
960 : // Surrogate pair.
961 : unsigned int ucs2 = src[i++];
962 : ucs = 0x10000U + ((ucs & 0x3ff) << 10) + (ucs2 & 0x3ff);
963 : // All surrogate pairs turn into 4-byte utf8.
964 : #else
965 : }
966 2373 : else if (ucs >= 0x10000)
967 : {
968 1 : if (ucs > 0x10ffff)
969 : {
970 1 : ucs = 0xfffd;
971 1 : goto J1;
972 : }
973 : #endif
974 0 : if (count + 4 >= dstlen)
975 : {
976 0 : dst[count] = 0;
977 0 : count += 4;
978 0 : break;
979 : }
980 0 : dst[count++] = 0xf0 | static_cast<char>(ucs >> 18);
981 0 : dst[count++] = 0x80 | static_cast<char>((ucs >> 12) & 0x3F);
982 0 : dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F);
983 0 : dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
984 : }
985 : else
986 : {
987 : #ifndef _WIN32
988 2372 : J1:
989 : #endif
990 : // All others are 3 bytes:
991 2373 : if (count + 3 >= dstlen)
992 : {
993 0 : dst[count] = 0;
994 0 : count += 3;
995 0 : break;
996 : }
997 2373 : dst[count++] = 0xe0 | static_cast<char>(ucs >> 12);
998 2373 : dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F);
999 2373 : dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
1000 : }
1001 1502200 : }
1002 :
1003 : // We filled dst, measure the rest:
1004 0 : while (i < srclen)
1005 : {
1006 0 : unsigned int ucs = src[i++];
1007 0 : if (ucs < 0x80U)
1008 : {
1009 0 : count++;
1010 : }
1011 0 : else if (ucs < 0x800U)
1012 : {
1013 : // 2 bytes.
1014 0 : count += 2;
1015 : #ifdef _WIN32
1016 : }
1017 : else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen - 1 &&
1018 : src[i + 1] >= 0xdc00 && src[i + 1] <= 0xdfff)
1019 : {
1020 : // Surrogate pair.
1021 : ++i;
1022 : #else
1023 : }
1024 0 : else if (ucs >= 0x10000 && ucs <= 0x10ffff)
1025 : {
1026 : #endif
1027 0 : count += 4;
1028 : }
1029 : else
1030 : {
1031 0 : count += 3;
1032 : }
1033 : }
1034 0 : return count;
1035 : }
1036 :
1037 : /************************************************************************/
1038 : /* utf8froma() */
1039 : /************************************************************************/
1040 :
1041 : /* Convert an ISO-8859-1 (i.e. normal c-string) byte stream to UTF-8.
1042 :
1043 : It is possible this should convert Microsoft's CP1252 to UTF-8
1044 : instead. This would translate the codes in the range 0x80-0x9f
1045 : to different characters. Currently it does not do this.
1046 :
1047 : Up to \a dstlen bytes are written to \a dst, including a null
1048 : terminator. The return value is the number of bytes that would be
1049 : written, not counting the null terminator. If greater or equal to
1050 : \a dstlen then if you malloc a new array of size n+1 you will have
1051 : the space needed for the entire string. If \a dstlen is zero then
1052 : nothing is written and this call just measures the storage space
1053 : needed.
1054 :
1055 : \a srclen is the number of bytes in \a src to convert.
1056 :
1057 : If the return value equals \a srclen then this indicates that
1058 : no conversion is necessary, as only ASCII characters are in the
1059 : string.
1060 : */
1061 712490 : static unsigned utf8froma(char *dst, unsigned dstlen, const char *src,
1062 : unsigned srclen)
1063 : {
1064 712490 : const char *p = src;
1065 712490 : const char *e = src + srclen;
1066 712490 : unsigned count = 0;
1067 712490 : if (dstlen)
1068 : while (true)
1069 : {
1070 8434880 : if (p >= e)
1071 : {
1072 712490 : dst[count] = 0;
1073 712490 : return count;
1074 : }
1075 7722390 : unsigned char ucs = *reinterpret_cast<const unsigned char *>(p);
1076 7722390 : p++;
1077 7722390 : if (ucs < 0x80U)
1078 : {
1079 7684610 : dst[count++] = ucs;
1080 7684610 : if (count >= dstlen)
1081 : {
1082 0 : dst[count - 1] = 0;
1083 0 : break;
1084 : }
1085 : }
1086 : else
1087 : {
1088 : // 2 bytes (note that CP1252 translate could make 3 bytes!)
1089 37783 : if (count + 2 >= dstlen)
1090 : {
1091 0 : dst[count] = 0;
1092 0 : count += 2;
1093 0 : break;
1094 : }
1095 37783 : dst[count++] = 0xc0 | (ucs >> 6);
1096 37783 : dst[count++] = 0x80 | (ucs & 0x3F);
1097 : }
1098 7722390 : }
1099 :
1100 : // We filled dst, measure the rest:
1101 0 : while (p < e)
1102 : {
1103 0 : unsigned char ucs = *reinterpret_cast<const unsigned char *>(p);
1104 0 : p++;
1105 0 : if (ucs < 0x80U)
1106 : {
1107 0 : count++;
1108 : }
1109 : else
1110 : {
1111 0 : count += 2;
1112 : }
1113 : }
1114 :
1115 0 : return count;
1116 : }
1117 :
1118 : #ifdef _WIN32
1119 :
1120 : /************************************************************************/
1121 : /* CPLWin32Recode() */
1122 : /************************************************************************/
1123 :
1124 : /* Convert an CODEPAGE (i.e. normal c-string) byte stream
1125 : to another CODEPAGE (i.e. normal c-string) byte stream.
1126 :
1127 : \a src is target c-string byte stream (including a null terminator).
1128 : \a src_code_page is target c-string byte code page.
1129 : \a dst_code_page is destination c-string byte code page.
1130 :
1131 : UTF7 65000
1132 : UTF8 65001
1133 : OEM-US 437
1134 : OEM-ALABIC 720
1135 : OEM-GREEK 737
1136 : OEM-BALTIC 775
1137 : OEM-MLATIN1 850
1138 : OEM-LATIN2 852
1139 : OEM-CYRILLIC 855
1140 : OEM-TURKISH 857
1141 : OEM-MLATIN1P 858
1142 : OEM-HEBREW 862
1143 : OEM-RUSSIAN 866
1144 :
1145 : THAI 874
1146 : SJIS 932
1147 : GBK 936
1148 : KOREA 949
1149 : BIG5 950
1150 :
1151 : EUROPE 1250
1152 : CYRILLIC 1251
1153 : LATIN1 1252
1154 : GREEK 1253
1155 : TURKISH 1254
1156 : HEBREW 1255
1157 : ARABIC 1256
1158 : BALTIC 1257
1159 : VIETNAM 1258
1160 :
1161 : ISO-LATIN1 28591
1162 : ISO-LATIN2 28592
1163 : ISO-LATIN3 28593
1164 : ISO-BALTIC 28594
1165 : ISO-CYRILLIC 28595
1166 : ISO-ARABIC 28596
1167 : ISO-HEBREW 28598
1168 : ISO-TURKISH 28599
1169 : ISO-LATIN9 28605
1170 :
1171 : ISO-2022-JP 50220
1172 :
1173 : */
1174 :
1175 : char *CPLWin32Recode(const char *src, unsigned src_code_page,
1176 : unsigned dst_code_page)
1177 : {
1178 : // Convert from source code page to Unicode.
1179 :
1180 : // Compute the length in wide characters.
1181 : int wlen = MultiByteToWideChar(src_code_page, MB_ERR_INVALID_CHARS, src, -1,
1182 : nullptr, 0);
1183 : if (wlen == 0 && GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
1184 : {
1185 : if (!bHaveWarned5)
1186 : {
1187 : bHaveWarned5 = true;
1188 : CPLError(
1189 : CE_Warning, CPLE_AppDefined,
1190 : "One or several characters could not be translated from CP%d. "
1191 : "This warning will not be emitted anymore.",
1192 : src_code_page);
1193 : }
1194 :
1195 : // Retry now without MB_ERR_INVALID_CHARS flag.
1196 : wlen = MultiByteToWideChar(src_code_page, 0, src, -1, nullptr, 0);
1197 : }
1198 :
1199 : // Do the actual conversion.
1200 : wchar_t *tbuf =
1201 : static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), wlen + 1));
1202 : tbuf[wlen] = 0;
1203 : MultiByteToWideChar(src_code_page, 0, src, -1, tbuf, wlen + 1);
1204 :
1205 : // Convert from Unicode to destination code page.
1206 :
1207 : // Compute the length in chars.
1208 : BOOL bUsedDefaultChar = FALSE;
1209 : int len = 0;
1210 : if (dst_code_page == CP_UTF7 || dst_code_page == CP_UTF8)
1211 : len = WideCharToMultiByte(dst_code_page, 0, tbuf, -1, nullptr, 0,
1212 : nullptr, nullptr);
1213 : else
1214 : len = WideCharToMultiByte(dst_code_page, 0, tbuf, -1, nullptr, 0,
1215 : nullptr, &bUsedDefaultChar);
1216 : if (bUsedDefaultChar)
1217 : {
1218 : if (!bHaveWarned6)
1219 : {
1220 : bHaveWarned6 = true;
1221 : CPLError(
1222 : CE_Warning, CPLE_AppDefined,
1223 : "One or several characters could not be translated to CP%d. "
1224 : "This warning will not be emitted anymore.",
1225 : dst_code_page);
1226 : }
1227 : }
1228 :
1229 : // Do the actual conversion.
1230 : char *pszResult = static_cast<char *>(CPLCalloc(sizeof(char), len + 1));
1231 : WideCharToMultiByte(dst_code_page, 0, tbuf, -1, pszResult, len + 1, nullptr,
1232 : nullptr);
1233 : pszResult[len] = 0;
1234 :
1235 : CPLFree(tbuf);
1236 :
1237 : return pszResult;
1238 : }
1239 :
1240 : #endif
1241 :
1242 : /*
1243 : ** For now we disable the rest which is locale() related. We may need
1244 : ** parts of it later.
1245 : */
1246 :
1247 : #ifdef notdef
1248 :
1249 : #ifdef _WIN32
1250 : #include <windows.h>
1251 : #endif
1252 :
1253 : /*! Return true if the "locale" seems to indicate that UTF-8 encoding
1254 : is used. If true the utf8tomb and utf8frommb don't do anything
1255 : useful.
1256 :
1257 : <i>It is highly recommended that you change your system so this
1258 : does return true.</i> On Windows this is done by setting the
1259 : "codepage" to CP_UTF8. On Unix this is done by setting $LC_CTYPE
1260 : to a string containing the letters "utf" or "UTF" in it, or by
1261 : deleting all $LC* and $LANG environment variables. In the future
1262 : it is likely that all non-Asian Unix systems will return true,
1263 : due to the compatibility of UTF-8 with ISO-8859-1.
1264 : */
1265 : int utf8locale(void)
1266 : {
1267 : static int ret = 2;
1268 : if (ret == 2)
1269 : {
1270 : #ifdef _WIN32
1271 : ret = GetACP() == CP_UTF8;
1272 : #else
1273 : char *s;
1274 : ret = 1; // assume UTF-8 if no locale
1275 : if (((s = getenv("LC_CTYPE")) && *s) ||
1276 : ((s = getenv("LC_ALL")) && *s) || ((s = getenv("LANG")) && *s))
1277 : {
1278 : ret = strstr(s, "utf") || strstr(s, "UTF");
1279 : }
1280 : #endif
1281 : }
1282 :
1283 : return ret;
1284 : }
1285 :
1286 : /*! Convert the UTF-8 used by FLTK to the locale-specific encoding
1287 : used for filenames (and sometimes used for data in files).
1288 : Unfortunately due to stupid design you will have to do this as
1289 : needed for filenames. This is a bug on both Unix and Windows.
1290 :
1291 : Up to \a dstlen bytes are written to \a dst, including a null
1292 : terminator. The return value is the number of bytes that would be
1293 : written, not counting the null terminator. If greater or equal to
1294 : \a dstlen then if you malloc a new array of size n+1 you will have
1295 : the space needed for the entire string. If \a dstlen is zero then
1296 : nothing is written and this call just measures the storage space
1297 : needed.
1298 :
1299 : If utf8locale() returns true then this does not change the data.
1300 : It is copied and truncated as necessary to
1301 : the destination buffer and \a srclen is always returned. */
1302 : unsigned utf8tomb(const char *src, unsigned srclen, char *dst, unsigned dstlen)
1303 : {
1304 : if (!utf8locale())
1305 : {
1306 : #ifdef _WIN32
1307 : wchar_t lbuf[1024] = {};
1308 : wchar_t *buf = lbuf;
1309 : unsigned length = utf8towc(src, srclen, buf, 1024);
1310 : unsigned ret;
1311 : if (length >= 1024)
1312 : {
1313 : buf =
1314 : static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t)));
1315 : utf8towc(src, srclen, buf, length + 1);
1316 : }
1317 : if (dstlen)
1318 : {
1319 : // apparently this does not null-terminate, even though msdn
1320 : // documentation claims it does:
1321 : ret = WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0,
1322 : 0);
1323 : dst[ret] = 0;
1324 : }
1325 : // if it overflows or measuring length, get the actual length:
1326 : if (dstlen == 0 || ret >= dstlen - 1)
1327 : ret = WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
1328 : if (buf != lbuf)
1329 : free((void *)buf);
1330 : return ret;
1331 : #else
1332 : wchar_t lbuf[1024] = {};
1333 : wchar_t *buf = lbuf;
1334 : unsigned length = utf8towc(src, srclen, buf, 1024);
1335 : if (length >= 1024)
1336 : {
1337 : buf =
1338 : static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t)));
1339 : utf8towc(src, srclen, buf, length + 1);
1340 : }
1341 : int ret = 0;
1342 : if (dstlen)
1343 : {
1344 : ret = wcstombs(dst, buf, dstlen);
1345 : if (ret >= dstlen - 1)
1346 : ret = wcstombs(0, buf, 0);
1347 : }
1348 : else
1349 : {
1350 : ret = wcstombs(0, buf, 0);
1351 : }
1352 : if (buf != lbuf)
1353 : free((void *)buf);
1354 : if (ret >= 0)
1355 : return (unsigned)ret;
1356 : // On any errors we return the UTF-8 as raw text...
1357 : #endif
1358 : }
1359 : // Identity transform:
1360 : if (srclen < dstlen)
1361 : {
1362 : memcpy(dst, src, srclen);
1363 : dst[srclen] = 0;
1364 : }
1365 : else
1366 : {
1367 : memcpy(dst, src, dstlen - 1);
1368 : dst[dstlen - 1] = 0;
1369 : }
1370 : return srclen;
1371 : }
1372 :
1373 : /*! Convert a filename from the locale-specific multibyte encoding
1374 : used by Windows to UTF-8 as used by FLTK.
1375 :
1376 : Up to \a dstlen bytes are written to \a dst, including a null
1377 : terminator. The return value is the number of bytes that would be
1378 : written, not counting the null terminator. If greater or equal to
1379 : \a dstlen then if you malloc a new array of size n+1 you will have
1380 : the space needed for the entire string. If \a dstlen is zero then
1381 : nothing is written and this call just measures the storage space
1382 : needed.
1383 :
1384 : On Unix or on Windows when a UTF-8 locale is in effect, this
1385 : does not change the data. It is copied and truncated as necessary to
1386 : the destination buffer and \a srclen is always returned.
1387 : You may also want to check if utf8test() returns non-zero, so that
1388 : the filesystem can store filenames in UTF-8 encoding regardless of
1389 : the locale.
1390 : */
1391 : unsigned utf8frommb(char *dst, unsigned dstlen, const char *src,
1392 : unsigned srclen)
1393 : {
1394 : if (!utf8locale())
1395 : {
1396 : #ifdef _WIN32
1397 : wchar_t lbuf[1024] = {};
1398 : wchar_t *buf = lbuf;
1399 : unsigned ret;
1400 : const unsigned length =
1401 : MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
1402 : if (length >= 1024)
1403 : {
1404 : length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
1405 : buf = static_cast<wchar_t *>(malloc(length * sizeof(wchar_t)));
1406 : MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
1407 : }
1408 : ret = utf8fromwc(dst, dstlen, buf, length);
1409 : if (buf != lbuf)
1410 : free(buf);
1411 : return ret;
1412 : #else
1413 : wchar_t lbuf[1024] = {};
1414 : wchar_t *buf = lbuf;
1415 : const int length = mbstowcs(buf, src, 1024);
1416 : if (length >= 1024)
1417 : {
1418 : length = mbstowcs(0, src, 0) + 1;
1419 : buf =
1420 : static_cast<wchar_t *>(malloc(length * sizeof(unsigned short)));
1421 : mbstowcs(buf, src, length);
1422 : }
1423 : if (length >= 0)
1424 : {
1425 : const unsigned ret = utf8fromwc(dst, dstlen, buf, length);
1426 : if (buf != lbuf)
1427 : free(buf);
1428 : return ret;
1429 : }
1430 : // Errors in conversion return the UTF-8 unchanged.
1431 : #endif
1432 : }
1433 : // Identity transform:
1434 : if (srclen < dstlen)
1435 : {
1436 : memcpy(dst, src, srclen);
1437 : dst[srclen] = 0;
1438 : }
1439 : else
1440 : {
1441 : memcpy(dst, src, dstlen - 1);
1442 : dst[dstlen - 1] = 0;
1443 : }
1444 : return srclen;
1445 : }
1446 :
1447 : #endif // def notdef - disabled locale specific stuff.
1448 :
1449 : /*! Examines the first \a srclen bytes in \a src and return a verdict
1450 : on whether it is UTF-8 or not.
1451 : - Returns 0 if there is any illegal UTF-8 sequences, using the
1452 : same rules as utf8decode(). Note that some UCS values considered
1453 : illegal by RFC 3629, such as 0xffff, are considered legal by this.
1454 : - Returns 1 if there are only single-byte characters (i.e. no bytes
1455 : have the high bit set). This is legal UTF-8, but also indicates
1456 : plain ASCII. It also returns 1 if \a srclen is zero.
1457 : - Returns 2 if there are only characters less than 0x800.
1458 : - Returns 3 if there are only characters less than 0x10000.
1459 : - Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
1460 :
1461 : Because there are many illegal sequences in UTF-8, it is almost
1462 : impossible for a string in another encoding to be confused with
1463 : UTF-8. This is very useful for transitioning Unix to UTF-8
1464 : filenames, you can simply test each filename with this to decide
1465 : if it is UTF-8 or in the locale encoding. My hope is that if
1466 : this is done we will be able to cleanly transition to a locale-less
1467 : encoding.
1468 : */
1469 :
1470 18353 : static int utf8test(const char *src, unsigned srclen)
1471 : {
1472 18353 : int ret = 1;
1473 18353 : const char *p = src;
1474 18353 : const char *e = src + srclen;
1475 1846420 : while (p < e)
1476 : {
1477 1828120 : if (*p == 0)
1478 0 : return 0;
1479 1828120 : if (*p & 0x80)
1480 : {
1481 1602 : int len = 0;
1482 1602 : utf8decode(p, e, &len);
1483 1602 : if (len < 2)
1484 52 : return 0;
1485 1550 : if (len > ret)
1486 552 : ret = len;
1487 1550 : p += len;
1488 : }
1489 : else
1490 : {
1491 1826520 : p++;
1492 : }
1493 : }
1494 18301 : return ret;
1495 : }
|