Line data Source code
1 : /**********************************************************************
2 : *
3 : * Name: cpl_recode_stub.cpp
4 : * Project: CPL - Common Portability Library
5 : * Purpose: Character set recoding and char/wchar_t conversions, stub
6 : * implementation to be used if iconv() functionality is not
7 : * available.
8 : * Author: Frank Warmerdam, warmerdam@pobox.com
9 : *
10 : * The bulk of this code is derived from the utf.c module from FLTK. It
11 : * was originally downloaded from:
12 : * http://svn.easysw.com/public/fltk/fltk/trunk/src/utf.c
13 : *
14 : **********************************************************************
15 : * Copyright (c) 2008, Frank Warmerdam
16 : * Copyright 2006 by Bill Spitzak and others.
17 : * Copyright (c) 2009-2014, Even Rouault <even dot rouault at spatialys.com>
18 : *
19 : * Permission to use, copy, modify, and distribute this software for any
20 : * purpose with or without fee is hereby granted, provided that the above
21 : * copyright notice and this permission notice appear in all copies.
22 : *
23 : * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
24 : * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
25 : * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
26 : * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
27 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
28 : * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
29 : * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
30 : **********************************************************************/
31 :
32 : #include "cpl_port.h"
33 : #include "cpl_string.h"
34 :
35 : #include <cstring>
36 :
37 : #include "cpl_conv.h"
38 : #include "cpl_error.h"
39 : #include "cpl_character_sets.c"
40 :
41 : static unsigned utf8decode(const char *p, const char *end, int *len);
42 : static unsigned utf8towc(const char *src, unsigned srclen, wchar_t *dst,
43 : unsigned dstlen);
44 : static unsigned utf8toa(const char *src, unsigned srclen, char *dst,
45 : unsigned dstlen);
46 : static unsigned utf8fromwc(char *dst, unsigned dstlen, const wchar_t *src,
47 : unsigned srclen);
48 : static unsigned utf8froma(char *dst, unsigned dstlen, const char *src,
49 : unsigned srclen);
50 : static int utf8test(const char *src, unsigned srclen);
51 :
52 : #ifdef _WIN32
53 :
54 : #include <windows.h>
55 : #include <winnls.h>
56 :
57 : static char *CPLWin32Recode(const char *src, unsigned src_code_page,
58 : unsigned dst_code_page) CPL_RETURNS_NONNULL;
59 : #endif
60 :
61 : /* used by cpl_recode.cpp */
62 : extern void CPLClearRecodeStubWarningFlags();
63 : extern char *CPLRecodeStub(const char *, const char *,
64 : const char *) CPL_RETURNS_NONNULL;
65 : extern char *CPLRecodeFromWCharStub(const wchar_t *, const char *,
66 : const char *);
67 : extern wchar_t *CPLRecodeToWCharStub(const char *, const char *, const char *);
68 :
69 : /************************************************************************/
70 : /* ==================================================================== */
71 : /* Stub Implementation not depending on iconv() or WIN32 API. */
72 : /* ==================================================================== */
73 : /************************************************************************/
74 :
75 : static bool bHaveWarned1 = false;
76 : static bool bHaveWarned2 = false;
77 : static bool bHaveWarned3 = false;
78 : static bool bHaveWarned4 = false;
79 : static bool bHaveWarned5 = false;
80 : static bool bHaveWarned6 = false;
81 :
82 : /************************************************************************/
83 : /* CPLClearRecodeStubWarningFlags() */
84 : /************************************************************************/
85 :
86 13601 : void CPLClearRecodeStubWarningFlags()
87 : {
88 13601 : bHaveWarned1 = false;
89 13601 : bHaveWarned2 = false;
90 13601 : bHaveWarned3 = false;
91 13601 : bHaveWarned4 = false;
92 13601 : bHaveWarned5 = false;
93 13601 : bHaveWarned6 = false;
94 13601 : }
95 :
96 : /************************************************************************/
97 : /* CPLRecodeStub() */
98 : /************************************************************************/
99 :
100 : /**
101 : * Convert a string from a source encoding to a destination encoding.
102 : *
103 : * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
104 : * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
105 : * <ul>
106 : * <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in
107 : * fact)</li>
108 : * <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
109 : * <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
110 : * </ul>
111 : *
112 : * If an error occurs an error may, or may not be posted with CPLError().
113 : *
114 : * @param pszSource a NULL terminated string.
115 : * @param pszSrcEncoding the source encoding.
116 : * @param pszDstEncoding the destination encoding.
117 : *
118 : * @return a NULL terminated string which should be freed with CPLFree().
119 : */
120 :
121 1243320 : char *CPLRecodeStub(const char *pszSource, const char *pszSrcEncoding,
122 : const char *pszDstEncoding)
123 :
124 : {
125 : /* -------------------------------------------------------------------- */
126 : /* If the source or destination is current locale(), we change */
127 : /* it to ISO8859-1 since our stub implementation does not */
128 : /* attempt to address locales properly. */
129 : /* -------------------------------------------------------------------- */
130 :
131 1243320 : if (pszSrcEncoding[0] == '\0')
132 0 : pszSrcEncoding = CPL_ENC_ISO8859_1;
133 :
134 1243320 : if (pszDstEncoding[0] == '\0')
135 0 : pszDstEncoding = CPL_ENC_ISO8859_1;
136 :
137 : /* -------------------------------------------------------------------- */
138 : /* ISO8859 to UTF8 */
139 : /* -------------------------------------------------------------------- */
140 1243320 : if (strcmp(pszSrcEncoding, CPL_ENC_ISO8859_1) == 0 &&
141 1171150 : strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
142 : {
143 1171150 : const int nCharCount = static_cast<int>(strlen(pszSource));
144 1171150 : char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1));
145 :
146 1171150 : utf8froma(pszResult, nCharCount * 2 + 1, pszSource, nCharCount);
147 :
148 1171150 : return pszResult;
149 : }
150 :
151 : /* -------------------------------------------------------------------- */
152 : /* UTF8 to ISO8859 */
153 : /* -------------------------------------------------------------------- */
154 72170 : if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0 &&
155 47577 : strcmp(pszDstEncoding, CPL_ENC_ISO8859_1) == 0)
156 : {
157 47577 : int nCharCount = static_cast<int>(strlen(pszSource));
158 47577 : char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount + 1));
159 :
160 47577 : utf8toa(pszSource, nCharCount, pszResult, nCharCount + 1);
161 :
162 47577 : return pszResult;
163 : }
164 :
165 : // A few hard coded CPxxx/ISO-8859-x to UTF-8 tables
166 24593 : if (EQUAL(pszDstEncoding, CPL_ENC_UTF8))
167 : {
168 24593 : const auto pConvTable = CPLGetConversionTableToUTF8(pszSrcEncoding);
169 24593 : if (pConvTable)
170 : {
171 24593 : const auto convTable = *pConvTable;
172 24593 : const size_t nCharCount = strlen(pszSource);
173 : char *pszResult =
174 24593 : static_cast<char *>(CPLCalloc(1, nCharCount * 3 + 1));
175 24593 : size_t iDst = 0;
176 24593 : unsigned char *pabyResult =
177 : reinterpret_cast<unsigned char *>(pszResult);
178 580111 : for (size_t i = 0; i < nCharCount; ++i)
179 : {
180 555518 : const unsigned char nChar =
181 555518 : static_cast<unsigned char>(pszSource[i]);
182 555518 : if (nChar <= 127)
183 : {
184 522071 : pszResult[iDst] = pszSource[i];
185 522071 : ++iDst;
186 : }
187 : else
188 : {
189 33447 : const unsigned char nShiftedChar = nChar - 128;
190 33447 : if (convTable[nShiftedChar][0])
191 : {
192 33446 : pabyResult[iDst] = convTable[nShiftedChar][0];
193 33446 : ++iDst;
194 33446 : CPLAssert(convTable[nShiftedChar][1]);
195 33446 : pabyResult[iDst] = convTable[nShiftedChar][1];
196 33446 : ++iDst;
197 33446 : if (convTable[nShiftedChar][2])
198 : {
199 13 : pabyResult[iDst] = convTable[nShiftedChar][2];
200 13 : ++iDst;
201 : }
202 : }
203 : else
204 : {
205 : // Skip the invalid sequence in the input string.
206 1 : if (!bHaveWarned2)
207 : {
208 1 : bHaveWarned2 = true;
209 1 : CPLError(CE_Warning, CPLE_AppDefined,
210 : "One or several characters couldn't be "
211 : "converted correctly from %s to %s. "
212 : "This warning will not be emitted anymore",
213 : pszSrcEncoding, pszDstEncoding);
214 : }
215 : }
216 : }
217 : }
218 :
219 24593 : pszResult[iDst] = 0;
220 24593 : return pszResult;
221 : }
222 : }
223 :
224 : #ifdef _WIN32
225 : const auto MapEncodingToWindowsCodePage = [](const char *pszEncoding)
226 : {
227 : // Cf https://learn.microsoft.com/fr-fr/windows/win32/intl/code-page-identifiers
228 : if (STARTS_WITH(pszEncoding, "CP"))
229 : {
230 : const int nCode = atoi(pszEncoding + strlen("CP"));
231 : if (nCode > 0)
232 : return nCode;
233 : else if (EQUAL(pszEncoding, "CP_OEMCP"))
234 : return CP_OEMCP;
235 : else if (EQUAL(pszEncoding, "CP_ACP"))
236 : return CP_ACP;
237 : }
238 : else if (STARTS_WITH(pszEncoding, "WINDOWS-"))
239 : {
240 : const int nCode = atoi(pszEncoding + strlen("WINDOWS-"));
241 : if (nCode > 0)
242 : return nCode;
243 : }
244 : else if (STARTS_WITH(pszEncoding, "ISO-8859-"))
245 : {
246 : const int nCode = atoi(pszEncoding + strlen("ISO-8859-"));
247 : if ((nCode >= 1 && nCode <= 9) || nCode == 13 || nCode == 15)
248 : return 28590 + nCode;
249 : }
250 :
251 : // Return a negative value, since CP_ACP = 0
252 : return -1;
253 : };
254 :
255 : /* ---------------------------------------------------------------------*/
256 : /* XXX to UTF8 */
257 : /* ---------------------------------------------------------------------*/
258 : if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
259 : {
260 : const int nCode = MapEncodingToWindowsCodePage(pszSrcEncoding);
261 : if (nCode >= 0)
262 : {
263 : return CPLWin32Recode(pszSource, nCode, CP_UTF8);
264 : }
265 : }
266 :
267 : /* ---------------------------------------------------------------------*/
268 : /* UTF8 to XXX */
269 : /* ---------------------------------------------------------------------*/
270 : if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0)
271 : {
272 : const int nCode = MapEncodingToWindowsCodePage(pszDstEncoding);
273 : if (nCode >= 0)
274 : {
275 : return CPLWin32Recode(pszSource, CP_UTF8, nCode);
276 : }
277 : }
278 : #endif
279 :
280 : /* -------------------------------------------------------------------- */
281 : /* Anything else to UTF-8 is treated as ISO8859-1 to UTF-8 with */
282 : /* a one-time warning. */
283 : /* -------------------------------------------------------------------- */
284 0 : if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
285 : {
286 0 : const int nCharCount = static_cast<int>(strlen(pszSource));
287 0 : char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1));
288 :
289 0 : if (!bHaveWarned1)
290 : {
291 0 : bHaveWarned1 = true;
292 0 : CPLError(CE_Warning, CPLE_AppDefined,
293 : "Recode from %s to UTF-8 not supported, "
294 : "treated as ISO-8859-1 to UTF-8.",
295 : pszSrcEncoding);
296 : }
297 :
298 0 : utf8froma(pszResult, nCharCount * 2 + 1, pszSource, nCharCount);
299 :
300 0 : return pszResult;
301 : }
302 :
303 : /* -------------------------------------------------------------------- */
304 : /* Everything else is treated as a no-op with a warning. */
305 : /* -------------------------------------------------------------------- */
306 : {
307 0 : if (!bHaveWarned3)
308 : {
309 0 : bHaveWarned3 = true;
310 0 : CPLError(CE_Warning, CPLE_AppDefined,
311 : "Recode from %s to %s not supported, no change applied.",
312 : pszSrcEncoding, pszDstEncoding);
313 : }
314 :
315 0 : return CPLStrdup(pszSource);
316 : }
317 : }
318 :
319 : /************************************************************************/
320 : /* CPLRecodeFromWCharStub() */
321 : /************************************************************************/
322 :
323 : /**
324 : * Convert wchar_t string to UTF-8.
325 : *
326 : * Convert a wchar_t string into a multibyte utf-8 string. The only
327 : * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
328 : * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
329 : * and CPL_ENC_ISO8859_1. In some cases (i.e. using iconv()) other encodings
330 : * may also be supported.
331 : *
332 : * Note that the wchar_t type varies in size on different systems. On
333 : * win32 it is normally 2 bytes, and on unix 4 bytes.
334 : *
335 : * If an error occurs an error may, or may not be posted with CPLError().
336 : *
337 : * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
338 : * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
339 : * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
340 : *
341 : * @return a zero terminated multi-byte string which should be freed with
342 : * CPLFree(), or NULL if an error occurs.
343 : */
344 :
345 130852 : char *CPLRecodeFromWCharStub(const wchar_t *pwszSource,
346 : const char *pszSrcEncoding,
347 : const char *pszDstEncoding)
348 :
349 : {
350 : /* -------------------------------------------------------------------- */
351 : /* We try to avoid changes of character set. We are just */
352 : /* providing for unicode to unicode. */
353 : /* -------------------------------------------------------------------- */
354 130852 : if (strcmp(pszSrcEncoding, "WCHAR_T") != 0 &&
355 129190 : strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0 &&
356 129190 : strcmp(pszSrcEncoding, CPL_ENC_UTF16) != 0 &&
357 129190 : strcmp(pszSrcEncoding, CPL_ENC_UCS2) != 0 &&
358 0 : strcmp(pszSrcEncoding, CPL_ENC_UCS4) != 0)
359 : {
360 0 : CPLError(CE_Failure, CPLE_AppDefined,
361 : "Stub recoding implementation does not support "
362 : "CPLRecodeFromWCharStub(...,%s,%s)",
363 : pszSrcEncoding, pszDstEncoding);
364 0 : return nullptr;
365 : }
366 :
367 : /* -------------------------------------------------------------------- */
368 : /* What is the source length. */
369 : /* -------------------------------------------------------------------- */
370 130852 : int nSrcLen = 0;
371 :
372 1926710 : while (pwszSource[nSrcLen] != 0)
373 1795860 : nSrcLen++;
374 :
375 : /* -------------------------------------------------------------------- */
376 : /* Allocate destination buffer plenty big. */
377 : /* -------------------------------------------------------------------- */
378 130852 : const int nDstBufSize = nSrcLen * 4 + 1;
379 : // Nearly worst case.
380 130852 : char *pszResult = static_cast<char *>(CPLMalloc(nDstBufSize));
381 :
382 130852 : if (nSrcLen == 0)
383 : {
384 57926 : pszResult[0] = '\0';
385 57926 : return pszResult;
386 : }
387 :
388 : /* -------------------------------------------------------------------- */
389 : /* Convert, and confirm we had enough space. */
390 : /* -------------------------------------------------------------------- */
391 72926 : const int nDstLen = utf8fromwc(pszResult, nDstBufSize, pwszSource, nSrcLen);
392 72926 : if (nDstLen >= nDstBufSize)
393 : {
394 0 : CPLAssert(false); // too small!
395 : return nullptr;
396 : }
397 :
398 : /* -------------------------------------------------------------------- */
399 : /* If something other than UTF-8 was requested, recode now. */
400 : /* -------------------------------------------------------------------- */
401 72926 : if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
402 72926 : return pszResult;
403 :
404 : char *pszFinalResult =
405 0 : CPLRecodeStub(pszResult, CPL_ENC_UTF8, pszDstEncoding);
406 :
407 0 : CPLFree(pszResult);
408 :
409 0 : return pszFinalResult;
410 : }
411 :
412 : /************************************************************************/
413 : /* CPLRecodeToWCharStub() */
414 : /************************************************************************/
415 :
416 : /**
417 : * Convert UTF-8 string to a wchar_t string.
418 : *
419 : * Convert a 8bit, multi-byte per character input string into a wide
420 : * character (wchar_t) string. The only guaranteed supported source encodings
421 : * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1). The only
422 : * guaranteed supported destination encoding is CPL_ENC_UCS2. Other source
423 : * and destination encodings may be supported depending on the underlying
424 : * implementation.
425 : *
426 : * Note that the wchar_t type varies in size on different systems. On
427 : * win32 it is normally 2 bytes, and on unix 4 bytes.
428 : *
429 : * If an error occurs an error may, or may not be posted with CPLError().
430 : *
431 : * @param pszSource input multi-byte character string.
432 : * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
433 : * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
434 : *
435 : * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
436 : * NULL on error.
437 : *
438 : */
439 :
440 41083 : wchar_t *CPLRecodeToWCharStub(const char *pszSource, const char *pszSrcEncoding,
441 : const char *pszDstEncoding)
442 :
443 : {
444 41083 : char *pszUTF8Source = const_cast<char *>(pszSource);
445 :
446 41083 : if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0 &&
447 0 : strcmp(pszSrcEncoding, CPL_ENC_ASCII) != 0)
448 : {
449 0 : pszUTF8Source = CPLRecodeStub(pszSource, pszSrcEncoding, CPL_ENC_UTF8);
450 0 : if (pszUTF8Source == nullptr)
451 0 : return nullptr;
452 : }
453 :
454 : /* -------------------------------------------------------------------- */
455 : /* We try to avoid changes of character set. We are just */
456 : /* providing for unicode to unicode. */
457 : /* -------------------------------------------------------------------- */
458 41083 : if (strcmp(pszDstEncoding, "WCHAR_T") != 0 &&
459 41083 : strcmp(pszDstEncoding, CPL_ENC_UCS2) != 0 &&
460 0 : strcmp(pszDstEncoding, CPL_ENC_UCS4) != 0 &&
461 0 : strcmp(pszDstEncoding, CPL_ENC_UTF16) != 0)
462 : {
463 0 : CPLError(CE_Failure, CPLE_AppDefined,
464 : "Stub recoding implementation does not support "
465 : "CPLRecodeToWCharStub(...,%s,%s)",
466 : pszSrcEncoding, pszDstEncoding);
467 0 : if (pszUTF8Source != pszSource)
468 0 : CPLFree(pszUTF8Source);
469 0 : return nullptr;
470 : }
471 :
472 : /* -------------------------------------------------------------------- */
473 : /* Do the UTF-8 to UCS-2 recoding. */
474 : /* -------------------------------------------------------------------- */
475 41083 : int nSrcLen = static_cast<int>(strlen(pszUTF8Source));
476 : wchar_t *pwszResult =
477 41083 : static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), nSrcLen + 1));
478 :
479 41083 : utf8towc(pszUTF8Source, nSrcLen, pwszResult, nSrcLen + 1);
480 :
481 41083 : if (pszUTF8Source != pszSource)
482 0 : CPLFree(pszUTF8Source);
483 :
484 41083 : return pwszResult;
485 : }
486 :
487 : /************************************************************************/
488 : /* CPLIsUTF8() */
489 : /************************************************************************/
490 :
491 : /**
492 : * Test if a string is encoded as UTF-8.
493 : *
494 : * @param pabyData input string to test
495 : * @param nLen length of the input string, or -1 if the function must compute
496 : * the string length. In which case it must be null terminated.
497 : * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
498 : *
499 : */
500 18933 : int CPLIsUTF8(const char *pabyData, int nLen)
501 : {
502 18933 : if (nLen < 0)
503 14319 : nLen = static_cast<int>(strlen(pabyData));
504 18933 : return utf8test(pabyData, static_cast<unsigned>(nLen)) != 0;
505 : }
506 :
507 : /************************************************************************/
508 : /* ==================================================================== */
509 : /* UTF.C code from FLTK with some modifications. */
510 : /* ==================================================================== */
511 : /************************************************************************/
512 :
513 : /* Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
514 : they are instead turned into the Unicode REPLACEMENT CHARACTER, of
515 : value 0xfffd.
516 : If this is on utf8decode will correctly map most (perhaps all)
517 : human-readable text that is in ISO-8859-1. This may allow you
518 : to completely ignore character sets in your code because virtually
519 : everything is either ISO-8859-1 or UTF-8.
520 : */
521 : #define ERRORS_TO_ISO8859_1 1
522 :
523 : /* Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
524 : Unicode index for Microsoft's CP1252 character set. You should
525 : also set ERRORS_TO_ISO8859_1. With this a huge amount of more
526 : available text (such as all web pages) are correctly converted
527 : to Unicode.
528 : */
529 : #define ERRORS_TO_CP1252 1
530 :
531 : /* A number of Unicode code points are in fact illegal and should not
532 : be produced by a UTF-8 converter. Turn this on will replace the
533 : bytes in those encodings with errors. If you do this then converting
534 : arbitrary 16-bit data to UTF-8 and then back is not an identity,
535 : which will probably break a lot of software.
536 : */
537 : #define STRICT_RFC3629 0
538 :
539 : #if ERRORS_TO_CP1252
540 : // Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
541 : // to Unicode:
542 : constexpr unsigned short cp1252[32] = {
543 : 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
544 : 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
545 : 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
546 : 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178};
547 : #endif
548 :
549 : /************************************************************************/
550 : /* utf8decode() */
551 : /************************************************************************/
552 :
553 : /*
554 : Decode a single UTF-8 encoded character starting at \e p. The
555 : resulting Unicode value (in the range 0-0x10ffff) is returned,
556 : and \e len is set the number of bytes in the UTF-8 encoding
557 : (adding \e len to \e p will point at the next character).
558 :
559 : If \a p points at an illegal UTF-8 encoding, including one that
560 : would go past \e end, or where a code is uses more bytes than
561 : necessary, then *reinterpret_cast<const unsigned char*>(p) is translated as
562 : though it is in the Microsoft CP1252 character set and \e len is set to 1.
563 : Treating errors this way allows this to decode almost any
564 : ISO-8859-1 or CP1252 text that has been mistakenly placed where
565 : UTF-8 is expected, and has proven very useful.
566 :
567 : If you want errors to be converted to error characters (as the
568 : standards recommend), adding a test to see if the length is
569 : unexpectedly 1 will work:
570 :
571 : \code
572 : if( *p & 0x80 )
573 : { // What should be a multibyte encoding.
574 : code = utf8decode(p, end, &len);
575 : if( len<2 ) code = 0xFFFD; // Turn errors into REPLACEMENT CHARACTER.
576 : }
577 : else
578 : { // Handle the 1-byte utf8 encoding:
579 : code = *p;
580 : len = 1;
581 : }
582 : \endcode
583 :
584 : Direct testing for the 1-byte case (as shown above) will also
585 : speed up the scanning of strings where the majority of characters
586 : are ASCII.
587 : */
588 2615 : static unsigned utf8decode(const char *p, const char *end, int *len)
589 : {
590 2615 : unsigned char c = *reinterpret_cast<const unsigned char *>(p);
591 2615 : if (c < 0x80)
592 : {
593 0 : *len = 1;
594 0 : return c;
595 : #if ERRORS_TO_CP1252
596 : }
597 2615 : else if (c < 0xa0)
598 : {
599 39 : *len = 1;
600 39 : return cp1252[c - 0x80];
601 : #endif
602 : }
603 2576 : else if (c < 0xc2)
604 : {
605 10 : goto FAIL;
606 : }
607 2566 : if (p + 1 >= end || (p[1] & 0xc0) != 0x80)
608 70 : goto FAIL;
609 2496 : if (c < 0xe0)
610 : {
611 2488 : *len = 2;
612 2488 : return ((p[0] & 0x1f) << 6) + ((p[1] & 0x3f));
613 : }
614 8 : else if (c == 0xe0)
615 : {
616 0 : if ((reinterpret_cast<const unsigned char *>(p))[1] < 0xa0)
617 0 : goto FAIL;
618 0 : goto UTF8_3;
619 : #if STRICT_RFC3629
620 : }
621 : else if (c == 0xed)
622 : {
623 : // RFC 3629 says surrogate chars are illegal.
624 : if ((reinterpret_cast<const unsigned char *>(p))[1] >= 0xa0)
625 : goto FAIL;
626 : goto UTF8_3;
627 : }
628 : else if (c == 0xef)
629 : {
630 : // 0xfffe and 0xffff are also illegal characters.
631 : if ((reinterpret_cast<const unsigned char *>(p))[1] == 0xbf &&
632 : (reinterpret_cast<const unsigned char *>(p))[2] >= 0xbe)
633 : goto FAIL;
634 : goto UTF8_3;
635 : #endif
636 : }
637 8 : else if (c < 0xf0)
638 : {
639 4 : UTF8_3:
640 4 : if (p + 2 >= end || (p[2] & 0xc0) != 0x80)
641 0 : goto FAIL;
642 4 : *len = 3;
643 4 : return ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + ((p[2] & 0x3f));
644 : }
645 4 : else if (c == 0xf0)
646 : {
647 4 : if ((reinterpret_cast<const unsigned char *>(p))[1] < 0x90)
648 0 : goto FAIL;
649 4 : goto UTF8_4;
650 : }
651 0 : else if (c < 0xf4)
652 : {
653 0 : UTF8_4:
654 4 : if (p + 3 >= end || (p[2] & 0xc0) != 0x80 || (p[3] & 0xc0) != 0x80)
655 0 : goto FAIL;
656 4 : *len = 4;
657 : #if STRICT_RFC3629
658 : // RFC 3629 says all codes ending in fffe or ffff are illegal:
659 : if ((p[1] & 0xf) == 0xf &&
660 : (reinterpret_cast<const unsigned char *>(p))[2] == 0xbf &&
661 : (reinterpret_cast<const unsigned char *>(p))[3] >= 0xbe)
662 : goto FAIL;
663 : #endif
664 4 : return ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12) +
665 4 : ((p[2] & 0x3f) << 6) + ((p[3] & 0x3f));
666 : }
667 0 : else if (c == 0xf4)
668 : {
669 0 : if ((reinterpret_cast<const unsigned char *>(p))[1] > 0x8f)
670 0 : goto FAIL; // After 0x10ffff.
671 0 : goto UTF8_4;
672 : }
673 : else
674 : {
675 0 : FAIL:
676 80 : *len = 1;
677 : #if ERRORS_TO_ISO8859_1
678 80 : return c;
679 : #else
680 : return 0xfffd; // Unicode REPLACEMENT CHARACTER
681 : #endif
682 : }
683 : }
684 :
685 : /************************************************************************/
686 : /* utf8towc() */
687 : /************************************************************************/
688 :
689 : /* Convert a UTF-8 sequence into an array of wchar_t. These
690 : are used by some system calls, especially on Windows.
691 :
692 : \a src points at the UTF-8, and \a srclen is the number of bytes to
693 : convert.
694 :
695 : \a dst points at an array to write, and \a dstlen is the number of
696 : locations in this array. At most \a dstlen-1 words will be
697 : written there, plus a 0 terminating word. Thus this function
698 : will never overwrite the buffer and will always return a
699 : zero-terminated string. If \a dstlen is zero then \a dst can be
700 : null and no data is written, but the length is returned.
701 :
702 : The return value is the number of words that \e would be written
703 : to \a dst if it were long enough, not counting the terminating
704 : zero. If the return value is greater or equal to \a dstlen it
705 : indicates truncation, you can then allocate a new array of size
706 : return+1 and call this again.
707 :
708 : Errors in the UTF-8 are converted as though each byte in the
709 : erroneous string is in the Microsoft CP1252 encoding. This allows
710 : ISO-8859-1 text mistakenly identified as UTF-8 to be printed
711 : correctly.
712 :
713 : Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
714 : and most other systems. Where wchar_t is 16 bits, Unicode
715 : characters in the range 0x10000 to 0x10ffff are converted to
716 : "surrogate pairs" which take two words each (this is called UTF-16
717 : encoding). If wchar_t is 32 bits this rather nasty problem is
718 : avoided.
719 : */
720 41083 : static unsigned utf8towc(const char *src, unsigned srclen, wchar_t *dst,
721 : unsigned dstlen)
722 : {
723 41083 : const char *p = src;
724 41083 : const char *e = src + srclen;
725 41083 : unsigned count = 0;
726 41083 : if (dstlen)
727 : while (true)
728 : {
729 299522 : if (p >= e)
730 : {
731 41083 : dst[count] = 0;
732 41083 : return count;
733 : }
734 258439 : if (!(*p & 0x80))
735 : {
736 : // ASCII
737 258237 : dst[count] = *p++;
738 : }
739 : else
740 : {
741 202 : int len = 0;
742 202 : unsigned ucs = utf8decode(p, e, &len);
743 202 : p += len;
744 : #ifdef _WIN32
745 : if (ucs < 0x10000)
746 : {
747 : dst[count] = static_cast<wchar_t>(ucs);
748 : }
749 : else
750 : {
751 : // Make a surrogate pair:
752 : if (count + 2 >= dstlen)
753 : {
754 : dst[count] = 0;
755 : count += 2;
756 : break;
757 : }
758 : dst[count] = static_cast<wchar_t>(
759 : (((ucs - 0x10000u) >> 10) & 0x3ff) | 0xd800);
760 : dst[++count] = static_cast<wchar_t>((ucs & 0x3ff) | 0xdc00);
761 : }
762 : #else
763 202 : dst[count] = static_cast<wchar_t>(ucs);
764 : #endif
765 : }
766 258439 : if (++count == dstlen)
767 : {
768 0 : dst[count - 1] = 0;
769 0 : break;
770 : }
771 258439 : }
772 : // We filled dst, measure the rest:
773 0 : while (p < e)
774 : {
775 0 : if (!(*p & 0x80))
776 : {
777 0 : p++;
778 : }
779 : else
780 : {
781 0 : int len = 0;
782 : #ifdef _WIN32
783 : const unsigned ucs = utf8decode(p, e, &len);
784 : p += len;
785 : if (ucs >= 0x10000)
786 : ++count;
787 : #else
788 0 : utf8decode(p, e, &len);
789 0 : p += len;
790 : #endif
791 : }
792 0 : ++count;
793 : }
794 :
795 0 : return count;
796 : }
797 :
798 : /************************************************************************/
799 : /* utf8toa() */
800 : /************************************************************************/
801 : /* Convert a UTF-8 sequence into an array of 1-byte characters.
802 :
803 : If the UTF-8 decodes to a character greater than 0xff then it is
804 : replaced with '?'.
805 :
806 : Errors in the UTF-8 are converted as individual bytes, same as
807 : utf8decode() does. This allows ISO-8859-1 text mistakenly identified
808 : as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
809 :
810 : \a src points at the UTF-8, and \a srclen is the number of bytes to
811 : convert.
812 :
813 : Up to \a dstlen bytes are written to \a dst, including a null
814 : terminator. The return value is the number of bytes that would be
815 : written, not counting the null terminator. If greater or equal to
816 : \a dstlen then if you malloc a new array of size n+1 you will have
817 : the space needed for the entire string. If \a dstlen is zero then
818 : nothing is written and this call just measures the storage space
819 : needed.
820 : */
821 47577 : static unsigned int utf8toa(const char *src, unsigned srclen, char *dst,
822 : unsigned dstlen)
823 : {
824 47577 : const char *p = src;
825 47577 : const char *e = src + srclen;
826 47577 : unsigned int count = 0;
827 47577 : if (dstlen)
828 : while (true)
829 : {
830 172257 : if (p >= e)
831 : {
832 47577 : dst[count] = 0;
833 47577 : return count;
834 : }
835 124680 : unsigned char c = *reinterpret_cast<const unsigned char *>(p);
836 124680 : if (c < 0xC2)
837 : {
838 : // ASCII or bad code.
839 123880 : dst[count] = c;
840 123880 : p++;
841 : }
842 : else
843 : {
844 800 : int len = 0;
845 800 : const unsigned int ucs = utf8decode(p, e, &len);
846 800 : p += len;
847 800 : if (ucs < 0x100)
848 : {
849 796 : dst[count] = static_cast<char>(ucs);
850 : }
851 : else
852 : {
853 4 : if (!bHaveWarned4)
854 : {
855 2 : bHaveWarned4 = true;
856 2 : CPLError(
857 : CE_Warning, CPLE_AppDefined,
858 : "One or several characters couldn't be converted "
859 : "correctly from UTF-8 to ISO-8859-1. "
860 : "This warning will not be emitted anymore.");
861 : }
862 4 : dst[count] = '?';
863 : }
864 : }
865 124680 : if (++count >= dstlen)
866 : {
867 0 : dst[count - 1] = 0;
868 0 : break;
869 : }
870 124680 : }
871 : // We filled dst, measure the rest:
872 0 : while (p < e)
873 : {
874 0 : if (!(*p & 0x80))
875 : {
876 0 : p++;
877 : }
878 : else
879 : {
880 0 : int len = 0;
881 0 : utf8decode(p, e, &len);
882 0 : p += len;
883 : }
884 0 : ++count;
885 : }
886 0 : return count;
887 : }
888 :
889 : /************************************************************************/
890 : /* utf8fromwc() */
891 : /************************************************************************/
892 : /* Turn "wide characters" as returned by some system calls
893 : (especially on Windows) into UTF-8.
894 :
895 : Up to \a dstlen bytes are written to \a dst, including a null
896 : terminator. The return value is the number of bytes that would be
897 : written, not counting the null terminator. If greater or equal to
898 : \a dstlen then if you malloc a new array of size n+1 you will have
899 : the space needed for the entire string. If \a dstlen is zero then
900 : nothing is written and this call just measures the storage space
901 : needed.
902 :
903 : \a srclen is the number of words in \a src to convert. On Windows
904 : this is not necessarily the number of characters, due to there
905 : possibly being "surrogate pairs" in the UTF-16 encoding used.
906 : On Unix wchar_t is 32 bits and each location is a character.
907 :
908 : On Unix if a src word is greater than 0x10ffff then this is an
909 : illegal character according to RFC 3629. These are converted as
910 : though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
911 : range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
912 : illegal according to RFC 3629. However I encode these as though
913 : they are legal, so that utf8towc will return the original data.
914 :
915 : On Windows "surrogate pairs" are converted to a single character
916 : and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
917 : pairs are converted as though they are individual characters.
918 : */
919 72926 : static unsigned int utf8fromwc(char *dst, unsigned dstlen, const wchar_t *src,
920 : unsigned srclen)
921 : {
922 72926 : unsigned int i = 0;
923 72926 : unsigned int count = 0;
924 72926 : if (dstlen)
925 : while (true)
926 : {
927 1868780 : if (i >= srclen)
928 : {
929 72926 : dst[count] = 0;
930 72926 : return count;
931 : }
932 1795860 : unsigned int ucs = src[i++];
933 1795860 : if (ucs < 0x80U)
934 : {
935 1788950 : dst[count++] = static_cast<char>(ucs);
936 1788950 : if (count >= dstlen)
937 : {
938 0 : dst[count - 1] = 0;
939 0 : break;
940 : }
941 : }
942 6909 : else if (ucs < 0x800U)
943 : {
944 : // 2 bytes.
945 4263 : if (count + 2 >= dstlen)
946 : {
947 0 : dst[count] = 0;
948 0 : count += 2;
949 0 : break;
950 : }
951 4263 : dst[count++] = 0xc0 | static_cast<char>(ucs >> 6);
952 4263 : dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
953 : #ifdef _WIN32
954 : }
955 : else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
956 : src[i] >= 0xdc00 && src[i] <= 0xdfff)
957 : {
958 : // Surrogate pair.
959 : unsigned int ucs2 = src[i++];
960 : ucs = 0x10000U + ((ucs & 0x3ff) << 10) + (ucs2 & 0x3ff);
961 : // All surrogate pairs turn into 4-byte utf8.
962 : #else
963 : }
964 2646 : else if (ucs >= 0x10000)
965 : {
966 1 : if (ucs > 0x10ffff)
967 : {
968 1 : ucs = 0xfffd;
969 1 : goto J1;
970 : }
971 : #endif
972 0 : if (count + 4 >= dstlen)
973 : {
974 0 : dst[count] = 0;
975 0 : count += 4;
976 0 : break;
977 : }
978 0 : dst[count++] = 0xf0 | static_cast<char>(ucs >> 18);
979 0 : dst[count++] = 0x80 | static_cast<char>((ucs >> 12) & 0x3F);
980 0 : dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F);
981 0 : dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
982 : }
983 : else
984 : {
985 : #ifndef _WIN32
986 2645 : J1:
987 : #endif
988 : // All others are 3 bytes:
989 2646 : if (count + 3 >= dstlen)
990 : {
991 0 : dst[count] = 0;
992 0 : count += 3;
993 0 : break;
994 : }
995 2646 : dst[count++] = 0xe0 | static_cast<char>(ucs >> 12);
996 2646 : dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F);
997 2646 : dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
998 : }
999 1795860 : }
1000 :
1001 : // We filled dst, measure the rest:
1002 0 : while (i < srclen)
1003 : {
1004 0 : unsigned int ucs = src[i++];
1005 0 : if (ucs < 0x80U)
1006 : {
1007 0 : count++;
1008 : }
1009 0 : else if (ucs < 0x800U)
1010 : {
1011 : // 2 bytes.
1012 0 : count += 2;
1013 : #ifdef _WIN32
1014 : }
1015 : else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen - 1 &&
1016 : src[i + 1] >= 0xdc00 && src[i + 1] <= 0xdfff)
1017 : {
1018 : // Surrogate pair.
1019 : ++i;
1020 : #else
1021 : }
1022 0 : else if (ucs >= 0x10000 && ucs <= 0x10ffff)
1023 : {
1024 : #endif
1025 0 : count += 4;
1026 : }
1027 : else
1028 : {
1029 0 : count += 3;
1030 : }
1031 : }
1032 0 : return count;
1033 : }
1034 :
1035 : /************************************************************************/
1036 : /* utf8froma() */
1037 : /************************************************************************/
1038 :
1039 : /* Convert an ISO-8859-1 (i.e. normal c-string) byte stream to UTF-8.
1040 :
1041 : It is possible this should convert Microsoft's CP1252 to UTF-8
1042 : instead. This would translate the codes in the range 0x80-0x9f
1043 : to different characters. Currently it does not do this.
1044 :
1045 : Up to \a dstlen bytes are written to \a dst, including a null
1046 : terminator. The return value is the number of bytes that would be
1047 : written, not counting the null terminator. If greater or equal to
1048 : \a dstlen then if you malloc a new array of size n+1 you will have
1049 : the space needed for the entire string. If \a dstlen is zero then
1050 : nothing is written and this call just measures the storage space
1051 : needed.
1052 :
1053 : \a srclen is the number of bytes in \a src to convert.
1054 :
1055 : If the return value equals \a srclen then this indicates that
1056 : no conversion is necessary, as only ASCII characters are in the
1057 : string.
1058 : */
1059 1171150 : static unsigned utf8froma(char *dst, unsigned dstlen, const char *src,
1060 : unsigned srclen)
1061 : {
1062 1171150 : const char *p = src;
1063 1171150 : const char *e = src + srclen;
1064 1171150 : unsigned count = 0;
1065 1171150 : if (dstlen)
1066 : while (true)
1067 : {
1068 17815800 : if (p >= e)
1069 : {
1070 1171150 : dst[count] = 0;
1071 1171150 : return count;
1072 : }
1073 16644600 : unsigned char ucs = *reinterpret_cast<const unsigned char *>(p);
1074 16644600 : p++;
1075 16644600 : if (ucs < 0x80U)
1076 : {
1077 16597600 : dst[count++] = ucs;
1078 16597600 : if (count >= dstlen)
1079 : {
1080 0 : dst[count - 1] = 0;
1081 0 : break;
1082 : }
1083 : }
1084 : else
1085 : {
1086 : // 2 bytes (note that CP1252 translate could make 3 bytes!)
1087 47068 : if (count + 2 >= dstlen)
1088 : {
1089 0 : dst[count] = 0;
1090 0 : count += 2;
1091 0 : break;
1092 : }
1093 47068 : dst[count++] = 0xc0 | (ucs >> 6);
1094 47068 : dst[count++] = 0x80 | (ucs & 0x3F);
1095 : }
1096 16644600 : }
1097 :
1098 : // We filled dst, measure the rest:
1099 0 : while (p < e)
1100 : {
1101 0 : unsigned char ucs = *reinterpret_cast<const unsigned char *>(p);
1102 0 : p++;
1103 0 : if (ucs < 0x80U)
1104 : {
1105 0 : count++;
1106 : }
1107 : else
1108 : {
1109 0 : count += 2;
1110 : }
1111 : }
1112 :
1113 0 : return count;
1114 : }
1115 :
1116 : #ifdef _WIN32
1117 :
1118 : /************************************************************************/
1119 : /* CPLWin32Recode() */
1120 : /************************************************************************/
1121 :
1122 : /* Convert an CODEPAGE (i.e. normal c-string) byte stream
1123 : to another CODEPAGE (i.e. normal c-string) byte stream.
1124 :
1125 : \a src is target c-string byte stream (including a null terminator).
1126 : \a src_code_page is target c-string byte code page.
1127 : \a dst_code_page is destination c-string byte code page.
1128 :
1129 : UTF7 65000
1130 : UTF8 65001
1131 : OEM-US 437
1132 : OEM-ALABIC 720
1133 : OEM-GREEK 737
1134 : OEM-BALTIC 775
1135 : OEM-MLATIN1 850
1136 : OEM-LATIN2 852
1137 : OEM-CYRILLIC 855
1138 : OEM-TURKISH 857
1139 : OEM-MLATIN1P 858
1140 : OEM-HEBREW 862
1141 : OEM-RUSSIAN 866
1142 :
1143 : THAI 874
1144 : SJIS 932
1145 : GBK 936
1146 : KOREA 949
1147 : BIG5 950
1148 :
1149 : EUROPE 1250
1150 : CYRILLIC 1251
1151 : LATIN1 1252
1152 : GREEK 1253
1153 : TURKISH 1254
1154 : HEBREW 1255
1155 : ARABIC 1256
1156 : BALTIC 1257
1157 : VIETNAM 1258
1158 :
1159 : ISO-LATIN1 28591
1160 : ISO-LATIN2 28592
1161 : ISO-LATIN3 28593
1162 : ISO-BALTIC 28594
1163 : ISO-CYRILLIC 28595
1164 : ISO-ARABIC 28596
1165 : ISO-HEBREW 28598
1166 : ISO-TURKISH 28599
1167 : ISO-LATIN9 28605
1168 :
1169 : ISO-2022-JP 50220
1170 :
1171 : */
1172 :
1173 : char *CPLWin32Recode(const char *src, unsigned src_code_page,
1174 : unsigned dst_code_page)
1175 : {
1176 : // Convert from source code page to Unicode.
1177 :
1178 : // Compute the length in wide characters.
1179 : int wlen = MultiByteToWideChar(src_code_page, MB_ERR_INVALID_CHARS, src, -1,
1180 : nullptr, 0);
1181 : if (wlen == 0 && GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
1182 : {
1183 : if (!bHaveWarned5)
1184 : {
1185 : bHaveWarned5 = true;
1186 : CPLError(
1187 : CE_Warning, CPLE_AppDefined,
1188 : "One or several characters could not be translated from CP%d. "
1189 : "This warning will not be emitted anymore.",
1190 : src_code_page);
1191 : }
1192 :
1193 : // Retry now without MB_ERR_INVALID_CHARS flag.
1194 : wlen = MultiByteToWideChar(src_code_page, 0, src, -1, nullptr, 0);
1195 : }
1196 :
1197 : // Do the actual conversion.
1198 : wchar_t *tbuf =
1199 : static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), wlen + 1));
1200 : tbuf[wlen] = 0;
1201 : MultiByteToWideChar(src_code_page, 0, src, -1, tbuf, wlen + 1);
1202 :
1203 : // Convert from Unicode to destination code page.
1204 :
1205 : // Compute the length in chars.
1206 : BOOL bUsedDefaultChar = FALSE;
1207 : int len = 0;
1208 : if (dst_code_page == CP_UTF7 || dst_code_page == CP_UTF8)
1209 : len = WideCharToMultiByte(dst_code_page, 0, tbuf, -1, nullptr, 0,
1210 : nullptr, nullptr);
1211 : else
1212 : len = WideCharToMultiByte(dst_code_page, 0, tbuf, -1, nullptr, 0,
1213 : nullptr, &bUsedDefaultChar);
1214 : if (bUsedDefaultChar)
1215 : {
1216 : if (!bHaveWarned6)
1217 : {
1218 : bHaveWarned6 = true;
1219 : CPLError(
1220 : CE_Warning, CPLE_AppDefined,
1221 : "One or several characters could not be translated to CP%d. "
1222 : "This warning will not be emitted anymore.",
1223 : dst_code_page);
1224 : }
1225 : }
1226 :
1227 : // Do the actual conversion.
1228 : char *pszResult = static_cast<char *>(CPLCalloc(sizeof(char), len + 1));
1229 : WideCharToMultiByte(dst_code_page, 0, tbuf, -1, pszResult, len + 1, nullptr,
1230 : nullptr);
1231 : pszResult[len] = 0;
1232 :
1233 : CPLFree(tbuf);
1234 :
1235 : return pszResult;
1236 : }
1237 :
1238 : #endif
1239 :
1240 : /*
1241 : ** For now we disable the rest which is locale() related. We may need
1242 : ** parts of it later.
1243 : */
1244 :
1245 : #ifdef notdef
1246 :
1247 : #ifdef _WIN32
1248 : #include <windows.h>
1249 : #endif
1250 :
1251 : /*! Return true if the "locale" seems to indicate that UTF-8 encoding
1252 : is used. If true the utf8tomb and utf8frommb don't do anything
1253 : useful.
1254 :
1255 : <i>It is highly recommended that you change your system so this
1256 : does return true.</i> On Windows this is done by setting the
1257 : "codepage" to CP_UTF8. On Unix this is done by setting $LC_CTYPE
1258 : to a string containing the letters "utf" or "UTF" in it, or by
1259 : deleting all $LC* and $LANG environment variables. In the future
1260 : it is likely that all non-Asian Unix systems will return true,
1261 : due to the compatibility of UTF-8 with ISO-8859-1.
1262 : */
1263 : int utf8locale(void)
1264 : {
1265 : static int ret = 2;
1266 : if (ret == 2)
1267 : {
1268 : #ifdef _WIN32
1269 : ret = GetACP() == CP_UTF8;
1270 : #else
1271 : char *s;
1272 : ret = 1; // assume UTF-8 if no locale
1273 : if (((s = getenv("LC_CTYPE")) && *s) ||
1274 : ((s = getenv("LC_ALL")) && *s) || ((s = getenv("LANG")) && *s))
1275 : {
1276 : ret = strstr(s, "utf") || strstr(s, "UTF");
1277 : }
1278 : #endif
1279 : }
1280 :
1281 : return ret;
1282 : }
1283 :
1284 : /*! Convert the UTF-8 used by FLTK to the locale-specific encoding
1285 : used for filenames (and sometimes used for data in files).
1286 : Unfortunately due to stupid design you will have to do this as
1287 : needed for filenames. This is a bug on both Unix and Windows.
1288 :
1289 : Up to \a dstlen bytes are written to \a dst, including a null
1290 : terminator. The return value is the number of bytes that would be
1291 : written, not counting the null terminator. If greater or equal to
1292 : \a dstlen then if you malloc a new array of size n+1 you will have
1293 : the space needed for the entire string. If \a dstlen is zero then
1294 : nothing is written and this call just measures the storage space
1295 : needed.
1296 :
1297 : If utf8locale() returns true then this does not change the data.
1298 : It is copied and truncated as necessary to
1299 : the destination buffer and \a srclen is always returned. */
1300 : unsigned utf8tomb(const char *src, unsigned srclen, char *dst, unsigned dstlen)
1301 : {
1302 : if (!utf8locale())
1303 : {
1304 : #ifdef _WIN32
1305 : wchar_t lbuf[1024] = {};
1306 : wchar_t *buf = lbuf;
1307 : unsigned length = utf8towc(src, srclen, buf, 1024);
1308 : unsigned ret;
1309 : if (length >= 1024)
1310 : {
1311 : buf =
1312 : static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t)));
1313 : utf8towc(src, srclen, buf, length + 1);
1314 : }
1315 : if (dstlen)
1316 : {
1317 : // apparently this does not null-terminate, even though msdn
1318 : // documentation claims it does:
1319 : ret = WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0,
1320 : 0);
1321 : dst[ret] = 0;
1322 : }
1323 : // if it overflows or measuring length, get the actual length:
1324 : if (dstlen == 0 || ret >= dstlen - 1)
1325 : ret = WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
1326 : if (buf != lbuf)
1327 : free((void *)buf);
1328 : return ret;
1329 : #else
1330 : wchar_t lbuf[1024] = {};
1331 : wchar_t *buf = lbuf;
1332 : unsigned length = utf8towc(src, srclen, buf, 1024);
1333 : if (length >= 1024)
1334 : {
1335 : buf =
1336 : static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t)));
1337 : utf8towc(src, srclen, buf, length + 1);
1338 : }
1339 : int ret = 0;
1340 : if (dstlen)
1341 : {
1342 : ret = wcstombs(dst, buf, dstlen);
1343 : if (ret >= dstlen - 1)
1344 : ret = wcstombs(0, buf, 0);
1345 : }
1346 : else
1347 : {
1348 : ret = wcstombs(0, buf, 0);
1349 : }
1350 : if (buf != lbuf)
1351 : free((void *)buf);
1352 : if (ret >= 0)
1353 : return (unsigned)ret;
1354 : // On any errors we return the UTF-8 as raw text...
1355 : #endif
1356 : }
1357 : // Identity transform:
1358 : if (srclen < dstlen)
1359 : {
1360 : memcpy(dst, src, srclen);
1361 : dst[srclen] = 0;
1362 : }
1363 : else
1364 : {
1365 : memcpy(dst, src, dstlen - 1);
1366 : dst[dstlen - 1] = 0;
1367 : }
1368 : return srclen;
1369 : }
1370 :
1371 : /*! Convert a filename from the locale-specific multibyte encoding
1372 : used by Windows to UTF-8 as used by FLTK.
1373 :
1374 : Up to \a dstlen bytes are written to \a dst, including a null
1375 : terminator. The return value is the number of bytes that would be
1376 : written, not counting the null terminator. If greater or equal to
1377 : \a dstlen then if you malloc a new array of size n+1 you will have
1378 : the space needed for the entire string. If \a dstlen is zero then
1379 : nothing is written and this call just measures the storage space
1380 : needed.
1381 :
1382 : On Unix or on Windows when a UTF-8 locale is in effect, this
1383 : does not change the data. It is copied and truncated as necessary to
1384 : the destination buffer and \a srclen is always returned.
1385 : You may also want to check if utf8test() returns non-zero, so that
1386 : the filesystem can store filenames in UTF-8 encoding regardless of
1387 : the locale.
1388 : */
1389 : unsigned utf8frommb(char *dst, unsigned dstlen, const char *src,
1390 : unsigned srclen)
1391 : {
1392 : if (!utf8locale())
1393 : {
1394 : #ifdef _WIN32
1395 : wchar_t lbuf[1024] = {};
1396 : wchar_t *buf = lbuf;
1397 : unsigned ret;
1398 : const unsigned length =
1399 : MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
1400 : if (length >= 1024)
1401 : {
1402 : length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
1403 : buf = static_cast<wchar_t *>(malloc(length * sizeof(wchar_t)));
1404 : MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
1405 : }
1406 : ret = utf8fromwc(dst, dstlen, buf, length);
1407 : if (buf != lbuf)
1408 : free(buf);
1409 : return ret;
1410 : #else
1411 : wchar_t lbuf[1024] = {};
1412 : wchar_t *buf = lbuf;
1413 : const int length = mbstowcs(buf, src, 1024);
1414 : if (length >= 1024)
1415 : {
1416 : length = mbstowcs(0, src, 0) + 1;
1417 : buf =
1418 : static_cast<wchar_t *>(malloc(length * sizeof(unsigned short)));
1419 : mbstowcs(buf, src, length);
1420 : }
1421 : if (length >= 0)
1422 : {
1423 : const unsigned ret = utf8fromwc(dst, dstlen, buf, length);
1424 : if (buf != lbuf)
1425 : free(buf);
1426 : return ret;
1427 : }
1428 : // Errors in conversion return the UTF-8 unchanged.
1429 : #endif
1430 : }
1431 : // Identity transform:
1432 : if (srclen < dstlen)
1433 : {
1434 : memcpy(dst, src, srclen);
1435 : dst[srclen] = 0;
1436 : }
1437 : else
1438 : {
1439 : memcpy(dst, src, dstlen - 1);
1440 : dst[dstlen - 1] = 0;
1441 : }
1442 : return srclen;
1443 : }
1444 :
1445 : #endif // def notdef - disabled locale specific stuff.
1446 :
1447 : /*! Examines the first \a srclen bytes in \a src and return a verdict
1448 : on whether it is UTF-8 or not.
1449 : - Returns 0 if there is any illegal UTF-8 sequences, using the
1450 : same rules as utf8decode(). Note that some UCS values considered
1451 : illegal by RFC 3629, such as 0xffff, are considered legal by this.
1452 : - Returns 1 if there are only single-byte characters (i.e. no bytes
1453 : have the high bit set). This is legal UTF-8, but also indicates
1454 : plain ASCII. It also returns 1 if \a srclen is zero.
1455 : - Returns 2 if there are only characters less than 0x800.
1456 : - Returns 3 if there are only characters less than 0x10000.
1457 : - Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
1458 :
1459 : Because there are many illegal sequences in UTF-8, it is almost
1460 : impossible for a string in another encoding to be confused with
1461 : UTF-8. This is very useful for transitioning Unix to UTF-8
1462 : filenames, you can simply test each filename with this to decide
1463 : if it is UTF-8 or in the locale encoding. My hope is that if
1464 : this is done we will be able to cleanly transition to a locale-less
1465 : encoding.
1466 : */
1467 :
1468 18933 : static int utf8test(const char *src, unsigned srclen)
1469 : {
1470 18933 : int ret = 1;
1471 18933 : const char *p = src;
1472 18933 : const char *e = src + srclen;
1473 1766760 : while (p < e)
1474 : {
1475 1747880 : if (*p == 0)
1476 0 : return 0;
1477 1747880 : if (*p & 0x80)
1478 : {
1479 1613 : int len = 0;
1480 1613 : utf8decode(p, e, &len);
1481 1613 : if (len < 2)
1482 53 : return 0;
1483 1560 : if (len > ret)
1484 555 : ret = len;
1485 1560 : p += len;
1486 : }
1487 : else
1488 : {
1489 1746270 : p++;
1490 : }
1491 : }
1492 18880 : return ret;
1493 : }
|