Line data Source code
1 : /**********************************************************************
2 : *
3 : * Name: cpl_recode_stub.cpp
4 : * Project: CPL - Common Portability Library
5 : * Purpose: Character set recoding and char/wchar_t conversions, stub
6 : * implementation to be used if iconv() functionality is not
7 : * available.
8 : * Author: Frank Warmerdam, warmerdam@pobox.com
9 : *
10 : * The bulk of this code is derived from the utf.c module from FLTK. It
11 : * was originally downloaded from:
12 : * http://svn.easysw.com/public/fltk/fltk/trunk/src/utf.c
13 : *
14 : **********************************************************************
15 : * Copyright (c) 2008, Frank Warmerdam
16 : * Copyright 2006 by Bill Spitzak and others.
17 : * Copyright (c) 2009-2014, Even Rouault <even dot rouault at spatialys.com>
18 : *
19 : * Permission to use, copy, modify, and distribute this software for any
20 : * purpose with or without fee is hereby granted, provided that the above
21 : * copyright notice and this permission notice appear in all copies.
22 : *
23 : * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
24 : * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
25 : * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
26 : * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
27 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
28 : * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
29 : * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
30 : **********************************************************************/
31 :
32 : #include "cpl_port.h"
33 : #include "cpl_string.h"
34 :
35 : #include <cstring>
36 :
37 : #include "cpl_conv.h"
38 : #include "cpl_error.h"
39 :
40 : static unsigned utf8decode(const char *p, const char *end, int *len);
41 : static unsigned utf8towc(const char *src, unsigned srclen, wchar_t *dst,
42 : unsigned dstlen);
43 : static unsigned utf8toa(const char *src, unsigned srclen, char *dst,
44 : unsigned dstlen);
45 : static unsigned utf8fromwc(char *dst, unsigned dstlen, const wchar_t *src,
46 : unsigned srclen);
47 : static unsigned utf8froma(char *dst, unsigned dstlen, const char *src,
48 : unsigned srclen);
49 : static int utf8test(const char *src, unsigned srclen);
50 :
51 : #ifdef _WIN32
52 :
53 : #include <windows.h>
54 : #include <winnls.h>
55 :
56 : static char *CPLWin32Recode(const char *src, unsigned src_code_page,
57 : unsigned dst_code_page) CPL_RETURNS_NONNULL;
58 : #endif
59 :
60 : /* used by cpl_recode.cpp */
61 : extern void CPLClearRecodeStubWarningFlags();
62 : extern char *CPLRecodeStub(const char *, const char *,
63 : const char *) CPL_RETURNS_NONNULL;
64 : extern char *CPLRecodeFromWCharStub(const wchar_t *, const char *,
65 : const char *);
66 : extern wchar_t *CPLRecodeToWCharStub(const char *, const char *, const char *);
67 :
68 : /************************************************************************/
69 : /* ==================================================================== */
70 : /* Stub Implementation not depending on iconv() or WIN32 API. */
71 : /* ==================================================================== */
72 : /************************************************************************/
73 :
74 : static bool bHaveWarned1 = false;
75 : static bool bHaveWarned2 = false;
76 : static bool bHaveWarned3 = false;
77 : static bool bHaveWarned4 = false;
78 : static bool bHaveWarned5 = false;
79 : static bool bHaveWarned6 = false;
80 :
81 : /************************************************************************/
82 : /* CPLClearRecodeStubWarningFlags() */
83 : /************************************************************************/
84 :
85 10377 : void CPLClearRecodeStubWarningFlags()
86 : {
87 10377 : bHaveWarned1 = false;
88 10377 : bHaveWarned2 = false;
89 10377 : bHaveWarned3 = false;
90 10377 : bHaveWarned4 = false;
91 10377 : bHaveWarned5 = false;
92 10377 : bHaveWarned6 = false;
93 10377 : }
94 :
95 : /************************************************************************/
96 : /* CPLRecodeStub() */
97 : /************************************************************************/
98 :
99 : /**
100 : * Convert a string from a source encoding to a destination encoding.
101 : *
102 : * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
103 : * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
104 : * <ul>
105 : * <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in
106 : * fact)</li>
107 : * <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
108 : * <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
109 : * </ul>
110 : *
111 : * If an error occurs an error may, or may not be posted with CPLError().
112 : *
113 : * @param pszSource a NULL terminated string.
114 : * @param pszSrcEncoding the source encoding.
115 : * @param pszDstEncoding the destination encoding.
116 : *
117 : * @return a NULL terminated string which should be freed with CPLFree().
118 : */
119 :
120 714482 : char *CPLRecodeStub(const char *pszSource, const char *pszSrcEncoding,
121 : const char *pszDstEncoding)
122 :
123 : {
124 : /* -------------------------------------------------------------------- */
125 : /* If the source or destination is current locale(), we change */
126 : /* it to ISO8859-1 since our stub implementation does not */
127 : /* attempt to address locales properly. */
128 : /* -------------------------------------------------------------------- */
129 :
130 714482 : if (pszSrcEncoding[0] == '\0')
131 0 : pszSrcEncoding = CPL_ENC_ISO8859_1;
132 :
133 714482 : if (pszDstEncoding[0] == '\0')
134 0 : pszDstEncoding = CPL_ENC_ISO8859_1;
135 :
136 : /* -------------------------------------------------------------------- */
137 : /* ISO8859 to UTF8 */
138 : /* -------------------------------------------------------------------- */
139 714482 : if (strcmp(pszSrcEncoding, CPL_ENC_ISO8859_1) == 0 &&
140 664723 : strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
141 : {
142 664723 : const int nCharCount = static_cast<int>(strlen(pszSource));
143 664723 : char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1));
144 :
145 664723 : utf8froma(pszResult, nCharCount * 2 + 1, pszSource, nCharCount);
146 :
147 664723 : return pszResult;
148 : }
149 :
150 : /* -------------------------------------------------------------------- */
151 : /* UTF8 to ISO8859 */
152 : /* -------------------------------------------------------------------- */
153 49759 : if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0 &&
154 49759 : strcmp(pszDstEncoding, CPL_ENC_ISO8859_1) == 0)
155 : {
156 49759 : int nCharCount = static_cast<int>(strlen(pszSource));
157 49759 : char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount + 1));
158 :
159 49759 : utf8toa(pszSource, nCharCount, pszResult, nCharCount + 1);
160 :
161 49759 : return pszResult;
162 : }
163 :
164 : #ifdef _WIN32
165 : /* ---------------------------------------------------------------------*/
166 : /* CPXXX to UTF8 */
167 : /* ---------------------------------------------------------------------*/
168 : if (STARTS_WITH(pszSrcEncoding, "CP") &&
169 : strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
170 : {
171 : int nCode = atoi(pszSrcEncoding + 2);
172 : if (nCode > 0)
173 : {
174 : return CPLWin32Recode(pszSource, nCode, CP_UTF8);
175 : }
176 : else if (EQUAL(pszSrcEncoding, "CP_OEMCP"))
177 : return CPLWin32Recode(pszSource, CP_OEMCP, CP_UTF8);
178 : else if (EQUAL(pszSrcEncoding, "CP_ACP"))
179 : return CPLWin32Recode(pszSource, CP_ACP, CP_UTF8);
180 : }
181 :
182 : /* ---------------------------------------------------------------------*/
183 : /* UTF8 to CPXXX */
184 : /* ---------------------------------------------------------------------*/
185 : if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0 &&
186 : STARTS_WITH(pszDstEncoding, "CP"))
187 : {
188 : int nCode = atoi(pszDstEncoding + 2);
189 : if (nCode > 0)
190 : {
191 : return CPLWin32Recode(pszSource, CP_UTF8, nCode);
192 : }
193 : else if (EQUAL(pszDstEncoding, "CP_OEMCP"))
194 : return CPLWin32Recode(pszSource, CP_UTF8, CP_OEMCP);
195 : else if (EQUAL(pszDstEncoding, "CP_ACP"))
196 : return CPLWin32Recode(pszSource, CP_UTF8, CP_ACP);
197 : }
198 : #endif
199 :
200 : /* -------------------------------------------------------------------- */
201 : /* Anything else to UTF-8 is treated as ISO8859-1 to UTF-8 with */
202 : /* a one-time warning. */
203 : /* -------------------------------------------------------------------- */
204 0 : if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
205 : {
206 0 : const int nCharCount = static_cast<int>(strlen(pszSource));
207 0 : char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount * 2 + 1));
208 :
209 0 : if (!bHaveWarned1)
210 : {
211 0 : bHaveWarned1 = true;
212 0 : CPLError(CE_Warning, CPLE_AppDefined,
213 : "Recode from %s to UTF-8 not supported, "
214 : "treated as ISO-8859-1 to UTF-8.",
215 : pszSrcEncoding);
216 : }
217 :
218 0 : utf8froma(pszResult, nCharCount * 2 + 1, pszSource, nCharCount);
219 :
220 0 : return pszResult;
221 : }
222 :
223 : /* -------------------------------------------------------------------- */
224 : /* UTF-8 to anything else is treated as UTF-8 to ISO-8859-1 */
225 : /* with a warning. */
226 : /* -------------------------------------------------------------------- */
227 0 : if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0 &&
228 0 : strcmp(pszDstEncoding, CPL_ENC_ISO8859_1) == 0)
229 : {
230 0 : int nCharCount = static_cast<int>(strlen(pszSource));
231 0 : char *pszResult = static_cast<char *>(CPLCalloc(1, nCharCount + 1));
232 :
233 0 : if (!bHaveWarned2)
234 : {
235 0 : bHaveWarned2 = true;
236 0 : CPLError(CE_Warning, CPLE_AppDefined,
237 : "Recode from UTF-8 to %s not supported, "
238 : "treated as UTF-8 to ISO-8859-1.",
239 : pszDstEncoding);
240 : }
241 :
242 0 : utf8toa(pszSource, nCharCount, pszResult, nCharCount + 1);
243 :
244 0 : return pszResult;
245 : }
246 :
247 : /* -------------------------------------------------------------------- */
248 : /* Everything else is treated as a no-op with a warning. */
249 : /* -------------------------------------------------------------------- */
250 : {
251 0 : if (!bHaveWarned3)
252 : {
253 0 : bHaveWarned3 = true;
254 0 : CPLError(CE_Warning, CPLE_AppDefined,
255 : "Recode from %s to %s not supported, no change applied.",
256 : pszSrcEncoding, pszDstEncoding);
257 : }
258 :
259 0 : return CPLStrdup(pszSource);
260 : }
261 : }
262 :
263 : /************************************************************************/
264 : /* CPLRecodeFromWCharStub() */
265 : /************************************************************************/
266 :
267 : /**
268 : * Convert wchar_t string to UTF-8.
269 : *
270 : * Convert a wchar_t string into a multibyte utf-8 string. The only
271 : * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
272 : * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
273 : * and CPL_ENC_ISO8859_1. In some cases (i.e. using iconv()) other encodings
274 : * may also be supported.
275 : *
276 : * Note that the wchar_t type varies in size on different systems. On
277 : * win32 it is normally 2 bytes, and on unix 4 bytes.
278 : *
279 : * If an error occurs an error may, or may not be posted with CPLError().
280 : *
281 : * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
282 : * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
283 : * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
284 : *
285 : * @return a zero terminated multi-byte string which should be freed with
286 : * CPLFree(), or NULL if an error occurs.
287 : */
288 :
289 107675 : char *CPLRecodeFromWCharStub(const wchar_t *pwszSource,
290 : const char *pszSrcEncoding,
291 : const char *pszDstEncoding)
292 :
293 : {
294 : /* -------------------------------------------------------------------- */
295 : /* We try to avoid changes of character set. We are just */
296 : /* providing for unicode to unicode. */
297 : /* -------------------------------------------------------------------- */
298 107675 : if (strcmp(pszSrcEncoding, "WCHAR_T") != 0 &&
299 106316 : strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0 &&
300 106316 : strcmp(pszSrcEncoding, CPL_ENC_UTF16) != 0 &&
301 106316 : strcmp(pszSrcEncoding, CPL_ENC_UCS2) != 0 &&
302 0 : strcmp(pszSrcEncoding, CPL_ENC_UCS4) != 0)
303 : {
304 0 : CPLError(CE_Failure, CPLE_AppDefined,
305 : "Stub recoding implementation does not support "
306 : "CPLRecodeFromWCharStub(...,%s,%s)",
307 : pszSrcEncoding, pszDstEncoding);
308 0 : return nullptr;
309 : }
310 :
311 : /* -------------------------------------------------------------------- */
312 : /* What is the source length. */
313 : /* -------------------------------------------------------------------- */
314 107675 : int nSrcLen = 0;
315 :
316 1578860 : while (pwszSource[nSrcLen] != 0)
317 1471180 : nSrcLen++;
318 :
319 : /* -------------------------------------------------------------------- */
320 : /* Allocate destination buffer plenty big. */
321 : /* -------------------------------------------------------------------- */
322 107675 : const int nDstBufSize = nSrcLen * 4 + 1;
323 : // Nearly worst case.
324 107675 : char *pszResult = static_cast<char *>(CPLMalloc(nDstBufSize));
325 :
326 107675 : if (nSrcLen == 0)
327 : {
328 44803 : pszResult[0] = '\0';
329 44803 : return pszResult;
330 : }
331 :
332 : /* -------------------------------------------------------------------- */
333 : /* Convert, and confirm we had enough space. */
334 : /* -------------------------------------------------------------------- */
335 62872 : const int nDstLen = utf8fromwc(pszResult, nDstBufSize, pwszSource, nSrcLen);
336 62872 : if (nDstLen >= nDstBufSize)
337 : {
338 0 : CPLAssert(false); // too small!
339 : return nullptr;
340 : }
341 :
342 : /* -------------------------------------------------------------------- */
343 : /* If something other than UTF-8 was requested, recode now. */
344 : /* -------------------------------------------------------------------- */
345 62872 : if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0)
346 62872 : return pszResult;
347 :
348 : char *pszFinalResult =
349 0 : CPLRecodeStub(pszResult, CPL_ENC_UTF8, pszDstEncoding);
350 :
351 0 : CPLFree(pszResult);
352 :
353 0 : return pszFinalResult;
354 : }
355 :
356 : /************************************************************************/
357 : /* CPLRecodeToWCharStub() */
358 : /************************************************************************/
359 :
360 : /**
361 : * Convert UTF-8 string to a wchar_t string.
362 : *
363 : * Convert a 8bit, multi-byte per character input string into a wide
364 : * character (wchar_t) string. The only guaranteed supported source encodings
365 : * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1). The only
366 : * guaranteed supported destination encoding is CPL_ENC_UCS2. Other source
367 : * and destination encodings may be supported depending on the underlying
368 : * implementation.
369 : *
370 : * Note that the wchar_t type varies in size on different systems. On
371 : * win32 it is normally 2 bytes, and on unix 4 bytes.
372 : *
373 : * If an error occurs an error may, or may not be posted with CPLError().
374 : *
375 : * @param pszSource input multi-byte character string.
376 : * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
377 : * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
378 : *
379 : * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
380 : * NULL on error.
381 : *
382 : * @since GDAL 1.6.0
383 : */
384 :
385 51932 : wchar_t *CPLRecodeToWCharStub(const char *pszSource, const char *pszSrcEncoding,
386 : const char *pszDstEncoding)
387 :
388 : {
389 51932 : char *pszUTF8Source = const_cast<char *>(pszSource);
390 :
391 51932 : if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) != 0 &&
392 0 : strcmp(pszSrcEncoding, CPL_ENC_ASCII) != 0)
393 : {
394 0 : pszUTF8Source = CPLRecodeStub(pszSource, pszSrcEncoding, CPL_ENC_UTF8);
395 0 : if (pszUTF8Source == nullptr)
396 0 : return nullptr;
397 : }
398 :
399 : /* -------------------------------------------------------------------- */
400 : /* We try to avoid changes of character set. We are just */
401 : /* providing for unicode to unicode. */
402 : /* -------------------------------------------------------------------- */
403 51932 : if (strcmp(pszDstEncoding, "WCHAR_T") != 0 &&
404 51932 : strcmp(pszDstEncoding, CPL_ENC_UCS2) != 0 &&
405 0 : strcmp(pszDstEncoding, CPL_ENC_UCS4) != 0 &&
406 0 : strcmp(pszDstEncoding, CPL_ENC_UTF16) != 0)
407 : {
408 0 : CPLError(CE_Failure, CPLE_AppDefined,
409 : "Stub recoding implementation does not support "
410 : "CPLRecodeToWCharStub(...,%s,%s)",
411 : pszSrcEncoding, pszDstEncoding);
412 0 : if (pszUTF8Source != pszSource)
413 0 : CPLFree(pszUTF8Source);
414 0 : return nullptr;
415 : }
416 :
417 : /* -------------------------------------------------------------------- */
418 : /* Do the UTF-8 to UCS-2 recoding. */
419 : /* -------------------------------------------------------------------- */
420 51932 : int nSrcLen = static_cast<int>(strlen(pszUTF8Source));
421 : wchar_t *pwszResult =
422 51932 : static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), nSrcLen + 1));
423 :
424 51932 : utf8towc(pszUTF8Source, nSrcLen, pwszResult, nSrcLen + 1);
425 :
426 51932 : if (pszUTF8Source != pszSource)
427 0 : CPLFree(pszUTF8Source);
428 :
429 51932 : return pwszResult;
430 : }
431 :
432 : /************************************************************************/
433 : /* CPLIsUTF8() */
434 : /************************************************************************/
435 :
436 : /**
437 : * Test if a string is encoded as UTF-8.
438 : *
439 : * @param pabyData input string to test
440 : * @param nLen length of the input string, or -1 if the function must compute
441 : * the string length. In which case it must be null terminated.
442 : * @return TRUE if the string is encoded as UTF-8. FALSE otherwise
443 : *
444 : * @since GDAL 1.7.0
445 : */
446 14858 : int CPLIsUTF8(const char *pabyData, int nLen)
447 : {
448 14858 : if (nLen < 0)
449 10270 : nLen = static_cast<int>(strlen(pabyData));
450 14858 : return utf8test(pabyData, static_cast<unsigned>(nLen)) != 0;
451 : }
452 :
453 : /************************************************************************/
454 : /* ==================================================================== */
455 : /* UTF.C code from FLTK with some modifications. */
456 : /* ==================================================================== */
457 : /************************************************************************/
458 :
459 : /* Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
460 : they are instead turned into the Unicode REPLACEMENT CHARACTER, of
461 : value 0xfffd.
462 : If this is on utf8decode will correctly map most (perhaps all)
463 : human-readable text that is in ISO-8859-1. This may allow you
464 : to completely ignore character sets in your code because virtually
465 : everything is either ISO-8859-1 or UTF-8.
466 : */
467 : #define ERRORS_TO_ISO8859_1 1
468 :
469 : /* Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
470 : Unicode index for Microsoft's CP1252 character set. You should
471 : also set ERRORS_TO_ISO8859_1. With this a huge amount of more
472 : available text (such as all web pages) are correctly converted
473 : to Unicode.
474 : */
475 : #define ERRORS_TO_CP1252 1
476 :
477 : /* A number of Unicode code points are in fact illegal and should not
478 : be produced by a UTF-8 converter. Turn this on will replace the
479 : bytes in those encodings with errors. If you do this then converting
480 : arbitrary 16-bit data to UTF-8 and then back is not an identity,
481 : which will probably break a lot of software.
482 : */
483 : #define STRICT_RFC3629 0
484 :
485 : #if ERRORS_TO_CP1252
486 : // Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
487 : // to Unicode:
488 : constexpr unsigned short cp1252[32] = {
489 : 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
490 : 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
491 : 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
492 : 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178};
493 : #endif
494 :
495 : /************************************************************************/
496 : /* utf8decode() */
497 : /************************************************************************/
498 :
499 : /*
500 : Decode a single UTF-8 encoded character starting at \e p. The
501 : resulting Unicode value (in the range 0-0x10ffff) is returned,
502 : and \e len is set the number of bytes in the UTF-8 encoding
503 : (adding \e len to \e p will point at the next character).
504 :
505 : If \a p points at an illegal UTF-8 encoding, including one that
506 : would go past \e end, or where a code is uses more bytes than
507 : necessary, then *reinterpret_cast<const unsigned char*>(p) is translated as
508 : though it is in the Microsoft CP1252 character set and \e len is set to 1.
509 : Treating errors this way allows this to decode almost any
510 : ISO-8859-1 or CP1252 text that has been mistakenly placed where
511 : UTF-8 is expected, and has proven very useful.
512 :
513 : If you want errors to be converted to error characters (as the
514 : standards recommend), adding a test to see if the length is
515 : unexpectedly 1 will work:
516 :
517 : \code
518 : if( *p & 0x80 )
519 : { // What should be a multibyte encoding.
520 : code = utf8decode(p, end, &len);
521 : if( len<2 ) code = 0xFFFD; // Turn errors into REPLACEMENT CHARACTER.
522 : }
523 : else
524 : { // Handle the 1-byte utf8 encoding:
525 : code = *p;
526 : len = 1;
527 : }
528 : \endcode
529 :
530 : Direct testing for the 1-byte case (as shown above) will also
531 : speed up the scanning of strings where the majority of characters
532 : are ASCII.
533 : */
534 3922 : static unsigned utf8decode(const char *p, const char *end, int *len)
535 : {
536 3922 : unsigned char c = *reinterpret_cast<const unsigned char *>(p);
537 3922 : if (c < 0x80)
538 : {
539 0 : *len = 1;
540 0 : return c;
541 : #if ERRORS_TO_CP1252
542 : }
543 3922 : else if (c < 0xa0)
544 : {
545 39 : *len = 1;
546 39 : return cp1252[c - 0x80];
547 : #endif
548 : }
549 3883 : else if (c < 0xc2)
550 : {
551 10 : goto FAIL;
552 : }
553 3873 : if (p + 1 >= end || (p[1] & 0xc0) != 0x80)
554 71 : goto FAIL;
555 3802 : if (c < 0xe0)
556 : {
557 3794 : *len = 2;
558 3794 : return ((p[0] & 0x1f) << 6) + ((p[1] & 0x3f));
559 : }
560 8 : else if (c == 0xe0)
561 : {
562 0 : if ((reinterpret_cast<const unsigned char *>(p))[1] < 0xa0)
563 0 : goto FAIL;
564 0 : goto UTF8_3;
565 : #if STRICT_RFC3629
566 : }
567 : else if (c == 0xed)
568 : {
569 : // RFC 3629 says surrogate chars are illegal.
570 : if ((reinterpret_cast<const unsigned char *>(p))[1] >= 0xa0)
571 : goto FAIL;
572 : goto UTF8_3;
573 : }
574 : else if (c == 0xef)
575 : {
576 : // 0xfffe and 0xffff are also illegal characters.
577 : if ((reinterpret_cast<const unsigned char *>(p))[1] == 0xbf &&
578 : (reinterpret_cast<const unsigned char *>(p))[2] >= 0xbe)
579 : goto FAIL;
580 : goto UTF8_3;
581 : #endif
582 : }
583 8 : else if (c < 0xf0)
584 : {
585 4 : UTF8_3:
586 4 : if (p + 2 >= end || (p[2] & 0xc0) != 0x80)
587 0 : goto FAIL;
588 4 : *len = 3;
589 4 : return ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + ((p[2] & 0x3f));
590 : }
591 4 : else if (c == 0xf0)
592 : {
593 4 : if ((reinterpret_cast<const unsigned char *>(p))[1] < 0x90)
594 0 : goto FAIL;
595 4 : goto UTF8_4;
596 : }
597 0 : else if (c < 0xf4)
598 : {
599 0 : UTF8_4:
600 4 : if (p + 3 >= end || (p[2] & 0xc0) != 0x80 || (p[3] & 0xc0) != 0x80)
601 0 : goto FAIL;
602 4 : *len = 4;
603 : #if STRICT_RFC3629
604 : // RFC 3629 says all codes ending in fffe or ffff are illegal:
605 : if ((p[1] & 0xf) == 0xf &&
606 : (reinterpret_cast<const unsigned char *>(p))[2] == 0xbf &&
607 : (reinterpret_cast<const unsigned char *>(p))[3] >= 0xbe)
608 : goto FAIL;
609 : #endif
610 4 : return ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12) +
611 4 : ((p[2] & 0x3f) << 6) + ((p[3] & 0x3f));
612 : }
613 0 : else if (c == 0xf4)
614 : {
615 0 : if ((reinterpret_cast<const unsigned char *>(p))[1] > 0x8f)
616 0 : goto FAIL; // After 0x10ffff.
617 0 : goto UTF8_4;
618 : }
619 : else
620 : {
621 0 : FAIL:
622 81 : *len = 1;
623 : #if ERRORS_TO_ISO8859_1
624 81 : return c;
625 : #else
626 : return 0xfffd; // Unicode REPLACEMENT CHARACTER
627 : #endif
628 : }
629 : }
630 :
631 : /************************************************************************/
632 : /* utf8towc() */
633 : /************************************************************************/
634 :
635 : /* Convert a UTF-8 sequence into an array of wchar_t. These
636 : are used by some system calls, especially on Windows.
637 :
638 : \a src points at the UTF-8, and \a srclen is the number of bytes to
639 : convert.
640 :
641 : \a dst points at an array to write, and \a dstlen is the number of
642 : locations in this array. At most \a dstlen-1 words will be
643 : written there, plus a 0 terminating word. Thus this function
644 : will never overwrite the buffer and will always return a
645 : zero-terminated string. If \a dstlen is zero then \a dst can be
646 : null and no data is written, but the length is returned.
647 :
648 : The return value is the number of words that \e would be written
649 : to \a dst if it were long enough, not counting the terminating
650 : zero. If the return value is greater or equal to \a dstlen it
651 : indicates truncation, you can then allocate a new array of size
652 : return+1 and call this again.
653 :
654 : Errors in the UTF-8 are converted as though each byte in the
655 : erroneous string is in the Microsoft CP1252 encoding. This allows
656 : ISO-8859-1 text mistakenly identified as UTF-8 to be printed
657 : correctly.
658 :
659 : Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
660 : and most other systems. Where wchar_t is 16 bits, Unicode
661 : characters in the range 0x10000 to 0x10ffff are converted to
662 : "surrogate pairs" which take two words each (this is called UTF-16
663 : encoding). If wchar_t is 32 bits this rather nasty problem is
664 : avoided.
665 : */
666 51932 : static unsigned utf8towc(const char *src, unsigned srclen, wchar_t *dst,
667 : unsigned dstlen)
668 : {
669 51932 : const char *p = src;
670 51932 : const char *e = src + srclen;
671 51932 : unsigned count = 0;
672 51932 : if (dstlen)
673 : while (true)
674 : {
675 380839 : if (p >= e)
676 : {
677 51932 : dst[count] = 0;
678 51932 : return count;
679 : }
680 328907 : if (!(*p & 0x80))
681 : {
682 : // ASCII
683 327489 : dst[count] = *p++;
684 : }
685 : else
686 : {
687 1418 : int len = 0;
688 1418 : unsigned ucs = utf8decode(p, e, &len);
689 1418 : p += len;
690 : #ifdef _WIN32
691 : if (ucs < 0x10000)
692 : {
693 : dst[count] = static_cast<wchar_t>(ucs);
694 : }
695 : else
696 : {
697 : // Make a surrogate pair:
698 : if (count + 2 >= dstlen)
699 : {
700 : dst[count] = 0;
701 : count += 2;
702 : break;
703 : }
704 : dst[count] = static_cast<wchar_t>(
705 : (((ucs - 0x10000u) >> 10) & 0x3ff) | 0xd800);
706 : dst[++count] = static_cast<wchar_t>((ucs & 0x3ff) | 0xdc00);
707 : }
708 : #else
709 1418 : dst[count] = static_cast<wchar_t>(ucs);
710 : #endif
711 : }
712 328907 : if (++count == dstlen)
713 : {
714 0 : dst[count - 1] = 0;
715 0 : break;
716 : }
717 328907 : }
718 : // We filled dst, measure the rest:
719 0 : while (p < e)
720 : {
721 0 : if (!(*p & 0x80))
722 : {
723 0 : p++;
724 : }
725 : else
726 : {
727 0 : int len = 0;
728 : #ifdef _WIN32
729 : const unsigned ucs = utf8decode(p, e, &len);
730 : p += len;
731 : if (ucs >= 0x10000)
732 : ++count;
733 : #else
734 0 : utf8decode(p, e, &len);
735 0 : p += len;
736 : #endif
737 : }
738 0 : ++count;
739 : }
740 :
741 0 : return count;
742 : }
743 :
744 : /************************************************************************/
745 : /* utf8toa() */
746 : /************************************************************************/
747 : /* Convert a UTF-8 sequence into an array of 1-byte characters.
748 :
749 : If the UTF-8 decodes to a character greater than 0xff then it is
750 : replaced with '?'.
751 :
752 : Errors in the UTF-8 are converted as individual bytes, same as
753 : utf8decode() does. This allows ISO-8859-1 text mistakenly identified
754 : as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
755 :
756 : \a src points at the UTF-8, and \a srclen is the number of bytes to
757 : convert.
758 :
759 : Up to \a dstlen bytes are written to \a dst, including a null
760 : terminator. The return value is the number of bytes that would be
761 : written, not counting the null terminator. If greater or equal to
762 : \a dstlen then if you malloc a new array of size n+1 you will have
763 : the space needed for the entire string. If \a dstlen is zero then
764 : nothing is written and this call just measures the storage space
765 : needed.
766 : */
767 49759 : static unsigned int utf8toa(const char *src, unsigned srclen, char *dst,
768 : unsigned dstlen)
769 : {
770 49759 : const char *p = src;
771 49759 : const char *e = src + srclen;
772 49759 : unsigned int count = 0;
773 49759 : if (dstlen)
774 : while (true)
775 : {
776 189612 : if (p >= e)
777 : {
778 49759 : dst[count] = 0;
779 49759 : return count;
780 : }
781 139853 : unsigned char c = *reinterpret_cast<const unsigned char *>(p);
782 139853 : if (c < 0xC2)
783 : {
784 : // ASCII or bad code.
785 138951 : dst[count] = c;
786 138951 : p++;
787 : }
788 : else
789 : {
790 902 : int len = 0;
791 902 : const unsigned int ucs = utf8decode(p, e, &len);
792 902 : p += len;
793 902 : if (ucs < 0x100)
794 : {
795 898 : dst[count] = static_cast<char>(ucs);
796 : }
797 : else
798 : {
799 4 : if (!bHaveWarned4)
800 : {
801 2 : bHaveWarned4 = true;
802 2 : CPLError(
803 : CE_Warning, CPLE_AppDefined,
804 : "One or several characters couldn't be converted "
805 : "correctly from UTF-8 to ISO-8859-1. "
806 : "This warning will not be emitted anymore.");
807 : }
808 4 : dst[count] = '?';
809 : }
810 : }
811 139853 : if (++count >= dstlen)
812 : {
813 0 : dst[count - 1] = 0;
814 0 : break;
815 : }
816 139853 : }
817 : // We filled dst, measure the rest:
818 0 : while (p < e)
819 : {
820 0 : if (!(*p & 0x80))
821 : {
822 0 : p++;
823 : }
824 : else
825 : {
826 0 : int len = 0;
827 0 : utf8decode(p, e, &len);
828 0 : p += len;
829 : }
830 0 : ++count;
831 : }
832 0 : return count;
833 : }
834 :
835 : /************************************************************************/
836 : /* utf8fromwc() */
837 : /************************************************************************/
838 : /* Turn "wide characters" as returned by some system calls
839 : (especially on Windows) into UTF-8.
840 :
841 : Up to \a dstlen bytes are written to \a dst, including a null
842 : terminator. The return value is the number of bytes that would be
843 : written, not counting the null terminator. If greater or equal to
844 : \a dstlen then if you malloc a new array of size n+1 you will have
845 : the space needed for the entire string. If \a dstlen is zero then
846 : nothing is written and this call just measures the storage space
847 : needed.
848 :
849 : \a srclen is the number of words in \a src to convert. On Windows
850 : this is not necessarily the number of characters, due to there
851 : possibly being "surrogate pairs" in the UTF-16 encoding used.
852 : On Unix wchar_t is 32 bits and each location is a character.
853 :
854 : On Unix if a src word is greater than 0x10ffff then this is an
855 : illegal character according to RFC 3629. These are converted as
856 : though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
857 : range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
858 : illegal according to RFC 3629. However I encode these as though
859 : they are legal, so that utf8towc will return the original data.
860 :
861 : On Windows "surrogate pairs" are converted to a single character
862 : and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
863 : pairs are converted as though they are individual characters.
864 : */
865 62872 : static unsigned int utf8fromwc(char *dst, unsigned dstlen, const wchar_t *src,
866 : unsigned srclen)
867 : {
868 62872 : unsigned int i = 0;
869 62872 : unsigned int count = 0;
870 62872 : if (dstlen)
871 : while (true)
872 : {
873 1534060 : if (i >= srclen)
874 : {
875 62872 : dst[count] = 0;
876 62872 : return count;
877 : }
878 1471180 : unsigned int ucs = src[i++];
879 1471180 : if (ucs < 0x80U)
880 : {
881 1464520 : dst[count++] = static_cast<char>(ucs);
882 1464520 : if (count >= dstlen)
883 : {
884 0 : dst[count - 1] = 0;
885 0 : break;
886 : }
887 : }
888 6664 : else if (ucs < 0x800U)
889 : {
890 : // 2 bytes.
891 4291 : if (count + 2 >= dstlen)
892 : {
893 0 : dst[count] = 0;
894 0 : count += 2;
895 0 : break;
896 : }
897 4291 : dst[count++] = 0xc0 | static_cast<char>(ucs >> 6);
898 4291 : dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
899 : #ifdef _WIN32
900 : }
901 : else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
902 : src[i] >= 0xdc00 && src[i] <= 0xdfff)
903 : {
904 : // Surrogate pair.
905 : unsigned int ucs2 = src[i++];
906 : ucs = 0x10000U + ((ucs & 0x3ff) << 10) + (ucs2 & 0x3ff);
907 : // All surrogate pairs turn into 4-byte utf8.
908 : #else
909 : }
910 2373 : else if (ucs >= 0x10000)
911 : {
912 1 : if (ucs > 0x10ffff)
913 : {
914 1 : ucs = 0xfffd;
915 1 : goto J1;
916 : }
917 : #endif
918 0 : if (count + 4 >= dstlen)
919 : {
920 0 : dst[count] = 0;
921 0 : count += 4;
922 0 : break;
923 : }
924 0 : dst[count++] = 0xf0 | static_cast<char>(ucs >> 18);
925 0 : dst[count++] = 0x80 | static_cast<char>((ucs >> 12) & 0x3F);
926 0 : dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F);
927 0 : dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
928 : }
929 : else
930 : {
931 : #ifndef _WIN32
932 2372 : J1:
933 : #endif
934 : // All others are 3 bytes:
935 2373 : if (count + 3 >= dstlen)
936 : {
937 0 : dst[count] = 0;
938 0 : count += 3;
939 0 : break;
940 : }
941 2373 : dst[count++] = 0xe0 | static_cast<char>(ucs >> 12);
942 2373 : dst[count++] = 0x80 | static_cast<char>((ucs >> 6) & 0x3F);
943 2373 : dst[count++] = 0x80 | static_cast<char>(ucs & 0x3F);
944 : }
945 1471180 : }
946 :
947 : // We filled dst, measure the rest:
948 0 : while (i < srclen)
949 : {
950 0 : unsigned int ucs = src[i++];
951 0 : if (ucs < 0x80U)
952 : {
953 0 : count++;
954 : }
955 0 : else if (ucs < 0x800U)
956 : {
957 : // 2 bytes.
958 0 : count += 2;
959 : #ifdef _WIN32
960 : }
961 : else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen - 1 &&
962 : src[i + 1] >= 0xdc00 && src[i + 1] <= 0xdfff)
963 : {
964 : // Surrogate pair.
965 : ++i;
966 : #else
967 : }
968 0 : else if (ucs >= 0x10000 && ucs <= 0x10ffff)
969 : {
970 : #endif
971 0 : count += 4;
972 : }
973 : else
974 : {
975 0 : count += 3;
976 : }
977 : }
978 0 : return count;
979 : }
980 :
981 : /************************************************************************/
982 : /* utf8froma() */
983 : /************************************************************************/
984 :
985 : /* Convert an ISO-8859-1 (i.e. normal c-string) byte stream to UTF-8.
986 :
987 : It is possible this should convert Microsoft's CP1252 to UTF-8
988 : instead. This would translate the codes in the range 0x80-0x9f
989 : to different characters. Currently it does not do this.
990 :
991 : Up to \a dstlen bytes are written to \a dst, including a null
992 : terminator. The return value is the number of bytes that would be
993 : written, not counting the null terminator. If greater or equal to
994 : \a dstlen then if you malloc a new array of size n+1 you will have
995 : the space needed for the entire string. If \a dstlen is zero then
996 : nothing is written and this call just measures the storage space
997 : needed.
998 :
999 : \a srclen is the number of bytes in \a src to convert.
1000 :
1001 : If the return value equals \a srclen then this indicates that
1002 : no conversion is necessary, as only ASCII characters are in the
1003 : string.
1004 : */
1005 664723 : static unsigned utf8froma(char *dst, unsigned dstlen, const char *src,
1006 : unsigned srclen)
1007 : {
1008 664723 : const char *p = src;
1009 664723 : const char *e = src + srclen;
1010 664723 : unsigned count = 0;
1011 664723 : if (dstlen)
1012 : while (true)
1013 : {
1014 7665170 : if (p >= e)
1015 : {
1016 664723 : dst[count] = 0;
1017 664723 : return count;
1018 : }
1019 7000450 : unsigned char ucs = *reinterpret_cast<const unsigned char *>(p);
1020 7000450 : p++;
1021 7000450 : if (ucs < 0x80U)
1022 : {
1023 6963070 : dst[count++] = ucs;
1024 6963070 : if (count >= dstlen)
1025 : {
1026 0 : dst[count - 1] = 0;
1027 0 : break;
1028 : }
1029 : }
1030 : else
1031 : {
1032 : // 2 bytes (note that CP1252 translate could make 3 bytes!)
1033 37383 : if (count + 2 >= dstlen)
1034 : {
1035 0 : dst[count] = 0;
1036 0 : count += 2;
1037 0 : break;
1038 : }
1039 37383 : dst[count++] = 0xc0 | (ucs >> 6);
1040 37383 : dst[count++] = 0x80 | (ucs & 0x3F);
1041 : }
1042 7000450 : }
1043 :
1044 : // We filled dst, measure the rest:
1045 0 : while (p < e)
1046 : {
1047 0 : unsigned char ucs = *reinterpret_cast<const unsigned char *>(p);
1048 0 : p++;
1049 0 : if (ucs < 0x80U)
1050 : {
1051 0 : count++;
1052 : }
1053 : else
1054 : {
1055 0 : count += 2;
1056 : }
1057 : }
1058 :
1059 0 : return count;
1060 : }
1061 :
1062 : #ifdef _WIN32
1063 :
1064 : /************************************************************************/
1065 : /* CPLWin32Recode() */
1066 : /************************************************************************/
1067 :
1068 : /* Convert an CODEPAGE (i.e. normal c-string) byte stream
1069 : to another CODEPAGE (i.e. normal c-string) byte stream.
1070 :
1071 : \a src is target c-string byte stream (including a null terminator).
1072 : \a src_code_page is target c-string byte code page.
1073 : \a dst_code_page is destination c-string byte code page.
1074 :
1075 : UTF7 65000
1076 : UTF8 65001
1077 : OEM-US 437
1078 : OEM-ALABIC 720
1079 : OEM-GREEK 737
1080 : OEM-BALTIC 775
1081 : OEM-MLATIN1 850
1082 : OEM-LATIN2 852
1083 : OEM-CYRILLIC 855
1084 : OEM-TURKISH 857
1085 : OEM-MLATIN1P 858
1086 : OEM-HEBREW 862
1087 : OEM-RUSSIAN 866
1088 :
1089 : THAI 874
1090 : SJIS 932
1091 : GBK 936
1092 : KOREA 949
1093 : BIG5 950
1094 :
1095 : EUROPE 1250
1096 : CYRILLIC 1251
1097 : LATIN1 1252
1098 : GREEK 1253
1099 : TURKISH 1254
1100 : HEBREW 1255
1101 : ARABIC 1256
1102 : BALTIC 1257
1103 : VIETNAM 1258
1104 :
1105 : ISO-LATIN1 28591
1106 : ISO-LATIN2 28592
1107 : ISO-LATIN3 28593
1108 : ISO-BALTIC 28594
1109 : ISO-CYRILLIC 28595
1110 : ISO-ARABIC 28596
1111 : ISO-HEBREW 28598
1112 : ISO-TURKISH 28599
1113 : ISO-LATIN9 28605
1114 :
1115 : ISO-2022-JP 50220
1116 :
1117 : */
1118 :
1119 : char *CPLWin32Recode(const char *src, unsigned src_code_page,
1120 : unsigned dst_code_page)
1121 : {
1122 : // Convert from source code page to Unicode.
1123 :
1124 : // Compute the length in wide characters.
1125 : int wlen = MultiByteToWideChar(src_code_page, MB_ERR_INVALID_CHARS, src, -1,
1126 : nullptr, 0);
1127 : if (wlen == 0 && GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
1128 : {
1129 : if (!bHaveWarned5)
1130 : {
1131 : bHaveWarned5 = true;
1132 : CPLError(
1133 : CE_Warning, CPLE_AppDefined,
1134 : "One or several characters could not be translated from CP%d. "
1135 : "This warning will not be emitted anymore.",
1136 : src_code_page);
1137 : }
1138 :
1139 : // Retry now without MB_ERR_INVALID_CHARS flag.
1140 : wlen = MultiByteToWideChar(src_code_page, 0, src, -1, nullptr, 0);
1141 : }
1142 :
1143 : // Do the actual conversion.
1144 : wchar_t *tbuf =
1145 : static_cast<wchar_t *>(CPLCalloc(sizeof(wchar_t), wlen + 1));
1146 : tbuf[wlen] = 0;
1147 : MultiByteToWideChar(src_code_page, 0, src, -1, tbuf, wlen + 1);
1148 :
1149 : // Convert from Unicode to destination code page.
1150 :
1151 : // Compute the length in chars.
1152 : BOOL bUsedDefaultChar = FALSE;
1153 : int len = 0;
1154 : if (dst_code_page == CP_UTF7 || dst_code_page == CP_UTF8)
1155 : len = WideCharToMultiByte(dst_code_page, 0, tbuf, -1, nullptr, 0,
1156 : nullptr, nullptr);
1157 : else
1158 : len = WideCharToMultiByte(dst_code_page, 0, tbuf, -1, nullptr, 0,
1159 : nullptr, &bUsedDefaultChar);
1160 : if (bUsedDefaultChar)
1161 : {
1162 : if (!bHaveWarned6)
1163 : {
1164 : bHaveWarned6 = true;
1165 : CPLError(
1166 : CE_Warning, CPLE_AppDefined,
1167 : "One or several characters could not be translated to CP%d. "
1168 : "This warning will not be emitted anymore.",
1169 : dst_code_page);
1170 : }
1171 : }
1172 :
1173 : // Do the actual conversion.
1174 : char *pszResult = static_cast<char *>(CPLCalloc(sizeof(char), len + 1));
1175 : WideCharToMultiByte(dst_code_page, 0, tbuf, -1, pszResult, len + 1, nullptr,
1176 : nullptr);
1177 : pszResult[len] = 0;
1178 :
1179 : CPLFree(tbuf);
1180 :
1181 : return pszResult;
1182 : }
1183 :
1184 : #endif
1185 :
1186 : /*
1187 : ** For now we disable the rest which is locale() related. We may need
1188 : ** parts of it later.
1189 : */
1190 :
1191 : #ifdef notdef
1192 :
1193 : #ifdef _WIN32
1194 : #include <windows.h>
1195 : #endif
1196 :
1197 : /*! Return true if the "locale" seems to indicate that UTF-8 encoding
1198 : is used. If true the utf8tomb and utf8frommb don't do anything
1199 : useful.
1200 :
1201 : <i>It is highly recommended that you change your system so this
1202 : does return true.</i> On Windows this is done by setting the
1203 : "codepage" to CP_UTF8. On Unix this is done by setting $LC_CTYPE
1204 : to a string containing the letters "utf" or "UTF" in it, or by
1205 : deleting all $LC* and $LANG environment variables. In the future
1206 : it is likely that all non-Asian Unix systems will return true,
1207 : due to the compatibility of UTF-8 with ISO-8859-1.
1208 : */
1209 : int utf8locale(void)
1210 : {
1211 : static int ret = 2;
1212 : if (ret == 2)
1213 : {
1214 : #ifdef _WIN32
1215 : ret = GetACP() == CP_UTF8;
1216 : #else
1217 : char *s;
1218 : ret = 1; // assume UTF-8 if no locale
1219 : if (((s = getenv("LC_CTYPE")) && *s) ||
1220 : ((s = getenv("LC_ALL")) && *s) || ((s = getenv("LANG")) && *s))
1221 : {
1222 : ret = strstr(s, "utf") || strstr(s, "UTF");
1223 : }
1224 : #endif
1225 : }
1226 :
1227 : return ret;
1228 : }
1229 :
1230 : /*! Convert the UTF-8 used by FLTK to the locale-specific encoding
1231 : used for filenames (and sometimes used for data in files).
1232 : Unfortunately due to stupid design you will have to do this as
1233 : needed for filenames. This is a bug on both Unix and Windows.
1234 :
1235 : Up to \a dstlen bytes are written to \a dst, including a null
1236 : terminator. The return value is the number of bytes that would be
1237 : written, not counting the null terminator. If greater or equal to
1238 : \a dstlen then if you malloc a new array of size n+1 you will have
1239 : the space needed for the entire string. If \a dstlen is zero then
1240 : nothing is written and this call just measures the storage space
1241 : needed.
1242 :
1243 : If utf8locale() returns true then this does not change the data.
1244 : It is copied and truncated as necessary to
1245 : the destination buffer and \a srclen is always returned. */
1246 : unsigned utf8tomb(const char *src, unsigned srclen, char *dst, unsigned dstlen)
1247 : {
1248 : if (!utf8locale())
1249 : {
1250 : #ifdef _WIN32
1251 : wchar_t lbuf[1024] = {};
1252 : wchar_t *buf = lbuf;
1253 : unsigned length = utf8towc(src, srclen, buf, 1024);
1254 : unsigned ret;
1255 : if (length >= 1024)
1256 : {
1257 : buf =
1258 : static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t)));
1259 : utf8towc(src, srclen, buf, length + 1);
1260 : }
1261 : if (dstlen)
1262 : {
1263 : // apparently this does not null-terminate, even though msdn
1264 : // documentation claims it does:
1265 : ret = WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0,
1266 : 0);
1267 : dst[ret] = 0;
1268 : }
1269 : // if it overflows or measuring length, get the actual length:
1270 : if (dstlen == 0 || ret >= dstlen - 1)
1271 : ret = WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
1272 : if (buf != lbuf)
1273 : free((void *)buf);
1274 : return ret;
1275 : #else
1276 : wchar_t lbuf[1024] = {};
1277 : wchar_t *buf = lbuf;
1278 : unsigned length = utf8towc(src, srclen, buf, 1024);
1279 : if (length >= 1024)
1280 : {
1281 : buf =
1282 : static_cast<wchar_t *>(malloc((length + 1) * sizeof(wchar_t)));
1283 : utf8towc(src, srclen, buf, length + 1);
1284 : }
1285 : int ret = 0;
1286 : if (dstlen)
1287 : {
1288 : ret = wcstombs(dst, buf, dstlen);
1289 : if (ret >= dstlen - 1)
1290 : ret = wcstombs(0, buf, 0);
1291 : }
1292 : else
1293 : {
1294 : ret = wcstombs(0, buf, 0);
1295 : }
1296 : if (buf != lbuf)
1297 : free((void *)buf);
1298 : if (ret >= 0)
1299 : return (unsigned)ret;
1300 : // On any errors we return the UTF-8 as raw text...
1301 : #endif
1302 : }
1303 : // Identity transform:
1304 : if (srclen < dstlen)
1305 : {
1306 : memcpy(dst, src, srclen);
1307 : dst[srclen] = 0;
1308 : }
1309 : else
1310 : {
1311 : memcpy(dst, src, dstlen - 1);
1312 : dst[dstlen - 1] = 0;
1313 : }
1314 : return srclen;
1315 : }
1316 :
1317 : /*! Convert a filename from the locale-specific multibyte encoding
1318 : used by Windows to UTF-8 as used by FLTK.
1319 :
1320 : Up to \a dstlen bytes are written to \a dst, including a null
1321 : terminator. The return value is the number of bytes that would be
1322 : written, not counting the null terminator. If greater or equal to
1323 : \a dstlen then if you malloc a new array of size n+1 you will have
1324 : the space needed for the entire string. If \a dstlen is zero then
1325 : nothing is written and this call just measures the storage space
1326 : needed.
1327 :
1328 : On Unix or on Windows when a UTF-8 locale is in effect, this
1329 : does not change the data. It is copied and truncated as necessary to
1330 : the destination buffer and \a srclen is always returned.
1331 : You may also want to check if utf8test() returns non-zero, so that
1332 : the filesystem can store filenames in UTF-8 encoding regardless of
1333 : the locale.
1334 : */
1335 : unsigned utf8frommb(char *dst, unsigned dstlen, const char *src,
1336 : unsigned srclen)
1337 : {
1338 : if (!utf8locale())
1339 : {
1340 : #ifdef _WIN32
1341 : wchar_t lbuf[1024] = {};
1342 : wchar_t *buf = lbuf;
1343 : unsigned ret;
1344 : const unsigned length =
1345 : MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
1346 : if (length >= 1024)
1347 : {
1348 : length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
1349 : buf = static_cast<wchar_t *>(malloc(length * sizeof(wchar_t)));
1350 : MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
1351 : }
1352 : ret = utf8fromwc(dst, dstlen, buf, length);
1353 : if (buf != lbuf)
1354 : free(buf);
1355 : return ret;
1356 : #else
1357 : wchar_t lbuf[1024] = {};
1358 : wchar_t *buf = lbuf;
1359 : const int length = mbstowcs(buf, src, 1024);
1360 : if (length >= 1024)
1361 : {
1362 : length = mbstowcs(0, src, 0) + 1;
1363 : buf =
1364 : static_cast<wchar_t *>(malloc(length * sizeof(unsigned short)));
1365 : mbstowcs(buf, src, length);
1366 : }
1367 : if (length >= 0)
1368 : {
1369 : const unsigned ret = utf8fromwc(dst, dstlen, buf, length);
1370 : if (buf != lbuf)
1371 : free(buf);
1372 : return ret;
1373 : }
1374 : // Errors in conversion return the UTF-8 unchanged.
1375 : #endif
1376 : }
1377 : // Identity transform:
1378 : if (srclen < dstlen)
1379 : {
1380 : memcpy(dst, src, srclen);
1381 : dst[srclen] = 0;
1382 : }
1383 : else
1384 : {
1385 : memcpy(dst, src, dstlen - 1);
1386 : dst[dstlen - 1] = 0;
1387 : }
1388 : return srclen;
1389 : }
1390 :
1391 : #endif // def notdef - disabled locale specific stuff.
1392 :
1393 : /*! Examines the first \a srclen bytes in \a src and return a verdict
1394 : on whether it is UTF-8 or not.
1395 : - Returns 0 if there is any illegal UTF-8 sequences, using the
1396 : same rules as utf8decode(). Note that some UCS values considered
1397 : illegal by RFC 3629, such as 0xffff, are considered legal by this.
1398 : - Returns 1 if there are only single-byte characters (i.e. no bytes
1399 : have the high bit set). This is legal UTF-8, but also indicates
1400 : plain ASCII. It also returns 1 if \a srclen is zero.
1401 : - Returns 2 if there are only characters less than 0x800.
1402 : - Returns 3 if there are only characters less than 0x10000.
1403 : - Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
1404 :
1405 : Because there are many illegal sequences in UTF-8, it is almost
1406 : impossible for a string in another encoding to be confused with
1407 : UTF-8. This is very useful for transitioning Unix to UTF-8
1408 : filenames, you can simply test each filename with this to decide
1409 : if it is UTF-8 or in the locale encoding. My hope is that if
1410 : this is done we will be able to cleanly transition to a locale-less
1411 : encoding.
1412 : */
1413 :
1414 14858 : static int utf8test(const char *src, unsigned srclen)
1415 : {
1416 14858 : int ret = 1;
1417 14858 : const char *p = src;
1418 14858 : const char *e = src + srclen;
1419 1801830 : while (p < e)
1420 : {
1421 1787020 : if (*p == 0)
1422 0 : return 0;
1423 1787020 : if (*p & 0x80)
1424 : {
1425 1602 : int len = 0;
1426 1602 : utf8decode(p, e, &len);
1427 1602 : if (len < 2)
1428 52 : return 0;
1429 1550 : if (len > ret)
1430 552 : ret = len;
1431 1550 : p += len;
1432 : }
1433 : else
1434 : {
1435 1785420 : p++;
1436 : }
1437 : }
1438 14806 : return ret;
1439 : }
|