Line data Source code
1 : /**********************************************************************
2 : *
3 : * Name: cpl_recode.cpp
4 : * Project: CPL - Common Portability Library
5 : * Purpose: Character set recoding and char/wchar_t conversions.
6 : * Author: Andrey Kiselev, dron@ak4719.spb.edu
7 : *
8 : **********************************************************************
9 : * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
10 : * Copyright (c) 2008, Frank Warmerdam
11 : * Copyright (c) 2011-2014, Even Rouault <even dot rouault at spatialys.com>
12 : *
13 : * Permission to use, copy, modify, and distribute this software for any
14 : * purpose with or without fee is hereby granted, provided that the above
15 : * copyright notice and this permission notice appear in all copies.
16 : *
17 : * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
18 : * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
19 : * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
20 : * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
21 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
22 : * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
23 : * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
24 : **********************************************************************/
25 :
26 : #include "cpl_port.h"
27 : #include "cpl_string.h"
28 :
29 : #include <cstring>
30 :
31 : #include "cpl_conv.h"
32 : #include "cpl_character_sets.h"
33 :
34 : #include "utf8.h"
35 :
36 : #ifdef CPL_RECODE_ICONV
37 : extern void CPLClearRecodeIconvWarningFlags();
38 : extern char *CPLRecodeIconv(const char *, const char *,
39 : const char *) CPL_RETURNS_NONNULL;
40 : extern char *CPLRecodeFromWCharIconv(const wchar_t *, const char *,
41 : const char *);
42 : extern wchar_t *CPLRecodeToWCharIconv(const char *, const char *, const char *);
43 : #endif // CPL_RECODE_ICONV
44 :
45 : extern void CPLClearRecodeStubWarningFlags();
46 : extern char *CPLRecodeStub(const char *, const char *,
47 : const char *) CPL_RETURNS_NONNULL;
48 : extern char *CPLRecodeFromWCharStub(const wchar_t *, const char *,
49 : const char *);
50 : extern wchar_t *CPLRecodeToWCharStub(const char *, const char *, const char *);
51 : extern int CPLIsUTF8Stub(const char *, int);
52 :
53 : /************************************************************************/
54 : /* CPLRecode() */
55 : /************************************************************************/
56 :
57 : /**
58 : * Convert a string from a source encoding to a destination encoding.
59 : *
60 : * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
61 : * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
62 : * <ul>
63 : * <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in
64 : * fact)</li>
65 : * <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
66 : * <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
67 : * </ul>
68 : *
69 : * If an error occurs an error may, or may not be posted with CPLError().
70 : *
71 : * @param pszSource a NULL terminated string.
72 : * @param pszSrcEncoding the source encoding.
73 : * @param pszDstEncoding the destination encoding.
74 : *
75 : * @return a NULL terminated string which should be freed with CPLFree().
76 : *
77 : */
78 :
79 1245990 : char CPL_DLL *CPLRecode(const char *pszSource, const char *pszSrcEncoding,
80 : const char *pszDstEncoding)
81 :
82 : {
83 : /* -------------------------------------------------------------------- */
84 : /* Handle a few common short cuts. */
85 : /* -------------------------------------------------------------------- */
86 1245990 : if (EQUAL(pszSrcEncoding, pszDstEncoding))
87 256 : return CPLStrdup(pszSource);
88 :
89 1245740 : if (EQUAL(pszSrcEncoding, CPL_ENC_ASCII) &&
90 0 : (EQUAL(pszDstEncoding, CPL_ENC_UTF8) ||
91 0 : EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1)))
92 0 : return CPLStrdup(pszSource);
93 :
94 : // A few hard coded CPxxx/ISO-8859-x to UTF-8 tables
95 2441620 : if (EQUAL(pszDstEncoding, CPL_ENC_UTF8) &&
96 1195880 : CPLGetConversionTableToUTF8(pszSrcEncoding))
97 : {
98 24593 : return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
99 : }
100 :
101 : #ifdef CPL_RECODE_ICONV
102 : /* -------------------------------------------------------------------- */
103 : /* CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8 */
104 : /* and CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1 conversions are handled */
105 : /* very well by the stub implementation which is faster than the */
106 : /* iconv() route. Use a stub for these two ones and iconv() */
107 : /* everything else. */
108 : /* -------------------------------------------------------------------- */
109 1221140 : if ((EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1) &&
110 1171150 : EQUAL(pszDstEncoding, CPL_ENC_UTF8)) ||
111 49991 : (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) &&
112 49854 : EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1)))
113 : {
114 1218730 : return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
115 : }
116 : #ifdef _WIN32
117 : else if (((EQUAL(pszSrcEncoding, "CP_ACP") ||
118 : EQUAL(pszSrcEncoding, "CP_OEMCP")) &&
119 : EQUAL(pszDstEncoding, CPL_ENC_UTF8)) ||
120 : (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) &&
121 : (EQUAL(pszDstEncoding, "CP_ACP") ||
122 : EQUAL(pszDstEncoding, "CP_OEMCP"))))
123 : {
124 : return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
125 : }
126 : #endif
127 : else
128 : {
129 2414 : return CPLRecodeIconv(pszSource, pszSrcEncoding, pszDstEncoding);
130 : }
131 : #else // CPL_RECODE_STUB
132 : return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
133 : #endif // CPL_RECODE_ICONV
134 : }
135 :
136 : /************************************************************************/
137 : /* CPLRecodeFromWChar() */
138 : /************************************************************************/
139 :
140 : /**
141 : * Convert wchar_t string to UTF-8.
142 : *
143 : * Convert a wchar_t string into a multibyte utf-8 string. The only
144 : * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
145 : * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
146 : * and CPL_ENC_ISO8859_1. In some cases (i.e. using iconv()) other encodings
147 : * may also be supported.
148 : *
149 : * Note that the wchar_t type varies in size on different systems. On
150 : * win32 it is normally 2 bytes, and on UNIX 4 bytes.
151 : *
152 : * If an error occurs an error may, or may not be posted with CPLError().
153 : *
154 : * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
155 : * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
156 : * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
157 : *
158 : * @return a zero terminated multi-byte string which should be freed with
159 : * CPLFree(), or NULL if an error occurs.
160 : *
161 : */
162 :
163 130853 : char CPL_DLL *CPLRecodeFromWChar(const wchar_t *pwszSource,
164 : const char *pszSrcEncoding,
165 : const char *pszDstEncoding)
166 :
167 : {
168 : #ifdef CPL_RECODE_ICONV
169 : /* -------------------------------------------------------------------- */
170 : /* Conversions from CPL_ENC_UCS2 */
171 : /* to CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */
172 : /* handled by the stub implementation. */
173 : /* -------------------------------------------------------------------- */
174 130853 : if ((EQUAL(pszSrcEncoding, CPL_ENC_UCS2) ||
175 1663 : EQUAL(pszSrcEncoding, "WCHAR_T")) &&
176 130852 : (EQUAL(pszDstEncoding, CPL_ENC_UTF8) ||
177 0 : EQUAL(pszDstEncoding, CPL_ENC_ASCII) ||
178 0 : EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1)))
179 : {
180 130852 : return CPLRecodeFromWCharStub(pwszSource, pszSrcEncoding,
181 130852 : pszDstEncoding);
182 : }
183 :
184 1 : return CPLRecodeFromWCharIconv(pwszSource, pszSrcEncoding, pszDstEncoding);
185 :
186 : #else // CPL_RECODE_STUB
187 : return CPLRecodeFromWCharStub(pwszSource, pszSrcEncoding, pszDstEncoding);
188 : #endif // CPL_RECODE_ICONV
189 : }
190 :
191 : /************************************************************************/
192 : /* CPLRecodeToWChar() */
193 : /************************************************************************/
194 :
195 : /**
196 : * Convert UTF-8 string to a wchar_t string.
197 : *
198 : * Convert a 8bit, multi-byte per character input string into a wide
199 : * character (wchar_t) string. The only guaranteed supported source encodings
200 : * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1). The only
201 : * guaranteed supported destination encoding is CPL_ENC_UCS2. Other source
202 : * and destination encodings may be supported depending on the underlying
203 : * implementation.
204 : *
205 : * Note that the wchar_t type varies in size on different systems. On
206 : * win32 it is normally 2 bytes, and on UNIX 4 bytes.
207 : *
208 : * If an error occurs an error may, or may not be posted with CPLError().
209 : *
210 : * @param pszSource input multi-byte character string.
211 : * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
212 : * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
213 : *
214 : * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
215 : * NULL on error.
216 : *
217 : */
218 :
219 41083 : wchar_t CPL_DLL *CPLRecodeToWChar(const char *pszSource,
220 : const char *pszSrcEncoding,
221 : const char *pszDstEncoding)
222 :
223 : {
224 : #ifdef CPL_RECODE_ICONV
225 : /* -------------------------------------------------------------------- */
226 : /* Conversions to CPL_ENC_UCS2 */
227 : /* from CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */
228 : /* handled by the stub implementation. */
229 : /* -------------------------------------------------------------------- */
230 41083 : if ((EQUAL(pszDstEncoding, CPL_ENC_UCS2) ||
231 0 : EQUAL(pszDstEncoding, "WCHAR_T")) &&
232 41083 : (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) ||
233 0 : EQUAL(pszSrcEncoding, CPL_ENC_ASCII) ||
234 0 : EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1)))
235 : {
236 41083 : return CPLRecodeToWCharStub(pszSource, pszSrcEncoding, pszDstEncoding);
237 : }
238 :
239 0 : return CPLRecodeToWCharIconv(pszSource, pszSrcEncoding, pszDstEncoding);
240 :
241 : #else // CPL_RECODE_STUB
242 : return CPLRecodeToWCharStub(pszSource, pszSrcEncoding, pszDstEncoding);
243 : #endif // CPL_RECODE_ICONV
244 : }
245 :
246 : /************************************************************************/
247 : /* CPLIsASCII() */
248 : /************************************************************************/
249 :
250 : /**
251 : * Test if a string is encoded as ASCII.
252 : *
253 : * @param pabyData input string to test
254 : * @param nLen length of the input string, or -1 if the function must compute
255 : * the string length. In which case it must be null terminated.
256 : * @return true if the string is encoded as ASCII. false otherwise
257 : *
258 : * @since GDAL 3.6.0
259 : */
260 803 : bool CPLIsASCII(const char *pabyData, size_t nLen)
261 : {
262 803 : if (nLen == static_cast<size_t>(-1))
263 21 : nLen = strlen(pabyData);
264 14151 : for (size_t i = 0; i < nLen; ++i)
265 : {
266 13351 : if (static_cast<unsigned char>(pabyData[i]) > 127)
267 3 : return false;
268 : }
269 800 : return true;
270 : }
271 :
272 : /************************************************************************/
273 : /* CPLForceToASCII() */
274 : /************************************************************************/
275 :
276 : /**
277 : * Return a new string that is made only of ASCII characters. If non-ASCII
278 : * characters are found in the input string, they will be replaced by the
279 : * provided replacement character.
280 : *
281 : * This function does not make any assumption on the encoding of the input
282 : * string (except it must be nul-terminated if nLen equals -1, or have at
283 : * least nLen bytes otherwise). CPLUTF8ForceToASCII() can be used instead when
284 : * the input string is known to be UTF-8 encoded.
285 : *
286 : * @param pabyData input string to test
287 : * @param nLen length of the input string, or -1 if the function must compute
288 : * the string length. In which case it must be null terminated.
289 :
290 : * @param chReplacementChar character which will be used when the input stream
291 : * contains a non ASCII character. Must be valid ASCII!
292 : *
293 : * @return a new string that must be freed with CPLFree().
294 : *
295 : */
296 5 : char *CPLForceToASCII(const char *pabyData, int nLen, char chReplacementChar)
297 : {
298 5 : const size_t nRealLen =
299 5 : (nLen >= 0) ? static_cast<size_t>(nLen) : strlen(pabyData);
300 5 : char *pszOutputString = static_cast<char *>(CPLMalloc(nRealLen + 1));
301 5 : const char *pszPtr = pabyData;
302 5 : const char *pszEnd = pabyData + nRealLen;
303 5 : size_t i = 0;
304 19 : while (pszPtr != pszEnd)
305 : {
306 14 : if (*reinterpret_cast<const unsigned char *>(pszPtr) > 127)
307 : {
308 3 : pszOutputString[i] = chReplacementChar;
309 3 : ++pszPtr;
310 3 : ++i;
311 : }
312 : else
313 : {
314 11 : pszOutputString[i] = *pszPtr;
315 11 : ++pszPtr;
316 11 : ++i;
317 : }
318 : }
319 5 : pszOutputString[i] = '\0';
320 5 : return pszOutputString;
321 : }
322 :
323 : /************************************************************************/
324 : /* CPLUTF8ForceToASCII() */
325 : /************************************************************************/
326 :
327 : /**
328 : * Return a new string that is made only of ASCII characters. If non-ASCII
329 : * characters are found in the input string, for which an "equivalent" ASCII
330 : * character is not found, they will be replaced by the provided replacement
331 : * character.
332 : *
333 : * This function is aware of https://en.wikipedia.org/wiki/Latin-1_Supplement
334 : * and https://en.wikipedia.org/wiki/Latin_Extended-A to provide sensible
335 : * replacements for accented characters.
336 :
337 : * @param pszStr NUL-terminated UTF-8 string.
338 : * @param chReplacementChar character which will be used when the input stream
339 : * contains a non ASCII character that cannot be
340 : * substituted with an equivalent ASCII character.
341 : * Must be valid ASCII!
342 : *
343 : * @return a new string that must be freed with CPLFree().
344 : *
345 : * @since GDAL 3.9
346 : */
347 17 : char *CPLUTF8ForceToASCII(const char *pszStr, char chReplacementChar)
348 : {
349 : static const struct
350 : {
351 : short nCodePoint;
352 : char chFirst;
353 : char chSecond;
354 : } aLatinCharacters[] = {
355 : // https://en.wikipedia.org/wiki/Latin-1_Supplement
356 : {0xC0, 'A', 0}, // Latin Capital Letter A with grave
357 : {0xC1, 'A', 0}, // Latin Capital letter A with acute
358 : {0xC2, 'A', 0}, // Latin Capital letter A with circumflex
359 : {0xC3, 'A', 0}, // Latin Capital letter A with tilde
360 : {0xC4, 'A', 0}, // Latin Capital letter A with diaeresis
361 : {0xC5, 'A', 0}, // Latin Capital letter A with ring above
362 : {0xC6, 'A', 'E'}, // Latin Capital letter AE
363 : {0xC7, 'C', 0}, // Latin Capital letter C with cedilla
364 : {0xC8, 'E', 0}, // Latin Capital letter E with grave
365 : {0xC9, 'E', 0}, // Latin Capital letter E with acute
366 : {0xCA, 'E', 0}, // Latin Capital letter E with circumflex
367 : {0xCB, 'E', 0}, // Latin Capital letter E with diaeresis
368 : {0xCC, 'I', 0}, // Latin Capital letter I with grave
369 : {0xCD, 'I', 0}, // Latin Capital letter I with acute
370 : {0xCE, 'I', 0}, // Latin Capital letter I with circumflex
371 : {0xCF, 'I', 0}, // Latin Capital letter I with diaeresis
372 : // { 0xD0, '?', 0 }, // Latin Capital letter Eth
373 : {0xD1, 'N', 0}, // Latin Capital letter N with tilde
374 : {0xD2, 'O', 0}, // Latin Capital letter O with grave
375 : {0xD3, 'O', 0}, // Latin Capital letter O with acute
376 : {0xD4, 'O', 0}, // Latin Capital letter O with circumflex
377 : {0xD5, 'O', 0}, // Latin Capital letter O with tilde
378 : {0xD6, 'O', 0}, // Latin Capital letter O with diaeresis
379 : {0xD8, 'O', 0}, // Latin Capital letter O with stroke
380 : {0xD9, 'U', 0}, // Latin Capital letter U with grave
381 : {0xDA, 'U', 0}, // Latin Capital letter U with acute
382 : {0xDB, 'U', 0}, // Latin Capital Letter U with circumflex
383 : {0xDC, 'U', 0}, // Latin Capital Letter U with diaeresis
384 : {0xDD, 'Y', 0}, // Latin Capital Letter Y with acute
385 : // { 0xDE, '?', 0 }, // Latin Capital Letter Thorn
386 : {0xDF, 'S', 'S'}, // Latin Small Letter sharp S
387 : {0xE0, 'a', 0}, // Latin Small Letter A with grave
388 : {0xE1, 'a', 0}, // Latin Small Letter A with acute
389 : {0xE2, 'a', 0}, // Latin Small Letter A with circumflex
390 : {0xE3, 'a', 0}, // Latin Small Letter A with tilde
391 : {0xE4, 'a', 0}, // Latin Small Letter A with diaeresis
392 : {0xE5, 'a', 0}, // Latin Small Letter A with ring above
393 : {0xE6, 'a', 'e'}, // Latin Small Letter AE
394 : {0xE7, 'c', 0}, // Latin Small Letter C with cedilla
395 : {0xE8, 'e', 0}, // Latin Small Letter E with grave
396 : {0xE9, 'e', 0}, // Latin Small Letter E with acute
397 : {0xEA, 'e', 0}, // Latin Small Letter E with circumflex
398 : {0xEB, 'e', 0}, // Latin Small Letter E with diaeresis
399 : {0xEC, 'i', 0}, // Latin Small Letter I with grave
400 : {0xED, 'i', 0}, // Latin Small Letter I with acute
401 : {0xEE, 'i', 0}, // Latin Small Letter I with circumflex
402 : {0xEF, 'i', 0}, // Latin Small Letter I with diaeresis
403 : // { 0xF0, '?', 0 }, // Latin Small Letter Eth
404 : {0xF1, 'n', 0}, // Latin Small Letter N with tilde
405 : {0xF2, 'o', 0}, // Latin Small Letter O with grave
406 : {0xF3, 'o', 0}, // Latin Small Letter O with acute
407 : {0xF4, 'o', 0}, // Latin Small Letter O with circumflex
408 : {0xF5, 'o', 0}, // Latin Small Letter O with tilde
409 : {0xF6, 'o', 0}, // Latin Small Letter O with diaeresis
410 : {0xF8, 'o', 0}, // Latin Small Letter O with stroke
411 : {0xF9, 'u', 0}, // Latin Small Letter U with grave
412 : {0xFA, 'u', 0}, // Latin Small Letter U with acute
413 : {0xFB, 'u', 0}, // Latin Small Letter U with circumflex
414 : {0xFC, 'u', 0}, // Latin Small Letter U with diaeresis
415 : {0xFD, 'y', 0}, // Latin Small Letter Y with acute
416 : // { 0xFE, '?', 0 }, // Latin Small Letter Thorn
417 : {0xFF, 'u', 0}, // Latin Small Letter Y with diaeresis
418 :
419 : // https://en.wikipedia.org/wiki/Latin_Extended-A
420 : {
421 : 0x0100,
422 : 'A',
423 : 0,
424 : }, // Latin Capital letter A with macron
425 : {
426 : 0x0101,
427 : 'a',
428 : 0,
429 : }, // Latin Small letter A with macron
430 : {
431 : 0x0102,
432 : 'A',
433 : 0,
434 : }, // Latin Capital letter A with breve
435 : {
436 : 0x0103,
437 : 'a',
438 : 0,
439 : }, // Latin Small letter A with breve
440 : {
441 : 0x0104,
442 : 'A',
443 : 0,
444 : }, // Latin Capital letter A with ogonek
445 : {
446 : 0x0105,
447 : 'a',
448 : 0,
449 : }, // Latin Small letter A with ogonek
450 : {
451 : 0x0106,
452 : 'C',
453 : 0,
454 : }, // Latin Capital letter C with acute
455 : {
456 : 0x0107,
457 : 'c',
458 : 0,
459 : }, // Latin Small letter C with acute
460 : {
461 : 0x0108,
462 : 'C',
463 : 0,
464 : }, // Latin Capital letter C with circumflex
465 : {
466 : 0x0109,
467 : 'c',
468 : 0,
469 : }, // Latin Small letter C with circumflex
470 : {
471 : 0x010A,
472 : 'C',
473 : 0,
474 : }, // Latin Capital letter C with dot above
475 : {
476 : 0x010B,
477 : 'c',
478 : 0,
479 : }, // Latin Small letter C with dot above
480 : {
481 : 0x010C,
482 : 'C',
483 : 0,
484 : }, // Latin Capital letter C with caron
485 : {
486 : 0x010D,
487 : 'c',
488 : 0,
489 : }, // Latin Small letter C with caron
490 : {
491 : 0x010E,
492 : 'D',
493 : 0,
494 : }, // Latin Capital letter D with caron
495 : {
496 : 0x010F,
497 : 'd',
498 : 0,
499 : }, // Latin Small letter D with caron
500 : {
501 : 0x0110,
502 : 'D',
503 : 0,
504 : }, // Latin Capital letter D with stroke
505 : {
506 : 0x0111,
507 : 'd',
508 : 0,
509 : }, // Latin Small letter D with stroke
510 : {
511 : 0x0112,
512 : 'E',
513 : 0,
514 : }, // Latin Capital letter E with macron
515 : {
516 : 0x0113,
517 : 'e',
518 : 0,
519 : }, // Latin Small letter E with macron
520 : {
521 : 0x0114,
522 : 'E',
523 : 0,
524 : }, // Latin Capital letter E with breve
525 : {
526 : 0x0115,
527 : 'e',
528 : 0,
529 : }, // Latin Small letter E with breve
530 : {
531 : 0x0116,
532 : 'E',
533 : 0,
534 : }, // Latin Capital letter E with dot above
535 : {
536 : 0x0117,
537 : 'e',
538 : 0,
539 : }, // Latin Small letter E with dot above
540 : {
541 : 0x0118,
542 : 'E',
543 : 0,
544 : }, // Latin Capital letter E with ogonek
545 : {
546 : 0x0119,
547 : 'e',
548 : 0,
549 : }, // Latin Small letter E with ogonek
550 : {
551 : 0x011A,
552 : 'E',
553 : 0,
554 : }, // Latin Capital letter E with caron
555 : {
556 : 0x011B,
557 : 'e',
558 : 0,
559 : }, // Latin Small letter E with caron
560 : {
561 : 0x011C,
562 : 'G',
563 : 0,
564 : }, // Latin Capital letter G with circumflex
565 : {
566 : 0x011D,
567 : 'g',
568 : 0,
569 : }, // Latin Small letter G with circumflex
570 : {
571 : 0x011E,
572 : 'G',
573 : 0,
574 : }, // Latin Capital letter G with breve
575 : {
576 : 0x011F,
577 : 'g',
578 : 0,
579 : }, // Latin Small letter G with breve
580 : {
581 : 0x0120,
582 : 'G',
583 : 0,
584 : }, // Latin Capital letter G with dot above
585 : {
586 : 0x0121,
587 : 'g',
588 : 0,
589 : }, // Latin Small letter G with dot above
590 : {
591 : 0x0122,
592 : 'G',
593 : 0,
594 : }, // Latin Capital letter G with cedilla
595 : {
596 : 0x0123,
597 : 'g',
598 : 0,
599 : }, // Latin Small letter G with cedilla
600 : {
601 : 0x0124,
602 : 'H',
603 : 0,
604 : }, // Latin Capital letter H with circumflex
605 : {
606 : 0x0125,
607 : 'h',
608 : 0,
609 : }, // Latin Small letter H with circumflex
610 : {
611 : 0x0126,
612 : 'H',
613 : 0,
614 : }, // Latin Capital letter H with stroke
615 : {
616 : 0x0127,
617 : 'h',
618 : 0,
619 : }, // Latin Small letter H with stroke
620 : {
621 : 0x0128,
622 : 'I',
623 : 0,
624 : }, // Latin Capital letter I with tilde
625 : {
626 : 0x0129,
627 : 'i',
628 : 0,
629 : }, // Latin Small letter I with tilde
630 : {
631 : 0x012A,
632 : 'I',
633 : 0,
634 : }, // Latin Capital letter I with macron
635 : {
636 : 0x012B,
637 : 'i',
638 : 0,
639 : }, // Latin Small letter I with macron
640 : {
641 : 0x012C,
642 : 'I',
643 : 0,
644 : }, // Latin Capital letter I with breve
645 : {
646 : 0x012D,
647 : 'i',
648 : 0,
649 : }, // Latin Small letter I with breve
650 : {
651 : 0x012E,
652 : 'I',
653 : 0,
654 : }, // Latin Capital letter I with ogonek
655 : {
656 : 0x012F,
657 : 'i',
658 : 0,
659 : }, // Latin Small letter I with ogonek
660 : {
661 : 0x0130,
662 : 'I',
663 : 0,
664 : }, // Latin Capital letter I with dot above
665 : {
666 : 0x0131,
667 : 'i',
668 : 0,
669 : }, // Latin Small letter dotless I
670 : {
671 : 0x0132,
672 : 'I',
673 : 'J',
674 : }, // Latin Capital Ligature IJ
675 : {
676 : 0x0133,
677 : 'i',
678 : 'j',
679 : }, // Latin Small Ligature IJ
680 : {
681 : 0x0134,
682 : 'J',
683 : 0,
684 : }, // Latin Capital letter J with circumflex
685 : {
686 : 0x0135,
687 : 'j',
688 : 0,
689 : }, // Latin Small letter J with circumflex
690 : {
691 : 0x0136,
692 : 'K',
693 : 0,
694 : }, // Latin Capital letter K with cedilla
695 : {
696 : 0x0137,
697 : 'k',
698 : 0,
699 : }, // Latin Small letter K with cedilla
700 : {
701 : 0x0138,
702 : 'k',
703 : 0,
704 : }, // Latin Small letter Kra
705 : {
706 : 0x0139,
707 : 'L',
708 : 0,
709 : }, // Latin Capital letter L with acute
710 : {
711 : 0x013A,
712 : 'l',
713 : 0,
714 : }, // Latin Small letter L with acute
715 : {
716 : 0x013B,
717 : 'L',
718 : 0,
719 : }, // Latin Capital letter L with cedilla
720 : {
721 : 0x013C,
722 : 'l',
723 : 0,
724 : }, // Latin Small letter L with cedilla
725 : {
726 : 0x013D,
727 : 'L',
728 : 0,
729 : }, // Latin Capital letter L with caron
730 : {
731 : 0x013E,
732 : 'l',
733 : 0,
734 : }, // Latin Small letter L with caron
735 : {
736 : 0x013F,
737 : 'L',
738 : 0,
739 : }, // Latin Capital letter L with middle dot
740 : {
741 : 0x0140,
742 : 'l',
743 : 0,
744 : }, // Latin Small letter L with middle dot
745 : {
746 : 0x0141,
747 : 'L',
748 : 0,
749 : }, // Latin Capital letter L with stroke
750 : {
751 : 0x0142,
752 : 'l',
753 : 0,
754 : }, // Latin Small letter L with stroke
755 : {
756 : 0x0143,
757 : 'N',
758 : 0,
759 : }, // Latin Capital letter N with acute
760 : {
761 : 0x0144,
762 : 'n',
763 : 0,
764 : }, // Latin Small letter N with acute
765 : {
766 : 0x0145,
767 : 'N',
768 : 0,
769 : }, // Latin Capital letter N with cedilla
770 : {
771 : 0x0146,
772 : 'n',
773 : 0,
774 : }, // Latin Small letter N with cedilla
775 : {
776 : 0x0147,
777 : 'N',
778 : 0,
779 : }, // Latin Capital letter N with caron
780 : {
781 : 0x0148,
782 : 'n',
783 : 0,
784 : }, // Latin Small letter N with caron
785 : // { 0x014A , '?' , 0, }, // Latin Capital letter Eng
786 : // { 0x014B , '?' , 0, }, // Latin Small letter Eng
787 : {
788 : 0x014C,
789 : 'O',
790 : 0,
791 : }, // Latin Capital letter O with macron
792 : {
793 : 0x014D,
794 : 'o',
795 : 0,
796 : }, // Latin Small letter O with macron
797 : {
798 : 0x014E,
799 : 'O',
800 : 0,
801 : }, // Latin Capital letter O with breve
802 : {
803 : 0x014F,
804 : 'o',
805 : 0,
806 : }, // Latin Small letter O with breve
807 : {
808 : 0x0150,
809 : 'O',
810 : 0,
811 : }, // Latin Capital Letter O with double acute
812 : {
813 : 0x0151,
814 : 'o',
815 : 0,
816 : }, // Latin Small Letter O with double acute
817 : {
818 : 0x0152,
819 : 'O',
820 : 'E',
821 : }, // Latin Capital Ligature OE
822 : {
823 : 0x0153,
824 : 'o',
825 : 'e',
826 : }, // Latin Small Ligature OE
827 : {
828 : 0x0154,
829 : 'R',
830 : 0,
831 : }, // Latin Capital letter R with acute
832 : {
833 : 0x0155,
834 : 'r',
835 : 0,
836 : }, // Latin Small letter R with acute
837 : {
838 : 0x0156,
839 : 'R',
840 : 0,
841 : }, // Latin Capital letter R with cedilla
842 : {
843 : 0x0157,
844 : 'r',
845 : 0,
846 : }, // Latin Small letter R with cedilla
847 : {
848 : 0x0158,
849 : 'R',
850 : 0,
851 : }, // Latin Capital letter R with caron
852 : {
853 : 0x0159,
854 : 'r',
855 : 0,
856 : }, // Latin Small letter R with caron
857 : {
858 : 0x015A,
859 : 'S',
860 : 0,
861 : }, // Latin Capital letter S with acute
862 : {
863 : 0x015B,
864 : 's',
865 : 0,
866 : }, // Latin Small letter S with acute
867 : {
868 : 0x015C,
869 : 'S',
870 : 0,
871 : }, // Latin Capital letter S with circumflex
872 : {
873 : 0x015D,
874 : 's',
875 : 0,
876 : }, // Latin Small letter S with circumflex
877 : {
878 : 0x015E,
879 : 'S',
880 : 0,
881 : }, // Latin Capital letter S with cedilla
882 : {
883 : 0x015F,
884 : 's',
885 : 0,
886 : }, // Latin Small letter S with cedilla
887 : {
888 : 0x0160,
889 : 'S',
890 : 0,
891 : }, // Latin Capital letter S with caron
892 : {
893 : 0x0161,
894 : 's',
895 : 0,
896 : }, // Latin Small letter S with caron
897 : {
898 : 0x0162,
899 : 'T',
900 : 0,
901 : }, // Latin Capital letter T with cedilla
902 : {
903 : 0x0163,
904 : 't',
905 : 0,
906 : }, // Latin Small letter T with cedilla
907 : {
908 : 0x0164,
909 : 'T',
910 : 0,
911 : }, // Latin Capital letter T with caron
912 : {
913 : 0x0165,
914 : 't',
915 : 0,
916 : }, // Latin Small letter T with caron
917 : {
918 : 0x0166,
919 : 'T',
920 : 0,
921 : }, // Latin Capital letter T with stroke
922 : {
923 : 0x0167,
924 : 't',
925 : 0,
926 : }, // Latin Small letter T with stroke
927 : {
928 : 0x0168,
929 : 'U',
930 : 0,
931 : }, // Latin Capital letter U with tilde
932 : {
933 : 0x0169,
934 : 'u',
935 : 0,
936 : }, // Latin Small letter U with tilde
937 : {
938 : 0x016A,
939 : 'U',
940 : 0,
941 : }, // Latin Capital letter U with macron
942 : {
943 : 0x016B,
944 : 'u',
945 : 0,
946 : }, // Latin Small letter U with macron
947 : {
948 : 0x016C,
949 : 'U',
950 : 0,
951 : }, // Latin Capital letter U with breve
952 : {
953 : 0x016D,
954 : 'u',
955 : 0,
956 : }, // Latin Small letter U with breve
957 : {
958 : 0x016E,
959 : 'U',
960 : 0,
961 : }, // Latin Capital letter U with ring above
962 : {
963 : 0x016F,
964 : 'u',
965 : 0,
966 : }, // Latin Small letter U with ring above
967 : {
968 : 0x0170,
969 : 'U',
970 : 0,
971 : }, // Latin Capital Letter U with double acute
972 : {
973 : 0x0171,
974 : 'u',
975 : 0,
976 : }, // Latin Small Letter U with double acute
977 : {
978 : 0x0172,
979 : 'U',
980 : 0,
981 : }, // Latin Capital letter U with ogonek
982 : {
983 : 0x0173,
984 : 'u',
985 : 0,
986 : }, // Latin Small letter U with ogonek
987 : {
988 : 0x0174,
989 : 'W',
990 : 0,
991 : }, // Latin Capital letter W with circumflex
992 : {
993 : 0x0175,
994 : 'w',
995 : 0,
996 : }, // Latin Small letter W with circumflex
997 : {
998 : 0x0176,
999 : 'Y',
1000 : 0,
1001 : }, // Latin Capital letter Y with circumflex
1002 : {
1003 : 0x0177,
1004 : 'y',
1005 : 0,
1006 : }, // Latin Small letter Y with circumflex
1007 : {
1008 : 0x0178,
1009 : 'Y',
1010 : 0,
1011 : }, // Latin Capital letter Y with diaeresis
1012 : {
1013 : 0x0179,
1014 : 'Z',
1015 : 0,
1016 : }, // Latin Capital letter Z with acute
1017 : {
1018 : 0x017A,
1019 : 'z',
1020 : 0,
1021 : }, // Latin Small letter Z with acute
1022 : {
1023 : 0x017B,
1024 : 'Z',
1025 : 0,
1026 : }, // Latin Capital letter Z with dot above
1027 : {
1028 : 0x017C,
1029 : 'z',
1030 : 0,
1031 : }, // Latin Small letter Z with dot above
1032 : {
1033 : 0x017D,
1034 : 'Z',
1035 : 0,
1036 : }, // Latin Capital letter Z with caron
1037 : {
1038 : 0x017E,
1039 : 'z',
1040 : 0,
1041 : }, // Latin Small letter Z with caron
1042 : };
1043 :
1044 17 : const size_t nLen = strlen(pszStr);
1045 17 : char *pszOutputString = static_cast<char *>(CPLMalloc(nLen + 1));
1046 17 : const char *pszPtr = pszStr;
1047 17 : const char *pszEnd = pszStr + nLen;
1048 17 : size_t i = 0;
1049 255 : while (pszPtr != pszEnd)
1050 : {
1051 240 : if (*reinterpret_cast<const unsigned char *>(pszPtr) > 127)
1052 : {
1053 : utf8_int32_t codepoint;
1054 190 : if (pszPtr + utf8codepointcalcsize(
1055 190 : reinterpret_cast<const utf8_int8_t *>(pszPtr)) >
1056 : pszEnd)
1057 2 : break;
1058 188 : auto pszNext = reinterpret_cast<const char *>(utf8codepoint(
1059 : reinterpret_cast<const utf8_int8_t *>(pszPtr), &codepoint));
1060 188 : char ch = chReplacementChar;
1061 17075 : for (const auto &latin1char : aLatinCharacters)
1062 : {
1063 17073 : if (codepoint == latin1char.nCodePoint)
1064 : {
1065 186 : pszOutputString[i] = latin1char.chFirst;
1066 186 : ++i;
1067 186 : if (latin1char.chSecond)
1068 : {
1069 7 : pszOutputString[i] = latin1char.chSecond;
1070 7 : ++i;
1071 : }
1072 186 : ch = 0;
1073 186 : break;
1074 : }
1075 : }
1076 188 : if (ch)
1077 : {
1078 2 : pszOutputString[i] = ch;
1079 2 : ++i;
1080 : }
1081 188 : pszPtr = pszNext;
1082 : }
1083 : else
1084 : {
1085 50 : pszOutputString[i] = *pszPtr;
1086 50 : ++pszPtr;
1087 50 : ++i;
1088 : }
1089 : }
1090 17 : pszOutputString[i] = '\0';
1091 17 : return pszOutputString;
1092 : }
1093 :
1094 : /************************************************************************/
1095 : /* CPLEncodingCharSize() */
1096 : /************************************************************************/
1097 :
1098 : /**
1099 : * Return bytes per character for encoding.
1100 : *
1101 : * This function returns the size in bytes of the smallest character
1102 : * in this encoding. For fixed width encodings (ASCII, UCS-2, UCS-4) this
1103 : * is straight forward. For encodings like UTF8 and UTF16 which represent
1104 : * some characters as a sequence of atomic character sizes the function
1105 : * still returns the atomic character size (1 for UTF8, 2 for UTF16).
1106 : *
1107 : * This function will return the correct value for well known encodings
1108 : * with corresponding CPL_ENC_ values. It may not return the correct value
1109 : * for other encodings even if they are supported by the underlying iconv
1110 : * or windows transliteration services. Hopefully it will improve over time.
1111 : *
1112 : * @param pszEncoding the name of the encoding.
1113 : *
1114 : * @return the size of a minimal character in bytes or -1 if the size is
1115 : * unknown.
1116 : */
1117 :
1118 1 : int CPLEncodingCharSize(const char *pszEncoding)
1119 :
1120 : {
1121 1 : if (EQUAL(pszEncoding, CPL_ENC_UTF8))
1122 0 : return 1;
1123 1 : else if (EQUAL(pszEncoding, CPL_ENC_UTF16) ||
1124 1 : EQUAL(pszEncoding, "UTF-16LE"))
1125 1 : return 2;
1126 0 : else if (EQUAL(pszEncoding, CPL_ENC_UCS2) || EQUAL(pszEncoding, "UCS-2LE"))
1127 0 : return 2;
1128 0 : else if (EQUAL(pszEncoding, CPL_ENC_UCS4))
1129 0 : return 4;
1130 0 : else if (EQUAL(pszEncoding, CPL_ENC_ASCII))
1131 0 : return 1;
1132 0 : else if (STARTS_WITH_CI(pszEncoding, "ISO-8859-"))
1133 0 : return 1;
1134 :
1135 0 : return -1;
1136 : }
1137 :
1138 : /************************************************************************/
1139 : /* CPLClearRecodeWarningFlags() */
1140 : /************************************************************************/
1141 :
1142 13601 : void CPLClearRecodeWarningFlags()
1143 : {
1144 : #ifdef CPL_RECODE_ICONV
1145 13601 : CPLClearRecodeIconvWarningFlags();
1146 : #endif
1147 13601 : CPLClearRecodeStubWarningFlags();
1148 13601 : }
1149 :
1150 : /************************************************************************/
1151 : /* CPLStrlenUTF8() */
1152 : /************************************************************************/
1153 :
1154 : /**
1155 : * Return the number of UTF-8 characters of a nul-terminated string.
1156 : *
1157 : * This is different from strlen() which returns the number of bytes.
1158 : *
1159 : * @param pszUTF8Str a nul-terminated UTF-8 string
1160 : *
1161 : * @return the number of UTF-8 characters.
1162 : */
1163 :
1164 2 : int CPLStrlenUTF8(const char *pszUTF8Str)
1165 : {
1166 2 : int nCharacterCount = 0;
1167 7 : for (size_t i = 0; pszUTF8Str[i] != '\0'; ++i)
1168 : {
1169 5 : if ((pszUTF8Str[i] & 0xc0) != 0x80)
1170 : {
1171 4 : if (nCharacterCount == INT_MAX)
1172 : {
1173 0 : CPLError(CE_Failure, CPLE_AppDefined,
1174 : "CPLStrlenUTF8(): nCharacterCount > INT_MAX. Use "
1175 : "CPLStrlenUTF8Ex() instead");
1176 0 : break;
1177 : }
1178 4 : ++nCharacterCount;
1179 : }
1180 : }
1181 2 : return nCharacterCount;
1182 : }
1183 :
1184 : /************************************************************************/
1185 : /* CPLStrlenUTF8Ex() */
1186 : /************************************************************************/
1187 :
1188 : /**
1189 : * Return the number of UTF-8 characters of a nul-terminated string.
1190 : *
1191 : * This is different from strlen() which returns the number of bytes.
1192 : *
1193 : * @param pszUTF8Str a nul-terminated UTF-8 string
1194 : *
1195 : * @return the number of UTF-8 characters.
1196 : */
1197 :
1198 402309 : size_t CPLStrlenUTF8Ex(const char *pszUTF8Str)
1199 : {
1200 402309 : size_t nCharacterCount = 0;
1201 20268800 : for (size_t i = 0; pszUTF8Str[i] != '\0'; ++i)
1202 : {
1203 19866500 : if ((pszUTF8Str[i] & 0xc0) != 0x80)
1204 : {
1205 19865800 : ++nCharacterCount;
1206 : }
1207 : }
1208 402309 : return nCharacterCount;
1209 : }
1210 :
1211 : /************************************************************************/
1212 : /* CPLCanRecode() */
1213 : /************************************************************************/
1214 :
1215 : /**
1216 : * Checks if it is possible to recode a string from one encoding to another.
1217 : *
1218 : * @param pszTestStr a NULL terminated string.
1219 : * @param pszSrcEncoding the source encoding.
1220 : * @param pszDstEncoding the destination encoding.
1221 : *
1222 : * @return a TRUE if recode is possible.
1223 : *
1224 : * @since GDAL 3.1.0
1225 : */
1226 8537 : int CPLCanRecode(const char *pszTestStr, const char *pszSrcEncoding,
1227 : const char *pszDstEncoding)
1228 : {
1229 8537 : CPLClearRecodeWarningFlags();
1230 8537 : CPLErrorReset();
1231 :
1232 8537 : CPLPushErrorHandler(CPLQuietErrorHandler);
1233 8537 : char *pszRec(CPLRecode(pszTestStr, pszSrcEncoding, pszDstEncoding));
1234 8537 : CPLPopErrorHandler();
1235 :
1236 8537 : if (pszRec == nullptr)
1237 : {
1238 0 : return FALSE;
1239 : }
1240 :
1241 8537 : CPLFree(pszRec);
1242 :
1243 8537 : if (CPLGetLastErrorType() != 0)
1244 : {
1245 1 : return FALSE;
1246 : }
1247 :
1248 8536 : return TRUE;
1249 : }
|