Line data Source code
1 : /**********************************************************************
2 : *
3 : * Name: cpl_recode.cpp
4 : * Project: CPL - Common Portability Library
5 : * Purpose: Character set recoding and char/wchar_t conversions.
6 : * Author: Andrey Kiselev, dron@ak4719.spb.edu
7 : *
8 : **********************************************************************
9 : * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
10 : * Copyright (c) 2008, Frank Warmerdam
11 : * Copyright (c) 2011-2014, Even Rouault <even dot rouault at spatialys.com>
12 : *
13 : * Permission to use, copy, modify, and distribute this software for any
14 : * purpose with or without fee is hereby granted, provided that the above
15 : * copyright notice and this permission notice appear in all copies.
16 : *
17 : * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
18 : * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
19 : * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
20 : * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
21 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
22 : * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
23 : * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
24 : **********************************************************************/
25 :
26 : #include "cpl_port.h"
27 : #include "cpl_string.h"
28 :
29 : #include <cstring>
30 :
31 : #include "cpl_conv.h"
32 : #include "cpl_character_sets.h"
33 :
34 : #include "utf8.h"
35 :
36 : #ifdef CPL_RECODE_ICONV
37 : extern void CPLClearRecodeIconvWarningFlags();
38 : extern char *CPLRecodeIconv(const char *, const char *,
39 : const char *) CPL_RETURNS_NONNULL;
40 : extern char *CPLRecodeFromWCharIconv(const wchar_t *, const char *,
41 : const char *);
42 : extern wchar_t *CPLRecodeToWCharIconv(const char *, const char *, const char *);
43 : #endif // CPL_RECODE_ICONV
44 :
45 : extern void CPLClearRecodeStubWarningFlags();
46 : extern char *CPLRecodeStub(const char *, const char *,
47 : const char *) CPL_RETURNS_NONNULL;
48 : extern char *CPLRecodeFromWCharStub(const wchar_t *, const char *,
49 : const char *);
50 : extern wchar_t *CPLRecodeToWCharStub(const char *, const char *, const char *);
51 : extern int CPLIsUTF8Stub(const char *, int);
52 :
53 : /************************************************************************/
54 : /* CPLRecode() */
55 : /************************************************************************/
56 :
57 : /**
58 : * Convert a string from a source encoding to a destination encoding.
59 : *
60 : * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
61 : * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
62 : * <ul>
63 : * <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in
64 : * fact)</li>
65 : * <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
66 : * <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
67 : * </ul>
68 : *
69 : * If an error occurs an error may, or may not be posted with CPLError().
70 : *
71 : * @param pszSource a NULL terminated string.
72 : * @param pszSrcEncoding the source encoding.
73 : * @param pszDstEncoding the destination encoding.
74 : *
75 : * @return a NULL terminated string which should be freed with CPLFree().
76 : *
77 : * @since GDAL 1.6.0
78 : */
79 :
80 783245 : char CPL_DLL *CPLRecode(const char *pszSource, const char *pszSrcEncoding,
81 : const char *pszDstEncoding)
82 :
83 : {
84 : /* -------------------------------------------------------------------- */
85 : /* Handle a few common short cuts. */
86 : /* -------------------------------------------------------------------- */
87 783245 : if (EQUAL(pszSrcEncoding, pszDstEncoding))
88 96 : return CPLStrdup(pszSource);
89 :
90 783149 : if (EQUAL(pszSrcEncoding, CPL_ENC_ASCII) &&
91 0 : (EQUAL(pszDstEncoding, CPL_ENC_UTF8) ||
92 0 : EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1)))
93 0 : return CPLStrdup(pszSource);
94 :
95 : // A few hard coded CPxxx/ISO-8859-x to UTF-8 tables
96 1516800 : if (EQUAL(pszDstEncoding, CPL_ENC_UTF8) &&
97 733647 : CPLGetConversionTableToUTF8(pszSrcEncoding))
98 : {
99 21020 : return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
100 : }
101 :
102 : #ifdef CPL_RECODE_ICONV
103 : /* -------------------------------------------------------------------- */
104 : /* CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8 */
105 : /* and CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1 conversions are handled */
106 : /* very well by the stub implementation which is faster than the */
107 : /* iconv() route. Use a stub for these two ones and iconv() */
108 : /* everything else. */
109 : /* -------------------------------------------------------------------- */
110 762129 : if ((EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1) &&
111 712490 : EQUAL(pszDstEncoding, CPL_ENC_UTF8)) ||
112 49639 : (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) &&
113 49502 : EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1)))
114 : {
115 761846 : return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
116 : }
117 : #ifdef _WIN32
118 : else if (((EQUAL(pszSrcEncoding, "CP_ACP") ||
119 : EQUAL(pszSrcEncoding, "CP_OEMCP")) &&
120 : EQUAL(pszDstEncoding, CPL_ENC_UTF8)) ||
121 : (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) &&
122 : (EQUAL(pszDstEncoding, "CP_ACP") ||
123 : EQUAL(pszDstEncoding, "CP_OEMCP"))))
124 : {
125 : return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
126 : }
127 : #endif
128 : else
129 : {
130 283 : return CPLRecodeIconv(pszSource, pszSrcEncoding, pszDstEncoding);
131 : }
132 : #else // CPL_RECODE_STUB
133 : return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
134 : #endif // CPL_RECODE_ICONV
135 : }
136 :
137 : /************************************************************************/
138 : /* CPLRecodeFromWChar() */
139 : /************************************************************************/
140 :
141 : /**
142 : * Convert wchar_t string to UTF-8.
143 : *
144 : * Convert a wchar_t string into a multibyte utf-8 string. The only
145 : * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
146 : * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
147 : * and CPL_ENC_ISO8859_1. In some cases (i.e. using iconv()) other encodings
148 : * may also be supported.
149 : *
150 : * Note that the wchar_t type varies in size on different systems. On
151 : * win32 it is normally 2 bytes, and on UNIX 4 bytes.
152 : *
153 : * If an error occurs an error may, or may not be posted with CPLError().
154 : *
155 : * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
156 : * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
157 : * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
158 : *
159 : * @return a zero terminated multi-byte string which should be freed with
160 : * CPLFree(), or NULL if an error occurs.
161 : *
162 : * @since GDAL 1.6.0
163 : */
164 :
165 111145 : char CPL_DLL *CPLRecodeFromWChar(const wchar_t *pwszSource,
166 : const char *pszSrcEncoding,
167 : const char *pszDstEncoding)
168 :
169 : {
170 : #ifdef CPL_RECODE_ICONV
171 : /* -------------------------------------------------------------------- */
172 : /* Conversions from CPL_ENC_UCS2 */
173 : /* to CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */
174 : /* handled by the stub implementation. */
175 : /* -------------------------------------------------------------------- */
176 111145 : if ((EQUAL(pszSrcEncoding, CPL_ENC_UCS2) ||
177 1360 : EQUAL(pszSrcEncoding, "WCHAR_T")) &&
178 111144 : (EQUAL(pszDstEncoding, CPL_ENC_UTF8) ||
179 0 : EQUAL(pszDstEncoding, CPL_ENC_ASCII) ||
180 0 : EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1)))
181 : {
182 111144 : return CPLRecodeFromWCharStub(pwszSource, pszSrcEncoding,
183 111144 : pszDstEncoding);
184 : }
185 :
186 1 : return CPLRecodeFromWCharIconv(pwszSource, pszSrcEncoding, pszDstEncoding);
187 :
188 : #else // CPL_RECODE_STUB
189 : return CPLRecodeFromWCharStub(pwszSource, pszSrcEncoding, pszDstEncoding);
190 : #endif // CPL_RECODE_ICONV
191 : }
192 :
193 : /************************************************************************/
194 : /* CPLRecodeToWChar() */
195 : /************************************************************************/
196 :
197 : /**
198 : * Convert UTF-8 string to a wchar_t string.
199 : *
200 : * Convert a 8bit, multi-byte per character input string into a wide
201 : * character (wchar_t) string. The only guaranteed supported source encodings
202 : * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1). The only
203 : * guaranteed supported destination encoding is CPL_ENC_UCS2. Other source
204 : * and destination encodings may be supported depending on the underlying
205 : * implementation.
206 : *
207 : * Note that the wchar_t type varies in size on different systems. On
208 : * win32 it is normally 2 bytes, and on UNIX 4 bytes.
209 : *
210 : * If an error occurs an error may, or may not be posted with CPLError().
211 : *
212 : * @param pszSource input multi-byte character string.
213 : * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
214 : * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
215 : *
216 : * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
217 : * NULL on error.
218 : *
219 : * @since GDAL 1.6.0
220 : */
221 :
222 52345 : wchar_t CPL_DLL *CPLRecodeToWChar(const char *pszSource,
223 : const char *pszSrcEncoding,
224 : const char *pszDstEncoding)
225 :
226 : {
227 : #ifdef CPL_RECODE_ICONV
228 : /* -------------------------------------------------------------------- */
229 : /* Conversions to CPL_ENC_UCS2 */
230 : /* from CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */
231 : /* handled by the stub implementation. */
232 : /* -------------------------------------------------------------------- */
233 52345 : if ((EQUAL(pszDstEncoding, CPL_ENC_UCS2) ||
234 0 : EQUAL(pszDstEncoding, "WCHAR_T")) &&
235 52345 : (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) ||
236 0 : EQUAL(pszSrcEncoding, CPL_ENC_ASCII) ||
237 0 : EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1)))
238 : {
239 52345 : return CPLRecodeToWCharStub(pszSource, pszSrcEncoding, pszDstEncoding);
240 : }
241 :
242 0 : return CPLRecodeToWCharIconv(pszSource, pszSrcEncoding, pszDstEncoding);
243 :
244 : #else // CPL_RECODE_STUB
245 : return CPLRecodeToWCharStub(pszSource, pszSrcEncoding, pszDstEncoding);
246 : #endif // CPL_RECODE_ICONV
247 : }
248 :
249 : /************************************************************************/
250 : /* CPLIsASCII() */
251 : /************************************************************************/
252 :
253 : /**
254 : * Test if a string is encoded as ASCII.
255 : *
256 : * @param pabyData input string to test
257 : * @param nLen length of the input string, or -1 if the function must compute
258 : * the string length. In which case it must be null terminated.
259 : * @return true if the string is encoded as ASCII. false otherwise
260 : *
261 : * @since GDAL 3.6.0
262 : */
263 750 : bool CPLIsASCII(const char *pabyData, size_t nLen)
264 : {
265 750 : if (nLen == static_cast<size_t>(-1))
266 21 : nLen = strlen(pabyData);
267 12791 : for (size_t i = 0; i < nLen; ++i)
268 : {
269 12043 : if (static_cast<unsigned char>(pabyData[i]) > 127)
270 2 : return false;
271 : }
272 748 : return true;
273 : }
274 :
275 : /************************************************************************/
276 : /* CPLForceToASCII() */
277 : /************************************************************************/
278 :
279 : /**
280 : * Return a new string that is made only of ASCII characters. If non-ASCII
281 : * characters are found in the input string, they will be replaced by the
282 : * provided replacement character.
283 : *
284 : * This function does not make any assumption on the encoding of the input
285 : * string (except it must be nul-terminated if nLen equals -1, or have at
286 : * least nLen bytes otherwise). CPLUTF8ForceToASCII() can be used instead when
287 : * the input string is known to be UTF-8 encoded.
288 : *
289 : * @param pabyData input string to test
290 : * @param nLen length of the input string, or -1 if the function must compute
291 : * the string length. In which case it must be null terminated.
292 :
293 : * @param chReplacementChar character which will be used when the input stream
294 : * contains a non ASCII character. Must be valid ASCII!
295 : *
296 : * @return a new string that must be freed with CPLFree().
297 : *
298 : * @since GDAL 1.7.0
299 : */
300 5 : char *CPLForceToASCII(const char *pabyData, int nLen, char chReplacementChar)
301 : {
302 5 : const size_t nRealLen =
303 5 : (nLen >= 0) ? static_cast<size_t>(nLen) : strlen(pabyData);
304 5 : char *pszOutputString = static_cast<char *>(CPLMalloc(nRealLen + 1));
305 5 : const char *pszPtr = pabyData;
306 5 : const char *pszEnd = pabyData + nRealLen;
307 5 : size_t i = 0;
308 19 : while (pszPtr != pszEnd)
309 : {
310 14 : if (*reinterpret_cast<const unsigned char *>(pszPtr) > 127)
311 : {
312 3 : pszOutputString[i] = chReplacementChar;
313 3 : ++pszPtr;
314 3 : ++i;
315 : }
316 : else
317 : {
318 11 : pszOutputString[i] = *pszPtr;
319 11 : ++pszPtr;
320 11 : ++i;
321 : }
322 : }
323 5 : pszOutputString[i] = '\0';
324 5 : return pszOutputString;
325 : }
326 :
327 : /************************************************************************/
328 : /* CPLUTF8ForceToASCII() */
329 : /************************************************************************/
330 :
331 : /**
332 : * Return a new string that is made only of ASCII characters. If non-ASCII
333 : * characters are found in the input string, for which an "equivalent" ASCII
334 : * character is not found, they will be replaced by the provided replacement
335 : * character.
336 : *
337 : * This function is aware of https://en.wikipedia.org/wiki/Latin-1_Supplement
338 : * and https://en.wikipedia.org/wiki/Latin_Extended-A to provide sensible
339 : * replacements for accented characters.
340 :
341 : * @param pszStr NUL-terminated UTF-8 string.
342 : * @param chReplacementChar character which will be used when the input stream
343 : * contains a non ASCII character that cannot be
344 : * substituted with an equivalent ASCII character.
345 : * Must be valid ASCII!
346 : *
347 : * @return a new string that must be freed with CPLFree().
348 : *
349 : * @since GDAL 3.9
350 : */
351 16 : char *CPLUTF8ForceToASCII(const char *pszStr, char chReplacementChar)
352 : {
353 : static const struct
354 : {
355 : short nCodePoint;
356 : char chFirst;
357 : char chSecond;
358 : } aLatinCharacters[] = {
359 : // https://en.wikipedia.org/wiki/Latin-1_Supplement
360 : {0xC0, 'A', 0}, // Latin Capital Letter A with grave
361 : {0xC1, 'A', 0}, // Latin Capital letter A with acute
362 : {0xC2, 'A', 0}, // Latin Capital letter A with circumflex
363 : {0xC3, 'A', 0}, // Latin Capital letter A with tilde
364 : {0xC4, 'A', 0}, // Latin Capital letter A with diaeresis
365 : {0xC5, 'A', 0}, // Latin Capital letter A with ring above
366 : {0xC6, 'A', 'E'}, // Latin Capital letter AE
367 : {0xC7, 'C', 0}, // Latin Capital letter C with cedilla
368 : {0xC8, 'E', 0}, // Latin Capital letter E with grave
369 : {0xC9, 'E', 0}, // Latin Capital letter E with acute
370 : {0xCA, 'E', 0}, // Latin Capital letter E with circumflex
371 : {0xCB, 'E', 0}, // Latin Capital letter E with diaeresis
372 : {0xCC, 'I', 0}, // Latin Capital letter I with grave
373 : {0xCD, 'I', 0}, // Latin Capital letter I with acute
374 : {0xCE, 'I', 0}, // Latin Capital letter I with circumflex
375 : {0xCF, 'I', 0}, // Latin Capital letter I with diaeresis
376 : // { 0xD0, '?', 0 }, // Latin Capital letter Eth
377 : {0xD1, 'N', 0}, // Latin Capital letter N with tilde
378 : {0xD2, 'O', 0}, // Latin Capital letter O with grave
379 : {0xD3, 'O', 0}, // Latin Capital letter O with acute
380 : {0xD4, 'O', 0}, // Latin Capital letter O with circumflex
381 : {0xD5, 'O', 0}, // Latin Capital letter O with tilde
382 : {0xD6, 'O', 0}, // Latin Capital letter O with diaeresis
383 : {0xD8, 'O', 0}, // Latin Capital letter O with stroke
384 : {0xD9, 'U', 0}, // Latin Capital letter U with grave
385 : {0xDA, 'U', 0}, // Latin Capital letter U with acute
386 : {0xDB, 'U', 0}, // Latin Capital Letter U with circumflex
387 : {0xDC, 'U', 0}, // Latin Capital Letter U with diaeresis
388 : {0xDD, 'Y', 0}, // Latin Capital Letter Y with acute
389 : // { 0xDE, '?', 0 }, // Latin Capital Letter Thorn
390 : {0xDF, 'S', 'S'}, // Latin Small Letter sharp S
391 : {0xE0, 'a', 0}, // Latin Small Letter A with grave
392 : {0xE1, 'a', 0}, // Latin Small Letter A with acute
393 : {0xE2, 'a', 0}, // Latin Small Letter A with circumflex
394 : {0xE3, 'a', 0}, // Latin Small Letter A with tilde
395 : {0xE4, 'a', 0}, // Latin Small Letter A with diaeresis
396 : {0xE5, 'a', 0}, // Latin Small Letter A with ring above
397 : {0xE6, 'a', 'e'}, // Latin Small Letter AE
398 : {0xE7, 'c', 0}, // Latin Small Letter C with cedilla
399 : {0xE8, 'e', 0}, // Latin Small Letter E with grave
400 : {0xE9, 'e', 0}, // Latin Small Letter E with acute
401 : {0xEA, 'e', 0}, // Latin Small Letter E with circumflex
402 : {0xEB, 'e', 0}, // Latin Small Letter E with diaeresis
403 : {0xEC, 'i', 0}, // Latin Small Letter I with grave
404 : {0xED, 'i', 0}, // Latin Small Letter I with acute
405 : {0xEE, 'i', 0}, // Latin Small Letter I with circumflex
406 : {0xEF, 'i', 0}, // Latin Small Letter I with diaeresis
407 : // { 0xF0, '?', 0 }, // Latin Small Letter Eth
408 : {0xF1, 'n', 0}, // Latin Small Letter N with tilde
409 : {0xF2, 'o', 0}, // Latin Small Letter O with grave
410 : {0xF3, 'o', 0}, // Latin Small Letter O with acute
411 : {0xF4, 'o', 0}, // Latin Small Letter O with circumflex
412 : {0xF5, 'o', 0}, // Latin Small Letter O with tilde
413 : {0xF6, 'o', 0}, // Latin Small Letter O with diaeresis
414 : {0xF8, 'o', 0}, // Latin Small Letter O with stroke
415 : {0xF9, 'u', 0}, // Latin Small Letter U with grave
416 : {0xFA, 'u', 0}, // Latin Small Letter U with acute
417 : {0xFB, 'u', 0}, // Latin Small Letter U with circumflex
418 : {0xFC, 'u', 0}, // Latin Small Letter U with diaeresis
419 : {0xFD, 'y', 0}, // Latin Small Letter Y with acute
420 : // { 0xFE, '?', 0 }, // Latin Small Letter Thorn
421 : {0xFF, 'u', 0}, // Latin Small Letter Y with diaeresis
422 :
423 : // https://en.wikipedia.org/wiki/Latin_Extended-A
424 : {
425 : 0x0100,
426 : 'A',
427 : 0,
428 : }, // Latin Capital letter A with macron
429 : {
430 : 0x0101,
431 : 'a',
432 : 0,
433 : }, // Latin Small letter A with macron
434 : {
435 : 0x0102,
436 : 'A',
437 : 0,
438 : }, // Latin Capital letter A with breve
439 : {
440 : 0x0103,
441 : 'a',
442 : 0,
443 : }, // Latin Small letter A with breve
444 : {
445 : 0x0104,
446 : 'A',
447 : 0,
448 : }, // Latin Capital letter A with ogonek
449 : {
450 : 0x0105,
451 : 'a',
452 : 0,
453 : }, // Latin Small letter A with ogonek
454 : {
455 : 0x0106,
456 : 'C',
457 : 0,
458 : }, // Latin Capital letter C with acute
459 : {
460 : 0x0107,
461 : 'c',
462 : 0,
463 : }, // Latin Small letter C with acute
464 : {
465 : 0x0108,
466 : 'C',
467 : 0,
468 : }, // Latin Capital letter C with circumflex
469 : {
470 : 0x0109,
471 : 'c',
472 : 0,
473 : }, // Latin Small letter C with circumflex
474 : {
475 : 0x010A,
476 : 'C',
477 : 0,
478 : }, // Latin Capital letter C with dot above
479 : {
480 : 0x010B,
481 : 'c',
482 : 0,
483 : }, // Latin Small letter C with dot above
484 : {
485 : 0x010C,
486 : 'C',
487 : 0,
488 : }, // Latin Capital letter C with caron
489 : {
490 : 0x010D,
491 : 'c',
492 : 0,
493 : }, // Latin Small letter C with caron
494 : {
495 : 0x010E,
496 : 'D',
497 : 0,
498 : }, // Latin Capital letter D with caron
499 : {
500 : 0x010F,
501 : 'd',
502 : 0,
503 : }, // Latin Small letter D with caron
504 : {
505 : 0x0110,
506 : 'D',
507 : 0,
508 : }, // Latin Capital letter D with stroke
509 : {
510 : 0x0111,
511 : 'd',
512 : 0,
513 : }, // Latin Small letter D with stroke
514 : {
515 : 0x0112,
516 : 'E',
517 : 0,
518 : }, // Latin Capital letter E with macron
519 : {
520 : 0x0113,
521 : 'e',
522 : 0,
523 : }, // Latin Small letter E with macron
524 : {
525 : 0x0114,
526 : 'E',
527 : 0,
528 : }, // Latin Capital letter E with breve
529 : {
530 : 0x0115,
531 : 'e',
532 : 0,
533 : }, // Latin Small letter E with breve
534 : {
535 : 0x0116,
536 : 'E',
537 : 0,
538 : }, // Latin Capital letter E with dot above
539 : {
540 : 0x0117,
541 : 'e',
542 : 0,
543 : }, // Latin Small letter E with dot above
544 : {
545 : 0x0118,
546 : 'E',
547 : 0,
548 : }, // Latin Capital letter E with ogonek
549 : {
550 : 0x0119,
551 : 'e',
552 : 0,
553 : }, // Latin Small letter E with ogonek
554 : {
555 : 0x011A,
556 : 'E',
557 : 0,
558 : }, // Latin Capital letter E with caron
559 : {
560 : 0x011B,
561 : 'e',
562 : 0,
563 : }, // Latin Small letter E with caron
564 : {
565 : 0x011C,
566 : 'G',
567 : 0,
568 : }, // Latin Capital letter G with circumflex
569 : {
570 : 0x011D,
571 : 'g',
572 : 0,
573 : }, // Latin Small letter G with circumflex
574 : {
575 : 0x011E,
576 : 'G',
577 : 0,
578 : }, // Latin Capital letter G with breve
579 : {
580 : 0x011F,
581 : 'g',
582 : 0,
583 : }, // Latin Small letter G with breve
584 : {
585 : 0x0120,
586 : 'G',
587 : 0,
588 : }, // Latin Capital letter G with dot above
589 : {
590 : 0x0121,
591 : 'g',
592 : 0,
593 : }, // Latin Small letter G with dot above
594 : {
595 : 0x0122,
596 : 'G',
597 : 0,
598 : }, // Latin Capital letter G with cedilla
599 : {
600 : 0x0123,
601 : 'g',
602 : 0,
603 : }, // Latin Small letter G with cedilla
604 : {
605 : 0x0124,
606 : 'H',
607 : 0,
608 : }, // Latin Capital letter H with circumflex
609 : {
610 : 0x0125,
611 : 'h',
612 : 0,
613 : }, // Latin Small letter H with circumflex
614 : {
615 : 0x0126,
616 : 'H',
617 : 0,
618 : }, // Latin Capital letter H with stroke
619 : {
620 : 0x0127,
621 : 'h',
622 : 0,
623 : }, // Latin Small letter H with stroke
624 : {
625 : 0x0128,
626 : 'I',
627 : 0,
628 : }, // Latin Capital letter I with tilde
629 : {
630 : 0x0129,
631 : 'i',
632 : 0,
633 : }, // Latin Small letter I with tilde
634 : {
635 : 0x012A,
636 : 'I',
637 : 0,
638 : }, // Latin Capital letter I with macron
639 : {
640 : 0x012B,
641 : 'i',
642 : 0,
643 : }, // Latin Small letter I with macron
644 : {
645 : 0x012C,
646 : 'I',
647 : 0,
648 : }, // Latin Capital letter I with breve
649 : {
650 : 0x012D,
651 : 'i',
652 : 0,
653 : }, // Latin Small letter I with breve
654 : {
655 : 0x012E,
656 : 'I',
657 : 0,
658 : }, // Latin Capital letter I with ogonek
659 : {
660 : 0x012F,
661 : 'i',
662 : 0,
663 : }, // Latin Small letter I with ogonek
664 : {
665 : 0x0130,
666 : 'I',
667 : 0,
668 : }, // Latin Capital letter I with dot above
669 : {
670 : 0x0131,
671 : 'i',
672 : 0,
673 : }, // Latin Small letter dotless I
674 : {
675 : 0x0132,
676 : 'I',
677 : 'J',
678 : }, // Latin Capital Ligature IJ
679 : {
680 : 0x0133,
681 : 'i',
682 : 'j',
683 : }, // Latin Small Ligature IJ
684 : {
685 : 0x0134,
686 : 'J',
687 : 0,
688 : }, // Latin Capital letter J with circumflex
689 : {
690 : 0x0135,
691 : 'j',
692 : 0,
693 : }, // Latin Small letter J with circumflex
694 : {
695 : 0x0136,
696 : 'K',
697 : 0,
698 : }, // Latin Capital letter K with cedilla
699 : {
700 : 0x0137,
701 : 'k',
702 : 0,
703 : }, // Latin Small letter K with cedilla
704 : {
705 : 0x0138,
706 : 'k',
707 : 0,
708 : }, // Latin Small letter Kra
709 : {
710 : 0x0139,
711 : 'L',
712 : 0,
713 : }, // Latin Capital letter L with acute
714 : {
715 : 0x013A,
716 : 'l',
717 : 0,
718 : }, // Latin Small letter L with acute
719 : {
720 : 0x013B,
721 : 'L',
722 : 0,
723 : }, // Latin Capital letter L with cedilla
724 : {
725 : 0x013C,
726 : 'l',
727 : 0,
728 : }, // Latin Small letter L with cedilla
729 : {
730 : 0x013D,
731 : 'L',
732 : 0,
733 : }, // Latin Capital letter L with caron
734 : {
735 : 0x013E,
736 : 'l',
737 : 0,
738 : }, // Latin Small letter L with caron
739 : {
740 : 0x013F,
741 : 'L',
742 : 0,
743 : }, // Latin Capital letter L with middle dot
744 : {
745 : 0x0140,
746 : 'l',
747 : 0,
748 : }, // Latin Small letter L with middle dot
749 : {
750 : 0x0141,
751 : 'L',
752 : 0,
753 : }, // Latin Capital letter L with stroke
754 : {
755 : 0x0142,
756 : 'l',
757 : 0,
758 : }, // Latin Small letter L with stroke
759 : {
760 : 0x0143,
761 : 'N',
762 : 0,
763 : }, // Latin Capital letter N with acute
764 : {
765 : 0x0144,
766 : 'n',
767 : 0,
768 : }, // Latin Small letter N with acute
769 : {
770 : 0x0145,
771 : 'N',
772 : 0,
773 : }, // Latin Capital letter N with cedilla
774 : {
775 : 0x0146,
776 : 'n',
777 : 0,
778 : }, // Latin Small letter N with cedilla
779 : {
780 : 0x0147,
781 : 'N',
782 : 0,
783 : }, // Latin Capital letter N with caron
784 : {
785 : 0x0148,
786 : 'n',
787 : 0,
788 : }, // Latin Small letter N with caron
789 : // { 0x014A , '?' , 0, }, // Latin Capital letter Eng
790 : // { 0x014B , '?' , 0, }, // Latin Small letter Eng
791 : {
792 : 0x014C,
793 : 'O',
794 : 0,
795 : }, // Latin Capital letter O with macron
796 : {
797 : 0x014D,
798 : 'o',
799 : 0,
800 : }, // Latin Small letter O with macron
801 : {
802 : 0x014E,
803 : 'O',
804 : 0,
805 : }, // Latin Capital letter O with breve
806 : {
807 : 0x014F,
808 : 'o',
809 : 0,
810 : }, // Latin Small letter O with breve
811 : {
812 : 0x0150,
813 : 'O',
814 : 0,
815 : }, // Latin Capital Letter O with double acute
816 : {
817 : 0x0151,
818 : 'o',
819 : 0,
820 : }, // Latin Small Letter O with double acute
821 : {
822 : 0x0152,
823 : 'O',
824 : 'E',
825 : }, // Latin Capital Ligature OE
826 : {
827 : 0x0153,
828 : 'o',
829 : 'e',
830 : }, // Latin Small Ligature OE
831 : {
832 : 0x0154,
833 : 'R',
834 : 0,
835 : }, // Latin Capital letter R with acute
836 : {
837 : 0x0155,
838 : 'r',
839 : 0,
840 : }, // Latin Small letter R with acute
841 : {
842 : 0x0156,
843 : 'R',
844 : 0,
845 : }, // Latin Capital letter R with cedilla
846 : {
847 : 0x0157,
848 : 'r',
849 : 0,
850 : }, // Latin Small letter R with cedilla
851 : {
852 : 0x0158,
853 : 'R',
854 : 0,
855 : }, // Latin Capital letter R with caron
856 : {
857 : 0x0159,
858 : 'r',
859 : 0,
860 : }, // Latin Small letter R with caron
861 : {
862 : 0x015A,
863 : 'S',
864 : 0,
865 : }, // Latin Capital letter S with acute
866 : {
867 : 0x015B,
868 : 's',
869 : 0,
870 : }, // Latin Small letter S with acute
871 : {
872 : 0x015C,
873 : 'S',
874 : 0,
875 : }, // Latin Capital letter S with circumflex
876 : {
877 : 0x015D,
878 : 's',
879 : 0,
880 : }, // Latin Small letter S with circumflex
881 : {
882 : 0x015E,
883 : 'S',
884 : 0,
885 : }, // Latin Capital letter S with cedilla
886 : {
887 : 0x015F,
888 : 's',
889 : 0,
890 : }, // Latin Small letter S with cedilla
891 : {
892 : 0x0160,
893 : 'S',
894 : 0,
895 : }, // Latin Capital letter S with caron
896 : {
897 : 0x0161,
898 : 's',
899 : 0,
900 : }, // Latin Small letter S with caron
901 : {
902 : 0x0162,
903 : 'T',
904 : 0,
905 : }, // Latin Capital letter T with cedilla
906 : {
907 : 0x0163,
908 : 't',
909 : 0,
910 : }, // Latin Small letter T with cedilla
911 : {
912 : 0x0164,
913 : 'T',
914 : 0,
915 : }, // Latin Capital letter T with caron
916 : {
917 : 0x0165,
918 : 't',
919 : 0,
920 : }, // Latin Small letter T with caron
921 : {
922 : 0x0166,
923 : 'T',
924 : 0,
925 : }, // Latin Capital letter T with stroke
926 : {
927 : 0x0167,
928 : 't',
929 : 0,
930 : }, // Latin Small letter T with stroke
931 : {
932 : 0x0168,
933 : 'U',
934 : 0,
935 : }, // Latin Capital letter U with tilde
936 : {
937 : 0x0169,
938 : 'u',
939 : 0,
940 : }, // Latin Small letter U with tilde
941 : {
942 : 0x016A,
943 : 'U',
944 : 0,
945 : }, // Latin Capital letter U with macron
946 : {
947 : 0x016B,
948 : 'u',
949 : 0,
950 : }, // Latin Small letter U with macron
951 : {
952 : 0x016C,
953 : 'U',
954 : 0,
955 : }, // Latin Capital letter U with breve
956 : {
957 : 0x016D,
958 : 'u',
959 : 0,
960 : }, // Latin Small letter U with breve
961 : {
962 : 0x016E,
963 : 'U',
964 : 0,
965 : }, // Latin Capital letter U with ring above
966 : {
967 : 0x016F,
968 : 'u',
969 : 0,
970 : }, // Latin Small letter U with ring above
971 : {
972 : 0x0170,
973 : 'U',
974 : 0,
975 : }, // Latin Capital Letter U with double acute
976 : {
977 : 0x0171,
978 : 'u',
979 : 0,
980 : }, // Latin Small Letter U with double acute
981 : {
982 : 0x0172,
983 : 'U',
984 : 0,
985 : }, // Latin Capital letter U with ogonek
986 : {
987 : 0x0173,
988 : 'u',
989 : 0,
990 : }, // Latin Small letter U with ogonek
991 : {
992 : 0x0174,
993 : 'W',
994 : 0,
995 : }, // Latin Capital letter W with circumflex
996 : {
997 : 0x0175,
998 : 'w',
999 : 0,
1000 : }, // Latin Small letter W with circumflex
1001 : {
1002 : 0x0176,
1003 : 'Y',
1004 : 0,
1005 : }, // Latin Capital letter Y with circumflex
1006 : {
1007 : 0x0177,
1008 : 'y',
1009 : 0,
1010 : }, // Latin Small letter Y with circumflex
1011 : {
1012 : 0x0178,
1013 : 'Y',
1014 : 0,
1015 : }, // Latin Capital letter Y with diaeresis
1016 : {
1017 : 0x0179,
1018 : 'Z',
1019 : 0,
1020 : }, // Latin Capital letter Z with acute
1021 : {
1022 : 0x017A,
1023 : 'z',
1024 : 0,
1025 : }, // Latin Small letter Z with acute
1026 : {
1027 : 0x017B,
1028 : 'Z',
1029 : 0,
1030 : }, // Latin Capital letter Z with dot above
1031 : {
1032 : 0x017C,
1033 : 'z',
1034 : 0,
1035 : }, // Latin Small letter Z with dot above
1036 : {
1037 : 0x017D,
1038 : 'Z',
1039 : 0,
1040 : }, // Latin Capital letter Z with caron
1041 : {
1042 : 0x017E,
1043 : 'z',
1044 : 0,
1045 : }, // Latin Small letter Z with caron
1046 : };
1047 :
1048 16 : const size_t nLen = strlen(pszStr);
1049 16 : char *pszOutputString = static_cast<char *>(CPLMalloc(nLen + 1));
1050 16 : const char *pszPtr = pszStr;
1051 16 : const char *pszEnd = pszStr + nLen;
1052 16 : size_t i = 0;
1053 248 : while (pszPtr != pszEnd)
1054 : {
1055 233 : if (*reinterpret_cast<const unsigned char *>(pszPtr) > 127)
1056 : {
1057 : utf8_int32_t codepoint;
1058 189 : if (pszPtr + utf8codepointcalcsize(
1059 189 : reinterpret_cast<const utf8_int8_t *>(pszPtr)) >
1060 : pszEnd)
1061 1 : break;
1062 188 : auto pszNext = reinterpret_cast<const char *>(utf8codepoint(
1063 : reinterpret_cast<const utf8_int8_t *>(pszPtr), &codepoint));
1064 188 : char ch = chReplacementChar;
1065 17075 : for (const auto &latin1char : aLatinCharacters)
1066 : {
1067 17073 : if (codepoint == latin1char.nCodePoint)
1068 : {
1069 186 : pszOutputString[i] = latin1char.chFirst;
1070 186 : ++i;
1071 186 : if (latin1char.chSecond)
1072 : {
1073 7 : pszOutputString[i] = latin1char.chSecond;
1074 7 : ++i;
1075 : }
1076 186 : ch = 0;
1077 186 : break;
1078 : }
1079 : }
1080 188 : if (ch)
1081 : {
1082 2 : pszOutputString[i] = ch;
1083 2 : ++i;
1084 : }
1085 188 : pszPtr = pszNext;
1086 : }
1087 : else
1088 : {
1089 44 : pszOutputString[i] = *pszPtr;
1090 44 : ++pszPtr;
1091 44 : ++i;
1092 : }
1093 : }
1094 16 : pszOutputString[i] = '\0';
1095 16 : return pszOutputString;
1096 : }
1097 :
1098 : /************************************************************************/
1099 : /* CPLEncodingCharSize() */
1100 : /************************************************************************/
1101 :
1102 : /**
1103 : * Return bytes per character for encoding.
1104 : *
1105 : * This function returns the size in bytes of the smallest character
1106 : * in this encoding. For fixed width encodings (ASCII, UCS-2, UCS-4) this
1107 : * is straight forward. For encodings like UTF8 and UTF16 which represent
1108 : * some characters as a sequence of atomic character sizes the function
1109 : * still returns the atomic character size (1 for UTF8, 2 for UTF16).
1110 : *
1111 : * This function will return the correct value for well known encodings
1112 : * with corresponding CPL_ENC_ values. It may not return the correct value
1113 : * for other encodings even if they are supported by the underlying iconv
1114 : * or windows transliteration services. Hopefully it will improve over time.
1115 : *
1116 : * @param pszEncoding the name of the encoding.
1117 : *
1118 : * @return the size of a minimal character in bytes or -1 if the size is
1119 : * unknown.
1120 : */
1121 :
1122 1 : int CPLEncodingCharSize(const char *pszEncoding)
1123 :
1124 : {
1125 1 : if (EQUAL(pszEncoding, CPL_ENC_UTF8))
1126 0 : return 1;
1127 1 : else if (EQUAL(pszEncoding, CPL_ENC_UTF16) ||
1128 1 : EQUAL(pszEncoding, "UTF-16LE"))
1129 1 : return 2;
1130 0 : else if (EQUAL(pszEncoding, CPL_ENC_UCS2) || EQUAL(pszEncoding, "UCS-2LE"))
1131 0 : return 2;
1132 0 : else if (EQUAL(pszEncoding, CPL_ENC_UCS4))
1133 0 : return 4;
1134 0 : else if (EQUAL(pszEncoding, CPL_ENC_ASCII))
1135 0 : return 1;
1136 0 : else if (STARTS_WITH_CI(pszEncoding, "ISO-8859-"))
1137 0 : return 1;
1138 :
1139 0 : return -1;
1140 : }
1141 :
1142 : /************************************************************************/
1143 : /* CPLClearRecodeWarningFlags() */
1144 : /************************************************************************/
1145 :
1146 10404 : void CPLClearRecodeWarningFlags()
1147 : {
1148 : #ifdef CPL_RECODE_ICONV
1149 10404 : CPLClearRecodeIconvWarningFlags();
1150 : #endif
1151 10404 : CPLClearRecodeStubWarningFlags();
1152 10404 : }
1153 :
1154 : /************************************************************************/
1155 : /* CPLStrlenUTF8() */
1156 : /************************************************************************/
1157 :
1158 : /**
1159 : * Return the number of UTF-8 characters of a nul-terminated string.
1160 : *
1161 : * This is different from strlen() which returns the number of bytes.
1162 : *
1163 : * @param pszUTF8Str a nul-terminated UTF-8 string
1164 : *
1165 : * @return the number of UTF-8 characters.
1166 : */
1167 :
1168 358245 : int CPLStrlenUTF8(const char *pszUTF8Str)
1169 : {
1170 358245 : int nCharacterCount = 0;
1171 18609400 : for (int i = 0; pszUTF8Str[i] != '\0'; ++i)
1172 : {
1173 18251100 : if ((pszUTF8Str[i] & 0xc0) != 0x80)
1174 18251100 : ++nCharacterCount;
1175 : }
1176 358245 : return nCharacterCount;
1177 : }
1178 :
1179 : /************************************************************************/
1180 : /* CPLCanRecode() */
1181 : /************************************************************************/
1182 :
1183 : /**
1184 : * Checks if it is possible to recode a string from one encoding to another.
1185 : *
1186 : * @param pszTestStr a NULL terminated string.
1187 : * @param pszSrcEncoding the source encoding.
1188 : * @param pszDstEncoding the destination encoding.
1189 : *
1190 : * @return a TRUE if recode is possible.
1191 : *
1192 : * @since GDAL 3.1.0
1193 : */
1194 5586 : int CPLCanRecode(const char *pszTestStr, const char *pszSrcEncoding,
1195 : const char *pszDstEncoding)
1196 : {
1197 5586 : CPLClearRecodeWarningFlags();
1198 5586 : CPLErrorReset();
1199 :
1200 5586 : CPLPushErrorHandler(CPLQuietErrorHandler);
1201 5586 : char *pszRec(CPLRecode(pszTestStr, pszSrcEncoding, pszDstEncoding));
1202 5586 : CPLPopErrorHandler();
1203 :
1204 5586 : if (pszRec == nullptr)
1205 : {
1206 0 : return FALSE;
1207 : }
1208 :
1209 5586 : CPLFree(pszRec);
1210 :
1211 5586 : if (CPLGetLastErrorType() != 0)
1212 : {
1213 1 : return FALSE;
1214 : }
1215 :
1216 5585 : return TRUE;
1217 : }
|