Line data Source code
1 : /**********************************************************************
2 : *
3 : * Name: cpl_recode.cpp
4 : * Project: CPL - Common Portability Library
5 : * Purpose: Character set recoding and char/wchar_t conversions.
6 : * Author: Andrey Kiselev, dron@ak4719.spb.edu
7 : *
8 : **********************************************************************
9 : * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
10 : * Copyright (c) 2008, Frank Warmerdam
11 : * Copyright (c) 2011-2014, Even Rouault <even dot rouault at spatialys.com>
12 : *
13 : * Permission to use, copy, modify, and distribute this software for any
14 : * purpose with or without fee is hereby granted, provided that the above
15 : * copyright notice and this permission notice appear in all copies.
16 : *
17 : * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
18 : * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
19 : * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
20 : * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
21 : * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
22 : * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
23 : * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
24 : **********************************************************************/
25 :
26 : #include "cpl_port.h"
27 : #include "cpl_string.h"
28 :
29 : #include <cstring>
30 :
31 : #include "cpl_conv.h"
32 :
33 : #include "utf8.h"
34 :
35 : #ifdef CPL_RECODE_ICONV
36 : extern void CPLClearRecodeIconvWarningFlags();
37 : extern char *CPLRecodeIconv(const char *, const char *,
38 : const char *) CPL_RETURNS_NONNULL;
39 : extern char *CPLRecodeFromWCharIconv(const wchar_t *, const char *,
40 : const char *);
41 : extern wchar_t *CPLRecodeToWCharIconv(const char *, const char *, const char *);
42 : #endif // CPL_RECODE_ICONV
43 :
44 : extern void CPLClearRecodeStubWarningFlags();
45 : extern char *CPLRecodeStub(const char *, const char *,
46 : const char *) CPL_RETURNS_NONNULL;
47 : extern char *CPLRecodeFromWCharStub(const wchar_t *, const char *,
48 : const char *);
49 : extern wchar_t *CPLRecodeToWCharStub(const char *, const char *, const char *);
50 : extern int CPLIsUTF8Stub(const char *, int);
51 :
52 : /************************************************************************/
53 : /* CPLRecode() */
54 : /************************************************************************/
55 :
56 : /**
57 : * Convert a string from a source encoding to a destination encoding.
58 : *
59 : * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
60 : * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
61 : * <ul>
62 : * <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in
63 : * fact)</li>
64 : * <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
65 : * <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
66 : * </ul>
67 : *
68 : * If an error occurs an error may, or may not be posted with CPLError().
69 : *
70 : * @param pszSource a NULL terminated string.
71 : * @param pszSrcEncoding the source encoding.
72 : * @param pszDstEncoding the destination encoding.
73 : *
74 : * @return a NULL terminated string which should be freed with CPLFree().
75 : *
76 : * @since GDAL 1.6.0
77 : */
78 :
79 734058 : char CPL_DLL *CPLRecode(const char *pszSource, const char *pszSrcEncoding,
80 : const char *pszDstEncoding)
81 :
82 : {
83 : /* -------------------------------------------------------------------- */
84 : /* Handle a few common short cuts. */
85 : /* -------------------------------------------------------------------- */
86 734058 : if (EQUAL(pszSrcEncoding, pszDstEncoding))
87 95 : return CPLStrdup(pszSource);
88 :
89 733963 : if (EQUAL(pszSrcEncoding, CPL_ENC_ASCII) &&
90 0 : (EQUAL(pszDstEncoding, CPL_ENC_UTF8) ||
91 0 : EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1)))
92 0 : return CPLStrdup(pszSource);
93 :
94 : /* -------------------------------------------------------------------- */
95 : /* For ZIP file handling */
96 : /* (CP437 might be missing even on some iconv, like on Mac) */
97 : /* -------------------------------------------------------------------- */
98 733963 : if (EQUAL(pszSrcEncoding, "CP437") &&
99 13120 : EQUAL(pszDstEncoding, CPL_ENC_UTF8)) //
100 : {
101 13120 : bool bIsAllPrintableASCII = true;
102 13120 : const size_t nCharCount = strlen(pszSource);
103 392941 : for (size_t i = 0; i < nCharCount; i++)
104 : {
105 379821 : if (pszSource[i] < 32 || pszSource[i] > 126)
106 : {
107 0 : bIsAllPrintableASCII = false;
108 0 : break;
109 : }
110 : }
111 13120 : if (bIsAllPrintableASCII)
112 : {
113 13120 : return CPLStrdup(pszSource);
114 : }
115 : }
116 :
117 : #ifdef CPL_RECODE_ICONV
118 : /* -------------------------------------------------------------------- */
119 : /* CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8 */
120 : /* and CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1 conversions are handled */
121 : /* very well by the stub implementation which is faster than the */
122 : /* iconv() route. Use a stub for these two ones and iconv() */
123 : /* everything else. */
124 : /* -------------------------------------------------------------------- */
125 720843 : if ((EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1) &&
126 664723 : EQUAL(pszDstEncoding, CPL_ENC_UTF8)) ||
127 56120 : (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) &&
128 49905 : EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1)))
129 : {
130 714482 : return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
131 : }
132 : #ifdef _WIN32
133 : else if (((EQUAL(pszSrcEncoding, "CP_ACP") ||
134 : EQUAL(pszSrcEncoding, "CP_OEMCP")) &&
135 : EQUAL(pszDstEncoding, CPL_ENC_UTF8)) ||
136 : (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) &&
137 : (EQUAL(pszDstEncoding, "CP_ACP") ||
138 : EQUAL(pszDstEncoding, "CP_OEMCP"))))
139 : {
140 : return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
141 : }
142 : #endif
143 : else
144 : {
145 6361 : return CPLRecodeIconv(pszSource, pszSrcEncoding, pszDstEncoding);
146 : }
147 : #else // CPL_RECODE_STUB
148 : return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
149 : #endif // CPL_RECODE_ICONV
150 : }
151 :
152 : /************************************************************************/
153 : /* CPLRecodeFromWChar() */
154 : /************************************************************************/
155 :
156 : /**
157 : * Convert wchar_t string to UTF-8.
158 : *
159 : * Convert a wchar_t string into a multibyte utf-8 string. The only
160 : * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
161 : * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
162 : * and CPL_ENC_ISO8859_1. In some cases (i.e. using iconv()) other encodings
163 : * may also be supported.
164 : *
165 : * Note that the wchar_t type varies in size on different systems. On
166 : * win32 it is normally 2 bytes, and on UNIX 4 bytes.
167 : *
168 : * If an error occurs an error may, or may not be posted with CPLError().
169 : *
170 : * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
171 : * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
172 : * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
173 : *
174 : * @return a zero terminated multi-byte string which should be freed with
175 : * CPLFree(), or NULL if an error occurs.
176 : *
177 : * @since GDAL 1.6.0
178 : */
179 :
180 107676 : char CPL_DLL *CPLRecodeFromWChar(const wchar_t *pwszSource,
181 : const char *pszSrcEncoding,
182 : const char *pszDstEncoding)
183 :
184 : {
185 : #ifdef CPL_RECODE_ICONV
186 : /* -------------------------------------------------------------------- */
187 : /* Conversions from CPL_ENC_UCS2 */
188 : /* to CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */
189 : /* handled by the stub implementation. */
190 : /* -------------------------------------------------------------------- */
191 107676 : if ((EQUAL(pszSrcEncoding, CPL_ENC_UCS2) ||
192 1360 : EQUAL(pszSrcEncoding, "WCHAR_T")) &&
193 107675 : (EQUAL(pszDstEncoding, CPL_ENC_UTF8) ||
194 0 : EQUAL(pszDstEncoding, CPL_ENC_ASCII) ||
195 0 : EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1)))
196 : {
197 107675 : return CPLRecodeFromWCharStub(pwszSource, pszSrcEncoding,
198 107675 : pszDstEncoding);
199 : }
200 :
201 1 : return CPLRecodeFromWCharIconv(pwszSource, pszSrcEncoding, pszDstEncoding);
202 :
203 : #else // CPL_RECODE_STUB
204 : return CPLRecodeFromWCharStub(pwszSource, pszSrcEncoding, pszDstEncoding);
205 : #endif // CPL_RECODE_ICONV
206 : }
207 :
208 : /************************************************************************/
209 : /* CPLRecodeToWChar() */
210 : /************************************************************************/
211 :
212 : /**
213 : * Convert UTF-8 string to a wchar_t string.
214 : *
215 : * Convert a 8bit, multi-byte per character input string into a wide
216 : * character (wchar_t) string. The only guaranteed supported source encodings
217 : * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1). The only
218 : * guaranteed supported destination encoding is CPL_ENC_UCS2. Other source
219 : * and destination encodings may be supported depending on the underlying
220 : * implementation.
221 : *
222 : * Note that the wchar_t type varies in size on different systems. On
223 : * win32 it is normally 2 bytes, and on UNIX 4 bytes.
224 : *
225 : * If an error occurs an error may, or may not be posted with CPLError().
226 : *
227 : * @param pszSource input multi-byte character string.
228 : * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
229 : * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
230 : *
231 : * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
232 : * NULL on error.
233 : *
234 : * @since GDAL 1.6.0
235 : */
236 :
237 51932 : wchar_t CPL_DLL *CPLRecodeToWChar(const char *pszSource,
238 : const char *pszSrcEncoding,
239 : const char *pszDstEncoding)
240 :
241 : {
242 : #ifdef CPL_RECODE_ICONV
243 : /* -------------------------------------------------------------------- */
244 : /* Conversions to CPL_ENC_UCS2 */
245 : /* from CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */
246 : /* handled by the stub implementation. */
247 : /* -------------------------------------------------------------------- */
248 51932 : if ((EQUAL(pszDstEncoding, CPL_ENC_UCS2) ||
249 0 : EQUAL(pszDstEncoding, "WCHAR_T")) &&
250 51932 : (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) ||
251 0 : EQUAL(pszSrcEncoding, CPL_ENC_ASCII) ||
252 0 : EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1)))
253 : {
254 51932 : return CPLRecodeToWCharStub(pszSource, pszSrcEncoding, pszDstEncoding);
255 : }
256 :
257 0 : return CPLRecodeToWCharIconv(pszSource, pszSrcEncoding, pszDstEncoding);
258 :
259 : #else // CPL_RECODE_STUB
260 : return CPLRecodeToWCharStub(pszSource, pszSrcEncoding, pszDstEncoding);
261 : #endif // CPL_RECODE_ICONV
262 : }
263 :
264 : /************************************************************************/
265 : /* CPLIsASCII() */
266 : /************************************************************************/
267 :
268 : /**
269 : * Test if a string is encoded as ASCII.
270 : *
271 : * @param pabyData input string to test
272 : * @param nLen length of the input string, or -1 if the function must compute
273 : * the string length. In which case it must be null terminated.
274 : * @return true if the string is encoded as ASCII. false otherwise
275 : *
276 : * @since GDAL 3.6.0
277 : */
278 700 : bool CPLIsASCII(const char *pabyData, size_t nLen)
279 : {
280 700 : if (nLen == static_cast<size_t>(-1))
281 21 : nLen = strlen(pabyData);
282 12067 : for (size_t i = 0; i < nLen; ++i)
283 : {
284 11369 : if (static_cast<unsigned char>(pabyData[i]) > 127)
285 2 : return false;
286 : }
287 698 : return true;
288 : }
289 :
290 : /************************************************************************/
291 : /* CPLForceToASCII() */
292 : /************************************************************************/
293 :
294 : /**
295 : * Return a new string that is made only of ASCII characters. If non-ASCII
296 : * characters are found in the input string, they will be replaced by the
297 : * provided replacement character.
298 : *
299 : * This function does not make any assumption on the encoding of the input
300 : * string (except it must be nul-terminated if nLen equals -1, or have at
301 : * least nLen bytes otherwise). CPLUTF8ForceToASCII() can be used instead when
302 : * the input string is known to be UTF-8 encoded.
303 : *
304 : * @param pabyData input string to test
305 : * @param nLen length of the input string, or -1 if the function must compute
306 : * the string length. In which case it must be null terminated.
307 :
308 : * @param chReplacementChar character which will be used when the input stream
309 : * contains a non ASCII character. Must be valid ASCII!
310 : *
311 : * @return a new string that must be freed with CPLFree().
312 : *
313 : * @since GDAL 1.7.0
314 : */
315 5 : char *CPLForceToASCII(const char *pabyData, int nLen, char chReplacementChar)
316 : {
317 5 : const size_t nRealLen =
318 5 : (nLen >= 0) ? static_cast<size_t>(nLen) : strlen(pabyData);
319 5 : char *pszOutputString = static_cast<char *>(CPLMalloc(nRealLen + 1));
320 5 : const char *pszPtr = pabyData;
321 5 : const char *pszEnd = pabyData + nRealLen;
322 5 : size_t i = 0;
323 19 : while (pszPtr != pszEnd)
324 : {
325 14 : if (*reinterpret_cast<const unsigned char *>(pszPtr) > 127)
326 : {
327 3 : pszOutputString[i] = chReplacementChar;
328 3 : ++pszPtr;
329 3 : ++i;
330 : }
331 : else
332 : {
333 11 : pszOutputString[i] = *pszPtr;
334 11 : ++pszPtr;
335 11 : ++i;
336 : }
337 : }
338 5 : pszOutputString[i] = '\0';
339 5 : return pszOutputString;
340 : }
341 :
342 : /************************************************************************/
343 : /* CPLUTF8ForceToASCII() */
344 : /************************************************************************/
345 :
346 : /**
347 : * Return a new string that is made only of ASCII characters. If non-ASCII
348 : * characters are found in the input string, for which an "equivalent" ASCII
349 : * character is not found, they will be replaced by the provided replacement
350 : * character.
351 : *
352 : * This function is aware of https://en.wikipedia.org/wiki/Latin-1_Supplement
353 : * and https://en.wikipedia.org/wiki/Latin_Extended-A to provide sensible
354 : * replacements for accented characters.
355 :
356 : * @param pszStr NUL-terminated UTF-8 string.
357 : * @param chReplacementChar character which will be used when the input stream
358 : * contains a non ASCII character that cannot be
359 : * substituted with an equivalent ASCII character.
360 : * Must be valid ASCII!
361 : *
362 : * @return a new string that must be freed with CPLFree().
363 : *
364 : * @since GDAL 3.9
365 : */
366 16 : char *CPLUTF8ForceToASCII(const char *pszStr, char chReplacementChar)
367 : {
368 : static const struct
369 : {
370 : short nCodePoint;
371 : char chFirst;
372 : char chSecond;
373 : } aLatinCharacters[] = {
374 : // https://en.wikipedia.org/wiki/Latin-1_Supplement
375 : {0xC0, 'A', 0}, // Latin Capital Letter A with grave
376 : {0xC1, 'A', 0}, // Latin Capital letter A with acute
377 : {0xC2, 'A', 0}, // Latin Capital letter A with circumflex
378 : {0xC3, 'A', 0}, // Latin Capital letter A with tilde
379 : {0xC4, 'A', 0}, // Latin Capital letter A with diaeresis
380 : {0xC5, 'A', 0}, // Latin Capital letter A with ring above
381 : {0xC6, 'A', 'E'}, // Latin Capital letter AE
382 : {0xC7, 'C', 0}, // Latin Capital letter C with cedilla
383 : {0xC8, 'E', 0}, // Latin Capital letter E with grave
384 : {0xC9, 'E', 0}, // Latin Capital letter E with acute
385 : {0xCA, 'E', 0}, // Latin Capital letter E with circumflex
386 : {0xCB, 'E', 0}, // Latin Capital letter E with diaeresis
387 : {0xCC, 'I', 0}, // Latin Capital letter I with grave
388 : {0xCD, 'I', 0}, // Latin Capital letter I with acute
389 : {0xCE, 'I', 0}, // Latin Capital letter I with circumflex
390 : {0xCF, 'I', 0}, // Latin Capital letter I with diaeresis
391 : // { 0xD0, '?', 0 }, // Latin Capital letter Eth
392 : {0xD1, 'N', 0}, // Latin Capital letter N with tilde
393 : {0xD2, 'O', 0}, // Latin Capital letter O with grave
394 : {0xD3, 'O', 0}, // Latin Capital letter O with acute
395 : {0xD4, 'O', 0}, // Latin Capital letter O with circumflex
396 : {0xD5, 'O', 0}, // Latin Capital letter O with tilde
397 : {0xD6, 'O', 0}, // Latin Capital letter O with diaeresis
398 : {0xD8, 'O', 0}, // Latin Capital letter O with stroke
399 : {0xD9, 'U', 0}, // Latin Capital letter U with grave
400 : {0xDA, 'U', 0}, // Latin Capital letter U with acute
401 : {0xDB, 'U', 0}, // Latin Capital Letter U with circumflex
402 : {0xDC, 'U', 0}, // Latin Capital Letter U with diaeresis
403 : {0xDD, 'Y', 0}, // Latin Capital Letter Y with acute
404 : // { 0xDE, '?', 0 }, // Latin Capital Letter Thorn
405 : {0xDF, 'S', 'S'}, // Latin Small Letter sharp S
406 : {0xE0, 'a', 0}, // Latin Small Letter A with grave
407 : {0xE1, 'a', 0}, // Latin Small Letter A with acute
408 : {0xE2, 'a', 0}, // Latin Small Letter A with circumflex
409 : {0xE3, 'a', 0}, // Latin Small Letter A with tilde
410 : {0xE4, 'a', 0}, // Latin Small Letter A with diaeresis
411 : {0xE5, 'a', 0}, // Latin Small Letter A with ring above
412 : {0xE6, 'a', 'e'}, // Latin Small Letter AE
413 : {0xE7, 'c', 0}, // Latin Small Letter C with cedilla
414 : {0xE8, 'e', 0}, // Latin Small Letter E with grave
415 : {0xE9, 'e', 0}, // Latin Small Letter E with acute
416 : {0xEA, 'e', 0}, // Latin Small Letter E with circumflex
417 : {0xEB, 'e', 0}, // Latin Small Letter E with diaeresis
418 : {0xEC, 'i', 0}, // Latin Small Letter I with grave
419 : {0xED, 'i', 0}, // Latin Small Letter I with acute
420 : {0xEE, 'i', 0}, // Latin Small Letter I with circumflex
421 : {0xEF, 'i', 0}, // Latin Small Letter I with diaeresis
422 : // { 0xF0, '?', 0 }, // Latin Small Letter Eth
423 : {0xF1, 'n', 0}, // Latin Small Letter N with tilde
424 : {0xF2, 'o', 0}, // Latin Small Letter O with grave
425 : {0xF3, 'o', 0}, // Latin Small Letter O with acute
426 : {0xF4, 'o', 0}, // Latin Small Letter O with circumflex
427 : {0xF5, 'o', 0}, // Latin Small Letter O with tilde
428 : {0xF6, 'o', 0}, // Latin Small Letter O with diaeresis
429 : {0xF8, 'o', 0}, // Latin Small Letter O with stroke
430 : {0xF9, 'u', 0}, // Latin Small Letter U with grave
431 : {0xFA, 'u', 0}, // Latin Small Letter U with acute
432 : {0xFB, 'u', 0}, // Latin Small Letter U with circumflex
433 : {0xFC, 'u', 0}, // Latin Small Letter U with diaeresis
434 : {0xFD, 'y', 0}, // Latin Small Letter Y with acute
435 : // { 0xFE, '?', 0 }, // Latin Small Letter Thorn
436 : {0xFF, 'u', 0}, // Latin Small Letter Y with diaeresis
437 :
438 : // https://en.wikipedia.org/wiki/Latin_Extended-A
439 : {
440 : 0x0100,
441 : 'A',
442 : 0,
443 : }, // Latin Capital letter A with macron
444 : {
445 : 0x0101,
446 : 'a',
447 : 0,
448 : }, // Latin Small letter A with macron
449 : {
450 : 0x0102,
451 : 'A',
452 : 0,
453 : }, // Latin Capital letter A with breve
454 : {
455 : 0x0103,
456 : 'a',
457 : 0,
458 : }, // Latin Small letter A with breve
459 : {
460 : 0x0104,
461 : 'A',
462 : 0,
463 : }, // Latin Capital letter A with ogonek
464 : {
465 : 0x0105,
466 : 'a',
467 : 0,
468 : }, // Latin Small letter A with ogonek
469 : {
470 : 0x0106,
471 : 'C',
472 : 0,
473 : }, // Latin Capital letter C with acute
474 : {
475 : 0x0107,
476 : 'c',
477 : 0,
478 : }, // Latin Small letter C with acute
479 : {
480 : 0x0108,
481 : 'C',
482 : 0,
483 : }, // Latin Capital letter C with circumflex
484 : {
485 : 0x0109,
486 : 'c',
487 : 0,
488 : }, // Latin Small letter C with circumflex
489 : {
490 : 0x010A,
491 : 'C',
492 : 0,
493 : }, // Latin Capital letter C with dot above
494 : {
495 : 0x010B,
496 : 'c',
497 : 0,
498 : }, // Latin Small letter C with dot above
499 : {
500 : 0x010C,
501 : 'C',
502 : 0,
503 : }, // Latin Capital letter C with caron
504 : {
505 : 0x010D,
506 : 'c',
507 : 0,
508 : }, // Latin Small letter C with caron
509 : {
510 : 0x010E,
511 : 'D',
512 : 0,
513 : }, // Latin Capital letter D with caron
514 : {
515 : 0x010F,
516 : 'd',
517 : 0,
518 : }, // Latin Small letter D with caron
519 : {
520 : 0x0110,
521 : 'D',
522 : 0,
523 : }, // Latin Capital letter D with stroke
524 : {
525 : 0x0111,
526 : 'd',
527 : 0,
528 : }, // Latin Small letter D with stroke
529 : {
530 : 0x0112,
531 : 'E',
532 : 0,
533 : }, // Latin Capital letter E with macron
534 : {
535 : 0x0113,
536 : 'e',
537 : 0,
538 : }, // Latin Small letter E with macron
539 : {
540 : 0x0114,
541 : 'E',
542 : 0,
543 : }, // Latin Capital letter E with breve
544 : {
545 : 0x0115,
546 : 'e',
547 : 0,
548 : }, // Latin Small letter E with breve
549 : {
550 : 0x0116,
551 : 'E',
552 : 0,
553 : }, // Latin Capital letter E with dot above
554 : {
555 : 0x0117,
556 : 'e',
557 : 0,
558 : }, // Latin Small letter E with dot above
559 : {
560 : 0x0118,
561 : 'E',
562 : 0,
563 : }, // Latin Capital letter E with ogonek
564 : {
565 : 0x0119,
566 : 'e',
567 : 0,
568 : }, // Latin Small letter E with ogonek
569 : {
570 : 0x011A,
571 : 'E',
572 : 0,
573 : }, // Latin Capital letter E with caron
574 : {
575 : 0x011B,
576 : 'e',
577 : 0,
578 : }, // Latin Small letter E with caron
579 : {
580 : 0x011C,
581 : 'G',
582 : 0,
583 : }, // Latin Capital letter G with circumflex
584 : {
585 : 0x011D,
586 : 'g',
587 : 0,
588 : }, // Latin Small letter G with circumflex
589 : {
590 : 0x011E,
591 : 'G',
592 : 0,
593 : }, // Latin Capital letter G with breve
594 : {
595 : 0x011F,
596 : 'g',
597 : 0,
598 : }, // Latin Small letter G with breve
599 : {
600 : 0x0120,
601 : 'G',
602 : 0,
603 : }, // Latin Capital letter G with dot above
604 : {
605 : 0x0121,
606 : 'g',
607 : 0,
608 : }, // Latin Small letter G with dot above
609 : {
610 : 0x0122,
611 : 'G',
612 : 0,
613 : }, // Latin Capital letter G with cedilla
614 : {
615 : 0x0123,
616 : 'g',
617 : 0,
618 : }, // Latin Small letter G with cedilla
619 : {
620 : 0x0124,
621 : 'H',
622 : 0,
623 : }, // Latin Capital letter H with circumflex
624 : {
625 : 0x0125,
626 : 'h',
627 : 0,
628 : }, // Latin Small letter H with circumflex
629 : {
630 : 0x0126,
631 : 'H',
632 : 0,
633 : }, // Latin Capital letter H with stroke
634 : {
635 : 0x0127,
636 : 'h',
637 : 0,
638 : }, // Latin Small letter H with stroke
639 : {
640 : 0x0128,
641 : 'I',
642 : 0,
643 : }, // Latin Capital letter I with tilde
644 : {
645 : 0x0129,
646 : 'i',
647 : 0,
648 : }, // Latin Small letter I with tilde
649 : {
650 : 0x012A,
651 : 'I',
652 : 0,
653 : }, // Latin Capital letter I with macron
654 : {
655 : 0x012B,
656 : 'i',
657 : 0,
658 : }, // Latin Small letter I with macron
659 : {
660 : 0x012C,
661 : 'I',
662 : 0,
663 : }, // Latin Capital letter I with breve
664 : {
665 : 0x012D,
666 : 'i',
667 : 0,
668 : }, // Latin Small letter I with breve
669 : {
670 : 0x012E,
671 : 'I',
672 : 0,
673 : }, // Latin Capital letter I with ogonek
674 : {
675 : 0x012F,
676 : 'i',
677 : 0,
678 : }, // Latin Small letter I with ogonek
679 : {
680 : 0x0130,
681 : 'I',
682 : 0,
683 : }, // Latin Capital letter I with dot above
684 : {
685 : 0x0131,
686 : 'i',
687 : 0,
688 : }, // Latin Small letter dotless I
689 : {
690 : 0x0132,
691 : 'I',
692 : 'J',
693 : }, // Latin Capital Ligature IJ
694 : {
695 : 0x0133,
696 : 'i',
697 : 'j',
698 : }, // Latin Small Ligature IJ
699 : {
700 : 0x0134,
701 : 'J',
702 : 0,
703 : }, // Latin Capital letter J with circumflex
704 : {
705 : 0x0135,
706 : 'j',
707 : 0,
708 : }, // Latin Small letter J with circumflex
709 : {
710 : 0x0136,
711 : 'K',
712 : 0,
713 : }, // Latin Capital letter K with cedilla
714 : {
715 : 0x0137,
716 : 'k',
717 : 0,
718 : }, // Latin Small letter K with cedilla
719 : {
720 : 0x0138,
721 : 'k',
722 : 0,
723 : }, // Latin Small letter Kra
724 : {
725 : 0x0139,
726 : 'L',
727 : 0,
728 : }, // Latin Capital letter L with acute
729 : {
730 : 0x013A,
731 : 'l',
732 : 0,
733 : }, // Latin Small letter L with acute
734 : {
735 : 0x013B,
736 : 'L',
737 : 0,
738 : }, // Latin Capital letter L with cedilla
739 : {
740 : 0x013C,
741 : 'l',
742 : 0,
743 : }, // Latin Small letter L with cedilla
744 : {
745 : 0x013D,
746 : 'L',
747 : 0,
748 : }, // Latin Capital letter L with caron
749 : {
750 : 0x013E,
751 : 'l',
752 : 0,
753 : }, // Latin Small letter L with caron
754 : {
755 : 0x013F,
756 : 'L',
757 : 0,
758 : }, // Latin Capital letter L with middle dot
759 : {
760 : 0x0140,
761 : 'l',
762 : 0,
763 : }, // Latin Small letter L with middle dot
764 : {
765 : 0x0141,
766 : 'L',
767 : 0,
768 : }, // Latin Capital letter L with stroke
769 : {
770 : 0x0142,
771 : 'l',
772 : 0,
773 : }, // Latin Small letter L with stroke
774 : {
775 : 0x0143,
776 : 'N',
777 : 0,
778 : }, // Latin Capital letter N with acute
779 : {
780 : 0x0144,
781 : 'n',
782 : 0,
783 : }, // Latin Small letter N with acute
784 : {
785 : 0x0145,
786 : 'N',
787 : 0,
788 : }, // Latin Capital letter N with cedilla
789 : {
790 : 0x0146,
791 : 'n',
792 : 0,
793 : }, // Latin Small letter N with cedilla
794 : {
795 : 0x0147,
796 : 'N',
797 : 0,
798 : }, // Latin Capital letter N with caron
799 : {
800 : 0x0148,
801 : 'n',
802 : 0,
803 : }, // Latin Small letter N with caron
804 : // { 0x014A , '?' , 0, }, // Latin Capital letter Eng
805 : // { 0x014B , '?' , 0, }, // Latin Small letter Eng
806 : {
807 : 0x014C,
808 : 'O',
809 : 0,
810 : }, // Latin Capital letter O with macron
811 : {
812 : 0x014D,
813 : 'o',
814 : 0,
815 : }, // Latin Small letter O with macron
816 : {
817 : 0x014E,
818 : 'O',
819 : 0,
820 : }, // Latin Capital letter O with breve
821 : {
822 : 0x014F,
823 : 'o',
824 : 0,
825 : }, // Latin Small letter O with breve
826 : {
827 : 0x0150,
828 : 'O',
829 : 0,
830 : }, // Latin Capital Letter O with double acute
831 : {
832 : 0x0151,
833 : 'o',
834 : 0,
835 : }, // Latin Small Letter O with double acute
836 : {
837 : 0x0152,
838 : 'O',
839 : 'E',
840 : }, // Latin Capital Ligature OE
841 : {
842 : 0x0153,
843 : 'o',
844 : 'e',
845 : }, // Latin Small Ligature OE
846 : {
847 : 0x0154,
848 : 'R',
849 : 0,
850 : }, // Latin Capital letter R with acute
851 : {
852 : 0x0155,
853 : 'r',
854 : 0,
855 : }, // Latin Small letter R with acute
856 : {
857 : 0x0156,
858 : 'R',
859 : 0,
860 : }, // Latin Capital letter R with cedilla
861 : {
862 : 0x0157,
863 : 'r',
864 : 0,
865 : }, // Latin Small letter R with cedilla
866 : {
867 : 0x0158,
868 : 'R',
869 : 0,
870 : }, // Latin Capital letter R with caron
871 : {
872 : 0x0159,
873 : 'r',
874 : 0,
875 : }, // Latin Small letter R with caron
876 : {
877 : 0x015A,
878 : 'S',
879 : 0,
880 : }, // Latin Capital letter S with acute
881 : {
882 : 0x015B,
883 : 's',
884 : 0,
885 : }, // Latin Small letter S with acute
886 : {
887 : 0x015C,
888 : 'S',
889 : 0,
890 : }, // Latin Capital letter S with circumflex
891 : {
892 : 0x015D,
893 : 's',
894 : 0,
895 : }, // Latin Small letter S with circumflex
896 : {
897 : 0x015E,
898 : 'S',
899 : 0,
900 : }, // Latin Capital letter S with cedilla
901 : {
902 : 0x015F,
903 : 's',
904 : 0,
905 : }, // Latin Small letter S with cedilla
906 : {
907 : 0x0160,
908 : 'S',
909 : 0,
910 : }, // Latin Capital letter S with caron
911 : {
912 : 0x0161,
913 : 's',
914 : 0,
915 : }, // Latin Small letter S with caron
916 : {
917 : 0x0162,
918 : 'T',
919 : 0,
920 : }, // Latin Capital letter T with cedilla
921 : {
922 : 0x0163,
923 : 't',
924 : 0,
925 : }, // Latin Small letter T with cedilla
926 : {
927 : 0x0164,
928 : 'T',
929 : 0,
930 : }, // Latin Capital letter T with caron
931 : {
932 : 0x0165,
933 : 't',
934 : 0,
935 : }, // Latin Small letter T with caron
936 : {
937 : 0x0166,
938 : 'T',
939 : 0,
940 : }, // Latin Capital letter T with stroke
941 : {
942 : 0x0167,
943 : 't',
944 : 0,
945 : }, // Latin Small letter T with stroke
946 : {
947 : 0x0168,
948 : 'U',
949 : 0,
950 : }, // Latin Capital letter U with tilde
951 : {
952 : 0x0169,
953 : 'u',
954 : 0,
955 : }, // Latin Small letter U with tilde
956 : {
957 : 0x016A,
958 : 'U',
959 : 0,
960 : }, // Latin Capital letter U with macron
961 : {
962 : 0x016B,
963 : 'u',
964 : 0,
965 : }, // Latin Small letter U with macron
966 : {
967 : 0x016C,
968 : 'U',
969 : 0,
970 : }, // Latin Capital letter U with breve
971 : {
972 : 0x016D,
973 : 'u',
974 : 0,
975 : }, // Latin Small letter U with breve
976 : {
977 : 0x016E,
978 : 'U',
979 : 0,
980 : }, // Latin Capital letter U with ring above
981 : {
982 : 0x016F,
983 : 'u',
984 : 0,
985 : }, // Latin Small letter U with ring above
986 : {
987 : 0x0170,
988 : 'U',
989 : 0,
990 : }, // Latin Capital Letter U with double acute
991 : {
992 : 0x0171,
993 : 'u',
994 : 0,
995 : }, // Latin Small Letter U with double acute
996 : {
997 : 0x0172,
998 : 'U',
999 : 0,
1000 : }, // Latin Capital letter U with ogonek
1001 : {
1002 : 0x0173,
1003 : 'u',
1004 : 0,
1005 : }, // Latin Small letter U with ogonek
1006 : {
1007 : 0x0174,
1008 : 'W',
1009 : 0,
1010 : }, // Latin Capital letter W with circumflex
1011 : {
1012 : 0x0175,
1013 : 'w',
1014 : 0,
1015 : }, // Latin Small letter W with circumflex
1016 : {
1017 : 0x0176,
1018 : 'Y',
1019 : 0,
1020 : }, // Latin Capital letter Y with circumflex
1021 : {
1022 : 0x0177,
1023 : 'y',
1024 : 0,
1025 : }, // Latin Small letter Y with circumflex
1026 : {
1027 : 0x0178,
1028 : 'Y',
1029 : 0,
1030 : }, // Latin Capital letter Y with diaeresis
1031 : {
1032 : 0x0179,
1033 : 'Z',
1034 : 0,
1035 : }, // Latin Capital letter Z with acute
1036 : {
1037 : 0x017A,
1038 : 'z',
1039 : 0,
1040 : }, // Latin Small letter Z with acute
1041 : {
1042 : 0x017B,
1043 : 'Z',
1044 : 0,
1045 : }, // Latin Capital letter Z with dot above
1046 : {
1047 : 0x017C,
1048 : 'z',
1049 : 0,
1050 : }, // Latin Small letter Z with dot above
1051 : {
1052 : 0x017D,
1053 : 'Z',
1054 : 0,
1055 : }, // Latin Capital letter Z with caron
1056 : {
1057 : 0x017E,
1058 : 'z',
1059 : 0,
1060 : }, // Latin Small letter Z with caron
1061 : };
1062 :
1063 16 : const size_t nLen = strlen(pszStr);
1064 16 : char *pszOutputString = static_cast<char *>(CPLMalloc(nLen + 1));
1065 16 : const char *pszPtr = pszStr;
1066 16 : const char *pszEnd = pszStr + nLen;
1067 16 : size_t i = 0;
1068 248 : while (pszPtr != pszEnd)
1069 : {
1070 233 : if (*reinterpret_cast<const unsigned char *>(pszPtr) > 127)
1071 : {
1072 : utf8_int32_t codepoint;
1073 189 : if (pszPtr + utf8codepointcalcsize(pszPtr) > pszEnd)
1074 1 : break;
1075 188 : auto pszNext = utf8codepoint(pszPtr, &codepoint);
1076 188 : char ch = chReplacementChar;
1077 17075 : for (const auto &latin1char : aLatinCharacters)
1078 : {
1079 17073 : if (codepoint == latin1char.nCodePoint)
1080 : {
1081 186 : pszOutputString[i] = latin1char.chFirst;
1082 186 : ++i;
1083 186 : if (latin1char.chSecond)
1084 : {
1085 7 : pszOutputString[i] = latin1char.chSecond;
1086 7 : ++i;
1087 : }
1088 186 : ch = 0;
1089 186 : break;
1090 : }
1091 : }
1092 188 : if (ch)
1093 : {
1094 2 : pszOutputString[i] = ch;
1095 2 : ++i;
1096 : }
1097 188 : pszPtr = pszNext;
1098 : }
1099 : else
1100 : {
1101 44 : pszOutputString[i] = *pszPtr;
1102 44 : ++pszPtr;
1103 44 : ++i;
1104 : }
1105 : }
1106 16 : pszOutputString[i] = '\0';
1107 16 : return pszOutputString;
1108 : }
1109 :
1110 : /************************************************************************/
1111 : /* CPLEncodingCharSize() */
1112 : /************************************************************************/
1113 :
1114 : /**
1115 : * Return bytes per character for encoding.
1116 : *
1117 : * This function returns the size in bytes of the smallest character
1118 : * in this encoding. For fixed width encodings (ASCII, UCS-2, UCS-4) this
1119 : * is straight forward. For encodings like UTF8 and UTF16 which represent
1120 : * some characters as a sequence of atomic character sizes the function
1121 : * still returns the atomic character size (1 for UTF8, 2 for UTF16).
1122 : *
1123 : * This function will return the correct value for well known encodings
1124 : * with corresponding CPL_ENC_ values. It may not return the correct value
1125 : * for other encodings even if they are supported by the underlying iconv
1126 : * or windows transliteration services. Hopefully it will improve over time.
1127 : *
1128 : * @param pszEncoding the name of the encoding.
1129 : *
1130 : * @return the size of a minimal character in bytes or -1 if the size is
1131 : * unknown.
1132 : */
1133 :
1134 1 : int CPLEncodingCharSize(const char *pszEncoding)
1135 :
1136 : {
1137 1 : if (EQUAL(pszEncoding, CPL_ENC_UTF8))
1138 0 : return 1;
1139 1 : else if (EQUAL(pszEncoding, CPL_ENC_UTF16) ||
1140 1 : EQUAL(pszEncoding, "UTF-16LE"))
1141 1 : return 2;
1142 0 : else if (EQUAL(pszEncoding, CPL_ENC_UCS2) || EQUAL(pszEncoding, "UCS-2LE"))
1143 0 : return 2;
1144 0 : else if (EQUAL(pszEncoding, CPL_ENC_UCS4))
1145 0 : return 4;
1146 0 : else if (EQUAL(pszEncoding, CPL_ENC_ASCII))
1147 0 : return 1;
1148 0 : else if (STARTS_WITH_CI(pszEncoding, "ISO-8859-"))
1149 0 : return 1;
1150 :
1151 0 : return -1;
1152 : }
1153 :
1154 : /************************************************************************/
1155 : /* CPLClearRecodeWarningFlags() */
1156 : /************************************************************************/
1157 :
1158 10377 : void CPLClearRecodeWarningFlags()
1159 : {
1160 : #ifdef CPL_RECODE_ICONV
1161 10377 : CPLClearRecodeIconvWarningFlags();
1162 : #endif
1163 10377 : CPLClearRecodeStubWarningFlags();
1164 10377 : }
1165 :
1166 : /************************************************************************/
1167 : /* CPLStrlenUTF8() */
1168 : /************************************************************************/
1169 :
1170 : /**
1171 : * Return the number of UTF-8 characters of a nul-terminated string.
1172 : *
1173 : * This is different from strlen() which returns the number of bytes.
1174 : *
1175 : * @param pszUTF8Str a nul-terminated UTF-8 string
1176 : *
1177 : * @return the number of UTF-8 characters.
1178 : */
1179 :
1180 358920 : int CPLStrlenUTF8(const char *pszUTF8Str)
1181 : {
1182 358920 : int nCharacterCount = 0;
1183 18629800 : for (int i = 0; pszUTF8Str[i] != '\0'; ++i)
1184 : {
1185 18270900 : if ((pszUTF8Str[i] & 0xc0) != 0x80)
1186 18270800 : ++nCharacterCount;
1187 : }
1188 358920 : return nCharacterCount;
1189 : }
1190 :
1191 : /************************************************************************/
1192 : /* CPLCanRecode() */
1193 : /************************************************************************/
1194 :
1195 : /**
1196 : * Checks if it is possible to recode a string from one encoding to another.
1197 : *
1198 : * @param pszTestStr a NULL terminated string.
1199 : * @param pszSrcEncoding the source encoding.
1200 : * @param pszDstEncoding the destination encoding.
1201 : *
1202 : * @return a TRUE if recode is possible.
1203 : *
1204 : * @since GDAL 3.1.0
1205 : */
1206 5569 : int CPLCanRecode(const char *pszTestStr, const char *pszSrcEncoding,
1207 : const char *pszDstEncoding)
1208 : {
1209 5569 : CPLClearRecodeWarningFlags();
1210 5569 : CPLErrorReset();
1211 :
1212 5569 : CPLPushErrorHandler(CPLQuietErrorHandler);
1213 5569 : char *pszRec(CPLRecode(pszTestStr, pszSrcEncoding, pszDstEncoding));
1214 5569 : CPLPopErrorHandler();
1215 :
1216 5569 : if (pszRec == nullptr)
1217 : {
1218 0 : return FALSE;
1219 : }
1220 :
1221 5569 : CPLFree(pszRec);
1222 :
1223 5569 : if (CPLGetLastErrorType() != 0)
1224 : {
1225 1 : return FALSE;
1226 : }
1227 :
1228 5568 : return TRUE;
1229 : }
|