LCOV - code coverage report
Current view: top level - ogr/ogrsf_frmts/avc - avc_mbyte.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 18 153 11.8 %
Date: 2024-05-06 18:28:20 Functions: 4 9 44.4 %

          Line data    Source code
       1             : /* $Id$
       2             :  *
       3             :  * Name:     avc_mbyte.c
       4             :  * Project:  Arc/Info vector coverage (AVC)  E00->BIN conversion library
       5             :  * Language: ANSI C
       6             :  * Purpose:  Functions to handle multibyte character conversions.
       7             :  * Author:   Daniel Morissette, dmorissette@dmsolutions.ca
       8             :  *
       9             :  **********************************************************************
      10             :  * Copyright (c) 1999-2005, Daniel Morissette
      11             :  *
      12             :  * Permission is hereby granted, free of charge, to any person obtaining a
      13             :  * copy of this software and associated documentation files (the "Software"),
      14             :  * to deal in the Software without restriction, including without limitation
      15             :  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      16             :  * and/or sell copies of the Software, and to permit persons to whom the
      17             :  * Software is furnished to do so, subject to the following conditions:
      18             :  *
      19             :  * The above copyright notice and this permission notice shall be included
      20             :  * in all copies or substantial portions of the Software.
      21             :  *
      22             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
      23             :  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      24             :  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
      25             :  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
      26             :  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
      27             :  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
      28             :  * DEALINGS IN THE SOFTWARE.
      29             :  **********************************************************************/
      30             : 
      31             : #include "avc.h"
      32             : 
      33             : #ifdef _WIN32
      34             : #include <mbctype.h>
      35             : #endif
      36             : 
      37             : static int _AVCDetectJapaneseEncoding(const GByte *pszLine);
      38             : static const GByte *_AVCJapanese2ArcDBCS(AVCDBCSInfo *psDBCSInfo,
      39             :                                          const GByte *pszLine,
      40             :                                          int nMaxOutputLen);
      41             : static const GByte *_AVCArcDBCS2JapaneseShiftJIS(AVCDBCSInfo *psDBCSInfo,
      42             :                                                  const GByte *pszLine,
      43             :                                                  int nMaxOutputLen);
      44             : 
      45             : /*=====================================================================
      46             :  * Functions to handle multibyte char conversions
      47             :  *====================================================================*/
      48             : 
      49             : #define IS_ASCII(c) ((c) < 0x80)
      50             : 
      51             : /**********************************************************************
      52             :  *                          AVCAllocDBCSInfo()
      53             :  *
      54             :  * Alloc and init a new AVCDBCSInfo structure.
      55             :  **********************************************************************/
      56           3 : AVCDBCSInfo *AVCAllocDBCSInfo(void)
      57             : {
      58             :     AVCDBCSInfo *psInfo;
      59             : 
      60           3 :     psInfo = (AVCDBCSInfo *)CPLCalloc(1, sizeof(AVCDBCSInfo));
      61             : 
      62           3 :     psInfo->nDBCSCodePage = AVCGetDBCSCodePage();
      63           3 :     psInfo->nDBCSEncoding = AVC_CODE_UNKNOWN;
      64           3 :     psInfo->pszDBCSBuf = nullptr;
      65           3 :     psInfo->nDBCSBufSize = 0;
      66             : 
      67           3 :     return psInfo;
      68             : }
      69             : 
      70             : /**********************************************************************
      71             :  *                          AVCFreeDBCSInfo()
      72             :  *
      73             :  * Release all memory associated with a AVCDBCSInfo structure.
      74             :  **********************************************************************/
      75           3 : void AVCFreeDBCSInfo(AVCDBCSInfo *psInfo)
      76             : {
      77           3 :     if (psInfo)
      78             :     {
      79           3 :         CPLFree(psInfo->pszDBCSBuf);
      80           3 :         CPLFree(psInfo);
      81             :     }
      82           3 : }
      83             : 
      84             : /**********************************************************************
      85             :  *                          AVCGetDBCSCodePage()
      86             :  *
      87             :  * Fetch current multibyte codepage on the system.
      88             :  * Returns a valid codepage number, or 0 if the codepage is single byte or
      89             :  * unsupported.
      90             :  **********************************************************************/
      91           3 : int AVCGetDBCSCodePage(void)
      92             : {
      93             : #ifdef _WIN32
      94             :     int nCP;
      95             :     nCP = _getmbcp();
      96             : 
      97             :     /* Check if that's a supported codepage */
      98             :     if (nCP == AVC_DBCS_JAPANESE)
      99             :         return nCP;
     100             : #endif
     101             : 
     102           3 :     return 0;
     103             : }
     104             : 
     105             : /**********************************************************************
     106             :  *                          AVCE00DetectEncoding()
     107             :  *
     108             :  * Try to detect the encoding used in the current file by examining lines
     109             :  * of input.
     110             :  *
     111             :  * Returns TRUE once the encoding is established, or FALSE if more lines
     112             :  * of input are required to establish the encoding.
     113             :  **********************************************************************/
     114           0 : GBool AVCE00DetectEncoding(AVCDBCSInfo *psDBCSInfo, const GByte *pszLine)
     115             : {
     116           0 :     if (psDBCSInfo == nullptr || psDBCSInfo->nDBCSCodePage == 0 ||
     117           0 :         psDBCSInfo->nDBCSEncoding != AVC_CODE_UNKNOWN)
     118             :     {
     119             :         /* Either single byte codepage, or encoding has already been detected
     120             :          */
     121           0 :         return TRUE;
     122             :     }
     123             : 
     124           0 :     switch (psDBCSInfo->nDBCSCodePage)
     125             :     {
     126           0 :         case AVC_DBCS_JAPANESE:
     127           0 :             psDBCSInfo->nDBCSEncoding = _AVCDetectJapaneseEncoding(pszLine);
     128           0 :             break;
     129           0 :         default:
     130           0 :             psDBCSInfo->nDBCSEncoding = AVC_CODE_UNKNOWN;
     131           0 :             return TRUE; /* Codepage not supported... no need to scan more
     132             :                             lines*/
     133             :     }
     134             : 
     135           0 :     if (psDBCSInfo->nDBCSEncoding != AVC_CODE_UNKNOWN)
     136           0 :         return TRUE; /* We detected the encoding! */
     137             : 
     138           0 :     return FALSE;
     139             : }
     140             : 
     141             : /**********************************************************************
     142             :  *                          AVCE00Convert2ArcDBCS()
     143             :  *
     144             :  * If encoding is still unknown, try to detect the encoding used in the
     145             :  * current file, and then convert the string to an encoding validfor output
     146             :  * to a coverage.
     147             :  *
     148             :  * Returns a reference to a const buffer that should not be freed by the
     149             :  * caller.  It can be either the original string buffer or a ref. to an
     150             :  * internal buffer.
     151             :  **********************************************************************/
     152           0 : const GByte *AVCE00Convert2ArcDBCS(AVCDBCSInfo *psDBCSInfo,
     153             :                                    const GByte *pszLine, int nMaxOutputLen)
     154             : {
     155           0 :     const GByte *pszOutBuf = nullptr;
     156           0 :     GByte *pszTmp = nullptr;
     157             :     GBool bAllAscii;
     158             : 
     159           0 :     if (psDBCSInfo == nullptr || psDBCSInfo->nDBCSCodePage == 0 ||
     160             :         pszLine == nullptr)
     161             :     {
     162             :         /* Single byte codepage... nothing to do
     163             :          */
     164           0 :         return pszLine;
     165             :     }
     166             : 
     167             :     /* If string is all ASCII then there is nothing to do...
     168             :      */
     169           0 :     pszTmp = (GByte *)pszLine;
     170           0 :     for (bAllAscii = TRUE; bAllAscii && pszTmp && *pszTmp; pszTmp++)
     171             :     {
     172           0 :         if (!IS_ASCII(*pszTmp))
     173           0 :             bAllAscii = FALSE;
     174             :     }
     175           0 :     if (bAllAscii)
     176           0 :         return pszLine;
     177             : 
     178             :     /* Make sure output buffer is large enough.
     179             :      * We add 2 chars to buffer size to simplify processing... no need to
     180             :      * check if second byte of a pair would overflow buffer.
     181             :      */
     182           0 :     if (psDBCSInfo->pszDBCSBuf == nullptr ||
     183           0 :         psDBCSInfo->nDBCSBufSize < nMaxOutputLen + 2)
     184             :     {
     185           0 :         psDBCSInfo->nDBCSBufSize = nMaxOutputLen + 2;
     186           0 :         psDBCSInfo->pszDBCSBuf = (GByte *)CPLRealloc(
     187           0 :             psDBCSInfo->pszDBCSBuf, psDBCSInfo->nDBCSBufSize * sizeof(GByte));
     188             :     }
     189             : 
     190             :     /* Do the conversion according to current code page
     191             :      */
     192           0 :     switch (psDBCSInfo->nDBCSCodePage)
     193             :     {
     194           0 :         case AVC_DBCS_JAPANESE:
     195             :             pszOutBuf =
     196           0 :                 _AVCJapanese2ArcDBCS(psDBCSInfo, pszLine, nMaxOutputLen);
     197           0 :             break;
     198           0 :         default:
     199             :             /* We should never get here anyways, but just in case return pszLine
     200             :              */
     201           0 :             CPLAssert(FALSE); /* Should never get here. */
     202             :             pszOutBuf = pszLine;
     203             :     }
     204             : 
     205           0 :     return pszOutBuf;
     206             : }
     207             : 
     208             : /**********************************************************************
     209             :  *                          AVCE00ConvertFromArcDBCS()
     210             :  *
     211             :  * Convert DBCS encoding in binary coverage files to E00 encoding.
     212             :  *
     213             :  * Returns a reference to a const buffer that should not be freed by the
     214             :  * caller.  It can be either the original string buffer or a ref. to an
     215             :  * internal buffer.
     216             :  **********************************************************************/
     217         286 : const GByte *AVCE00ConvertFromArcDBCS(AVCDBCSInfo *psDBCSInfo,
     218             :                                       const GByte *pszLine, int nMaxOutputLen)
     219             : {
     220         286 :     const GByte *pszOutBuf = nullptr;
     221             :     GByte *pszTmp;
     222             :     GBool bAllAscii;
     223             : 
     224         286 :     if (psDBCSInfo == nullptr || psDBCSInfo->nDBCSCodePage == 0 ||
     225             :         pszLine == nullptr)
     226             :     {
     227             :         /* Single byte codepage... nothing to do
     228             :          */
     229         286 :         return pszLine;
     230             :     }
     231             : 
     232             :     /* If string is all ASCII then there is nothing to do...
     233             :      */
     234           0 :     pszTmp = (GByte *)pszLine;
     235           0 :     for (bAllAscii = TRUE; bAllAscii && pszTmp && *pszTmp; pszTmp++)
     236             :     {
     237           0 :         if (!IS_ASCII(*pszTmp))
     238           0 :             bAllAscii = FALSE;
     239             :     }
     240           0 :     if (bAllAscii)
     241           0 :         return pszLine;
     242             : 
     243             :     /* Make sure output buffer is large enough.
     244             :      * We add 2 chars to buffer size to simplify processing... no need to
     245             :      * check if second byte of a pair would overflow buffer.
     246             :      */
     247           0 :     if (psDBCSInfo->pszDBCSBuf == nullptr ||
     248           0 :         psDBCSInfo->nDBCSBufSize < nMaxOutputLen + 2)
     249             :     {
     250           0 :         psDBCSInfo->nDBCSBufSize = nMaxOutputLen + 2;
     251           0 :         psDBCSInfo->pszDBCSBuf = (GByte *)CPLRealloc(
     252           0 :             psDBCSInfo->pszDBCSBuf, psDBCSInfo->nDBCSBufSize * sizeof(GByte));
     253             :     }
     254             : 
     255             :     /* Do the conversion according to current code page
     256             :      */
     257           0 :     switch (psDBCSInfo->nDBCSCodePage)
     258             :     {
     259           0 :         case AVC_DBCS_JAPANESE:
     260           0 :             pszOutBuf = _AVCArcDBCS2JapaneseShiftJIS(psDBCSInfo, pszLine,
     261             :                                                      nMaxOutputLen);
     262           0 :             break;
     263           0 :         default:
     264             :             /* We should never get here anyways, but just in case return pszLine
     265             :              */
     266           0 :             pszOutBuf = pszLine;
     267             :     }
     268             : 
     269           0 :     return pszOutBuf;
     270             : }
     271             : 
     272             : /*=====================================================================
     273             :  *=====================================================================
     274             :  * Functions Specific to Japanese encoding (CodePage 932).
     275             :  *
     276             :  * For now we assume that we can receive only Katakana, Shift-JIS, or EUC
     277             :  * encoding as input.  Coverages use EUC encoding in most cases, except
     278             :  * for Katakana characters that are prefixed with a 0x8e byte.
     279             :  *
     280             :  * Most of the Japanese conversion functions are based on information and
     281             :  * algorithms found at:
     282             :  *  http://www.mars.dti.ne.jp/~torao/program/appendix/japanese-en.html
     283             :  *=====================================================================
     284             :  *====================================================================*/
     285             : 
     286             : /**********************************************************************
     287             :  *                          _AVCDetectJapaneseEncoding()
     288             :  *
     289             :  * Scan a line of text to try to establish the type of japanese encoding
     290             :  *
     291             :  * Returns the encoding number (AVC_CODE_JAP_*), or AVC_CODE_UNKNOWN if no
     292             :  * specific encoding was detected.
     293             :  **********************************************************************/
     294             : 
     295             : #define IS_JAP_SHIFTJIS_1(c) ((c) >= 0x81 && (c) <= 0x9f)
     296             : #define IS_JAP_SHIFTJIS_2(c)                                                   \
     297             :     (((c) >= 0x40 && (c) <= 0x7e) || ((c) >= 0x80 && (c) <= 0xA0))
     298             : #define IS_JAP_EUC_1(c) ((c) >= 0xF0 && (c) <= 0xFE)
     299             : #define IS_JAP_EUC_2(c) ((c) >= 0xFD && (c) <= 0xFE)
     300             : #define IS_JAP_KANA(c) ((c) >= 0xA1 && (c) <= 0xDF)
     301             : 
     302           0 : static int _AVCDetectJapaneseEncoding(const GByte *pszLine)
     303             : {
     304           0 :     int nEncoding = AVC_CODE_UNKNOWN;
     305             : 
     306           0 :     for (; nEncoding == AVC_CODE_UNKNOWN && pszLine && *pszLine; pszLine++)
     307             :     {
     308           0 :         if (IS_ASCII(*pszLine))
     309           0 :             continue;
     310           0 :         else if (IS_JAP_SHIFTJIS_1(*pszLine))
     311             :         {
     312           0 :             nEncoding = AVC_CODE_JAP_SHIFTJIS;
     313           0 :             break;
     314             :         }
     315           0 :         else if (IS_JAP_KANA(*pszLine) && *(pszLine + 1) &&
     316           0 :                  *(pszLine + 1) <= 0xA0)
     317             :         {
     318           0 :             nEncoding = AVC_CODE_JAP_SHIFTJIS; /* SHIFT-JIS + Kana */
     319           0 :             break;
     320             :         }
     321           0 :         else if (IS_JAP_EUC_1(*pszLine))
     322             :         {
     323           0 :             nEncoding = AVC_CODE_JAP_EUC;
     324           0 :             break;
     325             :         }
     326             : 
     327           0 :         if (*(++pszLine) == '\0')
     328           0 :             break;
     329             : 
     330           0 :         if (IS_JAP_SHIFTJIS_2(*pszLine))
     331             :         {
     332           0 :             nEncoding = AVC_CODE_JAP_SHIFTJIS;
     333           0 :             break;
     334             :         }
     335           0 :         else if (IS_JAP_EUC_2(*pszLine))
     336             :         {
     337           0 :             nEncoding = AVC_CODE_JAP_EUC;
     338           0 :             break;
     339             :         }
     340             :     }
     341             : 
     342           0 :     return nEncoding;
     343             : }
     344             : 
     345             : /**********************************************************************
     346             :  *                          _AVCJapanese2ArcDBCS()
     347             :  *
     348             :  * Try to detect type of Japanese encoding if not done yet, and convert
     349             :  * string from Japanese to proper coverage DBCS encoding.
     350             :  **********************************************************************/
     351           0 : static const GByte *_AVCJapanese2ArcDBCS(AVCDBCSInfo *psDBCSInfo,
     352             :                                          const GByte *pszLine,
     353             :                                          int nMaxOutputLen)
     354             : {
     355             :     GByte *pszOut;
     356             :     int iDst;
     357             : 
     358           0 :     pszOut = psDBCSInfo->pszDBCSBuf;
     359             : 
     360           0 :     if (psDBCSInfo->nDBCSEncoding == AVC_CODE_UNKNOWN)
     361             :     {
     362             :         /* Type of encoding (Shift-JIS or EUC) not known yet... try to
     363             :          * detect it now.
     364             :          */
     365           0 :         psDBCSInfo->nDBCSEncoding = _AVCDetectJapaneseEncoding(pszLine);
     366             : 
     367             : #if 0
     368             :         if (psDBCSInfo->nDBCSEncoding == AVC_CODE_JAP_SHIFTJIS)
     369             :         {
     370             :             printf("Found Japanese Shift-JIS encoding\n");/*ok*/
     371             :         }
     372             :         else if (psDBCSInfo->nDBCSEncoding == AVC_CODE_JAP_EUC)
     373             :         {
     374             :             printf("Found Japanese EUC encoding\n");/*ok*/
     375             :         }
     376             : #endif
     377             :     }
     378             : 
     379           0 :     for (iDst = 0; *pszLine && iDst < nMaxOutputLen; pszLine++)
     380             :     {
     381           0 :         if (IS_ASCII(*pszLine))
     382             :         {
     383             :             /* No transformation required for ASCII */
     384           0 :             pszOut[iDst++] = *pszLine;
     385             :         }
     386           0 :         else if (psDBCSInfo->nDBCSEncoding == AVC_CODE_JAP_EUC &&
     387           0 :                  *(pszLine + 1))
     388             :         {
     389             :             /* This must be a pair of EUC chars and both should be in
     390             :              * the range 0xA1-0xFE
     391             :              */
     392           0 :             pszOut[iDst++] = *(pszLine++);
     393           0 :             pszOut[iDst++] = *pszLine;
     394             :         }
     395           0 :         else if (IS_JAP_KANA(*pszLine))
     396             :         {
     397             :             /* Katakana char. prefix it with 0x8e */
     398           0 :             pszOut[iDst++] = 0x8e;
     399           0 :             pszOut[iDst++] = *pszLine;
     400             :         }
     401           0 :         else if (*(pszLine + 1))
     402             :         {
     403             :             /* This must be a pair of Shift-JIS chars... convert them to EUC
     404             :              *
     405             :              * If we haven't been able to establish the encoding for sure
     406             :              * yet, then it is possible that a pair of EUC chars could be
     407             :              * treated as shift-JIS here... but there is not much we can do
     408             :              * about that unless we scan the whole E00 input before we
     409             :              * start the conversion.
     410             :              */
     411             :             unsigned char leader, trailer;
     412           0 :             leader = *(pszLine++);
     413           0 :             trailer = *pszLine;
     414             : 
     415           0 :             if (leader <= 0x9F)
     416           0 :                 leader -= 0x71;
     417             :             else
     418           0 :                 leader -= 0xB1;
     419           0 :             leader = (leader << 1) + 1;
     420             : 
     421           0 :             if (trailer > 0x7F)
     422           0 :                 trailer--;
     423           0 :             if (trailer >= 0x9E)
     424             :             {
     425           0 :                 trailer -= 0x7D;
     426           0 :                 leader++;
     427             :             }
     428             :             else
     429             :             {
     430           0 :                 trailer -= 0x1F;
     431             :             }
     432             : 
     433           0 :             pszOut[iDst++] = leader | 0x80;
     434           0 :             pszOut[iDst++] = trailer | 0x80;
     435             :         }
     436             :         else
     437             :         {
     438             :             /* We should never get here unless a double-byte pair was
     439             :              * truncated... but just in case...
     440             :              */
     441           0 :             pszOut[iDst++] = *pszLine;
     442             :         }
     443             :     }
     444             : 
     445           0 :     pszOut[iDst] = '\0';
     446             : 
     447           0 :     return psDBCSInfo->pszDBCSBuf;
     448             : }
     449             : 
     450             : /**********************************************************************
     451             :  *                          _AVCArcDBCS2JapaneseShiftJIS()
     452             :  *
     453             :  * Convert string from coverage DBCS (EUC) to Japanese Shift-JIS.
     454             :  *
     455             :  * We know that binary coverages use a custom EUC encoding for japanese
     456             :  * which is EUC + all Katakana chars are prefixed with 0x8e.  So this
     457             :  * function just does a simple conversion.
     458             :  **********************************************************************/
     459           0 : static const GByte *_AVCArcDBCS2JapaneseShiftJIS(AVCDBCSInfo *psDBCSInfo,
     460             :                                                  const GByte *pszLine,
     461             :                                                  int nMaxOutputLen)
     462             : {
     463             :     GByte *pszOut;
     464             :     int iDst;
     465             : 
     466           0 :     pszOut = psDBCSInfo->pszDBCSBuf;
     467             : 
     468           0 :     for (iDst = 0; *pszLine && iDst < nMaxOutputLen; pszLine++)
     469             :     {
     470           0 :         if (IS_ASCII(*pszLine))
     471             :         {
     472             :             /* No transformation required for ASCII */
     473           0 :             pszOut[iDst++] = *pszLine;
     474             :         }
     475           0 :         else if (*pszLine == 0x8e && *(pszLine + 1))
     476             :         {
     477           0 :             pszLine++; /* Flush the 0x8e */
     478           0 :             pszOut[iDst++] = *pszLine;
     479             :         }
     480           0 :         else if (*(pszLine + 1))
     481             :         {
     482             :             /* This is a pair of EUC chars... convert them to Shift-JIS
     483             :              */
     484             :             unsigned char leader, trailer;
     485           0 :             leader = *(pszLine++) & 0x7F;
     486           0 :             trailer = *pszLine & 0x7F;
     487             : 
     488           0 :             if ((leader & 0x01) != 0)
     489           0 :                 trailer += 0x1F;
     490             :             else
     491           0 :                 trailer += 0x7D;
     492           0 :             if (trailer >= 0x7F)
     493           0 :                 trailer++;
     494             : 
     495           0 :             leader = ((leader - 0x21) >> 1) + 0x81;
     496           0 :             if (leader > 0x9F)
     497           0 :                 leader += 0x40;
     498             : 
     499           0 :             pszOut[iDst++] = leader;
     500           0 :             pszOut[iDst++] = trailer;
     501             :         }
     502             :         else
     503             :         {
     504             :             /* We should never get here unless a double-byte pair was
     505             :              * truncated... but just in case...
     506             :              */
     507           0 :             pszOut[iDst++] = *pszLine;
     508             :         }
     509             :     }
     510             : 
     511           0 :     pszOut[iDst] = '\0';
     512             : 
     513           0 :     return psDBCSInfo->pszDBCSBuf;
     514             : }

Generated by: LCOV version 1.14