429 lines
13 KiB
C
Executable File
429 lines
13 KiB
C
Executable File
/* Codepage translation to Unicode, and UTF-8 support
|
|
*
|
|
* The translation is based on codepage mapping files that are distributed
|
|
* by the Unicode consortium, see ftp://ftp.unicode.org/Public/MAPPINGS/.
|
|
*
|
|
* Character sets with a maximum of 256 codes are translated via a lookup
|
|
* table (these are Single-Byte Character Sets). Character sets like Shift-JIS
|
|
* with single-byte characters and multi-byte characters (introduced by a
|
|
* leader byte) are split into two tables: the 256-entry lookup table for
|
|
* the single-byte characters and an extended table for the multi-byte
|
|
* characters. The extended table is allocated dynamically; the lookup table
|
|
* is allocated statically, so loading SBCS tables cannot fail (if the tables
|
|
* themselves are valid, of course).
|
|
*
|
|
* Copyright (c) ITB CompuPhase, 2004-2005
|
|
*
|
|
* This software is provided "as-is", without any express or implied warranty.
|
|
* In no event will the authors be held liable for any damages arising from
|
|
* the use of this software.
|
|
*
|
|
* Permission is granted to anyone to use this software for any purpose,
|
|
* including commercial applications, and to alter it and redistribute it
|
|
* freely, subject to the following restrictions:
|
|
*
|
|
* 1. The origin of this software must not be misrepresented; you must not
|
|
* claim that you wrote the original software. If you use this software in
|
|
* a product, an acknowledgment in the product documentation would be
|
|
* appreciated but is not required.
|
|
* 2. Altered source versions must be plainly marked as such, and must not be
|
|
* misrepresented as being the original software.
|
|
* 3. This notice may not be removed or altered from any source distribution.
|
|
*
|
|
* Version: $Id$
|
|
*/
|
|
#include <assert.h>
|
|
#include <stdio.h>
|
|
#include <stddef.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include "sc.h"
|
|
|
|
#if !defined TRUE
|
|
#define FALSE 0
|
|
#define TRUE 1
|
|
#endif
|
|
#if !defined _MAX_PATH
|
|
#define _MAX_PATH 250
|
|
#endif
|
|
#if !defined DIRSEP_CHAR
|
|
#if defined LINUX || defined __FreeBSD__ || defined __OpenBSD__
|
|
#define DIRSEP_CHAR '/'
|
|
#elif defined macintosh
|
|
#define DIRSEP_CHAR ':'
|
|
#else
|
|
#define DIRSEP_CHAR '\\'
|
|
#endif
|
|
#endif
|
|
|
|
#if !defined ELEMENTS
|
|
#define ELEMENTS(array) (sizeof(array) / sizeof(array[0]))
|
|
#endif
|
|
|
|
#if !defined NO_CODEPAGE
|
|
|
|
#if !defined MAXCODEPAGE
|
|
#define MAXCODEPAGE 12 /* typically "cp" + 4 digits + ".txt" */
|
|
#endif
|
|
#define INVALID 0xffffu /* 0xffff and 0xfffe are invalid Unicode characters */
|
|
#define LEADBYTE 0xfffeu
|
|
|
|
struct wordpair {
|
|
unsigned short index;
|
|
wchar_t code;
|
|
};
|
|
static char cprootpath[_MAX_PATH] = { DIRSEP_CHAR, '\0' };
|
|
static wchar_t bytetable[256];
|
|
static struct wordpair *wordtable = NULL;
|
|
static unsigned wordtablesize = 0;
|
|
static unsigned wordtabletop = 0;
|
|
|
|
|
|
/* read in a line delimited by '\r' or '\n'; do NOT store the '\r' or '\n' into
|
|
* the string and ignore empty lines
|
|
* returns 1 for success and 0 for failure
|
|
*/
|
|
static int cp_readline(FILE *fp,char *string,size_t size)
|
|
{
|
|
size_t count=0;
|
|
int c;
|
|
assert(size>1);
|
|
while ((c=fgetc(fp))!=EOF && count<size-1) {
|
|
if (c=='\r' || c=='\n') {
|
|
if (count>0) /* '\r' or '\n' ends a string */
|
|
break;
|
|
/* if count==0, the line started with a '\r' or '\n', or perhaps line
|
|
* ends in the file are '\r\n' and we read and stopped on the '\r' of
|
|
* the preceding line
|
|
*/
|
|
} else {
|
|
string[count++]=(char)c;
|
|
} /* if */
|
|
} /* while */
|
|
string[count]='\0';
|
|
return count>0;
|
|
}
|
|
|
|
/* cp_path() sets the directory where all codepage files must be found (if
|
|
* the parameter to cp_set() specifies a full path, that is used instead).
|
|
* The path is specified into two parts: root and directory; the full path
|
|
* for the codepage direcory is just the concatenation of the two, with a
|
|
* directory separator in between. The directory is given in two parts,
|
|
* because often a program already retrieves its "home" directory and the
|
|
* codepages are most conveniently stored in a subdirectory of this home
|
|
* directory.
|
|
*/
|
|
SC_FUNC int cp_path(const char *root, const char *directory)
|
|
{
|
|
size_t len1,len2;
|
|
int add_slash1,add_slash2;
|
|
|
|
len1= (root!=NULL) ? strlen(root) : 0;
|
|
add_slash1= (len1==0 || root[len1-1]!=DIRSEP_CHAR);
|
|
len2= (directory!=NULL) ? strlen(directory) : 0;
|
|
add_slash2= (len2>0 && root[len2-1]!=DIRSEP_CHAR);
|
|
if (len1+add_slash1+len2+add_slash2>=(_MAX_PATH-MAXCODEPAGE))
|
|
return FALSE; /* full filename may not fit */
|
|
if (root!=NULL)
|
|
strcpy(cprootpath,root);
|
|
if (add_slash1) {
|
|
assert(len1==0 || cprootpath[len1]=='\0');
|
|
cprootpath[len1]=DIRSEP_CHAR;
|
|
cprootpath[len1+1]='\0';
|
|
} /* if */
|
|
if (directory!=NULL)
|
|
strcat(cprootpath,directory);
|
|
if (add_slash2) {
|
|
assert(cprootpath[len1+add_slash1+len2]=='\0');
|
|
cprootpath[len1+add_slash1+len2]=DIRSEP_CHAR;
|
|
cprootpath[len1+add_slash1+len2+1]='\0';
|
|
} /* if */
|
|
cp_set(NULL); /* start with a "linear" table (no translation) */
|
|
return TRUE;
|
|
}
|
|
|
|
/* cp_set() loads a codepage from a file. The name parameter may be a
|
|
* filename (including a full path) or it may be a partial codepage name.
|
|
* If the name parameter is NULL, the codepage is cleared to be a "linear"
|
|
* table (no translation).
|
|
* The following files are attempted to open (where <name> specifies the
|
|
* value of the parameter):
|
|
* <name>
|
|
* <cprootpath>/<name>
|
|
* <cprootpath>/<name>.txt
|
|
* <cprootpath>/cp<name>
|
|
* <cprootpath>/cp<name>.txt
|
|
*/
|
|
SC_FUNC int cp_set(const char *name)
|
|
{
|
|
char filename[_MAX_PATH];
|
|
FILE *fp=NULL;
|
|
unsigned index;
|
|
|
|
/* for name==NULL, set up an identity table */
|
|
if (name==NULL || *name=='\0') {
|
|
if (wordtable!=NULL) {
|
|
free(wordtable);
|
|
wordtable=NULL;
|
|
wordtablesize=0;
|
|
wordtabletop=0;
|
|
} /* if */
|
|
for (index=0; index<ELEMENTS(bytetable); index++)
|
|
bytetable[index]=(wchar_t)index;
|
|
return TRUE;
|
|
} /* if */
|
|
|
|
/* try to open the file as-is */
|
|
if (strchr(name,DIRSEP_CHAR)!=NULL)
|
|
fp=fopen(name,"rt");
|
|
if (fp==NULL) {
|
|
/* try opening the file in the "root path" for codepages */
|
|
if (strlen(name)>MAXCODEPAGE)
|
|
return 0;
|
|
assert(strlen(name)+strlen(cprootpath)<_MAX_PATH);
|
|
strcpy(filename,cprootpath);
|
|
strcat(filename,name);
|
|
fp=fopen(filename,"rt");
|
|
} /* if */
|
|
if (fp==NULL) {
|
|
/* try opening the file in the "root path" for codepages, with a ".txt" extension */
|
|
if (strlen(name)+4>=MAXCODEPAGE)
|
|
return 0;
|
|
assert(strlen(filename)+4<_MAX_PATH);
|
|
strcat(filename,".txt");
|
|
fp=fopen(filename,"rt");
|
|
} /* if */
|
|
if (fp==NULL) {
|
|
/* try opening the file in the "root path" for codepages, with "cp" prefixed before the name */
|
|
if (strlen(name)+2>MAXCODEPAGE)
|
|
return 0;
|
|
assert(2+strlen(name)+strlen(cprootpath)<_MAX_PATH);
|
|
strcpy(filename,cprootpath);
|
|
strcat(filename,"cp");
|
|
strcat(filename,name);
|
|
fp=fopen(filename,"rt");
|
|
} /* if */
|
|
if (fp==NULL) {
|
|
/* try opening the file in the "root path" for codepages, with "cp" prefixed an ".txt" appended */
|
|
if (strlen(name)+2+4>MAXCODEPAGE)
|
|
return 0;
|
|
assert(strlen(filename)+4<_MAX_PATH);
|
|
strcat(filename,".txt");
|
|
fp=fopen(filename,"rt");
|
|
} /* if */
|
|
if (fp==NULL)
|
|
return FALSE; /* all failed */
|
|
|
|
/* clear the tables */
|
|
for (index=0; index<ELEMENTS(bytetable); index++)
|
|
bytetable[index]=INVALID; /* special code meaning "not found" */
|
|
assert(wordtablesize==0 && wordtabletop==0 && wordtable==NULL
|
|
|| wordtablesize>0 && wordtable!=NULL);
|
|
if (wordtable!=NULL) {
|
|
free(wordtable);
|
|
wordtable=NULL;
|
|
wordtablesize=0;
|
|
wordtabletop=0;
|
|
} /* if */
|
|
|
|
/* read in the table */
|
|
while (cp_readline(fp,filename,sizeof filename)) {
|
|
char *ptr;
|
|
if ((ptr=strchr(filename,'#'))!=NULL)
|
|
*ptr='\0'; /* strip of comment */
|
|
for (ptr=filename; *ptr>0 && *ptr<' '; ptr++)
|
|
/* nothing */; /* skip leading whitespace */
|
|
if (*ptr!='\0') {
|
|
/* content on line */
|
|
unsigned code=LEADBYTE;
|
|
int num=sscanf(ptr,"%i %i",&index,&code);
|
|
/* if sscanf() returns 1 and the index is in range 0..255, then the
|
|
* code is a DBCS lead byte; if sscanf() returns 2 and index>=256, this
|
|
* is a double byte pair (lead byte + follower)
|
|
*/
|
|
if (num>=1 && index<256) {
|
|
bytetable[index]=(wchar_t)code;
|
|
} else if (num==2 && index>=256 && index<LEADBYTE) {
|
|
/* store the DBCS character in wordtable */
|
|
if (wordtabletop>=wordtablesize) {
|
|
/* grow the list */
|
|
int newsize;
|
|
struct wordpair *newblock;
|
|
newsize= (wordtablesize==0) ? 128 : 2*wordtablesize;
|
|
newblock=(struct wordpair *)malloc(newsize*sizeof(*wordtable));
|
|
if (newblock!=NULL) {
|
|
memcpy(newblock,wordtable,wordtabletop*sizeof(*wordtable));
|
|
free(wordtable);
|
|
wordtable=newblock;
|
|
wordtablesize=newsize;
|
|
} /* if */
|
|
} /* if */
|
|
if (wordtabletop<wordtablesize) {
|
|
/* insert at sorted position */
|
|
int pos=wordtabletop;
|
|
assert(wordtable!=NULL);
|
|
while (pos>0 && (unsigned)wordtable[pos-1].index>index) {
|
|
wordtable[pos]=wordtable[pos-1];
|
|
pos--;
|
|
} /* while */
|
|
wordtable[pos].index=(unsigned short)index;
|
|
wordtable[pos].code=(wchar_t)code;
|
|
} /* if */
|
|
} /* if */
|
|
} /* if */
|
|
} /* while */
|
|
|
|
fclose(fp);
|
|
return TRUE;
|
|
}
|
|
|
|
SC_FUNC cell cp_translate(const unsigned char *string,const unsigned char **endptr)
|
|
{
|
|
wchar_t result;
|
|
|
|
result=bytetable[*string++];
|
|
/* check whether this is a leader code */
|
|
if ((unsigned)result==LEADBYTE && wordtable!=NULL) {
|
|
/* look up the code via binary search */
|
|
int low,high,mid;
|
|
unsigned short index=(unsigned short)(((*(string-1)) << 8) | *string);
|
|
string++;
|
|
assert(wordtabletop>0);
|
|
low=0;
|
|
high=wordtabletop-1;
|
|
while (low<high) {
|
|
mid=(low+high)/2;
|
|
assert(low<=mid && mid<high);
|
|
if (index>wordtable[mid].index)
|
|
low=mid+1;
|
|
else
|
|
high=mid;
|
|
} /* while */
|
|
assert(low==high);
|
|
if (wordtable[low].index==index)
|
|
result=wordtable[low].code;
|
|
} /* if */
|
|
|
|
if (endptr!=NULL)
|
|
*endptr=string;
|
|
return (cell)result;
|
|
}
|
|
|
|
#endif /* NO_CODEPAGE */
|
|
|
|
#if !defined NO_UTF8
|
|
SC_FUNC cell get_utf8_char(const unsigned char *string,const unsigned char **endptr)
|
|
{
|
|
int follow=0;
|
|
long lowmark=0;
|
|
unsigned char ch;
|
|
cell result=0;
|
|
|
|
if (endptr!=NULL)
|
|
*endptr=string;
|
|
|
|
for ( ;; ) {
|
|
ch=*string++;
|
|
|
|
if (follow>0 && (ch & 0xc0)==0x80) {
|
|
/* leader code is active, combine with earlier code */
|
|
result=(result << 6) | (ch & 0x3f);
|
|
if (--follow==0) {
|
|
/* encoding a character in more bytes than is strictly needed,
|
|
* is not really valid UTF-8; we are strict here to increase
|
|
* the chance of heuristic dectection of non-UTF-8 text
|
|
* (JAVA writes zero bytes as a 2-byte code UTF-8, which is invalid)
|
|
*/
|
|
if (result<lowmark)
|
|
return -1;
|
|
/* the code positions 0xd800--0xdfff and 0xfffe & 0xffff do not
|
|
* exist in UCS-4 (and hence, they do not exist in Unicode)
|
|
*/
|
|
if (result>=0xd800 && result<=0xdfff || result==0xfffe || result==0xffff)
|
|
return -1;
|
|
} /* if */
|
|
break;
|
|
} else if (follow==0 && (ch & 0x80)==0x80) {
|
|
/* UTF-8 leader code */
|
|
if ((ch & 0xe0)==0xc0) {
|
|
/* 110xxxxx 10xxxxxx */
|
|
follow=1;
|
|
lowmark=0x80L;
|
|
result=ch & 0x1f;
|
|
} else if ((ch & 0xf0)==0xe0) {
|
|
/* 1110xxxx 10xxxxxx 10xxxxxx (16 bits, BMP plane) */
|
|
follow=2;
|
|
lowmark=0x800L;
|
|
result=ch & 0x0f;
|
|
} else if ((ch & 0xf8)==0xf0) {
|
|
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
|
follow=3;
|
|
lowmark=0x10000L;
|
|
result=ch & 0x07;
|
|
} else if ((ch & 0xfc)==0xf8) {
|
|
/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
|
follow=4;
|
|
lowmark=0x200000L;
|
|
result=ch & 0x03;
|
|
} else if ((ch & 0xfe)==0xfc) {
|
|
/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (32 bits) */
|
|
follow=5;
|
|
lowmark=0x4000000L;
|
|
result=ch & 0x01;
|
|
} else {
|
|
/* this is invalid UTF-8 */
|
|
return -1;
|
|
} /* if */
|
|
} else if (follow==0 && (ch & 0x80)==0x00) {
|
|
/* 0xxxxxxx (US-ASCII) */
|
|
result=ch;
|
|
break;
|
|
} else {
|
|
/* this is invalid UTF-8 */
|
|
return -1;
|
|
} /* if */
|
|
|
|
} /* for */
|
|
|
|
if (endptr!=NULL)
|
|
*endptr=string;
|
|
return result;
|
|
}
|
|
#endif
|
|
|
|
SC_FUNC int scan_utf8(FILE *fp,const char *filename)
|
|
{
|
|
#if defined NO_UTF8
|
|
return 0;
|
|
#else
|
|
void *resetpos=pc_getpossrc(fp);
|
|
int utf8=TRUE;
|
|
int firstchar=TRUE,bom_found=FALSE;
|
|
const unsigned char *ptr;
|
|
|
|
while (utf8 && pc_readsrc(fp,pline,sLINEMAX)!=NULL) {
|
|
ptr=pline;
|
|
if (firstchar) {
|
|
/* check whether the very first character on the very first line
|
|
* starts with a BYTE order mark
|
|
*/
|
|
cell c=get_utf8_char(ptr,&ptr);
|
|
bom_found= (c==0xfeff);
|
|
utf8= (c>=0);
|
|
firstchar=FALSE;
|
|
} /* if */
|
|
while (utf8 && *ptr!='\0')
|
|
utf8= (get_utf8_char(ptr,&ptr)>=0);
|
|
} /* while */
|
|
pc_resetsrc(fp,resetpos);
|
|
if (bom_found) {
|
|
unsigned char bom[3];
|
|
if (!utf8)
|
|
error(77,filename); /* malformed UTF-8 encoding */
|
|
pc_readsrc(fp,bom,3);
|
|
assert(bom[0]==0xef && bom[1]==0xbb && bom[2]==0xbf);
|
|
} /* if */
|
|
return utf8;
|
|
#endif /* NO_UTF8 */
|
|
}
|