Compiler: Fix broken UTF8 support (#329)
This commit is contained in:
parent
1f0dc33875
commit
a54b9f05a7
|
@ -726,8 +726,6 @@ int mfputs(MEMFILE *mf,char *string);
|
||||||
SC_FUNC int cp_path(const char *root,const char *directory);
|
SC_FUNC int cp_path(const char *root,const char *directory);
|
||||||
SC_FUNC int cp_set(const char *name);
|
SC_FUNC int cp_set(const char *name);
|
||||||
SC_FUNC cell cp_translate(const unsigned char *string,const unsigned char **endptr);
|
SC_FUNC cell cp_translate(const unsigned char *string,const unsigned char **endptr);
|
||||||
SC_FUNC cell get_utf8_char(const unsigned char *string,const unsigned char **endptr);
|
|
||||||
SC_FUNC int scan_utf8(FILE *fp,const char *filename);
|
|
||||||
|
|
||||||
/* function prototypes in SCSTATE.C */
|
/* function prototypes in SCSTATE.C */
|
||||||
SC_FUNC constvalue *automaton_add(const char *name);
|
SC_FUNC constvalue *automaton_add(const char *name);
|
||||||
|
@ -803,7 +801,6 @@ SC_VDECL int sc_status; /* read/write status */
|
||||||
SC_VDECL int sc_rationaltag; /* tag for rational numbers */
|
SC_VDECL int sc_rationaltag; /* tag for rational numbers */
|
||||||
SC_VDECL int rational_digits; /* number of fractional digits */
|
SC_VDECL int rational_digits; /* number of fractional digits */
|
||||||
SC_VDECL int sc_allowproccall;/* allow/detect tagnames in lex() */
|
SC_VDECL int sc_allowproccall;/* allow/detect tagnames in lex() */
|
||||||
SC_VDECL short sc_is_utf8; /* is this source file in UTF-8 encoding */
|
|
||||||
SC_VDECL char *pc_deprecate; /* if non-NULL, mark next declaration as deprecated */
|
SC_VDECL char *pc_deprecate; /* if non-NULL, mark next declaration as deprecated */
|
||||||
SC_VDECL int sc_warnings_are_errors;
|
SC_VDECL int sc_warnings_are_errors;
|
||||||
|
|
||||||
|
|
|
@ -152,7 +152,6 @@ static char *extensions[] = { ".inc", ".p", ".pawn" };
|
||||||
PUSHSTK_I(iflevel);
|
PUSHSTK_I(iflevel);
|
||||||
assert(!SKIPPING);
|
assert(!SKIPPING);
|
||||||
assert(skiplevel==iflevel); /* these two are always the same when "parsing" */
|
assert(skiplevel==iflevel); /* these two are always the same when "parsing" */
|
||||||
PUSHSTK_I(sc_is_utf8);
|
|
||||||
PUSHSTK_I(icomment);
|
PUSHSTK_I(icomment);
|
||||||
PUSHSTK_I(fcurrent);
|
PUSHSTK_I(fcurrent);
|
||||||
PUSHSTK_I(fline);
|
PUSHSTK_I(fline);
|
||||||
|
@ -169,7 +168,6 @@ static char *extensions[] = { ".inc", ".p", ".pawn" };
|
||||||
assert(sc_status == statFIRST || strcmp(get_inputfile(fcurrent), inpfname) == 0);
|
assert(sc_status == statFIRST || strcmp(get_inputfile(fcurrent), inpfname) == 0);
|
||||||
setfiledirect(inpfname); /* (optionally) set in the list file */
|
setfiledirect(inpfname); /* (optionally) set in the list file */
|
||||||
listline=-1; /* force a #line directive when changing the file */
|
listline=-1; /* force a #line directive when changing the file */
|
||||||
sc_is_utf8=(short)scan_utf8(inpf,name);
|
|
||||||
return TRUE;
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -319,7 +317,6 @@ static void readline(unsigned char *line)
|
||||||
fline=i;
|
fline=i;
|
||||||
fcurrent=(short)POPSTK_I();
|
fcurrent=(short)POPSTK_I();
|
||||||
icomment=(short)POPSTK_I();
|
icomment=(short)POPSTK_I();
|
||||||
sc_is_utf8=(short)POPSTK_I();
|
|
||||||
iflevel=(short)POPSTK_I();
|
iflevel=(short)POPSTK_I();
|
||||||
skiplevel=iflevel; /* this condition held before including the file */
|
skiplevel=iflevel; /* this condition held before including the file */
|
||||||
assert(!SKIPPING); /* idem ditto */
|
assert(!SKIPPING); /* idem ditto */
|
||||||
|
@ -2395,21 +2392,12 @@ static cell litchar(const unsigned char **lptr,int flags)
|
||||||
|
|
||||||
cptr=*lptr;
|
cptr=*lptr;
|
||||||
if ((flags & RAWMODE)!=0 || *cptr!=sc_ctrlchar) { /* no escape character */
|
if ((flags & RAWMODE)!=0 || *cptr!=sc_ctrlchar) { /* no escape character */
|
||||||
#if !defined NO_UTF8
|
|
||||||
if (sc_is_utf8 && (flags & UTF8MODE)!=0) {
|
|
||||||
c=get_utf8_char(cptr,&cptr);
|
|
||||||
assert(c>=0); /* file was already scanned for conformance to UTF-8 */
|
|
||||||
} else {
|
|
||||||
#endif
|
|
||||||
#if !defined NO_CODEPAGE
|
#if !defined NO_CODEPAGE
|
||||||
c=cp_translate(cptr,&cptr);
|
c=cp_translate(cptr,&cptr);
|
||||||
#else
|
#else
|
||||||
c=*cptr;
|
c=*cptr;
|
||||||
cptr+=1;
|
cptr+=1;
|
||||||
#endif
|
#endif
|
||||||
#if !defined NO_UTF8
|
|
||||||
} /* if */
|
|
||||||
#endif
|
|
||||||
} else {
|
} else {
|
||||||
cptr+=1;
|
cptr+=1;
|
||||||
if (*cptr==sc_ctrlchar) {
|
if (*cptr==sc_ctrlchar) {
|
||||||
|
|
|
@ -309,120 +309,3 @@ SC_FUNC cell cp_translate(const unsigned char *string,const unsigned char **endp
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* NO_CODEPAGE */
|
#endif /* NO_CODEPAGE */
|
||||||
|
|
||||||
#if !defined NO_UTF8
|
|
||||||
SC_FUNC cell get_utf8_char(const unsigned char *string,const unsigned char **endptr)
|
|
||||||
{
|
|
||||||
int follow=0;
|
|
||||||
long lowmark=0;
|
|
||||||
unsigned char ch;
|
|
||||||
cell result=0;
|
|
||||||
|
|
||||||
if (endptr!=NULL)
|
|
||||||
*endptr=string;
|
|
||||||
|
|
||||||
for ( ;; ) {
|
|
||||||
ch=*string++;
|
|
||||||
|
|
||||||
if (follow>0 && (ch & 0xc0)==0x80) {
|
|
||||||
/* leader code is active, combine with earlier code */
|
|
||||||
result=(result << 6) | (ch & 0x3f);
|
|
||||||
if (--follow==0) {
|
|
||||||
/* encoding a character in more bytes than is strictly needed,
|
|
||||||
* is not really valid UTF-8; we are strict here to increase
|
|
||||||
* the chance of heuristic dectection of non-UTF-8 text
|
|
||||||
* (JAVA writes zero bytes as a 2-byte code UTF-8, which is invalid)
|
|
||||||
*/
|
|
||||||
if (result<lowmark)
|
|
||||||
return -1;
|
|
||||||
/* the code positions 0xd800--0xdfff and 0xfffe & 0xffff do not
|
|
||||||
* exist in UCS-4 (and hence, they do not exist in Unicode)
|
|
||||||
*/
|
|
||||||
if ((result>=0xd800 && result<=0xdfff) || result==0xfffe || result==0xffff)
|
|
||||||
return -1;
|
|
||||||
} /* if */
|
|
||||||
break;
|
|
||||||
} else if (follow==0 && (ch & 0x80)==0x80) {
|
|
||||||
/* UTF-8 leader code */
|
|
||||||
if ((ch & 0xe0)==0xc0) {
|
|
||||||
/* 110xxxxx 10xxxxxx */
|
|
||||||
follow=1;
|
|
||||||
lowmark=0x80L;
|
|
||||||
result=ch & 0x1f;
|
|
||||||
} else if ((ch & 0xf0)==0xe0) {
|
|
||||||
/* 1110xxxx 10xxxxxx 10xxxxxx (16 bits, BMP plane) */
|
|
||||||
follow=2;
|
|
||||||
lowmark=0x800L;
|
|
||||||
result=ch & 0x0f;
|
|
||||||
} else if ((ch & 0xf8)==0xf0) {
|
|
||||||
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
|
||||||
follow=3;
|
|
||||||
lowmark=0x10000L;
|
|
||||||
result=ch & 0x07;
|
|
||||||
} else if ((ch & 0xfc)==0xf8) {
|
|
||||||
/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
|
||||||
follow=4;
|
|
||||||
lowmark=0x200000L;
|
|
||||||
result=ch & 0x03;
|
|
||||||
} else if ((ch & 0xfe)==0xfc) {
|
|
||||||
/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (32 bits) */
|
|
||||||
follow=5;
|
|
||||||
lowmark=0x4000000L;
|
|
||||||
result=ch & 0x01;
|
|
||||||
} else {
|
|
||||||
/* this is invalid UTF-8 */
|
|
||||||
return -1;
|
|
||||||
} /* if */
|
|
||||||
} else if (follow==0 && (ch & 0x80)==0x00) {
|
|
||||||
/* 0xxxxxxx (US-ASCII) */
|
|
||||||
result=ch;
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
/* this is invalid UTF-8 */
|
|
||||||
return -1;
|
|
||||||
} /* if */
|
|
||||||
|
|
||||||
} /* for */
|
|
||||||
|
|
||||||
if (endptr!=NULL)
|
|
||||||
*endptr=string;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
SC_FUNC int scan_utf8(FILE *fp,const char *filename)
|
|
||||||
{
|
|
||||||
#if defined NO_UTF8
|
|
||||||
return 0;
|
|
||||||
#else
|
|
||||||
static void *resetpos=NULL;
|
|
||||||
int utf8=TRUE;
|
|
||||||
int firstchar=TRUE,bom_found=FALSE;
|
|
||||||
const unsigned char *ptr;
|
|
||||||
|
|
||||||
resetpos=pc_getpossrc(fp);
|
|
||||||
while (utf8 && pc_readsrc(fp,pline,sLINEMAX)!=NULL) {
|
|
||||||
ptr=pline;
|
|
||||||
if (firstchar) {
|
|
||||||
/* check whether the very first character on the very first line
|
|
||||||
* starts with a BYTE order mark
|
|
||||||
*/
|
|
||||||
cell c=get_utf8_char(ptr,&ptr);
|
|
||||||
bom_found= (c==0xfeff);
|
|
||||||
utf8= (c>=0);
|
|
||||||
firstchar=FALSE;
|
|
||||||
} /* if */
|
|
||||||
while (utf8 && *ptr!='\0')
|
|
||||||
utf8= (get_utf8_char(ptr,&ptr)>=0);
|
|
||||||
} /* while */
|
|
||||||
pc_resetsrc(fp,resetpos);
|
|
||||||
if (bom_found) {
|
|
||||||
unsigned char bom[3];
|
|
||||||
if (!utf8)
|
|
||||||
error(77,filename); /* malformed UTF-8 encoding */
|
|
||||||
pc_readsrc(fp,bom,3);
|
|
||||||
assert(bom[0]==0xef && bom[1]==0xbb && bom[2]==0xbf);
|
|
||||||
} /* if */
|
|
||||||
return utf8;
|
|
||||||
#endif /* NO_UTF8 */
|
|
||||||
}
|
|
||||||
|
|
|
@ -84,7 +84,6 @@ SC_VDEFINE int sc_status; /* read/write status */
|
||||||
SC_VDEFINE int sc_rationaltag=0; /* tag for rational numbers */
|
SC_VDEFINE int sc_rationaltag=0; /* tag for rational numbers */
|
||||||
SC_VDEFINE int rational_digits=0; /* number of fractional digits */
|
SC_VDEFINE int rational_digits=0; /* number of fractional digits */
|
||||||
SC_VDEFINE int sc_allowproccall=0; /* allow/detect tagnames in lex() */
|
SC_VDEFINE int sc_allowproccall=0; /* allow/detect tagnames in lex() */
|
||||||
SC_VDEFINE short sc_is_utf8=FALSE; /* is this source file in UTF-8 encoding */
|
|
||||||
SC_VDEFINE char *pc_deprecate = NULL;/* if non-null, mark next declaration as deprecated */
|
SC_VDEFINE char *pc_deprecate = NULL;/* if non-null, mark next declaration as deprecated */
|
||||||
SC_VDEFINE int sc_showincludes=0; /* show include files */
|
SC_VDEFINE int sc_showincludes=0; /* show include files */
|
||||||
SC_VDEFINE int sc_warnings_are_errors=0;
|
SC_VDEFINE int sc_warnings_are_errors=0;
|
||||||
|
|
Loading…
Reference in New Issue
Block a user