Compiler: Fix broken UTF8 support (#329)
This commit is contained in:
parent
1f0dc33875
commit
a54b9f05a7
@ -726,8 +726,6 @@ int mfputs(MEMFILE *mf,char *string);
|
||||
SC_FUNC int cp_path(const char *root,const char *directory);
|
||||
SC_FUNC int cp_set(const char *name);
|
||||
SC_FUNC cell cp_translate(const unsigned char *string,const unsigned char **endptr);
|
||||
SC_FUNC cell get_utf8_char(const unsigned char *string,const unsigned char **endptr);
|
||||
SC_FUNC int scan_utf8(FILE *fp,const char *filename);
|
||||
|
||||
/* function prototypes in SCSTATE.C */
|
||||
SC_FUNC constvalue *automaton_add(const char *name);
|
||||
@ -803,7 +801,6 @@ SC_VDECL int sc_status; /* read/write status */
|
||||
SC_VDECL int sc_rationaltag; /* tag for rational numbers */
|
||||
SC_VDECL int rational_digits; /* number of fractional digits */
|
||||
SC_VDECL int sc_allowproccall;/* allow/detect tagnames in lex() */
|
||||
SC_VDECL short sc_is_utf8; /* is this source file in UTF-8 encoding */
|
||||
SC_VDECL char *pc_deprecate; /* if non-NULL, mark next declaration as deprecated */
|
||||
SC_VDECL int sc_warnings_are_errors;
|
||||
|
||||
|
@ -152,7 +152,6 @@ static char *extensions[] = { ".inc", ".p", ".pawn" };
|
||||
PUSHSTK_I(iflevel);
|
||||
assert(!SKIPPING);
|
||||
assert(skiplevel==iflevel); /* these two are always the same when "parsing" */
|
||||
PUSHSTK_I(sc_is_utf8);
|
||||
PUSHSTK_I(icomment);
|
||||
PUSHSTK_I(fcurrent);
|
||||
PUSHSTK_I(fline);
|
||||
@ -169,7 +168,6 @@ static char *extensions[] = { ".inc", ".p", ".pawn" };
|
||||
assert(sc_status == statFIRST || strcmp(get_inputfile(fcurrent), inpfname) == 0);
|
||||
setfiledirect(inpfname); /* (optionally) set in the list file */
|
||||
listline=-1; /* force a #line directive when changing the file */
|
||||
sc_is_utf8=(short)scan_utf8(inpf,name);
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
@ -319,7 +317,6 @@ static void readline(unsigned char *line)
|
||||
fline=i;
|
||||
fcurrent=(short)POPSTK_I();
|
||||
icomment=(short)POPSTK_I();
|
||||
sc_is_utf8=(short)POPSTK_I();
|
||||
iflevel=(short)POPSTK_I();
|
||||
skiplevel=iflevel; /* this condition held before including the file */
|
||||
assert(!SKIPPING); /* idem ditto */
|
||||
@ -2395,21 +2392,12 @@ static cell litchar(const unsigned char **lptr,int flags)
|
||||
|
||||
cptr=*lptr;
|
||||
if ((flags & RAWMODE)!=0 || *cptr!=sc_ctrlchar) { /* no escape character */
|
||||
#if !defined NO_UTF8
|
||||
if (sc_is_utf8 && (flags & UTF8MODE)!=0) {
|
||||
c=get_utf8_char(cptr,&cptr);
|
||||
assert(c>=0); /* file was already scanned for conformance to UTF-8 */
|
||||
} else {
|
||||
#endif
|
||||
#if !defined NO_CODEPAGE
|
||||
c=cp_translate(cptr,&cptr);
|
||||
#else
|
||||
c=*cptr;
|
||||
cptr+=1;
|
||||
#endif
|
||||
#if !defined NO_UTF8
|
||||
} /* if */
|
||||
#endif
|
||||
} else {
|
||||
cptr+=1;
|
||||
if (*cptr==sc_ctrlchar) {
|
||||
|
@ -309,120 +309,3 @@ SC_FUNC cell cp_translate(const unsigned char *string,const unsigned char **endp
|
||||
}
|
||||
|
||||
#endif /* NO_CODEPAGE */
|
||||
|
||||
#if !defined NO_UTF8
|
||||
SC_FUNC cell get_utf8_char(const unsigned char *string,const unsigned char **endptr)
|
||||
{
|
||||
int follow=0;
|
||||
long lowmark=0;
|
||||
unsigned char ch;
|
||||
cell result=0;
|
||||
|
||||
if (endptr!=NULL)
|
||||
*endptr=string;
|
||||
|
||||
for ( ;; ) {
|
||||
ch=*string++;
|
||||
|
||||
if (follow>0 && (ch & 0xc0)==0x80) {
|
||||
/* leader code is active, combine with earlier code */
|
||||
result=(result << 6) | (ch & 0x3f);
|
||||
if (--follow==0) {
|
||||
/* encoding a character in more bytes than is strictly needed,
|
||||
* is not really valid UTF-8; we are strict here to increase
|
||||
* the chance of heuristic dectection of non-UTF-8 text
|
||||
* (JAVA writes zero bytes as a 2-byte code UTF-8, which is invalid)
|
||||
*/
|
||||
if (result<lowmark)
|
||||
return -1;
|
||||
/* the code positions 0xd800--0xdfff and 0xfffe & 0xffff do not
|
||||
* exist in UCS-4 (and hence, they do not exist in Unicode)
|
||||
*/
|
||||
if ((result>=0xd800 && result<=0xdfff) || result==0xfffe || result==0xffff)
|
||||
return -1;
|
||||
} /* if */
|
||||
break;
|
||||
} else if (follow==0 && (ch & 0x80)==0x80) {
|
||||
/* UTF-8 leader code */
|
||||
if ((ch & 0xe0)==0xc0) {
|
||||
/* 110xxxxx 10xxxxxx */
|
||||
follow=1;
|
||||
lowmark=0x80L;
|
||||
result=ch & 0x1f;
|
||||
} else if ((ch & 0xf0)==0xe0) {
|
||||
/* 1110xxxx 10xxxxxx 10xxxxxx (16 bits, BMP plane) */
|
||||
follow=2;
|
||||
lowmark=0x800L;
|
||||
result=ch & 0x0f;
|
||||
} else if ((ch & 0xf8)==0xf0) {
|
||||
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
||||
follow=3;
|
||||
lowmark=0x10000L;
|
||||
result=ch & 0x07;
|
||||
} else if ((ch & 0xfc)==0xf8) {
|
||||
/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
||||
follow=4;
|
||||
lowmark=0x200000L;
|
||||
result=ch & 0x03;
|
||||
} else if ((ch & 0xfe)==0xfc) {
|
||||
/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (32 bits) */
|
||||
follow=5;
|
||||
lowmark=0x4000000L;
|
||||
result=ch & 0x01;
|
||||
} else {
|
||||
/* this is invalid UTF-8 */
|
||||
return -1;
|
||||
} /* if */
|
||||
} else if (follow==0 && (ch & 0x80)==0x00) {
|
||||
/* 0xxxxxxx (US-ASCII) */
|
||||
result=ch;
|
||||
break;
|
||||
} else {
|
||||
/* this is invalid UTF-8 */
|
||||
return -1;
|
||||
} /* if */
|
||||
|
||||
} /* for */
|
||||
|
||||
if (endptr!=NULL)
|
||||
*endptr=string;
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
SC_FUNC int scan_utf8(FILE *fp,const char *filename)
|
||||
{
|
||||
#if defined NO_UTF8
|
||||
return 0;
|
||||
#else
|
||||
static void *resetpos=NULL;
|
||||
int utf8=TRUE;
|
||||
int firstchar=TRUE,bom_found=FALSE;
|
||||
const unsigned char *ptr;
|
||||
|
||||
resetpos=pc_getpossrc(fp);
|
||||
while (utf8 && pc_readsrc(fp,pline,sLINEMAX)!=NULL) {
|
||||
ptr=pline;
|
||||
if (firstchar) {
|
||||
/* check whether the very first character on the very first line
|
||||
* starts with a BYTE order mark
|
||||
*/
|
||||
cell c=get_utf8_char(ptr,&ptr);
|
||||
bom_found= (c==0xfeff);
|
||||
utf8= (c>=0);
|
||||
firstchar=FALSE;
|
||||
} /* if */
|
||||
while (utf8 && *ptr!='\0')
|
||||
utf8= (get_utf8_char(ptr,&ptr)>=0);
|
||||
} /* while */
|
||||
pc_resetsrc(fp,resetpos);
|
||||
if (bom_found) {
|
||||
unsigned char bom[3];
|
||||
if (!utf8)
|
||||
error(77,filename); /* malformed UTF-8 encoding */
|
||||
pc_readsrc(fp,bom,3);
|
||||
assert(bom[0]==0xef && bom[1]==0xbb && bom[2]==0xbf);
|
||||
} /* if */
|
||||
return utf8;
|
||||
#endif /* NO_UTF8 */
|
||||
}
|
||||
|
@ -84,7 +84,6 @@ SC_VDEFINE int sc_status; /* read/write status */
|
||||
SC_VDEFINE int sc_rationaltag=0; /* tag for rational numbers */
|
||||
SC_VDEFINE int rational_digits=0; /* number of fractional digits */
|
||||
SC_VDEFINE int sc_allowproccall=0; /* allow/detect tagnames in lex() */
|
||||
SC_VDEFINE short sc_is_utf8=FALSE; /* is this source file in UTF-8 encoding */
|
||||
SC_VDEFINE char *pc_deprecate = NULL;/* if non-null, mark next declaration as deprecated */
|
||||
SC_VDEFINE int sc_showincludes=0; /* show include files */
|
||||
SC_VDEFINE int sc_warnings_are_errors=0;
|
||||
|
Loading…
Reference in New Issue
Block a user