Compiler: Fix broken UTF8 support (#329)

This commit is contained in:
IgnacioFDM 2018-09-20 15:27:15 -03:00 committed by Vincent Herbet
parent 1f0dc33875
commit a54b9f05a7
4 changed files with 0 additions and 133 deletions

View File

@ -726,8 +726,6 @@ int mfputs(MEMFILE *mf,char *string);
SC_FUNC int cp_path(const char *root,const char *directory);
SC_FUNC int cp_set(const char *name);
SC_FUNC cell cp_translate(const unsigned char *string,const unsigned char **endptr);
SC_FUNC cell get_utf8_char(const unsigned char *string,const unsigned char **endptr);
SC_FUNC int scan_utf8(FILE *fp,const char *filename);
/* function prototypes in SCSTATE.C */
SC_FUNC constvalue *automaton_add(const char *name);
@ -803,7 +801,6 @@ SC_VDECL int sc_status; /* read/write status */
SC_VDECL int sc_rationaltag; /* tag for rational numbers */
SC_VDECL int rational_digits; /* number of fractional digits */
SC_VDECL int sc_allowproccall;/* allow/detect tagnames in lex() */
SC_VDECL short sc_is_utf8; /* is this source file in UTF-8 encoding */
SC_VDECL char *pc_deprecate; /* if non-NULL, mark next declaration as deprecated */
SC_VDECL int sc_warnings_are_errors;

View File

@ -152,7 +152,6 @@ static char *extensions[] = { ".inc", ".p", ".pawn" };
PUSHSTK_I(iflevel);
assert(!SKIPPING);
assert(skiplevel==iflevel); /* these two are always the same when "parsing" */
PUSHSTK_I(sc_is_utf8);
PUSHSTK_I(icomment);
PUSHSTK_I(fcurrent);
PUSHSTK_I(fline);
@ -169,7 +168,6 @@ static char *extensions[] = { ".inc", ".p", ".pawn" };
assert(sc_status == statFIRST || strcmp(get_inputfile(fcurrent), inpfname) == 0);
setfiledirect(inpfname); /* (optionally) set in the list file */
listline=-1; /* force a #line directive when changing the file */
sc_is_utf8=(short)scan_utf8(inpf,name);
return TRUE;
}
@ -319,7 +317,6 @@ static void readline(unsigned char *line)
fline=i;
fcurrent=(short)POPSTK_I();
icomment=(short)POPSTK_I();
sc_is_utf8=(short)POPSTK_I();
iflevel=(short)POPSTK_I();
skiplevel=iflevel; /* this condition held before including the file */
assert(!SKIPPING); /* idem ditto */
@ -2395,21 +2392,12 @@ static cell litchar(const unsigned char **lptr,int flags)
cptr=*lptr;
if ((flags & RAWMODE)!=0 || *cptr!=sc_ctrlchar) { /* no escape character */
#if !defined NO_UTF8
if (sc_is_utf8 && (flags & UTF8MODE)!=0) {
c=get_utf8_char(cptr,&cptr);
assert(c>=0); /* file was already scanned for conformance to UTF-8 */
} else {
#endif
#if !defined NO_CODEPAGE
c=cp_translate(cptr,&cptr);
#else
c=*cptr;
cptr+=1;
#endif
#if !defined NO_UTF8
} /* if */
#endif
} else {
cptr+=1;
if (*cptr==sc_ctrlchar) {

View File

@ -309,120 +309,3 @@ SC_FUNC cell cp_translate(const unsigned char *string,const unsigned char **endp
}
#endif /* NO_CODEPAGE */
#if !defined NO_UTF8
SC_FUNC cell get_utf8_char(const unsigned char *string,const unsigned char **endptr)
{
int follow=0;
long lowmark=0;
unsigned char ch;
cell result=0;
if (endptr!=NULL)
*endptr=string;
for ( ;; ) {
ch=*string++;
if (follow>0 && (ch & 0xc0)==0x80) {
/* leader code is active, combine with earlier code */
result=(result << 6) | (ch & 0x3f);
if (--follow==0) {
/* encoding a character in more bytes than is strictly needed,
* is not really valid UTF-8; we are strict here to increase
* the chance of heuristic dectection of non-UTF-8 text
* (JAVA writes zero bytes as a 2-byte code UTF-8, which is invalid)
*/
if (result<lowmark)
return -1;
/* the code positions 0xd800--0xdfff and 0xfffe & 0xffff do not
* exist in UCS-4 (and hence, they do not exist in Unicode)
*/
if ((result>=0xd800 && result<=0xdfff) || result==0xfffe || result==0xffff)
return -1;
} /* if */
break;
} else if (follow==0 && (ch & 0x80)==0x80) {
/* UTF-8 leader code */
if ((ch & 0xe0)==0xc0) {
/* 110xxxxx 10xxxxxx */
follow=1;
lowmark=0x80L;
result=ch & 0x1f;
} else if ((ch & 0xf0)==0xe0) {
/* 1110xxxx 10xxxxxx 10xxxxxx (16 bits, BMP plane) */
follow=2;
lowmark=0x800L;
result=ch & 0x0f;
} else if ((ch & 0xf8)==0xf0) {
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
follow=3;
lowmark=0x10000L;
result=ch & 0x07;
} else if ((ch & 0xfc)==0xf8) {
/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
follow=4;
lowmark=0x200000L;
result=ch & 0x03;
} else if ((ch & 0xfe)==0xfc) {
/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (32 bits) */
follow=5;
lowmark=0x4000000L;
result=ch & 0x01;
} else {
/* this is invalid UTF-8 */
return -1;
} /* if */
} else if (follow==0 && (ch & 0x80)==0x00) {
/* 0xxxxxxx (US-ASCII) */
result=ch;
break;
} else {
/* this is invalid UTF-8 */
return -1;
} /* if */
} /* for */
if (endptr!=NULL)
*endptr=string;
return result;
}
#endif
SC_FUNC int scan_utf8(FILE *fp,const char *filename)
{
#if defined NO_UTF8
return 0;
#else
static void *resetpos=NULL;
int utf8=TRUE;
int firstchar=TRUE,bom_found=FALSE;
const unsigned char *ptr;
resetpos=pc_getpossrc(fp);
while (utf8 && pc_readsrc(fp,pline,sLINEMAX)!=NULL) {
ptr=pline;
if (firstchar) {
/* check whether the very first character on the very first line
* starts with a BYTE order mark
*/
cell c=get_utf8_char(ptr,&ptr);
bom_found= (c==0xfeff);
utf8= (c>=0);
firstchar=FALSE;
} /* if */
while (utf8 && *ptr!='\0')
utf8= (get_utf8_char(ptr,&ptr)>=0);
} /* while */
pc_resetsrc(fp,resetpos);
if (bom_found) {
unsigned char bom[3];
if (!utf8)
error(77,filename); /* malformed UTF-8 encoding */
pc_readsrc(fp,bom,3);
assert(bom[0]==0xef && bom[1]==0xbb && bom[2]==0xbf);
} /* if */
return utf8;
#endif /* NO_UTF8 */
}

View File

@ -84,7 +84,6 @@ SC_VDEFINE int sc_status; /* read/write status */
SC_VDEFINE int sc_rationaltag=0; /* tag for rational numbers */
SC_VDEFINE int rational_digits=0; /* number of fractional digits */
SC_VDEFINE int sc_allowproccall=0; /* allow/detect tagnames in lex() */
SC_VDEFINE short sc_is_utf8=FALSE; /* is this source file in UTF-8 encoding */
SC_VDEFINE char *pc_deprecate = NULL;/* if non-null, mark next declaration as deprecated */
SC_VDEFINE int sc_showincludes=0; /* show include files */
SC_VDEFINE int sc_warnings_are_errors=0;