From ddf0cc69dd5a25c61c5422a439a463982102f138 Mon Sep 17 00:00:00 2001 From: IgnacioFDM Date: Thu, 20 Sep 2018 15:27:15 -0300 Subject: [PATCH] Compiler: Fix broken UTF8 support (#329) --- compiler/libpc300/sc.h | 3 - compiler/libpc300/sc2.c | 12 ---- compiler/libpc300/sci18n.c | 117 ------------------------------------- compiler/libpc300/scvars.c | 1 - 4 files changed, 133 deletions(-) diff --git a/compiler/libpc300/sc.h b/compiler/libpc300/sc.h index 0f8b3212..82a7616d 100755 --- a/compiler/libpc300/sc.h +++ b/compiler/libpc300/sc.h @@ -726,8 +726,6 @@ int mfputs(MEMFILE *mf,char *string); SC_FUNC int cp_path(const char *root,const char *directory); SC_FUNC int cp_set(const char *name); SC_FUNC cell cp_translate(const unsigned char *string,const unsigned char **endptr); -SC_FUNC cell get_utf8_char(const unsigned char *string,const unsigned char **endptr); -SC_FUNC int scan_utf8(FILE *fp,const char *filename); /* function prototypes in SCSTATE.C */ SC_FUNC constvalue *automaton_add(const char *name); @@ -803,7 +801,6 @@ SC_VDECL int sc_status; /* read/write status */ SC_VDECL int sc_rationaltag; /* tag for rational numbers */ SC_VDECL int rational_digits; /* number of fractional digits */ SC_VDECL int sc_allowproccall;/* allow/detect tagnames in lex() */ -SC_VDECL short sc_is_utf8; /* is this source file in UTF-8 encoding */ SC_VDECL char *pc_deprecate; /* if non-NULL, mark next declaration as deprecated */ SC_VDECL int sc_warnings_are_errors; diff --git a/compiler/libpc300/sc2.c b/compiler/libpc300/sc2.c index 5bd3735b..09ce6621 100755 --- a/compiler/libpc300/sc2.c +++ b/compiler/libpc300/sc2.c @@ -152,7 +152,6 @@ static char *extensions[] = { ".inc", ".p", ".pawn" }; PUSHSTK_I(iflevel); assert(!SKIPPING); assert(skiplevel==iflevel); /* these two are always the same when "parsing" */ - PUSHSTK_I(sc_is_utf8); PUSHSTK_I(icomment); PUSHSTK_I(fcurrent); PUSHSTK_I(fline); @@ -169,7 +168,6 @@ static char *extensions[] = { ".inc", ".p", ".pawn" }; assert(sc_status == statFIRST || strcmp(get_inputfile(fcurrent), inpfname) == 0); setfiledirect(inpfname); /* (optionally) set in the list file */ listline=-1; /* force a #line directive when changing the file */ - sc_is_utf8=(short)scan_utf8(inpf,name); return TRUE; } @@ -319,7 +317,6 @@ static void readline(unsigned char *line) fline=i; fcurrent=(short)POPSTK_I(); icomment=(short)POPSTK_I(); - sc_is_utf8=(short)POPSTK_I(); iflevel=(short)POPSTK_I(); skiplevel=iflevel; /* this condition held before including the file */ assert(!SKIPPING); /* idem ditto */ @@ -2395,21 +2392,12 @@ static cell litchar(const unsigned char **lptr,int flags) cptr=*lptr; if ((flags & RAWMODE)!=0 || *cptr!=sc_ctrlchar) { /* no escape character */ - #if !defined NO_UTF8 - if (sc_is_utf8 && (flags & UTF8MODE)!=0) { - c=get_utf8_char(cptr,&cptr); - assert(c>=0); /* file was already scanned for conformance to UTF-8 */ - } else { - #endif #if !defined NO_CODEPAGE c=cp_translate(cptr,&cptr); #else c=*cptr; cptr+=1; #endif - #if !defined NO_UTF8 - } /* if */ - #endif } else { cptr+=1; if (*cptr==sc_ctrlchar) { diff --git a/compiler/libpc300/sci18n.c b/compiler/libpc300/sci18n.c index b106fa9c..40912c46 100755 --- a/compiler/libpc300/sci18n.c +++ b/compiler/libpc300/sci18n.c @@ -309,120 +309,3 @@ SC_FUNC cell cp_translate(const unsigned char *string,const unsigned char **endp } #endif /* NO_CODEPAGE */ - -#if !defined NO_UTF8 -SC_FUNC cell get_utf8_char(const unsigned char *string,const unsigned char **endptr) -{ - int follow=0; - long lowmark=0; - unsigned char ch; - cell result=0; - - if (endptr!=NULL) - *endptr=string; - - for ( ;; ) { - ch=*string++; - - if (follow>0 && (ch & 0xc0)==0x80) { - /* leader code is active, combine with earlier code */ - result=(result << 6) | (ch & 0x3f); - if (--follow==0) { - /* encoding a character in more bytes than is strictly needed, - * is not really valid UTF-8; we are strict here to increase - * the chance of heuristic dectection of non-UTF-8 text - * (JAVA writes zero bytes as a 2-byte code UTF-8, which is invalid) - */ - if (result=0xd800 && result<=0xdfff) || result==0xfffe || result==0xffff) - return -1; - } /* if */ - break; - } else if (follow==0 && (ch & 0x80)==0x80) { - /* UTF-8 leader code */ - if ((ch & 0xe0)==0xc0) { - /* 110xxxxx 10xxxxxx */ - follow=1; - lowmark=0x80L; - result=ch & 0x1f; - } else if ((ch & 0xf0)==0xe0) { - /* 1110xxxx 10xxxxxx 10xxxxxx (16 bits, BMP plane) */ - follow=2; - lowmark=0x800L; - result=ch & 0x0f; - } else if ((ch & 0xf8)==0xf0) { - /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ - follow=3; - lowmark=0x10000L; - result=ch & 0x07; - } else if ((ch & 0xfc)==0xf8) { - /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ - follow=4; - lowmark=0x200000L; - result=ch & 0x03; - } else if ((ch & 0xfe)==0xfc) { - /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (32 bits) */ - follow=5; - lowmark=0x4000000L; - result=ch & 0x01; - } else { - /* this is invalid UTF-8 */ - return -1; - } /* if */ - } else if (follow==0 && (ch & 0x80)==0x00) { - /* 0xxxxxxx (US-ASCII) */ - result=ch; - break; - } else { - /* this is invalid UTF-8 */ - return -1; - } /* if */ - - } /* for */ - - if (endptr!=NULL) - *endptr=string; - return result; -} -#endif - -SC_FUNC int scan_utf8(FILE *fp,const char *filename) -{ - #if defined NO_UTF8 - return 0; - #else - static void *resetpos=NULL; - int utf8=TRUE; - int firstchar=TRUE,bom_found=FALSE; - const unsigned char *ptr; - - resetpos=pc_getpossrc(fp); - while (utf8 && pc_readsrc(fp,pline,sLINEMAX)!=NULL) { - ptr=pline; - if (firstchar) { - /* check whether the very first character on the very first line - * starts with a BYTE order mark - */ - cell c=get_utf8_char(ptr,&ptr); - bom_found= (c==0xfeff); - utf8= (c>=0); - firstchar=FALSE; - } /* if */ - while (utf8 && *ptr!='\0') - utf8= (get_utf8_char(ptr,&ptr)>=0); - } /* while */ - pc_resetsrc(fp,resetpos); - if (bom_found) { - unsigned char bom[3]; - if (!utf8) - error(77,filename); /* malformed UTF-8 encoding */ - pc_readsrc(fp,bom,3); - assert(bom[0]==0xef && bom[1]==0xbb && bom[2]==0xbf); - } /* if */ - return utf8; - #endif /* NO_UTF8 */ -} diff --git a/compiler/libpc300/scvars.c b/compiler/libpc300/scvars.c index 4a38f149..89c3b339 100755 --- a/compiler/libpc300/scvars.c +++ b/compiler/libpc300/scvars.c @@ -84,7 +84,6 @@ SC_VDEFINE int sc_status; /* read/write status */ SC_VDEFINE int sc_rationaltag=0; /* tag for rational numbers */ SC_VDEFINE int rational_digits=0; /* number of fractional digits */ SC_VDEFINE int sc_allowproccall=0; /* allow/detect tagnames in lex() */ -SC_VDEFINE short sc_is_utf8=FALSE; /* is this source file in UTF-8 encoding */ SC_VDEFINE char *pc_deprecate = NULL;/* if non-null, mark next declaration as deprecated */ SC_VDEFINE int sc_showincludes=0; /* show include files */ SC_VDEFINE int sc_warnings_are_errors=0;