Compiler: Fix broken UTF8 support (#329)

2018-09-20 15:27:15 -03:00
parent 9f00bf4f55
commit ddf0cc69dd
4 changed files with 0 additions and 133 deletions
--- a/compiler/libpc300/sc.h
+++ b/compiler/libpc300/sc.h
@@ -726,8 +726,6 @@ int mfputs(MEMFILE *mf,char *string);
 SC_FUNC int cp_path(const char *root,const char *directory);
 SC_FUNC int cp_set(const char *name);
 SC_FUNC cell cp_translate(const unsigned char *string,const unsigned char **endptr);
-SC_FUNC cell get_utf8_char(const unsigned char *string,const unsigned char **endptr);
-SC_FUNC int scan_utf8(FILE *fp,const char *filename);

 /* function prototypes in SCSTATE.C */
 SC_FUNC constvalue *automaton_add(const char *name);
@@ -803,7 +801,6 @@ SC_VDECL int sc_status;       /* read/write status */
 SC_VDECL int sc_rationaltag;  /* tag for rational numbers */
 SC_VDECL int rational_digits; /* number of fractional digits */
 SC_VDECL int sc_allowproccall;/* allow/detect tagnames in lex() */
-SC_VDECL short sc_is_utf8;    /* is this source file in UTF-8 encoding */
 SC_VDECL char *pc_deprecate;  /* if non-NULL, mark next declaration as deprecated */
 SC_VDECL int sc_warnings_are_errors;

--- a/compiler/libpc300/sc2.c
+++ b/compiler/libpc300/sc2.c
@@ -152,7 +152,6 @@ static char *extensions[] = { ".inc", ".p", ".pawn" };
  PUSHSTK_I(iflevel);
  assert(!SKIPPING);
  assert(skiplevel==iflevel);   /* these two are always the same when "parsing" */
-  PUSHSTK_I(sc_is_utf8);
  PUSHSTK_I(icomment);
  PUSHSTK_I(fcurrent);
  PUSHSTK_I(fline);
@@ -169,7 +168,6 @@ static char *extensions[] = { ".inc", ".p", ".pawn" };
  assert(sc_status == statFIRST || strcmp(get_inputfile(fcurrent), inpfname) == 0);
  setfiledirect(inpfname);      /* (optionally) set in the list file */
  listline=-1;                  /* force a #line directive when changing the file */
-  sc_is_utf8=(short)scan_utf8(inpf,name);
  return TRUE;
 }

@@ -319,7 +317,6 @@ static void readline(unsigned char *line)
      fline=i;
      fcurrent=(short)POPSTK_I();
      icomment=(short)POPSTK_I();
-      sc_is_utf8=(short)POPSTK_I();
      iflevel=(short)POPSTK_I();
      skiplevel=iflevel;        /* this condition held before including the file */
      assert(!SKIPPING);        /* idem ditto */
@@ -2395,21 +2392,12 @@ static cell litchar(const unsigned char **lptr,int flags)

  cptr=*lptr;
  if ((flags & RAWMODE)!=0 || *cptr!=sc_ctrlchar) {  /* no escape character */
-    #if !defined NO_UTF8
-      if (sc_is_utf8 && (flags & UTF8MODE)!=0) {
-        c=get_utf8_char(cptr,&cptr);
-        assert(c>=0);   /* file was already scanned for conformance to UTF-8 */
-      } else {
-    #endif
      #if !defined NO_CODEPAGE
        c=cp_translate(cptr,&cptr);
      #else
        c=*cptr;
        cptr+=1;
      #endif
-    #if !defined NO_UTF8
-      } /* if */
-    #endif
  } else {
    cptr+=1;
    if (*cptr==sc_ctrlchar) {
--- a/compiler/libpc300/sci18n.c
+++ b/compiler/libpc300/sci18n.c
@@ -309,120 +309,3 @@ SC_FUNC cell cp_translate(const unsigned char *string,const unsigned char **endp
 }

 #endif  /* NO_CODEPAGE */
-
-#if !defined NO_UTF8
-SC_FUNC cell get_utf8_char(const unsigned char *string,const unsigned char **endptr)
-{
-  int follow=0;
-  long lowmark=0;
-  unsigned char ch;
-  cell result=0;
-
-  if (endptr!=NULL)
-    *endptr=string;
-
-  for ( ;; ) {
-    ch=*string++;
-
-    if (follow>0 && (ch & 0xc0)==0x80) {
-      /* leader code is active, combine with earlier code */
-      result=(result << 6) | (ch & 0x3f);
-      if (--follow==0) {
-        /* encoding a character in more bytes than is strictly needed,
-         * is not really valid UTF-8; we are strict here to increase
-         * the chance of heuristic dectection of non-UTF-8 text
-         * (JAVA writes zero bytes as a 2-byte code UTF-8, which is invalid)
-         */
-        if (result<lowmark)
-          return -1;
-        /* the code positions 0xd800--0xdfff and 0xfffe & 0xffff do not
-         * exist in UCS-4 (and hence, they do not exist in Unicode)
-         */
-        if ((result>=0xd800 && result<=0xdfff) || result==0xfffe || result==0xffff)
-          return -1;
-      } /* if */
-      break;
-    } else if (follow==0 && (ch & 0x80)==0x80) {
-      /* UTF-8 leader code */
-      if ((ch & 0xe0)==0xc0) {
-        /* 110xxxxx 10xxxxxx */
-        follow=1;
-        lowmark=0x80L;
-        result=ch & 0x1f;
-      } else if ((ch & 0xf0)==0xe0) {
-        /* 1110xxxx 10xxxxxx 10xxxxxx (16 bits, BMP plane) */
-        follow=2;
-        lowmark=0x800L;
-        result=ch & 0x0f;
-      } else if ((ch & 0xf8)==0xf0) {
-        /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
-        follow=3;
-        lowmark=0x10000L;
-        result=ch & 0x07;
-      } else if ((ch & 0xfc)==0xf8) {
-        /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
-        follow=4;
-        lowmark=0x200000L;
-        result=ch & 0x03;
-      } else if ((ch & 0xfe)==0xfc) {
-        /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (32 bits) */
-        follow=5;
-        lowmark=0x4000000L;
-        result=ch & 0x01;
-      } else {
-        /* this is invalid UTF-8 */
-        return -1;
-      } /* if */
-    } else if (follow==0 && (ch & 0x80)==0x00) {
-      /* 0xxxxxxx (US-ASCII) */
-      result=ch;
-      break;
-    } else {
-      /* this is invalid UTF-8 */
-      return -1;
-    } /* if */
-
-  } /* for */
-
-  if (endptr!=NULL)
-    *endptr=string;
-  return result;
-}
-#endif
-
-SC_FUNC int scan_utf8(FILE *fp,const char *filename)
-{
-  #if defined NO_UTF8
-    return 0;
-  #else
-    static void *resetpos=NULL;
-    int utf8=TRUE;
-    int firstchar=TRUE,bom_found=FALSE;
-    const unsigned char *ptr;
-
-    resetpos=pc_getpossrc(fp);
-    while (utf8 && pc_readsrc(fp,pline,sLINEMAX)!=NULL) {
-      ptr=pline;
-      if (firstchar) {
-        /* check whether the very first character on the very first line
-         * starts with a BYTE order mark
-         */
-        cell c=get_utf8_char(ptr,&ptr);
-        bom_found= (c==0xfeff);
-        utf8= (c>=0);
-        firstchar=FALSE;
-      } /* if */
-      while (utf8 && *ptr!='\0')
-        utf8= (get_utf8_char(ptr,&ptr)>=0);
-    } /* while */
-    pc_resetsrc(fp,resetpos);
-    if (bom_found) {
-      unsigned char bom[3];
-      if (!utf8)
-        error(77,filename);     /* malformed UTF-8 encoding */
-      pc_readsrc(fp,bom,3);
-      assert(bom[0]==0xef && bom[1]==0xbb && bom[2]==0xbf);
-    } /* if */
-    return utf8;
-  #endif  /* NO_UTF8 */
-}
--- a/compiler/libpc300/scvars.c
+++ b/compiler/libpc300/scvars.c
@@ -84,7 +84,6 @@ SC_VDEFINE int sc_status;          /* read/write status */
 SC_VDEFINE int sc_rationaltag=0;   /* tag for rational numbers */
 SC_VDEFINE int rational_digits=0;  /* number of fractional digits */
 SC_VDEFINE int sc_allowproccall=0; /* allow/detect tagnames in lex() */
-SC_VDEFINE short sc_is_utf8=FALSE; /* is this source file in UTF-8 encoding */
 SC_VDEFINE char *pc_deprecate = NULL;/* if non-null, mark next declaration as deprecated */
 SC_VDEFINE int sc_showincludes=0;  /* show include files */
 SC_VDEFINE int sc_warnings_are_errors=0;