amxmodx/compiler/libpc300/sci18n.c

/*  Codepage translation to Unicode, and UTF-8 support
 *
 *  The translation is based on codepage mapping files that are distributed
 *  by the Unicode consortium, see ftp://ftp.unicode.org/Public/MAPPINGS/.
 *
 *  Character sets with a maximum of 256 codes are translated via a lookup
 *  table (these are Single-Byte Character Sets). Character sets like Shift-JIS
 *  with single-byte characters and multi-byte characters (introduced by a
 *  leader byte) are split into two tables: the 256-entry lookup table for
 *  the single-byte characters and an extended table for the multi-byte
 *  characters. The extended table is allocated dynamically; the lookup table
 *  is allocated statically, so loading SBCS tables cannot fail (if the tables
 *  themselves are valid, of course).
 *
 *  Copyright (c) ITB CompuPhase, 2004-2005
 *
 *  This software is provided "as-is", without any express or implied warranty.
 *  In no event will the authors be held liable for any damages arising from
 *  the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1.  The origin of this software must not be misrepresented; you must not
 *      claim that you wrote the original software. If you use this software in
 *      a product, an acknowledgment in the product documentation would be
 *      appreciated but is not required.
 *  2.  Altered source versions must be plainly marked as such, and must not be
 *      misrepresented as being the original software.
 *  3.  This notice may not be removed or altered from any source distribution.
 *
 *  Version: $Id: sci18n.c 1724 2005-07-24 20:00:55Z dvander $
 */
#include <assert.h>
#include <stdio.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include "sc.h"

#if !defined TRUE
  #define FALSE         0
  #define TRUE          1
#endif
#if !defined _MAX_PATH
  #define _MAX_PATH     250
#endif
#if !defined DIRSEP_CHAR
  #if defined LINUX || defined __FreeBSD__ || defined __OpenBSD__ || defined __APPLE___
    #define DIRSEP_CHAR '/'
  #elif defined macintosh
    #define DIRSEP_CHAR ':'
  #else
    #define DIRSEP_CHAR '\\'
  #endif
#endif

#if !defined ELEMENTS
  #define ELEMENTS(array)       (sizeof(array) / sizeof(array[0]))
#endif

#if !defined NO_CODEPAGE

#if !defined MAXCODEPAGE
  #define MAXCODEPAGE   12      /* typically "cp" + 4 digits + ".txt" */
#endif
#define INVALID         0xffffu /* 0xffff and 0xfffe are invalid Unicode characters */
#define LEADBYTE        0xfffeu

struct wordpair {
  unsigned short index;
  wchar_t code;
};
static char cprootpath[_MAX_PATH] = { DIRSEP_CHAR, '\0' };
static wchar_t bytetable[256];
static struct wordpair *wordtable = NULL;
static unsigned wordtablesize = 0;
static unsigned wordtabletop = 0;


/* read in a line delimited by '\r' or '\n'; do NOT store the '\r' or '\n' into
 * the string and ignore empty lines
 * returns 1 for success and 0 for failure
 */
static int cp_readline(FILE *fp,char *string,size_t size)
{
  size_t count=0;
  int c;
  assert(size>1);
  while ((c=fgetc(fp))!=EOF && count<size-1) {
    if (c=='\r' || c=='\n') {
      if (count>0)  /* '\r' or '\n' ends a string */
        break;
      /* if count==0, the line started with a '\r' or '\n', or perhaps line
       * ends in the file are '\r\n' and we read and stopped on the '\r' of
       * the preceding line
       */
    } else {
      string[count++]=(char)c;
    } /* if */
  } /* while */
  string[count]='\0';
  return count>0;
}

/* cp_path() sets the directory where all codepage files must be found (if
 * the parameter to cp_set() specifies a full path, that is used instead).
 * The path is specified into two parts: root and directory; the full path
 * for the codepage direcory is just the concatenation of the two, with a
 * directory separator in between. The directory is given in two parts,
 * because often a program already retrieves its "home" directory and the
 * codepages are most conveniently stored in a subdirectory of this home
 * directory.
 */
SC_FUNC int cp_path(const char *root, const char *directory)
{
  size_t len1,len2;
  int add_slash1,add_slash2;

  len1= (root!=NULL) ? strlen(root) : 0;
  add_slash1= (len1==0 || root[len1-1]!=DIRSEP_CHAR);
  len2= (directory!=NULL) ? strlen(directory) : 0;
  add_slash2= (len2>0 && root[len2-1]!=DIRSEP_CHAR);
  if (len1+add_slash1+len2+add_slash2>=(_MAX_PATH-MAXCODEPAGE))
    return FALSE;       /* full filename may not fit */
  if (root!=NULL)
    strcpy(cprootpath,root);
  if (add_slash1) {
    assert(len1==0 || cprootpath[len1]=='\0');
    cprootpath[len1]=DIRSEP_CHAR;
    cprootpath[len1+1]='\0';
  } /* if */
  if (directory!=NULL)
    strcat(cprootpath,directory);
  if (add_slash2) {
    assert(cprootpath[len1+add_slash1+len2]=='\0');
    cprootpath[len1+add_slash1+len2]=DIRSEP_CHAR;
    cprootpath[len1+add_slash1+len2+1]='\0';
  } /* if */
  cp_set(NULL);         /* start with a "linear" table (no translation) */
  return TRUE;
}

/* cp_set() loads a codepage from a file. The name parameter may be a
 * filename (including a full path) or it may be a partial codepage name.
 * If the name parameter is NULL, the codepage is cleared to be a "linear"
 * table (no translation).
 * The following files are attempted to open (where <name> specifies the
 * value of the parameter):
 *    <name>
 *    <cprootpath>/<name>
 *    <cprootpath>/<name>.txt
 *    <cprootpath>/cp<name>
 *    <cprootpath>/cp<name>.txt
 */
SC_FUNC int cp_set(const char *name)
{
  char filename[_MAX_PATH];
  FILE *fp=NULL;
  unsigned index;

  /* for name==NULL, set up an identity table */
  if (name==NULL || *name=='\0') {
    if (wordtable!=NULL) {
      free(wordtable);
      wordtable=NULL;
      wordtablesize=0;
      wordtabletop=0;
    } /* if */
    for (index=0; index<ELEMENTS(bytetable); index++)
      bytetable[index]=(wchar_t)index;
    return TRUE;
  } /* if */

  /* try to open the file as-is */
  if (strchr(name,DIRSEP_CHAR)!=NULL)
    fp=fopen(name,"rt");
  if (fp==NULL) {
    /* try opening the file in the "root path" for codepages */
    if (strlen(name)>MAXCODEPAGE)
      return 0;
    assert(strlen(name)+strlen(cprootpath)<_MAX_PATH);
    strcpy(filename,cprootpath);
    strcat(filename,name);
    fp=fopen(filename,"rt");
  } /* if */
  if (fp==NULL) {
    /* try opening the file in the "root path" for codepages, with a ".txt" extension */
    if (strlen(name)+4>=MAXCODEPAGE)
      return 0;
    assert(strlen(filename)+4<_MAX_PATH);
    strcat(filename,".txt");
    fp=fopen(filename,"rt");
  } /* if */
  if (fp==NULL) {
    /* try opening the file in the "root path" for codepages, with "cp" prefixed before the name */
    if (strlen(name)+2>MAXCODEPAGE)
      return 0;
    assert(2+strlen(name)+strlen(cprootpath)<_MAX_PATH);
    strcpy(filename,cprootpath);
    strcat(filename,"cp");
    strcat(filename,name);
    fp=fopen(filename,"rt");
  } /* if */
  if (fp==NULL) {
    /* try opening the file in the "root path" for codepages, with "cp" prefixed an ".txt" appended */
    if (strlen(name)+2+4>MAXCODEPAGE)
      return 0;
    assert(strlen(filename)+4<_MAX_PATH);
    strcat(filename,".txt");
    fp=fopen(filename,"rt");
  } /* if */
  if (fp==NULL)
    return FALSE;       /* all failed */

  /* clear the tables */
  for (index=0; index<ELEMENTS(bytetable); index++)
    bytetable[index]=INVALID;   /* special code meaning "not found" */
  assert(wordtablesize==0 && wordtabletop==0 && wordtable==NULL
         || wordtablesize>0 && wordtable!=NULL);
  if (wordtable!=NULL) {
    free(wordtable);
    wordtable=NULL;
    wordtablesize=0;
    wordtabletop=0;
  } /* if */

  /* read in the table */
  while (cp_readline(fp,filename,sizeof filename)) {
    char *ptr;
    if ((ptr=strchr(filename,'#'))!=NULL)
      *ptr='\0';                /* strip of comment */
    for (ptr=filename; *ptr>0 && *ptr<' '; ptr++)
      /* nothing */;            /* skip leading whitespace */
    if (*ptr!='\0') {
      /* content on line */
      unsigned code=LEADBYTE;
      int num=sscanf(ptr,"%i %i",&index,&code);
      /* if sscanf() returns 1 and the index is in range 0..255, then the
       * code is a DBCS lead byte; if sscanf() returns 2 and index>=256, this
       * is a double byte pair (lead byte + follower)
       */
      if (num>=1 && index<256) {
        bytetable[index]=(wchar_t)code;
      } else if (num==2 && index>=256 && index<LEADBYTE) {
        /* store the DBCS character in wordtable */
        if (wordtabletop>=wordtablesize) {
          /* grow the list */
          int newsize;
          struct wordpair *newblock;
          newsize= (wordtablesize==0) ? 128 : 2*wordtablesize;
          newblock=(struct wordpair *)malloc(newsize*sizeof(*wordtable));
          if (newblock!=NULL) {
            memcpy(newblock,wordtable,wordtabletop*sizeof(*wordtable));
            free(wordtable);
            wordtable=newblock;
            wordtablesize=newsize;
          } /* if */
        } /* if */
        if (wordtabletop<wordtablesize) {
          /* insert at sorted position */
          int pos=wordtabletop;
          assert(wordtable!=NULL);
          while (pos>0 && (unsigned)wordtable[pos-1].index>index) {
            wordtable[pos]=wordtable[pos-1];
            pos--;
          } /* while */
          wordtable[pos].index=(unsigned short)index;
          wordtable[pos].code=(wchar_t)code;
        } /* if */
      } /* if */
    } /* if */
  } /* while */

  fclose(fp);
  return TRUE;
}

SC_FUNC cell cp_translate(const unsigned char *string,const unsigned char **endptr)
{
  wchar_t result;

  result=bytetable[*string++];
  /* check whether this is a leader code */
  if ((unsigned)result==LEADBYTE && wordtable!=NULL) {
    /* look up the code via binary search */
    int low,high,mid;
    unsigned short index=(unsigned short)(((*(string-1)) << 8) | *string);
    string++;
    assert(wordtabletop>0);
    low=0;
    high=wordtabletop-1;
    while (low<high) {
      mid=(low+high)/2;
      assert(low<=mid && mid<high);
      if (index>wordtable[mid].index)
        low=mid+1;
      else
        high=mid;
    } /* while */
    assert(low==high);
    if (wordtable[low].index==index)
      result=wordtable[low].code;
  } /* if */

  if (endptr!=NULL)
    *endptr=string;
  return (cell)result;
}

#endif  /* NO_CODEPAGE */

#if !defined NO_UTF8
SC_FUNC cell get_utf8_char(const unsigned char *string,const unsigned char **endptr)
{
  int follow=0;
  long lowmark=0;
  unsigned char ch;
  cell result=0;

  if (endptr!=NULL)
    *endptr=string;

  for ( ;; ) {
    ch=*string++;

    if (follow>0 && (ch & 0xc0)==0x80) {
      /* leader code is active, combine with earlier code */
      result=(result << 6) | (ch & 0x3f);
      if (--follow==0) {
        /* encoding a character in more bytes than is strictly needed,
         * is not really valid UTF-8; we are strict here to increase
         * the chance of heuristic dectection of non-UTF-8 text
         * (JAVA writes zero bytes as a 2-byte code UTF-8, which is invalid)
         */
        if (result<lowmark)
          return -1;
        /* the code positions 0xd800--0xdfff and 0xfffe & 0xffff do not
         * exist in UCS-4 (and hence, they do not exist in Unicode)
         */
        if ((result>=0xd800 && result<=0xdfff) || result==0xfffe || result==0xffff)
          return -1;
      } /* if */
      break;
    } else if (follow==0 && (ch & 0x80)==0x80) {
      /* UTF-8 leader code */
      if ((ch & 0xe0)==0xc0) {
        /* 110xxxxx 10xxxxxx */
        follow=1;
        lowmark=0x80L;
        result=ch & 0x1f;
      } else if ((ch & 0xf0)==0xe0) {
        /* 1110xxxx 10xxxxxx 10xxxxxx (16 bits, BMP plane) */
        follow=2;
        lowmark=0x800L;
        result=ch & 0x0f;
      } else if ((ch & 0xf8)==0xf0) {
        /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
        follow=3;
        lowmark=0x10000L;
        result=ch & 0x07;
      } else if ((ch & 0xfc)==0xf8) {
        /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
        follow=4;
        lowmark=0x200000L;
        result=ch & 0x03;
      } else if ((ch & 0xfe)==0xfc) {
        /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (32 bits) */
        follow=5;
        lowmark=0x4000000L;
        result=ch & 0x01;
      } else {
        /* this is invalid UTF-8 */
        return -1;
      } /* if */
    } else if (follow==0 && (ch & 0x80)==0x00) {
      /* 0xxxxxxx (US-ASCII) */
      result=ch;
      break;
    } else {
      /* this is invalid UTF-8 */
      return -1;
    } /* if */

  } /* for */

  if (endptr!=NULL)
    *endptr=string;
  return result;
}
#endif

SC_FUNC int scan_utf8(FILE *fp,const char *filename)
{
  #if defined NO_UTF8
    return 0;
  #else
    void *resetpos=pc_getpossrc(fp);
    int utf8=TRUE;
    int firstchar=TRUE,bom_found=FALSE;
    const unsigned char *ptr;

    while (utf8 && pc_readsrc(fp,pline,sLINEMAX)!=NULL) {
      ptr=pline;
      if (firstchar) {
        /* check whether the very first character on the very first line
         * starts with a BYTE order mark
         */
        cell c=get_utf8_char(ptr,&ptr);
        bom_found= (c==0xfeff);
        utf8= (c>=0);
        firstchar=FALSE;
      } /* if */
      while (utf8 && *ptr!='\0')
        utf8= (get_utf8_char(ptr,&ptr)>=0);
    } /* while */
    pc_resetsrc(fp,resetpos);
    if (bom_found) {
      unsigned char bom[3];
      if (!utf8)
        error(77,filename);     /* malformed UTF-8 encoding */
      pc_readsrc(fp,bom,3);
      assert(bom[0]==0xef && bom[1]==0xbb && bom[2]==0xbf);
    } /* if */
    return utf8;
  #endif  /* NO_UTF8 */
}
Initial import of Pawn (Small 3.0) 2005-07-25 00:00:55 +04:00			`/* Codepage translation to Unicode, and UTF-8 support`
			`*`
			`* The translation is based on codepage mapping files that are distributed`
			`* by the Unicode consortium, see ftp://ftp.unicode.org/Public/MAPPINGS/.`
			`*`
			`* Character sets with a maximum of 256 codes are translated via a lookup`
			`* table (these are Single-Byte Character Sets). Character sets like Shift-JIS`
			`* with single-byte characters and multi-byte characters (introduced by a`
			`* leader byte) are split into two tables: the 256-entry lookup table for`
			`* the single-byte characters and an extended table for the multi-byte`
			`* characters. The extended table is allocated dynamically; the lookup table`
			`* is allocated statically, so loading SBCS tables cannot fail (if the tables`
			`* themselves are valid, of course).`
			`*`
			`* Copyright (c) ITB CompuPhase, 2004-2005`
			`*`
			`* This software is provided "as-is", without any express or implied warranty.`
			`* In no event will the authors be held liable for any damages arising from`
			`* the use of this software.`
			`*`
			`* Permission is granted to anyone to use this software for any purpose,`
			`* including commercial applications, and to alter it and redistribute it`
			`* freely, subject to the following restrictions:`
			`*`
			`* 1. The origin of this software must not be misrepresented; you must not`
			`* claim that you wrote the original software. If you use this software in`
			`* a product, an acknowledgment in the product documentation would be`
			`* appreciated but is not required.`
			`* 2. Altered source versions must be plainly marked as such, and must not be`
			`* misrepresented as being the original software.`
			`* 3. This notice may not be removed or altered from any source distribution.`
			`*`
Squashed commit of the following: commit 011d9b6b07d904ad1e81ef7c747269903e2d47c4 Author: David Anderson <dvander@alliedmods.net> Date: Mon Jan 11 00:17:08 2010 -0600 Initial import from Subversion (amxmodx/trunk rev 3757). 2014-02-07 11:06:54 +04:00			`* Version: $Id: sci18n.c 1724 2005-07-24 20:00:55Z dvander $`
Initial import of Pawn (Small 3.0) 2005-07-25 00:00:55 +04:00			`*/`
			`#include <assert.h>`
			`#include <stdio.h>`
			`#include <stddef.h>`
			`#include <stdlib.h>`
			`#include <string.h>`
			`#include "sc.h"`

			`#if !defined TRUE`
			`#define FALSE 0`
			`#define TRUE 1`
			`#endif`
			`#if !defined _MAX_PATH`
			`#define _MAX_PATH 250`
			`#endif`
			`#if !defined DIRSEP_CHAR`
Added support for Mac OS X and building with clang (bug 5601, r=dvander). 2013-02-13 11:14:37 +04:00			`#if defined LINUX \|\| defined __FreeBSD__ \|\| defined __OpenBSD__ \|\| defined __APPLE___`
Initial import of Pawn (Small 3.0) 2005-07-25 00:00:55 +04:00			`#define DIRSEP_CHAR '/'`
			`#elif defined macintosh`
			`#define DIRSEP_CHAR ':'`
			`#else`
			`#define DIRSEP_CHAR '\\'`
			`#endif`
			`#endif`

			`#if !defined ELEMENTS`
			`#define ELEMENTS(array) (sizeof(array) / sizeof(array[0]))`
			`#endif`

			`#if !defined NO_CODEPAGE`

			`#if !defined MAXCODEPAGE`
			`#define MAXCODEPAGE 12 /* typically "cp" + 4 digits + ".txt" */`
			`#endif`
			`#define INVALID 0xffffu /* 0xffff and 0xfffe are invalid Unicode characters */`
			`#define LEADBYTE 0xfffeu`

			`struct wordpair {`
			`unsigned short index;`
			`wchar_t code;`
			`};`
			`static char cprootpath[_MAX_PATH] = { DIRSEP_CHAR, '\0' };`
			`static wchar_t bytetable[256];`
			`static struct wordpair *wordtable = NULL;`
			`static unsigned wordtablesize = 0;`
			`static unsigned wordtabletop = 0;`


			`/* read in a line delimited by '\r' or '\n'; do NOT store the '\r' or '\n' into`
			`* the string and ignore empty lines`
			`* returns 1 for success and 0 for failure`
			`*/`
			`static int cp_readline(FILE fp,char string,size_t size)`
			`{`
			`size_t count=0;`
			`int c;`
			`assert(size>1);`
			`while ((c=fgetc(fp))!=EOF && count<size-1) {`
			`if (c=='\r' \|\| c=='\n') {`
			`if (count>0) /* '\r' or '\n' ends a string */`
			`break;`
			`/* if count==0, the line started with a '\r' or '\n', or perhaps line`
			`* ends in the file are '\r\n' and we read and stopped on the '\r' of`
			`* the preceding line`
			`*/`
			`} else {`
			`string[count++]=(char)c;`
			`} /* if */`
			`} /* while */`
			`string[count]='\0';`
			`return count>0;`
			`}`

			`/* cp_path() sets the directory where all codepage files must be found (if`
			`* the parameter to cp_set() specifies a full path, that is used instead).`
			`* The path is specified into two parts: root and directory; the full path`
			`* for the codepage direcory is just the concatenation of the two, with a`
			`* directory separator in between. The directory is given in two parts,`
			`* because often a program already retrieves its "home" directory and the`
			`* codepages are most conveniently stored in a subdirectory of this home`
			`* directory.`
			`*/`
			`SC_FUNC int cp_path(const char root, const char directory)`
			`{`
			`size_t len1,len2;`
			`int add_slash1,add_slash2;`

			`len1= (root!=NULL) ? strlen(root) : 0;`
			`add_slash1= (len1==0 \|\| root[len1-1]!=DIRSEP_CHAR);`
			`len2= (directory!=NULL) ? strlen(directory) : 0;`
			`add_slash2= (len2>0 && root[len2-1]!=DIRSEP_CHAR);`
			`if (len1+add_slash1+len2+add_slash2>=(_MAX_PATH-MAXCODEPAGE))`
			`return FALSE; /* full filename may not fit */`
			`if (root!=NULL)`
			`strcpy(cprootpath,root);`
			`if (add_slash1) {`
			`assert(len1==0 \|\| cprootpath[len1]=='\0');`
			`cprootpath[len1]=DIRSEP_CHAR;`
			`cprootpath[len1+1]='\0';`
			`} /* if */`
			`if (directory!=NULL)`
			`strcat(cprootpath,directory);`
			`if (add_slash2) {`
			`assert(cprootpath[len1+add_slash1+len2]=='\0');`
			`cprootpath[len1+add_slash1+len2]=DIRSEP_CHAR;`
			`cprootpath[len1+add_slash1+len2+1]='\0';`
			`} /* if */`
			`cp_set(NULL); /* start with a "linear" table (no translation) */`
			`return TRUE;`
			`}`

			`/* cp_set() loads a codepage from a file. The name parameter may be a`
			`* filename (including a full path) or it may be a partial codepage name.`
			`* If the name parameter is NULL, the codepage is cleared to be a "linear"`
			`* table (no translation).`
			`* The following files are attempted to open (where <name> specifies the`
			`* value of the parameter):`
			`* <name>`
			`* <cprootpath>/<name>`
			`* <cprootpath>/<name>.txt`
			`* <cprootpath>/cp<name>`
			`* <cprootpath>/cp<name>.txt`
			`*/`
			`SC_FUNC int cp_set(const char *name)`
			`{`
			`char filename[_MAX_PATH];`
			`FILE *fp=NULL;`
			`unsigned index;`

			`/* for name==NULL, set up an identity table */`
			`if (name==NULL \|\| *name=='\0') {`
			`if (wordtable!=NULL) {`
			`free(wordtable);`
			`wordtable=NULL;`
			`wordtablesize=0;`
			`wordtabletop=0;`
			`} /* if */`
			`for (index=0; index<ELEMENTS(bytetable); index++)`
			`bytetable[index]=(wchar_t)index;`
			`return TRUE;`
			`} /* if */`

			`/* try to open the file as-is */`
			`if (strchr(name,DIRSEP_CHAR)!=NULL)`
			`fp=fopen(name,"rt");`
			`if (fp==NULL) {`
			`/* try opening the file in the "root path" for codepages */`
			`if (strlen(name)>MAXCODEPAGE)`
			`return 0;`
			`assert(strlen(name)+strlen(cprootpath)<_MAX_PATH);`
			`strcpy(filename,cprootpath);`
			`strcat(filename,name);`
			`fp=fopen(filename,"rt");`
			`} /* if */`
			`if (fp==NULL) {`
			`/* try opening the file in the "root path" for codepages, with a ".txt" extension */`
			`if (strlen(name)+4>=MAXCODEPAGE)`
			`return 0;`
			`assert(strlen(filename)+4<_MAX_PATH);`
			`strcat(filename,".txt");`
			`fp=fopen(filename,"rt");`
			`} /* if */`
			`if (fp==NULL) {`
			`/* try opening the file in the "root path" for codepages, with "cp" prefixed before the name */`
			`if (strlen(name)+2>MAXCODEPAGE)`
			`return 0;`
			`assert(2+strlen(name)+strlen(cprootpath)<_MAX_PATH);`
			`strcpy(filename,cprootpath);`
			`strcat(filename,"cp");`
			`strcat(filename,name);`
			`fp=fopen(filename,"rt");`
			`} /* if */`
			`if (fp==NULL) {`
			`/* try opening the file in the "root path" for codepages, with "cp" prefixed an ".txt" appended */`
			`if (strlen(name)+2+4>MAXCODEPAGE)`
			`return 0;`
			`assert(strlen(filename)+4<_MAX_PATH);`
			`strcat(filename,".txt");`
			`fp=fopen(filename,"rt");`
			`} /* if */`
			`if (fp==NULL)`
			`return FALSE; /* all failed */`

			`/* clear the tables */`
			`for (index=0; index<ELEMENTS(bytetable); index++)`
			`bytetable[index]=INVALID; /* special code meaning "not found" */`
			`assert(wordtablesize==0 && wordtabletop==0 && wordtable==NULL`
			`\|\| wordtablesize>0 && wordtable!=NULL);`
			`if (wordtable!=NULL) {`
			`free(wordtable);`
			`wordtable=NULL;`
			`wordtablesize=0;`
			`wordtabletop=0;`
			`} /* if */`

			`/* read in the table */`
			`while (cp_readline(fp,filename,sizeof filename)) {`
			`char *ptr;`
			`if ((ptr=strchr(filename,'#'))!=NULL)`
			`ptr='\0'; / strip of comment */`
			`for (ptr=filename; ptr>0 && ptr<' '; ptr++)`
			`/* nothing /; / skip leading whitespace */`
			`if (*ptr!='\0') {`
			`/* content on line */`
			`unsigned code=LEADBYTE;`
			`int num=sscanf(ptr,"%i %i",&index,&code);`
			`/* if sscanf() returns 1 and the index is in range 0..255, then the`
			`* code is a DBCS lead byte; if sscanf() returns 2 and index>=256, this`
			`* is a double byte pair (lead byte + follower)`
			`*/`
			`if (num>=1 && index<256) {`
			`bytetable[index]=(wchar_t)code;`
			`} else if (num==2 && index>=256 && index<LEADBYTE) {`
			`/* store the DBCS character in wordtable */`
			`if (wordtabletop>=wordtablesize) {`
			`/* grow the list */`
			`int newsize;`
			`struct wordpair *newblock;`
			`newsize= (wordtablesize==0) ? 128 : 2*wordtablesize;`
			`newblock=(struct wordpair )malloc(newsizesizeof(*wordtable));`
			`if (newblock!=NULL) {`
			`memcpy(newblock,wordtable,wordtabletopsizeof(wordtable));`
			`free(wordtable);`
			`wordtable=newblock;`
			`wordtablesize=newsize;`
			`} /* if */`
			`} /* if */`
			`if (wordtabletop<wordtablesize) {`
			`/* insert at sorted position */`
			`int pos=wordtabletop;`
			`assert(wordtable!=NULL);`
			`while (pos>0 && (unsigned)wordtable[pos-1].index>index) {`
			`wordtable[pos]=wordtable[pos-1];`
			`pos--;`
			`} /* while */`
			`wordtable[pos].index=(unsigned short)index;`
			`wordtable[pos].code=(wchar_t)code;`
			`} /* if */`
			`} /* if */`
			`} /* if */`
			`} /* while */`

			`fclose(fp);`
			`return TRUE;`
			`}`

			`SC_FUNC cell cp_translate(const unsigned char string,const unsigned char *endptr)`
			`{`
			`wchar_t result;`

			`result=bytetable[*string++];`
			`/* check whether this is a leader code */`
			`if ((unsigned)result==LEADBYTE && wordtable!=NULL) {`
			`/* look up the code via binary search */`
			`int low,high,mid;`
			`unsigned short index=(unsigned short)((((string-1)) << 8) \| string);`
			`string++;`
			`assert(wordtabletop>0);`
			`low=0;`
			`high=wordtabletop-1;`
			`while (low<high) {`
			`mid=(low+high)/2;`
			`assert(low<=mid && mid<high);`
			`if (index>wordtable[mid].index)`
			`low=mid+1;`
			`else`
			`high=mid;`
			`} /* while */`
			`assert(low==high);`
			`if (wordtable[low].index==index)`
			`result=wordtable[low].code;`
			`} /* if */`

			`if (endptr!=NULL)`
			`*endptr=string;`
			`return (cell)result;`
			`}`

			`#endif /* NO_CODEPAGE */`

			`#if !defined NO_UTF8`
			`SC_FUNC cell get_utf8_char(const unsigned char string,const unsigned char *endptr)`
			`{`
			`int follow=0;`
			`long lowmark=0;`
			`unsigned char ch;`
			`cell result=0;`

			`if (endptr!=NULL)`
			`*endptr=string;`

			`for ( ;; ) {`
			`ch=*string++;`

			`if (follow>0 && (ch & 0xc0)==0x80) {`
			`/* leader code is active, combine with earlier code */`
			`result=(result << 6) \| (ch & 0x3f);`
			`if (--follow==0) {`
			`/* encoding a character in more bytes than is strictly needed,`
			`* is not really valid UTF-8; we are strict here to increase`
			`* the chance of heuristic dectection of non-UTF-8 text`
			`* (JAVA writes zero bytes as a 2-byte code UTF-8, which is invalid)`
			`*/`
			`if (result<lowmark)`
			`return -1;`
			`/* the code positions 0xd800--0xdfff and 0xfffe & 0xffff do not`
			`* exist in UCS-4 (and hence, they do not exist in Unicode)`
			`*/`
Added support for Mac OS X and building with clang (bug 5601, r=dvander). 2013-02-13 11:14:37 +04:00			`if ((result>=0xd800 && result<=0xdfff) \|\| result==0xfffe \|\| result==0xffff)`
Initial import of Pawn (Small 3.0) 2005-07-25 00:00:55 +04:00			`return -1;`
			`} /* if */`
			`break;`
			`} else if (follow==0 && (ch & 0x80)==0x80) {`
			`/* UTF-8 leader code */`
			`if ((ch & 0xe0)==0xc0) {`
			`/* 110xxxxx 10xxxxxx */`
			`follow=1;`
			`lowmark=0x80L;`
			`result=ch & 0x1f;`
			`} else if ((ch & 0xf0)==0xe0) {`
			`/* 1110xxxx 10xxxxxx 10xxxxxx (16 bits, BMP plane) */`
			`follow=2;`
			`lowmark=0x800L;`
			`result=ch & 0x0f;`
			`} else if ((ch & 0xf8)==0xf0) {`
			`/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */`
			`follow=3;`
			`lowmark=0x10000L;`
			`result=ch & 0x07;`
			`} else if ((ch & 0xfc)==0xf8) {`
			`/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */`
			`follow=4;`
			`lowmark=0x200000L;`
			`result=ch & 0x03;`
			`} else if ((ch & 0xfe)==0xfc) {`
			`/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (32 bits) */`
			`follow=5;`
			`lowmark=0x4000000L;`
			`result=ch & 0x01;`
			`} else {`
			`/* this is invalid UTF-8 */`
			`return -1;`
			`} /* if */`
			`} else if (follow==0 && (ch & 0x80)==0x00) {`
			`/* 0xxxxxxx (US-ASCII) */`
			`result=ch;`
			`break;`
			`} else {`
			`/* this is invalid UTF-8 */`
			`return -1;`
			`} /* if */`

			`} /* for */`

			`if (endptr!=NULL)`
			`*endptr=string;`
			`return result;`
			`}`
			`#endif`

			`SC_FUNC int scan_utf8(FILE fp,const char filename)`
			`{`
			`#if defined NO_UTF8`
			`return 0;`
			`#else`
			`void *resetpos=pc_getpossrc(fp);`
			`int utf8=TRUE;`
			`int firstchar=TRUE,bom_found=FALSE;`
			`const unsigned char *ptr;`

			`while (utf8 && pc_readsrc(fp,pline,sLINEMAX)!=NULL) {`
			`ptr=pline;`
			`if (firstchar) {`
			`/* check whether the very first character on the very first line`
			`* starts with a BYTE order mark`
			`*/`
			`cell c=get_utf8_char(ptr,&ptr);`
			`bom_found= (c==0xfeff);`
			`utf8= (c>=0);`
			`firstchar=FALSE;`
			`} /* if */`
			`while (utf8 && *ptr!='\0')`
			`utf8= (get_utf8_char(ptr,&ptr)>=0);`
			`} /* while */`
			`pc_resetsrc(fp,resetpos);`
			`if (bom_found) {`
			`unsigned char bom[3];`
			`if (!utf8)`
			`error(77,filename); /* malformed UTF-8 encoding */`
			`pc_readsrc(fp,bom,3);`
			`assert(bom[0]==0xef && bom[1]==0xbb && bom[2]==0xbf);`
			`} /* if */`
			`return utf8;`
			`#endif /* NO_UTF8 */`
			`}`