429 lines
13 KiB
C
Executable File

/* Codepage translation to Unicode, and UTF-8 support
*
* The translation is based on codepage mapping files that are distributed
* by the Unicode consortium, see ftp://ftp.unicode.org/Public/MAPPINGS/.
*
* Character sets with a maximum of 256 codes are translated via a lookup
* table (these are Single-Byte Character Sets). Character sets like Shift-JIS
* with single-byte characters and multi-byte characters (introduced by a
* leader byte) are split into two tables: the 256-entry lookup table for
* the single-byte characters and an extended table for the multi-byte
* characters. The extended table is allocated dynamically; the lookup table
* is allocated statically, so loading SBCS tables cannot fail (if the tables
* themselves are valid, of course).
*
* Copyright (c) ITB CompuPhase, 2004-2005
*
* This software is provided "as-is", without any express or implied warranty.
* In no event will the authors be held liable for any damages arising from
* the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software in
* a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#include <assert.h>
#include <stdio.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include "sc.h"
#if !defined TRUE
#define FALSE 0
#define TRUE 1
#endif
#if !defined _MAX_PATH
#define _MAX_PATH 250
#endif
#if !defined DIRSEP_CHAR
#if defined LINUX || defined __FreeBSD__ || defined __OpenBSD__ || defined __APPLE___
#define DIRSEP_CHAR '/'
#elif defined macintosh
#define DIRSEP_CHAR ':'
#else
#define DIRSEP_CHAR '\\'
#endif
#endif
#if !defined ELEMENTS
#define ELEMENTS(array) (sizeof(array) / sizeof(array[0]))
#endif
#if !defined NO_CODEPAGE
#if !defined MAXCODEPAGE
#define MAXCODEPAGE 12 /* typically "cp" + 4 digits + ".txt" */
#endif
#define INVALID 0xffffu /* 0xffff and 0xfffe are invalid Unicode characters */
#define LEADBYTE 0xfffeu
struct wordpair {
unsigned short index;
wchar_t code;
};
static char cprootpath[_MAX_PATH] = { DIRSEP_CHAR, '\0' };
static wchar_t bytetable[256];
static struct wordpair *wordtable = NULL;
static unsigned wordtablesize = 0;
static unsigned wordtabletop = 0;
/* read in a line delimited by '\r' or '\n'; do NOT store the '\r' or '\n' into
* the string and ignore empty lines
* returns 1 for success and 0 for failure
*/
static int cp_readline(FILE *fp,char *string,size_t size)
{
size_t count=0;
int c;
assert(size>1);
while ((c=fgetc(fp))!=EOF && count<size-1) {
if (c=='\r' || c=='\n') {
if (count>0) /* '\r' or '\n' ends a string */
break;
/* if count==0, the line started with a '\r' or '\n', or perhaps line
* ends in the file are '\r\n' and we read and stopped on the '\r' of
* the preceding line
*/
} else {
string[count++]=(char)c;
} /* if */
} /* while */
string[count]='\0';
return count>0;
}
/* cp_path() sets the directory where all codepage files must be found (if
* the parameter to cp_set() specifies a full path, that is used instead).
* The path is specified into two parts: root and directory; the full path
* for the codepage direcory is just the concatenation of the two, with a
* directory separator in between. The directory is given in two parts,
* because often a program already retrieves its "home" directory and the
* codepages are most conveniently stored in a subdirectory of this home
* directory.
*/
SC_FUNC int cp_path(const char *root, const char *directory)
{
size_t len1,len2;
int add_slash1,add_slash2;
len1= (root!=NULL) ? strlen(root) : 0;
add_slash1= (len1==0 || root[len1-1]!=DIRSEP_CHAR);
len2= (directory!=NULL) ? strlen(directory) : 0;
add_slash2= (len2>0 && root[len2-1]!=DIRSEP_CHAR);
if (len1+add_slash1+len2+add_slash2>=(_MAX_PATH-MAXCODEPAGE))
return FALSE; /* full filename may not fit */
if (root!=NULL)
strcpy(cprootpath,root);
if (add_slash1) {
assert(len1==0 || cprootpath[len1]=='\0');
cprootpath[len1]=DIRSEP_CHAR;
cprootpath[len1+1]='\0';
} /* if */
if (directory!=NULL)
strcat(cprootpath,directory);
if (add_slash2) {
assert(cprootpath[len1+add_slash1+len2]=='\0');
cprootpath[len1+add_slash1+len2]=DIRSEP_CHAR;
cprootpath[len1+add_slash1+len2+1]='\0';
} /* if */
cp_set(NULL); /* start with a "linear" table (no translation) */
return TRUE;
}
/* cp_set() loads a codepage from a file. The name parameter may be a
* filename (including a full path) or it may be a partial codepage name.
* If the name parameter is NULL, the codepage is cleared to be a "linear"
* table (no translation).
* The following files are attempted to open (where <name> specifies the
* value of the parameter):
* <name>
* <cprootpath>/<name>
* <cprootpath>/<name>.txt
* <cprootpath>/cp<name>
* <cprootpath>/cp<name>.txt
*/
SC_FUNC int cp_set(const char *name)
{
char filename[_MAX_PATH];
FILE *fp=NULL;
unsigned index;
/* for name==NULL, set up an identity table */
if (name==NULL || *name=='\0') {
if (wordtable!=NULL) {
free(wordtable);
wordtable=NULL;
wordtablesize=0;
wordtabletop=0;
} /* if */
for (index=0; index<ELEMENTS(bytetable); index++)
bytetable[index]=(wchar_t)index;
return TRUE;
} /* if */
/* try to open the file as-is */
if (strchr(name,DIRSEP_CHAR)!=NULL)
fp=fopen(name,"rt");
if (fp==NULL) {
/* try opening the file in the "root path" for codepages */
if (strlen(name)>MAXCODEPAGE)
return 0;
assert(strlen(name)+strlen(cprootpath)<_MAX_PATH);
strcpy(filename,cprootpath);
strcat(filename,name);
fp=fopen(filename,"rt");
} /* if */
if (fp==NULL) {
/* try opening the file in the "root path" for codepages, with a ".txt" extension */
if (strlen(name)+4>=MAXCODEPAGE)
return 0;
assert(strlen(filename)+4<_MAX_PATH);
strcat(filename,".txt");
fp=fopen(filename,"rt");
} /* if */
if (fp==NULL) {
/* try opening the file in the "root path" for codepages, with "cp" prefixed before the name */
if (strlen(name)+2>MAXCODEPAGE)
return 0;
assert(2+strlen(name)+strlen(cprootpath)<_MAX_PATH);
strcpy(filename,cprootpath);
strcat(filename,"cp");
strcat(filename,name);
fp=fopen(filename,"rt");
} /* if */
if (fp==NULL) {
/* try opening the file in the "root path" for codepages, with "cp" prefixed an ".txt" appended */
if (strlen(name)+2+4>MAXCODEPAGE)
return 0;
assert(strlen(filename)+4<_MAX_PATH);
strcat(filename,".txt");
fp=fopen(filename,"rt");
} /* if */
if (fp==NULL)
return FALSE; /* all failed */
/* clear the tables */
for (index=0; index<ELEMENTS(bytetable); index++)
bytetable[index]=INVALID; /* special code meaning "not found" */
assert(wordtablesize==0 && wordtabletop==0 && wordtable==NULL
|| wordtablesize>0 && wordtable!=NULL);
if (wordtable!=NULL) {
free(wordtable);
wordtable=NULL;
wordtablesize=0;
wordtabletop=0;
} /* if */
/* read in the table */
while (cp_readline(fp,filename,sizeof filename)) {
char *ptr;
if ((ptr=strchr(filename,'#'))!=NULL)
*ptr='\0'; /* strip of comment */
for (ptr=filename; *ptr>0 && *ptr<' '; ptr++)
/* nothing */; /* skip leading whitespace */
if (*ptr!='\0') {
/* content on line */
unsigned code=LEADBYTE;
int num=sscanf(ptr,"%i %i",&index,&code);
/* if sscanf() returns 1 and the index is in range 0..255, then the
* code is a DBCS lead byte; if sscanf() returns 2 and index>=256, this
* is a double byte pair (lead byte + follower)
*/
if (num>=1 && index<256) {
bytetable[index]=(wchar_t)code;
} else if (num==2 && index>=256 && index<LEADBYTE) {
/* store the DBCS character in wordtable */
if (wordtabletop>=wordtablesize) {
/* grow the list */
int newsize;
struct wordpair *newblock;
newsize= (wordtablesize==0) ? 128 : 2*wordtablesize;
newblock=(struct wordpair *)malloc(newsize*sizeof(*wordtable));
if (newblock!=NULL) {
memcpy(newblock,wordtable,wordtabletop*sizeof(*wordtable));
free(wordtable);
wordtable=newblock;
wordtablesize=newsize;
} /* if */
} /* if */
if (wordtabletop<wordtablesize) {
/* insert at sorted position */
int pos=wordtabletop;
assert(wordtable!=NULL);
while (pos>0 && (unsigned)wordtable[pos-1].index>index) {
wordtable[pos]=wordtable[pos-1];
pos--;
} /* while */
wordtable[pos].index=(unsigned short)index;
wordtable[pos].code=(wchar_t)code;
} /* if */
} /* if */
} /* if */
} /* while */
fclose(fp);
return TRUE;
}
SC_FUNC cell cp_translate(const unsigned char *string,const unsigned char **endptr)
{
wchar_t result;
result=bytetable[*string++];
/* check whether this is a leader code */
if ((unsigned)result==LEADBYTE && wordtable!=NULL) {
/* look up the code via binary search */
int low,high,mid;
unsigned short index=(unsigned short)(((*(string-1)) << 8) | *string);
string++;
assert(wordtabletop>0);
low=0;
high=wordtabletop-1;
while (low<high) {
mid=(low+high)/2;
assert(low<=mid && mid<high);
if (index>wordtable[mid].index)
low=mid+1;
else
high=mid;
} /* while */
assert(low==high);
if (wordtable[low].index==index)
result=wordtable[low].code;
} /* if */
if (endptr!=NULL)
*endptr=string;
return (cell)result;
}
#endif /* NO_CODEPAGE */
#if !defined NO_UTF8
SC_FUNC cell get_utf8_char(const unsigned char *string,const unsigned char **endptr)
{
int follow=0;
long lowmark=0;
unsigned char ch;
cell result=0;
if (endptr!=NULL)
*endptr=string;
for ( ;; ) {
ch=*string++;
if (follow>0 && (ch & 0xc0)==0x80) {
/* leader code is active, combine with earlier code */
result=(result << 6) | (ch & 0x3f);
if (--follow==0) {
/* encoding a character in more bytes than is strictly needed,
* is not really valid UTF-8; we are strict here to increase
* the chance of heuristic dectection of non-UTF-8 text
* (JAVA writes zero bytes as a 2-byte code UTF-8, which is invalid)
*/
if (result<lowmark)
return -1;
/* the code positions 0xd800--0xdfff and 0xfffe & 0xffff do not
* exist in UCS-4 (and hence, they do not exist in Unicode)
*/
if ((result>=0xd800 && result<=0xdfff) || result==0xfffe || result==0xffff)
return -1;
} /* if */
break;
} else if (follow==0 && (ch & 0x80)==0x80) {
/* UTF-8 leader code */
if ((ch & 0xe0)==0xc0) {
/* 110xxxxx 10xxxxxx */
follow=1;
lowmark=0x80L;
result=ch & 0x1f;
} else if ((ch & 0xf0)==0xe0) {
/* 1110xxxx 10xxxxxx 10xxxxxx (16 bits, BMP plane) */
follow=2;
lowmark=0x800L;
result=ch & 0x0f;
} else if ((ch & 0xf8)==0xf0) {
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
follow=3;
lowmark=0x10000L;
result=ch & 0x07;
} else if ((ch & 0xfc)==0xf8) {
/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
follow=4;
lowmark=0x200000L;
result=ch & 0x03;
} else if ((ch & 0xfe)==0xfc) {
/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (32 bits) */
follow=5;
lowmark=0x4000000L;
result=ch & 0x01;
} else {
/* this is invalid UTF-8 */
return -1;
} /* if */
} else if (follow==0 && (ch & 0x80)==0x00) {
/* 0xxxxxxx (US-ASCII) */
result=ch;
break;
} else {
/* this is invalid UTF-8 */
return -1;
} /* if */
} /* for */
if (endptr!=NULL)
*endptr=string;
return result;
}
#endif
SC_FUNC int scan_utf8(FILE *fp,const char *filename)
{
#if defined NO_UTF8
return 0;
#else
static void *resetpos=NULL;
int utf8=TRUE;
int firstchar=TRUE,bom_found=FALSE;
const unsigned char *ptr;
resetpos=pc_getpossrc(fp);
while (utf8 && pc_readsrc(fp,pline,sLINEMAX)!=NULL) {
ptr=pline;
if (firstchar) {
/* check whether the very first character on the very first line
* starts with a BYTE order mark
*/
cell c=get_utf8_char(ptr,&ptr);
bom_found= (c==0xfeff);
utf8= (c>=0);
firstchar=FALSE;
} /* if */
while (utf8 && *ptr!='\0')
utf8= (get_utf8_char(ptr,&ptr)>=0);
} /* while */
pc_resetsrc(fp,resetpos);
if (bom_found) {
unsigned char bom[3];
if (!utf8)
error(77,filename); /* malformed UTF-8 encoding */
pc_readsrc(fp,bom,3);
assert(bom[0]==0xef && bom[1]==0xbb && bom[2]==0xbf);
} /* if */
return utf8;
#endif /* NO_UTF8 */
}