451 lines
13 KiB
C

/* compress.c -- Byte Pair Encoding compression */
/* Copyright 1996 Philip Gage */
/* This program appeared in the September 1997 issue of
* C/C++ Users Journal. The original source code may still
* be found at the web site of the magazine (www.cuj.com).
*
* It has been modified by me (Thiadmer Riemersma) to
* compress only a section of the input file and to store
* the compressed output along with the input as "C" strings.
*
* Compiling instructions:
* Borland C++ 16-bit (large memory model is required):
* bcc -ml scpack.c
*
* Watcom C/C++ 32-bit:
* wcl386 scpack.c
*
* GNU C (Linux), 32-bit:
* gcc scpack.c -o scpack
*/
#include <assert.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#if UINT_MAX > 0xFFFFU
#define MAXSIZE 1024*1024L
#else
#define MAXSIZE UINT_MAX /* Input file buffer size */
#endif
#define HASHSIZE 8192 /* Hash table size, power of 2 */
#define THRESHOLD 3 /* Increase for speed, min 3 */
#define START_TOKEN "#ifdef SCPACK" /* start reading the buffer here */
#define NAME_TOKEN "#define SCPACK_TABLE"
#define SEP_TOKEN "#define SCPACK_SEPARATOR"
#define TERM_TOKEN "#define SCPACK_TERMINATOR"
#define TEMPFILE "~SCPACK.TMP"
static char tablename[32+1] = "scpack_table";
static char separator[16]=",";
static char terminator[16]="";
int compress(unsigned char *buffer, unsigned buffersize, unsigned char pairtable[128][2])
{
unsigned char *left, *right, *count;
unsigned char a, b, bestcount;
unsigned i, j, index, bestindex, code=128;
/* Dynamically allocate buffers and check for errors */
left = (unsigned char *)malloc(HASHSIZE);
right = (unsigned char *)malloc(HASHSIZE);
count = (unsigned char *)malloc(HASHSIZE);
if (left==NULL || right==NULL || count==NULL) {
printf("Error allocating memory\n");
exit(1);
}
/* Check for errors */
for (i=0; i<buffersize; i++)
if (buffer[i] > 127) {
printf("This program works only on text files (7-bit ASCII)\n");
exit(1);
}
memset(pairtable, 0, 128*2*sizeof(char));
do { /* Replace frequent pairs with bytes 128..255 */
/* Enter counts of all byte pairs into hash table */
memset(count,0,HASHSIZE);
for (i=0; i<buffersize-1; i++) {
a = buffer[i];
b = buffer[i+1];
/* ignore any pair with a '\0' */
if (a == 0 || b == 0)
continue;
index = (a ^ (b << 6)) & (HASHSIZE-1);
while ((left[index] != a || right[index] != b) &&
count[index] != 0)
index = (index + 1) & (HASHSIZE-1);
left[index] = a;
right[index] = b;
if (count[index] < 255)
count[index] += (unsigned char)1;
}
/* Search hash table for most frequent pair */
bestcount = THRESHOLD - 1;
for (i=0; i<HASHSIZE; i++) {
if (count[i] > bestcount) {
bestcount = count[i];
bestindex = i;
}
}
/* Compress if enough occurrences of pair */
if (bestcount >= THRESHOLD) {
/* Add pair to table using code as index */
a = pairtable[code-128][0] = left[bestindex];
b = pairtable[code-128][1] = right[bestindex];
/* Replace all pair occurrences with unused byte */
for (i=0, j=0; i<buffersize; i++, j++)
if (a == buffer[i] && b == buffer[i+1]) {
buffer[j] = (unsigned char)code;
++i;
}
else
buffer[j] = buffer[i];
buffersize = j;
}
else
break;
} while (++code < 255);
/* done */
free(left); free(right); free(count);
return buffersize; /* return adjusted buffersize */
}
static int strmatch(char *str, char *token, int *indent)
{
int i = 0;
/* skip whitespace */
while (*str==' ' || *str=='\t') {
str++;
i++;
} /* while */
if (strncmp(str,token,strlen(token))!=0)
return 0;
if (indent != NULL)
*indent = i;
return 1;
}
static void check_if(char *str,int linenr)
{
if (strmatch(str,"#if",NULL)) {
printf("Error: \"#if...\" preprocessor statement should not be in SCPACK section "
"(line %d)\n", linenr);
exit(1);
} /* if */
}
static int check_tablename(char *str)
{
int i;
if (strmatch(str,NAME_TOKEN,NULL)) {
str += strlen(NAME_TOKEN);
while (*str==' ' || *str=='\t')
str++;
for (i=0; i<(sizeof tablename - 1) && *str!='\0' && strchr(" \t\n",*str)==NULL; i++, str++)
tablename[i] = *str;
tablename[i] = '\0';
return 1;
} /* if */
return 0;
}
static int check_separator(char *str)
{
int i;
if (strmatch(str,SEP_TOKEN,NULL)) {
str += strlen(SEP_TOKEN);
while (*str==' ' || *str=='\t')
str++;
for (i=0; i<(sizeof separator - 1) && *str!='\0' && strchr(" \t\n",*str)==NULL; i++, str++)
separator[i] = *str;
separator[i] = '\0';
return 1;
} /* if */
if (strmatch(str,TERM_TOKEN,NULL)) {
str += strlen(TERM_TOKEN);
while (*str==' ' || *str=='\t')
str++;
for (i=0; i<(sizeof terminator - 1) && *str!='\0' && strchr(" \t\n",*str)==NULL; i++, str++)
terminator[i] = *str;
terminator[i] = '\0';
return 1;
} /* if */
return 0;
}
/* readbuffer
* Reads in the input file and stores all strings in the
* section between "#ifdef SCPACK" and "#else" in a buffer.
* Only text that is between double quotes is added to the
* buffer; the \" escape code is handled. Multiple strings
* on one line are handled.
*/
unsigned readbuffer(FILE *input, unsigned char *buffer)
{
char str[256];
unsigned buffersize;
int i,linenr;
linenr=0;
buffersize=0;
rewind(input);
while (!feof(input)) {
while (fgets(str,sizeof str,input)!=NULL) {
linenr++;
check_tablename(str);
check_separator(str);
if (strmatch(str,START_TOKEN,NULL))
break;
} /* while */
if (!strmatch(str,START_TOKEN,NULL))
return buffersize; /* no (more) section found, quit */
while (fgets(str,sizeof str,input)!=NULL) {
linenr++;
check_if(str,linenr);
if (check_tablename(str))
printf("Error: table name definition should not be in SCPACK section (line %d)\n", linenr);
check_separator(str);
if (strmatch(str,"#else",NULL))
break; /* done */
/* add to the buffer only what is between double quotes */
i=0;
do {
while (str[i]!='\0' && str[i]!='"')
i++;
if (str[i]=='"') {
/* we are in a string */
i++;
while (str[i]!='\0' && str[i]!='"') {
/* handle escape sequences */
if (str[i]=='\\') {
i++;
switch (str[i]) {
case 'a': /* alarm */
buffer[buffersize++]='\a';
i++;
break;
case 'b': /* backspace */
buffer[buffersize++]='\b';
i++;
break;
case 'f': /* form feed */
buffer[buffersize++]='\f';
i++;
break;
case 'n': /* newline */
buffer[buffersize++]='\n';
i++;
break;
case 'r': /* carriage return */
buffer[buffersize++]='\n';
i++;
break;
case 't': /* tab */
buffer[buffersize++]='\t';
i++;
break;
case '\'':
buffer[buffersize++]='\'';
i++;
break;
case '"':
buffer[buffersize++]='"';
i++;
break;
default:
// ??? octal character code escapes and hexadecimal escapes
// not supported
printf("Unknown escape sequence '\\%c' on line %d\n",
str[i], linenr);
} /* switch */
} else {
buffer[buffersize++]=str[i++];
} /* if */
} /* while */
if (str[i]=='"') {
buffer[buffersize++]='\0'; /* terminate each string */
i++;
} else {
printf("Error: unterminated string on line %d\n",linenr);
} /* if */
} /* if */
} while (str[i]!='\0');
} /* while - in SCPACK section */
/* put in another '\0' to terminate the section */
buffer[buffersize++]='\0';
} /* while - !feof(input) */
return buffersize;
}
static void write_pairtable(FILE *output, unsigned char pairtable[128][2], char *tablename)
{
int i;
/* dump the pair table */
fprintf(output, "/*-*SCPACK start of pair table, do not change or remove this line */\n");
fprintf(output, "unsigned char %s[][2] = {", tablename);
for (i=0; i<128 && pairtable[i][0]!=0 && pairtable[i][1]!=0; i++) {
if ((i % 16)==0)
fprintf(output, "\n ");
else
fprintf(output, " ");
fprintf(output, "{%d,%d}", pairtable[i][0], pairtable[i][1]);
/* check if something follows this pair */
if (i+1<128 && pairtable[i+1][0]!=0 && pairtable[i+1][1]!=0)
fprintf(output, ",");
} /* for */
fprintf(output, "\n};\n");
fprintf(output, "/*-*SCPACK end of pair table, do not change or remove this line */\n");
}
void writefile(FILE *input, FILE *output, unsigned char *buffer, unsigned buffersize, unsigned char pairtable[128][2])
{
char str[256];
int insection, indent, needseparator;
unsigned char *bufptr;
bufptr = buffer;
insection = 0;
rewind(input);
while (!feof(input)) {
while (fgets(str,sizeof str,input)!=NULL) {
fprintf(output,"%s",str);
if (check_tablename(str)) {
write_pairtable(output, pairtable, tablename);
/* strip an existing pair table from the file */
if (fgets(str,sizeof str,input)!=NULL) {
if (strmatch(str,"/*-*SCPACK",NULL)) {
while (fgets(str,sizeof str,input)!=NULL)
if (strmatch(str,"/*-*SCPACK",NULL))
break;
} else {
fprintf(output,"%s",str);
} /* if */
} /* if */
} /* if */
if (strmatch(str,START_TOKEN,NULL))
insection = 1;
if (insection && strmatch(str,"#else",NULL))
break;
} /* while */
if (!strmatch(str,"#else",&indent))
return; /* no (more) section found, quit */
insection=0;
/* dump the buffer as strings, separated with commas */
needseparator = 0;
while (*bufptr != '\0') {
assert((unsigned)(bufptr-buffer) < buffersize);
if (needseparator)
fprintf(output, "%s\n",separator);
fprintf(output, "%*c\"",indent+2,' ');
/* loop over string */
while (*bufptr != '\0') {
if (*bufptr<' ' || *bufptr >= 128 || *bufptr == '"' || *bufptr == '\\')
fprintf(output, "\\%03o", *bufptr);
else
fprintf(output, "%c", *bufptr);
bufptr++;
} /* while */
fprintf(output, "\"");
needseparator = 1;
bufptr++; /* skip '\0' */
} /* while */
fprintf(output, "%s\n",terminator);
bufptr++;
/* skip the input file until the #endif section */
while (fgets(str,sizeof str,input)!=NULL) {
if (strmatch(str,"#endif",NULL)) {
fprintf(output,"%s",str);
break; /* done */
} /* if */
} /* while */
} /* while - !feof(input) */
}
static void usage(void)
{
printf("Usage: scpack <filename> [output file]\n");
exit(1);
}
int main(int argc, char **argv)
{
FILE *in, *out;
unsigned char *buffer;
unsigned buffersize, orgbuffersize;
unsigned char pairtable[128][2];
if (argc < 2 || argc > 3)
usage();
if ((in=fopen(argv[1],"rt"))==NULL) {
printf("SCPACK: error opening input %s\n",argv[1]);
usage();
} /* if */
if (argc == 2) {
if ((out=fopen(TEMPFILE,"wt"))==NULL) {
printf("SCPACK: error opening temporary file %s\n",TEMPFILE);
usage();
} /* if */
} else {
if ((out=fopen(argv[2],"wt"))==NULL) {
printf("SCPACK: error opening output file %s\n",argv[2]);
usage();
} /* if */
} /* if */
buffer = (unsigned char *)malloc(MAXSIZE);
if (buffer == NULL) {
printf("SCPACK: error allocating memory\n");
return 1;
} /* if */
/* 1. read the buffer
* 2. compress the buffer
* 3. copy the file, insert the compressed buffer
*/
buffersize = readbuffer(in, buffer);
orgbuffersize = buffersize;
if (buffersize > 0) {
buffersize = compress(buffer, buffersize, pairtable);
writefile(in, out, buffer, buffersize, pairtable);
printf("SCPACK: compression ratio: %ld%% (%d -> %d)\n",
100L-(100L*buffersize)/orgbuffersize, orgbuffersize, buffersize);
} else {
printf("SCPACK: no SCPACK section found, nothing to do\n");
} /* if */
fclose(out);
fclose(in);
/* let the new file replace the old file */
if (buffersize == 0) {
if (argc == 2)
remove(TEMPFILE);
else
remove(argv[2]);
} else if (argc == 2) {
remove(argv[1]);
rename(TEMPFILE,argv[1]);
} /* if */
return 0;
}