451 lines
13 KiB
C
451 lines
13 KiB
C
|
/* compress.c -- Byte Pair Encoding compression */
|
||
|
/* Copyright 1996 Philip Gage */
|
||
|
|
||
|
/* This program appeared in the September 1997 issue of
|
||
|
* C/C++ Users Journal. The original source code may still
|
||
|
* be found at the web site of the magazine (www.cuj.com).
|
||
|
*
|
||
|
* It has been modified by me (Thiadmer Riemersma) to
|
||
|
* compress only a section of the input file and to store
|
||
|
* the compressed output along with the input as "C" strings.
|
||
|
*
|
||
|
* Compiling instructions:
|
||
|
* Borland C++ 16-bit (large memory model is required):
|
||
|
* bcc -ml scpack.c
|
||
|
*
|
||
|
* Watcom C/C++ 32-bit:
|
||
|
* wcl386 scpack.c
|
||
|
*
|
||
|
* GNU C (Linux), 32-bit:
|
||
|
* gcc scpack.c -o scpack
|
||
|
*/
|
||
|
|
||
|
#include <assert.h>
|
||
|
#include <limits.h>
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <string.h>
|
||
|
|
||
|
#if UINT_MAX > 0xFFFFU
|
||
|
#define MAXSIZE 1024*1024L
|
||
|
#else
|
||
|
#define MAXSIZE UINT_MAX /* Input file buffer size */
|
||
|
#endif
|
||
|
#define HASHSIZE 8192 /* Hash table size, power of 2 */
|
||
|
#define THRESHOLD 3 /* Increase for speed, min 3 */
|
||
|
|
||
|
#define START_TOKEN "#ifdef SCPACK" /* start reading the buffer here */
|
||
|
#define NAME_TOKEN "#define SCPACK_TABLE"
|
||
|
#define SEP_TOKEN "#define SCPACK_SEPARATOR"
|
||
|
#define TERM_TOKEN "#define SCPACK_TERMINATOR"
|
||
|
#define TEMPFILE "~SCPACK.TMP"
|
||
|
static char tablename[32+1] = "scpack_table";
|
||
|
static char separator[16]=",";
|
||
|
static char terminator[16]="";
|
||
|
|
||
|
int compress(unsigned char *buffer, unsigned buffersize, unsigned char pairtable[128][2])
|
||
|
{
|
||
|
unsigned char *left, *right, *count;
|
||
|
unsigned char a, b, bestcount;
|
||
|
unsigned i, j, index, bestindex, code=128;
|
||
|
|
||
|
/* Dynamically allocate buffers and check for errors */
|
||
|
left = (unsigned char *)malloc(HASHSIZE);
|
||
|
right = (unsigned char *)malloc(HASHSIZE);
|
||
|
count = (unsigned char *)malloc(HASHSIZE);
|
||
|
if (left==NULL || right==NULL || count==NULL) {
|
||
|
printf("Error allocating memory\n");
|
||
|
exit(1);
|
||
|
}
|
||
|
|
||
|
/* Check for errors */
|
||
|
for (i=0; i<buffersize; i++)
|
||
|
if (buffer[i] > 127) {
|
||
|
printf("This program works only on text files (7-bit ASCII)\n");
|
||
|
exit(1);
|
||
|
}
|
||
|
|
||
|
memset(pairtable, 0, 128*2*sizeof(char));
|
||
|
|
||
|
do { /* Replace frequent pairs with bytes 128..255 */
|
||
|
|
||
|
/* Enter counts of all byte pairs into hash table */
|
||
|
memset(count,0,HASHSIZE);
|
||
|
for (i=0; i<buffersize-1; i++) {
|
||
|
a = buffer[i];
|
||
|
b = buffer[i+1];
|
||
|
/* ignore any pair with a '\0' */
|
||
|
if (a == 0 || b == 0)
|
||
|
continue;
|
||
|
index = (a ^ (b << 6)) & (HASHSIZE-1);
|
||
|
while ((left[index] != a || right[index] != b) &&
|
||
|
count[index] != 0)
|
||
|
index = (index + 1) & (HASHSIZE-1);
|
||
|
left[index] = a;
|
||
|
right[index] = b;
|
||
|
if (count[index] < 255)
|
||
|
count[index] += (unsigned char)1;
|
||
|
}
|
||
|
|
||
|
/* Search hash table for most frequent pair */
|
||
|
bestcount = THRESHOLD - 1;
|
||
|
for (i=0; i<HASHSIZE; i++) {
|
||
|
if (count[i] > bestcount) {
|
||
|
bestcount = count[i];
|
||
|
bestindex = i;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* Compress if enough occurrences of pair */
|
||
|
if (bestcount >= THRESHOLD) {
|
||
|
|
||
|
/* Add pair to table using code as index */
|
||
|
a = pairtable[code-128][0] = left[bestindex];
|
||
|
b = pairtable[code-128][1] = right[bestindex];
|
||
|
|
||
|
/* Replace all pair occurrences with unused byte */
|
||
|
for (i=0, j=0; i<buffersize; i++, j++)
|
||
|
if (a == buffer[i] && b == buffer[i+1]) {
|
||
|
buffer[j] = (unsigned char)code;
|
||
|
++i;
|
||
|
}
|
||
|
else
|
||
|
buffer[j] = buffer[i];
|
||
|
buffersize = j;
|
||
|
}
|
||
|
else
|
||
|
break;
|
||
|
} while (++code < 255);
|
||
|
|
||
|
/* done */
|
||
|
free(left); free(right); free(count);
|
||
|
return buffersize; /* return adjusted buffersize */
|
||
|
}
|
||
|
|
||
|
static int strmatch(char *str, char *token, int *indent)
|
||
|
{
|
||
|
int i = 0;
|
||
|
|
||
|
/* skip whitespace */
|
||
|
while (*str==' ' || *str=='\t') {
|
||
|
str++;
|
||
|
i++;
|
||
|
} /* while */
|
||
|
if (strncmp(str,token,strlen(token))!=0)
|
||
|
return 0;
|
||
|
if (indent != NULL)
|
||
|
*indent = i;
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
static void check_if(char *str,int linenr)
|
||
|
{
|
||
|
if (strmatch(str,"#if",NULL)) {
|
||
|
printf("Error: \"#if...\" preprocessor statement should not be in SCPACK section "
|
||
|
"(line %d)\n", linenr);
|
||
|
exit(1);
|
||
|
} /* if */
|
||
|
}
|
||
|
|
||
|
static int check_tablename(char *str)
|
||
|
{
|
||
|
int i;
|
||
|
|
||
|
if (strmatch(str,NAME_TOKEN,NULL)) {
|
||
|
str += strlen(NAME_TOKEN);
|
||
|
while (*str==' ' || *str=='\t')
|
||
|
str++;
|
||
|
for (i=0; i<(sizeof tablename - 1) && *str!='\0' && strchr(" \t\n",*str)==NULL; i++, str++)
|
||
|
tablename[i] = *str;
|
||
|
tablename[i] = '\0';
|
||
|
return 1;
|
||
|
} /* if */
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
static int check_separator(char *str)
|
||
|
{
|
||
|
int i;
|
||
|
|
||
|
if (strmatch(str,SEP_TOKEN,NULL)) {
|
||
|
str += strlen(SEP_TOKEN);
|
||
|
while (*str==' ' || *str=='\t')
|
||
|
str++;
|
||
|
for (i=0; i<(sizeof separator - 1) && *str!='\0' && strchr(" \t\n",*str)==NULL; i++, str++)
|
||
|
separator[i] = *str;
|
||
|
separator[i] = '\0';
|
||
|
return 1;
|
||
|
} /* if */
|
||
|
|
||
|
if (strmatch(str,TERM_TOKEN,NULL)) {
|
||
|
str += strlen(TERM_TOKEN);
|
||
|
while (*str==' ' || *str=='\t')
|
||
|
str++;
|
||
|
for (i=0; i<(sizeof terminator - 1) && *str!='\0' && strchr(" \t\n",*str)==NULL; i++, str++)
|
||
|
terminator[i] = *str;
|
||
|
terminator[i] = '\0';
|
||
|
return 1;
|
||
|
} /* if */
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
/* readbuffer
|
||
|
* Reads in the input file and stores all strings in the
|
||
|
* section between "#ifdef SCPACK" and "#else" in a buffer.
|
||
|
* Only text that is between double quotes is added to the
|
||
|
* buffer; the \" escape code is handled. Multiple strings
|
||
|
* on one line are handled.
|
||
|
*/
|
||
|
unsigned readbuffer(FILE *input, unsigned char *buffer)
|
||
|
{
|
||
|
char str[256];
|
||
|
unsigned buffersize;
|
||
|
int i,linenr;
|
||
|
|
||
|
linenr=0;
|
||
|
buffersize=0;
|
||
|
|
||
|
rewind(input);
|
||
|
while (!feof(input)) {
|
||
|
while (fgets(str,sizeof str,input)!=NULL) {
|
||
|
linenr++;
|
||
|
check_tablename(str);
|
||
|
check_separator(str);
|
||
|
if (strmatch(str,START_TOKEN,NULL))
|
||
|
break;
|
||
|
} /* while */
|
||
|
if (!strmatch(str,START_TOKEN,NULL))
|
||
|
return buffersize; /* no (more) section found, quit */
|
||
|
|
||
|
while (fgets(str,sizeof str,input)!=NULL) {
|
||
|
linenr++;
|
||
|
check_if(str,linenr);
|
||
|
if (check_tablename(str))
|
||
|
printf("Error: table name definition should not be in SCPACK section (line %d)\n", linenr);
|
||
|
check_separator(str);
|
||
|
if (strmatch(str,"#else",NULL))
|
||
|
break; /* done */
|
||
|
/* add to the buffer only what is between double quotes */
|
||
|
i=0;
|
||
|
do {
|
||
|
while (str[i]!='\0' && str[i]!='"')
|
||
|
i++;
|
||
|
if (str[i]=='"') {
|
||
|
/* we are in a string */
|
||
|
i++;
|
||
|
while (str[i]!='\0' && str[i]!='"') {
|
||
|
/* handle escape sequences */
|
||
|
if (str[i]=='\\') {
|
||
|
i++;
|
||
|
switch (str[i]) {
|
||
|
case 'a': /* alarm */
|
||
|
buffer[buffersize++]='\a';
|
||
|
i++;
|
||
|
break;
|
||
|
case 'b': /* backspace */
|
||
|
buffer[buffersize++]='\b';
|
||
|
i++;
|
||
|
break;
|
||
|
case 'f': /* form feed */
|
||
|
buffer[buffersize++]='\f';
|
||
|
i++;
|
||
|
break;
|
||
|
case 'n': /* newline */
|
||
|
buffer[buffersize++]='\n';
|
||
|
i++;
|
||
|
break;
|
||
|
case 'r': /* carriage return */
|
||
|
buffer[buffersize++]='\n';
|
||
|
i++;
|
||
|
break;
|
||
|
case 't': /* tab */
|
||
|
buffer[buffersize++]='\t';
|
||
|
i++;
|
||
|
break;
|
||
|
case '\'':
|
||
|
buffer[buffersize++]='\'';
|
||
|
i++;
|
||
|
break;
|
||
|
case '"':
|
||
|
buffer[buffersize++]='"';
|
||
|
i++;
|
||
|
break;
|
||
|
default:
|
||
|
// ??? octal character code escapes and hexadecimal escapes
|
||
|
// not supported
|
||
|
printf("Unknown escape sequence '\\%c' on line %d\n",
|
||
|
str[i], linenr);
|
||
|
} /* switch */
|
||
|
} else {
|
||
|
buffer[buffersize++]=str[i++];
|
||
|
} /* if */
|
||
|
} /* while */
|
||
|
if (str[i]=='"') {
|
||
|
buffer[buffersize++]='\0'; /* terminate each string */
|
||
|
i++;
|
||
|
} else {
|
||
|
printf("Error: unterminated string on line %d\n",linenr);
|
||
|
} /* if */
|
||
|
} /* if */
|
||
|
} while (str[i]!='\0');
|
||
|
} /* while - in SCPACK section */
|
||
|
/* put in another '\0' to terminate the section */
|
||
|
buffer[buffersize++]='\0';
|
||
|
} /* while - !feof(input) */
|
||
|
return buffersize;
|
||
|
}
|
||
|
|
||
|
static void write_pairtable(FILE *output, unsigned char pairtable[128][2], char *tablename)
|
||
|
{
|
||
|
int i;
|
||
|
|
||
|
/* dump the pair table */
|
||
|
fprintf(output, "/*-*SCPACK start of pair table, do not change or remove this line */\n");
|
||
|
fprintf(output, "unsigned char %s[][2] = {", tablename);
|
||
|
for (i=0; i<128 && pairtable[i][0]!=0 && pairtable[i][1]!=0; i++) {
|
||
|
if ((i % 16)==0)
|
||
|
fprintf(output, "\n ");
|
||
|
else
|
||
|
fprintf(output, " ");
|
||
|
fprintf(output, "{%d,%d}", pairtable[i][0], pairtable[i][1]);
|
||
|
/* check if something follows this pair */
|
||
|
if (i+1<128 && pairtable[i+1][0]!=0 && pairtable[i+1][1]!=0)
|
||
|
fprintf(output, ",");
|
||
|
} /* for */
|
||
|
fprintf(output, "\n};\n");
|
||
|
fprintf(output, "/*-*SCPACK end of pair table, do not change or remove this line */\n");
|
||
|
}
|
||
|
|
||
|
void writefile(FILE *input, FILE *output, unsigned char *buffer, unsigned buffersize, unsigned char pairtable[128][2])
|
||
|
{
|
||
|
char str[256];
|
||
|
int insection, indent, needseparator;
|
||
|
unsigned char *bufptr;
|
||
|
|
||
|
bufptr = buffer;
|
||
|
insection = 0;
|
||
|
|
||
|
rewind(input);
|
||
|
while (!feof(input)) {
|
||
|
while (fgets(str,sizeof str,input)!=NULL) {
|
||
|
fprintf(output,"%s",str);
|
||
|
if (check_tablename(str)) {
|
||
|
write_pairtable(output, pairtable, tablename);
|
||
|
/* strip an existing pair table from the file */
|
||
|
if (fgets(str,sizeof str,input)!=NULL) {
|
||
|
if (strmatch(str,"/*-*SCPACK",NULL)) {
|
||
|
while (fgets(str,sizeof str,input)!=NULL)
|
||
|
if (strmatch(str,"/*-*SCPACK",NULL))
|
||
|
break;
|
||
|
} else {
|
||
|
fprintf(output,"%s",str);
|
||
|
} /* if */
|
||
|
} /* if */
|
||
|
} /* if */
|
||
|
if (strmatch(str,START_TOKEN,NULL))
|
||
|
insection = 1;
|
||
|
if (insection && strmatch(str,"#else",NULL))
|
||
|
break;
|
||
|
} /* while */
|
||
|
if (!strmatch(str,"#else",&indent))
|
||
|
return; /* no (more) section found, quit */
|
||
|
insection=0;
|
||
|
|
||
|
/* dump the buffer as strings, separated with commas */
|
||
|
needseparator = 0;
|
||
|
while (*bufptr != '\0') {
|
||
|
assert((unsigned)(bufptr-buffer) < buffersize);
|
||
|
if (needseparator)
|
||
|
fprintf(output, "%s\n",separator);
|
||
|
fprintf(output, "%*c\"",indent+2,' ');
|
||
|
/* loop over string */
|
||
|
while (*bufptr != '\0') {
|
||
|
if (*bufptr<' ' || *bufptr >= 128 || *bufptr == '"' || *bufptr == '\\')
|
||
|
fprintf(output, "\\%03o", *bufptr);
|
||
|
else
|
||
|
fprintf(output, "%c", *bufptr);
|
||
|
bufptr++;
|
||
|
} /* while */
|
||
|
fprintf(output, "\"");
|
||
|
needseparator = 1;
|
||
|
bufptr++; /* skip '\0' */
|
||
|
} /* while */
|
||
|
fprintf(output, "%s\n",terminator);
|
||
|
bufptr++;
|
||
|
|
||
|
/* skip the input file until the #endif section */
|
||
|
while (fgets(str,sizeof str,input)!=NULL) {
|
||
|
if (strmatch(str,"#endif",NULL)) {
|
||
|
fprintf(output,"%s",str);
|
||
|
break; /* done */
|
||
|
} /* if */
|
||
|
} /* while */
|
||
|
} /* while - !feof(input) */
|
||
|
}
|
||
|
|
||
|
static void usage(void)
|
||
|
{
|
||
|
printf("Usage: scpack <filename> [output file]\n");
|
||
|
exit(1);
|
||
|
}
|
||
|
|
||
|
int main(int argc, char **argv)
|
||
|
{
|
||
|
FILE *in, *out;
|
||
|
unsigned char *buffer;
|
||
|
unsigned buffersize, orgbuffersize;
|
||
|
unsigned char pairtable[128][2];
|
||
|
|
||
|
if (argc < 2 || argc > 3)
|
||
|
usage();
|
||
|
if ((in=fopen(argv[1],"rt"))==NULL) {
|
||
|
printf("SCPACK: error opening input %s\n",argv[1]);
|
||
|
usage();
|
||
|
} /* if */
|
||
|
if (argc == 2) {
|
||
|
if ((out=fopen(TEMPFILE,"wt"))==NULL) {
|
||
|
printf("SCPACK: error opening temporary file %s\n",TEMPFILE);
|
||
|
usage();
|
||
|
} /* if */
|
||
|
} else {
|
||
|
if ((out=fopen(argv[2],"wt"))==NULL) {
|
||
|
printf("SCPACK: error opening output file %s\n",argv[2]);
|
||
|
usage();
|
||
|
} /* if */
|
||
|
} /* if */
|
||
|
|
||
|
buffer = (unsigned char *)malloc(MAXSIZE);
|
||
|
if (buffer == NULL) {
|
||
|
printf("SCPACK: error allocating memory\n");
|
||
|
return 1;
|
||
|
} /* if */
|
||
|
/* 1. read the buffer
|
||
|
* 2. compress the buffer
|
||
|
* 3. copy the file, insert the compressed buffer
|
||
|
*/
|
||
|
buffersize = readbuffer(in, buffer);
|
||
|
orgbuffersize = buffersize;
|
||
|
if (buffersize > 0) {
|
||
|
buffersize = compress(buffer, buffersize, pairtable);
|
||
|
writefile(in, out, buffer, buffersize, pairtable);
|
||
|
printf("SCPACK: compression ratio: %ld%% (%d -> %d)\n",
|
||
|
100L-(100L*buffersize)/orgbuffersize, orgbuffersize, buffersize);
|
||
|
} else {
|
||
|
printf("SCPACK: no SCPACK section found, nothing to do\n");
|
||
|
} /* if */
|
||
|
fclose(out);
|
||
|
fclose(in);
|
||
|
/* let the new file replace the old file */
|
||
|
if (buffersize == 0) {
|
||
|
if (argc == 2)
|
||
|
remove(TEMPFILE);
|
||
|
else
|
||
|
remove(argv[2]);
|
||
|
} else if (argc == 2) {
|
||
|
remove(argv[1]);
|
||
|
rename(TEMPFILE,argv[1]);
|
||
|
} /* if */
|
||
|
return 0;
|
||
|
}
|