Attached is an extremely simple but very effective program to compress a
sorted word list. The list can them be compressed even farther by bzip2
or gzip.
It will generally bring a word list to the size of a "munched" ispell word
list.
To Compile:
g++ -O compress.cc -o wlc
T compress: ./wlc c < IN > OUT
To decompress: ./wlc d < IN > OUT
In order for it to work the word list must be sorted, not have any control
characters except for space characters, and all words must be less than
256 characters.
If you think it is useful I will include it in the next major version of
Aspell.
---
Kevin Atkinson
[EMAIL PROTECTED]
http://metalab.unc.edu/kevina/
#include <iostream>
#include <cassert>
static inline int prefix_len(const char * rhs, const char * lhs)
{
int i = 0;
while (rhs[i] != '\0' && lhs[i] != '\0' && rhs[i] == lhs[i])
++i;
return i;
}
void usage ()
{
cerr << "Usage: wlc c|d" << endl;
exit(1);
}
bool get_word(istream & in, char * w)
{
int c;
while (c = in.peek(), c != EOF && c <= 32)
in.get();
if (c == EOF) return false;
do {
*w++ = static_cast<char>(in.get());
} while (c = in.peek(), c != EOF && c > 32);
*w = '\0';
if (c == EOF) return false;
else return true;
}
int main (int argc, const char *argv[]) {
if (argc != 2)
usage();
if (argv[1][0] == 'c') {
char s1[256];
char s2[256];
char * prev = s2;
*prev = '\0';
char * cur = s1;
while (get_word(cin, cur)) {
int i = prefix_len(prev, cur);
if (i > 31)
cout << '\0';
cout << (char)(i+1);
cout << cur + i;
if (cur == s1) {
prev = s1; cur = s2;
} else {
prev = s2; cur = s1;
}
}
} else if (argv[1][0] == 'd') {
char cur[256];
int i,c;
while (i = cin.get(), i != -1 ) {
if (i == 0)
i = cin.get();
--i;
while (cin.peek() > 32)
cur[i++] = (char)cin.get();
cur[i] = '\0';
cout << cur << '\n';
}
} else {
usage();
}
}