/*
* hash tries.
*
* Herewith a program that can dynamically construct and run a
* deterministic finite state machine, such as for matching to a trie
* or a regular expression, over a sequence of bytes, at under 100
* clock cycles per byte, and under 50 (potentially 25) bytes of
* memory per state graph arc. Regular expressions are as yet
* unimplemented.
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
/* dummy definitions to avoid #include for faster compiles during
development */
/* void *malloc(unsigned);
* extern void *stderr; */
/*
* Representing tries, or generally, representing deterministic state
* machines that run over bytes, we have a big tradeoff when it comes
* to branching factors. If we use a small branching factor, we get a
* lot of extra nodes, and traversing the trie is slow. But if we use
* a large branching factor, each individual node is very large.
* Here's a compromise --- a way for all of the nodes in a trie to
* share space. I think Don Knuth invented this data structure for an
* article in the 1980s about literate programming; the topic he chose
* was counting the distinct words in a file, with a count of each
* word, and he stored the words in a hash trie. Unfortunately, I
* read that article in 1995, and now it's 2004, and I didn't
* thoroughly understand the algorithm at the time. So this data
* structure may or may not have anything in common with that one,
* other than the name and the problem it tries to solve.
*/
/* This program can test "sideways" for membership in a small trie in
* about 1.2 microseconds on my 500MHz machine, so it takes about
* 150ns, or about 74 clock cycles, to follow a single pointer from
* one state to the next. I wrote a little spell-check program that
* inserts the 206590 words (containing 887115 letters) in my
* /usr/share/dict/words into a trie, which takes about 1.7 seconds,
* and then ran /usr/share/dict/words through it again to see how long
* it took to probe the table --- another 0.82 seconds. That's about
* 4.0 microseconds per word, noticeably longer but not outrageously
* so, especially since it includes time to tokenize the words etc.
*
* The program reports,
* 482855 arcs in 1000003 buckets (0.482854 per bucket), max 7 in a bucket
* which seems pretty reasonably balanced. It also explains the small
* 23088kiB memory usage: that's 48 bytes per arc, which is plausible
* given the data structures, but adds up to a relatively slim 26
* bytes per input letter, or 112 bytes per word. (Man, it's enough
* to make you want to binary-search a sorted list of
* newline-separated strings or something!)
*/
/* Modifying the spell-checker to report the number of "words" it
* checked in the input, and searching for each one a configurable
* number of times, gave me 273345 total words. Searching for each
* one in the trie 10 times took 5.45 user seconds; searching for each
* one 20 times took 8.42 seconds. That's about 2.7 million searches
* (totaling about 17163480 characters) in about 3 seconds, so about
* 5.7 million arcs followed per second. These numbers are remarkably
* close to the microbenchmark numbers, so there aren't any big
* performance bugs hidden herein; the 4 microseconds from the more
* realistic test suggests that caching effects aren't costing more
* than about a factor of 3.
*
* One could probably cut this guy's memory usage from a calculated 44
* bytes per arc (with a half-full hashtable, so 2 buckets per arc) to
* 24 bytes per arc (by using a smaller hashtable, so 1 bucket per
* arc, and removing malloc overhead).
*/
/*
* interface:
* hashtrie *new_hashtrie(): make a new one
* hashtrie_node *follow_arc(hashtrie*, hashtrie_node *start, char c, int create):
* Follows the arc labeled 'c' from node 'start' (which may be
* NULL) to the node it points to. If that arc doesn't exist, it
* either creates it (if create is true) or returns NULL (if create
* is false). hashtrie_node contains a void *data you can get or set.
* hashtrie_node *follow_arcs(hashtrie*, hashtrie_node *start, char *s, int create):
* Analogously, but for a whole null-terminated string.
*/
/* 12 bytes incl. malloc overhead */
typedef struct hashtrie_node { void *data; } hashtrie_node;
/* We look up an arc in the hash table by its source and label,
* returning its destination. So all three pieces of data must be in
* the arc data structure. Since more than one arc may point to a
* single hashtrie_node, we can't merge this data with the
* hashtrie_node data.
*/
/* 24 bytes incl. malloc overhead */
typedef struct hashtrie_arc {
struct hashtrie_arc *next;
hashtrie_node *source;
char label;
hashtrie_node *destination;
} hashtrie_arc;
#define HASHTRIESIZE 1000003
typedef struct hashtrie {
hashtrie_arc* arcs[HASHTRIESIZE];
} hashtrie;
hashtrie *new_hashtrie() {
hashtrie* rv = malloc(sizeof(hashtrie));
memset(rv->arcs, '\0', sizeof(rv->arcs));
return rv;
}
static int hash(hashtrie_node *start, char label) {
return (((unsigned)start ^ (unsigned char)label * 257) % HASHTRIESIZE);
}
hashtrie_node *follow_arc(hashtrie *ht, hashtrie_node *start, char c, int create) {
hashtrie_arc **a = ht->arcs + hash(start, c);
while (*a) {
if (((*a)->source == start) && ((*a)->label == c))
return (*a)->destination;
a = &((*a)->next);
}
if (!create) return 0;
*a = malloc(sizeof(hashtrie_arc));
(*a)->source = start;
(*a)->label = c;
(*a)->destination = malloc(sizeof(hashtrie_node));
(*a)->destination->data = 0;
return (*a)->destination;
}
hashtrie_node *follow_arcs(hashtrie *ht, hashtrie_node *start, char *s, int create) {
while (*s) {
start = follow_arc(ht, start, *s++, create);
if (!start) break;
}
return start;
}
void hashtrie_report(hashtrie *ht) {
int narcs = 0, max = 0;
int i;
hashtrie_arc **a;
for (i = 0; i < HASHTRIESIZE; i++) {
int n = 0;
for (a = ht->arcs + i; *a; a = &((*a)->next)) n++;
narcs += n;
if (n > max) max = n;
}
fprintf(stderr, "%d arcs in %d buckets (%.6f per bucket), max %d in a bucket\n",
narcs, HASHTRIESIZE, ((double)narcs/HASHTRIESIZE), max);
}
/*** test code ***/
#define ok(got,exp,msg) (_ok(__FILE__,__LINE__,(got),(exp),(msg)))
static int _ok(char *f, int ln, void *a, void *b, char *msg) {
if (a != b) {
fprintf(stderr, "%s:%d: Failure: '%s'\n", f, ln, msg);
return 0;
}
return 1;
}
static char *success = "success";
static char *failure = "failure";
#define test_string(ht,s,exp) (_test_string(__FILE__,__LINE__,(ht),(s),(exp)))
static int _test_string(char *file, int line, hashtrie *ht, char *s, int expected) {
hashtrie_node *htn = follow_arcs(ht, 0, s, 0);
return _ok(file, line, (htn ? success : failure), (expected ? success : failure), s);
}
static int test() {
hashtrie *ht = new_hashtrie();
hashtrie_node *n;
char *s = "foo";
int nobad;
int i;
nobad = 1;
/* it starts out empty */
nobad &= test_string(ht, "", 0);
nobad &= test_string(ht, "s", 0);
nobad &= test_string(ht, "x", 0);
/* fill in hash trie */
follow_arcs(ht, 0, "siddeley", 1);
n = follow_arcs(ht, 0, "sideways", 1);
/* see what's in hash trie */
nobad &= test_string(ht, "", 0); /* due to a quirk of encoding, ""
* is never present */
nobad &= test_string(ht, "s", 1);
nobad &= test_string(ht, "x", 0);
nobad &= test_string(ht, "sid", 1);
nobad &= test_string(ht, "siddeley", 1);
nobad &= test_string(ht, "sideley", 0);
nobad &= test_string(ht, "iddeley", 0);
nobad &= test_string(ht, "sid", 1);
nobad &= test_string(ht, "sideway", 1);
nobad &= test_string(ht, "sideways", 1);
nobad &= test_string(ht, "sidewayx", 0);
for (i = 0; i < 8000000; i++) test_string(ht, "sideways", 1);
nobad &= test_string(ht, "sidewayse", 0);
nobad &= ok(
(n == follow_arcs(ht, 0, "siddeley", 0) ? success : failure),
failure, "node collision!");
nobad &= ok(n, follow_arcs(ht, 0, "sideways", 0), "Couldn't find sideways");
nobad &= ok(n, follow_arcs(ht, 0, "sideways", 1), "Found the wrong sideways");
nobad &= ok(n->data, 0, "Data started out non-null");
n->data = s;
nobad &= ok(follow_arcs(ht, 0, "siddeley", 0)->data, 0, "Data somehow got set");
nobad &= ok(follow_arcs(ht, 0, "sideways", 0)->data, s, "Data didn't stay set");
return nobad;
}
/* This declaration makes 'main' a weak symbol, so I can compile this
* file into a program by itself, or link it as a library,
* Python-like. */
int __attribute__((weak)) main(int argc, char **argv) {
return test();
}
The "spell-checker" program previously mentioned follows:
/* Really simple dumb "spell checker" to sort of test the hash trie
* implementation. */
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include "hashtrie.h"
#define LINESZ 1024
static int total_words = 0;
void spellcheck_word(hashtrie *ht, char *word) {
int i;
if (!strlen(word)) return;
total_words++;
/* change the 'i < 0' to some other number for benchmarking */
for (i = 0; i < 0; i++) follow_arcs(ht, 0, word, 0);
if (!follow_arcs(ht, 0, word, 0)) puts(word);
}
int main(int argc, char **argv) {
char s[LINESZ];
hashtrie *ht = new_hashtrie();
FILE *in = fopen(argv[1], "r"); /* e.g. /usr/share/dict/words */
if (!in) {
perror(argv[1]);
return 1;
}
while (fgets(s, LINESZ, in)) {
int len = strlen(s);
if (len) {
s[len-1] = '\0'; /* chop newline */
follow_arcs(ht, 0, s, 1); /* add to trie */
}
}
fclose(in);
while (fgets(s, LINESZ, stdin)) {
char *start = s, *end;
for (;;) {
while (*start && !isalnum(*start)) start++;
end = start;
while (isalnum(*end)) end++;
if (*end) {
*end = '\0';
spellcheck_word(ht, start);
start = end + 1;
} else {
spellcheck_word(ht, start);
break;
}
}
}
hashtrie_report(ht);
fprintf(stderr, "Checked %d total words.\n", total_words);
return 0;
}