hash tries in C

Kragen Sitaker Fri, 23 Apr 2004 00:37:19 -0700

/*
 * hash tries.
 *
 * Herewith a program that can dynamically construct and run a
 * deterministic finite state machine, such as for matching to a trie
 * or a regular expression, over a sequence of bytes, at under 100
 * clock cycles per byte, and under 50 (potentially 25) bytes of
 * memory per state graph arc.  Regular expressions are as yet
 * unimplemented.
 */


#include <stdio.h>
#include <string.h>
#include <stdlib.h>

/* dummy definitions to avoid #include for faster compiles during
   development */
/* void *malloc(unsigned);
 * extern void *stderr; */

/*
 * Representing tries, or generally, representing deterministic state
 * machines that run over bytes, we have a big tradeoff when it comes
 * to branching factors.  If we use a small branching factor, we get a
 * lot of extra nodes, and traversing the trie is slow.  But if we use
 * a large branching factor, each individual node is very large.
 * Here's a compromise --- a way for all of the nodes in a trie to
 * share space.  I think Don Knuth invented this data structure for an
 * article in the 1980s about literate programming; the topic he chose
 * was counting the distinct words in a file, with a count of each
 * word, and he stored the words in a hash trie.  Unfortunately, I
 * read that article in 1995, and now it's 2004, and I didn't
 * thoroughly understand the algorithm at the time.  So this data
 * structure may or may not have anything in common with that one,
 * other than the name and the problem it tries to solve.
 */

/* This program can test "sideways" for membership in a small trie in
 * about 1.2 microseconds on my 500MHz machine, so it takes about
 * 150ns, or about 74 clock cycles, to follow a single pointer from
 * one state to the next.  I wrote a little spell-check program that
 * inserts the 206590 words (containing 887115 letters) in my
 * /usr/share/dict/words into a trie, which takes about 1.7 seconds,
 * and then ran /usr/share/dict/words through it again to see how long
 * it took to probe the table --- another 0.82 seconds.  That's about
 * 4.0 microseconds per word, noticeably longer but not outrageously
 * so, especially since it includes time to tokenize the words etc.
 *
 * The program reports,
 * 482855 arcs in 1000003 buckets (0.482854 per bucket), max 7 in a bucket
 * which seems pretty reasonably balanced.  It also explains the small
 * 23088kiB memory usage: that's 48 bytes per arc, which is plausible
 * given the data structures, but adds up to a relatively slim 26
 * bytes per input letter, or 112 bytes per word.  (Man, it's enough
 * to make you want to binary-search a sorted list of
 * newline-separated strings or something!)
 */

/* Modifying the spell-checker to report the number of "words" it
 * checked in the input, and searching for each one a configurable
 * number of times, gave me 273345 total words.  Searching for each
 * one in the trie 10 times took 5.45 user seconds; searching for each
 * one 20 times took 8.42 seconds.  That's about 2.7 million searches
 * (totaling about 17163480 characters) in about 3 seconds, so about
 * 5.7 million arcs followed per second.  These numbers are remarkably
 * close to the microbenchmark numbers, so there aren't any big
 * performance bugs hidden herein; the 4 microseconds from the more
 * realistic test suggests that caching effects aren't costing more
 * than about a factor of 3.
 *
 * One could probably cut this guy's memory usage from a calculated 44
 * bytes per arc (with a half-full hashtable, so 2 buckets per arc) to
 * 24 bytes per arc (by using a smaller hashtable, so 1 bucket per
 * arc, and removing malloc overhead).
 */

/*
 * interface:
 * hashtrie *new_hashtrie(): make a new one
 * hashtrie_node *follow_arc(hashtrie*, hashtrie_node *start, char c, int create): 
 *   Follows the arc labeled 'c' from node 'start' (which may be
 *   NULL) to the node it points to.  If that arc doesn't exist, it
 *   either creates it (if create is true) or returns NULL (if create
 *   is false).  hashtrie_node contains a void *data you can get or set.
 * hashtrie_node *follow_arcs(hashtrie*, hashtrie_node *start, char *s, int create):
 *   Analogously, but for a whole null-terminated string.
 */

/* 12 bytes incl. malloc overhead */
typedef struct hashtrie_node { void *data; } hashtrie_node;

/* We look up an arc in the hash table by its source and label,
 * returning its destination.  So all three pieces of data must be in
 * the arc data structure.  Since more than one arc may point to a
 * single hashtrie_node, we can't merge this data with the
 * hashtrie_node data.
 */

/* 24 bytes incl. malloc overhead */
typedef struct hashtrie_arc {
  struct hashtrie_arc *next;
  hashtrie_node *source;
  char label;
  hashtrie_node *destination;
} hashtrie_arc;

#define HASHTRIESIZE 1000003

typedef struct hashtrie { 
  hashtrie_arc* arcs[HASHTRIESIZE];
} hashtrie;

hashtrie *new_hashtrie() {
  hashtrie* rv = malloc(sizeof(hashtrie));
  memset(rv->arcs, '\0', sizeof(rv->arcs));
  return rv;
}

static int hash(hashtrie_node *start, char label) {
  return (((unsigned)start ^ (unsigned char)label * 257) % HASHTRIESIZE);
}

hashtrie_node *follow_arc(hashtrie *ht, hashtrie_node *start, char c, int create) {
  hashtrie_arc **a = ht->arcs + hash(start, c);
  while (*a) {
    if (((*a)->source == start) && ((*a)->label == c))
      return (*a)->destination;
    a = &((*a)->next);
  }
  if (!create) return 0;
  *a = malloc(sizeof(hashtrie_arc));
  (*a)->source = start;
  (*a)->label = c;
  (*a)->destination = malloc(sizeof(hashtrie_node));
  (*a)->destination->data = 0;
  return (*a)->destination;
}

hashtrie_node *follow_arcs(hashtrie *ht, hashtrie_node *start, char *s, int create) {
  while (*s) {
    start = follow_arc(ht, start, *s++, create);
    if (!start) break;
  }
  return start;
}

void hashtrie_report(hashtrie *ht) {
  int narcs = 0, max = 0;
  int i;
  hashtrie_arc **a;
  for (i = 0; i < HASHTRIESIZE; i++) {
    int n = 0;
    for (a = ht->arcs + i; *a; a = &((*a)->next)) n++;
    narcs += n;
    if (n > max) max = n;
  }
  fprintf(stderr, "%d arcs in %d buckets (%.6f per bucket), max %d in a bucket\n",
         narcs, HASHTRIESIZE, ((double)narcs/HASHTRIESIZE), max);
}

/*** test code ***/

#define ok(got,exp,msg) (_ok(__FILE__,__LINE__,(got),(exp),(msg)))

static int _ok(char *f, int ln, void *a, void *b, char *msg) { 
  if (a != b) {
    fprintf(stderr, "%s:%d: Failure: '%s'\n", f, ln, msg);
    return 0;
  }
  return 1;
}

static char *success = "success";
static char *failure = "failure";

#define test_string(ht,s,exp) (_test_string(__FILE__,__LINE__,(ht),(s),(exp)))

static int _test_string(char *file, int line, hashtrie *ht, char *s, int expected) {
  hashtrie_node *htn = follow_arcs(ht, 0, s, 0);
  return _ok(file, line, (htn ? success : failure), (expected ? success : failure), s);
}

static int test() {
  hashtrie *ht = new_hashtrie();
  hashtrie_node *n;
  char *s = "foo";
  int nobad;
  int i;
  nobad = 1;
  /* it starts out empty */
  nobad &= test_string(ht, "", 0);
  nobad &= test_string(ht, "s", 0);
  nobad &= test_string(ht, "x", 0);
  /* fill in hash trie */
  follow_arcs(ht, 0, "siddeley", 1);
  n = follow_arcs(ht, 0, "sideways", 1);
  /* see what's in hash trie */
  nobad &= test_string(ht, "", 0);  /* due to a quirk of encoding, ""
                                     * is never present */
  nobad &= test_string(ht, "s", 1);
  nobad &= test_string(ht, "x", 0);
  nobad &= test_string(ht, "sid", 1);
  nobad &= test_string(ht, "siddeley", 1);
  nobad &= test_string(ht, "sideley", 0);
  nobad &= test_string(ht, "iddeley", 0);
  nobad &= test_string(ht, "sid", 1);
  nobad &= test_string(ht, "sideway", 1);
  nobad &= test_string(ht, "sideways", 1);
  nobad &= test_string(ht, "sidewayx", 0);
  for (i = 0; i < 8000000; i++) test_string(ht, "sideways", 1);
  nobad &= test_string(ht, "sidewayse", 0);
  nobad &= ok(
      (n == follow_arcs(ht, 0, "siddeley", 0) ? success : failure),
    failure, "node collision!");
  nobad &= ok(n, follow_arcs(ht, 0, "sideways", 0), "Couldn't find sideways");
  nobad &= ok(n, follow_arcs(ht, 0, "sideways", 1), "Found the wrong sideways");
  nobad &= ok(n->data, 0, "Data started out non-null");
  n->data = s;
  nobad &= ok(follow_arcs(ht, 0, "siddeley", 0)->data, 0, "Data somehow got set");
  nobad &= ok(follow_arcs(ht, 0, "sideways", 0)->data, s, "Data didn't stay set");

  return nobad;
}

/* This declaration makes 'main' a weak symbol, so I can compile this
 * file into a program by itself, or link it as a library,
 * Python-like. */
int __attribute__((weak)) main(int argc, char **argv) {
   return test();
}





The "spell-checker" program previously mentioned follows:

/* Really simple dumb "spell checker" to sort of test the hash trie
 * implementation. */

#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include "hashtrie.h"

#define LINESZ 1024

static int total_words = 0;
void spellcheck_word(hashtrie *ht, char *word) {
  int i;
  if (!strlen(word)) return;
  total_words++;
  /* change the 'i < 0' to some other number for benchmarking */
  for (i = 0; i < 0; i++) follow_arcs(ht, 0, word, 0);
  if (!follow_arcs(ht, 0, word, 0)) puts(word);
}

int main(int argc, char **argv) {
  char s[LINESZ];
  hashtrie *ht = new_hashtrie();
  FILE *in = fopen(argv[1], "r");  /* e.g. /usr/share/dict/words */

  if (!in) {
    perror(argv[1]);
    return 1;
  }
  while (fgets(s, LINESZ, in)) {
    int len = strlen(s);
    if (len) {
      s[len-1] = '\0';  /* chop newline */
      follow_arcs(ht, 0, s, 1); /* add to trie */
    }
  }
  fclose(in);
  while (fgets(s, LINESZ, stdin)) {
    char *start = s, *end;
    for (;;) {
      while (*start && !isalnum(*start)) start++;
      end = start;
      while (isalnum(*end)) end++;
      if (*end) {
        *end = '\0';
        spellcheck_word(ht, start);
        start = end + 1;
      } else {
        spellcheck_word(ht, start);
        break;
      }
    }
  }
  hashtrie_report(ht);
  fprintf(stderr, "Checked %d total words.\n", total_words);
  return 0;
}

hash tries in C

Reply via email to