Sasha Pachev wrote:
Anybody wants to shake his C/C++ muscle?

My muscles are pretty puny, but here's my attempt in C. It runs in 1.9 seconds on my 2.4GHz RedHat box on the kjv100 file. I believe this is one of the fastest times. The algorithm uses a constant-time hash table of my own design, which lives in a very sparse array of 1 million (mostly empty) entries. It runs with just 99 collisions in the hash's key space. Interestingly, it turns out that I/O is not the big part of the time (0.2 seconds to load both files). The fun part is that I didn't do any string copies from the original data. That was kind of groovy. The con is that it uses a TON of memory. It loads both files completely into memory before working on the data. I could have worked around this, to allow arbitrarily large files, but I've spent too much time now. :)

Bryan, could you run this code on your benchmark box? I'm curious how it compares with the others.

Enjoy!

--Dave


P.S. To whomever it was that was whining about not having incentive: Don't be a wimp. The code is the incentive.

P.P.S. I am NOT looking for employment. I already have the best job in the world.
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <string.h>
#include <sys/mman.h>
#include <fcntl.h>

#define DICTIONARY "/usr/share/dict/words"
#define READ_SIZE (1024*1024)
#define HASH_BUCKETS 10485760
#define INPUT_BUFFER_SIZE 10485760

struct HashEntry 
{
     char *word;
     int count;
     void *next;
};

inline char isStopChar( char c )
{
     switch( c )
     {
         case ' ':
         case '\n':
         case '\r':
         case '!':
         case '@':
         case '#':
         case '$':
         case '%':
         case '^':
         case '&':
         case '*':
         case '(':
         case ')':
         case '\t':
         case ':':
         case '.':
         case '/':
         case ',':
         case '?':
         case '>':
         case '<':
         case ';':
         case '\'':
         case '"':
         case ']':
         case '[':
         case '{':
         case '}':
         case '\\':
         case '|':
         case '=':
         case '0':
         case '1':
         case '2':
         case '3':
         case '4':
         case '5':
         case '6':
         case '7':
         case '8':
         case '9':
         case '-':
             return 1;

         default:
             return 0;
     }
}

unsigned int hash( char *word )
{
     unsigned int hash = 0;
     unsigned int i = 0;

     for( ; word[i] != 0; ++i )
     {
         hash += word[i];
         hash += ( hash << 10 );
         hash ^= ( hash >> 6  );
     }
     //hash += ( hash << 3  );
     //hash ^= ( hash >> 11 );
     //hash += ( hash << 15 );

     return hash % HASH_BUCKETS;
}

int main( int argc, char **argv )
{
    if( argc != 2 )
        return 1;

    int i;
    struct stat statBuf;
    stat( DICTIONARY, &statBuf );
    int dictionarySize = statBuf.st_size;
    char *dictionaryBuffer = (char*)malloc( dictionarySize * sizeof(char) );

    FILE *dictionaryFile = fopen( DICTIONARY, "r" );
    while( fread( dictionaryBuffer, READ_SIZE, 1, dictionaryFile ) );

    struct HashEntry *hashEntries = (struct HashEntry*)malloc( HASH_BUCKETS * 
sizeof(struct HashEntry) );
    memset( hashEntries, 0, sizeof(struct HashEntry) );

    char *dictionaryWord = dictionaryBuffer;
    for( i=0; i<dictionarySize; i++ )
    {
         if( dictionaryBuffer[i] == '\n' )
         {
             dictionaryBuffer[i] = 0;
             unsigned int hashVal = hash( dictionaryWord );

             struct HashEntry *newEntry = &( hashEntries[hashVal] );
             if( newEntry->word == 0 )
             {
                 newEntry->word = dictionaryWord;
                 newEntry->count = 0;
                 newEntry->next = 0;
             }
             else
             {
                 newEntry = (struct HashEntry*) malloc( sizeof( struct 
HashEntry ) );
                 newEntry->word = dictionaryWord;
                 newEntry->count = 0;
                 newEntry->next = 0;

                 struct HashEntry *existingEntry = &( hashEntries[hashVal] );
                 while( existingEntry->next != 0 )
                 {
                     existingEntry = existingEntry->next;
                 }

                 existingEntry->next = newEntry;
             }

             dictionaryWord = dictionaryBuffer + i + 1;
         }
    }

    stat( argv[1], &statBuf );
    int inputFileSize = statBuf.st_size;
    FILE *inputFile = fopen( argv[1], "r" );
    char *inputBuffer = (char*)malloc( inputFileSize );
    int counter=0;
    while( fread( inputBuffer+(READ_SIZE*counter++), READ_SIZE, 1, inputFile ) 
);

    // For every word in the input file, count those that are also in the 
dictionary
    char *inputWord = inputBuffer;
    for( i=0; i<inputFileSize; i++ )
    {
         // Case insensitive
         //inputBuffer[i] = tolower( inputBuffer[i] );

         if( isStopChar( inputBuffer[i] ) )
         {
             inputBuffer[i] = 0;

             while( isStopChar( inputBuffer[++i] ) && i < inputFileSize );

             unsigned int hashVal = hash( inputWord );
             struct HashEntry *entry = &( hashEntries[hashVal] );
             if( entry->word )
             {
                 if( !strcmp( entry->word, inputWord ) )
                 {
                     entry->count++;
                 }
                 else
                 {
                     while( (entry = entry->next) )
                     {
                         if( !strcmp( entry->word, inputWord ) )
                         {
                             entry->count++;
                             break;
                         }
                     }
                 }
             }

             inputWord = inputBuffer + i;
             i--;
         }

    }

    // Print results:
    for( i=0; i<HASH_BUCKETS; i++ )
    {
         struct HashEntry *entry = &( hashEntries[i] );
         if( entry->word )
         {
             if( entry->count > 0 )
                 printf( "%s: %d\n", entry->word, entry->count );

             while( entry->next )
             {
                 entry = entry->next;
                 if( entry->count > 0 )
                     printf( "%s: %d\n", entry->word, entry->count );
             }
         }
    }

    return 0;
}
/*
PLUG: http://plug.org, #utah on irc.freenode.net
Unsubscribe: http://plug.org/mailman/options/plug
Don't fear the penguin.
*/

Reply via email to