Sasha Pachev wrote:
Anybody wants to shake his C/C++ muscle?
My muscles are pretty puny, but here's my attempt in C. It runs in 1.9
seconds on my 2.4GHz RedHat box on the kjv100 file. I believe this is
one of the fastest times. The algorithm uses a constant-time hash table
of my own design, which lives in a very sparse array of 1 million
(mostly empty) entries. It runs with just 99 collisions in the hash's
key space. Interestingly, it turns out that I/O is not the big part of
the time (0.2 seconds to load both files). The fun part is that I didn't
do any string copies from the original data. That was kind of groovy.
The con is that it uses a TON of memory. It loads both files completely
into memory before working on the data. I could have worked around this,
to allow arbitrarily large files, but I've spent too much time now. :)
Bryan, could you run this code on your benchmark box? I'm curious how it
compares with the others.
Enjoy!
--Dave
P.S. To whomever it was that was whining about not having incentive:
Don't be a wimp. The code is the incentive.
P.P.S. I am NOT looking for employment. I already have the best job in
the world.
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <string.h>
#include <sys/mman.h>
#include <fcntl.h>
#define DICTIONARY "/usr/share/dict/words"
#define READ_SIZE (1024*1024)
#define HASH_BUCKETS 10485760
#define INPUT_BUFFER_SIZE 10485760
struct HashEntry
{
char *word;
int count;
void *next;
};
inline char isStopChar( char c )
{
switch( c )
{
case ' ':
case '\n':
case '\r':
case '!':
case '@':
case '#':
case '$':
case '%':
case '^':
case '&':
case '*':
case '(':
case ')':
case '\t':
case ':':
case '.':
case '/':
case ',':
case '?':
case '>':
case '<':
case ';':
case '\'':
case '"':
case ']':
case '[':
case '{':
case '}':
case '\\':
case '|':
case '=':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '-':
return 1;
default:
return 0;
}
}
unsigned int hash( char *word )
{
unsigned int hash = 0;
unsigned int i = 0;
for( ; word[i] != 0; ++i )
{
hash += word[i];
hash += ( hash << 10 );
hash ^= ( hash >> 6 );
}
//hash += ( hash << 3 );
//hash ^= ( hash >> 11 );
//hash += ( hash << 15 );
return hash % HASH_BUCKETS;
}
int main( int argc, char **argv )
{
if( argc != 2 )
return 1;
int i;
struct stat statBuf;
stat( DICTIONARY, &statBuf );
int dictionarySize = statBuf.st_size;
char *dictionaryBuffer = (char*)malloc( dictionarySize * sizeof(char) );
FILE *dictionaryFile = fopen( DICTIONARY, "r" );
while( fread( dictionaryBuffer, READ_SIZE, 1, dictionaryFile ) );
struct HashEntry *hashEntries = (struct HashEntry*)malloc( HASH_BUCKETS *
sizeof(struct HashEntry) );
memset( hashEntries, 0, sizeof(struct HashEntry) );
char *dictionaryWord = dictionaryBuffer;
for( i=0; i<dictionarySize; i++ )
{
if( dictionaryBuffer[i] == '\n' )
{
dictionaryBuffer[i] = 0;
unsigned int hashVal = hash( dictionaryWord );
struct HashEntry *newEntry = &( hashEntries[hashVal] );
if( newEntry->word == 0 )
{
newEntry->word = dictionaryWord;
newEntry->count = 0;
newEntry->next = 0;
}
else
{
newEntry = (struct HashEntry*) malloc( sizeof( struct
HashEntry ) );
newEntry->word = dictionaryWord;
newEntry->count = 0;
newEntry->next = 0;
struct HashEntry *existingEntry = &( hashEntries[hashVal] );
while( existingEntry->next != 0 )
{
existingEntry = existingEntry->next;
}
existingEntry->next = newEntry;
}
dictionaryWord = dictionaryBuffer + i + 1;
}
}
stat( argv[1], &statBuf );
int inputFileSize = statBuf.st_size;
FILE *inputFile = fopen( argv[1], "r" );
char *inputBuffer = (char*)malloc( inputFileSize );
int counter=0;
while( fread( inputBuffer+(READ_SIZE*counter++), READ_SIZE, 1, inputFile )
);
// For every word in the input file, count those that are also in the
dictionary
char *inputWord = inputBuffer;
for( i=0; i<inputFileSize; i++ )
{
// Case insensitive
//inputBuffer[i] = tolower( inputBuffer[i] );
if( isStopChar( inputBuffer[i] ) )
{
inputBuffer[i] = 0;
while( isStopChar( inputBuffer[++i] ) && i < inputFileSize );
unsigned int hashVal = hash( inputWord );
struct HashEntry *entry = &( hashEntries[hashVal] );
if( entry->word )
{
if( !strcmp( entry->word, inputWord ) )
{
entry->count++;
}
else
{
while( (entry = entry->next) )
{
if( !strcmp( entry->word, inputWord ) )
{
entry->count++;
break;
}
}
}
}
inputWord = inputBuffer + i;
i--;
}
}
// Print results:
for( i=0; i<HASH_BUCKETS; i++ )
{
struct HashEntry *entry = &( hashEntries[i] );
if( entry->word )
{
if( entry->count > 0 )
printf( "%s: %d\n", entry->word, entry->count );
while( entry->next )
{
entry = entry->next;
if( entry->count > 0 )
printf( "%s: %d\n", entry->word, entry->count );
}
}
}
return 0;
}
/*
PLUG: http://plug.org, #utah on irc.freenode.net
Unsubscribe: http://plug.org/mailman/options/plug
Don't fear the penguin.
*/