Bloom filters and a simple spell checker in Python

Kragen Sitaker Sat, 12 Aug 2006 00:37:06 -0700

I thought I'd try implementing a Bloom filter in Python, just for fun,
since Python does have a nice abstraction for large bit vectors: the
long int, or bignum.  Turns out it's grossly inefficient to use a single
long int for a Bloom filter because there's no way to set or clear bits
in it.


I wrote this on Beatrice's Mac with Python 2.3, which sucked, but was
better than not programming.

#!/usr/bin/python
"""Bloom filters in Python, using SHA-1 and Python longs.

My first attempt stored the whole filter in a single arbitrary-size integer,
but for some reason that was 100x slower than storing it in a bunch of 256-bit
integers.
"""

import sha

def nbits_required(n):
    """Bits required to represent any integer in [0, n)."""
    n -= 1
    rv = 0
    while n:
        n >>= 1
        rv += 1
    return rv

class Bloom:
    """Bloom filter: compact hash table for membership tests with false pos."""
    # default bits per bucket is 256 to cut down on pickle overhead
    def __init__(self, size, nhashes, bucketbits=256):
        """size: number of bits.  Should be a power of 2.
        nhashes: number of separate hashes to use.

        Making nhashes larger will make it slower.  There are also tradeoffs
        between size, performance, and false-positive rate, which you can look
        up elsewhere."""
        self.bucketbits = bucketbits
        self.filter = [0L] * int((size + bucketbits - 1) / bucketbits)
        self.size = size
        self.nhashes = nhashes
        self.hashbits = nbits_required(size)
        assert self.hashbits * nhashes <= 160  # 160's all we get with SHA1
    def add(self, astr):
        """Add a string to the membership of the filter."""
        for offset in self._hashes(astr):
            bucket, bit = divmod(offset, self.bucketbits)
            self.filter[bucket] |= (1L << bit)
    def __contains__(self, astr):
        """Returns true if the string is in the filter or it feels like it."""
        for offset in self._hashes(astr):
            bucket, bit = divmod(offset, self.bucketbits)
            if not self.filter[bucket] & (1L << bit): return 0
        return 1
    def _hashes(self, astr):
        """The hashes of a particular string."""
        digest = sha.sha(astr).digest()
        # is there no better way to convert a byte string into a long?!
        hashlong = 0L
        for ch in digest: hashlong = (hashlong << 8) | ord(ch)
        rv = []
        mask = (1L << self.hashbits) - 1
        for ii in range(self.nhashes):
            # Note that this will give substantially nonuniform results if
            # self.size is not a power of 2, in order to avoid wasting hash
            # bits or doing long division:
            rv.append((hashlong & mask) % self.size)
            hashlong >>= self.hashbits
        return rv

def test_bloom():
    """Very basic sanity test for Bloom filter implementation."""
    def ok(a, b): assert a == b, (a, b)
    ok(map(nbits_required, range(1, 10)), [0, 1, 2, 2, 3, 3, 3, 3, 4])
    ok(nbits_required(131072), 17)
    ok(nbits_required(131073), 18)

    b = Bloom(1024, 5)
    assert 'asdf' not in b
    assert 'fdsa' not in b
    b.add('asdf')
    assert 'asdf' in b
    assert 'fdsa' not in b

    # false positives (depends on hash function):
    x = Bloom(8, 3)
    x.add('asdf') # about a 5% chance of false positives
    assert 'asdf' in x
    assert 'fdsa' not in x
    ok(filter(x.__contains__, ['foo%d' % ii for ii in range(25)]), ['foo22'])

test_bloom()

def misspellings(infile):
    """Demo: spell check."""
    import re, cPickle, sys
    try: bf = cPickle.load(file('dict.pck', 'rb'))
    except IOError:
        # /usr/share/dict/words has 234936 words on this Mac and is 2.4 megs
        sys.stderr.write("reading dictionary...\n")
        words = file('/usr/share/dict/words')
        # 2^21 bits, 8.9 per word, would give us 1.5% false positives with 5
        # hashes or 1.7% with 6, so we use 4194304 = 2^22 bits, or 17.8 per
        # word, for 0.09% false positives; that's still only half a mebibyte,
        # although pickle overhead pushes it up to 559K, 22% of the dictionary.
        bf = Bloom(4194304, 5)  
        lineno = 0
        for line in words: 
            bf.add(line[:-1].lower())
            lineno += 1
            if not lineno % 100:
                sys.stderr.write('%s%s\r' % (lineno, ' ' * 40))
                sys.stderr.flush()
        sys.stderr.write("done reading dictionary;\n")
        try: cPickle.dump(bf, file('dict.pck', 'wb'), 2)
        except: pass
    def candidates(word):
        """Words you might find in the dictionary in English."""
        yield word
        for suffix in ['s', 'ing', 'ed', 'es', 'er', 's', 'ly']:
            if word.endswith(suffix): yield word[:-len(suffix)]
        for suffix, repl in [('ed', 'e'), ('er', 'e'), ('ing', 'e'),
                             ('ies', 'y'), ('ied', 'y')]:
            if word.endswith(suffix): yield word[:-len(suffix)] + repl
    for line in infile:
        for word in re.findall(r"['\w]+", line):
            # we drop the "'" because our dictionary has "didnt" but not
            # "didn't"
            for chance in candidates(word.replace("'", '').lower()):
                if chance in bf: break
            else:
                sys.stdout.write(word + ' ')
                sys.stdout.flush()
                # to prevent repeating it...
                bf.add(word)
    sys.stdout.write('\n')

if __name__ == '__main__': 
    import sys
    misspellings(file(sys.argv[1]))

Bloom filters and a simple spell checker in Python

Reply via email to