Like other things posted here without notices to the contrary, this code is in the public domain.
You can find partial hash collisions by sorting the hashes and then walking through the sorted lists looking for successive entries with long common prefixes. I was going to write a program to do that, and had gotten as far as doing the hashjoin bit, but then I realized the problem was profoundly irrelevant to my life. #!/usr/bin/python # Slow birthday-attack program # Does only about 1000 hashes per second # Can practically find partial hash collisions up to 32 bits # without much trouble on my 128MB-RAM laptop. # But its data access pattern is pretty random, so it degrades # spectacularly once it starts hitting swap. from __future__ import generators import sha, sys, bsddb def strings(maxlen): if maxlen == 0: yield '' else: for item in strings(maxlen - 1): yield item for num in range(256): char = chr(num) for item in strings(maxlen-1): yield char + item def smartcollide(a, b, minlen): count = 0 ahasher, bhasher = sha.new(a), sha.new(b) ahits = {} # this makes it 5-7x slower # but may degrade more slowly once it exceeds memory size # haven't yet seen any evidence it helps. #ahits = bsddb.hashopen('collider.db', 'n') while 1: for eachstring in strings(minlen * 4): aht = ahasher.copy(); aht.update(eachstring) ahits[aht.hexdigest()[:minlen]] = eachstring bht = bhasher.copy(); bht.update(eachstring) count += 2 if ahits.has_key(bht.hexdigest()[:minlen]): return a + ahits[bht.hexdigest()[:minlen]], b + eachstring, count def main(maxlen, str1, str2, collider): for minlen in range(maxlen): a, b, count = collider(str1, str2, minlen) ahash, bhash = sha.new(a).hexdigest(), sha.new(b).hexdigest() print (a, ahash, b, bhash, count) if __name__ == '__main__': main(11, sys.argv[1], sys.argv[2], smartcollide)