On Feb 22, 2009, at 12:51 PM, Greg Landrum wrote:
BTW: looks like you forgot the attachment.
D'oh!
# Written by Andrew Dalke <da...@dalkescientific.com> # Andrew Dalke Scientific, AB, Gothenburg, Sweden # This document has been released to the public domain. # Data set comes CACTVS keys, generated by # gzcat ~/databases/Compound_241*.sdf.gz | # awk 'flg==1{id=$1;flg=0} /PUBCHEM_COMPOUND_CID/ {flg=1} /^AAAD.*==/ {print $1, id}' > # cactvs_keys.dat # Result contains lines which look like: # # AAADccB6OAAHAAAAAAAAAAAAAAAAAAAAAAAwYAAAAAAAAAABQAAAHgIQAAAACA6hkCIwzoLABACIACXSWAKCCAAhJ0AIiABGb4gPJiPFs5/HOCjk1BHa6AeQQAAAACBAAACACBAAQIAAAQAQIAAAAAAAAA== 24100001 # AAADceB7MQBEAAAAAAAAAAAAAAAAAAAAAAAwYMEAAAAAAACBUAAAHwYQAAAADQqF2CizwIPAAAioAiVSdACCEAFlBxAJiAEAZsgIIDrB35GEIYhglADIyccYiMCOiABAAAACAAAQAIAAAAQAAAAAAAAAAA== 24100002 #### Here are results from running this for different data sets and queries # The known hit comes from cid:24100068 # The non-hit comes from cid:24776000 # This test set (Compound_241*) has 98837 fingerprints. # Trie uses 59 MB, bits use 14 MB # Trie takes 11* time of regular search for a known hit """ trie creation 8.29011487961 int creation 1.60190391541 trie search time 0.557299137115 int search time 0.0491080284119 int search finds 15 trie search finds 15 Are they the same? True Number of fingerprints: 98837 """ # Same data set, with a fingerprint which isn't in the data set # Trie is 1.2* the speed of regular search (Note: no hits) """ trie creation 8.09642100334 int creation 1.41214299202 trie search time 0.0511682033539 int search time 0.0443210601807 int search finds 0 trie search finds 0 Are they the same? True Number of fingerprints: 98837 """ # Using Compound_241* and Compound_242*; together 198819 fingerprints # Trie uses 105 MB, bits use 28 MB # With a known hit, trie takes 10* the time of regular search """ trie creation 17.7659249306 int creation 3.26757502556 trie search time 0.725470066071 int search time 0.0877990722656 int search finds 30 trie search finds 30 Are they the same? True Number of fingerprints: 198819 """ # With a known negative, trie is 20% faster than regular search (NOTE! no hits) """ trie creation 17.7014229298 int creation 3.35999107361 trie search time 0.0733439922333 int search time 0.0909128189087 int search finds 0 trie search finds 0 Are they the same? True Number of fingerprints: 198819 """ import time # Decode and grab the first 384 bits, to match Smellie's fp size def decode_cactvs_fp(encoded_fp): return encoded_fp[4:].decode("base64")[:384//8] # The order of the bits is very important. Reverse the bits and the trie search time is very slow # This is because in reverse order most fingerprints have a lot of leading 0000000000s # No branching occurs, so more work during the search. #return encoded_fp[4:].decode("base64")[::-1][:384//8] # convert into a Python integer. This is the fastest way to # do the substructure tests in pure Python. def decode_cactvs_fp_as_int(encoded_fp): return int(encoded_fp[4:].decode("base64")[:384//8].encode("hex"), 16) return int(encoded_fp[4:].decode("base64")[::-1][:384//8].encode("hex"), 16) # This is the leaf node. All values in the node have the same # fingerprint. The "tail" is non-empty when there is only one leaf # node under a given entry in a node. This occurs because most leaves # have no siblings (Proof: that would require about 2**(384/8) == # 2**48 == a lot of structures) # In that case, "tail" is a compact representation of the linear chain # of nodes along with the leaf which would contain the same set of # identifier values. class Leaf(object): # memory optimization __slots__ = ["tail", "values"] def __init__(self, tail): self.tail = tail self.values = set() def dump(self, depth): print " "*depth, self.tail.encode("hex"), for value in sorted(self.values): print value, print # A Node contains up to 2**8 == 256 branches, one for each possible byte value. # The child can be another Node or a Leaf. class Node(object): # memory optimization __slots__ = ["children"] def __init__(self): self.children = {} def add(self, text, value, offset=0): c = text[offset] x = self.children.get(c) if x is None: self.children[c] = y = Leaf(text[offset+1:]) y.values.add(value) elif isinstance(x, Node): x.add(text, value, offset+1) elif isinstance(x, Leaf): #print "compare", repr(x.tail), repr(text[offset+1:]) if x.tail == text[offset+1:]: x.values.add(value) elif not x.tail: assert offset+1 == len(text) x.values.add(value) else: c2 = x.tail[0] x.tail = x.tail[1:] self.children[c] = y = Node() y.children[c2] = x y.add(text, value, offset+1) else: raise AssertionError # Useful for debugging def dump(self, depth=0): for k, v in sorted(self.children.items()): print " "*depth, k.encode("hex") v.dump(depth+1) # When I descend the trie, I need to try all target patterns # contain the query pattern. I compute that once. # If the Then the target # query has must have # bitpattern bitpattern # ========= ============== # 0 0, 1 # 1 1 # # 00 00, 01, 10, 11 # 01 01, 11 # 10 10, 11 # 11 11 # Apply this to 8 bits at a time query_to_target_bits = {} for query_bits in range(256): for target_bits in range(256): if query_bits & target_bits == query_bits: query_to_target_bits.setdefault(chr(query_bits), []).append(chr(target_bits)) assert len(query_to_target_bits["\0"]) == 256 assert query_to_target_bits["\xff"] == ["\xff"] def substructure_search_trie(node, text, offset=0): if isinstance(node, Leaf): #print "Have node", repr(node.tail), repr(text[offset:]) for query_char, target_char in zip(text[offset:], node.tail): if target_char not in query_to_target_bits[query_char]: #print "They differ at", query_char.encode("hex"), target_char.encode("hex") #print text[offset:].encode("hex") #print node.tail.encode("hex") break else: yield node.values else: for target_bits in query_to_target_bits[text[offset]]: child = node.children.get(target_bits) if child is not None: #print "Try child", repr(target_bits) for values in substructure_search_trie(child, text, offset+1): yield values #print "Done with child", repr(target_bits) """ # used during initial development a = Node() a.add("Andrew", "apd") a.add("Axyzzy", "apd2") a.add("Andrew", "apd3") a.add("Andrez", "pad3") a.dump() print list(substructure_search_trie(a, "A\0\0\0\0\0")) raise SytemExit() """ # Used to select only a few of the input structures # Currently disabled. subset = set(['24100068', '24103053']) def read_fingerprints(): for line in open("cactvs_keys.dat"): encoded_fp, id = line.split() ## Used when trying to narrow down the differences between the two searches #if id not in subset: # continue yield encoded_fp, id def create_trie(): root = Node() for encoded_fp, id in read_fingerprints(): fp = decode_cactvs_fp(encoded_fp) root.add(fp, id) return root # This is the data for doing a regular brute-force screening def create_bits(): int_fp_data = [] for encoded_fp, id in read_fingerprints(): fp = decode_cactvs_fp_as_int(encoded_fp) int_fp_data.append( (fp, id) ) return int_fp_data # Brute-force searching. def substructure_search_int(int_fp_data, query): hits = [] for (fp, id) in int_fp_data: if query & fp == query: hits.append(id) return hits # I used 'top' to get the size from VSIZE; manually #size1 = raw_input("Get size: ") t1 = time.time() root = create_trie() t2 = time.time() #root.dump() #size2 = raw_input("Get end size: ") #print float(size2)-float(size1) print "trie creation", t2-t1 #size1 = raw_input("Get size: ") #size1=size2 t1 = time.time() int_fp_data = create_bits() t2 = time.time() #size2 = raw_input("Get end size: ") #print float(size2)-float(size1) print "int creation", t2-t1 #a.dump() # This is cid:24776000; does not exist in my data set encoded_query = "AAADceB7MYAEAAAAAAAAAAAAAAAAAAAAAAA8eIEAAAAAAAABQAAAHwIQAAAADwLBmCwwAIPAAACIAiFSEACCAAAgBQAIiAEIBogIIDKBlxGEIAhglCCIiAcYi8CPhAAAAAAAAAAIAAAAAAAAAAAAAAAAAA==" # This is cid:24100068; exists in my data set encoded_query = "AAADceBzMABAAAAAAAAAAAAAAAAAAWLAAAAsAAAAAAAAAAABgAAAHgQAAAAACCjh1gYCiRMIFAisAQVwXACA8KBFCDgAGBU4REACABpgwAANAYhABgDwAUAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA==" query_bytes = decode_cactvs_fp(encoded_query) #print "This is the query" #print query_bytes.encode("hex") query_int = decode_cactvs_fp_as_int(encoded_query) t1 = time.time() trie_hits = set() for hits in substructure_search_trie(root, query_bytes): trie_hits |= hits t2 = time.time() print "trie search time", t2-t1 t1 = time.time() int_hits = set(substructure_search_int(int_fp_data, query_int)) t2 = time.time() print "int search time", t2-t1 print "int search finds", len(int_hits) print "trie search finds", len(trie_hits) print "Are they the same?", int_hits == trie_hits #print sorted(int_hits) #print sorted(trie_hits) assert int_hits == trie_hits print "Number of fingerprints:", len(int_fp_data)
Andrew da...@dalkescientific.com