On Feb 22, 2009, at 12:51 PM, Greg Landrum wrote:
BTW: looks like you forgot the attachment.

D'oh!

# Written by Andrew Dalke <da...@dalkescientific.com>
# Andrew Dalke Scientific, AB, Gothenburg, Sweden
# This document has been released to the public domain.

# Data set comes CACTVS keys, generated by
# gzcat ~/databases/Compound_241*.sdf.gz | 
#   awk 'flg==1{id=$1;flg=0} /PUBCHEM_COMPOUND_CID/ {flg=1} /^AAAD.*==/ {print $1, id}' >
#   cactvs_keys.dat
# Result contains lines which look like:
#
# AAADccB6OAAHAAAAAAAAAAAAAAAAAAAAAAAwYAAAAAAAAAABQAAAHgIQAAAACA6hkCIwzoLABACIACXSWAKCCAAhJ0AIiABGb4gPJiPFs5/HOCjk1BHa6AeQQAAAACBAAACACBAAQIAAAQAQIAAAAAAAAA== 24100001
# AAADceB7MQBEAAAAAAAAAAAAAAAAAAAAAAAwYMEAAAAAAACBUAAAHwYQAAAADQqF2CizwIPAAAioAiVSdACCEAFlBxAJiAEAZsgIIDrB35GEIYhglADIyccYiMCOiABAAAACAAAQAIAAAAQAAAAAAAAAAA== 24100002

#### Here are results from running this for different data sets and queries
# The known hit comes from cid:24100068
# The non-hit comes from cid:24776000

# This test set (Compound_241*) has 98837 fingerprints.
# Trie uses 59 MB, bits use 14 MB
# Trie takes 11* time of regular search for a known hit
"""
trie creation 8.29011487961
int creation 1.60190391541
trie search time 0.557299137115
int search time 0.0491080284119
int search finds 15
trie search finds 15
Are they the same? True
Number of fingerprints: 98837
"""

# Same data set, with a fingerprint which isn't in the data set
# Trie is 1.2* the speed of regular search (Note: no hits)
"""
trie creation 8.09642100334
int creation 1.41214299202
trie search time 0.0511682033539
int search time 0.0443210601807
int search finds 0
trie search finds 0
Are they the same? True
Number of fingerprints: 98837
"""

# Using Compound_241* and Compound_242*; together 198819 fingerprints
# Trie uses 105 MB, bits use 28 MB
# With a known hit, trie takes 10* the time of regular search
"""
trie creation 17.7659249306
int creation 3.26757502556
trie search time 0.725470066071
int search time 0.0877990722656
int search finds 30
trie search finds 30
Are they the same? True
Number of fingerprints: 198819
"""

# With a known negative, trie is 20% faster than regular search (NOTE! no hits)
"""
trie creation 17.7014229298
int creation 3.35999107361
trie search time 0.0733439922333
int search time 0.0909128189087
int search finds 0
trie search finds 0
Are they the same? True
Number of fingerprints: 198819
"""


import time

# Decode and grab the first 384 bits, to match Smellie's fp size
def decode_cactvs_fp(encoded_fp):
    return encoded_fp[4:].decode("base64")[:384//8]
    # The order of the bits is very important. Reverse the bits and the trie search time is very slow
    # This is because in reverse order most fingerprints have a lot of leading 0000000000s
    # No branching occurs, so more work during the search.
    #return encoded_fp[4:].decode("base64")[::-1][:384//8]

# convert into a Python integer. This is the fastest way to
# do the substructure tests in pure Python.
def decode_cactvs_fp_as_int(encoded_fp):
    return int(encoded_fp[4:].decode("base64")[:384//8].encode("hex"), 16)
    return int(encoded_fp[4:].decode("base64")[::-1][:384//8].encode("hex"), 16)
    

# This is the leaf node. All values in the node have the same
# fingerprint.  The "tail" is non-empty when there is only one leaf
# node under a given entry in a node. This occurs because most leaves
# have no siblings (Proof: that would require about 2**(384/8) ==
# 2**48 == a lot of structures)

# In that case, "tail" is a compact representation of the linear chain
# of nodes along with the leaf which would contain the same set of
# identifier values.

class Leaf(object):
    # memory optimization
    __slots__ = ["tail", "values"]

    def __init__(self, tail):
        self.tail = tail
        self.values = set()
    def dump(self, depth):
        print " "*depth, self.tail.encode("hex"),
        for value in sorted(self.values):
            print value,
        print


# A Node contains up to 2**8 == 256 branches, one for each possible byte value.
# The child can be another Node or a Leaf.
class Node(object):
    # memory optimization
    __slots__ = ["children"]

    def __init__(self):
        self.children = {}
    def add(self, text, value, offset=0):
        c = text[offset]
        x = self.children.get(c)
        if x is None:
            self.children[c] = y = Leaf(text[offset+1:])
            y.values.add(value)
        elif isinstance(x, Node):
            x.add(text, value, offset+1)
        elif isinstance(x, Leaf):
            #print "compare", repr(x.tail), repr(text[offset+1:])
            if x.tail == text[offset+1:]:
                x.values.add(value)
            elif not x.tail:
                assert offset+1 == len(text)
                x.values.add(value)
            else:
                c2 = x.tail[0]
                x.tail = x.tail[1:]
                self.children[c] = y = Node()
                y.children[c2]  = x
                y.add(text, value, offset+1)
        else:
            raise AssertionError
            
            
    # Useful for debugging
    def dump(self, depth=0):
        for k, v in sorted(self.children.items()):
            print " "*depth, k.encode("hex")
            v.dump(depth+1)

# When I descend the trie, I need to try all target patterns
# contain the query pattern. I compute that once.

#  If the     Then the target
# query has    must have
# bitpattern   bitpattern
# =========   ==============
#    0            0, 1
#    1            1
#
#    00           00, 01, 10, 11
#    01           01, 11
#    10           10, 11
#    11           11

# Apply this to 8 bits at a time
query_to_target_bits = {}
for query_bits in range(256):
    for target_bits in range(256):
        if query_bits & target_bits == query_bits:
            query_to_target_bits.setdefault(chr(query_bits), []).append(chr(target_bits))

assert len(query_to_target_bits["\0"]) == 256
assert query_to_target_bits["\xff"] == ["\xff"]

def substructure_search_trie(node, text, offset=0):
    if isinstance(node, Leaf):
        #print "Have node", repr(node.tail), repr(text[offset:])
        for query_char, target_char in zip(text[offset:], node.tail):
            if target_char not in query_to_target_bits[query_char]:
                #print "They differ at", query_char.encode("hex"), target_char.encode("hex")
                #print text[offset:].encode("hex")
                #print node.tail.encode("hex")
                break
        else:
            yield node.values
    else:
        for target_bits in query_to_target_bits[text[offset]]:
            child = node.children.get(target_bits)
            if child is not None:
                #print "Try child", repr(target_bits)
                for values in substructure_search_trie(child, text, offset+1):
                    yield values
                #print "Done with child", repr(target_bits)

"""
# used during initial development
a = Node()
a.add("Andrew", "apd")
a.add("Axyzzy", "apd2")
a.add("Andrew", "apd3")
a.add("Andrez", "pad3")
a.dump()
print list(substructure_search_trie(a, "A\0\0\0\0\0"))
raise SytemExit()
"""

# Used to select only a few of the input structures
# Currently disabled.
subset = set(['24100068', '24103053'])

def read_fingerprints():
    for line in open("cactvs_keys.dat"):
        encoded_fp, id = line.split()
        ## Used when trying to narrow down the differences between the two searches
        #if id not in subset:
        #    continue
        yield encoded_fp, id
            

def create_trie():
    root = Node()
    for encoded_fp, id in read_fingerprints():
        fp = decode_cactvs_fp(encoded_fp)
        root.add(fp, id)
    return root

# This is the data for doing a regular brute-force screening
def create_bits():
    int_fp_data = []
    for encoded_fp, id in read_fingerprints():
        fp = decode_cactvs_fp_as_int(encoded_fp)
        int_fp_data.append( (fp, id) )
    return int_fp_data

# Brute-force searching.
def substructure_search_int(int_fp_data, query):
    hits = []
    for (fp, id) in int_fp_data:
        if query & fp == query:
            hits.append(id)
    return hits

# I used 'top' to get the size from VSIZE; manually
#size1 = raw_input("Get size: ")
t1 = time.time()
root = create_trie()
t2 = time.time()
#root.dump()
#size2 = raw_input("Get end size: ")
#print float(size2)-float(size1)
print "trie creation", t2-t1

#size1 = raw_input("Get size: ")
#size1=size2
t1 = time.time()
int_fp_data = create_bits()
t2 = time.time()
#size2 = raw_input("Get end size: ")
#print float(size2)-float(size1)
print "int creation", t2-t1

#a.dump()

# This is cid:24776000; does not exist in my data set
encoded_query = "AAADceB7MYAEAAAAAAAAAAAAAAAAAAAAAAA8eIEAAAAAAAABQAAAHwIQAAAADwLBmCwwAIPAAACIAiFSEACCAAAgBQAIiAEIBogIIDKBlxGEIAhglCCIiAcYi8CPhAAAAAAAAAAIAAAAAAAAAAAAAAAAAA=="

# This is cid:24100068; exists in my data set
encoded_query = "AAADceBzMABAAAAAAAAAAAAAAAAAAWLAAAAsAAAAAAAAAAABgAAAHgQAAAAACCjh1gYCiRMIFAisAQVwXACA8KBFCDgAGBU4REACABpgwAANAYhABgDwAUAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=="


query_bytes = decode_cactvs_fp(encoded_query)
#print "This is the query"
#print query_bytes.encode("hex")
query_int = decode_cactvs_fp_as_int(encoded_query)

t1 = time.time()
trie_hits = set()
for hits in substructure_search_trie(root, query_bytes):
    trie_hits |= hits
t2 = time.time()
print "trie search time", t2-t1

t1 = time.time()
int_hits = set(substructure_search_int(int_fp_data, query_int))
t2 = time.time()
print "int search time", t2-t1

print "int search finds", len(int_hits)
print "trie search finds", len(trie_hits)
print "Are they the same?", int_hits == trie_hits
#print sorted(int_hits)
#print sorted(trie_hits)
assert int_hits == trie_hits

print "Number of fingerprints:", len(int_fp_data)



                                Andrew
                                da...@dalkescientific.com


Reply via email to