Author: tack
Date: Wed Apr 19 22:10:36 2006
New Revision: 1425

Modified:
   trunk/base/src/db.py

Log:
Some minor optimizations to keyword searching.


Modified: trunk/base/src/db.py
==============================================================================
--- trunk/base/src/db.py        (original)
+++ trunk/base/src/db.py        Wed Apr 19 22:10:36 2006
@@ -36,7 +36,7 @@
         object_id       INTEGER,
         frequency       FLOAT
     );
-    CREATE INDEX words_map_word_idx ON words_map (word_id, rank, object_type);
+    CREATE INDEX words_map_word_idx ON words_map (word_id, rank, object_type, 
object_id);
     CREATE INDEX words_map_object_idx ON words_map (object_id, object_type);
     CREATE TRIGGER delete_words_map DELETE ON words_map
     BEGIN
@@ -1137,7 +1137,8 @@
             words[row[0]] = {
                 "word": row[1],
                 "count": row[2],
-                "idf_t": math.log(objectcount / row[2] + 1) + order_weight
+                "idf_t": math.log(objectcount / row[2] + 1) + order_weight,
+                "ids": {}
             }
             ids.append(row[0])
             # print "WORD: %s (%d), freq=%d/%d, idf_t=%f" % (row[1], row[0], 
row[2], objectcount, words[row[0]]["idf_t"])
@@ -1156,26 +1157,35 @@
             results[id] = {}
             state[id] = {
                 "offset": [0]*11,
-                "more": [True]*11
+                "more": [True]*11,
+                "count": 0,
+                "done": False
             }
 
         all_results = {}
         if limit == None:
             limit = objectcount
 
-        sql_limit = max(limit*3, 100)
+        sql_limit = min(limit*3, 200)
         finished = False
         nqueries = 0
 
+        # Keep a dict keyed on object_id that we can use to narrow queries
+        # once we have a full list of all objects that match a given word.
+        id_constraints = None
         while not finished:
             for rank in range(10, -1, -1):
                 for id in ids:
-                    if not state[id]["more"][rank]:
+                    if not state[id]["more"][rank] or state[id]["done"]:
+                        # If there's no more results at this rank, or we know
+                        # we've already seen all the results for this word, we
+                        # don't bother with the query.
                         continue
 
                     q = "SELECT object_type,object_id,frequency FROM " \
-                        "words_map WHERE word_id=? AND rank=? %s " \
+                        "words_map WHERE word_id=? AND rank=? %s %%s" \
                         "LIMIT ? OFFSET ?"
+
                     if object_type == None:
                         q %= ""
                         v = (id, rank, sql_limit, state[id]["offset"][rank])
@@ -1183,12 +1193,38 @@
                         q %= "AND object_type=?"
                         v = (id, rank, object_type, sql_limit, 
state[id]["offset"][rank])
 
+                    if id_constraints:
+                        # We know about all objects that match one or more of 
the other
+                        # search words, so we add the constraint that all rows 
for this 
+                        # word match the others as well.  Effectively we push 
the logic
+                        # to generate the intersection into the db.
+                        # This can't benefit from the index if object_type is 
not specified.
+                        q %= " AND object_id IN %s" % 
_list_to_printable(tuple(id_constraints))
+                    else:
+                        q %= ""
+
                     rows = self._db_query(q, v)
                     nqueries += 1
                     state[id]["more"][rank] = len(rows) == sql_limit
+                    state[id]["count"] += len(rows)
 
                     for row in rows:
                         results[id][row[0], row[1]] = row[2] * 
words[id]["idf_t"]
+                        words[id]["ids"][row[1]] = 1
+
+                    if state[id]["count"] >= words[id]["count"] or \
+                       (id_constraints and len(rows) == len(id_constraints)):
+                        # If we've now retrieved all objects for this word, or 
if
+                        # all the results we just got now intersect with our
+                        # constraints set, we're done this word and don't 
bother
+                        # querying it at other ranks.
+                        #print "Done word '%s' at rank %d" % 
(words[id]["word"], rank)
+                        state[id]["done"] = True
+                        if id_constraints is not None:
+                            id_constraints = 
id_constraints.intersection(words[id]["ids"])
+                        else:
+                            id_constraints = Set(words[id]["ids"])
+
 
                 # end loop over words
                 for r in reduce(lambda a, b: Set(a).intersection(Set(b)), 
results.values()):
@@ -1236,7 +1272,7 @@
                 # see if more exists at any rank, increasing offset and
                 # unsetting finished flag so we iterate again.
                 for rank in range(10, -1, -1):
-                    if state[id]["more"][rank]:
+                    if state[id]["more"][rank] and not state[id]["done"]:
                         state[id]["offset"][rank] += sql_limit
                         finished = False
 


-------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
Freevo-cvslog mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/freevo-cvslog

Reply via email to