Author: j16sdiz
Date: 2008-12-15 02:42:56 +0000 (Mon, 15 Dec 2008)
New Revision: 24337

Modified:
   trunk/plugins/XMLSpider/XMLSpider.java
Log:
store word/md5 on db4o
(have synchronization problem, will fix when we get full db4o)

Modified: trunk/plugins/XMLSpider/XMLSpider.java
===================================================================
--- trunk/plugins/XMLSpider/XMLSpider.java      2008-12-14 23:17:17 UTC (rev 
24336)
+++ trunk/plugins/XMLSpider/XMLSpider.java      2008-12-15 02:42:56 UTC (rev 
24337)
@@ -21,7 +21,6 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
-import java.util.TreeMap;
 import java.util.Vector;
 import java.util.concurrent.atomic.AtomicLong;
 
@@ -133,14 +132,25 @@
                }
        }
 
+       static class Term {
+               /** MD5 of the term */
+               String md5;
+               /** Term */
+               String word;
+
+               public Term(String word) {
+                       this.word = word;
+                       md5 = MD5(word);
+               }
+
+               public Term() {
+               }
+       }
+       
        /** Document ID of fetching documents */
        protected Map<Page, ClientGetter> runningFetch = new HashMap<Page, 
ClientGetter>();
 
        long tProducedIndex;
-       /**
-        * Stores the found words along with md5
-        */
-       public TreeMap<String, String> tMap = new TreeMap<String, String>();
        protected AtomicLong maxPageId;
        
        private final HashMap<String, Long[]> idsByWord = new HashMap<String, 
Long[]>();
@@ -557,21 +567,25 @@
                        return;
                }
 
+               Query query = db.query();
+               query.constrain(Term.class);
+               query.descend("md5").orderAscending();
+               ObjectSet<Term> termSet = query.execute();
+               
                indices = new Vector<String>();
-               int prefix = (int)(( Math.log(tMap.size()) - 
Math.log(MAX_ENTRIES) ) / Math.log(16)) - 1;
+               int prefix = (int) ((Math.log(termSet.size()) - 
Math.log(MAX_ENTRIES)) / Math.log(16)) - 1;
                if (prefix <= 0) prefix = 1;
                match = 1;
                Vector<String> list = new Vector<String>();
-               Iterator<String> it = tMap.keySet().iterator();
 
-               String str = it.next();
+               String str = termSet.get(0).md5;
                String currentPrefix = str.substring(0, prefix);
                list.add(str);
                                
                int i = 0;
-               while(it.hasNext())
+               for (Term term : termSet)
                {
-                       String key = it.next();
+                       String key = term.md5;
                        //create a list of the words to be added in the same 
subindex
                        if (key.startsWith(currentPrefix)) 
                        {i++;
@@ -695,7 +709,7 @@
                for(int i =0;i<list.size();i++)
                {
                        Element wordElement = xmlDoc.createElement("word");
-                       String str = tMap.get(list.get(i));
+                       String str = getTermByMd5(list.get(i)).word;
                        wordElement.setAttribute("v",str );
                        Long[] idsForWord = idsByWord.get(str);
                        for (int j = 0; j < idsForWord.length; j++) {
@@ -797,14 +811,19 @@
        /*
         * calculate the md5 for a given string
         */
-       private static String MD5(String text) throws NoSuchAlgorithmException, 
UnsupportedEncodingException  {
-               MessageDigest md;
-               md = MessageDigest.getInstance("MD5");
-               byte[] md5hash = new byte[32];
-               byte[] b = text.getBytes("UTF-8");
-               md.update(b, 0, b.length);
-               md5hash = md.digest();
-               return convertToHex(md5hash);
+       private static String MD5(String text) {
+               try {
+                       MessageDigest md = MessageDigest.getInstance("MD5");
+                       byte[] md5hash = new byte[32];
+                       byte[] b = text.getBytes("UTF-8");
+                       md.update(b, 0, b.length);
+                       md5hash = md.digest();
+                       return convertToHex(md5hash);
+               } catch (UnsupportedEncodingException e) {
+                       throw new RuntimeException("UTF-8 not supported", e);
+               } catch (NoSuchAlgorithmException e) {
+                       throw new RuntimeException("MD5 not supported", e);
+               }
        }
 
        public void generateSubIndex(String filename){
@@ -1324,7 +1343,10 @@
                                idsByWord.put(word, newIDs);
                        }
 
-                       tMap.put(MD5(word), word);
+                       synchronized (db) {
+                                       if (getTermByWord(word) == null)
+                                               db.store(new Term(word));
+                               }
                        //long time_indexing = System.currentTimeMillis();
 //                     FileWriter outp = new FileWriter("logfile",true);
                        mustWriteIndex = true;
@@ -1422,10 +1444,20 @@
                cfg.objectClass(Page.class).cascadeOnActivate(true);
                cfg.objectClass(Page.class).cascadeOnUpdate(true);
                cfg.objectClass(Page.class).cascadeOnDelete(true);
+               
+               //- Term
+               cfg.objectClass(Term.class).objectField("md5").indexed(true);
+               cfg.objectClass(Term.class).objectField("word").indexed(true);
 
+               cfg.objectClass(Term.class).callConstructor(true);
+
+               cfg.objectClass(Term.class).cascadeOnActivate(true);
+               cfg.objectClass(Term.class).cascadeOnUpdate(true);
+               cfg.objectClass(Term.class).cascadeOnDelete(true);
+
                //- Other
-               cfg.activationDepth(4);
-               cfg.updateDepth(4);
+               cfg.activationDepth(1);
+               cfg.updateDepth(1);
                cfg.queries().evaluationMode(QueryEvaluationMode.LAZY);
                cfg.diagnostic().addListener(new DiagnosticToConsole());
 
@@ -1461,4 +1493,28 @@
                else
                        return null;
        }
+       
+       protected Term getTermByMd5(String md5) {
+               Query query = db.query();
+               query.constrain(Term.class);
+               query.descend("md5").constrain(md5);
+               ObjectSet<Term> set = query.execute();
+
+               if (set.hasNext())
+                       return set.next();
+               else
+                       return null;
+       }
+
+       protected Term getTermByWord(String word) {
+               Query query = db.query();
+               query.constrain(Term.class);
+               query.descend("word").constrain(word);
+               ObjectSet<Term> set = query.execute();
+
+               if (set.hasNext())
+                       return set.next();
+               else
+                       return null;
+       }
 }

_______________________________________________
cvs mailing list
[email protected]
http://emu.freenetproject.org/cgi-bin/mailman/listinfo/cvs

Reply via email to