XMLSpider

j16sdiz Sun, 14 Dec 2008 21:58:55 -0800

Author: j16sdiz
Date: 2008-12-15 05:58:44 +0000 (Mon, 15 Dec 2008)
New Revision: 24345


Modified:
   trunk/plugins/XMLSpider/XMLSpider.java
Log:
everything in db4o, please test. optimization will follow

Modified: trunk/plugins/XMLSpider/XMLSpider.java
===================================================================
--- trunk/plugins/XMLSpider/XMLSpider.java      2008-12-15 04:20:26 UTC (rev 
24344)
+++ trunk/plugins/XMLSpider/XMLSpider.java      2008-12-15 05:58:44 UTC (rev 
24345)
@@ -147,14 +147,24 @@
                }
        }
        
+       static class TermPosition {
+               /** Term */
+               String word;
+               /** Page id */
+               long pageId;
+               /** Position List */
+               int[] positions;
+
+               public TermPosition() {
+               }
+       }
+       
        /** Document ID of fetching documents */
        protected Map<Page, ClientGetter> runningFetch = new HashMap<Page, 
ClientGetter>();
 
        long tProducedIndex;
        protected AtomicLong maxPageId;
-       
-       private final HashMap<String, Long[]> idsByWord = new HashMap<String, 
Long[]>();
-       
+               
        private Vector<String> indices;
        private int match;
        private long time_taken;
@@ -190,9 +200,6 @@
        private static final String indexOwner = "Freenet";
        private static final String indexOwnerEmail = null;
        
-//     private final HashMap lastPositionByURI = new HashMap(); /* String 
(URI) -> Integer */ /* Use to determine word position on each uri */
-//     private final HashMap positionsByWordByURI = new HashMap(); /* String 
(URI) -> HashMap (String (word) -> Integer[] (Positions)) */
-       private final HashMap<Long, HashMap<String, Integer[]>> 
positionsByWordById = new HashMap<Long, HashMap<String, Integer[]>>();
        // Can have many; this limit only exists to save memory.
        private static final int maxParallelRequests = 100;
        private int maxShownURIs = 15;
@@ -361,8 +368,16 @@
         */
        public void onSuccess(FetchResult result, ClientGetter state, Page 
page) {
                synchronized (this) {
+                       while (writingIndex && !stopped) {
+                               try {
+                                       wait();
+                               } catch (InterruptedException e) {
+                                       return;
+                               }
+                       }
+
                        if (stopped)
-                               return;
+                               return;                                 
                }
                
                FreenetURI uri = state.getURI();
@@ -449,10 +464,6 @@
                // Produce the main index file.
                Logger.minor(this, "Producing top index...");
 
-               if (idsByWord.isEmpty()) {
-                       System.out.println("No URIs with words");
-                       return;
-               }
                //the main index file 
                File outputFile = new File(DEFAULT_INDEX_DIR+"index.xml");
                // Use a stream so we can explicitly close - minimise number of 
filehandles used.
@@ -568,11 +579,6 @@
         */
        private synchronized void makeSubIndices() throws Exception{
                Logger.normal(this, "Generating index...");
-               //using the tMap generate the xml indices
-               if (idsByWord.isEmpty()) {
-                       System.out.println("No URIs with words");
-                       return;
-               }
 
                Query query = db.query();
                query.constrain(Term.class);
@@ -716,19 +722,21 @@
                for(int i =0;i<list.size();i++)
                {
                        Element wordElement = xmlDoc.createElement("word");
-                       String str = getTermByMd5(list.get(i)).word;
-                       wordElement.setAttribute("v",str );
-                       Long[] idsForWord = idsByWord.get(str);
-                       for (int j = 0; j < idsForWord.length; j++) {
-                               Long id = idsForWord[j];
-                                       Long x = id;
-                               if (x == null) {
-                                       Logger.error(this, "Eh?");
-                                       continue;
-                               }
+                       Term term = getTermByMd5(list.get(i));
+                       wordElement.setAttribute("v", term.word);
+
+                               Query query = db.query();
+                               query.constrain(TermPosition.class);
+
+                               query.descend("word").constrain(term.word);
+                               ObjectSet<TermPosition> set = query.execute();
+
+                               for (TermPosition termPos : set) {
+                                       synchronized (termPos) {
+                                               Page page = 
getPageById(termPos.pageId);
+                                               
+                                               synchronized (page) {
                                
-                               Page page = getPageById(id);
-                               
                                /*
                                 * adding file information
                                 * uriElement - lists the id of the file 
containing a particular word
@@ -736,29 +744,30 @@
                                 */
                                Element uriElement = 
xmlDoc.createElement("file");
                                Element fileElement = 
xmlDoc.createElement("file");
-                               uriElement.setAttribute("id", x.toString());
-                               fileElement.setAttribute("id", x.toString());
+                               uriElement.setAttribute("id", 
Long.toString(page.id));
+                                       fileElement.setAttribute("id", 
Long.toString(page.id));
                                fileElement.setAttribute("key", page.uri);
                                fileElement.setAttribute("title", 
page.pageTitle != null ? page.pageTitle : page.uri);
                                
                                /* Position by position */
-
-                               HashMap<String, Integer[]> 
positionsForGivenWord = positionsByWordById.get(x);
-                               Integer[] positions = 
(Integer[])positionsForGivenWord.get(str);
+                                       int[] positions = termPos.positions;
+                       
                                StringBuilder positionList = new 
StringBuilder();
 
                                for(int k=0; k < positions.length ; k++) {
                                        if(k!=0)
                                                positionList.append(',');
-                                       
positionList.append(positions[k].toString());
+                                       positionList.append(positions[k]);
                                }
                                
uriElement.appendChild(xmlDoc.createTextNode(positionList.toString()));
                                wordElement.appendChild(uriElement);
-                               if(!fileid.contains(x))
+                               if (!fileid.contains(page.id))
                                {
-                                       fileid.add(x);
+                                       fileid.add(page.id);
                                        filesElement.appendChild(fileElement);
                                }
+                                               }
+                               }
                        }
                        keywordsElement.appendChild(wordElement);
                }
@@ -1245,9 +1254,9 @@
                                word = word.intern();
                                try{
                                        if(type == null)
-                                               addWord(word, 
lastPosition.intValue() + i, page.id);
+                                               addWord(word, 
lastPosition.intValue() + i);
                                        else
-                                               addWord(word, -1 * (i + 1), 
page.id);
+                                               addWord(word, -1 * (i + 1));
                                }
                                catch (Exception e){}
                        }
@@ -1257,55 +1266,22 @@
                        }
                }
 
-               private void addWord(String word, int position, Long id) throws 
Exception {
-                       synchronized(XMLSpider.this) {
-                               if (word.length() < 3)
-                                       return;
+               private void addWord(String word, int position) throws 
Exception {
+                       if (word.length() < 3)
+                               return;
+                       Term term = getTermByWord(word, true);
+                       TermPosition termPos = getTermPosition(term, page, 
true);
 
-                               Long[] ids = idsByWord.get(word);
+                       synchronized (termPos) {
+                               int[] newPositions = new 
int[termPos.positions.length + 1];
+                               System.arraycopy(termPos.positions, 0, 
newPositions, 0, termPos.positions.length);
+                               newPositions[termPos.positions.length] = 
position;
 
-                               /* Word position indexation */
-                               HashMap<String, Integer[]> 
wordPositionsForOneUri = positionsByWordById.get(id);
-                               /* For a given URI , take as key a word , and 
gives position */
-                               if (wordPositionsForOneUri == null) {
-                                       wordPositionsForOneUri = new 
HashMap<String, Integer[]>();
-                                       wordPositionsForOneUri.put(word, new 
Integer[] { position });
-                                       positionsByWordById.put(id, 
wordPositionsForOneUri);
-                               } 
-                               else {
-                                       Integer[] positions = 
wordPositionsForOneUri.get(word);
-                                       if (positions == null) {
-                                               positions = new Integer[] { 
position };
-                                               
wordPositionsForOneUri.put(word, positions);
-                                       } else {
-                                               Integer[] newPositions = new 
Integer[positions.length + 1];
-                                               System.arraycopy(positions, 0, 
newPositions, 0, positions.length);
-                                               newPositions[positions.length] 
= position;
-                                               
wordPositionsForOneUri.put(word, newPositions);
-                                       }
-                               }
+                               termPos.positions = newPositions;
+                               db.store(termPos);                              
                
+                       }
 
-                               if (ids == null) {
-                                       idsByWord.put(word, new Long[] { id });
-                               } else {
-                                       for (int i = 0; i < ids.length; i++) {
-                                               if (ids[i].equals(id))
-                                                       return;
-                                       }
-                                       Long[] newIDs = new Long[ids.length + 
1];
-                                       System.arraycopy(ids, 0, newIDs, 0, 
ids.length);
-                                       newIDs[ids.length] = id;
-                                       idsByWord.put(word, newIDs);
-                               }
-
-                               synchronized (db) {
-                                       if (getTermByWord(word) == null)
-                                               db.store(new Term(word));
-                               }
-                               //long time_indexing = 
System.currentTimeMillis();
-                               //                      FileWriter outp = new 
FileWriter("logfile",true);
-                               mustWriteIndex = true;
-                       }
+                       mustWriteIndex = true;
                }
        }
 
@@ -1343,7 +1319,10 @@
                } finally {
                        if (!stopped)
                                scheduleMakeIndex();
-                       writingIndex = false;
+                       synchronized (this) {
+                               writingIndex = false;
+                               notifyAll();
+                       }
                }
        }
 
@@ -1421,6 +1400,16 @@
                cfg.objectClass(Term.class).cascadeOnUpdate(true);
                cfg.objectClass(Term.class).cascadeOnDelete(true);
 
+               //- TermPosition
+               
cfg.objectClass(TermPosition.class).objectField("pageId").indexed(true);
+               
cfg.objectClass(TermPosition.class).objectField("word").indexed(true);
+
+               cfg.objectClass(TermPosition.class).callConstructor(true);
+
+               cfg.objectClass(TermPosition.class).cascadeOnActivate(true);
+               cfg.objectClass(TermPosition.class).cascadeOnUpdate(true);
+               cfg.objectClass(TermPosition.class).cascadeOnDelete(true);
+               
                //- Other
                cfg.activationDepth(1);
                cfg.updateDepth(1);
@@ -1472,7 +1461,8 @@
                        return null;
        }
 
-       protected Term getTermByWord(String word) {
+       protected Term getTermByWord(String word, boolean create) {
+               synchronized (this) {
                Query query = db.query();
                query.constrain(Term.class);
                query.descend("word").constrain(word);
@@ -1480,7 +1470,39 @@
 
                if (set.hasNext())
                        return set.next();
-               else
+               else if (create) {
+                       Term term = new Term(word);
+                       db.store(term);
+                       return term;
+               } else
                        return null;
+               }
        }
+       
+       protected TermPosition getTermPosition(Term term, Page page, boolean 
create) {
+               synchronized (term) {
+                       synchronized (page) {
+                               Query query = db.query();
+                               query.constrain(TermPosition.class);
+
+                               query.descend("word").constrain(term.word);
+                               query.descend("pageId").constrain(page.id);
+                               ObjectSet<TermPosition> set = query.execute();
+
+                               if (set.hasNext()) {
+                                       return set.next();
+                               } else if (create) {
+                                       TermPosition termPos = new 
TermPosition();
+                                       termPos.word = term.word;
+                                       termPos.pageId = page.id;
+                                       termPos.positions = new int[0];
+
+                                       db.store(termPos);
+                                       return termPos;
+                               } else {
+                                       return null;
+                               }
+                       }
+               }
+       }
 }

_______________________________________________
cvs mailing list
[email protected]
http://emu.freenetproject.org/cgi-bin/mailman/listinfo/cvs

[freenet-cvs] r24345 - trunk/plugins/XMLSpider

Reply via email to