Author: jflesch
Date: 2006-06-04 23:46:10 +0000 (Sun, 04 Jun 2006)
New Revision: 9053

Modified:
   trunk/freenet/src/freenet/clients/http/Spider.java
   trunk/freenet/src/freenet/clients/http/filter/FilterCallback.java
   trunk/freenet/src/freenet/clients/http/filter/FoundURICallback.java
   trunk/freenet/src/freenet/clients/http/filter/GenericReadFilterCallback.java
   trunk/freenet/src/freenet/clients/http/filter/HTMLFilter.java
   trunk/freenet/src/freenet/clients/http/filter/NullFilterCallback.java
Log:
Spider is now able to put titles in indexes

Modified: trunk/freenet/src/freenet/clients/http/Spider.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/Spider.java  2006-06-04 22:28:24 UTC 
(rev 9052)
+++ trunk/freenet/src/freenet/clients/http/Spider.java  2006-06-04 23:46:10 UTC 
(rev 9053)
@@ -55,7 +55,10 @@
        private final LinkedList queuedURIList = new LinkedList();
        private final HashMap runningFetchesByURI = new HashMap();
        private final HashMap urisByWord = new HashMap();
+       private final HashMap titlesOfURIs = new HashMap();

+       private final int minTimeBetweenEachIndexRewriting = 10;
+
        // Can have many; this limit only exists to save memory.
        private final int maxParallelRequests = 20;
        private int maxShownURIs = 50;
@@ -160,7 +163,8 @@
                startSomeRequests();
        }

-       public void onText(String s, URI baseURI) {
+       public void onText(String s, String type, URI baseURI) {
+
                FreenetURI uri;
                try {
                        uri = new FreenetURI(baseURI.getPath());
@@ -168,6 +172,14 @@
                        Logger.error(this, "Caught " + e, e);
                        return;
                }
+
+               if(type != null && type.length() != 0 && 
type.toLowerCase().equals("title")
+                  && s != null && s.length() != 0 && !s.contains("\n")) {
+                       /* We should have a correct title */
+                       titlesOfURIs.put(uri.toString(false), s);
+               }
+
+
                String[] words = s.split("[^A-Za-z0-9]");
                for (int i = 0; i < words.length; i++) {
                        String word = words[i];
@@ -193,7 +205,7 @@
                        newURIs[uris.length] = uri;
                        urisByWord.put(word, newURIs);
                }
-               if (tProducedIndex + 10 * 1000 < System.currentTimeMillis()) {
+               if (tProducedIndex + minTimeBetweenEachIndexRewriting * 1000 < 
System.currentTimeMillis()) {
                        try {
                                produceIndex();
                        } catch (IOException e) {
@@ -224,6 +236,7 @@
                for (int i = 0; i < uris.length; i++) {
                        urisToNumbers.put(uris[i], new Integer(i));
                        bw.write("!" + uris[i].toString(false) + "\n");
+                       bw.write("+" + 
titlesOfURIs.get(uris[i].toString(false)) + "\n");
                }
                for (int i = 0; i < words.length; i++) {
                        StringBuffer s = new StringBuffer();

Modified: trunk/freenet/src/freenet/clients/http/filter/FilterCallback.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/filter/FilterCallback.java   
2006-06-04 22:28:24 UTC (rev 9052)
+++ trunk/freenet/src/freenet/clients/http/filter/FilterCallback.java   
2006-06-04 23:46:10 UTC (rev 9053)
@@ -30,7 +30,9 @@

        /**
         * Process plain-text. Notification only; can't modify.
+        * Type can be null, or can correspond, for example to HTML tag name 
around text
+        *    (for example: "title")
         */
-       public void onText(String s);
+       public void onText(String s, String type);

 }

Modified: trunk/freenet/src/freenet/clients/http/filter/FoundURICallback.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/filter/FoundURICallback.java 
2006-06-04 22:28:24 UTC (rev 9052)
+++ trunk/freenet/src/freenet/clients/http/filter/FoundURICallback.java 
2006-06-04 23:46:10 UTC (rev 9053)
@@ -8,6 +8,9 @@

        public void foundURI(FreenetURI uri);

-       public void onText(String s, URI baseURI);
-       
+       /* type can be null */
+       /* but type can also be, for example, HTML tag name around text */
+       /* Usefull to find things like titles */
+       public void onText(String s, String type, URI baseURI);
+
 }

Modified: 
trunk/freenet/src/freenet/clients/http/filter/GenericReadFilterCallback.java
===================================================================
--- 
trunk/freenet/src/freenet/clients/http/filter/GenericReadFilterCallback.java    
    2006-06-04 22:28:24 UTC (rev 9052)
+++ 
trunk/freenet/src/freenet/clients/http/filter/GenericReadFilterCallback.java    
    2006-06-04 23:46:10 UTC (rev 9053)
@@ -168,9 +168,9 @@
                }
        }

-       public void onText(String s) {
+       public void onText(String s, String type) {
                if(cb != null)
-                       cb.onText(s, baseURI);
+                       cb.onText(s, type, baseURI);
        }

 }

Modified: trunk/freenet/src/freenet/clients/http/filter/HTMLFilter.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/filter/HTMLFilter.java       
2006-06-04 22:28:24 UTC (rev 9052)
+++ trunk/freenet/src/freenet/clients/http/filter/HTMLFilter.java       
2006-06-04 23:46:10 UTC (rev 9053)
@@ -138,16 +138,18 @@
                        StringBuffer b = new StringBuffer(100);
                        StringBuffer balt = new StringBuffer(4000);
                        Vector splitTag = new Vector();
+                       String currentTag = null;
                        char pprevC = 0;
                        char prevC = 0;
                        char c = 0;
                        mode = INTEXT;
+
                        while (true) {
                                int x = r.read();
                                if (x == -1) {
                                        switch (mode) {
                                                case INTEXT :
-                                                       saveText(b, w, this);
+                                                       saveText(b, currentTag, 
w, this);
                                                        break;
                                                default :
                                                        // Dump unfinished tag
@@ -161,7 +163,9 @@
                                        switch (mode) {
                                                case INTEXT :
                                                        if (c == '<') {
-                                                               saveText(b, w, 
this);
+
+                                                               saveText(b, 
currentTag, w, this);
+
                                                                b.setLength(0);
                                                                
balt.setLength(0);
                                                                mode = INTAG;
@@ -177,7 +181,8 @@
                                                                b.setLength(0);
                                                        } else if (c == '<' && 
Character.isWhitespace(balt.charAt(0))) {
                                                                // Previous was 
an un-escaped < in a script.
-                                                               saveText(balt, 
w, this);
+                                                               saveText(b, 
currentTag, w, this);
+
                                                                
balt.setLength(0);
                                                                b.setLength(0);
                                                                
splitTag.clear();
@@ -185,6 +190,7 @@
                                                                
splitTag.add(b.toString());
                                                                b.setLength(0);
                                                                
processTag(splitTag, w, this);
+                                                               currentTag = 
(String)splitTag.get(0);
                                                                
splitTag.clear();
                                                                
balt.setLength(0);
                                                                mode = INTEXT;
@@ -201,6 +207,9 @@
                                                        } else if (c == '\'') {
                                                                mode = 
INTAGSQUOTES;
                                                                b.append(c);
+                                                       } else if (c == '/') { 
/* Probable end tag */
+                                                               currentTag = 
null; /* We didn't remember what was the last tag, so ... */
+                                                               b.append(c);
                                                        } else {
                                                                b.append(c);
                                                        }
@@ -310,13 +319,14 @@
                                                                if (!killTag)
                                                                        
processTag(splitTag, w, this);
                                                                killTag = false;
+                                                               currentTag = 
(String)splitTag.get(0);
                                                                
splitTag.clear();
                                                                b.setLength(0);
                                                                
balt.setLength(0);
                                                                mode = INTEXT;
                                                        } else if (c == '<' && 
Character.isWhitespace(balt.charAt(0))) {
                                                                // Previous was 
an un-escaped < in a script.
-                                                               saveText(balt, 
w, this);
+                                                               saveText(balt, 
currentTag, w, this);
                                                                
balt.setLength(0);
                                                                b.setLength(0);
                                                                
splitTag.clear();
@@ -354,8 +364,10 @@
                String writeAfterTag = "";
        }

-       void saveText(StringBuffer s, Writer w, HTMLParseContext pc)
+
+       void saveText(StringBuffer s, String tagName, Writer w, 
HTMLParseContext pc)
                throws IOException {
+
                Logger.minor(this, "Saving text: "+s.toString());
                if (pc.killText) {
                        return;
@@ -387,7 +399,8 @@
                }
                String sout = out.toString();
                if(pc.cb != null)
-                       pc.cb.onText(sout);
+                       pc.cb.onText(sout, tagName); /* Tag name is given as 
type for the text */
+               
                w.write(sout);
        }


Modified: trunk/freenet/src/freenet/clients/http/filter/NullFilterCallback.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/filter/NullFilterCallback.java       
2006-06-04 22:28:24 UTC (rev 9052)
+++ trunk/freenet/src/freenet/clients/http/filter/NullFilterCallback.java       
2006-06-04 23:46:10 UTC (rev 9053)
@@ -19,7 +19,7 @@
                return null;
        }

-       public void onText(String s) {
+       public void onText(String s, String type) {
                // Do nothing
        }



Reply via email to