Author: jflesch
Date: 2006-06-04 23:46:10 +0000 (Sun, 04 Jun 2006)
New Revision: 9053
Modified:
trunk/freenet/src/freenet/clients/http/Spider.java
trunk/freenet/src/freenet/clients/http/filter/FilterCallback.java
trunk/freenet/src/freenet/clients/http/filter/FoundURICallback.java
trunk/freenet/src/freenet/clients/http/filter/GenericReadFilterCallback.java
trunk/freenet/src/freenet/clients/http/filter/HTMLFilter.java
trunk/freenet/src/freenet/clients/http/filter/NullFilterCallback.java
Log:
Spider is now able to put titles in indexes
Modified: trunk/freenet/src/freenet/clients/http/Spider.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/Spider.java 2006-06-04 22:28:24 UTC
(rev 9052)
+++ trunk/freenet/src/freenet/clients/http/Spider.java 2006-06-04 23:46:10 UTC
(rev 9053)
@@ -55,7 +55,10 @@
private final LinkedList queuedURIList = new LinkedList();
private final HashMap runningFetchesByURI = new HashMap();
private final HashMap urisByWord = new HashMap();
+ private final HashMap titlesOfURIs = new HashMap();
+ private final int minTimeBetweenEachIndexRewriting = 10;
+
// Can have many; this limit only exists to save memory.
private final int maxParallelRequests = 20;
private int maxShownURIs = 50;
@@ -160,7 +163,8 @@
startSomeRequests();
}
- public void onText(String s, URI baseURI) {
+ public void onText(String s, String type, URI baseURI) {
+
FreenetURI uri;
try {
uri = new FreenetURI(baseURI.getPath());
@@ -168,6 +172,14 @@
Logger.error(this, "Caught " + e, e);
return;
}
+
+ if(type != null && type.length() != 0 &&
type.toLowerCase().equals("title")
+ && s != null && s.length() != 0 && !s.contains("\n")) {
+ /* We should have a correct title */
+ titlesOfURIs.put(uri.toString(false), s);
+ }
+
+
String[] words = s.split("[^A-Za-z0-9]");
for (int i = 0; i < words.length; i++) {
String word = words[i];
@@ -193,7 +205,7 @@
newURIs[uris.length] = uri;
urisByWord.put(word, newURIs);
}
- if (tProducedIndex + 10 * 1000 < System.currentTimeMillis()) {
+ if (tProducedIndex + minTimeBetweenEachIndexRewriting * 1000 <
System.currentTimeMillis()) {
try {
produceIndex();
} catch (IOException e) {
@@ -224,6 +236,7 @@
for (int i = 0; i < uris.length; i++) {
urisToNumbers.put(uris[i], new Integer(i));
bw.write("!" + uris[i].toString(false) + "\n");
+ bw.write("+" +
titlesOfURIs.get(uris[i].toString(false)) + "\n");
}
for (int i = 0; i < words.length; i++) {
StringBuffer s = new StringBuffer();
Modified: trunk/freenet/src/freenet/clients/http/filter/FilterCallback.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/filter/FilterCallback.java
2006-06-04 22:28:24 UTC (rev 9052)
+++ trunk/freenet/src/freenet/clients/http/filter/FilterCallback.java
2006-06-04 23:46:10 UTC (rev 9053)
@@ -30,7 +30,9 @@
/**
* Process plain-text. Notification only; can't modify.
+ * Type can be null, or can correspond, for example to HTML tag name
around text
+ * (for example: "title")
*/
- public void onText(String s);
+ public void onText(String s, String type);
}
Modified: trunk/freenet/src/freenet/clients/http/filter/FoundURICallback.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/filter/FoundURICallback.java
2006-06-04 22:28:24 UTC (rev 9052)
+++ trunk/freenet/src/freenet/clients/http/filter/FoundURICallback.java
2006-06-04 23:46:10 UTC (rev 9053)
@@ -8,6 +8,9 @@
public void foundURI(FreenetURI uri);
- public void onText(String s, URI baseURI);
-
+ /* type can be null */
+ /* but type can also be, for example, HTML tag name around text */
+ /* Usefull to find things like titles */
+ public void onText(String s, String type, URI baseURI);
+
}
Modified:
trunk/freenet/src/freenet/clients/http/filter/GenericReadFilterCallback.java
===================================================================
---
trunk/freenet/src/freenet/clients/http/filter/GenericReadFilterCallback.java
2006-06-04 22:28:24 UTC (rev 9052)
+++
trunk/freenet/src/freenet/clients/http/filter/GenericReadFilterCallback.java
2006-06-04 23:46:10 UTC (rev 9053)
@@ -168,9 +168,9 @@
}
}
- public void onText(String s) {
+ public void onText(String s, String type) {
if(cb != null)
- cb.onText(s, baseURI);
+ cb.onText(s, type, baseURI);
}
}
Modified: trunk/freenet/src/freenet/clients/http/filter/HTMLFilter.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/filter/HTMLFilter.java
2006-06-04 22:28:24 UTC (rev 9052)
+++ trunk/freenet/src/freenet/clients/http/filter/HTMLFilter.java
2006-06-04 23:46:10 UTC (rev 9053)
@@ -138,16 +138,18 @@
StringBuffer b = new StringBuffer(100);
StringBuffer balt = new StringBuffer(4000);
Vector splitTag = new Vector();
+ String currentTag = null;
char pprevC = 0;
char prevC = 0;
char c = 0;
mode = INTEXT;
+
while (true) {
int x = r.read();
if (x == -1) {
switch (mode) {
case INTEXT :
- saveText(b, w, this);
+ saveText(b, currentTag,
w, this);
break;
default :
// Dump unfinished tag
@@ -161,7 +163,9 @@
switch (mode) {
case INTEXT :
if (c == '<') {
- saveText(b, w,
this);
+
+ saveText(b,
currentTag, w, this);
+
b.setLength(0);
balt.setLength(0);
mode = INTAG;
@@ -177,7 +181,8 @@
b.setLength(0);
} else if (c == '<' &&
Character.isWhitespace(balt.charAt(0))) {
// Previous was
an un-escaped < in a script.
- saveText(balt,
w, this);
+ saveText(b,
currentTag, w, this);
+
balt.setLength(0);
b.setLength(0);
splitTag.clear();
@@ -185,6 +190,7 @@
splitTag.add(b.toString());
b.setLength(0);
processTag(splitTag, w, this);
+ currentTag =
(String)splitTag.get(0);
splitTag.clear();
balt.setLength(0);
mode = INTEXT;
@@ -201,6 +207,9 @@
} else if (c == '\'') {
mode =
INTAGSQUOTES;
b.append(c);
+ } else if (c == '/') {
/* Probable end tag */
+ currentTag =
null; /* We didn't remember what was the last tag, so ... */
+ b.append(c);
} else {
b.append(c);
}
@@ -310,13 +319,14 @@
if (!killTag)
processTag(splitTag, w, this);
killTag = false;
+ currentTag =
(String)splitTag.get(0);
splitTag.clear();
b.setLength(0);
balt.setLength(0);
mode = INTEXT;
} else if (c == '<' &&
Character.isWhitespace(balt.charAt(0))) {
// Previous was
an un-escaped < in a script.
- saveText(balt,
w, this);
+ saveText(balt,
currentTag, w, this);
balt.setLength(0);
b.setLength(0);
splitTag.clear();
@@ -354,8 +364,10 @@
String writeAfterTag = "";
}
- void saveText(StringBuffer s, Writer w, HTMLParseContext pc)
+
+ void saveText(StringBuffer s, String tagName, Writer w,
HTMLParseContext pc)
throws IOException {
+
Logger.minor(this, "Saving text: "+s.toString());
if (pc.killText) {
return;
@@ -387,7 +399,8 @@
}
String sout = out.toString();
if(pc.cb != null)
- pc.cb.onText(sout);
+ pc.cb.onText(sout, tagName); /* Tag name is given as
type for the text */
+
w.write(sout);
}
Modified: trunk/freenet/src/freenet/clients/http/filter/NullFilterCallback.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/filter/NullFilterCallback.java
2006-06-04 22:28:24 UTC (rev 9052)
+++ trunk/freenet/src/freenet/clients/http/filter/NullFilterCallback.java
2006-06-04 23:46:10 UTC (rev 9053)
@@ -19,7 +19,7 @@
return null;
}
- public void onText(String s) {
+ public void onText(String s, String type) {
// Do nothing
}