Author: jflesch
Date: 2006-06-07 01:15:07 +0000 (Wed, 07 Jun 2006)
New Revision: 9060
Modified:
trunk/freenet/src/freenet/clients/http/Spider.java
Log:
Spider now removes anchors from URI before adding them to the queue. This
should avoid to re-index many times the same file.
Modified: trunk/freenet/src/freenet/clients/http/Spider.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/Spider.java 2006-06-06 12:35:28 UTC
(rev 9059)
+++ trunk/freenet/src/freenet/clients/http/Spider.java 2006-06-07 01:15:07 UTC
(rev 9060)
@@ -69,6 +69,23 @@
private boolean stopped = true;
private synchronized void queueURI(FreenetURI uri) {
+ String uriStr = null;
+
+ /* We remove HTML targets from URI
(http://my.server/file#target) */
+ /* Else we re-index already indexed file */
+ try {
+ uriStr = uri.toString(false);
+ if(uriStr.indexOf("#") > 0)
+ {
+ uriStr = uriStr.substring(0,
uriStr.indexOf("#"));
+ uri = new FreenetURI(uriStr);
+ }
+ } catch (MalformedURLException e) {
+ Logger.error(this, "Spider: MalformedURLException:
"+uriStr+":"+e);
+ return;
+ }
+
+
if ((!visitedURIs.contains(uri)) && queuedURISet.add(uri)) {
queuedURIList.addLast(uri);
visitedURIs.add(uri);