Author: jflesch
Date: 2006-06-07 01:15:07 +0000 (Wed, 07 Jun 2006)
New Revision: 9060

Modified:
   trunk/freenet/src/freenet/clients/http/Spider.java
Log:
Spider now removes anchors from URI before adding them to the queue. This 
should avoid to re-index many times the same file. 



Modified: trunk/freenet/src/freenet/clients/http/Spider.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/Spider.java  2006-06-06 12:35:28 UTC 
(rev 9059)
+++ trunk/freenet/src/freenet/clients/http/Spider.java  2006-06-07 01:15:07 UTC 
(rev 9060)
@@ -69,6 +69,23 @@
        private boolean stopped = true;

        private synchronized void queueURI(FreenetURI uri) {
+               String uriStr = null;
+               
+               /* We remove HTML targets from URI 
(http://my.server/file#target) */
+               /* Else we re-index already indexed file */
+               try {
+                       uriStr = uri.toString(false);
+                       if(uriStr.indexOf("#") > 0)
+                               {
+                                       uriStr = uriStr.substring(0, 
uriStr.indexOf("#"));
+                                       uri = new FreenetURI(uriStr);
+                               }
+               } catch (MalformedURLException e) {
+                       Logger.error(this, "Spider: MalformedURLException: 
"+uriStr+":"+e);
+                       return;
+               }
+
+               
                if ((!visitedURIs.contains(uri)) && queuedURISet.add(uri)) {
                        queuedURIList.addLast(uri);
                        visitedURIs.add(uri);


Reply via email to