Author: toad
Date: 2006-05-26 01:20:55 +0000 (Fri, 26 May 2006)
New Revision: 8873

Added:
   trunk/freenet/src/freenet/clients/http/Spider.java
   trunk/freenet/src/freenet/clients/http/filter/FoundURICallback.java
Modified:
   trunk/freenet/src/freenet/client/async/SingleFileFetcher.java
   trunk/freenet/src/freenet/clients/http/BookmarkManager.java
   trunk/freenet/src/freenet/clients/http/FProxyToadlet.java
   trunk/freenet/src/freenet/clients/http/WelcomeToadlet.java
   trunk/freenet/src/freenet/clients/http/filter/ContentFilter.java
   trunk/freenet/src/freenet/clients/http/filter/FilterCallback.java
   trunk/freenet/src/freenet/clients/http/filter/GenericReadFilterCallback.java
   trunk/freenet/src/freenet/clients/http/filter/HTMLFilter.java
   trunk/freenet/src/freenet/clients/http/filter/NullFilterCallback.java
   trunk/freenet/src/freenet/node/Node.java
   trunk/freenet/src/freenet/node/Version.java
Log:
742: Index spider (tested on testnet). Off unless you hack Node.start().

Modified: trunk/freenet/src/freenet/client/async/SingleFileFetcher.java
===================================================================
--- trunk/freenet/src/freenet/client/async/SingleFileFetcher.java       
2006-05-25 22:15:22 UTC (rev 8872)
+++ trunk/freenet/src/freenet/client/async/SingleFileFetcher.java       
2006-05-26 01:20:55 UTC (rev 8873)
@@ -238,7 +238,7 @@
                                
clientMetadata.mergeNoOverwrite(metadata.getClientMetadata());
                                // Fetch it from the archive
                                if(ah == null)
-                                       throw new 
FetchException(FetchException.UNKNOWN_METADATA, "Archive redirect not in an 
archive");
+                                       throw new 
FetchException(FetchException.UNKNOWN_METADATA, "Archive redirect not in an 
archive manifest");
                                String filename = metadata.getZIPInternalName();
                                Logger.minor(this, "Fetching "+filename);
                                Bucket dataBucket = ah.get(filename, actx, 
null, recursionLevel+1, true);

Modified: trunk/freenet/src/freenet/clients/http/BookmarkManager.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/BookmarkManager.java 2006-05-25 
22:15:22 UTC (rev 8872)
+++ trunk/freenet/src/freenet/clients/http/BookmarkManager.java 2006-05-26 
01:20:55 UTC (rev 8873)
@@ -85,6 +85,15 @@
                return this.bookmarks.elements();
        }

+       public FreenetURI[] getBookmarkURIs() {
+               Bookmark[] b = (Bookmark[]) bookmarks.toArray(new 
Bookmark[bookmarks.size()]);
+               FreenetURI[] uris = new FreenetURI[b.length];
+               for(int i=0;i<uris.length;i++) {
+                       uris[i] = b[i].key;
+               }
+               return uris;
+       }
+       
        public void clear() {
                for (Enumeration e = this.bookmarks.elements(); 
e.hasMoreElements(); ) {
                        Bookmark i = (Bookmark)e.nextElement();

Modified: trunk/freenet/src/freenet/clients/http/FProxyToadlet.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/FProxyToadlet.java   2006-05-25 
22:15:22 UTC (rev 8872)
+++ trunk/freenet/src/freenet/clients/http/FProxyToadlet.java   2006-05-26 
01:20:55 UTC (rev 8873)
@@ -179,7 +179,7 @@

                        try {
                                if(!force && !forcedownload) {
-                                       data = ContentFilter.filter(data, 
ctx.getBucketFactory(), typeName, uri);
+                                       data = ContentFilter.filter(data, 
ctx.getBucketFactory(), typeName, uri, null);
                                }

                                if (forcedownload) {
@@ -415,5 +415,4 @@
                }
                return sb.toString();
        }
-       
 }

Added: trunk/freenet/src/freenet/clients/http/Spider.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/Spider.java  2006-05-25 22:15:22 UTC 
(rev 8872)
+++ trunk/freenet/src/freenet/clients/http/Spider.java  2006-05-26 01:20:55 UTC 
(rev 8873)
@@ -0,0 +1,253 @@
+package freenet.clients.http;
+
+import java.io.BufferedWriter;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.UnsupportedEncodingException;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.Vector;
+
+import freenet.client.ClientMetadata;
+import freenet.client.FetchException;
+import freenet.client.FetchResult;
+import freenet.client.FetcherContext;
+import freenet.client.InserterException;
+import freenet.client.async.BaseClientPutter;
+import freenet.client.async.ClientCallback;
+import freenet.client.async.ClientGetter;
+import freenet.clients.http.filter.ContentFilter;
+import freenet.clients.http.filter.FoundURICallback;
+import freenet.clients.http.filter.UnsafeContentTypeException;
+import freenet.keys.FreenetURI;
+import freenet.node.Node;
+import freenet.node.RequestStarter;
+import freenet.support.Bucket;
+import freenet.support.Logger;
+
+/**
+ * Spider. Produces an index.
+ */
+public class Spider implements ClientCallback, FoundURICallback {
+       
+       long tProducedIndex;
+       
+       // URIs visited, or fetching, or queued. Added once then forgotten 
about.
+       private final HashSet visitedURIs = new HashSet();
+       private final HashSet urisWithWords = new HashSet();
+       private final HashSet failedURIs = new HashSet();
+       private final HashSet queuedURISet = new HashSet();
+       private final LinkedList queuedURIList = new LinkedList();
+       private final HashMap runningFetchesByURI = new HashMap();
+       private final HashMap urisByWord = new HashMap();
+       // Can have many; this limit only exists to save memory.
+       private final int maxParallelRequests = 200;
+       private final Node node;
+       private final FetcherContext ctx;
+       private final short PRIORITY_CLASS = 
RequestStarter.PREFETCH_PRIORITY_CLASS;
+       
+       public Spider(BookmarkManager bm, Node node) {
+               this.node = node;
+               this.ctx = node.makeClient((short)0).getFetcherContext();
+               ctx.maxSplitfileBlockRetries = 10;
+               ctx.maxNonSplitfileRetries = 10;
+               ctx.maxTempLength = 2*1024*1024;
+               ctx.maxOutputLength = 2*1024*1024;
+               FreenetURI[] initialURIs = bm.getBookmarkURIs();
+               for(int i=0;i<initialURIs.length;i++)
+                       queueURI(initialURIs[i]);
+               tProducedIndex = System.currentTimeMillis();
+               startSomeRequests();
+       }
+
+       private synchronized void queueURI(FreenetURI uri) {
+               if((!visitedURIs.contains(uri)) && queuedURISet.add(uri)) {
+                       Logger.minor(this, "Spider queueing URI: "+uri);
+                       queuedURIList.addLast(uri);
+                       visitedURIs.add(uri);
+               }
+       }
+
+       private void startSomeRequests() {
+               Vector toStart = null;
+               synchronized(this) {
+                       int running = runningFetchesByURI.size();
+                       int queued = queuedURIList.size();
+                       if(running == maxParallelRequests || queued == 0) 
return;
+                       if(toStart == null)
+                               toStart = new 
Vector(Math.min(maxParallelRequests-running, queued));
+                       for(int i=running;i<maxParallelRequests;i++) {
+                               if(queuedURIList.isEmpty()) break;
+                               FreenetURI uri = (FreenetURI) 
queuedURIList.removeFirst();
+                               queuedURISet.remove(uri);
+                               ClientGetter getter = makeGetter(uri);
+                               toStart.add(getter);
+                       }
+               }
+               if(toStart != null) {
+                       for(int i=0;i<toStart.size();i++) {
+                               ClientGetter g = (ClientGetter) toStart.get(i);
+                               try {
+                                       Logger.minor(this, "Starting "+g+" for 
"+g.getURI());
+                                       g.start();
+                                       Logger.minor(this, "Started "+g+" for 
"+g.getURI());
+                                       runningFetchesByURI.put(g.getURI(), g);
+                               } catch (FetchException e) {
+                                       onFailure(e, g);
+                               }
+                       }
+               }
+       }
+
+       private ClientGetter makeGetter(FreenetURI uri) {
+               Logger.minor(this, "Starting getter for "+uri);
+               ClientGetter g = new ClientGetter(this, node.chkFetchScheduler, 
node.sskFetchScheduler, uri, ctx, PRIORITY_CLASS, this, null);
+               return g;
+       }
+
+       public void onSuccess(FetchResult result, ClientGetter state) {
+               FreenetURI uri = state.getURI();
+               synchronized(this) {
+                       runningFetchesByURI.remove(uri);
+               }
+               Logger.minor(this, "Success: "+uri);
+               startSomeRequests();
+               ClientMetadata cm = result.getMetadata();
+               Bucket data = result.asBucket();
+               String mimeType = cm.getMIMEType();
+               try {
+                       ContentFilter.filter(data, ctx.bucketFactory, mimeType, 
new URI("http://127.0.0.1:8888/"+uri.toString(false)), this);
+               } catch (UnsafeContentTypeException e) {
+                       return; // Ignore
+               } catch (IOException e) {
+                       Logger.error(this, "Bucket error?: "+e, e);
+               } catch (URISyntaxException e) {
+                       Logger.error(this, "Internal error: "+e, e);
+               } finally {
+                       data.free();
+               }
+       }
+
+       public void onFailure(FetchException e, ClientGetter state) {
+               FreenetURI uri = state.getURI();
+               Logger.minor(this, "Failed: "+uri);
+               synchronized(this) {
+                       failedURIs.add(uri);
+                       runningFetchesByURI.remove(uri);
+               }
+               if(e.newURI != null)
+                       queueURI(e.newURI);
+               startSomeRequests();
+       }
+
+       public void onSuccess(BaseClientPutter state) {
+               // Ignore
+       }
+
+       public void onFailure(InserterException e, BaseClientPutter state) {
+               // Ignore
+       }
+
+       public void onGeneratedURI(FreenetURI uri, BaseClientPutter state) {
+               // Ignore
+       }
+
+       public void foundURI(FreenetURI uri) {
+               queueURI(uri);
+               startSomeRequests();
+       }
+
+       public void onText(String s, URI baseURI) {
+               FreenetURI uri;
+               try {
+                       uri = new FreenetURI(baseURI.getPath());
+               } catch (MalformedURLException e) {
+                       Logger.error(this, "Caught "+e, e);
+                       return;
+               }
+               String[] words = s.split("[^A-Za-z0-9]");
+               for(int i=0;i<words.length;i++) {
+                       String word = words[i];
+                       if(word == null || word.length() == 0) continue;
+                       word = word.toLowerCase();
+                       addWord(word, uri);
+               }
+       }
+
+       private synchronized void addWord(String word, FreenetURI uri) {
+               FreenetURI[] uris = (FreenetURI[]) urisByWord.get(word);
+               urisWithWords.add(uri);
+               if(uris == null) {
+                       urisByWord.put(word, new FreenetURI[] { uri });
+               } else {
+                       for(int i=0;i<uris.length;i++) {
+                               if(uris[i].equals(uri))
+                                       return;
+                       }
+                       FreenetURI[] newURIs = new FreenetURI[uris.length+1];
+                       System.arraycopy(uris, 0, newURIs, 0, uris.length);
+                       newURIs[uris.length] = uri;
+                       urisByWord.put(word, newURIs);
+               }
+               Logger.minor(this, "Added word: "+word+" for "+uri);
+               if(tProducedIndex + 10*1000 < System.currentTimeMillis()) {
+                       try {
+                               produceIndex();
+                       } catch (IOException e) {
+                               Logger.error(this, "Caught "+e+" while creating 
index", e);
+                       }
+                       tProducedIndex = System.currentTimeMillis();
+               }
+       }
+
+       private synchronized void produceIndex() throws IOException {
+               // Produce an index file.
+               FileOutputStream fos = new FileOutputStream("index.new");
+               OutputStreamWriter osw;
+               try {
+                       osw = new OutputStreamWriter(fos, "UTF-8");
+               } catch (UnsupportedEncodingException e) {
+                       throw new Error(e);
+               }
+               BufferedWriter bw = new BufferedWriter(osw);
+               if(urisByWord.isEmpty() || urisWithWords.isEmpty()) {
+                       Logger.minor(this, "No URIs with words");
+                       return;
+               }
+               String[] words = (String[]) urisByWord.keySet().toArray(new 
String[urisByWord.size()]);
+               Arrays.sort(words);
+               FreenetURI[] uris = (FreenetURI[]) urisWithWords.toArray(new 
FreenetURI[urisWithWords.size()]);
+               HashMap urisToNumbers = new HashMap();
+               for(int i=0;i<uris.length;i++) {
+                       urisToNumbers.put(uris[i], new Integer(i));
+                       bw.write("!" + uris[i].toString(false)+"\n");
+               }
+               for(int i=0;i<words.length;i++) {
+                       StringBuffer s = new StringBuffer();
+                       s.append('?');
+                       s.append(words[i]);
+                       FreenetURI[] urisForWord = (FreenetURI[]) 
urisByWord.get(words[i]);
+                       for(int j=0;j<urisForWord.length;j++) {
+                               FreenetURI uri = urisForWord[j];
+                               Integer x = (Integer) urisToNumbers.get(uri);
+                               if(x == null)
+                                       Logger.error(this, "Eh?");
+                               else {
+                                       s.append(' ');
+                                       s.append(x.toString());
+                               }
+                       }
+                       s.append('\n');
+                       bw.write(s.toString());
+               }
+               bw.close();
+       }
+       
+}

Modified: trunk/freenet/src/freenet/clients/http/WelcomeToadlet.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/WelcomeToadlet.java  2006-05-25 
22:15:22 UTC (rev 8872)
+++ trunk/freenet/src/freenet/clients/http/WelcomeToadlet.java  2006-05-26 
01:20:55 UTC (rev 8873)
@@ -38,6 +38,7 @@
                this.node = n;
                this.config = sc;
                this.bookmarks = new BookmarkManager(n);
+               node.bookmarkManager = bookmarks;

                sc.register("bookmarks", n.isTestnetEnabled() ? 
DEFAULT_TESTNET_BOOKMARKS : DEFAULT_DARKNET_BOOKMARKS, 0, false, "List of 
bookmarks", "A list of bookmarked freesites", this.bookmarks.makeCB());


Modified: trunk/freenet/src/freenet/clients/http/filter/ContentFilter.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/filter/ContentFilter.java    
2006-05-25 22:15:22 UTC (rev 8872)
+++ trunk/freenet/src/freenet/clients/http/filter/ContentFilter.java    
2006-05-26 01:20:55 UTC (rev 8873)
@@ -104,7 +104,7 @@
         * Filter some data.
         * @throws IOException If an internal error involving buckets occurred.
         */
-       public static Bucket filter(Bucket data, BucketFactory bf, String 
typeName, URI baseURI) throws UnsafeContentTypeException, IOException {
+       public static Bucket filter(Bucket data, BucketFactory bf, String 
typeName, URI baseURI, FoundURICallback cb) throws UnsafeContentTypeException, 
IOException {
                String type = typeName;
                String options = "";
                String charset = null;
@@ -154,7 +154,7 @@
                                        charset = detectCharset(data, handler);
                                }

-                               return handler.readFilter.readFilter(data, bf, 
charset, otherParams, new GenericReadFilterCallback(baseURI));
+                               return handler.readFilter.readFilter(data, bf, 
charset, otherParams, new GenericReadFilterCallback(baseURI, cb));
                        }
                        handler.throwUnsafeContentTypeException();
                        return null;

Modified: trunk/freenet/src/freenet/clients/http/filter/FilterCallback.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/filter/FilterCallback.java   
2006-05-25 22:15:22 UTC (rev 8872)
+++ trunk/freenet/src/freenet/clients/http/filter/FilterCallback.java   
2006-05-26 01:20:55 UTC (rev 8873)
@@ -28,4 +28,9 @@
         */
        public String onBaseHref(String baseHref);

+       /**
+        * Process plain-text. Notification only; can't modify.
+        */
+       public void onText(String s);
+       
 }

Added: trunk/freenet/src/freenet/clients/http/filter/FoundURICallback.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/filter/FoundURICallback.java 
2006-05-25 22:15:22 UTC (rev 8872)
+++ trunk/freenet/src/freenet/clients/http/filter/FoundURICallback.java 
2006-05-26 01:20:55 UTC (rev 8873)
@@ -0,0 +1,13 @@
+package freenet.clients.http.filter;
+
+import java.net.URI;
+
+import freenet.keys.FreenetURI;
+
+public interface FoundURICallback {
+
+       public void foundURI(FreenetURI uri);
+
+       public void onText(String s, URI baseURI);
+       
+}

Modified: 
trunk/freenet/src/freenet/clients/http/filter/GenericReadFilterCallback.java
===================================================================
--- 
trunk/freenet/src/freenet/clients/http/filter/GenericReadFilterCallback.java    
    2006-05-25 22:15:22 UTC (rev 8872)
+++ 
trunk/freenet/src/freenet/clients/http/filter/GenericReadFilterCallback.java    
    2006-05-26 01:20:55 UTC (rev 8873)
@@ -14,14 +14,17 @@
 public class GenericReadFilterCallback implements FilterCallback {

        private URI baseURI;
+       private final FoundURICallback cb;

-       public GenericReadFilterCallback(URI uri) {
+       public GenericReadFilterCallback(URI uri, FoundURICallback cb) {
                this.baseURI = uri;
+               this.cb = cb;
        }

-       public GenericReadFilterCallback(FreenetURI uri) {
+       public GenericReadFilterCallback(FreenetURI uri, FoundURICallback cb) {
                try {
                        this.baseURI = new URI("/" + uri.toString(false));
+                       this.cb = cb;
                } catch (URISyntaxException e) {
                        throw new Error(e);
                }
@@ -143,6 +146,7 @@
                // Valid freenet URI, allow it
                // Now what about the queries?
                HTTPRequest req = new HTTPRequest(uri);
+               if(cb != null) cb.foundURI(furi);
                return finishProcess(req, overrideType, "/" + 
furi.toString(false), uri, noRelative);
        }

@@ -160,5 +164,10 @@
                        return baseURI.toASCIIString();
                }
        }
+
+       public void onText(String s) {
+               if(cb != null)
+                       cb.onText(s, baseURI);
+       }

 }

Modified: trunk/freenet/src/freenet/clients/http/filter/HTMLFilter.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/filter/HTMLFilter.java       
2006-05-25 22:15:22 UTC (rev 8872)
+++ trunk/freenet/src/freenet/clients/http/filter/HTMLFilter.java       
2006-05-26 01:20:55 UTC (rev 8873)
@@ -384,7 +384,10 @@
                                out.append(c);
                        }
                }
-               w.write(out.toString());
+               String sout = out.toString();
+               if(pc.cb != null)
+                       pc.cb.onText(sout);
+               w.write(sout);
        }

        void processTag(Vector splitTag, Writer w, HTMLParseContext pc)

Modified: trunk/freenet/src/freenet/clients/http/filter/NullFilterCallback.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/filter/NullFilterCallback.java       
2006-05-25 22:15:22 UTC (rev 8872)
+++ trunk/freenet/src/freenet/clients/http/filter/NullFilterCallback.java       
2006-05-26 01:20:55 UTC (rev 8873)
@@ -19,4 +19,8 @@
                return null;
        }

+       public void onText(String s) {
+               // Do nothing
+       }
+
 }

Modified: trunk/freenet/src/freenet/node/Node.java
===================================================================
--- trunk/freenet/src/freenet/node/Node.java    2006-05-25 22:15:22 UTC (rev 
8872)
+++ trunk/freenet/src/freenet/node/Node.java    2006-05-26 01:20:55 UTC (rev 
8873)
@@ -41,9 +41,10 @@
 import freenet.client.async.ClientPutter;
 import freenet.client.async.ClientRequestScheduler;
 import freenet.client.async.USKManager;
+import freenet.clients.http.BookmarkManager;
 import freenet.clients.http.FProxyToadlet;
 import freenet.clients.http.SimpleToadletServer;
-import freenet.config.BooleanCallback;
+import freenet.clients.http.Spider;
 import freenet.config.Config;
 import freenet.config.FilePersistentConfig;
 import freenet.config.IntCallback;
@@ -87,11 +88,11 @@
 import freenet.keys.SSKBlock;
 import freenet.keys.SSKVerifyException;
 import freenet.node.fcp.FCPServer;
+import freenet.node.updater.NodeUpdater;
 import freenet.node.useralerts.BuildOldAgeUserAlert;
 import freenet.node.useralerts.IPUndetectedUserAlert;
 import freenet.node.useralerts.MeaningfulNodeNameUserAlert;
 import freenet.node.useralerts.UserAlertManager;
-import freenet.node.updater.NodeUpdater;
 import freenet.pluginmanager.PluginManager;
 import freenet.store.BerkeleyDBFreenetStore;
 import freenet.store.FreenetStore;
@@ -1404,6 +1405,11 @@
                if(testnetHandler != null)
                        testnetHandler.start();

+               // Spider. FIXME.
+               
+               //if(testnetEnabled)
+               //      new Spider(bookmarkManager, this);
+               
         persistentTempBucketFactory.completedInit();

         redetectAddress();
@@ -2476,6 +2482,7 @@
        }

        FreenetInetAddress lastIP;
+       public BookmarkManager bookmarkManager;

        public void redetectAddress() {
                FreenetInetAddress newIP = detectPrimaryIPAddress();

Modified: trunk/freenet/src/freenet/node/Version.java
===================================================================
--- trunk/freenet/src/freenet/node/Version.java 2006-05-25 22:15:22 UTC (rev 
8872)
+++ trunk/freenet/src/freenet/node/Version.java 2006-05-26 01:20:55 UTC (rev 
8873)
@@ -18,7 +18,7 @@
        public static final String protocolVersion = "1.0";

        /** The build number of the current revision */
-       private static final int buildNumber = 741;
+       private static final int buildNumber = 742;

        /** Oldest build of Fred we will talk to */
        private static final int lastGoodBuild = 732;


Reply via email to