Author: swatig0
Date: 2007-08-16 20:51:22 +0000 (Thu, 16 Aug 2007)
New Revision: 14732

Modified:
   trunk/plugins/XMLSpider/XMLSpider.java
Log:
Outlinks-inlinks for a page

Modified: trunk/plugins/XMLSpider/XMLSpider.java
===================================================================
--- trunk/plugins/XMLSpider/XMLSpider.java      2007-08-16 20:47:44 UTC (rev 
14731)
+++ trunk/plugins/XMLSpider/XMLSpider.java      2007-08-16 20:51:22 UTC (rev 
14732)
@@ -4,7 +4,6 @@
 package plugins.XMLSpider;

 import java.io.File;
-import java.io.FileWriter;
 import java.io.IOException;
 import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
@@ -13,14 +12,11 @@
 import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
 import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
-import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.Vector;
@@ -33,11 +29,10 @@
 import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;

-import org.w3c.dom.Attr;
+
 import org.w3c.dom.DOMImplementation;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
-import org.w3c.dom.NodeList;
 import org.w3c.dom.Text;

 import freenet.client.ClientMetadata;
@@ -49,7 +44,6 @@
 import freenet.client.async.ClientCallback;
 import freenet.client.async.ClientGetter;
 import freenet.client.async.USKCallback;
-import freenet.clients.http.PageMaker;
 import freenet.clients.http.ToadletContext;
 import freenet.clients.http.ToadletContextClosedException;
 import freenet.clients.http.filter.ContentFilter;
@@ -67,15 +61,13 @@
 import freenet.pluginmanager.FredPluginThreadless;
 import freenet.pluginmanager.PluginHTTPException;
 import freenet.pluginmanager.PluginRespirator;
-import freenet.support.HTMLNode;
 import freenet.support.Logger;
-import freenet.support.MultiValueTable;
 import freenet.support.api.Bucket;
 import freenet.support.api.HTTPRequest;

 /**
- * XMLSpider. Produces index for searching words. 
- * In case the size of the index grows up a specific threshold the index is 
split into several subindices
+ * XMLSpider. Produces xml index for searching words. 
+ * In case the size of the index grows up a specific threshold the index is 
split into several subindices.
  * The indexing key is the md5 hash of the word.
  * 
  *  @author swati goyal
@@ -84,44 +76,70 @@
 public class XMLSpider implements FredPlugin, FredPluginHTTP, 
FredPluginThreadless,  FredPluginHTTPAdvanced,HttpPlugin, ClientCallback, 
USKCallback{

        long tProducedIndex;
-       private TreeMap tMap = new TreeMap();
+       /**
+        * Stores the found words along with md5
+        */
+       public TreeMap tMap = new TreeMap();
        int count;
        // URIs visited, or fetching, or queued. Added once then forgotten 
about.
-       private final HashSet visitedURIs = new HashSet();
-       private final HashSet urisWithWords = new HashSet();
-       private final HashSet idsWithWords = new HashSet();
-       private final HashSet failedURIs = new HashSet();
+       /**
+        * 
+        * Lists the uris that have been vistied by the spider
+        */
+       public final HashSet visitedURIs = new HashSet();
+               private final HashSet idsWithWords = new HashSet();
+       /**
+        * 
+        * Lists the uris that were visited but failed.
+        */
+       public final HashSet failedURIs = new HashSet();
+       
        private final HashSet queuedURISet = new HashSet();
-       private final LinkedList queuedURIList = new LinkedList();
+       /**
+        * 
+        * Lists the uris that are still queued.
+        */
+       public final LinkedList queuedURIList = new LinkedList();
        private final HashMap runningFetchesByURI = new HashMap();
-       private final HashMap urisByWord = new HashMap();
+       
        private final HashMap idsByWord = new HashMap();
-       private final HashMap titlesOfURIs = new HashMap();
+       
        private final HashMap titlesOfIds = new HashMap();
        private final HashMap uriIds = new HashMap();
        private final HashMap idUris = new HashMap();
-       private final HashMap outlinks = new HashMap();
-       private final HashMap inlinks = new HashMap();
+       /**
+        * Lists the outlinks from a particular page, 
+        * </br> indexed by the id of page uri
+        */
+       public final HashMap outlinks = new HashMap();
+       /**
+        * Lists the inlinks to a particular page,
+        *  indexed by the id of page uri.
+        */
+       public final HashMap inlinks = new HashMap();
        private Vector indices;
        private int match;
-       private int id;
-       private Vector list;
+       private Integer id;
+       
        private boolean indexing ;

        private static final int minTimeBetweenEachIndexRewriting = 10;
 /**
- * DEFAULT_INDEX_DIR is the directory where the generated indices are stored.
+ * directory where the generated indices are stored. 
  * Needs to be created before it can be used
  */
-       private static final String DEFAULT_INDEX_DIR = "myindex4/";
+       public static final String DEFAULT_INDEX_DIR = "myindex4/";
+       /**
+        * Lists the allowed mime types of the fetched page. 
+        */
        public Set allowedMIMETypes;
-       private static final int MAX_ENTRIES = 2;
+       private static final int MAX_ENTRIES = 10;
        private static final String pluginName = "XML spider";
        /**
-        * This gives the allowed fraction of total time spent on generating 
indices
-        * max value = 1; min value > 0 
+        * Gives the allowed fraction of total time spent on generating indices 
with
+        * maximum value = 1; minimum value = 0. 
         */
-       private static final double MAX_TIME_SPENT_INDEXING = 0.5;
+       public static final double MAX_TIME_SPENT_INDEXING = 0.5;

        private static final String indexTitle= "XMLSpider index";
        private static final String indexOwner = "Freenet";
@@ -135,47 +153,46 @@
        // Can have many; this limit only exists to save memory.
        private static final int maxParallelRequests = 100;
        private int maxShownURIs = 15;
-       private HashMap urisToNumbers;
+       
        private NodeClientCore core;
        private FetchContext ctx;
        private final short PRIORITY_CLASS = 
RequestStarter.BULK_SPLITFILE_PRIORITY_CLASS;
        private boolean stopped = true;
        PluginRespirator pr;

-
-       private synchronized void queueURI(FreenetURI uri) {
-               //not adding the html condition
+/**
+ * Adds the found uri to the list of to-be-retrieved uris. <p>Every usk uri 
added as ssk.
+ * @param uri the new uri that needs to be fetched for further indexing
+ */
+       public synchronized void queueURI(FreenetURI uri) {
                if((uri.getKeyType()).equals("USK")){
                        if(uri.getSuggestedEdition() < 0)
                                uri = uri.setSuggestedEdition((-1)* 
uri.getSuggestedEdition());
                        try{
-                       uri = ((USK.create(uri)).getSSK()).getURI();
-                       //all uris are added as ssk
-                       (ctx.uskManager).subscribe(USK.create(uri),this, false, 
this);  
+                               uri = ((USK.create(uri)).getSSK()).getURI();
+                               
(ctx.uskManager).subscribe(USK.create(uri),this, false, this);  
                        }
                        catch(Exception e){}
                }
-               
+
                if ((!visitedURIs.contains(uri)) && queuedURISet.add(uri)) {
                        queuedURIList.addLast(uri);
                        visitedURIs.add(uri);
                        uriIds.put(uri, id);
                        idUris.put(id, uri);
-                       id++;
-                       
-                       //the page object of the client will contain the uri of 
the current page
+                       id = new Integer(id.intValue()+1);
                }
        }

        private void startSomeRequests() {

-               
+
                FreenetURI[] initialURIs = 
core.bookmarkManager.getBookmarkURIs();
                for (int i = 0; i < initialURIs.length; i++)
                {
-               queueURI(initialURIs[i]);
+                       queueURI(initialURIs[i]);
                }
-                                       
+
                ArrayList toStart = null;
                synchronized (this) {
                        if (stopped) {
@@ -183,51 +200,31 @@
                        }
                        int running = runningFetchesByURI.size();
                        int queued = queuedURIList.size();
-                       
+
                        if ((running >= maxParallelRequests) || (queued == 0))
                                return;
-                       
+
                        toStart = new ArrayList(Math.min(maxParallelRequests - 
running, queued));
-                       
+
                        for (int i = running; i < maxParallelRequests; i++) {
                                if (queuedURIList.isEmpty())
                                        break;
                                FreenetURI uri = (FreenetURI) 
queuedURIList.removeFirst();
                                queuedURISet.remove(uri);
-//                             if((uri.getKeyType()).equals("USK")){
-//                             if(uri.getSuggestedEdition() < 0)
-//                                     uri = uri.setSuggestedEdition((-1)* 
uri.getSuggestedEdition());
-//                             try{
-//                                     
(ctx.uskManager).subscribe(USK.create(uri),this, false, this);  
-//                             }catch(Exception e){
-//                                     
-//                             }
-                               
-       //                      }
                                ClientGetter getter = makeGetter(uri);
                                toStart.add(getter);
-                               }
+                       }
                }
-                       for (int i = 0; i < toStart.size(); i++) {
-                               
+               for (int i = 0; i < toStart.size(); i++) {
+
                        ClientGetter g = (ClientGetter) toStart.get(i);
                        try {
                                runningFetchesByURI.put(g.getURI(), g);
                                g.start();
-                               FileWriter outp = new 
FileWriter("logfile2",true);
-                               outp.write("URI "+g.getURI().toString()+'\n');
-                               
-                               outp.close();
-                               } catch (FetchException e) {
-                                       onFailure(e, g);
-                               }
-                               catch (IOException e){
-                                       Logger.error(this, "the logfile can not 
be written"+e.toString(), e);
-                               }
-               
+                       } catch (FetchException e) {
+                               onFailure(e, g);
                        }
-               //}
-                               
+               }
        }


@@ -235,7 +232,12 @@
                ClientGetter g = new ClientGetter(this, 
core.requestStarters.chkFetchScheduler, core.requestStarters.sskFetchScheduler, 
uri, ctx, PRIORITY_CLASS, this, null, null);
                return g;
        }
-
+/**
+ * Processes the successfully fetched uri for further outlinks.
+ * 
+ * @param result
+ * @param state
+ */
        public void onSuccess(FetchResult result, ClientGetter state) {
                FreenetURI uri = state.getURI();

@@ -246,22 +248,16 @@
                ClientMetadata cm = result.getMetadata();
                Bucket data = result.asBucket();
                String mimeType = cm.getMIMEType();
-               
+
                sizeOfURIs.put(uri.toString(), new Long(data.size()));
                mimeOfURIs.put(uri.toString(), mimeType);
                PageCallBack page = new PageCallBack();
                page.id = (Integer) uriIds.get(uri);
                inlinks.put(page.id, new Vector());
                outlinks.put(page.id, new Vector());
-               
-               try{
-           FileWriter output = new FileWriter("logfile",true);
-           output.write(uri.toString()+" page " + page.id +"\n");
-           output.close();
-               }
-               catch(Exception e){
-                       Logger.error(this, "The uri could not be removed from 
running "+e.toString(), e);
-               }
+
+               //instead of passing the current object, the pagecallback 
object for every page is passed to the content filter
+               // this is to allow inlinks and outlinks be indexed by specific 
pages
                try {
                        ContentFilter.filter(data, ctx.bucketFactory, mimeType, 
uri.toURI("http://127.0.0.1:8888/";), page);
                } catch (UnsafeContentTypeException e) {
@@ -277,25 +273,15 @@

        public void onFailure(FetchException e, ClientGetter state) {
                FreenetURI uri = state.getURI();
-//             try{
-//                     FileWriter outp = new FileWriter("failed",true);
-//                     outp.write("failed "+e.toString()+" for "+uri+'\n');
-//                     outp.close();
-//                     
-//             }catch(Exception e2){
-//                     
-//             }
+
                synchronized (this) {
                        runningFetchesByURI.remove(uri);
                        failedURIs.add(uri);
                }
                if (e.newURI != null)
                        queueURI(e.newURI);
-//             else
-//                     queueURI(uri);
+
                startSomeRequests();
-               
-               
        }

        public void onSuccess(BaseClientPutter state) {
@@ -321,19 +307,12 @@
  */
        private synchronized void produceIndex2() throws 
IOException,NoSuchAlgorithmException {
                // Produce the main index file.
-               
-               //the number of bits to consider for matching 
-               
-       
-//             if (urisByWord.isEmpty() || urisWithWords.isEmpty()) {
-//                     System.out.println("No URIs with words");
-//                     return;
-//             }
-               
+
                if (idsByWord.isEmpty() || idsWithWords.isEmpty()) {
                        System.out.println("No URIs with words");
                        return;
                }
+               //the main index file 
                File outputFile = new File(DEFAULT_INDEX_DIR+"index.xml");
                StreamResult resultStream;
                resultStream = new StreamResult(outputFile);
@@ -351,7 +330,7 @@
                try {
                        xmlBuilder = xmlFactory.newDocumentBuilder();
                } catch(javax.xml.parsers.ParserConfigurationException e) {
-                       /* Will (should ?) never happen */
+
                        Logger.error(this, "Spider: Error while initializing 
XML generator: "+e.toString());
                        return;
                }
@@ -367,57 +346,42 @@
                /* -> title */
                Element subHeaderElement = xmlDoc.createElement("title");
                Text subHeaderText = xmlDoc.createTextNode(indexTitle);
-               
+
                subHeaderElement.appendChild(subHeaderText);
                headerElement.appendChild(subHeaderElement);

                /* -> owner */
                subHeaderElement = xmlDoc.createElement("owner");
                subHeaderText = xmlDoc.createTextNode(indexOwner);
-               
+
                subHeaderElement.appendChild(subHeaderText);
                headerElement.appendChild(subHeaderElement);
-               
+
                /* -> owner email */
                if(indexOwnerEmail != null) {
                        subHeaderElement = xmlDoc.createElement("email");
                        subHeaderText = xmlDoc.createTextNode(indexOwnerEmail);
-                       
+
                        subHeaderElement.appendChild(subHeaderText);
                        headerElement.appendChild(subHeaderElement);
                }
-
-               
-               //String[] words = (String[]) urisByWord.keySet().toArray(new 
String[urisByWord.size()]);
-               //Arrays.sort(words);
-               
+               /*
+                * the max number of digits in md5 to be used for matching with 
the search query is stored in the xml
+                */
                Element prefixElement = xmlDoc.createElement("prefix");
-               //prefixElement.setAttribute("value",match+"");
-               //this match will be set after processing the TreeMap
-       
-
-               
-               //all index files are ready
                /* Adding word index */
                Element keywordsElement = xmlDoc.createElement("keywords");
                for(int i = 0;i<indices.size();i++){
-                       
//generateSubIndex(DEFAULT_INDEX_DIR+"index_"+Integer.toHexString(i)+".xml");
+
                        Element subIndexElement = 
xmlDoc.createElement("subIndex");
-//                     if(i<=9)
-//                     subIndexElement.setAttribute("key",i+"");
-//                     else
-//                             
subIndexElement.setAttribute("key",Integer.toHexString(i));
                        subIndexElement.setAttribute("key", (String) 
indices.elementAt(i));
                        //the subindex element key will contain the bits used 
for matching in that subindex
                        keywordsElement.appendChild(subIndexElement);
                }
-               
+
                prefixElement.setAttribute("value",match+"");
-               // make sure that prefix is the first child of root Element
                rootElement.appendChild(prefixElement);
                rootElement.appendChild(headerElement);
-               
-               //rootElement.appendChild(filesElement);
                rootElement.appendChild(keywordsElement);

                /* Serialization */
@@ -434,7 +398,7 @@

                serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
                serializer.setOutputProperty(OutputKeys.INDENT,"yes");
-               
+
                /* final step */
                try {
                        serializer.transform(domSource, resultStream);
@@ -445,12 +409,11 @@

                if(Logger.shouldLog(Logger.MINOR, this))
                        Logger.minor(this, "Spider: indexes regenerated.");
-       
-       //the main xml file is generated 
-       //now as each word is generated enter it into the respective subindex
-       //now the parsing will start and nodes will be added as needed 
-               

+               //The main xml file is generated 
+               //As each word is generated enter it into the respective 
subindex
+               //The parsing will start and nodes will be added as needed 
+
        }
        /**
         * Generates the subindices. 
@@ -460,48 +423,36 @@
         * @throws Exception
         */
        private synchronized void generateIndex2() throws Exception{
-               // now we the tree map and we need to use the sorted (md5s) to 
generate the xml indices
-       
-               
+               //using the tMap generate the xml indices
                if (idsByWord.isEmpty() || idsWithWords.isEmpty()) {
                        System.out.println("No URIs with words");
                        return;
                }

-       //      FreenetURI[] uris = (FreenetURI[]) urisWithWords.toArray(new 
FreenetURI[urisWithWords.size()]);
-               Integer[] ids = (Integer[]) idsWithWords.toArray(new 
Integer[idsWithWords.size()]);
-//             urisToNumbers = new HashMap();
-//             for (int i = 0; i < uris.length; i++) {
-//                     urisToNumbers.put(uris[i], new Integer(i));
-//                     }
                indices = new Vector();
                int prefix = 1;
                match = 1;
                Vector list = new Vector();
-               //String str = tMap.firstKey();
                Iterator it = tMap.keySet().iterator();
-                FileWriter outp = new FileWriter("indexing");
-               outp.write("size = "+tMap.size()+"\n");
-               outp.close();
+
                String str = (String) it.next();
-               int i = 0,index =0;
+               int i = 0;
                while(it.hasNext())
                {
-                outp = new FileWriter("indexing",true);
                        String key =(String) it.next();
-                       outp.write(key + "\n");
-                       outp.close();
+                       //create a list of the words to be added in the same 
subindex
                        if(key.substring(0, prefix).equals(str.substring(0, 
prefix))) 
-                               {i++;
-                               list.add(key);
-                               }
+                       {i++;
+                       list.add(key);
+                       }
                        else {
-               generateSubIndex(prefix,list);
-               str = key;
-               list = new Vector();
+                               //generate the appropriate subindex with the 
current list
+                               generateSubIndex(prefix,list);
+                               str = key;
+                               list = new Vector();
+                       }
                }
-               }
-               
+
                generateSubIndex(prefix,list);
        }
        private synchronized Vector subVector(Vector list, int begin, int end){
@@ -511,15 +462,19 @@
        }

        private synchronized void generateSubIndex(int p,Vector list) throws 
Exception{
-               
+               /*
+                * if the list is less than max allowed entries in a file then 
directly generate the xml 
+                * otherwise split the list into further sublists
+                * and iterate till the number of entries per subindex is less 
than the allowed value
+                */
+
                if(list.size() < MAX_ENTRIES)
-               {
-                       //the index can be generated from this list
+               {               
                        generateXML(list,p);
                }
                else
                {
-                       //this means that prefix needs to be incremented
+                       //prefix needs to be incremented
                        if(match <= p) match = p+1; 
                        int prefix = p+1;
                        int i =0;
@@ -529,35 +484,30 @@
                        {
                                String key = (String) list.elementAt(i);
                                if((key.substring(0, 
prefix)).equals(str.substring(0, prefix))) 
-                                       {
-                                       //index = i;
+                               {
                                        i++;
-                                       }
+                               }
                                else {
-                                       
//generateXML(subVector(list,index,i-1),prefix);
                                        
generateSubIndex(prefix,subVector(list,index,i-1));
                                        index = i;
                                        str = key;
                                }
-                               
-
                        }
                        generateSubIndex(prefix,subVector(list,index,i-1));
                }
        }       
-               

-       private synchronized void generateXML (Vector list, int prefix) throws 
Exception
+/**
+ * generates the xml index with the given list of words with prefix number of 
matching bits in md5
+ * @param list  list of the words to be added in the index
+ * @param prefix number of matching bits of md5
+ * @throws Exception
+ */
+       public synchronized void generateXML (Vector list, int prefix) throws 
Exception
        {
-               FileWriter outp = new FileWriter("gen",true);
-               
-               
                String p = ((String) list.elementAt(0)).substring(0, prefix);
-               outp.write("inside gen xml + "+p+"\n");
-               
                indices.add(p);
                File outputFile = new File(DEFAULT_INDEX_DIR+"index_"+p+".xml");
-               //indices.add(p);
                StreamResult resultStream;
                resultStream = new StreamResult(outputFile);

@@ -567,92 +517,66 @@
                DocumentBuilder xmlBuilder = null;
                DOMImplementation impl = null;
                Element rootElement = null;
-
                xmlFactory = DocumentBuilderFactory.newInstance();

-
                try {
                        xmlBuilder = xmlFactory.newDocumentBuilder();
                } catch(javax.xml.parsers.ParserConfigurationException e) {
-                       /* Will (should ?) never happen */
                        Logger.error(this, "Spider: Error while initializing 
XML generator: "+e.toString());
                        return;
                }

-
                impl = xmlBuilder.getDOMImplementation();
-
                /* Starting to generate index */
-
                xmlDoc = impl.createDocument(null, "sub_index", null);
                rootElement = xmlDoc.getDocumentElement();

                /* Adding header to the index */
                Element headerElement = xmlDoc.createElement("header");
-
                /* -> title */
                Element subHeaderElement = xmlDoc.createElement("title");
                Text subHeaderText = xmlDoc.createTextNode(indexTitle);
-               
                subHeaderElement.appendChild(subHeaderText);
                headerElement.appendChild(subHeaderElement);

-                       
-               
                Element filesElement = xmlDoc.createElement("files"); /* 
filesElement != fileElement */
-
                Element EntriesElement = xmlDoc.createElement("entries");
-               
                EntriesElement.setNodeValue(list.size()+"");
-               outp.write("size = "+list.size()+"\n");
                EntriesElement.setAttribute("value", list.size()+"");
-               //all index files are ready
+
                /* Adding word index */
                Element keywordsElement = xmlDoc.createElement("keywords");
-               //words to be added 
                Vector fileid = new Vector();
                for(int i =0;i<list.size();i++)
                {
                        Element wordElement = xmlDoc.createElement("word");
                        String str = (String) tMap.get(list.elementAt(i));
-                       outp.write("word "+str+"\n");
                        wordElement.setAttribute("v",str );
-                       //FreenetURI[] urisForWord = (FreenetURI[]) 
urisByWord.get(str);
                        Integer[] idsForWord = (Integer[]) idsByWord.get(str);
-//                     
                        for (int j = 0; j < idsForWord.length; j++) {
                                Integer id = idsForWord[j];
-                               //Integer x = (Integer) urisToNumbers.get(uri);
                                Integer x = id;
-                               outp.write("x "+x+"\n");
                                if (x == null) {
                                        Logger.error(this, "Eh?");
                                        continue;
                                }
-//
                                Element uriElement = 
xmlDoc.createElement("file");
                                Element fileElement = 
xmlDoc.createElement("file");
                                uriElement.setAttribute("id", x.toString());
                                fileElement.setAttribute("id", x.toString());
-                               //fileElement.setAttribute("key", 
uri.toString());
-                               outp.write("uri 
"+(idUris.get(id)).toString()+"\n");
                                
fileElement.setAttribute("key",(idUris.get(id)).toString());
-////                           /* Position by position */
-                               //HashMap positionsForGivenWord = 
(HashMap)positionsByWordByURI.get(uri.toString());
+                               /* Position by position */
+
                                HashMap positionsForGivenWord = 
(HashMap)positionsByWordById.get(x);
                                Integer[] positions = 
(Integer[])positionsForGivenWord.get(str);
-
                                StringBuffer positionList = new StringBuffer();

                                for(int k=0; k < positions.length ; k++) {
                                        if(k!=0)
                                                positionList.append(',');
-
                                        
positionList.append(positions[k].toString());
                                }
-                               
                                
uriElement.appendChild(xmlDoc.createTextNode(positionList.toString()));
-                       
                                wordElement.appendChild(uriElement);
                                if(!fileid.contains(x))
                                {
@@ -660,12 +584,8 @@
                                        filesElement.appendChild(fileElement);
                                }
                        }
-                       
-                       //Element keywordsElement = (Element) 
root.getElementsByTagName("keywords").item(0);
                        keywordsElement.appendChild(wordElement);
-                               
                }
-               
                rootElement.appendChild(EntriesElement);
                rootElement.appendChild(headerElement);
                rootElement.appendChild(filesElement);
@@ -682,11 +602,8 @@
                        Logger.error(this, "Spider: Error while serializing XML 
(transformFactory.newTransformer()): "+e.toString());
                        return;
                }
-
-
                serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
                serializer.setOutputProperty(OutputKeys.INDENT,"yes");
-               
                /* final step */
                try {
                        serializer.transform(domSource, resultStream);
@@ -697,152 +614,22 @@

                if(Logger.shouldLog(Logger.MINOR, this))
                        Logger.minor(this, "Spider: indexes regenerated.");
-               outp.close();
        }


-       public String search(String str,NodeList list) throws Exception
-       {
-               int prefix = str.length();
-               for(int i = 0;i<list.getLength();i++){
-                       Element subIndex = (Element) list.item(i);
-                       String key = subIndex.getAttribute("key");
-                       if(key.equals(str)) return key;
-               }
-               return search(str.substring(0, prefix-1),list);
-       }
-
-       
        public void handleGet(HTTPRequest request, ToadletContext context) 
throws IOException, ToadletContextClosedException {
-               String action = request.getParam("action");
-               PageMaker pageMaker = context.getPageMaker();
-               if ((action == null) || (action.length() == 0)) {
-                       MultiValueTable responseHeaders = new MultiValueTable();
-                       responseHeaders.put("Location", "?action=list");
-                       context.sendReplyHeaders(301, "Redirect", 
responseHeaders, "text/html; charset=utf-8", 0);
-                       return;
-               } else if ("list".equals(action)) {
-                       
-                       String listName = request.getParam("listName", null);
-                       HTMLNode pageNode = pageMaker.getPageNode("The XML 
Spider", context);
-                       HTMLNode contentNode = 
pageMaker.getContentNode(pageNode);
-                       /* create copies for multi-threaded use */
-                       if (listName == null) {
-                               Map runningFetches = new 
HashMap(runningFetchesByURI);
-                               List queued = new ArrayList(queuedURIList);
-                               Set visited = new HashSet(visitedURIs);
-                               Set failed = new HashSet(failedURIs);
-                               contentNode.addChild(createNavbar(pageMaker, 
runningFetches.size(), queued.size(), visited.size(), failed.size()));
-                               contentNode.addChild(createAddBox(pageMaker, 
context));
-                               contentNode.addChild(createList(pageMaker, 
"Running FetcheIIIs", "running", runningFetches.keySet(), maxShownURIs));
-                               contentNode.addChild(createList(pageMaker, 
"Queued URIs", "queued", queued, maxShownURIs));
-                               contentNode.addChild(createList(pageMaker, 
"Visited URIs", "visited", visited, maxShownURIs));
-                               contentNode.addChild(createList(pageMaker, 
"Failed URIs", "failed", failed, maxShownURIs));
-                       } else {
-                               contentNode.addChild(createBackBox(pageMaker));
-                               if ("failed".equals(listName)) {
-                                       Set failed = new HashSet(failedURIs);
-                                       
contentNode.addChild(createList(pageMaker, "Failed URIs", "failed", failed, 
-1));       
-                               } else if ("visited".equals(listName)) {
-                                       Set visited = new HashSet(visitedURIs);
-                                       
contentNode.addChild(createList(pageMaker, "Visited URIs", "visited", visited, 
-1));
-                               } else if ("queued".equals(listName)) {
-                                       List queued = new 
ArrayList(queuedURIList);
-                                       
contentNode.addChild(createList(pageMaker, "Queued URIs", "queued", queued, 
-1));
-                               } else if ("running".equals(listName)) {
-                                       Map runningFetches = new 
HashMap(runningFetchesByURI);
-                                       
contentNode.addChild(createList(pageMaker, "Running Fetches", "running", 
runningFetches.keySet(), -1));
-                               }
-                       }
-                       MultiValueTable responseHeaders = new MultiValueTable();
-                       byte[] responseBytes = 
pageNode.generate().getBytes("utf-8");
-                       context.sendReplyHeaders(200, "OK", responseHeaders, 
"text/html; charset=utf-8", responseBytes.length);
-                       context.writeData(responseBytes);
-               } else if ("add".equals(action)) {
-                       String uriParam = request.getParam("key");
-                       try {
-                               FreenetURI uri = new FreenetURI(uriParam);
-                               synchronized (this) {
-                                       failedURIs.remove(uri);
-                                       visitedURIs.remove(uri);
-                               }
-                               queueURI(uri);
-                               startSomeRequests();
-                       } catch (MalformedURLException mue1) {
-                               sendSimpleResponse(context, "URL invalid", "The 
given URI is not valid.");
-                               return;
-                       }
-                       MultiValueTable responseHeaders = new MultiValueTable();
-                       responseHeaders.put("Location", "?action=list");
-                       context.sendReplyHeaders(301, "Redirect", 
responseHeaders, "text/html; charset=utf-8", 0);
-                       return;
-               }
+               /*
+                * ignore
+                */
        }

+       
        public void handlePost(HTTPRequest request, ToadletContext context) 
throws IOException {
+               /*
+                * ignore
+                */
        }

-       private void sendSimpleResponse(ToadletContext context, String title, 
String message) throws ToadletContextClosedException, IOException {
-               PageMaker pageMaker = context.getPageMaker();
-               HTMLNode pageNode = pageMaker.getPageNode(title, context);
-               HTMLNode contentNode = pageMaker.getContentNode(pageNode);
-               HTMLNode infobox = 
contentNode.addChild(pageMaker.getInfobox("infobox-alter", title));
-               HTMLNode infoboxContent = pageMaker.getContentNode(infobox);
-               infoboxContent.addChild("#", message);
-               byte[] responseBytes = pageNode.generate().getBytes("utf-8");
-               context.sendReplyHeaders(200, "OK", new MultiValueTable(), 
"text/html; charset=utf-8", responseBytes.length);
-               context.writeData(responseBytes);
-       }
-       
-       private HTMLNode createBackBox(PageMaker pageMaker) {
-               HTMLNode backbox = pageMaker.getInfobox((String) null);
-               HTMLNode backContent = pageMaker.getContentNode(backbox);
-               backContent.addChild("#", "Return to the ");
-               backContent.addChild("a", "href", "?action=list", "list of all 
URIs");
-               backContent.addChild("#", ".");
-               return backbox;
-       }
-       
-       private HTMLNode createAddBox(PageMaker pageMaker, ToadletContext ctx) {
-               HTMLNode addBox = pageMaker.getInfobox("Add a URI");
-               HTMLNode formNode = 
pageMaker.getContentNode(addBox).addChild("form", new String[] { "action", 
"method" }, new String[] { "", "get" });
-               formNode.addChild("input", new String[] { "type", "name", 
"value" }, new String[] { "hidden", "action", "add" });
-               formNode.addChild("input", new String[] { "type", "size", 
"name", "value" }, new String[] { "text", "40", "key", "" });
-               formNode.addChild("input", new String[] { "type", "value" }, 
new String[] { "submit", "Add URI" });
-               return addBox;
-       }
-
-       private HTMLNode createNavbar(PageMaker pageMaker, int running, int 
queued, int visited, int failed) {
-               HTMLNode navbar = pageMaker.getInfobox("navbar", "Page 
Navigation");
-               HTMLNode list = pageMaker.getContentNode(navbar).addChild("ul");
-               list.addChild("li").addChild("a", "href", "#running", "Running 
(" + running + ')');
-               list.addChild("li").addChild("a", "href", "#queued", "Queued (" 
+ queued + ')');
-               list.addChild("li").addChild("a", "href", "#visited", "Visited 
(" + visited + ')');
-               list.addChild("li").addChild("a", "href", "#failed", "Failed (" 
+ failed + ')');
-               return navbar;
-       }
-
-       private HTMLNode createList(PageMaker pageMaker, String listName, 
String anchorName, Collection collection, int maxCount) {
-               HTMLNode listNode = new HTMLNode("div");
-               listNode.addChild("a", "name", anchorName);
-               HTMLNode listBox = pageMaker.getInfobox(listName);
-               HTMLNode listContent = pageMaker.getContentNode(listBox);
-               listNode.addChild(listBox);
-               Iterator collectionItems = collection.iterator();
-               int itemCount = 0;
-               while (collectionItems.hasNext()) {
-                       FreenetURI uri = (FreenetURI) collectionItems.next();
-                       listContent.addChild("#", uri.toString());
-                       listContent.addChild("br");
-                       if (itemCount++ == maxCount) {
-                               listContent.addChild("br");
-                               listContent.addChild("a", "href", 
"?action=list&listName=" + anchorName, "Show all\u2026");
-                               break;
-                       }
-               }
-               return listNode;
-       }
-
        /**
         * @see freenet.oldplugins.plugin.Plugin#getPluginName()
         */
@@ -912,8 +699,11 @@
         }
         return buf.toString();
     }
-       //this function will return the String representation of the MD5 hash 
for the input string 
-       public static String MD5(String text) throws NoSuchAlgorithmException, 
UnsupportedEncodingException  {
+       
+       /*
+        * calculate the md5 for a given string
+        */
+       private static String MD5(String text) throws NoSuchAlgorithmException, 
UnsupportedEncodingException  {
                MessageDigest md;
                md = MessageDigest.getInstance("MD5");
                byte[] md5hash = new byte[32];
@@ -1033,7 +823,7 @@

 public void runPlugin(PluginRespirator pr){
        this.pr = pr;
-       this.id = 0;
+       this.id = new Integer(0);
        this.core = pr.getNode().clientCore;
        this.ctx = core.makeClient((short) 0).getFetchContext();
        ctx.maxSplitfileBlockRetries = 10;
@@ -1044,9 +834,9 @@
        allowedMIMETypes.add(new String("text/html"));
        allowedMIMETypes.add(new String("text/plain"));
        allowedMIMETypes.add(new String("application/xhtml+xml"));
-//     allowedMIMETypes.add(new String("application/zip"));
+
        ctx.allowedMIMETypes = new HashSet(allowedMIMETypes);
-//     ctx.allowedMIMETypes.add("text/html"); 
+
        tProducedIndex = System.currentTimeMillis();
        indexing = true;
        stopped = false;
@@ -1065,11 +855,12 @@
        starterThread.start();
 }

+/**
+ * Interface to the Spider data
+ */
 public String handleHTTPGet(HTTPRequest request) throws PluginHTTPException{
        StringBuffer out = new StringBuffer();
-       // need to produce pretty html
-       //later fredpluginhttpadvanced will give the interface
-       //this brings us to the page from visit
+       
        String listname = request.getParam("list");
        if(listname.length() != 0)
        {
@@ -1097,6 +888,7 @@
                }
        return out.toString();
 }
+
 private void appendList(String listname, StringBuffer out, String stylesheet)
 {
        Iterator it = (runningFetchesByURI.keySet()).iterator();
@@ -1111,6 +903,7 @@
        while(it.hasNext())
                out.append("<code>"+it.next().toString()+"</code><br/>");
 }
+
 private void appendDefaultPageStart(StringBuffer out, String stylesheet) {

        out.append("<HTML><HEAD><TITLE>" + pluginName + "</TITLE>");
@@ -1128,10 +921,11 @@

        Set failed = new HashSet(failedURIs);
        Iterator it=queued.iterator();
-       out.append("<br/>Size :"+runningFetches.size());
+       out.append("<br/>Size :"+runningFetches.size()+"<br/>");
        appendList(runningFetches,out,stylesheet);
        out.append("<p><a href=\"?list="+"running"+"\">Show all</a><br/></p>");
-       out.append("<br/>Size :"+queued.size());
+       out.append("<p><h3>Queued URIs</h3></p>");
+       out.append("<br/>Size :"+queued.size()+"<br/>");
        int i = 0;
        while(it.hasNext()){
                if(i<=maxShownURIs){
@@ -1141,15 +935,19 @@
                i++;
        }
        out.append("<p><a href=\"?list="+"queued"+"\">Show all</a><br/></p>");
-       out.append("<br/>Size :"+visited.size());
+       out.append("<p><h3>Visited URIs</h3></p>");
+       out.append("<br/>Size :"+visited.size()+"<br/>");
        appendList(visited,out,stylesheet);
        out.append("<p><a href=\"?list="+"visited"+"\">Show all</a><br/></p>");
-       out.append("<br/>Size :"+failed.size());
+       out.append("<p><h3>Failed URIs</h3></p>");
+       out.append("<br/>Size :"+failed.size()+"<br/>");
        appendList(failed,out,stylesheet);
        out.append("<p><a href=\"?list="+"failed"+"\">Show all</a><br/></p>");


 }
+
+
 private void appendDefaultHeader(StringBuffer out, String stylesheet){
        out.append("<HTML><HEAD><TITLE>" + pluginName + "</TITLE>");
        if(stylesheet != null)
@@ -1160,6 +958,8 @@
        out.append("<form method=\"GET\"><input type=\"text\" name=\"adduri\" 
/><br/><br/>");
        out.append("<input type=\"submit\" value=\"Add uri\" /></form>");
 }
+
+
 private void appendList(Set  list,StringBuffer out, String stylesheet){
        Iterator it = list.iterator();
        int i = 0;
@@ -1168,169 +968,130 @@
                out.append("<code>"+it.next().toString()+"</code><br/>");
                }
                else{
-                       //out.append("<form method=\"GET\"><input 
type=\"submit\" name=\"Showall\" />");
-//                     if(listname.equals("visited"))
-//                     out.append("<p><a href=\"?list="+listname+">Showall 
visited</a><br/></p>");
-//                     if(listname.equals("failed"))
-//                             out.append("<p><a 
href=\"?list="+listname+">Showall failed</a><br/></p>");
                        break;
                }
                i++;
-               
+                       }
        }
-       
-}

+/**
+ * creates the callback object for each page.
+ *<p>Used to create inlinks and outlinks for each page separately.
+ * @author swati
+ *
+ */
 public class PageCallBack implements FoundURICallback{
-       int id;
-               
+       Integer id;
+       /*
+        * id of the page as refrenced in uriIds
+        */     
        PageCallBack(){
-               id = 0;
+               id = new Integer(0);
        }
+       
        public void foundURI(FreenetURI uri){
-               //now we have the id of the page that had called this link
+
                queueURI(uri);
-               int iduri = (Integer) uriIds.get(uri);
-               Vector outlink = (Vector) outlinks.get(id);
-               if(!outlink.contains(iduri))    
+               Integer iduri = (Integer) uriIds.get(uri);
+
+               if(outlinks.containsKey(id)){
+                       Vector outlink = (Vector) outlinks.get(id);
+                       if(!outlink.contains(iduri))    
+                               outlink.add(iduri);
+                       outlinks.remove(id);
+                       outlinks.put(id, outlink);
+               }
+               else 
+               {
+                       Vector outlink = new Vector();
                        outlink.add(iduri);
-               outlinks.remove(id);
-               outlinks.put(id, outlink);
-               try{
-               FileWriter out = new FileWriter("outlink",true);
-               out.write(" id "+id+" size "+ outlink.size()+" \n");
-               out.close();
-               }catch(Exception e){}
+                       outlinks.put(id, outlink);
+               }

                if(inlinks.containsKey(iduri)){
                        Vector inlink = (Vector) inlinks.get(iduri);
-                       try{
-                               FileWriter out = new FileWriter("inlink",true);
-                               out.write(" id "+iduri+" size "+ 
inlink.size()+" \n");
-                               out.close();
-                               }catch(Exception e){}
-               
                        if(!inlink.contains(id)) inlink.add(id);
                        inlinks.remove(iduri);
                        inlinks.put(iduri, inlink);
-                       
                }
+               else 
+               {
+                       Vector inlink = new Vector();
+                       inlink.add(id);
+                       inlinks.put(iduri, inlink);
+               }
+
                startSomeRequests();
        }
+       
+       
        public void onText(String s, String type, URI baseURI){
-               try{
-                       FileWriter outp = new FileWriter("ontext",true);
-                       outp.write("inside on text with id"+id+" \n");
-                       outp.close();
-               }catch(Exception e){}
-//             FreenetURI uri;
-//             try {
-//                     uri = new FreenetURI(baseURI.getPath().substring(1));
-//             } catch (MalformedURLException e) {
-//                     Logger.error(this, "Caught " + e, e);
-//                     return;
-//             }
-                
-               
-      
+
                if((type != null) && (type.length() != 0) && 
type.toLowerCase().equals("title")
-                  && (s != null) && (s.length() != 0) && (s.indexOf('\n') < 
0)) {
+                               && (s != null) && (s.length() != 0) && 
(s.indexOf('\n') < 0)) {
                        /* We should have a correct title */
-               //      titlesOfURIs.put(uri.toString(), s);
                        titlesOfIds.put(id, s);
-                       
                        type = "title";
-                       
                }
                else type = null;

-
                String[] words = s.split("[^A-Za-z0-9]");
-
                Integer lastPosition = null;
-
-               //lastPosition = (Integer)lastPositionByURI.get(uri.toString());
                lastPosition = (Integer)lastPositionById.get(id);
+
                if(lastPosition == null)
                        lastPosition = new Integer(1); /* We start to count 
from 1 */
-
                for (int i = 0; i < words.length; i++) {
                        String word = words[i];
                        if ((word == null) || (word.length() == 0))
                                continue;
                        word = word.toLowerCase();
                        try{
-                       if(type == null)
-                               addWord(word, lastPosition.intValue() + i, id);
-                       else
-                               addWord(word, -1 * (i+1), id);
+                               if(type == null)
+                                       addWord(word, lastPosition.intValue() + 
i, id);
+                               else
+                                       addWord(word, -1 * (i+1), id);
                        }
                        catch (Exception e){}
                }
-               
+
                if(type == null) {
                        lastPosition = new Integer(lastPosition.intValue() + 
words.length);
-               //      lastPositionByURI.put(uri.toString(), lastPosition);
                        lastPositionById.put(id, lastPosition);
                }
-               
+
        }
-private synchronized void addWord(String word, int position,int id) throws 
Exception{
-               
-               
+       
+       private synchronized void addWord(String word, int position,Integer id) 
throws Exception{
                if(word.length() < 3)
                        return;
-               
-               //word = word.intern();

-
-               //FreenetURI[] uris = (FreenetURI[]) urisByWord.get(word);
                Integer[] ids = (Integer[]) idsByWord.get(word);
-               
-       //      urisWithWords.add(uri);
                idsWithWords.add(id);
-               try{
-                       FileWriter outp = new FileWriter("addWord",true);
-                       outp.write("ID ="+id+" uri ="+idUris.get(id)+"\n");
-                       outp.close();
-               }catch(Exception e){}
-//     FileWriter outp = new FileWriter("uricheck",true);
-//     outp.write(uri.getDocName()+"\n");
-//     outp.write(uri.getKeyType()+"\n");
-//     outp.write(uri.getMetaString()+"\n");
-//     outp.write(uri.getGuessableKey()+"\n");
-//     outp.write(uri.hashCode()+"\n");
-//     outp.write(uri.getPreferredFilename()+"\n");
-//     
-//     outp.close();

                /* Word position indexation */
                HashMap wordPositionsForOneUri = 
(HashMap)positionsByWordById.get(id); /* For a given URI, take as key a word, 
and gives position */
-               
                if(wordPositionsForOneUri == null) {
                        wordPositionsForOneUri = new HashMap();
                        wordPositionsForOneUri.put(word, new Integer[] { new 
Integer(position) });
-                       //positionsByWordByURI.put(uri.toString(), 
wordPositionsForOneUri);
                        positionsByWordById.put(id, wordPositionsForOneUri);
-               } else {
+               } 
+               else {
                        Integer[] positions = 
(Integer[])wordPositionsForOneUri.get(word);
-
                        if(positions == null) {
                                positions = new Integer[] { new 
Integer(position) };
                                wordPositionsForOneUri.put(word, positions);
-                       } else {
+                       } 
+                       else {
                                Integer[] newPositions = new 
Integer[positions.length + 1];
-
                                System.arraycopy(positions, 0, newPositions, 0, 
positions.length);
                                newPositions[positions.length] = new 
Integer(position);
-
                                wordPositionsForOneUri.put(word, newPositions);
                        }
                }
-       
+
                if (ids == null) {
                        idsByWord.put(word, new Integer[] { id });
-                       
                } else {
                        for (int i = 0; i < ids.length; i++) {
                                if (ids[i].equals(id))
@@ -1341,30 +1102,29 @@
                        newIDs[ids.length] = id;
                        idsByWord.put(word, newIDs);
                }
-               //the new word is added here in urisByWord
+
                tMap.put(MD5(word), word);
                long time_indexing = System.currentTimeMillis();
                if (tProducedIndex + minTimeBetweenEachIndexRewriting * 10 < 
System.currentTimeMillis()) {
                        try {
-                               //produceIndex();
-                               //check();
-                               
                                if(indexing){
-                               generateIndex2();
-                               produceIndex2();
-                               if((System.currentTimeMillis() - 
time_indexing)/(System.currentTimeMillis() - tProducedIndex) > 
MAX_TIME_SPENT_INDEXING) indexing= false;
-                               else indexing = true;
+                                       generateIndex2();
+                                       produceIndex2();
+                                       /*
+                                        * ensures that index production 
doesn't eat up the processor time 
+                                        */
+                                       if((System.currentTimeMillis() - 
time_indexing)/(System.currentTimeMillis() - tProducedIndex) > 
MAX_TIME_SPENT_INDEXING) indexing= false;
+                                       else indexing = true;
                                }
-                               
                        } catch (IOException e) {
                                Logger.error(this, "Caught " + e + " while 
creating index", e);
                        }
                        tProducedIndex = System.currentTimeMillis();
                }
-               
        }
-       
 }
+
+
 public String handleHTTPPut(HTTPRequest request) throws PluginHTTPException{
        return null;
 }
@@ -1379,5 +1139,4 @@
        queueURI(uri);
 }

-       
 }


Reply via email to