XMLSpider

[email protected] Thu, 16 Aug 2007 20:47:44 +0000 (UTC)

Author: swatig0
Date: 2007-08-16 20:47:44 +0000 (Thu, 16 Aug 2007)
New Revision: 14731


Modified:
   trunk/plugins/XMLSpider/XMLSpider.java
Log:
Outlinks-inlinks for a page

Modified: trunk/plugins/XMLSpider/XMLSpider.java
===================================================================
--- trunk/plugins/XMLSpider/XMLSpider.java      2007-08-16 20:45:08 UTC (rev 
14730)
+++ trunk/plugins/XMLSpider/XMLSpider.java      2007-08-16 20:47:44 UTC (rev 
14731)
@@ -4,6 +4,7 @@
 package plugins.XMLSpider;

 import java.io.File;
+import java.io.FileWriter;
 import java.io.IOException;
 import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
@@ -12,11 +13,14 @@
 import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.Vector;
@@ -29,10 +33,11 @@
 import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;

-
+import org.w3c.dom.Attr;
 import org.w3c.dom.DOMImplementation;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
 import org.w3c.dom.Text;

 import freenet.client.ClientMetadata;
@@ -44,6 +49,7 @@
 import freenet.client.async.ClientCallback;
 import freenet.client.async.ClientGetter;
 import freenet.client.async.USKCallback;
+import freenet.clients.http.PageMaker;
 import freenet.clients.http.ToadletContext;
 import freenet.clients.http.ToadletContextClosedException;
 import freenet.clients.http.filter.ContentFilter;
@@ -61,13 +67,15 @@
 import freenet.pluginmanager.FredPluginThreadless;
 import freenet.pluginmanager.PluginHTTPException;
 import freenet.pluginmanager.PluginRespirator;
+import freenet.support.HTMLNode;
 import freenet.support.Logger;
+import freenet.support.MultiValueTable;
 import freenet.support.api.Bucket;
 import freenet.support.api.HTTPRequest;

 /**
- * XMLSpider. Produces xml index for searching words. 
- * In case the size of the index grows up a specific threshold the index is 
split into several subindices.
+ * XMLSpider. Produces index for searching words. 
+ * In case the size of the index grows up a specific threshold the index is 
split into several subindices
  * The indexing key is the md5 hash of the word.
  * 
  *  @author swati goyal
@@ -76,70 +84,44 @@
 public class XMLSpider implements FredPlugin, FredPluginHTTP, 
FredPluginThreadless,  FredPluginHTTPAdvanced,HttpPlugin, ClientCallback, 
USKCallback{

        long tProducedIndex;
-       /**
-        * Stores the found words along with md5
-        */
-       public TreeMap tMap = new TreeMap();
+       private TreeMap tMap = new TreeMap();
        int count;
        // URIs visited, or fetching, or queued. Added once then forgotten 
about.
-       /**
-        * 
-        * Lists the uris that have been vistied by the spider
-        */
-       public final HashSet visitedURIs = new HashSet();
-               private final HashSet idsWithWords = new HashSet();
-       /**
-        * 
-        * Lists the uris that were visited but failed.
-        */
-       public final HashSet failedURIs = new HashSet();
-       
+       private final HashSet visitedURIs = new HashSet();
+       private final HashSet urisWithWords = new HashSet();
+       private final HashSet idsWithWords = new HashSet();
+       private final HashSet failedURIs = new HashSet();
        private final HashSet queuedURISet = new HashSet();
-       /**
-        * 
-        * Lists the uris that are still queued.
-        */
-       public final LinkedList queuedURIList = new LinkedList();
+       private final LinkedList queuedURIList = new LinkedList();
        private final HashMap runningFetchesByURI = new HashMap();
-       
+       private final HashMap urisByWord = new HashMap();
        private final HashMap idsByWord = new HashMap();
-       
+       private final HashMap titlesOfURIs = new HashMap();
        private final HashMap titlesOfIds = new HashMap();
        private final HashMap uriIds = new HashMap();
        private final HashMap idUris = new HashMap();
-       /**
-        * Lists the outlinks from a particular page, 
-        * </br> indexed by the id of page uri
-        */
-       public final HashMap outlinks = new HashMap();
-       /**
-        * Lists the inlinks to a particular page,
-        *  indexed by the id of page uri.
-        */
-       public final HashMap inlinks = new HashMap();
+       private final HashMap outlinks = new HashMap();
+       private final HashMap inlinks = new HashMap();
        private Vector indices;
        private int match;
-       private Integer id;
-       
+       private int id;
+       private Vector list;
        private boolean indexing ;

        private static final int minTimeBetweenEachIndexRewriting = 10;
 /**
- * directory where the generated indices are stored. 
+ * DEFAULT_INDEX_DIR is the directory where the generated indices are stored.
  * Needs to be created before it can be used
  */
-       public static final String DEFAULT_INDEX_DIR = "myindex4/";
-       /**
-        * Lists the allowed mime types of the fetched page. 
-        */
+       private static final String DEFAULT_INDEX_DIR = "myindex4/";
        public Set allowedMIMETypes;
-       private static final int MAX_ENTRIES = 10;
+       private static final int MAX_ENTRIES = 2;
        private static final String pluginName = "XML spider";
        /**
-        * Gives the allowed fraction of total time spent on generating indices 
with
-        * maximum value = 1; minimum value = 0. 
+        * This gives the allowed fraction of total time spent on generating 
indices
+        * max value = 1; min value > 0 
         */
-       public static final double MAX_TIME_SPENT_INDEXING = 0.5;
+       private static final double MAX_TIME_SPENT_INDEXING = 0.5;

        private static final String indexTitle= "XMLSpider index";
        private static final String indexOwner = "Freenet";
@@ -153,46 +135,47 @@
        // Can have many; this limit only exists to save memory.
        private static final int maxParallelRequests = 100;
        private int maxShownURIs = 15;
-       
+       private HashMap urisToNumbers;
        private NodeClientCore core;
        private FetchContext ctx;
        private final short PRIORITY_CLASS = 
RequestStarter.BULK_SPLITFILE_PRIORITY_CLASS;
        private boolean stopped = true;
        PluginRespirator pr;

-/**
- * Adds the found uri to the list of to-be-retrieved uris. <p>Every usk uri 
added as ssk.
- * @param uri the new uri that needs to be fetched for further indexing
- */
-       public synchronized void queueURI(FreenetURI uri) {
+
+       private synchronized void queueURI(FreenetURI uri) {
+               //not adding the html condition
                if((uri.getKeyType()).equals("USK")){
                        if(uri.getSuggestedEdition() < 0)
                                uri = uri.setSuggestedEdition((-1)* 
uri.getSuggestedEdition());
                        try{
-                               uri = ((USK.create(uri)).getSSK()).getURI();
-                               
(ctx.uskManager).subscribe(USK.create(uri),this, false, this);  
+                       uri = ((USK.create(uri)).getSSK()).getURI();
+                       //all uris are added as ssk
+                       (ctx.uskManager).subscribe(USK.create(uri),this, false, 
this);  
                        }
                        catch(Exception e){}
                }
-
+               
                if ((!visitedURIs.contains(uri)) && queuedURISet.add(uri)) {
                        queuedURIList.addLast(uri);
                        visitedURIs.add(uri);
                        uriIds.put(uri, id);
                        idUris.put(id, uri);
-                       id = new Integer(id.intValue()+1);
+                       id++;
+                       
+                       //the page object of the client will contain the uri of 
the current page
                }
        }

        private void startSomeRequests() {

-
+               
                FreenetURI[] initialURIs = 
core.bookmarkManager.getBookmarkURIs();
                for (int i = 0; i < initialURIs.length; i++)
                {
-                       queueURI(initialURIs[i]);
+               queueURI(initialURIs[i]);
                }
-
+                                       
                ArrayList toStart = null;
                synchronized (this) {
                        if (stopped) {
@@ -200,31 +183,51 @@
                        }
                        int running = runningFetchesByURI.size();
                        int queued = queuedURIList.size();
-
+                       
                        if ((running >= maxParallelRequests) || (queued == 0))
                                return;
-
+                       
                        toStart = new ArrayList(Math.min(maxParallelRequests - 
running, queued));
-
+                       
                        for (int i = running; i < maxParallelRequests; i++) {
                                if (queuedURIList.isEmpty())
                                        break;
                                FreenetURI uri = (FreenetURI) 
queuedURIList.removeFirst();
                                queuedURISet.remove(uri);
+//                             if((uri.getKeyType()).equals("USK")){
+//                             if(uri.getSuggestedEdition() < 0)
+//                                     uri = uri.setSuggestedEdition((-1)* 
uri.getSuggestedEdition());
+//                             try{
+//                                     
(ctx.uskManager).subscribe(USK.create(uri),this, false, this);  
+//                             }catch(Exception e){
+//                                     
+//                             }
+                               
+       //                      }
                                ClientGetter getter = makeGetter(uri);
                                toStart.add(getter);
-                       }
+                               }
                }
-               for (int i = 0; i < toStart.size(); i++) {
-
+                       for (int i = 0; i < toStart.size(); i++) {
+                               
                        ClientGetter g = (ClientGetter) toStart.get(i);
                        try {
                                runningFetchesByURI.put(g.getURI(), g);
                                g.start();
-                       } catch (FetchException e) {
-                               onFailure(e, g);
+                               FileWriter outp = new 
FileWriter("logfile2",true);
+                               outp.write("URI "+g.getURI().toString()+'\n');
+                               
+                               outp.close();
+                               } catch (FetchException e) {
+                                       onFailure(e, g);
+                               }
+                               catch (IOException e){
+                                       Logger.error(this, "the logfile can not 
be written"+e.toString(), e);
+                               }
+               
                        }
-               }
+               //}
+                               
        }


@@ -232,12 +235,7 @@
                ClientGetter g = new ClientGetter(this, 
core.requestStarters.chkFetchScheduler, core.requestStarters.sskFetchScheduler, 
uri, ctx, PRIORITY_CLASS, this, null, null);
                return g;
        }
-/**
- * Processes the successfully fetched uri for further outlinks.
- * 
- * @param result
- * @param state
- */
+
        public void onSuccess(FetchResult result, ClientGetter state) {
                FreenetURI uri = state.getURI();

@@ -248,16 +246,22 @@
                ClientMetadata cm = result.getMetadata();
                Bucket data = result.asBucket();
                String mimeType = cm.getMIMEType();
-
+               
                sizeOfURIs.put(uri.toString(), new Long(data.size()));
                mimeOfURIs.put(uri.toString(), mimeType);
                PageCallBack page = new PageCallBack();
                page.id = (Integer) uriIds.get(uri);
                inlinks.put(page.id, new Vector());
                outlinks.put(page.id, new Vector());
-
-               //instead of passing the current object, the pagecallback 
object for every page is passed to the content filter
-               // this is to allow inlinks and outlinks be indexed by specific 
pages
+               
+               try{
+           FileWriter output = new FileWriter("logfile",true);
+           output.write(uri.toString()+" page " + page.id +"\n");
+           output.close();
+               }
+               catch(Exception e){
+                       Logger.error(this, "The uri could not be removed from 
running "+e.toString(), e);
+               }
                try {
                        ContentFilter.filter(data, ctx.bucketFactory, mimeType, 
uri.toURI("http://127.0.0.1:8888/";), page);
                } catch (UnsafeContentTypeException e) {
@@ -273,15 +277,25 @@

        public void onFailure(FetchException e, ClientGetter state) {
                FreenetURI uri = state.getURI();
-
+//             try{
+//                     FileWriter outp = new FileWriter("failed",true);
+//                     outp.write("failed "+e.toString()+" for "+uri+'\n');
+//                     outp.close();
+//                     
+//             }catch(Exception e2){
+//                     
+//             }
                synchronized (this) {
                        runningFetchesByURI.remove(uri);
                        failedURIs.add(uri);
                }
                if (e.newURI != null)
                        queueURI(e.newURI);
-
+//             else
+//                     queueURI(uri);
                startSomeRequests();
+               
+               
        }

        public void onSuccess(BaseClientPutter state) {
@@ -307,12 +321,19 @@
  */
        private synchronized void produceIndex2() throws 
IOException,NoSuchAlgorithmException {
                // Produce the main index file.
-
+               
+               //the number of bits to consider for matching 
+               
+       
+//             if (urisByWord.isEmpty() || urisWithWords.isEmpty()) {
+//                     System.out.println("No URIs with words");
+//                     return;
+//             }
+               
                if (idsByWord.isEmpty() || idsWithWords.isEmpty()) {
                        System.out.println("No URIs with words");
                        return;
                }
-               //the main index file 
                File outputFile = new File(DEFAULT_INDEX_DIR+"index.xml");
                StreamResult resultStream;
                resultStream = new StreamResult(outputFile);
@@ -330,7 +351,7 @@
                try {
                        xmlBuilder = xmlFactory.newDocumentBuilder();
                } catch(javax.xml.parsers.ParserConfigurationException e) {
-
+                       /* Will (should ?) never happen */
                        Logger.error(this, "Spider: Error while initializing 
XML generator: "+e.toString());
                        return;
                }
@@ -346,42 +367,57 @@
                /* -> title */
                Element subHeaderElement = xmlDoc.createElement("title");
                Text subHeaderText = xmlDoc.createTextNode(indexTitle);
-
+               
                subHeaderElement.appendChild(subHeaderText);
                headerElement.appendChild(subHeaderElement);

                /* -> owner */
                subHeaderElement = xmlDoc.createElement("owner");
                subHeaderText = xmlDoc.createTextNode(indexOwner);
-
+               
                subHeaderElement.appendChild(subHeaderText);
                headerElement.appendChild(subHeaderElement);
-
+               
                /* -> owner email */
                if(indexOwnerEmail != null) {
                        subHeaderElement = xmlDoc.createElement("email");
                        subHeaderText = xmlDoc.createTextNode(indexOwnerEmail);
-
+                       
                        subHeaderElement.appendChild(subHeaderText);
                        headerElement.appendChild(subHeaderElement);
                }
-               /*
-                * the max number of digits in md5 to be used for matching with 
the search query is stored in the xml
-                */
+
+               
+               //String[] words = (String[]) urisByWord.keySet().toArray(new 
String[urisByWord.size()]);
+               //Arrays.sort(words);
+               
                Element prefixElement = xmlDoc.createElement("prefix");
+               //prefixElement.setAttribute("value",match+"");
+               //this match will be set after processing the TreeMap
+       
+
+               
+               //all index files are ready
                /* Adding word index */
                Element keywordsElement = xmlDoc.createElement("keywords");
                for(int i = 0;i<indices.size();i++){
-
+                       
//generateSubIndex(DEFAULT_INDEX_DIR+"index_"+Integer.toHexString(i)+".xml");
                        Element subIndexElement = 
xmlDoc.createElement("subIndex");
+//                     if(i<=9)
+//                     subIndexElement.setAttribute("key",i+"");
+//                     else
+//                             
subIndexElement.setAttribute("key",Integer.toHexString(i));
                        subIndexElement.setAttribute("key", (String) 
indices.elementAt(i));
                        //the subindex element key will contain the bits used 
for matching in that subindex
                        keywordsElement.appendChild(subIndexElement);
                }
-
+               
                prefixElement.setAttribute("value",match+"");
+               // make sure that prefix is the first child of root Element
                rootElement.appendChild(prefixElement);
                rootElement.appendChild(headerElement);
+               
+               //rootElement.appendChild(filesElement);
                rootElement.appendChild(keywordsElement);

                /* Serialization */
@@ -398,7 +434,7 @@

                serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
                serializer.setOutputProperty(OutputKeys.INDENT,"yes");
-
+               
                /* final step */
                try {
                        serializer.transform(domSource, resultStream);
@@ -409,11 +445,12 @@

                if(Logger.shouldLog(Logger.MINOR, this))
                        Logger.minor(this, "Spider: indexes regenerated.");
+       
+       //the main xml file is generated 
+       //now as each word is generated enter it into the respective subindex
+       //now the parsing will start and nodes will be added as needed 
+               

-               //The main xml file is generated 
-               //As each word is generated enter it into the respective 
subindex
-               //The parsing will start and nodes will be added as needed 
-
        }
        /**
         * Generates the subindices. 
@@ -423,36 +460,48 @@
         * @throws Exception
         */
        private synchronized void generateIndex2() throws Exception{
-               //using the tMap generate the xml indices
+               // now we the tree map and we need to use the sorted (md5s) to 
generate the xml indices
+       
+               
                if (idsByWord.isEmpty() || idsWithWords.isEmpty()) {
                        System.out.println("No URIs with words");
                        return;
                }

+       //      FreenetURI[] uris = (FreenetURI[]) urisWithWords.toArray(new 
FreenetURI[urisWithWords.size()]);
+               Integer[] ids = (Integer[]) idsWithWords.toArray(new 
Integer[idsWithWords.size()]);
+//             urisToNumbers = new HashMap();
+//             for (int i = 0; i < uris.length; i++) {
+//                     urisToNumbers.put(uris[i], new Integer(i));
+//                     }
                indices = new Vector();
                int prefix = 1;
                match = 1;
                Vector list = new Vector();
+               //String str = tMap.firstKey();
                Iterator it = tMap.keySet().iterator();
-
+                FileWriter outp = new FileWriter("indexing");
+               outp.write("size = "+tMap.size()+"\n");
+               outp.close();
                String str = (String) it.next();
-               int i = 0;
+               int i = 0,index =0;
                while(it.hasNext())
                {
+                outp = new FileWriter("indexing",true);
                        String key =(String) it.next();
-                       //create a list of the words to be added in the same 
subindex
+                       outp.write(key + "\n");
+                       outp.close();
                        if(key.substring(0, prefix).equals(str.substring(0, 
prefix))) 
-                       {i++;
-                       list.add(key);
-                       }
+                               {i++;
+                               list.add(key);
+                               }
                        else {
-                               //generate the appropriate subindex with the 
current list
-                               generateSubIndex(prefix,list);
-                               str = key;
-                               list = new Vector();
-                       }
+               generateSubIndex(prefix,list);
+               str = key;
+               list = new Vector();
                }
-
+               }
+               
                generateSubIndex(prefix,list);
        }
        private synchronized Vector subVector(Vector list, int begin, int end){
@@ -462,19 +511,15 @@
        }

        private synchronized void generateSubIndex(int p,Vector list) throws 
Exception{
-               /*
-                * if the list is less than max allowed entries in a file then 
directly generate the xml 
-                * otherwise split the list into further sublists
-                * and iterate till the number of entries per subindex is less 
than the allowed value
-                */
-
+               
                if(list.size() < MAX_ENTRIES)
-               {               
+               {
+                       //the index can be generated from this list
                        generateXML(list,p);
                }
                else
                {
-                       //prefix needs to be incremented
+                       //this means that prefix needs to be incremented
                        if(match <= p) match = p+1; 
                        int prefix = p+1;
                        int i =0;
@@ -484,30 +529,35 @@
                        {
                                String key = (String) list.elementAt(i);
                                if((key.substring(0, 
prefix)).equals(str.substring(0, prefix))) 
-                               {
+                                       {
+                                       //index = i;
                                        i++;
-                               }
+                                       }
                                else {
+                                       
//generateXML(subVector(list,index,i-1),prefix);
                                        
generateSubIndex(prefix,subVector(list,index,i-1));
                                        index = i;
                                        str = key;
                                }
+                               
+
                        }
                        generateSubIndex(prefix,subVector(list,index,i-1));
                }
        }       
+               

-/**
- * generates the xml index with the given list of words with prefix number of 
matching bits in md5
- * @param list  list of the words to be added in the index
- * @param prefix number of matching bits of md5
- * @throws Exception
- */
-       public synchronized void generateXML (Vector list, int prefix) throws 
Exception
+       private synchronized void generateXML (Vector list, int prefix) throws 
Exception
        {
+               FileWriter outp = new FileWriter("gen",true);
+               
+               
                String p = ((String) list.elementAt(0)).substring(0, prefix);
+               outp.write("inside gen xml + "+p+"\n");
+               
                indices.add(p);
                File outputFile = new File(DEFAULT_INDEX_DIR+"index_"+p+".xml");
+               //indices.add(p);
                StreamResult resultStream;
                resultStream = new StreamResult(outputFile);

@@ -517,66 +567,92 @@
                DocumentBuilder xmlBuilder = null;
                DOMImplementation impl = null;
                Element rootElement = null;
+
                xmlFactory = DocumentBuilderFactory.newInstance();

+
                try {
                        xmlBuilder = xmlFactory.newDocumentBuilder();
                } catch(javax.xml.parsers.ParserConfigurationException e) {
+                       /* Will (should ?) never happen */
                        Logger.error(this, "Spider: Error while initializing 
XML generator: "+e.toString());
                        return;
                }

+
                impl = xmlBuilder.getDOMImplementation();
+
                /* Starting to generate index */
+
                xmlDoc = impl.createDocument(null, "sub_index", null);
                rootElement = xmlDoc.getDocumentElement();

                /* Adding header to the index */
                Element headerElement = xmlDoc.createElement("header");
+
                /* -> title */
                Element subHeaderElement = xmlDoc.createElement("title");
                Text subHeaderText = xmlDoc.createTextNode(indexTitle);
+               
                subHeaderElement.appendChild(subHeaderText);
                headerElement.appendChild(subHeaderElement);

+                       
+               
                Element filesElement = xmlDoc.createElement("files"); /* 
filesElement != fileElement */
+
                Element EntriesElement = xmlDoc.createElement("entries");
+               
                EntriesElement.setNodeValue(list.size()+"");
+               outp.write("size = "+list.size()+"\n");
                EntriesElement.setAttribute("value", list.size()+"");
-
+               //all index files are ready
                /* Adding word index */
                Element keywordsElement = xmlDoc.createElement("keywords");
+               //words to be added 
                Vector fileid = new Vector();
                for(int i =0;i<list.size();i++)
                {
                        Element wordElement = xmlDoc.createElement("word");
                        String str = (String) tMap.get(list.elementAt(i));
+                       outp.write("word "+str+"\n");
                        wordElement.setAttribute("v",str );
+                       //FreenetURI[] urisForWord = (FreenetURI[]) 
urisByWord.get(str);
                        Integer[] idsForWord = (Integer[]) idsByWord.get(str);
+//                     
                        for (int j = 0; j < idsForWord.length; j++) {
                                Integer id = idsForWord[j];
+                               //Integer x = (Integer) urisToNumbers.get(uri);
                                Integer x = id;
+                               outp.write("x "+x+"\n");
                                if (x == null) {
                                        Logger.error(this, "Eh?");
                                        continue;
                                }
+//
                                Element uriElement = 
xmlDoc.createElement("file");
                                Element fileElement = 
xmlDoc.createElement("file");
                                uriElement.setAttribute("id", x.toString());
                                fileElement.setAttribute("id", x.toString());
+                               //fileElement.setAttribute("key", 
uri.toString());
+                               outp.write("uri 
"+(idUris.get(id)).toString()+"\n");
                                
fileElement.setAttribute("key",(idUris.get(id)).toString());
-                               /* Position by position */
-
+////                           /* Position by position */
+                               //HashMap positionsForGivenWord = 
(HashMap)positionsByWordByURI.get(uri.toString());
                                HashMap positionsForGivenWord = 
(HashMap)positionsByWordById.get(x);
                                Integer[] positions = 
(Integer[])positionsForGivenWord.get(str);
+
                                StringBuffer positionList = new StringBuffer();

                                for(int k=0; k < positions.length ; k++) {
                                        if(k!=0)
                                                positionList.append(',');
+
                                        
positionList.append(positions[k].toString());
                                }
+                               
                                
uriElement.appendChild(xmlDoc.createTextNode(positionList.toString()));
+                       
                                wordElement.appendChild(uriElement);
                                if(!fileid.contains(x))
                                {
@@ -584,8 +660,12 @@
                                        filesElement.appendChild(fileElement);
                                }
                        }
+                       
+                       //Element keywordsElement = (Element) 
root.getElementsByTagName("keywords").item(0);
                        keywordsElement.appendChild(wordElement);
+                               
                }
+               
                rootElement.appendChild(EntriesElement);
                rootElement.appendChild(headerElement);
                rootElement.appendChild(filesElement);
@@ -602,8 +682,11 @@
                        Logger.error(this, "Spider: Error while serializing XML 
(transformFactory.newTransformer()): "+e.toString());
                        return;
                }
+
+
                serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
                serializer.setOutputProperty(OutputKeys.INDENT,"yes");
+               
                /* final step */
                try {
                        serializer.transform(domSource, resultStream);
@@ -614,22 +697,152 @@

                if(Logger.shouldLog(Logger.MINOR, this))
                        Logger.minor(this, "Spider: indexes regenerated.");
+               outp.close();
        }


+       public String search(String str,NodeList list) throws Exception
+       {
+               int prefix = str.length();
+               for(int i = 0;i<list.getLength();i++){
+                       Element subIndex = (Element) list.item(i);
+                       String key = subIndex.getAttribute("key");
+                       if(key.equals(str)) return key;
+               }
+               return search(str.substring(0, prefix-1),list);
+       }
+
+       
        public void handleGet(HTTPRequest request, ToadletContext context) 
throws IOException, ToadletContextClosedException {
-               /*
-                * ignore
-                */
+               String action = request.getParam("action");
+               PageMaker pageMaker = context.getPageMaker();
+               if ((action == null) || (action.length() == 0)) {
+                       MultiValueTable responseHeaders = new MultiValueTable();
+                       responseHeaders.put("Location", "?action=list");
+                       context.sendReplyHeaders(301, "Redirect", 
responseHeaders, "text/html; charset=utf-8", 0);
+                       return;
+               } else if ("list".equals(action)) {
+                       
+                       String listName = request.getParam("listName", null);
+                       HTMLNode pageNode = pageMaker.getPageNode("The XML 
Spider", context);
+                       HTMLNode contentNode = 
pageMaker.getContentNode(pageNode);
+                       /* create copies for multi-threaded use */
+                       if (listName == null) {
+                               Map runningFetches = new 
HashMap(runningFetchesByURI);
+                               List queued = new ArrayList(queuedURIList);
+                               Set visited = new HashSet(visitedURIs);
+                               Set failed = new HashSet(failedURIs);
+                               contentNode.addChild(createNavbar(pageMaker, 
runningFetches.size(), queued.size(), visited.size(), failed.size()));
+                               contentNode.addChild(createAddBox(pageMaker, 
context));
+                               contentNode.addChild(createList(pageMaker, 
"Running FetcheIIIs", "running", runningFetches.keySet(), maxShownURIs));
+                               contentNode.addChild(createList(pageMaker, 
"Queued URIs", "queued", queued, maxShownURIs));
+                               contentNode.addChild(createList(pageMaker, 
"Visited URIs", "visited", visited, maxShownURIs));
+                               contentNode.addChild(createList(pageMaker, 
"Failed URIs", "failed", failed, maxShownURIs));
+                       } else {
+                               contentNode.addChild(createBackBox(pageMaker));
+                               if ("failed".equals(listName)) {
+                                       Set failed = new HashSet(failedURIs);
+                                       
contentNode.addChild(createList(pageMaker, "Failed URIs", "failed", failed, 
-1));       
+                               } else if ("visited".equals(listName)) {
+                                       Set visited = new HashSet(visitedURIs);
+                                       
contentNode.addChild(createList(pageMaker, "Visited URIs", "visited", visited, 
-1));
+                               } else if ("queued".equals(listName)) {
+                                       List queued = new 
ArrayList(queuedURIList);
+                                       
contentNode.addChild(createList(pageMaker, "Queued URIs", "queued", queued, 
-1));
+                               } else if ("running".equals(listName)) {
+                                       Map runningFetches = new 
HashMap(runningFetchesByURI);
+                                       
contentNode.addChild(createList(pageMaker, "Running Fetches", "running", 
runningFetches.keySet(), -1));
+                               }
+                       }
+                       MultiValueTable responseHeaders = new MultiValueTable();
+                       byte[] responseBytes = 
pageNode.generate().getBytes("utf-8");
+                       context.sendReplyHeaders(200, "OK", responseHeaders, 
"text/html; charset=utf-8", responseBytes.length);
+                       context.writeData(responseBytes);
+               } else if ("add".equals(action)) {
+                       String uriParam = request.getParam("key");
+                       try {
+                               FreenetURI uri = new FreenetURI(uriParam);
+                               synchronized (this) {
+                                       failedURIs.remove(uri);
+                                       visitedURIs.remove(uri);
+                               }
+                               queueURI(uri);
+                               startSomeRequests();
+                       } catch (MalformedURLException mue1) {
+                               sendSimpleResponse(context, "URL invalid", "The 
given URI is not valid.");
+                               return;
+                       }
+                       MultiValueTable responseHeaders = new MultiValueTable();
+                       responseHeaders.put("Location", "?action=list");
+                       context.sendReplyHeaders(301, "Redirect", 
responseHeaders, "text/html; charset=utf-8", 0);
+                       return;
+               }
        }

-       
        public void handlePost(HTTPRequest request, ToadletContext context) 
throws IOException {
-               /*
-                * ignore
-                */
        }

+       private void sendSimpleResponse(ToadletContext context, String title, 
String message) throws ToadletContextClosedException, IOException {
+               PageMaker pageMaker = context.getPageMaker();
+               HTMLNode pageNode = pageMaker.getPageNode(title, context);
+               HTMLNode contentNode = pageMaker.getContentNode(pageNode);
+               HTMLNode infobox = 
contentNode.addChild(pageMaker.getInfobox("infobox-alter", title));
+               HTMLNode infoboxContent = pageMaker.getContentNode(infobox);
+               infoboxContent.addChild("#", message);
+               byte[] responseBytes = pageNode.generate().getBytes("utf-8");
+               context.sendReplyHeaders(200, "OK", new MultiValueTable(), 
"text/html; charset=utf-8", responseBytes.length);
+               context.writeData(responseBytes);
+       }
+       
+       private HTMLNode createBackBox(PageMaker pageMaker) {
+               HTMLNode backbox = pageMaker.getInfobox((String) null);
+               HTMLNode backContent = pageMaker.getContentNode(backbox);
+               backContent.addChild("#", "Return to the ");
+               backContent.addChild("a", "href", "?action=list", "list of all 
URIs");
+               backContent.addChild("#", ".");
+               return backbox;
+       }
+       
+       private HTMLNode createAddBox(PageMaker pageMaker, ToadletContext ctx) {
+               HTMLNode addBox = pageMaker.getInfobox("Add a URI");
+               HTMLNode formNode = 
pageMaker.getContentNode(addBox).addChild("form", new String[] { "action", 
"method" }, new String[] { "", "get" });
+               formNode.addChild("input", new String[] { "type", "name", 
"value" }, new String[] { "hidden", "action", "add" });
+               formNode.addChild("input", new String[] { "type", "size", 
"name", "value" }, new String[] { "text", "40", "key", "" });
+               formNode.addChild("input", new String[] { "type", "value" }, 
new String[] { "submit", "Add URI" });
+               return addBox;
+       }
+
+       private HTMLNode createNavbar(PageMaker pageMaker, int running, int 
queued, int visited, int failed) {
+               HTMLNode navbar = pageMaker.getInfobox("navbar", "Page 
Navigation");
+               HTMLNode list = pageMaker.getContentNode(navbar).addChild("ul");
+               list.addChild("li").addChild("a", "href", "#running", "Running 
(" + running + ')');
+               list.addChild("li").addChild("a", "href", "#queued", "Queued (" 
+ queued + ')');
+               list.addChild("li").addChild("a", "href", "#visited", "Visited 
(" + visited + ')');
+               list.addChild("li").addChild("a", "href", "#failed", "Failed (" 
+ failed + ')');
+               return navbar;
+       }
+
+       private HTMLNode createList(PageMaker pageMaker, String listName, 
String anchorName, Collection collection, int maxCount) {
+               HTMLNode listNode = new HTMLNode("div");
+               listNode.addChild("a", "name", anchorName);
+               HTMLNode listBox = pageMaker.getInfobox(listName);
+               HTMLNode listContent = pageMaker.getContentNode(listBox);
+               listNode.addChild(listBox);
+               Iterator collectionItems = collection.iterator();
+               int itemCount = 0;
+               while (collectionItems.hasNext()) {
+                       FreenetURI uri = (FreenetURI) collectionItems.next();
+                       listContent.addChild("#", uri.toString());
+                       listContent.addChild("br");
+                       if (itemCount++ == maxCount) {
+                               listContent.addChild("br");
+                               listContent.addChild("a", "href", 
"?action=list&listName=" + anchorName, "Show all\u2026");
+                               break;
+                       }
+               }
+               return listNode;
+       }
+
        /**
         * @see freenet.oldplugins.plugin.Plugin#getPluginName()
         */
@@ -699,11 +912,8 @@
         }
         return buf.toString();
     }
-       
-       /*
-        * calculate the md5 for a given string
-        */
-       private static String MD5(String text) throws NoSuchAlgorithmException, 
UnsupportedEncodingException  {
+       //this function will return the String representation of the MD5 hash 
for the input string 
+       public static String MD5(String text) throws NoSuchAlgorithmException, 
UnsupportedEncodingException  {
                MessageDigest md;
                md = MessageDigest.getInstance("MD5");
                byte[] md5hash = new byte[32];
@@ -823,7 +1033,7 @@

 public void runPlugin(PluginRespirator pr){
        this.pr = pr;
-       this.id = new Integer(0);
+       this.id = 0;
        this.core = pr.getNode().clientCore;
        this.ctx = core.makeClient((short) 0).getFetchContext();
        ctx.maxSplitfileBlockRetries = 10;
@@ -834,9 +1044,9 @@
        allowedMIMETypes.add(new String("text/html"));
        allowedMIMETypes.add(new String("text/plain"));
        allowedMIMETypes.add(new String("application/xhtml+xml"));
-
+//     allowedMIMETypes.add(new String("application/zip"));
        ctx.allowedMIMETypes = new HashSet(allowedMIMETypes);
-
+//     ctx.allowedMIMETypes.add("text/html"); 
        tProducedIndex = System.currentTimeMillis();
        indexing = true;
        stopped = false;
@@ -855,12 +1065,11 @@
        starterThread.start();
 }

-/**
- * Interface to the Spider data
- */
 public String handleHTTPGet(HTTPRequest request) throws PluginHTTPException{
        StringBuffer out = new StringBuffer();
-       
+       // need to produce pretty html
+       //later fredpluginhttpadvanced will give the interface
+       //this brings us to the page from visit
        String listname = request.getParam("list");
        if(listname.length() != 0)
        {
@@ -888,7 +1097,6 @@
                }
        return out.toString();
 }
-
 private void appendList(String listname, StringBuffer out, String stylesheet)
 {
        Iterator it = (runningFetchesByURI.keySet()).iterator();
@@ -903,7 +1111,6 @@
        while(it.hasNext())
                out.append("<code>"+it.next().toString()+"</code><br/>");
 }
-
 private void appendDefaultPageStart(StringBuffer out, String stylesheet) {

        out.append("<HTML><HEAD><TITLE>" + pluginName + "</TITLE>");
@@ -921,11 +1128,10 @@

        Set failed = new HashSet(failedURIs);
        Iterator it=queued.iterator();
-       out.append("<br/>Size :"+runningFetches.size()+"<br/>");
+       out.append("<br/>Size :"+runningFetches.size());
        appendList(runningFetches,out,stylesheet);
        out.append("<p><a href=\"?list="+"running"+"\">Show all</a><br/></p>");
-       out.append("<p><h3>Queued URIs</h3></p>");
-       out.append("<br/>Size :"+queued.size()+"<br/>");
+       out.append("<br/>Size :"+queued.size());
        int i = 0;
        while(it.hasNext()){
                if(i<=maxShownURIs){
@@ -935,19 +1141,15 @@
                i++;
        }
        out.append("<p><a href=\"?list="+"queued"+"\">Show all</a><br/></p>");
-       out.append("<p><h3>Visited URIs</h3></p>");
-       out.append("<br/>Size :"+visited.size()+"<br/>");
+       out.append("<br/>Size :"+visited.size());
        appendList(visited,out,stylesheet);
        out.append("<p><a href=\"?list="+"visited"+"\">Show all</a><br/></p>");
-       out.append("<p><h3>Failed URIs</h3></p>");
-       out.append("<br/>Size :"+failed.size()+"<br/>");
+       out.append("<br/>Size :"+failed.size());
        appendList(failed,out,stylesheet);
        out.append("<p><a href=\"?list="+"failed"+"\">Show all</a><br/></p>");


 }
-
-
 private void appendDefaultHeader(StringBuffer out, String stylesheet){
        out.append("<HTML><HEAD><TITLE>" + pluginName + "</TITLE>");
        if(stylesheet != null)
@@ -958,8 +1160,6 @@
        out.append("<form method=\"GET\"><input type=\"text\" name=\"adduri\" 
/><br/><br/>");
        out.append("<input type=\"submit\" value=\"Add uri\" /></form>");
 }
-
-
 private void appendList(Set  list,StringBuffer out, String stylesheet){
        Iterator it = list.iterator();
        int i = 0;
@@ -968,130 +1168,169 @@
                out.append("<code>"+it.next().toString()+"</code><br/>");
                }
                else{
+                       //out.append("<form method=\"GET\"><input 
type=\"submit\" name=\"Showall\" />");
+//                     if(listname.equals("visited"))
+//                     out.append("<p><a href=\"?list="+listname+">Showall 
visited</a><br/></p>");
+//                     if(listname.equals("failed"))
+//                             out.append("<p><a 
href=\"?list="+listname+">Showall failed</a><br/></p>");
                        break;
                }
                i++;
-                       }
+               
        }
+       
+}

-/**
- * creates the callback object for each page.
- *<p>Used to create inlinks and outlinks for each page separately.
- * @author swati
- *
- */
 public class PageCallBack implements FoundURICallback{
-       Integer id;
-       /*
-        * id of the page as refrenced in uriIds
-        */     
+       int id;
+               
        PageCallBack(){
-               id = new Integer(0);
+               id = 0;
        }
-       
        public void foundURI(FreenetURI uri){
-
+               //now we have the id of the page that had called this link
                queueURI(uri);
                int iduri = (Integer) uriIds.get(uri);
-
-               if(outlinks.containsKey(id)){
-                       Vector outlink = (Vector) outlinks.get(id);
-                       if(!outlink.contains(iduri))    
-                               outlink.add(iduri);
-                       outlinks.remove(id);
-                       outlinks.put(id, outlink);
-               }
-               else 
-               {
-                       Vector outlink = new Vector();
+               Vector outlink = (Vector) outlinks.get(id);
+               if(!outlink.contains(iduri))    
                        outlink.add(iduri);
-                       outlinks.put(id, outlink);
-               }
+               outlinks.remove(id);
+               outlinks.put(id, outlink);
+               try{
+               FileWriter out = new FileWriter("outlink",true);
+               out.write(" id "+id+" size "+ outlink.size()+" \n");
+               out.close();
+               }catch(Exception e){}

                if(inlinks.containsKey(iduri)){
                        Vector inlink = (Vector) inlinks.get(iduri);
+                       try{
+                               FileWriter out = new FileWriter("inlink",true);
+                               out.write(" id "+iduri+" size "+ 
inlink.size()+" \n");
+                               out.close();
+                               }catch(Exception e){}
+               
                        if(!inlink.contains(id)) inlink.add(id);
                        inlinks.remove(iduri);
                        inlinks.put(iduri, inlink);
+                       
                }
-               else 
-               {
-                       Vector inlink = new Vector();
-                       inlink.add(id);
-                       inlinks.put(iduri, inlink);
-               }
-
                startSomeRequests();
        }
-       
-       
        public void onText(String s, String type, URI baseURI){
-
+               try{
+                       FileWriter outp = new FileWriter("ontext",true);
+                       outp.write("inside on text with id"+id+" \n");
+                       outp.close();
+               }catch(Exception e){}
+//             FreenetURI uri;
+//             try {
+//                     uri = new FreenetURI(baseURI.getPath().substring(1));
+//             } catch (MalformedURLException e) {
+//                     Logger.error(this, "Caught " + e, e);
+//                     return;
+//             }
+                
+               
+      
                if((type != null) && (type.length() != 0) && 
type.toLowerCase().equals("title")
-                               && (s != null) && (s.length() != 0) && 
(s.indexOf('\n') < 0)) {
+                  && (s != null) && (s.length() != 0) && (s.indexOf('\n') < 
0)) {
                        /* We should have a correct title */
+               //      titlesOfURIs.put(uri.toString(), s);
                        titlesOfIds.put(id, s);
+                       
                        type = "title";
+                       
                }
                else type = null;

+
                String[] words = s.split("[^A-Za-z0-9]");
+
                Integer lastPosition = null;
+
+               //lastPosition = (Integer)lastPositionByURI.get(uri.toString());
                lastPosition = (Integer)lastPositionById.get(id);
-
                if(lastPosition == null)
                        lastPosition = new Integer(1); /* We start to count 
from 1 */
+
                for (int i = 0; i < words.length; i++) {
                        String word = words[i];
                        if ((word == null) || (word.length() == 0))
                                continue;
                        word = word.toLowerCase();
                        try{
-                               if(type == null)
-                                       addWord(word, lastPosition.intValue() + 
i, id);
-                               else
-                                       addWord(word, -1 * (i+1), id);
+                       if(type == null)
+                               addWord(word, lastPosition.intValue() + i, id);
+                       else
+                               addWord(word, -1 * (i+1), id);
                        }
                        catch (Exception e){}
                }
-
+               
                if(type == null) {
                        lastPosition = new Integer(lastPosition.intValue() + 
words.length);
+               //      lastPositionByURI.put(uri.toString(), lastPosition);
                        lastPositionById.put(id, lastPosition);
                }
-
+               
        }
-       
-       private synchronized void addWord(String word, int position,Integer id) 
throws Exception{
+private synchronized void addWord(String word, int position,int id) throws 
Exception{
+               
+               
                if(word.length() < 3)
                        return;
+               
+               //word = word.intern();

+
+               //FreenetURI[] uris = (FreenetURI[]) urisByWord.get(word);
                Integer[] ids = (Integer[]) idsByWord.get(word);
+               
+       //      urisWithWords.add(uri);
                idsWithWords.add(id);
+               try{
+                       FileWriter outp = new FileWriter("addWord",true);
+                       outp.write("ID ="+id+" uri ="+idUris.get(id)+"\n");
+                       outp.close();
+               }catch(Exception e){}
+//     FileWriter outp = new FileWriter("uricheck",true);
+//     outp.write(uri.getDocName()+"\n");
+//     outp.write(uri.getKeyType()+"\n");
+//     outp.write(uri.getMetaString()+"\n");
+//     outp.write(uri.getGuessableKey()+"\n");
+//     outp.write(uri.hashCode()+"\n");
+//     outp.write(uri.getPreferredFilename()+"\n");
+//     
+//     outp.close();

                /* Word position indexation */
                HashMap wordPositionsForOneUri = 
(HashMap)positionsByWordById.get(id); /* For a given URI, take as key a word, 
and gives position */
+               
                if(wordPositionsForOneUri == null) {
                        wordPositionsForOneUri = new HashMap();
                        wordPositionsForOneUri.put(word, new Integer[] { new 
Integer(position) });
+                       //positionsByWordByURI.put(uri.toString(), 
wordPositionsForOneUri);
                        positionsByWordById.put(id, wordPositionsForOneUri);
-               } 
-               else {
+               } else {
                        Integer[] positions = 
(Integer[])wordPositionsForOneUri.get(word);
+
                        if(positions == null) {
                                positions = new Integer[] { new 
Integer(position) };
                                wordPositionsForOneUri.put(word, positions);
-                       } 
-                       else {
+                       } else {
                                Integer[] newPositions = new 
Integer[positions.length + 1];
+
                                System.arraycopy(positions, 0, newPositions, 0, 
positions.length);
                                newPositions[positions.length] = new 
Integer(position);
+
                                wordPositionsForOneUri.put(word, newPositions);
                        }
                }
-
+       
                if (ids == null) {
                        idsByWord.put(word, new Integer[] { id });
+                       
                } else {
                        for (int i = 0; i < ids.length; i++) {
                                if (ids[i].equals(id))
@@ -1102,29 +1341,30 @@
                        newIDs[ids.length] = id;
                        idsByWord.put(word, newIDs);
                }
-
+               //the new word is added here in urisByWord
                tMap.put(MD5(word), word);
                long time_indexing = System.currentTimeMillis();
                if (tProducedIndex + minTimeBetweenEachIndexRewriting * 10 < 
System.currentTimeMillis()) {
                        try {
+                               //produceIndex();
+                               //check();
+                               
                                if(indexing){
-                                       generateIndex2();
-                                       produceIndex2();
-                                       /*
-                                        * ensures that index production 
doesn't eat up the processor time 
-                                        */
-                                       if((System.currentTimeMillis() - 
time_indexing)/(System.currentTimeMillis() - tProducedIndex) > 
MAX_TIME_SPENT_INDEXING) indexing= false;
-                                       else indexing = true;
+                               generateIndex2();
+                               produceIndex2();
+                               if((System.currentTimeMillis() - 
time_indexing)/(System.currentTimeMillis() - tProducedIndex) > 
MAX_TIME_SPENT_INDEXING) indexing= false;
+                               else indexing = true;
                                }
+                               
                        } catch (IOException e) {
                                Logger.error(this, "Caught " + e + " while 
creating index", e);
                        }
                        tProducedIndex = System.currentTimeMillis();
                }
+               
        }
+       
 }
-
-
 public String handleHTTPPut(HTTPRequest request) throws PluginHTTPException{
        return null;
 }
@@ -1139,4 +1379,5 @@
        queueURI(uri);
 }

+       
 }

[freenet-cvs] r14731 - trunk/plugins/XMLSpider

Reply via email to