Author: swatig0
Date: 2007-08-16 20:35:21 +0000 (Thu, 16 Aug 2007)
New Revision: 14728

Modified:
   trunk/plugins/XMLSpider/XMLSpider.java
Log:
URI-id transformation

Modified: trunk/plugins/XMLSpider/XMLSpider.java
===================================================================
--- trunk/plugins/XMLSpider/XMLSpider.java      2007-08-16 20:30:56 UTC (rev 
14727)
+++ trunk/plugins/XMLSpider/XMLSpider.java      2007-08-16 20:35:21 UTC (rev 
14728)
@@ -4,7 +4,6 @@
 package plugins.XMLSpider;

 import java.io.File;
-import java.io.FileWriter;
 import java.io.IOException;
 import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
@@ -13,14 +12,11 @@
 import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
 import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
-import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.Vector;
@@ -33,11 +29,10 @@
 import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;

-import org.w3c.dom.Attr;
+
 import org.w3c.dom.DOMImplementation;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
-import org.w3c.dom.NodeList;
 import org.w3c.dom.Text;

 import freenet.client.ClientMetadata;
@@ -49,7 +44,6 @@
 import freenet.client.async.ClientCallback;
 import freenet.client.async.ClientGetter;
 import freenet.client.async.USKCallback;
-import freenet.clients.http.PageMaker;
 import freenet.clients.http.ToadletContext;
 import freenet.clients.http.ToadletContextClosedException;
 import freenet.clients.http.filter.ContentFilter;
@@ -67,9 +61,7 @@
 import freenet.pluginmanager.FredPluginThreadless;
 import freenet.pluginmanager.PluginHTTPException;
 import freenet.pluginmanager.PluginRespirator;
-import freenet.support.HTMLNode;
 import freenet.support.Logger;
-import freenet.support.MultiValueTable;
 import freenet.support.api.Bucket;
 import freenet.support.api.HTTPRequest;

@@ -95,8 +87,7 @@
         * Lists the uris that have been vistied by the spider
         */
        public final HashSet visitedURIs = new HashSet();
-       private final HashSet urisWithWords = new HashSet();
-       private final HashSet idsWithWords = new HashSet();
+               private final HashSet idsWithWords = new HashSet();
        /**
         * 
         * Lists the uris that were visited but failed.
@@ -110,9 +101,9 @@
         */
        public final LinkedList queuedURIList = new LinkedList();
        private final HashMap runningFetchesByURI = new HashMap();
-       private final HashMap urisByWord = new HashMap();
+       
        private final HashMap idsByWord = new HashMap();
-       private final HashMap titlesOfURIs = new HashMap();
+       
        private final HashMap titlesOfIds = new HashMap();
        private final HashMap uriIds = new HashMap();
        private final HashMap idUris = new HashMap();
@@ -129,7 +120,7 @@
        private Vector indices;
        private int match;
        private int id;
-       private Vector list;
+       
        private boolean indexing ;

        private static final int minTimeBetweenEachIndexRewriting = 10;
@@ -627,135 +618,18 @@


        public void handleGet(HTTPRequest request, ToadletContext context) 
throws IOException, ToadletContextClosedException {
-               String action = request.getParam("action");
-               PageMaker pageMaker = context.getPageMaker();
-               if ((action == null) || (action.length() == 0)) {
-                       MultiValueTable responseHeaders = new MultiValueTable();
-                       responseHeaders.put("Location", "?action=list");
-                       context.sendReplyHeaders(301, "Redirect", 
responseHeaders, "text/html; charset=utf-8", 0);
-                       return;
-               } else if ("list".equals(action)) {
-                       
-                       String listName = request.getParam("listName", null);
-                       HTMLNode pageNode = pageMaker.getPageNode("The XML 
Spider", context);
-                       HTMLNode contentNode = 
pageMaker.getContentNode(pageNode);
-                       /* create copies for multi-threaded use */
-                       if (listName == null) {
-                               Map runningFetches = new 
HashMap(runningFetchesByURI);
-                               List queued = new ArrayList(queuedURIList);
-                               Set visited = new HashSet(visitedURIs);
-                               Set failed = new HashSet(failedURIs);
-                               contentNode.addChild(createNavbar(pageMaker, 
runningFetches.size(), queued.size(), visited.size(), failed.size()));
-                               contentNode.addChild(createAddBox(pageMaker, 
context));
-                               contentNode.addChild(createList(pageMaker, 
"Running FetcheIIIs", "running", runningFetches.keySet(), maxShownURIs));
-                               contentNode.addChild(createList(pageMaker, 
"Queued URIs", "queued", queued, maxShownURIs));
-                               contentNode.addChild(createList(pageMaker, 
"Visited URIs", "visited", visited, maxShownURIs));
-                               contentNode.addChild(createList(pageMaker, 
"Failed URIs", "failed", failed, maxShownURIs));
-                       } else {
-                               contentNode.addChild(createBackBox(pageMaker));
-                               if ("failed".equals(listName)) {
-                                       Set failed = new HashSet(failedURIs);
-                                       
contentNode.addChild(createList(pageMaker, "Failed URIs", "failed", failed, 
-1));       
-                               } else if ("visited".equals(listName)) {
-                                       Set visited = new HashSet(visitedURIs);
-                                       
contentNode.addChild(createList(pageMaker, "Visited URIs", "visited", visited, 
-1));
-                               } else if ("queued".equals(listName)) {
-                                       List queued = new 
ArrayList(queuedURIList);
-                                       
contentNode.addChild(createList(pageMaker, "Queued URIs", "queued", queued, 
-1));
-                               } else if ("running".equals(listName)) {
-                                       Map runningFetches = new 
HashMap(runningFetchesByURI);
-                                       
contentNode.addChild(createList(pageMaker, "Running Fetches", "running", 
runningFetches.keySet(), -1));
-                               }
-                       }
-                       MultiValueTable responseHeaders = new MultiValueTable();
-                       byte[] responseBytes = 
pageNode.generate().getBytes("utf-8");
-                       context.sendReplyHeaders(200, "OK", responseHeaders, 
"text/html; charset=utf-8", responseBytes.length);
-                       context.writeData(responseBytes);
-               } else if ("add".equals(action)) {
-                       String uriParam = request.getParam("key");
-                       try {
-                               FreenetURI uri = new FreenetURI(uriParam);
-                               synchronized (this) {
-                                       failedURIs.remove(uri);
-                                       visitedURIs.remove(uri);
-                               }
-                               queueURI(uri);
-                               startSomeRequests();
-                       } catch (MalformedURLException mue1) {
-                               sendSimpleResponse(context, "URL invalid", "The 
given URI is not valid.");
-                               return;
-                       }
-                       MultiValueTable responseHeaders = new MultiValueTable();
-                       responseHeaders.put("Location", "?action=list");
-                       context.sendReplyHeaders(301, "Redirect", 
responseHeaders, "text/html; charset=utf-8", 0);
-                       return;
-               }
+               /*
+                * ignore
+                */
        }

+       
        public void handlePost(HTTPRequest request, ToadletContext context) 
throws IOException {
+               /*
+                * ignore
+                */
        }

-       private void sendSimpleResponse(ToadletContext context, String title, 
String message) throws ToadletContextClosedException, IOException {
-               PageMaker pageMaker = context.getPageMaker();
-               HTMLNode pageNode = pageMaker.getPageNode(title, context);
-               HTMLNode contentNode = pageMaker.getContentNode(pageNode);
-               HTMLNode infobox = 
contentNode.addChild(pageMaker.getInfobox("infobox-alter", title));
-               HTMLNode infoboxContent = pageMaker.getContentNode(infobox);
-               infoboxContent.addChild("#", message);
-               byte[] responseBytes = pageNode.generate().getBytes("utf-8");
-               context.sendReplyHeaders(200, "OK", new MultiValueTable(), 
"text/html; charset=utf-8", responseBytes.length);
-               context.writeData(responseBytes);
-       }
-       
-       private HTMLNode createBackBox(PageMaker pageMaker) {
-               HTMLNode backbox = pageMaker.getInfobox((String) null);
-               HTMLNode backContent = pageMaker.getContentNode(backbox);
-               backContent.addChild("#", "Return to the ");
-               backContent.addChild("a", "href", "?action=list", "list of all 
URIs");
-               backContent.addChild("#", ".");
-               return backbox;
-       }
-       
-       private HTMLNode createAddBox(PageMaker pageMaker, ToadletContext ctx) {
-               HTMLNode addBox = pageMaker.getInfobox("Add a URI");
-               HTMLNode formNode = 
pageMaker.getContentNode(addBox).addChild("form", new String[] { "action", 
"method" }, new String[] { "", "get" });
-               formNode.addChild("input", new String[] { "type", "name", 
"value" }, new String[] { "hidden", "action", "add" });
-               formNode.addChild("input", new String[] { "type", "size", 
"name", "value" }, new String[] { "text", "40", "key", "" });
-               formNode.addChild("input", new String[] { "type", "value" }, 
new String[] { "submit", "Add URI" });
-               return addBox;
-       }
-
-       private HTMLNode createNavbar(PageMaker pageMaker, int running, int 
queued, int visited, int failed) {
-               HTMLNode navbar = pageMaker.getInfobox("navbar", "Page 
Navigation");
-               HTMLNode list = pageMaker.getContentNode(navbar).addChild("ul");
-               list.addChild("li").addChild("a", "href", "#running", "Running 
(" + running + ')');
-               list.addChild("li").addChild("a", "href", "#queued", "Queued (" 
+ queued + ')');
-               list.addChild("li").addChild("a", "href", "#visited", "Visited 
(" + visited + ')');
-               list.addChild("li").addChild("a", "href", "#failed", "Failed (" 
+ failed + ')');
-               return navbar;
-       }
-
-       private HTMLNode createList(PageMaker pageMaker, String listName, 
String anchorName, Collection collection, int maxCount) {
-               HTMLNode listNode = new HTMLNode("div");
-               listNode.addChild("a", "name", anchorName);
-               HTMLNode listBox = pageMaker.getInfobox(listName);
-               HTMLNode listContent = pageMaker.getContentNode(listBox);
-               listNode.addChild(listBox);
-               Iterator collectionItems = collection.iterator();
-               int itemCount = 0;
-               while (collectionItems.hasNext()) {
-                       FreenetURI uri = (FreenetURI) collectionItems.next();
-                       listContent.addChild("#", uri.toString());
-                       listContent.addChild("br");
-                       if (itemCount++ == maxCount) {
-                               listContent.addChild("br");
-                               listContent.addChild("a", "href", 
"?action=list&listName=" + anchorName, "Show all\u2026");
-                               break;
-                       }
-               }
-               return listNode;
-       }
-
        /**
         * @see freenet.oldplugins.plugin.Plugin#getPluginName()
         */
@@ -825,8 +699,11 @@
         }
         return buf.toString();
     }
-       //this function will return the String representation of the MD5 hash 
for the input string 
-       public static String MD5(String text) throws NoSuchAlgorithmException, 
UnsupportedEncodingException  {
+       
+       /*
+        * calculate the md5 for a given string
+        */
+       private static String MD5(String text) throws NoSuchAlgorithmException, 
UnsupportedEncodingException  {
                MessageDigest md;
                md = MessageDigest.getInstance("MD5");
                byte[] md5hash = new byte[32];
@@ -957,9 +834,9 @@
        allowedMIMETypes.add(new String("text/html"));
        allowedMIMETypes.add(new String("text/plain"));
        allowedMIMETypes.add(new String("application/xhtml+xml"));
-//     allowedMIMETypes.add(new String("application/zip"));
+
        ctx.allowedMIMETypes = new HashSet(allowedMIMETypes);
-//     ctx.allowedMIMETypes.add("text/html"); 
+
        tProducedIndex = System.currentTimeMillis();
        indexing = true;
        stopped = false;
@@ -978,11 +855,12 @@
        starterThread.start();
 }

+/**
+ * Interface to the Spider data
+ */
 public String handleHTTPGet(HTTPRequest request) throws PluginHTTPException{
        StringBuffer out = new StringBuffer();
-       // need to produce pretty html
-       //later fredpluginhttpadvanced will give the interface
-       //this brings us to the page from visit
+       
        String listname = request.getParam("list");
        if(listname.length() != 0)
        {
@@ -1010,6 +888,7 @@
                }
        return out.toString();
 }
+
 private void appendList(String listname, StringBuffer out, String stylesheet)
 {
        Iterator it = (runningFetchesByURI.keySet()).iterator();
@@ -1024,6 +903,7 @@
        while(it.hasNext())
                out.append("<code>"+it.next().toString()+"</code><br/>");
 }
+
 private void appendDefaultPageStart(StringBuffer out, String stylesheet) {

        out.append("<HTML><HEAD><TITLE>" + pluginName + "</TITLE>");
@@ -1041,10 +921,11 @@

        Set failed = new HashSet(failedURIs);
        Iterator it=queued.iterator();
-       out.append("<br/>Size :"+runningFetches.size());
+       out.append("<br/>Size :"+runningFetches.size()+"<br/>");
        appendList(runningFetches,out,stylesheet);
        out.append("<p><a href=\"?list="+"running"+"\">Show all</a><br/></p>");
-       out.append("<br/>Size :"+queued.size());
+       out.append("<p><h3>Queued URIs</h3></p>");
+       out.append("<br/>Size :"+queued.size()+"<br/>");
        int i = 0;
        while(it.hasNext()){
                if(i<=maxShownURIs){
@@ -1054,15 +935,19 @@
                i++;
        }
        out.append("<p><a href=\"?list="+"queued"+"\">Show all</a><br/></p>");
-       out.append("<br/>Size :"+visited.size());
+       out.append("<p><h3>Visited URIs</h3></p>");
+       out.append("<br/>Size :"+visited.size()+"<br/>");
        appendList(visited,out,stylesheet);
        out.append("<p><a href=\"?list="+"visited"+"\">Show all</a><br/></p>");
-       out.append("<br/>Size :"+failed.size());
+       out.append("<p><h3>Failed URIs</h3></p>");
+       out.append("<br/>Size :"+failed.size()+"<br/>");
        appendList(failed,out,stylesheet);
        out.append("<p><a href=\"?list="+"failed"+"\">Show all</a><br/></p>");


 }
+
+
 private void appendDefaultHeader(StringBuffer out, String stylesheet){
        out.append("<HTML><HEAD><TITLE>" + pluginName + "</TITLE>");
        if(stylesheet != null)
@@ -1073,6 +958,8 @@
        out.append("<form method=\"GET\"><input type=\"text\" name=\"adduri\" 
/><br/><br/>");
        out.append("<input type=\"submit\" value=\"Add uri\" /></form>");
 }
+
+
 private void appendList(Set  list,StringBuffer out, String stylesheet){
        Iterator it = list.iterator();
        int i = 0;
@@ -1081,169 +968,130 @@
                out.append("<code>"+it.next().toString()+"</code><br/>");
                }
                else{
-                       //out.append("<form method=\"GET\"><input 
type=\"submit\" name=\"Showall\" />");
-//                     if(listname.equals("visited"))
-//                     out.append("<p><a href=\"?list="+listname+">Showall 
visited</a><br/></p>");
-//                     if(listname.equals("failed"))
-//                             out.append("<p><a 
href=\"?list="+listname+">Showall failed</a><br/></p>");
                        break;
                }
                i++;
-               
+                       }
        }
-       
-}

+/**
+ * creates the callback object for each page.
+ *<p>Used to create inlinks and outlinks for each page separately.
+ * @author swati
+ *
+ */
 public class PageCallBack implements FoundURICallback{
-       int id;
-               
+       Integer id;
+       /*
+        * id of the page as refrenced in uriIds
+        */     
        PageCallBack(){
                id = 0;
        }
+       
        public void foundURI(FreenetURI uri){
-               //now we have the id of the page that had called this link
+
                queueURI(uri);
-               int iduri = (Integer) uriIds.get(uri);
-               Vector outlink = (Vector) outlinks.get(id);
-               if(!outlink.contains(iduri))    
+               Integer iduri = (Integer) uriIds.get(uri);
+
+               if(outlinks.containsKey(id)){
+                       Vector outlink = (Vector) outlinks.get(id);
+                       if(!outlink.contains(iduri))    
+                               outlink.add(iduri);
+                       outlinks.remove(id);
+                       outlinks.put(id, outlink);
+               }
+               else 
+               {
+                       Vector outlink = new Vector();
                        outlink.add(iduri);
-               outlinks.remove(id);
-               outlinks.put(id, outlink);
-               try{
-               FileWriter out = new FileWriter("outlink",true);
-               out.write(" id "+id+" size "+ outlink.size()+" \n");
-               out.close();
-               }catch(Exception e){}
+                       outlinks.put(id, outlink);
+               }

                if(inlinks.containsKey(iduri)){
                        Vector inlink = (Vector) inlinks.get(iduri);
-                       try{
-                               FileWriter out = new FileWriter("inlink",true);
-                               out.write(" id "+iduri+" size "+ 
inlink.size()+" \n");
-                               out.close();
-                               }catch(Exception e){}
-               
                        if(!inlink.contains(id)) inlink.add(id);
                        inlinks.remove(iduri);
                        inlinks.put(iduri, inlink);
-                       
                }
+               else 
+               {
+                       Vector inlink = new Vector();
+                       inlink.add(id);
+                       inlinks.put(iduri, inlink);
+               }
+
                startSomeRequests();
        }
+       
+       
        public void onText(String s, String type, URI baseURI){
-               try{
-                       FileWriter outp = new FileWriter("ontext",true);
-                       outp.write("inside on text with id"+id+" \n");
-                       outp.close();
-               }catch(Exception e){}
-//             FreenetURI uri;
-//             try {
-//                     uri = new FreenetURI(baseURI.getPath().substring(1));
-//             } catch (MalformedURLException e) {
-//                     Logger.error(this, "Caught " + e, e);
-//                     return;
-//             }
-                
-               
-      
+
                if((type != null) && (type.length() != 0) && 
type.toLowerCase().equals("title")
-                  && (s != null) && (s.length() != 0) && (s.indexOf('\n') < 
0)) {
+                               && (s != null) && (s.length() != 0) && 
(s.indexOf('\n') < 0)) {
                        /* We should have a correct title */
-               //      titlesOfURIs.put(uri.toString(), s);
                        titlesOfIds.put(id, s);
-                       
                        type = "title";
-                       
                }
                else type = null;

-
                String[] words = s.split("[^A-Za-z0-9]");
-
                Integer lastPosition = null;
-
-               //lastPosition = (Integer)lastPositionByURI.get(uri.toString());
                lastPosition = (Integer)lastPositionById.get(id);
+
                if(lastPosition == null)
                        lastPosition = new Integer(1); /* We start to count 
from 1 */
-
                for (int i = 0; i < words.length; i++) {
                        String word = words[i];
                        if ((word == null) || (word.length() == 0))
                                continue;
                        word = word.toLowerCase();
                        try{
-                       if(type == null)
-                               addWord(word, lastPosition.intValue() + i, id);
-                       else
-                               addWord(word, -1 * (i+1), id);
+                               if(type == null)
+                                       addWord(word, lastPosition.intValue() + 
i, id);
+                               else
+                                       addWord(word, -1 * (i+1), id);
                        }
                        catch (Exception e){}
                }
-               
+
                if(type == null) {
                        lastPosition = new Integer(lastPosition.intValue() + 
words.length);
-               //      lastPositionByURI.put(uri.toString(), lastPosition);
                        lastPositionById.put(id, lastPosition);
                }
-               
+
        }
-private synchronized void addWord(String word, int position,int id) throws 
Exception{
-               
-               
+       
+       private synchronized void addWord(String word, int position,Integer id) 
throws Exception{
                if(word.length() < 3)
                        return;
-               
-               //word = word.intern();

-
-               //FreenetURI[] uris = (FreenetURI[]) urisByWord.get(word);
                Integer[] ids = (Integer[]) idsByWord.get(word);
-               
-       //      urisWithWords.add(uri);
                idsWithWords.add(id);
-               try{
-                       FileWriter outp = new FileWriter("addWord",true);
-                       outp.write("ID ="+id+" uri ="+idUris.get(id)+"\n");
-                       outp.close();
-               }catch(Exception e){}
-//     FileWriter outp = new FileWriter("uricheck",true);
-//     outp.write(uri.getDocName()+"\n");
-//     outp.write(uri.getKeyType()+"\n");
-//     outp.write(uri.getMetaString()+"\n");
-//     outp.write(uri.getGuessableKey()+"\n");
-//     outp.write(uri.hashCode()+"\n");
-//     outp.write(uri.getPreferredFilename()+"\n");
-//     
-//     outp.close();

                /* Word position indexation */
                HashMap wordPositionsForOneUri = 
(HashMap)positionsByWordById.get(id); /* For a given URI, take as key a word, 
and gives position */
-               
                if(wordPositionsForOneUri == null) {
                        wordPositionsForOneUri = new HashMap();
                        wordPositionsForOneUri.put(word, new Integer[] { new 
Integer(position) });
-                       //positionsByWordByURI.put(uri.toString(), 
wordPositionsForOneUri);
                        positionsByWordById.put(id, wordPositionsForOneUri);
-               } else {
+               } 
+               else {
                        Integer[] positions = 
(Integer[])wordPositionsForOneUri.get(word);
-
                        if(positions == null) {
                                positions = new Integer[] { new 
Integer(position) };
                                wordPositionsForOneUri.put(word, positions);
-                       } else {
+                       } 
+                       else {
                                Integer[] newPositions = new 
Integer[positions.length + 1];
-
                                System.arraycopy(positions, 0, newPositions, 0, 
positions.length);
                                newPositions[positions.length] = new 
Integer(position);
-
                                wordPositionsForOneUri.put(word, newPositions);
                        }
                }
-       
+
                if (ids == null) {
                        idsByWord.put(word, new Integer[] { id });
-                       
                } else {
                        for (int i = 0; i < ids.length; i++) {
                                if (ids[i].equals(id))
@@ -1254,30 +1102,29 @@
                        newIDs[ids.length] = id;
                        idsByWord.put(word, newIDs);
                }
-               //the new word is added here in urisByWord
+
                tMap.put(MD5(word), word);
                long time_indexing = System.currentTimeMillis();
                if (tProducedIndex + minTimeBetweenEachIndexRewriting * 10 < 
System.currentTimeMillis()) {
                        try {
-                               //produceIndex();
-                               //check();
-                               
                                if(indexing){
-                               generateIndex2();
-                               produceIndex2();
-                               if((System.currentTimeMillis() - 
time_indexing)/(System.currentTimeMillis() - tProducedIndex) > 
MAX_TIME_SPENT_INDEXING) indexing= false;
-                               else indexing = true;
+                                       generateIndex2();
+                                       produceIndex2();
+                                       /*
+                                        * ensures that index production 
doesn't eat up the processor time 
+                                        */
+                                       if((System.currentTimeMillis() - 
time_indexing)/(System.currentTimeMillis() - tProducedIndex) > 
MAX_TIME_SPENT_INDEXING) indexing= false;
+                                       else indexing = true;
                                }
-                               
                        } catch (IOException e) {
                                Logger.error(this, "Caught " + e + " while 
creating index", e);
                        }
                        tProducedIndex = System.currentTimeMillis();
                }
-               
        }
-       
 }
+
+
 public String handleHTTPPut(HTTPRequest request) throws PluginHTTPException{
        return null;
 }
@@ -1292,5 +1139,4 @@
        queueURI(uri);
 }

-       
 }


Reply via email to