Author: swatig0
Date: 2007-08-16 20:35:21 +0000 (Thu, 16 Aug 2007)
New Revision: 14728
Modified:
trunk/plugins/XMLSpider/XMLSpider.java
Log:
URI-id transformation
Modified: trunk/plugins/XMLSpider/XMLSpider.java
===================================================================
--- trunk/plugins/XMLSpider/XMLSpider.java 2007-08-16 20:30:56 UTC (rev
14727)
+++ trunk/plugins/XMLSpider/XMLSpider.java 2007-08-16 20:35:21 UTC (rev
14728)
@@ -4,7 +4,6 @@
package plugins.XMLSpider;
import java.io.File;
-import java.io.FileWriter;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
@@ -13,14 +12,11 @@
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
-import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.Vector;
@@ -33,11 +29,10 @@
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
-import org.w3c.dom.Attr;
+
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
-import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import freenet.client.ClientMetadata;
@@ -49,7 +44,6 @@
import freenet.client.async.ClientCallback;
import freenet.client.async.ClientGetter;
import freenet.client.async.USKCallback;
-import freenet.clients.http.PageMaker;
import freenet.clients.http.ToadletContext;
import freenet.clients.http.ToadletContextClosedException;
import freenet.clients.http.filter.ContentFilter;
@@ -67,9 +61,7 @@
import freenet.pluginmanager.FredPluginThreadless;
import freenet.pluginmanager.PluginHTTPException;
import freenet.pluginmanager.PluginRespirator;
-import freenet.support.HTMLNode;
import freenet.support.Logger;
-import freenet.support.MultiValueTable;
import freenet.support.api.Bucket;
import freenet.support.api.HTTPRequest;
@@ -95,8 +87,7 @@
* Lists the uris that have been vistied by the spider
*/
public final HashSet visitedURIs = new HashSet();
- private final HashSet urisWithWords = new HashSet();
- private final HashSet idsWithWords = new HashSet();
+ private final HashSet idsWithWords = new HashSet();
/**
*
* Lists the uris that were visited but failed.
@@ -110,9 +101,9 @@
*/
public final LinkedList queuedURIList = new LinkedList();
private final HashMap runningFetchesByURI = new HashMap();
- private final HashMap urisByWord = new HashMap();
+
private final HashMap idsByWord = new HashMap();
- private final HashMap titlesOfURIs = new HashMap();
+
private final HashMap titlesOfIds = new HashMap();
private final HashMap uriIds = new HashMap();
private final HashMap idUris = new HashMap();
@@ -129,7 +120,7 @@
private Vector indices;
private int match;
private int id;
- private Vector list;
+
private boolean indexing ;
private static final int minTimeBetweenEachIndexRewriting = 10;
@@ -627,135 +618,18 @@
public void handleGet(HTTPRequest request, ToadletContext context)
throws IOException, ToadletContextClosedException {
- String action = request.getParam("action");
- PageMaker pageMaker = context.getPageMaker();
- if ((action == null) || (action.length() == 0)) {
- MultiValueTable responseHeaders = new MultiValueTable();
- responseHeaders.put("Location", "?action=list");
- context.sendReplyHeaders(301, "Redirect",
responseHeaders, "text/html; charset=utf-8", 0);
- return;
- } else if ("list".equals(action)) {
-
- String listName = request.getParam("listName", null);
- HTMLNode pageNode = pageMaker.getPageNode("The XML
Spider", context);
- HTMLNode contentNode =
pageMaker.getContentNode(pageNode);
- /* create copies for multi-threaded use */
- if (listName == null) {
- Map runningFetches = new
HashMap(runningFetchesByURI);
- List queued = new ArrayList(queuedURIList);
- Set visited = new HashSet(visitedURIs);
- Set failed = new HashSet(failedURIs);
- contentNode.addChild(createNavbar(pageMaker,
runningFetches.size(), queued.size(), visited.size(), failed.size()));
- contentNode.addChild(createAddBox(pageMaker,
context));
- contentNode.addChild(createList(pageMaker,
"Running FetcheIIIs", "running", runningFetches.keySet(), maxShownURIs));
- contentNode.addChild(createList(pageMaker,
"Queued URIs", "queued", queued, maxShownURIs));
- contentNode.addChild(createList(pageMaker,
"Visited URIs", "visited", visited, maxShownURIs));
- contentNode.addChild(createList(pageMaker,
"Failed URIs", "failed", failed, maxShownURIs));
- } else {
- contentNode.addChild(createBackBox(pageMaker));
- if ("failed".equals(listName)) {
- Set failed = new HashSet(failedURIs);
-
contentNode.addChild(createList(pageMaker, "Failed URIs", "failed", failed,
-1));
- } else if ("visited".equals(listName)) {
- Set visited = new HashSet(visitedURIs);
-
contentNode.addChild(createList(pageMaker, "Visited URIs", "visited", visited,
-1));
- } else if ("queued".equals(listName)) {
- List queued = new
ArrayList(queuedURIList);
-
contentNode.addChild(createList(pageMaker, "Queued URIs", "queued", queued,
-1));
- } else if ("running".equals(listName)) {
- Map runningFetches = new
HashMap(runningFetchesByURI);
-
contentNode.addChild(createList(pageMaker, "Running Fetches", "running",
runningFetches.keySet(), -1));
- }
- }
- MultiValueTable responseHeaders = new MultiValueTable();
- byte[] responseBytes =
pageNode.generate().getBytes("utf-8");
- context.sendReplyHeaders(200, "OK", responseHeaders,
"text/html; charset=utf-8", responseBytes.length);
- context.writeData(responseBytes);
- } else if ("add".equals(action)) {
- String uriParam = request.getParam("key");
- try {
- FreenetURI uri = new FreenetURI(uriParam);
- synchronized (this) {
- failedURIs.remove(uri);
- visitedURIs.remove(uri);
- }
- queueURI(uri);
- startSomeRequests();
- } catch (MalformedURLException mue1) {
- sendSimpleResponse(context, "URL invalid", "The
given URI is not valid.");
- return;
- }
- MultiValueTable responseHeaders = new MultiValueTable();
- responseHeaders.put("Location", "?action=list");
- context.sendReplyHeaders(301, "Redirect",
responseHeaders, "text/html; charset=utf-8", 0);
- return;
- }
+ /*
+ * ignore
+ */
}
+
public void handlePost(HTTPRequest request, ToadletContext context)
throws IOException {
+ /*
+ * ignore
+ */
}
- private void sendSimpleResponse(ToadletContext context, String title,
String message) throws ToadletContextClosedException, IOException {
- PageMaker pageMaker = context.getPageMaker();
- HTMLNode pageNode = pageMaker.getPageNode(title, context);
- HTMLNode contentNode = pageMaker.getContentNode(pageNode);
- HTMLNode infobox =
contentNode.addChild(pageMaker.getInfobox("infobox-alter", title));
- HTMLNode infoboxContent = pageMaker.getContentNode(infobox);
- infoboxContent.addChild("#", message);
- byte[] responseBytes = pageNode.generate().getBytes("utf-8");
- context.sendReplyHeaders(200, "OK", new MultiValueTable(),
"text/html; charset=utf-8", responseBytes.length);
- context.writeData(responseBytes);
- }
-
- private HTMLNode createBackBox(PageMaker pageMaker) {
- HTMLNode backbox = pageMaker.getInfobox((String) null);
- HTMLNode backContent = pageMaker.getContentNode(backbox);
- backContent.addChild("#", "Return to the ");
- backContent.addChild("a", "href", "?action=list", "list of all
URIs");
- backContent.addChild("#", ".");
- return backbox;
- }
-
- private HTMLNode createAddBox(PageMaker pageMaker, ToadletContext ctx) {
- HTMLNode addBox = pageMaker.getInfobox("Add a URI");
- HTMLNode formNode =
pageMaker.getContentNode(addBox).addChild("form", new String[] { "action",
"method" }, new String[] { "", "get" });
- formNode.addChild("input", new String[] { "type", "name",
"value" }, new String[] { "hidden", "action", "add" });
- formNode.addChild("input", new String[] { "type", "size",
"name", "value" }, new String[] { "text", "40", "key", "" });
- formNode.addChild("input", new String[] { "type", "value" },
new String[] { "submit", "Add URI" });
- return addBox;
- }
-
- private HTMLNode createNavbar(PageMaker pageMaker, int running, int
queued, int visited, int failed) {
- HTMLNode navbar = pageMaker.getInfobox("navbar", "Page
Navigation");
- HTMLNode list = pageMaker.getContentNode(navbar).addChild("ul");
- list.addChild("li").addChild("a", "href", "#running", "Running
(" + running + ')');
- list.addChild("li").addChild("a", "href", "#queued", "Queued ("
+ queued + ')');
- list.addChild("li").addChild("a", "href", "#visited", "Visited
(" + visited + ')');
- list.addChild("li").addChild("a", "href", "#failed", "Failed ("
+ failed + ')');
- return navbar;
- }
-
- private HTMLNode createList(PageMaker pageMaker, String listName,
String anchorName, Collection collection, int maxCount) {
- HTMLNode listNode = new HTMLNode("div");
- listNode.addChild("a", "name", anchorName);
- HTMLNode listBox = pageMaker.getInfobox(listName);
- HTMLNode listContent = pageMaker.getContentNode(listBox);
- listNode.addChild(listBox);
- Iterator collectionItems = collection.iterator();
- int itemCount = 0;
- while (collectionItems.hasNext()) {
- FreenetURI uri = (FreenetURI) collectionItems.next();
- listContent.addChild("#", uri.toString());
- listContent.addChild("br");
- if (itemCount++ == maxCount) {
- listContent.addChild("br");
- listContent.addChild("a", "href",
"?action=list&listName=" + anchorName, "Show all\u2026");
- break;
- }
- }
- return listNode;
- }
-
/**
* @see freenet.oldplugins.plugin.Plugin#getPluginName()
*/
@@ -825,8 +699,11 @@
}
return buf.toString();
}
- //this function will return the String representation of the MD5 hash
for the input string
- public static String MD5(String text) throws NoSuchAlgorithmException,
UnsupportedEncodingException {
+
+ /*
+ * calculate the md5 for a given string
+ */
+ private static String MD5(String text) throws NoSuchAlgorithmException,
UnsupportedEncodingException {
MessageDigest md;
md = MessageDigest.getInstance("MD5");
byte[] md5hash = new byte[32];
@@ -957,9 +834,9 @@
allowedMIMETypes.add(new String("text/html"));
allowedMIMETypes.add(new String("text/plain"));
allowedMIMETypes.add(new String("application/xhtml+xml"));
-// allowedMIMETypes.add(new String("application/zip"));
+
ctx.allowedMIMETypes = new HashSet(allowedMIMETypes);
-// ctx.allowedMIMETypes.add("text/html");
+
tProducedIndex = System.currentTimeMillis();
indexing = true;
stopped = false;
@@ -978,11 +855,12 @@
starterThread.start();
}
+/**
+ * Interface to the Spider data
+ */
public String handleHTTPGet(HTTPRequest request) throws PluginHTTPException{
StringBuffer out = new StringBuffer();
- // need to produce pretty html
- //later fredpluginhttpadvanced will give the interface
- //this brings us to the page from visit
+
String listname = request.getParam("list");
if(listname.length() != 0)
{
@@ -1010,6 +888,7 @@
}
return out.toString();
}
+
private void appendList(String listname, StringBuffer out, String stylesheet)
{
Iterator it = (runningFetchesByURI.keySet()).iterator();
@@ -1024,6 +903,7 @@
while(it.hasNext())
out.append("<code>"+it.next().toString()+"</code><br/>");
}
+
private void appendDefaultPageStart(StringBuffer out, String stylesheet) {
out.append("<HTML><HEAD><TITLE>" + pluginName + "</TITLE>");
@@ -1041,10 +921,11 @@
Set failed = new HashSet(failedURIs);
Iterator it=queued.iterator();
- out.append("<br/>Size :"+runningFetches.size());
+ out.append("<br/>Size :"+runningFetches.size()+"<br/>");
appendList(runningFetches,out,stylesheet);
out.append("<p><a href=\"?list="+"running"+"\">Show all</a><br/></p>");
- out.append("<br/>Size :"+queued.size());
+ out.append("<p><h3>Queued URIs</h3></p>");
+ out.append("<br/>Size :"+queued.size()+"<br/>");
int i = 0;
while(it.hasNext()){
if(i<=maxShownURIs){
@@ -1054,15 +935,19 @@
i++;
}
out.append("<p><a href=\"?list="+"queued"+"\">Show all</a><br/></p>");
- out.append("<br/>Size :"+visited.size());
+ out.append("<p><h3>Visited URIs</h3></p>");
+ out.append("<br/>Size :"+visited.size()+"<br/>");
appendList(visited,out,stylesheet);
out.append("<p><a href=\"?list="+"visited"+"\">Show all</a><br/></p>");
- out.append("<br/>Size :"+failed.size());
+ out.append("<p><h3>Failed URIs</h3></p>");
+ out.append("<br/>Size :"+failed.size()+"<br/>");
appendList(failed,out,stylesheet);
out.append("<p><a href=\"?list="+"failed"+"\">Show all</a><br/></p>");
}
+
+
private void appendDefaultHeader(StringBuffer out, String stylesheet){
out.append("<HTML><HEAD><TITLE>" + pluginName + "</TITLE>");
if(stylesheet != null)
@@ -1073,6 +958,8 @@
out.append("<form method=\"GET\"><input type=\"text\" name=\"adduri\"
/><br/><br/>");
out.append("<input type=\"submit\" value=\"Add uri\" /></form>");
}
+
+
private void appendList(Set list,StringBuffer out, String stylesheet){
Iterator it = list.iterator();
int i = 0;
@@ -1081,169 +968,130 @@
out.append("<code>"+it.next().toString()+"</code><br/>");
}
else{
- //out.append("<form method=\"GET\"><input
type=\"submit\" name=\"Showall\" />");
-// if(listname.equals("visited"))
-// out.append("<p><a href=\"?list="+listname+">Showall
visited</a><br/></p>");
-// if(listname.equals("failed"))
-// out.append("<p><a
href=\"?list="+listname+">Showall failed</a><br/></p>");
break;
}
i++;
-
+ }
}
-
-}
+/**
+ * creates the callback object for each page.
+ *<p>Used to create inlinks and outlinks for each page separately.
+ * @author swati
+ *
+ */
public class PageCallBack implements FoundURICallback{
- int id;
-
+ Integer id;
+ /*
+ * id of the page as refrenced in uriIds
+ */
PageCallBack(){
id = 0;
}
+
public void foundURI(FreenetURI uri){
- //now we have the id of the page that had called this link
+
queueURI(uri);
- int iduri = (Integer) uriIds.get(uri);
- Vector outlink = (Vector) outlinks.get(id);
- if(!outlink.contains(iduri))
+ Integer iduri = (Integer) uriIds.get(uri);
+
+ if(outlinks.containsKey(id)){
+ Vector outlink = (Vector) outlinks.get(id);
+ if(!outlink.contains(iduri))
+ outlink.add(iduri);
+ outlinks.remove(id);
+ outlinks.put(id, outlink);
+ }
+ else
+ {
+ Vector outlink = new Vector();
outlink.add(iduri);
- outlinks.remove(id);
- outlinks.put(id, outlink);
- try{
- FileWriter out = new FileWriter("outlink",true);
- out.write(" id "+id+" size "+ outlink.size()+" \n");
- out.close();
- }catch(Exception e){}
+ outlinks.put(id, outlink);
+ }
if(inlinks.containsKey(iduri)){
Vector inlink = (Vector) inlinks.get(iduri);
- try{
- FileWriter out = new FileWriter("inlink",true);
- out.write(" id "+iduri+" size "+
inlink.size()+" \n");
- out.close();
- }catch(Exception e){}
-
if(!inlink.contains(id)) inlink.add(id);
inlinks.remove(iduri);
inlinks.put(iduri, inlink);
-
}
+ else
+ {
+ Vector inlink = new Vector();
+ inlink.add(id);
+ inlinks.put(iduri, inlink);
+ }
+
startSomeRequests();
}
+
+
public void onText(String s, String type, URI baseURI){
- try{
- FileWriter outp = new FileWriter("ontext",true);
- outp.write("inside on text with id"+id+" \n");
- outp.close();
- }catch(Exception e){}
-// FreenetURI uri;
-// try {
-// uri = new FreenetURI(baseURI.getPath().substring(1));
-// } catch (MalformedURLException e) {
-// Logger.error(this, "Caught " + e, e);
-// return;
-// }
-
-
-
+
if((type != null) && (type.length() != 0) &&
type.toLowerCase().equals("title")
- && (s != null) && (s.length() != 0) && (s.indexOf('\n') <
0)) {
+ && (s != null) && (s.length() != 0) &&
(s.indexOf('\n') < 0)) {
/* We should have a correct title */
- // titlesOfURIs.put(uri.toString(), s);
titlesOfIds.put(id, s);
-
type = "title";
-
}
else type = null;
-
String[] words = s.split("[^A-Za-z0-9]");
-
Integer lastPosition = null;
-
- //lastPosition = (Integer)lastPositionByURI.get(uri.toString());
lastPosition = (Integer)lastPositionById.get(id);
+
if(lastPosition == null)
lastPosition = new Integer(1); /* We start to count
from 1 */
-
for (int i = 0; i < words.length; i++) {
String word = words[i];
if ((word == null) || (word.length() == 0))
continue;
word = word.toLowerCase();
try{
- if(type == null)
- addWord(word, lastPosition.intValue() + i, id);
- else
- addWord(word, -1 * (i+1), id);
+ if(type == null)
+ addWord(word, lastPosition.intValue() +
i, id);
+ else
+ addWord(word, -1 * (i+1), id);
}
catch (Exception e){}
}
-
+
if(type == null) {
lastPosition = new Integer(lastPosition.intValue() +
words.length);
- // lastPositionByURI.put(uri.toString(), lastPosition);
lastPositionById.put(id, lastPosition);
}
-
+
}
-private synchronized void addWord(String word, int position,int id) throws
Exception{
-
-
+
+ private synchronized void addWord(String word, int position,Integer id)
throws Exception{
if(word.length() < 3)
return;
-
- //word = word.intern();
-
- //FreenetURI[] uris = (FreenetURI[]) urisByWord.get(word);
Integer[] ids = (Integer[]) idsByWord.get(word);
-
- // urisWithWords.add(uri);
idsWithWords.add(id);
- try{
- FileWriter outp = new FileWriter("addWord",true);
- outp.write("ID ="+id+" uri ="+idUris.get(id)+"\n");
- outp.close();
- }catch(Exception e){}
-// FileWriter outp = new FileWriter("uricheck",true);
-// outp.write(uri.getDocName()+"\n");
-// outp.write(uri.getKeyType()+"\n");
-// outp.write(uri.getMetaString()+"\n");
-// outp.write(uri.getGuessableKey()+"\n");
-// outp.write(uri.hashCode()+"\n");
-// outp.write(uri.getPreferredFilename()+"\n");
-//
-// outp.close();
/* Word position indexation */
HashMap wordPositionsForOneUri =
(HashMap)positionsByWordById.get(id); /* For a given URI, take as key a word,
and gives position */
-
if(wordPositionsForOneUri == null) {
wordPositionsForOneUri = new HashMap();
wordPositionsForOneUri.put(word, new Integer[] { new
Integer(position) });
- //positionsByWordByURI.put(uri.toString(),
wordPositionsForOneUri);
positionsByWordById.put(id, wordPositionsForOneUri);
- } else {
+ }
+ else {
Integer[] positions =
(Integer[])wordPositionsForOneUri.get(word);
-
if(positions == null) {
positions = new Integer[] { new
Integer(position) };
wordPositionsForOneUri.put(word, positions);
- } else {
+ }
+ else {
Integer[] newPositions = new
Integer[positions.length + 1];
-
System.arraycopy(positions, 0, newPositions, 0,
positions.length);
newPositions[positions.length] = new
Integer(position);
-
wordPositionsForOneUri.put(word, newPositions);
}
}
-
+
if (ids == null) {
idsByWord.put(word, new Integer[] { id });
-
} else {
for (int i = 0; i < ids.length; i++) {
if (ids[i].equals(id))
@@ -1254,30 +1102,29 @@
newIDs[ids.length] = id;
idsByWord.put(word, newIDs);
}
- //the new word is added here in urisByWord
+
tMap.put(MD5(word), word);
long time_indexing = System.currentTimeMillis();
if (tProducedIndex + minTimeBetweenEachIndexRewriting * 10 <
System.currentTimeMillis()) {
try {
- //produceIndex();
- //check();
-
if(indexing){
- generateIndex2();
- produceIndex2();
- if((System.currentTimeMillis() -
time_indexing)/(System.currentTimeMillis() - tProducedIndex) >
MAX_TIME_SPENT_INDEXING) indexing= false;
- else indexing = true;
+ generateIndex2();
+ produceIndex2();
+ /*
+ * ensures that index production
doesn't eat up the processor time
+ */
+ if((System.currentTimeMillis() -
time_indexing)/(System.currentTimeMillis() - tProducedIndex) >
MAX_TIME_SPENT_INDEXING) indexing= false;
+ else indexing = true;
}
-
} catch (IOException e) {
Logger.error(this, "Caught " + e + " while
creating index", e);
}
tProducedIndex = System.currentTimeMillis();
}
-
}
-
}
+
+
public String handleHTTPPut(HTTPRequest request) throws PluginHTTPException{
return null;
}
@@ -1292,5 +1139,4 @@
queueURI(uri);
}
-
}