Author: swatig0
Date: 2007-08-16 20:47:44 +0000 (Thu, 16 Aug 2007)
New Revision: 14731
Modified:
trunk/plugins/XMLSpider/XMLSpider.java
Log:
Outlinks-inlinks for a page
Modified: trunk/plugins/XMLSpider/XMLSpider.java
===================================================================
--- trunk/plugins/XMLSpider/XMLSpider.java 2007-08-16 20:45:08 UTC (rev
14730)
+++ trunk/plugins/XMLSpider/XMLSpider.java 2007-08-16 20:47:44 UTC (rev
14731)
@@ -4,6 +4,7 @@
package plugins.XMLSpider;
import java.io.File;
+import java.io.FileWriter;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
@@ -12,11 +13,14 @@
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
+import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.Vector;
@@ -29,10 +33,11 @@
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
-
+import org.w3c.dom.Attr;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import freenet.client.ClientMetadata;
@@ -44,6 +49,7 @@
import freenet.client.async.ClientCallback;
import freenet.client.async.ClientGetter;
import freenet.client.async.USKCallback;
+import freenet.clients.http.PageMaker;
import freenet.clients.http.ToadletContext;
import freenet.clients.http.ToadletContextClosedException;
import freenet.clients.http.filter.ContentFilter;
@@ -61,13 +67,15 @@
import freenet.pluginmanager.FredPluginThreadless;
import freenet.pluginmanager.PluginHTTPException;
import freenet.pluginmanager.PluginRespirator;
+import freenet.support.HTMLNode;
import freenet.support.Logger;
+import freenet.support.MultiValueTable;
import freenet.support.api.Bucket;
import freenet.support.api.HTTPRequest;
/**
- * XMLSpider. Produces xml index for searching words.
- * In case the size of the index grows up a specific threshold the index is
split into several subindices.
+ * XMLSpider. Produces index for searching words.
+ * In case the size of the index grows up a specific threshold the index is
split into several subindices
* The indexing key is the md5 hash of the word.
*
* @author swati goyal
@@ -76,70 +84,44 @@
public class XMLSpider implements FredPlugin, FredPluginHTTP,
FredPluginThreadless, FredPluginHTTPAdvanced,HttpPlugin, ClientCallback,
USKCallback{
long tProducedIndex;
- /**
- * Stores the found words along with md5
- */
- public TreeMap tMap = new TreeMap();
+ private TreeMap tMap = new TreeMap();
int count;
// URIs visited, or fetching, or queued. Added once then forgotten
about.
- /**
- *
- * Lists the uris that have been vistied by the spider
- */
- public final HashSet visitedURIs = new HashSet();
- private final HashSet idsWithWords = new HashSet();
- /**
- *
- * Lists the uris that were visited but failed.
- */
- public final HashSet failedURIs = new HashSet();
-
+ private final HashSet visitedURIs = new HashSet();
+ private final HashSet urisWithWords = new HashSet();
+ private final HashSet idsWithWords = new HashSet();
+ private final HashSet failedURIs = new HashSet();
private final HashSet queuedURISet = new HashSet();
- /**
- *
- * Lists the uris that are still queued.
- */
- public final LinkedList queuedURIList = new LinkedList();
+ private final LinkedList queuedURIList = new LinkedList();
private final HashMap runningFetchesByURI = new HashMap();
-
+ private final HashMap urisByWord = new HashMap();
private final HashMap idsByWord = new HashMap();
-
+ private final HashMap titlesOfURIs = new HashMap();
private final HashMap titlesOfIds = new HashMap();
private final HashMap uriIds = new HashMap();
private final HashMap idUris = new HashMap();
- /**
- * Lists the outlinks from a particular page,
- * </br> indexed by the id of page uri
- */
- public final HashMap outlinks = new HashMap();
- /**
- * Lists the inlinks to a particular page,
- * indexed by the id of page uri.
- */
- public final HashMap inlinks = new HashMap();
+ private final HashMap outlinks = new HashMap();
+ private final HashMap inlinks = new HashMap();
private Vector indices;
private int match;
- private Integer id;
-
+ private int id;
+ private Vector list;
private boolean indexing ;
private static final int minTimeBetweenEachIndexRewriting = 10;
/**
- * directory where the generated indices are stored.
+ * DEFAULT_INDEX_DIR is the directory where the generated indices are stored.
* Needs to be created before it can be used
*/
- public static final String DEFAULT_INDEX_DIR = "myindex4/";
- /**
- * Lists the allowed mime types of the fetched page.
- */
+ private static final String DEFAULT_INDEX_DIR = "myindex4/";
public Set allowedMIMETypes;
- private static final int MAX_ENTRIES = 10;
+ private static final int MAX_ENTRIES = 2;
private static final String pluginName = "XML spider";
/**
- * Gives the allowed fraction of total time spent on generating indices
with
- * maximum value = 1; minimum value = 0.
+ * This gives the allowed fraction of total time spent on generating
indices
+ * max value = 1; min value > 0
*/
- public static final double MAX_TIME_SPENT_INDEXING = 0.5;
+ private static final double MAX_TIME_SPENT_INDEXING = 0.5;
private static final String indexTitle= "XMLSpider index";
private static final String indexOwner = "Freenet";
@@ -153,46 +135,47 @@
// Can have many; this limit only exists to save memory.
private static final int maxParallelRequests = 100;
private int maxShownURIs = 15;
-
+ private HashMap urisToNumbers;
private NodeClientCore core;
private FetchContext ctx;
private final short PRIORITY_CLASS =
RequestStarter.BULK_SPLITFILE_PRIORITY_CLASS;
private boolean stopped = true;
PluginRespirator pr;
-/**
- * Adds the found uri to the list of to-be-retrieved uris. <p>Every usk uri
added as ssk.
- * @param uri the new uri that needs to be fetched for further indexing
- */
- public synchronized void queueURI(FreenetURI uri) {
+
+ private synchronized void queueURI(FreenetURI uri) {
+ //not adding the html condition
if((uri.getKeyType()).equals("USK")){
if(uri.getSuggestedEdition() < 0)
uri = uri.setSuggestedEdition((-1)*
uri.getSuggestedEdition());
try{
- uri = ((USK.create(uri)).getSSK()).getURI();
-
(ctx.uskManager).subscribe(USK.create(uri),this, false, this);
+ uri = ((USK.create(uri)).getSSK()).getURI();
+ //all uris are added as ssk
+ (ctx.uskManager).subscribe(USK.create(uri),this, false,
this);
}
catch(Exception e){}
}
-
+
if ((!visitedURIs.contains(uri)) && queuedURISet.add(uri)) {
queuedURIList.addLast(uri);
visitedURIs.add(uri);
uriIds.put(uri, id);
idUris.put(id, uri);
- id = new Integer(id.intValue()+1);
+ id++;
+
+ //the page object of the client will contain the uri of
the current page
}
}
private void startSomeRequests() {
-
+
FreenetURI[] initialURIs =
core.bookmarkManager.getBookmarkURIs();
for (int i = 0; i < initialURIs.length; i++)
{
- queueURI(initialURIs[i]);
+ queueURI(initialURIs[i]);
}
-
+
ArrayList toStart = null;
synchronized (this) {
if (stopped) {
@@ -200,31 +183,51 @@
}
int running = runningFetchesByURI.size();
int queued = queuedURIList.size();
-
+
if ((running >= maxParallelRequests) || (queued == 0))
return;
-
+
toStart = new ArrayList(Math.min(maxParallelRequests -
running, queued));
-
+
for (int i = running; i < maxParallelRequests; i++) {
if (queuedURIList.isEmpty())
break;
FreenetURI uri = (FreenetURI)
queuedURIList.removeFirst();
queuedURISet.remove(uri);
+// if((uri.getKeyType()).equals("USK")){
+// if(uri.getSuggestedEdition() < 0)
+// uri = uri.setSuggestedEdition((-1)*
uri.getSuggestedEdition());
+// try{
+//
(ctx.uskManager).subscribe(USK.create(uri),this, false, this);
+// }catch(Exception e){
+//
+// }
+
+ // }
ClientGetter getter = makeGetter(uri);
toStart.add(getter);
- }
+ }
}
- for (int i = 0; i < toStart.size(); i++) {
-
+ for (int i = 0; i < toStart.size(); i++) {
+
ClientGetter g = (ClientGetter) toStart.get(i);
try {
runningFetchesByURI.put(g.getURI(), g);
g.start();
- } catch (FetchException e) {
- onFailure(e, g);
+ FileWriter outp = new
FileWriter("logfile2",true);
+ outp.write("URI "+g.getURI().toString()+'\n');
+
+ outp.close();
+ } catch (FetchException e) {
+ onFailure(e, g);
+ }
+ catch (IOException e){
+ Logger.error(this, "the logfile can not
be written"+e.toString(), e);
+ }
+
}
- }
+ //}
+
}
@@ -232,12 +235,7 @@
ClientGetter g = new ClientGetter(this,
core.requestStarters.chkFetchScheduler, core.requestStarters.sskFetchScheduler,
uri, ctx, PRIORITY_CLASS, this, null, null);
return g;
}
-/**
- * Processes the successfully fetched uri for further outlinks.
- *
- * @param result
- * @param state
- */
+
public void onSuccess(FetchResult result, ClientGetter state) {
FreenetURI uri = state.getURI();
@@ -248,16 +246,22 @@
ClientMetadata cm = result.getMetadata();
Bucket data = result.asBucket();
String mimeType = cm.getMIMEType();
-
+
sizeOfURIs.put(uri.toString(), new Long(data.size()));
mimeOfURIs.put(uri.toString(), mimeType);
PageCallBack page = new PageCallBack();
page.id = (Integer) uriIds.get(uri);
inlinks.put(page.id, new Vector());
outlinks.put(page.id, new Vector());
-
- //instead of passing the current object, the pagecallback
object for every page is passed to the content filter
- // this is to allow inlinks and outlinks be indexed by specific
pages
+
+ try{
+ FileWriter output = new FileWriter("logfile",true);
+ output.write(uri.toString()+" page " + page.id +"\n");
+ output.close();
+ }
+ catch(Exception e){
+ Logger.error(this, "The uri could not be removed from
running "+e.toString(), e);
+ }
try {
ContentFilter.filter(data, ctx.bucketFactory, mimeType,
uri.toURI("http://127.0.0.1:8888/"), page);
} catch (UnsafeContentTypeException e) {
@@ -273,15 +277,25 @@
public void onFailure(FetchException e, ClientGetter state) {
FreenetURI uri = state.getURI();
-
+// try{
+// FileWriter outp = new FileWriter("failed",true);
+// outp.write("failed "+e.toString()+" for "+uri+'\n');
+// outp.close();
+//
+// }catch(Exception e2){
+//
+// }
synchronized (this) {
runningFetchesByURI.remove(uri);
failedURIs.add(uri);
}
if (e.newURI != null)
queueURI(e.newURI);
-
+// else
+// queueURI(uri);
startSomeRequests();
+
+
}
public void onSuccess(BaseClientPutter state) {
@@ -307,12 +321,19 @@
*/
private synchronized void produceIndex2() throws
IOException,NoSuchAlgorithmException {
// Produce the main index file.
-
+
+ //the number of bits to consider for matching
+
+
+// if (urisByWord.isEmpty() || urisWithWords.isEmpty()) {
+// System.out.println("No URIs with words");
+// return;
+// }
+
if (idsByWord.isEmpty() || idsWithWords.isEmpty()) {
System.out.println("No URIs with words");
return;
}
- //the main index file
File outputFile = new File(DEFAULT_INDEX_DIR+"index.xml");
StreamResult resultStream;
resultStream = new StreamResult(outputFile);
@@ -330,7 +351,7 @@
try {
xmlBuilder = xmlFactory.newDocumentBuilder();
} catch(javax.xml.parsers.ParserConfigurationException e) {
-
+ /* Will (should ?) never happen */
Logger.error(this, "Spider: Error while initializing
XML generator: "+e.toString());
return;
}
@@ -346,42 +367,57 @@
/* -> title */
Element subHeaderElement = xmlDoc.createElement("title");
Text subHeaderText = xmlDoc.createTextNode(indexTitle);
-
+
subHeaderElement.appendChild(subHeaderText);
headerElement.appendChild(subHeaderElement);
/* -> owner */
subHeaderElement = xmlDoc.createElement("owner");
subHeaderText = xmlDoc.createTextNode(indexOwner);
-
+
subHeaderElement.appendChild(subHeaderText);
headerElement.appendChild(subHeaderElement);
-
+
/* -> owner email */
if(indexOwnerEmail != null) {
subHeaderElement = xmlDoc.createElement("email");
subHeaderText = xmlDoc.createTextNode(indexOwnerEmail);
-
+
subHeaderElement.appendChild(subHeaderText);
headerElement.appendChild(subHeaderElement);
}
- /*
- * the max number of digits in md5 to be used for matching with
the search query is stored in the xml
- */
+
+
+ //String[] words = (String[]) urisByWord.keySet().toArray(new
String[urisByWord.size()]);
+ //Arrays.sort(words);
+
Element prefixElement = xmlDoc.createElement("prefix");
+ //prefixElement.setAttribute("value",match+"");
+ //this match will be set after processing the TreeMap
+
+
+
+ //all index files are ready
/* Adding word index */
Element keywordsElement = xmlDoc.createElement("keywords");
for(int i = 0;i<indices.size();i++){
-
+
//generateSubIndex(DEFAULT_INDEX_DIR+"index_"+Integer.toHexString(i)+".xml");
Element subIndexElement =
xmlDoc.createElement("subIndex");
+// if(i<=9)
+// subIndexElement.setAttribute("key",i+"");
+// else
+//
subIndexElement.setAttribute("key",Integer.toHexString(i));
subIndexElement.setAttribute("key", (String)
indices.elementAt(i));
//the subindex element key will contain the bits used
for matching in that subindex
keywordsElement.appendChild(subIndexElement);
}
-
+
prefixElement.setAttribute("value",match+"");
+ // make sure that prefix is the first child of root Element
rootElement.appendChild(prefixElement);
rootElement.appendChild(headerElement);
+
+ //rootElement.appendChild(filesElement);
rootElement.appendChild(keywordsElement);
/* Serialization */
@@ -398,7 +434,7 @@
serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
serializer.setOutputProperty(OutputKeys.INDENT,"yes");
-
+
/* final step */
try {
serializer.transform(domSource, resultStream);
@@ -409,11 +445,12 @@
if(Logger.shouldLog(Logger.MINOR, this))
Logger.minor(this, "Spider: indexes regenerated.");
+
+ //the main xml file is generated
+ //now as each word is generated enter it into the respective subindex
+ //now the parsing will start and nodes will be added as needed
+
- //The main xml file is generated
- //As each word is generated enter it into the respective
subindex
- //The parsing will start and nodes will be added as needed
-
}
/**
* Generates the subindices.
@@ -423,36 +460,48 @@
* @throws Exception
*/
private synchronized void generateIndex2() throws Exception{
- //using the tMap generate the xml indices
+ // now we the tree map and we need to use the sorted (md5s) to
generate the xml indices
+
+
if (idsByWord.isEmpty() || idsWithWords.isEmpty()) {
System.out.println("No URIs with words");
return;
}
+ // FreenetURI[] uris = (FreenetURI[]) urisWithWords.toArray(new
FreenetURI[urisWithWords.size()]);
+ Integer[] ids = (Integer[]) idsWithWords.toArray(new
Integer[idsWithWords.size()]);
+// urisToNumbers = new HashMap();
+// for (int i = 0; i < uris.length; i++) {
+// urisToNumbers.put(uris[i], new Integer(i));
+// }
indices = new Vector();
int prefix = 1;
match = 1;
Vector list = new Vector();
+ //String str = tMap.firstKey();
Iterator it = tMap.keySet().iterator();
-
+ FileWriter outp = new FileWriter("indexing");
+ outp.write("size = "+tMap.size()+"\n");
+ outp.close();
String str = (String) it.next();
- int i = 0;
+ int i = 0,index =0;
while(it.hasNext())
{
+ outp = new FileWriter("indexing",true);
String key =(String) it.next();
- //create a list of the words to be added in the same
subindex
+ outp.write(key + "\n");
+ outp.close();
if(key.substring(0, prefix).equals(str.substring(0,
prefix)))
- {i++;
- list.add(key);
- }
+ {i++;
+ list.add(key);
+ }
else {
- //generate the appropriate subindex with the
current list
- generateSubIndex(prefix,list);
- str = key;
- list = new Vector();
- }
+ generateSubIndex(prefix,list);
+ str = key;
+ list = new Vector();
}
-
+ }
+
generateSubIndex(prefix,list);
}
private synchronized Vector subVector(Vector list, int begin, int end){
@@ -462,19 +511,15 @@
}
private synchronized void generateSubIndex(int p,Vector list) throws
Exception{
- /*
- * if the list is less than max allowed entries in a file then
directly generate the xml
- * otherwise split the list into further sublists
- * and iterate till the number of entries per subindex is less
than the allowed value
- */
-
+
if(list.size() < MAX_ENTRIES)
- {
+ {
+ //the index can be generated from this list
generateXML(list,p);
}
else
{
- //prefix needs to be incremented
+ //this means that prefix needs to be incremented
if(match <= p) match = p+1;
int prefix = p+1;
int i =0;
@@ -484,30 +529,35 @@
{
String key = (String) list.elementAt(i);
if((key.substring(0,
prefix)).equals(str.substring(0, prefix)))
- {
+ {
+ //index = i;
i++;
- }
+ }
else {
+
//generateXML(subVector(list,index,i-1),prefix);
generateSubIndex(prefix,subVector(list,index,i-1));
index = i;
str = key;
}
+
+
}
generateSubIndex(prefix,subVector(list,index,i-1));
}
}
+
-/**
- * generates the xml index with the given list of words with prefix number of
matching bits in md5
- * @param list list of the words to be added in the index
- * @param prefix number of matching bits of md5
- * @throws Exception
- */
- public synchronized void generateXML (Vector list, int prefix) throws
Exception
+ private synchronized void generateXML (Vector list, int prefix) throws
Exception
{
+ FileWriter outp = new FileWriter("gen",true);
+
+
String p = ((String) list.elementAt(0)).substring(0, prefix);
+ outp.write("inside gen xml + "+p+"\n");
+
indices.add(p);
File outputFile = new File(DEFAULT_INDEX_DIR+"index_"+p+".xml");
+ //indices.add(p);
StreamResult resultStream;
resultStream = new StreamResult(outputFile);
@@ -517,66 +567,92 @@
DocumentBuilder xmlBuilder = null;
DOMImplementation impl = null;
Element rootElement = null;
+
xmlFactory = DocumentBuilderFactory.newInstance();
+
try {
xmlBuilder = xmlFactory.newDocumentBuilder();
} catch(javax.xml.parsers.ParserConfigurationException e) {
+ /* Will (should ?) never happen */
Logger.error(this, "Spider: Error while initializing
XML generator: "+e.toString());
return;
}
+
impl = xmlBuilder.getDOMImplementation();
+
/* Starting to generate index */
+
xmlDoc = impl.createDocument(null, "sub_index", null);
rootElement = xmlDoc.getDocumentElement();
/* Adding header to the index */
Element headerElement = xmlDoc.createElement("header");
+
/* -> title */
Element subHeaderElement = xmlDoc.createElement("title");
Text subHeaderText = xmlDoc.createTextNode(indexTitle);
+
subHeaderElement.appendChild(subHeaderText);
headerElement.appendChild(subHeaderElement);
+
+
Element filesElement = xmlDoc.createElement("files"); /*
filesElement != fileElement */
+
Element EntriesElement = xmlDoc.createElement("entries");
+
EntriesElement.setNodeValue(list.size()+"");
+ outp.write("size = "+list.size()+"\n");
EntriesElement.setAttribute("value", list.size()+"");
-
+ //all index files are ready
/* Adding word index */
Element keywordsElement = xmlDoc.createElement("keywords");
+ //words to be added
Vector fileid = new Vector();
for(int i =0;i<list.size();i++)
{
Element wordElement = xmlDoc.createElement("word");
String str = (String) tMap.get(list.elementAt(i));
+ outp.write("word "+str+"\n");
wordElement.setAttribute("v",str );
+ //FreenetURI[] urisForWord = (FreenetURI[])
urisByWord.get(str);
Integer[] idsForWord = (Integer[]) idsByWord.get(str);
+//
for (int j = 0; j < idsForWord.length; j++) {
Integer id = idsForWord[j];
+ //Integer x = (Integer) urisToNumbers.get(uri);
Integer x = id;
+ outp.write("x "+x+"\n");
if (x == null) {
Logger.error(this, "Eh?");
continue;
}
+//
Element uriElement =
xmlDoc.createElement("file");
Element fileElement =
xmlDoc.createElement("file");
uriElement.setAttribute("id", x.toString());
fileElement.setAttribute("id", x.toString());
+ //fileElement.setAttribute("key",
uri.toString());
+ outp.write("uri
"+(idUris.get(id)).toString()+"\n");
fileElement.setAttribute("key",(idUris.get(id)).toString());
- /* Position by position */
-
+//// /* Position by position */
+ //HashMap positionsForGivenWord =
(HashMap)positionsByWordByURI.get(uri.toString());
HashMap positionsForGivenWord =
(HashMap)positionsByWordById.get(x);
Integer[] positions =
(Integer[])positionsForGivenWord.get(str);
+
StringBuffer positionList = new StringBuffer();
for(int k=0; k < positions.length ; k++) {
if(k!=0)
positionList.append(',');
+
positionList.append(positions[k].toString());
}
+
uriElement.appendChild(xmlDoc.createTextNode(positionList.toString()));
+
wordElement.appendChild(uriElement);
if(!fileid.contains(x))
{
@@ -584,8 +660,12 @@
filesElement.appendChild(fileElement);
}
}
+
+ //Element keywordsElement = (Element)
root.getElementsByTagName("keywords").item(0);
keywordsElement.appendChild(wordElement);
+
}
+
rootElement.appendChild(EntriesElement);
rootElement.appendChild(headerElement);
rootElement.appendChild(filesElement);
@@ -602,8 +682,11 @@
Logger.error(this, "Spider: Error while serializing XML
(transformFactory.newTransformer()): "+e.toString());
return;
}
+
+
serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
serializer.setOutputProperty(OutputKeys.INDENT,"yes");
+
/* final step */
try {
serializer.transform(domSource, resultStream);
@@ -614,22 +697,152 @@
if(Logger.shouldLog(Logger.MINOR, this))
Logger.minor(this, "Spider: indexes regenerated.");
+ outp.close();
}
+ public String search(String str,NodeList list) throws Exception
+ {
+ int prefix = str.length();
+ for(int i = 0;i<list.getLength();i++){
+ Element subIndex = (Element) list.item(i);
+ String key = subIndex.getAttribute("key");
+ if(key.equals(str)) return key;
+ }
+ return search(str.substring(0, prefix-1),list);
+ }
+
+
public void handleGet(HTTPRequest request, ToadletContext context)
throws IOException, ToadletContextClosedException {
- /*
- * ignore
- */
+ String action = request.getParam("action");
+ PageMaker pageMaker = context.getPageMaker();
+ if ((action == null) || (action.length() == 0)) {
+ MultiValueTable responseHeaders = new MultiValueTable();
+ responseHeaders.put("Location", "?action=list");
+ context.sendReplyHeaders(301, "Redirect",
responseHeaders, "text/html; charset=utf-8", 0);
+ return;
+ } else if ("list".equals(action)) {
+
+ String listName = request.getParam("listName", null);
+ HTMLNode pageNode = pageMaker.getPageNode("The XML
Spider", context);
+ HTMLNode contentNode =
pageMaker.getContentNode(pageNode);
+ /* create copies for multi-threaded use */
+ if (listName == null) {
+ Map runningFetches = new
HashMap(runningFetchesByURI);
+ List queued = new ArrayList(queuedURIList);
+ Set visited = new HashSet(visitedURIs);
+ Set failed = new HashSet(failedURIs);
+ contentNode.addChild(createNavbar(pageMaker,
runningFetches.size(), queued.size(), visited.size(), failed.size()));
+ contentNode.addChild(createAddBox(pageMaker,
context));
+ contentNode.addChild(createList(pageMaker,
"Running FetcheIIIs", "running", runningFetches.keySet(), maxShownURIs));
+ contentNode.addChild(createList(pageMaker,
"Queued URIs", "queued", queued, maxShownURIs));
+ contentNode.addChild(createList(pageMaker,
"Visited URIs", "visited", visited, maxShownURIs));
+ contentNode.addChild(createList(pageMaker,
"Failed URIs", "failed", failed, maxShownURIs));
+ } else {
+ contentNode.addChild(createBackBox(pageMaker));
+ if ("failed".equals(listName)) {
+ Set failed = new HashSet(failedURIs);
+
contentNode.addChild(createList(pageMaker, "Failed URIs", "failed", failed,
-1));
+ } else if ("visited".equals(listName)) {
+ Set visited = new HashSet(visitedURIs);
+
contentNode.addChild(createList(pageMaker, "Visited URIs", "visited", visited,
-1));
+ } else if ("queued".equals(listName)) {
+ List queued = new
ArrayList(queuedURIList);
+
contentNode.addChild(createList(pageMaker, "Queued URIs", "queued", queued,
-1));
+ } else if ("running".equals(listName)) {
+ Map runningFetches = new
HashMap(runningFetchesByURI);
+
contentNode.addChild(createList(pageMaker, "Running Fetches", "running",
runningFetches.keySet(), -1));
+ }
+ }
+ MultiValueTable responseHeaders = new MultiValueTable();
+ byte[] responseBytes =
pageNode.generate().getBytes("utf-8");
+ context.sendReplyHeaders(200, "OK", responseHeaders,
"text/html; charset=utf-8", responseBytes.length);
+ context.writeData(responseBytes);
+ } else if ("add".equals(action)) {
+ String uriParam = request.getParam("key");
+ try {
+ FreenetURI uri = new FreenetURI(uriParam);
+ synchronized (this) {
+ failedURIs.remove(uri);
+ visitedURIs.remove(uri);
+ }
+ queueURI(uri);
+ startSomeRequests();
+ } catch (MalformedURLException mue1) {
+ sendSimpleResponse(context, "URL invalid", "The
given URI is not valid.");
+ return;
+ }
+ MultiValueTable responseHeaders = new MultiValueTable();
+ responseHeaders.put("Location", "?action=list");
+ context.sendReplyHeaders(301, "Redirect",
responseHeaders, "text/html; charset=utf-8", 0);
+ return;
+ }
}
-
public void handlePost(HTTPRequest request, ToadletContext context)
throws IOException {
- /*
- * ignore
- */
}
+ private void sendSimpleResponse(ToadletContext context, String title,
String message) throws ToadletContextClosedException, IOException {
+ PageMaker pageMaker = context.getPageMaker();
+ HTMLNode pageNode = pageMaker.getPageNode(title, context);
+ HTMLNode contentNode = pageMaker.getContentNode(pageNode);
+ HTMLNode infobox =
contentNode.addChild(pageMaker.getInfobox("infobox-alter", title));
+ HTMLNode infoboxContent = pageMaker.getContentNode(infobox);
+ infoboxContent.addChild("#", message);
+ byte[] responseBytes = pageNode.generate().getBytes("utf-8");
+ context.sendReplyHeaders(200, "OK", new MultiValueTable(),
"text/html; charset=utf-8", responseBytes.length);
+ context.writeData(responseBytes);
+ }
+
+ private HTMLNode createBackBox(PageMaker pageMaker) {
+ HTMLNode backbox = pageMaker.getInfobox((String) null);
+ HTMLNode backContent = pageMaker.getContentNode(backbox);
+ backContent.addChild("#", "Return to the ");
+ backContent.addChild("a", "href", "?action=list", "list of all
URIs");
+ backContent.addChild("#", ".");
+ return backbox;
+ }
+
+ private HTMLNode createAddBox(PageMaker pageMaker, ToadletContext ctx) {
+ HTMLNode addBox = pageMaker.getInfobox("Add a URI");
+ HTMLNode formNode =
pageMaker.getContentNode(addBox).addChild("form", new String[] { "action",
"method" }, new String[] { "", "get" });
+ formNode.addChild("input", new String[] { "type", "name",
"value" }, new String[] { "hidden", "action", "add" });
+ formNode.addChild("input", new String[] { "type", "size",
"name", "value" }, new String[] { "text", "40", "key", "" });
+ formNode.addChild("input", new String[] { "type", "value" },
new String[] { "submit", "Add URI" });
+ return addBox;
+ }
+
+ private HTMLNode createNavbar(PageMaker pageMaker, int running, int
queued, int visited, int failed) {
+ HTMLNode navbar = pageMaker.getInfobox("navbar", "Page
Navigation");
+ HTMLNode list = pageMaker.getContentNode(navbar).addChild("ul");
+ list.addChild("li").addChild("a", "href", "#running", "Running
(" + running + ')');
+ list.addChild("li").addChild("a", "href", "#queued", "Queued ("
+ queued + ')');
+ list.addChild("li").addChild("a", "href", "#visited", "Visited
(" + visited + ')');
+ list.addChild("li").addChild("a", "href", "#failed", "Failed ("
+ failed + ')');
+ return navbar;
+ }
+
+ private HTMLNode createList(PageMaker pageMaker, String listName,
String anchorName, Collection collection, int maxCount) {
+ HTMLNode listNode = new HTMLNode("div");
+ listNode.addChild("a", "name", anchorName);
+ HTMLNode listBox = pageMaker.getInfobox(listName);
+ HTMLNode listContent = pageMaker.getContentNode(listBox);
+ listNode.addChild(listBox);
+ Iterator collectionItems = collection.iterator();
+ int itemCount = 0;
+ while (collectionItems.hasNext()) {
+ FreenetURI uri = (FreenetURI) collectionItems.next();
+ listContent.addChild("#", uri.toString());
+ listContent.addChild("br");
+ if (itemCount++ == maxCount) {
+ listContent.addChild("br");
+ listContent.addChild("a", "href",
"?action=list&listName=" + anchorName, "Show all\u2026");
+ break;
+ }
+ }
+ return listNode;
+ }
+
/**
* @see freenet.oldplugins.plugin.Plugin#getPluginName()
*/
@@ -699,11 +912,8 @@
}
return buf.toString();
}
-
- /*
- * calculate the md5 for a given string
- */
- private static String MD5(String text) throws NoSuchAlgorithmException,
UnsupportedEncodingException {
+ //this function will return the String representation of the MD5 hash
for the input string
+ public static String MD5(String text) throws NoSuchAlgorithmException,
UnsupportedEncodingException {
MessageDigest md;
md = MessageDigest.getInstance("MD5");
byte[] md5hash = new byte[32];
@@ -823,7 +1033,7 @@
public void runPlugin(PluginRespirator pr){
this.pr = pr;
- this.id = new Integer(0);
+ this.id = 0;
this.core = pr.getNode().clientCore;
this.ctx = core.makeClient((short) 0).getFetchContext();
ctx.maxSplitfileBlockRetries = 10;
@@ -834,9 +1044,9 @@
allowedMIMETypes.add(new String("text/html"));
allowedMIMETypes.add(new String("text/plain"));
allowedMIMETypes.add(new String("application/xhtml+xml"));
-
+// allowedMIMETypes.add(new String("application/zip"));
ctx.allowedMIMETypes = new HashSet(allowedMIMETypes);
-
+// ctx.allowedMIMETypes.add("text/html");
tProducedIndex = System.currentTimeMillis();
indexing = true;
stopped = false;
@@ -855,12 +1065,11 @@
starterThread.start();
}
-/**
- * Interface to the Spider data
- */
public String handleHTTPGet(HTTPRequest request) throws PluginHTTPException{
StringBuffer out = new StringBuffer();
-
+ // need to produce pretty html
+ //later fredpluginhttpadvanced will give the interface
+ //this brings us to the page from visit
String listname = request.getParam("list");
if(listname.length() != 0)
{
@@ -888,7 +1097,6 @@
}
return out.toString();
}
-
private void appendList(String listname, StringBuffer out, String stylesheet)
{
Iterator it = (runningFetchesByURI.keySet()).iterator();
@@ -903,7 +1111,6 @@
while(it.hasNext())
out.append("<code>"+it.next().toString()+"</code><br/>");
}
-
private void appendDefaultPageStart(StringBuffer out, String stylesheet) {
out.append("<HTML><HEAD><TITLE>" + pluginName + "</TITLE>");
@@ -921,11 +1128,10 @@
Set failed = new HashSet(failedURIs);
Iterator it=queued.iterator();
- out.append("<br/>Size :"+runningFetches.size()+"<br/>");
+ out.append("<br/>Size :"+runningFetches.size());
appendList(runningFetches,out,stylesheet);
out.append("<p><a href=\"?list="+"running"+"\">Show all</a><br/></p>");
- out.append("<p><h3>Queued URIs</h3></p>");
- out.append("<br/>Size :"+queued.size()+"<br/>");
+ out.append("<br/>Size :"+queued.size());
int i = 0;
while(it.hasNext()){
if(i<=maxShownURIs){
@@ -935,19 +1141,15 @@
i++;
}
out.append("<p><a href=\"?list="+"queued"+"\">Show all</a><br/></p>");
- out.append("<p><h3>Visited URIs</h3></p>");
- out.append("<br/>Size :"+visited.size()+"<br/>");
+ out.append("<br/>Size :"+visited.size());
appendList(visited,out,stylesheet);
out.append("<p><a href=\"?list="+"visited"+"\">Show all</a><br/></p>");
- out.append("<p><h3>Failed URIs</h3></p>");
- out.append("<br/>Size :"+failed.size()+"<br/>");
+ out.append("<br/>Size :"+failed.size());
appendList(failed,out,stylesheet);
out.append("<p><a href=\"?list="+"failed"+"\">Show all</a><br/></p>");
}
-
-
private void appendDefaultHeader(StringBuffer out, String stylesheet){
out.append("<HTML><HEAD><TITLE>" + pluginName + "</TITLE>");
if(stylesheet != null)
@@ -958,8 +1160,6 @@
out.append("<form method=\"GET\"><input type=\"text\" name=\"adduri\"
/><br/><br/>");
out.append("<input type=\"submit\" value=\"Add uri\" /></form>");
}
-
-
private void appendList(Set list,StringBuffer out, String stylesheet){
Iterator it = list.iterator();
int i = 0;
@@ -968,130 +1168,169 @@
out.append("<code>"+it.next().toString()+"</code><br/>");
}
else{
+ //out.append("<form method=\"GET\"><input
type=\"submit\" name=\"Showall\" />");
+// if(listname.equals("visited"))
+// out.append("<p><a href=\"?list="+listname+">Showall
visited</a><br/></p>");
+// if(listname.equals("failed"))
+// out.append("<p><a
href=\"?list="+listname+">Showall failed</a><br/></p>");
break;
}
i++;
- }
+
}
+
+}
-/**
- * creates the callback object for each page.
- *<p>Used to create inlinks and outlinks for each page separately.
- * @author swati
- *
- */
public class PageCallBack implements FoundURICallback{
- Integer id;
- /*
- * id of the page as refrenced in uriIds
- */
+ int id;
+
PageCallBack(){
- id = new Integer(0);
+ id = 0;
}
-
public void foundURI(FreenetURI uri){
-
+ //now we have the id of the page that had called this link
queueURI(uri);
int iduri = (Integer) uriIds.get(uri);
-
- if(outlinks.containsKey(id)){
- Vector outlink = (Vector) outlinks.get(id);
- if(!outlink.contains(iduri))
- outlink.add(iduri);
- outlinks.remove(id);
- outlinks.put(id, outlink);
- }
- else
- {
- Vector outlink = new Vector();
+ Vector outlink = (Vector) outlinks.get(id);
+ if(!outlink.contains(iduri))
outlink.add(iduri);
- outlinks.put(id, outlink);
- }
+ outlinks.remove(id);
+ outlinks.put(id, outlink);
+ try{
+ FileWriter out = new FileWriter("outlink",true);
+ out.write(" id "+id+" size "+ outlink.size()+" \n");
+ out.close();
+ }catch(Exception e){}
if(inlinks.containsKey(iduri)){
Vector inlink = (Vector) inlinks.get(iduri);
+ try{
+ FileWriter out = new FileWriter("inlink",true);
+ out.write(" id "+iduri+" size "+
inlink.size()+" \n");
+ out.close();
+ }catch(Exception e){}
+
if(!inlink.contains(id)) inlink.add(id);
inlinks.remove(iduri);
inlinks.put(iduri, inlink);
+
}
- else
- {
- Vector inlink = new Vector();
- inlink.add(id);
- inlinks.put(iduri, inlink);
- }
-
startSomeRequests();
}
-
-
public void onText(String s, String type, URI baseURI){
-
+ try{
+ FileWriter outp = new FileWriter("ontext",true);
+ outp.write("inside on text with id"+id+" \n");
+ outp.close();
+ }catch(Exception e){}
+// FreenetURI uri;
+// try {
+// uri = new FreenetURI(baseURI.getPath().substring(1));
+// } catch (MalformedURLException e) {
+// Logger.error(this, "Caught " + e, e);
+// return;
+// }
+
+
+
if((type != null) && (type.length() != 0) &&
type.toLowerCase().equals("title")
- && (s != null) && (s.length() != 0) &&
(s.indexOf('\n') < 0)) {
+ && (s != null) && (s.length() != 0) && (s.indexOf('\n') <
0)) {
/* We should have a correct title */
+ // titlesOfURIs.put(uri.toString(), s);
titlesOfIds.put(id, s);
+
type = "title";
+
}
else type = null;
+
String[] words = s.split("[^A-Za-z0-9]");
+
Integer lastPosition = null;
+
+ //lastPosition = (Integer)lastPositionByURI.get(uri.toString());
lastPosition = (Integer)lastPositionById.get(id);
-
if(lastPosition == null)
lastPosition = new Integer(1); /* We start to count
from 1 */
+
for (int i = 0; i < words.length; i++) {
String word = words[i];
if ((word == null) || (word.length() == 0))
continue;
word = word.toLowerCase();
try{
- if(type == null)
- addWord(word, lastPosition.intValue() +
i, id);
- else
- addWord(word, -1 * (i+1), id);
+ if(type == null)
+ addWord(word, lastPosition.intValue() + i, id);
+ else
+ addWord(word, -1 * (i+1), id);
}
catch (Exception e){}
}
-
+
if(type == null) {
lastPosition = new Integer(lastPosition.intValue() +
words.length);
+ // lastPositionByURI.put(uri.toString(), lastPosition);
lastPositionById.put(id, lastPosition);
}
-
+
}
-
- private synchronized void addWord(String word, int position,Integer id)
throws Exception{
+private synchronized void addWord(String word, int position,int id) throws
Exception{
+
+
if(word.length() < 3)
return;
+
+ //word = word.intern();
+
+ //FreenetURI[] uris = (FreenetURI[]) urisByWord.get(word);
Integer[] ids = (Integer[]) idsByWord.get(word);
+
+ // urisWithWords.add(uri);
idsWithWords.add(id);
+ try{
+ FileWriter outp = new FileWriter("addWord",true);
+ outp.write("ID ="+id+" uri ="+idUris.get(id)+"\n");
+ outp.close();
+ }catch(Exception e){}
+// FileWriter outp = new FileWriter("uricheck",true);
+// outp.write(uri.getDocName()+"\n");
+// outp.write(uri.getKeyType()+"\n");
+// outp.write(uri.getMetaString()+"\n");
+// outp.write(uri.getGuessableKey()+"\n");
+// outp.write(uri.hashCode()+"\n");
+// outp.write(uri.getPreferredFilename()+"\n");
+//
+// outp.close();
/* Word position indexation */
HashMap wordPositionsForOneUri =
(HashMap)positionsByWordById.get(id); /* For a given URI, take as key a word,
and gives position */
+
if(wordPositionsForOneUri == null) {
wordPositionsForOneUri = new HashMap();
wordPositionsForOneUri.put(word, new Integer[] { new
Integer(position) });
+ //positionsByWordByURI.put(uri.toString(),
wordPositionsForOneUri);
positionsByWordById.put(id, wordPositionsForOneUri);
- }
- else {
+ } else {
Integer[] positions =
(Integer[])wordPositionsForOneUri.get(word);
+
if(positions == null) {
positions = new Integer[] { new
Integer(position) };
wordPositionsForOneUri.put(word, positions);
- }
- else {
+ } else {
Integer[] newPositions = new
Integer[positions.length + 1];
+
System.arraycopy(positions, 0, newPositions, 0,
positions.length);
newPositions[positions.length] = new
Integer(position);
+
wordPositionsForOneUri.put(word, newPositions);
}
}
-
+
if (ids == null) {
idsByWord.put(word, new Integer[] { id });
+
} else {
for (int i = 0; i < ids.length; i++) {
if (ids[i].equals(id))
@@ -1102,29 +1341,30 @@
newIDs[ids.length] = id;
idsByWord.put(word, newIDs);
}
-
+ //the new word is added here in urisByWord
tMap.put(MD5(word), word);
long time_indexing = System.currentTimeMillis();
if (tProducedIndex + minTimeBetweenEachIndexRewriting * 10 <
System.currentTimeMillis()) {
try {
+ //produceIndex();
+ //check();
+
if(indexing){
- generateIndex2();
- produceIndex2();
- /*
- * ensures that index production
doesn't eat up the processor time
- */
- if((System.currentTimeMillis() -
time_indexing)/(System.currentTimeMillis() - tProducedIndex) >
MAX_TIME_SPENT_INDEXING) indexing= false;
- else indexing = true;
+ generateIndex2();
+ produceIndex2();
+ if((System.currentTimeMillis() -
time_indexing)/(System.currentTimeMillis() - tProducedIndex) >
MAX_TIME_SPENT_INDEXING) indexing= false;
+ else indexing = true;
}
+
} catch (IOException e) {
Logger.error(this, "Caught " + e + " while
creating index", e);
}
tProducedIndex = System.currentTimeMillis();
}
+
}
+
}
-
-
public String handleHTTPPut(HTTPRequest request) throws PluginHTTPException{
return null;
}
@@ -1139,4 +1379,5 @@
queueURI(uri);
}
+
}