Author: swatig0
Date: 2007-08-18 19:17:32 +0000 (Sat, 18 Aug 2007)
New Revision: 14795
Modified:
trunk/plugins/XMLSpider/XMLSpider.java
Log:
XMLSpider with titles fixed
Modified: trunk/plugins/XMLSpider/XMLSpider.java
===================================================================
--- trunk/plugins/XMLSpider/XMLSpider.java 2007-08-18 19:17:17 UTC (rev
14794)
+++ trunk/plugins/XMLSpider/XMLSpider.java 2007-08-18 19:17:32 UTC (rev
14795)
@@ -4,7 +4,6 @@
package plugins.XMLSpider;
import java.io.File;
-import java.io.FileWriter;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
@@ -87,13 +86,13 @@
* Lists the uris that have been vistied by the spider
*/
public final HashSet visitedURIs = new HashSet();
- private final HashSet idsWithWords = new HashSet();
+ private final HashSet idsWithWords = new HashSet();
/**
*
* Lists the uris that were visited but failed.
*/
public final HashSet failedURIs = new HashSet();
-
+
private final HashSet queuedURISet = new HashSet();
/**
*
@@ -101,9 +100,9 @@
*/
public final LinkedList queuedURIList = new LinkedList();
private final HashMap runningFetchesByURI = new HashMap();
-
+
private final HashMap idsByWord = new HashMap();
-
+
private final HashMap titlesOfIds = new HashMap();
private final HashMap uriIds = new HashMap();
private final HashMap idUris = new HashMap();
@@ -120,14 +119,14 @@
private Vector indices;
private int match;
private Integer id;
-
+
private boolean indexing ;
-
+
private static final int minTimeBetweenEachIndexRewriting = 10;
-/**
- * directory where the generated indices are stored.
- * Needs to be created before it can be used
- */
+ /**
+ * directory where the generated indices are stored.
+ * Needs to be created before it can be used
+ */
public static final String DEFAULT_INDEX_DIR = "myindex/";
/**
* Lists the allowed mime types of the fetched page.
@@ -140,7 +139,7 @@
* maximum value = 1; minimum value = 0.
*/
public static final double MAX_TIME_SPENT_INDEXING = 0.5;
-
+
private static final String indexTitle= "XMLSpider index";
private static final String indexOwner = "Freenet";
private static final String indexOwnerEmail = null;
@@ -153,17 +152,17 @@
// Can have many; this limit only exists to save memory.
private static final int maxParallelRequests = 100;
private int maxShownURIs = 15;
-
+
private NodeClientCore core;
private FetchContext ctx;
private final short PRIORITY_CLASS =
RequestStarter.BULK_SPLITFILE_PRIORITY_CLASS;
private boolean stopped = true;
PluginRespirator pr;
-
-/**
- * Adds the found uri to the list of to-be-retrieved uris. <p>Every usk uri
added as ssk.
- * @param uri the new uri that needs to be fetched for further indexing
- */
+
+ /**
+ * Adds the found uri to the list of to-be-retrieved uris. <p>Every usk
uri added as ssk.
+ * @param uri the new uri that needs to be fetched for further indexing
+ */
public synchronized void queueURI(FreenetURI uri) {
if((uri.getKeyType()).equals("USK")){
if(uri.getSuggestedEdition() < 0)
@@ -226,18 +225,18 @@
}
}
}
-
+
private ClientGetter makeGetter(FreenetURI uri) {
ClientGetter g = new ClientGetter(this,
core.requestStarters.chkFetchScheduler, core.requestStarters.sskFetchScheduler,
uri, ctx, PRIORITY_CLASS, this, null, null);
return g;
}
-/**
- * Processes the successfully fetched uri for further outlinks.
- *
- * @param result
- * @param state
- */
+ /**
+ * Processes the successfully fetched uri for further outlinks.
+ *
+ * @param result
+ * @param state
+ */
public void onSuccess(FetchResult result, ClientGetter state) {
FreenetURI uri = state.getURI();
@@ -255,9 +254,11 @@
page.id = (Integer) uriIds.get(uri);
inlinks.put(page.id, new Vector());
outlinks.put(page.id, new Vector());
-
- //instead of passing the current object, the pagecallback
object for every page is passed to the content filter
- // this is to allow inlinks and outlinks be indexed by specific
pages
+ /*
+ * instead of passing the current object, the pagecallback
object for every page is passed to the content filter
+ * this is to allow inlinks and outlinks be indexed by specific
pages
+ */
+
try {
ContentFilter.filter(data, ctx.bucketFactory, mimeType,
uri.toURI("http://127.0.0.1:8888/"), page);
} catch (UnsafeContentTypeException e) {
@@ -270,7 +271,7 @@
data.free();
}
}
-
+
public void onFailure(FetchException e, ClientGetter state) {
FreenetURI uri = state.getURI();
@@ -297,14 +298,14 @@
}
/**
- * generates the main index file that can be used by librarian for searching
in the list of
- * subindices
- *
- * @param void
- * @author swati
- * @throws IOException
- * @throws NoSuchAlgorithmException
- */
+ * generates the main index file that can be used by librarian for
searching in the list of
+ * subindices
+ *
+ * @param void
+ * @author swati
+ * @throws IOException
+ * @throws NoSuchAlgorithmException
+ */
private synchronized void produceIndex2() throws
IOException,NoSuchAlgorithmException {
// Produce the main index file.
@@ -428,7 +429,7 @@
System.out.println("No URIs with words");
return;
}
-
+
indices = new Vector();
int prefix = 1;
match = 1;
@@ -460,7 +461,7 @@
for(int i = begin;i<end+1;i++) tmp.add(list.elementAt(i));
return tmp;
}
-
+
private synchronized void generateSubIndex(int p,Vector list) throws
Exception{
/*
* if the list is less than max allowed entries in a file then
directly generate the xml
@@ -469,7 +470,7 @@
*/
if(list.size() < MAX_ENTRIES)
- {
+ {
generateXML(list,p);
}
else
@@ -497,12 +498,12 @@
}
}
-/**
- * generates the xml index with the given list of words with prefix number of
matching bits in md5
- * @param list list of the words to be added in the index
- * @param prefix number of matching bits of md5
- * @throws Exception
- */
+ /**
+ * generates the xml index with the given list of words with prefix
number of matching bits in md5
+ * @param list list of the words to be added in the index
+ * @param prefix number of matching bits of md5
+ * @throws Exception
+ */
public synchronized void generateXML (Vector list, int prefix) throws
Exception
{
String p = ((String) list.elementAt(0)).substring(0, prefix);
@@ -560,23 +561,21 @@
Logger.error(this, "Eh?");
continue;
}
+ /*
+ * adding file information
+ * uriElement - lists the id of the file
containing a particular word
+ * fileElement - lists the id,key,title of the
files mentioned in the entire subindex
+ */
Element uriElement =
xmlDoc.createElement("file");
Element fileElement =
xmlDoc.createElement("file");
uriElement.setAttribute("id", x.toString());
fileElement.setAttribute("id", x.toString());
fileElement.setAttribute("key",(idUris.get(id)).toString());
if(titlesOfIds.containsKey(id))
-
fileElement.setAttribute("title",(titlesOfIds.get(id)).toString());
+
fileElement.setAttribute("title",(titlesOfIds.get(id)).toString());
else
fileElement.setAttribute("title",(idUris.get(id)).toString());
- // try{
-// FileWriter outp = new FileWriter("titles",true);
-// outp.write("title =
"+(titlesOfIds.get(id)).toString());
-// outp.close();
-// }
-// catch(Exception e){
-//
-// }
+
/* Position by position */
HashMap positionsForGivenWord =
(HashMap)positionsByWordById.get(x);
@@ -628,20 +627,20 @@
Logger.minor(this, "Spider: indexes regenerated.");
}
-
+
public void handleGet(HTTPRequest request, ToadletContext context)
throws IOException, ToadletContextClosedException {
/*
* ignore
*/
}
-
+
public void handlePost(HTTPRequest request, ToadletContext context)
throws IOException {
/*
* ignore
*/
}
-
+
/**
* @see freenet.oldplugins.plugin.Plugin#getPluginName()
*/
@@ -653,7 +652,7 @@
* @see
freenet.oldplugins.plugin.Plugin#setPluginManager(freenet.oldplugins.plugin.PluginManager)
*/
public void setPluginManager(PluginManager pluginManager) {
-
+
this.core = pluginManager.getClientCore();
this.ctx = core.makeClient((short) 0).getFetchContext();
ctx.maxSplitfileBlockRetries = 10;
@@ -664,9 +663,9 @@
allowedMIMETypes.add(new String("text/html"));
allowedMIMETypes.add(new String("text/plain"));
allowedMIMETypes.add(new String("application/xhtml+xml"));
- // allowedMIMETypes.add(new String("application/zip"));
+ // allowedMIMETypes.add(new String("application/zip"));
ctx.allowedMIMETypes = new HashSet(allowedMIMETypes);
- // ctx.allowedMIMETypes.add("text/html");
+ // ctx.allowedMIMETypes.add("text/html");
tProducedIndex = System.currentTimeMillis();
indexing = true;
}
@@ -697,21 +696,21 @@
// Ignore
}
private static String convertToHex(byte[] data) {
- StringBuffer buf = new StringBuffer();
- for (int i = 0; i < data.length; i++) {
- int halfbyte = (data[i] >>> 4) & 0x0F;
- int two_halfs = 0;
- do {
- if ((0 <= halfbyte) && (halfbyte <= 9))
- buf.append((char) ('0' + halfbyte));
- else
- buf.append((char) ('a' + (halfbyte - 10)));
- halfbyte = data[i] & 0x0F;
- } while(two_halfs++ < 1);
- }
- return buf.toString();
- }
-
+ StringBuffer buf = new StringBuffer();
+ for (int i = 0; i < data.length; i++) {
+ int halfbyte = (data[i] >>> 4) & 0x0F;
+ int two_halfs = 0;
+ do {
+ if ((0 <= halfbyte) && (halfbyte <= 9))
+ buf.append((char) ('0' + halfbyte));
+ else
+ buf.append((char) ('a' + (halfbyte -
10)));
+ halfbyte = data[i] & 0x0F;
+ } while(two_halfs++ < 1);
+ }
+ return buf.toString();
+ }
+
/*
* calculate the md5 for a given string
*/
@@ -723,9 +722,9 @@
md5hash = md.digest();
return convertToHex(md5hash);
}
-
+
public void generateSubIndex(String filename){
-//generates the new subIndex
+// generates the new subIndex
File outputFile = new File(filename);
StreamResult resultStream;
resultStream = new StreamResult(outputFile);
@@ -762,28 +761,28 @@
/* -> title */
Element subHeaderElement = xmlDoc.createElement("title");
Text subHeaderText = xmlDoc.createTextNode(indexTitle);
-
+
subHeaderElement.appendChild(subHeaderText);
headerElement.appendChild(subHeaderElement);
/* -> owner */
subHeaderElement = xmlDoc.createElement("owner");
subHeaderText = xmlDoc.createTextNode(indexOwner);
-
+
subHeaderElement.appendChild(subHeaderText);
headerElement.appendChild(subHeaderElement);
-
-
+
+
/* -> owner email */
if(indexOwnerEmail != null) {
subHeaderElement = xmlDoc.createElement("email");
subHeaderText = xmlDoc.createTextNode(indexOwnerEmail);
-
+
subHeaderElement.appendChild(subHeaderText);
headerElement.appendChild(subHeaderElement);
}
-
+
Element filesElement = xmlDoc.createElement("files"); /*
filesElement != fileElement */
Element EntriesElement = xmlDoc.createElement("entries");
@@ -792,7 +791,7 @@
//all index files are ready
/* Adding word index */
Element keywordsElement = xmlDoc.createElement("keywords");
-
+
rootElement.appendChild(EntriesElement);
rootElement.appendChild(headerElement);
rootElement.appendChild(filesElement);
@@ -813,7 +812,7 @@
serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
serializer.setOutputProperty(OutputKeys.INDENT,"yes");
-
+
/* final step */
try {
serializer.transform(domSource, resultStream);
@@ -825,331 +824,338 @@
if(Logger.shouldLog(Logger.MINOR, this))
Logger.minor(this, "Spider: indexes regenerated.");
}
-
-public void terminate(){
- synchronized (this) {
- stopped = true;
- queuedURIList.clear();
+
+ public void terminate(){
+ synchronized (this) {
+ stopped = true;
+ queuedURIList.clear();
+ }
}
-}
-
-public void runPlugin(PluginRespirator pr){
- this.pr = pr;
- this.id = new Integer(0);
- this.core = pr.getNode().clientCore;
- this.ctx = core.makeClient((short) 0).getFetchContext();
- ctx.maxSplitfileBlockRetries = 10;
- ctx.maxNonSplitfileRetries = 10;
- ctx.maxTempLength = 2 * 1024 * 1024;
- ctx.maxOutputLength = 2 * 1024 * 1024;
- allowedMIMETypes = new HashSet();
- allowedMIMETypes.add(new String("text/html"));
- allowedMIMETypes.add(new String("text/plain"));
- allowedMIMETypes.add(new String("application/xhtml+xml"));
- ctx.allowedMIMETypes = new HashSet(allowedMIMETypes);
+ public void runPlugin(PluginRespirator pr){
+ this.pr = pr;
+ this.id = new Integer(0);
+ this.core = pr.getNode().clientCore;
+ this.ctx = core.makeClient((short) 0).getFetchContext();
+ ctx.maxSplitfileBlockRetries = 10;
+ ctx.maxNonSplitfileRetries = 10;
+ ctx.maxTempLength = 2 * 1024 * 1024;
+ ctx.maxOutputLength = 2 * 1024 * 1024;
+ allowedMIMETypes = new HashSet();
+ allowedMIMETypes.add(new String("text/html"));
+ allowedMIMETypes.add(new String("text/plain"));
+ allowedMIMETypes.add(new String("application/xhtml+xml"));
- tProducedIndex = System.currentTimeMillis();
- indexing = true;
- stopped = false;
- count = 0;
-
- //startPlugin();
- Thread starterThread = new Thread("Spider Plugin Starter") {
- public void run() {
- try{
- Thread.sleep(30 * 1000); // Let the node start
up
- } catch (InterruptedException e){}
- startSomeRequests();
- }
- };
- starterThread.setDaemon(true);
- starterThread.start();
-}
+ ctx.allowedMIMETypes = new HashSet(allowedMIMETypes);
-/**
- * Interface to the Spider data
- */
-public String handleHTTPGet(HTTPRequest request) throws PluginHTTPException{
- StringBuffer out = new StringBuffer();
-
- String listname = request.getParam("list");
- if(listname.length() != 0)
- {
- appendDefaultHeader(out,null);
- out.append("<p><h4>"+listname+" URIs</h4></p>");
- appendList(listname,out,null);
- return out.toString();
+ tProducedIndex = System.currentTimeMillis();
+ indexing = true;
+ stopped = false;
+ count = 0;
+
+ //startPlugin();
+ Thread starterThread = new Thread("Spider Plugin Starter") {
+ public void run() {
+ try{
+ Thread.sleep(30 * 1000); // Let the
node start up
+ } catch (InterruptedException e){}
+ startSomeRequests();
+ }
+ };
+ starterThread.setDaemon(true);
+ starterThread.start();
}
- appendDefaultPageStart(out,null);
- String uriParam = request.getParam("adduri");
- if(uriParam != null && uriParam.length() != 0)
+
+ /**
+ * Interface to the Spider data
+ */
+ public String handleHTTPGet(HTTPRequest request) throws
PluginHTTPException{
+ StringBuffer out = new StringBuffer();
+
+ String listname = request.getParam("list");
+ if(listname.length() != 0)
{
- try {
- FreenetURI uri = new FreenetURI(uriParam);
- synchronized (this) {
- failedURIs.remove(uri);
- visitedURIs.remove(uri);
+ appendDefaultHeader(out,null);
+ out.append("<p><h4>"+listname+" URIs</h4></p>");
+ appendList(listname,out,null);
+ return out.toString();
+ }
+ appendDefaultPageStart(out,null);
+ String uriParam = request.getParam("adduri");
+ if(uriParam != null && uriParam.length() != 0)
+ {
+ try {
+ FreenetURI uri = new FreenetURI(uriParam);
+ synchronized (this) {
+ failedURIs.remove(uri);
+ visitedURIs.remove(uri);
+ }
+ out.append("<p>URI added :"+uriParam+"</p>");
+ queueURI(uri);
+ startSomeRequests();
+ } catch (MalformedURLException mue1) {
+ out.append("<p>MalFormed URI: "+uriParam+"</p");
}
- out.append("<p>URI added :"+uriParam+"</p>");
- queueURI(uri);
- startSomeRequests();
- } catch (MalformedURLException mue1) {
- out.append("<p>MalFormed URI: "+uriParam+"</p");
}
- }
- return out.toString();
-}
+ return out.toString();
+ }
+/*
+ * List the visited, queued, failed and running fetches on the web interface
+ */
+ private void appendList(String listname, StringBuffer out, String
stylesheet)
+ {
+ Iterator it = (runningFetchesByURI.keySet()).iterator();
+ if(listname.equals("running"))
+ it = (runningFetchesByURI.keySet()).iterator();
+ if(listname.equals("visited"))
+ it = (new HashSet(visitedURIs)).iterator();
+ if(listname.equals("queued"))
+ it = (new ArrayList(queuedURIList)).iterator();
+ if(listname.equals("failed"))
+ it = (new HashSet(failedURIs)).iterator();
+ while(it.hasNext())
+
out.append("<code>"+it.next().toString()+"</code><br/>");
+ }
-private void appendList(String listname, StringBuffer out, String stylesheet)
-{
- Iterator it = (runningFetchesByURI.keySet()).iterator();
- if(listname.equals("running"))
- it = (runningFetchesByURI.keySet()).iterator();
- if(listname.equals("visited"))
- it = (new HashSet(visitedURIs)).iterator();
- if(listname.equals("queued"))
- it = (new ArrayList(queuedURIList)).iterator();
- if(listname.equals("failed"))
- it = (new HashSet(failedURIs)).iterator();
- while(it.hasNext())
- out.append("<code>"+it.next().toString()+"</code><br/>");
-}
+ private void appendDefaultPageStart(StringBuffer out, String
stylesheet) {
-private void appendDefaultPageStart(StringBuffer out, String stylesheet) {
-
- out.append("<HTML><HEAD><TITLE>" + pluginName + "</TITLE>");
- if(stylesheet != null)
- out.append("<link href=\""+stylesheet+"\" type=\"text/css\"
rel=\"stylesheet\" />");
- out.append("</HEAD><BODY>\n");
- out.append("<CENTER><H1>" + pluginName + "</H1><BR/><BR/><BR/>\n");
- out.append("Add uri:");
- out.append("<form method=\"GET\"><input type=\"text\" name=\"adduri\"
/><br/><br/>");
- out.append("<input type=\"submit\" value=\"Add uri\" /></form>");
- Set runningFetches = runningFetchesByURI.keySet();
- out.append("<p><h3>Running Fetches</h3></p>");
- Set visited = new HashSet(visitedURIs);
- List queued = new ArrayList(queuedURIList);
-
- Set failed = new HashSet(failedURIs);
- Iterator it=queued.iterator();
- out.append("<br/>Size :"+runningFetches.size()+"<br/>");
- appendList(runningFetches,out,stylesheet);
- out.append("<p><a href=\"?list="+"running"+"\">Show all</a><br/></p>");
- out.append("<p><h3>Queued URIs</h3></p>");
- out.append("<br/>Size :"+queued.size()+"<br/>");
- int i = 0;
- while(it.hasNext()){
- if(i<=maxShownURIs){
- out.append("<code>"+it.next().toString()+"</code><br/>");
+ out.append("<HTML><HEAD><TITLE>" + pluginName + "</TITLE>");
+ if(stylesheet != null)
+ out.append("<link href=\""+stylesheet+"\"
type=\"text/css\" rel=\"stylesheet\" />");
+ out.append("</HEAD><BODY>\n");
+ out.append("<CENTER><H1>" + pluginName +
"</H1><BR/><BR/><BR/>\n");
+ out.append("Add uri:");
+ out.append("<form method=\"GET\"><input type=\"text\"
name=\"adduri\" /><br/><br/>");
+ out.append("<input type=\"submit\" value=\"Add uri\"
/></form>");
+ Set runningFetches = runningFetchesByURI.keySet();
+ out.append("<p><h3>Running Fetches</h3></p>");
+ Set visited = new HashSet(visitedURIs);
+ List queued = new ArrayList(queuedURIList);
+
+ Set failed = new HashSet(failedURIs);
+ Iterator it=queued.iterator();
+ out.append("<br/>Size :"+runningFetches.size()+"<br/>");
+ appendList(runningFetches,out,stylesheet);
+ out.append("<p><a href=\"?list="+"running"+"\">Show
all</a><br/></p>");
+ out.append("<p><h3>Queued URIs</h3></p>");
+ out.append("<br/>Size :"+queued.size()+"<br/>");
+ int i = 0;
+ while(it.hasNext()){
+ if(i<=maxShownURIs){
+
out.append("<code>"+it.next().toString()+"</code><br/>");
+ }
+ else break;
+ i++;
}
- else break;
- i++;
+ out.append("<p><a href=\"?list="+"queued"+"\">Show
all</a><br/></p>");
+ out.append("<p><h3>Visited URIs</h3></p>");
+ out.append("<br/>Size :"+visited.size()+"<br/>");
+ appendList(visited,out,stylesheet);
+ out.append("<p><a href=\"?list="+"visited"+"\">Show
all</a><br/></p>");
+ out.append("<p><h3>Failed URIs</h3></p>");
+ out.append("<br/>Size :"+failed.size()+"<br/>");
+ appendList(failed,out,stylesheet);
+ out.append("<p><a href=\"?list="+"failed"+"\">Show
all</a><br/></p>");
}
- out.append("<p><a href=\"?list="+"queued"+"\">Show all</a><br/></p>");
- out.append("<p><h3>Visited URIs</h3></p>");
- out.append("<br/>Size :"+visited.size()+"<br/>");
- appendList(visited,out,stylesheet);
- out.append("<p><a href=\"?list="+"visited"+"\">Show all</a><br/></p>");
- out.append("<p><h3>Failed URIs</h3></p>");
- out.append("<br/>Size :"+failed.size()+"<br/>");
- appendList(failed,out,stylesheet);
- out.append("<p><a href=\"?list="+"failed"+"\">Show all</a><br/></p>");
-
-
-}
-private void appendDefaultHeader(StringBuffer out, String stylesheet){
- out.append("<HTML><HEAD><TITLE>" + pluginName + "</TITLE>");
- if(stylesheet != null)
- out.append("<link href=\""+stylesheet+"\" type=\"text/css\"
rel=\"stylesheet\" />");
- out.append("</HEAD><BODY>\n");
- out.append("<CENTER><H1>" + pluginName + "</H1><BR/><BR/><BR/>\n");
- out.append("Add uri:");
- out.append("<form method=\"GET\"><input type=\"text\" name=\"adduri\"
/><br/><br/>");
- out.append("<input type=\"submit\" value=\"Add uri\" /></form>");
-}
+ private void appendDefaultHeader(StringBuffer out, String stylesheet){
+ out.append("<HTML><HEAD><TITLE>" + pluginName + "</TITLE>");
+ if(stylesheet != null)
+ out.append("<link href=\""+stylesheet+"\"
type=\"text/css\" rel=\"stylesheet\" />");
+ out.append("</HEAD><BODY>\n");
+ out.append("<CENTER><H1>" + pluginName +
"</H1><BR/><BR/><BR/>\n");
+ out.append("Add uri:");
+ out.append("<form method=\"GET\"><input type=\"text\"
name=\"adduri\" /><br/><br/>");
+ out.append("<input type=\"submit\" value=\"Add uri\"
/></form>");
+ }
-private void appendList(Set list,StringBuffer out, String stylesheet){
- Iterator it = list.iterator();
- int i = 0;
- while(it.hasNext()){
- if(i<=maxShownURIs){
- out.append("<code>"+it.next().toString()+"</code><br/>");
+ private void appendList(Set list,StringBuffer out, String stylesheet){
+ Iterator it = list.iterator();
+ int i = 0;
+ while(it.hasNext()){
+ if(i<=maxShownURIs){
+
out.append("<code>"+it.next().toString()+"</code><br/>");
+ }
+ else{
+ break;
+ }
+ i++;
}
- else{
- break;
- }
- i++;
- }
}
-/**
- * creates the callback object for each page.
- *<p>Used to create inlinks and outlinks for each page separately.
- * @author swati
- *
- */
-public class PageCallBack implements FoundURICallback{
- Integer id;
- /*
- * id of the page as refrenced in uriIds
- */
- PageCallBack(){
- id = new Integer(0);
- }
-
- public void foundURI(FreenetURI uri){
+ /**
+ * creates the callback object for each page.
+ *<p>Used to create inlinks and outlinks for each page separately.
+ * @author swati
+ *
+ */
+ public class PageCallBack implements FoundURICallback{
+ Integer id;
+ /*
+ * id of the page as refrenced in uriIds
+ */
+ PageCallBack(){
+ id = new Integer(0);
+ }
- queueURI(uri);
- Integer iduri = (Integer) uriIds.get(uri);
+ public void foundURI(FreenetURI uri){
- if(outlinks.containsKey(id)){
- Vector outlink = (Vector) outlinks.get(id);
- if(!outlink.contains(iduri))
+ queueURI(uri);
+ Integer iduri = (Integer) uriIds.get(uri);
+/*
+ * update the outlink information for the current page
+ */
+ if(outlinks.containsKey(id)){
+ Vector outlink = (Vector) outlinks.get(id);
+ if(!outlink.contains(iduri))
+ outlink.add(iduri);
+ outlinks.remove(id);
+ outlinks.put(id, outlink);
+ }
+ else
+ {
+ Vector outlink = new Vector();
outlink.add(iduri);
- outlinks.remove(id);
- outlinks.put(id, outlink);
- }
- else
- {
- Vector outlink = new Vector();
- outlink.add(iduri);
- outlinks.put(id, outlink);
- }
+ outlinks.put(id, outlink);
+ }
+/*
+ * update the inlink information for the new link
+ */
+ if(inlinks.containsKey(iduri)){
+ Vector inlink = (Vector) inlinks.get(iduri);
+ if(!inlink.contains(id)) inlink.add(id);
+ inlinks.remove(iduri);
+ inlinks.put(iduri, inlink);
+ }
+ else
+ {
+ Vector inlink = new Vector();
+ inlink.add(id);
+ inlinks.put(iduri, inlink);
+ }
- if(inlinks.containsKey(iduri)){
- Vector inlink = (Vector) inlinks.get(iduri);
- if(!inlink.contains(id)) inlink.add(id);
- inlinks.remove(iduri);
- inlinks.put(iduri, inlink);
+ startSomeRequests();
}
- else
- {
- Vector inlink = new Vector();
- inlink.add(id);
- inlinks.put(iduri, inlink);
- }
- startSomeRequests();
- }
-
-
- public void onText(String s, String type, URI baseURI){
- if((type != null) && (type.length() != 0) &&
type.toLowerCase().equals("title")
- && (s != null) && (s.length() != 0) &&
(s.indexOf('\n') < 0)) {
- /* We should have a correct title */
-
- titlesOfIds.put(id, s);
- type = "title";
- }
- else type = null;
+ public void onText(String s, String type, URI baseURI){
- String[] words = s.split("[^A-Za-z0-9]");
- Integer lastPosition = null;
- lastPosition = (Integer)lastPositionById.get(id);
+ if((type != null) && (type.length() != 0) &&
type.toLowerCase().equals("title")
+ && (s != null) && (s.length() != 0) &&
(s.indexOf('\n') < 0)) {
+ /*
+ * title of the page
+ */
+ titlesOfIds.put(id, s);
+ type = "title";
+ }
+ else type = null;
+ /*
+ * determine the position of the word in the retrieved
page
+ */
+ String[] words = s.split("[^A-Za-z0-9]");
+ Integer lastPosition = null;
+ lastPosition = (Integer)lastPositionById.get(id);
- if(lastPosition == null)
- lastPosition = new Integer(1); /* We start to count
from 1 */
- for (int i = 0; i < words.length; i++) {
- String word = words[i];
- if ((word == null) || (word.length() == 0))
- continue;
- word = word.toLowerCase();
- try{
- if(type == null)
- addWord(word, lastPosition.intValue() +
i, id);
- else
- addWord(word, -1 * (i+1), id);
+ if(lastPosition == null)
+ lastPosition = new Integer(1);
+ for (int i = 0; i < words.length; i++) {
+ String word = words[i];
+ if ((word == null) || (word.length() == 0))
+ continue;
+ word = word.toLowerCase();
+ try{
+ if(type == null)
+ addWord(word,
lastPosition.intValue() + i, id);
+ else
+ addWord(word, -1 * (i+1), id);
+ }
+ catch (Exception e){}
}
- catch (Exception e){}
- }
- if(type == null) {
- lastPosition = new Integer(lastPosition.intValue() +
words.length);
- lastPositionById.put(id, lastPosition);
+ if(type == null) {
+ lastPosition = new
Integer(lastPosition.intValue() + words.length);
+ lastPositionById.put(id, lastPosition);
+ }
+
}
- }
-
- private synchronized void addWord(String word, int position,Integer id)
throws Exception{
- if(word.length() < 3)
- return;
+ private synchronized void addWord(String word, int
position,Integer id) throws Exception{
+ if(word.length() < 3)
+ return;
- Integer[] ids = (Integer[]) idsByWord.get(word);
- idsWithWords.add(id);
+ Integer[] ids = (Integer[]) idsByWord.get(word);
+ idsWithWords.add(id);
- /* Word position indexation */
- HashMap wordPositionsForOneUri =
(HashMap)positionsByWordById.get(id); /* For a given URI, take as key a word,
and gives position */
- if(wordPositionsForOneUri == null) {
- wordPositionsForOneUri = new HashMap();
- wordPositionsForOneUri.put(word, new Integer[] { new
Integer(position) });
- positionsByWordById.put(id, wordPositionsForOneUri);
- }
- else {
- Integer[] positions =
(Integer[])wordPositionsForOneUri.get(word);
- if(positions == null) {
- positions = new Integer[] { new
Integer(position) };
- wordPositionsForOneUri.put(word, positions);
+ /* Word position indexation */
+ HashMap wordPositionsForOneUri =
(HashMap)positionsByWordById.get(id); /* For a given URI, take as key a word,
and gives position */
+ if(wordPositionsForOneUri == null) {
+ wordPositionsForOneUri = new HashMap();
+ wordPositionsForOneUri.put(word, new Integer[]
{ new Integer(position) });
+ positionsByWordById.put(id,
wordPositionsForOneUri);
}
else {
- Integer[] newPositions = new
Integer[positions.length + 1];
- System.arraycopy(positions, 0, newPositions, 0,
positions.length);
- newPositions[positions.length] = new
Integer(position);
- wordPositionsForOneUri.put(word, newPositions);
+ Integer[] positions =
(Integer[])wordPositionsForOneUri.get(word);
+ if(positions == null) {
+ positions = new Integer[] { new
Integer(position) };
+ wordPositionsForOneUri.put(word,
positions);
+ }
+ else {
+ Integer[] newPositions = new
Integer[positions.length + 1];
+ System.arraycopy(positions, 0,
newPositions, 0, positions.length);
+ newPositions[positions.length] = new
Integer(position);
+ wordPositionsForOneUri.put(word,
newPositions);
+ }
}
- }
- if (ids == null) {
- idsByWord.put(word, new Integer[] { id });
- } else {
- for (int i = 0; i < ids.length; i++) {
- if (ids[i].equals(id))
- return;
+ if (ids == null) {
+ idsByWord.put(word, new Integer[] { id });
+ } else {
+ for (int i = 0; i < ids.length; i++) {
+ if (ids[i].equals(id))
+ return;
+ }
+ Integer[] newIDs = new Integer[ids.length + 1];
+ System.arraycopy(ids, 0, newIDs, 0, ids.length);
+ newIDs[ids.length] = id;
+ idsByWord.put(word, newIDs);
}
- Integer[] newIDs = new Integer[ids.length + 1];
- System.arraycopy(ids, 0, newIDs, 0, ids.length);
- newIDs[ids.length] = id;
- idsByWord.put(word, newIDs);
- }
- tMap.put(MD5(word), word);
- long time_indexing = System.currentTimeMillis();
- if (tProducedIndex + minTimeBetweenEachIndexRewriting * 10 <
System.currentTimeMillis()) {
- try {
- if(indexing){
- generateIndex2();
- produceIndex2();
- /*
- * ensures that index production
doesn't eat up the processor time
- */
- if((System.currentTimeMillis() -
time_indexing)/(System.currentTimeMillis() - tProducedIndex) >
MAX_TIME_SPENT_INDEXING) indexing= false;
- else indexing = true;
+ tMap.put(MD5(word), word);
+ long time_indexing = System.currentTimeMillis();
+ if (tProducedIndex + minTimeBetweenEachIndexRewriting *
10 < System.currentTimeMillis()) {
+ try {
+ if(indexing){
+ generateIndex2();
+ produceIndex2();
+ /*
+ * ensures that index
production doesn't eat up the processor time
+ */
+ if((System.currentTimeMillis()
- time_indexing)/(System.currentTimeMillis() - tProducedIndex) >
MAX_TIME_SPENT_INDEXING) indexing= false;
+ else indexing = true;
+ }
+ } catch (IOException e) {
+ Logger.error(this, "Caught " + e + "
while creating index", e);
}
- } catch (IOException e) {
- Logger.error(this, "Caught " + e + " while
creating index", e);
+ tProducedIndex = System.currentTimeMillis();
}
- tProducedIndex = System.currentTimeMillis();
}
}
-}
-public String handleHTTPPut(HTTPRequest request) throws PluginHTTPException{
- return null;
-}
-public String handleHTTPPost(HTTPRequest request) throws PluginHTTPException{
- return null;
-}
+ public String handleHTTPPut(HTTPRequest request) throws
PluginHTTPException{
+ return null;
+ }
+ public String handleHTTPPost(HTTPRequest request) throws
PluginHTTPException{
+ return null;
+ }
-public void onFoundEdition(long l, USK key){
- FreenetURI uri = key.getURI();
- if(runningFetchesByURI.containsKey(uri))
runningFetchesByURI.remove(uri);
- uri = key.getURI().setSuggestedEdition(l);
- queueURI(uri);
+ public void onFoundEdition(long l, USK key){
+ FreenetURI uri = key.getURI();
+ if(runningFetchesByURI.containsKey(uri))
runningFetchesByURI.remove(uri);
+ uri = key.getURI().setSuggestedEdition(l);
+ queueURI(uri);
+ }
+
}
-
-}