Author: swatig0
Date: 2007-06-13 16:37:02 +0000 (Wed, 13 Jun 2007)
New Revision: 13562
Added:
trunk/freenet/src/freenet/clients/http/XMLSpider.java
Log:
msg
Added: trunk/freenet/src/freenet/clients/http/XMLSpider.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/XMLSpider.java
(rev 0)
+++ trunk/freenet/src/freenet/clients/http/XMLSpider.java 2007-06-13
16:37:02 UTC (rev 13562)
@@ -0,0 +1,1084 @@
+/* This code is part of Freenet. It is distributed under the GNU General
+ * Public License, version 2 (or at your option any later version). See
+ * http://www.gnu.org/ for further details of the GPL. */
+package freenet.clients.http;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import org.w3c.dom.Attr;
+import org.w3c.dom.DOMImplementation;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.Text;
+
+import freenet.client.ClientMetadata;
+import freenet.client.FetchContext;
+import freenet.client.FetchException;
+import freenet.client.FetchResult;
+import freenet.client.InsertException;
+import freenet.client.async.BaseClientPutter;
+import freenet.client.async.ClientCallback;
+import freenet.client.async.ClientGetter;
+import freenet.clients.http.filter.ContentFilter;
+import freenet.clients.http.filter.FoundURICallback;
+import freenet.clients.http.filter.UnsafeContentTypeException;
+import freenet.keys.FreenetURI;
+import freenet.node.NodeClientCore;
+import freenet.node.RequestStarter;
+import freenet.oldplugins.plugin.HttpPlugin;
+import freenet.oldplugins.plugin.PluginManager;
+import freenet.support.HTMLNode;
+import freenet.support.Logger;
+import freenet.support.MultiValueTable;
+import freenet.support.api.Bucket;
+import freenet.support.api.HTTPRequest;
+/**
+ * Spider. Produces an index.
+ */
+public class XMLSpider implements HttpPlugin, ClientCallback, FoundURICallback
{
+
+ long tProducedIndex;
+
+ // URIs visited, or fetching, or queued. Added once then forgotten
about.
+ private final HashSet visitedURIs = new HashSet();
+ private final HashSet urisWithWords = new HashSet();
+ private final HashSet failedURIs = new HashSet();
+ private final HashSet queuedURISet = new HashSet();
+ private final LinkedList queuedURIList = new LinkedList();
+ private final HashMap runningFetchesByURI = new HashMap();
+ private final HashMap urisByWord = new HashMap();
+ private final HashMap titlesOfURIs = new HashMap();
+ private FileWriter output;
+ private FileWriter output2;
+
+ private static final int minTimeBetweenEachIndexRewriting = 1;
+ //private static final String indexFilename = "index.xml";
+ private static final String DEFAULT_INDEX_DIR = "/home/swati/myindex/";
+ private static final int MAX_ENTRIES = 5;
+ private static final String pluginName = "XML spider";
+
+ private static final String indexTitle= "This is an index";
+ private static final String indexOwner = "Another anonymous";
+ private static final String indexOwnerEmail = null;
+ private final HashMap sizeOfURIs = new HashMap(); /* String (URI) ->
Long */
+ private final HashMap mimeOfURIs = new HashMap(); /* String (URI) ->
String */
+ private final HashMap lastPositionByURI = new HashMap(); /* String
(URI) -> Integer */ /* Use to determine word position on each uri */
+ private final HashMap positionsByWordByURI = new HashMap(); /* String
(URI) -> HashMap (String (word) -> Integer[] (Positions)) */
+
+ // Can have many; this limit only exists to save memory.
+ private static final int maxParallelRequests = 20;
+ private int maxShownURIs = 50;
+ private HashMap urisToNumbers;
+ private NodeClientCore core;
+ private FetchContext ctx;
+ private final short PRIORITY_CLASS =
RequestStarter.PREFETCH_PRIORITY_CLASS;
+ private boolean stopped = true;
+
+ private synchronized void queueURI(FreenetURI uri) {
+ //not adding the html condition
+ if ((!visitedURIs.contains(uri)) && queuedURISet.add(uri)) {
+ queuedURIList.addLast(uri);
+ visitedURIs.add(uri);
+ }
+ }
+
+ private void startSomeRequests() {
+ try{
+ Thread.sleep(30 * 1000); // Let the node start up
+ } catch (InterruptedException e){}
+
+ FreenetURI[] initialURIs =
core.bookmarkManager.getBookmarkURIs();
+ for (int i = 0; i < initialURIs.length; i++)
+ {
+ queueURI(initialURIs[i]);
+ }
+
+ ArrayList toStart = null;
+ synchronized (this) {
+ if (stopped) {
+ return;
+ }
+ int running = runningFetchesByURI.size();
+ int queued = queuedURIList.size();
+
+ if ((running >= maxParallelRequests) || (queued == 0))
+ return;
+
+ toStart = new ArrayList(Math.min(maxParallelRequests -
running, queued));
+
+ for (int i = running; i < maxParallelRequests; i++) {
+ if (queuedURIList.isEmpty())
+ break;
+ FreenetURI uri = (FreenetURI)
queuedURIList.removeFirst();
+ queuedURISet.remove(uri);
+ ClientGetter getter = makeGetter(uri);
+ toStart.add(getter);
+
+ }
+
+ for (int i = 0; i < toStart.size(); i++) {
+ ClientGetter g = (ClientGetter) toStart.get(i);
+ try {
+ runningFetchesByURI.put(g.getURI(), g);
+ g.start();
+ } catch (FetchException e) {
+ onFailure(e, g);
+ }
+
+ }
+ }
+
+ }
+
+
+ private ClientGetter makeGetter(FreenetURI uri) {
+ ClientGetter g = new ClientGetter(this,
core.requestStarters.chkFetchScheduler, core.requestStarters.sskFetchScheduler,
uri, ctx, PRIORITY_CLASS, this, null, null);
+ return g;
+ }
+
+ public void onSuccess(FetchResult result, ClientGetter state) {
+ FreenetURI uri = state.getURI();
+
+ synchronized (this) {
+ runningFetchesByURI.remove(uri);
+ }
+ startSomeRequests();
+ ClientMetadata cm = result.getMetadata();
+ Bucket data = result.asBucket();
+ String mimeType = cm.getMIMEType();
+
+ sizeOfURIs.put(uri.toString(), new Long(data.size()));
+ mimeOfURIs.put(uri.toString(), mimeType);
+
+ try {
+ ContentFilter.filter(data, ctx.bucketFactory, mimeType,
uri.toURI("http://127.0.0.1:8888/"), this);
+ } catch (UnsafeContentTypeException e) {
+ return; // Ignore
+ } catch (IOException e) {
+ Logger.error(this, "Bucket error?: " + e, e);
+ } catch (URISyntaxException e) {
+ Logger.error(this, "Internal error: " + e, e);
+ } finally {
+ data.free();
+ }
+ }
+
+ public void onFailure(FetchException e, ClientGetter state) {
+ FreenetURI uri = state.getURI();
+
+ synchronized (this) {
+ failedURIs.add(uri);
+ runningFetchesByURI.remove(uri);
+ }
+ if (e.newURI != null)
+ queueURI(e.newURI);
+ else
+ queueURI(uri);
+ startSomeRequests();
+
+
+ }
+
+ public void onSuccess(BaseClientPutter state) {
+ // Ignore
+ }
+
+ public void onFailure(InsertException e, BaseClientPutter state) {
+ // Ignore
+ }
+
+ public void onGeneratedURI(FreenetURI uri, BaseClientPutter state) {
+ // Ignore
+ }
+
+ public void foundURI(FreenetURI uri) {
+ queueURI(uri);
+ startSomeRequests();
+ }
+
+ public void onText(String s, String type, URI baseURI) {
+
+ FreenetURI uri;
+ try {
+ uri = new FreenetURI(baseURI.getPath().substring(1));
+ } catch (MalformedURLException e) {
+ Logger.error(this, "Caught " + e, e);
+ return;
+ }
+
+
+
+ if((type != null) && (type.length() != 0) &&
type.toLowerCase().equals("title")
+ && (s != null) && (s.length() != 0) && (s.indexOf('\n') <
0)) {
+ /* We should have a correct title */
+ titlesOfURIs.put(uri.toString(), s);
+ type = "title";
+
+ }
+ else type = null;
+
+//
+// for (int i = 0; i < words.length; i++) {
+// String word = words[i];
+// if ((word == null) || (word.length() == 0))
+// continue;
+// word = word.toLowerCase();
+// addWord(word, uri);
+// }
+ String[] words = s.split("[^A-Za-z0-9]");
+
+ Integer lastPosition = null;
+
+ lastPosition = (Integer)lastPositionByURI.get(uri.toString());
+
+ if(lastPosition == null)
+ lastPosition = new Integer(1); /* We start to count
from 1 */
+
+ for (int i = 0; i < words.length; i++) {
+ String word = words[i];
+ if ((word == null) || (word.length() == 0))
+ continue;
+ word = word.toLowerCase();
+ try{
+ if(type == null)
+ addWord(word, lastPosition.intValue() + i, uri);
+ else
+ addWord(word, -1 * (i+1), uri);
+ }
+ catch (Exception e){}
+ }
+
+ if(type == null) {
+ lastPosition = new Integer(lastPosition.intValue() +
words.length);
+ lastPositionByURI.put(uri.toString(), lastPosition);
+ }
+
+ }
+
+ private synchronized void addWord(String word, int position,FreenetURI
uri) throws Exception{
+
+
+ if(word.length() < 3)
+ return;
+
+
+ FreenetURI[] uris = (FreenetURI[]) urisByWord.get(word);
+
+ //Integer[] positions = (Integer[])
positionsByWordByURI.get(word);
+
+ urisWithWords.add(uri);
+
+
+ /* Word position indexation */
+ HashMap wordPositionsForOneUri =
(HashMap)positionsByWordByURI.get(uri.toString()); /* For a given URI, take as
key a word, and gives position */
+
+ if(wordPositionsForOneUri == null) {
+ wordPositionsForOneUri = new HashMap();
+ wordPositionsForOneUri.put(word, new Integer[] { new
Integer(position) });
+ positionsByWordByURI.put(uri.toString(),
wordPositionsForOneUri);
+ } else {
+ Integer[] positions =
(Integer[])wordPositionsForOneUri.get(word);
+
+ if(positions == null) {
+ positions = new Integer[] { new
Integer(position) };
+ wordPositionsForOneUri.put(word, positions);
+ } else {
+ Integer[] newPositions = new
Integer[positions.length + 1];
+
+ System.arraycopy(positions, 0, newPositions, 0,
positions.length);
+ newPositions[positions.length] = new
Integer(position);
+
+ wordPositionsForOneUri.put(word, newPositions);
+ }
+ }
+
+ if (uris == null) {
+ urisByWord.put(word, new FreenetURI[] { uri });
+
+ } else {
+ for (int i = 0; i < uris.length; i++) {
+ if (uris[i].equals(uri))
+ return;
+ }
+ FreenetURI[] newURIs = new FreenetURI[uris.length + 1];
+ System.arraycopy(uris, 0, newURIs, 0, uris.length);
+ newURIs[uris.length] = uri;
+ urisByWord.put(word, newURIs);
+ }
+ if (tProducedIndex + minTimeBetweenEachIndexRewriting * 10 <
System.currentTimeMillis()) {
+ try {
+ produceIndex();
+ generateIndex();
+ } catch (IOException e) {
+ Logger.error(this, "Caught " + e + " while
creating index", e);
+ }
+ tProducedIndex = System.currentTimeMillis();
+ }
+
+ }
+
+ private synchronized void produceIndex() throws
IOException,NoSuchAlgorithmException {
+ // Produce an index file.
+ //FileOutputStream fos = new FileOutputStream("index2_new.xml");
+
+ //the number of bits to consider for matching
+ int prefix = 1 ;
+
+ if (urisByWord.isEmpty() || urisWithWords.isEmpty()) {
+ System.out.println("No URIs with words");
+ return;
+ }
+ File outputFile = new File(DEFAULT_INDEX_DIR+"index.xml");
+ StreamResult resultStream;
+ resultStream = new StreamResult(outputFile);
+
+ /* Initialize xml builder */
+ Document xmlDoc = null;
+ DocumentBuilderFactory xmlFactory = null;
+ DocumentBuilder xmlBuilder = null;
+ DOMImplementation impl = null;
+ Element rootElement = null;
+
+ xmlFactory = DocumentBuilderFactory.newInstance();
+
+
+ try {
+ xmlBuilder = xmlFactory.newDocumentBuilder();
+ } catch(javax.xml.parsers.ParserConfigurationException e) {
+ /* Will (should ?) never happen */
+ Logger.error(this, "Spider: Error while initializing
XML generator: "+e.toString());
+ return;
+ }
+
+
+ impl = xmlBuilder.getDOMImplementation();
+
+ /* Starting to generate index */
+
+ xmlDoc = impl.createDocument(null, "main_index", null);
+ rootElement = xmlDoc.getDocumentElement();
+
+ /* Adding header to the index */
+ Element headerElement = xmlDoc.createElement("header");
+
+ /* -> title */
+ Element subHeaderElement = xmlDoc.createElement("title");
+ Text subHeaderText = xmlDoc.createTextNode(indexTitle);
+
+ subHeaderElement.appendChild(subHeaderText);
+ headerElement.appendChild(subHeaderElement);
+
+ /* -> owner */
+ subHeaderElement = xmlDoc.createElement("owner");
+ subHeaderText = xmlDoc.createTextNode(indexOwner);
+
+ subHeaderElement.appendChild(subHeaderText);
+ headerElement.appendChild(subHeaderElement);
+
+ /* -> owner email */
+ if(indexOwnerEmail != null) {
+ subHeaderElement = xmlDoc.createElement("email");
+ subHeaderText = xmlDoc.createTextNode(indexOwnerEmail);
+
+ subHeaderElement.appendChild(subHeaderText);
+ headerElement.appendChild(subHeaderElement);
+ }
+
+
+ String[] words = (String[]) urisByWord.keySet().toArray(new
String[urisByWord.size()]);
+ Arrays.sort(words);
+ FreenetURI[] uris = (FreenetURI[]) urisWithWords.toArray(new
FreenetURI[urisWithWords.size()]);
+ urisToNumbers = new HashMap();
+ Element prefixElement = xmlDoc.createElement("prefix");
+ prefixElement.setAttribute("value", prefix+"");
+ Element filesElement = xmlDoc.createElement("files"); /*
filesElement != fileElement */
+
+ for (int i = 0; i < uris.length; i++) {
+ urisToNumbers.put(uris[i], new Integer(i));
+
+ Element fileElement = xmlDoc.createElement("file");
+
+ fileElement.setAttribute("id", Integer.toString(i));
+ fileElement.setAttribute("key", uris[i].toString());
+
+ Long size = (Long)sizeOfURIs.get(uris[i].toString());
+
+ if(size == null) {
+ Logger.error(this, "Spider: size is missing");
+ } else {
+ fileElement.setAttribute("size",
size.toString());
+ }
+ fileElement.setAttribute("mime",
((String)mimeOfURIs.get(uris[i].toString())));
+
+ Element titleElement = xmlDoc.createElement("option");
+ titleElement.setAttribute("name", "title");
+ titleElement.setAttribute("value",
(String)titlesOfURIs.get(uris[i].toString()));
+
+ fileElement.appendChild(titleElement);
+ filesElement.appendChild(fileElement);
+ }
+
+
+
+ //all index files are ready
+ /* Adding word index */
+ Element keywordsElement = xmlDoc.createElement("keywords");
+ for(int i = 0;i<16;i++){
+
generateSubIndex(DEFAULT_INDEX_DIR+"index_"+Integer.toHexString(i)+".xml");
+ Element subIndexElement =
xmlDoc.createElement("subIndex");
+ if(i<=9)
+ subIndexElement.setAttribute("key",i+"");
+ else
+
subIndexElement.setAttribute("key",Integer.toHexString(i));
+ //the subindex element key will contain the bits used
for matching in that subindex
+ keywordsElement.appendChild(subIndexElement);
+ }
+
+
+
+ // make sure that prefix is the first child of root Element
+ rootElement.appendChild(prefixElement);
+ rootElement.appendChild(headerElement);
+
+ rootElement.appendChild(filesElement);
+ rootElement.appendChild(keywordsElement);
+
+ /* Serialization */
+ DOMSource domSource = new DOMSource(xmlDoc);
+ TransformerFactory transformFactory =
TransformerFactory.newInstance();
+ Transformer serializer;
+
+ try {
+ serializer = transformFactory.newTransformer();
+ } catch(javax.xml.transform.TransformerConfigurationException
e) {
+ Logger.error(this, "Spider: Error while serializing XML
(transformFactory.newTransformer()): "+e.toString());
+ return;
+ }
+
+
+ serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
+ serializer.setOutputProperty(OutputKeys.INDENT,"yes");
+
+ /* final step */
+ try {
+ serializer.transform(domSource, resultStream);
+ } catch(javax.xml.transform.TransformerException e) {
+ Logger.error(this, "Spider: Error while serializing XML
(transform()): "+e.toString());
+ return;
+ }
+
+ if(Logger.shouldLog(Logger.MINOR, this))
+ Logger.minor(this, "Spider: indexes regenerated.");
+
+
+ //the main xml file is generated
+ //now as each word is generated enter it into the respective subindex
+ //now the parsing will start and nodes will be added as needed
+
+
+
+
+ }
+
+ private synchronized void generateIndex() throws Exception{
+ String[] words = (String[]) urisByWord.keySet().toArray(new
String[urisByWord.size()]);
+ Arrays.sort(words);
+
+
+
+ for (int i = 0; i < 100; i++) {
+ try{
+
+ String prefix_match = getIndex(words[i]);
+
+ boolean addedWord = addWord(prefix_match,words[i]);
+
+ if(addedWord == false)
+ {
+
+ output2 = new FileWriter(DEFAULT_INDEX_DIR+"log3",true);
+ output2.write("\naddword failes at "+words[i]+" with
prefix "+prefix_match);
+ split(prefix_match);
+ regenerateIndex(prefix_match);
+ output2.write("finished splitting on prefix
"+prefix_match);
+ prefix_match = getIndex(words[i]);
+ output2.write("the new prefix "+prefix_match);
+ addWord(prefix_match,words[i]);
+
+ }
+ output2.close();
+ }
+ catch(Exception e2){ }
+ }
+
+
+ }
+ private void regenerateIndex(String prefix) throws Exception{
+ //redistribute the entries in prefix.xml to prefix(0-f).xml
+ DocumentBuilderFactory docFactory =
DocumentBuilderFactory.newInstance();
+ DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
+ Document doc =
docBuilder.parse(DEFAULT_INDEX_DIR+"index_"+prefix+".xml");
+ Element root = doc.getDocumentElement();
+ NodeList wordList = root.getElementsByTagName("word");
+ for(int i = 0;i<wordList.getLength();i++){
+ Element word = (Element)wordList.item(i);
+ String value = word.getAttribute("v");
+ String prefix_match = getIndex(value);
+ addWord(prefix_match,value);
+ }
+ }
+ private String getIndex(String word) throws Exception {
+ DocumentBuilderFactory docFactory =
DocumentBuilderFactory.newInstance();
+ DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
+ Document doc = docBuilder.parse(DEFAULT_INDEX_DIR+"index.xml");
+ Element root = doc.getDocumentElement();
+ Attr prefix_value = (Attr)
(root.getElementsByTagName("prefix").item(0)).getAttributes().getNamedItem("value");
+ int prefix = Integer.parseInt(prefix_value.getValue());
+ output = new FileWriter(DEFAULT_INDEX_DIR+"logfile2",true);
+ //Element prefixNode = (Element)root.getFirstChild();
+ output.write("\nword "+word);
+
+ String md5 = MD5(word);
+ output.write(" md5 "+md5);
+// NodeList KeywordsList = root.getElementsByTagName("keywords");
+
+ //Node Keyword = KeywordsList.item(0);
+
+
+ NodeList subindexList = root.getElementsByTagName("subIndex");
+ String str = md5.substring(0,prefix);
+
+ output.write("String "+str);
+ output.write("\n");
+
+ output.close();
+ String prefix_match = search(str,subindexList);
+
+
+
+ output = new FileWriter(DEFAULT_INDEX_DIR+"search",true);
+ output.write("\nPrefix returned "+prefix_match+" with md5
"+str+ " and word "+word);
+ output.close();
+
+
+ return prefix_match;
+ }
+ private boolean addWord(String prefix, String str) throws Exception
+ {
+ //this word has to be added to the particular subindex
+ // modify the corresponding index
+ try{
+
+
+ DocumentBuilderFactory docFactory =
DocumentBuilderFactory.newInstance();
+ DocumentBuilder docBuilder =
docFactory.newDocumentBuilder();
+ Document doc =
docBuilder.parse(DEFAULT_INDEX_DIR+"index_"+prefix+".xml");
+ Element root = doc.getDocumentElement();
+
+ Element entry = (Element)
root.getElementsByTagName("entries").item(0);
+
+ Attr no_entries = (Attr)
entry.getAttributes().getNamedItem("value");
+
+
+ if(Integer.parseInt(no_entries.getValue()) >=
MAX_ENTRIES) return false;
+ else
+ {
+ //increment the number of entries
+
entry.setAttribute("value",(Integer.parseInt(no_entries.getValue())+1)+"");
+ //add the entry
+
+ Element wordElement = doc.createElement("word");
+ wordElement.setAttribute("v", str);
+
+ FreenetURI[] urisForWord = (FreenetURI[])
urisByWord.get(str);
+
+ /* URI by URI */
+ for (int j = 0; j < urisForWord.length; j++) {
+ FreenetURI uri = urisForWord[j];
+ Integer x = (Integer) urisToNumbers.get(uri);
+
+ if (x == null) {
+ Logger.error(this, "Eh?");
+ continue;
+ }
+
+ Element uriElement = doc.createElement("file");
+ uriElement.setAttribute("id", x.toString());
+//
+// /* Position by position */
+ HashMap positionsForGivenWord =
(HashMap)positionsByWordByURI.get(uri.toString());
+ Integer[] positions =
(Integer[])positionsForGivenWord.get(str);
+
+ StringBuffer positionList = new StringBuffer();
+
+ for(int k=0; k < positions.length ; k++) {
+ if(k!=0)
+ positionList.append(',');
+
+
positionList.append(positions[k].toString());
+ }
+
+
uriElement.appendChild(doc.createTextNode(positionList.toString()));
+
+ wordElement.appendChild(uriElement);
+ }
+ Element keywordsElement = (Element)
root.getElementsByTagName("keywords").item(0);
+ keywordsElement.appendChild(wordElement);
+
+
+
+ DOMSource domSource = new DOMSource(doc);
+ TransformerFactory transformFactory =
TransformerFactory.newInstance();
+ Transformer serializer;
+
+
+ serializer = transformFactory.newTransformer();
+
+
+
+ File outputFile = new
File(DEFAULT_INDEX_DIR+"index_"+prefix+".xml");
+ StreamResult resultStream;
+ resultStream = new StreamResult(outputFile);
+
+ serializer.setOutputProperty(OutputKeys.ENCODING,
"UTF-8");
+ serializer.setOutputProperty(OutputKeys.INDENT,"yes");
+
+ /* final step */
+ try {
+ serializer.transform(domSource, resultStream);
+ } catch(javax.xml.transform.TransformerException e) {}
+
+ //i.appendChild(root);
+ //c.replaceChild(root,doc.getDocumentElement());
+
+
+ }
+
+ return true;
+ }
+
+ catch(Exception e){}
+ return false;
+ }
+ private void split(String prefix) throws Exception
+ {
+ //first we need to split the current subindex into 16 newones
+ //then read from the original one and append to the new ones
+
+ // make the entry in the main index..
+ DocumentBuilderFactory docFactory =
DocumentBuilderFactory.newInstance();
+ DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
+ Document doc = docBuilder.parse(DEFAULT_INDEX_DIR+"index.xml");
+ Element root = doc.getDocumentElement();
+ Element prefixElt =(Element)
root.getElementsByTagName("prefix").item(0);
+ int prefix_current =
Integer.parseInt(prefixElt.getAttribute("value"));
+ if (prefix_current <= prefix.length())
+ prefixElt.setAttribute("value", (prefix_current+1)+"");
+
+ Element keywordElement = (Element)
root.getElementsByTagName("keywords").item(0);
+
+ NodeList subIndexElt = root.getElementsByTagName("subIndex");
+ for(int i =0;i<subIndexElt.getLength();i++)
+ {
+ Element subIndex = (Element) subIndexElt.item(i);
+ if((subIndex.getAttribute("key")).equals(prefix)) {
+ keywordElement.removeChild(subIndex);
+ break;
+ }
+ }
+
+ for(int i = 0;i<16;i++)
+ {
+ Element subIndex = doc.createElement("subIndex");
+
generateSubIndex(DEFAULT_INDEX_DIR+"index_"+prefix+Integer.toHexString(i)+".xml");
+
subIndex.setAttribute("key",prefix.concat(Integer.toHexString(i)));
+ keywordElement.appendChild(subIndex);
+ }
+
+
+ DOMSource domSource = new DOMSource(doc);
+ TransformerFactory transformFactory =
TransformerFactory.newInstance();
+ Transformer serializer;
+
+
+ serializer = transformFactory.newTransformer();
+
+
+
+ File outputFile = new File(DEFAULT_INDEX_DIR+"index.xml");
+ StreamResult resultStream;
+ resultStream = new StreamResult(outputFile);
+
+ serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
+ serializer.setOutputProperty(OutputKeys.INDENT,"yes");
+
+ /* final step */
+ try {
+ serializer.transform(domSource, resultStream);
+ } catch(javax.xml.transform.TransformerException e) {}
+
+
+ }
+ public String search(String str,NodeList list) throws Exception
+ {
+ int prefix = str.length();
+ for(int i = 0;i<list.getLength();i++){
+ Element subIndex = (Element) list.item(i);
+ String key = subIndex.getAttribute("key");
+ if(key.equals(str)) return key;
+ }
+
+ return search(str.substring(0, prefix-1),list);
+ }
+
+//
+// output.close();
+// return search(str.substring(0,prefix-1),list);
+
+
+
+ public void handleGet(HTTPRequest request, ToadletContext context)
throws IOException, ToadletContextClosedException {
+ String action = request.getParam("action");
+ PageMaker pageMaker = context.getPageMaker();
+ if ((action == null) || (action.length() == 0)) {
+ MultiValueTable responseHeaders = new MultiValueTable();
+ responseHeaders.put("Location", "?action=list");
+ context.sendReplyHeaders(301, "Redirect",
responseHeaders, "text/html; charset=utf-8", 0);
+ return;
+ } else if ("list".equals(action)) {
+
+ String listName = request.getParam("listName", null);
+ HTMLNode pageNode = pageMaker.getPageNode("The XML
Spider", context);
+ HTMLNode contentNode =
pageMaker.getContentNode(pageNode);
+ /* create copies for multi-threaded use */
+ if (listName == null) {
+ Map runningFetches = new
HashMap(runningFetchesByURI);
+ List queued = new ArrayList(queuedURIList);
+ Set visited = new HashSet(visitedURIs);
+ Set failed = new HashSet(failedURIs);
+ contentNode.addChild(createNavbar(pageMaker,
runningFetches.size(), queued.size(), visited.size(), failed.size()));
+ contentNode.addChild(createAddBox(pageMaker,
context));
+ contentNode.addChild(createList(pageMaker,
"Running FetcheIIIs", "running", runningFetches.keySet(), maxShownURIs));
+ contentNode.addChild(createList(pageMaker,
"Queued URIs", "queued", queued, maxShownURIs));
+ contentNode.addChild(createList(pageMaker,
"Visited URIs", "visited", visited, maxShownURIs));
+ contentNode.addChild(createList(pageMaker,
"Failed URIs", "failed", failed, maxShownURIs));
+ } else {
+ contentNode.addChild(createBackBox(pageMaker));
+ if ("failed".equals(listName)) {
+ Set failed = new HashSet(failedURIs);
+
contentNode.addChild(createList(pageMaker, "Failed URIs", "failed", failed,
-1));
+ } else if ("visited".equals(listName)) {
+ Set visited = new HashSet(visitedURIs);
+
contentNode.addChild(createList(pageMaker, "Visited URIs", "visited", visited,
-1));
+ } else if ("queued".equals(listName)) {
+ List queued = new
ArrayList(queuedURIList);
+
contentNode.addChild(createList(pageMaker, "Queued URIs", "queued", queued,
-1));
+ } else if ("running".equals(listName)) {
+ Map runningFetches = new
HashMap(runningFetchesByURI);
+
contentNode.addChild(createList(pageMaker, "Running Fetches", "running",
runningFetches.keySet(), -1));
+ }
+ }
+ MultiValueTable responseHeaders = new MultiValueTable();
+ byte[] responseBytes =
pageNode.generate().getBytes("utf-8");
+ context.sendReplyHeaders(200, "OK", responseHeaders,
"text/html; charset=utf-8", responseBytes.length);
+ context.writeData(responseBytes);
+ } else if ("add".equals(action)) {
+ String uriParam = request.getParam("key");
+ try {
+ FreenetURI uri = new FreenetURI(uriParam);
+ synchronized (this) {
+ failedURIs.remove(uri);
+ visitedURIs.remove(uri);
+ }
+ queueURI(uri);
+ startSomeRequests();
+ } catch (MalformedURLException mue1) {
+ sendSimpleResponse(context, "URL invalid", "The
given URI is not valid.");
+ return;
+ }
+ MultiValueTable responseHeaders = new MultiValueTable();
+ responseHeaders.put("Location", "?action=list");
+ context.sendReplyHeaders(301, "Redirect",
responseHeaders, "text/html; charset=utf-8", 0);
+ return;
+ }
+ }
+
+ /**
+ * @see
freenet.oldplugins.plugin.HttpPlugin#handlePost(freenet.clients.http.HTTPRequestImpl,
freenet.clients.http.ToadletContext)
+ */
+ public void handlePost(HTTPRequest request, ToadletContext context)
throws IOException {
+ }
+
+ private void sendSimpleResponse(ToadletContext context, String title,
String message) throws ToadletContextClosedException, IOException {
+ PageMaker pageMaker = context.getPageMaker();
+ HTMLNode pageNode = pageMaker.getPageNode(title, context);
+ HTMLNode contentNode = pageMaker.getContentNode(pageNode);
+ HTMLNode infobox =
contentNode.addChild(pageMaker.getInfobox("infobox-alter", title));
+ HTMLNode infoboxContent = pageMaker.getContentNode(infobox);
+ infoboxContent.addChild("#", message);
+ byte[] responseBytes = pageNode.generate().getBytes("utf-8");
+ context.sendReplyHeaders(200, "OK", new MultiValueTable(),
"text/html; charset=utf-8", responseBytes.length);
+ context.writeData(responseBytes);
+ }
+
+ private HTMLNode createBackBox(PageMaker pageMaker) {
+ HTMLNode backbox = pageMaker.getInfobox((String) null);
+ HTMLNode backContent = pageMaker.getContentNode(backbox);
+ backContent.addChild("#", "Return to the ");
+ backContent.addChild("a", "href", "?action=list", "list of all
URIs");
+ backContent.addChild("#", ".");
+ return backbox;
+ }
+
+ private HTMLNode createAddBox(PageMaker pageMaker, ToadletContext ctx) {
+ HTMLNode addBox = pageMaker.getInfobox("Add a URI");
+ HTMLNode formNode =
pageMaker.getContentNode(addBox).addChild("form", new String[] { "action",
"method" }, new String[] { "", "get" });
+ formNode.addChild("input", new String[] { "type", "name",
"value" }, new String[] { "hidden", "action", "add" });
+ formNode.addChild("input", new String[] { "type", "size",
"name", "value" }, new String[] { "text", "40", "key", "" });
+ formNode.addChild("input", new String[] { "type", "value" },
new String[] { "submit", "Add URI" });
+ return addBox;
+ }
+
+ private HTMLNode createNavbar(PageMaker pageMaker, int running, int
queued, int visited, int failed) {
+ HTMLNode navbar = pageMaker.getInfobox("navbar", "Page
Navigation");
+ HTMLNode list = pageMaker.getContentNode(navbar).addChild("ul");
+ list.addChild("li").addChild("a", "href", "#running", "Running
(" + running + ')');
+ list.addChild("li").addChild("a", "href", "#queued", "Queued ("
+ queued + ')');
+ list.addChild("li").addChild("a", "href", "#visited", "Visited
(" + visited + ')');
+ list.addChild("li").addChild("a", "href", "#failed", "Failed ("
+ failed + ')');
+ return navbar;
+ }
+
+ private HTMLNode createList(PageMaker pageMaker, String listName,
String anchorName, Collection collection, int maxCount) {
+ HTMLNode listNode = new HTMLNode("div");
+ listNode.addChild("a", "name", anchorName);
+ HTMLNode listBox = pageMaker.getInfobox(listName);
+ HTMLNode listContent = pageMaker.getContentNode(listBox);
+ listNode.addChild(listBox);
+ Iterator collectionItems = collection.iterator();
+ int itemCount = 0;
+ while (collectionItems.hasNext()) {
+ FreenetURI uri = (FreenetURI) collectionItems.next();
+ listContent.addChild("#", uri.toString());
+ listContent.addChild("br");
+ if (itemCount++ == maxCount) {
+ listContent.addChild("br");
+ listContent.addChild("a", "href",
"?action=list&listName=" + anchorName, "Show all\u2026");
+ break;
+ }
+ }
+ return listNode;
+ }
+
+ /**
+ * @see freenet.oldplugins.plugin.Plugin#getPluginName()
+ */
+ public String getPluginName() {
+ return pluginName;
+ }
+
+ /**
+ * @see
freenet.oldplugins.plugin.Plugin#setPluginManager(freenet.oldplugins.plugin.PluginManager)
+ */
+ public void setPluginManager(PluginManager pluginManager) {
+ this.core = pluginManager.getClientCore();
+ this.ctx = core.makeClient((short) 0).getFetchContext();
+ ctx.maxSplitfileBlockRetries = 10;
+ ctx.maxNonSplitfileRetries = 10;
+ ctx.maxTempLength = 2 * 1024 * 1024;
+ ctx.maxOutputLength = 2 * 1024 * 1024;
+ tProducedIndex = System.currentTimeMillis();
+ }
+
+
+ /**
+ * @see freenet.oldplugins.plugin.Plugin#startPlugin()
+ */
+ public void startPlugin() {
+ stopped = false;
+
+ Thread starterThread = new Thread("Spider Plugin Starter") {
+ public void run() {
+ startSomeRequests();
+ }
+ };
+ starterThread.setDaemon(true);
+ starterThread.start();
+ }
+
+ /**
+ * @see freenet.oldplugins.plugin.Plugin#stopPlugin()
+ */
+ public void stopPlugin() {
+ synchronized (this) {
+ stopped = true;
+ queuedURIList.clear();
+ }
+ }
+
+ public void onMajorProgress() {
+ // Ignore
+ }
+
+ public void onFetchable(BaseClientPutter state) {
+ // Ignore
+ }
+ private static String convertToHex(byte[] data) {
+ StringBuffer buf = new StringBuffer();
+ for (int i = 0; i < data.length; i++) {
+ int halfbyte = (data[i] >>> 4) & 0x0F;
+ int two_halfs = 0;
+ do {
+ if ((0 <= halfbyte) && (halfbyte <= 9))
+ buf.append((char) ('0' + halfbyte));
+ else
+ buf.append((char) ('a' + (halfbyte - 10)));
+ halfbyte = data[i] & 0x0F;
+ } while(two_halfs++ < 1);
+ }
+ return buf.toString();
+ }
+ //this function will return the String representation of the MD5 hash
for the input string
+ public static String MD5(String text) throws NoSuchAlgorithmException,
UnsupportedEncodingException {
+ MessageDigest md;
+ md = MessageDigest.getInstance("MD5");
+ byte[] md5hash = new byte[32];
+ md.update(text.getBytes("iso-8859-1"), 0, text.length());
+ md5hash = md.digest();
+ return convertToHex(md5hash);
+ }
+
+ public void generateSubIndex(String filename){
+//generates the new subIndex
+ File outputFile = new File(filename);
+ StreamResult resultStream;
+ resultStream = new StreamResult(outputFile);
+
+ /* Initialize xml builder */
+ Document xmlDoc = null;
+ DocumentBuilderFactory xmlFactory = null;
+ DocumentBuilder xmlBuilder = null;
+ DOMImplementation impl = null;
+ Element rootElement = null;
+
+ xmlFactory = DocumentBuilderFactory.newInstance();
+
+
+ try {
+ xmlBuilder = xmlFactory.newDocumentBuilder();
+ } catch(javax.xml.parsers.ParserConfigurationException e) {
+ /* Will (should ?) never happen */
+ Logger.error(this, "Spider: Error while initializing
XML generator: "+e.toString());
+ return;
+ }
+
+
+ impl = xmlBuilder.getDOMImplementation();
+
+ /* Starting to generate index */
+
+ xmlDoc = impl.createDocument(null, "sub_index", null);
+ rootElement = xmlDoc.getDocumentElement();
+
+ /* Adding header to the index */
+ Element headerElement = xmlDoc.createElement("header");
+
+ /* -> title */
+ Element subHeaderElement = xmlDoc.createElement("title");
+ Text subHeaderText = xmlDoc.createTextNode(indexTitle);
+
+ subHeaderElement.appendChild(subHeaderText);
+ headerElement.appendChild(subHeaderElement);
+
+ /* -> owner */
+ subHeaderElement = xmlDoc.createElement("owner");
+ subHeaderText = xmlDoc.createTextNode(indexOwner);
+
+ subHeaderElement.appendChild(subHeaderText);
+ headerElement.appendChild(subHeaderElement);
+
+
+ /* -> owner email */
+ if(indexOwnerEmail != null) {
+ subHeaderElement = xmlDoc.createElement("email");
+ subHeaderText = xmlDoc.createTextNode(indexOwnerEmail);
+
+ subHeaderElement.appendChild(subHeaderText);
+ headerElement.appendChild(subHeaderElement);
+ }
+
+
+ Element filesElement = xmlDoc.createElement("files"); /*
filesElement != fileElement */
+
+ Element EntriesElement = xmlDoc.createElement("entries");
+ EntriesElement.setNodeValue("0");
+ EntriesElement.setAttribute("value", "0");
+ //all index files are ready
+ /* Adding word index */
+ Element keywordsElement = xmlDoc.createElement("keywords");
+
+ rootElement.appendChild(EntriesElement);
+ rootElement.appendChild(headerElement);
+ rootElement.appendChild(filesElement);
+ rootElement.appendChild(keywordsElement);
+
+ /* Serialization */
+ DOMSource domSource = new DOMSource(xmlDoc);
+ TransformerFactory transformFactory =
TransformerFactory.newInstance();
+ Transformer serializer;
+
+ try {
+ serializer = transformFactory.newTransformer();
+ } catch(javax.xml.transform.TransformerConfigurationException
e) {
+ Logger.error(this, "Spider: Error while serializing XML
(transformFactory.newTransformer()): "+e.toString());
+ return;
+ }
+
+
+ serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
+ serializer.setOutputProperty(OutputKeys.INDENT,"yes");
+
+ /* final step */
+ try {
+ serializer.transform(domSource, resultStream);
+ } catch(javax.xml.transform.TransformerException e) {
+ Logger.error(this, "Spider: Error while serializing XML
(transform()): "+e.toString());
+ return;
+ }
+
+ if(Logger.shouldLog(Logger.MINOR, this))
+ Logger.minor(this, "Spider: indexes regenerated.");
+ }
+
+
+
+}