Author: swatig0
Date: 2007-06-13 16:37:02 +0000 (Wed, 13 Jun 2007)
New Revision: 13562

Added:
   trunk/freenet/src/freenet/clients/http/XMLSpider.java
Log:
msg

Added: trunk/freenet/src/freenet/clients/http/XMLSpider.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/XMLSpider.java                       
        (rev 0)
+++ trunk/freenet/src/freenet/clients/http/XMLSpider.java       2007-06-13 
16:37:02 UTC (rev 13562)
@@ -0,0 +1,1084 @@
+/* This code is part of Freenet. It is distributed under the GNU General
+ * Public License, version 2 (or at your option any later version). See
+ * http://www.gnu.org/ for further details of the GPL. */
+package freenet.clients.http;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import org.w3c.dom.Attr;
+import org.w3c.dom.DOMImplementation;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.Text;
+
+import freenet.client.ClientMetadata;
+import freenet.client.FetchContext;
+import freenet.client.FetchException;
+import freenet.client.FetchResult;
+import freenet.client.InsertException;
+import freenet.client.async.BaseClientPutter;
+import freenet.client.async.ClientCallback;
+import freenet.client.async.ClientGetter;
+import freenet.clients.http.filter.ContentFilter;
+import freenet.clients.http.filter.FoundURICallback;
+import freenet.clients.http.filter.UnsafeContentTypeException;
+import freenet.keys.FreenetURI;
+import freenet.node.NodeClientCore;
+import freenet.node.RequestStarter;
+import freenet.oldplugins.plugin.HttpPlugin;
+import freenet.oldplugins.plugin.PluginManager;
+import freenet.support.HTMLNode;
+import freenet.support.Logger;
+import freenet.support.MultiValueTable;
+import freenet.support.api.Bucket;
+import freenet.support.api.HTTPRequest;
+/**
+ * Spider. Produces an index.
+ */
+public class XMLSpider implements HttpPlugin, ClientCallback, FoundURICallback 
{
+
+       long tProducedIndex;
+
+       // URIs visited, or fetching, or queued. Added once then forgotten 
about.
+       private final HashSet visitedURIs = new HashSet();
+       private final HashSet urisWithWords = new HashSet();
+       private final HashSet failedURIs = new HashSet();
+       private final HashSet queuedURISet = new HashSet();
+       private final LinkedList queuedURIList = new LinkedList();
+       private final HashMap runningFetchesByURI = new HashMap();
+       private final HashMap urisByWord = new HashMap();
+       private final HashMap titlesOfURIs = new HashMap();
+       private FileWriter output;
+       private FileWriter output2;
+       
+       private static final int minTimeBetweenEachIndexRewriting = 1;
+       //private static final String indexFilename = "index.xml";
+       private static final String DEFAULT_INDEX_DIR = "/home/swati/myindex/";
+       private static final int MAX_ENTRIES = 5;
+       private static final String pluginName = "XML spider";
+       
+       private static final String indexTitle= "This is an index";
+       private static final String indexOwner = "Another anonymous";
+       private static final String indexOwnerEmail = null;
+       private final HashMap sizeOfURIs = new HashMap(); /* String (URI) -> 
Long */
+       private final HashMap mimeOfURIs = new HashMap(); /* String (URI) -> 
String */
+       private final HashMap lastPositionByURI = new HashMap(); /* String 
(URI) -> Integer */ /* Use to determine word position on each uri */
+       private final HashMap positionsByWordByURI = new HashMap(); /* String 
(URI) -> HashMap (String (word) -> Integer[] (Positions)) */
+
+       // Can have many; this limit only exists to save memory.
+       private static final int maxParallelRequests = 20;
+       private int maxShownURIs = 50;
+       private HashMap urisToNumbers;
+       private NodeClientCore core;
+       private FetchContext ctx;
+       private final short PRIORITY_CLASS = 
RequestStarter.PREFETCH_PRIORITY_CLASS;
+       private boolean stopped = true;
+
+       private synchronized void queueURI(FreenetURI uri) {
+               //not adding the html condition
+               if ((!visitedURIs.contains(uri)) && queuedURISet.add(uri)) {
+                       queuedURIList.addLast(uri);
+                       visitedURIs.add(uri);
+               }
+       }
+
+       private void startSomeRequests() {
+               try{
+                       Thread.sleep(30 * 1000); // Let the node start up
+               } catch (InterruptedException e){}
+               
+               FreenetURI[] initialURIs = 
core.bookmarkManager.getBookmarkURIs();
+               for (int i = 0; i < initialURIs.length; i++)
+               {
+               queueURI(initialURIs[i]);
+               }
+                                       
+               ArrayList toStart = null;
+               synchronized (this) {
+                       if (stopped) {
+                               return;
+                       }
+                       int running = runningFetchesByURI.size();
+                       int queued = queuedURIList.size();
+                       
+                       if ((running >= maxParallelRequests) || (queued == 0))
+                               return;
+                       
+                       toStart = new ArrayList(Math.min(maxParallelRequests - 
running, queued));
+                       
+                       for (int i = running; i < maxParallelRequests; i++) {
+                               if (queuedURIList.isEmpty())
+                                       break;
+                               FreenetURI uri = (FreenetURI) 
queuedURIList.removeFirst();
+                               queuedURISet.remove(uri);
+                               ClientGetter getter = makeGetter(uri);
+                               toStart.add(getter);
+                               
+                       }
+                       
+                       for (int i = 0; i < toStart.size(); i++) {
+                       ClientGetter g = (ClientGetter) toStart.get(i);
+                       try {
+                               runningFetchesByURI.put(g.getURI(), g);
+                               g.start();
+                               } catch (FetchException e) {
+                                       onFailure(e, g);
+                               }
+               
+                       }
+               }
+                               
+       }
+       
+
+       private ClientGetter makeGetter(FreenetURI uri) {
+               ClientGetter g = new ClientGetter(this, 
core.requestStarters.chkFetchScheduler, core.requestStarters.sskFetchScheduler, 
uri, ctx, PRIORITY_CLASS, this, null, null);
+               return g;
+       }
+
+       public void onSuccess(FetchResult result, ClientGetter state) {
+               FreenetURI uri = state.getURI();
+               
+               synchronized (this) {
+                       runningFetchesByURI.remove(uri);
+               }
+               startSomeRequests();
+               ClientMetadata cm = result.getMetadata();
+               Bucket data = result.asBucket();
+               String mimeType = cm.getMIMEType();
+               
+               sizeOfURIs.put(uri.toString(), new Long(data.size()));
+               mimeOfURIs.put(uri.toString(), mimeType);
+               
+               try {
+                       ContentFilter.filter(data, ctx.bucketFactory, mimeType, 
uri.toURI("http://127.0.0.1:8888/";), this);
+               } catch (UnsafeContentTypeException e) {
+                       return; // Ignore
+               } catch (IOException e) {
+                       Logger.error(this, "Bucket error?: " + e, e);
+               } catch (URISyntaxException e) {
+                       Logger.error(this, "Internal error: " + e, e);
+               } finally {
+                       data.free();
+               }
+       }
+
+       public void onFailure(FetchException e, ClientGetter state) {
+               FreenetURI uri = state.getURI();
+               
+               synchronized (this) {
+                       failedURIs.add(uri);
+                       runningFetchesByURI.remove(uri);
+               }
+               if (e.newURI != null)
+                       queueURI(e.newURI);
+               else
+                       queueURI(uri);
+               startSomeRequests();
+               
+               
+       }
+
+       public void onSuccess(BaseClientPutter state) {
+               // Ignore
+       }
+
+       public void onFailure(InsertException e, BaseClientPutter state) {
+               // Ignore
+       }
+
+       public void onGeneratedURI(FreenetURI uri, BaseClientPutter state) {
+               // Ignore
+       }
+
+       public void foundURI(FreenetURI uri) {
+               queueURI(uri);
+               startSomeRequests();
+       }
+
+       public void onText(String s, String type, URI baseURI) {
+               
+               FreenetURI uri;
+               try {
+                       uri = new FreenetURI(baseURI.getPath().substring(1));
+               } catch (MalformedURLException e) {
+                       Logger.error(this, "Caught " + e, e);
+                       return;
+               }
+                
+               
+      
+               if((type != null) && (type.length() != 0) && 
type.toLowerCase().equals("title")
+                  && (s != null) && (s.length() != 0) && (s.indexOf('\n') < 
0)) {
+                       /* We should have a correct title */
+                       titlesOfURIs.put(uri.toString(), s);
+                       type = "title";
+                       
+               }
+               else type = null;
+
+//                     
+//             for (int i = 0; i < words.length; i++) {
+//                     String word = words[i];
+//                     if ((word == null) || (word.length() == 0))
+//                             continue;
+//                     word = word.toLowerCase();
+//                     addWord(word, uri);
+//             }
+               String[] words = s.split("[^A-Za-z0-9]");
+
+               Integer lastPosition = null;
+
+               lastPosition = (Integer)lastPositionByURI.get(uri.toString());
+
+               if(lastPosition == null)
+                       lastPosition = new Integer(1); /* We start to count 
from 1 */
+
+               for (int i = 0; i < words.length; i++) {
+                       String word = words[i];
+                       if ((word == null) || (word.length() == 0))
+                               continue;
+                       word = word.toLowerCase();
+                       try{
+                       if(type == null)
+                               addWord(word, lastPosition.intValue() + i, uri);
+                       else
+                               addWord(word, -1 * (i+1), uri);
+                       }
+                       catch (Exception e){}
+               }
+               
+               if(type == null) {
+                       lastPosition = new Integer(lastPosition.intValue() + 
words.length);
+                       lastPositionByURI.put(uri.toString(), lastPosition);
+               }
+               
+       }
+
+       private synchronized void addWord(String word, int position,FreenetURI 
uri) throws Exception{
+               
+               
+               if(word.length() < 3)
+                       return;
+
+
+               FreenetURI[] uris = (FreenetURI[]) urisByWord.get(word);
+
+               //Integer[] positions = (Integer[]) 
positionsByWordByURI.get(word);
+
+               urisWithWords.add(uri);
+
+
+               /* Word position indexation */
+               HashMap wordPositionsForOneUri = 
(HashMap)positionsByWordByURI.get(uri.toString()); /* For a given URI, take as 
key a word, and gives position */
+               
+               if(wordPositionsForOneUri == null) {
+                       wordPositionsForOneUri = new HashMap();
+                       wordPositionsForOneUri.put(word, new Integer[] { new 
Integer(position) });
+                       positionsByWordByURI.put(uri.toString(), 
wordPositionsForOneUri);
+               } else {
+                       Integer[] positions = 
(Integer[])wordPositionsForOneUri.get(word);
+
+                       if(positions == null) {
+                               positions = new Integer[] { new 
Integer(position) };
+                               wordPositionsForOneUri.put(word, positions);
+                       } else {
+                               Integer[] newPositions = new 
Integer[positions.length + 1];
+
+                               System.arraycopy(positions, 0, newPositions, 0, 
positions.length);
+                               newPositions[positions.length] = new 
Integer(position);
+
+                               wordPositionsForOneUri.put(word, newPositions);
+                       }
+               }
+       
+               if (uris == null) {
+                       urisByWord.put(word, new FreenetURI[] { uri });
+                       
+               } else {
+                       for (int i = 0; i < uris.length; i++) {
+                               if (uris[i].equals(uri))
+                                       return;
+                       }
+                       FreenetURI[] newURIs = new FreenetURI[uris.length + 1];
+                       System.arraycopy(uris, 0, newURIs, 0, uris.length);
+                       newURIs[uris.length] = uri;
+                       urisByWord.put(word, newURIs);
+               }
+               if (tProducedIndex + minTimeBetweenEachIndexRewriting * 10 < 
System.currentTimeMillis()) {
+                       try {
+                               produceIndex();
+                               generateIndex();
+                       } catch (IOException e) {
+                               Logger.error(this, "Caught " + e + " while 
creating index", e);
+                       }
+                       tProducedIndex = System.currentTimeMillis();
+               }
+               
+       }
+
+       private synchronized void produceIndex() throws 
IOException,NoSuchAlgorithmException {
+               // Produce an index file.
+               //FileOutputStream fos = new FileOutputStream("index2_new.xml");
+               
+               //the number of bits to consider for matching 
+               int prefix = 1 ;
+       
+               if (urisByWord.isEmpty() || urisWithWords.isEmpty()) {
+                       System.out.println("No URIs with words");
+                       return;
+               }
+               File outputFile = new File(DEFAULT_INDEX_DIR+"index.xml");
+               StreamResult resultStream;
+               resultStream = new StreamResult(outputFile);
+
+               /* Initialize xml builder */
+               Document xmlDoc = null;
+               DocumentBuilderFactory xmlFactory = null;
+               DocumentBuilder xmlBuilder = null;
+               DOMImplementation impl = null;
+               Element rootElement = null;
+
+               xmlFactory = DocumentBuilderFactory.newInstance();
+
+
+               try {
+                       xmlBuilder = xmlFactory.newDocumentBuilder();
+               } catch(javax.xml.parsers.ParserConfigurationException e) {
+                       /* Will (should ?) never happen */
+                       Logger.error(this, "Spider: Error while initializing 
XML generator: "+e.toString());
+                       return;
+               }
+
+
+               impl = xmlBuilder.getDOMImplementation();
+
+               /* Starting to generate index */
+
+               xmlDoc = impl.createDocument(null, "main_index", null);
+               rootElement = xmlDoc.getDocumentElement();
+
+               /* Adding header to the index */
+               Element headerElement = xmlDoc.createElement("header");
+
+               /* -> title */
+               Element subHeaderElement = xmlDoc.createElement("title");
+               Text subHeaderText = xmlDoc.createTextNode(indexTitle);
+               
+               subHeaderElement.appendChild(subHeaderText);
+               headerElement.appendChild(subHeaderElement);
+
+               /* -> owner */
+               subHeaderElement = xmlDoc.createElement("owner");
+               subHeaderText = xmlDoc.createTextNode(indexOwner);
+               
+               subHeaderElement.appendChild(subHeaderText);
+               headerElement.appendChild(subHeaderElement);
+               
+               /* -> owner email */
+               if(indexOwnerEmail != null) {
+                       subHeaderElement = xmlDoc.createElement("email");
+                       subHeaderText = xmlDoc.createTextNode(indexOwnerEmail);
+                       
+                       subHeaderElement.appendChild(subHeaderText);
+                       headerElement.appendChild(subHeaderElement);
+               }
+
+               
+               String[] words = (String[]) urisByWord.keySet().toArray(new 
String[urisByWord.size()]);
+               Arrays.sort(words);
+               FreenetURI[] uris = (FreenetURI[]) urisWithWords.toArray(new 
FreenetURI[urisWithWords.size()]);
+               urisToNumbers = new HashMap();
+               Element prefixElement = xmlDoc.createElement("prefix");
+               prefixElement.setAttribute("value", prefix+"");
+               Element filesElement = xmlDoc.createElement("files"); /* 
filesElement != fileElement */
+
+               for (int i = 0; i < uris.length; i++) {
+                       urisToNumbers.put(uris[i], new Integer(i));
+                       
+                       Element fileElement = xmlDoc.createElement("file");
+
+                       fileElement.setAttribute("id", Integer.toString(i));
+                       fileElement.setAttribute("key", uris[i].toString());
+                       
+                       Long size = (Long)sizeOfURIs.get(uris[i].toString());
+
+                       if(size == null) {
+                               Logger.error(this, "Spider: size is missing");
+                       } else {
+                               fileElement.setAttribute("size", 
size.toString());
+                       }
+                       fileElement.setAttribute("mime", 
((String)mimeOfURIs.get(uris[i].toString())));
+
+                       Element titleElement = xmlDoc.createElement("option");
+                       titleElement.setAttribute("name", "title");
+                       titleElement.setAttribute("value", 
(String)titlesOfURIs.get(uris[i].toString()));
+
+                       fileElement.appendChild(titleElement);
+                       filesElement.appendChild(fileElement);
+               }
+
+               
+               
+               //all index files are ready
+               /* Adding word index */
+               Element keywordsElement = xmlDoc.createElement("keywords");
+               for(int i = 0;i<16;i++){
+                       
generateSubIndex(DEFAULT_INDEX_DIR+"index_"+Integer.toHexString(i)+".xml");
+                       Element subIndexElement = 
xmlDoc.createElement("subIndex");
+                       if(i<=9)
+                       subIndexElement.setAttribute("key",i+"");
+                       else
+                               
subIndexElement.setAttribute("key",Integer.toHexString(i));
+                       //the subindex element key will contain the bits used 
for matching in that subindex
+                       keywordsElement.appendChild(subIndexElement);
+               }
+               
+                                       
+
+               // make sure that prefix is the first child of root Element
+               rootElement.appendChild(prefixElement);
+               rootElement.appendChild(headerElement);
+               
+               rootElement.appendChild(filesElement);
+               rootElement.appendChild(keywordsElement);
+
+               /* Serialization */
+               DOMSource domSource = new DOMSource(xmlDoc);
+               TransformerFactory transformFactory = 
TransformerFactory.newInstance();
+               Transformer serializer;
+
+               try {
+                       serializer = transformFactory.newTransformer();
+               } catch(javax.xml.transform.TransformerConfigurationException 
e) {
+                       Logger.error(this, "Spider: Error while serializing XML 
(transformFactory.newTransformer()): "+e.toString());
+                       return;
+               }
+               
+
+               serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
+               serializer.setOutputProperty(OutputKeys.INDENT,"yes");
+               
+               /* final step */
+               try {
+                       serializer.transform(domSource, resultStream);
+               } catch(javax.xml.transform.TransformerException e) {
+                       Logger.error(this, "Spider: Error while serializing XML 
(transform()): "+e.toString());
+                       return;
+               }
+
+               if(Logger.shouldLog(Logger.MINOR, this))
+                       Logger.minor(this, "Spider: indexes regenerated.");
+       
+       
+       //the main xml file is generated 
+       //now as each word is generated enter it into the respective subindex
+       //now the parsing will start and nodes will be added as needed 
+               
+               
+               
+
+       }
+
+       private synchronized void generateIndex() throws Exception{
+               String[] words = (String[]) urisByWord.keySet().toArray(new 
String[urisByWord.size()]);
+               Arrays.sort(words);
+                        
+                       
+                               
+               for (int i = 0; i < 100; i++) {
+               try{
+               
+               String prefix_match = getIndex(words[i]);
+
+               boolean addedWord = addWord(prefix_match,words[i]);
+
+               if(addedWord == false)
+                       {
+                       
+                       output2 = new FileWriter(DEFAULT_INDEX_DIR+"log3",true);
+                       output2.write("\naddword failes at "+words[i]+" with 
prefix "+prefix_match);
+                       split(prefix_match);
+                       regenerateIndex(prefix_match);
+                       output2.write("finished splitting on prefix 
"+prefix_match);
+                       prefix_match = getIndex(words[i]);
+                       output2.write("the new prefix "+prefix_match);
+                       addWord(prefix_match,words[i]);
+       
+                       }
+                       output2.close();
+       }
+               catch(Exception e2){ }
+               }       
+
+       
+       }
+       private void regenerateIndex(String prefix) throws Exception{
+               //redistribute the entries in prefix.xml to prefix(0-f).xml
+               DocumentBuilderFactory docFactory = 
DocumentBuilderFactory.newInstance();
+               DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
+               Document doc = 
docBuilder.parse(DEFAULT_INDEX_DIR+"index_"+prefix+".xml");
+               Element root = doc.getDocumentElement();
+               NodeList wordList = root.getElementsByTagName("word");
+               for(int i = 0;i<wordList.getLength();i++){
+                       Element word = (Element)wordList.item(i);
+                       String value = word.getAttribute("v");
+                       String prefix_match = getIndex(value);
+                       addWord(prefix_match,value);
+               }
+       }
+       private String getIndex(String word) throws Exception {
+               DocumentBuilderFactory docFactory = 
DocumentBuilderFactory.newInstance();
+               DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
+               Document doc = docBuilder.parse(DEFAULT_INDEX_DIR+"index.xml");
+               Element root = doc.getDocumentElement();
+               Attr prefix_value = (Attr) 
(root.getElementsByTagName("prefix").item(0)).getAttributes().getNamedItem("value");
+               int prefix = Integer.parseInt(prefix_value.getValue()); 
+               output = new FileWriter(DEFAULT_INDEX_DIR+"logfile2",true);
+               //Element prefixNode = (Element)root.getFirstChild();
+                output.write("\nword "+word);
+               
+               String md5 = MD5(word);
+               output.write("  md5 "+md5);
+//             NodeList KeywordsList = root.getElementsByTagName("keywords");
+               
+               //Node Keyword = KeywordsList.item(0);
+       
+               
+               NodeList subindexList = root.getElementsByTagName("subIndex");
+               String str = md5.substring(0,prefix);           
+               
+                output.write("String "+str);
+                 output.write("\n");
+               
+                 output.close();
+                 String prefix_match = search(str,subindexList);
+
+                       
+               
+               output = new FileWriter(DEFAULT_INDEX_DIR+"search",true);
+               output.write("\nPrefix returned "+prefix_match+" with md5 
"+str+ " and word "+word);
+               output.close();
+                       
+               
+               return prefix_match;
+       }
+       private boolean addWord(String prefix, String str) throws Exception
+       {
+               //this word has to be added to the particular subindex
+               // modify the corresponding index
+               try{
+                       
+               
+                       DocumentBuilderFactory docFactory = 
DocumentBuilderFactory.newInstance();
+                       DocumentBuilder docBuilder = 
docFactory.newDocumentBuilder();
+                       Document doc = 
docBuilder.parse(DEFAULT_INDEX_DIR+"index_"+prefix+".xml");
+                       Element root = doc.getDocumentElement();
+                       
+                       Element entry = (Element) 
root.getElementsByTagName("entries").item(0);
+                       
+                       Attr no_entries = (Attr) 
entry.getAttributes().getNamedItem("value");
+                       
+                       
+                       if(Integer.parseInt(no_entries.getValue()) >= 
MAX_ENTRIES) return false;
+                       else
+                       {
+                       //increment the number of entries
+                       
entry.setAttribute("value",(Integer.parseInt(no_entries.getValue())+1)+"");
+                       //add the entry
+                       
+                       Element wordElement = doc.createElement("word");
+                       wordElement.setAttribute("v", str);
+
+                       FreenetURI[] urisForWord = (FreenetURI[]) 
urisByWord.get(str);
+
+                       /* URI by URI */
+                       for (int j = 0; j < urisForWord.length; j++) {
+                               FreenetURI uri = urisForWord[j];
+                               Integer x = (Integer) urisToNumbers.get(uri);
+                               
+                               if (x == null) {
+                                       Logger.error(this, "Eh?");
+                                       continue;
+                               }
+
+                               Element uriElement = doc.createElement("file");
+                               uriElement.setAttribute("id", x.toString());
+//
+//                             /* Position by position */
+                               HashMap positionsForGivenWord = 
(HashMap)positionsByWordByURI.get(uri.toString());
+                               Integer[] positions = 
(Integer[])positionsForGivenWord.get(str);
+
+                               StringBuffer positionList = new StringBuffer();
+
+                               for(int k=0; k < positions.length ; k++) {
+                                       if(k!=0)
+                                               positionList.append(',');
+
+                                       
positionList.append(positions[k].toString());
+                               }
+                               
+                               
uriElement.appendChild(doc.createTextNode(positionList.toString()));
+
+                               wordElement.appendChild(uriElement);
+                       }
+                       Element keywordsElement = (Element) 
root.getElementsByTagName("keywords").item(0);
+                       keywordsElement.appendChild(wordElement);
+               
+                       
+                       
+                       DOMSource domSource = new DOMSource(doc);
+                       TransformerFactory transformFactory = 
TransformerFactory.newInstance();
+                       Transformer serializer;
+
+                       
+                               serializer = transformFactory.newTransformer();
+                       
+                               
+                                               
+                       File outputFile = new 
File(DEFAULT_INDEX_DIR+"index_"+prefix+".xml");
+                       StreamResult resultStream;
+                       resultStream = new StreamResult(outputFile);
+
+                       serializer.setOutputProperty(OutputKeys.ENCODING, 
"UTF-8");
+                       serializer.setOutputProperty(OutputKeys.INDENT,"yes");
+                       
+                       /* final step */
+                       try {
+                               serializer.transform(domSource, resultStream);
+                       } catch(javax.xml.transform.TransformerException e) {}
+                               
+                                               //i.appendChild(root);
+                       //c.replaceChild(root,doc.getDocumentElement());
+                       
+                               
+                       }
+                       
+                       return true;    
+               }
+               
+               catch(Exception e){}
+               return false;
+       }
+       private void split(String prefix) throws Exception
+       {
+               //first we need to split the current subindex into 16 newones
+               //then read from the original one and append to the new ones
+               
+               // make the entry in the main index..
+               DocumentBuilderFactory docFactory = 
DocumentBuilderFactory.newInstance();
+               DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
+               Document doc = docBuilder.parse(DEFAULT_INDEX_DIR+"index.xml");
+               Element root = doc.getDocumentElement();
+               Element prefixElt =(Element) 
root.getElementsByTagName("prefix").item(0);
+               int prefix_current = 
Integer.parseInt(prefixElt.getAttribute("value"));
+               if (prefix_current <= prefix.length())
+               prefixElt.setAttribute("value", (prefix_current+1)+"");
+               
+               Element keywordElement = (Element) 
root.getElementsByTagName("keywords").item(0);
+               
+               NodeList subIndexElt = root.getElementsByTagName("subIndex");
+               for(int i =0;i<subIndexElt.getLength();i++)
+               {
+                       Element subIndex = (Element) subIndexElt.item(i);
+                       if((subIndex.getAttribute("key")).equals(prefix)) {
+                               keywordElement.removeChild(subIndex);
+                               break;
+                       }
+               }
+               
+               for(int i = 0;i<16;i++)
+                       {
+                       Element subIndex = doc.createElement("subIndex");
+                       
generateSubIndex(DEFAULT_INDEX_DIR+"index_"+prefix+Integer.toHexString(i)+".xml");
+                       
subIndex.setAttribute("key",prefix.concat(Integer.toHexString(i)));
+                       keywordElement.appendChild(subIndex);
+                       }
+               
+               
+               DOMSource domSource = new DOMSource(doc);
+               TransformerFactory transformFactory = 
TransformerFactory.newInstance();
+               Transformer serializer;
+
+               
+                       serializer = transformFactory.newTransformer();
+               
+                       
+                                       
+               File outputFile = new File(DEFAULT_INDEX_DIR+"index.xml");
+               StreamResult resultStream;
+               resultStream = new StreamResult(outputFile);
+
+               serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
+               serializer.setOutputProperty(OutputKeys.INDENT,"yes");
+               
+               /* final step */
+               try {
+                       serializer.transform(domSource, resultStream);
+               } catch(javax.xml.transform.TransformerException e) {}
+               
+               
+       }
+       public String search(String str,NodeList list) throws Exception
+       {
+               int prefix = str.length();
+               for(int i = 0;i<list.getLength();i++){
+                       Element subIndex = (Element) list.item(i);
+                       String key = subIndex.getAttribute("key");
+                       if(key.equals(str)) return key;
+               }
+               
+               return search(str.substring(0, prefix-1),list);
+       }
+
+//             
+//             output.close();
+//             return search(str.substring(0,prefix-1),list);  
+               
+
+       
+       public void handleGet(HTTPRequest request, ToadletContext context) 
throws IOException, ToadletContextClosedException {
+               String action = request.getParam("action");
+               PageMaker pageMaker = context.getPageMaker();
+               if ((action == null) || (action.length() == 0)) {
+                       MultiValueTable responseHeaders = new MultiValueTable();
+                       responseHeaders.put("Location", "?action=list");
+                       context.sendReplyHeaders(301, "Redirect", 
responseHeaders, "text/html; charset=utf-8", 0);
+                       return;
+               } else if ("list".equals(action)) {
+                       
+                       String listName = request.getParam("listName", null);
+                       HTMLNode pageNode = pageMaker.getPageNode("The XML 
Spider", context);
+                       HTMLNode contentNode = 
pageMaker.getContentNode(pageNode);
+                       /* create copies for multi-threaded use */
+                       if (listName == null) {
+                               Map runningFetches = new 
HashMap(runningFetchesByURI);
+                               List queued = new ArrayList(queuedURIList);
+                               Set visited = new HashSet(visitedURIs);
+                               Set failed = new HashSet(failedURIs);
+                               contentNode.addChild(createNavbar(pageMaker, 
runningFetches.size(), queued.size(), visited.size(), failed.size()));
+                               contentNode.addChild(createAddBox(pageMaker, 
context));
+                               contentNode.addChild(createList(pageMaker, 
"Running FetcheIIIs", "running", runningFetches.keySet(), maxShownURIs));
+                               contentNode.addChild(createList(pageMaker, 
"Queued URIs", "queued", queued, maxShownURIs));
+                               contentNode.addChild(createList(pageMaker, 
"Visited URIs", "visited", visited, maxShownURIs));
+                               contentNode.addChild(createList(pageMaker, 
"Failed URIs", "failed", failed, maxShownURIs));
+                       } else {
+                               contentNode.addChild(createBackBox(pageMaker));
+                               if ("failed".equals(listName)) {
+                                       Set failed = new HashSet(failedURIs);
+                                       
contentNode.addChild(createList(pageMaker, "Failed URIs", "failed", failed, 
-1));       
+                               } else if ("visited".equals(listName)) {
+                                       Set visited = new HashSet(visitedURIs);
+                                       
contentNode.addChild(createList(pageMaker, "Visited URIs", "visited", visited, 
-1));
+                               } else if ("queued".equals(listName)) {
+                                       List queued = new 
ArrayList(queuedURIList);
+                                       
contentNode.addChild(createList(pageMaker, "Queued URIs", "queued", queued, 
-1));
+                               } else if ("running".equals(listName)) {
+                                       Map runningFetches = new 
HashMap(runningFetchesByURI);
+                                       
contentNode.addChild(createList(pageMaker, "Running Fetches", "running", 
runningFetches.keySet(), -1));
+                               }
+                       }
+                       MultiValueTable responseHeaders = new MultiValueTable();
+                       byte[] responseBytes = 
pageNode.generate().getBytes("utf-8");
+                       context.sendReplyHeaders(200, "OK", responseHeaders, 
"text/html; charset=utf-8", responseBytes.length);
+                       context.writeData(responseBytes);
+               } else if ("add".equals(action)) {
+                       String uriParam = request.getParam("key");
+                       try {
+                               FreenetURI uri = new FreenetURI(uriParam);
+                               synchronized (this) {
+                                       failedURIs.remove(uri);
+                                       visitedURIs.remove(uri);
+                               }
+                               queueURI(uri);
+                               startSomeRequests();
+                       } catch (MalformedURLException mue1) {
+                               sendSimpleResponse(context, "URL invalid", "The 
given URI is not valid.");
+                               return;
+                       }
+                       MultiValueTable responseHeaders = new MultiValueTable();
+                       responseHeaders.put("Location", "?action=list");
+                       context.sendReplyHeaders(301, "Redirect", 
responseHeaders, "text/html; charset=utf-8", 0);
+                       return;
+               }
+       }
+
+       /**
+        * @see 
freenet.oldplugins.plugin.HttpPlugin#handlePost(freenet.clients.http.HTTPRequestImpl,
 freenet.clients.http.ToadletContext)
+        */
+       public void handlePost(HTTPRequest request, ToadletContext context) 
throws IOException {
+       }
+       
+       private void sendSimpleResponse(ToadletContext context, String title, 
String message) throws ToadletContextClosedException, IOException {
+               PageMaker pageMaker = context.getPageMaker();
+               HTMLNode pageNode = pageMaker.getPageNode(title, context);
+               HTMLNode contentNode = pageMaker.getContentNode(pageNode);
+               HTMLNode infobox = 
contentNode.addChild(pageMaker.getInfobox("infobox-alter", title));
+               HTMLNode infoboxContent = pageMaker.getContentNode(infobox);
+               infoboxContent.addChild("#", message);
+               byte[] responseBytes = pageNode.generate().getBytes("utf-8");
+               context.sendReplyHeaders(200, "OK", new MultiValueTable(), 
"text/html; charset=utf-8", responseBytes.length);
+               context.writeData(responseBytes);
+       }
+       
+       private HTMLNode createBackBox(PageMaker pageMaker) {
+               HTMLNode backbox = pageMaker.getInfobox((String) null);
+               HTMLNode backContent = pageMaker.getContentNode(backbox);
+               backContent.addChild("#", "Return to the ");
+               backContent.addChild("a", "href", "?action=list", "list of all 
URIs");
+               backContent.addChild("#", ".");
+               return backbox;
+       }
+       
+       private HTMLNode createAddBox(PageMaker pageMaker, ToadletContext ctx) {
+               HTMLNode addBox = pageMaker.getInfobox("Add a URI");
+               HTMLNode formNode = 
pageMaker.getContentNode(addBox).addChild("form", new String[] { "action", 
"method" }, new String[] { "", "get" });
+               formNode.addChild("input", new String[] { "type", "name", 
"value" }, new String[] { "hidden", "action", "add" });
+               formNode.addChild("input", new String[] { "type", "size", 
"name", "value" }, new String[] { "text", "40", "key", "" });
+               formNode.addChild("input", new String[] { "type", "value" }, 
new String[] { "submit", "Add URI" });
+               return addBox;
+       }
+
+       private HTMLNode createNavbar(PageMaker pageMaker, int running, int 
queued, int visited, int failed) {
+               HTMLNode navbar = pageMaker.getInfobox("navbar", "Page 
Navigation");
+               HTMLNode list = pageMaker.getContentNode(navbar).addChild("ul");
+               list.addChild("li").addChild("a", "href", "#running", "Running 
(" + running + ')');
+               list.addChild("li").addChild("a", "href", "#queued", "Queued (" 
+ queued + ')');
+               list.addChild("li").addChild("a", "href", "#visited", "Visited 
(" + visited + ')');
+               list.addChild("li").addChild("a", "href", "#failed", "Failed (" 
+ failed + ')');
+               return navbar;
+       }
+
+       private HTMLNode createList(PageMaker pageMaker, String listName, 
String anchorName, Collection collection, int maxCount) {
+               HTMLNode listNode = new HTMLNode("div");
+               listNode.addChild("a", "name", anchorName);
+               HTMLNode listBox = pageMaker.getInfobox(listName);
+               HTMLNode listContent = pageMaker.getContentNode(listBox);
+               listNode.addChild(listBox);
+               Iterator collectionItems = collection.iterator();
+               int itemCount = 0;
+               while (collectionItems.hasNext()) {
+                       FreenetURI uri = (FreenetURI) collectionItems.next();
+                       listContent.addChild("#", uri.toString());
+                       listContent.addChild("br");
+                       if (itemCount++ == maxCount) {
+                               listContent.addChild("br");
+                               listContent.addChild("a", "href", 
"?action=list&listName=" + anchorName, "Show all\u2026");
+                               break;
+                       }
+               }
+               return listNode;
+       }
+
+       /**
+        * @see freenet.oldplugins.plugin.Plugin#getPluginName()
+        */
+       public String getPluginName() {
+               return pluginName;
+       }
+
+       /**
+        * @see 
freenet.oldplugins.plugin.Plugin#setPluginManager(freenet.oldplugins.plugin.PluginManager)
+        */
+       public void setPluginManager(PluginManager pluginManager) {
+               this.core = pluginManager.getClientCore();
+               this.ctx = core.makeClient((short) 0).getFetchContext();
+               ctx.maxSplitfileBlockRetries = 10;
+               ctx.maxNonSplitfileRetries = 10;
+               ctx.maxTempLength = 2 * 1024 * 1024;
+               ctx.maxOutputLength = 2 * 1024 * 1024;
+               tProducedIndex = System.currentTimeMillis();
+       }
+
+
+       /**
+        * @see freenet.oldplugins.plugin.Plugin#startPlugin()
+        */
+       public void startPlugin() {
+               stopped = false;
+               
+               Thread starterThread = new Thread("Spider Plugin Starter") {
+                       public void run() {
+                               startSomeRequests();
+                       }
+               };
+               starterThread.setDaemon(true);
+               starterThread.start();
+       }
+
+       /**
+        * @see freenet.oldplugins.plugin.Plugin#stopPlugin()
+        */
+       public void stopPlugin() {
+               synchronized (this) {
+                       stopped = true;
+                       queuedURIList.clear();
+               }
+       }
+
+       public void onMajorProgress() {
+               // Ignore
+       }
+
+       public void onFetchable(BaseClientPutter state) {
+               // Ignore
+       }
+       private static String convertToHex(byte[] data) {
+        StringBuffer buf = new StringBuffer();
+        for (int i = 0; i < data.length; i++) {
+               int halfbyte = (data[i] >>> 4) & 0x0F;
+               int two_halfs = 0;
+               do {
+                       if ((0 <= halfbyte) && (halfbyte <= 9))
+                       buf.append((char) ('0' + halfbyte));
+                   else
+                       buf.append((char) ('a' + (halfbyte - 10)));
+                       halfbyte = data[i] & 0x0F;
+               } while(two_halfs++ < 1);
+        }
+        return buf.toString();
+    }
+       //this function will return the String representation of the MD5 hash 
for the input string 
+       public static String MD5(String text) throws NoSuchAlgorithmException, 
UnsupportedEncodingException  {
+               MessageDigest md;
+               md = MessageDigest.getInstance("MD5");
+               byte[] md5hash = new byte[32];
+               md.update(text.getBytes("iso-8859-1"), 0, text.length());
+               md5hash = md.digest();
+               return convertToHex(md5hash);
+       }
+       
+       public void generateSubIndex(String filename){
+//generates the new subIndex
+               File outputFile = new File(filename);
+               StreamResult resultStream;
+               resultStream = new StreamResult(outputFile);
+
+               /* Initialize xml builder */
+               Document xmlDoc = null;
+               DocumentBuilderFactory xmlFactory = null;
+               DocumentBuilder xmlBuilder = null;
+               DOMImplementation impl = null;
+               Element rootElement = null;
+
+               xmlFactory = DocumentBuilderFactory.newInstance();
+
+
+               try {
+                       xmlBuilder = xmlFactory.newDocumentBuilder();
+               } catch(javax.xml.parsers.ParserConfigurationException e) {
+                       /* Will (should ?) never happen */
+                       Logger.error(this, "Spider: Error while initializing 
XML generator: "+e.toString());
+                       return;
+               }
+
+
+               impl = xmlBuilder.getDOMImplementation();
+
+               /* Starting to generate index */
+
+               xmlDoc = impl.createDocument(null, "sub_index", null);
+               rootElement = xmlDoc.getDocumentElement();
+
+               /* Adding header to the index */
+               Element headerElement = xmlDoc.createElement("header");
+
+               /* -> title */
+               Element subHeaderElement = xmlDoc.createElement("title");
+               Text subHeaderText = xmlDoc.createTextNode(indexTitle);
+               
+               subHeaderElement.appendChild(subHeaderText);
+               headerElement.appendChild(subHeaderElement);
+
+               /* -> owner */
+               subHeaderElement = xmlDoc.createElement("owner");
+               subHeaderText = xmlDoc.createTextNode(indexOwner);
+               
+               subHeaderElement.appendChild(subHeaderText);
+               headerElement.appendChild(subHeaderElement);
+               
+       
+               /* -> owner email */
+               if(indexOwnerEmail != null) {
+                       subHeaderElement = xmlDoc.createElement("email");
+                       subHeaderText = xmlDoc.createTextNode(indexOwnerEmail);
+                       
+                       subHeaderElement.appendChild(subHeaderText);
+                       headerElement.appendChild(subHeaderElement);
+               }
+
+               
+               Element filesElement = xmlDoc.createElement("files"); /* 
filesElement != fileElement */
+
+               Element EntriesElement = xmlDoc.createElement("entries");
+               EntriesElement.setNodeValue("0");
+               EntriesElement.setAttribute("value", "0");
+               //all index files are ready
+               /* Adding word index */
+               Element keywordsElement = xmlDoc.createElement("keywords");
+               
+               rootElement.appendChild(EntriesElement);
+               rootElement.appendChild(headerElement);
+               rootElement.appendChild(filesElement);
+               rootElement.appendChild(keywordsElement);
+
+               /* Serialization */
+               DOMSource domSource = new DOMSource(xmlDoc);
+               TransformerFactory transformFactory = 
TransformerFactory.newInstance();
+               Transformer serializer;
+
+               try {
+                       serializer = transformFactory.newTransformer();
+               } catch(javax.xml.transform.TransformerConfigurationException 
e) {
+                       Logger.error(this, "Spider: Error while serializing XML 
(transformFactory.newTransformer()): "+e.toString());
+                       return;
+               }
+
+
+               serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
+               serializer.setOutputProperty(OutputKeys.INDENT,"yes");
+               
+               /* final step */
+               try {
+                       serializer.transform(domSource, resultStream);
+               } catch(javax.xml.transform.TransformerException e) {
+                       Logger.error(this, "Spider: Error while serializing XML 
(transform()): "+e.toString());
+                       return;
+               }
+
+               if(Logger.shouldLog(Logger.MINOR, this))
+                       Logger.minor(this, "Spider: indexes regenerated.");
+       }
+
+       
+       
+}


Reply via email to