[freenet-cvs] r13687 - in trunk/plugins: . XMLSpider

[email protected] Thu, 21 Jun 2007 17:29:08 +0000 (UTC)

Author: swatig0
Date: 2007-06-21 17:29:08 +0000 (Thu, 21 Jun 2007)
New Revision: 13687


Added:
   trunk/plugins/XMLSpider/
   trunk/plugins/XMLSpider/XMLSpider.java
Log:
trying to port to other api

Added: trunk/plugins/XMLSpider/XMLSpider.java
===================================================================
--- trunk/plugins/XMLSpider/XMLSpider.java                              (rev 0)
+++ trunk/plugins/XMLSpider/XMLSpider.java      2007-06-21 17:29:08 UTC (rev 
13687)
@@ -0,0 +1,1128 @@
+/* This code is part of Freenet. It is distributed under the GNU General
+ * Public License, version 2 (or at your option any later version). See
+ * http://www.gnu.org/ for further details of the GPL. */
+package plugins.XMLSpider;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import org.w3c.dom.Attr;
+import org.w3c.dom.DOMImplementation;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.Text;
+
+import freenet.client.ClientMetadata;
+import freenet.client.FetchContext;
+import freenet.client.FetchException;
+import freenet.client.FetchResult;
+import freenet.client.InsertException;
+import freenet.client.async.BaseClientPutter;
+import freenet.client.async.ClientCallback;
+import freenet.client.async.ClientGetter;
+import freenet.client.async.USKCallback;
+import freenet.clients.http.PageMaker;
+import freenet.clients.http.ToadletContext;
+import freenet.clients.http.ToadletContextClosedException;
+import freenet.clients.http.filter.ContentFilter;
+import freenet.clients.http.filter.FoundURICallback;
+import freenet.clients.http.filter.UnsafeContentTypeException;
+import freenet.keys.FreenetURI;
+import freenet.keys.USK;
+import freenet.node.Node;
+import freenet.node.NodeClientCore;
+import freenet.node.RequestStarter;
+import freenet.oldplugins.plugin.HttpPlugin;
+import freenet.oldplugins.plugin.PluginManager;
+import freenet.pluginmanager.FredPlugin;
+import freenet.pluginmanager.FredPluginHTTP;
+import freenet.pluginmanager.FredPluginHTTPAdvanced;
+import freenet.pluginmanager.FredPluginThreadless;
+import freenet.pluginmanager.PluginHTTPException;
+import freenet.pluginmanager.PluginRespirator;
+import freenet.support.HTMLNode;
+import freenet.support.Logger;
+import freenet.support.MultiValueTable;
+import freenet.support.api.Bucket;
+import freenet.support.api.HTTPRequest;
+/**
+ * Spider. Produces an index.
+ */
+public class XMLSpider implements FredPlugin, FredPluginHTTP, 
FredPluginThreadless,  FredPluginHTTPAdvanced, HttpPlugin, ClientCallback, 
FoundURICallback ,USKCallback{
+
+       long tProducedIndex;
+
+       // URIs visited, or fetching, or queued. Added once then forgotten 
about.
+       private final HashSet visitedURIs = new HashSet();
+       private final HashSet urisWithWords = new HashSet();
+       private final HashSet failedURIs = new HashSet();
+       private final HashSet queuedURISet = new HashSet();
+       private final LinkedList queuedURIList = new LinkedList();
+       private final HashMap runningFetchesByURI = new HashMap();
+       private final HashMap urisByWord = new HashMap();
+       private final HashMap titlesOfURIs = new HashMap();
+       
+       private static final int minTimeBetweenEachIndexRewriting = 10;
+       //private static final String indexFilename = "index.xml";
+       private static final String DEFAULT_INDEX_DIR = "myindex/";
+       public Set allowedMIMETypes;
+       private static final int MAX_ENTRIES = 50;
+       private static final String pluginName = "XML spider";
+       
+       private static final String indexTitle= "This is an index";
+       private static final String indexOwner = "Another anonymous";
+       private static final String indexOwnerEmail = null;
+       private final HashMap sizeOfURIs = new HashMap(); /* String (URI) -> 
Long */
+       private final HashMap mimeOfURIs = new HashMap(); /* String (URI) -> 
String */
+       private final HashMap lastPositionByURI = new HashMap(); /* String 
(URI) -> Integer */ /* Use to determine word position on each uri */
+       private final HashMap positionsByWordByURI = new HashMap(); /* String 
(URI) -> HashMap (String (word) -> Integer[] (Positions)) */
+
+       // Can have many; this limit only exists to save memory.
+       private static final int maxParallelRequests = 20;
+       private int maxShownURIs = 50;
+       private HashMap urisToNumbers;
+       private NodeClientCore core;
+       private FetchContext ctx;
+       private final short PRIORITY_CLASS = 
RequestStarter.IMMEDIATE_SPLITFILE_PRIORITY_CLASS;
+       private boolean stopped = true;
+       PluginRespirator pr;
+       
+
+       private synchronized void queueURI(FreenetURI uri) {
+               //not adding the html condition
+               if ((!visitedURIs.contains(uri)) && queuedURISet.add(uri)) {
+                       queuedURIList.addLast(uri);
+                       visitedURIs.add(uri);
+               }
+       }
+
+       private void startSomeRequests() {
+
+               
+               FreenetURI[] initialURIs = 
core.bookmarkManager.getBookmarkURIs();
+               for (int i = 0; i < initialURIs.length; i++)
+               {
+               queueURI(initialURIs[i]);
+               }
+                                       
+               ArrayList toStart = null;
+               synchronized (this) {
+                       if (stopped) {
+                               return;
+                       }
+                       int running = runningFetchesByURI.size();
+                       int queued = queuedURIList.size();
+                       
+                       if ((running >= maxParallelRequests) || (queued == 0))
+                               return;
+                       
+                       toStart = new ArrayList(Math.min(maxParallelRequests - 
running, queued));
+                       
+                       for (int i = running; i < maxParallelRequests; i++) {
+                               if (queuedURIList.isEmpty())
+                                       break;
+                               FreenetURI uri = (FreenetURI) 
queuedURIList.removeFirst();
+                               queuedURISet.remove(uri);
+                               if((uri.getKeyType()).equals("USK")){
+                               if(uri.getSuggestedEdition() < 0)
+                                       uri = uri.setSuggestedEdition((-1)* 
uri.getSuggestedEdition());
+                               try{
+                                       
(ctx.uskManager).subscribe(USK.create(uri),this, false, this);  
+                               }catch(Exception e){
+                                       
+                               }
+                               
+                               }
+                               ClientGetter getter = makeGetter(uri);
+                               toStart.add(getter);
+                               }
+               }
+                       for (int i = 0; i < toStart.size(); i++) {
+                               
+                       ClientGetter g = (ClientGetter) toStart.get(i);
+                       try {
+                               runningFetchesByURI.put(g.getURI(), g);
+                               g.start();
+                               FileWriter outp = new 
FileWriter("logfile2",true);
+                               outp.write("URI "+g.getURI().toString()+'\n');
+                               
+                               outp.close();
+                               } catch (FetchException e) {
+                                       onFailure(e, g);
+                               }
+                               catch (IOException e){
+                                       Logger.error(this, "the logfile can not 
be written"+e.toString(), e);
+                               }
+               
+                       }
+               //}
+                               
+       }
+       
+
+       private ClientGetter makeGetter(FreenetURI uri) {
+               ClientGetter g = new ClientGetter(this, 
core.requestStarters.chkFetchScheduler, core.requestStarters.sskFetchScheduler, 
uri, ctx, PRIORITY_CLASS, this, null, null);
+               return g;
+       }
+
+       public void onSuccess(FetchResult result, ClientGetter state) {
+               FreenetURI uri = state.getURI();
+               try{
+           FileWriter output = new FileWriter("logfile",true);
+           output.write(uri.toString()+"\n");
+           output.close();
+               }
+               catch(Exception e){
+                       Logger.error(this, "The uri could not be removed from 
running "+e.toString(), e);
+               }
+               synchronized (this) {
+                       runningFetchesByURI.remove(uri);
+               }
+               startSomeRequests();
+               ClientMetadata cm = result.getMetadata();
+               Bucket data = result.asBucket();
+               String mimeType = cm.getMIMEType();
+               
+               sizeOfURIs.put(uri.toString(), new Long(data.size()));
+               mimeOfURIs.put(uri.toString(), mimeType);
+               
+               try {
+                       ContentFilter.filter(data, ctx.bucketFactory, mimeType, 
uri.toURI("http://127.0.0.1:8888/";), this);
+               } catch (UnsafeContentTypeException e) {
+                       return; // Ignore
+               } catch (IOException e) {
+                       Logger.error(this, "Bucket error?: " + e, e);
+               } catch (URISyntaxException e) {
+                       Logger.error(this, "Internal error: " + e, e);
+               } finally {
+                       data.free();
+               }
+       }
+
+       public void onFailure(FetchException e, ClientGetter state) {
+               FreenetURI uri = state.getURI();
+               try{
+                       FileWriter outp = new FileWriter("failed",true);
+                       outp.write("failed "+e.toString()+" for "+uri+'\n');
+                       outp.close();
+                       
+               }catch(Exception e2){
+                       
+               }
+               synchronized (this) {
+                       runningFetchesByURI.remove(uri);
+                       failedURIs.add(uri);
+               }
+               if (e.newURI != null)
+                       queueURI(e.newURI);
+//             else
+//                     queueURI(uri);
+               startSomeRequests();
+               
+               
+       }
+
+       public void onSuccess(BaseClientPutter state) {
+               // Ignore
+       }
+
+       public void onFailure(InsertException e, BaseClientPutter state) {
+               // Ignore
+       }
+
+       public void onGeneratedURI(FreenetURI uri, BaseClientPutter state) {
+               // Ignore
+       }
+
+       public void foundURI(FreenetURI uri) {
+               queueURI(uri);
+               startSomeRequests();
+       }
+
+       public void onText(String s, String type, URI baseURI) {
+               
+               FreenetURI uri;
+               try {
+                       uri = new FreenetURI(baseURI.getPath().substring(1));
+               } catch (MalformedURLException e) {
+                       Logger.error(this, "Caught " + e, e);
+                       return;
+               }
+                
+               
+      
+               if((type != null) && (type.length() != 0) && 
type.toLowerCase().equals("title")
+                  && (s != null) && (s.length() != 0) && (s.indexOf('\n') < 
0)) {
+                       /* We should have a correct title */
+                       titlesOfURIs.put(uri.toString(), s);
+                       type = "title";
+                       
+               }
+               else type = null;
+
+
+               String[] words = s.split("[^A-Za-z0-9]");
+
+               Integer lastPosition = null;
+
+               lastPosition = (Integer)lastPositionByURI.get(uri.toString());
+
+               if(lastPosition == null)
+                       lastPosition = new Integer(1); /* We start to count 
from 1 */
+
+               for (int i = 0; i < words.length; i++) {
+                       String word = words[i];
+                       if ((word == null) || (word.length() == 0))
+                               continue;
+                       word = word.toLowerCase();
+                       try{
+                       if(type == null)
+                               addWord(word, lastPosition.intValue() + i, uri);
+                       else
+                               addWord(word, -1 * (i+1), uri);
+                       }
+                       catch (Exception e){}
+               }
+               
+               if(type == null) {
+                       lastPosition = new Integer(lastPosition.intValue() + 
words.length);
+                       lastPositionByURI.put(uri.toString(), lastPosition);
+               }
+               
+       }
+
+       private synchronized void addWord(String word, int position,FreenetURI 
uri) throws Exception{
+               
+               
+               if(word.length() < 3)
+                       return;
+
+
+               FreenetURI[] uris = (FreenetURI[]) urisByWord.get(word);
+
+               //Integer[] positions = (Integer[]) 
positionsByWordByURI.get(word);
+
+               urisWithWords.add(uri);
+
+
+               /* Word position indexation */
+               HashMap wordPositionsForOneUri = 
(HashMap)positionsByWordByURI.get(uri.toString()); /* For a given URI, take as 
key a word, and gives position */
+               
+               if(wordPositionsForOneUri == null) {
+                       wordPositionsForOneUri = new HashMap();
+                       wordPositionsForOneUri.put(word, new Integer[] { new 
Integer(position) });
+                       positionsByWordByURI.put(uri.toString(), 
wordPositionsForOneUri);
+               } else {
+                       Integer[] positions = 
(Integer[])wordPositionsForOneUri.get(word);
+
+                       if(positions == null) {
+                               positions = new Integer[] { new 
Integer(position) };
+                               wordPositionsForOneUri.put(word, positions);
+                       } else {
+                               Integer[] newPositions = new 
Integer[positions.length + 1];
+
+                               System.arraycopy(positions, 0, newPositions, 0, 
positions.length);
+                               newPositions[positions.length] = new 
Integer(position);
+
+                               wordPositionsForOneUri.put(word, newPositions);
+                       }
+               }
+       
+               if (uris == null) {
+                       urisByWord.put(word, new FreenetURI[] { uri });
+                       
+               } else {
+                       for (int i = 0; i < uris.length; i++) {
+                               if (uris[i].equals(uri))
+                                       return;
+                       }
+                       FreenetURI[] newURIs = new FreenetURI[uris.length + 1];
+                       System.arraycopy(uris, 0, newURIs, 0, uris.length);
+                       newURIs[uris.length] = uri;
+                       urisByWord.put(word, newURIs);
+               }
+               if (tProducedIndex + minTimeBetweenEachIndexRewriting * 1000 < 
System.currentTimeMillis()) {
+                       try {
+                               produceIndex();
+                               generateIndex();
+                       } catch (IOException e) {
+                               Logger.error(this, "Caught " + e + " while 
creating index", e);
+                       }
+                       tProducedIndex = System.currentTimeMillis();
+               }
+               
+       }
+
+       private synchronized void produceIndex() throws 
IOException,NoSuchAlgorithmException {
+               // Produce the main index file.
+               
+               //the number of bits to consider for matching 
+               int prefix = 1 ;
+       
+               if (urisByWord.isEmpty() || urisWithWords.isEmpty()) {
+                       System.out.println("No URIs with words");
+                       return;
+               }
+               File outputFile = new File(DEFAULT_INDEX_DIR+"index.xml");
+               StreamResult resultStream;
+               resultStream = new StreamResult(outputFile);
+
+               /* Initialize xml builder */
+               Document xmlDoc = null;
+               DocumentBuilderFactory xmlFactory = null;
+               DocumentBuilder xmlBuilder = null;
+               DOMImplementation impl = null;
+               Element rootElement = null;
+
+               xmlFactory = DocumentBuilderFactory.newInstance();
+
+
+               try {
+                       xmlBuilder = xmlFactory.newDocumentBuilder();
+               } catch(javax.xml.parsers.ParserConfigurationException e) {
+                       /* Will (should ?) never happen */
+                       Logger.error(this, "Spider: Error while initializing 
XML generator: "+e.toString());
+                       return;
+               }
+
+               impl = xmlBuilder.getDOMImplementation();
+               /* Starting to generate index */
+               xmlDoc = impl.createDocument(null, "main_index", null);
+               rootElement = xmlDoc.getDocumentElement();
+
+               /* Adding header to the index */
+               Element headerElement = xmlDoc.createElement("header");
+
+               /* -> title */
+               Element subHeaderElement = xmlDoc.createElement("title");
+               Text subHeaderText = xmlDoc.createTextNode(indexTitle);
+               
+               subHeaderElement.appendChild(subHeaderText);
+               headerElement.appendChild(subHeaderElement);
+
+               /* -> owner */
+               subHeaderElement = xmlDoc.createElement("owner");
+               subHeaderText = xmlDoc.createTextNode(indexOwner);
+               
+               subHeaderElement.appendChild(subHeaderText);
+               headerElement.appendChild(subHeaderElement);
+               
+               /* -> owner email */
+               if(indexOwnerEmail != null) {
+                       subHeaderElement = xmlDoc.createElement("email");
+                       subHeaderText = xmlDoc.createTextNode(indexOwnerEmail);
+                       
+                       subHeaderElement.appendChild(subHeaderText);
+                       headerElement.appendChild(subHeaderElement);
+               }
+
+               
+               String[] words = (String[]) urisByWord.keySet().toArray(new 
String[urisByWord.size()]);
+               Arrays.sort(words);
+               FreenetURI[] uris = (FreenetURI[]) urisWithWords.toArray(new 
FreenetURI[urisWithWords.size()]);
+               urisToNumbers = new HashMap();
+               Element prefixElement = xmlDoc.createElement("prefix");
+               prefixElement.setAttribute("value", prefix+"");
+       
+
+               for (int i = 0; i < uris.length; i++) {
+                       urisToNumbers.put(uris[i], new Integer(i));
+                       }
+               
+               //all index files are ready
+               /* Adding word index */
+               Element keywordsElement = xmlDoc.createElement("keywords");
+               for(int i = 0;i<16;i++){
+                       
generateSubIndex(DEFAULT_INDEX_DIR+"index_"+Integer.toHexString(i)+".xml");
+                       Element subIndexElement = 
xmlDoc.createElement("subIndex");
+                       if(i<=9)
+                       subIndexElement.setAttribute("key",i+"");
+                       else
+                               
subIndexElement.setAttribute("key",Integer.toHexString(i));
+                       //the subindex element key will contain the bits used 
for matching in that subindex
+                       keywordsElement.appendChild(subIndexElement);
+               }
+               
+
+               // make sure that prefix is the first child of root Element
+               rootElement.appendChild(prefixElement);
+               rootElement.appendChild(headerElement);
+               
+               //rootElement.appendChild(filesElement);
+               rootElement.appendChild(keywordsElement);
+
+               /* Serialization */
+               DOMSource domSource = new DOMSource(xmlDoc);
+               TransformerFactory transformFactory = 
TransformerFactory.newInstance();
+               Transformer serializer;
+
+               try {
+                       serializer = transformFactory.newTransformer();
+               } catch(javax.xml.transform.TransformerConfigurationException 
e) {
+                       Logger.error(this, "Spider: Error while serializing XML 
(transformFactory.newTransformer()): "+e.toString());
+                       return;
+               }
+
+               serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
+               serializer.setOutputProperty(OutputKeys.INDENT,"yes");
+               
+               /* final step */
+               try {
+                       serializer.transform(domSource, resultStream);
+               } catch(javax.xml.transform.TransformerException e) {
+                       Logger.error(this, "Spider: Error while serializing XML 
(transform()): "+e.toString());
+                       return;
+               }
+
+               if(Logger.shouldLog(Logger.MINOR, this))
+                       Logger.minor(this, "Spider: indexes regenerated.");
+       
+       //the main xml file is generated 
+       //now as each word is generated enter it into the respective subindex
+       //now the parsing will start and nodes will be added as needed 
+               
+
+       }
+
+       private synchronized void generateIndex() throws Exception{
+               String[] words = (String[]) urisByWord.keySet().toArray(new 
String[urisByWord.size()]);
+               Arrays.sort(words);
+               for (int i = 0; i < words.length; i++) {
+               try{
+               
+               String prefix_match = getIndex(words[i]);
+
+               boolean addedWord = addWord(prefix_match,words[i]);
+
+               if(addedWord == false)
+                       {
+                       split(prefix_match);
+                       regenerateIndex(prefix_match);
+                       prefix_match = getIndex(words[i]);
+                       addWord(prefix_match,words[i]);
+                       }
+               }
+               catch(Exception e2){Logger.error(this,"The Word could not be 
added"+ e2.toString(), e2); }
+               }       
+
+       
+       }
+       private void regenerateIndex(String prefix) throws Exception{
+               //redistribute the entries in prefix.xml to prefix(0-f).xml
+               DocumentBuilderFactory docFactory = 
DocumentBuilderFactory.newInstance();
+               DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
+               Document doc = 
docBuilder.parse(DEFAULT_INDEX_DIR+"index_"+prefix+".xml");
+               Element root = doc.getDocumentElement();
+               NodeList wordList = root.getElementsByTagName("word");
+               for(int i = 0;i<wordList.getLength();i++){
+                       Element word = (Element)wordList.item(i);
+                       String value = word.getAttribute("v");
+                       String prefix_match = getIndex(value);
+                       addWord(prefix_match,value);
+               }
+       }
+       
+       private String getIndex(String word) throws Exception {
+               DocumentBuilderFactory docFactory = 
DocumentBuilderFactory.newInstance();
+               DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
+               Document doc = docBuilder.parse(DEFAULT_INDEX_DIR+"index.xml");
+               Element root = doc.getDocumentElement();
+               Attr prefix_value = (Attr) 
(root.getElementsByTagName("prefix").item(0)).getAttributes().getNamedItem("value");
+               int prefix = Integer.parseInt(prefix_value.getValue()); 
+               String md5 = MD5(word);
+               NodeList subindexList = root.getElementsByTagName("subIndex");
+               String str = md5.substring(0,prefix);           
+               String prefix_match = search(str,subindexList);
+               
+               return prefix_match;
+       }
+       
+       private boolean addWord(String prefix, String str) throws Exception
+       {
+               //this word has to be added to the particular subindex
+               // modify the corresponding index
+               try{
+                       DocumentBuilderFactory docFactory = 
DocumentBuilderFactory.newInstance();
+                       DocumentBuilder docBuilder = 
docFactory.newDocumentBuilder();
+                       Document doc = 
docBuilder.parse(DEFAULT_INDEX_DIR+"index_"+prefix+".xml");
+                       Element root = doc.getDocumentElement();
+                       
+                       Element entry = (Element) 
root.getElementsByTagName("entries").item(0);
+                       
+                       Attr no_entries = (Attr) 
entry.getAttributes().getNamedItem("value");
+                       
+                       Element filesElement = (Element) 
root.getElementsByTagName("files").item(0);
+                       NodeList filesList = 
filesElement.getElementsByTagName("file");
+                       if(Integer.parseInt(no_entries.getValue()) >= 
MAX_ENTRIES) return false;
+                       else
+                       {
+                       //increment the number of entries
+                       
entry.setAttribute("value",(Integer.parseInt(no_entries.getValue())+1)+"");
+                       //add the entry
+                       
+                       Element wordElement = doc.createElement("word");
+                       wordElement.setAttribute("v", str);
+
+                       FreenetURI[] urisForWord = (FreenetURI[]) 
urisByWord.get(str);
+
+                       /* URI by URI */
+                       for (int j = 0; j < urisForWord.length; j++) {
+                               FreenetURI uri = urisForWord[j];
+                               Integer x = (Integer) urisToNumbers.get(uri);
+                               
+                               if (x == null) {
+                                       Logger.error(this, "Eh?");
+                                       continue;
+                               }
+
+                               Element uriElement = doc.createElement("file");
+                               Element fileElement = doc.createElement("file");
+                               uriElement.setAttribute("id", x.toString());
+                               fileElement.setAttribute("id", x.toString());
+                               fileElement.setAttribute("key", uri.toString());
+//                             /* Position by position */
+                               HashMap positionsForGivenWord = 
(HashMap)positionsByWordByURI.get(uri.toString());
+                               Integer[] positions = 
(Integer[])positionsForGivenWord.get(str);
+
+                               StringBuffer positionList = new StringBuffer();
+
+                               for(int k=0; k < positions.length ; k++) {
+                                       if(k!=0)
+                                               positionList.append(',');
+
+                                       
positionList.append(positions[k].toString());
+                               }
+                               
+                               
uriElement.appendChild(doc.createTextNode(positionList.toString()));
+                               int l;
+                       for(l = 0;l<filesList.getLength();l++)
+                               { Element file = (Element) filesList.item(l);
+                               if(file.getAttribute("id").equals(x.toString()))
+                               
+                               break;
+                               }
+                               wordElement.appendChild(uriElement);
+                               if(l>=filesList.getLength())
+                               filesElement.appendChild(fileElement);
+                       }
+                       Element keywordsElement = (Element) 
root.getElementsByTagName("keywords").item(0);
+                       keywordsElement.appendChild(wordElement);
+               
+                       
+                       
+                       DOMSource domSource = new DOMSource(doc);
+                       TransformerFactory transformFactory = 
TransformerFactory.newInstance();
+                       Transformer serializer;
+
+                       
+                               serializer = transformFactory.newTransformer();
+                       
+                               
+                                               
+                       File outputFile = new 
File(DEFAULT_INDEX_DIR+"index_"+prefix+".xml");
+                       StreamResult resultStream;
+                       resultStream = new StreamResult(outputFile);
+
+                       serializer.setOutputProperty(OutputKeys.ENCODING, 
"UTF-8");
+                       serializer.setOutputProperty(OutputKeys.INDENT,"yes");
+                       
+                       /* final step */
+                       try {
+                               serializer.transform(domSource, resultStream);
+                       } catch(javax.xml.transform.TransformerException e) {}
+                       }
+                       
+                       return true;    
+               }
+               
+               catch(Exception e){Logger.error(this,"Word could not be added 
to the subindex"+ e.toString(), e);}
+               return false;
+       }
+       private void split(String prefix) throws Exception
+       {
+               //first we need to split the current subindex into 16 newones
+               //then read from the original one and append to the new ones
+               // make the entry in the main index..
+               DocumentBuilderFactory docFactory = 
DocumentBuilderFactory.newInstance();
+               DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
+               Document doc = docBuilder.parse(DEFAULT_INDEX_DIR+"index.xml");
+               Element root = doc.getDocumentElement();
+               Element prefixElt =(Element) 
root.getElementsByTagName("prefix").item(0);
+               int prefix_current = 
Integer.parseInt(prefixElt.getAttribute("value"));
+               if (prefix_current <= prefix.length())
+               prefixElt.setAttribute("value", (prefix_current+1)+"");
+               
+               Element keywordElement = (Element) 
root.getElementsByTagName("keywords").item(0);
+               
+               NodeList subIndexElt = root.getElementsByTagName("subIndex");
+               for(int i =0;i<subIndexElt.getLength();i++)
+               {
+                       Element subIndex = (Element) subIndexElt.item(i);
+                       if((subIndex.getAttribute("key")).equals(prefix)) {
+                               keywordElement.removeChild(subIndex);
+                               break;
+                       }
+               }
+               
+               for(int i = 0;i<16;i++)
+                       {
+                       Element subIndex = doc.createElement("subIndex");
+                       
generateSubIndex(DEFAULT_INDEX_DIR+"index_"+prefix+Integer.toHexString(i)+".xml");
+                       
subIndex.setAttribute("key",prefix.concat(Integer.toHexString(i)));
+                       keywordElement.appendChild(subIndex);
+                       }
+               
+               
+               DOMSource domSource = new DOMSource(doc);
+               TransformerFactory transformFactory = 
TransformerFactory.newInstance();
+               Transformer serializer;
+               serializer = transformFactory.newTransformer();
+               File outputFile = new File(DEFAULT_INDEX_DIR+"index.xml");
+               StreamResult resultStream;
+               resultStream = new StreamResult(outputFile);
+
+               serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
+               serializer.setOutputProperty(OutputKeys.INDENT,"yes");
+               
+               /* final step */
+               try {
+                       serializer.transform(domSource, resultStream);
+               } catch(javax.xml.transform.TransformerException e) {}
+       }
+       
+       public String search(String str,NodeList list) throws Exception
+       {
+               int prefix = str.length();
+               for(int i = 0;i<list.getLength();i++){
+                       Element subIndex = (Element) list.item(i);
+                       String key = subIndex.getAttribute("key");
+                       if(key.equals(str)) return key;
+               }
+               return search(str.substring(0, prefix-1),list);
+       }
+
+       
+       public void handleGet(HTTPRequest request, ToadletContext context) 
throws IOException, ToadletContextClosedException {
+               String action = request.getParam("action");
+               PageMaker pageMaker = context.getPageMaker();
+               if ((action == null) || (action.length() == 0)) {
+                       MultiValueTable responseHeaders = new MultiValueTable();
+                       responseHeaders.put("Location", "?action=list");
+                       context.sendReplyHeaders(301, "Redirect", 
responseHeaders, "text/html; charset=utf-8", 0);
+                       return;
+               } else if ("list".equals(action)) {
+                       
+                       String listName = request.getParam("listName", null);
+                       HTMLNode pageNode = pageMaker.getPageNode("The XML 
Spider", context);
+                       HTMLNode contentNode = 
pageMaker.getContentNode(pageNode);
+                       /* create copies for multi-threaded use */
+                       if (listName == null) {
+                               Map runningFetches = new 
HashMap(runningFetchesByURI);
+                               List queued = new ArrayList(queuedURIList);
+                               Set visited = new HashSet(visitedURIs);
+                               Set failed = new HashSet(failedURIs);
+                               contentNode.addChild(createNavbar(pageMaker, 
runningFetches.size(), queued.size(), visited.size(), failed.size()));
+                               contentNode.addChild(createAddBox(pageMaker, 
context));
+                               contentNode.addChild(createList(pageMaker, 
"Running FetcheIIIs", "running", runningFetches.keySet(), maxShownURIs));
+                               contentNode.addChild(createList(pageMaker, 
"Queued URIs", "queued", queued, maxShownURIs));
+                               contentNode.addChild(createList(pageMaker, 
"Visited URIs", "visited", visited, maxShownURIs));
+                               contentNode.addChild(createList(pageMaker, 
"Failed URIs", "failed", failed, maxShownURIs));
+                       } else {
+                               contentNode.addChild(createBackBox(pageMaker));
+                               if ("failed".equals(listName)) {
+                                       Set failed = new HashSet(failedURIs);
+                                       
contentNode.addChild(createList(pageMaker, "Failed URIs", "failed", failed, 
-1));       
+                               } else if ("visited".equals(listName)) {
+                                       Set visited = new HashSet(visitedURIs);
+                                       
contentNode.addChild(createList(pageMaker, "Visited URIs", "visited", visited, 
-1));
+                               } else if ("queued".equals(listName)) {
+                                       List queued = new 
ArrayList(queuedURIList);
+                                       
contentNode.addChild(createList(pageMaker, "Queued URIs", "queued", queued, 
-1));
+                               } else if ("running".equals(listName)) {
+                                       Map runningFetches = new 
HashMap(runningFetchesByURI);
+                                       
contentNode.addChild(createList(pageMaker, "Running Fetches", "running", 
runningFetches.keySet(), -1));
+                               }
+                       }
+                       MultiValueTable responseHeaders = new MultiValueTable();
+                       byte[] responseBytes = 
pageNode.generate().getBytes("utf-8");
+                       context.sendReplyHeaders(200, "OK", responseHeaders, 
"text/html; charset=utf-8", responseBytes.length);
+                       context.writeData(responseBytes);
+               } else if ("add".equals(action)) {
+                       String uriParam = request.getParam("key");
+                       try {
+                               FreenetURI uri = new FreenetURI(uriParam);
+                               synchronized (this) {
+                                       failedURIs.remove(uri);
+                                       visitedURIs.remove(uri);
+                               }
+                               queueURI(uri);
+                               startSomeRequests();
+                       } catch (MalformedURLException mue1) {
+                               sendSimpleResponse(context, "URL invalid", "The 
given URI is not valid.");
+                               return;
+                       }
+                       MultiValueTable responseHeaders = new MultiValueTable();
+                       responseHeaders.put("Location", "?action=list");
+                       context.sendReplyHeaders(301, "Redirect", 
responseHeaders, "text/html; charset=utf-8", 0);
+                       return;
+               }
+       }
+
+       /**
+        * @see 
freenet.oldplugins.plugin.HttpPlugin#handlePost(freenet.clients.http.HTTPRequestImpl,
 freenet.clients.http.ToadletContext)
+        */
+       public void handlePost(HTTPRequest request, ToadletContext context) 
throws IOException {
+       }
+       
+       private void sendSimpleResponse(ToadletContext context, String title, 
String message) throws ToadletContextClosedException, IOException {
+               PageMaker pageMaker = context.getPageMaker();
+               HTMLNode pageNode = pageMaker.getPageNode(title, context);
+               HTMLNode contentNode = pageMaker.getContentNode(pageNode);
+               HTMLNode infobox = 
contentNode.addChild(pageMaker.getInfobox("infobox-alter", title));
+               HTMLNode infoboxContent = pageMaker.getContentNode(infobox);
+               infoboxContent.addChild("#", message);
+               byte[] responseBytes = pageNode.generate().getBytes("utf-8");
+               context.sendReplyHeaders(200, "OK", new MultiValueTable(), 
"text/html; charset=utf-8", responseBytes.length);
+               context.writeData(responseBytes);
+       }
+       
+       private HTMLNode createBackBox(PageMaker pageMaker) {
+               HTMLNode backbox = pageMaker.getInfobox((String) null);
+               HTMLNode backContent = pageMaker.getContentNode(backbox);
+               backContent.addChild("#", "Return to the ");
+               backContent.addChild("a", "href", "?action=list", "list of all 
URIs");
+               backContent.addChild("#", ".");
+               return backbox;
+       }
+       
+       private HTMLNode createAddBox(PageMaker pageMaker, ToadletContext ctx) {
+               HTMLNode addBox = pageMaker.getInfobox("Add a URI");
+               HTMLNode formNode = 
pageMaker.getContentNode(addBox).addChild("form", new String[] { "action", 
"method" }, new String[] { "", "get" });
+               formNode.addChild("input", new String[] { "type", "name", 
"value" }, new String[] { "hidden", "action", "add" });
+               formNode.addChild("input", new String[] { "type", "size", 
"name", "value" }, new String[] { "text", "40", "key", "" });
+               formNode.addChild("input", new String[] { "type", "value" }, 
new String[] { "submit", "Add URI" });
+               return addBox;
+       }
+
+       private HTMLNode createNavbar(PageMaker pageMaker, int running, int 
queued, int visited, int failed) {
+               HTMLNode navbar = pageMaker.getInfobox("navbar", "Page 
Navigation");
+               HTMLNode list = pageMaker.getContentNode(navbar).addChild("ul");
+               list.addChild("li").addChild("a", "href", "#running", "Running 
(" + running + ')');
+               list.addChild("li").addChild("a", "href", "#queued", "Queued (" 
+ queued + ')');
+               list.addChild("li").addChild("a", "href", "#visited", "Visited 
(" + visited + ')');
+               list.addChild("li").addChild("a", "href", "#failed", "Failed (" 
+ failed + ')');
+               return navbar;
+       }
+
+       private HTMLNode createList(PageMaker pageMaker, String listName, 
String anchorName, Collection collection, int maxCount) {
+               HTMLNode listNode = new HTMLNode("div");
+               listNode.addChild("a", "name", anchorName);
+               HTMLNode listBox = pageMaker.getInfobox(listName);
+               HTMLNode listContent = pageMaker.getContentNode(listBox);
+               listNode.addChild(listBox);
+               Iterator collectionItems = collection.iterator();
+               int itemCount = 0;
+               while (collectionItems.hasNext()) {
+                       FreenetURI uri = (FreenetURI) collectionItems.next();
+                       listContent.addChild("#", uri.toString());
+                       listContent.addChild("br");
+                       if (itemCount++ == maxCount) {
+                               listContent.addChild("br");
+                               listContent.addChild("a", "href", 
"?action=list&listName=" + anchorName, "Show all\u2026");
+                               break;
+                       }
+               }
+               return listNode;
+       }
+
+       /**
+        * @see freenet.oldplugins.plugin.Plugin#getPluginName()
+        */
+       public String getPluginName() {
+               return pluginName;
+       }
+
+       /**
+        * @see 
freenet.oldplugins.plugin.Plugin#setPluginManager(freenet.oldplugins.plugin.PluginManager)
+        */
+       public void setPluginManager(PluginManager pluginManager) {
+               
+               this.core = pluginManager.getClientCore();
+               this.ctx = core.makeClient((short) 0).getFetchContext();
+               ctx.maxSplitfileBlockRetries = 10;
+               ctx.maxNonSplitfileRetries = 10;
+               ctx.maxTempLength = 2 * 1024 * 1024;
+               ctx.maxOutputLength = 2 * 1024 * 1024;
+               allowedMIMETypes = new HashSet();
+               allowedMIMETypes.add(new String("text/html"));
+               allowedMIMETypes.add(new String("text/plain"));
+               allowedMIMETypes.add(new String("application/xhtml+xml"));
+       //      allowedMIMETypes.add(new String("application/zip"));
+               ctx.allowedMIMETypes = new HashSet(allowedMIMETypes);
+       //      ctx.allowedMIMETypes.add("text/html"); 
+               tProducedIndex = System.currentTimeMillis();
+       }
+
+
+       /**
+        * @see freenet.oldplugins.plugin.Plugin#startPlugin()
+        */
+       public void startPlugin() {
+               stopped = false;
+               
+               Thread starterThread = new Thread("Spider Plugin Starter") {
+                       public void run() {
+                               try{
+                                       Thread.sleep(30 * 1000); // Let the 
node start up
+                               } catch (InterruptedException e){}
+                               startSomeRequests();
+                       }
+               };
+               starterThread.setDaemon(true);
+               starterThread.start();
+       }
+
+       /**
+        * @see freenet.oldplugins.plugin.Plugin#stopPlugin()
+        */
+       public void stopPlugin() {
+               synchronized (this) {
+                       stopped = true;
+                       queuedURIList.clear();
+               }
+       }
+
+       public void onMajorProgress() {
+               // Ignore
+       }
+
+       public void onFetchable(BaseClientPutter state) {
+               // Ignore
+       }
+       private static String convertToHex(byte[] data) {
+        StringBuffer buf = new StringBuffer();
+        for (int i = 0; i < data.length; i++) {
+               int halfbyte = (data[i] >>> 4) & 0x0F;
+               int two_halfs = 0;
+               do {
+                       if ((0 <= halfbyte) && (halfbyte <= 9))
+                       buf.append((char) ('0' + halfbyte));
+                   else
+                       buf.append((char) ('a' + (halfbyte - 10)));
+                       halfbyte = data[i] & 0x0F;
+               } while(two_halfs++ < 1);
+        }
+        return buf.toString();
+    }
+       //this function will return the String representation of the MD5 hash 
for the input string 
+       public static String MD5(String text) throws NoSuchAlgorithmException, 
UnsupportedEncodingException  {
+               MessageDigest md;
+               md = MessageDigest.getInstance("MD5");
+               byte[] md5hash = new byte[32];
+               md.update(text.getBytes("iso-8859-1"), 0, text.length());
+               md5hash = md.digest();
+               return convertToHex(md5hash);
+       }
+       
+       public void generateSubIndex(String filename){
+//generates the new subIndex
+               File outputFile = new File(filename);
+               StreamResult resultStream;
+               resultStream = new StreamResult(outputFile);
+
+               /* Initialize xml builder */
+               Document xmlDoc = null;
+               DocumentBuilderFactory xmlFactory = null;
+               DocumentBuilder xmlBuilder = null;
+               DOMImplementation impl = null;
+               Element rootElement = null;
+
+               xmlFactory = DocumentBuilderFactory.newInstance();
+
+
+               try {
+                       xmlBuilder = xmlFactory.newDocumentBuilder();
+               } catch(javax.xml.parsers.ParserConfigurationException e) {
+                       /* Will (should ?) never happen */
+                       Logger.error(this, "Spider: Error while initializing 
XML generator: "+e.toString());
+                       return;
+               }
+
+
+               impl = xmlBuilder.getDOMImplementation();
+
+               /* Starting to generate index */
+
+               xmlDoc = impl.createDocument(null, "sub_index", null);
+               rootElement = xmlDoc.getDocumentElement();
+
+               /* Adding header to the index */
+               Element headerElement = xmlDoc.createElement("header");
+
+               /* -> title */
+               Element subHeaderElement = xmlDoc.createElement("title");
+               Text subHeaderText = xmlDoc.createTextNode(indexTitle);
+               
+               subHeaderElement.appendChild(subHeaderText);
+               headerElement.appendChild(subHeaderElement);
+
+               /* -> owner */
+               subHeaderElement = xmlDoc.createElement("owner");
+               subHeaderText = xmlDoc.createTextNode(indexOwner);
+               
+               subHeaderElement.appendChild(subHeaderText);
+               headerElement.appendChild(subHeaderElement);
+               
+       
+               /* -> owner email */
+               if(indexOwnerEmail != null) {
+                       subHeaderElement = xmlDoc.createElement("email");
+                       subHeaderText = xmlDoc.createTextNode(indexOwnerEmail);
+                       
+                       subHeaderElement.appendChild(subHeaderText);
+                       headerElement.appendChild(subHeaderElement);
+               }
+
+               
+               Element filesElement = xmlDoc.createElement("files"); /* 
filesElement != fileElement */
+
+               Element EntriesElement = xmlDoc.createElement("entries");
+               EntriesElement.setNodeValue("0");
+               EntriesElement.setAttribute("value", "0");
+               //all index files are ready
+               /* Adding word index */
+               Element keywordsElement = xmlDoc.createElement("keywords");
+               
+               rootElement.appendChild(EntriesElement);
+               rootElement.appendChild(headerElement);
+               rootElement.appendChild(filesElement);
+               rootElement.appendChild(keywordsElement);
+
+               /* Serialization */
+               DOMSource domSource = new DOMSource(xmlDoc);
+               TransformerFactory transformFactory = 
TransformerFactory.newInstance();
+               Transformer serializer;
+
+               try {
+                       serializer = transformFactory.newTransformer();
+               } catch(javax.xml.transform.TransformerConfigurationException 
e) {
+                       Logger.error(this, "Spider: Error while serializing XML 
(transformFactory.newTransformer()): "+e.toString());
+                       return;
+               }
+
+
+               serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
+               serializer.setOutputProperty(OutputKeys.INDENT,"yes");
+               
+               /* final step */
+               try {
+                       serializer.transform(domSource, resultStream);
+               } catch(javax.xml.transform.TransformerException e) {
+                       Logger.error(this, "Spider: Error while serializing XML 
(transform()): "+e.toString());
+                       return;
+               }
+
+               if(Logger.shouldLog(Logger.MINOR, this))
+                       Logger.minor(this, "Spider: indexes regenerated.");
+       }
+       
+       
+       public String handleHTTPGet(HTTPRequest request) throws 
PluginHTTPException{
+               StringBuffer out = new StringBuffer();
+               // need to produce pretty html
+               //later fredpluginhttpadvanced will give the interface
+               String action = request.getParam("action");
+               if(action == null || action.length() == 0){
+                       //put the default post fields
+                       appendDefaultPageStart(out,null);
+               } else if ("list".equals(action)) {
+                       String listName = request.getParam("listName", null);
+               }
+               
+               return out.toString();
+       }
+       private void appendDefaultPageStart(StringBuffer out, String 
stylesheet) {
+               out.append("<HTML><HEAD><TITLE>" + pluginName + "</TITLE>");
+               if(stylesheet != null)
+                       out.append("<link href=\""+stylesheet+"\" 
type=\"text/css\" rel=\"stylesheet\" />");
+               out.append("</HEAD><BODY>\n");
+               out.append("<CENTER><H1>" + pluginName + 
"</H1><BR/><BR/><BR/>\n");
+       }
+       public String handleHTTPPut(HTTPRequest request) throws 
PluginHTTPException{
+               return null;
+       }
+       public String handleHTTPPost(HTTPRequest request) throws 
PluginHTTPException{
+               return null;
+       }
+public void terminate(){
+       synchronized (this) {
+               stopped = true;
+               queuedURIList.clear();
+       }
+}
+       
+public void runPlugin(PluginRespirator pr){
+       this.pr = pr;
+       this.core = ((Node) pr.getNode()).clientCore;
+       this.ctx = core.makeClient((short) 0).getFetchContext();
+       ctx.maxSplitfileBlockRetries = 3;
+       ctx.maxNonSplitfileRetries = 10;
+       ctx.maxTempLength = 2 * 1024 * 1024;
+       ctx.maxOutputLength = 2 * 1024 * 1024;
+       allowedMIMETypes = new HashSet();
+       allowedMIMETypes.add(new String("text/html"));
+       allowedMIMETypes.add(new String("text/plain"));
+       allowedMIMETypes.add(new String("application/xhtml+xml"));
+       ctx.allowedMIMETypes = new HashSet(allowedMIMETypes);
+       tProducedIndex = System.currentTimeMillis();
+       
+       stopped = false;
+       
+       Thread starterThread = new Thread("Spider Plugin Starter") {
+               public void run() {
+                       try{
+                               Thread.sleep(30 * 1000); // Let the node start 
up
+                       } catch (InterruptedException e){}
+                       startSomeRequests();
+               }
+       };
+       starterThread.setDaemon(true);
+       starterThread.start();
+}
+
+
+public void onFoundEdition(long l, USK key){
+       FreenetURI uri = key.getURI();
+       if(runningFetchesByURI.containsKey(uri)) 
runningFetchesByURI.remove(uri);
+       uri = key.getURI().setSuggestedEdition(l);
+       queueURI(uri);
+}
+       
+       
+}

[freenet-cvs] r13687 - in trunk/plugins: . XMLSpider

Reply via email to